libcpp: Implement C++23 P2290R3 - Delimited escape sequences [PR106645]

Message ID YvyWH6JZYXjkPO49@tucnak
State New, archived
Headers
Series libcpp: Implement C++23 P2290R3 - Delimited escape sequences [PR106645] |

Commit Message

Jakub Jelinek Aug. 17, 2022, 7:17 a.m. UTC
  Hi!

The following patch implements the C++23 P2290R3 paper.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2022-08-17  Jakub Jelinek  <jakub@redhat.com>

	PR c++/106645
libcpp/
	* include/cpplib.h (struct cpp_options): Implement
	P2290R3 - Delimited escape sequences.  Add delimite_escape_seqs
	member.
	* init.cc (struct lang_flags): Likewise.
	(lang_defaults): Add delim column.
	(cpp_set_lang): Copy over delimite_escape_seqs.
	* charset.cc (_cpp_valid_ucn): Handle delimited escape sequences.
	(convert_hex): Likewise.
	(convert_oct): Likewise.
	(convert_escape): Call convert_oct even for \o.
	(_cpp_interpret_identifier): Handle delimited escape sequences.
	* lex.cc (get_bidi_ucn_1): Likewise.  Add end argument, fill it in.
	(get_bidi_ucn): Adjust get_bidi_ucn_1 caller.  Use end argument to
	compute num_bytes.
gcc/testsuite/
	* c-c++-common/cpp/delimited-escape-seq-1.c: New test.
	* c-c++-common/cpp/delimited-escape-seq-2.c: New test.
	* c-c++-common/cpp/delimited-escape-seq-3.c: New test.
	* c-c++-common/Wbidi-chars-24.c: New test.
	* gcc.dg/cpp/delimited-escape-seq-1.c: New test.
	* gcc.dg/cpp/delimited-escape-seq-2.c: New test.
	* g++.dg/cpp/delimited-escape-seq-1.C: New test.
	* g++.dg/cpp/delimited-escape-seq-2.C: New test.


	Jakub
  

Comments

Jason Merrill Aug. 17, 2022, 8:47 p.m. UTC | #1
On 8/17/22 00:17, Jakub Jelinek wrote:
> Hi!
> 
> The following patch implements the C++23 P2290R3 paper.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> 
> 2022-08-17  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR c++/106645
> libcpp/
> 	* include/cpplib.h (struct cpp_options): Implement
> 	P2290R3 - Delimited escape sequences.  Add delimite_escape_seqs
> 	member.
> 	* init.cc (struct lang_flags): Likewise.
> 	(lang_defaults): Add delim column.
> 	(cpp_set_lang): Copy over delimite_escape_seqs.
> 	* charset.cc (_cpp_valid_ucn): Handle delimited escape sequences.
> 	(convert_hex): Likewise.
> 	(convert_oct): Likewise.
> 	(convert_escape): Call convert_oct even for \o.
> 	(_cpp_interpret_identifier): Handle delimited escape sequences.
> 	* lex.cc (get_bidi_ucn_1): Likewise.  Add end argument, fill it in.
> 	(get_bidi_ucn): Adjust get_bidi_ucn_1 caller.  Use end argument to
> 	compute num_bytes.
> gcc/testsuite/
> 	* c-c++-common/cpp/delimited-escape-seq-1.c: New test.
> 	* c-c++-common/cpp/delimited-escape-seq-2.c: New test.
> 	* c-c++-common/cpp/delimited-escape-seq-3.c: New test.
> 	* c-c++-common/Wbidi-chars-24.c: New test.
> 	* gcc.dg/cpp/delimited-escape-seq-1.c: New test.
> 	* gcc.dg/cpp/delimited-escape-seq-2.c: New test.
> 	* g++.dg/cpp/delimited-escape-seq-1.C: New test.
> 	* g++.dg/cpp/delimited-escape-seq-2.C: New test.
> 
> --- libcpp/include/cpplib.h.jj	2022-08-10 09:06:53.268209449 +0200
> +++ libcpp/include/cpplib.h	2022-08-15 19:32:53.743213474 +0200
> @@ -519,6 +519,9 @@ struct cpp_options
>     /* Nonzero for C++23 size_t literals.  */
>     unsigned char size_t_literals;
>   
> +  /* Nonzero for C++23 delimited escape sequences.  */
> +  unsigned char delimited_escape_seqs;
> +
>     /* Holds the name of the target (execution) character set.  */
>     const char *narrow_charset;
>   
> --- libcpp/init.cc.jj	2022-08-10 09:06:53.268209449 +0200
> +++ libcpp/init.cc	2022-08-15 16:09:01.403020485 +0200
> @@ -96,34 +96,35 @@ struct lang_flags
>     char dfp_constants;
>     char size_t_literals;
>     char elifdef;
> +  char delimited_escape_seqs;
>   };
>   
>   static const struct lang_flags lang_defaults[] =
> -{ /*              c99 c++ xnum xid c11 std digr ulit rlit udlit bincst digsep trig u8chlit vaopt scope dfp szlit elifdef */
> -  /* GNUC89   */  { 0,  0,  1,  0,  0,  0,  1,   0,   0,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0 },
> -  /* GNUC99   */  { 1,  0,  1,  1,  0,  0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0 },
> -  /* GNUC11   */  { 1,  0,  1,  1,  1,  0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0 },
> -  /* GNUC17   */  { 1,  0,  1,  1,  1,  0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0 },
> -  /* GNUC2X   */  { 1,  0,  1,  1,  1,  0,  1,   1,   1,   0,    1,     1,     0,   1,      1,   1,     1,   0,   1 },
> -  /* STDC89   */  { 0,  0,  0,  0,  0,  1,  0,   0,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0 },
> -  /* STDC94   */  { 0,  0,  0,  0,  0,  1,  1,   0,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0 },
> -  /* STDC99   */  { 1,  0,  1,  1,  0,  1,  1,   0,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0 },
> -  /* STDC11   */  { 1,  0,  1,  1,  1,  1,  1,   1,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0 },
> -  /* STDC17   */  { 1,  0,  1,  1,  1,  1,  1,   1,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0 },
> -  /* STDC2X   */  { 1,  0,  1,  1,  1,  1,  1,   1,   0,   0,    1,     1,     1,   1,      0,   1,     1,   0,   1 },
> -  /* GNUCXX   */  { 0,  1,  1,  1,  0,  0,  1,   0,   0,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0 },
> -  /* CXX98    */  { 0,  1,  0,  1,  0,  1,  1,   0,   0,   0,    0,     0,     1,   0,      0,   1,     0,   0,   0 },
> -  /* GNUCXX11 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    0,     0,     0,   0,      1,   1,     0,   0,   0 },
> -  /* CXX11    */  { 1,  1,  0,  1,  1,  1,  1,   1,   1,   1,    0,     0,     1,   0,      0,   1,     0,   0,   0 },
> -  /* GNUCXX14 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    1,     1,     0,   0,      1,   1,     0,   0,   0 },
> -  /* CXX14    */  { 1,  1,  0,  1,  1,  1,  1,   1,   1,   1,    1,     1,     1,   0,      0,   1,     0,   0,   0 },
> -  /* GNUCXX17 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   0,   0 },
> -  /* CXX17    */  { 1,  1,  1,  1,  1,  1,  1,   1,   1,   1,    1,     1,     0,   1,      0,   1,     0,   0,   0 },
> -  /* GNUCXX20 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   0,   0 },
> -  /* CXX20    */  { 1,  1,  1,  1,  1,  1,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   0,   0 },
> -  /* GNUCXX23 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   1,   1 },
> -  /* CXX23    */  { 1,  1,  1,  1,  1,  1,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   1,   1 },
> -  /* ASM      */  { 0,  0,  1,  0,  0,  0,  0,   0,   0,   0,    0,     0,     0,   0,      0,   0,     0,   0,   0 }
> +{ /*              c99 c++ xnum xid c11 std digr ulit rlit udlit bincst digsep trig u8chlit vaopt scope dfp szlit elifdef delim */
> +  /* GNUC89   */  { 0,  0,  1,  0,  0,  0,  1,   0,   0,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0 },
> +  /* GNUC99   */  { 1,  0,  1,  1,  0,  0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0 },
> +  /* GNUC11   */  { 1,  0,  1,  1,  1,  0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0 },
> +  /* GNUC17   */  { 1,  0,  1,  1,  1,  0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0 },
> +  /* GNUC2X   */  { 1,  0,  1,  1,  1,  0,  1,   1,   1,   0,    1,     1,     0,   1,      1,   1,     1,   0,   1,      0 },
> +  /* STDC89   */  { 0,  0,  0,  0,  0,  1,  0,   0,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0,      0 },
> +  /* STDC94   */  { 0,  0,  0,  0,  0,  1,  1,   0,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0,      0 },
> +  /* STDC99   */  { 1,  0,  1,  1,  0,  1,  1,   0,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0,      0 },
> +  /* STDC11   */  { 1,  0,  1,  1,  1,  1,  1,   1,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0,      0 },
> +  /* STDC17   */  { 1,  0,  1,  1,  1,  1,  1,   1,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0,      0 },
> +  /* STDC2X   */  { 1,  0,  1,  1,  1,  1,  1,   1,   0,   0,    1,     1,     1,   1,      0,   1,     1,   0,   1,      0 },
> +  /* GNUCXX   */  { 0,  1,  1,  1,  0,  0,  1,   0,   0,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0 },
> +  /* CXX98    */  { 0,  1,  0,  1,  0,  1,  1,   0,   0,   0,    0,     0,     1,   0,      0,   1,     0,   0,   0,      0 },
> +  /* GNUCXX11 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0 },
> +  /* CXX11    */  { 1,  1,  0,  1,  1,  1,  1,   1,   1,   1,    0,     0,     1,   0,      0,   1,     0,   0,   0,      0 },
> +  /* GNUCXX14 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    1,     1,     0,   0,      1,   1,     0,   0,   0,      0 },
> +  /* CXX14    */  { 1,  1,  0,  1,  1,  1,  1,   1,   1,   1,    1,     1,     1,   0,      0,   1,     0,   0,   0,      0 },
> +  /* GNUCXX17 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   0,   0,      0 },
> +  /* CXX17    */  { 1,  1,  1,  1,  1,  1,  1,   1,   1,   1,    1,     1,     0,   1,      0,   1,     0,   0,   0,      0 },
> +  /* GNUCXX20 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   0,   0,      0 },
> +  /* CXX20    */  { 1,  1,  1,  1,  1,  1,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   0,   0,      0 },
> +  /* GNUCXX23 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   1,   1,      1 },
> +  /* CXX23    */  { 1,  1,  1,  1,  1,  1,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   1,   1,      1 },
> +  /* ASM      */  { 0,  0,  1,  0,  0,  0,  0,   0,   0,   0,    0,     0,     0,   0,      0,   0,     0,   0,   0,      0 }
>   };
>   
>   /* Sets internal flags correctly for a given language.  */
> @@ -153,6 +154,7 @@ cpp_set_lang (cpp_reader *pfile, enum c_
>     CPP_OPTION (pfile, dfp_constants)		 = l->dfp_constants;
>     CPP_OPTION (pfile, size_t_literals)		 = l->size_t_literals;
>     CPP_OPTION (pfile, elifdef)			 = l->elifdef;
> +  CPP_OPTION (pfile, delimited_escape_seqs)	 = l->delimited_escape_seqs;
>   }
>   
>   /* Initialize library global state.  */
> --- libcpp/charset.cc.jj	2022-08-15 12:52:43.213902801 +0200
> +++ libcpp/charset.cc	2022-08-16 11:42:27.729948705 +0200
> @@ -1081,6 +1081,7 @@ _cpp_valid_ucn (cpp_reader *pfile, const
>     unsigned int length;
>     const uchar *str = *pstr;
>     const uchar *base = str - 2;
> +  bool delimited = false;
>   
>     if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
>       cpp_error (pfile, CPP_DL_WARNING,
> @@ -1095,7 +1096,17 @@ _cpp_valid_ucn (cpp_reader *pfile, const
>   	         (int) str[-1]);
>   
>     if (str[-1] == 'u')
> -    length = 4;
> +    {
> +      length = 4;
> +      if (str < limit && *str == '{')
> +	{
> +	  str++;
> +	  length = 32;

/* Magic value to indicate no digits seen.  */

> +	  delimited = true;
> +	  if (loc_reader)
> +	    char_range->m_finish = loc_reader->get_next ().m_finish;
> +	}
> +    }
>     else if (str[-1] == 'U')
>       length = 8;
>     else
> @@ -1107,6 +1118,8 @@ _cpp_valid_ucn (cpp_reader *pfile, const
>     result = 0;
>     do
>       {
> +      if (str == limit)
> +	break;
>         c = *str;
>         if (!ISXDIGIT (c))
>   	break;
> @@ -1116,9 +1129,41 @@ _cpp_valid_ucn (cpp_reader *pfile, const
>   	  gcc_assert (char_range);
>   	  char_range->m_finish = loc_reader->get_next ().m_finish;
>   	}
> +      if (delimited)
> +	{
> +	  if (!result)
> +	    /* Accept arbitrary number of leading zeros.  */
> +	    length = 16;
> +	  else if (length == 8)
> +	    {
> +	      /* Make sure we detect overflows.  */
> +	      result |= 0x8000000;
> +	      ++length;
> +	    }

16 above so that this case happens after we read 8 digits after leading 
zeroes?

> +	}
> +
>         result = (result << 4) + hex_value (c);
>       }
> -  while (--length && str < limit);
> +  while (--length);
> +
> +  if (delimited
> +      && str < limit
> +      && *str == '}'
> +      && (length != 32 || !identifier_pos))
> +    {
> +      if (length == 32)
> +	cpp_error (pfile, CPP_DL_ERROR,
> +		   "empty delimited escape sequence");
> +      else if (!CPP_OPTION (pfile, delimited_escape_seqs)
> +	       && CPP_OPTION (pfile, cpp_pedantic))
> +	cpp_error (pfile, CPP_DL_PEDWARN,
> +		   "delimited escape sequences are only valid in C++23");
> +      str++;
> +      length = 0;
> +      delimited = false;
> +      if (loc_reader)
> +	char_range->m_finish = loc_reader->get_next ().m_finish;

Here and in other functions, the pattern of increment the input pointer 
and update m_finish seems like it should be a macro?

> +    }
>   
>     /* Partial UCNs are not valid in strings, but decompose into
>        multiple tokens in identifiers, so we can't give a helpful
> @@ -1132,9 +1177,14 @@ _cpp_valid_ucn (cpp_reader *pfile, const
>     *pstr = str;
>     if (length)
>       {
> -      cpp_error (pfile, CPP_DL_ERROR,
> -		 "incomplete universal character name %.*s",
> -		 (int) (str - base), base);
> +      if (!delimited)
> +	cpp_error (pfile, CPP_DL_ERROR,
> +		   "incomplete universal character name %.*s",
> +		   (int) (str - base), base);
> +      else
> +	cpp_error (pfile, CPP_DL_ERROR,
> +		   "'\\u{' not terminated with '}' after %.*s",
> +		   (int) (str - base), base);
>         result = 1;
>       }
>     /* The C99 standard permits $, @ and ` to be specified as UCNs.  We use
> @@ -1392,6 +1442,8 @@ convert_hex (cpp_reader *pfile, const uc
>     int digits_found = 0;
>     size_t width = cvt.width;
>     size_t mask = width_to_mask (width);
> +  bool delimited = false;
> +  const uchar *base = from - 1;
>   
>     /* loc_reader and ranges must either be both NULL, or both be non-NULL.  */
>     gcc_assert ((loc_reader != NULL) == (ranges != NULL));
> @@ -1407,6 +1459,14 @@ convert_hex (cpp_reader *pfile, const uc
>     if (loc_reader)
>       char_range.m_finish = loc_reader->get_next ().m_finish;
>   
> +  if (from < limit && *from == '{')
> +    {
> +      delimited = true;
> +      from++;
> +      if (loc_reader)
> +	char_range.m_finish = loc_reader->get_next ().m_finish;
> +    }
> +
>     while (from < limit)
>       {
>         c = *from;
> @@ -1420,12 +1480,37 @@ convert_hex (cpp_reader *pfile, const uc
>         digits_found = 1;
>       }
>   
> +  if (delimited && from < limit && *from == '}')
> +    {
> +      from++;
> +      if (!digits_found)
> +	{
> +	  cpp_error (pfile, CPP_DL_ERROR,
> +		     "empty delimited escape sequence");
> +	  return from;
> +	}
> +     else if (!CPP_OPTION (pfile, delimited_escape_seqs)
> +	      && CPP_OPTION (pfile, cpp_pedantic))
> +	cpp_error (pfile, CPP_DL_PEDWARN,
> +		   "delimited escape sequences are only valid in C++23");
> +      delimited = false;
> +      if (loc_reader)
> +	char_range.m_finish = loc_reader->get_next ().m_finish;
> +    }
> +
>     if (!digits_found)
>       {
>         cpp_error (pfile, CPP_DL_ERROR,
>   		 "\\x used with no following hex digits");
>         return from;
>       }
> +  else if (delimited)
> +    {
> +      cpp_error (pfile, CPP_DL_ERROR,
> +		 "'\\x{' not terminated with '}' after %.*s",
> +		 (int) (from - base), base);
> +      return from;
> +    }
>   
>     if (overflow | (n != (n & mask)))
>       {
> @@ -1459,13 +1544,31 @@ convert_oct (cpp_reader *pfile, const uc
>   	     cpp_substring_ranges *ranges)
>   {
>     size_t count = 0;
> -  cppchar_t c, n = 0;
> +  cppchar_t c, n = 0, overflow = 0;
>     size_t width = cvt.width;
>     size_t mask = width_to_mask (width);
> +  bool delimited = false;
> +  const uchar *base = from - 1;
>   
>     /* loc_reader and ranges must either be both NULL, or both be non-NULL.  */
>     gcc_assert ((loc_reader != NULL) == (ranges != NULL));
>   
> +  if (from < limit && *from == 'o')
> +    {
> +      from++;
> +      if (loc_reader)
> +	char_range.m_finish = loc_reader->get_next ().m_finish;
> +      if (from == limit || *from != '{')
> +	cpp_error (pfile, CPP_DL_ERROR, "'\\o' not followed by '{'");
> +      else
> +	{
> +	  from++;
> +	  if (loc_reader)
> +	    char_range.m_finish = loc_reader->get_next ().m_finish;
> +	  delimited = true;
> +	}
> +    }
> +
>     while (from < limit && count++ < 3)
>       {
>         c = *from;
> @@ -1474,10 +1577,42 @@ convert_oct (cpp_reader *pfile, const uc
>         from++;
>         if (loc_reader)
>   	char_range.m_finish = loc_reader->get_next ().m_finish;
> +      if (delimited)
> +	{
> +	  count = 2;
> +	  overflow |= n ^ (n << 3 >> 3);
> +	}
>         n = (n << 3) + c - '0';
>       }
>   
> -  if (n != (n & mask))
> +  if (delimited)
> +    {
> +      if (from < limit && *from == '}')
> +	{
> +	  from++;
> +	  if (count == 1)
> +	    {
> +	      cpp_error (pfile, CPP_DL_ERROR,
> +			 "empty delimited escape sequence");
> +	      return from;
> +	    }
> +	  else if (!CPP_OPTION (pfile, delimited_escape_seqs)
> +		   && CPP_OPTION (pfile, cpp_pedantic))
> +	    cpp_error (pfile, CPP_DL_PEDWARN,
> +		       "delimited escape sequences are only valid in C++23");
> +	  if (loc_reader)
> +	    char_range.m_finish = loc_reader->get_next ().m_finish;
> +	}
> +      else
> +	{
> +	  cpp_error (pfile, CPP_DL_ERROR,
> +		     "'\\o{' not terminated with '}' after %.*s",
> +		     (int) (from - base), base);
> +	  return from;
> +	}
> +    }
> +
> +  if (overflow | (n != (n & mask)))
>       {
>         cpp_error (pfile, CPP_DL_PEDWARN,
>   		 "octal escape sequence out of range");
> @@ -1535,6 +1670,7 @@ convert_escape (cpp_reader *pfile, const
>   
>       case '0':  case '1':  case '2':  case '3':
>       case '4':  case '5':  case '6':  case '7':
> +    case 'o':
>         return convert_oct (pfile, from, limit, tbuf, cvt,
>   			  char_range, loc_reader, ranges);
>   
> @@ -2119,15 +2255,23 @@ _cpp_interpret_identifier (cpp_reader *p
>   	cppchar_t value = 0;
>   	size_t bufleft = len - (bufp - buf);
>   	int rval;
> +	bool delimited = false;
>   
>   	idp += 2;
> +	if (length == 4 && id[idp] == '{')
> +	  {
> +	    delimited = true;
> +	    idp++;
> +	  }
>   	while (length && idp < len && ISXDIGIT (id[idp]))
>   	  {
>   	    value = (value << 4) + hex_value (id[idp]);
>   	    idp++;
> -	    length--;
> +	    if (!delimited)
> +	      length--;
>   	  }
> -	idp--;
> +	if (!delimited)
> +	  idp--;

Don't we need to check that the first non-xdigit is a }?

>   
>   	/* Special case for EBCDIC: if the identifier contains
>   	   a '$' specified using a UCN, translate it to EBCDIC.  */
> --- libcpp/lex.cc.jj	2022-05-23 10:59:06.235591348 +0200
> +++ libcpp/lex.cc	2022-08-16 11:57:53.772823661 +0200
> @@ -1426,19 +1426,35 @@ get_bidi_utf8 (cpp_reader *pfile, const
>   /* Parse a UCN where P points just past \u or \U and return its bidi code.  */
>   
>   static bidi::kind
> -get_bidi_ucn_1 (const unsigned char *p, bool is_U)
> +get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
>   {
>     /* 6.4.3 Universal Character Names
>         \u hex-quad
>         \U hex-quad hex-quad
> +      \u { simple-hexadecimal-digit-sequence }
>        where \unnnn means \U0000nnnn.  */
>   
> +  *end = p + 4;
>     if (is_U)
>       {
>         if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
>   	return bidi::kind::NONE;
>         /* Skip 4B so we can treat \u and \U the same below.  */
>         p += 4;
> +      *end += 4;
> +    }
> +  else if (p[0] == '{')
> +    {
> +      p++;
> +      while (*p == '0')
> +	p++;
> +      if (p[0] != '2'
> +	  || p[1] != '0'
> +	  || !ISXDIGIT (p[2])
> +	  || !ISXDIGIT (p[3])
> +	  || p[4] != '}')
> +	return bidi::kind::NONE;
> +      *end = p + 5;
>       }
>   
>     /* All code points we are looking for start with 20xx.  */
> @@ -1499,14 +1515,15 @@ get_bidi_ucn_1 (const unsigned char *p,
>      If the kind is not NONE, write the location to *OUT.*/
>   
>   static bidi::kind
> -get_bidi_ucn (cpp_reader *pfile,  const unsigned char *p, bool is_U,
> +get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
>   	      location_t *out)
>   {
> -  bidi::kind result = get_bidi_ucn_1 (p, is_U);
> +  const unsigned char *end;
> +  bidi::kind result = get_bidi_ucn_1 (p, is_U, &end);
>     if (result != bidi::kind::NONE)
>       {
>         const unsigned char *start = p - 2;
> -      size_t num_bytes = 2 + (is_U ? 8 : 4);
> +      size_t num_bytes = end - start;
>         *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
>       }
>     return result;
> --- gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-1.c.jj	2022-08-16 10:47:38.693022740 +0200
> +++ gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-1.c	2022-08-16 12:18:42.235477632 +0200
> @@ -0,0 +1,92 @@
> +/* P2290R3 - Delimited escape sequences */
> +/* { dg-do run } */
> +/* { dg-require-effective-target wchar } */
> +/* { dg-options "-std=gnu99 -Wno-c++-compat" { target c } } */
> +/* { dg-options "-std=c++23" { target c++ } } */
> +
> +#ifndef __cplusplus
> +#include <wchar.h>
> +typedef __CHAR16_TYPE__ char16_t;
> +typedef __CHAR32_TYPE__ char32_t;
> +#endif
> +
> +const char32_t *a = U"\u{1234}\u{10fffd}\u{000000000000000000000000000000000000000000000000000000000001234}\u{10FFFD}";
> +const char32_t *b = U"\x{1234}\x{10fffd}\x{000000000000000000000000000000000000000000000000000000000001234}";
> +const char32_t *c = U"\o{1234}\o{4177775}\o{000000000000000000000000000000000000000000000000000000000000000000000000004177775}";
> +const char16_t *d = u"\u{1234}\u{bFFd}\u{00000000000000000000000000000001234}";
> +const char16_t *e = u"\x{1234}\x{BffD}\x{000001234}";
> +const char16_t *f = u"\o{1234}\o{137775}\o{000000000000000137775}";
> +const wchar_t *g = L"\u{1234}\u{bFFd}\u{00000000000000000000000000000001234}";
> +const wchar_t *h = L"\x{1234}\x{bFFd}\x{000001234}";
> +const wchar_t *i = L"\o{1234}\o{137775}\o{000000000000000137775}";
> +#ifdef __cplusplus
> +const char *j = "\u{34}\u{000000000000000003D}";
> +#endif
> +const char *k = "\x{34}\x{000000000000000003D}";
> +const char *l = "\o{34}\o{000000000000000176}";
> +
> +#if U'\u{1234}' != U'\u1234' || U'\u{10fffd}' != U'\U0010FFFD' \
> +    || U'\x{00000001234}' != U'\x1234' || U'\x{010fffd}' != U'\x10FFFD' \
> +    || U'\o{1234}' != U'\x29c' || U'\o{004177775}' != U'\x10FFFD' \
> +    || u'\u{1234}' != u'\u1234' || u'\u{0bffd}' != u'\uBFFD' \
> +    || u'\x{00000001234}' != u'\x1234' || u'\x{0Bffd}' != u'\x0bFFD' \
> +    || u'\o{1234}' != u'\x29c' || u'\o{00137775}' != u'\xBFFD' \
> +    || L'\u{1234}' != L'\u1234' || L'\u{0bffd}' != L'\uBFFD' \
> +    || L'\x{00000001234}' != L'\x1234' || L'\x{0bffd}' != L'\x0bFFD' \
> +    || L'\o{1234}' != L'\x29c' || L'\o{00137775}' != L'\xBFFD' \
> +    || '\x{34}' != '\x034' || '\x{0003d}' != '\x003D' \
> +    || '\o{34}' != '\x1C' || '\o{176}' != '\x007E'
> +#error Bad
> +#endif
> +#ifdef __cplusplus
> +#if '\u{0000000034}' != '\u0034' || '\u{3d}' != '\u003D'
> +#error Bad
> +#endif
> +#endif
> +
> +int
> +main ()
> +{
> +  if (a[0] != U'\u1234' || a[0] != U'\u{1234}'
> +      || a[1] != U'\U0010FFFD' || a[1] != U'\u{000010fFfD}'
> +      || a[2] != a[0]
> +      || a[3] != a[1]
> +      || b[0] != U'\x1234' || b[0] != U'\x{001234}'
> +      || b[1] != U'\x10FFFD' || b[1] != U'\x{0010fFfD}'
> +      || b[2] != b[0]
> +      || c[0] != U'\x29c' || c[0] != U'\o{001234}'
> +      || c[1] != U'\x10FFFD' || c[1] != U'\o{4177775}'
> +      || c[2] != c[1])
> +    __builtin_abort ();
> +  if (d[0] != u'\u1234' || d[0] != u'\u{1234}'
> +      || d[1] != u'\U0000BFFD' || d[1] != u'\u{00000bFfD}'
> +      || d[2] != d[0]
> +      || e[0] != u'\x1234' || e[0] != u'\x{001234}'
> +      || e[1] != u'\xBFFD' || e[1] != u'\x{00bFfD}'
> +      || e[2] != e[0]
> +      || f[0] != u'\x29c' || f[0] != u'\o{001234}'
> +      || f[1] != u'\xbFFD' || f[1] != u'\o{137775}'
> +      || f[2] != f[1])
> +    __builtin_abort ();
> +  if (g[0] != L'\u1234' || g[0] != L'\u{1234}'
> +      || g[1] != L'\U0000BFFD' || g[1] != L'\u{00000bFfD}'
> +      || g[2] != g[0]
> +      || h[0] != L'\x1234' || h[0] != L'\x{001234}'
> +      || h[1] != L'\xBFFD' || h[1] != L'\x{00bFfD}'
> +      || h[2] != h[0]
> +      || i[0] != L'\x29c' || i[0] != L'\o{001234}'
> +      || i[1] != L'\xbFFD' || i[1] != L'\o{137775}'
> +      || i[2] != i[1])
> +    __builtin_abort ();
> +#ifdef __cplusplus
> +  if (j[0] != '\u0034' || j[0] != '\u{034}'
> +      || j[1] != '\U0000003D' || j[1] != '\u{000003d}')
> +    __builtin_abort ();
> +#endif
> +  if (k[0] != '\x034' || k[0] != '\x{0034}'
> +      || k[1] != '\x3D' || k[1] != '\x{3d}'
> +      || l[0] != '\x1c' || l[0] != '\o{0034}'
> +      || l[1] != '\x07e' || l[1] != '\o{176}' || l[1] != '\176')
> +    __builtin_abort ();
> +  return 0;
> +}
> --- gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-2.c.jj	2022-08-16 10:47:41.846981390 +0200
> +++ gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-2.c	2022-08-16 12:18:58.807260607 +0200
> @@ -0,0 +1,18 @@
> +/* P2290R3 - Delimited escape sequences */
> +/* { dg-do compile } */
> +/* { dg-options "-std=gnu99 -Wno-c++-compat" { target c } } */
> +/* { dg-options "-std=c++23" { target c++ } } */
> +
> +int jalape\u{f1}o = 42;
> +
> +int
> +caf\u{000e9} (void)
> +{
> +  return jalape\u00F1o;
> +}
> +
> +int
> +test (void)
> +{
> +  return caf\u00e9 ();
> +}
> --- gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-3.c.jj	2022-08-16 12:18:19.308777922 +0200
> +++ gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-3.c	2022-08-16 12:41:23.693648138 +0200
> @@ -0,0 +1,33 @@
> +/* P2290R3 - Delimited escape sequences */
> +/* { dg-do compile } */
> +/* { dg-require-effective-target wchar } */
> +/* { dg-options "-std=gnu99 -Wno-c++-compat" { target c } } */
> +/* { dg-options "-std=c++23" { target c++ } } */
> +
> +#ifndef __cplusplus
> +typedef __CHAR32_TYPE__ char32_t;
> +#endif
> +
> +const char32_t *a = U"\u{}";				/* { dg-error "empty delimited escape sequence" } */
> +							/* { dg-error "is not a valid universal character" "" { target c } .-1 } */
> +const char32_t *b = U"\u{12" "34}";			/* { dg-error "'\\\\u\\{' not terminated with '\\}' after" } */
> +const char32_t *c = U"\u{0000ffffffff}";		/* { dg-error "is not a valid universal character" } */
> +const char32_t *d = U"\u{010000edcb}";			/* { dg-error "is not a valid universal character" } */
> +const char32_t *e = U"\u{02000000000000000000edcb}";	/* { dg-error "is not a valid universal character" } */
> +const char32_t *f = U"\u{123ghij}";			/* { dg-error "'\\\\u\\{' not terminated with '\\}' after" } */
> +const char32_t *g = U"\u{123.}";			/* { dg-error "'\\\\u\\{' not terminated with '\\}' after" } */
> +const char32_t *h = U"\u{.}";				/* { dg-error "'\\\\u\\{' not terminated with '\\}' after" } */
> +const char32_t *i = U"\x{}";				/* { dg-error "empty delimited escape sequence" } */
> +const char32_t *j = U"\x{12" "34}";			/* { dg-error "'\\\\x\\{' not terminated with '\\}' after" } */
> +const char32_t *k = U"\x{0000ffffffff}";
> +const char32_t *l = U"\x{010000edcb}";			/* { dg-warning "hex escape sequence out of range" } */
> +const char32_t *m = U"\x{02000000000000000000edcb}";	/* { dg-warning "hex escape sequence out of range" } */
> +const char32_t *n = U"\x{123ghij}";			/* { dg-error "'\\\\x\\{' not terminated with '\\}' after" } */
> +const char32_t *o = U"\x{123.}";			/* { dg-error "'\\\\x\\{' not terminated with '\\}' after" } */
> +const char32_t *p = U"\o{}";				/* { dg-error "empty delimited escape sequence" } */
> +const char32_t *q = U"\o{12" "34}";			/* { dg-error "'\\\\o\\{' not terminated with '\\}' after" } */
> +const char32_t *r = U"\o{0000037777777777}";
> +const char32_t *s = U"\o{040000166713}";		/* { dg-warning "octal escape sequence out of range" } */
> +const char32_t *t = U"\o{02000000000000000000000166713}";/* { dg-warning "octal escape sequence out of range" } */
> +const char32_t *u = U"\o{1238}";			/* { dg-error "'\\\\o\\{' not terminated with '\\}' after" } */
> +const char32_t *v = U"\o{.}";				/* { dg-error "'\\\\o\\{' not terminated with '\\}' after" } */
> --- gcc/testsuite/c-c++-common/Wbidi-chars-24.c.jj	2022-08-16 12:03:19.350561676 +0200
> +++ gcc/testsuite/c-c++-common/Wbidi-chars-24.c	2022-08-16 12:06:46.381851525 +0200
> @@ -0,0 +1,28 @@
> +/* PR preprocessor/103026 */
> +/* { dg-do compile } */
> +/* { dg-options "-Wbidi-chars=ucn,unpaired" } */
> +/* Test nesting of bidi chars in various contexts.  */
> +
> +void
> +g1 ()
> +{
> +  const char *s1 = "a b c LRE\u{202a} 1 2 3 PDI\u{00000000000000000000000002069} x y z";
> +/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
> +  const char *s2 = "a b c RLE\u{00202b} 1 2 3 PDI\u{2069} x y z";
> +/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
> +  const char *s3 = "a b c LRO\u{000000202d} 1 2 3 PDI\u{02069} x y z";
> +/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
> +  const char *s4 = "a b c RLO\u{202e} 1 2 3 PDI\u{00000002069} x y z";
> +/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
> +  const char *s5 = "a b c LRI\u{002066} 1 2 3 PDF\u{202C} x y z";
> +/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
> +  const char *s6 = "a b c RLI\u{02067} 1 2 3 PDF\u{202c} x y z";
> +/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
> +  const char *s7 = "a b c FSI\u{0002068} 1 2 3 PDF\u{0202c} x y z";
> +/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
> +}
> +
> +int A\u{202a}B\u{2069}C;
> +/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
> +int a\u{00000202b}B\u{000000002069}c;
> +/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
> --- gcc/testsuite/gcc.dg/cpp/delimited-escape-seq-1.c.jj	2022-08-16 10:47:38.693022740 +0200
> +++ gcc/testsuite/gcc.dg/cpp/delimited-escape-seq-1.c	2022-08-16 12:46:56.508291006 +0200
> @@ -0,0 +1,10 @@
> +/* P2290R3 - Delimited escape sequences */
> +/* { dg-do compile } */
> +/* { dg-require-effective-target wchar } */
> +/* { dg-options "-std=gnu99 -Wno-c++-compat -pedantic" } */
> +
> +typedef __CHAR32_TYPE__ char32_t;
> +
> +const char32_t *a = U"\u{1234}";	/* { dg-warning "delimited escape sequences are only valid in" } */
> +const char32_t *b = U"\x{1234}";	/* { dg-warning "delimited escape sequences are only valid in" } */
> +const char32_t *c = U"\o{1234}";	/* { dg-warning "delimited escape sequences are only valid in" } */
> --- gcc/testsuite/gcc.dg/cpp/delimited-escape-seq-2.c.jj	2022-08-16 10:47:41.846981390 +0200
> +++ gcc/testsuite/gcc.dg/cpp/delimited-escape-seq-2.c	2022-08-16 12:47:05.955167423 +0200
> @@ -0,0 +1,10 @@
> +/* P2290R3 - Delimited escape sequences */
> +/* { dg-do compile } */
> +/* { dg-require-effective-target wchar } */
> +/* { dg-options "-std=gnu99 -Wno-c++-compat -pedantic-errors" } */
> +
> +typedef __CHAR32_TYPE__ char32_t;
> +
> +const char32_t *a = U"\u{1234}";	/* { dg-error "delimited escape sequences are only valid in" } */
> +const char32_t *b = U"\x{1234}";	/* { dg-error "delimited escape sequences are only valid in" } */
> +const char32_t *c = U"\o{1234}";	/* { dg-error "delimited escape sequences are only valid in" } */
> --- gcc/testsuite/g++.dg/cpp/delimited-escape-seq-1.C.jj	2022-08-16 12:46:43.368462901 +0200
> +++ gcc/testsuite/g++.dg/cpp/delimited-escape-seq-1.C	2022-08-16 12:49:21.532393786 +0200
> @@ -0,0 +1,8 @@
> +// P2290R3 - Delimited escape sequences
> +// { dg-do compile { target c++11 } }
> +// { dg-require-effective-target wchar }
> +// { dg-options "-pedantic" }
> +
> +const char32_t *a = U"\u{1234}";	// { dg-warning "delimited escape sequences are only valid in" "" { target c++20_down } }
> +const char32_t *b = U"\x{1234}";	// { dg-warning "delimited escape sequences are only valid in" "" { target c++20_down } }
> +const char32_t *c = U"\o{1234}";	// { dg-warning "delimited escape sequences are only valid in" "" { target c++20_down } }
> --- gcc/testsuite/g++.dg/cpp/delimited-escape-seq-2.C.jj	2022-08-16 12:46:46.281424798 +0200
> +++ gcc/testsuite/g++.dg/cpp/delimited-escape-seq-2.C	2022-08-16 12:49:33.761233803 +0200
> @@ -0,0 +1,8 @@
> +// P2290R3 - Delimited escape sequences
> +// { dg-do compile { target c++11 } }
> +// { dg-require-effective-target wchar }
> +// { dg-options "-pedantic-errors" }
> +
> +const char32_t *a = U"\u{1234}";	// { dg-error "delimited escape sequences are only valid in" "" { target c++20_down } }
> +const char32_t *b = U"\x{1234}";	// { dg-error "delimited escape sequences are only valid in" "" { target c++20_down } }
> +const char32_t *c = U"\o{1234}";	// { dg-error "delimited escape sequences are only valid in" "" { target c++20_down } }
> 
> 	Jakub
>
  
Jakub Jelinek Aug. 17, 2022, 9:19 p.m. UTC | #2
On Wed, Aug 17, 2022 at 04:47:19PM -0400, Jason Merrill via Gcc-patches wrote:
> > +	  length = 32;
> 
> /* Magic value to indicate no digits seen.  */

Indeed, will add the comment.

> > +	  delimited = true;
> > +	  if (loc_reader)
> > +	    char_range->m_finish = loc_reader->get_next ().m_finish;
> > +	}
> > +    }
> >     else if (str[-1] == 'U')
> >       length = 8;
> >     else
> > @@ -1107,6 +1118,8 @@ _cpp_valid_ucn (cpp_reader *pfile, const
> >     result = 0;
> >     do
> >       {
> > +      if (str == limit)
> > +	break;
> >         c = *str;
> >         if (!ISXDIGIT (c))
> >   	break;
> > @@ -1116,9 +1129,41 @@ _cpp_valid_ucn (cpp_reader *pfile, const
> >   	  gcc_assert (char_range);
> >   	  char_range->m_finish = loc_reader->get_next ().m_finish;
> >   	}
> > +      if (delimited)
> > +	{
> > +	  if (!result)
> > +	    /* Accept arbitrary number of leading zeros.  */
> > +	    length = 16;
> > +	  else if (length == 8)
> > +	    {
> > +	      /* Make sure we detect overflows.  */
> > +	      result |= 0x8000000;
> > +	      ++length;
> > +	    }
> 
> 16 above so that this case happens after we read 8 digits after leading
> zeroes?

Another magic value less than the no digits seen one and
> 8, so that it can count 8 digits with the first non-zero one after
which to or in the overflow flag.  The intent is not to break the loop
if there are further digits, just that there will be overflow.
Another option would be those overflow |= n ^ (n << 4 >> 4);
tests that convert_hex does and just making sure length is never decremented
(except we need a way to distinguish between \u{} and at least one digit).

> > +      if (loc_reader)
> > +	char_range->m_finish = loc_reader->get_next ().m_finish;
> 
> Here and in other functions, the pattern of increment the input pointer and
> update m_finish seems like it should be a macro?

Perhaps or inline function.  Before my patch, there are 5 such ifs
(some with char_range.m_finish and others char_range->m_finish),
the patch adds another 7 such spots.

> > @@ -2119,15 +2255,23 @@ _cpp_interpret_identifier (cpp_reader *p
> >   	cppchar_t value = 0;
> >   	size_t bufleft = len - (bufp - buf);
> >   	int rval;
> > +	bool delimited = false;
> >   	idp += 2;
> > +	if (length == 4 && id[idp] == '{')
> > +	  {
> > +	    delimited = true;
> > +	    idp++;
> > +	  }
> >   	while (length && idp < len && ISXDIGIT (id[idp]))
> >   	  {
> >   	    value = (value << 4) + hex_value (id[idp]);
> >   	    idp++;
> > -	    length--;
> > +	    if (!delimited)
> > +	      length--;
> >   	  }
> > -	idp--;
> > +	if (!delimited)
> > +	  idp--;
> 
> Don't we need to check that the first non-xdigit is a }?

The comments and my understanding of the code say that we first
check what is a valid identifier and the above is only called on
a valid identifier.  So, if it would be delimited \u{ not terminated
with }, then it would fail forms_identifier_p and wouldn't be included
in the range.  Thus e.g. the ISXDIGIT (id[id]) test is probably not needed
unless delimited is true because we've checked earlier that it has 4 or 8
hex digits.
But sure, if you want a id[idp] == '}' test or assertion, it can be
added.

	Jakub
  
Jason Merrill Aug. 18, 2022, 2:22 a.m. UTC | #3
On 8/17/22 14:19, Jakub Jelinek wrote:
> On Wed, Aug 17, 2022 at 04:47:19PM -0400, Jason Merrill via Gcc-patches wrote:
>>> +	  length = 32;
>>
>> /* Magic value to indicate no digits seen.  */
> 
> Indeed, will add the comment.
> 
>>> +	  delimited = true;
>>> +	  if (loc_reader)
>>> +	    char_range->m_finish = loc_reader->get_next ().m_finish;
>>> +	}
>>> +    }
>>>      else if (str[-1] == 'U')
>>>        length = 8;
>>>      else
>>> @@ -1107,6 +1118,8 @@ _cpp_valid_ucn (cpp_reader *pfile, const
>>>      result = 0;
>>>      do
>>>        {
>>> +      if (str == limit)
>>> +	break;
>>>          c = *str;
>>>          if (!ISXDIGIT (c))
>>>    	break;
>>> @@ -1116,9 +1129,41 @@ _cpp_valid_ucn (cpp_reader *pfile, const
>>>    	  gcc_assert (char_range);
>>>    	  char_range->m_finish = loc_reader->get_next ().m_finish;
>>>    	}
>>> +      if (delimited)
>>> +	{
>>> +	  if (!result)
>>> +	    /* Accept arbitrary number of leading zeros.  */
>>> +	    length = 16;
>>> +	  else if (length == 8)
>>> +	    {
>>> +	      /* Make sure we detect overflows.  */
>>> +	      result |= 0x8000000;
>>> +	      ++length;
>>> +	    }
>>
>> 16 above so that this case happens after we read 8 digits after leading
>> zeroes?
> 
> Another magic value less than the no digits seen one and >8,
> so that it can count 8 digits with the first non-zero one after
> which to or in the overflow flag.  The intent is not to break the loop
> if there are further digits, just that there will be overflow.
> Another option would be those overflow |= n ^ (n << 4 >> 4);
> tests that convert_hex does and just making sure length is never decremented
> (except we need a way to distinguish between \u{} and at least one digit).

This way is fine, could just use more comment.

>>> +      if (loc_reader)
>>> +	char_range->m_finish = loc_reader->get_next ().m_finish;
>>
>> Here and in other functions, the pattern of increment the input pointer and
>> update m_finish seems like it should be a macro?
> 
> Perhaps or inline function.  Before my patch, there are 5 such ifs
> (some with char_range.m_finish and others char_range->m_finish),
> the patch adds another 7 such spots.

Either way is fine.

>>> @@ -2119,15 +2255,23 @@ _cpp_interpret_identifier (cpp_reader *p
>>>    	cppchar_t value = 0;
>>>    	size_t bufleft = len - (bufp - buf);
>>>    	int rval;
>>> +	bool delimited = false;
>>>    	idp += 2;
>>> +	if (length == 4 && id[idp] == '{')
>>> +	  {
>>> +	    delimited = true;
>>> +	    idp++;
>>> +	  }
>>>    	while (length && idp < len && ISXDIGIT (id[idp]))
>>>    	  {
>>>    	    value = (value << 4) + hex_value (id[idp]);
>>>    	    idp++;
>>> -	    length--;
>>> +	    if (!delimited)
>>> +	      length--;
>>>    	  }
>>> -	idp--;
>>> +	if (!delimited)
>>> +	  idp--;
>>
>> Don't we need to check that the first non-xdigit is a }?
> 
> The comments and my understanding of the code say that we first
> check what is a valid identifier and the above is only called on
> a valid identifier.  So, if it would be delimited \u{ not terminated
> with }, then it would fail forms_identifier_p and wouldn't be included
> in the range.  Thus e.g. the ISXDIGIT (id[id]) test is probably not needed
> unless delimited is true because we've checked earlier that it has 4 or 8
> hex digits.
> But sure, if you want a id[idp] == '}' test or assertion, it can be
> added.

OK, a comment mentioning this should be sufficient.

Jason
  

Patch

--- libcpp/include/cpplib.h.jj	2022-08-10 09:06:53.268209449 +0200
+++ libcpp/include/cpplib.h	2022-08-15 19:32:53.743213474 +0200
@@ -519,6 +519,9 @@  struct cpp_options
   /* Nonzero for C++23 size_t literals.  */
   unsigned char size_t_literals;
 
+  /* Nonzero for C++23 delimited escape sequences.  */
+  unsigned char delimited_escape_seqs;
+
   /* Holds the name of the target (execution) character set.  */
   const char *narrow_charset;
 
--- libcpp/init.cc.jj	2022-08-10 09:06:53.268209449 +0200
+++ libcpp/init.cc	2022-08-15 16:09:01.403020485 +0200
@@ -96,34 +96,35 @@  struct lang_flags
   char dfp_constants;
   char size_t_literals;
   char elifdef;
+  char delimited_escape_seqs;
 };
 
 static const struct lang_flags lang_defaults[] =
-{ /*              c99 c++ xnum xid c11 std digr ulit rlit udlit bincst digsep trig u8chlit vaopt scope dfp szlit elifdef */
-  /* GNUC89   */  { 0,  0,  1,  0,  0,  0,  1,   0,   0,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0 },
-  /* GNUC99   */  { 1,  0,  1,  1,  0,  0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0 },
-  /* GNUC11   */  { 1,  0,  1,  1,  1,  0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0 },
-  /* GNUC17   */  { 1,  0,  1,  1,  1,  0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0 },
-  /* GNUC2X   */  { 1,  0,  1,  1,  1,  0,  1,   1,   1,   0,    1,     1,     0,   1,      1,   1,     1,   0,   1 },
-  /* STDC89   */  { 0,  0,  0,  0,  0,  1,  0,   0,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0 },
-  /* STDC94   */  { 0,  0,  0,  0,  0,  1,  1,   0,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0 },
-  /* STDC99   */  { 1,  0,  1,  1,  0,  1,  1,   0,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0 },
-  /* STDC11   */  { 1,  0,  1,  1,  1,  1,  1,   1,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0 },
-  /* STDC17   */  { 1,  0,  1,  1,  1,  1,  1,   1,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0 },
-  /* STDC2X   */  { 1,  0,  1,  1,  1,  1,  1,   1,   0,   0,    1,     1,     1,   1,      0,   1,     1,   0,   1 },
-  /* GNUCXX   */  { 0,  1,  1,  1,  0,  0,  1,   0,   0,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0 },
-  /* CXX98    */  { 0,  1,  0,  1,  0,  1,  1,   0,   0,   0,    0,     0,     1,   0,      0,   1,     0,   0,   0 },
-  /* GNUCXX11 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    0,     0,     0,   0,      1,   1,     0,   0,   0 },
-  /* CXX11    */  { 1,  1,  0,  1,  1,  1,  1,   1,   1,   1,    0,     0,     1,   0,      0,   1,     0,   0,   0 },
-  /* GNUCXX14 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    1,     1,     0,   0,      1,   1,     0,   0,   0 },
-  /* CXX14    */  { 1,  1,  0,  1,  1,  1,  1,   1,   1,   1,    1,     1,     1,   0,      0,   1,     0,   0,   0 },
-  /* GNUCXX17 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   0,   0 },
-  /* CXX17    */  { 1,  1,  1,  1,  1,  1,  1,   1,   1,   1,    1,     1,     0,   1,      0,   1,     0,   0,   0 },
-  /* GNUCXX20 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   0,   0 },
-  /* CXX20    */  { 1,  1,  1,  1,  1,  1,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   0,   0 },
-  /* GNUCXX23 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   1,   1 },
-  /* CXX23    */  { 1,  1,  1,  1,  1,  1,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   1,   1 },
-  /* ASM      */  { 0,  0,  1,  0,  0,  0,  0,   0,   0,   0,    0,     0,     0,   0,      0,   0,     0,   0,   0 }
+{ /*              c99 c++ xnum xid c11 std digr ulit rlit udlit bincst digsep trig u8chlit vaopt scope dfp szlit elifdef delim */
+  /* GNUC89   */  { 0,  0,  1,  0,  0,  0,  1,   0,   0,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0 },
+  /* GNUC99   */  { 1,  0,  1,  1,  0,  0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0 },
+  /* GNUC11   */  { 1,  0,  1,  1,  1,  0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0 },
+  /* GNUC17   */  { 1,  0,  1,  1,  1,  0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0 },
+  /* GNUC2X   */  { 1,  0,  1,  1,  1,  0,  1,   1,   1,   0,    1,     1,     0,   1,      1,   1,     1,   0,   1,      0 },
+  /* STDC89   */  { 0,  0,  0,  0,  0,  1,  0,   0,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0,      0 },
+  /* STDC94   */  { 0,  0,  0,  0,  0,  1,  1,   0,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0,      0 },
+  /* STDC99   */  { 1,  0,  1,  1,  0,  1,  1,   0,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0,      0 },
+  /* STDC11   */  { 1,  0,  1,  1,  1,  1,  1,   1,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0,      0 },
+  /* STDC17   */  { 1,  0,  1,  1,  1,  1,  1,   1,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0,      0 },
+  /* STDC2X   */  { 1,  0,  1,  1,  1,  1,  1,   1,   0,   0,    1,     1,     1,   1,      0,   1,     1,   0,   1,      0 },
+  /* GNUCXX   */  { 0,  1,  1,  1,  0,  0,  1,   0,   0,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0 },
+  /* CXX98    */  { 0,  1,  0,  1,  0,  1,  1,   0,   0,   0,    0,     0,     1,   0,      0,   1,     0,   0,   0,      0 },
+  /* GNUCXX11 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0 },
+  /* CXX11    */  { 1,  1,  0,  1,  1,  1,  1,   1,   1,   1,    0,     0,     1,   0,      0,   1,     0,   0,   0,      0 },
+  /* GNUCXX14 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    1,     1,     0,   0,      1,   1,     0,   0,   0,      0 },
+  /* CXX14    */  { 1,  1,  0,  1,  1,  1,  1,   1,   1,   1,    1,     1,     1,   0,      0,   1,     0,   0,   0,      0 },
+  /* GNUCXX17 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   0,   0,      0 },
+  /* CXX17    */  { 1,  1,  1,  1,  1,  1,  1,   1,   1,   1,    1,     1,     0,   1,      0,   1,     0,   0,   0,      0 },
+  /* GNUCXX20 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   0,   0,      0 },
+  /* CXX20    */  { 1,  1,  1,  1,  1,  1,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   0,   0,      0 },
+  /* GNUCXX23 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   1,   1,      1 },
+  /* CXX23    */  { 1,  1,  1,  1,  1,  1,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   1,   1,      1 },
+  /* ASM      */  { 0,  0,  1,  0,  0,  0,  0,   0,   0,   0,    0,     0,     0,   0,      0,   0,     0,   0,   0,      0 }
 };
 
 /* Sets internal flags correctly for a given language.  */
@@ -153,6 +154,7 @@  cpp_set_lang (cpp_reader *pfile, enum c_
   CPP_OPTION (pfile, dfp_constants)		 = l->dfp_constants;
   CPP_OPTION (pfile, size_t_literals)		 = l->size_t_literals;
   CPP_OPTION (pfile, elifdef)			 = l->elifdef;
+  CPP_OPTION (pfile, delimited_escape_seqs)	 = l->delimited_escape_seqs;
 }
 
 /* Initialize library global state.  */
--- libcpp/charset.cc.jj	2022-08-15 12:52:43.213902801 +0200
+++ libcpp/charset.cc	2022-08-16 11:42:27.729948705 +0200
@@ -1081,6 +1081,7 @@  _cpp_valid_ucn (cpp_reader *pfile, const
   unsigned int length;
   const uchar *str = *pstr;
   const uchar *base = str - 2;
+  bool delimited = false;
 
   if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
     cpp_error (pfile, CPP_DL_WARNING,
@@ -1095,7 +1096,17 @@  _cpp_valid_ucn (cpp_reader *pfile, const
 	         (int) str[-1]);
 
   if (str[-1] == 'u')
-    length = 4;
+    {
+      length = 4;
+      if (str < limit && *str == '{')
+	{
+	  str++;
+	  length = 32;
+	  delimited = true;
+	  if (loc_reader)
+	    char_range->m_finish = loc_reader->get_next ().m_finish;
+	}
+    }
   else if (str[-1] == 'U')
     length = 8;
   else
@@ -1107,6 +1118,8 @@  _cpp_valid_ucn (cpp_reader *pfile, const
   result = 0;
   do
     {
+      if (str == limit)
+	break;
       c = *str;
       if (!ISXDIGIT (c))
 	break;
@@ -1116,9 +1129,41 @@  _cpp_valid_ucn (cpp_reader *pfile, const
 	  gcc_assert (char_range);
 	  char_range->m_finish = loc_reader->get_next ().m_finish;
 	}
+      if (delimited)
+	{
+	  if (!result)
+	    /* Accept arbitrary number of leading zeros.  */
+	    length = 16;
+	  else if (length == 8)
+	    {
+	      /* Make sure we detect overflows.  */
+	      result |= 0x8000000;
+	      ++length;
+	    }
+	}
+
       result = (result << 4) + hex_value (c);
     }
-  while (--length && str < limit);
+  while (--length);
+
+  if (delimited
+      && str < limit
+      && *str == '}'
+      && (length != 32 || !identifier_pos))
+    {
+      if (length == 32)
+	cpp_error (pfile, CPP_DL_ERROR,
+		   "empty delimited escape sequence");
+      else if (!CPP_OPTION (pfile, delimited_escape_seqs)
+	       && CPP_OPTION (pfile, cpp_pedantic))
+	cpp_error (pfile, CPP_DL_PEDWARN,
+		   "delimited escape sequences are only valid in C++23");
+      str++;
+      length = 0;
+      delimited = false;
+      if (loc_reader)
+	char_range->m_finish = loc_reader->get_next ().m_finish;
+    }
 
   /* Partial UCNs are not valid in strings, but decompose into
      multiple tokens in identifiers, so we can't give a helpful
@@ -1132,9 +1177,14 @@  _cpp_valid_ucn (cpp_reader *pfile, const
   *pstr = str;
   if (length)
     {
-      cpp_error (pfile, CPP_DL_ERROR,
-		 "incomplete universal character name %.*s",
-		 (int) (str - base), base);
+      if (!delimited)
+	cpp_error (pfile, CPP_DL_ERROR,
+		   "incomplete universal character name %.*s",
+		   (int) (str - base), base);
+      else
+	cpp_error (pfile, CPP_DL_ERROR,
+		   "'\\u{' not terminated with '}' after %.*s",
+		   (int) (str - base), base);
       result = 1;
     }
   /* The C99 standard permits $, @ and ` to be specified as UCNs.  We use
@@ -1392,6 +1442,8 @@  convert_hex (cpp_reader *pfile, const uc
   int digits_found = 0;
   size_t width = cvt.width;
   size_t mask = width_to_mask (width);
+  bool delimited = false;
+  const uchar *base = from - 1;
 
   /* loc_reader and ranges must either be both NULL, or both be non-NULL.  */
   gcc_assert ((loc_reader != NULL) == (ranges != NULL));
@@ -1407,6 +1459,14 @@  convert_hex (cpp_reader *pfile, const uc
   if (loc_reader)
     char_range.m_finish = loc_reader->get_next ().m_finish;
 
+  if (from < limit && *from == '{')
+    {
+      delimited = true;
+      from++;
+      if (loc_reader)
+	char_range.m_finish = loc_reader->get_next ().m_finish;
+    }
+
   while (from < limit)
     {
       c = *from;
@@ -1420,12 +1480,37 @@  convert_hex (cpp_reader *pfile, const uc
       digits_found = 1;
     }
 
+  if (delimited && from < limit && *from == '}')
+    {
+      from++;
+      if (!digits_found)
+	{
+	  cpp_error (pfile, CPP_DL_ERROR,
+		     "empty delimited escape sequence");
+	  return from;
+	}
+     else if (!CPP_OPTION (pfile, delimited_escape_seqs)
+	      && CPP_OPTION (pfile, cpp_pedantic))
+	cpp_error (pfile, CPP_DL_PEDWARN,
+		   "delimited escape sequences are only valid in C++23");
+      delimited = false;
+      if (loc_reader)
+	char_range.m_finish = loc_reader->get_next ().m_finish;
+    }
+
   if (!digits_found)
     {
       cpp_error (pfile, CPP_DL_ERROR,
 		 "\\x used with no following hex digits");
       return from;
     }
+  else if (delimited)
+    {
+      cpp_error (pfile, CPP_DL_ERROR,
+		 "'\\x{' not terminated with '}' after %.*s",
+		 (int) (from - base), base);
+      return from;
+    }
 
   if (overflow | (n != (n & mask)))
     {
@@ -1459,13 +1544,31 @@  convert_oct (cpp_reader *pfile, const uc
 	     cpp_substring_ranges *ranges)
 {
   size_t count = 0;
-  cppchar_t c, n = 0;
+  cppchar_t c, n = 0, overflow = 0;
   size_t width = cvt.width;
   size_t mask = width_to_mask (width);
+  bool delimited = false;
+  const uchar *base = from - 1;
 
   /* loc_reader and ranges must either be both NULL, or both be non-NULL.  */
   gcc_assert ((loc_reader != NULL) == (ranges != NULL));
 
+  if (from < limit && *from == 'o')
+    {
+      from++;
+      if (loc_reader)
+	char_range.m_finish = loc_reader->get_next ().m_finish;
+      if (from == limit || *from != '{')
+	cpp_error (pfile, CPP_DL_ERROR, "'\\o' not followed by '{'");
+      else
+	{
+	  from++;
+	  if (loc_reader)
+	    char_range.m_finish = loc_reader->get_next ().m_finish;
+	  delimited = true;
+	}
+    }
+
   while (from < limit && count++ < 3)
     {
       c = *from;
@@ -1474,10 +1577,42 @@  convert_oct (cpp_reader *pfile, const uc
       from++;
       if (loc_reader)
 	char_range.m_finish = loc_reader->get_next ().m_finish;
+      if (delimited)
+	{
+	  count = 2;
+	  overflow |= n ^ (n << 3 >> 3);
+	}
       n = (n << 3) + c - '0';
     }
 
-  if (n != (n & mask))
+  if (delimited)
+    {
+      if (from < limit && *from == '}')
+	{
+	  from++;
+	  if (count == 1)
+	    {
+	      cpp_error (pfile, CPP_DL_ERROR,
+			 "empty delimited escape sequence");
+	      return from;
+	    }
+	  else if (!CPP_OPTION (pfile, delimited_escape_seqs)
+		   && CPP_OPTION (pfile, cpp_pedantic))
+	    cpp_error (pfile, CPP_DL_PEDWARN,
+		       "delimited escape sequences are only valid in C++23");
+	  if (loc_reader)
+	    char_range.m_finish = loc_reader->get_next ().m_finish;
+	}
+      else
+	{
+	  cpp_error (pfile, CPP_DL_ERROR,
+		     "'\\o{' not terminated with '}' after %.*s",
+		     (int) (from - base), base);
+	  return from;
+	}
+    }
+
+  if (overflow | (n != (n & mask)))
     {
       cpp_error (pfile, CPP_DL_PEDWARN,
 		 "octal escape sequence out of range");
@@ -1535,6 +1670,7 @@  convert_escape (cpp_reader *pfile, const
 
     case '0':  case '1':  case '2':  case '3':
     case '4':  case '5':  case '6':  case '7':
+    case 'o':
       return convert_oct (pfile, from, limit, tbuf, cvt,
 			  char_range, loc_reader, ranges);
 
@@ -2119,15 +2255,23 @@  _cpp_interpret_identifier (cpp_reader *p
 	cppchar_t value = 0;
 	size_t bufleft = len - (bufp - buf);
 	int rval;
+	bool delimited = false;
 
 	idp += 2;
+	if (length == 4 && id[idp] == '{')
+	  {
+	    delimited = true;
+	    idp++;
+	  }
 	while (length && idp < len && ISXDIGIT (id[idp]))
 	  {
 	    value = (value << 4) + hex_value (id[idp]);
 	    idp++;
-	    length--;
+	    if (!delimited)
+	      length--;
 	  }
-	idp--;
+	if (!delimited)
+	  idp--;
 
 	/* Special case for EBCDIC: if the identifier contains
 	   a '$' specified using a UCN, translate it to EBCDIC.  */
--- libcpp/lex.cc.jj	2022-05-23 10:59:06.235591348 +0200
+++ libcpp/lex.cc	2022-08-16 11:57:53.772823661 +0200
@@ -1426,19 +1426,35 @@  get_bidi_utf8 (cpp_reader *pfile, const
 /* Parse a UCN where P points just past \u or \U and return its bidi code.  */
 
 static bidi::kind
-get_bidi_ucn_1 (const unsigned char *p, bool is_U)
+get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
 {
   /* 6.4.3 Universal Character Names
       \u hex-quad
       \U hex-quad hex-quad
+      \u { simple-hexadecimal-digit-sequence }
      where \unnnn means \U0000nnnn.  */
 
+  *end = p + 4;
   if (is_U)
     {
       if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
 	return bidi::kind::NONE;
       /* Skip 4B so we can treat \u and \U the same below.  */
       p += 4;
+      *end += 4;
+    }
+  else if (p[0] == '{')
+    {
+      p++;
+      while (*p == '0')
+	p++;
+      if (p[0] != '2'
+	  || p[1] != '0'
+	  || !ISXDIGIT (p[2])
+	  || !ISXDIGIT (p[3])
+	  || p[4] != '}')
+	return bidi::kind::NONE;
+      *end = p + 5;
     }
 
   /* All code points we are looking for start with 20xx.  */
@@ -1499,14 +1515,15 @@  get_bidi_ucn_1 (const unsigned char *p,
    If the kind is not NONE, write the location to *OUT.*/
 
 static bidi::kind
-get_bidi_ucn (cpp_reader *pfile,  const unsigned char *p, bool is_U,
+get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
 	      location_t *out)
 {
-  bidi::kind result = get_bidi_ucn_1 (p, is_U);
+  const unsigned char *end;
+  bidi::kind result = get_bidi_ucn_1 (p, is_U, &end);
   if (result != bidi::kind::NONE)
     {
       const unsigned char *start = p - 2;
-      size_t num_bytes = 2 + (is_U ? 8 : 4);
+      size_t num_bytes = end - start;
       *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
     }
   return result;
--- gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-1.c.jj	2022-08-16 10:47:38.693022740 +0200
+++ gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-1.c	2022-08-16 12:18:42.235477632 +0200
@@ -0,0 +1,92 @@ 
+/* P2290R3 - Delimited escape sequences */
+/* { dg-do run } */
+/* { dg-require-effective-target wchar } */
+/* { dg-options "-std=gnu99 -Wno-c++-compat" { target c } } */
+/* { dg-options "-std=c++23" { target c++ } } */
+
+#ifndef __cplusplus
+#include <wchar.h>
+typedef __CHAR16_TYPE__ char16_t;
+typedef __CHAR32_TYPE__ char32_t;
+#endif
+
+const char32_t *a = U"\u{1234}\u{10fffd}\u{000000000000000000000000000000000000000000000000000000000001234}\u{10FFFD}";
+const char32_t *b = U"\x{1234}\x{10fffd}\x{000000000000000000000000000000000000000000000000000000000001234}";
+const char32_t *c = U"\o{1234}\o{4177775}\o{000000000000000000000000000000000000000000000000000000000000000000000000004177775}";
+const char16_t *d = u"\u{1234}\u{bFFd}\u{00000000000000000000000000000001234}";
+const char16_t *e = u"\x{1234}\x{BffD}\x{000001234}";
+const char16_t *f = u"\o{1234}\o{137775}\o{000000000000000137775}";
+const wchar_t *g = L"\u{1234}\u{bFFd}\u{00000000000000000000000000000001234}";
+const wchar_t *h = L"\x{1234}\x{bFFd}\x{000001234}";
+const wchar_t *i = L"\o{1234}\o{137775}\o{000000000000000137775}";
+#ifdef __cplusplus
+const char *j = "\u{34}\u{000000000000000003D}";
+#endif
+const char *k = "\x{34}\x{000000000000000003D}";
+const char *l = "\o{34}\o{000000000000000176}";
+
+#if U'\u{1234}' != U'\u1234' || U'\u{10fffd}' != U'\U0010FFFD' \
+    || U'\x{00000001234}' != U'\x1234' || U'\x{010fffd}' != U'\x10FFFD' \
+    || U'\o{1234}' != U'\x29c' || U'\o{004177775}' != U'\x10FFFD' \
+    || u'\u{1234}' != u'\u1234' || u'\u{0bffd}' != u'\uBFFD' \
+    || u'\x{00000001234}' != u'\x1234' || u'\x{0Bffd}' != u'\x0bFFD' \
+    || u'\o{1234}' != u'\x29c' || u'\o{00137775}' != u'\xBFFD' \
+    || L'\u{1234}' != L'\u1234' || L'\u{0bffd}' != L'\uBFFD' \
+    || L'\x{00000001234}' != L'\x1234' || L'\x{0bffd}' != L'\x0bFFD' \
+    || L'\o{1234}' != L'\x29c' || L'\o{00137775}' != L'\xBFFD' \
+    || '\x{34}' != '\x034' || '\x{0003d}' != '\x003D' \
+    || '\o{34}' != '\x1C' || '\o{176}' != '\x007E'
+#error Bad
+#endif
+#ifdef __cplusplus
+#if '\u{0000000034}' != '\u0034' || '\u{3d}' != '\u003D'
+#error Bad
+#endif
+#endif
+
+int
+main ()
+{
+  if (a[0] != U'\u1234' || a[0] != U'\u{1234}'
+      || a[1] != U'\U0010FFFD' || a[1] != U'\u{000010fFfD}'
+      || a[2] != a[0]
+      || a[3] != a[1]
+      || b[0] != U'\x1234' || b[0] != U'\x{001234}'
+      || b[1] != U'\x10FFFD' || b[1] != U'\x{0010fFfD}'
+      || b[2] != b[0]
+      || c[0] != U'\x29c' || c[0] != U'\o{001234}'
+      || c[1] != U'\x10FFFD' || c[1] != U'\o{4177775}'
+      || c[2] != c[1])
+    __builtin_abort ();
+  if (d[0] != u'\u1234' || d[0] != u'\u{1234}'
+      || d[1] != u'\U0000BFFD' || d[1] != u'\u{00000bFfD}'
+      || d[2] != d[0]
+      || e[0] != u'\x1234' || e[0] != u'\x{001234}'
+      || e[1] != u'\xBFFD' || e[1] != u'\x{00bFfD}'
+      || e[2] != e[0]
+      || f[0] != u'\x29c' || f[0] != u'\o{001234}'
+      || f[1] != u'\xbFFD' || f[1] != u'\o{137775}'
+      || f[2] != f[1])
+    __builtin_abort ();
+  if (g[0] != L'\u1234' || g[0] != L'\u{1234}'
+      || g[1] != L'\U0000BFFD' || g[1] != L'\u{00000bFfD}'
+      || g[2] != g[0]
+      || h[0] != L'\x1234' || h[0] != L'\x{001234}'
+      || h[1] != L'\xBFFD' || h[1] != L'\x{00bFfD}'
+      || h[2] != h[0]
+      || i[0] != L'\x29c' || i[0] != L'\o{001234}'
+      || i[1] != L'\xbFFD' || i[1] != L'\o{137775}'
+      || i[2] != i[1])
+    __builtin_abort ();
+#ifdef __cplusplus
+  if (j[0] != '\u0034' || j[0] != '\u{034}'
+      || j[1] != '\U0000003D' || j[1] != '\u{000003d}')
+    __builtin_abort ();
+#endif
+  if (k[0] != '\x034' || k[0] != '\x{0034}'
+      || k[1] != '\x3D' || k[1] != '\x{3d}'
+      || l[0] != '\x1c' || l[0] != '\o{0034}'
+      || l[1] != '\x07e' || l[1] != '\o{176}' || l[1] != '\176')
+    __builtin_abort ();
+  return 0;
+}
--- gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-2.c.jj	2022-08-16 10:47:41.846981390 +0200
+++ gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-2.c	2022-08-16 12:18:58.807260607 +0200
@@ -0,0 +1,18 @@ 
+/* P2290R3 - Delimited escape sequences */
+/* { dg-do compile } */
+/* { dg-options "-std=gnu99 -Wno-c++-compat" { target c } } */
+/* { dg-options "-std=c++23" { target c++ } } */
+
+int jalape\u{f1}o = 42;
+
+int
+caf\u{000e9} (void)
+{
+  return jalape\u00F1o;
+}
+
+int
+test (void)
+{
+  return caf\u00e9 ();
+}
--- gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-3.c.jj	2022-08-16 12:18:19.308777922 +0200
+++ gcc/testsuite/c-c++-common/cpp/delimited-escape-seq-3.c	2022-08-16 12:41:23.693648138 +0200
@@ -0,0 +1,33 @@ 
+/* P2290R3 - Delimited escape sequences */
+/* { dg-do compile } */
+/* { dg-require-effective-target wchar } */
+/* { dg-options "-std=gnu99 -Wno-c++-compat" { target c } } */
+/* { dg-options "-std=c++23" { target c++ } } */
+
+#ifndef __cplusplus
+typedef __CHAR32_TYPE__ char32_t;
+#endif
+
+const char32_t *a = U"\u{}";				/* { dg-error "empty delimited escape sequence" } */
+							/* { dg-error "is not a valid universal character" "" { target c } .-1 } */
+const char32_t *b = U"\u{12" "34}";			/* { dg-error "'\\\\u\\{' not terminated with '\\}' after" } */
+const char32_t *c = U"\u{0000ffffffff}";		/* { dg-error "is not a valid universal character" } */
+const char32_t *d = U"\u{010000edcb}";			/* { dg-error "is not a valid universal character" } */
+const char32_t *e = U"\u{02000000000000000000edcb}";	/* { dg-error "is not a valid universal character" } */
+const char32_t *f = U"\u{123ghij}";			/* { dg-error "'\\\\u\\{' not terminated with '\\}' after" } */
+const char32_t *g = U"\u{123.}";			/* { dg-error "'\\\\u\\{' not terminated with '\\}' after" } */
+const char32_t *h = U"\u{.}";				/* { dg-error "'\\\\u\\{' not terminated with '\\}' after" } */
+const char32_t *i = U"\x{}";				/* { dg-error "empty delimited escape sequence" } */
+const char32_t *j = U"\x{12" "34}";			/* { dg-error "'\\\\x\\{' not terminated with '\\}' after" } */
+const char32_t *k = U"\x{0000ffffffff}";
+const char32_t *l = U"\x{010000edcb}";			/* { dg-warning "hex escape sequence out of range" } */
+const char32_t *m = U"\x{02000000000000000000edcb}";	/* { dg-warning "hex escape sequence out of range" } */
+const char32_t *n = U"\x{123ghij}";			/* { dg-error "'\\\\x\\{' not terminated with '\\}' after" } */
+const char32_t *o = U"\x{123.}";			/* { dg-error "'\\\\x\\{' not terminated with '\\}' after" } */
+const char32_t *p = U"\o{}";				/* { dg-error "empty delimited escape sequence" } */
+const char32_t *q = U"\o{12" "34}";			/* { dg-error "'\\\\o\\{' not terminated with '\\}' after" } */
+const char32_t *r = U"\o{0000037777777777}";
+const char32_t *s = U"\o{040000166713}";		/* { dg-warning "octal escape sequence out of range" } */
+const char32_t *t = U"\o{02000000000000000000000166713}";/* { dg-warning "octal escape sequence out of range" } */
+const char32_t *u = U"\o{1238}";			/* { dg-error "'\\\\o\\{' not terminated with '\\}' after" } */
+const char32_t *v = U"\o{.}";				/* { dg-error "'\\\\o\\{' not terminated with '\\}' after" } */
--- gcc/testsuite/c-c++-common/Wbidi-chars-24.c.jj	2022-08-16 12:03:19.350561676 +0200
+++ gcc/testsuite/c-c++-common/Wbidi-chars-24.c	2022-08-16 12:06:46.381851525 +0200
@@ -0,0 +1,28 @@ 
+/* PR preprocessor/103026 */
+/* { dg-do compile } */
+/* { dg-options "-Wbidi-chars=ucn,unpaired" } */
+/* Test nesting of bidi chars in various contexts.  */
+
+void
+g1 ()
+{
+  const char *s1 = "a b c LRE\u{202a} 1 2 3 PDI\u{00000000000000000000000002069} x y z";
+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
+  const char *s2 = "a b c RLE\u{00202b} 1 2 3 PDI\u{2069} x y z";
+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
+  const char *s3 = "a b c LRO\u{000000202d} 1 2 3 PDI\u{02069} x y z";
+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
+  const char *s4 = "a b c RLO\u{202e} 1 2 3 PDI\u{00000002069} x y z";
+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
+  const char *s5 = "a b c LRI\u{002066} 1 2 3 PDF\u{202C} x y z";
+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
+  const char *s6 = "a b c RLI\u{02067} 1 2 3 PDF\u{202c} x y z";
+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
+  const char *s7 = "a b c FSI\u{0002068} 1 2 3 PDF\u{0202c} x y z";
+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
+}
+
+int A\u{202a}B\u{2069}C;
+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
+int a\u{00000202b}B\u{000000002069}c;
+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
--- gcc/testsuite/gcc.dg/cpp/delimited-escape-seq-1.c.jj	2022-08-16 10:47:38.693022740 +0200
+++ gcc/testsuite/gcc.dg/cpp/delimited-escape-seq-1.c	2022-08-16 12:46:56.508291006 +0200
@@ -0,0 +1,10 @@ 
+/* P2290R3 - Delimited escape sequences */
+/* { dg-do compile } */
+/* { dg-require-effective-target wchar } */
+/* { dg-options "-std=gnu99 -Wno-c++-compat -pedantic" } */
+
+typedef __CHAR32_TYPE__ char32_t;
+
+const char32_t *a = U"\u{1234}";	/* { dg-warning "delimited escape sequences are only valid in" } */
+const char32_t *b = U"\x{1234}";	/* { dg-warning "delimited escape sequences are only valid in" } */
+const char32_t *c = U"\o{1234}";	/* { dg-warning "delimited escape sequences are only valid in" } */
--- gcc/testsuite/gcc.dg/cpp/delimited-escape-seq-2.c.jj	2022-08-16 10:47:41.846981390 +0200
+++ gcc/testsuite/gcc.dg/cpp/delimited-escape-seq-2.c	2022-08-16 12:47:05.955167423 +0200
@@ -0,0 +1,10 @@ 
+/* P2290R3 - Delimited escape sequences */
+/* { dg-do compile } */
+/* { dg-require-effective-target wchar } */
+/* { dg-options "-std=gnu99 -Wno-c++-compat -pedantic-errors" } */
+
+typedef __CHAR32_TYPE__ char32_t;
+
+const char32_t *a = U"\u{1234}";	/* { dg-error "delimited escape sequences are only valid in" } */
+const char32_t *b = U"\x{1234}";	/* { dg-error "delimited escape sequences are only valid in" } */
+const char32_t *c = U"\o{1234}";	/* { dg-error "delimited escape sequences are only valid in" } */
--- gcc/testsuite/g++.dg/cpp/delimited-escape-seq-1.C.jj	2022-08-16 12:46:43.368462901 +0200
+++ gcc/testsuite/g++.dg/cpp/delimited-escape-seq-1.C	2022-08-16 12:49:21.532393786 +0200
@@ -0,0 +1,8 @@ 
+// P2290R3 - Delimited escape sequences
+// { dg-do compile { target c++11 } }
+// { dg-require-effective-target wchar }
+// { dg-options "-pedantic" }
+
+const char32_t *a = U"\u{1234}";	// { dg-warning "delimited escape sequences are only valid in" "" { target c++20_down } }
+const char32_t *b = U"\x{1234}";	// { dg-warning "delimited escape sequences are only valid in" "" { target c++20_down } }
+const char32_t *c = U"\o{1234}";	// { dg-warning "delimited escape sequences are only valid in" "" { target c++20_down } }
--- gcc/testsuite/g++.dg/cpp/delimited-escape-seq-2.C.jj	2022-08-16 12:46:46.281424798 +0200
+++ gcc/testsuite/g++.dg/cpp/delimited-escape-seq-2.C	2022-08-16 12:49:33.761233803 +0200
@@ -0,0 +1,8 @@ 
+// P2290R3 - Delimited escape sequences
+// { dg-do compile { target c++11 } }
+// { dg-require-effective-target wchar }
+// { dg-options "-pedantic-errors" }
+
+const char32_t *a = U"\u{1234}";	// { dg-error "delimited escape sequences are only valid in" "" { target c++20_down } }
+const char32_t *b = U"\x{1234}";	// { dg-error "delimited escape sequences are only valid in" "" { target c++20_down } }
+const char32_t *c = U"\o{1234}";	// { dg-error "delimited escape sequences are only valid in" "" { target c++20_down } }