[v3,2/3] libcpp: add a function to determine UTF-8 validity of a C string

Message ID 20221109021048.2123704-3-ben.boeckel@kitware.com
State Accepted
Headers
Series RFC: P1689R5 support |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

Ben Boeckel Nov. 9, 2022, 2:10 a.m. UTC
  This simplifies the interface for other UTF-8 validity detections when a
simple "yes" or "no" answer is sufficient.

libcpp/

	* charset.cc: Add `_cpp_valid_utf8_str` which determines whether
	a C string is valid UTF-8 or not.
	* internal.h: Add prototype for `_cpp_valid_utf8_str`.

Signed-off-by: Ben Boeckel <ben.boeckel@kitware.com>
---
 libcpp/charset.cc | 20 ++++++++++++++++++++
 libcpp/internal.h |  2 ++
 2 files changed, 22 insertions(+)
  

Comments

Jason Merrill Nov. 16, 2022, midnight UTC | #1
On 11/8/22 16:10, Ben Boeckel wrote:
> This simplifies the interface for other UTF-8 validity detections when a
> simple "yes" or "no" answer is sufficient.
> 
> libcpp/
> 
> 	* charset.cc: Add `_cpp_valid_utf8_str` which determines whether
> 	a C string is valid UTF-8 or not.
> 	* internal.h: Add prototype for `_cpp_valid_utf8_str`.
> 
> Signed-off-by: Ben Boeckel <ben.boeckel@kitware.com>
> ---
>   libcpp/charset.cc | 20 ++++++++++++++++++++
>   libcpp/internal.h |  2 ++
>   2 files changed, 22 insertions(+)
> 
> diff --git a/libcpp/charset.cc b/libcpp/charset.cc
> index 324b5b19136..e130bc01f48 100644
> --- a/libcpp/charset.cc
> +++ b/libcpp/charset.cc
> @@ -1868,6 +1868,26 @@ _cpp_valid_utf8 (cpp_reader *pfile,
>     return true;
>   }
>   
> +/*  Detect whether a C-string is a valid UTF-8-encoded set of bytes. Returns
> +    `false` if any contained byte sequence encodes an invalid Unicode codepoint
> +    or is not a valid UTF-8 sequence. Returns `true` otherwise. */
> +
> +extern bool
> +_cpp_valid_utf8_str (const char *name)
> +{
> +  const uchar* in = (const uchar*)name;
> +  size_t len = strlen(name);

You'se missing a space before (.

> +  cppchar_t cp;
> +
> +  while (*in)
> +    {
> +      if (one_utf8_to_cppchar(&in, &len, &cp))

Here too.

OK with those fixed.

> +	return false;
> +    }
> +
> +  return true;
> +}
> +
>   /* Subroutine of convert_hex and convert_oct.  N is the representation
>      in the execution character set of a numeric escape; write it into the
>      string buffer TBUF and update the end-of-string pointer therein.  WIDE
> diff --git a/libcpp/internal.h b/libcpp/internal.h
> index badfd1b40da..4f2dd4a2f5c 100644
> --- a/libcpp/internal.h
> +++ b/libcpp/internal.h
> @@ -834,6 +834,8 @@ extern bool _cpp_valid_utf8 (cpp_reader *pfile,
>   			     struct normalize_state *nst,
>   			     cppchar_t *cp);
>   
> +extern bool _cpp_valid_utf8_str (const char *str);
> +
>   extern void _cpp_destroy_iconv (cpp_reader *);
>   extern unsigned char *_cpp_convert_input (cpp_reader *, const char *,
>   					  unsigned char *, size_t, size_t,
  

Patch

diff --git a/libcpp/charset.cc b/libcpp/charset.cc
index 324b5b19136..e130bc01f48 100644
--- a/libcpp/charset.cc
+++ b/libcpp/charset.cc
@@ -1868,6 +1868,26 @@  _cpp_valid_utf8 (cpp_reader *pfile,
   return true;
 }
 
+/*  Detect whether a C-string is a valid UTF-8-encoded set of bytes. Returns
+    `false` if any contained byte sequence encodes an invalid Unicode codepoint
+    or is not a valid UTF-8 sequence. Returns `true` otherwise. */
+
+extern bool
+_cpp_valid_utf8_str (const char *name)
+{
+  const uchar* in = (const uchar*)name;
+  size_t len = strlen(name);
+  cppchar_t cp;
+
+  while (*in)
+    {
+      if (one_utf8_to_cppchar(&in, &len, &cp))
+	return false;
+    }
+
+  return true;
+}
+
 /* Subroutine of convert_hex and convert_oct.  N is the representation
    in the execution character set of a numeric escape; write it into the
    string buffer TBUF and update the end-of-string pointer therein.  WIDE
diff --git a/libcpp/internal.h b/libcpp/internal.h
index badfd1b40da..4f2dd4a2f5c 100644
--- a/libcpp/internal.h
+++ b/libcpp/internal.h
@@ -834,6 +834,8 @@  extern bool _cpp_valid_utf8 (cpp_reader *pfile,
 			     struct normalize_state *nst,
 			     cppchar_t *cp);
 
+extern bool _cpp_valid_utf8_str (const char *str);
+
 extern void _cpp_destroy_iconv (cpp_reader *);
 extern unsigned char *_cpp_convert_input (cpp_reader *, const char *,
 					  unsigned char *, size_t, size_t,