libcpp: Add -Winvalid-utf8 warning [PR106655]

The following patch introduces a new warning - -Winvalid-utf8 similarly
to what clang now has - to diagnose invalid UTF-8 byte sequences in
comments, but not just in those, but also in string/character literals
and outside of them.

The warning is on by default when explicit -finput-charset=UTF-8 is
used and C++23 compilation is requested and if -{,W}pedantic or
-pedantic-errors it is actually a pedwarn.

The reason it is on by default only for -finput-charset=UTF-8 is
that the sources often are UTF-8, but sometimes could be some ASCII
compatible single byte encoding where non-ASCII characters only
appear in comments.  So having the warning off by default
is IMO desirable.  The C++23 pedantic mode for when the source code
is UTF-8 is -std=c++23 -pedantic-errors -finput-charset=UTF-8.

2022-09-01  Jakub Jelinek  <jakub@redhat.com>

	PR c++/106655
libcpp/
	* include/cpplib.h (struct cpp_options): Implement C++23
	P2295R6 - Support for UTF-8 as a portable source file encoding.
	Add cpp_warn_invalid_utf8 and cpp_input_charset_explicit fields.
	(enum cpp_warning_reason): Add CPP_W_INVALID_UTF8 enumerator.
	* init.cc (cpp_create_reader): Initialize cpp_warn_invalid_utf8
	and cpp_input_charset_explicit.
	* charset.cc (_cpp_valid_utf8): Adjust function comment.
	* lex.cc (UCS_LIMIT): Define.
	(utf8_continuation): New const variable.
	(utf8_signifier): Move earlier in the file.
	(_cpp_warn_invalid_utf8, _cpp_handle_multibyte_utf8): New functions.
	(_cpp_skip_block_comment): Handle -Winvalid-utf8 warning.
	(skip_line_comment): Likewise.
	(lex_raw_string, lex_string): Likewise.
	(_cpp_lex_direct): Likewise.
gcc/
	* doc/invoke.texi (-Winvalid-utf8): Document it.
gcc/c-family/
	* c.opt (-Winvalid-utf8): New warning.
	* c-opts.cc (c_common_handle_option) <case OPT_finput_charset_>:
	Set cpp_opts->cpp_input_charset_explicit.
	(c_common_post_options): If -finput-charset=UTF-8 is explicit
	in C++23, enable -Winvalid-utf8 by default and if -pedantic
	or -pedantic-errors, make it a pedwarn.
gcc/testsuite/
	* c-c++-common/cpp/Winvalid-utf8-1.c: New test.
	* c-c++-common/cpp/Winvalid-utf8-2.c: New test.
	* c-c++-common/cpp/Winvalid-utf8-3.c: New test.
	* g++.dg/cpp23/Winvalid-utf8-1.C: New test.
	* g++.dg/cpp23/Winvalid-utf8-2.C: New test.
	* g++.dg/cpp23/Winvalid-utf8-3.C: New test.
	* g++.dg/cpp23/Winvalid-utf8-4.C: New test.
	* g++.dg/cpp23/Winvalid-utf8-5.C: New test.
	* g++.dg/cpp23/Winvalid-utf8-6.C: New test.
	* g++.dg/cpp23/Winvalid-utf8-7.C: New test.
	* g++.dg/cpp23/Winvalid-utf8-8.C: New test.
	* g++.dg/cpp23/Winvalid-utf8-9.C: New test.
	* g++.dg/cpp23/Winvalid-utf8-10.C: New test.
	* g++.dg/cpp23/Winvalid-utf8-11.C: New test.
	* g++.dg/cpp23/Winvalid-utf8-12.C: New test.
This commit is contained in:
Jakub Jelinek
2022-09-01 09:48:01 +02:00
parent bdfe0d1ce0
commit 0b8c57ed40
22 changed files with 973 additions and 33 deletions

View File

@@ -1742,9 +1742,9 @@ convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
case, no diagnostic is emitted, and the return value of FALSE should cause
a new token to be formed.
Unlike _cpp_valid_ucn, this will never be called when lexing a string; only
a potential identifier, or a CPP_OTHER token. NST is unused in the latter
case.
_cpp_valid_utf8 can be called when lexing a potential identifier, or a
CPP_OTHER token or for the purposes of -Winvalid-utf8 warning in string or
character literals. NST is unused when not in a potential identifier.
As in _cpp_valid_ucn, IDENTIFIER_POS is 0 when not in an identifier, 1 for
the start of an identifier, or 2 otherwise. */

View File

@@ -560,6 +560,13 @@ struct cpp_options
cpp_bidirectional_level. */
unsigned char cpp_warn_bidirectional;
/* True if libcpp should warn about invalid UTF-8 characters in comments.
2 if it should be a pedwarn. */
unsigned char cpp_warn_invalid_utf8;
/* True if -finput-charset= option has been used explicitly. */
bool cpp_input_charset_explicit;
/* Dependency generation. */
struct
{
@@ -666,7 +673,8 @@ enum cpp_warning_reason {
CPP_W_CXX11_COMPAT,
CPP_W_CXX20_COMPAT,
CPP_W_EXPANSION_TO_DEFINED,
CPP_W_BIDIRECTIONAL
CPP_W_BIDIRECTIONAL,
CPP_W_INVALID_UTF8
};
/* Callback for header lookup for HEADER, which is the name of a

View File

@@ -227,6 +227,8 @@ cpp_create_reader (enum c_lang lang, cpp_hash_table *table,
CPP_OPTION (pfile, ext_numeric_literals) = 1;
CPP_OPTION (pfile, warn_date_time) = 0;
CPP_OPTION (pfile, cpp_warn_bidirectional) = bidirectional_unpaired;
CPP_OPTION (pfile, cpp_warn_invalid_utf8) = 0;
CPP_OPTION (pfile, cpp_input_charset_explicit) = 0;
/* Default CPP arithmetic to something sensible for the host for the
benefit of dumb users like fix-header. */

View File

@@ -50,6 +50,9 @@ static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
#define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
#define TOKEN_NAME(token) (token_spellings[(token)->type].name)
/* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive. */
#define UCS_LIMIT 0x10FFFF
static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
static int skip_line_comment (cpp_reader *);
static void skip_whitespace (cpp_reader *, cppchar_t);
@@ -1704,6 +1707,120 @@ maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
bidi::on_char (kind, ucn_p, loc);
}
static const cppchar_t utf8_continuation = 0x80;
static const cppchar_t utf8_signifier = 0xC0;
/* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
at PFILE->buffer->cur. Return a pointer after the diagnosed
invalid character. */
static const uchar *
_cpp_warn_invalid_utf8 (cpp_reader *pfile)
{
cpp_buffer *buffer = pfile->buffer;
const uchar *cur = buffer->cur;
bool pedantic = (CPP_PEDANTIC (pfile)
&& CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
if (cur[0] < utf8_signifier
|| cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
{
if (pedantic)
cpp_error_with_line (pfile, CPP_DL_PEDWARN,
pfile->line_table->highest_line,
CPP_BUF_COL (buffer),
"invalid UTF-8 character <%x>",
cur[0]);
else
cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
pfile->line_table->highest_line,
CPP_BUF_COL (buffer),
"invalid UTF-8 character <%x>",
cur[0]);
return cur + 1;
}
else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
{
if (pedantic)
cpp_error_with_line (pfile, CPP_DL_PEDWARN,
pfile->line_table->highest_line,
CPP_BUF_COL (buffer),
"invalid UTF-8 character <%x><%x>",
cur[0], cur[1]);
else
cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
pfile->line_table->highest_line,
CPP_BUF_COL (buffer),
"invalid UTF-8 character <%x><%x>",
cur[0], cur[1]);
return cur + 2;
}
else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
{
if (pedantic)
cpp_error_with_line (pfile, CPP_DL_PEDWARN,
pfile->line_table->highest_line,
CPP_BUF_COL (buffer),
"invalid UTF-8 character <%x><%x><%x>",
cur[0], cur[1], cur[2]);
else
cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
pfile->line_table->highest_line,
CPP_BUF_COL (buffer),
"invalid UTF-8 character <%x><%x><%x>",
cur[0], cur[1], cur[2]);
return cur + 3;
}
else
{
if (pedantic)
cpp_error_with_line (pfile, CPP_DL_PEDWARN,
pfile->line_table->highest_line,
CPP_BUF_COL (buffer),
"invalid UTF-8 character <%x><%x><%x><%x>",
cur[0], cur[1], cur[2], cur[3]);
else
cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
pfile->line_table->highest_line,
CPP_BUF_COL (buffer),
"invalid UTF-8 character <%x><%x><%x><%x>",
cur[0], cur[1], cur[2], cur[3]);
return cur + 4;
}
}
/* Helper function of *skip_*_comment and lex*_string. For C,
character at CUR[-1] with MSB set handle -Wbidi-chars* and
-Winvalid-utf8 diagnostics and return pointer to first character
that should be processed next. */
static inline const uchar *
_cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
const uchar *cur, bool warn_bidi_p,
bool warn_invalid_utf8_p)
{
/* If this is a beginning of a UTF-8 encoding, it might be
a bidirectional control character. */
if (c == bidi::utf8_start && warn_bidi_p)
{
location_t loc;
bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
}
if (!warn_invalid_utf8_p)
return cur;
if (c >= utf8_signifier)
{
cppchar_t s;
const uchar *pstr = cur - 1;
if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s)
&& s <= UCS_LIMIT)
return pstr;
}
pfile->buffer->cur = cur - 1;
return _cpp_warn_invalid_utf8 (pfile);
}
/* Skip a C-style block comment. We find the end of the comment by
seeing if an asterisk is before every '/' we encounter. Returns
nonzero if comment terminated by EOF, zero otherwise.
@@ -1716,6 +1833,8 @@ _cpp_skip_block_comment (cpp_reader *pfile)
const uchar *cur = buffer->cur;
uchar c;
const bool warn_bidi_p = pfile->warn_bidi_p ();
const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
cur++;
if (*cur == '/')
@@ -1765,14 +1884,10 @@ _cpp_skip_block_comment (cpp_reader *pfile)
cur = buffer->cur;
}
/* If this is a beginning of a UTF-8 encoding, it might be
a bidirectional control character. */
else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
{
location_t loc;
bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
}
else if (__builtin_expect (c >= utf8_continuation, 0)
&& warn_bidi_or_invalid_utf8_p)
cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
warn_invalid_utf8_p);
}
buffer->cur = cur;
@@ -1789,11 +1904,13 @@ skip_line_comment (cpp_reader *pfile)
cpp_buffer *buffer = pfile->buffer;
location_t orig_line = pfile->line_table->highest_line;
const bool warn_bidi_p = pfile->warn_bidi_p ();
const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
if (!warn_bidi_p)
if (!warn_bidi_or_invalid_utf8_p)
while (*buffer->cur != '\n')
buffer->cur++;
else
else if (!warn_invalid_utf8_p)
{
while (*buffer->cur != '\n'
&& *buffer->cur != bidi::utf8_start)
@@ -1813,6 +1930,22 @@ skip_line_comment (cpp_reader *pfile)
maybe_warn_bidi_on_close (pfile, buffer->cur);
}
}
else
{
while (*buffer->cur != '\n')
{
if (*buffer->cur < utf8_continuation)
{
buffer->cur++;
continue;
}
buffer->cur
= _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1,
warn_bidi_p, warn_invalid_utf8_p);
}
if (warn_bidi_p)
maybe_warn_bidi_on_close (pfile, buffer->cur);
}
_cpp_process_line_notes (pfile, true);
return orig_line != pfile->line_table->highest_line;
@@ -1919,8 +2052,6 @@ warn_about_normalization (cpp_reader *pfile,
}
}
static const cppchar_t utf8_signifier = 0xC0;
/* Returns TRUE if the sequence starting at buffer->cur is valid in
an identifier. FIRST is TRUE if this starts an identifier. */
@@ -2361,6 +2492,8 @@ lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
{
const uchar *pos = base;
const bool warn_bidi_p = pfile->warn_bidi_p ();
const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
/* 'tis a pity this information isn't passed down from the lexer's
initial categorization of the token. */
@@ -2597,13 +2730,10 @@ lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
pos = base = pfile->buffer->cur;
note = &pfile->buffer->notes[pfile->buffer->cur_note];
}
else if (__builtin_expect ((unsigned char) c == bidi::utf8_start, 0)
&& warn_bidi_p)
{
location_t loc;
bidi::kind kind = get_bidi_utf8 (pfile, pos - 1, &loc);
maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
}
else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
&& warn_bidi_or_invalid_utf8_p)
pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p,
warn_invalid_utf8_p);
}
if (warn_bidi_p)
@@ -2704,6 +2834,8 @@ lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
terminator = '>', type = CPP_HEADER_NAME;
const bool warn_bidi_p = pfile->warn_bidi_p ();
const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
for (;;)
{
cppchar_t c = *cur++;
@@ -2745,12 +2877,10 @@ lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
}
else if (c == '\0')
saw_NUL = true;
else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
{
location_t loc;
bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
}
else if (__builtin_expect (c >= utf8_continuation, 0)
&& warn_bidi_or_invalid_utf8_p)
cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
warn_invalid_utf8_p);
}
if (saw_NUL && !pfile->state.skipping)
@@ -4052,6 +4182,7 @@ _cpp_lex_direct (cpp_reader *pfile)
default:
{
const uchar *base = --buffer->cur;
static int no_warn_cnt;
/* Check for an extended identifier ($ or UCN or UTF-8). */
struct normalize_state nst = INITIAL_NORMALIZE_STATE;
@@ -4072,7 +4203,33 @@ _cpp_lex_direct (cpp_reader *pfile)
const uchar *pstr = base;
cppchar_t s;
if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
buffer->cur = pstr;
{
if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
{
buffer->cur = base;
_cpp_warn_invalid_utf8 (pfile);
}
buffer->cur = pstr;
}
else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
{
buffer->cur = base;
const uchar *end = _cpp_warn_invalid_utf8 (pfile);
buffer->cur = base + 1;
no_warn_cnt = end - buffer->cur;
}
}
else if (c >= utf8_continuation
&& CPP_OPTION (pfile, cpp_warn_invalid_utf8))
{
if (no_warn_cnt)
--no_warn_cnt;
else
{
buffer->cur = base;
_cpp_warn_invalid_utf8 (pfile);
buffer->cur = base + 1;
}
}
create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
break;