[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[bug-gettext] [PATCH 2/2] c: Interpret string literals lazily
From: |
Daiki Ueno |
Subject: |
[bug-gettext] [PATCH 2/2] c: Interpret string literals lazily |
Date: |
Thu, 8 May 2014 18:30:16 +0900 |
* x-c.c (phase7_get): Remove.
(phase5_get): Use 'phase3_get' directly to extract string
literals; use 'arglist_parser_remember_literal' instead of
'arglist_parser_remember'.
(literalstring_parse): New function.
(literalstring_c): New variable.
* x-c.h (SCANNERS_C): Register 'literalstring_c' as a
literalstring_parser.
(literalstring_c): New variable.
---
gettext-tools/src/x-c.c | 515 +++++++++++++++++++++++++-----------------------
gettext-tools/src/x-c.h | 14 +-
2 files changed, 278 insertions(+), 251 deletions(-)
diff --git a/gettext-tools/src/x-c.c b/gettext-tools/src/x-c.c
index 9050433..374fa50 100644
--- a/gettext-tools/src/x-c.c
+++ b/gettext-tools/src/x-c.c
@@ -860,228 +860,226 @@ struct token_ty
};
-/* 7. Replace escape sequences within character strings with their
- single character equivalents. This is called from phase 5, because
- we don't have to worry about the #include argument. There are
- pathological cases which could bite us (like the DOS directory
- separator), but just pretend it can't happen. */
-
-/* Return value of phase7_getc when EOF is reached. */
-#define P7_EOF (-1)
-#define P7_STRING_END (-2)
-
-/* Replace escape sequences within character strings with their single
- character equivalents. */
-#define P7_QUOTES (-3)
-#define P7_QUOTE (-4)
-#define P7_NEWLINE (-5)
-
-/* Convert an UTF-16 or UTF-32 code point to a return value that can be
- distinguished from a single-byte return value. */
-#define UNICODE(code) (0x100 + (code))
-
-/* Test a return value of phase7_getuc whether it designates an UTF-16 or
- UTF-32 code point. */
-#define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
-
-/* Extract the UTF-16 or UTF-32 code of a return value that satisfies
- IS_UNICODE. */
-#define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
+/* Free the memory pointed to by a 'struct token_ty'. */
+static inline void
+free_token (token_ty *tp)
+{
+ if (tp->type == token_type_name || tp->type == token_type_string_literal)
+ free (tp->string);
+ if (tp->type == token_type_string_literal
+ || tp->type == token_type_objc_special)
+ drop_reference (tp->comment);
+}
-static int
-phase7_getc ()
+static char *
+literalstring_parse (const char *string, lex_pos_ty *pos,
+ enum literalstring_escape_type type)
{
- int c, n, j;
+ struct mixed_string_buffer *bp;
+ const char *p;
- /* Use phase 3, because phase 4 elides comments. */
- c = phase3_getc ();
+ /* Start accumulating the string. */
+ bp = mixed_string_buffer_alloc (lc_string,
+ logical_file_name,
+ line_number);
- /* Return a magic newline indicator, so that we can distinguish
- between the user requesting a newline in the string (e.g. using
- "\n" or "\012") from the user failing to terminate the string or
- character constant. The ANSI C standard says: 3.1.3.4 Character
- Constants contain "any character except single quote, backslash or
- newline; or an escape sequence" and 3.1.4 String Literals contain
- "any character except double quote, backslash or newline; or an
- escape sequence".
-
- Most compilers give a fatal error in this case, however gcc is
- stupidly silent, even though this is a very common typo. OK, so
- "gcc --pedantic" will tell me, but that gripes about too much other
- stuff. Could I have a "gcc -Wnewline-in-string" option, or
- better yet a "gcc -fno-newline-in-string" option, please? Gcc is
- also inconsistent between string literals and character constants:
- you may not embed newlines in character constants; try it, you get
- a useful diagnostic. --PMiller */
- if (c == '\n')
- return P7_NEWLINE;
-
- if (c == '"')
- return P7_QUOTES;
- if (c == '\'')
- return P7_QUOTE;
- if (c != '\\')
- return c;
- c = phase3_getc ();
- switch (c)
+ for (p = string; *p != '\0'; p++)
{
- default:
- /* Unknown escape sequences really should be an error, but just
- ignore them, and let the real compiler complain. */
- phase3_ungetc (c);
- return '\\';
-
- case '"':
- case '\'':
- case '?':
- case '\\':
- return c;
+ int c;
- case 'a':
- return '\a';
- case 'b':
- return '\b';
-
- /* The \e escape is preculiar to gcc, and assumes an ASCII
- character set (or superset). We don't provide support for it
- here. */
-
- case 'f':
- return '\f';
- case 'n':
- return '\n';
- case 'r':
- return '\r';
- case 't':
- return '\t';
- case 'v':
- return '\v';
-
- case 'x':
- c = phase3_getc ();
- switch (c)
+ if (*p != '\\')
{
- default:
- phase3_ungetc (c);
- phase3_ungetc ('x');
- return '\\';
-
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
- case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
- break;
- }
- n = 0;
- for (;;)
- {
- switch (c)
- {
- default:
- phase3_ungetc (c);
- return n;
-
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- n = n * 16 + c - '0';
- break;
-
- case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
- n = n * 16 + 10 + c - 'A';
- break;
-
- case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
- n = n * 16 + 10 + c - 'a';
- break;
- }
- c = phase3_getc ();
+ mixed_string_buffer_append_char (bp, *p);
+ continue;
}
- return n;
- case '0': case '1': case '2': case '3':
- case '4': case '5': case '6': case '7':
- n = 0;
- for (j = 0; j < 3; ++j)
+ if (!(type & LET_ANSI_C) && !(type & LET_UNICODE))
{
- n = n * 8 + c - '0';
- c = phase3_getc ();
- switch (c)
- {
- default:
- break;
-
- case '0': case '1': case '2': case '3':
- case '4': case '5': case '6': case '7':
- continue;
- }
- break;
+ mixed_string_buffer_append_char (bp, '\\');
+ continue;
}
- phase3_ungetc (c);
- return n;
- case 'U': case 'u':
- {
- unsigned char buf[8];
+ c = *++p;
- n = 0;
- for (j = 0; j < (c == 'u' ? 4 : 8); j++)
+ if (type & LET_ANSI_C)
+ switch (c)
{
- int c1 = phase3_getc ();
-
- if (c1 >= '0' && c1 <= '9')
- n = (n << 4) + (c1 - '0');
- else if (c1 >= 'A' && c1 <= 'F')
- n = (n << 4) + (c1 - 'A' + 10);
- else if (c1 >= 'a' && c1 <= 'f')
- n = (n << 4) + (c1 - 'a' + 10);
- else
+ case '"':
+ case '\'':
+ case '?':
+ case '\\':
+ mixed_string_buffer_append_char (bp, c);
+ continue;
+
+ case 'a':
+ mixed_string_buffer_append_char (bp, '\a');
+ continue;
+ case 'b':
+ mixed_string_buffer_append_char (bp, '\b');
+ continue;
+
+ /* The \e escape is preculiar to gcc, and assumes an ASCII
+ character set (or superset). We don't provide support for it
+ here. */
+
+ case 'f':
+ mixed_string_buffer_append_char (bp, '\f');
+ continue;
+ case 'n':
+ mixed_string_buffer_append_char (bp, '\n');
+ continue;
+ case 'r':
+ mixed_string_buffer_append_char (bp, '\r');
+ continue;
+ case 't':
+ mixed_string_buffer_append_char (bp, '\t');
+ continue;
+ case 'v':
+ mixed_string_buffer_append_char (bp, '\v');
+ continue;
+
+ case 'x':
+ c = *++p;
+ switch (c)
{
- phase3_ungetc (c1);
- while (--j >= 0)
- phase3_ungetc (buf[j]);
- phase3_ungetc (c);
- return '\\';
+ default:
+ mixed_string_buffer_append_char (bp, '\\');
+ mixed_string_buffer_append_char (bp, 'x');
+ mixed_string_buffer_append_char (bp, c);
+ break;
+
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+ case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+ {
+ int n;
+
+ for (n = 0; ; ++p)
+ {
+ switch (*p)
+ {
+ default:
+ break;
+
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ n = n * 16 + *p - '0';
+ continue;
+
+ case 'A': case 'B': case 'C': case 'D': case 'E':
+ case 'F':
+ n = n * 16 + 10 + *p - 'A';
+ continue;
+
+ case 'a': case 'b': case 'c': case 'd': case 'e':
+ case 'f':
+ n = n * 16 + 10 + *p - 'a';
+ continue;
+ }
+ break;
+ }
+
+ mixed_string_buffer_append_char (bp, n);
+ --p;
+ }
+ break;
}
+ continue;
+
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ {
+ int n, j;
+
+ for (n = 0, j = 0; j < 3; ++j)
+ {
+ n = n * 8 + c - '0';
+ switch (*++p)
+ {
+ default:
+ break;
+
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ continue;
+ }
+ break;
+ }
- buf[j] = c1;
+ mixed_string_buffer_append_char (bp, n);
+ --p;
+ }
+ continue;
}
- if (n < 0x110000)
- return UNICODE (n);
+ if (type & LET_UNICODE)
+ switch (c)
+ {
+ case 'U': case 'u':
+ {
+ unsigned char buf[8];
+ int length = c == 'u' ? 4 : 8;
+ int n, j;
- error_with_progname = false;
- error (0, 0, _("%s:%d: warning: invalid Unicode character"),
- logical_file_name, line_number);
- error_with_progname = true;
+ for (n = 0, j = 0; j < length; j++)
+ {
+ int c1 = *++p;
+
+ if (c1 >= '0' && c1 <= '9')
+ n = (n << 4) + (c1 - '0');
+ else if (c1 >= 'A' && c1 <= 'F')
+ n = (n << 4) + (c1 - 'A' + 10);
+ else if (c1 >= 'a' && c1 <= 'f')
+ n = (n << 4) + (c1 - 'a' + 10);
+ else
+ break;
+
+ buf[j] = c1;
+ }
- while (--j >= 0)
- phase3_ungetc (buf[j]);
- phase3_ungetc (c);
- return '\\';
- }
- }
-}
+ if (j == length)
+ {
+ if (n < 0x110000)
+ mixed_string_buffer_append_unicode (bp, n);
+ else
+ {
+ error_with_progname = false;
+ error_at_line (0, 0,
+ pos->file_name, pos->line_number,
+ _("\
+warning: invalid Unicode character"));
+ error_with_progname = true;
+ }
+ }
+ else
+ {
+ int i;
+ mixed_string_buffer_append_char (bp, '\\');
+ mixed_string_buffer_append_char (bp, c);
-static void
-phase7_ungetc (int c)
-{
- phase3_ungetc (c);
-}
+ for (i = 0; i < j; i++)
+ mixed_string_buffer_append_char (bp, buf[i]);
+ --p;
+ }
+ }
+ continue;
+ }
-/* Free the memory pointed to by a 'struct token_ty'. */
-static inline void
-free_token (token_ty *tp)
-{
- if (tp->type == token_type_name || tp->type == token_type_string_literal)
- free (tp->string);
- if (tp->type == token_type_string_literal
- || tp->type == token_type_objc_special)
- drop_reference (tp->comment);
+ mixed_string_buffer_append_char (bp, c);
+ }
+
+ return mixed_string_buffer_done (bp);
}
+struct literalstring_parser literalstring_c =
+ {
+ literalstring_parse
+ };
+
/* 5. Parse each resulting logical line as preprocessing tokens and
white space. Preprocessing tokens and C tokens don't always match. */
@@ -1097,6 +1095,7 @@ phase5_get (token_ty *tp)
static int bufmax;
int bufpos;
int c;
+ int last_was_backslash;
if (phase5_pushback_length)
{
@@ -1276,19 +1275,30 @@ phase5_get (token_ty *tp)
but ignoring it has no effect unless one of the keywords is
"L". Just pretend it won't happen. Also, we don't need to
remember the character constant. */
+ last_was_backslash = false;
for (;;)
{
- c = phase7_getc ();
- if (c == P7_NEWLINE)
+ c = phase3_getc ();
+ if (last_was_backslash)
+ {
+ last_was_backslash = false;
+ continue;
+ }
+ switch (c)
{
+ case '\\':
+ last_was_backslash = true;
+ continue;
+ case '\n':
error_with_progname = false;
error (0, 0, _("%s:%d: warning: unterminated character
constant"),
logical_file_name, line_number - 1);
error_with_progname = true;
- phase7_ungetc ('\n');
+ phase3_ungetc ('\n');
+ break;
+ case EOF: case '\'':
break;
}
- if (c == EOF || c == P7_QUOTE)
break;
}
tp->type = token_type_character_constant;
@@ -1296,49 +1306,55 @@ phase5_get (token_ty *tp)
case '"':
{
- struct mixed_string_buffer *bp;
-
- /* Start accumulating the string. */
- bp = mixed_string_buffer_alloc (lc_string,
- logical_file_name,
- line_number);
-
/* We could worry about the 'L' before wide string constants,
but since gettext's argument is not a wide character string,
let the compiler complain about the argument not matching the
prototype. Just pretend it won't happen. */
+ last_was_backslash = false;
+ bufpos = 0;
for (;;)
{
- c = phase7_getc ();
-
- /* Keep line_number in sync. */
- bp->line_number = line_number;
-
- if (c == P7_NEWLINE)
+ c = phase3_getc ();
+ if (last_was_backslash)
{
+ last_was_backslash = false;
+ if (bufpos >= bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+ buffer[bufpos++] = c;
+ continue;
+ }
+ switch (c)
+ {
+ case '\\':
+ last_was_backslash = true;
+ /* FALLTHROUGH */
+ default:
+ if (bufpos >= bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+ buffer[bufpos++] = c;
+ continue;
+
+ case '\n':
error_with_progname = false;
error (0, 0, _("%s:%d: warning: unterminated string literal"),
logical_file_name, line_number - 1);
error_with_progname = true;
- phase7_ungetc ('\n');
+ phase3_ungetc ('\n');
+ break;
+ case EOF: case '"':
break;
}
- if (c == EOF || c == P7_QUOTES)
- break;
- if (c == P7_QUOTE)
- c = '\'';
- if (IS_UNICODE (c))
- {
- assert (UNICODE_VALUE (c) >= 0
- && UNICODE_VALUE (c) < 0x110000);
- mixed_string_buffer_append_unicode (bp,
- UNICODE_VALUE (c));
- }
- else
- mixed_string_buffer_append_char (bp, c);
+ break;
}
+ buffer[bufpos] = 0;
tp->type = token_type_string_literal;
- tp->string = mixed_string_buffer_done (bp);
+ tp->string = xstrdup (buffer);
tp->comment = add_reference (savable_comment);
return;
}
@@ -1914,10 +1930,7 @@ extract_parenthesized (message_list_ty *mlp,
arglist_parser_alloc (mlp,
state ? next_shapes
: NULL)))
{
- xgettext_current_source_encoding = po_charset_utf8;
arglist_parser_done (argparser, arg);
- xgettext_current_source_encoding =
- xgettext_global_source_encoding;
return true;
}
next_context_iter = null_context_list_iterator;
@@ -1926,9 +1939,7 @@ extract_parenthesized (message_list_ty *mlp,
continue;
case xgettext_token_type_rparen:
- xgettext_current_source_encoding = po_charset_utf8;
arglist_parser_done (argparser, arg);
- xgettext_current_source_encoding = xgettext_global_source_encoding;
return false;
case xgettext_token_type_comma:
@@ -1962,20 +1973,32 @@ extract_parenthesized (message_list_ty *mlp,
continue;
case xgettext_token_type_string_literal:
- xgettext_current_source_encoding = po_charset_utf8;
- if (extract_all)
- remember_a_message (mlp, NULL, token.string, inner_context,
- &token.pos, NULL, token.comment);
- else
- arglist_parser_remember (argparser, arg, token.string,
- inner_context,
- token.pos.file_name,
token.pos.line_number,
- token.comment);
- xgettext_current_source_encoding = xgettext_global_source_encoding;
- drop_reference (token.comment);
- next_context_iter = null_context_list_iterator;
- selectorcall_context_iter = null_context_list_iterator;
- state = 0;
+ {
+ if (extract_all)
+ {
+ char *string = literalstring_parse (token.string, &token.pos,
+ LET_ANSI_C | LET_UNICODE);
+ free (token.string);
+
+ /* STRING is already in UTF-8. Prevent further conversion. */
+ xgettext_current_source_encoding = po_charset_utf8;
+ remember_a_message (mlp, NULL, string, inner_context,
+ &token.pos, NULL, token.comment);
+ xgettext_current_source_encoding =
+ xgettext_global_source_encoding;
+ }
+ else
+ arglist_parser_remember_literal (argparser, arg, token.string,
+ inner_context,
+ token.pos.file_name,
+ token.pos.line_number,
+ token.comment,
+ LET_ANSI_C | LET_UNICODE);
+ drop_reference (token.comment);
+ next_context_iter = null_context_list_iterator;
+ selectorcall_context_iter = null_context_list_iterator;
+ state = 0;
+ }
continue;
case xgettext_token_type_other:
@@ -1985,9 +2008,7 @@ extract_parenthesized (message_list_ty *mlp,
continue;
case xgettext_token_type_eof:
- xgettext_current_source_encoding = po_charset_utf8;
arglist_parser_done (argparser, arg);
- xgettext_current_source_encoding = xgettext_global_source_encoding;
return true;
default:
diff --git a/gettext-tools/src/x-c.h b/gettext-tools/src/x-c.h
index 28c5b92..8c8bd0d 100644
--- a/gettext-tools/src/x-c.h
+++ b/gettext-tools/src/x-c.h
@@ -43,16 +43,20 @@ extern "C" {
#define SCANNERS_C \
{ "C", extract_c, \
&flag_table_c, \
- &formatstring_c, NULL, NULL }, \
+ &formatstring_c, NULL, \
+ &literalstring_c }, \
{ "C++", extract_c, \
&flag_table_c, \
- &formatstring_c, NULL, NULL }, \
+ &formatstring_c, NULL, \
+ &literalstring_c }, \
{ "ObjectiveC", extract_objc, \
&flag_table_objc, \
- &formatstring_c, &formatstring_objc, NULL }, \
+ &formatstring_c, &formatstring_objc, \
+ &literalstring_c }, \
{ "GCC-source", extract_c, \
&flag_table_gcc_internal, \
- &formatstring_gcc_internal,
&formatstring_gfc_internal, NULL }, \
+ &formatstring_gcc_internal,
&formatstring_gfc_internal, \
+ &literalstring_c }, \
/* Scan a C/C++ file and add its translatable strings to mdlp. */
extern void extract_c (FILE *fp, const char *real_filename,
@@ -80,6 +84,8 @@ extern void init_flag_table_objc (void);
extern void init_flag_table_gcc_internal (void);
+extern DLL_VARIABLE struct literalstring_parser literalstring_c;
+
#ifdef __cplusplus
}
#endif
--
1.9.0