[bug-gettext] [PATCH 2/2] c: Interpret string literals lazily

bug-gettext
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[bug-gettext] [PATCH 2/2] c: Interpret string literals lazily

From:	Daiki Ueno
Subject:	[bug-gettext] [PATCH 2/2] c: Interpret string literals lazily
Date:	Thu, 8 May 2014 18:30:16 +0900
* x-c.c (phase7_get): Remove.
(phase5_get): Use 'phase3_get' directly to extract string
literals; use 'arglist_parser_remember_literal' instead of
'arglist_parser_remember'.
(literalstring_parse): New function.
(literalstring_c): New variable.

* x-c.h (SCANNERS_C): Register 'literalstring_c' as a
literalstring_parser.
(literalstring_c): New variable.
---
 gettext-tools/src/x-c.c | 515 +++++++++++++++++++++++++-----------------------
 gettext-tools/src/x-c.h |  14 +-
 2 files changed, 278 insertions(+), 251 deletions(-)

diff --git a/gettext-tools/src/x-c.c b/gettext-tools/src/x-c.c
index 9050433..374fa50 100644
--- a/gettext-tools/src/x-c.c
+++ b/gettext-tools/src/x-c.c
@@ -860,228 +860,226 @@ struct token_ty
 };
 
 
-/* 7. Replace escape sequences within character strings with their
-   single character equivalents.  This is called from phase 5, because
-   we don't have to worry about the #include argument.  There are
-   pathological cases which could bite us (like the DOS directory
-   separator), but just pretend it can't happen.  */
-
-/* Return value of phase7_getc when EOF is reached.  */
-#define P7_EOF (-1)
-#define P7_STRING_END (-2)
-
-/* Replace escape sequences within character strings with their single
-   character equivalents.  */
-#define P7_QUOTES (-3)
-#define P7_QUOTE (-4)
-#define P7_NEWLINE (-5)
-
-/* Convert an UTF-16 or UTF-32 code point to a return value that can be
-   distinguished from a single-byte return value.  */
-#define UNICODE(code) (0x100 + (code))
-
-/* Test a return value of phase7_getuc whether it designates an UTF-16 or
-   UTF-32 code point.  */
-#define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
-
-/* Extract the UTF-16 or UTF-32 code of a return value that satisfies
-   IS_UNICODE.  */
-#define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
+/* Free the memory pointed to by a 'struct token_ty'.  */
+static inline void
+free_token (token_ty *tp)
+{
+  if (tp->type == token_type_name || tp->type == token_type_string_literal)
+    free (tp->string);
+  if (tp->type == token_type_string_literal
+      || tp->type == token_type_objc_special)
+    drop_reference (tp->comment);
+}
 
 
-static int
-phase7_getc ()
+static char *
+literalstring_parse (const char *string, lex_pos_ty *pos,
+                     enum literalstring_escape_type type)
 {
-  int c, n, j;
+  struct mixed_string_buffer *bp;
+  const char *p;
 
-  /* Use phase 3, because phase 4 elides comments.  */
-  c = phase3_getc ();
+  /* Start accumulating the string.  */
+  bp = mixed_string_buffer_alloc (lc_string,
+                                  logical_file_name,
+                                  line_number);
 
-  /* Return a magic newline indicator, so that we can distinguish
-     between the user requesting a newline in the string (e.g. using
-     "\n" or "\012") from the user failing to terminate the string or
-     character constant.  The ANSI C standard says: 3.1.3.4 Character
-     Constants contain "any character except single quote, backslash or
-     newline; or an escape sequence" and 3.1.4 String Literals contain
-     "any character except double quote, backslash or newline; or an
-     escape sequence".
-
-     Most compilers give a fatal error in this case, however gcc is
-     stupidly silent, even though this is a very common typo.  OK, so
-     "gcc --pedantic" will tell me, but that gripes about too much other
-     stuff.  Could I have a "gcc -Wnewline-in-string" option, or
-     better yet a "gcc -fno-newline-in-string" option, please?  Gcc is
-     also inconsistent between string literals and character constants:
-     you may not embed newlines in character constants; try it, you get
-     a useful diagnostic.  --PMiller  */
-  if (c == '\n')
-    return P7_NEWLINE;
-
-  if (c == '"')
-    return P7_QUOTES;
-  if (c == '\'')
-    return P7_QUOTE;
-  if (c != '\\')
-    return c;
-  c = phase3_getc ();
-  switch (c)
+  for (p = string; *p != '\0'; p++)
     {
-    default:
-      /* Unknown escape sequences really should be an error, but just
-         ignore them, and let the real compiler complain.  */
-      phase3_ungetc (c);
-      return '\\';
-
-    case '"':
-    case '\'':
-    case '?':
-    case '\\':
-      return c;
+      int c;
 
-    case 'a':
-      return '\a';
-    case 'b':
-      return '\b';
-
-      /* The \e escape is preculiar to gcc, and assumes an ASCII
-         character set (or superset).  We don't provide support for it
-         here.  */
-
-    case 'f':
-      return '\f';
-    case 'n':
-      return '\n';
-    case 'r':
-      return '\r';
-    case 't':
-      return '\t';
-    case 'v':
-      return '\v';
-
-    case 'x':
-      c = phase3_getc ();
-      switch (c)
+      if (*p != '\\')
         {
-        default:
-          phase3_ungetc (c);
-          phase3_ungetc ('x');
-          return '\\';
-
-        case '0': case '1': case '2': case '3': case '4':
-        case '5': case '6': case '7': case '8': case '9':
-        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
-        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
-          break;
-        }
-      n = 0;
-      for (;;)
-        {
-          switch (c)
-            {
-            default:
-              phase3_ungetc (c);
-              return n;
-
-            case '0': case '1': case '2': case '3': case '4':
-            case '5': case '6': case '7': case '8': case '9':
-              n = n * 16 + c - '0';
-              break;
-
-            case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
-              n = n * 16 + 10 + c - 'A';
-              break;
-
-            case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
-              n = n * 16 + 10 + c - 'a';
-              break;
-            }
-          c = phase3_getc ();
+          mixed_string_buffer_append_char (bp, *p);
+          continue;
         }
-      return n;
 
-    case '0': case '1': case '2': case '3':
-    case '4': case '5': case '6': case '7':
-      n = 0;
-      for (j = 0; j < 3; ++j)
+      if (!(type & LET_ANSI_C) && !(type & LET_UNICODE))
         {
-          n = n * 8 + c - '0';
-          c = phase3_getc ();
-          switch (c)
-            {
-            default:
-              break;
-
-            case '0': case '1': case '2': case '3':
-            case '4': case '5': case '6': case '7':
-              continue;
-            }
-          break;
+          mixed_string_buffer_append_char (bp, '\\');
+          continue;
         }
-      phase3_ungetc (c);
-      return n;
 
-    case 'U': case 'u':
-      {
-        unsigned char buf[8];
+      c = *++p;
 
-        n = 0;
-        for (j = 0; j < (c == 'u' ? 4 : 8); j++)
+      if (type & LET_ANSI_C)
+        switch (c)
           {
-            int c1 = phase3_getc ();
-
-            if (c1 >= '0' && c1 <= '9')
-              n = (n << 4) + (c1 - '0');
-            else if (c1 >= 'A' && c1 <= 'F')
-              n = (n << 4) + (c1 - 'A' + 10);
-            else if (c1 >= 'a' && c1 <= 'f')
-              n = (n << 4) + (c1 - 'a' + 10);
-            else
+          case '"':
+          case '\'':
+          case '?':
+          case '\\':
+            mixed_string_buffer_append_char (bp, c);
+            continue;
+
+          case 'a':
+            mixed_string_buffer_append_char (bp, '\a');
+            continue;
+          case 'b':
+            mixed_string_buffer_append_char (bp, '\b');
+            continue;
+
+            /* The \e escape is preculiar to gcc, and assumes an ASCII
+               character set (or superset).  We don't provide support for it
+               here.  */
+
+          case 'f':
+            mixed_string_buffer_append_char (bp, '\f');
+            continue;
+          case 'n':
+            mixed_string_buffer_append_char (bp, '\n');
+            continue;
+          case 'r':
+            mixed_string_buffer_append_char (bp, '\r');
+            continue;
+          case 't':
+            mixed_string_buffer_append_char (bp, '\t');
+            continue;
+          case 'v':
+            mixed_string_buffer_append_char (bp, '\v');
+            continue;
+
+          case 'x':
+            c = *++p;
+            switch (c)
               {
-                phase3_ungetc (c1);
-                while (--j >= 0)
-                  phase3_ungetc (buf[j]);
-                phase3_ungetc (c);
-                return '\\';
+              default:
+                mixed_string_buffer_append_char (bp, '\\');
+                mixed_string_buffer_append_char (bp, 'x');
+                mixed_string_buffer_append_char (bp, c);
+                break;
+
+              case '0': case '1': case '2': case '3': case '4':
+              case '5': case '6': case '7': case '8': case '9':
+              case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+              case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+                {
+                  int n;
+
+                  for (n = 0; ; ++p)
+                    {
+                      switch (*p)
+                        {
+                        default:
+                          break;
+
+                        case '0': case '1': case '2': case '3': case '4':
+                        case '5': case '6': case '7': case '8': case '9':
+                          n = n * 16 + *p - '0';
+                          continue;
+
+                        case 'A': case 'B': case 'C': case 'D': case 'E':
+                        case 'F':
+                          n = n * 16 + 10 + *p - 'A';
+                          continue;
+
+                        case 'a': case 'b': case 'c': case 'd': case 'e':
+                        case 'f':
+                          n = n * 16 + 10 + *p - 'a';
+                          continue;
+                        }
+                      break;
+                    }
+
+                  mixed_string_buffer_append_char (bp, n);
+                  --p;
+                }
+                break;
               }
+            continue;
+
+          case '0': case '1': case '2': case '3':
+          case '4': case '5': case '6': case '7':
+            {
+              int n, j;
+
+              for (n = 0, j = 0; j < 3; ++j)
+                {
+                  n = n * 8 + c - '0';
+                  switch (*++p)
+                    {
+                    default:
+                      break;
+
+                    case '0': case '1': case '2': case '3':
+                    case '4': case '5': case '6': case '7':
+                      continue;
+                    }
+                  break;
+                }
 
-            buf[j] = c1;
+              mixed_string_buffer_append_char (bp, n);
+              --p;
+            }
+            continue;
           }
 
-        if (n < 0x110000)
-          return UNICODE (n);
+      if (type & LET_UNICODE)
+        switch (c)
+          {
+          case 'U': case 'u':
+            {
+              unsigned char buf[8];
+              int length = c == 'u' ? 4 : 8;
+              int n, j;
 
-        error_with_progname = false;
-        error (0, 0, _("%s:%d: warning: invalid Unicode character"),
-               logical_file_name, line_number);
-        error_with_progname = true;
+              for (n = 0, j = 0; j < length; j++)
+                {
+                  int c1 = *++p;
+
+                  if (c1 >= '0' && c1 <= '9')
+                    n = (n << 4) + (c1 - '0');
+                  else if (c1 >= 'A' && c1 <= 'F')
+                    n = (n << 4) + (c1 - 'A' + 10);
+                  else if (c1 >= 'a' && c1 <= 'f')
+                    n = (n << 4) + (c1 - 'a' + 10);
+                  else
+                    break;
+
+                  buf[j] = c1;
+                }
 
-        while (--j >= 0)
-          phase3_ungetc (buf[j]);
-        phase3_ungetc (c);
-        return '\\';
-      }
-    }
-}
+              if (j == length)
+                {
+                  if (n < 0x110000)
+                    mixed_string_buffer_append_unicode (bp, n);
+                  else
+                    {
+                      error_with_progname = false;
+                      error_at_line (0, 0,
+                                     pos->file_name, pos->line_number,
+                                     _("\
+warning: invalid Unicode character"));
+                      error_with_progname = true;
+                    }
+                }
+              else
+                {
+                  int i;
 
+                  mixed_string_buffer_append_char (bp, '\\');
+                  mixed_string_buffer_append_char (bp, c);
 
-static void
-phase7_ungetc (int c)
-{
-  phase3_ungetc (c);
-}
+                  for (i = 0; i < j; i++)
+                    mixed_string_buffer_append_char (bp, buf[i]);
 
+                  --p;
+                }
+            }
+            continue;
+          }
 
-/* Free the memory pointed to by a 'struct token_ty'.  */
-static inline void
-free_token (token_ty *tp)
-{
-  if (tp->type == token_type_name || tp->type == token_type_string_literal)
-    free (tp->string);
-  if (tp->type == token_type_string_literal
-      || tp->type == token_type_objc_special)
-    drop_reference (tp->comment);
+      mixed_string_buffer_append_char (bp, c);
+    }
+
+  return mixed_string_buffer_done (bp);
 }
 
+struct literalstring_parser literalstring_c =
+  {
+    literalstring_parse
+  };
+
 
 /* 5. Parse each resulting logical line as preprocessing tokens and
    white space.  Preprocessing tokens and C tokens don't always match.  */
@@ -1097,6 +1095,7 @@ phase5_get (token_ty *tp)
   static int bufmax;
   int bufpos;
   int c;
+  int last_was_backslash;
 
   if (phase5_pushback_length)
     {
@@ -1276,19 +1275,30 @@ phase5_get (token_ty *tp)
          but ignoring it has no effect unless one of the keywords is
          "L".  Just pretend it won't happen.  Also, we don't need to
          remember the character constant.  */
+      last_was_backslash = false;
       for (;;)
         {
-          c = phase7_getc ();
-          if (c == P7_NEWLINE)
+          c = phase3_getc ();
+          if (last_was_backslash)
+            {
+              last_was_backslash = false;
+              continue;
+            }
+          switch (c)
             {
+            case '\\':
+              last_was_backslash = true;
+              continue;
+            case '\n':
               error_with_progname = false;
               error (0, 0, _("%s:%d: warning: unterminated character 
constant"),
                      logical_file_name, line_number - 1);
               error_with_progname = true;
-              phase7_ungetc ('\n');
+              phase3_ungetc ('\n');
+              break;
+            case EOF: case '\'':
               break;
             }
-          if (c == EOF || c == P7_QUOTE)
             break;
         }
       tp->type = token_type_character_constant;
@@ -1296,49 +1306,55 @@ phase5_get (token_ty *tp)
 
     case '"':
       {
-        struct mixed_string_buffer *bp;
-
-        /* Start accumulating the string.  */
-        bp = mixed_string_buffer_alloc (lc_string,
-                                        logical_file_name,
-                                        line_number);
-
         /* We could worry about the 'L' before wide string constants,
            but since gettext's argument is not a wide character string,
            let the compiler complain about the argument not matching the
            prototype.  Just pretend it won't happen.  */
+        last_was_backslash = false;
+        bufpos = 0;
         for (;;)
           {
-            c = phase7_getc ();
-
-            /* Keep line_number in sync.  */
-            bp->line_number = line_number;
-
-            if (c == P7_NEWLINE)
+            c = phase3_getc ();
+            if (last_was_backslash)
               {
+                last_was_backslash = false;
+                if (bufpos >= bufmax)
+                  {
+                    bufmax = 2 * bufmax + 10;
+                    buffer = xrealloc (buffer, bufmax);
+                  }
+                buffer[bufpos++] = c;
+                continue;
+              }
+            switch (c)
+              {
+              case '\\':
+                last_was_backslash = true;
+                /* FALLTHROUGH */
+              default:
+                if (bufpos >= bufmax)
+                  {
+                    bufmax = 2 * bufmax + 10;
+                    buffer = xrealloc (buffer, bufmax);
+                  }
+                buffer[bufpos++] = c;
+                continue;
+
+              case '\n':
                 error_with_progname = false;
                 error (0, 0, _("%s:%d: warning: unterminated string literal"),
                        logical_file_name, line_number - 1);
                 error_with_progname = true;
-                phase7_ungetc ('\n');
+                phase3_ungetc ('\n');
+                break;
+              case EOF: case '"':
                 break;
               }
-            if (c == EOF || c == P7_QUOTES)
-              break;
-            if (c == P7_QUOTE)
-              c = '\'';
-            if (IS_UNICODE (c))
-              {
-                assert (UNICODE_VALUE (c) >= 0
-                        && UNICODE_VALUE (c) < 0x110000);
-                mixed_string_buffer_append_unicode (bp,
-                                                    UNICODE_VALUE (c));
-              }
-            else
-              mixed_string_buffer_append_char (bp, c);
+            break;
           }
+        buffer[bufpos] = 0;
         tp->type = token_type_string_literal;
-        tp->string = mixed_string_buffer_done (bp);
+        tp->string = xstrdup (buffer);
         tp->comment = add_reference (savable_comment);
         return;
       }
@@ -1914,10 +1930,7 @@ extract_parenthesized (message_list_ty *mlp,
                                      arglist_parser_alloc (mlp,
                                                            state ? next_shapes 
: NULL)))
             {
-              xgettext_current_source_encoding = po_charset_utf8;
               arglist_parser_done (argparser, arg);
-              xgettext_current_source_encoding =
-                xgettext_global_source_encoding;
               return true;
             }
           next_context_iter = null_context_list_iterator;
@@ -1926,9 +1939,7 @@ extract_parenthesized (message_list_ty *mlp,
           continue;
 
         case xgettext_token_type_rparen:
-          xgettext_current_source_encoding = po_charset_utf8;
           arglist_parser_done (argparser, arg);
-          xgettext_current_source_encoding = xgettext_global_source_encoding;
           return false;
 
         case xgettext_token_type_comma:
@@ -1962,20 +1973,32 @@ extract_parenthesized (message_list_ty *mlp,
           continue;
 
         case xgettext_token_type_string_literal:
-          xgettext_current_source_encoding = po_charset_utf8;
-          if (extract_all)
-            remember_a_message (mlp, NULL, token.string, inner_context,
-                                &token.pos, NULL, token.comment);
-          else
-            arglist_parser_remember (argparser, arg, token.string,
-                                     inner_context,
-                                     token.pos.file_name, 
token.pos.line_number,
-                                     token.comment);
-          xgettext_current_source_encoding = xgettext_global_source_encoding;
-          drop_reference (token.comment);
-          next_context_iter = null_context_list_iterator;
-          selectorcall_context_iter = null_context_list_iterator;
-          state = 0;
+          {
+            if (extract_all)
+              {
+                char *string = literalstring_parse (token.string, &token.pos,
+                                                    LET_ANSI_C | LET_UNICODE);
+                free (token.string);
+
+                /* STRING is already in UTF-8.  Prevent further conversion.  */
+                xgettext_current_source_encoding = po_charset_utf8;
+                remember_a_message (mlp, NULL, string, inner_context,
+                                    &token.pos, NULL, token.comment);
+                xgettext_current_source_encoding =
+                  xgettext_global_source_encoding;
+              }
+            else
+              arglist_parser_remember_literal (argparser, arg, token.string,
+                                               inner_context,
+                                               token.pos.file_name,
+                                               token.pos.line_number,
+                                               token.comment,
+                                               LET_ANSI_C | LET_UNICODE);
+            drop_reference (token.comment);
+            next_context_iter = null_context_list_iterator;
+            selectorcall_context_iter = null_context_list_iterator;
+            state = 0;
+          }
           continue;
 
         case xgettext_token_type_other:
@@ -1985,9 +2008,7 @@ extract_parenthesized (message_list_ty *mlp,
           continue;
 
         case xgettext_token_type_eof:
-          xgettext_current_source_encoding = po_charset_utf8;
           arglist_parser_done (argparser, arg);
-          xgettext_current_source_encoding = xgettext_global_source_encoding;
           return true;
 
         default:
diff --git a/gettext-tools/src/x-c.h b/gettext-tools/src/x-c.h
index 28c5b92..8c8bd0d 100644
--- a/gettext-tools/src/x-c.h
+++ b/gettext-tools/src/x-c.h
@@ -43,16 +43,20 @@ extern "C" {
 #define SCANNERS_C \
   { "C",                extract_c,                                      \
                         &flag_table_c,                                  \
-                        &formatstring_c, NULL, NULL },                        \
+                        &formatstring_c, NULL,                          \
+                        &literalstring_c },                             \
   { "C++",              extract_c,                                      \
                         &flag_table_c,                                  \
-                        &formatstring_c, NULL, NULL },                        \
+                        &formatstring_c, NULL,                          \
+                        &literalstring_c },                             \
   { "ObjectiveC",       extract_objc,                                   \
                         &flag_table_objc,                               \
-                        &formatstring_c, &formatstring_objc, NULL },          \
+                        &formatstring_c, &formatstring_objc,            \
+                        &literalstring_c },                             \
   { "GCC-source",       extract_c,                                      \
                         &flag_table_gcc_internal,                       \
-                        &formatstring_gcc_internal, 
&formatstring_gfc_internal, NULL }, \
+                        &formatstring_gcc_internal, 
&formatstring_gfc_internal, \
+                        &literalstring_c },                             \
 
 /* Scan a C/C++ file and add its translatable strings to mdlp.  */
 extern void extract_c (FILE *fp, const char *real_filename,
@@ -80,6 +84,8 @@ extern void init_flag_table_objc (void);
 extern void init_flag_table_gcc_internal (void);
 
 
+extern DLL_VARIABLE struct literalstring_parser literalstring_c;
+
 #ifdef __cplusplus
 }
 #endif
-- 
1.9.0
[Prev in Thread]
Current Thread
[Next in Thread]
[bug-gettext] [PATCH 0/2] xgettext: Delay interpretation of escape sequence, Daiki Ueno, 2014/05/08
- [bug-gettext] [PATCH 1/2] xgettext: Provide a way to interpret string literals lazily, Daiki Ueno, 2014/05/08
- [bug-gettext] [PATCH 2/2] c: Interpret string literals lazily, Daiki Ueno <=
Prev by Date: [bug-gettext] [PATCH 1/2] xgettext: Provide a way to interpret string literals lazily
Next by Date: [bug-gettext] Pre-release gettext-0.19-rc1
Previous by thread: [bug-gettext] [PATCH 1/2] xgettext: Provide a way to interpret string literals lazily
Next by thread: [bug-gettext] Pre-release gettext-0.19-rc1
Index(es):
- Date
- Thread