[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 4/8] parsers: don't double escape tnames
From: |
Akim Demaille |
Subject: |
[PATCH 4/8] parsers: don't double escape tnames |
Date: |
Sat, 29 Dec 2018 17:30:23 +0100 |
So far we used to escape the token aliases before saving them into
yytname. As a consequence, we introduced yytnamerr to strip this
escaping when "it's useless".
Unfortunately, our escaping is too aggressive and for instance
destroys UTF-8 symbols (foreign languages, mathematical symbols, etc).
Let's stop quoting these symbols.
* src/output.c (prepare_symbols): Don't escape symbols twice.
* data/skeletons/glr.c, data/skeletons/lalr1.cc,
* data/skeletons/lalr1.java, data/skeletons/yacc.c
(yytnamerr): Don't de-quote these symbols.
* tests/javapush.at, tests/regression.at: Adjust expectations.
* tests/regression.at (Token definitions): Make sure we preserve
non ASCII symbols.
---
data/skeletons/glr.c | 71 ++++++++++-----------------------------
data/skeletons/lalr1.cc | 29 +---------------
data/skeletons/lalr1.d | 31 ++---------------
data/skeletons/lalr1.java | 34 +++----------------
data/skeletons/yacc.c | 39 ++-------------------
src/output.c | 5 ++-
tests/javapush.at | 64 +++++++++++++++++------------------
tests/regression.at | 18 ++++------
8 files changed, 69 insertions(+), 222 deletions(-)
diff --git a/data/skeletons/glr.c b/data/skeletons/glr.c
index ef26c391..02438887 100644
--- a/data/skeletons/glr.c
+++ b/data/skeletons/glr.c
@@ -534,6 +534,20 @@ static void yypdumpstack (struct yyGLRStack* yystackp)
} while (0)
#endif
+/** Grammar symbol */
+typedef int yySymbol;
+
+#if ]b4_api_PREFIX[DEBUG || YYERROR_VERBOSE
+/** A printable representation of TOKEN. */
+static inline const char*
+yytokenName (yySymbol yytoken)
+{
+ if (yytoken == YYEMPTY)
+ return "";
+ else
+ return yytname[yytoken];
+}
+#endif
#if YYERROR_VERBOSE
@@ -558,50 +572,14 @@ yystpcpy (char *yydest, const char *yysrc)
# endif
# ifndef yytnamerr
-/* Copy to YYRES the name of YYTOKEN after stripping away unnecessary
- quotes and backslashes, so that it's suitable for yyerror. The
- heuristic is that double-quoting is unnecessary unless the string
- contains an apostrophe, a comma, or backslash (other than
- backslash-backslash). YYSTR is taken from yytname. If YYRES is
- null, do not copy; instead, return the length of what the result
- would have been. */
+/* Copy to YYRES the name of YYTOKEN. If YYRES is null, do not copy;
+ instead, return the length of what the result would have been. */
static size_t
yytnamerr (char *yyres, int yytoken)
{
- const char *yystr = yytname[yytoken];
- if (*yystr == '"')
- {
- size_t yyn = 0;
- char const *yyp = yystr;
-
- for (;;)
- switch (*++yyp)
- {
- case '\'':
- case ',':
- goto do_not_strip_quotes;
-
- case '\\':
- if (*++yyp != '\\')
- goto do_not_strip_quotes;
- /* Fall through. */
- default:
- if (yyres)
- yyres[yyn] = *yyp;
- yyn++;
- break;
-
- case '"':
- if (yyres)
- yyres[yyn] = '\0';
- return yyn;
- }
- do_not_strip_quotes: ;
- }
-
+ const char *yystr = yytokenName (yytoken);
if (! yyres)
return strlen (yystr);
-
return (size_t) (yystpcpy (yyres, yystr) - yyres);
}
# endif
@@ -614,9 +592,6 @@ typedef int yyStateNum;
/** Rule numbers, as in LALR(1) machine */
typedef int yyRuleNum;
-/** Grammar symbol */
-typedef int yySymbol;
-
/** Item references, as in LALR(1) machine */
typedef short yyItemNum;
@@ -721,18 +696,6 @@ yyMemoryExhausted (yyGLRStack* yystackp)
YYLONGJMP (yystackp->yyexception_buffer, 2);
}
-#if ]b4_api_PREFIX[DEBUG || YYERROR_VERBOSE
-/** A printable representation of TOKEN. */
-static inline const char*
-yytokenName (yySymbol yytoken)
-{
- if (yytoken == YYEMPTY)
- return "";
- else
- return yytname[yytoken];
-}
-#endif
-
/** Fill in YYVSP[YYLOW1 .. YYLOW0-1] from the chain of states starting
* at YYVSP[YYLOW0].yystate.yypred. Leaves YYVSP[YYLOW1].yystate.yypred
* containing the pointer to the next state in the chain. */
diff --git a/data/skeletons/lalr1.cc b/data/skeletons/lalr1.cc
index 5a6091b1..7944c4e7 100644
--- a/data/skeletons/lalr1.cc
+++ b/data/skeletons/lalr1.cc
@@ -512,34 +512,7 @@ m4_if(b4_prefix, [yy], [],
std::string
]b4_parser_class_name[::yytnamerr_ (int yytoken)
{
- const char *yystr = yytname_[yytoken];
- if (*yystr == '"')
- {
- std::string yyr;
- char const *yyp = yystr;
-
- for (;;)
- switch (*++yyp)
- {
- case '\'':
- case ',':
- goto do_not_strip_quotes;
-
- case '\\':
- if (*++yyp != '\\')
- goto do_not_strip_quotes;
- // Fall through.
- default:
- yyr += *yyp;
- break;
-
- case '"':
- return yyr;
- }
- do_not_strip_quotes: ;
- }
-
- return yystr;
+ return yytname_[yytoken];
}
]])[
diff --git a/data/skeletons/lalr1.d b/data/skeletons/lalr1.d
index 6f8ef552..4c0445ee 100644
--- a/data/skeletons/lalr1.d
+++ b/data/skeletons/lalr1.d
@@ -407,39 +407,12 @@ b4_lexer_if([[
return YYNEWSTATE;
}
- /* The name of YYTOKEN after stripping away unnecessary quotes and
- backslashes, so that it's suitable for yyerror. The heuristic is
- that double-quoting is unnecessary unless the string contains an
- apostrophe, a comma, or backslash (other than backslash-backslash).
- YYSTR is taken from yytname. */
+ /* The name of YYTOKEN. */
private final string yytnamerr_ (int yytoken)
{
string yystr = yytname_[yytoken];
- if (yystr[0] == '"')
- {
- string yyr;
- strip_quotes: for (int i = 1; i < yystr.length; i++)
- switch (yystr[i])
- {
- case '\'':
- case ',':
- break strip_quotes;
-
- case '\\':
- if (yystr[++i] != '\\')
- break strip_quotes;
- goto default;
- default:
- yyr ~= yystr[i];
- break;
-
- case '"':
- return yyr;
- }
- }
- else if (yystr=="$end")
+ if (yystr=="$end")
return "end of input";
-
return yystr;
}
diff --git a/data/skeletons/lalr1.java b/data/skeletons/lalr1.java
index a4e48c05..2489d669 100644
--- a/data/skeletons/lalr1.java
+++ b/data/skeletons/lalr1.java
@@ -501,40 +501,14 @@ b4_define_state])[
}
]b4_error_verbose_if([[
- /* The name of YYTOKEN after stripping away unnecessary quotes and
- backslashes, so that it's suitable for yyerror. The heuristic is
- that double-quoting is unnecessary unless the string contains an
- apostrophe, a comma, or backslash (other than backslash-backslash).
- YYSTR is taken from yytname. */
+ /* The name of YYTOKEN. */
private final String yytnamerr_ (int yytoken)
{
String yystr = yytname_[yytoken];
- if (yystr.charAt (0) == '"')
- {
- StringBuffer yyr = new StringBuffer ();
- strip_quotes: for (int i = 1; i < yystr.length (); i++)
- switch (yystr.charAt (i))
- {
- case '\'':
- case ',':
- break strip_quotes;
-
- case '\\':
- if (yystr.charAt(++i) != '\\')
- break strip_quotes;
- /* Fall through. */
- default:
- yyr.append (yystr.charAt (i));
- break;
-
- case '"':
- return yyr.toString ();
- }
- }
- else if (yystr.equals ("$end"))
+ if (yystr.equals ("$end"))
return "end of input";
-
- return yystr;
+ else
+ return yystr;
}
]])[
diff --git a/data/skeletons/yacc.c b/data/skeletons/yacc.c
index 5ec843e1..fd10a004 100644
--- a/data/skeletons/yacc.c
+++ b/data/skeletons/yacc.c
@@ -1041,47 +1041,12 @@ yy_lac (yytype_int16 *yyesa, yytype_int16 **yyes,
# endif
# ifndef yytnamerr
-/* Copy to YYRES the name of YYTOKEN after stripping away unnecessary
- quotes and backslashes, so that it's suitable for yyerror. The
- heuristic is that double-quoting is unnecessary unless the string
- contains an apostrophe, a comma, or backslash (other than
- backslash-backslash). YYSTR is taken from yytname. If YYRES is
- null, do not copy; instead, return the length of what the result
- would have been. */
+/* Copy to YYRES the name of YYTOKEN. If YYRES is null, do not copy;
+ instead, return the length of what the result would have been. */
static YYSIZE_T
yytnamerr (char *yyres, int yytoken)
{
const char *yystr = yytname[yytoken];
- if (*yystr == '"')
- {
- YYSIZE_T yyn = 0;
- char const *yyp = yystr;
-
- for (;;)
- switch (*++yyp)
- {
- case '\'':
- case ',':
- goto do_not_strip_quotes;
-
- case '\\':
- if (*++yyp != '\\')
- goto do_not_strip_quotes;
- /* Fall through. */
- default:
- if (yyres)
- yyres[yyn] = *yyp;
- yyn++;
- break;
-
- case '"':
- if (yyres)
- yyres[yyn] = '\0';
- return yyn;
- }
- do_not_strip_quotes: ;
- }
-
if (! yyres)
return yystrlen (yystr);
diff --git a/src/output.c b/src/output.c
index e87f9812..a90e8266 100644
--- a/src/output.c
+++ b/src/output.c
@@ -165,7 +165,10 @@ prepare_symbols (void)
set_quoting_flags (qo, QA_SPLIT_TRIGRAPHS);
for (int i = 0; i < nsyms; i++)
{
- char *cp = quotearg_alloc (symbols[i]->tag, -1, qo);
+ char *cp =
+ symbols[i]->tag[0] == '"'
+ ? xstrdup (symbols[i]->tag)
+ : quotearg_alloc (symbols[i]->tag, -1, qo);
/* Width of the next token, including the two quotes, the
comma and the space. */
int width = strlen (cp) + 2;
diff --git a/tests/javapush.at b/tests/javapush.at
index 8749301a..557903ce 100644
--- a/tests/javapush.at
+++ b/tests/javapush.at
@@ -726,121 +726,121 @@ total = 256
total = 64
]])
-AT_DATA([locations],[[Next token is token "number" (1.1: 1)
+AT_DATA([locations],[[Next token is token number (1.1: 1)
Next token is token '+' (1.2: 1)
-Next token is token "number" (1.3: 2)
+Next token is token number (1.3: 2)
Next token is token '*' (1.4: 2)
-Next token is token "number" (1.5: 3)
+Next token is token number (1.5: 3)
Next token is token '=' (1.6: 3)
Next token is token '=' (1.6: 3)
Next token is token '=' (1.6: 3)
-Next token is token "number" (1.7: 7)
+Next token is token number (1.7: 7)
Next token is token '\n' (2.0: 7)
Next token is token '\n' (2.0: 7)
-Next token is token "number" (2.1: 1)
+Next token is token number (2.1: 1)
Next token is token '+' (2.2: 1)
-Next token is token "number" (2.3: 2)
+Next token is token number (2.3: 2)
Next token is token '*' (2.4: 2)
Next token is token '-' (2.5: 2)
-Next token is token "number" (2.6: 3)
+Next token is token number (2.6: 3)
Next token is token '=' (2.7: 3)
Next token is token '=' (2.7: 3)
Next token is token '=' (2.7: 3)
Next token is token '=' (2.7: 3)
Next token is token '-' (2.8: 3)
-Next token is token "number" (2.9: 5)
+Next token is token number (2.9: 5)
Next token is token '\n' (3.0: 5)
Next token is token '\n' (3.0: 5)
Next token is token '\n' (3.0: 5)
Next token is token '\n' (4.0: 5)
Next token is token '-' (4.1: 5)
-Next token is token "number" (4.2: 1)
+Next token is token number (4.2: 1)
Next token is token '^' (4.3: 1)
-Next token is token "number" (4.4: 2)
+Next token is token number (4.4: 2)
Next token is token '=' (4.5: 2)
Next token is token '=' (4.5: 2)
Next token is token '=' (4.5: 2)
Next token is token '-' (4.6: 2)
-Next token is token "number" (4.7: 1)
+Next token is token number (4.7: 1)
Next token is token '\n' (5.0: 1)
Next token is token '\n' (5.0: 1)
Next token is token '\n' (5.0: 1)
Next token is token '(' (5.1: 1)
Next token is token '-' (5.2: 1)
-Next token is token "number" (5.3: 1)
+Next token is token number (5.3: 1)
Next token is token ')' (5.4: 1)
Next token is token ')' (5.4: 1)
Next token is token '^' (5.5: 1)
-Next token is token "number" (5.6: 2)
+Next token is token number (5.6: 2)
Next token is token '=' (5.7: 2)
Next token is token '=' (5.7: 2)
-Next token is token "number" (5.8: 1)
+Next token is token number (5.8: 1)
Next token is token '\n' (6.0: 1)
Next token is token '\n' (6.0: 1)
Next token is token '\n' (7.0: 1)
Next token is token '-' (7.1: 1)
Next token is token '-' (7.2: 1)
Next token is token '-' (7.3: 1)
-Next token is token "number" (7.4: 1)
+Next token is token number (7.4: 1)
Next token is token '=' (7.5: 1)
Next token is token '=' (7.5: 1)
Next token is token '=' (7.5: 1)
Next token is token '=' (7.5: 1)
Next token is token '-' (7.6: 1)
-Next token is token "number" (7.7: 1)
+Next token is token number (7.7: 1)
Next token is token '\n' (8.0: 1)
Next token is token '\n' (8.0: 1)
Next token is token '\n' (8.0: 1)
Next token is token '\n' (9.0: 1)
-Next token is token "number" (9.1: 1)
+Next token is token number (9.1: 1)
Next token is token '-' (9.2: 1)
-Next token is token "number" (9.3: 2)
+Next token is token number (9.3: 2)
Next token is token '-' (9.4: 2)
Next token is token '-' (9.4: 2)
-Next token is token "number" (9.5: 3)
+Next token is token number (9.5: 3)
Next token is token '=' (9.6: 3)
Next token is token '=' (9.6: 3)
Next token is token '-' (9.7: 3)
-Next token is token "number" (9.8: 4)
+Next token is token number (9.8: 4)
Next token is token '\n' (10.0: 4)
Next token is token '\n' (10.0: 4)
Next token is token '\n' (10.0: 4)
-Next token is token "number" (10.1: 1)
+Next token is token number (10.1: 1)
Next token is token '-' (10.2: 1)
Next token is token '(' (10.3: 1)
-Next token is token "number" (10.4: 2)
+Next token is token number (10.4: 2)
Next token is token '-' (10.5: 2)
-Next token is token "number" (10.6: 3)
+Next token is token number (10.6: 3)
Next token is token ')' (10.7: 3)
Next token is token ')' (10.7: 3)
Next token is token '=' (10.8: 3)
Next token is token '=' (10.8: 3)
-Next token is token "number" (10.9: 2)
+Next token is token number (10.9: 2)
Next token is token '\n' (11.0: 2)
Next token is token '\n' (11.0: 2)
Next token is token '\n' (12.0: 2)
-Next token is token "number" (12.1: 2)
+Next token is token number (12.1: 2)
Next token is token '^' (12.2: 2)
-Next token is token "number" (12.3: 2)
+Next token is token number (12.3: 2)
Next token is token '^' (12.4: 2)
-Next token is token "number" (12.5: 3)
+Next token is token number (12.5: 3)
Next token is token '=' (12.6: 3)
Next token is token '=' (12.6: 3)
Next token is token '=' (12.6: 3)
-Next token is token "number" (12.7: 256)
+Next token is token number (12.7: 256)
Next token is token '\n' (13.0: 256)
Next token is token '\n' (13.0: 256)
Next token is token '(' (13.1: 256)
-Next token is token "number" (13.2: 2)
+Next token is token number (13.2: 2)
Next token is token '^' (13.3: 2)
-Next token is token "number" (13.4: 2)
+Next token is token number (13.4: 2)
Next token is token ')' (13.5: 2)
Next token is token ')' (13.5: 2)
Next token is token '^' (13.6: 2)
-Next token is token "number" (13.7: 3)
+Next token is token number (13.7: 3)
Next token is token '=' (13.8: 3)
Next token is token '=' (13.8: 3)
-Next token is token "number" (13.9: 64)
+Next token is token number (13.9: 64)
Next token is token '\n' (14.0: 64)
Next token is token '\n' (14.0: 64)
]])
diff --git a/tests/regression.at b/tests/regression.at
index 0530b1e5..147e2c5e 100644
--- a/tests/regression.at
+++ b/tests/regression.at
@@ -433,11 +433,12 @@ AT_DATA_GRAMMAR([input.y],
%token 'd' D_TOKEN
%token SPECIAL "\\\'\?\"\a\b\f\n\r\t\v\001\201\x001\x000081"
%token SPECIAL "\\\'\?\"\a\b\f\n\r\t\v\001\201\x001\x000081"
+%token MAGIC "∃¬∩∪∀"
%%
-exp: "a" "\\\'\?\"\a\b\f\n\r\t\v\001\201\x001\x000081??!";
+exp: "a" MAGIC;
%%
]AT_YYERROR_DEFINE[
-]AT_YYLEX_DEFINE([{ SPECIAL }])[
+]AT_YYLEX_DEFINE([{ MAGIC }])[
]AT_MAIN_DEFINE[
]])
AT_BISON_OPTION_POPDEFS
@@ -454,14 +455,9 @@ input.y:22.16-60: warning: symbol
"\\'?\"\a\b\f\n\r\t\v\001\201\001\201" used mo
]])
AT_COMPILE([input])
-# Checking the error message here guarantees that yytname, which does contain
-# C-string literals, does have the trigraph escaped correctly. Thus, the
-# symbol name reported by the parser is exactly the same as that reported by
-# Bison itself.
-AT_DATA([experr],
-[[syntax error, unexpected "\\'?\"\a\b\f\n\r\t\v\001\201\001\201??!",
expecting a
+AT_PARSER_CHECK([./input], 1, [],
+[[syntax error, unexpected ∃¬∩∪∀, expecting a
]])
-AT_PARSER_CHECK([./input], 1, [], [experr])
AT_CLEANUP
@@ -736,8 +732,8 @@ static const yytype_uint8 yyrline[] =
};
static const char *const yytname[] =
{
- "$end", "error", "$undefined", "\"if\"", "\"const\"", "\"then\"",
- "\"else\"", "$accept", "statement", "struct_stat", "if", "else", YY_NULLPTR
+ "$end", "error", "$undefined", "if", "const", "then", "else", "$accept",
+ "statement", "struct_stat", "if", "else", YY_NULLPTR
};
static const yytype_uint16 yytoknum[] =
{
--
2.20.0
- [PATCH 0/8] Revamp the handling token string aliases in error messages, Akim Demaille, 2018/12/29
- [PATCH 6/8] tests: check that internationalization of token works, Akim Demaille, 2018/12/29
- [PATCH 7/8] translate bison's own tokens, Akim Demaille, 2018/12/29
- [PATCH 4/8] parsers: don't double escape tnames,
Akim Demaille <=
- [PATCH 2/8] parsers: revamp the interface of yytnamerr, Akim Demaille, 2018/12/29
- [PATCH 1/8] yacc.c: avoid negated if, Akim Demaille, 2018/12/29
- [PATCH 3/8] tests: no longer play with trigraphs, Akim Demaille, 2018/12/29
- [PATCH 5/8] parsers: support translatable token aliases, Akim Demaille, 2018/12/29
- [PATCH 8/8] regen, Akim Demaille, 2018/12/29