gawk-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[gawk-diffs] [SCM] gawk branch, gawk-4.1-stable, updated. gawk-4.1.0-734


From: Arnold Robbins
Subject: [gawk-diffs] [SCM] gawk branch, gawk-4.1-stable, updated. gawk-4.1.0-734-g243b097
Date: Tue, 25 Aug 2015 17:30:09 +0000

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "gawk".

The branch, gawk-4.1-stable has been updated
       via  243b097279a89d456fda4a400412482d70b3665c (commit)
       via  278fe876bb18938803ac1c36b028adb8cef6fe84 (commit)
      from  96cc85ac9ba06ab6b9edface5e4c34392a07a98d (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.sv.gnu.org/cgit/gawk.git/commit/?id=243b097279a89d456fda4a400412482d70b3665c

commit 243b097279a89d456fda4a400412482d70b3665c
Author: Arnold D. Robbins <address@hidden>
Date:   Tue Aug 25 20:29:15 2015 +0300

    Make MAYBE_NUM -> NUMBER smarter, clear STRING.

diff --git a/ChangeLog b/ChangeLog
index 05f5342..e685dd6 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -6,6 +6,16 @@
        of unexpected data.  Make the lint warning an unconditional
        warning.
 
+       Unrelated:
+
+       * awk.h: Add explanatory comment on the flags related to
+       types and values.
+       * mpfr.c (mpg_force_number): If setting NUMBER, clear STRING also
+       when clearing MAYBE_NUM.
+       (set_PREC): Check STRCUR instead of STRING.
+       * node.c (r_force_number): If setting NUMBER, clear STRING also
+       when clearing MAYBE_NUM.
+
 2015-08-15         Arnold D. Robbins     <address@hidden>
 
        * dfa.c (dfamust): Restore c90 compat by moving some
diff --git a/awk.h b/awk.h
index 1eb2688..c3a3cba 100644
--- a/awk.h
+++ b/awk.h
@@ -404,6 +404,37 @@ typedef struct exp_node {
 #              define  MALLOC  0x0001       /* can be free'd */
 
 /* type = Node_val */
+       /*
+        * STRING and NUMBER are mutually exclusive. They represent the
+        * type of a value as assigned.
+        *
+        * STRCUR and NUMCUR are not mutually exclusive. They represent that
+        * the particular type of value is up to date.  For example,
+        *
+        *      a = 5           # NUMBER | NUMCUR
+        *      b = a ""        # Adds STRCUR to a, since a string value
+        *                      # is now available. But the type hasn't changed!
+        *
+        *      a = "42"        # STRING | STRCUR
+        *      b = a + 0       # Adds NUMCUR to a, since numeric value
+        *                      # is now available. But the type hasn't changed!
+        *
+        * MAYBE_NUM is the joker.  It means "this is string data, but
+        * the user may have really wanted it to be a number. If we have
+        * to guess, like in a comparison, turn it into a number."
+        * For example,    gawk -v a=42 ....
+        * Here, `a' gets STRING|STRCUR|MAYBE_NUM and then when used where
+        * a number is needed, it gets turned into a NUMBER and STRING
+        * is cleared.
+        *
+        * WSTRCUR is for efficiency. If in a multibyte locale, and we
+        * need to do something character based (substr, length, etc.)
+        * we create the corresponding wide character string and store it,
+        * and add WSTRCUR to the flags so that we don't have to do the
+        * conversion more than once.
+        *
+        * We hope that the rest of the flags are self-explanatory. :-)
+        */
 #              define  STRING  0x0002       /* assigned as string */
 #              define  STRCUR  0x0004       /* string value is current */
 #              define  NUMCUR  0x0008       /* numeric value is current */
diff --git a/mpfr.c b/mpfr.c
index a89b2bc..4e4e12d 100644
--- a/mpfr.c
+++ b/mpfr.c
@@ -347,7 +347,7 @@ mpg_force_number(NODE *n)
                return n;
 
        if ((n->flags & MAYBE_NUM) != 0) {
-               n->flags &= ~MAYBE_NUM;
+               n->flags &= ~(MAYBE_NUM|STRING);
                newflags = NUMBER;
        }
 
@@ -525,7 +525,7 @@ set_PREC()
        if ((val->flags & MAYBE_NUM) != 0)
                force_number(val);
 
-       if ((val->flags & (STRING|NUMBER)) == STRING) {
+       if ((val->flags & STRCUR) != 0) {
                int i, j;
 
                /* emulate IEEE-754 binary format */
diff --git a/node.c b/node.c
index de77114..a7c19db 100644
--- a/node.c
+++ b/node.c
@@ -76,7 +76,7 @@ r_force_number(NODE *n)
                        return n;
                } else if (n->stlen == 4 && is_ieee_magic_val(n->stptr)) {
                        if ((n->flags & MAYBE_NUM) != 0)
-                               n->flags &= ~MAYBE_NUM;
+                               n->flags &= ~(MAYBE_NUM|STRING);
                        n->flags |= NUMBER|NUMCUR;
                        n->numbr = get_ieee_magic_val(n->stptr);
 
@@ -103,7 +103,7 @@ r_force_number(NODE *n)
 
        if ((n->flags & MAYBE_NUM) != 0) {
                newflags = NUMBER;
-               n->flags &= ~MAYBE_NUM;
+               n->flags &= ~(MAYBE_NUM|STRING);
        } else
                newflags = 0;
 

http://git.sv.gnu.org/cgit/gawk.git/commit/?id=278fe876bb18938803ac1c36b028adb8cef6fe84

commit 278fe876bb18938803ac1c36b028adb8cef6fe84
Author: Arnold D. Robbins <address@hidden>
Date:   Tue Aug 25 19:51:54 2015 +0300

    Improve handling of invalid data in UTF locales.

diff --git a/ChangeLog b/ChangeLog
index 62647c1..05f5342 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+2015-08-25         Arnold D. Robbins     <address@hidden>
+
+       * node.c (str2wstr): Upon finding an invalid character, if
+       using UTF-8, use the replacement character instead of skipping
+       it. Helps match() and other functions work better in the face
+       of unexpected data.  Make the lint warning an unconditional
+       warning.
+
 2015-08-15         Arnold D. Robbins     <address@hidden>
 
        * dfa.c (dfamust): Restore c90 compat by moving some
diff --git a/node.c b/node.c
index 1741a13..de77114 100644
--- a/node.c
+++ b/node.c
@@ -717,22 +717,37 @@ str2wstr(NODE *n, size_t **ptr)
                case (size_t) -2:
                case (size_t) -1:
                        /*
-                        * Just skip the bad byte and keep going, so that
-                        * we get a more-or-less full string, instead of
-                        * stopping early. This is particularly important
-                        * for match() where we need to build the indices.
-                        */
-                       sp++;
-                       src_count--;
-                       /*
                         * mbrtowc(3) says the state of mbs becomes undefined
                         * after a bad character, so reset it.
                         */
                        memset(& mbs, 0, sizeof(mbs));
-                       /* And warn the user something's wrong */
-                       if (do_lint && ! warned) {
+
+                       /* Warn the user something's wrong */
+                       if (! warned) {
                                warned = true;
-                               lintwarn(_("Invalid multibyte data detected. 
There may be a mismatch between your data and your locale."));
+                               warning(_("Invalid multibyte data detected. 
There may be a mismatch between your data and your locale."));
+                       }
+
+                       /*
+                        * 8/2015: If we're using UTF, then instead of just
+                        * skipping the character, plug in the Unicode
+                        * replacement character. In most cases this gives
+                        * us "better" results, in that character counts
+                        * and string lengths tend to make more sense.
+                        *
+                        * Otherwise, just skip the bad byte and keep going,
+                        * so that we get a more-or-less full string, instead of
+                        * stopping early. This is particularly important
+                        * for match() where we need to build the indices.
+                        */
+                       if (using_utf8()) {
+                               count = 1;
+                               wc = 0xFFFD;    /* unicode replacement 
character */
+                               goto set_wc;
+                       } else {
+                               /* skip it and keep going */
+                               sp++;
+                               src_count--;
                        }
                        break;
 
@@ -740,6 +755,7 @@ str2wstr(NODE *n, size_t **ptr)
                        count = 1;
                        /* fall through */
                default:
+               set_wc:
                        *wsp++ = wc;
                        src_count -= count;
                        while (count--)  {
diff --git a/test/ChangeLog b/test/ChangeLog
index cc7576e..7b9e273 100644
--- a/test/ChangeLog
+++ b/test/ChangeLog
@@ -1,3 +1,9 @@
+2015-08-25         Arnold D. Robbins     <address@hidden>
+
+       * mbstr1.ok: Updated after code change.
+       * Makefile.am (mbstr2): New test.
+       * mbstr2.awk, mbstr2.in, mbstr2.ok: New files.
+
 2015-06-25         Arnold D. Robbins     <address@hidden>
 
        * Makefile.am (negtime): Fix out-of-tree test run.
diff --git a/test/Makefile.am b/test/Makefile.am
index 5e72014..14ebf54 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -531,6 +531,9 @@ EXTRA_DIST = \
        mbprintf4.ok \
        mbstr1.awk \
        mbstr1.ok \
+       mbstr2.awk \
+       mbstr2.in \
+       mbstr2.ok \
        membug1.awk \
        membug1.in \
        membug1.ok \
@@ -1062,7 +1065,7 @@ GAWK_EXT_TESTS = \
        incdupe incdupe2 incdupe3 incdupe4 incdupe5 incdupe6 incdupe7 \
        include include2 indirectbuiltin indirectcall indirectcall2 \
        lint lintold lintwarn \
-       manyfiles match1 match2 match3 mbstr1 \
+       manyfiles match1 match2 match3 mbstr1 mbstr2 \
        nastyparm negtime next nondec nondec2 \
        patsplit posix printfbad1 printfbad2 printfbad3 printfbad4 printhuge 
procinfs \
        profile0 profile1 profile2 profile3 profile4 profile5 profile6 profile7 
pty1 \
@@ -1705,6 +1708,12 @@ mbstr1::
        AWKPATH="$(srcdir)" $(AWK) -f address@hidden  >_$@ 2>&1 || echo EXIT 
CODE: $$? >>_$@
        @-$(CMP) "$(srcdir)"/address@hidden _$@ && rm -f _$@
 
+mbstr2::
+       @echo $@
+       @[ -z "$$GAWKLOCALE" ] && GAWKLOCALE=en_US.UTF-8; \
+       AWKPATH="$(srcdir)" $(AWK) -f address@hidden < 
"$(srcdir)"/address@hidden >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
+       @-$(CMP) "$(srcdir)"/address@hidden _$@ && rm -f _$@
+
 printfbad2: printfbad2.ok
        @echo $@
        @$(AWK) --lint -f "$(srcdir)"/address@hidden "$(srcdir)"/address@hidden 
2>&1 | sed 's;$(srcdir)/;;g' >_$@ || echo EXIT CODE: $$?  >>_$@
diff --git a/test/Makefile.in b/test/Makefile.in
index 0116f3a..a78b3e6 100644
--- a/test/Makefile.in
+++ b/test/Makefile.in
@@ -788,6 +788,9 @@ EXTRA_DIST = \
        mbprintf4.ok \
        mbstr1.awk \
        mbstr1.ok \
+       mbstr2.awk \
+       mbstr2.in \
+       mbstr2.ok \
        membug1.awk \
        membug1.in \
        membug1.ok \
@@ -1318,7 +1321,7 @@ GAWK_EXT_TESTS = \
        incdupe incdupe2 incdupe3 incdupe4 incdupe5 incdupe6 incdupe7 \
        include include2 indirectbuiltin indirectcall indirectcall2 \
        lint lintold lintwarn \
-       manyfiles match1 match2 match3 mbstr1 \
+       manyfiles match1 match2 match3 mbstr1 mbstr2 \
        nastyparm negtime next nondec nondec2 \
        patsplit posix printfbad1 printfbad2 printfbad3 printfbad4 printhuge 
procinfs \
        profile0 profile1 profile2 profile3 profile4 profile5 profile6 profile7 
pty1 \
@@ -2143,6 +2146,12 @@ mbstr1::
        AWKPATH="$(srcdir)" $(AWK) -f address@hidden  >_$@ 2>&1 || echo EXIT 
CODE: $$? >>_$@
        @-$(CMP) "$(srcdir)"/address@hidden _$@ && rm -f _$@
 
+mbstr2::
+       @echo $@
+       @[ -z "$$GAWKLOCALE" ] && GAWKLOCALE=en_US.UTF-8; \
+       AWKPATH="$(srcdir)" $(AWK) -f address@hidden < 
"$(srcdir)"/address@hidden >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
+       @-$(CMP) "$(srcdir)"/address@hidden _$@ && rm -f _$@
+
 printfbad2: printfbad2.ok
        @echo $@
        @$(AWK) --lint -f "$(srcdir)"/address@hidden "$(srcdir)"/address@hidden 
2>&1 | sed 's;$(srcdir)/;;g' >_$@ || echo EXIT CODE: $$?  >>_$@
diff --git a/test/mbstr1.ok b/test/mbstr1.ok
index dcb4347..3fd1bf8 100644
--- a/test/mbstr1.ok
+++ b/test/mbstr1.ok
@@ -1,2 +1,3 @@
+gawk: mbstr1.awk:2: warning: Invalid multibyte data detected. There may be a 
mismatch between your data and your locale.
 4
 1
diff --git a/test/mbstr2.awk b/test/mbstr2.awk
new file mode 100644
index 0000000..4f2c8cc
--- /dev/null
+++ b/test/mbstr2.awk
@@ -0,0 +1 @@
+match($0,/:deathdate=2007....:/) { print substr($0,RSTART+11,RLENGTH-16) }
diff --git a/test/mbstr2.in b/test/mbstr2.in
new file mode 100644
index 0000000..36e971a
--- /dev/null
+++ b/test/mbstr2.in
@@ -0,0 +1,4 @@
+missile:deathdate=20070306:
+P”rr”:deathdate=20070306:
+wizard:deathdate=20071103:
+Daithí:deathdate=20071103:
diff --git a/test/mbstr2.ok b/test/mbstr2.ok
new file mode 100644
index 0000000..29ac876
--- /dev/null
+++ b/test/mbstr2.ok
@@ -0,0 +1,5 @@
+2007
+gawk: mbstr2.awk:1: (FILENAME=- FNR=2) warning: Invalid multibyte data 
detected. There may be a mismatch between your data and your locale.
+2007
+2007
+2007

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog        |   18 ++++++++++++++++++
 awk.h            |   31 +++++++++++++++++++++++++++++++
 mpfr.c           |    4 ++--
 node.c           |   42 +++++++++++++++++++++++++++++-------------
 test/ChangeLog   |    6 ++++++
 test/Makefile.am |   11 ++++++++++-
 test/Makefile.in |   11 ++++++++++-
 test/mbstr1.ok   |    1 +
 test/mbstr2.awk  |    1 +
 test/mbstr2.in   |    4 ++++
 test/mbstr2.ok   |    5 +++++
 11 files changed, 117 insertions(+), 17 deletions(-)
 create mode 100644 test/mbstr2.awk
 create mode 100644 test/mbstr2.in
 create mode 100644 test/mbstr2.ok


hooks/post-receive
-- 
gawk



reply via email to

[Prev in Thread] Current Thread [Next in Thread]