emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Emacs-diffs] master 512e988: Add support for Unicode whitespace in [:bl


From: Philipp Stephani
Subject: [Emacs-diffs] master 512e988: Add support for Unicode whitespace in [:blank:]
Date: Fri, 6 Jan 2017 19:20:19 +0000 (UTC)

branch: master
commit 512e9886be693f61f9d1932f19461bf4482fba51
Author: Philipp Stephani <address@hidden>
Commit: Philipp Stephani <address@hidden>

    Add support for Unicode whitespace in [:blank:]
    
    See Bug#25366.
    
    * src/character.c (blankp): New function for checking Unicode
    horizontal whitespace.
    * src/regex.c (ISBLANK): Use 'blankp' for non-ASCII horizontal
    whitespace.
    (BIT_BLANK): New bit for range table.
    (re_wctype_to_bit, execute_charset): Use it.
    * test/lisp/subr-tests.el (subr-tests--string-match-p--blank): Add
    unit test for [:blank:] character class.
    * test/src/regex-tests.el (test): Adapt unit test.
    * doc/lispref/searching.texi (Char Classes): Document new Unicode
    behavior for [:blank:].
---
 doc/lispref/searching.texi |    6 +++++-
 etc/NEWS                   |    6 ++++++
 src/character.c            |   17 +++++++++++++++++
 src/character.h            |    1 +
 src/regex.c                |   12 ++++++++----
 test/lisp/subr-tests.el    |   10 ++++++++++
 test/src/regex-tests.el    |    2 +-
 7 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi
index b011d14..67d4c22 100644
--- a/doc/lispref/searching.texi
+++ b/doc/lispref/searching.texi
@@ -553,7 +553,11 @@ characters whose Unicode @samp{general-category} property
 (@pxref{Character Properties}) indicates they are alphabetic
 characters.
 @item [:blank:]
-This matches space and tab only.
+This matches horizontal whitespace, as defined by Annex C of the
+Unicode Technical Standard #18.  In particular, it matches spaces,
+tabs, and other characters whose Unicode @samp{general-category}
+property (@pxref{Character Properties}) indicates they are spacing
+separators.
 @item [:cntrl:]
 This matches any @acronym{ASCII} control character.
 @item [:digit:]
diff --git a/etc/NEWS b/etc/NEWS
index d91204b..051b97e 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -710,6 +710,12 @@ of curved quotes in format arguments to functions like 
'message' and
 now generate less chatter and more-compact diagnostics.  The auxiliary
 function 'check-declare-errmsg' has been removed.
 
++++
+** The regular expression character class [:blank:] now matches
+Unicode horizontal whitespace as defined in the Unicode Technical
+Standard #18.  If you only want to match space and tab, use [ \t]
+instead.
+
 
 * Lisp Changes in Emacs 26.1
 
diff --git a/src/character.c b/src/character.c
index b594af0..bc99daf 100644
--- a/src/character.c
+++ b/src/character.c
@@ -1038,6 +1038,23 @@ printablep (int c)
            || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
 }
 
+/* Return true if C is a horizontal whitespace character, as defined
+   by http://www.unicode.org/reports/tr18/tr18-19.html#blank.  */
+bool
+blankp (int c)
+{
+  /* Fast path for ASCII characters that are always assumed to
+     constitute horizontal whitespace.  */
+  if (c == ' ' || c == '\t')
+    return true;
+
+  Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
+  if (! INTEGERP (category))
+    return false;
+
+  return XINT (category) == UNICODE_CATEGORY_Zs; /* separator, space */
+}
+
 void
 syms_of_character (void)
 {
diff --git a/src/character.h b/src/character.h
index fc8a0dd..62d252e 100644
--- a/src/character.h
+++ b/src/character.h
@@ -680,6 +680,7 @@ extern bool alphabeticp (int);
 extern bool alphanumericp (int);
 extern bool graphicp (int);
 extern bool printablep (int);
+extern bool blankp (int);
 
 /* Return a translation table of id number ID.  */
 #define GET_TRANSLATION_TABLE(id) \
diff --git a/src/regex.c b/src/regex.c
index ae3fde8..7e70c49 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -310,11 +310,12 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 
};
                     || ((c) >= 'a' && (c) <= 'f')      \
                     || ((c) >= 'A' && (c) <= 'F'))
 
-/* This is only used for single-byte characters.  */
-# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
-
 /* The rest must handle multibyte characters.  */
 
+# define ISBLANK(c) (IS_REAL_ASCII (c)                  \
+                     ? ((c) == ' ' || (c) == '\t')      \
+                     : blankp (c))
+
 # define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c)                            \
                     ? (c) > ' ' && !((c) >= 0177 && (c) <= 0240)       \
                     : graphicp (c))
@@ -1790,6 +1791,7 @@ struct range_table_work_area
 #define BIT_ALNUM      0x80
 #define BIT_GRAPH      0x100
 #define BIT_PRINT      0x200
+#define BIT_BLANK       0x400
 
 
 /* Set the bit for character C in a list.  */
@@ -2066,8 +2068,9 @@ re_wctype_to_bit (re_wctype_t cc)
     case RECC_SPACE: return BIT_SPACE;
     case RECC_GRAPH: return BIT_GRAPH;
     case RECC_PRINT: return BIT_PRINT;
+    case RECC_BLANK: return BIT_BLANK;
     case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
-    case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
+    case RECC_UNIBYTE: case RECC_ERROR: return 0;
     default:
       abort ();
     }
@@ -4658,6 +4661,7 @@ execute_charset (const_re_char **pp, unsigned c, unsigned 
corig, bool unibyte)
          (class_bits & BIT_ALNUM && ISALNUM (c)) ||
          (class_bits & BIT_ALPHA && ISALPHA (c)) ||
          (class_bits & BIT_SPACE && ISSPACE (c)) ||
+          (class_bits & BIT_BLANK && ISBLANK (c)) ||
          (class_bits & BIT_WORD  && ISWORD  (c)) ||
          ((class_bits & BIT_UPPER) &&
           (ISUPPER (c) || (corig != c &&
diff --git a/test/lisp/subr-tests.el b/test/lisp/subr-tests.el
index 3c5dbcd..a3b08e9 100644
--- a/test/lisp/subr-tests.el
+++ b/test/lisp/subr-tests.el
@@ -271,5 +271,15 @@ indirectly `mapbacktrace'."
   (let ((frame-lists (subr-test--frames-1 'subr-test--frames-2)))
     (should (equal (car frame-lists) (cdr frame-lists)))))
 
+(ert-deftest subr-tests--string-match-p--blank ()
+  "Test that [:blank:] matches horizontal whitespace, cf. Bug#25366."
+  (should (equal (string-match-p "\\`[[:blank:]]\\'" " ") 0))
+  (should (equal (string-match-p "\\`[[:blank:]]\\'" "\t") 0))
+  (should-not (string-match-p "\\`[[:blank:]]\\'" "\n"))
+  (should-not (string-match-p "\\`[[:blank:]]\\'" "a"))
+  (should (equal (string-match-p "\\`[[:blank:]]\\'" "\N{HAIR SPACE}") 0))
+  (should (equal (string-match-p "\\`[[:blank:]]\\'" "\u3000") 0))
+  (should-not (string-match-p "\\`[[:blank:]]\\'" "\N{LINE SEPARATOR}")))
+
 (provide 'subr-tests)
 ;;; subr-tests.el ends here
diff --git a/test/src/regex-tests.el b/test/src/regex-tests.el
index 74c2711..db187fd 100644
--- a/test/src/regex-tests.el
+++ b/test/src/regex-tests.el
@@ -80,7 +80,7 @@ character) must match a string \"\u2420\"."
                 ("print" "abcłąka\u2620-, " "\t\n\1")
 
                 ("space" " \t\n\u2001" "abcABCł0123")
-                ("blank" " \t" "\n\u2001")
+                ("blank" " \t\u2001" "\n")
 
                 ("ascii" "abcABC012 \t\n\1" "łą\u2620")
                 ("nonascii" "łą\u2622" "abcABC012 \t\n\1")



reply via email to

[Prev in Thread] Current Thread [Next in Thread]