emacs-elpa-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[elpa] externals/xr 6085257 1/3: Detect overlap in character alternative


From: Mattias Engdegård
Subject: [elpa] externals/xr 6085257 1/3: Detect overlap in character alternatives
Date: Fri, 1 Mar 2019 13:09:47 -0500 (EST)

branch: externals/xr
commit 608525744b4b354b166b4f1b9143734d44460d97
Author: Mattias Engdegård <address@hidden>
Commit: Mattias Engdegård <address@hidden>

    Detect overlap in character alternatives
    
    Detecting duplication and overlap in character alternatives turns
    out to be a fruitful way of finding bugs.
    Avoid escaping \ in messages where it applies to single characters.
    Increment version to 1.4.
---
 xr-test.el |  34 +++++++----
 xr.el      | 202 +++++++++++++++++++++++++++++++++++++++----------------------
 2 files changed, 149 insertions(+), 87 deletions(-)

diff --git a/xr-test.el b/xr-test.el
index b79c211..7beee0a 100644
--- a/xr-test.el
+++ b/xr-test.el
@@ -163,15 +163,17 @@
   (should (equal (xr "[^]-c]")
                  '(not (any "]-c"))))
   (should (equal (xr "[-^]")
-                 '(any "-" "^")))
+                 '(any "^-")))
   (should (equal (xr "[a-z-+/*%0-4[:xdigit:]]")
-                 '(any "a-z" "-" "+/*%" "0-4" xdigit)))
+                 '(any "0-4a-z" "%*+/-" xdigit)))
   (should (equal (xr "[^]A-Za-z-]*")
-                 '(zero-or-more (not (any "]" "A-Za-z" "-")))))
+                 '(zero-or-more (not (any "A-Za-z" "]-")))))
   (should (equal (xr "[+*%A-Ka-k0-3${-}]")
-                 '(any "+*%" "A-Ka-k0-3" "$" "{-}")))
+                 '(any "0-3A-Ka-k{-}" "$%*+")))
   (should (equal (xr "[^\\\\o][A-\\\\][A-\\\\-a]")
-                 '(seq (not (any "\\o")) (any "A-\\") (any "A-\\\\-a"))))
+                 '(seq (not (any "\\o")) (any "A-\\") (any "A-a"))))
+  (should (equal (xr "[^A-FFGI-LI-Mb-da-eg-ki-ns-tz-v]")
+                 '(not (any "A-FI-Ma-eg-ns-t" "G"))))
   )
 
 (ert-deftest xr-empty ()
@@ -197,7 +199,7 @@
                  '(seq bow (group (or "catch" "finally")) eow
                        (not (any "_")))))
   (should (equal (xr "[ \t\n]*:\\([^:]+\\|$\\)")
-                 '(seq (zero-or-more (any " \t\n")) ":"
+                 '(seq (zero-or-more (any "\t\n ")) ":"
                        (group (or (one-or-more (not (any ":")))
                                   eol)))))
   )
@@ -265,11 +267,11 @@
   (should (equal (xr-lint "^**$")
                  '((1 . "Unescaped literal `*'"))))
   (should (equal (xr-lint "a[\\\\[]")
-                 '((2 . "Escaped `\\' inside character alternative"))))
+                 '((3 . "Duplicated `\\' inside character alternative"))))
   (should (equal (xr-lint "\\{\\(+\\|?\\)\\[\\]\\}\\\t")
-                 '((0 . "Escaped non-special character `{'")
-                   (4 . "Unescaped literal `+'")
-                   (7 . "Unescaped literal `?'")
+                 '((0  . "Escaped non-special character `{'")
+                   (4  . "Unescaped literal `+'")
+                   (7  . "Unescaped literal `?'")
                    (14 . "Escaped non-special character `}'")
                    (16 . "Escaped non-special character `\\t'"))))
   (should (equal (xr-lint "\\}\\w\\a\\b\\%")
@@ -277,12 +279,18 @@
                    (4 . "Escaped non-special character `a'")
                    (8 . "Escaped non-special character `%'"))))
   (should (equal (xr-lint "a?+b+?\\(?:c?\\)*d\\{3\\}+e*?\\{2,5\\}")
-                 '((2 . "Repetition of repetition")
+                 '((2  . "Repetition of repetition")
                    (14 . "Repetition of repetition")
                    (25 . "Repetition of repetition"))))
   (should (equal (xr-lint "[]-Qa-fz-t]")
-                '((1 . "Reversed range `]-Q' matches nothing")
-                  (7 . "Reversed range `z-t' matches nothing"))))
+                 '((1 . "Reversed range `]-Q' matches nothing")
+                   (7 . "Reversed range `z-t' matches nothing"))))
+  (should (equal (xr-lint "[^A-FFGI-LI-Mb-da-eg-ki-ns-t33-7]")
+                 '((5  . "Character `F' included in range `A-F'")
+                   (10 . "Ranges `I-L' and `I-M' overlap")
+                   (16 . "Ranges `a-e' and `b-d' overlap")
+                   (22 . "Ranges `g-k' and `i-n' overlap")
+                   (29 . "Character `3' included in range `3-7'"))))
   )
 
 (provide 'xr-test)
diff --git a/xr.el b/xr.el
index 853bfc6..3b804ad 100644
--- a/xr.el
+++ b/xr.el
@@ -3,7 +3,7 @@
 ;; Copyright (C) 2019 Free Software Foundation, Inc.
 
 ;; Author: Mattias Engdegård <address@hidden>
-;; Version: 1.3
+;; Version: 1.4
 ;; Keywords: lisp, maint, regexps
 
 ;; This program is free software; you can redistribute it and/or modify
@@ -77,20 +77,23 @@
     (push (cons (1- position) message) (car warnings))))
 
 (defun xr--parse-char-alt (negated warnings)
-  (let ((set nil))
+  (let ((intervals nil)
+        (classes nil))
     (cond
      ;; Initial ]-x range
-     ((looking-at (rx "]-" (group (not (any "]")))))
-      (if (>= (string-to-char (match-string 1)) ?\])
-         (push (match-string 0) set)
-        (xr--report warnings (point)
-                   (format "Reversed range `%s' matches nothing"
-                           (match-string 0))))
+     ((looking-at (rx "]-" (not (any "]"))))
+      (let ((end (aref (match-string 0) 2)))
+        (if (>= end ?\])
+            (push (vector ?\] end (point)) intervals)
+          (xr--report warnings (point)
+                      (format "Reversed range `%s' matches nothing"
+                              (match-string 0)))))
       (goto-char (match-end 0)))
      ;; Initial ]
      ((looking-at "]")
-      (push "]" set)
+      (push (vector ?\] ?\] (point)) intervals)
       (forward-char 1)))
+
     (while (not (looking-at "]"))
       (cond
        ;; character class
@@ -101,71 +104,121 @@
                              lower multibyte nonascii print punct space
                              unibyte upper word xdigit)))
             (error "No character class `%s'" sym))
-          (push sym set)
+          (if (memq sym classes)
+              (xr--report warnings (point)
+                          (format "Duplicated character class `[:%s:]'" sym))
+            (push sym classes))
           (goto-char (match-end 0))))
        ;; character range
-       ((looking-at (rx (not (any "]")) "-" (not (any "]"))))
-        (let ((range (match-string 0)))
-          ;; We render [a-z] as (any "a-z") instead of (any (?a . ?z))
-          ;; for readability and brevity, and because the latter would
-          ;; become (97 . 122) when printed.
-          ;; TODO: Possibly convert "[0-9]" to digit, and
-          ;; "[0-9a-fA-F]" (and permutations) to hex-digit.
-         (cond
-          ((<= (aref range 0) (aref range 2))
-            (let ((prev (car set)))
-              ;; Merge with preceding range if any.
-              (if (and (stringp prev)
-                       (>= (length prev) 3)
-                       (eq (aref prev 1) ?-))
-                  (setq set (cons (concat prev range) (cdr set)))
-               (push range set))))
-          (t
+       ((looking-at (rx (group (not (any "]"))) "-" (group (not (any "]")))))
+        (let ((start (string-to-char (match-string 1)))
+              (end   (string-to-char (match-string 2))))
+          (cond
+           ((<= start end)
+            (push (vector start end (point)) intervals))
+           (t
             (xr--report warnings (point)
-                       (format "Reversed range `%s' matches nothing"
-                               range))))
+                        (format "Reversed range `%s' matches nothing"
+                                (match-string 0)))))
           (goto-char (match-end 0))))
        ((looking-at (rx eos))
         (error "Unterminated character alternative"))
        ;; plain character (including ^ or -)
        (t
-        (let* ((ch (following-char))
-               (ch-str (char-to-string ch)))
-          (cond
-           ;; Duplicated \ are common enough for us to remove them (and warn).
-           ((and (eq ch ?\\)
-                 (stringp (car set))
-                 (eq (string-to-char (substring (car set) -1)) ?\\))
-            (xr--report warnings (1- (point))
-                        "Escaped `\\' inside character alternative"))
-           ;; Merge with the previous string if neither contains "-".
-           ((and (stringp (car set))
-                 (not (eq ch ?-))
-                 (not (string-match "-" (car set))))
-            (setq set (cons (concat (car set) ch-str) (cdr set))))
-           (t
-            (push ch-str set))))
+        (let ((ch (following-char)))
+          (push (vector ch ch (point)) intervals))
         (forward-char 1))))
 
     (forward-char 1)                    ; eat the ]
-    (cond
-     ;; Non-negated single-char set, like [$]
-     ((and (not negated)
-           (= (length set) 1)
-           (stringp (car set))
-           (= (length (car set)) 1))
-      (car set))
-     ;; Single named class set, like [[:space:]]
-     ((and (= (length set) 1)
-           (symbolp (car set)))
-      (if negated
-          (list 'not (car set))
-        (car set)))
-     ;; Anything else.
-     (negated
-      (list 'not (cons 'any (reverse set))))
-     (t
-      (cons 'any (reverse set))))))
+
+    ;; Detect duplicates and overlapping intervals.
+    (let* ((sorted
+            (sort (nreverse intervals)
+                  (lambda (a b) (< (aref a 0) (aref b 0)))))
+           (s sorted))
+      (while (cdr s)
+        (let ((this (car s))
+              (next (cadr s)))
+          (when (>= (aref this 1) (aref next 0))
+            (let ((message
+                   (cond
+                    ;; Duplicate character: drop it and warn.
+                    ((and (eq (aref this 0) (aref this 1))
+                          (eq (aref next 0) (aref next 1)))
+                     (setcdr s (cddr s))
+                     (format "Duplicated `%c' inside character alternative"
+                             (aref this 0)))
+                    ;; Duplicate range: drop it and warn.
+                    ((and (eq (aref this 0) (aref next 0))
+                          (eq (aref this 1) (aref next 1)))
+                     (setcdr s (cddr s))
+                     (format "Duplicated `%c-%c' inside character alternative"
+                             (aref this 0) (aref this 1)))
+                    ;; Character in range: drop it and warn.
+                    ((eq (aref this 0) (aref this 1))
+                     (setcar s next)
+                     (setcdr s (cddr s))
+                     (format "Character `%c' included in range `%c-%c'"
+                             (aref this 0) (aref next 0) (aref next 1)))
+                    ;; Same but other way around.
+                    ((eq (aref next 0) (aref next 1))
+                     (setcdr s (cddr s))
+                     (format "Character `%c' included in range `%c-%c'"
+                             (aref next 0) (aref this 0) (aref this 1)))
+                    ;; Overlapping ranges: merge and warn.
+                    (t
+                     (let ((this-end (aref this 1)))
+                       (aset this 1 (max (aref this 1) (aref next 1)))
+                       (setcdr s (cddr s))
+                       (format "Ranges `%c-%c' and `%c-%c' overlap"
+                               (aref this 0) this-end
+                               (aref next 0) (aref next 1)))))))
+              (xr--report warnings (max (aref this 2) (aref next 2))
+                          (xr--escape-string message nil)))))
+        (setq s (cdr s)))
+            
+      ;; Gather ranges and single characters separately.
+      ;; We make no attempts at merging adjacent intervals/characters,
+      ;; nor at splitting short intervals such as "a-b"; if the user
+      ;; wrote it that way, there was probably a reason for it.
+      (let ((ranges nil)
+            (chars nil))
+        (mapc (lambda (interv)
+                (if (eq (aref interv 0) (aref interv 1))
+                    (push (aref interv 0) chars)
+                  (push (string (aref interv 0) ?- (aref interv 1))
+                        ranges)))
+              sorted)
+        
+        (cond
+         ;; Non-negated single-char set, like [$]: make a string.
+         ((and (= (length chars) 1)
+               (not negated)
+               (null ranges)
+               (null classes))
+          (string (car chars)))
+         ;; Single named class, like [[:space:]]: use the symbol.
+         ((and (= (length classes) 1)
+               (null chars)
+               (null ranges))
+          (if negated
+              (list 'not (car classes))
+            (car classes)))
+         ;; Anything else: produce (any ...)
+         (t
+          ;; Put dash last of all single characters.
+          (when (memq ?- chars)
+            (setq chars (cons ?- (delq ?- chars))))
+          (let* ((set (cons 'any
+                            (append
+                             (and ranges
+                                  (list (apply #'concat (nreverse ranges))))
+                             (and chars
+                                  (list (apply #'string (nreverse chars))))
+                             (nreverse classes)))))
+            (if negated
+                (list 'not set)
+              set))))))))
 
 ;; Reverse a sequence, flatten any (seq ...) inside, and concatenate
 ;; adjacent strings.
@@ -255,7 +308,7 @@
                      (?!  . comment-delimiter)))))
     (when (not sym)
       (error "Unknown syntax code `%s'"
-             (xr--escape-string (char-to-string syntax-code))))
+             (xr--escape-string (char-to-string syntax-code) nil)))
     (let ((item (list 'syntax (cdr sym))))
       (if negated (list 'not item) item))))
 
@@ -466,7 +519,7 @@
           ;; makes it unlikely to be a serious error.
           (xr--report warnings (match-beginning 0)
                       (format "Escaped non-special character `%s'"
-                              (xr--escape-string (match-string 2))))))
+                              (xr--escape-string (match-string 2) nil)))))
 
        (t (error "Backslash at end of regexp"))))
 
@@ -494,6 +547,7 @@
 
 (defun xr--parse (re-string warnings)
   (with-temp-buffer
+    (set-buffer-multibyte t)
     (insert re-string)
     (goto-char (point-min))
     (let ((rx (xr--parse-alt warnings)))
@@ -517,10 +571,11 @@ Return a list of (OFFSET . COMMENT) where COMMENT applies 
at OFFSET
 in RE-STRING."
   (let ((warnings (list nil)))
     (xr--parse re-string warnings)
-    (reverse (car warnings))))
+    (sort (car warnings) #'car-less-than-car)))
 
 ;; Escape non-printing characters in a string for maximum readability.
-(defun xr--escape-string (string)
+;; If ESCAPE-BACKSLASH, also escape \, otherwise don't.
+(defun xr--escape-string (string escape-backslash)
   ;; Translate control and raw chars to escape sequences for readability.
   ;; We prefer hex escapes (\xHH) since that is usually what the user wants,
   ;; but use octal (\OOO) if a legitimate hex digit follows, as
@@ -532,8 +587,6 @@ in RE-STRING."
             (xdigit (substring s 1))
             (transl (assq c
                           '((?\" . "\\\"")
-                            (?\\ . "\\\\")
-                            (?\a . "\\a")
                             (?\b . "\\b")
                             (?\t . "\\t")
                             (?\n . "\\n")
@@ -542,10 +595,11 @@ in RE-STRING."
                             (?\r . "\\r")
                             (?\e . "\\e")))))
        (concat
-        (if transl
-            (cdr transl)
-          (format (if (zerop (length xdigit)) "\\x%02x" "\\%03o")
-                  c))
+        (cond (transl (cdr transl))
+              ((eq c ?\\)
+               (if escape-backslash "\\\\" "\\"))
+              ((zerop (length xdigit)) (format "\\x%02x" c))
+              (t (format (format "\\%03o" c))))
         xdigit)))
    string 'fixedcase 'literal))
 
@@ -563,7 +617,7 @@ in RE-STRING."
           (rest (mapcar #'xr--rx-to-string (cdr rx))))
       (concat "(" (mapconcat #'identity (cons first rest) " ") ")")))
    ((stringp rx)
-    (concat "\"" (xr--escape-string rx) "\""))
+    (concat "\"" (xr--escape-string rx t) "\""))
    (t (prin1-to-string rx))))
 
 (defun xr-pp-rx-to-str (rx)



reply via email to

[Prev in Thread] Current Thread [Next in Thread]