[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[elpa] externals/xr 6085257 1/3: Detect overlap in character alternative
From: |
Mattias Engdegård |
Subject: |
[elpa] externals/xr 6085257 1/3: Detect overlap in character alternatives |
Date: |
Fri, 1 Mar 2019 13:09:47 -0500 (EST) |
branch: externals/xr
commit 608525744b4b354b166b4f1b9143734d44460d97
Author: Mattias Engdegård <address@hidden>
Commit: Mattias Engdegård <address@hidden>
Detect overlap in character alternatives
Detecting duplication and overlap in character alternatives turns
out to be a fruitful way of finding bugs.
Avoid escaping \ in messages where it applies to single characters.
Increment version to 1.4.
---
xr-test.el | 34 +++++++----
xr.el | 202 +++++++++++++++++++++++++++++++++++++++----------------------
2 files changed, 149 insertions(+), 87 deletions(-)
diff --git a/xr-test.el b/xr-test.el
index b79c211..7beee0a 100644
--- a/xr-test.el
+++ b/xr-test.el
@@ -163,15 +163,17 @@
(should (equal (xr "[^]-c]")
'(not (any "]-c"))))
(should (equal (xr "[-^]")
- '(any "-" "^")))
+ '(any "^-")))
(should (equal (xr "[a-z-+/*%0-4[:xdigit:]]")
- '(any "a-z" "-" "+/*%" "0-4" xdigit)))
+ '(any "0-4a-z" "%*+/-" xdigit)))
(should (equal (xr "[^]A-Za-z-]*")
- '(zero-or-more (not (any "]" "A-Za-z" "-")))))
+ '(zero-or-more (not (any "A-Za-z" "]-")))))
(should (equal (xr "[+*%A-Ka-k0-3${-}]")
- '(any "+*%" "A-Ka-k0-3" "$" "{-}")))
+ '(any "0-3A-Ka-k{-}" "$%*+")))
(should (equal (xr "[^\\\\o][A-\\\\][A-\\\\-a]")
- '(seq (not (any "\\o")) (any "A-\\") (any "A-\\\\-a"))))
+ '(seq (not (any "\\o")) (any "A-\\") (any "A-a"))))
+ (should (equal (xr "[^A-FFGI-LI-Mb-da-eg-ki-ns-tz-v]")
+ '(not (any "A-FI-Ma-eg-ns-t" "G"))))
)
(ert-deftest xr-empty ()
@@ -197,7 +199,7 @@
'(seq bow (group (or "catch" "finally")) eow
(not (any "_")))))
(should (equal (xr "[ \t\n]*:\\([^:]+\\|$\\)")
- '(seq (zero-or-more (any " \t\n")) ":"
+ '(seq (zero-or-more (any "\t\n ")) ":"
(group (or (one-or-more (not (any ":")))
eol)))))
)
@@ -265,11 +267,11 @@
(should (equal (xr-lint "^**$")
'((1 . "Unescaped literal `*'"))))
(should (equal (xr-lint "a[\\\\[]")
- '((2 . "Escaped `\\' inside character alternative"))))
+ '((3 . "Duplicated `\\' inside character alternative"))))
(should (equal (xr-lint "\\{\\(+\\|?\\)\\[\\]\\}\\\t")
- '((0 . "Escaped non-special character `{'")
- (4 . "Unescaped literal `+'")
- (7 . "Unescaped literal `?'")
+ '((0 . "Escaped non-special character `{'")
+ (4 . "Unescaped literal `+'")
+ (7 . "Unescaped literal `?'")
(14 . "Escaped non-special character `}'")
(16 . "Escaped non-special character `\\t'"))))
(should (equal (xr-lint "\\}\\w\\a\\b\\%")
@@ -277,12 +279,18 @@
(4 . "Escaped non-special character `a'")
(8 . "Escaped non-special character `%'"))))
(should (equal (xr-lint "a?+b+?\\(?:c?\\)*d\\{3\\}+e*?\\{2,5\\}")
- '((2 . "Repetition of repetition")
+ '((2 . "Repetition of repetition")
(14 . "Repetition of repetition")
(25 . "Repetition of repetition"))))
(should (equal (xr-lint "[]-Qa-fz-t]")
- '((1 . "Reversed range `]-Q' matches nothing")
- (7 . "Reversed range `z-t' matches nothing"))))
+ '((1 . "Reversed range `]-Q' matches nothing")
+ (7 . "Reversed range `z-t' matches nothing"))))
+ (should (equal (xr-lint "[^A-FFGI-LI-Mb-da-eg-ki-ns-t33-7]")
+ '((5 . "Character `F' included in range `A-F'")
+ (10 . "Ranges `I-L' and `I-M' overlap")
+ (16 . "Ranges `a-e' and `b-d' overlap")
+ (22 . "Ranges `g-k' and `i-n' overlap")
+ (29 . "Character `3' included in range `3-7'"))))
)
(provide 'xr-test)
diff --git a/xr.el b/xr.el
index 853bfc6..3b804ad 100644
--- a/xr.el
+++ b/xr.el
@@ -3,7 +3,7 @@
;; Copyright (C) 2019 Free Software Foundation, Inc.
;; Author: Mattias Engdegård <address@hidden>
-;; Version: 1.3
+;; Version: 1.4
;; Keywords: lisp, maint, regexps
;; This program is free software; you can redistribute it and/or modify
@@ -77,20 +77,23 @@
(push (cons (1- position) message) (car warnings))))
(defun xr--parse-char-alt (negated warnings)
- (let ((set nil))
+ (let ((intervals nil)
+ (classes nil))
(cond
;; Initial ]-x range
- ((looking-at (rx "]-" (group (not (any "]")))))
- (if (>= (string-to-char (match-string 1)) ?\])
- (push (match-string 0) set)
- (xr--report warnings (point)
- (format "Reversed range `%s' matches nothing"
- (match-string 0))))
+ ((looking-at (rx "]-" (not (any "]"))))
+ (let ((end (aref (match-string 0) 2)))
+ (if (>= end ?\])
+ (push (vector ?\] end (point)) intervals)
+ (xr--report warnings (point)
+ (format "Reversed range `%s' matches nothing"
+ (match-string 0)))))
(goto-char (match-end 0)))
;; Initial ]
((looking-at "]")
- (push "]" set)
+ (push (vector ?\] ?\] (point)) intervals)
(forward-char 1)))
+
(while (not (looking-at "]"))
(cond
;; character class
@@ -101,71 +104,121 @@
lower multibyte nonascii print punct space
unibyte upper word xdigit)))
(error "No character class `%s'" sym))
- (push sym set)
+ (if (memq sym classes)
+ (xr--report warnings (point)
+ (format "Duplicated character class `[:%s:]'" sym))
+ (push sym classes))
(goto-char (match-end 0))))
;; character range
- ((looking-at (rx (not (any "]")) "-" (not (any "]"))))
- (let ((range (match-string 0)))
- ;; We render [a-z] as (any "a-z") instead of (any (?a . ?z))
- ;; for readability and brevity, and because the latter would
- ;; become (97 . 122) when printed.
- ;; TODO: Possibly convert "[0-9]" to digit, and
- ;; "[0-9a-fA-F]" (and permutations) to hex-digit.
- (cond
- ((<= (aref range 0) (aref range 2))
- (let ((prev (car set)))
- ;; Merge with preceding range if any.
- (if (and (stringp prev)
- (>= (length prev) 3)
- (eq (aref prev 1) ?-))
- (setq set (cons (concat prev range) (cdr set)))
- (push range set))))
- (t
+ ((looking-at (rx (group (not (any "]"))) "-" (group (not (any "]")))))
+ (let ((start (string-to-char (match-string 1)))
+ (end (string-to-char (match-string 2))))
+ (cond
+ ((<= start end)
+ (push (vector start end (point)) intervals))
+ (t
(xr--report warnings (point)
- (format "Reversed range `%s' matches nothing"
- range))))
+ (format "Reversed range `%s' matches nothing"
+ (match-string 0)))))
(goto-char (match-end 0))))
((looking-at (rx eos))
(error "Unterminated character alternative"))
;; plain character (including ^ or -)
(t
- (let* ((ch (following-char))
- (ch-str (char-to-string ch)))
- (cond
- ;; Duplicated \ are common enough for us to remove them (and warn).
- ((and (eq ch ?\\)
- (stringp (car set))
- (eq (string-to-char (substring (car set) -1)) ?\\))
- (xr--report warnings (1- (point))
- "Escaped `\\' inside character alternative"))
- ;; Merge with the previous string if neither contains "-".
- ((and (stringp (car set))
- (not (eq ch ?-))
- (not (string-match "-" (car set))))
- (setq set (cons (concat (car set) ch-str) (cdr set))))
- (t
- (push ch-str set))))
+ (let ((ch (following-char)))
+ (push (vector ch ch (point)) intervals))
(forward-char 1))))
(forward-char 1) ; eat the ]
- (cond
- ;; Non-negated single-char set, like [$]
- ((and (not negated)
- (= (length set) 1)
- (stringp (car set))
- (= (length (car set)) 1))
- (car set))
- ;; Single named class set, like [[:space:]]
- ((and (= (length set) 1)
- (symbolp (car set)))
- (if negated
- (list 'not (car set))
- (car set)))
- ;; Anything else.
- (negated
- (list 'not (cons 'any (reverse set))))
- (t
- (cons 'any (reverse set))))))
+
+ ;; Detect duplicates and overlapping intervals.
+ (let* ((sorted
+ (sort (nreverse intervals)
+ (lambda (a b) (< (aref a 0) (aref b 0)))))
+ (s sorted))
+ (while (cdr s)
+ (let ((this (car s))
+ (next (cadr s)))
+ (when (>= (aref this 1) (aref next 0))
+ (let ((message
+ (cond
+ ;; Duplicate character: drop it and warn.
+ ((and (eq (aref this 0) (aref this 1))
+ (eq (aref next 0) (aref next 1)))
+ (setcdr s (cddr s))
+ (format "Duplicated `%c' inside character alternative"
+ (aref this 0)))
+ ;; Duplicate range: drop it and warn.
+ ((and (eq (aref this 0) (aref next 0))
+ (eq (aref this 1) (aref next 1)))
+ (setcdr s (cddr s))
+ (format "Duplicated `%c-%c' inside character alternative"
+ (aref this 0) (aref this 1)))
+ ;; Character in range: drop it and warn.
+ ((eq (aref this 0) (aref this 1))
+ (setcar s next)
+ (setcdr s (cddr s))
+ (format "Character `%c' included in range `%c-%c'"
+ (aref this 0) (aref next 0) (aref next 1)))
+ ;; Same but other way around.
+ ((eq (aref next 0) (aref next 1))
+ (setcdr s (cddr s))
+ (format "Character `%c' included in range `%c-%c'"
+ (aref next 0) (aref this 0) (aref this 1)))
+ ;; Overlapping ranges: merge and warn.
+ (t
+ (let ((this-end (aref this 1)))
+ (aset this 1 (max (aref this 1) (aref next 1)))
+ (setcdr s (cddr s))
+ (format "Ranges `%c-%c' and `%c-%c' overlap"
+ (aref this 0) this-end
+ (aref next 0) (aref next 1)))))))
+ (xr--report warnings (max (aref this 2) (aref next 2))
+ (xr--escape-string message nil)))))
+ (setq s (cdr s)))
+
+ ;; Gather ranges and single characters separately.
+ ;; We make no attempts at merging adjacent intervals/characters,
+ ;; nor at splitting short intervals such as "a-b"; if the user
+ ;; wrote it that way, there was probably a reason for it.
+ (let ((ranges nil)
+ (chars nil))
+ (mapc (lambda (interv)
+ (if (eq (aref interv 0) (aref interv 1))
+ (push (aref interv 0) chars)
+ (push (string (aref interv 0) ?- (aref interv 1))
+ ranges)))
+ sorted)
+
+ (cond
+ ;; Non-negated single-char set, like [$]: make a string.
+ ((and (= (length chars) 1)
+ (not negated)
+ (null ranges)
+ (null classes))
+ (string (car chars)))
+ ;; Single named class, like [[:space:]]: use the symbol.
+ ((and (= (length classes) 1)
+ (null chars)
+ (null ranges))
+ (if negated
+ (list 'not (car classes))
+ (car classes)))
+ ;; Anything else: produce (any ...)
+ (t
+ ;; Put dash last of all single characters.
+ (when (memq ?- chars)
+ (setq chars (cons ?- (delq ?- chars))))
+ (let* ((set (cons 'any
+ (append
+ (and ranges
+ (list (apply #'concat (nreverse ranges))))
+ (and chars
+ (list (apply #'string (nreverse chars))))
+ (nreverse classes)))))
+ (if negated
+ (list 'not set)
+ set))))))))
;; Reverse a sequence, flatten any (seq ...) inside, and concatenate
;; adjacent strings.
@@ -255,7 +308,7 @@
(?! . comment-delimiter)))))
(when (not sym)
(error "Unknown syntax code `%s'"
- (xr--escape-string (char-to-string syntax-code))))
+ (xr--escape-string (char-to-string syntax-code) nil)))
(let ((item (list 'syntax (cdr sym))))
(if negated (list 'not item) item))))
@@ -466,7 +519,7 @@
;; makes it unlikely to be a serious error.
(xr--report warnings (match-beginning 0)
(format "Escaped non-special character `%s'"
- (xr--escape-string (match-string 2))))))
+ (xr--escape-string (match-string 2) nil)))))
(t (error "Backslash at end of regexp"))))
@@ -494,6 +547,7 @@
(defun xr--parse (re-string warnings)
(with-temp-buffer
+ (set-buffer-multibyte t)
(insert re-string)
(goto-char (point-min))
(let ((rx (xr--parse-alt warnings)))
@@ -517,10 +571,11 @@ Return a list of (OFFSET . COMMENT) where COMMENT applies
at OFFSET
in RE-STRING."
(let ((warnings (list nil)))
(xr--parse re-string warnings)
- (reverse (car warnings))))
+ (sort (car warnings) #'car-less-than-car)))
;; Escape non-printing characters in a string for maximum readability.
-(defun xr--escape-string (string)
+;; If ESCAPE-BACKSLASH, also escape \, otherwise don't.
+(defun xr--escape-string (string escape-backslash)
;; Translate control and raw chars to escape sequences for readability.
;; We prefer hex escapes (\xHH) since that is usually what the user wants,
;; but use octal (\OOO) if a legitimate hex digit follows, as
@@ -532,8 +587,6 @@ in RE-STRING."
(xdigit (substring s 1))
(transl (assq c
'((?\" . "\\\"")
- (?\\ . "\\\\")
- (?\a . "\\a")
(?\b . "\\b")
(?\t . "\\t")
(?\n . "\\n")
@@ -542,10 +595,11 @@ in RE-STRING."
(?\r . "\\r")
(?\e . "\\e")))))
(concat
- (if transl
- (cdr transl)
- (format (if (zerop (length xdigit)) "\\x%02x" "\\%03o")
- c))
+ (cond (transl (cdr transl))
+ ((eq c ?\\)
+ (if escape-backslash "\\\\" "\\"))
+ ((zerop (length xdigit)) (format "\\x%02x" c))
+ (t (format (format "\\%03o" c))))
xdigit)))
string 'fixedcase 'literal))
@@ -563,7 +617,7 @@ in RE-STRING."
(rest (mapcar #'xr--rx-to-string (cdr rx))))
(concat "(" (mapconcat #'identity (cons first rest) " ") ")")))
((stringp rx)
- (concat "\"" (xr--escape-string rx) "\""))
+ (concat "\"" (xr--escape-string rx t) "\""))
(t (prin1-to-string rx))))
(defun xr-pp-rx-to-str (rx)