--- regexp-opt.el 2019-02-16 23:59:22.000000000 -0200 +++ regexp-opt3.el 2019-04-12 10:14:25.871968393 -0300 @@ -1,4 +1,4 @@ -;;; regexp-opt.el --- generate efficient regexps to match strings -*- lexical-binding: t -*- +;;; regexp-opt2.el --- generate efficient regexps to match strings -*- lexical-binding: t -*- ;; Copyright (C) 1994-2019 Free Software Foundation, Inc. @@ -37,10 +37,10 @@ ;; For example: ;; ;; (let ((strings '("cond" "if" "when" "unless" "while" -;; "let" "let*" "progn" "prog1" "prog2" -;; "save-restriction" "save-excursion" "save-window-excursion" -;; "save-current-buffer" "save-match-data" -;; "catch" "throw" "unwind-protect" "condition-case"))) +;; "let" "let*" "progn" "prog1" "prog2" +;; "save-restriction" "save-excursion" "save-window-excursion" +;; "save-current-buffer" "save-match-data" +;; "catch" "throw" "unwind-protect" "condition-case"))) ;; (concat "(" (regexp-opt strings t) "\\>")) ;; => "(\\(c\\(atch\\|ond\\(ition-case\\)?\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(current-buffer\\|excursion\\|match-data\\|restriction\\|window-excursion\\)\\|throw\\|un\\(less\\|wind-protect\\)\\|wh\\(en\\|ile\\)\\)\\>" ;; @@ -71,18 +71,15 @@ ;; your code for such changes to have effect in your code. ;; Originally written for font-lock.el, from an idea from Stig's hl319.el, with -;; thanks for ideas also to Michael Ernst, Bob Glickstein, Dan Nicolaescu and -;; Stefan Monnier. +;; thanks for ideas also to Michael Ernst, Bob Glickstein, Dan Nicolaescu, +;; Stefan Monnier and Miguel Frasson. + ;; No doubt `regexp-opt' doesn't always produce optimal regexps, so code, ideas ;; or any other information to improve things are welcome. -;; -;; One possible improvement would be to compile '("aa" "ab" "ba" "bb") -;; into "[ab][ab]" rather than "a[ab]\\|b[ab]". I'm not sure it's worth -;; it but if someone knows how to do it without going through too many -;; contortions, I'm all ears. ;;; Code: +;; original ;;;###autoload (defun regexp-opt (strings &optional paren) "Return a regexp to match a string in the list STRINGS. @@ -117,12 +114,12 @@ (defun simplified-regexp-opt (strings &optional paren) (let ((parens (cond ((stringp paren) (cons paren \"\\\\)\")) - ((eq paren \\='words) \\='(\"\\\\\\=<\\\\(\" . \"\\\\)\\\\>\")) - ((eq paren \\='symbols) \\='(\"\\\\_<\\\\(\" . \"\\\\)\\\\_>\")) - ((null paren) \\='(\"\\\\(?:\" . \"\\\\)\")) - (t \\='(\"\\\\(\" . \"\\\\)\"))))) + ((eq paren 'words) '(\"\\\\\\=<\\\\(\" . \"\\\\)\\\\>\")) + ((eq paren 'symbols) '(\"\\\\_<\\\\(\" . \"\\\\)\\\\_>\")) + ((null paren) '(\"\\\\(?:\" . \"\\\\)\")) + (t '(\"\\\\(\" . \"\\\\)\"))))) (concat (car paren) - (mapconcat \\='regexp-quote strings \"\\\\|\") + (mapconcat 'regexp-quote strings \"\\\\|\") (cdr paren))))" (save-match-data ;; Recurse on the sorted list. @@ -141,6 +138,80 @@ (t re))))) ;;;###autoload +(defun regexp-opt2 (strings &optional paren) + "Return a regexp to match a string in the list STRINGS. +Each string should be unique in STRINGS and should not contain +any regexps, quoted or not. Optional PAREN specifies how the +returned regexp is surrounded by grouping constructs. + +The optional argument PAREN can be any of the following: + +a string + the resulting regexp is preceded by PAREN and followed by + \\), e.g. use \"\\\\(?1:\" to produce an explicitly numbered + group. + +`words' + the resulting regexp is surrounded by \\=\\<\\( and \\)\\>. + +`symbols' + the resulting regexp is surrounded by \\_<\\( and \\)\\_>. + +non-nil + the resulting regexp is surrounded by \\( and \\). + +nil + the resulting regexp is surrounded by \\(?: and \\), if it is + necessary to ensure that a postfix operator appended to it will + apply to the whole expression. + +The resulting regexp is equivalent to but usually more efficient +than that of a simplified version: + + (defun simplified-regexp-opt (strings &optional paren) + (let ((parens + (cond ((stringp paren) (cons paren \"\\\\)\")) + ((eq paren \\='words) \\='(\"\\\\\\=<\\\\(\" . \"\\\\)\\\\>\")) + ((eq paren \\='symbols) \\='(\"\\\\_<\\\\(\" . \"\\\\)\\\\_>\")) + ((null paren) \\='(\"\\\\(?:\" . \"\\\\)\")) + (t \\='(\"\\\\(\" . \"\\\\)\"))))) + (concat (car paren) + (mapconcat \\='regexp-quote strings \"\\\\|\") + (cdr paren))))" + (let* ((regexp-opt-branches-alist nil); storage of `regexp-opt-branch' + ; made local + (root-fa nil)) + (dolist (s strings) + (setq root-fa (regexp-opt-add-string-to-node s root-fa))) + ;; + ;; Simplification + ;; + (regexp-opt-perform-simplification root-fa) + ;; end of simplification + ;; + ;; output regexp stored in root-fa + ;; return according to PAREN and type + ;; + ;;(regexp-opt-print-node root-fa 'graphviz) + (let* ((re-type (regexp-opt-typed-regexp root-fa)) + (type (cdr re-type)) + (re (if (or (eq type 'sequence) (eq type 'empty)) + (concat "\\(?:" (car re-type) "\\)") + (car re-type))) + ;; re-group is the re beginning with "\\(?:" + (re-group (if (eq type 'single) (concat "\\(?:" re "\\)") re)) + ;; open is the group beginning string if necessary + (open (cond ((stringp paren) paren) + ((eq paren 'words) "\\<\\(") + ((eq paren 'symbols) "\\_<\\(") + (paren "\\("))) + (close (cond ((eq paren 'words) "\\>") + ((eq paren 'symbols) "\\_>") + (t "")))) + ; if OPEN, replace "\\(?:" by OPEN + (if open (concat open (substring re-group 4) close) re)))) + +;;;###autoload (defun regexp-opt-depth (regexp) "Return the depth of REGEXP. This means the number of non-shy regexp grouping constructs @@ -151,17 +222,18 @@ ;; Count the number of open parentheses in REGEXP. (let ((count 0) start last) (while (string-match "\\\\(\\(\\?[0-9]*:\\)?" regexp start) - (setq start (match-end 0)) ; Start of next search. - (when (and (not (match-beginning 1)) - (subregexp-context-p regexp (match-beginning 0) last)) - ;; It's not a shy group and it's not inside brackets or after - ;; a backslash: it's really a group-open marker. - (setq last start) ; Speed up next regexp-opt-re-context-p. - (setq count (1+ count)))) + (setq start (match-end 0)) ; Start of next search. + (when (and (not (match-beginning 1)) + (subregexp-context-p regexp (match-beginning 0) last)) + ;; It's not a shy group and it's not inside brackets or after + ;; a backslash: it's really a group-open marker. + (setq last start) ; Speed up next regexp-opt-re-context-p. + (setq count (1+ count)))) count))) ;;; Workhorse functions. +;; kept for the original `regexp-opt' (defun regexp-opt-group (strings &optional paren lax) "Return a regexp to match a string in the sorted list STRINGS. If PAREN non-nil, output regexp parentheses around returned regexp. @@ -256,63 +328,893 @@ close-group)))))))))) +;; Rationale for simplification +;; ============================ +;; +;; If a list of strings could be arranged in a regexp splitable in +;; units, and we arrange these strings in finite automata (oriented +;; graph with labeled arrows and nameless nodes), from a point a set +;; of subgraphs will reproduce same pattern (are `equal'), so we +;; simplify by making the `equal's become `eq's, merginging nodes, +;; what closes a biurcation, meaning a smaller alternative group in +;; regexp. +;; +;; Implementation of optimization by finite automata FA +;; ==================================================== +;; +;; Regexp automata are a type of directed graph with labeled arrows. +;; +;; NODE = (ARROW1 ARROW2 ...) +;; A node is a list of arrows pointing out from it. +;; +;; ARROW = (CHAR . NODE) +;; An arrow is a cons of its label and a node pointed by it. +;; +;; Each node must have a unique arrow with CHAR, must be sorted +;; because we will compare nodes with `equal'. +;; +;; Since the empty string "" must be included in a natural way, all +;; strings are included in FAs like a null-terminated strings, what +;; means "ending with epsilon arrow" in FA jargon. The string "" is +;; just an epsilon arrow (arrow with empty label). The char 0 +;; represents the `epsilon' arrows of FA. +;; +;; A path in the graph from start node to end node (nil) represents a +;; string with that characteres, epsilon arrows (char=0) adding no +;; chars to string. +;; +;; Charsets, alternative groups or `?' constructions make a +;; bifurcation in the path. Example: regexp "a?\\(b\\|cd\\)e" is +;; implemented by the FA +;; +;; epsilon-| +;; / v +;; >1 --a--> 2 ---b---> 3 --e--> 5 --epsilon--> nil +;; | ^ +;; \-c-> 4 -d-/ +;; +;; When translating strings to FA, the "unique arrow with CHAR" rule +;; already groups similarities at the beginning of regexps. For +;; example, from strings ("abd" "acd"), we get the FA: +;; +;; >1 --a--> 2 --b--> 3 --d--> 4 --epsilon--> nil +;; | ^ +;; +---c--> 5 --d--> 6 --epsilon----/ +;; +;; This FA is internally stored as +;; ((?a (?b (?d (0))) +;; (?c (?d (0))))) +;; +;; Note that not all such graphs come from a regexp automaton. We will +;; not allow a graph with arrows connecting branches of an +;; alternative. For example, it is not allowed an arrow from 3 +;; pointing to 5, getting the following graph: +;; +;; >1 --a--> 2 --b--> 3 --c--> 4 --f--> F +;; | g ^ (! not allowed graph) +;; | V | ( sibling branches connected) +;; +---d--> 5 --e--> 6 +;; +;; Althought we could build a (complicated) regexp from it (namely +;; "a\\(b\\(c\\|ge\\)\\|de\\)f" we could accomplish it with a graph +;; without connections between sibling branches (detaching 6 from 5) +;; +;; +--------c--------+ +;; | v +;; >1 --a--> 2 --b--> 3 --g--> 5 --e--> 4 --f--> F +;; | ^ +;; | | +;; +-----d----> 6 ------e-----+ +;; +;; Keeping this in mind, it is only allowed merge nodes that have the +;; same `source' and `sink' nodes, that is, are in sibling branches. +;; In the example above, nodes 6, 3 and 4 are in sibling branches +;; (source=2, sink=4), but not 5 (its source is 3). If we merge 5 and +;; 6 we return to the previous not allowed graph. +;; +;; *** Simplification 1: merge `equal' nodes *** +;; +;; Start from strings ("abd" "acd"), we get the FA: +;; +;; >1 --a--> 2 --b--> 3 --d--> 4 --epsilon--> nil +;; | ^ +;; +---c--> 5 --d--> 6 --epsilon----/ +;; +;; Look at nodes 3 and 5 in sibling branches: both trees starting from +;; 3 and 5 (d -> epsilon ->), which are internally stored as (?d (0)), +;; are `equal' but not `eq'; so we "merge" nodes 3 and 5, making arrow +;; "c" from 2 point node 3 instead of 5, so these trees are `eq' now: +;; +;; >1 --a--> 2 --b--> 3 --d--> 4 --epsilon--> nil +;; | ^ +;; +---c---/ +;; +;; Internally: ((?a (?b #1) +;; (?c #1))) where #1=(?d (0)) +;; +;; No other nodes are `equal'. We finished. +;; This FA translates to regexp "a[bc]d". +;; +;; *** Simplification 2: node equal to subset of its source +;; +;; This simplification produces `?' constructions. Start from strings +;; ("abcf" "adef" "af") +;; +;; +---f--> 9 ----------epsilon -------------\ +;; | v +;; >1 --a--> 2 --b--> 3 --c--> 4 --f--> 5 --epsilon--> nil +;; | ^ +;; +---d--> 6 --e--> 7 --f--> 8 --epsilon----/ +;; +;; After simplification 1 (merge 4 and 7): +;; +;; +---f--> 9 ----------epsilon -------------\ +;; | v +;; >1 --a--> 2 --b--> 3 --c--> 4 --f--> 5 --epsilon--> nil +;; | ^ +;; +---d--> 6 --e---/ +;; +;; (This is the regexp "a\\(\\(bc\\|de\\)f\\|f\\)" ) +;; +;; Now, notice that node 4 = {f -> epsilon ->}, internally ((?f (0))), +;; is `equal' to a subset of arrows of 2, its source. We could split +;; node 2 with an epsilon arrow in two nodes +;; +;; +--f--> 9 --epsilon--> nil epsilon--> 10 --f--> 9 --epsilon--> nil +;; | | +;; 2 --b--> ... is equivalent to 2 --b--> ... +;; | | +;; +---d--> ... +---d--> ... +;; +;; This new node 10 also has source 2, like 4, and is `equal' to 4, so +;; we merge 10 and 4 by simplification 1 (node 10 disappears again!): +;; +;; +---epsilon-------+ +;; | v +;; >1 --a--> 2 --b--> 3 --c--> 4 --f--> 5 --epsilon--> nil +;; | ^ +;; +---d--> 6 --e--> 7 +;; +;; This FA yields the simpler regexp "a\\(bc\\|de\\)?f" +;; +;; Summarizing, if a node N is similar to a subtree os its source S, +;; we delete the corresponding arrows from S and add an epsilon arrow +;; from S to N. +;; +;; Sibling branches +;; ================ +;; +;; Sibling branches are those that start in the same node (source +;; node) and end in the same node (sink node). This concept is +;; related to build alternative groups. +;; +;; The source node of a node S is the one most nested where one but +;; not all paths from it lead to N. The sink is analogous with arrows +;; in oposite directions. Source and sink nodes mark beginning and +;; end of an alternative group of the corresponding regexp. For +;; example, in the FA below +;; +;; +--------c--------+ +;; | v +;; >1 --a--> 2 --b--> 3 --g--> 5 --e--> 4 --f--> 7 --epsilon--> nil +;; | ^ +;; | | +;; +-----d----> 6 ------e-----+ +;; +;; for regexp "a\\(b\\(c\\|ge\\)\\|de\\)f", there are 2 sets of (non +;; trivial) sibling branches: {2 -> 4} and {3 -> 4}, this last nested +;; in the previous. Here, nodes 3 and 6 are in sibling branches of +;; {2 -> 4}. 5 does not belong to {2 -> 4} because its source is 3, +;; not 2. Nodes 2 and 4 belong to the trivial root branch {1 -> nil}. +;; +;; The initial conversion from string to nodes only creates sources +;; (nil is the only sink). The simplifications explained only create +;; sinks, never sources. + +;; It may happen that a source has several sinks, so we need +;; to compute all possible intersections to compute all sinks. + +;; Example of sink detection: +;; +---a---> 2 --x---\ +;; | v +;; | --b--> 3 --y--> 4 ---\ +;; | / v +;; >1 ------c-----> 5 --e--> 7 --epsilon--> nil +;; \ ^ +;; \-d--> 6 -f-/ +;; For source 1: +;; path for a = (2 4 7 nil) +;; path for b = (3 4 7 nil) +;; path for c = (5 7 nil) +;; path for d = (6 5 7 nil) +;; Intersections +;; ((7 nil) (7 nil) (7 nil) (4 7 nil) (7 nil) (7 nil) (7 nil) +;; (7 nil) (7 nil) (7 nil) (5 7 nil)) +;; So we detect the sinks: 4, 5, 7 + +(defun regexp-opt-perform-simplification (fa) + "Perform simplifications to FA, by side effects." + ;; We need to substitute nodes that are `equal' to be `eq', so we + ;; need to know the "parents" of that node to set cdr's of arrows. + ;; Se we make a `parents-alist' that associates nodes with all + ;; nodes with an arrow to it. + ;; + ;; Simplification happens at source nodes (nodes with more than 1 + ;; arrow from it), so we gather a list of sources too. Sources + ;; will be sorted from most external to most nested. + (let ((sources) + (parents-alist) + ;; nodes to be processed: list of (node parent depth) + (nodes (list (list fa t 0))) + processed-sources + node-parent-depth node depth parent + new-node source-without-arrows) + (while nodes + (setq node-parent-depth (pop nodes) + node (car node-parent-depth) + parent (cadr node-parent-depth) + depth (caddr node-parent-depth)) + (while (and node (= (length node) 1)) ; if 1 arrow, follow up + ; to a source + (push parent (alist-get node parents-alist)) + (setq parent node + node (cdar node))) + (when node ; must be a source, push all subnodes to `nodes' + (push (cons node depth) sources) + (push parent (alist-get node parents-alist)) + (dolist (arrow node) + (push (list (cdr arrow) node (1+ depth)) nodes)))) + ;; Sort sources by depth, from most nested to most external. + (setq sources (mapcar #'car (sort sources + (lambda (x y) (> (cdr x) (cdr y)))))) + ;; start of simplification + ;; + (dolist (source sources) + (setq simplified t) + ;; repeat simplification until source is fully simplified + (while simplified + ;; debug + ;;(regexp-opt-print-node fa 'graphviz "png" "/home/sme/re-temp") + ;;(read-from-minibuffer "Press ENTER to continue.") + (setq simplified nil) + ;; It's safe to process all branches completely + (dolist (branch (regexp-opt-branch source t)) + (setq nodes (cdr branch)) + (while nodes + (setq to (pop nodes)) + ;; Simplification 1 + (dolist (from nodes) + (when (equal from to) + (setq simplified t) + ;; make all arrows that pointed to FROM now point + ;; to TO and update parents-alist of TO + (dolist (parent (alist-get from parents-alist)) + (dolist (arrow parent) + (when (eq (cdr arrow) from) + (setcdr arrow to))) + (or (memq parent (alist-get to parents-alist)) + (push parent (alist-get to parents-alist)))) + ;; verify if from is last sink of some previous source + ;; if so, must recompute branch for that source + (catch 'from-last-sink + (dolist (prev-source processed-sources) + (when (eq from + (caar (last (regexp-opt-branch prev-source)))) + (regexp-opt-branch prev-source t) + (throw 'from-last-sink nil)))))) + ;; eliminate all simplified nodes from `nodes' + (setq nodes (delete to nodes))))) + ;; Simplification 2 + ;; + ;; need to create a epsilon arrow to subnode: if SOURCE + ;; has arrow 0, TO must have an arrow 0 so it will be + ;; removed from SOURCE in this simplification + (catch 'simplification2 + (dolist (branch (regexp-opt-branch source)) + (dolist (node (cdr branch)) + (unless (and (assq 0 source) (not (assq 0 node))) + ;; sorted list of chars of NODE: + ;; CHARS = (mapcar 'car node) + ;; list of same arrows in SOURCE: + ;; NEW-NODE = (mapcar (lambda (c) (assq c source)) + ;; CHARS) + (setq new-node (mapcar (lambda (char) + (assq char source)) + (mapcar 'car node))) + (when (equal new-node node) + (setq simplified t) + ;; remove arrows of source commom to node, + ;; keeping cons of SOURCE, add arrow (0 . node) + (setf source-without-arrows (regexp-opt-difference + source new-node #'eq) + (car source) (cons 0 node) + (cdr source) source-without-arrows) + ;; add source as parrent of node + (push source (alist-get node parents-alist)) + ;; recompute branch because last-sink may have changed + (regexp-opt-branch source t) + ;; when simplified, no need to look for another simplif. + (throw 'simplification2 nil)))))) + (push source processed-sources)))) + +;; (defun regexp-opt-perform-simplification (fa) +;; "Perform simplifications to FA, by side effects." +;; ;; We need to substitute nodes that are `equal' to be `eq', so we +;; ;; need to know the "parents" of that node to set cdr's of arrows. +;; ;; Se we make a `parents-alist' that associates nodes with all +;; ;; nodes with an arrow to it. +;; ;; +;; ;; Simplification happens at source nodes (nodes with more than 1 +;; ;; arrow from it), so we gather a list of sources too. Sources +;; ;; will be sorted from most external to most nested. +;; (let ((sources) +;; (parents-alist) +;; ;; nodes to be processed: list of (node parent depth) +;; (nodes (list (list fa t 0))) +;; node-parent-depth node depth parent) +;; (while nodes +;; (setq node-parent-depth (pop nodes) +;; node (car node-parent-depth) +;; parent (cadr node-parent-depth) +;; depth (caddr node-parent-depth)) +;; (while (and node (= (length node) 1)) ; if 1 arrow, follow up +;; ; to a source +;; (push parent (alist-get node parents-alist)) +;; (setq parent node +;; node (cdar node))) +;; (when node ; must be a source, push all subnodes to `nodes' +;; (push (cons node depth) sources) +;; (push parent (alist-get node parents-alist)) +;; (dolist (arrow node) +;; (push (list (cdr arrow) node (1+ depth)) nodes)))) +;; ;; Sort sources by depth, from most nested to most external. +;; (setq sources (mapcar #'car (sort sources +;; (lambda (x y) (> (cdr x) (cdr y)))))) +;; ;; start of simplification +;; (let* ((simplify t) +;; new-node source-without-arrows) +;; ;; redo simplifications until no simplification is made +;; (while simplify +;; (setq simplify nil +;; ;; force branch recomputation +;; regexp-opt-branches-alist nil) +;; (dolist (source sources) +;; (setq source-simplified nil) +;; ;; It's safe to process all branches completely +;; (dolist (branch (regexp-opt-branch source)) +;; (setq sink (car branch) +;; nodes (cdr branch)) +;; (while nodes +;; (setq to (pop nodes)) +;; ;; Simplification 1 +;; (dolist (from nodes) +;; (when (equal from to) +;; (setq simplify t) +;; ;; make all arrows that pointed to FROM now point +;; ;; to TO and update parents-alist of TO +;; (dolist (parent (alist-get from parents-alist)) +;; (dolist (arrow parent) +;; (when (eq (cdr arrow) from) +;; (setcdr arrow to))) +;; (or (memq parent (alist-get to parents-alist)) +;; (push parent (alist-get to parents-alist)))) +;; ;; vanish FROM and nested sources from SOURCES +;; (when (> (length from) 1) +;; (setq sources (regexp-opt-remove-source from sources))))) +;; ;; eliminate all simplified nodes from `nodes' +;; (setq nodes (delete to nodes)) +;; ;; Simplification 2 +;; ;; +;; ;; need to create a epsilon arrow to subnode: if SOURCE +;; ;; has arrow 0, TO must have an arrow 0 so it will be +;; ;; removed from SOURCE in this simplification +;; (unless (and (assq 0 source) (not (assq 0 to))) +;; ;; sorted list of chars of TO: +;; ;; CHARS = (mapcar 'car to) +;; ;; list of same arrows in SOURCE: +;; ;; NEW-NODE = (mapcar (lambda (c) (assq c source)) +;; ;; CHARS) +;; (setq new-node (mapcar (lambda (char) +;; (assq char source)) +;; (mapcar 'car to))) +;; (when (equal new-node to) +;; (setq simplify t) +;; ;; remove arrows of source commom to node, +;; ;; keeping cons of SOURCE, add arrow (0 . to) +;; (setf source-without-arrows (regexp-opt-difference +;; source new-node #'eq) +;; (car source) (cons 0 to) +;; (cdr source) source-without-arrows) +;; ;; add source as parrent of node +;; (push source (alist-get to parents-alist))))))))))) + +(defun regexp-opt-add-string-to-node (s node &optional position) + "Add string S to NODE and return NODE. +POSITION is the start position of S considered, 0 if POSITION is +nil. Arrows are inserted sorted by sorted by char." + (let* ((len (length s)) + (pos (if position position 0)) + (char (if (> len pos) (aref s pos) 0)) ; epsilon arrow if pos=len + (arrow (assq char node)) + (child-node (when (> len pos) + (regexp-opt-add-string-to-node s (cdr arrow) + (1+ pos))))) + (cond ((and node arrow) (setcdr arrow child-node) node) + (node (sort (cons (cons char child-node) node) + (lambda (x y) (< (car x) (car y))))) + (t (list (cons char child-node)))))) + +(defvar regexp-opt-branches-alist nil + "Variable holding branch computation or other. +Before simplification, holds source information. +During simplification, holds info that can be reused until a +simplification changes scenario.") + +(defun regexp-opt-branch (source &optional recompute) + "Return a list of branches associated with SOURCE. +The return value is a list of (SINK . NODES). SINK is a node +that finishes a bifurcation started in SOURCE. A SOURCE may have +several sinks. Both SOURCE and SINK do not belong to NODES. The +order of sinks in the returned value is from the most nested to +the most external. + +If RECOMPUTE is nonnil, recomputes branch for SOURCE. If +RECOMPUTE is the symbol `recursive', recompute recursively all +nested sources encountered." + (unless (> (length source) 1) + (error "Should be a list with at least 2 elements: %s" source)) + (let* ((branch-info (alist-get source regexp-opt-branches-alist)) + (recur (when (eq recompute 'recursive) 'recursive)) + (last-sink (caar (last branch-info)))) ; reuse last-sink from + ; previous computation + ; to limit path following + (when (or recompute + (not branch-info)) + ;; + ;; compute branch-info anew + ;; + (let (path paths + node nodes + sink sinks + intersections) + (dolist (arrow source) ; looping in all arrows of each source + (setq node (cdr arrow) + path (list node)) + (while (and node (not (eq node last-sink))) + (setq node (if (= (length node) 1) + (cdar node) ; 1 arrow, follow arrow + ;; a source; its most external sink is the car + ;; of last branch-info + (caar (last (regexp-opt-branch node recur))))) + (push node path)) + ;; save on paths list + (push (nreverse path) paths)) + ;; Compute intersections of all paths to compute sinks. + ;; + ;; To detect the nesting, we sort by intersection length: the + ;; most external, shorter is the intersection. + (setq intersections (sort (regexp-opt-all-intersections t paths) + (lambda (x y) (> (length x) (length y)))) + sinks (regexp-opt-uniq (mapcar 'car intersections)) + ;; make branch-info an alist in same order that sinks + branch-info (mapcar #'list sinks)) + ;; Now select which nodes are in corresponding branches + (dolist (path paths) + (setq nodes nil) + (while path + (while (and path (not (memq (car path) sinks))) + (push (pop path) nodes)) + (when path ; means (car path) is in `sinks' + (setq sink (pop path)) + ;; add elements of `nodes' to assq of `sink' in `branch-info' + (setf (alist-get sink branch-info) + (append nodes (alist-get sink branch-info))) + ;; this sink is a node for next sink, if any + (setq nodes (list sink))))) + ;; eliminate duplicated nodes in branches + (dolist (branch branch-info) + (setcdr branch (regexp-opt-uniq (cdr branch)))) + ;; end of branch-info computation + ;; + ;; save branch-info to `regexp-opt-branches-alist' + (setf (alist-get source regexp-opt-branches-alist) branch-info))) + branch-info)) + +(defun regexp-opt-typed-regexp (node) + "Return (REGEXP . TYPE) corresponding to NODE. +REGEXP is a string and TYPE is one of the symbols: +empty = empty; +single = single char or charset or ? construction; +group = \\(?: ... \\); +sequence = sequence of singles. + +A regexp of type `single' or `group' can be followed by * or + +contructions." + (let ((re "") + (type 'empty) + re-type-lastsink type-s) + (while node + (if (= (length node) 1) + ;; 1 arrow, follow it + (if (= (caar node) 0) ; epsilon arrow, skip node + (setq node (cdar node)) + (setq re (concat re (regexp-quote (char-to-string (caar node)))) + type (if (eq type 'empty) 'single 'sequence) + node (cdar node))) + ;; else, a source + (setq re-type-lastsink (regexp-opt-typed-regexp-source node) + ;; re-type-lastsink = (RE TYPE LAST-SINK) for source NODE + re (concat re (car re-type-lastsink)) + type-s (cadr re-type-lastsink) + type (cond ((eq type 'empty) type-s) + ((eq type-s 'empty) type) + (t 'sequence)) + node (caddr re-type-lastsink)))) + (cons re type))) + +(defun regexp-opt-typed-regexp-source (source) + "Return (REGEXP TYPE LAST-SINK) corresponding to SOURCE. +REGEXP and TYPE are as in `regexp-opt-typed-regexp'." + ;; Idea of implementation: + ;; + ;; A source can have nested sources. The most nested sources + ;; do not depend on other sources, so starting with the most + ;; nested sources to the most external, process each source. + ;; The regexp must be the regexp for the most external source + ;; and sink. + ;; + ;; For each source, the most nested sinks do not depend on + ;; others, only on arrow paths. So we process all arrow paths + ;; first and then resolve the sinks in order from the most + ;; nested to the most external. + (let* ((branch-info (regexp-opt-branch source)) + ;; sinks are also stored in `branch-info' + (sinks (mapcar #'car branch-info)) + (last-sink (car (last sinks))) + (sink-chars-alist nil) + (sink-re-alist nil) + ;; `chars-node-list' is a list of (LIST-OBJS . SUBNODE) + ;; if at an arrow, LIST-OBJS is (list CHAR) + ;; if a sink, LIST-OBJS is (list SINK) and we get the regexp + ;; from association of SINK in sink-re-alist. + ;; + ;; populate `chars-node-list' in order, first arrows, then + ;; sinks, from the most nested to the most external (except + ;; the last, that we don't follow) + (chars-node-list (append (mapcar (lambda (arrow) + (cons (list (car arrow)) + (cdr arrow))) + source) + (mapcar (lambda (sink) + (cons (list sink) sink)) + (butlast sinks)))) + re-type) + ;; Follow paths until a sink is found. + ;; Then save this path to sink-chars-alist for this former sink. + (dolist (chars-node chars-node-list) + (let* ((chars (car chars-node)) + (node (cdr chars-node)) + (sinkp (not (characterp (car chars)))) + re-type-lastsink) + ;; allow start from an original sink + (while (or sinkp (not (memq node sinks))) + (setq sinkp nil) + ;; follow path + (if (= (length node) 1) + (setq chars (cons (caar node) chars) ; save char to chars + node (cdar node)) ; follow arrow + ;; means it is a source -- recursion + (setq re-type-lastsink + (regexp-opt-typed-regexp-source node)) + ;; save typed-regexp to chars + (push (cons (car re-type-lastsink) + (cadr re-type-lastsink)) + chars) + ;; node <- last-sink + (setq node (caddr re-type-lastsink)))) + ;; node is a sink, save chars with node as key + (push (nreverse chars) (alist-get node sink-chars-alist)))) + ;; + ;; now process each sink of this source in order + (dolist (sink sinks) + (let* ((list-chars (alist-get sink sink-chars-alist)) + (singles nil) + (compounds nil) + (question (member '(0) list-chars)) + re type + re-obj type-obj aux) + (when question + (setq list-chars (delete '(0) list-chars))) + ;; eliminate all epsilon arrows left from all chars + (setq list-chars (mapcar (lambda (chars) (delq 0 chars)) list-chars)) + ;; separate all alone chars in singles + (dolist (chars list-chars) + (setq re "" type 'empty) + ;; + ;; so far, chars only contains chars, sinks or typed-regexps + ;; (cdr is symbol) + (if (= (length chars) 1) + (cond ((characterp (car chars)) ; 1 char => singles + (push (car chars) singles)) + ((and (cdar chars) + (symbolp (cdar chars))) ; typed-regexp => compounds + (push (car chars) compounds)) + (t + ;; a sink, for sure a most nested, already processed + ;; the association is a typed-regexp + (push (alist-get (car chars) sink-re-alist) compounds))) + ;; + (dolist (obj chars) + (cond ((characterp obj) + (setq re-obj (regexp-quote (char-to-string obj)) + type-obj 'single)) + ;; typed-regexp + ((and (cdr obj) (symbolp (cdr obj))) + (setq re-obj (car obj) + type-obj (cdr obj))) + ;; obj=sink -> lookup in `sink-re-alist' + (t (setq aux (alist-get obj sink-re-alist) + re-obj (car aux) + type-obj (cdr aux)))) + (setq re (concat re re-obj) + type (cond ((eq type 'empty) type-obj) + ((eq type-obj 'empty) type) + (t 'sequence)))) + (push (cons re type) compounds))) + compounds + ;; process singles into compounds + (cond ((> (length singles) 1) + (push (cons (regexp-opt-charset singles) 'single) + compounds)) + ((= (length singles) 1) + (push (cons (regexp-quote (char-to-string (car singles))) + 'single) + compounds))) + ;; output according to compounds and question + (cond ((> (length compounds) 1) + (setq re (concat "\\(?:" (mapconcat #'car compounds "\\|") "\\)") + type 'group) + (when question (setq re (concat re "?") + type 'single))) + ;; + ((= (length compounds) 1) + (setq re (caar compounds) + type (cdar compounds)) + (cond ((and question (eq type 'single)) + (setq re (concat re "?"))) + (question (setq re (concat "\\(?:" re "\\)?") + type 'single)))) + ((= (length compounds) 0) ; this may happen with node (0) + (setq re "" ; because question becomes nonnil + type 'empty))) ; and compounds is empty + ;; save (RE . TYPE) to sink-re-alist + (setf (alist-get sink sink-re-alist) (cons re type)))) + ;; return (RE TYPE LAST-SINK) + (setq re-type (alist-get last-sink sink-re-alist)) + (list (car re-type) (cdr re-type) last-sink))) + +(defun regexp-opt-remove-source (source sources) + "Return SOURCES with SOURCE and all nested sources removed." + (dolist (branch (regexp-opt-branch source)) + ;; each branch = (SINK . NODES) + (dolist (node (cdr branch)) + (when (> (length node) 1) + (setq sources (regexp-opt-remove-source node sources))))) + (delq source sources)) + (defun regexp-opt-charset (chars) "Return a regexp to match a character in CHARS. CHARS should be a list of characters." ;; The basic idea is to find character ranges. Also we take care in the ;; position of character set meta characters in the character set regexp. ;; - (let* ((charmap (make-char-table 'regexp-opt-charset)) - (start -1) (end -2) - (charset "") - (bracket "") (dash "") (caret "")) + (let* ((charmap (make-char-table 'case-table)) + (start -1) (end -2) + (charset "") + (bracket "") (dash "") (caret "")) ;; ;; Make a character map but extract character set meta characters. (dolist (char chars) (cond ((eq char ?\]) - (setq bracket "]")) + (setq bracket "]")) ((eq char ?^) - (setq caret "^")) + (setq caret "^")) ((eq char ?-) - (setq dash "-")) + (setq dash "-")) (t - (aset charmap char t)))) + (aset charmap char t)))) ;; ;; Make a character set from the map using ranges where applicable. (map-char-table (lambda (c v) (when v - (if (consp c) - (if (= (1- (car c)) end) (setq end (cdr c)) - (if (> end (+ start 2)) - (setq charset (format "%s%c-%c" charset start end)) - (while (>= end start) - (setq charset (format "%s%c" charset start)) - (setq start (1+ start)))) - (setq start (car c) end (cdr c))) - (if (= (1- c) end) (setq end c) - (if (> end (+ start 2)) - (setq charset (format "%s%c-%c" charset start end)) - (while (>= end start) - (setq charset (format "%s%c" charset start)) - (setq start (1+ start)))) - (setq start c end c))))) + (if (consp c) + (if (= (1- (car c)) end) (setq end (cdr c)) + (if (> end (+ start 2)) + (setq charset (format "%s%c-%c" charset start end)) + (while (>= end start) + (setq charset (format "%s%c" charset start)) + (setq start (1+ start)))) + (setq start (car c) end (cdr c))) + (if (= (1- c) end) (setq end c) + (if (> end (+ start 2)) + (setq charset (format "%s%c-%c" charset start end)) + (while (>= end start) + (setq charset (format "%s%c" charset start)) + (setq start (1+ start)))) + (setq start c end c))))) charmap) (when (>= end start) (if (> end (+ start 2)) - (setq charset (format "%s%c-%c" charset start end)) - (while (>= end start) - (setq charset (format "%s%c" charset start)) - (setq start (1+ start))))) + (setq charset (format "%s%c-%c" charset start end)) + (while (>= end start) + (setq charset (format "%s%c" charset start)) + (setq start (1+ start))))) ;; ;; Make sure a caret is not first and a dash is first or last. (if (and (string-equal charset "") (string-equal bracket "")) - (if (string-equal dash "") + (if (string-equal dash "") "\\^" ; [^] is not a valid regexp (concat "[" dash caret "]")) (concat "[" bracket charset caret dash "]")))) +;; Due to a bug with nil as elements, that is fixed only in Emacs +;; 27.2, I reimplemented `seq-intersection', `seq-uniq' and +;; `seq-difference'. Later, if seq is loaded by default, we could +;; eliminate these functions, changing code accordingly (notice that +;; TESTFN is `eq' for intersection and uniq!) + +(defun regexp-opt-intersection (list1 list2) + "Return a list of the elements that appear in both LIST1 and LIST2. +Equality is defined by `eq'." + (let (intersection) + (dolist (x list1 (nreverse intersection)) + (when (memq x list2) (push x intersection))))) + +(defun regexp-opt-uniq (list) + "Return a list of the elements of LIST with duplicates removed. +Equality is defined by `eq'." + (let (result) + (dolist (el list (nreverse result)) + (unless (memq el result) (push el result))))) + +(defun regexp-opt-difference (list1 list2 &optional testfn) + "My version of seq-difference, for use with Emacs 26.1." + (let ((fn (if testfn testfn #'equal)) + diff s2) + (dolist (x list1 (nreverse diff)) + (setq s2 list2) + (while (and s2 (not (funcall fn x (car s2)))) + (setq s2 (cdr s2))) + (unless s2 + (push x diff))))) + +(defun regexp-opt-all-intersections (x rest) + "Return the list of all possible intersections. +Use (regexp-opt-all-intersections t REST) for the list of all +possible intersections of combinations of elements of REST with +at least 2 elements. + +For example, if A, B, C, D are lists, + (regexp-opt-all-intersections t (A B C D)) + -> (A∩B A∩B∩C A∩B∩C∩D A∩B∩D A∩C A∩C∩D A∩D B∩C B∩C∩D B∩D C∩D) +in this order, where X∩Y is computed comparing elements with `eq'. + +If ELT is a list, return the list of intersections of ELT with +all combinations of elements of REST. This feature is used for +recursion." + ;; Idea for implementation: + ;; + ;; called with ELT=A, A is a list -> all intersections that have A + ;; (regexp-opt-all-intersections A '(B C D)) returns + ;; (A∩B) + (regexp-opt-all-intersections A∩B '(C D)) + ;; + (A∩C) + (regexp-opt-all-intersections A∩C '(D)) + ;; + (A∩D) + ;; = (A∩B) + ;; + (A∩B∩C) + (regexp-opt-all-intersections A∩B∩C '(D)) + (A∩B∩D) + ;; + (A∩C) + (A∩C∩D) + (A∩D) + ;; = (A∩B A∩B∩C A∩B∩C∩D A∩B∩D A∩C A∩C∩D A∩D) + ;; + ;; Called with ELT not a list + ;; (regexp-opt-all-intersections t '(A B C D)) returns + ;; (regexp-opt-all-intersections A '(B C D)) + ;; + (regexp-opt-all-intersections B '(C D)) + ;; + (regexp-opt-all-intersections C '(D)) + ;; = (A∩B A∩B∩C A∩B∩C∩D A∩B∩D A∩C A∩C∩D A∩D) + ;; + (B∩C B∩C∩D B∩D) + ;; + (C∩D) + ;; = (A∩B A∩B∩C A∩B∩C∩D A∩B∩D A∩C A∩C∩D A∩D B∩C B∩C∩D B∩D C∩D) + ;; ... that is the set of all possible intersections of elements of + ;; '(A B C D) where A, B, C, D are lists + (if (listp x) + (mapcan (lambda (y) + (let ((x∩y (regexp-opt-intersection x y)) + (tail (cdr (memq y rest)))) + (if tail + (cons x∩y + (regexp-opt-all-intersections x∩y tail)) + (list x∩y)))) + rest) + (mapcan (lambda (l) + (let ((tail (cdr (memq l rest)))) + (when tail (regexp-opt-all-intersections l tail)))) + rest))) + +;; Debuging: regexp-opt-print-node prints NODE in human readable form. +;; If ou have graphviz installed, get a nice graph with +;; (regexp-opt-print-node node 'graphviz) +;; or just use +;; (regexp-opt-print-node node) +;; for a list of arrows to numbered nodes. + +;; (defun regexp-opt-print-node (node &optional driver ext tempfile-no-ext) +;; "Print structure of NODE in human readable form. +;; Optional second argument DRIVER should be `buffer' (default if +;; nil) or `graphviz'. If DRIVER is `buffer', insert a list of +;; arrows in current buffer. If DRIVER is `graphviz', pass node +;; structure to graphviz and display the output image in a buffer. +;; For graphviz driver, other arguments EXT and TEMPFILE-NO-EXT +;; shuold be strings where EXT (default to \"svg\") is the image +;; extension to be output by graphviz and TEMPFILE-NO-EXT (default +;; to \"re-temp\" is the filename without extension for saving and +;; output for graphviz." +;; (let* ((to-be-printed (list node)) +;; (graphviz (eq driver 'graphviz)) +;; (buf (when graphviz (get-buffer-create "*regexp-dot*"))) +;; (last-buffer (current-buffer)) +;; (compilation-ask-about-save nil) +;; printed-nodes nodes +;; from to char +;; from-num to-num +;; input output +;; compilation-going-on) +;; (unless ext (setq ext "png")) +;; (unless tempfile-no-ext (setq tempfile-no-ext "re-temp")) +;; (when graphviz +;; (setq input (concat tempfile-no-ext ".dot") +;; output (concat tempfile-no-ext "." ext)) +;; (set-buffer buf) +;; (delete-region (point-min) (point-max)) +;; (insert "digraph regexpfa {\nrankdir=LR;\nnode [shape = circle];")) +;; (while to-be-printed +;; (setq from (pop to-be-printed)) +;; (unless (memq from printed-nodes) +;; (push from printed-nodes) +;; (unless (memq from nodes) (push from nodes)) +;; (setq from-num (length (memq from nodes))) +;; (dolist (arrow from) +;; (setq to (cdr arrow) +;; char (if (= (car arrow) 0) ?ε (car arrow))) +;; (push to to-be-printed) +;; (unless (memq to nodes) (push to nodes)) +;; (setq to-num (length (memq to nodes))) +;; (cond ((eq driver 'graphviz) +;; (insert (format "\n%3d -> %3d [ label = %S ];" +;; from-num to-num (char-to-string char)))) +;; ((or (not driver) (eq driver 'buffer)) +;; (insert (format "\n%3d -- %c --> %3d" +;; from-num char to-num))))))) +;; (when graphviz +;; (insert "\n}\n") +;; ;; write to temp, compile and display +;; (write-file (concat tempfile-no-ext ".dot")) +;; (set-buffer last-buffer) +;; (kill-buffer buf) +;; (when (get-buffer "*preview-re*") +;; (kill-buffer "*preview-re*")) +;; (setq compilation-going-on t) +;; (add-hook 'compilation-finish-functions +;; (lambda (a b) (ignore a b) (setq compilation-going-on nil))) +;; (compile (concat "dot -T" ext " \"" input "\" \"-o" output "\"")) +;; (while compilation-going-on (sleep-for 0.1)) +;; (when (file-exists-p output) +;; (auto-image-file-mode 1) +;; (set-buffer (find-file-noselect output)) +;; (rename-buffer "*preview-re*") +;; (sleep-for 0.1) +;; (display-buffer (get-buffer "*preview-re*")))))) + (provide 'regexp-opt) ;;; regexp-opt.el ends here