[Top][All Lists]

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: peg.el --- Parsing Expression Grammars in Emacs Lisp

From: Helmut Eller
Subject: Re: peg.el --- Parsing Expression Grammars in Emacs Lisp
Date: Thu, 27 Nov 2008 10:13:59 +0100
User-agent: Gnus/5.13 (Gnus v5.13) Emacs/23.0.60 (gnu/linux)

New in this version:

 * fix a few bugs, most notably *-expressions didn't backtrack properly.

 * a more convenient syntax for character sets, e.g. [A-Z "-]"] now
   means all charecters from A to Z, minus, and right brackets.

 * more realistic examples

 * minimal test suite


;;; peg.el --- Parsing Expression Grammars in Emacs Lisp
;; Copyright 2008  Helmut Eller <address@hidden>.
;; This file is licensed under the terms of the GNU General Public
;; License as distributed with Emacs (press C-h C-c for details).
;;; Commentary:
;; Parsing Expression Grammars (PEG) are a formalism in the spirit of
;; Context Free Grammars (CFG) with some simplifications which makes
;; the implementation of PEGs as top-down parser particularly simple
;; and easy to understand [**].
;; This file implements a macro `peg-parse' which parses the current
;; buffer according to a PEG.  E.g. we can match integers with a PEG
;; like this:
;;  (peg-parse (number   sign digit (* digit))
;;             (sign     (or "+" "-" ""))
;;             (digit    '"09")))
;; In contrast to regexps, PEGs allow us to define recursive rules.  A
;; PEG is a list of rules.  A rule is written as (NAME . PE).
;; E.g. (sign (or "+" "-" "")) is a rule with the name "sign".  For
;; convenience, PE is implicitly wrapped in a and. The syntax for
;; Parsing Expression (PE) is a follows:
;; Description          Lisp            Haskell as in [*]
;; Sequence             (and e1 e2)     e1 e2
;; Prioritized Choice   (or e1 e2)      e1 / e2
;; Not-predicate        (not e)         !e
;; And-predicate        (if e)          &e
;; Any character        (any)           .
;; Literal string       "abc"           "abc"
;; Character C          (char c)        'c'
;; Zero-or-more         (* e)           e*
;; One-or-more          (+ e)           e+
;; Optional             (opt e)         e?
;; Character range      (range a b)     [a-b]
;; Character set        [a-b "+*" ?x]   [a-b+*x]
;; `peg-parse' also supports parsing actions, i.e. Lisp snippets which
;; are executed when a PE matches.  This can be used to construct
;; syntax trees or for similar tasks.  Actions are written as
;;  (action FORM)          ; evaluate FORM
;;  `(VAR... -- FORM...)   ; stack action
;; Actions don't consume input, but are executed at the point of
;; match.  A "stack action" takes VARs from the "value stack" and
;; pushes the result of evaluating FORMs to that stack.  See
;; `peg-ex-parse-int' for an example.
;; References:
;; [*] Bryan Ford. Parsing Expression Grammars: a Recognition-Based
;; Syntactic Foundation. In POPL'04: Proceedings of the 31st ACM
;; SIGPLAN-SIGACT symposium on Principles of Programming Languages,
;; pages 111-122, New York, NY, USA, 2004. ACM Press.
;; [**] Baker, Henry G. "Pragmatic Parsing in Common Lisp".  ACM Lisp
;; Pointers 4(2), April--June 1991, pp. 3--15.

;;; Code:

(defmacro peg-parse (&rest rules)
  "Match RULES at point.
Return (T STACK) if the match succeed and nil on failure."
  (peg-translate-rules rules))

(defmacro peg-parse-exp (exp)
  "Match the parsing expression EXP at point.
Note: a PE can't \"call\" rules by name."
  `(let ((peg-thunks nil))
     (when ,(peg-translate-exp (peg-normalize exp))
       (peg-postprocess peg-thunks))))

;; A table of the PEG rules.  Used during compilation to resolve
;; references to named rules.
(defvar peg-rules)

;; used at runtime for backtracking.  It's a list ((POS . THUNK)...).
;; Each THUNK is executed at the corresponding POS.  Thunks are
;; executed in a postprocessing step, not during parsing.
(defvar peg-thunks)

;; The basic idea is to translate each rule to a lisp function.
;; The result looks like
;;   (let ((rule1 (lambda () code-for-rule1))
;;         ...
;;         (ruleN (lambda () code-for-ruleN)))
;;     (funcall rule1))
;; code-for-ruleX returns t if the rule matches and nil otherwise.
(defun peg-translate-rules (rules)
  "Translate the PEG RULES, to a top-down parser."
  (let ((peg-rules (make-hash-table :size 20)))
    (dolist (rule rules)
      (puthash (car rule) 'defer peg-rules))
    (dolist (rule rules)
      (puthash (car rule) (peg-normalize `(and . ,(cdr rule))) peg-rules))
    (peg-check-cycles peg-rules)
    `(let ((peg-thunks '()))
       (let ,(mapcar (lambda (rule)
                       (let ((name (car rule)))
                           (lambda () 
                             ,(peg-translate-exp (gethash name peg-rules))))))
         (when (funcall ,(car (car rules)))
           (peg-postprocess peg-thunks))))))

(defun peg-method-table-name (method-name)
  (intern (format "peg-%s-methods" method-name)))

(defmacro peg-define-method-table (name)
  (let ((tab (peg-method-table-name name)))
     (defvar ,tab)
     (setq ,tab (make-hash-table :size 20)))))

(defmacro peg-add-method (method type args &rest body)
  (declare (indent 3))
  `(puthash ',type (lambda ,args . ,body) ,(peg-method-table-name method)))

(peg-define-method-table normalize)

;; Internally we use a regularized syntax, e.g. we only have binary OR
;; nodes.  Regularized nodes are lists of the form (OP ARGS...).
(defun peg-normalize (exp)
  "Return a \"normalized\" form of EXP."
  (cond ((and (consp exp)
              (let ((fun (gethash (car exp) peg-normalize-methods)))
                (and fun 
                     (apply fun (cdr exp))))))
        ((stringp exp)
         (let ((len (length exp)))
           (cond ((zerop len) '(null))
                 ((= len 1) `(char ,(aref exp 0)))
                 (t `(str ,exp)))))
        ((and (symbolp exp) exp)
         (when (not (gethash exp peg-rules))
           (error "Reference to undefined PEG rule: %S" exp))
         `(call ,exp))
        ((vectorp exp)
         (peg-normalize `(set . ,(append exp '()))))
         (error "Invalid parsing expression: %S" exp))))

(defvar peg-leaf-types '(null fail any call action char range str eob

(dolist (type peg-leaf-types)
  (puthash type `(lambda (&rest args) (cons ',type args)) 

(peg-add-method normalize or (&rest args)
  (cond ((null args) '(fail))
        ((null (cdr args)) (peg-normalize (car args)))
        (t `(or ,(peg-normalize (car args)) 
                ,(peg-normalize `(or . ,(cdr args)))))))

(peg-add-method normalize and (&rest args)
  (cond ((null args) '(null))
        ((null (cdr args)) (peg-normalize (car args)))
        (t `(and ,(peg-normalize (car args)) 
                 ,(peg-normalize `(and . ,(cdr args)))))))

(peg-add-method normalize * (&rest args)
  `(* ,(peg-normalize `(and . ,args))))

(peg-add-method normalize + (&rest args)
  (let ((e (peg-normalize `(and . ,args))))
    `(and ,e (* ,e))))

(peg-add-method normalize opt (&rest args)
  (let ((e (peg-normalize `(and . ,args))))
    `(or ,e (null))))

(peg-add-method normalize if (&rest args)
  `(if ,(peg-normalize `(and . ,args))))

(peg-add-method normalize not (&rest args)
  `(not ,(peg-normalize `(and . ,args))))

(peg-add-method normalize \` (form)
  (unless (member '-- form)
    (error "Malformed stack action: %S" form))
  (let ((args (cdr (member '-- (reverse form))))
        (values (cdr (member '-- form))))
    (let ((form `(let ,(mapcar (lambda (var) `(,var (pop peg-stack)))
                   (setq peg-stack 
                         (append (list . ,values) peg-stack)))))
      `(action ,form))))

(peg-add-method normalize set (&rest specs)
  (cond ((null specs) '(fail))
        ((and (null (cdr specs))
              (let ((range (peg-range-designator (car specs))))
                (and range `(range . ,range)))))
         (let ((args '()))
           (while specs
             (let* ((spec (pop specs))
                    (range (peg-range-designator spec)))
               (cond (range
                      (push (cons (car range) (cadr range)) args))
                     ((or (and (symbolp spec) (eq spec nil))
                          (characterp spec))
                      (push spec args))
                     ((stringp spec)
                      (setq args (append spec args)))
                     (t (error "Invalid set specifier: %S" spec)))))
           `(set ,(rx-to-string `(any . ,args) t))))))

(peg-add-method normalize *list (&rest args)
   `(and `(-- ())
         (* ,@args `(l e -- (cons e l)))
         `(l -- (nreverse l)))))

(peg-add-method normalize substring (&rest args)
   `(and `(-- (point))
         `(start -- 
                 (buffer-substring-no-properties start (point))))))

(defun peg-range-designator (x)
  (and (symbolp x)
       (let ((str (symbol-name x)))
         (and (= (length str) 3)
              (eq (aref str 1) ?-)
              (list (aref str 0) (aref str 2))))))

(peg-add-method normalize quote (form)
  (error "quote is reverved for future use"))

(peg-define-method-table translate)

;; This is the main translation function.
(defun peg-translate-exp (exp)
  "Return the ELisp code to match the PE EXP."
  (let ((translator (or (gethash (car exp) peg-translate-methods)
                        (error "No translator for: %S" (car exp)))))
    (apply translator (cdr exp))))

(peg-add-method translate and (e1 e2)
  `(and ,(peg-translate-exp e1)
        ,(peg-translate-exp e2)))

(peg-add-method translate or (e1 e2)
  (let ((cp (peg-make-choicepoint)))
    `(,@(peg-save-choicepoint cp)
      (or ,(peg-translate-exp e1)
          (,@(peg-restore-choicepoint cp)
           ,(peg-translate-exp e2))))))

;; Choicepoints are used for backtracking.  At a choicepoint we save
;; enough state, so that we can continue from there if needed.
(defun peg-make-choicepoint ()
  (cons (make-symbol "point") (make-symbol "thunks")))

(defun peg-save-choicepoint (choicepoint)
  `(let ((,(car choicepoint) (point))
         (,(cdr choicepoint) peg-thunks))))

(defun peg-restore-choicepoint (choicepoint)
     (goto-char ,(car choicepoint))
     (setq peg-thunks ,(cdr choicepoint))))

;; match empty strings
(peg-add-method translate null ()

;; match nothing
(peg-add-method translate fail ()

(peg-add-method translate eob ()

(peg-add-method translate * (e)
  (let ((cp (peg-make-choicepoint)))
    `(progn (while (,@(peg-save-choicepoint cp)
                    (cond (,(peg-translate-exp e))
                          (t ,(peg-restore-choicepoint cp)

(peg-add-method translate if (e)
  (let ((cp (peg-make-choicepoint)))
    `(,@(peg-save-choicepoint cp)
      (when ,(peg-translate-exp e)
        ,(peg-restore-choicepoint cp)

(peg-add-method translate not (e)
  (let ((cp (peg-make-choicepoint)))
    `(,@(peg-save-choicepoint cp)
      (when (not ,(peg-translate-exp e))
        ,(peg-restore-choicepoint cp)

(peg-add-method translate any ()
  '(when (not (eobp))

(peg-add-method translate char (c)
  `(when (eq (char-after) ',c)

(peg-add-method translate set (charset-regexp)
  `(when (looking-at ',charset-regexp)

(peg-add-method translate range (from to)
  `(when (and (<= ',from (char-after))
              (<= (char-after) ',to))

(peg-add-method translate str (str)
  `(when (looking-at ',(regexp-quote str))
     (goto-char (match-end 0))

(peg-add-method translate call (name)
  (or (gethash name peg-rules) 
      (error "Reference to unknown rule: %S" name))
  `(funcall ,name))

(peg-add-method translate action (form)
     (push (cons (point) (lambda () ,form)) peg-thunks)

(defvar peg-stack)
(defun peg-postprocess (thunks)
  "Execute \"actions\"."
  (let  ((peg-stack '()))
    (dolist (thunk (reverse thunks))
      (goto-char (car thunk))
      (funcall (cdr thunk)))
    (list t peg-stack)))

;; Left recursion is presumably a common mistate when using PEGs.
;; Here we try to detect such mistakes.  Essentailly we traverse the
;; graph as long as we can without consuming input.  When we find a
;; recursive call we signal an error.

(defun peg-check-cycles (peg-rules)
  (maphash (lambda (name exp)
             (peg-detect-cycles exp (list name))
             (dolist (node (peg-find-star-nodes exp))
               (peg-detect-cycles node '())))

(defun peg-find-star-nodes (exp)
  (let ((type (car exp)))
    (cond ((memq type peg-leaf-types) '())
          (t (let ((kids (apply #'append 
                                (mapcar #'peg-find-star-nodes (cdr exp)))))
               (if (eq type '*)
                   (cons exp kids)

(peg-define-method-table detect-cycles)

(defun peg-detect-cycles (exp path)
  "Signal an error on a cycle.
Otherwise traverse EXP recursively and return T if EXP can match
without consuming input.  Return nil if EXP definetly consumes
input.  PATH is the list of rules that we have visited so far."
  (apply (or (gethash (car exp) peg-detect-cycles-methods) 
             (error "No detect-cycle method for: %S" exp))
         path (cdr exp)))

(peg-add-method detect-cycles call (path name)
  (cond ((member name path)
         (error "Possible left recursion: %s"
                (mapconcat (lambda (x) (format "%s" x))
                           (reverse (cons name path)) " -> ")))
         (peg-detect-cycles (gethash name peg-rules) (cons name path)))))

(peg-add-method detect-cycles and (path e1 e2)
  (and (peg-detect-cycles e1 path)
       (peg-detect-cycles e2 path)))

(peg-add-method detect-cycles or (path e1 e2)
  (or (peg-detect-cycles e1 path)
      (peg-detect-cycles e2 path)))

(peg-add-method detect-cycles * (path e) 
  (when (peg-detect-cycles e path)
    (error "Infinite *-loop: %S matches empty string" e))

(peg-add-method detect-cycles if  (path e) (peg-unary-nullable e path))
(peg-add-method detect-cycles not (path e) (peg-unary-nullable e path))

(defun peg-unary-nullable (exp path)
  (peg-detect-cycles exp path)

(peg-add-method detect-cycles any   (path)       nil)
(peg-add-method detect-cycles char  (path c)     nil)
(peg-add-method detect-cycles set   (path s)     nil)
(peg-add-method detect-cycles range (path c1 c2) nil)
(peg-add-method detect-cycles str   (path s)     (equal s ""))
(peg-add-method detect-cycles null  (path)       t)
(peg-add-method detect-cycles fail  (path)       nil)
(peg-add-method detect-cycles eob   (path)       t)
(peg-add-method detect-cycles action (path form) t)

;;; Tests:

(defmacro peg-parse-string (rules string)
     (insert ,string)
     (goto-char (point-min))
     (peg-parse . ,rules)))

(defun peg-test ()
  (assert (peg-parse-string ((s "a")) "a"))
  (assert (not (peg-parse-string ((s "a")) "b")))
  (assert (peg-parse-string ((s (not "a"))) "b"))
  (assert (not (peg-parse-string ((s (not "a"))) "a")))
  (assert (peg-parse-string ((s (if "a"))) "a"))
  (assert (not (peg-parse-string ((s (if "a"))) "b")))
  (assert (peg-parse-string ((s "ab")) "ab"))
  (assert (not (peg-parse-string ((s "ab")) "ba")))
  (assert (not (peg-parse-string ((s "ab")) "a")))
  (assert (peg-parse-string ((s (range ?0 ?9))) "0"))
  (assert (not (peg-parse-string ((s (range ?0 ?9))) "a")))
  (assert (peg-parse-string ((s [0-9])) "0"))
  (assert (peg-parse-string ((s (any))) "0"))
  (assert (not (peg-parse-string ((s (any))) "")))
  (assert (peg-parse-string ((s (eob))) ""))
  (assert (peg-parse-string ((s (not (eob)))) "a"))
  (assert (peg-parse-string ((s (or "a" "b"))) "a"))
  (assert (peg-parse-string ((s (or "a" "b"))) "b"))
  (assert (not (peg-parse-string ((s (or "a" "b"))) "c")))
  (assert (peg-parse-string ((s (and "a" "b"))) "ab"))
  (assert (peg-parse-string ((s (and "a" "b"))) "abc"))
  (assert (not (peg-parse-string ((s (and "a" "b"))) "ba")))
  (assert (peg-parse-string ((s (and "a" "b" "c"))) "abc"))
  (assert (peg-parse-string ((s (* "a") "b" (eob))) "ab"))
  (assert (not (peg-parse-string ((s (* "a") "b" (eob))) "abc")))
  (assert (peg-parse-string ((s "")) "abc"))
  (assert (peg-parse-string ((s "" (eob))) ""))
  (assert (peg-parse-string ((s (opt "a") "b")) "abc"))
  (assert (peg-parse-string ((s (opt "a") "b")) "bc"))
  (assert (not (peg-parse-string ((s (or))) "ab")))
  (assert (peg-parse-string ((s (and))) "ab")))
(when (featurep 'cl) 

;;; Examples:

;; peg-ex-recognize-int recognizes integers.  An integer begins with a
;; optional sign, then follows one or more digits.  Digits are all
;; characters from 0 to 9.
;; Notes: 
;; 1) "" matches the empty sequence, i.e. matches without
;; consuming input.  
;; 2) [0-9] is the character range from 0 to 9.  This can also be
;; written as (range ?0 ?9). [Note that 0-9 is a symbol.]
(defun peg-ex-recognize-int ()
  (peg-parse (number   sign digit (* digit))
             (sign     (or "+" "-" ""))
             (digit    [0-9])))

;; peg-ex-parse-int recognizes integers and computes the corresponding
;; value.  The grammer is the same as for `peg-ex-recognize-int' added
;; with parsing actions.  Unfortunaletly, the actions add quite a bit
;; of clutter.
;; The action for the sign rule pushes t on the stack for a minus sign
;; and nil for plus or no sign.
;; The action for the digit rule pushes the value for a single digit.
;; The action `(a b -- (+ (* a 10) b)), takes two items from the stack
;; and pushes the first digit times 10 added to second digit.
;; The action `(minus val -- (if minus (- val) val)), negates the
;; value if the minus flag is true.
(defun peg-ex-parse-int ()
  (peg-parse (number sign
                     (* digit `(a b -- (+ (* a 10) b)))
                     `(minus val -- (if minus (- val) val)))
             (sign (or (and "+" `(-- nil))
                       (and "-" `(-- t))
                       (and "" `(-- nil))))
             (digit [0-9] `(-- (- (char-before) ?0)))))

;; Put point after the ) and press C-x C-e
;; (peg-ex-parse-int)-234234

;; Parse arithmetic expressions and compute the result as side effect.
(defun peg-ex-arith ()
   (expr (or (and ws sum eol)
             (and (* (not eol) (any)) eol error)))
   (sum product (* (or (and plus product `(a b -- (+ a b)))
                       (and minus product `(a b -- (- a b))))))
   (product value (* (or (and times value `(a b -- (* a b)))
                         (and divide value `(a b -- (/ a b))))))
   (value (or (and (substring number) `(string -- (string-to-number string)))
              (and open sum close)))
   (number (+ [0-9]) ws)
   (plus "+" ws)
   (minus "-" ws)
   (times "*" ws)
   (divide "/" ws)
   (open "(" ws)
   (close ")" ws)
   (ws (* (or " " "\t")))
   (eol (or "\n" "\r\n" "\r"))
   (error (action (error "Parse error at: %s" (point))))))

;; (peg-ex-arith)   1 + 2 * 3 * (4 + 5)

;; Parse URI according to RFC 2396.
(defun peg-ex-uri ()
   (URI-reference (or absoluteURI relativeURI) 
                  (or (and "#" (substring fragment))
                      `(-- nil)))
   (absoluteURI (substring scheme) ":" (or hier-part opaque-part)
                `(scheme user host port path query -- (list :scheme scheme 
                                                            :user user
                                                            :host host
                                                            :port port
                                                            :path path
                                                            :query query)))
   (hier-part (or net-path abs-path) 
              (or (and "?" (substring query))
                  `(-- nil)))
   (net-path "//" authority (or abs-path `(-- nil)))
   (abs-path "/" path-segments)
   (path-segments segment (*list "/" segment) `(s l -- (cons s l)))
   (segment (substring (* pchar) (* ";" param)))
   (param (* pchar))
   (pchar (or unreserved escaped [":@&=+$,"]))
   (query (* uric))
   (fragment (* uric))
   (relativeURI (or net-path abs-path rel-path) (opt "?" query))
   (rel-path rel-segment (opt abs-path))
   (rel-segment (+ unreserved escaped [";@&=+$,"]))
   (authority (or server reg-name))
   (server (or (and (or (and (substring userinfo) "@")
                        `(-- nil))
               `(-- nil nil nil)))
   (userinfo (* (or unreserved escaped [";:&=+$,"])))
   (hostport (substring host) (or (and ":" (substring port))
                                  `(-- nil)))
   (host (or hostname ipv4address))
   (hostname (* domainlabel ".") toplabel (opt "."))
   (domainlabel alphanum
                (opt (* (or alphanum "-") (if alphanum))
   (toplabel alpha
             (* (or alphanum "-") (if alphanum))
   (ipv4address (+ digit) "." (+ digit) "." (+ digit) "." (+ digit))
   (port (* digit))
   (scheme alpha (* (or alpha digit ["+-."])))
   (reg-name (or unreserved escaped ["$,;:@&=+"]))
   (opaque-part uric-no-slash (* uric))
   (uric (or reserved unreserved escaped))
   (uric-no-slash (or unreserved escaped [";?:@&=+$,"]))
   (reserved (set ";/?:@&=+$,"))
   (unreserved (or alphanum mark))
   (escaped "%" hex hex)
   (hex (or digit [A-F] [a-f]))
   (mark (set "-_.!~*'()"))
   (alphanum (or alpha digit))
   (alpha (or lowalpha upalpha))
   (lowalpha [a-z])
   (upalpha [A-Z])
   (digit [0-9])))

;; (peg-ex-uri)file:/bar/baz.html?foo=df
;; (peg-ex-uri)http://address@hidden:8080/bar/baz.html?x=1#foo

;; Parse a lisp style Sexp.
;; [To keep the example short, ' and . are handled as ordinary symbol.]
(defun peg-ex-lisp ()
   (sexp (* (or blank comment)) (or string list number symbol))
   (ws (* blank))
   (blank [" \n\t"])
   (comment ";" (* (not (or "\n" (eob))) (any)))
   (string "\"" (substring  (* (not "\"") (any))) "\"")
   (number (substring (opt (set "+-")) (+ digit))
           (if terminating)
           `(string -- (string-to-number string)))
   (symbol (substring (and symchar (* (not terminating) symchar)))
           `(s -- (intern s)))
   (symchar [a-z A-Z 0-9 "-;!#%&'*+,./:;<=>address@hidden|}~"])
   (list "("            `(-- (cons nil nil)) `(hd -- hd hd)
         (* sexp        `(tl e -- (setcdr tl (list e)))
            ) ws ")"    `(hd tl -- (cdr hd)))
   (digit [0-9])
   (terminating (or (set " \n\t();\"'") (eob)))))

;; (peg-ex-lisp)
;; We try to detect left recursion and report it as error.
(defun peg-ex-left-recursion ()
  (eval '(peg-parse (exp (or term
                             (and exp "+" exp)))
                    (term (or digit
                              (and term "*" term)))
                    (digit [0-9]))))

(defun peg-ex-infinite-loop ()
  (eval '(peg-parse (exp (* (or "x"
                                (action (foo))))))))

(provide 'peg)

;;; peg.el ends here

reply via email to

[Prev in Thread] Current Thread [Next in Thread]