[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
bug#73188: [PATCH v2] PEG: Add full support for PEG + some extensions
From: |
Ekaitz Zarraga |
Subject: |
bug#73188: [PATCH v2] PEG: Add full support for PEG + some extensions |
Date: |
Thu, 12 Sep 2024 22:57:22 +0200 |
This commit adds support for PEG as described in:
<https://bford.info/pub/lang/peg.pdf>
It adds support for the missing features (comments, underscores in
identifiers and escaping) while keeping the extensions (dashes in
identifiers, < and <--).
The naming system tries to be as close as possible to the one proposed
in the paper.
* module/ice-9/peg/string-peg.scm: Rewrite PEG parser.
* test-suite/tests/peg.test: Fix import
---
Previus version did not make proper character and escaping control.
module/ice-9/peg/string-peg.scm | 438 ++++++++++++++++++++------------
test-suite/tests/peg.test | 32 ++-
2 files changed, 292 insertions(+), 178 deletions(-)
diff --git a/module/ice-9/peg/string-peg.scm b/module/ice-9/peg/string-peg.scm
index 45ed14bb1..05755693a 100644
--- a/module/ice-9/peg/string-peg.scm
+++ b/module/ice-9/peg/string-peg.scm
@@ -1,6 +1,6 @@
;;;; string-peg.scm --- representing PEG grammars as strings
;;;;
-;;;; Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+;;;; Copyright (C) 2010, 2011, 2023 Free Software Foundation, Inc.
;;;;
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
@@ -22,6 +22,7 @@
define-peg-string-patterns
peg-grammar)
#:use-module (ice-9 peg using-parsers)
+ #:use-module (srfi srfi-1)
#:use-module (ice-9 peg codegen)
#:use-module (ice-9 peg simplify-tree))
@@ -38,22 +39,57 @@
;; Grammar for PEGs in PEG grammar.
(define peg-as-peg
-"grammar <-- (nonterminal ('<--' / '<-' / '<') sp pattern)+
-pattern <-- alternative (SLASH sp alternative)*
-alternative <-- ([!&]? sp suffix)+
-suffix <-- primary ([*+?] sp)*
-primary <-- '(' sp pattern ')' sp / '.' sp / literal / charclass / nonterminal
!'<'
-literal <-- ['] (!['] .)* ['] sp
-charclass <-- LB (!']' (CCrange / CCsingle))* RB sp
-CCrange <-- . '-' .
-CCsingle <-- .
-nonterminal <-- [a-zA-Z0-9-]+ sp
-sp < [ \t\n]*
-SLASH < '/'
-LB < '['
-RB < ']'
+"# Hierarchical syntax
+Grammar <-- Spacing Definition+ EndOfFile
+Definition <-- Identifier LEFTARROW Expression
+
+Expression <-- Sequence (SLASH Sequence)*
+Sequence <-- Prefix*
+Prefix <-- (AND / NOT)? Suffix
+Suffix <-- Primary (QUESTION / STAR / PLUS)?
+Primary <-- Identifier !LEFTARROW
+ / OPEN Expression CLOSE
+ / Literal / Class / DOT
+
+# Lexical syntax
+Identifier <-- IdentStart IdentCont* Spacing
+# NOTE: `-` is an extension
+IdentStart <- [a-zA-Z_] / '-'
+IdentCont <- IdentStart / [0-9]
+
+Literal <-- SQUOTE (!SQUOTE Char)* SQUOTE Spacing
+ / DQUOTE (!DQUOTE Char)* DQUOTE Spacing
+Class <-- '[' (!']' Range)* ']' Spacing
+Range <-- Char '-' Char / Char
+Char <-- '\\\\' [nrt'\"\\[\\]\\\\]
+ / '\\\\' [0-7][0-7][0-7]
+ / '\\\\' [0-7][0-7]?
+ / !'\\\\' .
+
+# NOTE: `<--` and `<` are extensions
+LEFTARROW <- ('<--' / '<-' / '<') Spacing
+SQUOTE <-- [']
+DQUOTE <-- [\"]
+OPENBRACKET < '['
+CLOSEBRACKET < ']'
+SLASH < '/' Spacing
+AND <-- '&' Spacing
+NOT <-- '!' Spacing
+QUESTION <-- '?' Spacing
+STAR <-- '*' Spacing
+PLUS <-- '+' Spacing
+OPEN < '(' Spacing
+CLOSE < ')' Spacing
+DOT <-- '.' Spacing
+
+Spacing < (Space / Comment)*
+Comment < '#' (!EndOfLine .)* EndOfLine
+Space < ' ' / '\t' / EndOfLine
+EndOfLine < '\\r\\n' / '\\n' / '\\r'
+EndOfFile < !.
")
+
(define-syntax define-sexp-parser
(lambda (x)
(syntax-case x ()
@@ -63,35 +99,78 @@ RB < ']'
(syn (wrap-parser-for-users x matchf accumsym #'sym)))
#`(define sym #,syn))))))
-(define-sexp-parser peg-grammar all
- (+ (and peg-nonterminal (or "<--" "<-" "<") peg-sp peg-pattern)))
-(define-sexp-parser peg-pattern all
- (and peg-alternative
- (* (and (ignore "/") peg-sp peg-alternative))))
-(define-sexp-parser peg-alternative all
- (+ (and (? (or "!" "&")) peg-sp peg-suffix)))
-(define-sexp-parser peg-suffix all
- (and peg-primary (* (and (or "*" "+" "?") peg-sp))))
-(define-sexp-parser peg-primary all
- (or (and "(" peg-sp peg-pattern ")" peg-sp)
- (and "." peg-sp)
- peg-literal
- peg-charclass
- (and peg-nonterminal (not-followed-by "<"))))
-(define-sexp-parser peg-literal all
- (and "'" (* (and (not-followed-by "'") peg-any)) "'" peg-sp))
-(define-sexp-parser peg-charclass all
- (and (ignore "[")
- (* (and (not-followed-by "]")
- (or charclass-range charclass-single)))
- (ignore "]")
- peg-sp))
-(define-sexp-parser charclass-range all (and peg-any "-" peg-any))
-(define-sexp-parser charclass-single all peg-any)
-(define-sexp-parser peg-nonterminal all
- (and (+ (or (range #\a #\z) (range #\A #\Z) (range #\0 #\9) "-")) peg-sp))
-(define-sexp-parser peg-sp none
- (* (or " " "\t" "\n")))
+(define-sexp-parser Grammar all
+ (and Spacing (+ Definition) EndOfFile))
+(define-sexp-parser Definition all
+ (and Identifier LEFTARROW Expression))
+(define-sexp-parser Expression all
+ (and Sequence (* (and SLASH Sequence))))
+(define-sexp-parser Sequence all
+ (* Prefix))
+(define-sexp-parser Prefix all
+ (and (? (or AND NOT)) Suffix))
+(define-sexp-parser Suffix all
+ (and Primary (? (or QUESTION STAR PLUS))))
+(define-sexp-parser Primary all
+ (or (and Identifier (not-followed-by LEFTARROW))
+ (and OPEN Expression CLOSE)
+ Literal
+ Class
+ DOT))
+(define-sexp-parser Identifier all
+ (and IdentStart (* IdentCont) Spacing))
+(define-sexp-parser IdentStart body
+ (or (range #\a #\z) (range #\A #\Z) "_" "-")) ; NOTE: - is an extension
+(define-sexp-parser IdentCont body
+ (or IdentStart (range #\0 #\9)))
+(define-sexp-parser Literal all
+ (or (and SQUOTE (* (and (not-followed-by SQUOTE) Char)) SQUOTE Spacing)
+ (and DQUOTE (* (and (not-followed-by DQUOTE) Char)) DQUOTE Spacing)))
+(define-sexp-parser Class all
+ (and OPENBRACKET (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET
Spacing))
+(define-sexp-parser Range all
+ (or (and Char DASH Char) Char))
+(define-sexp-parser Char all
+ (or (and "\\" (or "n" "r" "t" "'" "[" "]" "\\"))
+ (and "\\" (range #\0 #\7) (range #\0 #\7) (range #\0 #\7))
+ (and "\\" (range #\0 #\7) (? (range #\0 #\7)))
+ (and (not-followed-by "\\") peg-any)))
+(define-sexp-parser LEFTARROW body
+ (and (or "<--" "<-" "<") Spacing)) ; NOTE: <-- and < are extensions
+(define-sexp-parser SLASH none
+ (and "/" Spacing))
+(define-sexp-parser AND all
+ (and "&" Spacing))
+(define-sexp-parser NOT all
+ (and "!" Spacing))
+(define-sexp-parser QUESTION all
+ (and "?" Spacing))
+(define-sexp-parser STAR all
+ (and "*" Spacing))
+(define-sexp-parser PLUS all
+ (and "+" Spacing))
+(define-sexp-parser OPEN none
+ (and "(" Spacing))
+(define-sexp-parser CLOSE none
+ (and ")" Spacing))
+(define-sexp-parser DOT all
+ (and "." Spacing))
+(define-sexp-parser SQUOTE none "'")
+(define-sexp-parser DQUOTE none "\"")
+(define-sexp-parser OPENBRACKET none "[")
+(define-sexp-parser CLOSEBRACKET none "]")
+(define-sexp-parser DASH none "-")
+(define-sexp-parser Spacing none
+ (* (or Space Comment)))
+(define-sexp-parser Comment none
+ (and "#" (* (and (not-followed-by EndOfLine) peg-any)) EndOfLine))
+(define-sexp-parser Space none
+ (or " " "\t" EndOfLine))
+(define-sexp-parser EndOfLine none
+ (or "\r\n" "\n" "\r"))
+(define-sexp-parser EndOfFile none
+ (not-followed-by peg-any))
+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;; PARSE STRING PEGS
@@ -101,7 +180,7 @@ RB < ']'
;; will define all of the nonterminals in the grammar with equivalent
;; PEG s-expressions.
(define (peg-parser str for-syntax)
- (let ((parsed (match-pattern peg-grammar str)))
+ (let ((parsed (match-pattern Grammar str)))
(if (not parsed)
(begin
;; (display "Invalid PEG grammar!\n")
@@ -110,132 +189,160 @@ RB < ']'
(cond
((or (not (list? lst)) (null? lst))
lst)
- ((eq? (car lst) 'peg-grammar)
- #`(begin
- #,@(map (lambda (x) (peg-nonterm->defn x for-syntax))
- (context-flatten (lambda (lst) (<= (depth lst) 2))
- (cdr lst))))))))))
+ ((eq? (car lst) 'Grammar)
+ (Grammar->defn lst for-syntax)))))))
-;; Macro wrapper for PEG-PARSER. Parses PEG grammars expressed as strings and
-;; defines all the appropriate nonterminals.
-(define-syntax define-peg-string-patterns
- (lambda (x)
- (syntax-case x ()
- ((_ str)
- (peg-parser (syntax->datum #'str) x)))))
+;; (Grammar (Definition ...) (Definition ...))
+(define (Grammar->defn lst for-syntax)
+ #`(begin
+ #,@(map (lambda (x) (Definition->defn x for-syntax))
+ (context-flatten (lambda (lst) (<= (depth lst) 1))
+ (cdr lst)))))
-;; lst has format (nonterm grabber pattern), where
-;; nonterm is a symbol (the name of the nonterminal),
-;; grabber is a string (either "<", "<-" or "<--"), and
-;; pattern is the parse of a PEG pattern expressed as as string.
-(define (peg-nonterm->defn lst for-syntax)
- (let* ((nonterm (car lst))
- (grabber (cadr lst))
- (pattern (caddr lst))
- (nonterm-name (datum->syntax for-syntax
- (string->symbol (cadr nonterm)))))
- #`(define-peg-pattern #,nonterm-name
+;; (Definition (Identifier "Something") "<-" (Expression ...))
+;; `-> (define-peg-pattern Something 'all ...)
+(define (Definition->defn lst for-syntax)
+ (let ((identifier (second (second lst)))
+ (grabber (third lst))
+ (expression (fourth lst)))
+ #`(define-peg-pattern #,(datum->syntax for-syntax
+ (string->symbol identifier))
#,(cond
((string=? grabber "<--") (datum->syntax for-syntax 'all))
((string=? grabber "<-") (datum->syntax for-syntax 'body))
(else (datum->syntax for-syntax 'none)))
- #,(compressor (peg-pattern->defn pattern for-syntax) for-syntax))))
-
-;; lst has format ('peg-pattern ...).
-;; After the context-flatten, (cdr lst) has format
-;; (('peg-alternative ...) ...), where the outer list is a collection
-;; of elements from a '/' alternative.
-(define (peg-pattern->defn lst for-syntax)
- #`(or #,@(map (lambda (x) (peg-alternative->defn x for-syntax))
- (context-flatten (lambda (x) (eq? (car x) 'peg-alternative))
- (cdr lst)))))
-
-;; lst has format ('peg-alternative ...).
-;; After the context-flatten, (cdr lst) has the format
-;; (item ...), where each item has format either ("!" ...), ("&" ...),
-;; or ('peg-suffix ...).
-(define (peg-alternative->defn lst for-syntax)
- #`(and #,@(map (lambda (x) (peg-body->defn x for-syntax))
- (context-flatten (lambda (x) (or (string? (car x))
- (eq? (car x) 'peg-suffix)))
- (cdr lst)))))
-
-;; lst has the format either
-;; ("!" ('peg-suffix ...)), ("&" ('peg-suffix ...)), or
-;; ('peg-suffix ...).
-(define (peg-body->defn lst for-syntax)
- (cond
- ((equal? (car lst) "&")
- #`(followed-by #,(peg-suffix->defn (cadr lst) for-syntax)))
- ((equal? (car lst) "!")
- #`(not-followed-by #,(peg-suffix->defn (cadr lst) for-syntax)))
- ((eq? (car lst) 'peg-suffix)
- (peg-suffix->defn lst for-syntax))
- (else `(peg-parse-body-fail ,lst))))
-
-;; lst has format ('peg-suffix <peg-primary> (? (/ "*" "?" "+")))
-(define (peg-suffix->defn lst for-syntax)
- (let ((inner-defn (peg-primary->defn (cadr lst) for-syntax)))
- (cond
- ((null? (cddr lst))
- inner-defn)
- ((equal? (caddr lst) "*")
- #`(* #,inner-defn))
- ((equal? (caddr lst) "?")
- #`(? #,inner-defn))
- ((equal? (caddr lst) "+")
- #`(+ #,inner-defn)))))
-
-;; Parse a primary.
-(define (peg-primary->defn lst for-syntax)
- (let ((el (cadr lst)))
+ #,(compressor (Expression->defn expression for-syntax) for-syntax))))
+
+;; (Expression (Sequence X))
+;; `-> (X)
+;; (Expression (Sequence X) (Sequence Y))
+;; `-> (or X Y)
+;; (Expression (Sequence X) ((Sequence Y) (Sequence Z) ...))
+;; `-> (or X Y Z ...)
+(define (Expression->defn lst for-syntax)
+ (let ((first-sequence (second lst))
+ (rest (cddr lst)))
+ #`(or #,(Sequence->defn first-sequence for-syntax)
+ #,@(map (lambda (x)
+ (Sequence->defn x for-syntax))
+ (keyword-flatten '(Sequence) rest)))))
+
+
+(define (Sequence->defn lst for-syntax)
+ #`(and #,@(map (lambda (x) (Prefix->defn x for-syntax)) (cdr lst))))
+
+
+;; (Prefix (Suffix ...))
+;; `-> (...)
+;; (Prefix (NOT "!") (Suffix ...))
+;; `-> (not-followed-by ...)
+;; (Prefix (AND "&") (Suffix ...))
+;; `-> (followed-by ...)
+(define (Prefix->defn lst for-syntax)
+ (let ((suffix (second lst)))
+ (case (car suffix)
+ ('AND #`(followed-by #,(Suffix->defn (third lst) for-syntax)))
+ ('NOT #`(not-followed-by #,(Suffix->defn (third lst) for-syntax)))
+ (else (Suffix->defn suffix for-syntax)))))
+
+;; (Suffix (Primary ...))
+;; `-> (...)
+;; (Suffix (Primary ...) (STAR "*"))
+;; `-> (* ...)
+;; (Suffix (Primary ...) (QUESTION "?"))
+;; `-> (? ...)
+;; (Suffix (Primary ...) (PLUS "+"))
+;; `-> (+ ...)
+(define (Suffix->defn lst for-syntax)
+ (let* ((primary (second lst))
+ (out (Primary->defn primary for-syntax))
+ (extra (cddr lst)))
+ (if (null? extra)
+ out
+ (case (caar extra)
+ ('QUESTION #`(? #,out))
+ ('STAR #`(* #,out))
+ ('PLUS #`(+ #,out))))))
+
+(define (Primary->defn lst for-syntax)
+ (let ((value (second lst)))
+ (case (car value)
+ ('DOT #'peg-any)
+ ('Identifier (Identifier->defn value for-syntax))
+ ('Expression (Expression->defn value for-syntax))
+ ('Literal (Literal->defn value for-syntax))
+ ('Class (Class->defn value for-syntax)))))
+
+;; (Identifier "hello")
+;; `-> hello
+(define (Identifier->defn lst for-syntax)
+ (datum->syntax for-syntax (string->symbol (second lst))))
+
+;; (Literal (Char "a") (Char "b") (Char "c"))
+;; `-> "abc"
+(define (Literal->defn lst for-syntax)
+ (apply string (map (lambda (x) (Char->defn x for-syntax)) (cdr lst))))
+
+;; TODO: empty Class can happen: `[]`, but what does it represent?
+;; (Class ...)
+;; `-> (or ...)
+(define (Class->defn lst for-syntax)
+ #`(or #,@(map (lambda (x) (Range->defn x for-syntax))
+ (cdr lst))))
+
+;; For one character:
+;; (Range (Char "a"))
+;; `-> "a"
+;; Or for a range:
+;; (Range (Char "a") (Char "b"))
+;; `-> (range #\a #\b)
+(define (Range->defn lst for-syntax)
(cond
- ((list? el)
- (cond
- ((eq? (car el) 'peg-literal)
- (peg-literal->defn el for-syntax))
- ((eq? (car el) 'peg-charclass)
- (peg-charclass->defn el for-syntax))
- ((eq? (car el) 'peg-nonterminal)
- (datum->syntax for-syntax (string->symbol (cadr el))))))
- ((string? el)
+ ((= 2 (length lst))
+ (second (second lst)))
+ ((= 3 (length lst))
+ #`(range
+ #,(Char->defn (second lst) for-syntax)
+ #,(Char->defn (third lst) for-syntax)))))
+
+;; (Char "a")
+;; `-> #\a
+;; (Char "\\n")
+;; `-> #\newline
+;; (Char "\\135")
+;; `-> #\]
+(define (Char->defn lst for-syntax)
+ (let* ((charstr (second lst))
+ (first (string-ref charstr 0)))
(cond
- ((equal? el "(")
- (peg-pattern->defn (caddr lst) for-syntax))
- ((equal? el ".")
- (datum->syntax for-syntax 'peg-any))
- (else (datum->syntax for-syntax
- `(peg-parse-any unknown-string ,lst)))))
- (else (datum->syntax for-syntax
- `(peg-parse-any unknown-el ,lst))))))
-
-;; Trims characters off the front and end of STR.
-;; (trim-1chars "'ab'") -> "ab"
-(define (trim-1chars str) (substring str 1 (- (string-length str) 1)))
-
-;; Parses a literal.
-(define (peg-literal->defn lst for-syntax)
- (datum->syntax for-syntax (trim-1chars (cadr lst))))
-
-;; Parses a charclass.
-(define (peg-charclass->defn lst for-syntax)
- #`(or
- #,@(map
- (lambda (cc)
- (cond
- ((eq? (car cc) 'charclass-range)
- #`(range #,(datum->syntax
- for-syntax
- (string-ref (cadr cc) 0))
- #,(datum->syntax
- for-syntax
- (string-ref (cadr cc) 2))))
- ((eq? (car cc) 'charclass-single)
- (datum->syntax for-syntax (cadr cc)))))
- (context-flatten
- (lambda (x) (or (eq? (car x) 'charclass-range)
- (eq? (car x) 'charclass-single)))
- (cdr lst)))))
+ ((= 1 (string-length charstr)) first)
+ ((char-numeric? (string-ref charstr 1))
+ (integer->char
+ (reduce + 0
+ (map
+ (lambda (x y)
+ (* (- (char->integer x) (char->integer #\0)) y))
+ (reverse (string->list charstr 1))
+ '(1 8 64)))))
+ (else
+ (case (string-ref charstr 1)
+ ((#\n) #\newline)
+ ((#\r) #\return)
+ ((#\t) #\tab)
+ ((#\') #\')
+ ((#\]) #\])
+ ((#\\) #\\)
+ ((#\[) #\[))))))
+
+(define peg-grammar Grammar)
+
+;; Macro wrapper for PEG-PARSER. Parses PEG grammars expressed as strings and
+;; defines all the appropriate nonterminals.
+(define-syntax define-peg-string-patterns
+ (lambda (x)
+ (syntax-case x ()
+ ((_ str)
+ (peg-parser (syntax->datum #'str) x)))))
;; Compresses a list to save the optimizer work.
;; e.g. (or (and a)) -> a
@@ -263,11 +370,10 @@ RB < ']'
(let ((string (syntax->datum #'str-stx)))
(compile-peg-pattern
(compressor
- (peg-pattern->defn
- (peg:tree (match-pattern peg-pattern string)) #'str-stx)
+ (Expression->defn
+ (peg:tree (match-pattern Expression string)) #'str-stx)
#'str-stx)
(if (eq? accum 'all) 'body accum))))
(else (error "Bad embedded PEG string" args))))
(add-peg-compiler! 'peg peg-string-compile)
-
diff --git a/test-suite/tests/peg.test b/test-suite/tests/peg.test
index f516571e8..556145e72 100644
--- a/test-suite/tests/peg.test
+++ b/test-suite/tests/peg.test
@@ -28,17 +28,25 @@
;; the nonterminals defined in the PEG parser written with
;; S-expressions.
(define grammar-mapping
- '((grammar peg-grammar)
- (pattern peg-pattern)
- (alternative peg-alternative)
- (suffix peg-suffix)
- (primary peg-primary)
- (literal peg-literal)
- (charclass peg-charclass)
- (CCrange charclass-range)
- (CCsingle charclass-single)
- (nonterminal peg-nonterminal)
- (sp peg-sp)))
+ '((Grammar Grammar)
+ (Definition Definition)
+ (Expression Expression)
+ (Sequence Sequence)
+ (Prefix Prefix)
+ (Suffix Suffix)
+ (Primary Primary)
+ (Identifier Identifier)
+ (Literal Literal)
+ (Class Class)
+ (Range Range)
+ (Char Char)
+ (LEFTARROW LEFTARROW)
+ (AND AND)
+ (NOT NOT)
+ (QUESTION QUESTION)
+ (STAR STAR)
+ (PLUS PLUS)
+ (DOT DOT)))
;; Transforms the nonterminals defined in the PEG parser written as a PEG to
the nonterminals defined in the PEG parser written with S-expressions.
(define (grammar-transform x)
@@ -69,7 +77,7 @@
(peg:tree (match-pattern (@@ (ice-9 peg) peg-grammar) (@@ (ice-9 peg)
peg-as-peg)))
(tree-map
grammar-transform
- (peg:tree (match-pattern grammar (@@ (ice-9 peg) peg-as-peg)))))))
+ (peg:tree (match-pattern (@@ (ice-9 peg) peg-grammar) (@@ (ice-9 peg)
peg-as-peg)))))))
;; A grammar for pascal-style comments from Wikipedia.
(define comment-grammar
--
2.45.2