[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH devel 4/6] htmlprag: Allow users to parameterize the parent const
From: |
Maxim Cournoyer |
Subject: |
[PATCH devel 4/6] htmlprag: Allow users to parameterize the parent constraints. |
Date: |
Thu, 4 Mar 2021 21:35:22 -0500 |
These constraints are used to make the tokenizer more lenient in the
face of incomplete/invalid HTML; unfortunately it causes valid HTML to
be parsed incorrectly. This change allows users to parameterize the
alist that defines which elements are treated specially via the
%PARENT-CONSTRAINTS parameter, or to disable the pragmatic behavior
completely via the %STRICT-TOKENIZER? parameter or the STRICT?
keyword argument of the HTML->SXML procedure (and its variants).
* src/htmlprag.scm: Update doc.
(%default-parent-constraints): New variable.
(%parent-constraints): New parameter.
(%strict-tokenizer?): Likewise.
(parse-html/tokenizer)[strict?]: New keyword argument. Adjust to use
the newly added parameters and argument.
(htmlprag-internal:parse-html)[strict?]: New argument.
(test-htmlprag): Add tests.
(html->sxml-0nf, html->sxml-1nf, html->sxml-2nf)[strict?]: New argument.
---
src/htmlprag.scm | 143 ++++++++++++++++++++++++++++++++++-------------
1 file changed, 104 insertions(+), 39 deletions(-)
diff --git a/src/htmlprag.scm b/src/htmlprag.scm
index 3bd352b..79a7b2f 100644
--- a/src/htmlprag.scm
+++ b/src/htmlprag.scm
@@ -1,6 +1,7 @@
;; (htmlprag) -- pragmatic parsing of real-world HTML
;; Copyright (C) 2003-2004 Neil W. Van Dyke <neil at neilvandyke.org>
;; Modified 2004 by Andy Wingo to fit in with guile-lib.
+;; Modified 2021 by Maxim Cournoyer to parameterize the parent constraints.
;; This program is free software: you can redistribute it and/or modify
;; it under the terms of the GNU Lesser General Public License as
@@ -39,7 +40,12 @@
;;; defeat a strict or validating parser. HtmlPrag's handling of errors is
;;; intended to generally emulate popular Web browsers' interpretation of the
;;; structure of erroneous HTML. We euphemistically term this kind of parse
-;;; ``pragmatic.''
+;;; ``pragmatic.'' To disable the pragmatic behavior and parse HTML more
+;;; rigidly, the @code{%strict-tokenizer?} parameter can be set to
+;;; @code{#true}. In this mode of operation, one ended HTML tags will not be
+;;; treated specially, for example, and their content will be coalesced. On
+;;; the other side, valid HTML will parse more accurately. When working with
+;;; HTML known to be valid, it makes sense to use this mode of operation.
;;;
;;; HtmlPrag also has some support for [XHTML], although XML namespace
;;; qualifiers [XML-Names] are currently accepted but stripped from the
@@ -1076,6 +1082,48 @@
;;; input port. This procedure is used internally, and generally should not be
;;; called directly.
+;;; The alist below defines constraints about what possible parents a
+;;; tag may have. This exists to allow parsing malformed HTML, e.g.,
+;;; in the presence of missing closing tags. The drawback is that
+;;; this restricts where these tags may nested; for example, the
+;;; following HTML fragment
+;;; "<body><blockquote><p>foo</p>\n</blockquote></body>" parses to
+;;; '(*TOP* (body (blockquote) (p "foo") "\n")), which incorrectly
+;;; forces the 'p' tag to be a child of 'body' rather than of
+;;; 'blockquote'.
+(define %default-parent-constraints
+ '((area . (map))
+ (body . (html))
+ (caption . (table))
+ (colgroup . (table))
+ (dd . (dl))
+ (dt . (dl))
+ (frame . (frameset))
+ (head . (html))
+ (isindex . (head))
+ (li . (dir menu ol ul))
+ (meta . (head))
+ (noframes . (frameset))
+ (option . (select))
+ (p . (body td th))
+ (param . (applet))
+ (tbody . (table))
+ (td . (tr))
+ (th . (tr))
+ (thead . (table))
+ (title . (head))
+ (tr . (table tbody thead))))
+
+;;; The following parameter enables users to parameterize which
+;;; constraints to use when tokenizing HTML.
+(define %parent-constraints (make-parameter %default-parent-constraints))
+
+;;; The following switch is disabled for historical reasons. When true,
+;;; it disables the use of the above %parent-constraints parameter.
+;;; TODO: Set to #true when bumping the major version, which is a
+;;; better default in modern times, where most HTML is valid.
+(define %strict-tokenizer? (make-parameter #false))
+
(define parse-html/tokenizer
;; TODO: Document the algorithm, then see if rewriting as idiomatic Scheme
;; can make it more clear.
@@ -1084,32 +1132,9 @@
;; acceptable way to parse old HTML that uses the `p' element
;; as a paragraph terminator.
htmlprag-internal:empty-elements)
- (parent-constraints
- ;; TODO: Maybe make this an option.
- '((area . (map))
- (body . (html))
- (caption . (table))
- (colgroup . (table))
- (dd . (dl))
- (dt . (dl))
- (frame . (frameset))
- (head . (html))
- (isindex . (head))
- (li . (dir menu ol ul))
- (meta . (head))
- (noframes . (frameset))
- (option . (select))
- (p . (body td th))
- (param . (applet))
- (tbody . (table))
- (td . (tr))
- (th . (tr))
- (thead . (table))
- (title . (head))
- (tr . (table tbody thead))))
(start-tag-name (lambda (tag-token) (car tag-token)))
(end-tag-name (lambda (tag-token) (list-ref tag-token 1))))
- (lambda (tokenizer normalized?)
+ (lambda* (tokenizer normalized? #:key (strict? 'unset))
;; Example `begs' value:
;;
;; ( ((head ...) . ( (title ...) ))
@@ -1169,7 +1194,14 @@
(add-to-current-beg tok))
((eqv? kind shtml-start-symbol)
(let* ((name (start-tag-name tok))
- (cell (assq name parent-constraints)))
+ ;; If STRICT? is a boolean, it means the
+ ;; user explicitly provided it, in which
+ ;; case it takes precedence over the
+ ;; %strict-tokenizer? parameter.
+ (cell (and (not (if (boolean? strict?)
+ strict?
+ (%strict-tokenizer?)))
+ (assq name
(%parent-constraints)))))
(and cell (finish-begs-upto (cdr cell) begs))
(add-to-current-beg tok)
(or (memq name empty-elements)
@@ -1207,7 +1239,7 @@
;; variants, and should not be used directly by programs. The interface is
;; likely to change in future versions of HtmlPrag.
-(define (htmlprag-internal:parse-html input normalized? top?)
+(define (htmlprag-internal:parse-html input normalized? top? strict?)
(let ((parse
(lambda ()
(parse-html/tokenizer
@@ -1219,15 +1251,16 @@
"invalid input type"
input)))
normalized?)
- normalized?))))
+ normalized?
+ #:strict? strict?))))
(if top?
(cons shtml-top-symbol (parse))
(parse))))
-;;; @defproc html->sxml-0nf input
-;;; @defprocx html->sxml-1nf input
-;;; @defprocx html->sxml-2nf input
-;;; @defprocx html->sxml input
+;;; @defproc html->sxml-0nf input strict?
+;;; @defprocx html->sxml-1nf input strict?
+;;; @defprocx html->sxml-2nf input strict?
+;;; @defprocx html->sxml input strict?
;;;
;;; Permissively parse HTML from @var{input}, which is either an input port or
;;; a string, and emit an SHTML equivalent or approximation. To borrow and
@@ -1257,7 +1290,15 @@
;;; Note that in the emitted SHTML the text token @code{"still < bold"} is
;;; @emph{not} inside the @code{b} element, which represents an unfortunate
;;; failure to emulate all the quirks-handling behavior of some popular Web
-;;; browsers.
+;;; browsers. When correctness is preferred over pragmatism, the
+;;; @code{%strict-tokenizer?} parameter can be set to true. In the above
+;;; example, the unbound elements would nested, but valid HTML would parse
+;;; correctly, without the parsing quirk mentioned above. Alternatively, the
+;;; @code{strict?} keyword argument can be set to true, in which case it takes
+;;; precedence over the value of the @code{%strict-tokenizer?} parameter.
+;;; Finally, the %parent-constraints parameter can also be used to customize
+;;; which elements should be treated specially, when operating in the default
+;;; pragmatic mode.
;;;
;;; The procedures @code{html->sxml-@var{n}nf} for @var{n} 0 through 2
;;; correspond to 0th through 2nd normal forms of SXML as specified in [SXML],
@@ -1267,9 +1308,12 @@
;;; @code{html->sxml-0nf}, and can be used in scripts and interactively, when
;;; terseness is important and any normal form of SXML would suffice.
-(define (html->sxml-0nf input) (htmlprag-internal:parse-html input #f #t))
-(define (html->sxml-1nf input) (htmlprag-internal:parse-html input #f #t))
-(define (html->sxml-2nf input) (htmlprag-internal:parse-html input #t #t))
+(define* (html->sxml-0nf input #:key (strict? 'unset))
+ (htmlprag-internal:parse-html input #f #t strict?))
+(define* (html->sxml-1nf input #:key (strict? 'unset))
+ (htmlprag-internal:parse-html input #f #t strict?))
+(define* (html->sxml-2nf input #:key (strict? 'unset))
+ (htmlprag-internal:parse-html input #t #t strict?))
(define html->sxml html->sxml-0nf)
(define html->shtml html->sxml-0nf)
@@ -1660,7 +1704,7 @@
;;; @defproc test-htmlprag
;;;
;;; Run the test suite. A log will be printed to the default output port.
-;;; Returns true iff all tests pass.
+;;; Returns true if all tests pass.
(define (test-htmlprag)
(letrec ((passed 0)
@@ -1708,10 +1752,10 @@
(display ";; ")
(write expected)
(newline))))))
- (t1 (lambda (input expected)
+ (t1 (lambda* (input expected #:key (strict? 'unset))
(test html->shtml
'html->shtml
- (list input)
+ (list input #:strict? strict?)
(cons shtml-top-symbol expected))))
(t2 (lambda (input expected)
(test shtml->html
@@ -2019,6 +2063,20 @@
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\""
" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"))
+ (parameterize ((%strict-tokenizer? #true))
+ ;; When using the strict tokenizer mode, the 'p' tag is correctly parsed
+ ;; as a child of blockquote.
+ (t1 "<body><blockquote><p>foo</p>\n</blockquote></body>"
+ '((body (blockquote (p "foo") "\n"))))
+ ;; Ensure the strict? argument takes precedence over %strict-tokenizer?.
+ (t1 "<body><blockquote><p>foo</p>\n</blockquote></body>"
+ '((body (blockquote) (p "foo") "\n"))
+ #:strict? #false)
+ ;; In strict tokenizer mode, missing closing tags are not handled
+ ;; specially.
+ (t1 "<body><blockquote><p>foo\n</blockquote></body>"
+ '((body (blockquote (p "foo\n"))))))
+
;; TODO: Write more test cases for HTML encoding.
;; TODO: Document this.
@@ -2213,6 +2271,9 @@ shtml-entity-value
make-html-tokenizer
tokenize-html
shtml-token-kind
+%default-parent-constraints
+%parent-constraints
+%strict-tokenizer?
parse-html/tokenizer
html->sxml-0nf
html->sxml-1nf
@@ -2227,3 +2288,7 @@ test-htmlprag
)
;;; arch-tag: 491d7e61-5690-4b76-bc8f-d70315c10ed5
;;; htmlprag.scm ends here
+
+;; Local Variables:
+;; fill-column: 78
+;; End:
--
2.30.1