emacs-pretest-bug
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

xml.el changes


From: Dave Love
Subject: xml.el changes
Date: 13 Mar 2003 23:04:04 +0000
User-agent: Gnus/5.09 (Gnus v5.9.0) Emacs/21.2

I made these changes to xml.el mainly for non-ASCII and to deal with
character syntax reliably, e.g. for testing a parse in *scratch*.  For
skipping whitespace I reverted to using skip-chars-forward, which is
byte-coded, rather than the use of re-search-forward.

I may work on it later to eliminate some of the fixmes I noted.

2003-03-13  Dave Love  <address@hidden>

        * xml.el (xml-parse-region): Use with-syntax-table.  Allow leading
        comments.  Add autoload cookie.
        (xml-parse-file): Add autoload cookie.
        (xml-parse-attlist, xml-parse-dtd): Use char classes for non-ASCII
        letters.
        (xml-parse-tag, xml-parse-tag, xml-parse-attlist)
        (xml-parse-attlist, xml-parse-attlist, xml-skip-dtd)
        (xml-parse-dtd): Use skip-chars-forward.
        (xml-parse-tag): Pass parse-dtd to recursive call.  Revert change
        of 2002-12-16 (see XML [44]).
        (xml-parse-dtd): Skip comment declarations.
        (xml-substitute-entity): New.
        (xml-substitute-special): Use it.

Index: xml.el
===================================================================
RCS file: /cvsroot/emacs/emacs/lisp/xml.el,v
retrieving revision 1.15
diff -u -p -c -r1.15 xml.el
cvs server: conflicting specifications of output style
*** xml.el      14 Feb 2003 09:58:04 -0000      1.15
--- xml.el      13 Mar 2003 22:44:07 -0000
***************
*** 1,10 ****
  ;;; xml.el --- XML parser
  
! ;; Copyright (C) 2000, 2001 Free Software Foundation, Inc.
  
  ;; Author: Emmanuel Briot  <address@hidden>
  ;; Maintainer: Emmanuel Briot <address@hidden>
! ;; Keywords: xml
  
  ;; This file is part of GNU Emacs.
  
--- 1,10 ----
  ;;; xml.el --- XML parser
  
! ;; Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc.
  
  ;; Author: Emmanuel Briot  <address@hidden>
  ;; Maintainer: Emmanuel Briot <address@hidden>
! ;; Keywords: xml, data
  
  ;; This file is part of GNU Emacs.
  
***************
*** 25,42 ****
  
  ;;; Commentary:
  
! ;; This file contains a full XML parser. It parses a file, and returns a list
! ;; that can be used internally by any other lisp file.
! ;; See some example in todo.el
  
  ;;; FILE FORMAT
  
! ;; It does not parse the DTD, if present in the XML file, but knows how to
! ;; ignore it. The XML file is assumed to be well-formed. In case of error, the
! ;; parsing stops and the XML file is shown where the parsing stopped.
  ;;
! ;; It also knows how to ignore comments, as well as the special ?xml? tag
! ;; in the XML file.
  ;;
  ;; The XML file should have the following format:
  ;;    <node1 attr1="name1" attr2="name2" ...>value
--- 25,43 ----
  
  ;;; Commentary:
  
! ;; This file contains an incomplete non-validating XML parser.  It
! ;; parses a file, and returns a list that can be used internally by
! ;; any other lisp libraries.
  
  ;;; FILE FORMAT
  
! ;; The document type declaration may either be ignored or (optionally)
! ;; parsed, but currently the parsing will only accept element
! ;; declarations.  The XML file is assumed to be well-formed. In case
! ;; of error, the parsing stops and the XML file is shown where the
! ;; parsing stopped.
  ;;
! ;; It also knows how to ignore comments and processing instructions.
  ;;
  ;; The XML file should have the following format:
  ;;    <node1 attr1="name1" attr2="name2" ...>value
*************** An empty string is returned if the attri
*** 114,119 ****
--- 115,121 ----
  ;;**
  ;;*******************************************************************
  
+ ;;;###autoload
  (defun xml-parse-file (file &optional parse-dtd)
    "Parse the well-formed XML FILE.
  If FILE is already edited, this will keep the buffer alive.
*************** If PARSE-DTD is non-nil, the DTD is pars
*** 135,140 ****
--- 137,156 ----
        (kill-buffer (current-buffer)))
        xml)))
  
+ ;; Get space syntax correct per XML [3].
+ (defvar xml-syntax-table
+   (let ((table (make-syntax-table)))
+     (dotimes (c 31)
+       (modify-syntax-entry c "." table))
+     (dolist (c '(?\t ?\n ?\r))
+       (modify-syntax-entry c " " table))
+     table))
+ 
+ ;; Fixme:  This needs re-writing to deal with the XML grammar properly, i.e.
+ ;;   document    ::=    prolog element Misc*
+ ;;   prolog    ::=    XMLDecl? Misc* (doctypedecl Misc*)?
+ 
+ ;;;###autoload
  (defun xml-parse-region (beg end &optional buffer parse-dtd)
    "Parse the region from BEG to END in BUFFER.
  If BUFFER is nil, it defaults to the current buffer.
*************** Returns the XML list for the region, or 
*** 142,173 ****
  is not a well-formed XML file.
  If PARSE-DTD is non-nil, the DTD is parsed rather than skipped,
  and returned as the first element of the list"
!   (let (xml result dtd)
!     (save-excursion
!       (if buffer
!         (set-buffer buffer))
!       (goto-char beg)
!       (while (< (point) end)
!       (if (search-forward "<" end t)
!           (progn
!             (forward-char -1)
!             (if (null xml)
!                 (progn
!                   (setq result (xml-parse-tag end parse-dtd))
!                   (cond
!                    ((null result))
!                    ((listp (car result))
!                     (setq dtd (car result))
!                     (add-to-list 'xml (cdr result)))
!                    (t
!                     (add-to-list 'xml result))))
! 
!               ;;  translation of rule [1] of XML specifications
!               (error "XML files can have only one toplevel tag")))
!         (goto-char end)))
!       (if parse-dtd
!         (cons dtd (reverse xml))
!       (reverse xml)))))
  
  
  (defun xml-parse-tag (end &optional parse-dtd)
--- 158,193 ----
  is not a well-formed XML file.
  If PARSE-DTD is non-nil, the DTD is parsed rather than skipped,
  and returned as the first element of the list"
!   ;; Use fixed syntax table to ensure regexp char classes and syntax
!   ;; specs DTRT.
!   (with-syntax-table (standard-syntax-table)
!     (let (xml result dtd)
!       (save-excursion
!       (if buffer
!           (set-buffer buffer))
!       (goto-char beg)
!       (while (< (point) end)
!         (if (search-forward "<" end t)
!             (progn
!               (forward-char -1)
!               (if (null xml)
!                   (progn
!                     (setq result (xml-parse-tag end parse-dtd))
!                     (cond
!                      ((null result))
!                      ((listp (car result))
!                       (setq dtd (car result))
!                       (if (cdr result) ; possible leading comment
!                           (add-to-list 'xml (cdr result))))
!                      (t
!                       (add-to-list 'xml result))))
! 
!                 ;;  translation of rule [1] of XML specifications
!                 (error "XML files can have only one toplevel tag")))
!           (goto-char end)))
!       (if parse-dtd
!           (cons dtd (reverse xml))
!         (reverse xml))))))
  
  
  (defun xml-parse-tag (end &optional parse-dtd)
*************** Returns one of:
*** 184,191 ****
     ;; beginning of a document)
     ((looking-at "<\\?")
      (search-forward "?>" end)
!     (goto-char (- (re-search-forward "[^[:space:]]") 1))
!     (xml-parse-tag end))
     ;;  Character data (CDATA) sections, in which no tag should be interpreted
     ((looking-at "<!\\[CDATA\\[")
      (let ((pos (match-end 0)))
--- 204,211 ----
     ;; beginning of a document)
     ((looking-at "<\\?")
      (search-forward "?>" end)
!     (skip-chars-forward " \t\n\r" end)
!     (xml-parse-tag end parse-dtd))
     ;;  Character data (CDATA) sections, in which no tag should be interpreted
     ((looking-at "<!\\[CDATA\\[")
      (let ((pos (match-end 0)))
*************** Returns one of:
*** 198,204 ****
        (if parse-dtd
          (setq dtd (xml-parse-dtd end))
        (xml-skip-dtd end))
!       (goto-char (- (re-search-forward "[^[:space:]]") 1))
        (if dtd
          (cons dtd (xml-parse-tag end))
        (xml-parse-tag end))))
--- 218,224 ----
        (if parse-dtd
          (setq dtd (xml-parse-dtd end))
        (xml-skip-dtd end))
!       (skip-chars-forward " \t\n\r" end)
        (if dtd
          (cons dtd (xml-parse-tag end))
        (xml-parse-tag end))))
*************** Returns one of:
*** 219,227 ****
           pos)
  
        ;; is this an empty element ?
!       (if (looking-at "/[[:space:]]*>")
          (progn
            (forward-char 2)
            (nreverse (cons '("") children)))
  
        ;; is this a valid start tag ?
--- 239,249 ----
           pos)
  
        ;; is this an empty element ?
!       (if (looking-at "/>")
          (progn
            (forward-char 2)
+           ;; Fixme:  Inconsistent with the nil content returned from
+           ;; `<tag></tag>'.
            (nreverse (cons '("") children)))
  
        ;; is this a valid start tag ?
*************** Returns one of:
*** 277,288 ****
  
  (defun xml-parse-attlist (end)
    "Return the attribute-list that point is looking at.
! The search for attributes end at the position END in the current buffer.
! Leaves the point on the first non-blank character after the tag."
    (let ((attlist ())
        name)
!     (goto-char (- (re-search-forward "[^[:space:]]") 1))
!     (while (looking-at 
"\\([a-zA-Z_:][-a-zA-Z0-9._:]*\\)[[:space:]]*=[[:space:]]*")
        (setq name (intern (match-string 1)))
        (goto-char (match-end 0))
  
--- 299,310 ----
  
  (defun xml-parse-attlist (end)
    "Return the attribute-list that point is looking at.
! The search for attributes ends at the position END in the current buffer.
! Leave point at the first non-blank character after the tag."
    (let ((attlist ())
        name)
!     (skip-chars-forward " \t\n\r" end)
!     (while (looking-at "\\([[:alpha:]_:][-[:alnum:]._:]*\\)\\s-*=\\s-*")
        (setq name (intern (match-string 1)))
        (goto-char (match-end 0))
  
*************** Leaves the point on the first non-blank 
*** 298,304 ****
  
        (push (cons name (match-string-no-properties 1)) attlist)
        (goto-char (match-end 0))
!       (goto-char (- (re-search-forward "[^[:space:]]") 1))
        (if (> (point) end)
          (error "XML: end of attribute list not found before end of region"))
        )
--- 320,326 ----
  
        (push (cons name (match-string-no-properties 1)) attlist)
        (goto-char (match-end 0))
!       (skip-chars-forward " \t\n\r" end)
        (if (> (point) end)
          (error "XML: end of attribute list not found before end of region"))
        )
*************** Leaves the point on the first non-blank 
*** 312,317 ****
--- 334,341 ----
  ;;**
  ;;*******************************************************************
  
+ ;; Fixme: This fails at least if the DTD contains conditional sections.
+ 
  (defun xml-skip-dtd (end)
    "Skip the DTD that point is looking at.
  The DTD must end before the position END in the current buffer.
*************** This follows the rule [28] in the XML sp
*** 323,329 ****
    (condition-case nil
        (progn
        (forward-word 1)  ;; name of the document
!       (goto-char (- (re-search-forward "[^[:space:]]") 1))
        (if (looking-at "\\[")
            (re-search-forward "\\][[:space:]]*>" end)
          (search-forward ">" end)))
--- 347,353 ----
    (condition-case nil
        (progn
        (forward-word 1)  ;; name of the document
!       (skip-chars-forward " \t\n\r" end)
        (if (looking-at "\\[")
            (re-search-forward "\\][[:space:]]*>" end)
          (search-forward ">" end)))
*************** This follows the rule [28] in the XML sp
*** 333,339 ****
    "Parse the DTD that point is looking at.
  The DTD must end before the position END in the current buffer."
    (forward-char (length "<!DOCTYPE"))
!   (goto-char (- (re-search-forward "[^[:space:]]") 1))
    (if (looking-at ">")
        (error "XML: invalid DTD (excepting name of the document)"))
  
--- 357,363 ----
    "Parse the DTD that point is looking at.
  The DTD must end before the position END in the current buffer."
    (forward-char (length "<!DOCTYPE"))
!   (skip-chars-forward " \t\n\r" end)
    (if (looking-at ">")
        (error "XML: invalid DTD (excepting name of the document)"))
  
*************** The DTD must end before the position END
*** 343,349 ****
        type element end-pos)
      (goto-char (match-end 0))
  
!     (goto-char (- (re-search-forward "[^[:space:]]") 1))
  
      ;;  External DTDs => don't know how to handle them yet
      (if (looking-at "SYSTEM")
--- 367,373 ----
        type element end-pos)
      (goto-char (match-end 0))
  
!     (skip-chars-forward " \t\n\r" end)
  
      ;;  External DTDs => don't know how to handle them yet
      (if (looking-at "SYSTEM")
*************** The DTD must end before the position END
*** 353,366 ****
        (error "XML: Unknown declaration in the DTD"))
  
      ;;  Parse the rest of the DTD
      (forward-char 1)
      (while (and (not (looking-at "[[:space:]]*\\]"))
                (<= (point) end))
        (cond
  
         ;;  Translation of rule [45] of XML specifications
         ((looking-at
!        
"[[:space:]]*<!ELEMENT[[:space:]]+\\([a-zA-Z0-9.%;]+\\)[[:space:]]+\\([^>]+\\)>")
  
        (setq element (intern (match-string-no-properties 1))
              type    (match-string-no-properties 2))
--- 377,392 ----
        (error "XML: Unknown declaration in the DTD"))
  
      ;;  Parse the rest of the DTD
+     ;;  Fixme: Deal with ENTITY, ATTLIST, NOTATION, PIs.
      (forward-char 1)
      (while (and (not (looking-at "[[:space:]]*\\]"))
                (<= (point) end))
+       (skip-chars-forward " \t\n\r" end)
        (cond
  
         ;;  Translation of rule [45] of XML specifications
         ((looking-at
!        "<!ELEMENT\\s-+\\([[:alnum:].%;]+\\)\\s-+\\([^>]+\\)>")
  
        (setq element (intern (match-string-no-properties 1))
              type    (match-string-no-properties 2))
*************** The DTD must end before the position END
*** 381,393 ****
  
        ;;  rule [45]: the element declaration must be unique
        (if (assoc element dtd)
!           (error "XML: elements declaration must be unique in a DTD (<%s>)"
                   (symbol-name element)))
  
        ;;  Store the element in the DTD
        (push (list element type) dtd)
        (goto-char end-pos))
! 
  
         (t
        (error "XML: Invalid DTD item"))
--- 407,420 ----
  
        ;;  rule [45]: the element declaration must be unique
        (if (assoc element dtd)
!           (error "XML: element declarations must be unique in a DTD (<%s>)"
                   (symbol-name element)))
  
        ;;  Store the element in the DTD
        (push (list element type) dtd)
        (goto-char end-pos))
!        ((looking-at "<!--")
!       (search-forward "-->" end))
  
         (t
        (error "XML: Invalid DTD item"))
*************** The DTD must end before the position END
*** 440,459 ****
  ;;**
  ;;*******************************************************************
  
  (defun xml-substitute-special (string)
!   "Return STRING, after subsituting special XML sequences."
!   (while (string-match "&lt;" string)
!     (setq string (replace-match "<"  t nil string)))
!   (while (string-match "&gt;" string)
!     (setq string (replace-match ">"  t nil string)))
!   (while (string-match "&apos;" string)
!     (setq string (replace-match "'"  t nil string)))
!   (while (string-match "&quot;" string)
!     (setq string (replace-match "\"" t nil string)))
!   ;; This goes last so it doesn't confuse the matches above.
!   (while (string-match "&amp;" string)
!     (setq string (replace-match "&"  t nil string)))
!   string)
  
  ;;*******************************************************************
  ;;**
--- 467,501 ----
  ;;**
  ;;*******************************************************************
  
+ ;; Fixme:  Take declared entities from the DTD when they're available.
+ (defun xml-substitute-entity (match)
+   (save-match-data
+     (let ((match1 (match-string 1 str)))
+       (cond ((string= match1 "lt") "<")
+           ((string= match1 "gt") ">")
+           ((string= match1 "apos") "'")
+           ((string= match1 "quot") "\"")
+           ((string= match1 "amp") "&")
+           ((and (string-match "#\\([0-9]+\\)" match1)
+                 (let ((c (decode-char
+                           'ucs
+                           (string-to-number (match-string 1 match1)))))
+                   (if c (string c))))) ; else unrepresentable
+           ((and (string-match "#x\\([[:xdigit:]]+\\)" match1)
+                 (let ((c (decode-char
+                           'ucs
+                           (string-to-number (match-string 1 match1) 16))))
+                   (if c (string c)))))
+           ;; default to asis
+           (t match)))))
+ 
  (defun xml-substitute-special (string)
!   "Return STRING, after subsituting entity references."
!   ;; This originally made repeated passes through the string from the
!   ;; beginning, which isn't correct, since then either "&amp;amp;" or
!   ;; "&#38;amp;" won't DTRT.
!   (replace-regexp-in-string "&\\([^;]+\\);"
!                           #'xml-substitute-entity string t t))
  
  ;;*******************************************************************
  ;;**

reply via email to

[Prev in Thread] Current Thread [Next in Thread]