;;; -*-Scheme-*-
;;;
-;;; $Id: xml-parser.scm,v 1.2 2001/07/05 20:47:41 cph Exp $
+;;; $Id: xml-parser.scm,v 1.3 2001/07/06 20:50:47 cph Exp $
;;;
;;; Copyright (c) 2001 Massachusetts Institute of Technology
;;;
;;; standard, second edition, 6 October 2000. Each such comment marks
;;; the code that corresponds to that rule.
-;;; **** TO DO ****
-;;; * Attribute-value normalization (p. 29).
-
(declare (usual-integrations))
\f
;;;; Utilities
(define char-set:char-data
(char-set-difference char-set:xml-char (char-set #\< #\&)))
-(define-*parser-macro S ;[3]
- `(NOISE (+ (ALPHABET CHAR-SET:XML-WHITESPACE))))
-
-(define-*parser-macro S?
- `(NOISE (* (ALPHABET CHAR-SET:XML-WHITESPACE))))
-
(define char-set:xml-whitespace
(char-set #\space #\tab #\return #\linefeed))
-(define-*parser-macro (bracket description open close . body)
- (let ((v (generate-uninterned-symbol)))
- `(WITH-POINTER ,v
- (SEQ ,open
- ,@body
- (ALT ,close
- (SEXP
- (LAMBDA (BUFFER)
- BUFFER
- (ERROR
- ,(if (string? description)
- (string-append "Unterminated " description " at")
- `(STRING-APPEND "Unterminated " ,description " at"))
- (PARSER-BUFFER-POSITION-STRING ,v)))))))))
-
-(define-*parser-macro (sbracket description open close . body)
- `(BRACKET ,description (NOISE (STRING ,open)) (NOISE (STRING ,close))
- ,@body))
-
-(define-*parser-macro (require-success message body)
- `(ALT ,body
- (SEXP
- (LAMBDA (BUFFER)
- (ERROR ,(if (string? message)
- (string-append message " at")
- `(STRING-APPEND ,message " at"))
- (PARSER-BUFFER-POSITION-STRING BUFFER))))))
+(define (coalesce-strings! elements)
+ (do ((elements elements (cdr elements)))
+ ((not (pair? elements)))
+ (if (and (string? (car elements))
+ (pair? (cdr elements))
+ (string? (cadr elements)))
+ (begin
+ (set-car! elements
+ (string-append (car elements)
+ (cadr elements)))
+ (set-cdr! elements (cddr elements))))))
\f
(define (make-xml-char-reference n)
- (if (not (or (= n #x9)
- (= n #xA)
- (= n #xD)
- (<= #x20 n #xD7FF)
- (<= #xE000 n #xFFFD)
- (<= #x10000 n #x10FFFF)))
+ (if (not (valid-xml-code-point? n))
(error "Disallowed Unicode character code:" n))
(integer->unicode-string n))
+(define (valid-xml-code-point? n)
+ (and (< n #x110000)
+ (if (< n #xD800)
+ (or (>= n #x20)
+ (= n #x9)
+ (= n #xA)
+ (= n #xD))
+ (and (>= n #xE000)
+ (not (or (= n #xFFFE)
+ (= n #xFFFF)))))))
+
(define (integer->unicode-string n)
- (let ((initial-char
- (lambda (n offset)
- (integer->char
- (fix:or (fix:and (fix:lsh #xFF (fix:+ n 1)) #xFF)
- (fix:lsh n -6)))))
- (subsequent-char
- (lambda (offset)
- (integer->char
- (fix:or #x80
- (fix:and (fix:lsh n (fix:- 0 offset))
- #x3F))))))
- (declare (integrate-operator initial-char subsequent-char))
- (if (not (and (<= 0 n) (< n #x80000000)))
- (error:bad-range-argument n 'INTEGER->UNICODE-STRING))
- (cond ((< n #x00000080)
- (string (integer->char n)))
- ((< n #x00000800)
- (string (initial-char 5 6)
- (subsequent-char 6)))
- ((< n #x00010000)
- (string (initial-char 4 12)
- (subsequent-char 12)
- (subsequent-char 6)))
- ((< n #x00200000)
- (string (initial-char 3 18)
- (subsequent-char 18)
- (subsequent-char 12)
- (subsequent-char 6)))
- ((< n #x04000000)
- (string (initial-char 2 24)
- (subsequent-char 24)
- (subsequent-char 18)
- (subsequent-char 12)
- (subsequent-char 6)))
- (else
- (string (initial-char 1 30)
- (subsequent-char 30)
- (subsequent-char 24)
- (subsequent-char 18)
- (subsequent-char 12)
- (subsequent-char 6))))))
-\f
-(define (normalize-line-endings string)
- (if (string-find-next-char string #\return)
- (let ((end (string-length string)))
- (let ((step-over-eol
- (lambda (index)
- (fix:+ index
- (if (and (fix:< (fix:+ index 1) end)
- (char=? (string-ref string (fix:+ index 1))
- #\linefeed))
- 2
- 1)))))
- (let ((n
- (let loop ((start 0) (n 0))
- (let ((index
- (substring-find-next-char string start end
- #\return)))
- (if index
- (loop (step-over-eol index)
- (fix:+ n (fix:+ (fix:- index start) 1)))
- (fix:+ n (fix:- end start)))))))
- (let ((string* (make-string n)))
- (let loop ((start 0) (start* 0))
- (let ((index
- (substring-find-next-char string start end
- #\return)))
- (if index
- (let ((start*
- (substring-move! string start index
- string* start*)))
- (string-set! string* start* #\newline)
- (loop (step-over-eol index)
- (fix:+ start* 1)))
- (substring-move! string start end string* start*))))
- string*))))
- string))
+
+ (define-integrable (initial-char n offset)
+ (integer->char
+ (fix:or (fix:and (fix:lsh #xFF (fix:+ n 1)) #xFF)
+ (fix:lsh n (fix:- 0 offset)))))
+
+ (define-integrable (subsequent-char offset)
+ (integer->char
+ (fix:or #x80
+ (fix:and (fix:lsh n (fix:- 0 offset)) #x3F))))
+
+ (if (not (and (<= 0 n) (< n #x80000000)))
+ (error:bad-range-argument n 'INTEGER->UNICODE-STRING))
+ (cond ((< n #x00000080)
+ (string (integer->char n)))
+ ((< n #x00000800)
+ (string (initial-char 5 6)
+ (subsequent-char 6)))
+ ((< n #x00010000)
+ (string (initial-char 4 12)
+ (subsequent-char 12)
+ (subsequent-char 6)))
+ ((< n #x00200000)
+ (string (initial-char 3 18)
+ (subsequent-char 18)
+ (subsequent-char 12)
+ (subsequent-char 6)))
+ ((< n #x04000000)
+ (string (initial-char 2 24)
+ (subsequent-char 24)
+ (subsequent-char 18)
+ (subsequent-char 12)
+ (subsequent-char 6)))
+ (else
+ (string (initial-char 1 30)
+ (subsequent-char 30)
+ (subsequent-char 24)
+ (subsequent-char 18)
+ (subsequent-char 12)
+ (subsequent-char 6)))))
\f
;;;; Top level
-(define parse-xml-document ;[1,22]
- (*parser
- (transform
- (lambda (v)
- (make-xml-document (vector-ref v 0)
- (vector-ref v 1)
- (vector-ref v 2)
- (vector-ref v 3)
- (vector-ref v 4)
- (vector-ref v 5)))
- (complete
- (seq (alt (top-level parse-declaration)
- (values #f))
- (encapsulate vector->list
- (* (top-level parse-misc)))
- (alt (seq (top-level parse-dtd)
- (encapsulate vector->list
- (* (top-level parse-misc))))
- (values #f '()))
- (top-level parse-element)
- (encapsulate vector->list
- (* (top-level parse-misc))))))))
+(define (parse-xml-document buffer) ;[1,22]
+ (let* ((declaration (parse-declaration buffer))
+ (misc-1 (parse-misc buffer))
+ (dtd (parse-dtd buffer))
+ (misc-2 (if dtd (parse-misc buffer) '()))
+ (element
+ (fluid-let ((*dtd* dtd))
+ (parse-root-element buffer)))
+ (misc-3 (parse-misc buffer)))
+ (if (peek-parser-buffer-char buffer)
+ (error "Unparsed content in input at"
+ (parser-buffer-position-string buffer)))
+ (make-xml-document declaration
+ misc-1
+ dtd
+ misc-2
+ element
+ misc-3)))
+
+(define *dtd*)
(define parse-misc ;[27]
(*parser
- (alt parse-comment
- parse-processing-instructions
- (element-transform normalize-line-endings
- (match (+ (alphabet char-set:xml-whitespace)))))))
+ (encapsulate vector->list
+ (* (top-level
+ (alt parse-comment
+ parse-processing-instructions
+ (element-transform normalize-line-endings
+ (match (+ (alphabet char-set:xml-whitespace))))))))))
\f
(define parse-declaration ;[23,24,32,80]
(*parser
- (encapsulate (lambda (v) (transform-declaration (vector-ref v 0)))
- (sbracket "XML declaration" "<?xml" "?>"
- parse-attribute-list))))
+ (top-level
+ (transform (lambda (v) (transform-declaration (vector-ref v 0)))
+ (sbracket "XML declaration" "<?xml" "?>"
+ parse-attribute-list)))))
(define (transform-declaration attributes)
(let ((finish
\f
;;;; Elements
+(define parse-root-element
+ (*parser (top-level parse-element)))
+
(define (parse-element buffer) ;[39]
(let ((p (get-parser-buffer-pointer buffer)))
(let ((v (parse-start-tag buffer)))
(define parse-start-tag ;[40,44]
(*parser
- (bracket "start tag"
- (seq (noise (string "<")) maybe-parse-name)
- (match (alt (string ">") (string "/>")))
- parse-attribute-list)))
+ (top-level
+ (bracket "start tag"
+ (seq (noise (string "<")) maybe-parse-name)
+ (match (alt (string ">") (string "/>")))
+ parse-attribute-list))))
(define parse-end-tag ;[42]
(*parser
- (sbracket "end tag" "</" ">"
- parse-name
- S?)))
+ (top-level
+ (sbracket "end tag" "</" ">"
+ parse-name
+ S?))))
(define parse-content ;[43]
(*parser
(*parser (sbracket description start end parser))))
(define (terminated-region-parser description alphabet . ends)
+ description
(let ((matcher
(lambda (buffer)
(let loop ()
(terminated-region-parser "character data" char-set:char-data "]]>"))
(define parse-comment ;[15]
- (let ((parser (bracketed-region-parser "comment" "<!--" "-->")))
- (*parser (element-transform make-xml-comment parser))))
+ (let ((parse-body
+ (terminated-region-parser "comment" char-set:xml-char "--")))
+ (*parser
+ (element-transform make-xml-comment
+ (sbracket "comment" "<!--" "-->"
+ parse-body)))))
(define parse-cdata-section ;[18,19,20,21]
(bracketed-region-parser "CDATA section" "<![CDATA[" "]]>"))
;;;; Names and references
(define parse-name
- (*parser (require-success "malformed XML name" maybe-parse-name)))
+ (*parser (require-success "Malformed XML name" maybe-parse-name)))
(define maybe-parse-name ;[5]
(*parser
(define parse-name-token
(*parser
- (require-success "malformed XML name token"
+ (require-success "Malformed XML name token"
maybe-parse-name-token)))
(define maybe-parse-name-token ;[7]
(define parse-attribute-list
(*parser
- (encapsulate vector->list
- (seq (* parse-attribute)
- S?))))
+ (with-pointer p
+ (encapsulate
+ (lambda (v)
+ (let ((alist (vector->list v)))
+ (do ((alist alist (cdr alist)))
+ ((not (pair? alist)))
+ (let ((entry (assq (caar alist) (cdr alist))))
+ (if entry
+ (error "Duplicate entry in attribute list at"
+ (parser-buffer-position-string p)))))
+ alist))
+ (seq (* parse-attribute)
+ S?)))))
(define parse-attribute ;[41,25]
(*parser
(seq S
maybe-parse-name
S?
- (require-success "missing attribute separator"
+ (require-success "Missing attribute separator"
(noise (string "=")))
S?
parse-attribute-value))))
(define parse-attribute-value ;[10]
(let ((parser (attribute-value-parser char-set:char-data parse-reference)))
- (*parser (require-success "malformed attribute value" parser))))
+ (*parser
+ (element-transform normalize-attribute-value
+ (require-success "Malformed attribute value"
+ parser)))))
+\f
+;;;; Normalization
-(define (coalesce-strings! elements)
- (do ((elements elements (cdr elements)))
- ((not (pair? elements)))
- (if (and (string? (car elements))
- (pair? (cdr elements))
- (string? (cadr elements)))
- (begin
- (set-car! elements
- (string-append (car elements)
- (cadr elements)))
- (set-cdr! elements (cddr elements))))))
+(define (normalize-line-endings string #!optional always-copy?)
+ (if (string-find-next-char string #\return)
+ (let ((end (string-length string)))
+ (let ((step-over-eol
+ (lambda (index)
+ (fix:+ index
+ (if (and (fix:< (fix:+ index 1) end)
+ (char=? (string-ref string (fix:+ index 1))
+ #\linefeed))
+ 2
+ 1)))))
+ (let ((n
+ (let loop ((start 0) (n 0))
+ (let ((index
+ (substring-find-next-char string start end
+ #\return)))
+ (if index
+ (loop (step-over-eol index)
+ (fix:+ n (fix:+ (fix:- index start) 1)))
+ (fix:+ n (fix:- end start)))))))
+ (let ((string* (make-string n)))
+ (let loop ((start 0) (start* 0))
+ (let ((index
+ (substring-find-next-char string start end
+ #\return)))
+ (if index
+ (let ((start*
+ (substring-move! string start index
+ string* start*)))
+ (string-set! string* start* #\newline)
+ (loop (step-over-eol index)
+ (fix:+ start* 1)))
+ (substring-move! string start end string* start*))))
+ string*))))
+ (if (and (not (default-object? always-copy?))
+ always-copy?)
+ (string-copy string)
+ string)))
+
+(define (normalize-attribute-value value)
+ (cond ((pair? value)
+ (map normalize-attribute-value value))
+ ((string? value)
+ (let ((string (normalize-line-endings value #t)))
+ (let ((n (string-length string)))
+ (do ((i 0 (fix:+ i 1)))
+ ((fix:= i n))
+ (if (or (char=? (string-ref string i) #\tab)
+ (char=? (string-ref string i) #\newline))
+ (string-set! string i #\space))))
+ string))
+ (else value)))
\f
;;;; Document-type declarations
(define parse-dtd ;[28]
(*parser
- (encapsulate
- (lambda (v)
- (make-xml-dtd (vector-ref v 0)
- (vector-ref v 1)
- (vector-ref v 2)))
- (sbracket "document-type declaration" "<!DOCTYPE" ">"
- (require-success "Malformed document type"
- (seq S
- parse-name
- (alt (seq S
- parse-external-id)
- (values #f))
- S?
- (alt (seq (encapsulate vector->list
- (sbracket "internal DTD" "[" "]"
- (* (alt parse-markup-decl
- parse-decl-separator))))
- S?)
- (values #f))))))))
-
-(define parse-markup-decl ;[29]
- (*parser
- (alt parse-element-decl
- parse-attlist-decl
- parse-entity-decl
- parse-notation-decl
- parse-processing-instructions
- parse-comment)))
+ (top-level
+ (encapsulate
+ (lambda (v)
+ (make-xml-dtd (vector-ref v 0)
+ (vector-ref v 1)
+ (vector-ref v 2)))
+ (sbracket "document-type declaration" "<!DOCTYPE" ">"
+ (require-success "Malformed document type"
+ (seq S
+ parse-name
+ (alt (seq S
+ parse-external-id)
+ (values #f))
+ S?
+ (alt (seq (encapsulate vector->list
+ (sbracket "internal DTD" "[" "]"
+ (* (alt parse-markup-decl
+ parse-decl-separator))))
+ S?)
+ (values #f)))))))))
(define parse-decl-separator ;[28a]
(*parser
(alt parse-parameter-entity-reference
S)))
+(define parse-markup-decl ;[29]
+ (*parser
+ (alt parse-!element
+ parse-!attlist
+ parse-!entity
+ parse-!notation
+ parse-processing-instructions
+ parse-comment)))
+
(define parse-external-subset ;[30]
(*parser
(seq (? parse-text-decl)
parse-conditional-section
parse-decl-separator))))
\f
-(define parse-element-decl ;[45]
+(define parse-!element ;[45]
(letrec
((parse-children ;[47,49,50]
(*parser
(encapsulate encapsulate-suffix
- (seq (sbracket "element-declaration type" "(" ")"
+ (seq (sbracket "!ELEMENT type" "(" ")"
S?
(alt (encapsulate (lambda (v) (cons 'ALT (vector->list v)))
(seq parse-cp
(*parser
(encapsulate
- (lambda (v)
- (make-xml-element-declaration (vector-ref v 0)
- (vector-ref v 1)))
+ (lambda (v) (make-xml-!element (vector-ref v 0) (vector-ref v 1)))
(sbracket "element declaration" "<!ELEMENT" ">"
S
parse-name
(element-transform xml-intern (match (string "ANY")))
;;[51]
(encapsulate (lambda (v) (cons 'MIX (vector->list v)))
- (alt (sbracket "element-declaration type" "(" ")"
+ (with-pointer p
+ (seq (noise (string "("))
S?
(noise (string "#PCDATA"))
- S?)
- (sbracket "element-declaration type" "(" ")*"
- S?
- (noise (string "#PCDATA"))
- (* (seq S?
- (noise (string "|"))
- S?
- parse-name))
- S?)))
+ (alt (seq S?
+ (noise (string ")")))
+ (seq (* (seq S?
+ (noise (string "|"))
+ S?
+ parse-name))
+ S?
+ (noise (string ")*")))
+
+ (sexp
+ (lambda (buffer)
+ buffer
+ (error "Unterminated !ELEMENT type at"
+ (parser-buffer-position-string p))))))))
parse-children))))))
\f
-(define parse-attlist-decl ;[52,53]
+(define parse-!attlist ;[52,53]
(*parser
(encapsulate
- (lambda (v)
- (make-xml-attribute-declaration (vector-ref v 0)
- (vector-ref v 1)))
+ (lambda (v) (make-xml-!attlist (vector-ref v 0) (vector-ref v 1)))
(sbracket "attribute-list declaration" "<!ATTLIST" ">"
S
parse-name
parse-attribute-value))))))
S?))))
\f
-(define parse-conditional-section ;[61]
- (*parser
- (alt parse-include-section
- parse-ignore-section)))
-
-(define-integrable conditional-start "<![")
-(define-integrable conditional-end "]]>")
-
-(define parse-include-section ;[62]
- (*parser
- (encapsulate
- (lambda (v)
- (make-xml-include-section (vector->list v)))
- (bracket "include section"
- (seq (noise (string conditional-start))
- S?
- (noise (string "INCLUDE"))
- S?
- (noise (string "[")))
- (noise (string conditional-end))
- parse-external-subset-decl))))
-
-(define parse-ignore-section ;[63]
- (*parser
- (encapsulate
- (lambda (v)
- (make-xml-ignore-section (vector->list v)))
- (bracket "ignore section"
- (seq (noise (string conditional-start))
- S?
- (noise (string "IGNORE"))
- S?
- (noise (string "[")))
- (noise (string conditional-end))
- (* parse-ignore-section-contents)))))
-
-(define parse-ignore-section-contents ;[64,65]
- (let ((parser
- (terminated-region-parser "ignore section" char-set:xml-char
- conditional-start conditional-end)))
- (*parser
- (seq parser
- (* (seq (sbracket "ignore section" conditional-start conditional-end
- parse-ignore-section-contents)
- parser))))))
-\f
-(define parse-entity-decl ;[70,71,72,73,74]
+(define parse-!entity ;[70,71,72,73,74,76]
(*parser
(sbracket "entity declaration" "<!ENTITY" ">"
(seq S
(alt (encapsulate
(lambda (v)
- (make-xml-parameter-entity-declaration (vector-ref v 0)
- (vector-ref v 1)))
+ (make-xml-parameter-!entity (vector-ref v 0)
+ (vector-ref v 1)))
(seq (noise (string "%"))
S
parse-name
parse-external-id)))
(encapsulate
(lambda (v)
- (make-xml-entity-declaration (vector-ref v 0)
- (vector-ref v 1)))
+ (if (fix:= (vector-length v) 2)
+ (make-xml-!entity (vector-ref v 0) (vector-ref v 1))
+ (make-xml-unparsed-!entity (vector-ref v 0)
+ (vector-ref v 1)
+ (vector-ref v 2))))
(seq parse-name
S
(alt parse-entity-value
(seq parse-external-id
- (? parse-ndata-decl))))))
+ (? (seq S
+ (noise (string "NDATA"))
+ S
+ parse-name)))))))
S?))))
+(define parse-!notation ;[82,83]
+ (*parser
+ (encapsulate
+ (lambda (v) (make-xml-!notation (vector-ref v 0) (vector-ref v 1)))
+ (sbracket "notation declaration" "<!NOTATION" ">"
+ S
+ parse-name
+ S
+ (alt parse-external-id
+ (encapsulate
+ (lambda (v) (make-xml-external-id (vector-ref v 0) #f))
+ (seq (noise (string "PUBLIC"))
+ S
+ parse-public-id-literal)))
+ S?))))
+
(define parse-external-id ;[75]
(*parser
(alt (encapsulate
(alt (sbracket description "\"" "\"" (match (* (alphabet a1))))
(sbracket description "'" "'" (match (* (alphabet a2))))))))
+(define parse-system-literal ;[11]
+ (string-parser "system literal" char-set:xml-char))
+
(define parse-public-id-literal ;[12,13]
(string-parser
"public-ID literal"
(char-set-union char-set:alphanumeric
(string->char-set " \r\n-'()+,./:=?;!*#@$_%"))))
-
-(define parse-system-literal ;[11]
- (string-parser "system literal" char-set:xml-char))
\f
-(define parse-ndata-decl ;[76]
+(define parse-conditional-section ;[61]
(*parser
- (seq S
- (noise (string "NDATA"))
- S
- parse-name)))
+ (alt parse-!include
+ parse-!ignore)))
+
+(define-integrable conditional-start "<![")
+(define-integrable conditional-end "]]>")
-(define parse-notation-decl ;[82,83]
+(define parse-!include ;[62]
(*parser
- (sbracket "notation declaration" "<!NOTATION" ">"
- S
- parse-name
- S
- (alt parse-external-id
- (encapsulate
- (lambda (v)
- (make-xml-external-id (vector-ref v 0) #f))
- (seq (noise (string "PUBLIC"))
- S
- parse-public-id-literal)))
- S?)))
+ (encapsulate (lambda (v) (make-xml-!include (vector->list v)))
+ (bracket "include section"
+ (seq (noise (string conditional-start))
+ S?
+ (noise (string "INCLUDE"))
+ S?
+ (noise (string "[")))
+ (noise (string conditional-end))
+ parse-external-subset-decl))))
+
+(define parse-!ignore ;[63]
+ (*parser
+ (encapsulate (lambda (v) (make-xml-!ignore (vector->list v)))
+ (bracket "ignore section"
+ (seq (noise (string conditional-start))
+ S?
+ (noise (string "IGNORE"))
+ S?
+ (noise (string "[")))
+ (noise (string conditional-end))
+ (* parse-!ignore-contents)))))
+
+(define parse-!ignore-contents ;[64,65]
+ (let ((parser
+ (terminated-region-parser "ignore section" char-set:xml-char
+ conditional-start conditional-end)))
+ (*parser
+ (seq parser
+ (* (seq (sbracket "ignore section" conditional-start conditional-end
+ parse-!ignore-contents)
+ parser))))))
;;; Edwin Variables:
;;; Eval: (scheme-indent-method 'encapsulate 1)