From 21008d4793a9565e745e86c908fe595fb9c860a5 Mon Sep 17 00:00:00 2001 From: Chris Hanson Date: Thu, 12 Jul 2001 03:24:32 +0000 Subject: [PATCH] Update to use new Unicode support in the parser language. This should improve the conformance. --- v7/src/xml/compile.scm | 3 +- v7/src/xml/ed-ffi.scm | 5 +- v7/src/xml/parser-macro.scm | 6 +- v7/src/xml/xml-chars.scm | 394 ++++++++++++++++++++++++++++++++++++ v7/src/xml/xml-parser.scm | 199 ++++++------------ v7/src/xml/xml.pkg | 4 +- 6 files changed, 469 insertions(+), 142 deletions(-) create mode 100644 v7/src/xml/xml-chars.scm diff --git a/v7/src/xml/compile.scm b/v7/src/xml/compile.scm index 92ede4875..5dec17c01 100644 --- a/v7/src/xml/compile.scm +++ b/v7/src/xml/compile.scm @@ -1,6 +1,6 @@ ;;; -*-Scheme-*- ;;; -;;; $Id: compile.scm,v 1.1 2001/07/06 20:50:37 cph Exp $ +;;; $Id: compile.scm,v 1.2 2001/07/12 03:20:48 cph Exp $ ;;; ;;; Copyright (c) 2001 Massachusetts Institute of Technology ;;; @@ -25,6 +25,7 @@ (load "parser-macro") (for-each compile-file '("xml-struct" + "xml-chars" "xml-parser" ;;"xml-output" )) diff --git a/v7/src/xml/ed-ffi.scm b/v7/src/xml/ed-ffi.scm index 0e0b4b8ec..7f46b8a9c 100644 --- a/v7/src/xml/ed-ffi.scm +++ b/v7/src/xml/ed-ffi.scm @@ -1,6 +1,6 @@ ;;; -*-Scheme-*- ;;; -;;; $Id: ed-ffi.scm,v 1.1 2001/07/06 20:50:39 cph Exp $ +;;; $Id: ed-ffi.scm,v 1.2 2001/07/12 03:24:32 cph Exp $ ;;; ;;; Copyright (c) 2001 Massachusetts Institute of Technology ;;; @@ -22,5 +22,6 @@ ;;;; XML: Edwin buffer packaging info (standard-scheme-find-file-initialization - '#(("xml-struct" (runtime xml structure) system-global-syntax-table) + '#(("xml-chars" (runtime xml parser) system-global-syntax-table) + ("xml-struct" (runtime xml structure) system-global-syntax-table) ("xml-parser" (runtime xml parser) system-global-syntax-table))) \ No newline at end of file diff --git a/v7/src/xml/parser-macro.scm b/v7/src/xml/parser-macro.scm index adef80b3d..b397a7e5d 100644 --- a/v7/src/xml/parser-macro.scm +++ b/v7/src/xml/parser-macro.scm @@ -1,6 +1,6 @@ ;;; -*-Scheme-*- ;;; -;;; $Id: parser-macro.scm,v 1.3 2001/07/10 17:50:11 cph Exp $ +;;; $Id: parser-macro.scm,v 1.4 2001/07/12 03:20:50 cph Exp $ ;;; ;;; Copyright (c) 2001 Massachusetts Institute of Technology ;;; @@ -23,10 +23,10 @@ (declare (usual-integrations)) -(define-*matcher-macro S `(+ (ALPHABET CHAR-SET:XML-WHITESPACE))) +(define-*matcher-macro S `(+ (CHAR-SET CHAR-SET:XML-WHITESPACE))) (define-*parser-macro S `(NOISE S)) -(define-*matcher-macro S? `(* (ALPHABET CHAR-SET:XML-WHITESPACE))) +(define-*matcher-macro S? `(* (CHAR-SET CHAR-SET:XML-WHITESPACE))) (define-*parser-macro S? `(NOISE S?)) (define-*parser-macro (bracket description open close . body) diff --git a/v7/src/xml/xml-chars.scm b/v7/src/xml/xml-chars.scm new file mode 100644 index 000000000..6f012466e --- /dev/null +++ b/v7/src/xml/xml-chars.scm @@ -0,0 +1,394 @@ +;;; -*-Scheme-*- +;;; +;;; $Id: xml-chars.scm,v 1.1 2001/07/12 03:20:56 cph Exp $ +;;; +;;; Copyright (c) 2001 Massachusetts Institute of Technology +;;; +;;; This program is free software; you can redistribute it and/or +;;; modify it under the terms of the GNU General Public License as +;;; published by the Free Software Foundation; either version 2 of the +;;; License, or (at your option) any later version. +;;; +;;; This program is distributed in the hope that it will be useful, +;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;; General Public License for more details. +;;; +;;; You should have received a copy of the GNU General Public License +;;; along with this program; if not, write to the Free Software +;;; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +;;; 02111-1307, USA. + +;;;; XML characters + +(declare (usual-integrations)) + +(define alphabet:xml-base-char + (code-points->alphabet + '((#x0041 . #x005A) + (#x0061 . #x007A) + (#x00C0 . #x00D6) + (#x00D8 . #x00F6) + (#x00F8 . #x00FF) + (#x0100 . #x0131) + (#x0134 . #x013E) + (#x0141 . #x0148) + (#x014A . #x017E) + (#x0180 . #x01C3) + (#x01CD . #x01F0) + (#x01F4 . #x01F5) + (#x01FA . #x0217) + (#x0250 . #x02A8) + (#x02BB . #x02C1) + #x0386 + (#x0388 . #x038A) + #x038C + (#x038E . #x03A1) + (#x03A3 . #x03CE) + (#x03D0 . #x03D6) + #x03DA + #x03DC + #x03DE + #x03E0 + (#x03E2 . #x03F3) + (#x0401 . #x040C) + (#x040E . #x044F) + (#x0451 . #x045C) + (#x045E . #x0481) + (#x0490 . #x04C4) + (#x04C7 . #x04C8) + (#x04CB . #x04CC) + (#x04D0 . #x04EB) + (#x04EE . #x04F5) + (#x04F8 . #x04F9) + (#x0531 . #x0556) + #x0559 + (#x0561 . #x0586) + (#x05D0 . #x05EA) + (#x05F0 . #x05F2) + (#x0621 . #x063A) + (#x0641 . #x064A) + (#x0671 . #x06B7) + (#x06BA . #x06BE) + (#x06C0 . #x06CE) + (#x06D0 . #x06D3) + #x06D5 + (#x06E5 . #x06E6) + (#x0905 . #x0939) + #x093D + (#x0958 . #x0961) + (#x0985 . #x098C) + (#x098F . #x0990) + (#x0993 . #x09A8) + (#x09AA . #x09B0) + #x09B2 + (#x09B6 . #x09B9) + (#x09DC . #x09DD) + (#x09DF . #x09E1) + (#x09F0 . #x09F1) + (#x0A05 . #x0A0A) + (#x0A0F . #x0A10) + (#x0A13 . #x0A28) + (#x0A2A . #x0A30) + (#x0A32 . #x0A33) + (#x0A35 . #x0A36) + (#x0A38 . #x0A39) + (#x0A59 . #x0A5C) + #x0A5E + (#x0A72 . #x0A74) + (#x0A85 . #x0A8B) + #x0A8D + (#x0A8F . #x0A91) + (#x0A93 . #x0AA8) + (#x0AAA . #x0AB0) + (#x0AB2 . #x0AB3) + (#x0AB5 . #x0AB9) + #x0ABD + #x0AE0 + (#x0B05 . #x0B0C) + (#x0B0F . #x0B10) + (#x0B13 . #x0B28) + (#x0B2A . #x0B30) + (#x0B32 . #x0B33) + (#x0B36 . #x0B39) + #x0B3D + (#x0B5C . #x0B5D) + (#x0B5F . #x0B61) + (#x0B85 . #x0B8A) + (#x0B8E . #x0B90) + (#x0B92 . #x0B95) + (#x0B99 . #x0B9A) + #x0B9C + (#x0B9E . #x0B9F) + (#x0BA3 . #x0BA4) + (#x0BA8 . #x0BAA) + (#x0BAE . #x0BB5) + (#x0BB7 . #x0BB9) + (#x0C05 . #x0C0C) + (#x0C0E . #x0C10) + (#x0C12 . #x0C28) + (#x0C2A . #x0C33) + (#x0C35 . #x0C39) + (#x0C60 . #x0C61) + (#x0C85 . #x0C8C) + (#x0C8E . #x0C90) + (#x0C92 . #x0CA8) + (#x0CAA . #x0CB3) + (#x0CB5 . #x0CB9) + #x0CDE + (#x0CE0 . #x0CE1) + (#x0D05 . #x0D0C) + (#x0D0E . #x0D10) + (#x0D12 . #x0D28) + (#x0D2A . #x0D39) + (#x0D60 . #x0D61) + (#x0E01 . #x0E2E) + #x0E30 + (#x0E32 . #x0E33) + (#x0E40 . #x0E45) + (#x0E81 . #x0E82) + #x0E84 + (#x0E87 . #x0E88) + #x0E8A + #x0E8D + (#x0E94 . #x0E97) + (#x0E99 . #x0E9F) + (#x0EA1 . #x0EA3) + #x0EA5 + #x0EA7 + (#x0EAA . #x0EAB) + (#x0EAD . #x0EAE) + #x0EB0 + (#x0EB2 . #x0EB3) + #x0EBD + (#x0EC0 . #x0EC4) + (#x0F40 . #x0F47) + (#x0F49 . #x0F69) + (#x10A0 . #x10C5) + (#x10D0 . #x10F6) + #x1100 + (#x1102 . #x1103) + (#x1105 . #x1107) + #x1109 + (#x110B . #x110C) + (#x110E . #x1112) + #x113C + #x113E + #x1140 + #x114C + #x114E + #x1150 + (#x1154 . #x1155) + #x1159 + (#x115F . #x1161) + #x1163 + #x1165 + #x1167 + #x1169 + (#x116D . #x116E) + (#x1172 . #x1173) + #x1175 + #x119E + #x11A8 + #x11AB + (#x11AE . #x11AF) + (#x11B7 . #x11B8) + #x11BA + (#x11BC . #x11C2) + #x11EB + #x11F0 + #x11F9 + (#x1E00 . #x1E9B) + (#x1EA0 . #x1EF9) + (#x1F00 . #x1F15) + (#x1F18 . #x1F1D) + (#x1F20 . #x1F45) + (#x1F48 . #x1F4D) + (#x1F50 . #x1F57) + #x1F59 + #x1F5B + #x1F5D + (#x1F5F . #x1F7D) + (#x1F80 . #x1FB4) + (#x1FB6 . #x1FBC) + #x1FBE + (#x1FC2 . #x1FC4) + (#x1FC6 . #x1FCC) + (#x1FD0 . #x1FD3) + (#x1FD6 . #x1FDB) + (#x1FE0 . #x1FEC) + (#x1FF2 . #x1FF4) + (#x1FF6 . #x1FFC) + #x2126 + (#x212A . #x212B) + #x212E + (#x2180 . #x2182) + (#x3041 . #x3094) + (#x30A1 . #x30FA) + (#x3105 . #x312C) + (#xAC00 . #xD7A3)))) + +(define alphabet:xml-ideographic + (code-points->alphabet + '(#x3007 + (#x3021 . #x3029) + (#x4E00 . #x9FA5)))) + +(define alphabet:xml-combining-char + (code-points->alphabet + '((#x0300 . #x0345) + (#x0360 . #x0361) + (#x0483 . #x0486) + (#x0591 . #x05A1) + (#x05A3 . #x05B9) + (#x05BB . #x05BD) + #x05BF + (#x05C1 . #x05C2) + #x05C4 + (#x064B . #x0652) + #x0670 + (#x06D6 . #x06DC) + (#x06DD . #x06DF) + (#x06E0 . #x06E4) + (#x06E7 . #x06E8) + (#x06EA . #x06ED) + (#x0901 . #x0903) + #x093C + (#x093E . #x094C) + #x094D + (#x0951 . #x0954) + (#x0962 . #x0963) + (#x0981 . #x0983) + #x09BC + #x09BE + #x09BF + (#x09C0 . #x09C4) + (#x09C7 . #x09C8) + (#x09CB . #x09CD) + #x09D7 + (#x09E2 . #x09E3) + #x0A02 + #x0A3C + #x0A3E + #x0A3F + (#x0A40 . #x0A42) + (#x0A47 . #x0A48) + (#x0A4B . #x0A4D) + (#x0A70 . #x0A71) + (#x0A81 . #x0A83) + #x0ABC + (#x0ABE . #x0AC5) + (#x0AC7 . #x0AC9) + (#x0ACB . #x0ACD) + (#x0B01 . #x0B03) + #x0B3C + (#x0B3E . #x0B43) + (#x0B47 . #x0B48) + (#x0B4B . #x0B4D) + (#x0B56 . #x0B57) + (#x0B82 . #x0B83) + (#x0BBE . #x0BC2) + (#x0BC6 . #x0BC8) + (#x0BCA . #x0BCD) + #x0BD7 + (#x0C01 . #x0C03) + (#x0C3E . #x0C44) + (#x0C46 . #x0C48) + (#x0C4A . #x0C4D) + (#x0C55 . #x0C56) + (#x0C82 . #x0C83) + (#x0CBE . #x0CC4) + (#x0CC6 . #x0CC8) + (#x0CCA . #x0CCD) + (#x0CD5 . #x0CD6) + (#x0D02 . #x0D03) + (#x0D3E . #x0D43) + (#x0D46 . #x0D48) + (#x0D4A . #x0D4D) + #x0D57 + #x0E31 + (#x0E34 . #x0E3A) + (#x0E47 . #x0E4E) + #x0EB1 + (#x0EB4 . #x0EB9) + (#x0EBB . #x0EBC) + (#x0EC8 . #x0ECD) + (#x0F18 . #x0F19) + #x0F35 + #x0F37 + #x0F39 + #x0F3E + #x0F3F + (#x0F71 . #x0F84) + (#x0F86 . #x0F8B) + (#x0F90 . #x0F95) + #x0F97 + (#x0F99 . #x0FAD) + (#x0FB1 . #x0FB7) + #x0FB9 + (#x20D0 . #x20DC) + #x20E1 + (#x302A . #x302F) + #x3099 + #x309A))) + +(define alphabet:xml-digit + (code-points->alphabet + '((#x0030 . #x0039) + (#x0660 . #x0669) + (#x06F0 . #x06F9) + (#x0966 . #x096F) + (#x09E6 . #x09EF) + (#x0A66 . #x0A6F) + (#x0AE6 . #x0AEF) + (#x0B66 . #x0B6F) + (#x0BE7 . #x0BEF) + (#x0C66 . #x0C6F) + (#x0CE6 . #x0CEF) + (#x0D66 . #x0D6F) + (#x0E50 . #x0E59) + (#x0ED0 . #x0ED9) + (#x0F20 . #x0F29)))) + +(define alphabet:xml-extender + (code-points->alphabet + '(#x00B7 + #x02D0 + #x02D1 + #x0387 + #x0640 + #x0E46 + #x0EC6 + #x3005 + (#x3031 . #x3035) + (#x309D . #x309E) + (#x30FC . #x30FE)))) + +(define alphabet:xml-char + (code-points->alphabet + '(#x0009 + #x000A + #x000D + (#x0020 . #xD7FF) + (#xE000 . #xFFFD) + (#x10000 . #x10FFFF)))) + +(define alphabet:char-data + (alphabet- alphabet:xml-char + (string->alphabet "<&"))) + +(define alphabet:name-initial + (alphabet+ alphabet:xml-base-char + alphabet:xml-ideographic + (string->alphabet "_:"))) + +(define alphabet:name-subsequent ;[4] + (alphabet+ alphabet:xml-base-char + alphabet:xml-ideographic + alphabet:xml-digit + alphabet:xml-combining-char + alphabet:xml-extender + (string->alphabet ".-_:"))) + +(define char-set:xml-whitespace + (char-set #\space #\tab #\return #\linefeed)) \ No newline at end of file diff --git a/v7/src/xml/xml-parser.scm b/v7/src/xml/xml-parser.scm index dc65014da..809aa870b 100644 --- a/v7/src/xml/xml-parser.scm +++ b/v7/src/xml/xml-parser.scm @@ -1,6 +1,6 @@ ;;; -*-Scheme-*- ;;; -;;; $Id: xml-parser.scm,v 1.6 2001/07/10 19:34:32 cph Exp $ +;;; $Id: xml-parser.scm,v 1.7 2001/07/12 03:21:00 cph Exp $ ;;; ;;; Copyright (c) 2001 Massachusetts Institute of Technology ;;; @@ -29,26 +29,6 @@ ;;;; Utilities -(define char-set:xml-char ;[2], loose UTF-8 - ;; The upper range of this alphabet would normally be #xFE, but XML - ;; doesn't use any characters larger than #x10FFFF, so the largest - ;; byte that can be seen is #xF4. - (char-set-union (char-set #\tab #\linefeed #\return) - (ascii-range->char-set #x20 #xF5))) - -(define char-set:char-data - (char-set-difference char-set:xml-char (char-set #\< #\&))) - -(define char-set:xml-whitespace - (char-set #\space #\tab #\return #\linefeed)) - -(define (string-parser description alphabet) - (let ((a1 (char-set-difference alphabet (char-set #\"))) - (a2 (char-set-difference alphabet (char-set #\')))) - (*parser - (alt (sbracket description "\"" "\"" (match (* (alphabet a1)))) - (sbracket description "'" "'" (match (* (alphabet a2)))))))) - (define (perror ptr msg . irritants) (apply error (string-append msg @@ -97,64 +77,17 @@ (perror ptr (string-append "Malformed " description) value)) v) (vector value)))) - -(define (make-xml-char-reference n p) - (if (not (valid-xml-code-point? n)) - (perror p "Disallowed Unicode code point" n)) - (integer->unicode-string n)) - -(define (valid-xml-code-point? n) - (and (< n #x110000) - (if (< n #xD800) - (or (>= n #x20) - (= n #x9) - (= n #xA) - (= n #xD)) - (and (>= n #xE000) - (not (or (= n #xFFFE) - (= n #xFFFF))))))) - -(define (integer->unicode-string n) - - (define-integrable (initial-char n offset) - (integer->char - (fix:or (fix:and (fix:lsh #xFF (fix:+ n 1)) #xFF) - (fix:lsh n (fix:- 0 offset))))) - - (define-integrable (subsequent-char offset) - (integer->char - (fix:or #x80 - (fix:and (fix:lsh n (fix:- 0 offset)) #x3F)))) - - (if (not (and (<= 0 n) (< n #x80000000))) - (error:bad-range-argument n 'INTEGER->UNICODE-STRING)) - (cond ((< n #x00000080) - (string (integer->char n))) - ((< n #x00000800) - (string (initial-char 5 6) - (subsequent-char 6))) - ((< n #x00010000) - (string (initial-char 4 12) - (subsequent-char 12) - (subsequent-char 6))) - ((< n #x00200000) - (string (initial-char 3 18) - (subsequent-char 18) - (subsequent-char 12) - (subsequent-char 6))) - ((< n #x04000000) - (string (initial-char 2 24) - (subsequent-char 24) - (subsequent-char 18) - (subsequent-char 12) - (subsequent-char 6))) - (else - (string (initial-char 1 30) - (subsequent-char 30) - (subsequent-char 24) - (subsequent-char 18) - (subsequent-char 12) - (subsequent-char 6))))) + +(define (string-parser description alphabet) + (let ((a1 (alphabet- alphabet (string->alphabet "\""))) + (a2 (alphabet- alphabet (string->alphabet "'")))) + (*parser + (alt (sbracket description "\"" "\"" (match (* (alphabet a1)))) + (sbracket description "'" "'" (match (* (alphabet a2)))))))) + +(define alphabet:alphabetic (char-set->alphabet char-set:alphabetic)) +(define alphabet:numeric (char-set->alphabet char-set:numeric)) +(define alphabet:alphanumeric (char-set->alphabet char-set:alphanumeric)) ;;;; Top level @@ -197,8 +130,7 @@ (* (top-level (alt parse-comment parse-processing-instructions - (map normalize-line-endings - (match (+ (alphabet char-set:xml-whitespace)))))))))) + (map normalize-line-endings (match S)))))))) (define parse-declaration ;[23,24,32,80] (*parser @@ -259,14 +191,14 @@ (finish (caddr results) (cadr results) (car results))))))) (define match-xml-version ;[26] - (let ((a (char-set-union char-set:alphanumeric (string->char-set "_.:-")))) + (let ((a (alphabet+ alphabet:alphanumeric (string->alphabet "_.:-")))) (*matcher (complete (+ (alphabet a)))))) (define match-encoding ;[81] - (let ((a (char-set-union char-set:alphanumeric (string->char-set "_.-")))) + (let ((a (alphabet+ alphabet:alphanumeric (string->alphabet "_.-")))) (*matcher (complete - (seq (alphabet char-set:alphabetic) + (seq (alphabet alphabet:alphabetic) (* (alphabet a))))))) ;;;; Elements @@ -327,8 +259,7 @@ ;;;; Other markup (define (bracketed-region-parser description start end) - (let ((parser - (terminated-region-parser description char-set:xml-char end))) + (let ((parser (terminated-region-parser description alphabet:xml-char end))) (*parser (sbracket description start end parser)))) (define (terminated-region-parser description alphabet . ends) @@ -343,16 +274,16 @@ (lambda (end) (match-parser-buffer-string-no-advance buffer end)))) - (match-parser-buffer-char-in-set buffer alphabet)) + (match-utf8-char-in-alphabet buffer alphabet)) (loop) #t)))) (define parse-char-data ;[14] - (terminated-region-parser "character data" char-set:char-data "]]>")) + (terminated-region-parser "character data" alphabet:char-data "]]>")) (define parse-comment ;[15] (let ((match-body - (terminated-region-matcher "comment" char-set:xml-char "--"))) + (terminated-region-matcher "comment" alphabet:xml-char "--"))) (*parser (sbracket "comment" "" (noise match-body))))) @@ -365,7 +296,7 @@ (start "")) (let ((parse-body - (terminated-region-parser description char-set:xml-char end))) + (terminated-region-parser description alphabet:xml-char end))) (*parser (encapsulate (lambda (v) @@ -389,10 +320,12 @@ (define maybe-parse-name ;[5] (*parser (map xml-intern (match match-name)))) -(define match-name - (*matcher - (seq (alphabet char-set:name-initial) - (* (alphabet char-set:name-subsequent))))) +(define (match-name buffer) ;[5] + (and (match-utf8-char-in-alphabet buffer alphabet:name-initial) + (let loop () + (if (match-utf8-char-in-alphabet buffer alphabet:name-subsequent) + (loop) + #t)))) (define parse-name-token (*parser @@ -400,31 +333,30 @@ maybe-parse-name-token))) (define maybe-parse-name-token ;[7] - (*parser - (map xml-intern - (match (+ (alphabet char-set:name-subsequent)))))) + (*parser (map xml-intern (match match-name-token)))) -(define char-set:name-initial - (char-set-union char-set:alphabetic - (string->char-set "_:") - (ascii-range->char-set #x80 #xF5))) - -(define char-set:name-subsequent ;[4], loose UTF-8 - (char-set-union char-set:alphanumeric - (string->char-set ".-_:") - (ascii-range->char-set #x80 #xF5))) +(define (match-name-token buffer) + (and (match-utf8-char-in-alphabet buffer alphabet:name-subsequent) + (let loop () + (if (match-utf8-char-in-alphabet buffer alphabet:name-subsequent) + (loop) + #t)))) (define parse-char-reference ;[66] - (*parser - (with-pointer p - (sbracket "character reference" "&#" ";" - (alt (map (lambda (s) - (make-xml-char-reference (string->number s 10) p)) - (match (+ (alphabet char-set:numeric)))) - (seq (noise (string "x")) - (map (lambda (s) - (make-xml-char-reference (string->number s 16) p)) - (match (+ (alphabet "0-9a-fA-f")))))))))) + (let ((make-ref + (lambda (s r p) + (let ((n (string->number s r))) + (if (not (code-point-in-alphabet? n alphabet:xml-char)) + (perror p "Disallowed Unicode code point" n)) + (code-point->utf8-string n))))) + (*parser + (with-pointer p + (sbracket "character reference" "&#" ";" + (alt (map (lambda (s) (make-ref s 10 p)) + (match (+ (alphabet alphabet:numeric)))) + (seq (noise (string "x")) + (map (lambda (s) (make-ref s 16 p)) + (match (+ (char-set "0-9a-fA-f"))))))))))) (define parse-reference ;[67] (*parser @@ -443,8 +375,8 @@ (match (seq (string "&") (alt (seq (string "#") - (alt (+ (alphabet char-set:numeric)) - (seq (string "x") (+ (alphabet "0-9a-fA-f"))))) + (alt (+ (alphabet alphabet:numeric)) + (seq (string "x") (+ (char-set "0-9a-fA-f"))))) match-name) (string ";"))))) @@ -486,8 +418,8 @@ parse-attribute-value)))) (define (attribute-value-parser alphabet parse-reference) - (let ((a1 (char-set-difference alphabet (char-set #\"))) - (a2 (char-set-difference alphabet (char-set #\')))) + (let ((a1 (alphabet- alphabet (string->alphabet "\""))) + (a2 (alphabet- alphabet (string->alphabet "'")))) (*parser (encapsulate (lambda (v) (coalesce-elements (vector->list v))) (alt (sbracket "attribute value" "\"" "\"" @@ -499,7 +431,7 @@ (define parse-entity-value ;[9] (attribute-value-parser - (char-set-difference char-set:xml-char (char-set #\% #\&)) + (alphabet- alphabet:xml-char (string->alphabet "%&")) (*parser (alt parse-char-reference parse-entity-reference-deferred @@ -507,7 +439,7 @@ (define parse-attribute-value ;[10] (let ((parser - (attribute-value-parser char-set:char-data + (attribute-value-parser alphabet:char-data parse-reference-deferred))) (*parser (with-pointer p @@ -764,13 +696,13 @@ S? parse-cp))))) S?) - (? (match (alphabet "?*+"))))))) + (? (match (char-set "?*+"))))))) (parse-cp ;[48] (*parser (alt (encapsulate encapsulate-suffix (seq maybe-parse-name - (? (match (alphabet "?*+"))))) + (? (match (char-set "?*+"))))) parse-children))) (encapsulate-suffix @@ -941,13 +873,12 @@ parse-system-literal)))))) (define parse-system-literal ;[11] - (string-parser "system literal" char-set:xml-char)) + (string-parser "system literal" alphabet:xml-char)) (define parse-public-id-literal ;[12,13] - (string-parser - "public-ID literal" - (char-set-union char-set:alphanumeric - (string->char-set " \r\n-'()+,./:=?;!*#@$_%")))) + (string-parser "public-ID literal" + (alphabet+ alphabet:alphanumeric + (string->alphabet " \r\n-'()+,./:=?;!*#@$_%")))) ;;;; External subset @@ -964,9 +895,9 @@ parse-decl-separator)))) (define external-decl-parser - (let ((a1 (char-set-difference char-set:xml-char (char-set #\% #\" #\' #\>))) - (a2 (char-set-difference char-set:xml-char (char-set #\"))) - (a3 (char-set-difference char-set:xml-char (char-set #\')))) + (let ((a1 (alphabet- alphabet:xml-char (string->alphabet "%\"'>"))) + (a2 (alphabet- alphabet:xml-char (string->alphabet "\""))) + (a3 (alphabet- alphabet:xml-char (string->alphabet "'")))) (lambda (prefix parse-decl) (*parser (with-pointer p @@ -979,7 +910,7 @@ (seq (match prefix) (require-success "Malformed markup declaration" - (seq + (seq (* (alt (match (alt (* (alphabet a1)) (seq (char #\") (* (alphabet a2)) (char #\")) @@ -1051,7 +982,7 @@ match-!ignore))))) (define match-!ignore ;[65] - (terminated-region-matcher "ignore section" char-set:xml-char + (terminated-region-matcher "ignore section" alphabet:xml-char conditional-start conditional-end)) (define parse-parameterized-conditional diff --git a/v7/src/xml/xml.pkg b/v7/src/xml/xml.pkg index 03e287530..495a768b6 100644 --- a/v7/src/xml/xml.pkg +++ b/v7/src/xml/xml.pkg @@ -1,6 +1,6 @@ ;;; -*-Scheme-*- ;;; -;;; $Id: xml.pkg,v 1.2 2001/07/10 05:30:24 cph Exp $ +;;; $Id: xml.pkg,v 1.3 2001/07/12 03:20:53 cph Exp $ ;;; ;;; Copyright (c) 2001 Massachusetts Institute of Technology ;;; @@ -133,7 +133,7 @@ xml-unparsed-!entity?)) (define-package (runtime xml parser) - (files "xml-parser") + (files "xml-chars" "xml-parser") (parent ()) (export () parse-xml-document)) \ No newline at end of file -- 2.25.1