From 43541203d722fb5e017e731fea67d709cca92a6b Mon Sep 17 00:00:00 2001 From: Chris Hanson Date: Mon, 31 May 2010 02:41:21 -0700 Subject: [PATCH] Eliminate all trace of the alphabet abstraction. --- doc/ref-manual/characters.texi | 194 ++++-------- doc/ref-manual/io.texi | 23 +- src/runtime/chrset.scm | 24 +- src/runtime/parser-buffer.scm | 12 - src/runtime/regsexp.scm | 14 +- src/runtime/runtime.pkg | 22 +- src/star-parser/matcher.scm | 6 +- src/xml/turtle.scm | 100 +++--- src/xml/xml-chars.scm | 542 ++++++++++++++++----------------- src/xml/xml-names.scm | 16 +- src/xml/xml-parser.scm | 65 ++-- src/xml/xml-struct.scm | 2 +- src/xml/xml.pkg | 12 +- 13 files changed, 443 insertions(+), 589 deletions(-) diff --git a/doc/ref-manual/characters.texi b/doc/ref-manual/characters.texi index 1dcd983ca..72342bca8 100644 --- a/doc/ref-manual/characters.texi +++ b/doc/ref-manual/characters.texi @@ -520,10 +520,8 @@ corresponding to @var{code}. @cindex set, of characters MIT/GNU Scheme's character-set abstraction is used to represent groups -of characters, such as the letters or digits. Character sets may -contain only @acronym{ISO-8859-1} characters; use the @dfn{alphabet} -abstraction (@pxref{Unicode} if you need to cover the entire Unicode -range. +of characters, such as the letters or digits. A character set may +contain any Unicode character. @deffn procedure char-set? object @cindex type predicate, for character set @@ -541,8 +539,10 @@ Returns @code{#t} if @var{object} is a character set; otherwise returns @defvrx variable char-set:graphic @defvrx variable char-set:not-graphic @defvrx variable char-set:standard -These variables contain predefined character sets. -To see the contents of one of these sets, use @code{char-set-members}. +These variables contain predefined character sets. At present, these +character sets contain only @acronym{ISO-8859-1} characters; in the +future they will contain all the relevant Unicode characters. To see +the contents of one of these sets, use @code{char-set->scalar-values}. @cindex alphabetic character (defn) @cindex character, alphabetic (defn) @@ -596,10 +596,6 @@ These predicates are defined in terms of the respective character sets defined above. @end deffn -@deffn procedure char-set-members char-set -Returns a newly allocated list of the characters in @var{char-set}. -@end deffn - @deffn procedure char-set-member? char-set char Returns @code{#t} if @var{char} is in @var{char-set}; otherwise returns @code{#f}. @@ -612,15 +608,14 @@ exactly the same characters; otherwise returns @code{#f}. @deffn procedure char-set char @dots{} @cindex construction, of character set -Returns a character set consisting of the specified @acronym{ISO-8859-1} -characters. With no arguments, @code{char-set} returns an empty -character set. +Returns a character set consisting of the specified characters. With no +arguments, @code{char-set} returns an empty character set. @end deffn @deffn procedure chars->char-set chars Returns a character set consisting of @var{chars}, which must be a list -of @acronym{ISO-8859-1} characters. This is equivalent to @code{(apply -char-set @var{chars})}. +of characters. This is equivalent to @code{(apply char-set +@var{chars})}. @end deffn @deffn procedure string->char-set string @@ -628,28 +623,35 @@ Returns a character set consisting of all the characters that occur in @var{string}. @end deffn -@deffn procedure ascii-range->char-set lower upper -@var{Lower} and @var{upper} must be exact non-negative integers -representing @acronym{ISO-8859-1} character codes, and @var{lower} must -be less than or equal to @var{upper}. This procedure creates and -returns a new character set consisting of the characters whose -@acronym{ISO-8859-1} codes are between @var{lower} (inclusive) and -@var{upper} (exclusive). +@deffn procedure scalar-values->char-set items +Returns a character set containing the scalar values described by +@var{items}. @var{Items} must satisfy +@code{well-formed-scalar-values-list?}. +@end deffn -For historical reasons, the name of this procedure refers to -``@acronym{ASCII}'' rather than ``@acronym{ISO-8859-1}''. +@deffn procedure char-set->scalar-values char-set +Returns a well-formed scalar-values list that describes the scalar values +represented by @var{char-set}. @end deffn -@deffn procedure predicate->char-set predicate -@var{Predicate} must be a procedure of one argument. -@code{predicate->char-set} creates and returns a character set -consisting of the @acronym{ISO-8859-1} characters for which -@var{predicate} is true. +@deffn procedure well-formed-scalar-values-list? object +Returns @code{#t} if @var{object} is a well-formed scalar-values list, +otherwise returns @code{#f}. A well-formed scalar-values list is a +proper list, each element of which is either a unicode scalar value or a +pair of unicode scalar values. A pair of scalar values represents a +contiguous range of scalar values. The @sc{car} of the pair is the +inclusive lower limit, and the @sc{cdr} is the exclusive upper limit. +The lower limit must be strictly less than to the upper limit. +@end deffn + +@deffn procedure char-set-invert char-set +Returns a character set consisting of the characters that are not in +@var{char-set}. @end deffn -@deffn procedure char-set-difference char-set1 char-set2 +@deffn procedure char-set-difference char-set1 char-set @dots{} Returns a character set consisting of the characters that are in -@var{char-set1} but aren't in @var{char-set2}. +@var{char-set1} but aren't in any of the @var{char-set}s. @end deffn @deffn procedure char-set-intersection char-set @dots{} @@ -662,9 +664,31 @@ Returns a character set consisting of the characters that are in at least one o the @var{char-set}s. @end deffn -@deffn procedure char-set-invert char-set -Returns a character set consisting of the @acronym{ISO-8859-1} -characters that are not in @var{char-set}. +@deffn procedure 8-bit-char-set? char-set +Returns @code{#t} if @var{char-set} contains only 8-bit scalar values, +otherwise returns @code{#f}. +@end deffn + +@deffn procedure ascii-range->char-set lower upper +This procedure is obsolete. Instead use + +@example +(scalar-values->char-set (list (cons @var{lower} @var{upper}))) +@end example +@end deffn + +@deffn procedure char-set-members char-set +This procedure is obsolete; instead use @code{char-set->scalar-values}. + +Returns a newly allocated list of the @acronym{ISO-8859-1} characters in +@var{char-set}, ignoring any characters outside of that set. +@end deffn + +@deffn procedure predicate->char-set predicate +@var{Predicate} must be a procedure of one argument. +@code{predicate->char-set} creates and returns a character set +consisting of the @acronym{ISO-8859-1} characters for which +@var{predicate} is true. @end deffn @node Unicode, , Character Sets, Characters @@ -703,7 +727,7 @@ if @var{object} is a character with no bucky bits and whose code satisfies @code{unicode-scalar-value?}. @end deffn -The Unicode implementation consists of three parts: +The Unicode implementation consists of these parts: @itemize @bullet @item @@ -714,17 +738,11 @@ that support the full Unicode character set with constant-time access. @acronym{I/O} procedures that read and write Unicode characters in several external representations, specifically @acronym{UTF-8}, @acronym{UTF-16}, and @acronym{UTF-32}. - -@item -An @dfn{alphabet} abstraction, which is an efficient implementation of -sets of Unicode scalar values (similar to the @code{char-set} -abstraction). @end itemize @menu * Wide Strings:: * Unicode Representations:: -* Alphabets:: @end menu @node Wide Strings, Unicode Representations, Unicode, Unicode @@ -817,7 +835,7 @@ accumulated output is returned as a wide string. This is equivalent to: @end example @end deffn -@node Unicode Representations, Alphabets, Wide Strings, Unicode +@node Unicode Representations, , Wide Strings, Unicode @subsection Unicode Representations @cindex Unicode external representations @@ -899,95 +917,3 @@ that stream as a byte vector. The arguments @var{start} and @var{end} allow specification of a substring; they default to zero and @var{string}'s length, respectively. @end deffn - -@node Alphabets, , Unicode Representations, Unicode -@subsection Alphabets - -@cindex Alphabet, Unicode -Applications often need to manipulate sets of characters, such as the -set of alphabetic characters or the set of whitespace characters. The -@dfn{alphabet} abstraction provides an efficient implementation of -sets of Unicode scalar values. - -@deffn procedure alphabet? object -Returns @code{#t} if @var{object} is a Unicode alphabet, otherwise -returns @code{#f}. -@end deffn - -@deffn procedure alphabet unicode-char @dots{} -Returns a Unicode alphabet containing the Unicode characters passed as -arguments. -@end deffn - -@deffn procedure scalar-values->alphabet items -Returns a Unicode alphabet containing the scalar values described by -@var{items}. @var{Items} must satisfy -@code{well-formed-scalar-values-list?}. -@end deffn - -@deffn procedure alphabet->scalar-values alphabet -Returns a well-formed scalar-values list that describes the scalar values -represented by @var{alphabet}. -@end deffn - -@deffn procedure well-formed-scalar-values-list? object -Returns @code{#t} if @var{object} is a well-formed scalar-values list, -otherwise returns @code{#f}. A well-formed scalar-values list is a -proper list, each element of which is either a unicode scalar value or a -pair of unicode scalar values. A pair of scalar values represents a -contiguous range of scalar values. The @sc{car} of the pair is the -lower limit, and the @sc{cdr} is the upper limit. Both limits are -inclusive, and the lower limit must be less than or equal to the upper -limit. -@end deffn - -@deffn procedure char-in-alphabet? char alphabet -Returns @code{#t} if @var{char} is a member of @var{alphabet}, -otherwise returns @code{#f}. -@end deffn - -Character sets and alphabets can be converted to one another, provided -that the alphabet contains only 8-bit scalar values. This is true -because 8-bit scalar values in Unicode map directly to -@acronym{ISO-8859-1} characters, which is what character sets contain. - -@deffn procedure char-set->alphabet char-set -Returns a Unicode alphabet containing the scalar values that correspond -to characters that are members of @var{char-set}. -@end deffn - -@deffn procedure alphabet->char-set alphabet -Returns a character set containing the characters that correspond to -8-bit scalar values that are members of @var{alphabet}. (Scalar values -outside the 8-bit range are ignored.) -@end deffn - -@deffn procedure string->alphabet string -Returns a Unicode alphabet containing the scalar values corresponding to -the characters in @var{string}. Equivalent to - -@example -(char-set->alphabet (string->char-set @var{string})) -@end example -@end deffn - -@deffn procedure alphabet->string alphabet -Returns a newly-allocated string containing the characters corresponding -to the 8-bit scalar values in @var{alphabet}. (Scalar values outside -the 8-bit range are ignored.) -@end deffn - -@deffn procedure 8-bit-alphabet? alphabet -Returns @code{#t} if @var{alphabet} contains only 8-bit scalar values, -otherwise returns @code{#f}. -@end deffn - -@deffn procedure alphabet+ alphabet @dots{} -Returns a Unicode alphabet that contains each scalar value that is a -member of any of the @var{alphabet} arguments. -@end deffn - -@deffn procedure alphabet- alphabet1 alphabet2 -Returns a Unicode alphabet that contains each scalar value that is a -member of @var{alphabet1} and is not a member of @var{alphabet2}. -@end deffn diff --git a/doc/ref-manual/io.texi b/doc/ref-manual/io.texi index c1d21b0e7..0f69a24ff 100644 --- a/doc/ref-manual/io.texi +++ b/doc/ref-manual/io.texi @@ -60,7 +60,8 @@ a port that you specify. The current output port is initially @code{console-i/o-port}, but Scheme provides procedures that let you change the current output port to be a file or string. -All ports read or write only @acronym{ISO-8859-1} characters. +Nearly all ports read or write Unicode characters; the exceptions are +those for which non-Unicode character coding has been specified. Every port is either an input port, an output port, or both. The following predicates distinguish all of the possible cases. @@ -1920,10 +1921,7 @@ with buffering and backtracking. A parser-buffer pointer is a pointer into the stream of characters provided by a parser buffer. Note that all of the procedures defined here consider a parser buffer -to contain a stream of 8-bit characters in the @acronym{ISO-8859-1} -character set, except for @code{match-utf8-char-in-alphabet} which -treats it as a stream of Unicode characters encoded as 8-bit bytes in -the @acronym{UTF-8} encoding. +to contain a stream of Unicode characters. There are several constructors for parser buffers: @@ -2077,15 +2075,6 @@ These procedures match the specified substring against @var{buffer}'s contents. The @samp{-ci} procedures do case-insensitive matching. @end deffn -@deffn procedure match-utf8-char-in-alphabet buffer alphabet -This procedure treats @var{buffer}'s contents as @acronym{UTF-8} -encoded Unicode characters and matches the next such character against -@var{alphabet}, which must be a Unicode alphabet (@pxref{Unicode}). -@acronym{UTF-8} represents characters with 1 to 6 bytes, so a -successful match can move the internal pointer forward by as many as 6 -bytes. -@end deffn - The remaining procedures provide information that can be used to identify locations in a parser buffer's stream. @@ -2355,12 +2344,6 @@ character set. The @var{expression} operand is a Scheme expression that must evaluate to a character set at run time. @end deffn -@deffn {matcher expression} alphabet expression -These expressions match a single character that is a member of a given -Unicode alphabet (@pxref{Unicode}). The @var{expression} operand is a -Scheme expression that must evaluate to an alphabet at run time. -@end deffn - @deffn {matcher expression} end-of-input The @code{end-of-input} expression is successful only when there are no more characters available to be matched. diff --git a/src/runtime/chrset.scm b/src/runtime/chrset.scm index 8ad7ca20b..fd44250be 100644 --- a/src/runtime/chrset.scm +++ b/src/runtime/chrset.scm @@ -579,26 +579,4 @@ USA. (error:bad-range-argument start 'ASCII-RANGE->CHAR-SET)) (if (not (fix:<= end #x100)) (error:bad-range-argument end 'ASCII-RANGE->CHAR-SET)) - (%scalar-values->char-set (list (cons start (fix:- end 1))))) - -(define (char-in-alphabet? char alphabet) - (char-set-member? alphabet char)) - -(define (scalar-values->alphabet items) - (scalar-values->char-set - (map (lambda (range) - (if (and (pair? range) - (unicode-scalar-value? (car range)) - (unicode-scalar-value? (cdr range))) - (cons (car range) (fix:+ (cdr range) 1)) - range)) - items))) - -(define (alphabet->scalar-values alphabet) - (map (lambda (range) - (if (pair? range) - (set-cdr! range (fix:- (cdr range) 1)))) - (char-set->scalar-values alphabet))) - -(define (char-set->alphabet char-set) - char-set) \ No newline at end of file + (%scalar-values->char-set (list (cons start (fix:- end 1))))) \ No newline at end of file diff --git a/src/runtime/parser-buffer.scm b/src/runtime/parser-buffer.scm index 88032afe2..19cf1bc68 100644 --- a/src/runtime/parser-buffer.scm +++ b/src/runtime/parser-buffer.scm @@ -239,18 +239,6 @@ USA. (define-integrable (char-in-set? char set) (char-set-member? set char)) -(define (match-parser-buffer-char-in-alphabet buffer alphabet) - (match-char buffer alphabet char-in-alphabet?)) - -(define (match-parser-buffer-char-not-in-alphabet buffer alphabet) - (match-char-not buffer alphabet char-in-alphabet?)) - -(define (match-parser-buffer-char-in-alphabet-no-advance buffer alphabet) - (match-char-no-advance buffer alphabet char-in-alphabet?)) - -(define (match-parser-buffer-char-not-in-alphabet-no-advance buffer alphabet) - (match-char-not-no-advance buffer alphabet char-in-alphabet?)) - (define-integrable (match-char buffer reference compare) (and (guarantee-buffer-chars buffer 1) (let ((char diff --git a/src/runtime/regsexp.scm b/src/runtime/regsexp.scm index 0ff55a736..4f40645b0 100644 --- a/src/runtime/regsexp.scm +++ b/src/runtime/regsexp.scm @@ -77,14 +77,14 @@ USA. (error "Ill-formed regular s-expression:" regsexp)))) (define (%compile-char-set items) - (scalar-values->alphabet + (scalar-values->char-set (append-map (lambda (item) (cond ((well-formed-scalar-value-range? item) (list item)) ((unicode-char? item) (list (char->integer item))) - ((alphabet? item) - (alphabet->scalar-values item)) + ((char-set? item) + (char-set->scalar-values item)) ((string? item) (map char->integer (string->list item))) (else @@ -307,21 +307,21 @@ USA. (fail))) (succeed position groups fail))))))))) -(define (insn:char-set alphabet) +(define (insn:char-set char-set) (lambda (succeed) (lambda (position groups fail) (if (let ((char (next-char position))) (and char - (char-in-alphabet? char alphabet))) + (char-set-member? char-set char))) (succeed (next-position position) groups fail) (fail))))) -(define (insn:inverse-char-set alphabet) +(define (insn:inverse-char-set char-set) (lambda (succeed) (lambda (position groups fail) (if (let ((char (next-char position))) (and char - (not (char-in-alphabet? char alphabet)))) + (not (char-set-member? char-set char)))) (succeed (next-position position) groups fail) (fail))))) diff --git a/src/runtime/runtime.pkg b/src/runtime/runtime.pkg index cea35327b..9bfb10ce3 100644 --- a/src/runtime/runtime.pkg +++ b/src/runtime/runtime.pkg @@ -1084,29 +1084,16 @@ USA. (files "chrset") (parent (runtime)) (export () - (8-bit-alphabet? 8-bit-char-set?) - ( ) - (alphabet char-set) - (alphabet+ char-set-union) - (alphabet- char-set-difference) - (alphabet->char-set char-set->alphabet) - (alphabet->string char-set->string) - (alphabet-predicate char-set-predicate) - (alphabet? char-set?) - (string->alphabet string->char-set) 8-bit-char-set? - alphabet->scalar-values ascii-range->char-set char-alphabetic? char-alphanumeric? char-ctl? char-graphic? - char-in-alphabet? char-lower-case? char-numeric? char-set - char-set->alphabet char-set->scalar-values char-set-difference char-set-intersection @@ -1151,7 +1138,6 @@ USA. guarantee-well-formed-scalar-value-list guarantee-well-formed-scalar-value-range predicate->char-set - scalar-values->alphabet scalar-values->char-set string->char-set well-formed-scalar-value-list? @@ -5184,7 +5170,6 @@ USA. (files "parser-buffer") (parent (runtime)) (export () - (match-utf8-char-in-alphabet match-parser-buffer-char-in-alphabet) *match-string *match-symbol *match-utf8-string @@ -5201,13 +5186,12 @@ USA. match-parser-buffer-char match-parser-buffer-char-ci match-parser-buffer-char-ci-no-advance - match-parser-buffer-char-in-alphabet - match-parser-buffer-char-in-alphabet-no-advance + + match-parser-buffer-char-in-set match-parser-buffer-char-in-set-no-advance match-parser-buffer-char-no-advance - match-parser-buffer-char-not-in-alphabet - match-parser-buffer-char-not-in-alphabet-no-advance + match-parser-buffer-char-not-in-set match-parser-buffer-char-not-in-set-no-advance match-parser-buffer-not-char diff --git a/src/star-parser/matcher.scm b/src/star-parser/matcher.scm index 323be2a58..be4f965a6 100644 --- a/src/star-parser/matcher.scm +++ b/src/star-parser/matcher.scm @@ -170,7 +170,7 @@ USA. external-bindings internal-bindings)))) -(define-matcher-preprocessor '(CHAR CHAR-CI NOT-CHAR NOT-CHAR-CI ALPHABET) +(define-matcher-preprocessor '(CHAR CHAR-CI NOT-CHAR NOT-CHAR-CI) (lambda (expression external-bindings internal-bindings) external-bindings internal-bindings (check-1-arg expression) @@ -317,10 +317,6 @@ USA. `(MATCH-PARSER-BUFFER-CHAR-IN-SET ,*buffer-name* ,(protect char-set free-names))) -(define-atomic-matcher (alphabet alphabet) - `(MATCH-PARSER-BUFFER-CHAR-IN-ALPHABET ,*buffer-name* - ,(protect alphabet free-names))) - (define-atomic-matcher (string string) `(MATCH-PARSER-BUFFER-STRING ,*buffer-name* ,(protect string free-names))) diff --git a/src/xml/turtle.scm b/src/xml/turtle.scm index 605e144d7..b9eeb1cd3 100644 --- a/src/xml/turtle.scm +++ b/src/xml/turtle.scm @@ -161,8 +161,8 @@ USA. (define match:name (*matcher - (seq (alphabet alphabet:name-start-char) - (* (alphabet alphabet:name-char))))) + (seq (char-set char-set:name-start-char) + (* (char-set char-set:name-char))))) (define parse:prefix (*parser @@ -171,8 +171,8 @@ USA. (define match:prefix-name (*matcher - (? (seq (alphabet alphabet:prefix-name-start-char) - (* (alphabet alphabet:name-char)))))) + (? (seq (char-set char-set:prefix-name-start-char) + (* (char-set char-set:name-char)))))) ;;;; Literals @@ -259,7 +259,7 @@ USA. (map (lambda (s) (make-rdf-literal s xsd:boolean)) (match (alt "true" "false"))))) -;;;; Alphabets +;;;; Character sets (define char-set:turtle-hex (string->char-set "0123456789ABCDEF")) @@ -277,52 +277,52 @@ USA. (char-set-union char-set:turtle-lower char-set:turtle-digit)) -(define alphabet:name-start-char - (scalar-values->alphabet - '((#x0041 . #x005A) +(define char-set:name-start-char + (scalar-values->char-set + '((#x0041 . #x005B) #x005F - (#x0061 . #x007A) - (#x00C0 . #x00D6) - (#x00D8 . #x00F6) - (#x00F8 . #x02FF) - (#x0370 . #x037D) - (#x037F . #x1FFF) - (#x200C . #x200D) - (#x2070 . #x218F) - (#x2C00 . #x2FEF) - (#x3001 . #xD7FF) - (#xF900 . #xFDCF) - (#xFDF0 . #xFFFD) - (#x10000 . #xEFFFF)))) - -(define alphabet:name-char - (alphabet+ alphabet:name-start-char - (scalar-values->alphabet - '(#x002D - (#x0030 . #x0039) - #x00B7 - (#x0300 . #x036F) - (#x203F . #x2040))))) - -(define alphabet:prefix-name-start-char - (alphabet- alphabet:name-start-char (alphabet #\_))) - -(define alphabet:character - (scalar-values->alphabet '((#x20 . #x5B) (#x5D . #x10FFFF)))) - -(define alphabet:ucharacter - (alphabet- alphabet:character (alphabet #\>))) - -(define alphabet:scharacter - (alphabet- alphabet:character (alphabet #\"))) - -(define alphabet:lcharacter - (alphabet+ alphabet:character (alphabet #\tab #\newline #\return))) + (#x0061 . #x007B) + (#x00C0 . #x00D7) + (#x00D8 . #x00F7) + (#x00F8 . #x0300) + (#x0370 . #x037E) + (#x037F . #x2000) + (#x200C . #x200E) + (#x2070 . #x2190) + (#x2C00 . #x2FF0) + (#x3001 . #xD800) + (#xF900 . #xFDD0) + (#xFDF0 . #xFFFE) + (#x10000 . #xF0000)))) + +(define char-set:name-char + (char-set-union char-set:name-start-char + (scalar-values->char-set + '(#x002D + (#x0030 . #x003A) + #x00B7 + (#x0300 . #x0370) + (#x203F . #x2041))))) + +(define char-set:prefix-name-start-char + (char-set-difference char-set:name-start-char (char-set #\_))) + +(define char-set:character + (scalar-values->char-set '((#x20 . #x5C) (#x5D . #x110000)))) + +(define char-set:ucharacter + (char-set-difference char-set:character (char-set #\>))) + +(define char-set:scharacter + (char-set-difference char-set:character (char-set #\"))) + +(define char-set:lcharacter + (char-set-union char-set:character (char-set #\tab #\newline #\return))) ;;;; Escaped strings (define (delimited-region-parser name start-delim end-delim - alphabet parse-escapes) + char-set parse-escapes) (lambda (buffer) (let ((output (open-utf8-output-string)) (start (get-parser-buffer-pointer buffer))) @@ -334,7 +334,7 @@ USA. (copy p) (match-parser-buffer-string buffer end-delim) (finish)) - ((match-parser-buffer-char-in-alphabet buffer alphabet) + ((match-parser-buffer-char-in-set buffer char-set) (loop)) ((match-parser-buffer-char-no-advance buffer #\\) (copy p) @@ -409,21 +409,21 @@ USA. (delimited-region-parser "string" "\"" "\"" - alphabet:scharacter + char-set:scharacter parse:scharacter-escape)) (define parse:long-string (delimited-region-parser "long string" "\"\"\"" "\"\"\"" - alphabet:lcharacter + char-set:lcharacter parse:scharacter-escape)) (define parse:uriref (delimited-region-parser "URI reference" "<" ">" - alphabet:ucharacter + char-set:ucharacter parse:ucharacter-escape)) ;;;; Whitespace diff --git a/src/xml/xml-chars.scm b/src/xml/xml-chars.scm index e7952a3e4..ca4649cd2 100644 --- a/src/xml/xml-chars.scm +++ b/src/xml/xml-chars.scm @@ -27,335 +27,335 @@ USA. (declare (usual-integrations)) -(define alphabet:xml-base-char - (scalar-values->alphabet - '((#x0041 . #x005A) - (#x0061 . #x007A) - (#x00C0 . #x00D6) - (#x00D8 . #x00F6) - (#x00F8 . #x00FF) - (#x0100 . #x0131) - (#x0134 . #x013E) - (#x0141 . #x0148) - (#x014A . #x017E) - (#x0180 . #x01C3) - (#x01CD . #x01F0) - (#x01F4 . #x01F5) - (#x01FA . #x0217) - (#x0250 . #x02A8) - (#x02BB . #x02C1) +(define char-set:xml-base-char + (scalar-values->char-set + '((#x0041 . #x005B) + (#x0061 . #x007B) + (#x00C0 . #x00D7) + (#x00D8 . #x00F7) + (#x00F8 . #x0100) + (#x0100 . #x0132) + (#x0134 . #x013F) + (#x0141 . #x0149) + (#x014A . #x017F) + (#x0180 . #x01C4) + (#x01CD . #x01F1) + (#x01F4 . #x01F6) + (#x01FA . #x0218) + (#x0250 . #x02A9) + (#x02BB . #x02C2) #x0386 - (#x0388 . #x038A) + (#x0388 . #x038B) #x038C - (#x038E . #x03A1) - (#x03A3 . #x03CE) - (#x03D0 . #x03D6) + (#x038E . #x03A2) + (#x03A3 . #x03CF) + (#x03D0 . #x03D7) #x03DA #x03DC #x03DE #x03E0 - (#x03E2 . #x03F3) - (#x0401 . #x040C) - (#x040E . #x044F) - (#x0451 . #x045C) - (#x045E . #x0481) - (#x0490 . #x04C4) - (#x04C7 . #x04C8) - (#x04CB . #x04CC) - (#x04D0 . #x04EB) - (#x04EE . #x04F5) - (#x04F8 . #x04F9) - (#x0531 . #x0556) + (#x03E2 . #x03F4) + (#x0401 . #x040D) + (#x040E . #x0450) + (#x0451 . #x045D) + (#x045E . #x0482) + (#x0490 . #x04C5) + (#x04C7 . #x04C9) + (#x04CB . #x04CD) + (#x04D0 . #x04EC) + (#x04EE . #x04F6) + (#x04F8 . #x04FA) + (#x0531 . #x0557) #x0559 - (#x0561 . #x0586) - (#x05D0 . #x05EA) - (#x05F0 . #x05F2) - (#x0621 . #x063A) - (#x0641 . #x064A) - (#x0671 . #x06B7) - (#x06BA . #x06BE) - (#x06C0 . #x06CE) - (#x06D0 . #x06D3) + (#x0561 . #x0587) + (#x05D0 . #x05EB) + (#x05F0 . #x05F3) + (#x0621 . #x063B) + (#x0641 . #x064B) + (#x0671 . #x06B8) + (#x06BA . #x06BF) + (#x06C0 . #x06CF) + (#x06D0 . #x06D4) #x06D5 - (#x06E5 . #x06E6) - (#x0905 . #x0939) + (#x06E5 . #x06E7) + (#x0905 . #x093A) #x093D - (#x0958 . #x0961) - (#x0985 . #x098C) - (#x098F . #x0990) - (#x0993 . #x09A8) - (#x09AA . #x09B0) + (#x0958 . #x0962) + (#x0985 . #x098D) + (#x098F . #x0991) + (#x0993 . #x09A9) + (#x09AA . #x09B1) #x09B2 - (#x09B6 . #x09B9) - (#x09DC . #x09DD) - (#x09DF . #x09E1) - (#x09F0 . #x09F1) - (#x0A05 . #x0A0A) - (#x0A0F . #x0A10) - (#x0A13 . #x0A28) - (#x0A2A . #x0A30) - (#x0A32 . #x0A33) - (#x0A35 . #x0A36) - (#x0A38 . #x0A39) - (#x0A59 . #x0A5C) + (#x09B6 . #x09BA) + (#x09DC . #x09DE) + (#x09DF . #x09E2) + (#x09F0 . #x09F2) + (#x0A05 . #x0A0B) + (#x0A0F . #x0A11) + (#x0A13 . #x0A29) + (#x0A2A . #x0A31) + (#x0A32 . #x0A34) + (#x0A35 . #x0A37) + (#x0A38 . #x0A3A) + (#x0A59 . #x0A5D) #x0A5E - (#x0A72 . #x0A74) - (#x0A85 . #x0A8B) + (#x0A72 . #x0A75) + (#x0A85 . #x0A8C) #x0A8D - (#x0A8F . #x0A91) - (#x0A93 . #x0AA8) - (#x0AAA . #x0AB0) - (#x0AB2 . #x0AB3) - (#x0AB5 . #x0AB9) + (#x0A8F . #x0A92) + (#x0A93 . #x0AA9) + (#x0AAA . #x0AB1) + (#x0AB2 . #x0AB4) + (#x0AB5 . #x0ABA) #x0ABD #x0AE0 - (#x0B05 . #x0B0C) - (#x0B0F . #x0B10) - (#x0B13 . #x0B28) - (#x0B2A . #x0B30) - (#x0B32 . #x0B33) - (#x0B36 . #x0B39) + (#x0B05 . #x0B0D) + (#x0B0F . #x0B11) + (#x0B13 . #x0B29) + (#x0B2A . #x0B31) + (#x0B32 . #x0B34) + (#x0B36 . #x0B3A) #x0B3D - (#x0B5C . #x0B5D) - (#x0B5F . #x0B61) - (#x0B85 . #x0B8A) - (#x0B8E . #x0B90) - (#x0B92 . #x0B95) - (#x0B99 . #x0B9A) + (#x0B5C . #x0B5E) + (#x0B5F . #x0B62) + (#x0B85 . #x0B8B) + (#x0B8E . #x0B91) + (#x0B92 . #x0B96) + (#x0B99 . #x0B9B) #x0B9C - (#x0B9E . #x0B9F) - (#x0BA3 . #x0BA4) - (#x0BA8 . #x0BAA) - (#x0BAE . #x0BB5) - (#x0BB7 . #x0BB9) - (#x0C05 . #x0C0C) - (#x0C0E . #x0C10) - (#x0C12 . #x0C28) - (#x0C2A . #x0C33) - (#x0C35 . #x0C39) - (#x0C60 . #x0C61) - (#x0C85 . #x0C8C) - (#x0C8E . #x0C90) - (#x0C92 . #x0CA8) - (#x0CAA . #x0CB3) - (#x0CB5 . #x0CB9) + (#x0B9E . #x0BA0) + (#x0BA3 . #x0BA5) + (#x0BA8 . #x0BAB) + (#x0BAE . #x0BB6) + (#x0BB7 . #x0BBA) + (#x0C05 . #x0C0D) + (#x0C0E . #x0C11) + (#x0C12 . #x0C29) + (#x0C2A . #x0C34) + (#x0C35 . #x0C3A) + (#x0C60 . #x0C62) + (#x0C85 . #x0C8D) + (#x0C8E . #x0C91) + (#x0C92 . #x0CA9) + (#x0CAA . #x0CB4) + (#x0CB5 . #x0CBA) #x0CDE - (#x0CE0 . #x0CE1) - (#x0D05 . #x0D0C) - (#x0D0E . #x0D10) - (#x0D12 . #x0D28) - (#x0D2A . #x0D39) - (#x0D60 . #x0D61) - (#x0E01 . #x0E2E) + (#x0CE0 . #x0CE2) + (#x0D05 . #x0D0D) + (#x0D0E . #x0D11) + (#x0D12 . #x0D29) + (#x0D2A . #x0D3A) + (#x0D60 . #x0D62) + (#x0E01 . #x0E2F) #x0E30 - (#x0E32 . #x0E33) - (#x0E40 . #x0E45) - (#x0E81 . #x0E82) + (#x0E32 . #x0E34) + (#x0E40 . #x0E46) + (#x0E81 . #x0E83) #x0E84 - (#x0E87 . #x0E88) + (#x0E87 . #x0E89) #x0E8A #x0E8D - (#x0E94 . #x0E97) - (#x0E99 . #x0E9F) - (#x0EA1 . #x0EA3) + (#x0E94 . #x0E98) + (#x0E99 . #x0EA0) + (#x0EA1 . #x0EA4) #x0EA5 #x0EA7 - (#x0EAA . #x0EAB) - (#x0EAD . #x0EAE) + (#x0EAA . #x0EAC) + (#x0EAD . #x0EAF) #x0EB0 - (#x0EB2 . #x0EB3) + (#x0EB2 . #x0EB4) #x0EBD - (#x0EC0 . #x0EC4) - (#x0F40 . #x0F47) - (#x0F49 . #x0F69) - (#x10A0 . #x10C5) - (#x10D0 . #x10F6) + (#x0EC0 . #x0EC5) + (#x0F40 . #x0F48) + (#x0F49 . #x0F6A) + (#x10A0 . #x10C6) + (#x10D0 . #x10F7) #x1100 - (#x1102 . #x1103) - (#x1105 . #x1107) + (#x1102 . #x1104) + (#x1105 . #x1108) #x1109 - (#x110B . #x110C) - (#x110E . #x1112) + (#x110B . #x110D) + (#x110E . #x1113) #x113C #x113E #x1140 #x114C #x114E #x1150 - (#x1154 . #x1155) + (#x1154 . #x1156) #x1159 - (#x115F . #x1161) + (#x115F . #x1162) #x1163 #x1165 #x1167 #x1169 - (#x116D . #x116E) - (#x1172 . #x1173) + (#x116D . #x116F) + (#x1172 . #x1174) #x1175 #x119E #x11A8 #x11AB - (#x11AE . #x11AF) - (#x11B7 . #x11B8) + (#x11AE . #x11B0) + (#x11B7 . #x11B9) #x11BA - (#x11BC . #x11C2) + (#x11BC . #x11C3) #x11EB #x11F0 #x11F9 - (#x1E00 . #x1E9B) - (#x1EA0 . #x1EF9) - (#x1F00 . #x1F15) - (#x1F18 . #x1F1D) - (#x1F20 . #x1F45) - (#x1F48 . #x1F4D) - (#x1F50 . #x1F57) + (#x1E00 . #x1E9C) + (#x1EA0 . #x1EFA) + (#x1F00 . #x1F16) + (#x1F18 . #x1F1E) + (#x1F20 . #x1F46) + (#x1F48 . #x1F4E) + (#x1F50 . #x1F58) #x1F59 #x1F5B #x1F5D - (#x1F5F . #x1F7D) - (#x1F80 . #x1FB4) - (#x1FB6 . #x1FBC) + (#x1F5F . #x1F7E) + (#x1F80 . #x1FB5) + (#x1FB6 . #x1FBD) #x1FBE - (#x1FC2 . #x1FC4) - (#x1FC6 . #x1FCC) - (#x1FD0 . #x1FD3) - (#x1FD6 . #x1FDB) - (#x1FE0 . #x1FEC) - (#x1FF2 . #x1FF4) - (#x1FF6 . #x1FFC) + (#x1FC2 . #x1FC5) + (#x1FC6 . #x1FCD) + (#x1FD0 . #x1FD4) + (#x1FD6 . #x1FDC) + (#x1FE0 . #x1FED) + (#x1FF2 . #x1FF5) + (#x1FF6 . #x1FFD) #x2126 - (#x212A . #x212B) + (#x212A . #x212C) #x212E - (#x2180 . #x2182) - (#x3041 . #x3094) - (#x30A1 . #x30FA) - (#x3105 . #x312C) - (#xAC00 . #xD7A3)))) + (#x2180 . #x2183) + (#x3041 . #x3095) + (#x30A1 . #x30FB) + (#x3105 . #x312D) + (#xAC00 . #xD7A4)))) -(define alphabet:xml-ideographic - (scalar-values->alphabet +(define char-set:xml-ideographic + (scalar-values->char-set '(#x3007 - (#x3021 . #x3029) - (#x4E00 . #x9FA5)))) + (#x3021 . #x302A) + (#x4E00 . #x9FA6)))) -(define alphabet:xml-combining-char - (scalar-values->alphabet - '((#x0300 . #x0345) - (#x0360 . #x0361) - (#x0483 . #x0486) - (#x0591 . #x05A1) - (#x05A3 . #x05B9) - (#x05BB . #x05BD) +(define char-set:xml-combining-char + (scalar-values->char-set + '((#x0300 . #x0346) + (#x0360 . #x0362) + (#x0483 . #x0487) + (#x0591 . #x05A2) + (#x05A3 . #x05BA) + (#x05BB . #x05BE) #x05BF - (#x05C1 . #x05C2) + (#x05C1 . #x05C3) #x05C4 - (#x064B . #x0652) + (#x064B . #x0653) #x0670 - (#x06D6 . #x06DC) - (#x06DD . #x06DF) - (#x06E0 . #x06E4) - (#x06E7 . #x06E8) - (#x06EA . #x06ED) - (#x0901 . #x0903) + (#x06D6 . #x06DD) + (#x06DD . #x06E0) + (#x06E0 . #x06E5) + (#x06E7 . #x06E9) + (#x06EA . #x06EE) + (#x0901 . #x0904) #x093C - (#x093E . #x094C) + (#x093E . #x094D) #x094D - (#x0951 . #x0954) - (#x0962 . #x0963) - (#x0981 . #x0983) + (#x0951 . #x0955) + (#x0962 . #x0964) + (#x0981 . #x0984) #x09BC #x09BE #x09BF - (#x09C0 . #x09C4) - (#x09C7 . #x09C8) - (#x09CB . #x09CD) + (#x09C0 . #x09C5) + (#x09C7 . #x09C9) + (#x09CB . #x09CE) #x09D7 - (#x09E2 . #x09E3) + (#x09E2 . #x09E4) #x0A02 #x0A3C #x0A3E #x0A3F - (#x0A40 . #x0A42) - (#x0A47 . #x0A48) - (#x0A4B . #x0A4D) - (#x0A70 . #x0A71) - (#x0A81 . #x0A83) + (#x0A40 . #x0A43) + (#x0A47 . #x0A49) + (#x0A4B . #x0A4E) + (#x0A70 . #x0A72) + (#x0A81 . #x0A84) #x0ABC - (#x0ABE . #x0AC5) - (#x0AC7 . #x0AC9) - (#x0ACB . #x0ACD) - (#x0B01 . #x0B03) + (#x0ABE . #x0AC6) + (#x0AC7 . #x0ACA) + (#x0ACB . #x0ACE) + (#x0B01 . #x0B04) #x0B3C - (#x0B3E . #x0B43) - (#x0B47 . #x0B48) - (#x0B4B . #x0B4D) - (#x0B56 . #x0B57) - (#x0B82 . #x0B83) - (#x0BBE . #x0BC2) - (#x0BC6 . #x0BC8) - (#x0BCA . #x0BCD) + (#x0B3E . #x0B44) + (#x0B47 . #x0B49) + (#x0B4B . #x0B4E) + (#x0B56 . #x0B58) + (#x0B82 . #x0B84) + (#x0BBE . #x0BC3) + (#x0BC6 . #x0BC9) + (#x0BCA . #x0BCE) #x0BD7 - (#x0C01 . #x0C03) - (#x0C3E . #x0C44) - (#x0C46 . #x0C48) - (#x0C4A . #x0C4D) - (#x0C55 . #x0C56) - (#x0C82 . #x0C83) - (#x0CBE . #x0CC4) - (#x0CC6 . #x0CC8) - (#x0CCA . #x0CCD) - (#x0CD5 . #x0CD6) - (#x0D02 . #x0D03) - (#x0D3E . #x0D43) - (#x0D46 . #x0D48) - (#x0D4A . #x0D4D) + (#x0C01 . #x0C04) + (#x0C3E . #x0C45) + (#x0C46 . #x0C49) + (#x0C4A . #x0C4E) + (#x0C55 . #x0C57) + (#x0C82 . #x0C84) + (#x0CBE . #x0CC5) + (#x0CC6 . #x0CC9) + (#x0CCA . #x0CCE) + (#x0CD5 . #x0CD7) + (#x0D02 . #x0D04) + (#x0D3E . #x0D44) + (#x0D46 . #x0D49) + (#x0D4A . #x0D4E) #x0D57 #x0E31 - (#x0E34 . #x0E3A) - (#x0E47 . #x0E4E) + (#x0E34 . #x0E3B) + (#x0E47 . #x0E4F) #x0EB1 - (#x0EB4 . #x0EB9) - (#x0EBB . #x0EBC) - (#x0EC8 . #x0ECD) - (#x0F18 . #x0F19) + (#x0EB4 . #x0EBA) + (#x0EBB . #x0EBD) + (#x0EC8 . #x0ECE) + (#x0F18 . #x0F1A) #x0F35 #x0F37 #x0F39 #x0F3E #x0F3F - (#x0F71 . #x0F84) - (#x0F86 . #x0F8B) - (#x0F90 . #x0F95) + (#x0F71 . #x0F85) + (#x0F86 . #x0F8C) + (#x0F90 . #x0F96) #x0F97 - (#x0F99 . #x0FAD) - (#x0FB1 . #x0FB7) + (#x0F99 . #x0FAE) + (#x0FB1 . #x0FB8) #x0FB9 - (#x20D0 . #x20DC) + (#x20D0 . #x20DD) #x20E1 - (#x302A . #x302F) + (#x302A . #x3030) #x3099 #x309A))) -(define alphabet:xml-digit - (scalar-values->alphabet - '((#x0030 . #x0039) - (#x0660 . #x0669) - (#x06F0 . #x06F9) - (#x0966 . #x096F) - (#x09E6 . #x09EF) - (#x0A66 . #x0A6F) - (#x0AE6 . #x0AEF) - (#x0B66 . #x0B6F) - (#x0BE7 . #x0BEF) - (#x0C66 . #x0C6F) - (#x0CE6 . #x0CEF) - (#x0D66 . #x0D6F) - (#x0E50 . #x0E59) - (#x0ED0 . #x0ED9) - (#x0F20 . #x0F29)))) +(define char-set:xml-digit + (scalar-values->char-set + '((#x0030 . #x003A) + (#x0660 . #x066A) + (#x06F0 . #x06FA) + (#x0966 . #x0970) + (#x09E6 . #x09F0) + (#x0A66 . #x0A70) + (#x0AE6 . #x0AF0) + (#x0B66 . #x0B70) + (#x0BE7 . #x0BF0) + (#x0C66 . #x0C70) + (#x0CE6 . #x0CF0) + (#x0D66 . #x0D70) + (#x0E50 . #x0E5A) + (#x0ED0 . #x0EDA) + (#x0F20 . #x0F2A)))) -(define alphabet:xml-extender - (scalar-values->alphabet +(define char-set:xml-extender + (scalar-values->char-set '(#x00B7 #x02D0 #x02D1 @@ -364,43 +364,43 @@ USA. #x0E46 #x0EC6 #x3005 - (#x3031 . #x3035) - (#x309D . #x309E) - (#x30FC . #x30FE)))) + (#x3031 . #x3036) + (#x309D . #x309F) + (#x30FC . #x30FF)))) -(define alphabet:xml-char - (scalar-values->alphabet +(define char-set:xml-char + (scalar-values->char-set '(#x0009 #x000A #x000D - (#x0020 . #xD7FF) - (#xE000 . #xFFFD) - (#x10000 . #x10FFFF)))) + (#x0020 . #xD800) + (#xE000 . #xFFFE) + (#x10000 . #x110000)))) -(define alphabet:char-data - (alphabet- alphabet:xml-char - (string->alphabet "<&"))) +(define char-set:char-data + (char-set-difference char-set:xml-char + (string->char-set "<&"))) -(define alphabet:name-initial - (alphabet+ alphabet:xml-base-char - alphabet:xml-ideographic - (string->alphabet "_:"))) +(define char-set:name-initial + (char-set-union char-set:xml-base-char + char-set:xml-ideographic + (string->char-set "_:"))) -(define alphabet:name-subsequent ;[4] - (alphabet+ alphabet:xml-base-char - alphabet:xml-ideographic - alphabet:xml-digit - alphabet:xml-combining-char - alphabet:xml-extender - (string->alphabet ".-_:"))) +(define char-set:name-subsequent ;[4] + (char-set-union char-set:xml-base-char + char-set:xml-ideographic + char-set:xml-digit + char-set:xml-combining-char + char-set:xml-extender + (string->char-set ".-_:"))) -(define alphabet:ncname-initial - (alphabet- alphabet:name-initial - (string->alphabet ":"))) +(define char-set:ncname-initial + (char-set-difference char-set:name-initial + (string->char-set ":"))) -(define alphabet:ncname-subsequent - (alphabet- alphabet:name-subsequent - (string->alphabet ":"))) +(define char-set:ncname-subsequent + (char-set-difference char-set:name-subsequent + (string->char-set ":"))) (define char-set:xml-whitespace (char-set #\space #\tab #\return #\linefeed)) \ No newline at end of file diff --git a/src/xml/xml-names.scm b/src/xml/xml-names.scm index de51ec25b..9f8fab1b0 100644 --- a/src/xml/xml-names.scm +++ b/src/xml/xml-names.scm @@ -108,23 +108,23 @@ USA. (define (name-matcher initial subsequent) (lambda (buffer) - (and (match-parser-buffer-char-in-alphabet buffer initial) + (and (match-parser-buffer-char-in-set buffer initial) (let loop () - (if (match-parser-buffer-char-in-alphabet buffer subsequent) + (if (match-parser-buffer-char-in-set buffer subsequent) (loop) #t))))) (define match-ncname - (name-matcher alphabet:ncname-initial - alphabet:ncname-subsequent)) + (name-matcher char-set:ncname-initial + char-set:ncname-subsequent)) (define match:xml-name - (name-matcher alphabet:name-initial - alphabet:name-subsequent)) + (name-matcher char-set:name-initial + char-set:name-subsequent)) (define match:xml-nmtoken - (name-matcher alphabet:name-subsequent - alphabet:name-subsequent)) + (name-matcher char-set:name-subsequent + char-set:name-subsequent)) (define match:xml-qname (*matcher (seq match-ncname (? (seq ":" match-ncname))))) diff --git a/src/xml/xml-parser.scm b/src/xml/xml-parser.scm index e562bbcb5..8942b3e84 100644 --- a/src/xml/xml-parser.scm +++ b/src/xml/xml-parser.scm @@ -66,12 +66,12 @@ USA. (set-cdr! elements (cddr elements))))) elements) -(define (string-parser description alphabet) - (let ((a1 (alphabet- alphabet (string->alphabet "\""))) - (a2 (alphabet- alphabet (string->alphabet "'")))) +(define (string-parser description char-set) + (let ((a1 (char-set-difference char-set (string->char-set "\""))) + (a2 (char-set-difference char-set (string->char-set "'")))) (*parser - (alt (sbracket description "\"" "\"" (match (* (alphabet a1)))) - (sbracket description "'" "'" (match (* (alphabet a2)))))))) + (alt (sbracket description "\"" "\"" (match (* (char-set a1)))) + (sbracket description "'" "'" (match (* (char-set a2)))))))) ;;;; Entry points @@ -469,15 +469,15 @@ USA. (define (bracketed-region-parser description start end) (let ((parser - (terminated-region-parser description alphabet:xml-char #t end))) + (terminated-region-parser description char-set:xml-char #t end))) (*parser (sbracket description start end parser)))) -(define (terminated-region-parser description alphabet must-match? end) +(define (terminated-region-parser description char-set must-match? end) (let ((matcher - (terminated-region-matcher description alphabet must-match? end))) + (terminated-region-matcher description char-set must-match? end))) (*parser (map normalize-line-endings (match matcher))))) -(define (terminated-region-matcher description alphabet must-match? . ends) +(define (terminated-region-matcher description char-set must-match? . ends) description (lambda (buffer) (let loop () @@ -485,7 +485,7 @@ USA. (lambda (end) (match-parser-buffer-string-no-advance buffer end))) #t) - ((match-parser-buffer-char-in-alphabet buffer alphabet) + ((match-parser-buffer-char-in-set buffer char-set) (loop)) (must-match? (let ((p (get-parser-buffer-pointer buffer)) @@ -498,7 +498,7 @@ USA. (define parse-char-data ;[14] (let ((parse-body (terminated-region-parser "character data" - alphabet:char-data + char-set:char-data #f "]]>"))) (*parser @@ -510,7 +510,7 @@ USA. (define parse-comment ;[15] (let ((parse-body - (terminated-region-parser "comment" alphabet:xml-char #t "--"))) + (terminated-region-parser "comment" char-set:xml-char #t "--"))) (*parser (encapsulate (lambda (v) @@ -627,7 +627,7 @@ USA. (start "")) (let ((parse-body - (terminated-region-parser description alphabet:xml-char #t end))) + (terminated-region-parser description char-set:xml-char #t end))) (*parser (with-pointer p (transform @@ -676,7 +676,7 @@ USA. (if (not (unicode-scalar-value? n)) (perror p "Invalid code point" n)) (let ((char (integer->char n))) - (if (not (char-in-alphabet? char alphabet:xml-char)) + (if (not (char-set-member? char-set:xml-char char)) (perror p "Disallowed Unicode character" char)) (call-with-utf8-output-string (lambda (port) @@ -774,9 +774,9 @@ USA. (attribute-list-parser (*parser (map make-xml-name (match match:xml-name))) (lambda (a) a))) -(define (attribute-value-parser alphabet parse-reference) - (let ((a1 (alphabet- alphabet (string->alphabet "\""))) - (a2 (alphabet- alphabet (string->alphabet "'")))) +(define (attribute-value-parser char-set parse-reference) + (let ((a1 (char-set-difference char-set (string->char-set "\""))) + (a2 (char-set-difference char-set (string->char-set "'")))) (*parser (encapsulate (lambda (v) (let ((elements (coalesce-strings! (vector->list v)))) @@ -784,15 +784,15 @@ USA. (list "") elements))) (alt (sbracket "attribute value" "\"" "\"" - (* (alt (match (+ (alphabet a1))) + (* (alt (match (+ (char-set a1))) parse-reference))) (sbracket "attribute value" "'" "'" - (* (alt (match (+ (alphabet a2))) + (* (alt (match (+ (char-set a2))) parse-reference)))))))) (define parse-entity-value ;[9] (attribute-value-parser - (alphabet- alphabet:xml-char (string->alphabet "%&")) + (char-set-difference char-set:xml-char (string->char-set "%&")) (*parser (alt parse-char-reference parse-entity-reference-deferred @@ -810,7 +810,7 @@ USA. (define parse-attribute-value ;[10] (let ((parser - (attribute-value-parser alphabet:char-data + (attribute-value-parser char-set:char-data parse-reference-deferred))) (*parser (map (lambda (elements) @@ -1275,14 +1275,13 @@ USA. parse-system-literal)))))) (define parse-system-literal ;[11] - (string-parser "system literal" alphabet:xml-char)) + (string-parser "system literal" char-set:xml-char)) (define parse-public-id-literal ;[12,13] (string-parser "public-ID literal" - (char-set->alphabet - (char-set-union - char-set:alphanumeric - (string->char-set " \r\n-'()+,./:=?;!*#@$_%"))))) + (char-set-union + char-set:alphanumeric + (string->char-set " \r\n-'()+,./:=?;!*#@$_%")))) ;;;; External subset @@ -1299,9 +1298,9 @@ USA. parse-decl-separator)))) (define external-decl-parser - (let ((a1 (alphabet- alphabet:xml-char (string->alphabet "%\"'>"))) - (a2 (alphabet- alphabet:xml-char (string->alphabet "\""))) - (a3 (alphabet- alphabet:xml-char (string->alphabet "'")))) + (let ((a1 (char-set-difference char-set:xml-char (string->char-set "%\"'>"))) + (a2 (char-set-difference char-set:xml-char (string->char-set "\""))) + (a3 (char-set-difference char-set:xml-char (string->char-set "'")))) (lambda (prefix parse-decl) (*parser (with-pointer p @@ -1313,9 +1312,9 @@ USA. (require-success "Malformed markup declaration" (seq (* (alt (match - (alt (* (alphabet a1)) - (seq (char #\") (* (alphabet a2)) (char #\")) - (seq (char #\') (* (alphabet a3)) (char #\')))) + (alt (* (char-set a1)) + (seq (char #\") (* (char-set a2)) (char #\")) + (seq (char #\') (* (char-set a3)) (char #\')))) parse-parameter-entity-reference)) (match ">")))))))))) @@ -1395,7 +1394,7 @@ USA. match-!ignore))))) (define match-!ignore ;[65] - (terminated-region-matcher "ignore section" alphabet:xml-char #t + (terminated-region-matcher "ignore section" char-set:xml-char #t conditional-start conditional-end)) (define parse-parameterized-conditional diff --git a/src/xml/xml-struct.scm b/src/xml/xml-struct.scm index dad07d18c..d96158155 100644 --- a/src/xml/xml-struct.scm +++ b/src/xml/xml-struct.scm @@ -169,7 +169,7 @@ USA. (string-of-xml-chars? object)))) (define (string-of-xml-chars? string) - (for-all-chars-in-string? (alphabet-predicate alphabet:xml-char) + (for-all-chars-in-string? (char-set-predicate char-set:xml-char) string 0 (string-length string) diff --git a/src/xml/xml.pkg b/src/xml/xml.pkg index 71dfc8b0e..0438a5e1a 100644 --- a/src/xml/xml.pkg +++ b/src/xml/xml.pkg @@ -36,14 +36,14 @@ USA. (files "xml-chars") (parent (runtime xml)) (export () - alphabet:xml-char + char-set:xml-char char-set:xml-whitespace) (export (runtime xml) - alphabet:char-data - alphabet:name-initial - alphabet:name-subsequent - alphabet:ncname-initial - alphabet:ncname-subsequent)) + char-set:char-data + char-set:name-initial + char-set:name-subsequent + char-set:ncname-initial + char-set:ncname-subsequent)) (define-package (runtime xml names) (files "xml-names") -- 2.25.1