From f0d25f71152dd6b6a3cb65c0915d27b7d38f8ec4 Mon Sep 17 00:00:00 2001 From: Chris Hanson Date: Sat, 24 Jul 2004 03:45:54 +0000 Subject: [PATCH] Add support for XHTML predefined entities. These are available only when the document has an XHTML DTD. --- v7/src/xml/compile.scm | 5 +- v7/src/xml/load.scm | 4 +- v7/src/xml/xhtml-entities.scm | 290 ++++++++++++++++++++++++++++++++++ v7/src/xml/xml-output.scm | 31 ++-- v7/src/xml/xml-parser.scm | 38 +++-- v7/src/xml/xml.pkg | 10 +- 6 files changed, 346 insertions(+), 32 deletions(-) create mode 100644 v7/src/xml/xhtml-entities.scm diff --git a/v7/src/xml/compile.scm b/v7/src/xml/compile.scm index b8373aeed..24f9f80cb 100644 --- a/v7/src/xml/compile.scm +++ b/v7/src/xml/compile.scm @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: compile.scm,v 1.13 2004/07/15 18:16:43 cph Exp $ +$Id: compile.scm,v 1.14 2004/07/24 03:45:29 cph Exp $ Copyright 2001,2003,2004 Massachusetts Institute of Technology @@ -43,5 +43,6 @@ USA. "xml-chars" "xml-output" "xml-parser" - "xhtml")))) + "xhtml" + "xhtml-entities")))) (cref/generate-constructors "xml"))) \ No newline at end of file diff --git a/v7/src/xml/load.scm b/v7/src/xml/load.scm index cd0d200f4..95d5cf2b5 100644 --- a/v7/src/xml/load.scm +++ b/v7/src/xml/load.scm @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: load.scm,v 1.13 2004/07/15 18:16:49 cph Exp $ +$Id: load.scm,v 1.14 2004/07/24 03:45:34 cph Exp $ Copyright 2001,2002,2003,2004 Massachusetts Institute of Technology @@ -28,4 +28,4 @@ USA. (with-working-directory-pathname (directory-pathname (current-load-pathname)) (lambda () (package/system-loader "xml" '() 'QUERY))) -(add-subsystem-identification! "XML" '(0 6)) \ No newline at end of file +(add-subsystem-identification! "XML" '(0 7)) \ No newline at end of file diff --git a/v7/src/xml/xhtml-entities.scm b/v7/src/xml/xhtml-entities.scm new file mode 100644 index 000000000..34c58d268 --- /dev/null +++ b/v7/src/xml/xhtml-entities.scm @@ -0,0 +1,290 @@ +#| -*-Scheme-*- + +$Id: xhtml-entities.scm,v 1.1 2004/07/24 03:45:39 cph Exp $ + +Copyright 2004 Massachusetts Institute of Technology + +This file is part of MIT/GNU Scheme. + +MIT/GNU Scheme is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at +your option) any later version. + +MIT/GNU Scheme is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with MIT/GNU Scheme; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, +USA. + +|# + +;;;; XHTML entity definitions + +(declare (usual-integrations)) + +(define html-entity-alist + '((nbsp #\U+0160) ; no-break space + (iexcl #\U+0161) ; inverted exclamation mark + (cent #\U+0162) ; cent sign + (pound #\U+0163) ; pound sign + (curren #\U+0164) ; currency sign + (yen #\U+0165) ; yen sign + (brvbar #\U+0166) ; broken bar + (sect #\U+0167) ; section sign + (uml #\U+0168) ; diaeresis + (copy #\U+0169) ; copyright sign + (ordf #\U+0170) ; feminine ordinal indicator + (laquo #\U+0171) ; left-pointing double angle quotation mark + (not #\U+0172) ; not sign + (shy #\U+0173) ; soft hyphen + (reg #\U+0174) ; registered sign + (macr #\U+0175) ; macron + (deg #\U+0176) ; degree sign + (plusmn #\U+0177) ; plus-minus sign + (sup2 #\U+0178) ; superscript two + (sup3 #\U+0179) ; superscript three + (acute #\U+0180) ; acute accent + (micro #\U+0181) ; micro sign + (para #\U+0182) ; pilcrow sign + (middot #\U+0183) ; middle dot + (cedil #\U+0184) ; cedilla + (sup1 #\U+0185) ; superscript one + (ordm #\U+0186) ; masculine ordinal indicator + (raquo #\U+0187) ; right-pointing double angle quotation mark + (frac14 #\U+0188) ; vulgar fraction one quarter + (frac12 #\U+0189) ; vulgar fraction one half + (frac34 #\U+0190) ; vulgar fraction three quarters + (iquest #\U+0191) ; inverted question mark + (|Agrave| #\U+0192) ; latin capital letter A with grave + (|Aacute| #\U+0193) ; latin capital letter A with acute + (|Acirc| #\U+0194) ; latin capital letter A with circumflex + (|Atilde| #\U+0195) ; latin capital letter A with tilde + (|Auml| #\U+0196) ; latin capital letter A with diaeresis + (|Aring| #\U+0197) ; latin capital letter A with ring above + (|AElig| #\U+0198) ; latin capital letter AE + (|Ccedil| #\U+0199) ; latin capital letter C with cedilla + (|Egrave| #\U+0200) ; latin capital letter E with grave + (|Eacute| #\U+0201) ; latin capital letter E with acute + (|Ecirc| #\U+0202) ; latin capital letter E with circumflex + (|Euml| #\U+0203) ; latin capital letter E with diaeresis + (|Igrave| #\U+0204) ; latin capital letter I with grave + (|Iacute| #\U+0205) ; latin capital letter I with acute + (|Icirc| #\U+0206) ; latin capital letter I with circumflex + (|Iuml| #\U+0207) ; latin capital letter I with diaeresis + (|ETH| #\U+0208) ; latin capital letter ETH + (|Ntilde| #\U+0209) ; latin capital letter N with tilde + (|Ograve| #\U+0210) ; latin capital letter O with grave + (|Oacute| #\U+0211) ; latin capital letter O with acute + (|Ocirc| #\U+0212) ; latin capital letter O with circumflex + (|Otilde| #\U+0213) ; latin capital letter O with tilde + (|Ouml| #\U+0214) ; latin capital letter O with diaeresis + (times #\U+0215) ; multiplication sign + (|Oslash| #\U+0216) ; latin capital letter O with stroke + (|Ugrave| #\U+0217) ; latin capital letter U with grave + (|Uacute| #\U+0218) ; latin capital letter U with acute + (|Ucirc| #\U+0219) ; latin capital letter U with circumflex + (|Uuml| #\U+0220) ; latin capital letter U with diaeresis + (|Yacute| #\U+0221) ; latin capital letter Y with acute + (|THORN| #\U+0222) ; latin capital letter THORN + (szlig #\U+0223) ; latin small letter sharp s + (agrave #\U+0224) ; latin small letter a with grave + (aacute #\U+0225) ; latin small letter a with acute + (acirc #\U+0226) ; latin small letter a with circumflex + (atilde #\U+0227) ; latin small letter a with tilde + (auml #\U+0228) ; latin small letter a with diaeresis + (aring #\U+0229) ; latin small letter a with ring above + (aelig #\U+0230) ; latin small letter ae + (ccedil #\U+0231) ; latin small letter c with cedilla + (egrave #\U+0232) ; latin small letter e with grave + (eacute #\U+0233) ; latin small letter e with acute + (ecirc #\U+0234) ; latin small letter e with circumflex + (euml #\U+0235) ; latin small letter e with diaeresis + (igrave #\U+0236) ; latin small letter i with grave + (iacute #\U+0237) ; latin small letter i with acute + (icirc #\U+0238) ; latin small letter i with circumflex + (iuml #\U+0239) ; latin small letter i with diaeresis + (eth #\U+0240) ; latin small letter eth + (ntilde #\U+0241) ; latin small letter n with tilde + (ograve #\U+0242) ; latin small letter o with grave + (oacute #\U+0243) ; latin small letter o with acute + (ocirc #\U+0244) ; latin small letter o with circumflex + (otilde #\U+0245) ; latin small letter o with tilde + (ouml #\U+0246) ; latin small letter o with diaeresis + (divide #\U+0247) ; division sign + (oslash #\U+0248) ; latin small letter o with stroke, + (ugrave #\U+0249) ; latin small letter u with grave + (uacute #\U+0250) ; latin small letter u with acute + (ucirc #\U+0251) ; latin small letter u with circumflex + (uuml #\U+0252) ; latin small letter u with diaeresis + (yacute #\U+0253) ; latin small letter y with acute + (thorn #\U+0254) ; latin small letter thorn + (yuml #\U+0255) ; latin small letter y with diaeresis + (|OElig| #\U+0338) ; latin capital ligature OE + (oelig #\U+0339) ; latin small ligature oe + (|Scaron| #\U+0352) ; latin capital letter S with caron + (scaron #\U+0353) ; latin small letter s with caron + (|Yuml| #\U+0376) ; latin capital letter Y with diaeresis + (circ #\U+0710) ; modifier letter circumflex accent + (tilde #\U+0732) ; small tilde + (ensp #\U+8194) ; en space + (emsp #\U+8195) ; em space + (thinsp #\U+8201) ; thin space + (zwnj #\U+8204) ; zero width non-joiner + (zwj #\U+8205) ; zero width joiner + (lrm #\U+8206) ; left-to-right mark + (rlm #\U+8207) ; right-to-left mark + (ndash #\U+8211) ; en dash + (mdash #\U+8212) ; em dash + (lsquo #\U+8216) ; left single quotation mark + (rsquo #\U+8217) ; right single quotation mark + (sbquo #\U+8218) ; single low-9 quotation mark + (ldquo #\U+8220) ; left double quotation mark + (rdquo #\U+8221) ; right double quotation mark + (bdquo #\U+8222) ; double low-9 quotation mark + (dagger #\U+8224) ; dagger + (|Dagger| #\U+8225) ; double dagger + (permil #\U+8240) ; per mille sign + (lsaquo #\U+8249) ; single left-pointing angle quotation mark + (rsaquo #\U+8250) ; single right-pointing angle quotation mark + (euro #\U+8364) ; euro sign + (fnof #\U+0402) ; latin small letter f with hook + (|Alpha| #\U+0913) ; greek capital letter alpha + (|Beta| #\U+0914) ; greek capital letter beta + (|Gamma| #\U+0915) ; greek capital letter gamma + (|Delta| #\U+0916) ; greek capital letter delta + (|Epsilon| #\U+0917) ; greek capital letter epsilon + (|Zeta| #\U+0918) ; greek capital letter zeta + (|Eta| #\U+0919) ; greek capital letter eta + (|Theta| #\U+0920) ; greek capital letter theta + (|Iota| #\U+0921) ; greek capital letter iota + (|Kappa| #\U+0922) ; greek capital letter kappa + (|Lambda| #\U+0923) ; greek capital letter lamda + (|Mu| #\U+0924) ; greek capital letter mu + (|Nu| #\U+0925) ; greek capital letter nu + (|Xi| #\U+0926) ; greek capital letter xi + (|Omicron| #\U+0927) ; greek capital letter omicron + (|Pi| #\U+0928) ; greek capital letter pi + (|Rho| #\U+0929) ; greek capital letter rho + (|Sigma| #\U+0931) ; greek capital letter sigma + (|Tau| #\U+0932) ; greek capital letter tau + (|Upsilon| #\U+0933) ; greek capital letter upsilon + (|Phi| #\U+0934) ; greek capital letter phi + (|Chi| #\U+0935) ; greek capital letter chi + (|Psi| #\U+0936) ; greek capital letter psi + (|Omega| #\U+0937) ; greek capital letter omega + (alpha #\U+0945) ; greek small letter alpha + (beta #\U+0946) ; greek small letter beta + (gamma #\U+0947) ; greek small letter gamma + (delta #\U+0948) ; greek small letter delta + (epsilon #\U+0949) ; greek small letter epsilon + (zeta #\U+0950) ; greek small letter zeta + (eta #\U+0951) ; greek small letter eta + (theta #\U+0952) ; greek small letter theta + (iota #\U+0953) ; greek small letter iota + (kappa #\U+0954) ; greek small letter kappa + (lambda #\U+0955) ; greek small letter lamda + (mu #\U+0956) ; greek small letter mu + (nu #\U+0957) ; greek small letter nu + (xi #\U+0958) ; greek small letter xi + (omicron #\U+0959) ; greek small letter omicron + (pi #\U+0960) ; greek small letter pi + (rho #\U+0961) ; greek small letter rho + (sigmaf #\U+0962) ; greek small letter final sigma + (sigma #\U+0963) ; greek small letter sigma + (tau #\U+0964) ; greek small letter tau + (upsilon #\U+0965) ; greek small letter upsilon + (phi #\U+0966) ; greek small letter phi + (chi #\U+0967) ; greek small letter chi + (psi #\U+0968) ; greek small letter psi + (omega #\U+0969) ; greek small letter omega + (thetasym #\U+0977) ; greek theta symbol + (upsih #\U+0978) ; greek upsilon with hook symbol + (piv #\U+0982) ; greek pi symbol + (bull #\U+8226) ; bullet + (hellip #\U+8230) ; horizontal ellipsis + (prime #\U+8242) ; prime + (|Prime| #\U+8243) ; double prime + (oline #\U+8254) ; overline + (frasl #\U+8260) ; fraction slash + (weierp #\U+8472) ; script capital P + (image #\U+8465) ; black-letter capital I + (real #\U+8476) ; black-letter capital R + (trade #\U+8482) ; trade mark sign + (alefsym #\U+8501) ; alef symbol + (larr #\U+8592) ; leftwards arrow + (uarr #\U+8593) ; upwards arrow + (rarr #\U+8594) ; rightwards arrow + (darr #\U+8595) ; downwards arrow + (harr #\U+8596) ; left right arrow + (crarr #\U+8629) ; downwards arrow with corner leftwards + (|lArr| #\U+8656) ; leftwards double arrow + (|uArr| #\U+8657) ; upwards double arrow + (|rArr| #\U+8658) ; rightwards double arrow + (|dArr| #\U+8659) ; downwards double arrow + (|hArr| #\U+8660) ; left right double arrow + (forall #\U+8704) ; for all + (part #\U+8706) ; partial differential + (exist #\U+8707) ; there exists + (empty #\U+8709) ; empty set + (nabla #\U+8711) ; nabla + (isin #\U+8712) ; element of + (notin #\U+8713) ; not an element of + (ni #\U+8715) ; contains as member + (prod #\U+8719) ; n-ary product + (sum #\U+8721) ; n-ary summation + (minus #\U+8722) ; minus sign + (lowast #\U+8727) ; asterisk operator + (radic #\U+8730) ; square root + (prop #\U+8733) ; proportional to + (infin #\U+8734) ; infinity + (ang #\U+8736) ; angle + (and #\U+8743) ; logical and + (or #\U+8744) ; logical or + (cap #\U+8745) ; intersection + (cup #\U+8746) ; union + (int #\U+8747) ; integral + (there4 #\U+8756) ; therefore + (sim #\U+8764) ; tilde operator + (cong #\U+8773) ; approximately equal to + (asymp #\U+8776) ; almost equal to + (ne #\U+8800) ; not equal to + (equiv #\U+8801) ; identical to + (le #\U+8804) ; less-than or equal to + (ge #\U+8805) ; greater-than or equal to + (sub #\U+8834) ; subset of + (sup #\U+8835) ; superset of + (nsub #\U+8836) ; not a subset of + (sube #\U+8838) ; subset of or equal to + (supe #\U+8839) ; superset of or equal to + (oplus #\U+8853) ; circled plus + (otimes #\U+8855) ; circled times + (perp #\U+8869) ; up tack + (sdot #\U+8901) ; dot operator + (lceil #\U+8968) ; left ceiling + (rceil #\U+8969) ; right ceiling + (lfloor #\U+8970) ; left floor + (rfloor #\U+8971) ; right floor + (lang #\U+9001) ; left-pointing angle bracket + (rang #\U+9002) ; right-pointing angle bracket + (loz #\U+9674) ; lozenge + (spades #\U+9824) ; black spade suit + (clubs #\U+9827) ; black club suit + (hearts #\U+9829) ; black heart suit + (diams #\U+9830) ; black diamond suit + )) + +(define html-entities + (map (lambda (b) (apply make-xml-!entity b)) + html-entity-alist)) + +(define html-char->name-map + (let ((table (make-eq-hash-table))) + (for-each (lambda (b) (hash-table/put! table (cadr b) (car b))) + html-entity-alist) + (lambda (char) + (hash-table/get table char #f)))) \ No newline at end of file diff --git a/v7/src/xml/xml-output.scm b/v7/src/xml/xml-output.scm index 3dd0c426c..658454a12 100644 --- a/v7/src/xml/xml-output.scm +++ b/v7/src/xml/xml-output.scm @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: xml-output.scm,v 1.33 2004/02/26 01:52:24 cph Exp $ +$Id: xml-output.scm,v 1.34 2004/07/24 03:45:49 cph Exp $ Copyright 2001,2002,2003,2004 Massachusetts Institute of Technology @@ -57,19 +57,24 @@ USA. (wide-string->utf8-string (apply xml->wide-string xml options))) (define (write-xml-1 xml port options) - (%write-xml xml (make-ctx port options))) + (%write-xml xml + (apply make-ctx + 'CHAR-MAP (if (and (xml-document? xml) + (html-dtd? (xml-document-dtd xml))) + html-char->name-map + (lambda (char) char #f)) + 'PORT port + options))) (define-structure (ctx (type-descriptor ) - (keyword-constructor %make-ctx) + (keyword-constructor make-ctx) (print-procedure (standard-unparser-method 'XML-OUTPUT-CONTEXT #f))) + (char-map #f read-only #t) (port #f read-only #t) (indent-attributes? #f read-only #t) (indent-dtd? #f read-only #t)) -(define (make-ctx port options) - (apply %make-ctx 'PORT port options)) - (define (emit-char char ctx) (let ((port (ctx-port ctx))) (if (fix:< (char->integer char) #x80) @@ -474,10 +479,16 @@ USA. (define (write-escaped-string string escapes ctx) (for-each-wide-char string (lambda (char) - (let ((e (assq char escapes))) - (if e - (emit-string (cdr e) ctx) - (emit-char char ctx)))))) + (cond ((assq char escapes) + => (lambda (e) + (emit-string (cdr e) ctx))) + (((ctx-char-map ctx) char) + => (lambda (name) + (emit-char #\& ctx) + (emit-string (symbol-name name) ctx) + (emit-char #\; ctx))) + (else + (emit-char char ctx)))))) (define (for-each-wide-char string procedure) (let ((port (open-input-string string))) diff --git a/v7/src/xml/xml-parser.scm b/v7/src/xml/xml-parser.scm index 36ae425b6..71eeb8fa3 100644 --- a/v7/src/xml/xml-parser.scm +++ b/v7/src/xml/xml-parser.scm @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: xml-parser.scm,v 1.62 2004/06/28 03:27:04 cph Exp $ +$Id: xml-parser.scm,v 1.63 2004/07/24 03:45:54 cph Exp $ Copyright 2001,2002,2003,2004 Massachusetts Institute of Technology @@ -232,20 +232,22 @@ USA. (dtd (one-value (fluid-let ((*in-dtd?* #t)) - (parse-dtd buffer)))) - (misc-2 (if dtd (one-value (parse-misc buffer)) '())) - (element - (or (one-value (parse-element buffer)) - (perror buffer "Missing root element"))) - (misc-3 (one-value (parse-misc buffer)))) - (if (peek-parser-buffer-char buffer) - (perror buffer "Unparsed content in input")) - (make-xml-document declaration - misc-1 - dtd - misc-2 - element - misc-3)))))) + (parse-dtd buffer))))) + (if (html-dtd? dtd) + (add-html-entities!)) + (let* ((misc-2 (if dtd (one-value (parse-misc buffer)) '())) + (element + (or (one-value (parse-element buffer)) + (perror buffer "Missing root element"))) + (misc-3 (one-value (parse-misc buffer)))) + (if (peek-parser-buffer-char buffer) + (perror buffer "Unparsed content in input")) + (make-xml-document declaration + misc-1 + dtd + misc-2 + element + misc-3))))))) (define *standalone?*) (define *internal-dtd?*) @@ -1008,6 +1010,12 @@ USA. (car entities) (loop (cdr entities)))))) +(define (add-html-entities!) + (if (pair? *general-entities*) + (begin + (set! *general-entities* (append *general-entities* html-entities)) + unspecific))) + (define (predefined-entities) (list (make-xml-!entity 'lt '("<")) (make-xml-!entity 'gt '(">")) diff --git a/v7/src/xml/xml.pkg b/v7/src/xml/xml.pkg index 2ce4a418a..756795944 100644 --- a/v7/src/xml/xml.pkg +++ b/v7/src/xml/xml.pkg @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: xml.pkg,v 1.51 2004/07/24 03:19:23 cph Exp $ +$Id: xml.pkg,v 1.52 2004/07/24 03:45:44 cph Exp $ Copyright 2001,2002,2003,2004 Massachusetts Institute of Technology @@ -298,7 +298,7 @@ USA. xml->wide-string)) (define-package (runtime xml html) - (files "xhtml") + (files "xhtml" "xhtml-entities") (parent (runtime xml)) (export () (html-document html-1.0-document) @@ -492,4 +492,8 @@ USA. html:ul html:ul? html:var - html:var?)) \ No newline at end of file + html:var?) + (export (runtime xml parser) + html-entities) + (export (runtime xml output) + html-char->name-map)) \ No newline at end of file -- 2.25.1