From 519273265be5285d930b8bcc80392749ed45fccb Mon Sep 17 00:00:00 2001 From: "Taylor R. Campbell" Date: Mon, 31 Jul 2006 18:55:43 +0000 Subject: [PATCH] Tighten XML-CHAR-DATA? and CANONICALIZE-CHAR-DATA so that they refuse characters or entity references not included in the XML-CHAR alphabet. This may have a small performance cost when constructing XML trees, and it will have to change for XML 1.1, which relaxes the restriction on XML character data. It could have been done in the XML writer, since the parser already checks this, but this way offers guarantees about the validity of any XML data in Scheme, and these guarantees are probably good things. --- v7/src/xml/xml-struct.scm | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/v7/src/xml/xml-struct.scm b/v7/src/xml/xml-struct.scm index d61dbe491..6b4000151 100644 --- a/v7/src/xml/xml-struct.scm +++ b/v7/src/xml/xml-struct.scm @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: xml-struct.scm,v 1.54 2006/02/02 20:50:26 cph Exp $ +$Id: xml-struct.scm,v 1.55 2006/07/31 18:55:43 riastradh Exp $ Copyright 2001,2002,2003,2004,2005,2006 Massachusetts Institute of Technology @@ -153,9 +153,17 @@ USA. (define (xml-char-data? object) (or (wide-char? object) - (wide-string? object) - (and (string? object) - (utf8-string-valid? object)))) + (and (or (wide-string? object) + (and (string? object) + (utf8-string-valid? object))) + (string-of-xml-chars? object)))) + +(define (string-of-xml-chars? string) + (for-all-chars-in-string? (alphabet-predicate alphabet:xml-char) + string + 0 + (string-length string) + 'UTF-8)) (define (canonicalize-char-data object) (cond ((wide-char? object) @@ -165,9 +173,12 @@ USA. (write-char object port)))) ((wide-string? object) (wide-string->utf8-string object)) - ((and (string? object) - (utf8-string-valid? object)) - object) + ((string? object) + (cond ((not (utf8-string-valid? object)) + (error:wrong-type-datum object "valid UTF-8 XML char data")) + ((not (string-of-xml-chars? object)) + (error:wrong-type-datum object "well-formed XML char data")) + (else object))) ((uri? object) (uri->string object)) (else (error:wrong-type-datum object "an XML char data")))) @@ -196,7 +207,7 @@ USA. (xml-comment? object) (xml-element? object) (xml-processing-instructions? object))) - + (define (canonicalize-content content) (letrec ((search -- 2.25.1