From 7d326a53c596f65ff5b493b508d4e7cc01cc1ffb Mon Sep 17 00:00:00 2001 From: Chris Hanson Date: Tue, 29 Jul 2003 04:16:28 +0000 Subject: [PATCH] Implement UTFxx validity procedures: UTF8-STRING-VALID? UTF16-BE-STRING-VALID? UTF16-LE-STRING-VALID? UTF16-STRING-VALID? UTF32-BE-STRING-VALID? UTF32-LE-STRING-VALID? UTF32-STRING-VALID? --- v7/src/runtime/runtime.pkg | 9 +- v7/src/runtime/unicode.scm | 211 ++++++++++++++++++++++++++----------- 2 files changed, 157 insertions(+), 63 deletions(-) diff --git a/v7/src/runtime/runtime.pkg b/v7/src/runtime/runtime.pkg index 538de0ea0..5d0d200e6 100644 --- a/v7/src/runtime/runtime.pkg +++ b/v7/src/runtime/runtime.pkg @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: runtime.pkg,v 14.451 2003/07/29 03:46:08 cph Exp $ +$Id: runtime.pkg,v 14.452 2003/07/29 04:16:28 cph Exp $ Copyright 1988,1989,1990,1991,1992,1993 Massachusetts Institute of Technology Copyright 1994,1995,1996,1997,1998,1999 Massachusetts Institute of Technology @@ -4541,18 +4541,25 @@ USA. unicode-code-point? utf16-be-string->wide-string utf16-be-string-length + utf16-be-string-valid? utf16-le-string->wide-string utf16-le-string-length + utf16-le-string-valid? utf16-string->wide-string utf16-string-length + utf16-string-valid? utf32-be-string->wide-string utf32-be-string-length + utf32-be-string-valid? utf32-le-string->wide-string utf32-le-string-length + utf32-le-string-valid? utf32-string->wide-string utf32-string-length + utf32-string-valid? utf8-string->wide-string utf8-string-length + utf8-string-valid? well-formed-code-point-list? wide-char? wide-string diff --git a/v7/src/runtime/unicode.scm b/v7/src/runtime/unicode.scm index c340a817a..6eceba306 100644 --- a/v7/src/runtime/unicode.scm +++ b/v7/src/runtime/unicode.scm @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: unicode.scm,v 1.11 2003/07/03 04:33:50 cph Exp $ +$Id: unicode.scm,v 1.12 2003/07/29 04:16:20 cph Exp $ Copyright 2001,2003 Massachusetts Institute of Technology @@ -89,10 +89,10 @@ USA. 0 (GUARANTEE-SUBSTRING-START-INDEX ,start ,(list-ref form 3) ,caller)))) - ,@(map (lambda (expr) - (make-syntactic-closure environment - (list (list-ref form 2) (list-ref form 3)) - expr)) + ,@(map (let ((excludes + (list (list-ref form 2) (list-ref form 3)))) + (lambda (expr) + (make-syntactic-closure environment excludes expr))) (list-tail form 5))))) (ill-formed-syntax form))))) @@ -107,6 +107,15 @@ USA. (loop start* (fix:+ n 1))) n))) +(define (encoded-string-valid? string start end validate-char) + (let loop ((start start)) + (if (fix:< start end) + (let ((start* (validate-char string start end))) + (if start* + (loop start*) + #f)) + #t))) + (define (read-byte port) (let ((char (read-char port))) (if (eof-object? char) @@ -874,17 +883,53 @@ USA. (with-substring-args string start end caller (encoded-string-length string start end type caller (lambda (string start end) + (validate-utf32-char string start end combiner))))) + +(define (utf32-string-valid? string #!optional start end) + (if (host-big-endian?) + (%utf32-string-valid? string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + utf32-be-bytes->code-point + 'UTF32-STRING-VALID?) + (%utf32-string-valid? string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + utf32-le-bytes->code-point + 'UTF32-STRING-VALID?))) + +(define (utf32-be-string-valid? string #!optional start end) + (%utf32-string-valid? string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + utf32-be-bytes->code-point + 'UTF32-BE-STRING-VALID?)) + +(define (utf32-le-string-valid? string #!optional start end) + (%utf32-string-valid? string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + utf32-le-bytes->code-point + 'UTF32-LE-STRING-VALID?)) + +(define (%utf32-string-valid? string start end combiner caller) + (with-substring-args string start end caller + (encoded-string-valid? string start end + (lambda (string start end) + (validate-utf32-char string start end combiner))))) + +(define (validate-utf32-char string start end combiner) - (define-integrable (n i) - (vector-8b-ref string (fix:+ start i))) + (define-integrable (n i) + (vector-8b-ref string (fix:+ start i))) - (if (fix:< start end) - (let ((start* (fix:+ start 4))) - (and (fix:<= start* end) - (let ((pt (combiner (n 0) (n 1) (n 2) (n 3)))) - (and (unicode-code-point? pt) - start*)))) - start))))) + (if (fix:< start end) + (let ((start* (fix:+ start 4))) + (and (fix:<= start* end) + (let ((pt (combiner (n 0) (n 1) (n 2) (n 3)))) + (and (unicode-code-point? pt) + start*)))) + start)) ;;;; UTF-16 representation @@ -1051,23 +1096,59 @@ USA. (with-substring-args string start end caller (encoded-string-length string start end type caller (lambda (string start end) + (validate-utf16-char string start end combiner))))) - (define-integrable (n i) - (vector-8b-ref string (fix:+ start i))) - - (if (fix:< start end) - (and (fix:<= (fix:+ start 2) end) - (let ((d0 (combiner (n 0) (n 1)))) - (if (high-surrogate? d0) - (and (fix:<= (fix:+ start 4) end) - (let ((d1 (combiner (n 2) (n 3)))) - (and (low-surrogate? d1) - (let ((pt (combine-surrogates d0 d1))) - (and (unicode-code-point? pt) - (fix:+ start 4)))))) - (and (unicode-code-point? d0) - (fix:+ start 2))))) - start))))) +(define (utf16-string-valid? string #!optional start end) + (if (host-big-endian?) + (%utf16-string-valid? string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + be-bytes->digit16 + 'UTF16-STRING-VALID?) + (%utf16-string-valid? string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + le-bytes->digit16 + 'UTF16-STRING-VALID?))) + +(define (utf16-be-string-valid? string #!optional start end) + (%utf16-string-valid? string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + be-bytes->digit16 + 'UTF16-BE-STRING-VALID?)) + +(define (utf16-le-string-valid? string #!optional start end) + (%utf16-string-valid? string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + le-bytes->digit16 + 'UTF16-LE-STRING-VALID?)) + +(define (%utf16-string-valid? string start end combiner caller) + (with-substring-args string start end caller + (encoded-string-valid? string start end + (lambda (string start end) + (validate-utf16-char string start end combiner))))) + +(define (validate-utf16-char string start end combiner) + + (define-integrable (n i) + (vector-8b-ref string (fix:+ start i))) + + (if (fix:< start end) + (and (fix:<= (fix:+ start 2) end) + (let ((d0 (combiner (n 0) (n 1)))) + (if (high-surrogate? d0) + (and (fix:<= (fix:+ start 4) end) + (let ((d1 (combiner (n 2) (n 3)))) + (and (low-surrogate? d1) + (let ((pt (combine-surrogates d0 d1))) + (and (unicode-code-point? pt) + (fix:+ start 4)))))) + (and (unicode-code-point? d0) + (fix:+ start 2))))) + start)) (define-integrable (be-bytes->digit16 b0 b1) (fix:or (fix:lsh b0 8) b1)) @@ -1182,42 +1263,48 @@ USA. (begin (%write-utf8-char char output) (loop))))))))) - + (define (utf8-string-length string #!optional start end) (with-substring-args string start end 'UTF8-STRING-LENGTH (encoded-string-length string start end "8" 'UTF8-STRING-LENGTH - (lambda (string start end) + validate-utf8-char))) - (define-integrable (check-byte i) - (%valid-trailer? (n i))) - - (define-integrable (n i) - (vector-8b-ref string (fix:+ start i))) - - (if (fix:< start end) - (let ((b0 (vector-8b-ref string start))) - (cond ((fix:< b0 #x80) - (fix:+ start 1)) - ((fix:< b0 #xE0) - (and (fix:<= (fix:+ start 2) end) - (check-byte 1) - (%vs2 b0) - (fix:+ start 2))) - ((fix:< b0 #xF0) - (and (fix:<= (fix:+ start 3) end) - (check-byte 1) - (check-byte 2) - (%vs3 b0 (n 1)) - (fix:+ start 3))) - ((fix:< b0 #xF8) - (and (fix:<= (fix:+ start 4) end) - (check-byte 1) - (%vs4 b0 (n 1)) - (check-byte 2) - (check-byte 3) - (fix:+ start 4))) - (else #f))) - start))))) +(define (utf8-string-valid? string #!optional start end) + (with-substring-args string start end 'UTF8-STRING-VALID? + (encoded-string-valid? string start end validate-utf8-char))) + +(define (validate-utf8-char string start end) + + (define-integrable (check-byte i) + (%valid-trailer? (n i))) + + (define-integrable (n i) + (vector-8b-ref string (fix:+ start i))) + + (if (fix:< start end) + (let ((b0 (vector-8b-ref string start))) + (cond ((fix:< b0 #x80) + (fix:+ start 1)) + ((fix:< b0 #xE0) + (and (fix:<= (fix:+ start 2) end) + (check-byte 1) + (%vs2 b0) + (fix:+ start 2))) + ((fix:< b0 #xF0) + (and (fix:<= (fix:+ start 3) end) + (check-byte 1) + (check-byte 2) + (%vs3 b0 (n 1)) + (fix:+ start 3))) + ((fix:< b0 #xF8) + (and (fix:<= (fix:+ start 4) end) + (check-byte 1) + (%vs4 b0 (n 1)) + (check-byte 2) + (check-byte 3) + (fix:+ start 4))) + (else #f))) + start)) (define-integrable (%vc2 b0) (if (not (%vs2 b0)) -- 2.25.1