From: Chris Hanson Date: Mon, 30 Jan 2017 09:42:20 +0000 (-0800) Subject: Rework the UTF-8 codecs: X-Git-Tag: mit-scheme-pucked-9.2.12~227^2~17 X-Git-Url: https://birchwood-abbey.net/git?a=commitdiff_plain;h=1a4c025e53d1befdc8226b00501e351372c3e925;p=mit-scheme.git Rework the UTF-8 codecs: * Allow any scalar value to be used, as required by Unicode. * Implement strict decoding as described in Unicode document. * Change test cases to match new behavior. --- diff --git a/src/runtime/char.scm b/src/runtime/char.scm index d4aac5b26..79074aca9 100644 --- a/src/runtime/char.scm +++ b/src/runtime/char.scm @@ -427,14 +427,14 @@ USA. ;;;; UTF-{8,16,32} encoders (define (char-utf8-byte-length char) - (let ((sv (unicode-char->scalar-value char 'char-utf8-byte-length))) + (let ((sv (char->scalar-value char 'char-utf8-byte-length))) (cond ((fix:< sv #x80) 1) ((fix:< sv #x800) 2) ((fix:< sv #x10000) 3) (else 4)))) (define (encode-utf8-char! bytes index char) - (let ((sv (unicode-char->scalar-value char 'encode-utf8-char!))) + (let ((sv (char->scalar-value char 'encode-utf8-char!))) (define-integrable (initial-byte leader offset) (fix:or leader (fix:lsh sv offset))) @@ -462,13 +462,13 @@ USA. (fix:+ index 4))))) (define (char-utf16-byte-length char) - (if (fix:< (unicode-char->scalar-value char 'char-utf16-byte-length) #x10000) + (if (fix:< (char->scalar-value char 'char-utf16-byte-length) #x10000) 2 4)) (define (utf16-char-encoder setter caller) (lambda (bytes index char) - (let ((sv (unicode-char->scalar-value char caller))) + (let ((sv (char->scalar-value char caller))) (cond ((fix:< sv #x10000) (setter bytes index sv) (fix:+ index 2)) @@ -487,12 +487,12 @@ USA. (utf16-char-encoder bytevector-u16le-set! 'encode-utf16le-char!)) (define (char-utf32-byte-length char) - (unicode-char->scalar-value char 'char-utf32-byte-length) + (char->scalar-value char 'char-utf32-byte-length) 4) (define (utf32-char-encoder setter caller) (lambda (bytes index char) - (setter bytes index (unicode-char->scalar-value char caller)))) + (setter bytes index (char->scalar-value char caller)))) (define encode-utf32be-char! (utf32-char-encoder bytevector-u32be-set! 'encode-utf32be-char!)) @@ -502,13 +502,13 @@ USA. ;;;; UTF-{8,16,32} decoders -(define (initial-byte->utf8-char-length byte) - (guarantee byte? byte 'initial-byte->utf8-char-length) - (cond ((utf8-initial-byte-1? byte) 1) - ((utf8-initial-byte-2? byte) 2) - ((utf8-initial-byte-3? byte) 3) - ((utf8-initial-byte-4? byte) 4) - (else (error "Illegal UTF-8 initial byte:" byte)))) +(define (initial-byte->utf8-char-length b0) + (guarantee u8? b0 'initial-byte->utf8-char-length) + (cond ((utf8-initial-byte-1? b0) 1) + ((utf8-initial-byte-2? b0) 2) + ((utf8-initial-byte-3? b0) 3) + ((utf8-initial-byte-4? b0) 4) + (else (error "Illegal UTF-8 initial byte:" b0)))) (define (decode-utf8-char bytes index) (integer->char @@ -516,91 +516,123 @@ USA. (cond ((utf8-initial-byte-1? b0) b0) ((utf8-initial-byte-2? b0) - (decode-utf8-2 b0 - (bytevector-u8-ref bytes (fix:+ index 1)))) + (let ((b1 (bytevector-u8-ref bytes (fix:+ index 1)))) + (if (not (valid-utf8-sequence-2? b0 b1)) + (error "Ill-formed UTF-8 sequence:" b0 b1)) + (fix:or (extract-bits b0 #x1F 6) + (extract-bits b1 #x3F 0)))) ((utf8-initial-byte-3? b0) - (decode-utf8-3 b0 - (bytevector-u8-ref bytes (fix:+ index 1)) - (bytevector-u8-ref bytes (fix:+ index 2)))) + (let ((b1 (bytevector-u8-ref bytes (fix:+ index 1))) + (b2 (bytevector-u8-ref bytes (fix:+ index 2)))) + (if (not (valid-utf8-sequence-3? b0 b1 b2)) + (error "Ill-formed UTF-8 sequence:" b0 b1 b2)) + (fix:or (fix:or (extract-bits b0 #x0F 12) + (extract-bits b1 #x3F 6)) + (extract-bits b2 #x3F 0)))) ((utf8-initial-byte-4? b0) - (decode-utf8-4 b0 - (bytevector-u8-ref bytes (fix:+ index 1)) - (bytevector-u8-ref bytes (fix:+ index 2)) - (bytevector-u8-ref bytes (fix:+ index 3)))) + (let ((b1 (bytevector-u8-ref bytes (fix:+ index 1))) + (b2 (bytevector-u8-ref bytes (fix:+ index 2))) + (b3 (bytevector-u8-ref bytes (fix:+ index 3)))) + (if (not (valid-utf8-sequence-4? b0 b1 b2 b3)) + (error "Ill-formed UTF-8 sequence:" b0 b1 b2 b3)) + (fix:or (fix:or (extract-bits b0 #x07 18) + (extract-bits b1 #x3F 12)) + (fix:or (extract-bits b2 #x3F 6) + (extract-bits b3 #x3F 0))))) (else (error "Illegal UTF-8 initial byte:" b0)))))) -(define (decode-utf8-2 b0 b1) - (if (not (and (fix:> b0 #xC1) - (utf8-trailing-byte? b1))) - (error "Ill-formed UTF-8 sequence:" b0 b1)) - (fix:or (extract-bits b0 #x1F 6) - (extract-bits b1 #x3F 0))) - -(define (decode-utf8-3 b0 b1 b2) - (if (not (and (or (fix:> b0 #xE0) (fix:> b1 #x9F)) - (utf8-trailing-byte? b1) - (utf8-trailing-byte? b2))) - (error "Ill-formed UTF-8 sequence:" b0 b1 b2)) - (let ((cp - (fix:or (fix:or (extract-bits b0 #x0F 12) - (extract-bits b1 #x3F 6)) - (extract-bits b2 #x3F 0)))) - (guarantee-cp-not-utf16-surrogate cp) - (guarantee-cp-is-character cp) - cp)) - -(define (decode-utf8-4 b0 b1 b2 b3) - (if (not (and (or (fix:> b0 #xF0) (fix:> b1 #x8F)) - (utf8-trailing-byte? b1) - (utf8-trailing-byte? b2) - (utf8-trailing-byte? b3))) - (error "Ill-formed UTF-8 sequence:" b0 b1 b2 b3)) - (let ((cp - (fix:or (fix:or (extract-bits b0 #x07 18) - (extract-bits b1 #x3F 12)) - (fix:or (extract-bits b2 #x3F 6) - (extract-bits b3 #x3F 0))))) - (guarantee-cp-in-range cp) - (guarantee-cp-is-character cp) - cp)) - (define-integrable (utf8-initial-byte-1? byte) (fix:= #x00 (fix:and #x80 byte))) (define-integrable (utf8-initial-byte-2? byte) - (fix:= #xC0 (fix:and #xE0 byte))) + (and (fix:>= byte #xC2) (fix:<= byte #xDF))) (define-integrable (utf8-initial-byte-3? byte) (fix:= #xE0 (fix:and #xF0 byte))) (define-integrable (utf8-initial-byte-4? byte) - (fix:= #xF0 (fix:and #xF8 byte))) - -(define-integrable (utf8-trailing-byte? byte) + (and (fix:>= byte #xF0) (fix:<= byte #xF4))) + +;; code-point range b0 +;; ------------------ ------ +;; U+000000..U+00007F 00..7F +(define-integrable (valid-utf8-sequence-1? b0) + (utf8-initial-byte-1? b0)) + +;; code-point range b0 b1 +;; ------------------ ------ ------ +;; U+000080..U+0007FF C2..DF 80..BF +(define-integrable (valid-utf8-sequence-2? b0 b1) + (and (utf8-initial-byte-2? b0) + (u8:80..BF? b1))) + +;; code-point range b0 b1 b2 +;; ------------------ ------ ------ ------ +;; U+000800..U+000FFF E0 A0..BF 80..BF +;; U+001000..U+00CFFF E1..EC 80..BF 80..BF +;; U+00D000..U+00D7FF ED 80..9F 80..BF +;; U+00E000..U+00FFFF EE..EF 80..BF 80..BF +(define-integrable (valid-utf8-sequence-3? b0 b1 b2) + (and (utf8-initial-byte-3? b0) + (cond ((fix:= b0 #xE0) (u8:A0..BF? b1)) + ((fix:< b0 #xED) (u8:80..BF? b1)) + ((fix:= b0 #xED) (u8:80..9F? b1)) + (else (u8:80..BF? b1))) + (u8:80..BF? b2))) + +;; code-point range b0 b1 b2 b3 +;; ------------------ ------ ------ ------ ------ +;; U+010000..U+03FFFF F0 90..BF 80..BF 80..BF +;; U+040000..U+0FFFFF F1..F3 80..BF 80..BF 80..BF +;; U+100000..U+10FFFF F4 80..8F 80..BF 80..BF +(define-integrable (valid-utf8-sequence-4? b0 b1 b2 b3) + (and (utf8-initial-byte-4? b0) + (cond ((fix:= b0 #xF0) (u8:90..BF? b1)) + ((fix:< b0 #xF4) (u8:80..BF? b1)) + (else (u8:80..8F? b1))) + (u8:80..BF? b2) + (u8:80..BF? b3))) + +;; Trailing bytes: + +(define-integrable (u8:80..8F? byte) + (fix:= #x80 (fix:and #xF0 byte))) + +(define-integrable (u8:80..9F? byte) + (fix:= #x80 (fix:and #xE0 byte))) + +(define-integrable (u8:80..BF? byte) (fix:= #x80 (fix:and #xC0 byte))) + +(define-integrable (u8:90..BF? byte) + (and (fix:>= byte #x90) (fix:<= byte #xBF))) + +(define-integrable (u8:A0..BF? byte) + (and (fix:>= byte #xA0) (fix:<= byte #xBF))) (define (initial-u16->utf16-char-length u16) (guarantee u16? u16 'initial-u16->utf16-char-length) - (if (utf16-high-surrogate? u16) 4 2)) + (if (utf16-low-surrogate? u16) + (error "Illegal initial UTF-16 unit:" u16)) + (if (utf16-high-surrogate? u16) + 4 + 2)) (define (utf16-char-decoder getter) (lambda (bytes index) - (let ((d0 (getter bytes index))) - (if (utf16-low-surrogate? d0) - (error "Ill-formed UTF-16 sequence:" d0)) - (let ((cp - (if (utf16-high-surrogate? d0) - (let ((d1 (getter bytes (fix:+ index 2)))) - (if (not (utf16-low-surrogate? d1)) - (error "Ill-formed UTF-16 sequence:" d0 d1)) - (fix:+ (fix:or (extract-bits d0 #x3FF 10) - (extract-bits d1 #x3FF 0)) - #x10000)) - d0))) - (guarantee-cp-in-range cp) - (guarantee-cp-is-character cp) - (integer->char cp))))) + (integer->char + (let ((d0 (getter bytes index))) + (if (utf16-low-surrogate? d0) + (error "Illegal initial UTF-16 unit:" d0)) + (if (utf16-high-surrogate? d0) + (let ((d1 (getter bytes (fix:+ index 2)))) + (if (not (utf16-low-surrogate? d1)) + (error "Ill-formed UTF-16 sequence:" d0 d1)) + (fix:+ (fix:or (extract-bits d0 #x3FF 10) + (extract-bits d1 #x3FF 0)) + #x10000)) + d0))))) (define decode-utf16be-char (utf16-char-decoder bytevector-u16be-ref)) @@ -609,16 +641,13 @@ USA. (utf16-char-decoder bytevector-u16le-ref)) (define (initial-u32->utf32-char-length u32) - (guarantee u32? u32 'initial-u32->utf32-char-length) + (guarantee unicode-scalar-value? u32 'initial-u32->utf32-char-length) 4) (define (utf32-char-decoder getter) (lambda (bytes index) (let ((u32 (getter bytes index))) - (if (not (< u32 char-code-limit)) - (error "Value is not a code point:" u32)) - (guarantee-cp-not-utf16-surrogate u32) - (guarantee-cp-is-character u32) + (guarantee unicode-scalar-value? u32 'utf32-char-decoder) (integer->char u32)))) (define decode-utf32be-char diff --git a/tests/runtime/test-char.scm b/tests/runtime/test-char.scm index 82b99ff92..43c72ccb8 100644 --- a/tests/runtime/test-char.scm +++ b/tests/runtime/test-char.scm @@ -50,16 +50,16 @@ USA. named-chars))) (define ascii-chars - '(#\u+00 #\u+01 #\u+02 #\u+03 #\u+04 #\u+05 #\u+06 #\u+07 - #\u+08 #\u+09 #\u+0A #\u+0B #\u+0C #\u+0D #\u+0E #\u+0F - #\u+10 #\u+11 #\u+12 #\u+13 #\u+14 #\u+15 #\u+16 #\u+17 - #\u+18 #\u+19 #\u+1A #\u+1B #\u+1C #\u+1D #\u+1E #\u+1F - #\u+20 #\! #\" #\# #\$ #\% #\& #\' #\( #\) #\* #\+ #\, #\- #\. #\/ + '(#\x00 #\x01 #\x02 #\x03 #\x04 #\x05 #\x06 #\x07 + #\x08 #\x09 #\x0A #\x0B #\x0C #\x0D #\x0E #\x0F + #\x10 #\x11 #\x12 #\x13 #\x14 #\x15 #\x16 #\x17 + #\x18 #\x19 #\x1A #\x1B #\x1C #\x1D #\x1E #\x1F + #\x20 #\! #\" #\# #\$ #\% #\& #\' #\( #\) #\* #\+ #\, #\- #\. #\/ #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9 #\: #\; #\< #\= #\> #\? #\@ #\A #\B #\C #\D #\E #\F #\G #\H #\I #\J #\K #\L #\M #\N #\O #\P #\Q #\R #\S #\T #\U #\V #\W #\X #\Y #\Z #\[ #\\ #\] #\^ #\_ #\` #\a #\b #\c #\d #\e #\f #\g #\h #\i #\j #\k #\l #\m #\n #\o #\p #\q - #\r #\s #\t #\u #\v #\w #\x #\y #\z #\{ #\| #\} #\~ #\u+7F)) + #\r #\s #\t #\u #\v #\w #\x #\y #\z #\{ #\| #\} #\~ #\x7F)) (define-test 'basic-ascii (lambda () @@ -99,31 +99,56 @@ USA. ascii-chars) ;; 2.1 First possible sequence of a certain length - (#u8(#x00) #\u+00000000) - (#u8(#xC2 #x80) #\u+00000080) - (#u8(#xE0 #xA0 #x80) #\u+00000800) - (#u8(#xF0 #x90 #x80 #x80) #\u+00010000) + (#u8(#x00) #\x00000000) + (#u8(#xC2 #x80) #\x00000080) + (#u8(#xE0 #xA0 #x80) #\x00000800) + (#u8(#xF0 #x90 #x80 #x80) #\x00010000) ;; 2.2 Last possible sequence of a certain length - (#u8(#x7F) #\u+0000007F) - (#u8(#xDF #xBF) #\u+000007FF) - (#u8(#xEF #xBF #xBD) #\u+0000FFFD) + (#u8(#x7F) #\x0000007F) + (#u8(#xDF #xBF) #\x000007FF) + (#u8(#xEF #xBF #xBD) #\x0000FFFD) + (#u8(#xEF #xBF #xBF) #\x0000FFFF) ;; 2.3 Other boundary conditions - (#u8(#xED #x9F #xBF) #\u+0000D7FF) - (#u8(#xEE #x80 #x80) #\u+0000E000) - (#u8(#xEF #xBF #xBD) #\u+0000FFFD) - (#u8(#xF4 #x8F #xBF #xBD) #\u+0010FFFD) + (#u8(#xED #x9F #xBF) #\x0000D7FF) + (#u8(#xEE #x80 #x80) #\x0000E000) + (#u8(#xEF #xBF #xBD) #\x0000FFFD) + (#u8(#xF4 #x8F #xBF #xBD) #\x0010FFFD) + (#u8(#xF4 #x8F #xBF #xBF) #\x0010FFFF) + + ;; 5.3 Noncharacter code positions + ;; Particularly problematic noncharacters in 16-bit applications: + (#u8(#xEF #xBF #xBE) #\xFFFE) + (#u8(#xEF #xBF #xBF) #\xFFFF) )) + +(define-test 'utf8-initial-byte + (lambda () + (for-each (lambda (b) + (if (memv b invalid-utf8-initial-bytes) + (assert-error + (lambda () (initial-byte->utf8-char-length b))) + (assert-= (cond ((< b #x80) 1) + ((< b #xE0) 2) + ((< b #xF0) 3) + (else 4)) + (initial-byte->utf8-char-length b)))) + (iota #x100)))) + +(define invalid-utf8-initial-bytes + (append (iota (- #xc2 #x80) #x80) + (iota (- #x100 #xf5) #xF5))) (define-test 'invalid-known-length-utf8-sequences (lambda () (for-each (lambda (entry) (let ((bytes (car entry)) (length (cadr entry))) - (assert-= length - (initial-byte->utf8-char-length - (bytevector-u8-ref bytes 0))) + (let ((b0 (bytevector-u8-ref bytes 0))) + (if (not (memv b0 invalid-utf8-initial-bytes)) + (assert-= length + (initial-byte->utf8-char-length b0)))) (assert-error (lambda () (decode-utf8-char bytes 0))))) invalid-known-length-sequences))) @@ -152,28 +177,23 @@ USA. ,@(map (lambda (bytes) (list bytes (+ (bytevector-length bytes) 1))) '( - #u8(#xC0) ; #\u+0000 - #u8(#xE0 #x80) ; #\u+0000 - #u8(#xF0 #x80 #x80) ; #\u+0000 - #u8(#xDF) ; #\u+000007FF - #u8(#xEF #xBF) ; #\u+0000FFFF - #u8(#xF7 #xBF #xBF) ; #\u+001FFFFF + #u8(#xC0) ; #\x0000 + #u8(#xE0 #x80) ; #\x0000 + #u8(#xF0 #x80 #x80) ; #\x0000 + #u8(#xDF) ; #\x000007FF + #u8(#xEF #xBF) ; #\x0000FFFF + #u8(#xF7 #xBF #xBF) ; #\x001FFFFF )) )) -(define-test 'illegal-chars +(define-test 'utf16-surrogates (lambda () (for-each (lambda (cp) + (value-assert unicode-code-point? "code point" cp) (value-assert (lambda (cp) (not (unicode-scalar-value? cp))) "non-scalar value" - cp)) - utf16-surrogates) - (for-each (lambda (cp) - (value-assert unicode-scalar-value? "scalar value" cp)) - illegal-characters) - (for-each (lambda (cp) - (value-assert unicode-code-point? "code point" cp) + cp) (let ((char (integer->char cp))) (value-assert (lambda (char) (not (unicode-char? char))) @@ -182,11 +202,24 @@ USA. (assert-error (lambda () (encode-utf8-char! (make-bytevector 16) 0 char))))) - (append utf16-surrogates illegal-characters)))) + utf16-surrogates))) (define utf16-surrogates (iota #x800 #xD800)) +(define-test 'illegal-chars + (lambda () + (for-each (lambda (cp) + (value-assert unicode-code-point? "code point" cp) + (value-assert unicode-scalar-value? "scalar value" cp) + (let ((char (integer->char cp))) + (value-assert (lambda (char) + (not (unicode-char? char))) + "non-unicode character" + char) + (encode-utf8-char! (make-bytevector 16) 0 char))) + illegal-characters))) + (define illegal-characters `( ;; Other noncharacters: @@ -207,13 +240,9 @@ USA. (define invalid-utf8-sequences `( - ;; 2.2 Last possible sequence of a certain length - #u8(#xEF #xBF #xBF) ; #\u+0000FFFF - #u8(#xF7 #xBF #xBF #xBF) ; #\u+001FFFFF - ;; 2.3 Other boundary conditions - #u8(#xF4 #x8F #xBF #xBF) ; #\u+0010FFFF - #u8(#xF4 #x90 #x80 #x80) ; #\u+00110000 + #u8(#xF4 #x90 #x80 #x80) ; #\x00110000 + #u8(#xF7 #xBF #xBF #xBF) ; #\x001FFFFF ;; 3.1 Unexpected continuation bytes ;; (duplicated below) @@ -244,31 +273,31 @@ USA. (iota #x02 #xFC)) ;; 3.3 Sequences with last continuation byte missing - #u8(#xF8 #x80 #x80 #x80) ; #\u+0000 - #u8(#xFC #x80 #x80 #x80 #x80) ; #\u+0000 - #u8(#xFB #xBF #xBF #xBF) ; #\u+03FFFFFF - #u8(#xFD #xBF #xBF #xBF #xBF) ; #\u+7FFFFFFF + #u8(#xF8 #x80 #x80 #x80) ; #\x0000 + #u8(#xFC #x80 #x80 #x80 #x80) ; #\x0000 + #u8(#xFB #xBF #xBF #xBF) ; #\x03FFFFFF + #u8(#xFD #xBF #xBF #xBF #xBF) ; #\x7FFFFFFF ;; 4.1 Examples of an overlong ASCII character - #u8(#xC0 #xAF) ; #\u+002F - #u8(#xE0 #x80 #xAF) ; #\u+002F - #u8(#xF0 #x80 #x80 #xAF) ; #\u+002F - #u8(#xF8 #x80 #x80 #x80 #xAF) ; #\u+002F - #u8(#xFC #x80 #x80 #x80 #x80 #xAF) ; #\u+002F + #u8(#xC0 #xAF) ; #\x002F + #u8(#xE0 #x80 #xAF) ; #\x002F + #u8(#xF0 #x80 #x80 #xAF) ; #\x002F + #u8(#xF8 #x80 #x80 #x80 #xAF) ; #\x002F + #u8(#xFC #x80 #x80 #x80 #x80 #xAF) ; #\x002F ;; 4.2 Maximum overlong sequences - #u8(#xC1 #xBF) ; #\u+0000007F - #u8(#xE0 #x9F #xBF) ; #\u+000007FF - #u8(#xF0 #x8F #xBF #xBF) ; #\u+0000FFFF - #u8(#xF8 #x87 #xBF #xBF #xBF) ; #\u+001FFFFF - #u8(#xFC #x83 #xBF #xBF #xBF #xBF) ; #\u+03FFFFFF + #u8(#xC1 #xBF) ; #\x0000007F + #u8(#xE0 #x9F #xBF) ; #\x000007FF + #u8(#xF0 #x8F #xBF #xBF) ; #\x0000FFFF + #u8(#xF8 #x87 #xBF #xBF #xBF) ; #\x001FFFFF + #u8(#xFC #x83 #xBF #xBF #xBF #xBF) ; #\x03FFFFFF ;; 4.3 Overlong representation of the NUL character - #u8(#xC0 #x80) ; #\u+0000 - #u8(#xE0 #x80 #x80) ; #\u+0000 - #u8(#xF0 #x80 #x80 #x80) ; #\u+0000 - #u8(#xF8 #x80 #x80 #x80 #x80) ; #\u+0000 - #u8(#xFC #x80 #x80 #x80 #x80 #x80) ; #\u+0000 + #u8(#xC0 #x80) ; #\x0000 + #u8(#xE0 #x80 #x80) ; #\x0000 + #u8(#xF0 #x80 #x80 #x80) ; #\x0000 + #u8(#xF8 #x80 #x80 #x80 #x80) ; #\x0000 + #u8(#xFC #x80 #x80 #x80 #x80 #x80) ; #\x0000 ;; 3.5 Impossible bytes #u8(#xFE) @@ -276,26 +305,21 @@ USA. #u8(#xFE #xFE #xFF #xFF) ;; 5.1 Single UTF-16 surrogates - #u8(#xED #xA0 #x80) ; #\u+D800 - #u8(#xED #xAD #xBF) ; #\u+DB7F - #u8(#xED #xAE #x80) ; #\u+DB80 - #u8(#xED #xAF #xBF) ; #\u+DBFF - #u8(#xED #xB0 #x80) ; #\u+DC00 - #u8(#xED #xBE #x80) ; #\u+DF80 - #u8(#xED #xBF #xBF) ; #\u+DFFF + #u8(#xED #xA0 #x80) ; #\xD800 + #u8(#xED #xAD #xBF) ; #\xDB7F + #u8(#xED #xAE #x80) ; #\xDB80 + #u8(#xED #xAF #xBF) ; #\xDBFF + #u8(#xED #xB0 #x80) ; #\xDC00 + #u8(#xED #xBE #x80) ; #\xDF80 + #u8(#xED #xBF #xBF) ; #\xDFFF ;; 5.2 Paired UTF-16 surrogates - ;; (#\u+D800 #\u+DC00 #u8(#xED #xA0 #x80 #xED #xB0 #x80)) - ;; (#\u+D800 #\u+DFFF #u8(#xED #xA0 #x80 #xED #xBF #xBF)) - ;; (#\u+DB7F #\u+DC00 #u8(#xED #xAD #xBF #xED #xB0 #x80)) - ;; (#\u+DB7F #\u+DFFF #u8(#xED #xAD #xBF #xED #xBF #xBF)) - ;; (#\u+DB80 #\u+DC00 #u8(#xED #xAE #x80 #xED #xB0 #x80)) - ;; (#\u+DB80 #\u+DFFF #u8(#xED #xAE #x80 #xED #xBF #xBF)) - ;; (#\u+DBFF #\u+DC00 #u8(#xED #xAF #xBF #xED #xB0 #x80)) - ;; (#\u+DBFF #\u+DFFF #u8(#xED #xAF #xBF #xED #xBF #xBF)) - - ;; 5.3 Noncharacter code positions - ;; Particularly problematic noncharacters in 16-bit applications: - #u8(#xEF #xBF #xBE) ; #\u+FFFE - #u8(#xEF #xBF #xBF) ; #\u+FFFF + ;; (#\xD800 #\xDC00 #u8(#xED #xA0 #x80 #xED #xB0 #x80)) + ;; (#\xD800 #\xDFFF #u8(#xED #xA0 #x80 #xED #xBF #xBF)) + ;; (#\xDB7F #\xDC00 #u8(#xED #xAD #xBF #xED #xB0 #x80)) + ;; (#\xDB7F #\xDFFF #u8(#xED #xAD #xBF #xED #xBF #xBF)) + ;; (#\xDB80 #\xDC00 #u8(#xED #xAE #x80 #xED #xB0 #x80)) + ;; (#\xDB80 #\xDFFF #u8(#xED #xAE #x80 #xED #xBF #xBF)) + ;; (#\xDBFF #\xDC00 #u8(#xED #xAF #xBF #xED #xB0 #x80)) + ;; (#\xDBFF #\xDFFF #u8(#xED #xAF #xBF #xED #xBF #xBF)) )) \ No newline at end of file