From d7cf9e8a8b14fe56e746dd7ddec03b3e67d6d4a9 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Mon, 27 May 2019 16:21:08 +0000 Subject: [PATCH] Implement character replacement on ill-formed octet sequences. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit - (utf8->string bv start end #t) now replaces by U+FFFD. Existing behaviour of (utf8->string bv [start end]) is unchanged so that utf8->string will fail noisily rather than quietly fail to be invertible by string->utf8 on certain inputs. - Generic I/O input now replaces ill-formed octet sequences by U+FFFD. TODO: Add (port/set-coding-error port ) for = replace or = error, perhaps. TODO: This does not exactly implement the replacement algorithm recommended as a best practice by Unicode 9, §3.9, pp. 127-129. That algorithm is inconveneint because our decoder is factored into (a) claiming a length based on the first code unit, and then (b) consuming exactly that many bytes; the algorithm requires us to refactor it so that part (b) can say `never mind' and consume fewer bytes than (a) requeste. --- src/runtime/bytevector.scm | 32 +++++++--- src/runtime/char.scm | 98 +++++++++++++++-------------- src/runtime/generic-io.scm | 15 +++-- src/runtime/runtime.pkg | 1 + tests/runtime/test-char.scm | 120 +++++++++++++++++++++++++++++++----- 5 files changed, 184 insertions(+), 82 deletions(-) diff --git a/src/runtime/bytevector.scm b/src/runtime/bytevector.scm index e11e91740..094c9e5d2 100644 --- a/src/runtime/bytevector.scm +++ b/src/runtime/bytevector.scm @@ -352,26 +352,38 @@ USA. (define-integrable (bytes-decoder getter initial->length decode-char step noun caller) - (lambda (bytevector #!optional start end) + (lambda (bytevector #!optional start end replace?) (let* ((end (fix:end-index end (bytevector-length bytevector) caller)) (start (fix:start-index start end caller)) (builder (string-builder))) (let ((truncated - (lambda (index) - (error (string "Truncated " noun " sequence:") - (bytevector-copy bytevector - index - (fix:min (fix:+ index 4) end)))))) + (if (or (default-object? replace?) (not replace?)) + (lambda (index) + (error (string "Truncated " noun " sequence:") + (bytevector-copy bytevector + index + (fix:min (fix:+ index 4) end)))) + (lambda (index) index char:replacement))) + (ill-formed + (if (or (default-object? replace?) (not replace?)) + (lambda (index) + (error (string "Ill-formed " noun " sequence:") + (bytevector-copy bytevector + index + (fix:min (fix:+ index 4) end)))) + (lambda (index) index char:replacement)))) (let loop ((index start)) (if (fix:<= (fix:+ index step) end) (let ((n (initial->length (getter bytevector index)))) (let ((index* (fix:+ index n))) - (if (not (fix:<= index* end)) - (truncated index)) - (builder (decode-char bytevector index)) + (builder + (if (not (fix:<= index* end)) + (truncated index) + (or (decode-char bytevector index) + (ill-formed index)))) (loop index*))) (if (fix:< index end) - (truncated index))))) + (builder (truncated index)))))) (builder)))) (define utf8->string) diff --git a/src/runtime/char.scm b/src/runtime/char.scm index 257d61ad0..251762e54 100644 --- a/src/runtime/char.scm +++ b/src/runtime/char.scm @@ -414,6 +414,8 @@ USA. (define-integrable (insert-bits word mask shift) (fix:and (fix:lsh word shift) mask)) + +(define char:replacement (integer->char #xfffd)) ;;;; UTF-{8,16,32} encoders @@ -499,43 +501,41 @@ USA. ((utf8-initial-byte-2? b0) 2) ((utf8-initial-byte-3? b0) 3) ((utf8-initial-byte-4? b0) 4) - (else (error "Illegal UTF-8 initial byte:" b0)))) + (else 1))) ;error, eat byte (define (next-char-length:utf8 bv bs be) (and (fix:<= (fix:+ bs 1) be) (initial-byte->utf8-char-length (bytevector-u8-ref bv bs)))) (define (decode-utf8-char bytes index) - (integer->char - (let ((b0 (bytevector-u8-ref bytes index))) - (cond ((utf8-initial-byte-1? b0) - b0) - ((utf8-initial-byte-2? b0) - (let ((b1 (bytevector-u8-ref bytes (fix:+ index 1)))) - (if (not (valid-utf8-sequence-2? b0 b1)) - (error "Ill-formed UTF-8 sequence:" b0 b1)) - (fix:or (extract-bits b0 #x1F 6) - (extract-bits b1 #x3F 0)))) - ((utf8-initial-byte-3? b0) - (let ((b1 (bytevector-u8-ref bytes (fix:+ index 1))) - (b2 (bytevector-u8-ref bytes (fix:+ index 2)))) - (if (not (valid-utf8-sequence-3? b0 b1 b2)) - (error "Ill-formed UTF-8 sequence:" b0 b1 b2)) - (fix:or (fix:or (extract-bits b0 #x0F 12) - (extract-bits b1 #x3F 6)) - (extract-bits b2 #x3F 0)))) - ((utf8-initial-byte-4? b0) - (let ((b1 (bytevector-u8-ref bytes (fix:+ index 1))) - (b2 (bytevector-u8-ref bytes (fix:+ index 2))) - (b3 (bytevector-u8-ref bytes (fix:+ index 3)))) - (if (not (valid-utf8-sequence-4? b0 b1 b2 b3)) - (error "Ill-formed UTF-8 sequence:" b0 b1 b2 b3)) - (fix:or (fix:or (extract-bits b0 #x07 18) - (extract-bits b1 #x3F 12)) - (fix:or (extract-bits b2 #x3F 6) - (extract-bits b3 #x3F 0))))) - (else - (error "Illegal UTF-8 initial byte:" b0)))))) + (let ((b0 (bytevector-u8-ref bytes index))) + (cond ((utf8-initial-byte-1? b0) + (integer->char b0)) + ((utf8-initial-byte-2? b0) + (let ((b1 (bytevector-u8-ref bytes (fix:+ index 1)))) + (and (valid-utf8-sequence-2? b0 b1) + (integer->char + (fix:or (extract-bits b0 #x1F 6) + (extract-bits b1 #x3F 0)))))) + ((utf8-initial-byte-3? b0) + (let ((b1 (bytevector-u8-ref bytes (fix:+ index 1))) + (b2 (bytevector-u8-ref bytes (fix:+ index 2)))) + (and (valid-utf8-sequence-3? b0 b1 b2) + (integer->char + (fix:or (fix:or (extract-bits b0 #x0F 12) + (extract-bits b1 #x3F 6)) + (extract-bits b2 #x3F 0)))))) + ((utf8-initial-byte-4? b0) + (let ((b1 (bytevector-u8-ref bytes (fix:+ index 1))) + (b2 (bytevector-u8-ref bytes (fix:+ index 2))) + (b3 (bytevector-u8-ref bytes (fix:+ index 3)))) + (and (valid-utf8-sequence-4? b0 b1 b2 b3) + (integer->char + (fix:or (fix:or (extract-bits b0 #x07 18) + (extract-bits b1 #x3F 12)) + (fix:or (extract-bits b2 #x3F 6) + (extract-bits b3 #x3F 0))))))) + (else #f)))) (define-integrable (utf8-initial-byte-1? byte) (fix:= #x00 (fix:and #x80 byte))) @@ -608,9 +608,8 @@ USA. (define (initial-u16->utf16-char-length u16) (guarantee u16? u16 'initial-u16->utf16-char-length) - (if (utf16-low-surrogate? u16) - (error "Illegal initial UTF-16 unit:" u16)) - (if (utf16-high-surrogate? u16) + (if (and (not (utf16-low-surrogate? u16)) + (utf16-high-surrogate? u16)) 4 2)) @@ -624,18 +623,16 @@ USA. (define (utf16-char-decoder getter) (lambda (bytes index) - (integer->char - (let ((d0 (getter bytes index))) - (if (utf16-low-surrogate? d0) - (error "Illegal initial UTF-16 unit:" d0)) - (if (utf16-high-surrogate? d0) - (let ((d1 (getter bytes (fix:+ index 2)))) - (if (not (utf16-low-surrogate? d1)) - (error "Ill-formed UTF-16 sequence:" d0 d1)) - (fix:+ (fix:or (extract-bits d0 #x3FF 10) - (extract-bits d1 #x3FF 0)) - #x10000)) - d0))))) + (let ((d0 (getter bytes index))) + (and (not (utf16-low-surrogate? d0)) + (if (utf16-high-surrogate? d0) + (let ((d1 (getter bytes (fix:+ index 2)))) + (and (utf16-low-surrogate? d1) + (integer->char + (fix:+ (fix:or (extract-bits d0 #x3FF 10) + (extract-bits d1 #x3FF 0)) + #x10000)))) + (integer->char d0)))))) (define decode-utf16be-char (utf16-char-decoder bytevector-u16be-ref)) @@ -644,7 +641,7 @@ USA. (utf16-char-decoder bytevector-u16le-ref)) (define (initial-u32->utf32-char-length u32) - (guarantee unicode-scalar-value? u32 'initial-u32->utf32-char-length) + (guarantee u32? u32 'initial-u32->utf32-char-length) 4) (define (next-char-length:utf32le bv bs be) @@ -658,8 +655,8 @@ USA. (define (utf32-char-decoder getter) (lambda (bytes index) (let ((u32 (getter bytes index))) - (guarantee unicode-scalar-value? u32 'utf32-char-decoder) - (integer->char u32)))) + (and (unicode-scalar-value? u32) + (integer->char u32))))) (define decode-utf32be-char (utf32-char-decoder bytevector-u32be-ref)) @@ -702,7 +699,8 @@ USA. (k #f bs) (let ((bs* (fix:+ bs n))) (k (and (fix:<= bs* be) - (decode-char bv bs)) + (or (decode-char bv bs) + char:replacement)) bs*))))))) (define-char-codec 'utf8 diff --git a/src/runtime/generic-io.scm b/src/runtime/generic-io.scm index 6052dfa31..16f3560a7 100644 --- a/src/runtime/generic-io.scm +++ b/src/runtime/generic-io.scm @@ -1354,7 +1354,8 @@ USA. (lambda (ib) (let ((n (initial-byte->utf8-char-length (peek-byte ib)))) (read-bytes! ib 0 n) - (decode-utf8-char (input-buffer-bytes ib) 0)))) + (or (decode-utf8-char (input-buffer-bytes ib) 0) + char:replacement)))) (define-encoder 'utf-8 (lambda (ob char) @@ -1372,7 +1373,8 @@ USA. (bytevector-u16be-ref (input-buffer-bytes ib) 0)))) (if (fix:> n 2) (read-bytes! ib 2 n)) - (decode-utf16be-char (input-buffer-bytes ib) 0)))) + (or (decode-utf16be-char (input-buffer-bytes ib) 0) + char:replacement)))) (define-decoder 'utf-16le (lambda (ib) @@ -1382,7 +1384,8 @@ USA. (bytevector-u16le-ref (input-buffer-bytes ib) 0)))) (if (fix:> n 2) (read-bytes! ib 2 n)) - (decode-utf16le-char (input-buffer-bytes ib) 0)))) + (or (decode-utf16le-char (input-buffer-bytes ib) 0) + char:replacement)))) (define-encoder 'utf-16be (lambda (ob char) @@ -1399,12 +1402,14 @@ USA. (define-decoder 'utf-32be (lambda (ib) (read-bytes! ib 0 4) - (decode-utf32be-char (input-buffer-bytes ib) 0))) + (or (decode-utf32be-char (input-buffer-bytes ib) 0) + char:replacement))) (define-decoder 'utf-32le (lambda (ib) (read-bytes! ib 0 4) - (decode-utf32le-char (input-buffer-bytes ib) 0))) + (or (decode-utf32le-char (input-buffer-bytes ib) 0) + char:replacement))) (define-encoder 'utf-32be (lambda (ob char) diff --git a/src/runtime/runtime.pkg b/src/runtime/runtime.pkg index 3bd89bf6d..f3aca8325 100644 --- a/src/runtime/runtime.pkg +++ b/src/runtime/runtime.pkg @@ -1361,6 +1361,7 @@ USA. char-utf16-byte-length char-utf32-byte-length char-utf8-byte-length + char:replacement decode-utf16be-char decode-utf16le-char decode-utf32be-char diff --git a/tests/runtime/test-char.scm b/tests/runtime/test-char.scm index 2d1af87ec..28a2f88a6 100644 --- a/tests/runtime/test-char.scm +++ b/tests/runtime/test-char.scm @@ -302,14 +302,12 @@ USA. (define-test 'utf8-initial-byte (lambda () (for-each (lambda (b) - (if (memv b invalid-utf8-initial-bytes) - (assert-error - (lambda () (initial-byte->utf8-char-length b))) - (assert-= (initial-byte->utf8-char-length b) - (cond ((< b #x80) 1) - ((< b #xE0) 2) - ((< b #xF0) 3) - (else 4))))) + (assert-= (initial-byte->utf8-char-length b) + (cond ((memv b invalid-utf8-initial-bytes) 1) + ((< b #x80) 1) + ((< b #xE0) 2) + ((< b #xF0) 3) + (else 4)))) (iota #x100)))) (define invalid-utf8-initial-bytes @@ -319,13 +317,16 @@ USA. (define-test 'invalid-known-length-utf8-sequences (lambda () (for-each (lambda (entry) - (let ((bytes (car entry)) - (length (cadr entry))) - (let ((b0 (bytevector-u8-ref bytes 0))) - (if (not (memv b0 invalid-utf8-initial-bytes)) - (assert-= (initial-byte->utf8-char-length b0) - length))) - (assert-error (lambda () (decode-utf8-char bytes 0))))) + (let ((bytes (car entry)) + (length (cadr entry))) + (let* ((b0 (bytevector-u8-ref bytes 0)) + (length* (initial-byte->utf8-char-length b0))) + (assert-= length* + (if (memv b0 invalid-utf8-initial-bytes) + 1 + length)) + (if (<= length* (bytevector-length bytes)) + (assert-false (decode-utf8-char bytes 0)))))) invalid-known-length-sequences))) (define invalid-known-length-sequences @@ -410,7 +411,7 @@ USA. (define-test 'invalid-utf8-sequences (lambda () (for-each (lambda (bytes) - (assert-error (lambda () (decode-utf8-char bytes 0)))) + (assert-false (decode-utf8-char bytes 0))) invalid-utf8-sequences))) (define invalid-utf8-sequences @@ -498,4 +499,89 @@ USA. ;; (#\xDB80 #\xDFFF #u8(#xED #xAE #x80 #xED #xBF #xBF)) ;; (#\xDBFF #\xDC00 #u8(#xED #xAF #xBF #xED #xB0 #x80)) ;; (#\xDBFF #\xDFFF #u8(#xED #xAF #xBF #xED #xBF #xBF)) - )) \ No newline at end of file + )) + +(define (decode-via-port coding octets) + (let* ((binary-port (open-input-bytevector octets)) + (textual-port (binary->textual-port binary-port))) + (port/set-coding textual-port coding) + (read-string (char-set) textual-port))) + +(define-test 'replacement-character/utf8 + (lambda () + (define octets + #u8(#x20 #b10000010 #x20 + #b11000010 0 #x20 + #b11100010 0 0 #x20 + #b11110010 0 0 0 #x20)) + (define n (bytevector-length octets)) + (define string " \xfffd; \xfffd; \xfffd; \xfffd; ") + (assert-error (lambda () (utf8->string octets))) + (assert-error (lambda () (utf8->string octets 0))) + (assert-error (lambda () (utf8->string octets 0 n))) + (assert-equal (utf8->string octets 0 n #t) string) + (assert-equal (decode-via-port 'UTF-8 octets) string))) + +(define-test 'replacement-character/utf16le + (lambda () + (define octets + #u8(#x20 0 + #x00 #xd8 0 0 + #x20 0 + 0 #xdf + #x20 0)) + (define n (bytevector-length octets)) + (define string " \xfffd; \xfffd; ") + (assert-error (lambda () (utf16le->string octets))) + (assert-error (lambda () (utf16le->string octets 0))) + (assert-error (lambda () (utf16le->string octets 0 n))) + (assert-equal (utf16le->string octets 0 n #t) string) + (assert-equal (decode-via-port 'UTF-16LE octets) string))) + +(define-test 'replacement-character/utf16be + (lambda () + (define octets + #u8(0 #x20 + #xd8 #x00 0 0 + 0 #x20 + #xdf 0 + 0 #x20)) + (define n (bytevector-length octets)) + (define string " \xfffd; \xfffd; ") + (assert-error (lambda () (utf16be->string octets))) + (assert-error (lambda () (utf16be->string octets 0))) + (assert-error (lambda () (utf16be->string octets 0 n))) + (assert-equal (utf16be->string octets 0 n #t) string) + (assert-equal (decode-via-port 'UTF-16BE octets) string))) + +(define-test 'replacement-character/utf32le + (lambda () + (define octets + #u8(#x20 0 0 0 + 0 #xd8 0 0 + #x20 0 0 0 + 0 #xdf 0 0 + #x20 0 0 0)) + (define n (bytevector-length octets)) + (define string " \xfffd; \xfffd; ") + (assert-error (lambda () (utf32le->string octets))) + (assert-error (lambda () (utf32le->string octets 0))) + (assert-error (lambda () (utf32le->string octets 0 n))) + (assert-equal (utf32le->string octets 0 n #t) string) + (assert-equal (decode-via-port 'UTF-32LE octets) string))) + +(define-test 'replacement-character:utf32be + (lambda () + (define octets + #u8(0 0 0 #x20 + 0 0 #xd8 0 + 0 0 0 #x20 + 0 0 #xdf 0 + 0 0 0 #x20)) + (define n (bytevector-length octets)) + (define string " \xfffd; \xfffd; ") + (assert-error (lambda () (utf32be->string octets))) + (assert-error (lambda () (utf32be->string octets 0))) + (assert-error (lambda () (utf32be->string octets 0 n))) + (assert-equal (utf32be->string octets 0 n #t) string) + (assert-equal (decode-via-port 'UTF-32BE octets) string))) \ No newline at end of file -- 2.25.1