- (utf8->string bv start end #t) now replaces by U+FFFD.
Existing behaviour of (utf8->string bv [start end]) is unchanged so
that utf8->string will fail noisily rather than quietly fail to be
invertible by string->utf8 on certain inputs.
- Generic I/O input now replaces ill-formed octet sequences by U+FFFD.
TODO: Add (port/set-coding-error port <action>) for <action> =
replace or <action> = error, perhaps.
TODO: This does not exactly implement the replacement algorithm
recommended as a best practice by Unicode 9, ยง3.9, pp. 127-129. That
algorithm is inconveneint because our decoder is factored into (a)
claiming a length based on the first code unit, and then (b)
consuming exactly that many bytes; the algorithm requires us to
refactor it so that part (b) can say `never mind' and consume fewer
bytes than (a) requeste.
\f
(define-integrable (bytes-decoder getter initial->length decode-char step noun
caller)
- (lambda (bytevector #!optional start end)
+ (lambda (bytevector #!optional start end replace?)
(let* ((end (fix:end-index end (bytevector-length bytevector) caller))
(start (fix:start-index start end caller))
(builder (string-builder)))
(let ((truncated
- (lambda (index)
- (error (string "Truncated " noun " sequence:")
- (bytevector-copy bytevector
- index
- (fix:min (fix:+ index 4) end))))))
+ (if (or (default-object? replace?) (not replace?))
+ (lambda (index)
+ (error (string "Truncated " noun " sequence:")
+ (bytevector-copy bytevector
+ index
+ (fix:min (fix:+ index 4) end))))
+ (lambda (index) index char:replacement)))
+ (ill-formed
+ (if (or (default-object? replace?) (not replace?))
+ (lambda (index)
+ (error (string "Ill-formed " noun " sequence:")
+ (bytevector-copy bytevector
+ index
+ (fix:min (fix:+ index 4) end))))
+ (lambda (index) index char:replacement))))
(let loop ((index start))
(if (fix:<= (fix:+ index step) end)
(let ((n (initial->length (getter bytevector index))))
(let ((index* (fix:+ index n)))
- (if (not (fix:<= index* end))
- (truncated index))
- (builder (decode-char bytevector index))
+ (builder
+ (if (not (fix:<= index* end))
+ (truncated index)
+ (or (decode-char bytevector index)
+ (ill-formed index))))
(loop index*)))
(if (fix:< index end)
- (truncated index)))))
+ (builder (truncated index))))))
(builder))))
(define utf8->string)
(define-integrable (insert-bits word mask shift)
(fix:and (fix:lsh word shift) mask))
+
+(define char:replacement (integer->char #xfffd))
\f
;;;; UTF-{8,16,32} encoders
((utf8-initial-byte-2? b0) 2)
((utf8-initial-byte-3? b0) 3)
((utf8-initial-byte-4? b0) 4)
- (else (error "Illegal UTF-8 initial byte:" b0))))
+ (else 1))) ;error, eat byte
(define (next-char-length:utf8 bv bs be)
(and (fix:<= (fix:+ bs 1) be)
(initial-byte->utf8-char-length (bytevector-u8-ref bv bs))))
(define (decode-utf8-char bytes index)
- (integer->char
- (let ((b0 (bytevector-u8-ref bytes index)))
- (cond ((utf8-initial-byte-1? b0)
- b0)
- ((utf8-initial-byte-2? b0)
- (let ((b1 (bytevector-u8-ref bytes (fix:+ index 1))))
- (if (not (valid-utf8-sequence-2? b0 b1))
- (error "Ill-formed UTF-8 sequence:" b0 b1))
- (fix:or (extract-bits b0 #x1F 6)
- (extract-bits b1 #x3F 0))))
- ((utf8-initial-byte-3? b0)
- (let ((b1 (bytevector-u8-ref bytes (fix:+ index 1)))
- (b2 (bytevector-u8-ref bytes (fix:+ index 2))))
- (if (not (valid-utf8-sequence-3? b0 b1 b2))
- (error "Ill-formed UTF-8 sequence:" b0 b1 b2))
- (fix:or (fix:or (extract-bits b0 #x0F 12)
- (extract-bits b1 #x3F 6))
- (extract-bits b2 #x3F 0))))
- ((utf8-initial-byte-4? b0)
- (let ((b1 (bytevector-u8-ref bytes (fix:+ index 1)))
- (b2 (bytevector-u8-ref bytes (fix:+ index 2)))
- (b3 (bytevector-u8-ref bytes (fix:+ index 3))))
- (if (not (valid-utf8-sequence-4? b0 b1 b2 b3))
- (error "Ill-formed UTF-8 sequence:" b0 b1 b2 b3))
- (fix:or (fix:or (extract-bits b0 #x07 18)
- (extract-bits b1 #x3F 12))
- (fix:or (extract-bits b2 #x3F 6)
- (extract-bits b3 #x3F 0)))))
- (else
- (error "Illegal UTF-8 initial byte:" b0))))))
+ (let ((b0 (bytevector-u8-ref bytes index)))
+ (cond ((utf8-initial-byte-1? b0)
+ (integer->char b0))
+ ((utf8-initial-byte-2? b0)
+ (let ((b1 (bytevector-u8-ref bytes (fix:+ index 1))))
+ (and (valid-utf8-sequence-2? b0 b1)
+ (integer->char
+ (fix:or (extract-bits b0 #x1F 6)
+ (extract-bits b1 #x3F 0))))))
+ ((utf8-initial-byte-3? b0)
+ (let ((b1 (bytevector-u8-ref bytes (fix:+ index 1)))
+ (b2 (bytevector-u8-ref bytes (fix:+ index 2))))
+ (and (valid-utf8-sequence-3? b0 b1 b2)
+ (integer->char
+ (fix:or (fix:or (extract-bits b0 #x0F 12)
+ (extract-bits b1 #x3F 6))
+ (extract-bits b2 #x3F 0))))))
+ ((utf8-initial-byte-4? b0)
+ (let ((b1 (bytevector-u8-ref bytes (fix:+ index 1)))
+ (b2 (bytevector-u8-ref bytes (fix:+ index 2)))
+ (b3 (bytevector-u8-ref bytes (fix:+ index 3))))
+ (and (valid-utf8-sequence-4? b0 b1 b2 b3)
+ (integer->char
+ (fix:or (fix:or (extract-bits b0 #x07 18)
+ (extract-bits b1 #x3F 12))
+ (fix:or (extract-bits b2 #x3F 6)
+ (extract-bits b3 #x3F 0)))))))
+ (else #f))))
(define-integrable (utf8-initial-byte-1? byte)
(fix:= #x00 (fix:and #x80 byte)))
\f
(define (initial-u16->utf16-char-length u16)
(guarantee u16? u16 'initial-u16->utf16-char-length)
- (if (utf16-low-surrogate? u16)
- (error "Illegal initial UTF-16 unit:" u16))
- (if (utf16-high-surrogate? u16)
+ (if (and (not (utf16-low-surrogate? u16))
+ (utf16-high-surrogate? u16))
4
2))
(define (utf16-char-decoder getter)
(lambda (bytes index)
- (integer->char
- (let ((d0 (getter bytes index)))
- (if (utf16-low-surrogate? d0)
- (error "Illegal initial UTF-16 unit:" d0))
- (if (utf16-high-surrogate? d0)
- (let ((d1 (getter bytes (fix:+ index 2))))
- (if (not (utf16-low-surrogate? d1))
- (error "Ill-formed UTF-16 sequence:" d0 d1))
- (fix:+ (fix:or (extract-bits d0 #x3FF 10)
- (extract-bits d1 #x3FF 0))
- #x10000))
- d0)))))
+ (let ((d0 (getter bytes index)))
+ (and (not (utf16-low-surrogate? d0))
+ (if (utf16-high-surrogate? d0)
+ (let ((d1 (getter bytes (fix:+ index 2))))
+ (and (utf16-low-surrogate? d1)
+ (integer->char
+ (fix:+ (fix:or (extract-bits d0 #x3FF 10)
+ (extract-bits d1 #x3FF 0))
+ #x10000))))
+ (integer->char d0))))))
(define decode-utf16be-char
(utf16-char-decoder bytevector-u16be-ref))
(utf16-char-decoder bytevector-u16le-ref))
(define (initial-u32->utf32-char-length u32)
- (guarantee unicode-scalar-value? u32 'initial-u32->utf32-char-length)
+ (guarantee u32? u32 'initial-u32->utf32-char-length)
4)
(define (next-char-length:utf32le bv bs be)
(define (utf32-char-decoder getter)
(lambda (bytes index)
(let ((u32 (getter bytes index)))
- (guarantee unicode-scalar-value? u32 'utf32-char-decoder)
- (integer->char u32))))
+ (and (unicode-scalar-value? u32)
+ (integer->char u32)))))
(define decode-utf32be-char
(utf32-char-decoder bytevector-u32be-ref))
(k #f bs)
(let ((bs* (fix:+ bs n)))
(k (and (fix:<= bs* be)
- (decode-char bv bs))
+ (or (decode-char bv bs)
+ char:replacement))
bs*)))))))
(define-char-codec 'utf8
(lambda (ib)
(let ((n (initial-byte->utf8-char-length (peek-byte ib))))
(read-bytes! ib 0 n)
- (decode-utf8-char (input-buffer-bytes ib) 0))))
+ (or (decode-utf8-char (input-buffer-bytes ib) 0)
+ char:replacement))))
(define-encoder 'utf-8
(lambda (ob char)
(bytevector-u16be-ref (input-buffer-bytes ib) 0))))
(if (fix:> n 2)
(read-bytes! ib 2 n))
- (decode-utf16be-char (input-buffer-bytes ib) 0))))
+ (or (decode-utf16be-char (input-buffer-bytes ib) 0)
+ char:replacement))))
(define-decoder 'utf-16le
(lambda (ib)
(bytevector-u16le-ref (input-buffer-bytes ib) 0))))
(if (fix:> n 2)
(read-bytes! ib 2 n))
- (decode-utf16le-char (input-buffer-bytes ib) 0))))
+ (or (decode-utf16le-char (input-buffer-bytes ib) 0)
+ char:replacement))))
(define-encoder 'utf-16be
(lambda (ob char)
(define-decoder 'utf-32be
(lambda (ib)
(read-bytes! ib 0 4)
- (decode-utf32be-char (input-buffer-bytes ib) 0)))
+ (or (decode-utf32be-char (input-buffer-bytes ib) 0)
+ char:replacement)))
(define-decoder 'utf-32le
(lambda (ib)
(read-bytes! ib 0 4)
- (decode-utf32le-char (input-buffer-bytes ib) 0)))
+ (or (decode-utf32le-char (input-buffer-bytes ib) 0)
+ char:replacement)))
(define-encoder 'utf-32be
(lambda (ob char)
char-utf16-byte-length
char-utf32-byte-length
char-utf8-byte-length
+ char:replacement
decode-utf16be-char
decode-utf16le-char
decode-utf32be-char
(define-test 'utf8-initial-byte
(lambda ()
(for-each (lambda (b)
- (if (memv b invalid-utf8-initial-bytes)
- (assert-error
- (lambda () (initial-byte->utf8-char-length b)))
- (assert-= (initial-byte->utf8-char-length b)
- (cond ((< b #x80) 1)
- ((< b #xE0) 2)
- ((< b #xF0) 3)
- (else 4)))))
+ (assert-= (initial-byte->utf8-char-length b)
+ (cond ((memv b invalid-utf8-initial-bytes) 1)
+ ((< b #x80) 1)
+ ((< b #xE0) 2)
+ ((< b #xF0) 3)
+ (else 4))))
(iota #x100))))
(define invalid-utf8-initial-bytes
(define-test 'invalid-known-length-utf8-sequences
(lambda ()
(for-each (lambda (entry)
- (let ((bytes (car entry))
- (length (cadr entry)))
- (let ((b0 (bytevector-u8-ref bytes 0)))
- (if (not (memv b0 invalid-utf8-initial-bytes))
- (assert-= (initial-byte->utf8-char-length b0)
- length)))
- (assert-error (lambda () (decode-utf8-char bytes 0)))))
+ (let ((bytes (car entry))
+ (length (cadr entry)))
+ (let* ((b0 (bytevector-u8-ref bytes 0))
+ (length* (initial-byte->utf8-char-length b0)))
+ (assert-= length*
+ (if (memv b0 invalid-utf8-initial-bytes)
+ 1
+ length))
+ (if (<= length* (bytevector-length bytes))
+ (assert-false (decode-utf8-char bytes 0))))))
invalid-known-length-sequences)))
(define invalid-known-length-sequences
(define-test 'invalid-utf8-sequences
(lambda ()
(for-each (lambda (bytes)
- (assert-error (lambda () (decode-utf8-char bytes 0))))
+ (assert-false (decode-utf8-char bytes 0)))
invalid-utf8-sequences)))
(define invalid-utf8-sequences
;; (#\xDB80 #\xDFFF #u8(#xED #xAE #x80 #xED #xBF #xBF))
;; (#\xDBFF #\xDC00 #u8(#xED #xAF #xBF #xED #xB0 #x80))
;; (#\xDBFF #\xDFFF #u8(#xED #xAF #xBF #xED #xBF #xBF))
- ))
\ No newline at end of file
+ ))
+\f
+(define (decode-via-port coding octets)
+ (let* ((binary-port (open-input-bytevector octets))
+ (textual-port (binary->textual-port binary-port)))
+ (port/set-coding textual-port coding)
+ (read-string (char-set) textual-port)))
+
+(define-test 'replacement-character/utf8
+ (lambda ()
+ (define octets
+ #u8(#x20 #b10000010 #x20
+ #b11000010 0 #x20
+ #b11100010 0 0 #x20
+ #b11110010 0 0 0 #x20))
+ (define n (bytevector-length octets))
+ (define string " \xfffd; \xfffd; \xfffd; \xfffd; ")
+ (assert-error (lambda () (utf8->string octets)))
+ (assert-error (lambda () (utf8->string octets 0)))
+ (assert-error (lambda () (utf8->string octets 0 n)))
+ (assert-equal (utf8->string octets 0 n #t) string)
+ (assert-equal (decode-via-port 'UTF-8 octets) string)))
+
+(define-test 'replacement-character/utf16le
+ (lambda ()
+ (define octets
+ #u8(#x20 0
+ #x00 #xd8 0 0
+ #x20 0
+ 0 #xdf
+ #x20 0))
+ (define n (bytevector-length octets))
+ (define string " \xfffd; \xfffd; ")
+ (assert-error (lambda () (utf16le->string octets)))
+ (assert-error (lambda () (utf16le->string octets 0)))
+ (assert-error (lambda () (utf16le->string octets 0 n)))
+ (assert-equal (utf16le->string octets 0 n #t) string)
+ (assert-equal (decode-via-port 'UTF-16LE octets) string)))
+
+(define-test 'replacement-character/utf16be
+ (lambda ()
+ (define octets
+ #u8(0 #x20
+ #xd8 #x00 0 0
+ 0 #x20
+ #xdf 0
+ 0 #x20))
+ (define n (bytevector-length octets))
+ (define string " \xfffd; \xfffd; ")
+ (assert-error (lambda () (utf16be->string octets)))
+ (assert-error (lambda () (utf16be->string octets 0)))
+ (assert-error (lambda () (utf16be->string octets 0 n)))
+ (assert-equal (utf16be->string octets 0 n #t) string)
+ (assert-equal (decode-via-port 'UTF-16BE octets) string)))
+
+(define-test 'replacement-character/utf32le
+ (lambda ()
+ (define octets
+ #u8(#x20 0 0 0
+ 0 #xd8 0 0
+ #x20 0 0 0
+ 0 #xdf 0 0
+ #x20 0 0 0))
+ (define n (bytevector-length octets))
+ (define string " \xfffd; \xfffd; ")
+ (assert-error (lambda () (utf32le->string octets)))
+ (assert-error (lambda () (utf32le->string octets 0)))
+ (assert-error (lambda () (utf32le->string octets 0 n)))
+ (assert-equal (utf32le->string octets 0 n #t) string)
+ (assert-equal (decode-via-port 'UTF-32LE octets) string)))
+
+(define-test 'replacement-character:utf32be
+ (lambda ()
+ (define octets
+ #u8(0 0 0 #x20
+ 0 0 #xd8 0
+ 0 0 0 #x20
+ 0 0 #xdf 0
+ 0 0 0 #x20))
+ (define n (bytevector-length octets))
+ (define string " \xfffd; \xfffd; ")
+ (assert-error (lambda () (utf32be->string octets)))
+ (assert-error (lambda () (utf32be->string octets 0)))
+ (assert-error (lambda () (utf32be->string octets 0 n)))
+ (assert-equal (utf32be->string octets 0 n #t) string)
+ (assert-equal (decode-via-port 'UTF-32BE octets) string)))
\ No newline at end of file