From d7cf9e8a8b14fe56e746dd7ddec03b3e67d6d4a9 Mon Sep 17 00:00:00 2001
From: Taylor R Campbell <campbell@mumble.net>
Date: Mon, 27 May 2019 16:21:08 +0000
Subject: [PATCH] Implement character replacement on ill-formed octet
 sequences.
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

- (utf8->string bv start end #t) now replaces by U+FFFD.

  Existing behaviour of (utf8->string bv [start end]) is unchanged so
  that utf8->string will fail noisily rather than quietly fail to be
  invertible by string->utf8 on certain inputs.

- Generic I/O input now replaces ill-formed octet sequences by U+FFFD.

  TODO: Add (port/set-coding-error port <action>) for <action> =
  replace or <action> = error, perhaps.

TODO: This does not exactly implement the replacement algorithm
recommended as a best practice by Unicode 9, Â§3.9, pp. 127-129.  That
algorithm is inconveneint because our decoder is factored into (a)
claiming a length based on the first code unit, and then (b)
consuming exactly that many bytes; the algorithm requires us to
refactor it so that part (b) can say `never mind' and consume fewer
bytes than (a) requeste.
---
 src/runtime/bytevector.scm  |  32 +++++++---
 src/runtime/char.scm        |  98 +++++++++++++++--------------
 src/runtime/generic-io.scm  |  15 +++--
 src/runtime/runtime.pkg     |   1 +
 tests/runtime/test-char.scm | 120 +++++++++++++++++++++++++++++++-----
 5 files changed, 184 insertions(+), 82 deletions(-)
diff --git a/src/runtime/bytevector.scm b/src/runtime/bytevector.scm
index e11e91740..094c9e5d2 100644
--- a/src/runtime/bytevector.scm
+++ b/src/runtime/bytevector.scm
@@ -352,26 +352,38 @@ USA.
 
 (define-integrable (bytes-decoder getter initial->length decode-char step noun
 				  caller)
-  (lambda (bytevector #!optional start end)
+  (lambda (bytevector #!optional start end replace?)
     (let* ((end (fix:end-index end (bytevector-length bytevector) caller))
 	   (start (fix:start-index start end caller))
 	   (builder (string-builder)))
       (let ((truncated
-	     (lambda (index)
-	       (error (string "Truncated " noun " sequence:")
-		      (bytevector-copy bytevector
-				       index
-				       (fix:min (fix:+ index 4) end))))))
+	     (if (or (default-object? replace?) (not replace?))
+		 (lambda (index)
+		   (error (string "Truncated " noun " sequence:")
+			  (bytevector-copy bytevector
+					   index
+					   (fix:min (fix:+ index 4) end))))
+		 (lambda (index) index char:replacement)))
+	    (ill-formed
+	     (if (or (default-object? replace?) (not replace?))
+		 (lambda (index)
+		   (error (string "Ill-formed " noun " sequence:")
+			  (bytevector-copy bytevector
+					   index
+					   (fix:min (fix:+ index 4) end))))
+		 (lambda (index) index char:replacement))))
 	(let loop ((index start))
 	  (if (fix:<= (fix:+ index step) end)
 	      (let ((n (initial->length (getter bytevector index))))
 		(let ((index* (fix:+ index n)))
-		  (if (not (fix:<= index* end))
-		      (truncated index))
-		  (builder (decode-char bytevector index))
+		  (builder
+		   (if (not (fix:<= index* end))
+		       (truncated index)
+		       (or (decode-char bytevector index)
+			   (ill-formed index))))
 		  (loop index*)))
 	      (if (fix:< index end)
-		  (truncated index)))))
+		  (builder (truncated index))))))
       (builder))))
 
 (define utf8->string)
diff --git a/src/runtime/char.scm b/src/runtime/char.scm
index 257d61ad0..251762e54 100644
--- a/src/runtime/char.scm
+++ b/src/runtime/char.scm
@@ -414,6 +414,8 @@ USA.
 
 (define-integrable (insert-bits word mask shift)
   (fix:and (fix:lsh word shift) mask))
+
+(define char:replacement (integer->char #xfffd))
 
 ;;;; UTF-{8,16,32} encoders
 
@@ -499,43 +501,41 @@ USA.
 	((utf8-initial-byte-2? b0) 2)
 	((utf8-initial-byte-3? b0) 3)
 	((utf8-initial-byte-4? b0) 4)
-	(else (error "Illegal UTF-8 initial byte:" b0))))
+	(else 1)))			;error, eat byte
 
 (define (next-char-length:utf8 bv bs be)
   (and (fix:<= (fix:+ bs 1) be)
        (initial-byte->utf8-char-length (bytevector-u8-ref bv bs))))
 
 (define (decode-utf8-char bytes index)
-  (integer->char
-   (let ((b0 (bytevector-u8-ref bytes index)))
-     (cond ((utf8-initial-byte-1? b0)
-	    b0)
-	   ((utf8-initial-byte-2? b0)
-	    (let ((b1 (bytevector-u8-ref bytes (fix:+ index 1))))
-	      (if (not (valid-utf8-sequence-2? b0 b1))
-		  (error "Ill-formed UTF-8 sequence:" b0 b1))
-	      (fix:or (extract-bits b0 #x1F 6)
-		      (extract-bits b1 #x3F 0))))
-	   ((utf8-initial-byte-3? b0)
-	    (let ((b1 (bytevector-u8-ref bytes (fix:+ index 1)))
-		  (b2 (bytevector-u8-ref bytes (fix:+ index 2))))
-	      (if (not (valid-utf8-sequence-3? b0 b1 b2))
-		  (error "Ill-formed UTF-8 sequence:" b0 b1 b2))
-	      (fix:or (fix:or (extract-bits b0 #x0F 12)
-			      (extract-bits b1 #x3F 6))
-		      (extract-bits b2 #x3F 0))))
-	   ((utf8-initial-byte-4? b0)
-	    (let ((b1 (bytevector-u8-ref bytes (fix:+ index 1)))
-		  (b2 (bytevector-u8-ref bytes (fix:+ index 2)))
-		  (b3 (bytevector-u8-ref bytes (fix:+ index 3))))
-	      (if (not (valid-utf8-sequence-4? b0 b1 b2 b3))
-		  (error "Ill-formed UTF-8 sequence:" b0 b1 b2 b3))
-	      (fix:or (fix:or (extract-bits b0 #x07 18)
-			      (extract-bits b1 #x3F 12))
-		      (fix:or (extract-bits b2 #x3F 6)
-			      (extract-bits b3 #x3F 0)))))
-	   (else
-	    (error "Illegal UTF-8 initial byte:" b0))))))
+  (let ((b0 (bytevector-u8-ref bytes index)))
+    (cond ((utf8-initial-byte-1? b0)
+	   (integer->char b0))
+	  ((utf8-initial-byte-2? b0)
+	   (let ((b1 (bytevector-u8-ref bytes (fix:+ index 1))))
+	     (and (valid-utf8-sequence-2? b0 b1)
+		  (integer->char
+		   (fix:or (extract-bits b0 #x1F 6)
+			   (extract-bits b1 #x3F 0))))))
+	  ((utf8-initial-byte-3? b0)
+	   (let ((b1 (bytevector-u8-ref bytes (fix:+ index 1)))
+		 (b2 (bytevector-u8-ref bytes (fix:+ index 2))))
+	     (and (valid-utf8-sequence-3? b0 b1 b2)
+		  (integer->char
+		   (fix:or (fix:or (extract-bits b0 #x0F 12)
+				   (extract-bits b1 #x3F 6))
+			   (extract-bits b2 #x3F 0))))))
+	  ((utf8-initial-byte-4? b0)
+	   (let ((b1 (bytevector-u8-ref bytes (fix:+ index 1)))
+		 (b2 (bytevector-u8-ref bytes (fix:+ index 2)))
+		 (b3 (bytevector-u8-ref bytes (fix:+ index 3))))
+	     (and (valid-utf8-sequence-4? b0 b1 b2 b3)
+		  (integer->char
+		   (fix:or (fix:or (extract-bits b0 #x07 18)
+				   (extract-bits b1 #x3F 12))
+			   (fix:or (extract-bits b2 #x3F 6)
+				   (extract-bits b3 #x3F 0)))))))
+	  (else #f))))
 
 (define-integrable (utf8-initial-byte-1? byte)
   (fix:= #x00 (fix:and #x80 byte)))
@@ -608,9 +608,8 @@ USA.
 
 (define (initial-u16->utf16-char-length u16)
   (guarantee u16? u16 'initial-u16->utf16-char-length)
-  (if (utf16-low-surrogate? u16)
-      (error "Illegal initial UTF-16 unit:" u16))
-  (if (utf16-high-surrogate? u16)
+  (if (and (not (utf16-low-surrogate? u16))
+	   (utf16-high-surrogate? u16))
       4
       2))
 
@@ -624,18 +623,16 @@ USA.
 
 (define (utf16-char-decoder getter)
   (lambda (bytes index)
-    (integer->char
-     (let ((d0 (getter bytes index)))
-       (if (utf16-low-surrogate? d0)
-	   (error "Illegal initial UTF-16 unit:" d0))
-       (if (utf16-high-surrogate? d0)
-	   (let ((d1 (getter bytes (fix:+ index 2))))
-	     (if (not (utf16-low-surrogate? d1))
-		 (error "Ill-formed UTF-16 sequence:" d0 d1))
-	     (fix:+ (fix:or (extract-bits d0 #x3FF 10)
-			    (extract-bits d1 #x3FF 0))
-		    #x10000))
-	   d0)))))
+    (let ((d0 (getter bytes index)))
+      (and (not (utf16-low-surrogate? d0))
+	   (if (utf16-high-surrogate? d0)
+	       (let ((d1 (getter bytes (fix:+ index 2))))
+		 (and (utf16-low-surrogate? d1)
+		      (integer->char
+		       (fix:+ (fix:or (extract-bits d0 #x3FF 10)
+				      (extract-bits d1 #x3FF 0))
+			      #x10000))))
+	       (integer->char d0))))))
 
 (define decode-utf16be-char
   (utf16-char-decoder bytevector-u16be-ref))
@@ -644,7 +641,7 @@ USA.
   (utf16-char-decoder bytevector-u16le-ref))
 
 (define (initial-u32->utf32-char-length u32)
-  (guarantee unicode-scalar-value? u32 'initial-u32->utf32-char-length)
+  (guarantee u32? u32 'initial-u32->utf32-char-length)
   4)
 
 (define (next-char-length:utf32le bv bs be)
@@ -658,8 +655,8 @@ USA.
 (define (utf32-char-decoder getter)
   (lambda (bytes index)
     (let ((u32 (getter bytes index)))
-      (guarantee unicode-scalar-value? u32 'utf32-char-decoder)
-      (integer->char u32))))
+      (and (unicode-scalar-value? u32)
+	   (integer->char u32)))))
 
 (define decode-utf32be-char
   (utf32-char-decoder bytevector-u32be-ref))
@@ -702,7 +699,8 @@ USA.
 	   (k #f bs)
 	   (let ((bs* (fix:+ bs n)))
 	     (k (and (fix:<= bs* be)
-		     (decode-char bv bs))
+		     (or (decode-char bv bs)
+			 char:replacement))
 		bs*)))))))
 
 (define-char-codec 'utf8
diff --git a/src/runtime/generic-io.scm b/src/runtime/generic-io.scm
index 6052dfa31..16f3560a7 100644
--- a/src/runtime/generic-io.scm
+++ b/src/runtime/generic-io.scm
@@ -1354,7 +1354,8 @@ USA.
   (lambda (ib)
     (let ((n (initial-byte->utf8-char-length (peek-byte ib))))
       (read-bytes! ib 0 n)
-      (decode-utf8-char (input-buffer-bytes ib) 0))))
+      (or (decode-utf8-char (input-buffer-bytes ib) 0)
+	  char:replacement))))
 
 (define-encoder 'utf-8
   (lambda (ob char)
@@ -1372,7 +1373,8 @@ USA.
 	    (bytevector-u16be-ref (input-buffer-bytes ib) 0))))
       (if (fix:> n 2)
 	  (read-bytes! ib 2 n))
-      (decode-utf16be-char (input-buffer-bytes ib) 0))))
+      (or (decode-utf16be-char (input-buffer-bytes ib) 0)
+	  char:replacement))))
 
 (define-decoder 'utf-16le
   (lambda (ib)
@@ -1382,7 +1384,8 @@ USA.
 	    (bytevector-u16le-ref (input-buffer-bytes ib) 0))))
       (if (fix:> n 2)
 	  (read-bytes! ib 2 n))
-      (decode-utf16le-char (input-buffer-bytes ib) 0))))
+      (or (decode-utf16le-char (input-buffer-bytes ib) 0)
+	  char:replacement))))
 
 (define-encoder 'utf-16be
   (lambda (ob char)
@@ -1399,12 +1402,14 @@ USA.
 (define-decoder 'utf-32be
   (lambda (ib)
     (read-bytes! ib 0 4)
-    (decode-utf32be-char (input-buffer-bytes ib) 0)))
+    (or (decode-utf32be-char (input-buffer-bytes ib) 0)
+	char:replacement)))
 
 (define-decoder 'utf-32le
   (lambda (ib)
     (read-bytes! ib 0 4)
-    (decode-utf32le-char (input-buffer-bytes ib) 0)))
+    (or (decode-utf32le-char (input-buffer-bytes ib) 0)
+	char:replacement)))
 
 (define-encoder 'utf-32be
   (lambda (ob char)
diff --git a/src/runtime/runtime.pkg b/src/runtime/runtime.pkg
index 3bd89bf6d..f3aca8325 100644
--- a/src/runtime/runtime.pkg
+++ b/src/runtime/runtime.pkg
@@ -1361,6 +1361,7 @@ USA.
 	  char-utf16-byte-length
 	  char-utf32-byte-length
 	  char-utf8-byte-length
+	  char:replacement
 	  decode-utf16be-char
 	  decode-utf16le-char
 	  decode-utf32be-char
diff --git a/tests/runtime/test-char.scm b/tests/runtime/test-char.scm
index 2d1af87ec..28a2f88a6 100644
--- a/tests/runtime/test-char.scm
+++ b/tests/runtime/test-char.scm
@@ -302,14 +302,12 @@ USA.
 (define-test 'utf8-initial-byte
   (lambda ()
     (for-each (lambda (b)
-                (if (memv b invalid-utf8-initial-bytes)
-                    (assert-error
-                     (lambda () (initial-byte->utf8-char-length b)))
-                    (assert-= (initial-byte->utf8-char-length b)
-                              (cond ((< b #x80) 1)
-                                    ((< b #xE0) 2)
-                                    ((< b #xF0) 3)
-                                    (else 4)))))
+                (assert-= (initial-byte->utf8-char-length b)
+                          (cond ((memv b invalid-utf8-initial-bytes) 1)
+                                ((< b #x80) 1)
+                                ((< b #xE0) 2)
+                                ((< b #xF0) 3)
+                                (else 4))))
               (iota #x100))))
 
 (define invalid-utf8-initial-bytes
@@ -319,13 +317,16 @@ USA.
 (define-test 'invalid-known-length-utf8-sequences
   (lambda ()
     (for-each (lambda (entry)
-		(let ((bytes (car entry))
-		      (length (cadr entry)))
-                  (let ((b0 (bytevector-u8-ref bytes 0)))
-                    (if (not (memv b0 invalid-utf8-initial-bytes))
-                        (assert-= (initial-byte->utf8-char-length b0)
-                                  length)))
-		  (assert-error (lambda () (decode-utf8-char bytes 0)))))
+                (let ((bytes (car entry))
+                      (length (cadr entry)))
+                  (let* ((b0 (bytevector-u8-ref bytes 0))
+                         (length* (initial-byte->utf8-char-length b0)))
+                    (assert-= length*
+                              (if (memv b0 invalid-utf8-initial-bytes)
+                                  1
+                                  length))
+                    (if (<= length* (bytevector-length bytes))
+                        (assert-false (decode-utf8-char bytes 0))))))
               invalid-known-length-sequences)))
 
 (define invalid-known-length-sequences
@@ -410,7 +411,7 @@ USA.
 (define-test 'invalid-utf8-sequences
   (lambda ()
     (for-each (lambda (bytes)
-                (assert-error (lambda () (decode-utf8-char bytes 0))))
+                (assert-false (decode-utf8-char bytes 0)))
               invalid-utf8-sequences)))
 
 (define invalid-utf8-sequences
@@ -498,4 +499,89 @@ USA.
     ;; (#\xDB80 #\xDFFF #u8(#xED #xAE #x80 #xED #xBF #xBF))
     ;; (#\xDBFF #\xDC00 #u8(#xED #xAF #xBF #xED #xB0 #x80))
     ;; (#\xDBFF #\xDFFF #u8(#xED #xAF #xBF #xED #xBF #xBF))
-    ))
\ No newline at end of file
+    ))
+
+(define (decode-via-port coding octets)
+  (let* ((binary-port (open-input-bytevector octets))
+         (textual-port (binary->textual-port binary-port)))
+    (port/set-coding textual-port coding)
+    (read-string (char-set) textual-port)))
+
+(define-test 'replacement-character/utf8
+  (lambda ()
+    (define octets
+      #u8(#x20 #b10000010 #x20
+               #b11000010 0 #x20
+               #b11100010 0 0 #x20
+               #b11110010 0 0 0 #x20))
+    (define n (bytevector-length octets))
+    (define string " \xfffd; \xfffd; \xfffd; \xfffd; ")
+    (assert-error (lambda () (utf8->string octets)))
+    (assert-error (lambda () (utf8->string octets 0)))
+    (assert-error (lambda () (utf8->string octets 0 n)))
+    (assert-equal (utf8->string octets 0 n #t) string)
+    (assert-equal (decode-via-port 'UTF-8 octets) string)))
+
+(define-test 'replacement-character/utf16le
+  (lambda ()
+    (define octets
+      #u8(#x20 0
+          #x00 #xd8 0 0
+          #x20 0
+          0 #xdf
+          #x20 0))
+    (define n (bytevector-length octets))
+    (define string " \xfffd; \xfffd; ")
+    (assert-error (lambda () (utf16le->string octets)))
+    (assert-error (lambda () (utf16le->string octets 0)))
+    (assert-error (lambda () (utf16le->string octets 0 n)))
+    (assert-equal (utf16le->string octets 0 n #t) string)
+    (assert-equal (decode-via-port 'UTF-16LE octets) string)))
+
+(define-test 'replacement-character/utf16be
+  (lambda ()
+    (define octets
+      #u8(0 #x20
+          #xd8 #x00 0 0
+          0 #x20
+          #xdf 0
+          0 #x20))
+    (define n (bytevector-length octets))
+    (define string " \xfffd; \xfffd; ")
+    (assert-error (lambda () (utf16be->string octets)))
+    (assert-error (lambda () (utf16be->string octets 0)))
+    (assert-error (lambda () (utf16be->string octets 0 n)))
+    (assert-equal (utf16be->string octets 0 n #t) string)
+    (assert-equal (decode-via-port 'UTF-16BE octets) string)))
+
+(define-test 'replacement-character/utf32le
+  (lambda ()
+    (define octets
+      #u8(#x20 0 0 0
+          0 #xd8 0 0
+          #x20 0 0 0
+          0 #xdf 0 0
+          #x20 0 0 0))
+    (define n (bytevector-length octets))
+    (define string " \xfffd; \xfffd; ")
+    (assert-error (lambda () (utf32le->string octets)))
+    (assert-error (lambda () (utf32le->string octets 0)))
+    (assert-error (lambda () (utf32le->string octets 0 n)))
+    (assert-equal (utf32le->string octets 0 n #t) string)
+    (assert-equal (decode-via-port 'UTF-32LE octets) string)))
+
+(define-test 'replacement-character:utf32be
+  (lambda ()
+    (define octets
+      #u8(0 0 0 #x20
+          0 0 #xd8 0
+          0 0 0 #x20
+          0 0 #xdf 0
+          0 0 0 #x20))
+    (define n (bytevector-length octets))
+    (define string " \xfffd; \xfffd; ")
+    (assert-error (lambda () (utf32be->string octets)))
+    (assert-error (lambda () (utf32be->string octets 0)))
+    (assert-error (lambda () (utf32be->string octets 0 n)))
+    (assert-equal (utf32be->string octets 0 n #t) string)
+    (assert-equal (decode-via-port 'UTF-32BE octets) string)))
\ No newline at end of file
-- 
2.25.1