From 9cbcc4bc08c5721464039a79b8af60388f31cd28 Mon Sep 17 00:00:00 2001
From: Chris Hanson <org/chris-hanson/cph>
Date: Wed, 18 Jan 2017 03:00:08 -0800
Subject: [PATCH] Implement UTF-X codecs for chars and strings.

---
 src/runtime/bytevector.scm | 277 +++++++++++++--------------
 src/runtime/char.scm       | 378 ++++++++++++++++++++++++++-----------
 src/runtime/make.scm       |   1 +
 src/runtime/runtime.pkg    |  26 ++-
 4 files changed, 419 insertions(+), 263 deletions(-)

diff --git a/src/runtime/bytevector.scm b/src/runtime/bytevector.scm
index 64247c008..6180def91 100644
--- a/src/runtime/bytevector.scm
+++ b/src/runtime/bytevector.scm
@@ -245,150 +245,135 @@ USA.
 			       index-fixnum?
 			       exact-nonnegative-integer?)))
 
-(define (string->utf8 string #!optional start end)
-  (guarantee string? string 'string->utf8)
-  (let* ((end
-	  (if (default-object? end)
-	      (string-length string)
-	      (begin
-		(guarantee index-fixnum? end 'string->utf8)
-		(if (not (fix:<= end (string-length string)))
-		    (error:bad-range-argument end 'string->utf8))
-		end)))
-	 (start
-	  (if (default-object? start)
-	      0
-	      (begin
-		(guarantee index-fixnum? start 'string->utf8)
-		(if (not (fix:<= start end))
-		    (error:bad-range-argument start 'string->utf8))
-		start))))
-    (let ((buffer (allocate-bytevector (%count-utf8-bytes string start end))))
-      (do ((from start (fix:+ from 1))
-	   (to 0
-	       (fix:+ to
-		      (char-utf8-bytes! buffer to (string-ref string from)))))
-	  ((not (fix:< from end))))
-      buffer)))
-
-(define (%count-utf8-bytes string start end)
-  (do ((index start (fix:+ index 1))
-       (n-bytes 0
-		(fix:+ n-bytes
-		       (char-utf8-byte-length (string-ref string index)))))
-      ((not (fix:< index end)) n-bytes)))
+(define (string-encoder char-byte-length encode-char! caller)
+  (lambda (string #!optional start end)
+    (guarantee string? string caller)
+    (let* ((end
+	    (if (default-object? end)
+		(string-length string)
+		(begin
+		  (guarantee index-fixnum? end caller)
+		  (if (not (fix:<= end (string-length string)))
+		      (error:bad-range-argument end caller))
+		  end)))
+	   (start
+	    (if (default-object? start)
+		0
+		(begin
+		  (guarantee index-fixnum? start caller)
+		  (if (not (fix:<= start end))
+		      (error:bad-range-argument start caller))
+		  start))))
+      (let ((bytes
+	     (allocate-bytevector
+	      (let loop ((index start) (n-bytes 0))
+		(if (fix:< index end)
+		    (loop (fix:+ index 1)
+			  (fix:+ n-bytes
+				 (char-byte-length (string-ref string index))))
+		    n-bytes)))))
+	(let loop ((from start) (to 0))
+	  (if (fix:< from end)
+	      (loop (fix:+ from 1)
+		    (encode-char! bytes to (string-ref string from)))))
+	bytes))))
+
+(define string->utf8)
+(define string->utf16be)
+(define string->utf16le)
+(define string->utf32be)
+(define string->utf32le)
+(add-boot-init!
+ (lambda ()
+   (set! string->utf8
+	 (string-encoder char-utf8-byte-length encode-utf8-char!
+			 'string->utf8))
+   (set! string->utf16be
+	 (string-encoder char-utf16-byte-length encode-utf16be-char!
+			 'string->utf16be))
+   (set! string->utf16le
+	 (string-encoder char-utf16-byte-length encode-utf16le-char!
+			 'string->utf16le))
+   (set! string->utf32be
+	 (string-encoder char-utf32-byte-length encode-utf32be-char!
+			 'string->utf32be))
+   (set! string->utf32le
+	 (string-encoder char-utf32-byte-length encode-utf32le-char!
+			 'string->utf32le))
+   unspecific))
 
-(define (utf8->string bytevector #!optional start end)
-  (guarantee bytevector? bytevector 'utf8->string)
-  (let* ((end
-	  (if (default-object? end)
-	      (bytevector-length bytevector)
-	      (begin
-		(guarantee index-fixnum? end 'utf8->string)
-		(if (not (fix:<= end (bytevector-length bytevector)))
-		    (error:bad-range-argument end 'utf8->string))
-		end)))
-	(start
-	 (if (default-object? start)
-	     0
-	     (begin
-	       (guarantee index-fixnum? start 'utf8->string)
-	       (if (not (fix:<= start end))
-		   (error:bad-range-argument start 'utf8->string))
-	       start))))
-    (%utf8->string bytevector start end)))
-
-(define (%utf8->string bytevector start end)
-  (let ((string (make-string (%count-utf8-chars bytevector start end))))
-    (let loop ((from start) (to 0))
-
-      (define-integrable (get-byte offset)
-	(bytevector-u8-ref bytevector (fix:+ from offset)))
-
-      (define-integrable (put-char! cp)
-	(string-set! string to (integer->char cp)))
-
-      (if (fix:< from end)
-	  (let ((b0 (get-byte 0)))
-	    (cond ((fix:< b0 #x80)
-		   (put-char! b0)
-		   (loop (fix:+ from 1) (fix:+ to 1)))
-		  ((fix:< b0 #xE0)
-		   (put-char! (decode-utf8-2 b0 (get-byte 1)))
-		   (loop (fix:+ from 2) (fix:+ to 1)))
-		  ((fix:< b0 #xF0)
-		   (put-char! (decode-utf8-3 b0 (get-byte 1) (get-byte 2)))
-		   (loop (fix:+ from 3) (fix:+ to 1)))
-		  (else
-		   (put-char!
-		    (decode-utf8-4 b0 (get-byte 1) (get-byte 2) (get-byte 3)))
-		   (loop (fix:+ from 4) (fix:+ to 1)))))))
-    string))
-
-(define (%count-utf8-chars bytevector start end)
-  (let loop ((index start) (n-chars 0))
-    (if (fix:< index end)
-	(let ((b0 (bytevector-u8-ref bytevector index)))
-	  (let ((index*
-		 (fix:+ index
-			(cond ((fix:< b0 #x80) 1)
-			      ((fix:< b0 #xE0) 2)
-			      ((fix:< b0 #xF0) 3)
-			      (else 4)))))
-	    (if (not (fix:<= index* end))
-		(error "Truncated UTF-8 sequence:"
-		       (bytevector-copy bytevector index end)))
-	    (loop index* (fix:+ n-chars 1))))
-	n-chars)))
-
-(define (decode-utf8-2 b0 b1)
-  (if (not (and (fix:> b0 #xC1)
-		(trailing-byte? b1)))
-      (error "Ill-formed UTF-8 sequence:" b0 b1))
-  (fix:or (extract b0 #x1F 6)
-	  (extract b1 #x3F 0)))
-
-(define (decode-utf8-3 b0 b1 b2)
-  (if (not (and (or (fix:> b0 #xE0) (fix:> b1 #x9F))
-		(trailing-byte? b1)
-		(trailing-byte? b2)))
-      (error "Ill-formed UTF-8 sequence:" b0 b1 b2))
-  (let ((cp
-	 (fix:or (fix:or (extract b0 #x0F 12)
-			 (extract b1 #x3F 6))
-		 (extract b2 #x3F 0))))
-    (if (surrogate? cp)
-	(error "Code point is a UTF-16 surrogate:" cp))
-    (if (non-character? cp)
-	(error "Code point is a non-character:" cp))
-    cp))
-
-(define (decode-utf8-4 b0 b1 b2 b3)
-  (if (not (and (or (fix:> b0 #xF0) (fix:> b1 #x8F))
-		(trailing-byte? b1)
-		(trailing-byte? b2)
-		(trailing-byte? b3)))
-      (error "Ill-formed UTF-8 sequence:" b0 b1 b2 b3))
-  (let ((cp
-	 (fix:or (fix:or (extract b0 #x07 18)
-			 (extract b1 #x3F 12))
-		 (fix:or (extract b2 #x3F 6)
-			 (extract b3 #x3F 0)))))
-    (if (not (fix:< cp #x110000))
-	(error "Value is not a code point:" cp))
-    (if (non-character? cp)
-	(error "Code point is a non-character:" cp))
-    cp))
-
-(define-integrable (extract b m n)
-  (fix:lsh (fix:and b m) n))
-
-(define-integrable (trailing-byte? b)
-  (fix:= (fix:and #xC0 b) #x80))
-
-(define-integrable (surrogate? cp)
-  (and (fix:<= #xD800 cp) (fix:< cp #xDFFF)))
-
-(define-integrable (non-character? cp)
-  (or (and (fix:<= #xFDD0 cp) (fix:< cp #xFDF0))
-      (fix:= (fix:and #xFFFE cp) #xFFFE)))
\ No newline at end of file
+(define (bytes-decoder getter initial->length char-length decode-char step noun
+		       caller)
+  (lambda (bytevector #!optional start end)
+    (guarantee bytevector? bytevector caller)
+    (let* ((end
+	    (if (default-object? end)
+		(bytevector-length bytevector)
+		(begin
+		  (guarantee index-fixnum? end caller)
+		  (if (not (fix:<= end (bytevector-length bytevector)))
+		      (error:bad-range-argument end caller))
+		  end)))
+	  (start
+	   (if (default-object? start)
+	       0
+	       (begin
+		 (guarantee index-fixnum? start caller)
+		 (if (not (fix:<= start end))
+		     (error:bad-range-argument start caller))
+		 start)))
+	  (truncated
+	   (lambda (index)
+	     (error (string "Truncated " noun " sequence:")
+		    (bytevector-copy bytevector
+				     index
+				     (fix:min (fix:+ index 4) end))))))
+      (let ((string
+	     (make-string
+	      (let loop ((index start) (n-chars 0))
+		(if (fix:<= (fix:+ index step) end)
+		    (let ((n (initial->length (getter bytevector start))))
+		      (let ((index* (fix:+ index n)))
+			(if (not (fix:<= index* end))
+			    (truncated index))
+			(loop index* (fix:+ n-chars 1))))
+		    (begin
+		      (if (fix:< index end)
+			  (truncated index))
+		      n-chars))))))
+	(let loop ((from start) (to 0))
+	  (if (fix:< from end)
+	      (let ((char (decode-char bytevector start)))
+		(string-set! string to char)
+		(loop (fix:+ from (char-length char))
+		      (fix:+ to 1)))))
+	string))))
+
+(define utf8->string)
+(define utf16be->string)
+(define utf16le->string)
+(define utf32be->string)
+(define utf32le->string)
+(add-boot-init!
+ (lambda ()
+   (set! utf8->string
+	 (bytes-decoder bytevector-u8-ref initial-byte->utf8-char-length
+			char-utf8-byte-length decode-utf8-char 1 "UTF-8"
+			'utf8->string))
+   (set! utf16be->string
+	 (bytes-decoder bytevector-u16be-ref initial-u16->utf16-char-length
+			char-utf16-byte-length decode-utf16be-char 1 "UTF-16BE"
+			'utf16be->string))
+   (set! utf16le->string
+	 (bytes-decoder bytevector-u16le-ref initial-u16->utf16-char-length
+			char-utf16-byte-length decode-utf16le-char 1 "UTF-16LE"
+			'utf16le->string))
+   (set! utf32be->string
+	 (bytes-decoder bytevector-u32be-ref initial-u32->utf32-char-length
+			char-utf32-byte-length decode-utf32be-char 1 "UTF-32BE"
+			'utf32be->string))
+   (set! utf32le->string
+	 (bytes-decoder bytevector-u32le-ref initial-u32->utf32-char-length
+			char-utf32-byte-length decode-utf32le-char 1 "UTF-32LE"
+			'utf32le->string))
+   unspecific))
\ No newline at end of file
diff --git a/src/runtime/char.scm b/src/runtime/char.scm
index 0d37470f3..754c7297a 100644
--- a/src/runtime/char.scm
+++ b/src/runtime/char.scm
@@ -76,45 +76,9 @@ USA.
   (%make-char (char-code char)
 	      (fix:andc (char-bits char) bits)))
 
-(define (unicode-char? object)
-  (and (char? object)
-       (legal-code-32? (char->integer object))))
-
-(define-guarantee unicode-char "a Unicode character")
-
-(define (unicode-scalar-value? object)
-  (and (index-fixnum? object)
-       (fix:< object char-code-limit)
-       (not (surrogate? object))
-       (not (non-character? object))))
-
-(define-guarantee unicode-scalar-value "a Unicode scalar value")
-
-(define (unicode-char->scalar-value char)
-  (let ((cp (char->integer char)))
-    (if (not (legal-code-32? cp))
-	(error:not-unicode-char char 'char-utf8-byte-length))
-    cp))
-
-(define-integrable (legal-code-32? pt)
-  (and (fix:< pt char-code-limit)
-       (not (surrogate? pt))
-       (not (non-character? pt))))
-
-(define-integrable (legal-code-16? pt)
-  (and (not (surrogate? pt))
-       (not (non-character? pt))))
-
-(define-integrable (surrogate? pt)
-  (and (fix:<= #xD800 pt) (fix:< pt #xDFFF)))
-
-(define-integrable (non-character? pt)
-  (or (and (fix:<= #xFDD0 pt) (fix:< pt #xFDF0))
-      (fix:= (fix:and #xFFFE pt) #xFFFE)))
-
 (define (8-bit-char? object)
   (and (char? object)
-       (fix:< (char->integer object) 256)))
+       (fix:< (char->integer object) #x100)))
 
 (define (guarantee-8-bit-char object #!optional caller)
   caller
@@ -123,7 +87,7 @@ USA.
 
 (define (char-ascii? char)
   (let ((n (char->integer char)))
-    (and (fix:< n 256)
+    (and (fix:< n #x100)
 	 n)))
 
 (define (char->ascii char)
@@ -131,84 +95,12 @@ USA.
   (char->integer char))
 
 (define (ascii->char code)
-  (guarantee-limited-index-fixnum code 256 'ASCII->CHAR)
+  (guarantee-limited-index-fixnum code #x100 'ASCII->CHAR)
   (%make-char code 0))
 
 (define (chars->ascii chars)
   (map char->ascii chars))
-
-;;;; UTF-{8,16,32} encoders
-
-(define (char-utf8-byte-length char)
-  (let ((sv (unicode-char->scalar-value char)))
-    (cond ((fix:< sv #x00000080) 1)
-	  ((fix:< sv #x00000800) 2)
-	  ((fix:< sv #x00010000) 3)
-	  (else 4))))
-
-(define (char-utf8-bytes! bytes index char)
-  (let ((sv (unicode-char->scalar-value char)))
-
-    (define-integrable (initial-byte n-bits offset)
-      (fix:or (fix:and (fix:lsh #xFF (fix:+ n-bits 1)) #xFF)
-	      (fix:lsh sv (fix:- 0 offset))))
-
-    (define-integrable (trailing-byte offset)
-      (fix:or #x80 (fix:and (fix:lsh sv (fix:- 0 offset)) #x3F)))
-
-    (cond ((fix:< sv #x00000080)
-	   (bytevector-u8-set! bytes index sv)
-	   (fix:+ index 1))
-	  ((fix:< sv #x00000800)
-	   (bytevector-u8-set! bytes index (initial-byte 5 6))
-	   (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 0))
-	   (fix:+ index 2))
-	  ((fix:< sv #x00010000)
-	   (bytevector-u8-set! bytes index (initial-byte 4 12))
-	   (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 6))
-	   (bytevector-u8-set! bytes (fix:+ index 2) (trailing-byte 0))
-	   (fix:+ index 3))
-	  (else
-	   (bytevector-u8-set! bytes index (initial-byte 3 18))
-	   (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 12))
-	   (bytevector-u8-set! bytes (fix:+ index 2) (trailing-byte 6))
-	   (bytevector-u8-set! bytes (fix:+ index 3) (trailing-byte 0))
-	   (fix:+ index 4)))))
 
-(define (char-utf16-byte-length char)
-  (if (fix:< (unicode-char->scalar-value char) #x00010000) 2 4))
-
-(define (char-utf16be-bytes! bytes index char)
-  (let ((sv (unicode-char->scalar-value char)))
-    (cond ((fix:< sv #x10000)
-	   (bytevector-u16be-set! bytes index sv)
-	   (fix:+ index 2))
-	  (else
-	   (receive (h l) (split-into-utf16-surrogates sv)
-	     (bytevector-u16be-set! bytes index h)
-	     (bytevector-u16be-set! bytes (fix:+ index 1) l))
-	   (fix:+ index 4)))))
-
-(define (char-utf16le-bytes! bytes index char)
-  (let ((sv (unicode-char->scalar-value char)))
-    (cond ((fix:< sv #x10000)
-	   (bytevector-u16le-set! bytes index sv)
-	   (fix:+ index 2))
-	  (else
-	   (receive (h l) (split-into-utf16-surrogates sv)
-	     (bytevector-u16le-set! bytes index l)
-	     (bytevector-u16le-set! bytes (fix:+ index 1) h))
-	   (fix:+ index 4)))))
-
-(define (char-utf32-byte-length char)
-  (if (fix:< (unicode-char->scalar-value char) #x00010000) 2 4))
-
-(define (char-utf32be-bytes! bytes index char)
-  (bytevector-u32be-set! bytes index (unicode-char->scalar-value char)))
-
-(define (char-utf32le-bytes! bytes index char)
-  (bytevector-u32le-set! bytes index (unicode-char->scalar-value char)))
-
 (define (char=? x y)
   (fix:= (char->integer x) (char->integer y)))
 
@@ -449,4 +341,266 @@ USA.
 (define char-bit:meta #x01)
 (define char-bit:control #x02)
 (define char-bit:super #x04)
-(define char-bit:hyper #x08)
\ No newline at end of file
+(define char-bit:hyper #x08)
+
+;;;; Unicode characters
+
+(define (unicode-char? object)
+  (and (char? object)
+       (legal-code-32? (char->integer object))))
+
+(define (unicode-scalar-value? object)
+  (and (index-fixnum? object)
+       (legal-code-32? object)))
+
+(define-guarantee unicode-char "a Unicode character")
+(define-guarantee unicode-scalar-value "a Unicode scalar value")
+
+(define (unicode-char->scalar-value char #!optional caller)
+  (let ((cp (char->integer char)))
+    (if (not (legal-code-32? cp))
+	(error:not-a unicode-char? char caller))
+    cp))
+
+(define-integrable (legal-code-32? cp)
+  (and (fix:< cp char-code-limit)
+       (not (utf16-surrogate? cp))
+       (not (non-character? cp))))
+
+(define (legal-code-16? pt)
+  (and (not (utf16-surrogate? pt))
+       (not (non-character? pt))))
+
+(define-integrable (utf16-surrogate? cp)
+  (fix:= #xD800 (fix:and #xF800 cp)))
+
+(define-integrable (utf16-high-surrogate? cp)
+  (fix:= #xD800 (fix:and #xFC00 cp)))
+
+(define-integrable (utf16-low-surrogate? cp)
+  (fix:= #xDC00 (fix:and #xFC00 cp)))
+
+(define-integrable (non-character? cp)
+  (or (and (fix:<= #xFDD0 cp) (fix:< cp #xFDF0))
+      (fix:= #xFFFE (fix:and #xFFFE cp))))
+
+(define-integrable (guarantee-cp-is-character cp)
+  (if (non-character? cp)
+      (error "Code point is a non-character:" cp)))
+
+(define-integrable (guarantee-cp-in-range cp)
+  (if (not (fix:< cp char-code-limit))
+      (error "Value is not a code point:" cp)))
+
+(define-integrable (guarantee-cp-not-utf16-surrogate cp)
+  (if (utf16-surrogate? cp)
+      (error "Code point is a UTF-16 surrogate:" cp)))
+
+(define-integrable (extract-bits word mask shift)
+  (fix:lsh (fix:and word mask) shift))
+
+(define-integrable (insert-bits word mask shift)
+  (fix:and (fix:lsh word shift) mask))
+
+;;;; UTF-{8,16,32} encoders
+
+(define (char-utf8-byte-length char)
+  (let ((sv (unicode-char->scalar-value char 'char-utf8-byte-length)))
+    (cond ((fix:< sv #x80) 1)
+	  ((fix:< sv #x800) 2)
+	  ((fix:< sv #x10000) 3)
+	  (else 4))))
+
+(define (encode-utf8-char! bytes index char)
+  (let ((sv (unicode-char->scalar-value char 'encode-utf8-char!)))
+
+    (define-integrable (initial-byte leader offset)
+      (fix:or leader (fix:lsh sv offset)))
+
+    (define-integrable (trailing-byte offset)
+      (fix:or #x80 (insert-bits sv #x3F offset)))
+
+    (cond ((fix:< sv #x00000080)
+	   (bytevector-u8-set! bytes index sv)
+	   (fix:+ index 1))
+	  ((fix:< sv #x00000800)
+	   (bytevector-u8-set! bytes index (initial-byte #xC0 -6))
+	   (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 0))
+	   (fix:+ index 2))
+	  ((fix:< sv #x00010000)
+	   (bytevector-u8-set! bytes index (initial-byte #xE0 -12))
+	   (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte -6))
+	   (bytevector-u8-set! bytes (fix:+ index 2) (trailing-byte 0))
+	   (fix:+ index 3))
+	  (else
+	   (bytevector-u8-set! bytes index (initial-byte #xF0 -18))
+	   (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte -12))
+	   (bytevector-u8-set! bytes (fix:+ index 2) (trailing-byte -6))
+	   (bytevector-u8-set! bytes (fix:+ index 3) (trailing-byte 0))
+	   (fix:+ index 4)))))
+
+(define (char-utf16-byte-length char)
+  (if (fix:< (unicode-char->scalar-value char 'char-utf16-byte-length) #x10000)
+      2
+      4))
+
+(define (utf16-char-encoder setter caller)
+  (lambda (bytes index char)
+    (let ((sv (unicode-char->scalar-value char caller)))
+      (cond ((fix:< sv #x10000)
+	     (setter bytes index sv)
+	     (fix:+ index 2))
+	    (else
+	     (let ((n (fix:- sv #x10000)))
+	       (setter bytes index
+		       (fix:or #xD800 (insert-bits n #x3FF -10)))
+	       (setter bytes (fix:+ index 2)
+		       (fix:or #xDC00 (insert-bits n #x3FF 0))))
+	     (fix:+ index 4))))))
+
+(define encode-utf16be-char!
+  (utf16-char-encoder bytevector-u16be-set! 'encode-utf16be-char!))
+
+(define encode-utf16le-char!
+  (utf16-char-encoder bytevector-u16le-set! 'encode-utf16le-char!))
+
+(define (char-utf32-byte-length char)
+  (unicode-char->scalar-value char 'char-utf32-byte-length)
+  4)
+
+(define (utf32-char-encoder setter caller)
+  (lambda (bytes index char)
+    (setter bytes index (unicode-char->scalar-value char caller))))
+
+(define encode-utf32be-char!
+  (utf32-char-encoder bytevector-u32be-set! 'encode-utf32be-char!))
+
+(define encode-utf32le-char!
+  (utf32-char-encoder bytevector-u32le-set! 'encode-utf32le-char!))
+
+;;;; UTF-{8,16,32} decoders
+
+(define (initial-byte->utf8-char-length byte)
+  (guarantee byte? byte 'initial-byte->utf8-char-length)
+  (cond ((utf8-initial-byte-1? byte) 1)
+	((utf8-initial-byte-2? byte) 2)
+	((utf8-initial-byte-3? byte) 3)
+	((utf8-initial-byte-4? byte) 4)
+	(else (error "Illegal UTF-8 initial byte:" byte))))
+
+(define (decode-utf8-char bytes index)
+  (integer->char
+   (let ((b0 (bytevector-u8-ref bytes index)))
+     (cond ((utf8-initial-byte-1? b0)
+	    b0)
+	   ((utf8-initial-byte-2? b0)
+	    (decode-utf8-2 b0
+			   (bytevector-u8-ref bytes (fix:+ index 1))))
+	   ((utf8-initial-byte-3? b0)
+	    (decode-utf8-3 b0
+			   (bytevector-u8-ref bytes (fix:+ index 1))
+			   (bytevector-u8-ref bytes (fix:+ index 2))))
+	   ((utf8-initial-byte-4? b0)
+	    (decode-utf8-4 b0
+			   (bytevector-u8-ref bytes (fix:+ index 1))
+			   (bytevector-u8-ref bytes (fix:+ index 2))
+			   (bytevector-u8-ref bytes (fix:+ index 3))))
+	   (else
+	    (error "Illegal UTF-8 initial byte:" b0))))))
+
+(define (decode-utf8-2 b0 b1)
+  (if (not (and (fix:> b0 #xC1)
+		(utf8-trailing-byte? b1)))
+      (error "Ill-formed UTF-8 sequence:" b0 b1))
+  (fix:or (extract-bits b0 #x1F 6)
+	  (extract-bits b1 #x3F 0)))
+
+(define (decode-utf8-3 b0 b1 b2)
+  (if (not (and (or (fix:> b0 #xE0) (fix:> b1 #x9F))
+		(utf8-trailing-byte? b1)
+		(utf8-trailing-byte? b2)))
+      (error "Ill-formed UTF-8 sequence:" b0 b1 b2))
+  (let ((cp
+	 (fix:or (fix:or (extract-bits b0 #x0F 12)
+			 (extract-bits b1 #x3F 6))
+		 (extract-bits b2 #x3F 0))))
+    (guarantee-cp-not-utf16-surrogate cp)
+    (guarantee-cp-is-character cp)
+    cp))
+
+(define (decode-utf8-4 b0 b1 b2 b3)
+  (if (not (and (or (fix:> b0 #xF0) (fix:> b1 #x8F))
+		(utf8-trailing-byte? b1)
+		(utf8-trailing-byte? b2)
+		(utf8-trailing-byte? b3)))
+      (error "Ill-formed UTF-8 sequence:" b0 b1 b2 b3))
+  (let ((cp
+	 (fix:or (fix:or (extract-bits b0 #x07 18)
+			 (extract-bits b1 #x3F 12))
+		 (fix:or (extract-bits b2 #x3F 6)
+			 (extract-bits b3 #x3F 0)))))
+    (guarantee-cp-in-range cp)
+    (guarantee-cp-is-character cp)
+    cp))
+
+(define-integrable (utf8-initial-byte-1? byte)
+  (fix:= #x00 (fix:and #x80 byte)))
+
+(define-integrable (utf8-initial-byte-2? byte)
+  (fix:= #xC0 (fix:and #xE0 byte)))
+
+(define-integrable (utf8-initial-byte-3? byte)
+  (fix:= #xE0 (fix:and #xF0 byte)))
+
+(define-integrable (utf8-initial-byte-4? byte)
+  (fix:= #xF0 (fix:and #xF8 byte)))
+
+(define-integrable (utf8-trailing-byte? byte)
+  (fix:= #x80 (fix:and #xC0 byte)))
+
+(define (initial-u16->utf16-char-length u16)
+  (guarantee u16? u16 'initial-u16->utf16-char-length)
+  (if (utf16-high-surrogate? u16) 4 2))
+
+(define (utf16-char-decoder getter)
+  (lambda (bytes index)
+    (let ((d0 (getter bytes index)))
+      (if (utf16-low-surrogate? d0)
+	  (error "Ill-formed UTF-16 sequence:" d0))
+      (let ((cp
+	     (if (utf16-high-surrogate? d0)
+		 (let ((d1 (getter bytes (fix:+ index 2))))
+		   (if (not (utf16-low-surrogate? d1))
+		       (error "Ill-formed UTF-16 sequence:" d0 d1))
+		   (fix:+ (fix:or (extract-bits d0 #x3FF 10)
+				  (extract-bits d1 #x3FF 0))
+			  #x10000))
+		 d0)))
+	(guarantee-cp-in-range cp)
+	(guarantee-cp-is-character cp)
+	(integer->char cp)))))
+
+(define decode-utf16be-char
+  (utf16-char-decoder bytevector-u16be-ref))
+
+(define decode-utf16le-char
+  (utf16-char-decoder bytevector-u16le-ref))
+
+(define (initial-u32->utf32-char-length u32)
+  (guarantee u32? u32 'initial-u32->utf32-char-length)
+  4)
+
+(define (utf32-char-decoder getter)
+  (lambda (bytes index)
+    (let ((u32 (getter bytes index)))
+      (if (not (< u32 char-code-limit))
+	  (error "Value is not a code point:" u32))
+      (guarantee-cp-not-utf16-surrogate u32)
+      (guarantee-cp-is-character u32)
+      (integer->char u32))))
+
+(define decode-utf32be-char
+  (utf32-char-decoder bytevector-u32be-ref))
+
+(define decode-utf32le-char
+  (utf32-char-decoder bytevector-u32le-ref))
\ No newline at end of file
diff --git a/src/runtime/make.scm b/src/runtime/make.scm
index 377b27de5..eed707df4 100644
--- a/src/runtime/make.scm
+++ b/src/runtime/make.scm
@@ -442,6 +442,7 @@ USA.
    ((RUNTIME NUMBER) INITIALIZE-DRAGON4!)
    (RUNTIME MISCELLANEOUS-GLOBAL)
    (RUNTIME CHARACTER)
+   (RUNTIME BYTEVECTOR)
    (RUNTIME CHARACTER-SET)
    (RUNTIME GENSYM)
    (RUNTIME STREAM)
diff --git a/src/runtime/runtime.pkg b/src/runtime/runtime.pkg
index 2c7926eaf..0f3eef825 100644
--- a/src/runtime/runtime.pkg
+++ b/src/runtime/runtime.pkg
@@ -1162,9 +1162,17 @@ USA.
 	  bytevector=?
 	  bytevector?
 	  make-bytevector
+	  string->utf16be
+	  string->utf16le
+	  string->utf32be
+	  string->utf32le
 	  string->utf8
 	  u16?
 	  u32?
+	  utf16be->string
+	  utf16le->string
+	  utf32be->string
+	  utf32le->string
 	  utf8->string)
   (export (runtime predicate-metadata)
 	  register-mit-bytevector-predicates!))
@@ -1278,13 +1286,8 @@ USA.
 	  char-integer-limit
 	  char-upcase
 	  char-utf16-byte-length
-	  char-utf16be-bytes!
-	  char-utf16le-bytes!
 	  char-utf32-byte-length
-	  char-utf32be-bytes!
-	  char-utf32le-bytes!
 	  char-utf8-byte-length
-	  char-utf8-bytes!
 	  char<=?
 	  char<?
 	  char=?
@@ -1294,8 +1297,21 @@ USA.
 	  chars->ascii
 	  clear-char-bits
 	  code->char
+	  decode-utf16be-char
+	  decode-utf16le-char
+	  decode-utf32be-char
+	  decode-utf32le-char
+	  decode-utf8-char
 	  digit->char
+	  encode-utf16be-char!
+	  encode-utf16le-char!
+	  encode-utf32be-char!
+	  encode-utf32le-char!
+	  encode-utf8-char!
 	  guarantee-8-bit-char
+	  initial-byte->utf8-char-length
+	  initial-u16->utf16-char-length
+	  initial-u32->utf32-char-length
 	  integer->char
 	  make-char
 	  name->char
-- 
2.25.1