From: Chris Hanson <org/chris-hanson/cph>
Date: Sun, 8 Jan 2017 07:37:29 +0000 (-0800)
Subject: Move UTF-8 character encoding into char.scm and share with bytevector.
X-Git-Tag: mit-scheme-pucked-9.2.12~227^2~177
X-Git-Url: https://birchwood-abbey.net/git?a=commitdiff_plain;h=805fea6508f2298a54abc13b4f7eeae176848c2a;p=mit-scheme.git

Move UTF-8 character encoding into char.scm and share with bytevector.
---

diff --git a/src/runtime/bytevector.scm b/src/runtime/bytevector.scm
index 1e0247089..281b86247 100644
--- a/src/runtime/bytevector.scm
+++ b/src/runtime/bytevector.scm
@@ -116,60 +116,18 @@ USA.
 		start))))
     (let ((buffer (allocate-bytevector (%count-utf8-bytes string start end))))
       (do ((from start (fix:+ from 1))
-	   (to 0 (fix:+ to (%char->utf8! buffer to (string-ref string from)))))
+	   (to 0
+	       (fix:+ to
+		      (char-utf8-bytes! buffer to (string-ref string from)))))
 	  ((not (fix:< from end))))
       buffer)))
 
-(define (%char->utf8! buffer index char)
-  (let ((cp (char->integer char)))
-
-    (define-integrable (initial-byte n-bits offset)
-      (fix:or (fix:and (fix:lsh #xFF (fix:+ n-bits 1)) #xFF)
-	      (fix:lsh cp (fix:- 0 offset))))
-
-    (define-integrable (trailing-byte offset)
-      (fix:or #x80 (fix:and (fix:lsh cp (fix:- 0 offset)) #x3F)))
-
-    (define-integrable (put-byte! offset byte)
-      (bytevector-u8-set! buffer (fix:+ index offset) byte))
-
-    (cond ((fix:< cp #x00000080)
-	   (put-byte! 0 cp)
-	   1)
-	  ((fix:< cp #x00000800)
-	   (put-byte! 0 (initial-byte 5 6))
-	   (put-byte! 1 (trailing-byte 0))
-	   2)
-	  ((fix:< cp #x00010000)
-	   (if (surrogate? cp)
-	       (error "Code point is a UTF-16 surrogate:" cp))
-	   (if (non-character? cp)
-	       (error "Code point is a non-character:" cp))
-	   (put-byte! 0 (initial-byte 4 12))
-	   (put-byte! 1 (trailing-byte 6))
-	   (put-byte! 2 (trailing-byte 0))
-	   3)
-	  (else
-	   (if (non-character? cp)
-	       (error "Code point is a non-character:" cp))
-	   (put-byte! 0 (initial-byte 3 18))
-	   (put-byte! 1 (trailing-byte 12))
-	   (put-byte! 2 (trailing-byte 6))
-	   (put-byte! 3 (trailing-byte 0))
-	   4))))
-
 (define (%count-utf8-bytes string start end)
   (do ((index start (fix:+ index 1))
-       (n-bytes 0 (fix:+ n-bytes (char-utf8-bytes (string-ref string index)))))
+       (n-bytes 0
+		(fix:+ n-bytes
+		       (char-utf8-byte-length (string-ref string index)))))
       ((not (fix:< index end)) n-bytes)))
-
-(define (char-utf8-bytes char)
-  (let ((cp (char->integer char)))
-    (cond ((fix:< cp #x00000080) 1)
-	  ((fix:< cp #x00000800) 2)
-	  ((fix:< cp #x00010000) 3)
-	  ((fix:< cp #x00110000) 4)
-	  (else (error "Not a unicode character:" char)))))
 
 (define (utf8->string bytevector #!optional start end)
   (guarantee bytevector? bytevector 'utf8->string)
diff --git a/src/runtime/char.scm b/src/runtime/char.scm
index 5d2c5c4de..b6493bf1b 100644
--- a/src/runtime/char.scm
+++ b/src/runtime/char.scm
@@ -90,6 +90,12 @@ USA.
 
 (define-guarantee unicode-scalar-value "a Unicode scalar value")
 
+(define (unicode-char->scalar-value char)
+  (let ((cp (char->integer char)))
+    (if (not (legal-code-32? cp))
+	(error:not-unicode-char char 'char-utf8-byte-length))
+    cp))
+
 (define-integrable (legal-code-32? pt)
   (and (fix:< pt char-code-limit)
        (not (surrogate? pt))
@@ -131,6 +137,52 @@ USA.
 (define (chars->ascii chars)
   (map char->ascii chars))
 
+(define (char-utf8-byte-length char)
+  (%sv-utf8-byte-length (unicode-char->scalar-value char)))
+
+(define (char-utf8-bytes char)
+  (let ((sv (unicode-char->scalar-value char)))
+    (let ((bytes (make-bytevector (%sv-utf8-byte-length sv))))
+      (%sv-utf8-bytes! bytes 0 sv)
+      bytes)))
+
+(define (char-utf8-bytes! bytes index char)
+  (%sv-utf8-bytes! bytes index (unicode-char->scalar-value char)))
+
+(define (%sv-utf8-byte-length sv)
+  (cond ((fix:< sv #x00000080) 1)
+	((fix:< sv #x00000800) 2)
+	((fix:< sv #x00010000) 3)
+	(else 4)))
+
+(define (%sv-utf8-bytes! bytes index sv)
+
+  (define-integrable (initial-byte n-bits offset)
+    (fix:or (fix:and (fix:lsh #xFF (fix:+ n-bits 1)) #xFF)
+	    (fix:lsh sv (fix:- 0 offset))))
+
+  (define-integrable (trailing-byte offset)
+    (fix:or #x80 (fix:and (fix:lsh sv (fix:- 0 offset)) #x3F)))
+
+  (cond ((fix:< sv #x00000080)
+	 (bytevector-u8-set! bytes index sv)
+	 (fix:+ index 1))
+	((fix:< sv #x00000800)
+	 (bytevector-u8-set! bytes index (initial-byte 5 6))
+	 (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 0))
+	 (fix:+ index 2))
+	((fix:< sv #x00010000)
+	 (bytevector-u8-set! bytes index (initial-byte 4 12))
+	 (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 6))
+	 (bytevector-u8-set! bytes (fix:+ index 2) (trailing-byte 0))
+	 (fix:+ index 3))
+	(else
+	 (bytevector-u8-set! bytes index (initial-byte 3 18))
+	 (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 12))
+	 (bytevector-u8-set! bytes (fix:+ index 2) (trailing-byte 6))
+	 (bytevector-u8-set! bytes (fix:+ index 3) (trailing-byte 0))
+	 (fix:+ index 4))))
+
 (define (char=? x y)
   (fix:= (char->integer x) (char->integer y)))
 
diff --git a/src/runtime/runtime.pkg b/src/runtime/runtime.pkg
index 83444f75a..dd3d8e855 100644
--- a/src/runtime/runtime.pkg
+++ b/src/runtime/runtime.pkg
@@ -1233,6 +1233,9 @@ USA.
 	  char-downcase
 	  char-integer-limit
 	  char-upcase
+	  char-utf8-byte-length
+	  char-utf8-bytes
+	  char-utf8-bytes!
 	  char<=?
 	  char<?
 	  char=?
@@ -1257,6 +1260,7 @@ USA.
 	  name->char
 	  radix?
 	  set-char-bits
+	  unicode-char->scalar-value
 	  unicode-char?
 	  unicode-scalar-value?)
   (export (runtime unicode)