Implement UTF-X codecs for chars and strings.
authorChris Hanson <org/chris-hanson/cph>
Wed, 18 Jan 2017 11:00:08 +0000 (03:00 -0800)
committerChris Hanson <org/chris-hanson/cph>
Wed, 18 Jan 2017 11:00:08 +0000 (03:00 -0800)
src/runtime/bytevector.scm
src/runtime/char.scm
src/runtime/make.scm
src/runtime/runtime.pkg

index 64247c008fdda5bcdbc45723b0619c50611d4878..6180def912f20737ea4f226597fe33171d838d05 100644 (file)
@@ -245,150 +245,135 @@ USA.
                               index-fixnum?
                               exact-nonnegative-integer?)))
 \f
-(define (string->utf8 string #!optional start end)
-  (guarantee string? string 'string->utf8)
-  (let* ((end
-         (if (default-object? end)
-             (string-length string)
-             (begin
-               (guarantee index-fixnum? end 'string->utf8)
-               (if (not (fix:<= end (string-length string)))
-                   (error:bad-range-argument end 'string->utf8))
-               end)))
-        (start
-         (if (default-object? start)
-             0
-             (begin
-               (guarantee index-fixnum? start 'string->utf8)
-               (if (not (fix:<= start end))
-                   (error:bad-range-argument start 'string->utf8))
-               start))))
-    (let ((buffer (allocate-bytevector (%count-utf8-bytes string start end))))
-      (do ((from start (fix:+ from 1))
-          (to 0
-              (fix:+ to
-                     (char-utf8-bytes! buffer to (string-ref string from)))))
-         ((not (fix:< from end))))
-      buffer)))
-
-(define (%count-utf8-bytes string start end)
-  (do ((index start (fix:+ index 1))
-       (n-bytes 0
-               (fix:+ n-bytes
-                      (char-utf8-byte-length (string-ref string index)))))
-      ((not (fix:< index end)) n-bytes)))
+(define (string-encoder char-byte-length encode-char! caller)
+  (lambda (string #!optional start end)
+    (guarantee string? string caller)
+    (let* ((end
+           (if (default-object? end)
+               (string-length string)
+               (begin
+                 (guarantee index-fixnum? end caller)
+                 (if (not (fix:<= end (string-length string)))
+                     (error:bad-range-argument end caller))
+                 end)))
+          (start
+           (if (default-object? start)
+               0
+               (begin
+                 (guarantee index-fixnum? start caller)
+                 (if (not (fix:<= start end))
+                     (error:bad-range-argument start caller))
+                 start))))
+      (let ((bytes
+            (allocate-bytevector
+             (let loop ((index start) (n-bytes 0))
+               (if (fix:< index end)
+                   (loop (fix:+ index 1)
+                         (fix:+ n-bytes
+                                (char-byte-length (string-ref string index))))
+                   n-bytes)))))
+       (let loop ((from start) (to 0))
+         (if (fix:< from end)
+             (loop (fix:+ from 1)
+                   (encode-char! bytes to (string-ref string from)))))
+       bytes))))
+
+(define string->utf8)
+(define string->utf16be)
+(define string->utf16le)
+(define string->utf32be)
+(define string->utf32le)
+(add-boot-init!
+ (lambda ()
+   (set! string->utf8
+        (string-encoder char-utf8-byte-length encode-utf8-char!
+                        'string->utf8))
+   (set! string->utf16be
+        (string-encoder char-utf16-byte-length encode-utf16be-char!
+                        'string->utf16be))
+   (set! string->utf16le
+        (string-encoder char-utf16-byte-length encode-utf16le-char!
+                        'string->utf16le))
+   (set! string->utf32be
+        (string-encoder char-utf32-byte-length encode-utf32be-char!
+                        'string->utf32be))
+   (set! string->utf32le
+        (string-encoder char-utf32-byte-length encode-utf32le-char!
+                        'string->utf32le))
+   unspecific))
 \f
-(define (utf8->string bytevector #!optional start end)
-  (guarantee bytevector? bytevector 'utf8->string)
-  (let* ((end
-         (if (default-object? end)
-             (bytevector-length bytevector)
-             (begin
-               (guarantee index-fixnum? end 'utf8->string)
-               (if (not (fix:<= end (bytevector-length bytevector)))
-                   (error:bad-range-argument end 'utf8->string))
-               end)))
-       (start
-        (if (default-object? start)
-            0
-            (begin
-              (guarantee index-fixnum? start 'utf8->string)
-              (if (not (fix:<= start end))
-                  (error:bad-range-argument start 'utf8->string))
-              start))))
-    (%utf8->string bytevector start end)))
-
-(define (%utf8->string bytevector start end)
-  (let ((string (make-string (%count-utf8-chars bytevector start end))))
-    (let loop ((from start) (to 0))
-
-      (define-integrable (get-byte offset)
-       (bytevector-u8-ref bytevector (fix:+ from offset)))
-
-      (define-integrable (put-char! cp)
-       (string-set! string to (integer->char cp)))
-
-      (if (fix:< from end)
-         (let ((b0 (get-byte 0)))
-           (cond ((fix:< b0 #x80)
-                  (put-char! b0)
-                  (loop (fix:+ from 1) (fix:+ to 1)))
-                 ((fix:< b0 #xE0)
-                  (put-char! (decode-utf8-2 b0 (get-byte 1)))
-                  (loop (fix:+ from 2) (fix:+ to 1)))
-                 ((fix:< b0 #xF0)
-                  (put-char! (decode-utf8-3 b0 (get-byte 1) (get-byte 2)))
-                  (loop (fix:+ from 3) (fix:+ to 1)))
-                 (else
-                  (put-char!
-                   (decode-utf8-4 b0 (get-byte 1) (get-byte 2) (get-byte 3)))
-                  (loop (fix:+ from 4) (fix:+ to 1)))))))
-    string))
-\f
-(define (%count-utf8-chars bytevector start end)
-  (let loop ((index start) (n-chars 0))
-    (if (fix:< index end)
-       (let ((b0 (bytevector-u8-ref bytevector index)))
-         (let ((index*
-                (fix:+ index
-                       (cond ((fix:< b0 #x80) 1)
-                             ((fix:< b0 #xE0) 2)
-                             ((fix:< b0 #xF0) 3)
-                             (else 4)))))
-           (if (not (fix:<= index* end))
-               (error "Truncated UTF-8 sequence:"
-                      (bytevector-copy bytevector index end)))
-           (loop index* (fix:+ n-chars 1))))
-       n-chars)))
-
-(define (decode-utf8-2 b0 b1)
-  (if (not (and (fix:> b0 #xC1)
-               (trailing-byte? b1)))
-      (error "Ill-formed UTF-8 sequence:" b0 b1))
-  (fix:or (extract b0 #x1F 6)
-         (extract b1 #x3F 0)))
-
-(define (decode-utf8-3 b0 b1 b2)
-  (if (not (and (or (fix:> b0 #xE0) (fix:> b1 #x9F))
-               (trailing-byte? b1)
-               (trailing-byte? b2)))
-      (error "Ill-formed UTF-8 sequence:" b0 b1 b2))
-  (let ((cp
-        (fix:or (fix:or (extract b0 #x0F 12)
-                        (extract b1 #x3F 6))
-                (extract b2 #x3F 0))))
-    (if (surrogate? cp)
-       (error "Code point is a UTF-16 surrogate:" cp))
-    (if (non-character? cp)
-       (error "Code point is a non-character:" cp))
-    cp))
-
-(define (decode-utf8-4 b0 b1 b2 b3)
-  (if (not (and (or (fix:> b0 #xF0) (fix:> b1 #x8F))
-               (trailing-byte? b1)
-               (trailing-byte? b2)
-               (trailing-byte? b3)))
-      (error "Ill-formed UTF-8 sequence:" b0 b1 b2 b3))
-  (let ((cp
-        (fix:or (fix:or (extract b0 #x07 18)
-                        (extract b1 #x3F 12))
-                (fix:or (extract b2 #x3F 6)
-                        (extract b3 #x3F 0)))))
-    (if (not (fix:< cp #x110000))
-       (error "Value is not a code point:" cp))
-    (if (non-character? cp)
-       (error "Code point is a non-character:" cp))
-    cp))
-
-(define-integrable (extract b m n)
-  (fix:lsh (fix:and b m) n))
-
-(define-integrable (trailing-byte? b)
-  (fix:= (fix:and #xC0 b) #x80))
-
-(define-integrable (surrogate? cp)
-  (and (fix:<= #xD800 cp) (fix:< cp #xDFFF)))
-
-(define-integrable (non-character? cp)
-  (or (and (fix:<= #xFDD0 cp) (fix:< cp #xFDF0))
-      (fix:= (fix:and #xFFFE cp) #xFFFE)))
\ No newline at end of file
+(define (bytes-decoder getter initial->length char-length decode-char step noun
+                      caller)
+  (lambda (bytevector #!optional start end)
+    (guarantee bytevector? bytevector caller)
+    (let* ((end
+           (if (default-object? end)
+               (bytevector-length bytevector)
+               (begin
+                 (guarantee index-fixnum? end caller)
+                 (if (not (fix:<= end (bytevector-length bytevector)))
+                     (error:bad-range-argument end caller))
+                 end)))
+         (start
+          (if (default-object? start)
+              0
+              (begin
+                (guarantee index-fixnum? start caller)
+                (if (not (fix:<= start end))
+                    (error:bad-range-argument start caller))
+                start)))
+         (truncated
+          (lambda (index)
+            (error (string "Truncated " noun " sequence:")
+                   (bytevector-copy bytevector
+                                    index
+                                    (fix:min (fix:+ index 4) end))))))
+      (let ((string
+            (make-string
+             (let loop ((index start) (n-chars 0))
+               (if (fix:<= (fix:+ index step) end)
+                   (let ((n (initial->length (getter bytevector start))))
+                     (let ((index* (fix:+ index n)))
+                       (if (not (fix:<= index* end))
+                           (truncated index))
+                       (loop index* (fix:+ n-chars 1))))
+                   (begin
+                     (if (fix:< index end)
+                         (truncated index))
+                     n-chars))))))
+       (let loop ((from start) (to 0))
+         (if (fix:< from end)
+             (let ((char (decode-char bytevector start)))
+               (string-set! string to char)
+               (loop (fix:+ from (char-length char))
+                     (fix:+ to 1)))))
+       string))))
+
+(define utf8->string)
+(define utf16be->string)
+(define utf16le->string)
+(define utf32be->string)
+(define utf32le->string)
+(add-boot-init!
+ (lambda ()
+   (set! utf8->string
+        (bytes-decoder bytevector-u8-ref initial-byte->utf8-char-length
+                       char-utf8-byte-length decode-utf8-char 1 "UTF-8"
+                       'utf8->string))
+   (set! utf16be->string
+        (bytes-decoder bytevector-u16be-ref initial-u16->utf16-char-length
+                       char-utf16-byte-length decode-utf16be-char 1 "UTF-16BE"
+                       'utf16be->string))
+   (set! utf16le->string
+        (bytes-decoder bytevector-u16le-ref initial-u16->utf16-char-length
+                       char-utf16-byte-length decode-utf16le-char 1 "UTF-16LE"
+                       'utf16le->string))
+   (set! utf32be->string
+        (bytes-decoder bytevector-u32be-ref initial-u32->utf32-char-length
+                       char-utf32-byte-length decode-utf32be-char 1 "UTF-32BE"
+                       'utf32be->string))
+   (set! utf32le->string
+        (bytes-decoder bytevector-u32le-ref initial-u32->utf32-char-length
+                       char-utf32-byte-length decode-utf32le-char 1 "UTF-32LE"
+                       'utf32le->string))
+   unspecific))
\ No newline at end of file
index 0d37470f3bca03fb061d80fe43ed5d3b6303a8a7..754c7297aa7f27f567629d8d6c9d726b28d05478 100644 (file)
@@ -76,45 +76,9 @@ USA.
   (%make-char (char-code char)
              (fix:andc (char-bits char) bits)))
 \f
-(define (unicode-char? object)
-  (and (char? object)
-       (legal-code-32? (char->integer object))))
-
-(define-guarantee unicode-char "a Unicode character")
-
-(define (unicode-scalar-value? object)
-  (and (index-fixnum? object)
-       (fix:< object char-code-limit)
-       (not (surrogate? object))
-       (not (non-character? object))))
-
-(define-guarantee unicode-scalar-value "a Unicode scalar value")
-
-(define (unicode-char->scalar-value char)
-  (let ((cp (char->integer char)))
-    (if (not (legal-code-32? cp))
-       (error:not-unicode-char char 'char-utf8-byte-length))
-    cp))
-
-(define-integrable (legal-code-32? pt)
-  (and (fix:< pt char-code-limit)
-       (not (surrogate? pt))
-       (not (non-character? pt))))
-
-(define-integrable (legal-code-16? pt)
-  (and (not (surrogate? pt))
-       (not (non-character? pt))))
-
-(define-integrable (surrogate? pt)
-  (and (fix:<= #xD800 pt) (fix:< pt #xDFFF)))
-
-(define-integrable (non-character? pt)
-  (or (and (fix:<= #xFDD0 pt) (fix:< pt #xFDF0))
-      (fix:= (fix:and #xFFFE pt) #xFFFE)))
-
 (define (8-bit-char? object)
   (and (char? object)
-       (fix:< (char->integer object) 256)))
+       (fix:< (char->integer object) #x100)))
 
 (define (guarantee-8-bit-char object #!optional caller)
   caller
@@ -123,7 +87,7 @@ USA.
 
 (define (char-ascii? char)
   (let ((n (char->integer char)))
-    (and (fix:< n 256)
+    (and (fix:< n #x100)
         n)))
 
 (define (char->ascii char)
@@ -131,84 +95,12 @@ USA.
   (char->integer char))
 
 (define (ascii->char code)
-  (guarantee-limited-index-fixnum code 256 'ASCII->CHAR)
+  (guarantee-limited-index-fixnum code #x100 'ASCII->CHAR)
   (%make-char code 0))
 
 (define (chars->ascii chars)
   (map char->ascii chars))
-\f
-;;;; UTF-{8,16,32} encoders
-
-(define (char-utf8-byte-length char)
-  (let ((sv (unicode-char->scalar-value char)))
-    (cond ((fix:< sv #x00000080) 1)
-         ((fix:< sv #x00000800) 2)
-         ((fix:< sv #x00010000) 3)
-         (else 4))))
-
-(define (char-utf8-bytes! bytes index char)
-  (let ((sv (unicode-char->scalar-value char)))
-
-    (define-integrable (initial-byte n-bits offset)
-      (fix:or (fix:and (fix:lsh #xFF (fix:+ n-bits 1)) #xFF)
-             (fix:lsh sv (fix:- 0 offset))))
-
-    (define-integrable (trailing-byte offset)
-      (fix:or #x80 (fix:and (fix:lsh sv (fix:- 0 offset)) #x3F)))
-
-    (cond ((fix:< sv #x00000080)
-          (bytevector-u8-set! bytes index sv)
-          (fix:+ index 1))
-         ((fix:< sv #x00000800)
-          (bytevector-u8-set! bytes index (initial-byte 5 6))
-          (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 0))
-          (fix:+ index 2))
-         ((fix:< sv #x00010000)
-          (bytevector-u8-set! bytes index (initial-byte 4 12))
-          (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 6))
-          (bytevector-u8-set! bytes (fix:+ index 2) (trailing-byte 0))
-          (fix:+ index 3))
-         (else
-          (bytevector-u8-set! bytes index (initial-byte 3 18))
-          (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 12))
-          (bytevector-u8-set! bytes (fix:+ index 2) (trailing-byte 6))
-          (bytevector-u8-set! bytes (fix:+ index 3) (trailing-byte 0))
-          (fix:+ index 4)))))
 
-(define (char-utf16-byte-length char)
-  (if (fix:< (unicode-char->scalar-value char) #x00010000) 2 4))
-
-(define (char-utf16be-bytes! bytes index char)
-  (let ((sv (unicode-char->scalar-value char)))
-    (cond ((fix:< sv #x10000)
-          (bytevector-u16be-set! bytes index sv)
-          (fix:+ index 2))
-         (else
-          (receive (h l) (split-into-utf16-surrogates sv)
-            (bytevector-u16be-set! bytes index h)
-            (bytevector-u16be-set! bytes (fix:+ index 1) l))
-          (fix:+ index 4)))))
-
-(define (char-utf16le-bytes! bytes index char)
-  (let ((sv (unicode-char->scalar-value char)))
-    (cond ((fix:< sv #x10000)
-          (bytevector-u16le-set! bytes index sv)
-          (fix:+ index 2))
-         (else
-          (receive (h l) (split-into-utf16-surrogates sv)
-            (bytevector-u16le-set! bytes index l)
-            (bytevector-u16le-set! bytes (fix:+ index 1) h))
-          (fix:+ index 4)))))
-
-(define (char-utf32-byte-length char)
-  (if (fix:< (unicode-char->scalar-value char) #x00010000) 2 4))
-
-(define (char-utf32be-bytes! bytes index char)
-  (bytevector-u32be-set! bytes index (unicode-char->scalar-value char)))
-
-(define (char-utf32le-bytes! bytes index char)
-  (bytevector-u32le-set! bytes index (unicode-char->scalar-value char)))
-\f
 (define (char=? x y)
   (fix:= (char->integer x) (char->integer y)))
 
@@ -449,4 +341,266 @@ USA.
 (define char-bit:meta #x01)
 (define char-bit:control #x02)
 (define char-bit:super #x04)
-(define char-bit:hyper #x08)
\ No newline at end of file
+(define char-bit:hyper #x08)
+\f
+;;;; Unicode characters
+
+(define (unicode-char? object)
+  (and (char? object)
+       (legal-code-32? (char->integer object))))
+
+(define (unicode-scalar-value? object)
+  (and (index-fixnum? object)
+       (legal-code-32? object)))
+
+(define-guarantee unicode-char "a Unicode character")
+(define-guarantee unicode-scalar-value "a Unicode scalar value")
+
+(define (unicode-char->scalar-value char #!optional caller)
+  (let ((cp (char->integer char)))
+    (if (not (legal-code-32? cp))
+       (error:not-a unicode-char? char caller))
+    cp))
+
+(define-integrable (legal-code-32? cp)
+  (and (fix:< cp char-code-limit)
+       (not (utf16-surrogate? cp))
+       (not (non-character? cp))))
+
+(define (legal-code-16? pt)
+  (and (not (utf16-surrogate? pt))
+       (not (non-character? pt))))
+
+(define-integrable (utf16-surrogate? cp)
+  (fix:= #xD800 (fix:and #xF800 cp)))
+
+(define-integrable (utf16-high-surrogate? cp)
+  (fix:= #xD800 (fix:and #xFC00 cp)))
+
+(define-integrable (utf16-low-surrogate? cp)
+  (fix:= #xDC00 (fix:and #xFC00 cp)))
+
+(define-integrable (non-character? cp)
+  (or (and (fix:<= #xFDD0 cp) (fix:< cp #xFDF0))
+      (fix:= #xFFFE (fix:and #xFFFE cp))))
+
+(define-integrable (guarantee-cp-is-character cp)
+  (if (non-character? cp)
+      (error "Code point is a non-character:" cp)))
+
+(define-integrable (guarantee-cp-in-range cp)
+  (if (not (fix:< cp char-code-limit))
+      (error "Value is not a code point:" cp)))
+
+(define-integrable (guarantee-cp-not-utf16-surrogate cp)
+  (if (utf16-surrogate? cp)
+      (error "Code point is a UTF-16 surrogate:" cp)))
+
+(define-integrable (extract-bits word mask shift)
+  (fix:lsh (fix:and word mask) shift))
+
+(define-integrable (insert-bits word mask shift)
+  (fix:and (fix:lsh word shift) mask))
+\f
+;;;; UTF-{8,16,32} encoders
+
+(define (char-utf8-byte-length char)
+  (let ((sv (unicode-char->scalar-value char 'char-utf8-byte-length)))
+    (cond ((fix:< sv #x80) 1)
+         ((fix:< sv #x800) 2)
+         ((fix:< sv #x10000) 3)
+         (else 4))))
+
+(define (encode-utf8-char! bytes index char)
+  (let ((sv (unicode-char->scalar-value char 'encode-utf8-char!)))
+
+    (define-integrable (initial-byte leader offset)
+      (fix:or leader (fix:lsh sv offset)))
+
+    (define-integrable (trailing-byte offset)
+      (fix:or #x80 (insert-bits sv #x3F offset)))
+
+    (cond ((fix:< sv #x00000080)
+          (bytevector-u8-set! bytes index sv)
+          (fix:+ index 1))
+         ((fix:< sv #x00000800)
+          (bytevector-u8-set! bytes index (initial-byte #xC0 -6))
+          (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 0))
+          (fix:+ index 2))
+         ((fix:< sv #x00010000)
+          (bytevector-u8-set! bytes index (initial-byte #xE0 -12))
+          (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte -6))
+          (bytevector-u8-set! bytes (fix:+ index 2) (trailing-byte 0))
+          (fix:+ index 3))
+         (else
+          (bytevector-u8-set! bytes index (initial-byte #xF0 -18))
+          (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte -12))
+          (bytevector-u8-set! bytes (fix:+ index 2) (trailing-byte -6))
+          (bytevector-u8-set! bytes (fix:+ index 3) (trailing-byte 0))
+          (fix:+ index 4)))))
+
+(define (char-utf16-byte-length char)
+  (if (fix:< (unicode-char->scalar-value char 'char-utf16-byte-length) #x10000)
+      2
+      4))
+
+(define (utf16-char-encoder setter caller)
+  (lambda (bytes index char)
+    (let ((sv (unicode-char->scalar-value char caller)))
+      (cond ((fix:< sv #x10000)
+            (setter bytes index sv)
+            (fix:+ index 2))
+           (else
+            (let ((n (fix:- sv #x10000)))
+              (setter bytes index
+                      (fix:or #xD800 (insert-bits n #x3FF -10)))
+              (setter bytes (fix:+ index 2)
+                      (fix:or #xDC00 (insert-bits n #x3FF 0))))
+            (fix:+ index 4))))))
+
+(define encode-utf16be-char!
+  (utf16-char-encoder bytevector-u16be-set! 'encode-utf16be-char!))
+
+(define encode-utf16le-char!
+  (utf16-char-encoder bytevector-u16le-set! 'encode-utf16le-char!))
+
+(define (char-utf32-byte-length char)
+  (unicode-char->scalar-value char 'char-utf32-byte-length)
+  4)
+
+(define (utf32-char-encoder setter caller)
+  (lambda (bytes index char)
+    (setter bytes index (unicode-char->scalar-value char caller))))
+
+(define encode-utf32be-char!
+  (utf32-char-encoder bytevector-u32be-set! 'encode-utf32be-char!))
+
+(define encode-utf32le-char!
+  (utf32-char-encoder bytevector-u32le-set! 'encode-utf32le-char!))
+\f
+;;;; UTF-{8,16,32} decoders
+
+(define (initial-byte->utf8-char-length byte)
+  (guarantee byte? byte 'initial-byte->utf8-char-length)
+  (cond ((utf8-initial-byte-1? byte) 1)
+       ((utf8-initial-byte-2? byte) 2)
+       ((utf8-initial-byte-3? byte) 3)
+       ((utf8-initial-byte-4? byte) 4)
+       (else (error "Illegal UTF-8 initial byte:" byte))))
+
+(define (decode-utf8-char bytes index)
+  (integer->char
+   (let ((b0 (bytevector-u8-ref bytes index)))
+     (cond ((utf8-initial-byte-1? b0)
+           b0)
+          ((utf8-initial-byte-2? b0)
+           (decode-utf8-2 b0
+                          (bytevector-u8-ref bytes (fix:+ index 1))))
+          ((utf8-initial-byte-3? b0)
+           (decode-utf8-3 b0
+                          (bytevector-u8-ref bytes (fix:+ index 1))
+                          (bytevector-u8-ref bytes (fix:+ index 2))))
+          ((utf8-initial-byte-4? b0)
+           (decode-utf8-4 b0
+                          (bytevector-u8-ref bytes (fix:+ index 1))
+                          (bytevector-u8-ref bytes (fix:+ index 2))
+                          (bytevector-u8-ref bytes (fix:+ index 3))))
+          (else
+           (error "Illegal UTF-8 initial byte:" b0))))))
+
+(define (decode-utf8-2 b0 b1)
+  (if (not (and (fix:> b0 #xC1)
+               (utf8-trailing-byte? b1)))
+      (error "Ill-formed UTF-8 sequence:" b0 b1))
+  (fix:or (extract-bits b0 #x1F 6)
+         (extract-bits b1 #x3F 0)))
+
+(define (decode-utf8-3 b0 b1 b2)
+  (if (not (and (or (fix:> b0 #xE0) (fix:> b1 #x9F))
+               (utf8-trailing-byte? b1)
+               (utf8-trailing-byte? b2)))
+      (error "Ill-formed UTF-8 sequence:" b0 b1 b2))
+  (let ((cp
+        (fix:or (fix:or (extract-bits b0 #x0F 12)
+                        (extract-bits b1 #x3F 6))
+                (extract-bits b2 #x3F 0))))
+    (guarantee-cp-not-utf16-surrogate cp)
+    (guarantee-cp-is-character cp)
+    cp))
+
+(define (decode-utf8-4 b0 b1 b2 b3)
+  (if (not (and (or (fix:> b0 #xF0) (fix:> b1 #x8F))
+               (utf8-trailing-byte? b1)
+               (utf8-trailing-byte? b2)
+               (utf8-trailing-byte? b3)))
+      (error "Ill-formed UTF-8 sequence:" b0 b1 b2 b3))
+  (let ((cp
+        (fix:or (fix:or (extract-bits b0 #x07 18)
+                        (extract-bits b1 #x3F 12))
+                (fix:or (extract-bits b2 #x3F 6)
+                        (extract-bits b3 #x3F 0)))))
+    (guarantee-cp-in-range cp)
+    (guarantee-cp-is-character cp)
+    cp))
+
+(define-integrable (utf8-initial-byte-1? byte)
+  (fix:= #x00 (fix:and #x80 byte)))
+
+(define-integrable (utf8-initial-byte-2? byte)
+  (fix:= #xC0 (fix:and #xE0 byte)))
+
+(define-integrable (utf8-initial-byte-3? byte)
+  (fix:= #xE0 (fix:and #xF0 byte)))
+
+(define-integrable (utf8-initial-byte-4? byte)
+  (fix:= #xF0 (fix:and #xF8 byte)))
+
+(define-integrable (utf8-trailing-byte? byte)
+  (fix:= #x80 (fix:and #xC0 byte)))
+\f
+(define (initial-u16->utf16-char-length u16)
+  (guarantee u16? u16 'initial-u16->utf16-char-length)
+  (if (utf16-high-surrogate? u16) 4 2))
+
+(define (utf16-char-decoder getter)
+  (lambda (bytes index)
+    (let ((d0 (getter bytes index)))
+      (if (utf16-low-surrogate? d0)
+         (error "Ill-formed UTF-16 sequence:" d0))
+      (let ((cp
+            (if (utf16-high-surrogate? d0)
+                (let ((d1 (getter bytes (fix:+ index 2))))
+                  (if (not (utf16-low-surrogate? d1))
+                      (error "Ill-formed UTF-16 sequence:" d0 d1))
+                  (fix:+ (fix:or (extract-bits d0 #x3FF 10)
+                                 (extract-bits d1 #x3FF 0))
+                         #x10000))
+                d0)))
+       (guarantee-cp-in-range cp)
+       (guarantee-cp-is-character cp)
+       (integer->char cp)))))
+
+(define decode-utf16be-char
+  (utf16-char-decoder bytevector-u16be-ref))
+
+(define decode-utf16le-char
+  (utf16-char-decoder bytevector-u16le-ref))
+
+(define (initial-u32->utf32-char-length u32)
+  (guarantee u32? u32 'initial-u32->utf32-char-length)
+  4)
+
+(define (utf32-char-decoder getter)
+  (lambda (bytes index)
+    (let ((u32 (getter bytes index)))
+      (if (not (< u32 char-code-limit))
+         (error "Value is not a code point:" u32))
+      (guarantee-cp-not-utf16-surrogate u32)
+      (guarantee-cp-is-character u32)
+      (integer->char u32))))
+
+(define decode-utf32be-char
+  (utf32-char-decoder bytevector-u32be-ref))
+
+(define decode-utf32le-char
+  (utf32-char-decoder bytevector-u32le-ref))
\ No newline at end of file
index 377b27de5fdabb7b335a84e034e3331212b84f97..eed707df46316ec8cfab98b4852a6f7c5537eef7 100644 (file)
@@ -442,6 +442,7 @@ USA.
    ((RUNTIME NUMBER) INITIALIZE-DRAGON4!)
    (RUNTIME MISCELLANEOUS-GLOBAL)
    (RUNTIME CHARACTER)
+   (RUNTIME BYTEVECTOR)
    (RUNTIME CHARACTER-SET)
    (RUNTIME GENSYM)
    (RUNTIME STREAM)
index 2c7926eafd53cee8cefc06f92b6b1cb875606da7..0f3eef8259381fa25b50cc6f80b693b2f8746e68 100644 (file)
@@ -1162,9 +1162,17 @@ USA.
          bytevector=?
          bytevector?
          make-bytevector
+         string->utf16be
+         string->utf16le
+         string->utf32be
+         string->utf32le
          string->utf8
          u16?
          u32?
+         utf16be->string
+         utf16le->string
+         utf32be->string
+         utf32le->string
          utf8->string)
   (export (runtime predicate-metadata)
          register-mit-bytevector-predicates!))
@@ -1278,13 +1286,8 @@ USA.
          char-integer-limit
          char-upcase
          char-utf16-byte-length
-         char-utf16be-bytes!
-         char-utf16le-bytes!
          char-utf32-byte-length
-         char-utf32be-bytes!
-         char-utf32le-bytes!
          char-utf8-byte-length
-         char-utf8-bytes!
          char<=?
          char<?
          char=?
@@ -1294,8 +1297,21 @@ USA.
          chars->ascii
          clear-char-bits
          code->char
+         decode-utf16be-char
+         decode-utf16le-char
+         decode-utf32be-char
+         decode-utf32le-char
+         decode-utf8-char
          digit->char
+         encode-utf16be-char!
+         encode-utf16le-char!
+         encode-utf32be-char!
+         encode-utf32le-char!
+         encode-utf8-char!
          guarantee-8-bit-char
+         initial-byte->utf8-char-length
+         initial-u16->utf16-char-length
+         initial-u32->utf32-char-length
          integer->char
          make-char
          name->char