Implement UTF-{16,32} converters that use BOM.

author Chris Hanson <org/chris-hanson/cph>

Mon, 18 Nov 2019 00:48:18 +0000 (16:48 -0800)

committer Chris Hanson <org/chris-hanson/cph>

Mon, 18 Nov 2019 00:48:18 +0000 (16:48 -0800)
author Chris Hanson <org/chris-hanson/cph>
Mon, 18 Nov 2019 00:48:18 +0000 (16:48 -0800)
committer Chris Hanson <org/chris-hanson/cph>
Mon, 18 Nov 2019 00:48:18 +0000 (16:48 -0800)
diff --git a/src/runtime/bytevector.scm b/src/runtime/bytevector.scm

index fce5ca9b3d1ba47f5a6fda972457f2b535aaabc0..c4fecb95a1524b42db6c10086a4a4cb888a3a5b2 100644 (file)
--- a/src/runtime/bytevector.scm
+++ b/src/runtime/bytevector.scm
@@ -294,19 +294,21 @@ USA.
    (bytevector-u8-set! bytevector (fix:+ index 3) (u32le-byte3 u32)))
  \f
  (define-integrable (string-encoder char-byte-length allocator encode-char!
-                                  caller)
+                                  bom? caller)
    (lambda (string #!optional start end)
      (let* ((end (fix:end-index end (string-length string) caller))
            (start (fix:start-index start end caller)))
        (let ((bytes
              (allocator
-             (let loop ((index start) (n-bytes 0))
+             (let loop ((index start)
+                        (n-bytes (if bom? (char-byte-length #\bom) 0)))
                 (if (fix:< index end)
                     (loop (fix:+ index 1)
                           (fix:+ n-bytes
                                  (char-byte-length (string-ref string index))))
                     n-bytes)))))
-       (let loop ((from start) (to 0))
+       (let loop ((from start)
+                  (to (if bom? (encode-char! bytes 0 #\bom) 0)))
           (if (fix:< from end)
               (loop (fix:+ from 1)
                     (encode-char! bytes to (string-ref string from)))))
@@ -319,25 +321,47 @@ USA.
  (define string->utf8)
  (define string->utf16be)
  (define string->utf16le)
+(define string->utf16be+bom)
+(define string->utf16le+bom)
+(define string->utf16)
  (define string->utf32be)
  (define string->utf32le)
+(define string->utf32be+bom)
+(define string->utf32le+bom)
+(define string->utf32)
  (add-boot-init!
   (lambda ()
     (set! string->utf8
          (string-encoder char-utf8-byte-length utf8-allocator
-                        encode-utf8-char! 'string->utf8))
+                        encode-utf8-char! #f 'string->utf8))
     (set! string->utf16be
          (string-encoder char-utf16-byte-length allocate-bytevector
-                        encode-utf16be-char! 'string->utf16be))
+                        encode-utf16be-char! #f 'string->utf16be))
     (set! string->utf16le
          (string-encoder char-utf16-byte-length allocate-bytevector
-                        encode-utf16le-char! 'string->utf16le))
+                        encode-utf16le-char! #f 'string->utf16le))
+   (set! string->utf16be+bom
+        (string-encoder char-utf16-byte-length allocate-bytevector
+                        encode-utf16be-char! #t 'string->utf16))
+   (set! string->utf16le+bom
+        (string-encoder char-utf16-byte-length allocate-bytevector
+                        encode-utf16le-char! #t 'string->utf16))
+   (set! string->utf16
+        (if (host-big-endian?) string->utf16be+bom string->utf16le+bom))
     (set! string->utf32be
          (string-encoder char-utf32-byte-length allocate-bytevector
-                        encode-utf32be-char! 'string->utf32be))
+                        encode-utf32be-char! #f 'string->utf32be))
     (set! string->utf32le
          (string-encoder char-utf32-byte-length allocate-bytevector
-                        encode-utf32le-char! 'string->utf32le))
+                        encode-utf32le-char! #f 'string->utf32le))
+   (set! string->utf32be+bom
+        (string-encoder char-utf32-byte-length allocate-bytevector
+                        encode-utf32be-char! #t 'string->utf32))
+   (set! string->utf32le+bom
+        (string-encoder char-utf32-byte-length allocate-bytevector
+                        encode-utf32le-char! #t 'string->utf32))
+   (set! string->utf32
+        (if (host-big-endian?) string->utf32be+bom string->utf32le+bom))
     unspecific))
  \f
  (define-integrable (bytes-decoder getter initial->length decode-char step noun
@@ -388,17 +412,59 @@ USA.
                         decode-utf8-char 1 "UTF-8" 'utf8->string))
     (set! utf16be->string
          (bytes-decoder bytevector-u16be-ref initial-u16->utf16-char-length
-                       decode-utf16be-char 1 "UTF-16BE" 'utf16be->string))
+                       decode-utf16be-char 2 "UTF-16BE" 'utf16be->string))
     (set! utf16le->string
          (bytes-decoder bytevector-u16le-ref initial-u16->utf16-char-length
-                       decode-utf16le-char 1 "UTF-16LE" 'utf16le->string))
+                       decode-utf16le-char 2 "UTF-16LE" 'utf16le->string))
     (set! utf32be->string
          (bytes-decoder bytevector-u32be-ref initial-u32->utf32-char-length
-                       decode-utf32be-char 1 "UTF-32BE" 'utf32be->string))
+                       decode-utf32be-char 4 "UTF-32BE" 'utf32be->string))
     (set! utf32le->string
          (bytes-decoder bytevector-u32le-ref initial-u32->utf32-char-length
-                       decode-utf32le-char 1 "UTF-32LE" 'utf32le->string))
+                       decode-utf32le-char 4 "UTF-32LE" 'utf32le->string))
     unspecific))
+\f
+(define (utf16->string bytevector #!optional start end replace?)
+  (let* ((end (fix:end-index end (bytevector-length bytevector) 'utf16->string))
+        (start (fix:start-index start end 'utf16->string)))
+
+    (define (default)
+      (if (host-big-endian?)
+         (utf16be->string bytevector start end replace?)
+         (utf16le->string bytevector start end replace?)))
+
+    (if (fix:<= (fix:+ start 2) end)
+       (let ((b0 (bytevector-u8-ref bytevector start))
+             (b1 (bytevector-u8-ref bytevector (fix:+ start 1))))
+         (cond ((and (fix:= b0 #xFE) (fix:= b1 #xFF))
+                (utf16be->string bytevector (fix:+ start 2) end replace?))
+               ((and (fix:= b0 #xFF) (fix:= b1 #xFE))
+                (utf16le->string bytevector (fix:+ start 2) end replace?))
+               (else
+                (default))))
+       (default))))
+
+(define (utf32->string bytevector #!optional start end replace?)
+  (let* ((end (fix:end-index end (bytevector-length bytevector) 'utf32->string))
+        (start (fix:start-index start end 'utf32->string)))
+
+    (define (default)
+      (if (host-big-endian?)
+         (utf32be->string bytevector start end replace?)
+         (utf32le->string bytevector start end replace?)))
+
+    (if (fix:<= (fix:+ start 4) end)
+       (let ((b0 (bytevector-u8-ref bytevector start))
+             (b1 (bytevector-u8-ref bytevector (fix:+ start 1)))
+             (b2 (bytevector-u8-ref bytevector (fix:+ start 2)))
+             (b3 (bytevector-u8-ref bytevector (fix:+ start 3))))
+         (cond ((and (fix:= b0 0) (fix:= b1 0) (fix:= b2 #xFE) (fix:= b3 #xFF))
+                (utf32be->string bytevector (fix:+ start 4) end replace?))
+               ((and (fix:= b0 #xFF) (fix:= b1 #xFE) (fix:= b2 0) (fix:= b3 0))
+                (utf32le->string bytevector (fix:+ start 4) end replace?))
+               (else
+                (default))))
+       (default))))
  
  (define (string->iso8859-1 string #!optional start end)
    (let* ((end (fix:end-index end (string-length string) 'string->iso8859-1))
diff --git a/src/runtime/runtime.pkg b/src/runtime/runtime.pkg

index 8e6aa938de75bf9917dc315b4c176816bbea5fa9..1511851512cd70d32414e5aa25915fdac75305a9 100644 (file)
--- a/src/runtime/runtime.pkg
+++ b/src/runtime/runtime.pkg
@@ -1271,16 +1271,20 @@ USA.
           list->bytevector
           make-bytevector
           string->iso8859-1
+         string->utf16
           string->utf16be
           string->utf16le
+         string->utf32
           string->utf32be
           string->utf32le
           string->utf8
           u16?
           u32?
           u8?
+         utf16->string
           utf16be->string
           utf16le->string
+         utf32->string
           utf32be->string
           utf32le->string
           utf8->string
diff --git a/tests/runtime/test-bytevector.scm b/tests/runtime/test-bytevector.scm

index eca6472497395759056b88097fd5307f41e67a26..2e1e734be5e2ae3fdca060f8b531e737d677ab64 100644 (file)
--- a/tests/runtime/test-bytevector.scm
+++ b/tests/runtime/test-bytevector.scm
@@ -512,4 +512,127 @@ USA.
    (map (lambda (i)
          (declare (ignore i))
          (random #x100))
-       (iota (random (+ max-length 1)))))
-\ No newline at end of file
+       (iota (random (+ max-length 1)))))
+\f
+;; These tests taken from SRFI 140.
+(define-test 'utf-converters
+  (lambda ()
+    (assert-equal (string->utf8 "abc")
+                 '#u8(97 98 99))
+    (assert-equal (string->utf8 "xxxabcyyyzzz" 3)
+                 '#u8(97 98 99 121 121 121 122 122 122))
+    (assert-equal (string->utf8 "xxxabcyyyzzz" 3 6)
+                 '#u8(97 98 99))
+
+    (assert-equal (cond-expand (big-endian '#u8(254 255 0 97 0 98 0 99))
+                              (else '#u8(255 254 97 0 98 0 99 0)))
+                 (string->utf16 "abc"))
+    (assert-equal
+     (cond-expand
+       (big-endian
+       '#u8(254 255 0 97 0 98 0 99 0 121 0 121 0 121 0 122 0 122 0 122))
+       (else
+       '#u8(255  254 97 0 98 0 99 0 121 0 121 0 121 0 122 0 122 0 122 0)))
+     (string->utf16 "xxxabcyyyzzz" 3))
+    (assert-equal (cond-expand (big-endian '#u8(254 255 0 97 0 98 0 99))
+                               (else '#u8(255 254 97 0 98 0 99 0)))
+                 (string->utf16 "xxxabcyyyzzz" 3 6))
+
+    (assert-equal (string->utf16be "abc")
+                 '#u8(0 97 0 98 0 99))
+    (assert-equal (string->utf16be "xxxabcyyyzzz" 3)
+                 '#u8(0 97 0 98 0 99 0 121 0 121 0 121 0 122 0 122 0 122))
+    (assert-equal (string->utf16be "xxxabcyyyzzz" 3 6)
+                 '#u8(0 97 0 98 0 99))
+
+    (assert-equal (string->utf16le "abc")
+                 '#u8(97 0 98 0 99 0))
+    (assert-equal (string->utf16le "xxxabcyyyzzz" 3)
+                 '#u8(97 0 98 0 99 0 121 0 121 0 121 0 122 0 122 0 122 0))
+    (assert-equal (string->utf16le "xxxabcyyyzzz" 3 6)
+                 '#u8(97 0 98 0 99 0))
+
+    (assert-equal (utf8->string '#u8(97 98 99))
+                 "abc")
+    (assert-equal (utf8->string '#u8(0 1 2 97 98 99 121 121 121 122 122 122) 3)
+                 "abcyyyzzz")
+    (assert-equal (utf8->string '#u8(41 42 43 97 98 99 100 101 102) 3 6)
+                 "abc")
+
+    (assert-equal (utf16->string '#u8(254 255 0 97 0 98 0 99))
+                 "abc")
+    (assert-equal (utf16->string '#u8(255 254 97 0 98 0 99 0))
+                 "abc")
+
+    (assert-equal (utf16->string (string->utf16 "abc") 2)
+                 "abc")
+    (assert-equal (utf16->string (string->utf16 "abcdef") 4)
+                 "bcdef")
+    (assert-equal (utf16->string (string->utf16 "abcdef") 4 10)
+                 "bcd")
+
+    (assert-equal (utf16be->string '#u8(0 97 0 98 0 99))
+                 "abc")
+    (assert-equal (utf16be->string (string->utf16be "abc") 2)
+                 "bc")
+    (assert-equal (utf16be->string (string->utf16be "abcdef") 2 8)
+                 "bcd")
+
+    (assert-equal (utf16le->string '#u8(97 0 98 0 99 0))
+                 "abc")
+    (assert-equal (utf16le->string (string->utf16le "abc") 2)
+                 "bc")
+    (assert-equal (utf16le->string (string->utf16le "abcdef") 2 8)
+                 "bcd")))
+\f
+;; These tests taken from SRFI 140.
+(define-test 'utf-converters-beyond-bmp
+  (lambda ()
+    (assert-equal (string->utf8 beyond-bmp)
+                 '#u8(97 195 128 206 191
+                         240 157 145 129 240 157 132 147 240 157 132 144 122))
+
+    (if (host-big-endian?)
+       (assert-equal
+        (string->utf16 beyond-bmp)
+        '#u8(254 255 0 97 0 192 3 191
+                 216 53 220 65 216 52 221 19 216 52 221 16 0 122))
+       (assert-equal
+        (string->utf16 beyond-bmp)
+        '#u8(255 254 97 0 192 0 191 3
+                 53 216 65 220 52 216 19 221 52 216 16 221 122 0)))
+
+    (assert-equal
+     (string->utf16be beyond-bmp)
+     '#u8(0 97 0 192 3 191 216 53 220 65 216 52 221 19 216 52 221 16 0 122))
+
+    (assert-equal
+     (string->utf16le beyond-bmp)
+     '#u8(97 0 192 0 191 3 53 216 65 220 52 216 19 221 52 216 16 221 122 0))
+
+    (assert-equal
+     (utf8->string
+      '#u8(97 195 128 206 191
+              240 157 145 129 240 157 132 147 240 157 132 144 122))
+     beyond-bmp)
+
+    (assert-equal (utf16->string (string->utf16 beyond-bmp))
+                 beyond-bmp)
+
+    (assert-equal (utf16->string (string->utf16 beyond-bmp) 2)
+                 beyond-bmp)
+
+    (assert-equal (utf16be->string (string->utf16be beyond-bmp)) beyond-bmp)
+
+    (assert-equal (utf16le->string (string->utf16le beyond-bmp)) beyond-bmp)
+
+    (assert-equal (utf16be->string '#u8(254 255 0 97 0 98 0 99))
+                 (string-append (string (integer->char #xfeff)) "abc"))
+
+    (assert-equal (utf16le->string '#u8(255 254 97 0 98 0 99 0))
+                 (string-append (string (integer->char #xfeff)) "abc"))))
+
+(define beyond-bmp
+  (list->string (map integer->char
+                    '(#x61 #xc0 #x3bf
+                           #x1d441 #x1d113 #x1d110 #x7a))))
+\ No newline at end of file
author	Chris Hanson <org/chris-hanson/cph>
	Mon, 18 Nov 2019 00:48:18 +0000 (16:48 -0800)
committer	Chris Hanson <org/chris-hanson/cph>
	Mon, 18 Nov 2019 00:48:18 +0000 (16:48 -0800)
src/runtime/bytevector.scm		patch \| blob \| history
src/runtime/runtime.pkg		patch \| blob \| history
tests/runtime/test-bytevector.scm		patch \| blob \| history