From 64664b958aab1b15cca967c80852fd472a5c28c1 Mon Sep 17 00:00:00 2001 From: Chris Hanson Date: Sun, 17 Nov 2019 16:48:18 -0800 Subject: [PATCH] Implement UTF-{16,32} converters that use BOM. This is needed for support of SRFI 140. --- src/runtime/bytevector.scm | 90 ++++++++++++++++++--- src/runtime/runtime.pkg | 4 + tests/runtime/test-bytevector.scm | 125 +++++++++++++++++++++++++++++- 3 files changed, 206 insertions(+), 13 deletions(-) diff --git a/src/runtime/bytevector.scm b/src/runtime/bytevector.scm index fce5ca9b3..c4fecb95a 100644 --- a/src/runtime/bytevector.scm +++ b/src/runtime/bytevector.scm @@ -294,19 +294,21 @@ USA. (bytevector-u8-set! bytevector (fix:+ index 3) (u32le-byte3 u32))) (define-integrable (string-encoder char-byte-length allocator encode-char! - caller) + bom? caller) (lambda (string #!optional start end) (let* ((end (fix:end-index end (string-length string) caller)) (start (fix:start-index start end caller))) (let ((bytes (allocator - (let loop ((index start) (n-bytes 0)) + (let loop ((index start) + (n-bytes (if bom? (char-byte-length #\bom) 0))) (if (fix:< index end) (loop (fix:+ index 1) (fix:+ n-bytes (char-byte-length (string-ref string index)))) n-bytes))))) - (let loop ((from start) (to 0)) + (let loop ((from start) + (to (if bom? (encode-char! bytes 0 #\bom) 0))) (if (fix:< from end) (loop (fix:+ from 1) (encode-char! bytes to (string-ref string from))))) @@ -319,25 +321,47 @@ USA. (define string->utf8) (define string->utf16be) (define string->utf16le) +(define string->utf16be+bom) +(define string->utf16le+bom) +(define string->utf16) (define string->utf32be) (define string->utf32le) +(define string->utf32be+bom) +(define string->utf32le+bom) +(define string->utf32) (add-boot-init! (lambda () (set! string->utf8 (string-encoder char-utf8-byte-length utf8-allocator - encode-utf8-char! 'string->utf8)) + encode-utf8-char! #f 'string->utf8)) (set! string->utf16be (string-encoder char-utf16-byte-length allocate-bytevector - encode-utf16be-char! 'string->utf16be)) + encode-utf16be-char! #f 'string->utf16be)) (set! string->utf16le (string-encoder char-utf16-byte-length allocate-bytevector - encode-utf16le-char! 'string->utf16le)) + encode-utf16le-char! #f 'string->utf16le)) + (set! string->utf16be+bom + (string-encoder char-utf16-byte-length allocate-bytevector + encode-utf16be-char! #t 'string->utf16)) + (set! string->utf16le+bom + (string-encoder char-utf16-byte-length allocate-bytevector + encode-utf16le-char! #t 'string->utf16)) + (set! string->utf16 + (if (host-big-endian?) string->utf16be+bom string->utf16le+bom)) (set! string->utf32be (string-encoder char-utf32-byte-length allocate-bytevector - encode-utf32be-char! 'string->utf32be)) + encode-utf32be-char! #f 'string->utf32be)) (set! string->utf32le (string-encoder char-utf32-byte-length allocate-bytevector - encode-utf32le-char! 'string->utf32le)) + encode-utf32le-char! #f 'string->utf32le)) + (set! string->utf32be+bom + (string-encoder char-utf32-byte-length allocate-bytevector + encode-utf32be-char! #t 'string->utf32)) + (set! string->utf32le+bom + (string-encoder char-utf32-byte-length allocate-bytevector + encode-utf32le-char! #t 'string->utf32)) + (set! string->utf32 + (if (host-big-endian?) string->utf32be+bom string->utf32le+bom)) unspecific)) (define-integrable (bytes-decoder getter initial->length decode-char step noun @@ -388,17 +412,59 @@ USA. decode-utf8-char 1 "UTF-8" 'utf8->string)) (set! utf16be->string (bytes-decoder bytevector-u16be-ref initial-u16->utf16-char-length - decode-utf16be-char 1 "UTF-16BE" 'utf16be->string)) + decode-utf16be-char 2 "UTF-16BE" 'utf16be->string)) (set! utf16le->string (bytes-decoder bytevector-u16le-ref initial-u16->utf16-char-length - decode-utf16le-char 1 "UTF-16LE" 'utf16le->string)) + decode-utf16le-char 2 "UTF-16LE" 'utf16le->string)) (set! utf32be->string (bytes-decoder bytevector-u32be-ref initial-u32->utf32-char-length - decode-utf32be-char 1 "UTF-32BE" 'utf32be->string)) + decode-utf32be-char 4 "UTF-32BE" 'utf32be->string)) (set! utf32le->string (bytes-decoder bytevector-u32le-ref initial-u32->utf32-char-length - decode-utf32le-char 1 "UTF-32LE" 'utf32le->string)) + decode-utf32le-char 4 "UTF-32LE" 'utf32le->string)) unspecific)) + +(define (utf16->string bytevector #!optional start end replace?) + (let* ((end (fix:end-index end (bytevector-length bytevector) 'utf16->string)) + (start (fix:start-index start end 'utf16->string))) + + (define (default) + (if (host-big-endian?) + (utf16be->string bytevector start end replace?) + (utf16le->string bytevector start end replace?))) + + (if (fix:<= (fix:+ start 2) end) + (let ((b0 (bytevector-u8-ref bytevector start)) + (b1 (bytevector-u8-ref bytevector (fix:+ start 1)))) + (cond ((and (fix:= b0 #xFE) (fix:= b1 #xFF)) + (utf16be->string bytevector (fix:+ start 2) end replace?)) + ((and (fix:= b0 #xFF) (fix:= b1 #xFE)) + (utf16le->string bytevector (fix:+ start 2) end replace?)) + (else + (default)))) + (default)))) + +(define (utf32->string bytevector #!optional start end replace?) + (let* ((end (fix:end-index end (bytevector-length bytevector) 'utf32->string)) + (start (fix:start-index start end 'utf32->string))) + + (define (default) + (if (host-big-endian?) + (utf32be->string bytevector start end replace?) + (utf32le->string bytevector start end replace?))) + + (if (fix:<= (fix:+ start 4) end) + (let ((b0 (bytevector-u8-ref bytevector start)) + (b1 (bytevector-u8-ref bytevector (fix:+ start 1))) + (b2 (bytevector-u8-ref bytevector (fix:+ start 2))) + (b3 (bytevector-u8-ref bytevector (fix:+ start 3)))) + (cond ((and (fix:= b0 0) (fix:= b1 0) (fix:= b2 #xFE) (fix:= b3 #xFF)) + (utf32be->string bytevector (fix:+ start 4) end replace?)) + ((and (fix:= b0 #xFF) (fix:= b1 #xFE) (fix:= b2 0) (fix:= b3 0)) + (utf32le->string bytevector (fix:+ start 4) end replace?)) + (else + (default)))) + (default)))) (define (string->iso8859-1 string #!optional start end) (let* ((end (fix:end-index end (string-length string) 'string->iso8859-1)) diff --git a/src/runtime/runtime.pkg b/src/runtime/runtime.pkg index 8e6aa938d..151185151 100644 --- a/src/runtime/runtime.pkg +++ b/src/runtime/runtime.pkg @@ -1271,16 +1271,20 @@ USA. list->bytevector make-bytevector string->iso8859-1 + string->utf16 string->utf16be string->utf16le + string->utf32 string->utf32be string->utf32le string->utf8 u16? u32? u8? + utf16->string utf16be->string utf16le->string + utf32->string utf32be->string utf32le->string utf8->string diff --git a/tests/runtime/test-bytevector.scm b/tests/runtime/test-bytevector.scm index eca647249..2e1e734be 100644 --- a/tests/runtime/test-bytevector.scm +++ b/tests/runtime/test-bytevector.scm @@ -512,4 +512,127 @@ USA. (map (lambda (i) (declare (ignore i)) (random #x100)) - (iota (random (+ max-length 1))))) \ No newline at end of file + (iota (random (+ max-length 1))))) + +;; These tests taken from SRFI 140. +(define-test 'utf-converters + (lambda () + (assert-equal (string->utf8 "abc") + '#u8(97 98 99)) + (assert-equal (string->utf8 "xxxabcyyyzzz" 3) + '#u8(97 98 99 121 121 121 122 122 122)) + (assert-equal (string->utf8 "xxxabcyyyzzz" 3 6) + '#u8(97 98 99)) + + (assert-equal (cond-expand (big-endian '#u8(254 255 0 97 0 98 0 99)) + (else '#u8(255 254 97 0 98 0 99 0))) + (string->utf16 "abc")) + (assert-equal + (cond-expand + (big-endian + '#u8(254 255 0 97 0 98 0 99 0 121 0 121 0 121 0 122 0 122 0 122)) + (else + '#u8(255 254 97 0 98 0 99 0 121 0 121 0 121 0 122 0 122 0 122 0))) + (string->utf16 "xxxabcyyyzzz" 3)) + (assert-equal (cond-expand (big-endian '#u8(254 255 0 97 0 98 0 99)) + (else '#u8(255 254 97 0 98 0 99 0))) + (string->utf16 "xxxabcyyyzzz" 3 6)) + + (assert-equal (string->utf16be "abc") + '#u8(0 97 0 98 0 99)) + (assert-equal (string->utf16be "xxxabcyyyzzz" 3) + '#u8(0 97 0 98 0 99 0 121 0 121 0 121 0 122 0 122 0 122)) + (assert-equal (string->utf16be "xxxabcyyyzzz" 3 6) + '#u8(0 97 0 98 0 99)) + + (assert-equal (string->utf16le "abc") + '#u8(97 0 98 0 99 0)) + (assert-equal (string->utf16le "xxxabcyyyzzz" 3) + '#u8(97 0 98 0 99 0 121 0 121 0 121 0 122 0 122 0 122 0)) + (assert-equal (string->utf16le "xxxabcyyyzzz" 3 6) + '#u8(97 0 98 0 99 0)) + + (assert-equal (utf8->string '#u8(97 98 99)) + "abc") + (assert-equal (utf8->string '#u8(0 1 2 97 98 99 121 121 121 122 122 122) 3) + "abcyyyzzz") + (assert-equal (utf8->string '#u8(41 42 43 97 98 99 100 101 102) 3 6) + "abc") + + (assert-equal (utf16->string '#u8(254 255 0 97 0 98 0 99)) + "abc") + (assert-equal (utf16->string '#u8(255 254 97 0 98 0 99 0)) + "abc") + + (assert-equal (utf16->string (string->utf16 "abc") 2) + "abc") + (assert-equal (utf16->string (string->utf16 "abcdef") 4) + "bcdef") + (assert-equal (utf16->string (string->utf16 "abcdef") 4 10) + "bcd") + + (assert-equal (utf16be->string '#u8(0 97 0 98 0 99)) + "abc") + (assert-equal (utf16be->string (string->utf16be "abc") 2) + "bc") + (assert-equal (utf16be->string (string->utf16be "abcdef") 2 8) + "bcd") + + (assert-equal (utf16le->string '#u8(97 0 98 0 99 0)) + "abc") + (assert-equal (utf16le->string (string->utf16le "abc") 2) + "bc") + (assert-equal (utf16le->string (string->utf16le "abcdef") 2 8) + "bcd"))) + +;; These tests taken from SRFI 140. +(define-test 'utf-converters-beyond-bmp + (lambda () + (assert-equal (string->utf8 beyond-bmp) + '#u8(97 195 128 206 191 + 240 157 145 129 240 157 132 147 240 157 132 144 122)) + + (if (host-big-endian?) + (assert-equal + (string->utf16 beyond-bmp) + '#u8(254 255 0 97 0 192 3 191 + 216 53 220 65 216 52 221 19 216 52 221 16 0 122)) + (assert-equal + (string->utf16 beyond-bmp) + '#u8(255 254 97 0 192 0 191 3 + 53 216 65 220 52 216 19 221 52 216 16 221 122 0))) + + (assert-equal + (string->utf16be beyond-bmp) + '#u8(0 97 0 192 3 191 216 53 220 65 216 52 221 19 216 52 221 16 0 122)) + + (assert-equal + (string->utf16le beyond-bmp) + '#u8(97 0 192 0 191 3 53 216 65 220 52 216 19 221 52 216 16 221 122 0)) + + (assert-equal + (utf8->string + '#u8(97 195 128 206 191 + 240 157 145 129 240 157 132 147 240 157 132 144 122)) + beyond-bmp) + + (assert-equal (utf16->string (string->utf16 beyond-bmp)) + beyond-bmp) + + (assert-equal (utf16->string (string->utf16 beyond-bmp) 2) + beyond-bmp) + + (assert-equal (utf16be->string (string->utf16be beyond-bmp)) beyond-bmp) + + (assert-equal (utf16le->string (string->utf16le beyond-bmp)) beyond-bmp) + + (assert-equal (utf16be->string '#u8(254 255 0 97 0 98 0 99)) + (string-append (string (integer->char #xfeff)) "abc")) + + (assert-equal (utf16le->string '#u8(255 254 97 0 98 0 99 0)) + (string-append (string (integer->char #xfeff)) "abc")))) + +(define beyond-bmp + (list->string (map integer->char + '(#x61 #xc0 #x3bf + #x1d441 #x1d113 #x1d110 #x7a)))) \ No newline at end of file -- 2.25.1