(bytevector-u8-set! bytevector (fix:+ index 3) (u32le-byte3 u32)))
\f
(define-integrable (string-encoder char-byte-length allocator encode-char!
- caller)
+ bom? caller)
(lambda (string #!optional start end)
(let* ((end (fix:end-index end (string-length string) caller))
(start (fix:start-index start end caller)))
(let ((bytes
(allocator
- (let loop ((index start) (n-bytes 0))
+ (let loop ((index start)
+ (n-bytes (if bom? (char-byte-length #\bom) 0)))
(if (fix:< index end)
(loop (fix:+ index 1)
(fix:+ n-bytes
(char-byte-length (string-ref string index))))
n-bytes)))))
- (let loop ((from start) (to 0))
+ (let loop ((from start)
+ (to (if bom? (encode-char! bytes 0 #\bom) 0)))
(if (fix:< from end)
(loop (fix:+ from 1)
(encode-char! bytes to (string-ref string from)))))
(define string->utf8)
(define string->utf16be)
(define string->utf16le)
+(define string->utf16be+bom)
+(define string->utf16le+bom)
+(define string->utf16)
(define string->utf32be)
(define string->utf32le)
+(define string->utf32be+bom)
+(define string->utf32le+bom)
+(define string->utf32)
(add-boot-init!
(lambda ()
(set! string->utf8
(string-encoder char-utf8-byte-length utf8-allocator
- encode-utf8-char! 'string->utf8))
+ encode-utf8-char! #f 'string->utf8))
(set! string->utf16be
(string-encoder char-utf16-byte-length allocate-bytevector
- encode-utf16be-char! 'string->utf16be))
+ encode-utf16be-char! #f 'string->utf16be))
(set! string->utf16le
(string-encoder char-utf16-byte-length allocate-bytevector
- encode-utf16le-char! 'string->utf16le))
+ encode-utf16le-char! #f 'string->utf16le))
+ (set! string->utf16be+bom
+ (string-encoder char-utf16-byte-length allocate-bytevector
+ encode-utf16be-char! #t 'string->utf16))
+ (set! string->utf16le+bom
+ (string-encoder char-utf16-byte-length allocate-bytevector
+ encode-utf16le-char! #t 'string->utf16))
+ (set! string->utf16
+ (if (host-big-endian?) string->utf16be+bom string->utf16le+bom))
(set! string->utf32be
(string-encoder char-utf32-byte-length allocate-bytevector
- encode-utf32be-char! 'string->utf32be))
+ encode-utf32be-char! #f 'string->utf32be))
(set! string->utf32le
(string-encoder char-utf32-byte-length allocate-bytevector
- encode-utf32le-char! 'string->utf32le))
+ encode-utf32le-char! #f 'string->utf32le))
+ (set! string->utf32be+bom
+ (string-encoder char-utf32-byte-length allocate-bytevector
+ encode-utf32be-char! #t 'string->utf32))
+ (set! string->utf32le+bom
+ (string-encoder char-utf32-byte-length allocate-bytevector
+ encode-utf32le-char! #t 'string->utf32))
+ (set! string->utf32
+ (if (host-big-endian?) string->utf32be+bom string->utf32le+bom))
unspecific))
\f
(define-integrable (bytes-decoder getter initial->length decode-char step noun
decode-utf8-char 1 "UTF-8" 'utf8->string))
(set! utf16be->string
(bytes-decoder bytevector-u16be-ref initial-u16->utf16-char-length
- decode-utf16be-char 1 "UTF-16BE" 'utf16be->string))
+ decode-utf16be-char 2 "UTF-16BE" 'utf16be->string))
(set! utf16le->string
(bytes-decoder bytevector-u16le-ref initial-u16->utf16-char-length
- decode-utf16le-char 1 "UTF-16LE" 'utf16le->string))
+ decode-utf16le-char 2 "UTF-16LE" 'utf16le->string))
(set! utf32be->string
(bytes-decoder bytevector-u32be-ref initial-u32->utf32-char-length
- decode-utf32be-char 1 "UTF-32BE" 'utf32be->string))
+ decode-utf32be-char 4 "UTF-32BE" 'utf32be->string))
(set! utf32le->string
(bytes-decoder bytevector-u32le-ref initial-u32->utf32-char-length
- decode-utf32le-char 1 "UTF-32LE" 'utf32le->string))
+ decode-utf32le-char 4 "UTF-32LE" 'utf32le->string))
unspecific))
+\f
+(define (utf16->string bytevector #!optional start end replace?)
+ (let* ((end (fix:end-index end (bytevector-length bytevector) 'utf16->string))
+ (start (fix:start-index start end 'utf16->string)))
+
+ (define (default)
+ (if (host-big-endian?)
+ (utf16be->string bytevector start end replace?)
+ (utf16le->string bytevector start end replace?)))
+
+ (if (fix:<= (fix:+ start 2) end)
+ (let ((b0 (bytevector-u8-ref bytevector start))
+ (b1 (bytevector-u8-ref bytevector (fix:+ start 1))))
+ (cond ((and (fix:= b0 #xFE) (fix:= b1 #xFF))
+ (utf16be->string bytevector (fix:+ start 2) end replace?))
+ ((and (fix:= b0 #xFF) (fix:= b1 #xFE))
+ (utf16le->string bytevector (fix:+ start 2) end replace?))
+ (else
+ (default))))
+ (default))))
+
+(define (utf32->string bytevector #!optional start end replace?)
+ (let* ((end (fix:end-index end (bytevector-length bytevector) 'utf32->string))
+ (start (fix:start-index start end 'utf32->string)))
+
+ (define (default)
+ (if (host-big-endian?)
+ (utf32be->string bytevector start end replace?)
+ (utf32le->string bytevector start end replace?)))
+
+ (if (fix:<= (fix:+ start 4) end)
+ (let ((b0 (bytevector-u8-ref bytevector start))
+ (b1 (bytevector-u8-ref bytevector (fix:+ start 1)))
+ (b2 (bytevector-u8-ref bytevector (fix:+ start 2)))
+ (b3 (bytevector-u8-ref bytevector (fix:+ start 3))))
+ (cond ((and (fix:= b0 0) (fix:= b1 0) (fix:= b2 #xFE) (fix:= b3 #xFF))
+ (utf32be->string bytevector (fix:+ start 4) end replace?))
+ ((and (fix:= b0 #xFF) (fix:= b1 #xFE) (fix:= b2 0) (fix:= b3 0))
+ (utf32le->string bytevector (fix:+ start 4) end replace?))
+ (else
+ (default))))
+ (default))))
(define (string->iso8859-1 string #!optional start end)
(let* ((end (fix:end-index end (string-length string) 'string->iso8859-1))
(map (lambda (i)
(declare (ignore i))
(random #x100))
- (iota (random (+ max-length 1)))))
\ No newline at end of file
+ (iota (random (+ max-length 1)))))
+\f
+;; These tests taken from SRFI 140.
+(define-test 'utf-converters
+ (lambda ()
+ (assert-equal (string->utf8 "abc")
+ '#u8(97 98 99))
+ (assert-equal (string->utf8 "xxxabcyyyzzz" 3)
+ '#u8(97 98 99 121 121 121 122 122 122))
+ (assert-equal (string->utf8 "xxxabcyyyzzz" 3 6)
+ '#u8(97 98 99))
+
+ (assert-equal (cond-expand (big-endian '#u8(254 255 0 97 0 98 0 99))
+ (else '#u8(255 254 97 0 98 0 99 0)))
+ (string->utf16 "abc"))
+ (assert-equal
+ (cond-expand
+ (big-endian
+ '#u8(254 255 0 97 0 98 0 99 0 121 0 121 0 121 0 122 0 122 0 122))
+ (else
+ '#u8(255 254 97 0 98 0 99 0 121 0 121 0 121 0 122 0 122 0 122 0)))
+ (string->utf16 "xxxabcyyyzzz" 3))
+ (assert-equal (cond-expand (big-endian '#u8(254 255 0 97 0 98 0 99))
+ (else '#u8(255 254 97 0 98 0 99 0)))
+ (string->utf16 "xxxabcyyyzzz" 3 6))
+
+ (assert-equal (string->utf16be "abc")
+ '#u8(0 97 0 98 0 99))
+ (assert-equal (string->utf16be "xxxabcyyyzzz" 3)
+ '#u8(0 97 0 98 0 99 0 121 0 121 0 121 0 122 0 122 0 122))
+ (assert-equal (string->utf16be "xxxabcyyyzzz" 3 6)
+ '#u8(0 97 0 98 0 99))
+
+ (assert-equal (string->utf16le "abc")
+ '#u8(97 0 98 0 99 0))
+ (assert-equal (string->utf16le "xxxabcyyyzzz" 3)
+ '#u8(97 0 98 0 99 0 121 0 121 0 121 0 122 0 122 0 122 0))
+ (assert-equal (string->utf16le "xxxabcyyyzzz" 3 6)
+ '#u8(97 0 98 0 99 0))
+
+ (assert-equal (utf8->string '#u8(97 98 99))
+ "abc")
+ (assert-equal (utf8->string '#u8(0 1 2 97 98 99 121 121 121 122 122 122) 3)
+ "abcyyyzzz")
+ (assert-equal (utf8->string '#u8(41 42 43 97 98 99 100 101 102) 3 6)
+ "abc")
+
+ (assert-equal (utf16->string '#u8(254 255 0 97 0 98 0 99))
+ "abc")
+ (assert-equal (utf16->string '#u8(255 254 97 0 98 0 99 0))
+ "abc")
+
+ (assert-equal (utf16->string (string->utf16 "abc") 2)
+ "abc")
+ (assert-equal (utf16->string (string->utf16 "abcdef") 4)
+ "bcdef")
+ (assert-equal (utf16->string (string->utf16 "abcdef") 4 10)
+ "bcd")
+
+ (assert-equal (utf16be->string '#u8(0 97 0 98 0 99))
+ "abc")
+ (assert-equal (utf16be->string (string->utf16be "abc") 2)
+ "bc")
+ (assert-equal (utf16be->string (string->utf16be "abcdef") 2 8)
+ "bcd")
+
+ (assert-equal (utf16le->string '#u8(97 0 98 0 99 0))
+ "abc")
+ (assert-equal (utf16le->string (string->utf16le "abc") 2)
+ "bc")
+ (assert-equal (utf16le->string (string->utf16le "abcdef") 2 8)
+ "bcd")))
+\f
+;; These tests taken from SRFI 140.
+(define-test 'utf-converters-beyond-bmp
+ (lambda ()
+ (assert-equal (string->utf8 beyond-bmp)
+ '#u8(97 195 128 206 191
+ 240 157 145 129 240 157 132 147 240 157 132 144 122))
+
+ (if (host-big-endian?)
+ (assert-equal
+ (string->utf16 beyond-bmp)
+ '#u8(254 255 0 97 0 192 3 191
+ 216 53 220 65 216 52 221 19 216 52 221 16 0 122))
+ (assert-equal
+ (string->utf16 beyond-bmp)
+ '#u8(255 254 97 0 192 0 191 3
+ 53 216 65 220 52 216 19 221 52 216 16 221 122 0)))
+
+ (assert-equal
+ (string->utf16be beyond-bmp)
+ '#u8(0 97 0 192 3 191 216 53 220 65 216 52 221 19 216 52 221 16 0 122))
+
+ (assert-equal
+ (string->utf16le beyond-bmp)
+ '#u8(97 0 192 0 191 3 53 216 65 220 52 216 19 221 52 216 16 221 122 0))
+
+ (assert-equal
+ (utf8->string
+ '#u8(97 195 128 206 191
+ 240 157 145 129 240 157 132 147 240 157 132 144 122))
+ beyond-bmp)
+
+ (assert-equal (utf16->string (string->utf16 beyond-bmp))
+ beyond-bmp)
+
+ (assert-equal (utf16->string (string->utf16 beyond-bmp) 2)
+ beyond-bmp)
+
+ (assert-equal (utf16be->string (string->utf16be beyond-bmp)) beyond-bmp)
+
+ (assert-equal (utf16le->string (string->utf16le beyond-bmp)) beyond-bmp)
+
+ (assert-equal (utf16be->string '#u8(254 255 0 97 0 98 0 99))
+ (string-append (string (integer->char #xfeff)) "abc"))
+
+ (assert-equal (utf16le->string '#u8(255 254 97 0 98 0 99 0))
+ (string-append (string (integer->char #xfeff)) "abc"))))
+
+(define beyond-bmp
+ (list->string (map integer->char
+ '(#x61 #xc0 #x3bf
+ #x1d441 #x1d113 #x1d110 #x7a))))
\ No newline at end of file