From: Chris Hanson Date: Wed, 18 Jan 2017 07:47:10 +0000 (-0800) Subject: Implement character encoders for UTF-16 and UTF-32. X-Git-Tag: mit-scheme-pucked-9.2.12~227^2~104 X-Git-Url: https://birchwood-abbey.net/git?a=commitdiff_plain;h=a2c0c487ff3ba0af4db29b12095b1a3b98b42752;p=mit-scheme.git Implement character encoders for UTF-16 and UTF-32. --- diff --git a/src/runtime/char.scm b/src/runtime/char.scm index b6493bf1b..0d37470f3 100644 --- a/src/runtime/char.scm +++ b/src/runtime/char.scm @@ -137,51 +137,77 @@ USA. (define (chars->ascii chars) (map char->ascii chars)) -(define (char-utf8-byte-length char) - (%sv-utf8-byte-length (unicode-char->scalar-value char))) +;;;; UTF-{8,16,32} encoders -(define (char-utf8-bytes char) +(define (char-utf8-byte-length char) (let ((sv (unicode-char->scalar-value char))) - (let ((bytes (make-bytevector (%sv-utf8-byte-length sv)))) - (%sv-utf8-bytes! bytes 0 sv) - bytes))) + (cond ((fix:< sv #x00000080) 1) + ((fix:< sv #x00000800) 2) + ((fix:< sv #x00010000) 3) + (else 4)))) (define (char-utf8-bytes! bytes index char) - (%sv-utf8-bytes! bytes index (unicode-char->scalar-value char))) - -(define (%sv-utf8-byte-length sv) - (cond ((fix:< sv #x00000080) 1) - ((fix:< sv #x00000800) 2) - ((fix:< sv #x00010000) 3) - (else 4))) - -(define (%sv-utf8-bytes! bytes index sv) - - (define-integrable (initial-byte n-bits offset) - (fix:or (fix:and (fix:lsh #xFF (fix:+ n-bits 1)) #xFF) - (fix:lsh sv (fix:- 0 offset)))) - - (define-integrable (trailing-byte offset) - (fix:or #x80 (fix:and (fix:lsh sv (fix:- 0 offset)) #x3F))) - - (cond ((fix:< sv #x00000080) - (bytevector-u8-set! bytes index sv) - (fix:+ index 1)) - ((fix:< sv #x00000800) - (bytevector-u8-set! bytes index (initial-byte 5 6)) - (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 0)) - (fix:+ index 2)) - ((fix:< sv #x00010000) - (bytevector-u8-set! bytes index (initial-byte 4 12)) - (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 6)) - (bytevector-u8-set! bytes (fix:+ index 2) (trailing-byte 0)) - (fix:+ index 3)) - (else - (bytevector-u8-set! bytes index (initial-byte 3 18)) - (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 12)) - (bytevector-u8-set! bytes (fix:+ index 2) (trailing-byte 6)) - (bytevector-u8-set! bytes (fix:+ index 3) (trailing-byte 0)) - (fix:+ index 4)))) + (let ((sv (unicode-char->scalar-value char))) + + (define-integrable (initial-byte n-bits offset) + (fix:or (fix:and (fix:lsh #xFF (fix:+ n-bits 1)) #xFF) + (fix:lsh sv (fix:- 0 offset)))) + + (define-integrable (trailing-byte offset) + (fix:or #x80 (fix:and (fix:lsh sv (fix:- 0 offset)) #x3F))) + + (cond ((fix:< sv #x00000080) + (bytevector-u8-set! bytes index sv) + (fix:+ index 1)) + ((fix:< sv #x00000800) + (bytevector-u8-set! bytes index (initial-byte 5 6)) + (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 0)) + (fix:+ index 2)) + ((fix:< sv #x00010000) + (bytevector-u8-set! bytes index (initial-byte 4 12)) + (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 6)) + (bytevector-u8-set! bytes (fix:+ index 2) (trailing-byte 0)) + (fix:+ index 3)) + (else + (bytevector-u8-set! bytes index (initial-byte 3 18)) + (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 12)) + (bytevector-u8-set! bytes (fix:+ index 2) (trailing-byte 6)) + (bytevector-u8-set! bytes (fix:+ index 3) (trailing-byte 0)) + (fix:+ index 4))))) + +(define (char-utf16-byte-length char) + (if (fix:< (unicode-char->scalar-value char) #x00010000) 2 4)) + +(define (char-utf16be-bytes! bytes index char) + (let ((sv (unicode-char->scalar-value char))) + (cond ((fix:< sv #x10000) + (bytevector-u16be-set! bytes index sv) + (fix:+ index 2)) + (else + (receive (h l) (split-into-utf16-surrogates sv) + (bytevector-u16be-set! bytes index h) + (bytevector-u16be-set! bytes (fix:+ index 1) l)) + (fix:+ index 4))))) + +(define (char-utf16le-bytes! bytes index char) + (let ((sv (unicode-char->scalar-value char))) + (cond ((fix:< sv #x10000) + (bytevector-u16le-set! bytes index sv) + (fix:+ index 2)) + (else + (receive (h l) (split-into-utf16-surrogates sv) + (bytevector-u16le-set! bytes index l) + (bytevector-u16le-set! bytes (fix:+ index 1) h)) + (fix:+ index 4))))) + +(define (char-utf32-byte-length char) + (if (fix:< (unicode-char->scalar-value char) #x00010000) 2 4)) + +(define (char-utf32be-bytes! bytes index char) + (bytevector-u32be-set! bytes index (unicode-char->scalar-value char))) + +(define (char-utf32le-bytes! bytes index char) + (bytevector-u32le-set! bytes index (unicode-char->scalar-value char))) (define (char=? x y) (fix:= (char->integer x) (char->integer y))) diff --git a/src/runtime/runtime.pkg b/src/runtime/runtime.pkg index c813299b4..2c7926eaf 100644 --- a/src/runtime/runtime.pkg +++ b/src/runtime/runtime.pkg @@ -1277,8 +1277,13 @@ USA. char-downcase char-integer-limit char-upcase + char-utf16-byte-length + char-utf16be-bytes! + char-utf16le-bytes! + char-utf32-byte-length + char-utf32be-bytes! + char-utf32le-bytes! char-utf8-byte-length - char-utf8-bytes char-utf8-bytes! char<=? char