From: Chris Hanson Date: Sun, 8 Jan 2017 07:37:29 +0000 (-0800) Subject: Move UTF-8 character encoding into char.scm and share with bytevector. X-Git-Tag: mit-scheme-pucked-9.2.12~227^2~177 X-Git-Url: https://birchwood-abbey.net/git?a=commitdiff_plain;h=805fea6508f2298a54abc13b4f7eeae176848c2a;p=mit-scheme.git Move UTF-8 character encoding into char.scm and share with bytevector. --- diff --git a/src/runtime/bytevector.scm b/src/runtime/bytevector.scm index 1e0247089..281b86247 100644 --- a/src/runtime/bytevector.scm +++ b/src/runtime/bytevector.scm @@ -116,60 +116,18 @@ USA. start)))) (let ((buffer (allocate-bytevector (%count-utf8-bytes string start end)))) (do ((from start (fix:+ from 1)) - (to 0 (fix:+ to (%char->utf8! buffer to (string-ref string from))))) + (to 0 + (fix:+ to + (char-utf8-bytes! buffer to (string-ref string from))))) ((not (fix:< from end)))) buffer))) -(define (%char->utf8! buffer index char) - (let ((cp (char->integer char))) - - (define-integrable (initial-byte n-bits offset) - (fix:or (fix:and (fix:lsh #xFF (fix:+ n-bits 1)) #xFF) - (fix:lsh cp (fix:- 0 offset)))) - - (define-integrable (trailing-byte offset) - (fix:or #x80 (fix:and (fix:lsh cp (fix:- 0 offset)) #x3F))) - - (define-integrable (put-byte! offset byte) - (bytevector-u8-set! buffer (fix:+ index offset) byte)) - - (cond ((fix:< cp #x00000080) - (put-byte! 0 cp) - 1) - ((fix:< cp #x00000800) - (put-byte! 0 (initial-byte 5 6)) - (put-byte! 1 (trailing-byte 0)) - 2) - ((fix:< cp #x00010000) - (if (surrogate? cp) - (error "Code point is a UTF-16 surrogate:" cp)) - (if (non-character? cp) - (error "Code point is a non-character:" cp)) - (put-byte! 0 (initial-byte 4 12)) - (put-byte! 1 (trailing-byte 6)) - (put-byte! 2 (trailing-byte 0)) - 3) - (else - (if (non-character? cp) - (error "Code point is a non-character:" cp)) - (put-byte! 0 (initial-byte 3 18)) - (put-byte! 1 (trailing-byte 12)) - (put-byte! 2 (trailing-byte 6)) - (put-byte! 3 (trailing-byte 0)) - 4)))) - (define (%count-utf8-bytes string start end) (do ((index start (fix:+ index 1)) - (n-bytes 0 (fix:+ n-bytes (char-utf8-bytes (string-ref string index))))) + (n-bytes 0 + (fix:+ n-bytes + (char-utf8-byte-length (string-ref string index))))) ((not (fix:< index end)) n-bytes))) - -(define (char-utf8-bytes char) - (let ((cp (char->integer char))) - (cond ((fix:< cp #x00000080) 1) - ((fix:< cp #x00000800) 2) - ((fix:< cp #x00010000) 3) - ((fix:< cp #x00110000) 4) - (else (error "Not a unicode character:" char))))) (define (utf8->string bytevector #!optional start end) (guarantee bytevector? bytevector 'utf8->string) diff --git a/src/runtime/char.scm b/src/runtime/char.scm index 5d2c5c4de..b6493bf1b 100644 --- a/src/runtime/char.scm +++ b/src/runtime/char.scm @@ -90,6 +90,12 @@ USA. (define-guarantee unicode-scalar-value "a Unicode scalar value") +(define (unicode-char->scalar-value char) + (let ((cp (char->integer char))) + (if (not (legal-code-32? cp)) + (error:not-unicode-char char 'char-utf8-byte-length)) + cp)) + (define-integrable (legal-code-32? pt) (and (fix:< pt char-code-limit) (not (surrogate? pt)) @@ -131,6 +137,52 @@ USA. (define (chars->ascii chars) (map char->ascii chars)) +(define (char-utf8-byte-length char) + (%sv-utf8-byte-length (unicode-char->scalar-value char))) + +(define (char-utf8-bytes char) + (let ((sv (unicode-char->scalar-value char))) + (let ((bytes (make-bytevector (%sv-utf8-byte-length sv)))) + (%sv-utf8-bytes! bytes 0 sv) + bytes))) + +(define (char-utf8-bytes! bytes index char) + (%sv-utf8-bytes! bytes index (unicode-char->scalar-value char))) + +(define (%sv-utf8-byte-length sv) + (cond ((fix:< sv #x00000080) 1) + ((fix:< sv #x00000800) 2) + ((fix:< sv #x00010000) 3) + (else 4))) + +(define (%sv-utf8-bytes! bytes index sv) + + (define-integrable (initial-byte n-bits offset) + (fix:or (fix:and (fix:lsh #xFF (fix:+ n-bits 1)) #xFF) + (fix:lsh sv (fix:- 0 offset)))) + + (define-integrable (trailing-byte offset) + (fix:or #x80 (fix:and (fix:lsh sv (fix:- 0 offset)) #x3F))) + + (cond ((fix:< sv #x00000080) + (bytevector-u8-set! bytes index sv) + (fix:+ index 1)) + ((fix:< sv #x00000800) + (bytevector-u8-set! bytes index (initial-byte 5 6)) + (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 0)) + (fix:+ index 2)) + ((fix:< sv #x00010000) + (bytevector-u8-set! bytes index (initial-byte 4 12)) + (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 6)) + (bytevector-u8-set! bytes (fix:+ index 2) (trailing-byte 0)) + (fix:+ index 3)) + (else + (bytevector-u8-set! bytes index (initial-byte 3 18)) + (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 12)) + (bytevector-u8-set! bytes (fix:+ index 2) (trailing-byte 6)) + (bytevector-u8-set! bytes (fix:+ index 3) (trailing-byte 0)) + (fix:+ index 4)))) + (define (char=? x y) (fix:= (char->integer x) (char->integer y))) diff --git a/src/runtime/runtime.pkg b/src/runtime/runtime.pkg index 83444f75a..dd3d8e855 100644 --- a/src/runtime/runtime.pkg +++ b/src/runtime/runtime.pkg @@ -1233,6 +1233,9 @@ USA. char-downcase char-integer-limit char-upcase + char-utf8-byte-length + char-utf8-bytes + char-utf8-bytes! char<=? charchar radix? set-char-bits + unicode-char->scalar-value unicode-char? unicode-scalar-value?) (export (runtime unicode)