start))))
(let ((buffer (allocate-bytevector (%count-utf8-bytes string start end))))
(do ((from start (fix:+ from 1))
- (to 0 (fix:+ to (%char->utf8! buffer to (string-ref string from)))))
+ (to 0
+ (fix:+ to
+ (char-utf8-bytes! buffer to (string-ref string from)))))
((not (fix:< from end))))
buffer)))
-(define (%char->utf8! buffer index char)
- (let ((cp (char->integer char)))
-
- (define-integrable (initial-byte n-bits offset)
- (fix:or (fix:and (fix:lsh #xFF (fix:+ n-bits 1)) #xFF)
- (fix:lsh cp (fix:- 0 offset))))
-
- (define-integrable (trailing-byte offset)
- (fix:or #x80 (fix:and (fix:lsh cp (fix:- 0 offset)) #x3F)))
-
- (define-integrable (put-byte! offset byte)
- (bytevector-u8-set! buffer (fix:+ index offset) byte))
-
- (cond ((fix:< cp #x00000080)
- (put-byte! 0 cp)
- 1)
- ((fix:< cp #x00000800)
- (put-byte! 0 (initial-byte 5 6))
- (put-byte! 1 (trailing-byte 0))
- 2)
- ((fix:< cp #x00010000)
- (if (surrogate? cp)
- (error "Code point is a UTF-16 surrogate:" cp))
- (if (non-character? cp)
- (error "Code point is a non-character:" cp))
- (put-byte! 0 (initial-byte 4 12))
- (put-byte! 1 (trailing-byte 6))
- (put-byte! 2 (trailing-byte 0))
- 3)
- (else
- (if (non-character? cp)
- (error "Code point is a non-character:" cp))
- (put-byte! 0 (initial-byte 3 18))
- (put-byte! 1 (trailing-byte 12))
- (put-byte! 2 (trailing-byte 6))
- (put-byte! 3 (trailing-byte 0))
- 4))))
-
(define (%count-utf8-bytes string start end)
(do ((index start (fix:+ index 1))
- (n-bytes 0 (fix:+ n-bytes (char-utf8-bytes (string-ref string index)))))
+ (n-bytes 0
+ (fix:+ n-bytes
+ (char-utf8-byte-length (string-ref string index)))))
((not (fix:< index end)) n-bytes)))
-
-(define (char-utf8-bytes char)
- (let ((cp (char->integer char)))
- (cond ((fix:< cp #x00000080) 1)
- ((fix:< cp #x00000800) 2)
- ((fix:< cp #x00010000) 3)
- ((fix:< cp #x00110000) 4)
- (else (error "Not a unicode character:" char)))))
\f
(define (utf8->string bytevector #!optional start end)
(guarantee bytevector? bytevector 'utf8->string)
(define-guarantee unicode-scalar-value "a Unicode scalar value")
+(define (unicode-char->scalar-value char)
+ (let ((cp (char->integer char)))
+ (if (not (legal-code-32? cp))
+ (error:not-unicode-char char 'char-utf8-byte-length))
+ cp))
+
(define-integrable (legal-code-32? pt)
(and (fix:< pt char-code-limit)
(not (surrogate? pt))
(define (chars->ascii chars)
(map char->ascii chars))
\f
+(define (char-utf8-byte-length char)
+ (%sv-utf8-byte-length (unicode-char->scalar-value char)))
+
+(define (char-utf8-bytes char)
+ (let ((sv (unicode-char->scalar-value char)))
+ (let ((bytes (make-bytevector (%sv-utf8-byte-length sv))))
+ (%sv-utf8-bytes! bytes 0 sv)
+ bytes)))
+
+(define (char-utf8-bytes! bytes index char)
+ (%sv-utf8-bytes! bytes index (unicode-char->scalar-value char)))
+
+(define (%sv-utf8-byte-length sv)
+ (cond ((fix:< sv #x00000080) 1)
+ ((fix:< sv #x00000800) 2)
+ ((fix:< sv #x00010000) 3)
+ (else 4)))
+
+(define (%sv-utf8-bytes! bytes index sv)
+
+ (define-integrable (initial-byte n-bits offset)
+ (fix:or (fix:and (fix:lsh #xFF (fix:+ n-bits 1)) #xFF)
+ (fix:lsh sv (fix:- 0 offset))))
+
+ (define-integrable (trailing-byte offset)
+ (fix:or #x80 (fix:and (fix:lsh sv (fix:- 0 offset)) #x3F)))
+
+ (cond ((fix:< sv #x00000080)
+ (bytevector-u8-set! bytes index sv)
+ (fix:+ index 1))
+ ((fix:< sv #x00000800)
+ (bytevector-u8-set! bytes index (initial-byte 5 6))
+ (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 0))
+ (fix:+ index 2))
+ ((fix:< sv #x00010000)
+ (bytevector-u8-set! bytes index (initial-byte 4 12))
+ (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 6))
+ (bytevector-u8-set! bytes (fix:+ index 2) (trailing-byte 0))
+ (fix:+ index 3))
+ (else
+ (bytevector-u8-set! bytes index (initial-byte 3 18))
+ (bytevector-u8-set! bytes (fix:+ index 1) (trailing-byte 12))
+ (bytevector-u8-set! bytes (fix:+ index 2) (trailing-byte 6))
+ (bytevector-u8-set! bytes (fix:+ index 3) (trailing-byte 0))
+ (fix:+ index 4))))
+\f
(define (char=? x y)
(fix:= (char->integer x) (char->integer y)))