;;; -*-Scheme-*-
;;;
-;;; $Id: unicode.scm,v 1.4 2001/10/04 15:52:39 cph Exp $
+;;; $Id: unicode.scm,v 1.5 2001/10/04 16:28:57 cph Exp $
;;;
;;; Copyright (c) 2001 Massachusetts Institute of Technology
;;;
(values (vector-head lower n) (vector-head upper n))
(values lower upper))))))
\f
-(define (code-point->utf8-string n)
-
- (define-integrable (initial-char n-bits offset)
- (fix:or (fix:and (fix:lsh #xFF (fix:+ n-bits 1)) #xFF)
- (fix:lsh n (fix:- 0 offset))))
-
- (define-integrable (subsequent-char offset)
- (fix:or #x80
- (fix:and (fix:lsh n (fix:- 0 offset)) #x3F)))
+(define (read-utf8-code-point port)
+ (let ((c0 (read-char port))
+ (get-next
+ (lambda ()
+ (let ((c (read-char port)))
+ (if (eof-object? c)
+ (error "EOF while reading UTF-8 code point."))
+ (if (not (and (fix:<= #x80 (char->integer c))
+ (fix:< (char->integer c) #xC0)))
+ (error "Illegal subsequent UTF-8 char:" c))
+ (fix:and (char->integer c) #x3F)))))
+ (cond ((eof-object? c0)
+ c0)
+ ((fix:< (char->integer c0) #x80)
+ (char->integer c0))
+ ((fix:< (char->integer c0) #xE0)
+ (fix:or (fix:lsh (fix:and (char->integer c0) #x1F) 6)
+ (get-next)))
+ ((fix:< (char->integer c0) #xF0)
+ (let* ((n1 (get-next))
+ (n2 (get-next)))
+ (fix:or (fix:lsh (fix:and (char->integer c0) #x0F) 12)
+ (fix:or (fix:lsh n1 6)
+ n2))))
+ ((fix:< (char->integer c0) #xF8)
+ (let* ((n1 (get-next))
+ (n2 (get-next))
+ (n3 (get-next)))
+ (fix:or (fix:lsh (fix:and (char->integer c0) #x07) 18)
+ (fix:or (fix:lsh n1 12)
+ (fix:or (fix:lsh n2 6)
+ n3)))))
+ ((fix:< (char->integer c0) #xFC)
+ (let* ((n1 (get-next))
+ (n2 (get-next))
+ (n3 (get-next))
+ (n4 (get-next)))
+ (+ (* (fix:and (char->integer c0) #x03) #x01000000)
+ (fix:or (fix:lsh n1 18)
+ (fix:lsh n2 12))
+ (fix:or (fix:lsh n3 6)
+ n4))))
+ ((fix:< (char->integer c0) #xFE)
+ (let* ((n1 (get-next))
+ (n2 (get-next))
+ (n3 (get-next))
+ (n4 (get-next))
+ (n5 (get-next)))
+ (+ (* (fix:and (char->integer c0) #x01) #x40000000)
+ (* n1 #x01000000)
+ (fix:or (fix:lsh n2 18)
+ (fix:lsh n3 12))
+ (fix:or (fix:lsh n4 6)
+ n5))))
+ (else
+ (error "Illegal initial UTF-8 char:" c)))))
- (if (not (unicode-code-point? n))
- (error:wrong-type-argument n "unicode code point"
- 'CODE-POINT->UTF8-STRING))
- (cond ((< n #x00000080)
- (let ((s (make-string 1)))
- (vector-8b-set! s 0 n)
- s))
- ((< n #x00000800)
- (let ((s (make-string 2)))
- (vector-8b-set! s 0 (initial-char 5 6))
- (vector-8b-set! s 1 (subsequent-char 0))
- s))
- ((< n #x00010000)
- (let ((s (make-string 3)))
- (vector-8b-set! s 0 (initial-char 4 12))
- (vector-8b-set! s 1 (subsequent-char 6))
- (vector-8b-set! s 2 (subsequent-char 0))
- s))
- ((< n #x00200000)
- (let ((s (make-string 4)))
- (vector-8b-set! s 0 (initial-char 3 18))
- (vector-8b-set! s 1 (subsequent-char 12))
- (vector-8b-set! s 2 (subsequent-char 6))
- (vector-8b-set! s 3 (subsequent-char 0))
- s))
- ((< n #x04000000)
- (let ((s (make-string 5)))
- (vector-8b-set! s 0 (initial-char 2 24))
- (vector-8b-set! s 1 (subsequent-char 18))
- (vector-8b-set! s 2 (subsequent-char 12))
- (vector-8b-set! s 3 (subsequent-char 6))
- (vector-8b-set! s 4 (subsequent-char 0))
- s))
- (else
- (let ((s (make-string 6)))
- (vector-8b-set! s 0 (initial-char 1 30))
- (vector-8b-set! s 1 (subsequent-char 24))
- (vector-8b-set! s 2 (subsequent-char 18))
- (vector-8b-set! s 3 (subsequent-char 12))
- (vector-8b-set! s 4 (subsequent-char 6))
- (vector-8b-set! s 5 (subsequent-char 0))
- s))))
-\f
(define (utf8-string->code-point string)
-
- (define-integrable (test2 index)
- (and (fix:<= #x80 (vector-8b-ref string index))
- (fix:< (vector-8b-ref string index) #xC0)))
-
- (define-integrable (get2 index)
- (fix:and (vector-8b-ref string index) #x3F))
-
- (or (cond ((fix:= (string-length string) 0)
- #f)
- ((fix:< (vector-8b-ref string 0) #x80)
- (and (fix:= (string-length string) 1)
- (vector-8b-ref string 0)))
- ((fix:< (vector-8b-ref string 0) #xE0)
- (and (fix:= (string-length string) 2)
- (test2 1)
- (fix:or (fix:lsh (fix:and (vector-8b-ref string 0) #x1F) 6)
- (get2 1))))
- ((fix:< (vector-8b-ref string 0) #xF0)
- (and (fix:= (string-length string) 3)
- (test2 1)
- (test2 2)
- (fix:or (fix:lsh (fix:and (vector-8b-ref string 0) #x0F) 12)
- (fix:or (fix:lsh (get2 1) 6)
- (get2 2)))))
- ((fix:< (vector-8b-ref string 0) #xF8)
- (and (fix:= (string-length string) 4)
- (test2 1)
- (test2 2)
- (test2 3)
- (fix:or (fix:lsh (fix:and (vector-8b-ref string 0) #x07) 18)
- (fix:or (fix:lsh (get2 1) 12)
- (fix:or (fix:lsh (get2 2) 6)
- (get2 3))))))
- ((fix:< (vector-8b-ref string 0) #xFC)
- (and (fix:= (string-length string) 5)
- (test2 1)
- (test2 2)
- (test2 3)
- (test2 4)
- (+ (* (fix:and (vector-8b-ref string 0) #x03) #x01000000)
- (fix:or (fix:lsh (get2 1) 18)
- (fix:lsh (get2 2) 12))
- (fix:or (fix:lsh (get2 3) 6)
- (get2 4)))))
- ((fix:< (vector-8b-ref string 0) #xFE)
- (and (fix:= (string-length string) 6)
- (test2 1)
- (test2 2)
- (test2 3)
- (test2 4)
- (test2 5)
- (+ (* (fix:and (vector-8b-ref string 0) #x01) #x40000000)
- (* (get2 1) #x01000000)
- (fix:or (fix:lsh (get2 2) 18)
- (fix:lsh (get2 3) 12))
- (fix:or (fix:lsh (get2 4) 6)
- (get2 5)))))
- (else #f))
- (error:wrong-type-argument string "UTF-8 character"
- 'UTF8-STRING->CODE-POINT)))
+ (read-utf8-code-point (string->input-port string)))
\f
(define (read-utf8-code-point-from-source source)
+ ;; This is separately implemented to speed up the parser buffer.
(let ((c0 (source))
(get-next
(lambda ()
(fix:or (fix:lsh n4 6)
n5)))))
(else
- #f)))))
\ No newline at end of file
+ #f)))))
+\f
+(define (write-utf8-code-point n port)
+
+ (define-integrable (initial-char n-bits offset)
+ (fix:or (fix:and (fix:lsh #xFF (fix:+ n-bits 1)) #xFF)
+ (fix:lsh n (fix:- 0 offset))))
+
+ (define-integrable (subsequent-char offset)
+ (fix:or #x80 (fix:and (fix:lsh n (fix:- 0 offset)) #x3F)))
+
+ (define-integrable (output-8b n)
+ (write-char (integer->char n) port))
+
+ (if (not (unicode-code-point? n))
+ (error:wrong-type-argument n "unicode code point"
+ 'CODE-POINT->UTF8-STRING))
+ (cond ((< n #x00000080)
+ (output-8b n))
+ ((< n #x00000800)
+ (output-8b (initial-char 5 6))
+ (output-8b (subsequent-char 0)))
+ ((< n #x00010000)
+ (output-8b (initial-char 4 12))
+ (output-8b (subsequent-char 6))
+ (output-8b (subsequent-char 0)))
+ ((< n #x00200000)
+ (output-8b (initial-char 3 18))
+ (output-8b (subsequent-char 12))
+ (output-8b (subsequent-char 6))
+ (output-8b (subsequent-char 0)))
+ ((< n #x04000000)
+ (output-8b (initial-char 2 24))
+ (output-8b (subsequent-char 18))
+ (output-8b (subsequent-char 12))
+ (output-8b (subsequent-char 6))
+ (output-8b (subsequent-char 0)))
+ (else
+ (output-8b (initial-char 1 30))
+ (output-8b (subsequent-char 24))
+ (output-8b (subsequent-char 18))
+ (output-8b (subsequent-char 12))
+ (output-8b (subsequent-char 6))
+ (output-8b (subsequent-char 0)))))
+
+(define (code-point->utf8-string n)
+ (with-string-output-port
+ (lambda (port)
+ (write-utf8-code-point n port))))