From: Chris Hanson Date: Thu, 4 Oct 2001 16:28:57 +0000 (+0000) Subject: Implement READ-UTF8-CODE-POINT and WRITE-UTF8-CODE-POINT. X-Git-Tag: 20090517-FFI~2525 X-Git-Url: https://birchwood-abbey.net/git?a=commitdiff_plain;h=162e267d31938b053473b034b8025b6d4ad6c37e;p=mit-scheme.git Implement READ-UTF8-CODE-POINT and WRITE-UTF8-CODE-POINT. --- diff --git a/v7/src/star-parser/parser.pkg b/v7/src/star-parser/parser.pkg index d5ad7b780..95adca7d5 100644 --- a/v7/src/star-parser/parser.pkg +++ b/v7/src/star-parser/parser.pkg @@ -1,6 +1,6 @@ ;;; -*-Scheme-*- ;;; -;;; $Id: parser.pkg,v 1.12 2001/10/04 16:27:39 cph Exp $ +;;; $Id: parser.pkg,v 1.13 2001/10/04 16:28:13 cph Exp $ ;;; ;;; Copyright (c) 2001 Massachusetts Institute of Technology ;;; @@ -93,8 +93,10 @@ code-point->utf8-string code-point-in-alphabet? code-points->alphabet + read-utf8-code-point read-utf8-code-point-from-source string->alphabet unicode-code-point? utf8-string->code-point - well-formed-code-points-list?)) \ No newline at end of file + well-formed-code-points-list? + write-utf8-code-point)) \ No newline at end of file diff --git a/v7/src/star-parser/unicode.scm b/v7/src/star-parser/unicode.scm index ed163264f..29de59514 100644 --- a/v7/src/star-parser/unicode.scm +++ b/v7/src/star-parser/unicode.scm @@ -1,6 +1,6 @@ ;;; -*-Scheme-*- ;;; -;;; $Id: unicode.scm,v 1.4 2001/10/04 15:52:39 cph Exp $ +;;; $Id: unicode.scm,v 1.5 2001/10/04 16:28:57 cph Exp $ ;;; ;;; Copyright (c) 2001 Massachusetts Institute of Technology ;;; @@ -347,123 +347,68 @@ (values (vector-head lower n) (vector-head upper n)) (values lower upper)))))) -(define (code-point->utf8-string n) - - (define-integrable (initial-char n-bits offset) - (fix:or (fix:and (fix:lsh #xFF (fix:+ n-bits 1)) #xFF) - (fix:lsh n (fix:- 0 offset)))) - - (define-integrable (subsequent-char offset) - (fix:or #x80 - (fix:and (fix:lsh n (fix:- 0 offset)) #x3F))) +(define (read-utf8-code-point port) + (let ((c0 (read-char port)) + (get-next + (lambda () + (let ((c (read-char port))) + (if (eof-object? c) + (error "EOF while reading UTF-8 code point.")) + (if (not (and (fix:<= #x80 (char->integer c)) + (fix:< (char->integer c) #xC0))) + (error "Illegal subsequent UTF-8 char:" c)) + (fix:and (char->integer c) #x3F))))) + (cond ((eof-object? c0) + c0) + ((fix:< (char->integer c0) #x80) + (char->integer c0)) + ((fix:< (char->integer c0) #xE0) + (fix:or (fix:lsh (fix:and (char->integer c0) #x1F) 6) + (get-next))) + ((fix:< (char->integer c0) #xF0) + (let* ((n1 (get-next)) + (n2 (get-next))) + (fix:or (fix:lsh (fix:and (char->integer c0) #x0F) 12) + (fix:or (fix:lsh n1 6) + n2)))) + ((fix:< (char->integer c0) #xF8) + (let* ((n1 (get-next)) + (n2 (get-next)) + (n3 (get-next))) + (fix:or (fix:lsh (fix:and (char->integer c0) #x07) 18) + (fix:or (fix:lsh n1 12) + (fix:or (fix:lsh n2 6) + n3))))) + ((fix:< (char->integer c0) #xFC) + (let* ((n1 (get-next)) + (n2 (get-next)) + (n3 (get-next)) + (n4 (get-next))) + (+ (* (fix:and (char->integer c0) #x03) #x01000000) + (fix:or (fix:lsh n1 18) + (fix:lsh n2 12)) + (fix:or (fix:lsh n3 6) + n4)))) + ((fix:< (char->integer c0) #xFE) + (let* ((n1 (get-next)) + (n2 (get-next)) + (n3 (get-next)) + (n4 (get-next)) + (n5 (get-next))) + (+ (* (fix:and (char->integer c0) #x01) #x40000000) + (* n1 #x01000000) + (fix:or (fix:lsh n2 18) + (fix:lsh n3 12)) + (fix:or (fix:lsh n4 6) + n5)))) + (else + (error "Illegal initial UTF-8 char:" c))))) - (if (not (unicode-code-point? n)) - (error:wrong-type-argument n "unicode code point" - 'CODE-POINT->UTF8-STRING)) - (cond ((< n #x00000080) - (let ((s (make-string 1))) - (vector-8b-set! s 0 n) - s)) - ((< n #x00000800) - (let ((s (make-string 2))) - (vector-8b-set! s 0 (initial-char 5 6)) - (vector-8b-set! s 1 (subsequent-char 0)) - s)) - ((< n #x00010000) - (let ((s (make-string 3))) - (vector-8b-set! s 0 (initial-char 4 12)) - (vector-8b-set! s 1 (subsequent-char 6)) - (vector-8b-set! s 2 (subsequent-char 0)) - s)) - ((< n #x00200000) - (let ((s (make-string 4))) - (vector-8b-set! s 0 (initial-char 3 18)) - (vector-8b-set! s 1 (subsequent-char 12)) - (vector-8b-set! s 2 (subsequent-char 6)) - (vector-8b-set! s 3 (subsequent-char 0)) - s)) - ((< n #x04000000) - (let ((s (make-string 5))) - (vector-8b-set! s 0 (initial-char 2 24)) - (vector-8b-set! s 1 (subsequent-char 18)) - (vector-8b-set! s 2 (subsequent-char 12)) - (vector-8b-set! s 3 (subsequent-char 6)) - (vector-8b-set! s 4 (subsequent-char 0)) - s)) - (else - (let ((s (make-string 6))) - (vector-8b-set! s 0 (initial-char 1 30)) - (vector-8b-set! s 1 (subsequent-char 24)) - (vector-8b-set! s 2 (subsequent-char 18)) - (vector-8b-set! s 3 (subsequent-char 12)) - (vector-8b-set! s 4 (subsequent-char 6)) - (vector-8b-set! s 5 (subsequent-char 0)) - s)))) - (define (utf8-string->code-point string) - - (define-integrable (test2 index) - (and (fix:<= #x80 (vector-8b-ref string index)) - (fix:< (vector-8b-ref string index) #xC0))) - - (define-integrable (get2 index) - (fix:and (vector-8b-ref string index) #x3F)) - - (or (cond ((fix:= (string-length string) 0) - #f) - ((fix:< (vector-8b-ref string 0) #x80) - (and (fix:= (string-length string) 1) - (vector-8b-ref string 0))) - ((fix:< (vector-8b-ref string 0) #xE0) - (and (fix:= (string-length string) 2) - (test2 1) - (fix:or (fix:lsh (fix:and (vector-8b-ref string 0) #x1F) 6) - (get2 1)))) - ((fix:< (vector-8b-ref string 0) #xF0) - (and (fix:= (string-length string) 3) - (test2 1) - (test2 2) - (fix:or (fix:lsh (fix:and (vector-8b-ref string 0) #x0F) 12) - (fix:or (fix:lsh (get2 1) 6) - (get2 2))))) - ((fix:< (vector-8b-ref string 0) #xF8) - (and (fix:= (string-length string) 4) - (test2 1) - (test2 2) - (test2 3) - (fix:or (fix:lsh (fix:and (vector-8b-ref string 0) #x07) 18) - (fix:or (fix:lsh (get2 1) 12) - (fix:or (fix:lsh (get2 2) 6) - (get2 3)))))) - ((fix:< (vector-8b-ref string 0) #xFC) - (and (fix:= (string-length string) 5) - (test2 1) - (test2 2) - (test2 3) - (test2 4) - (+ (* (fix:and (vector-8b-ref string 0) #x03) #x01000000) - (fix:or (fix:lsh (get2 1) 18) - (fix:lsh (get2 2) 12)) - (fix:or (fix:lsh (get2 3) 6) - (get2 4))))) - ((fix:< (vector-8b-ref string 0) #xFE) - (and (fix:= (string-length string) 6) - (test2 1) - (test2 2) - (test2 3) - (test2 4) - (test2 5) - (+ (* (fix:and (vector-8b-ref string 0) #x01) #x40000000) - (* (get2 1) #x01000000) - (fix:or (fix:lsh (get2 2) 18) - (fix:lsh (get2 3) 12)) - (fix:or (fix:lsh (get2 4) 6) - (get2 5))))) - (else #f)) - (error:wrong-type-argument string "UTF-8 character" - 'UTF8-STRING->CODE-POINT))) + (read-utf8-code-point (string->input-port string))) (define (read-utf8-code-point-from-source source) + ;; This is separately implemented to speed up the parser buffer. (let ((c0 (source)) (get-next (lambda () @@ -521,4 +466,52 @@ (fix:or (fix:lsh n4 6) n5))))) (else - #f))))) \ No newline at end of file + #f))))) + +(define (write-utf8-code-point n port) + + (define-integrable (initial-char n-bits offset) + (fix:or (fix:and (fix:lsh #xFF (fix:+ n-bits 1)) #xFF) + (fix:lsh n (fix:- 0 offset)))) + + (define-integrable (subsequent-char offset) + (fix:or #x80 (fix:and (fix:lsh n (fix:- 0 offset)) #x3F))) + + (define-integrable (output-8b n) + (write-char (integer->char n) port)) + + (if (not (unicode-code-point? n)) + (error:wrong-type-argument n "unicode code point" + 'CODE-POINT->UTF8-STRING)) + (cond ((< n #x00000080) + (output-8b n)) + ((< n #x00000800) + (output-8b (initial-char 5 6)) + (output-8b (subsequent-char 0))) + ((< n #x00010000) + (output-8b (initial-char 4 12)) + (output-8b (subsequent-char 6)) + (output-8b (subsequent-char 0))) + ((< n #x00200000) + (output-8b (initial-char 3 18)) + (output-8b (subsequent-char 12)) + (output-8b (subsequent-char 6)) + (output-8b (subsequent-char 0))) + ((< n #x04000000) + (output-8b (initial-char 2 24)) + (output-8b (subsequent-char 18)) + (output-8b (subsequent-char 12)) + (output-8b (subsequent-char 6)) + (output-8b (subsequent-char 0))) + (else + (output-8b (initial-char 1 30)) + (output-8b (subsequent-char 24)) + (output-8b (subsequent-char 18)) + (output-8b (subsequent-char 12)) + (output-8b (subsequent-char 6)) + (output-8b (subsequent-char 0))))) + +(define (code-point->utf8-string n) + (with-string-output-port + (lambda (port) + (write-utf8-code-point n port))))