From: Chris Hanson <org/chris-hanson/cph>
Date: Thu, 4 Oct 2001 16:28:57 +0000 (+0000)
Subject: Implement READ-UTF8-CODE-POINT and WRITE-UTF8-CODE-POINT.
X-Git-Tag: 20090517-FFI~2525
X-Git-Url: https://birchwood-abbey.net/git?a=commitdiff_plain;h=162e267d31938b053473b034b8025b6d4ad6c37e;p=mit-scheme.git

Implement READ-UTF8-CODE-POINT and WRITE-UTF8-CODE-POINT.
---

diff --git a/v7/src/star-parser/parser.pkg b/v7/src/star-parser/parser.pkg
index d5ad7b780..95adca7d5 100644
--- a/v7/src/star-parser/parser.pkg
+++ b/v7/src/star-parser/parser.pkg
@@ -1,6 +1,6 @@
 ;;; -*-Scheme-*-
 ;;;
-;;; $Id: parser.pkg,v 1.12 2001/10/04 16:27:39 cph Exp $
+;;; $Id: parser.pkg,v 1.13 2001/10/04 16:28:13 cph Exp $
 ;;;
 ;;; Copyright (c) 2001 Massachusetts Institute of Technology
 ;;;
@@ -93,8 +93,10 @@
 	  code-point->utf8-string
 	  code-point-in-alphabet?
 	  code-points->alphabet
+	  read-utf8-code-point
 	  read-utf8-code-point-from-source
 	  string->alphabet
 	  unicode-code-point?
 	  utf8-string->code-point
-	  well-formed-code-points-list?))
\ No newline at end of file
+	  well-formed-code-points-list?
+	  write-utf8-code-point))
\ No newline at end of file
diff --git a/v7/src/star-parser/unicode.scm b/v7/src/star-parser/unicode.scm
index ed163264f..29de59514 100644
--- a/v7/src/star-parser/unicode.scm
+++ b/v7/src/star-parser/unicode.scm
@@ -1,6 +1,6 @@
 ;;; -*-Scheme-*-
 ;;;
-;;; $Id: unicode.scm,v 1.4 2001/10/04 15:52:39 cph Exp $
+;;; $Id: unicode.scm,v 1.5 2001/10/04 16:28:57 cph Exp $
 ;;;
 ;;; Copyright (c) 2001 Massachusetts Institute of Technology
 ;;;
@@ -347,123 +347,68 @@
 	    (values (vector-head lower n) (vector-head upper n))
 	    (values lower upper))))))
 
-(define (code-point->utf8-string n)
-
-  (define-integrable (initial-char n-bits offset)
-    (fix:or (fix:and (fix:lsh #xFF (fix:+ n-bits 1)) #xFF)
-	    (fix:lsh n (fix:- 0 offset))))
-
-  (define-integrable (subsequent-char offset)
-    (fix:or #x80
-	    (fix:and (fix:lsh n (fix:- 0 offset)) #x3F)))
+(define (read-utf8-code-point port)
+  (let ((c0 (read-char port))
+	(get-next
+	 (lambda ()
+	   (let ((c (read-char port)))
+	     (if (eof-object? c)
+		 (error "EOF while reading UTF-8 code point."))
+	     (if (not (and (fix:<= #x80 (char->integer c))
+			   (fix:< (char->integer c) #xC0)))
+		 (error "Illegal subsequent UTF-8 char:" c))
+	     (fix:and (char->integer c) #x3F)))))
+    (cond ((eof-object? c0)
+	   c0)
+	  ((fix:< (char->integer c0) #x80)
+	   (char->integer c0))
+	  ((fix:< (char->integer c0) #xE0)
+	   (fix:or (fix:lsh (fix:and (char->integer c0) #x1F) 6)
+		   (get-next)))
+	  ((fix:< (char->integer c0) #xF0)
+	   (let* ((n1 (get-next))
+		  (n2 (get-next)))
+	     (fix:or (fix:lsh (fix:and (char->integer c0) #x0F) 12)
+		     (fix:or (fix:lsh n1 6)
+			     n2))))
+	  ((fix:< (char->integer c0) #xF8)
+	   (let* ((n1 (get-next))
+		  (n2 (get-next))
+		  (n3 (get-next)))
+	     (fix:or (fix:lsh (fix:and (char->integer c0) #x07) 18)
+		     (fix:or (fix:lsh n1 12)
+			     (fix:or (fix:lsh n2 6)
+				     n3)))))
+	  ((fix:< (char->integer c0) #xFC)
+	   (let* ((n1 (get-next))
+		  (n2 (get-next))
+		  (n3 (get-next))
+		  (n4 (get-next)))
+	     (+ (* (fix:and (char->integer c0) #x03) #x01000000)
+		(fix:or (fix:lsh n1 18)
+			(fix:lsh n2 12))
+		(fix:or (fix:lsh n3 6)
+			n4))))
+	  ((fix:< (char->integer c0) #xFE)
+	   (let* ((n1 (get-next))
+		  (n2 (get-next))
+		  (n3 (get-next))
+		  (n4 (get-next))
+		  (n5 (get-next)))
+	     (+ (* (fix:and (char->integer c0) #x01) #x40000000)
+		(* n1 #x01000000)
+		(fix:or (fix:lsh n2 18)
+			(fix:lsh n3 12))
+		(fix:or (fix:lsh n4 6)
+			n5))))
+	  (else
+	   (error "Illegal initial UTF-8 char:" c)))))
 
-  (if (not (unicode-code-point? n))
-      (error:wrong-type-argument n "unicode code point"
-				 'CODE-POINT->UTF8-STRING))
-  (cond ((< n #x00000080)
-	 (let ((s (make-string 1)))
-	   (vector-8b-set! s 0 n)
-	   s))
-	((< n #x00000800)
-	 (let ((s (make-string 2)))
-	   (vector-8b-set! s 0 (initial-char 5 6))
-	   (vector-8b-set! s 1 (subsequent-char 0))
-	   s))
-	((< n #x00010000)
-	 (let ((s (make-string 3)))
-	   (vector-8b-set! s 0 (initial-char 4 12))
-	   (vector-8b-set! s 1 (subsequent-char 6))
-	   (vector-8b-set! s 2 (subsequent-char 0))
-	   s))
-	((< n #x00200000)
-	 (let ((s (make-string 4)))
-	   (vector-8b-set! s 0 (initial-char 3 18))
-	   (vector-8b-set! s 1 (subsequent-char 12))
-	   (vector-8b-set! s 2 (subsequent-char 6))
-	   (vector-8b-set! s 3 (subsequent-char 0))
-	   s))
-	((< n #x04000000)
-	 (let ((s (make-string 5)))
-	   (vector-8b-set! s 0 (initial-char 2 24))
-	   (vector-8b-set! s 1 (subsequent-char 18))
-	   (vector-8b-set! s 2 (subsequent-char 12))
-	   (vector-8b-set! s 3 (subsequent-char 6))
-	   (vector-8b-set! s 4 (subsequent-char 0))
-	   s))
-	(else
-	 (let ((s (make-string 6)))
-	   (vector-8b-set! s 0 (initial-char 1 30))
-	   (vector-8b-set! s 1 (subsequent-char 24))
-	   (vector-8b-set! s 2 (subsequent-char 18))
-	   (vector-8b-set! s 3 (subsequent-char 12))
-	   (vector-8b-set! s 4 (subsequent-char 6))
-	   (vector-8b-set! s 5 (subsequent-char 0))
-	   s))))
-
 (define (utf8-string->code-point string)
-
-  (define-integrable (test2 index)
-    (and (fix:<= #x80 (vector-8b-ref string index))
-	 (fix:< (vector-8b-ref string index) #xC0)))
-
-  (define-integrable (get2 index)
-    (fix:and (vector-8b-ref string index) #x3F))
-
-  (or (cond ((fix:= (string-length string) 0)
-	     #f)
-	    ((fix:< (vector-8b-ref string 0) #x80)
-	     (and (fix:= (string-length string) 1)
-		  (vector-8b-ref string 0)))
-	    ((fix:< (vector-8b-ref string 0) #xE0)
-	     (and (fix:= (string-length string) 2)
-		  (test2 1)
-		  (fix:or (fix:lsh (fix:and (vector-8b-ref string 0) #x1F) 6)
-			  (get2 1))))
-	    ((fix:< (vector-8b-ref string 0) #xF0)
-	     (and (fix:= (string-length string) 3)
-		  (test2 1)
-		  (test2 2)
-		  (fix:or (fix:lsh (fix:and (vector-8b-ref string 0) #x0F) 12)
-			  (fix:or (fix:lsh (get2 1) 6)
-				  (get2 2)))))
-	    ((fix:< (vector-8b-ref string 0) #xF8)
-	     (and (fix:= (string-length string) 4)
-		  (test2 1)
-		  (test2 2)
-		  (test2 3)
-		  (fix:or (fix:lsh (fix:and (vector-8b-ref string 0) #x07) 18)
-			  (fix:or (fix:lsh (get2 1) 12)
-				  (fix:or (fix:lsh (get2 2) 6)
-					  (get2 3))))))
-	    ((fix:< (vector-8b-ref string 0) #xFC)
-	     (and (fix:= (string-length string) 5)
-		  (test2 1)
-		  (test2 2)
-		  (test2 3)
-		  (test2 4)
-		  (+ (* (fix:and (vector-8b-ref string 0) #x03) #x01000000)
-		     (fix:or (fix:lsh (get2 1) 18)
-			     (fix:lsh (get2 2) 12))
-		     (fix:or (fix:lsh (get2 3) 6)
-			     (get2 4)))))
-	    ((fix:< (vector-8b-ref string 0) #xFE)
-	     (and (fix:= (string-length string) 6)
-		  (test2 1)
-		  (test2 2)
-		  (test2 3)
-		  (test2 4)
-		  (test2 5)
-		  (+ (* (fix:and (vector-8b-ref string 0) #x01) #x40000000)
-		     (* (get2 1) #x01000000)
-		     (fix:or (fix:lsh (get2 2) 18)
-			     (fix:lsh (get2 3) 12))
-		     (fix:or (fix:lsh (get2 4) 6)
-			     (get2 5)))))
-	    (else #f))
-      (error:wrong-type-argument string "UTF-8 character"
-				 'UTF8-STRING->CODE-POINT)))
+  (read-utf8-code-point (string->input-port string)))
 
 (define (read-utf8-code-point-from-source source)
+  ;; This is separately implemented to speed up the parser buffer.
   (let ((c0 (source))
 	(get-next
 	 (lambda ()
@@ -521,4 +466,52 @@
 			  (fix:or (fix:lsh n4 6)
 				  n5)))))
 	       (else
-		#f)))))
\ No newline at end of file
+		#f)))))
+
+(define (write-utf8-code-point n port)
+
+  (define-integrable (initial-char n-bits offset)
+    (fix:or (fix:and (fix:lsh #xFF (fix:+ n-bits 1)) #xFF)
+	    (fix:lsh n (fix:- 0 offset))))
+
+  (define-integrable (subsequent-char offset)
+    (fix:or #x80 (fix:and (fix:lsh n (fix:- 0 offset)) #x3F)))
+
+  (define-integrable (output-8b n)
+    (write-char (integer->char n) port))
+
+  (if (not (unicode-code-point? n))
+      (error:wrong-type-argument n "unicode code point"
+				 'CODE-POINT->UTF8-STRING))
+  (cond ((< n #x00000080)
+	 (output-8b n))
+	((< n #x00000800)
+	 (output-8b (initial-char 5 6))
+	 (output-8b (subsequent-char 0)))
+	((< n #x00010000)
+	 (output-8b (initial-char 4 12))
+	 (output-8b (subsequent-char 6))
+	 (output-8b (subsequent-char 0)))
+	((< n #x00200000)
+	 (output-8b (initial-char 3 18))
+	 (output-8b (subsequent-char 12))
+	 (output-8b (subsequent-char 6))
+	 (output-8b (subsequent-char 0)))
+	((< n #x04000000)
+	 (output-8b (initial-char 2 24))
+	 (output-8b (subsequent-char 18))
+	 (output-8b (subsequent-char 12))
+	 (output-8b (subsequent-char 6))
+	 (output-8b (subsequent-char 0)))
+	(else
+	 (output-8b (initial-char 1 30))
+	 (output-8b (subsequent-char 24))
+	 (output-8b (subsequent-char 18))
+	 (output-8b (subsequent-char 12))
+	 (output-8b (subsequent-char 6))
+	 (output-8b (subsequent-char 0)))))
+
+(define (code-point->utf8-string n)
+  (with-string-output-port
+    (lambda (port)
+      (write-utf8-code-point n port))))