From 5edc2d38c01c5d6181551facbce7ca2bd1e7226b Mon Sep 17 00:00:00 2001 From: Chris Hanson Date: Mon, 14 Apr 2003 19:40:36 +0000 Subject: [PATCH] Add host-endian procedures for UTF-32 and UTF-16. Flesh out the UTF-32 abstraction to correspond to UTF-16. --- v7/src/runtime/runtime.pkg | 18 ++- v7/src/runtime/unicode.scm | 282 ++++++++++++++++++++++++++++--------- 2 files changed, 229 insertions(+), 71 deletions(-) diff --git a/v7/src/runtime/runtime.pkg b/v7/src/runtime/runtime.pkg index 98bec5e2b..3fdf5238e 100644 --- a/v7/src/runtime/runtime.pkg +++ b/v7/src/runtime/runtime.pkg @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: runtime.pkg,v 14.442 2003/04/14 18:59:08 cph Exp $ +$Id: runtime.pkg,v 14.443 2003/04/14 19:40:36 cph Exp $ Copyright 1988,1989,1990,1991,1992,1993 Massachusetts Institute of Technology Copyright 1994,1995,1996,1997,1998,1999 Massachusetts Institute of Technology @@ -4499,9 +4499,9 @@ USA. (files "unicode") (parent (runtime)) (export () + 8-bit-alphabet? - 8-bit-alphabet? alphabet alphabet+ alphabet- @@ -4524,8 +4524,10 @@ USA. open-wide-input-string open-wide-output-string read-utf16-be-char + read-utf16-char read-utf16-le-char read-utf32-be-char + read-utf32-char read-utf32-le-char read-utf8-char string->alphabet @@ -4535,8 +4537,14 @@ USA. utf16-be-string-length utf16-le-string->wide-string utf16-le-string-length + utf16-string->wide-string + utf16-string-length + utf32-be-string->wide-string utf32-be-string-length + utf32-le-string->wide-string utf32-le-string-length + utf32-string->wide-string + utf32-string-length utf8-string->wide-string utf8-string-length well-formed-code-point-list? @@ -4545,6 +4553,10 @@ USA. wide-string->string wide-string->utf16-be-string wide-string->utf16-le-string + wide-string->utf16-string + wide-string->utf32-be-string + wide-string->utf32-le-string + wide-string->utf32-string wide-string->utf8-string wide-string-index? wide-string-length @@ -4552,8 +4564,10 @@ USA. wide-string-set! wide-string? write-utf16-be-char + write-utf16-char write-utf16-le-char write-utf32-be-char + write-utf32-char write-utf32-le-char write-utf8-char) (export (runtime parser-buffer) diff --git a/v7/src/runtime/unicode.scm b/v7/src/runtime/unicode.scm index 640853a17..77ea6c329 100644 --- a/v7/src/runtime/unicode.scm +++ b/v7/src/runtime/unicode.scm @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: unicode.scm,v 1.8 2003/03/07 21:24:45 cph Exp $ +$Id: unicode.scm,v 1.9 2003/04/14 19:40:04 cph Exp $ Copyright 2001,2003 Massachusetts Institute of Technology @@ -116,7 +116,7 @@ USA. (error "Illegal input byte:" b)) b)))) -(define (write-byte byte port) +(define-integrable (write-byte byte port) (write-char (integer->char byte) port)) (define (initialize-package!) @@ -144,9 +144,12 @@ USA. (not (illegal-code? object)) (fix:< object char-code-limit)))) -(define (guarantee-unicode-code-point object caller) +(define-integrable (guarantee-unicode-code-point object caller) (if (not (unicode-code-point? object)) - (error:wrong-type-argument object "Unicode code point" caller))) + (error:not-unicode-code-point object caller))) + +(define (error:not-unicode-code-point object caller) + (error:wrong-type-argument object "Unicode code point" caller)) (define-integrable (illegal-code? pt) (or (fix:= #xD800 (fix:and #xF800 pt)) @@ -159,9 +162,12 @@ USA. (high1 #f read-only #t) (high2 #f read-only #t)) -(define (guarantee-alphabet object caller) +(define-integrable (guarantee-alphabet object caller) (if (not (alphabet? object)) - (error:wrong-type-argument object "Unicode alphabet" caller))) + (error:not-alphabet object caller))) + +(define (error:not-alphabet object caller) + (error:wrong-type-argument object "Unicode alphabet" caller)) (define-integrable (make-alphabet-low) (make-string #x100 (integer->char 0))) @@ -219,9 +225,12 @@ USA. (fix:< (car item) (cdr item))) (unicode-code-point? item))) -(define (guarantee-well-formed-code-point-list object caller) +(define-integrable (guarantee-well-formed-code-point-list object caller) (if (not (well-formed-code-point-list? object)) - (error:wrong-type-argument object "Unicode code-point list" caller))) + (error:not-well-formed-code-point-list object caller))) + +(define (error:not-well-formed-code-point-list object caller) + (error:wrong-type-argument object "Unicode code-point list" caller)) (define (code-points->alphabet items) (guarantee-well-formed-code-point-list items 'CODE-POINTS->ALPHABET) @@ -698,13 +707,18 @@ USA. ;;;; UTF-32 representation +(define (read-utf32-char port) + (if (host-big-endian?) + (read-utf32-be-char port) + (read-utf32-le-char port))) + (define (read-utf32-be-char port) (%read-utf32-char port utf32-be-bytes->code-point 'READ-UTF32-BE-CHAR)) (define (read-utf32-le-char port) (%read-utf32-char port utf32-le-bytes->code-point 'READ-UTF32-LE-CHAR)) -(define (%read-utf32-char port combiner caller) +(define-integrable (%read-utf32-char port combiner caller) (let ((b0 (read-byte port))) (if (eof-object? b0) b0 @@ -731,31 +745,121 @@ USA. (fix:lsh b1 8) b0)) +(define (write-utf32-char char port) + (if (host-big-endian?) + (write-utf32-be-char char port) + (write-utf32-le-char char port))) + (define (write-utf32-be-char char port) (guarantee-wide-char char 'WRITE-UTF32-BE-CHAR) + (%write-utf32-be-char char port)) + +(define (write-utf32-le-char char port) + (guarantee-wide-char char 'WRITE-UTF32-LE-CHAR) + (%write-utf32-le-char char port)) + +(define-integrable (%write-utf32-be-char char port) (let ((pt (char->integer char))) (write-byte 0 port) (write-byte (fix:lsh pt -16) port) (write-byte (fix:lsh pt -8) port) (write-byte (fix:and pt #xFF) port))) -(define (write-utf32-le-char char port) - (guarantee-wide-char char 'WRITE-UTF32-LE-CHAR) +(define-integrable (%write-utf32-le-char char port) (let ((pt (char->integer char))) (write-byte (fix:and pt #xFF) port) (write-byte (fix:lsh pt -8) port) (write-byte (fix:lsh pt -16) port) (write-byte 0 port))) + +(define (utf32-string->wide-string string #!optional start end) + (%utf32-string->wide-string string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + (if (host-big-endian?) + read-utf32-be-char + read-utf32-le-char))) + +(define (utf32-be-string->wide-string string #!optional start end) + (%utf32-string->wide-string string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + read-utf32-be-char)) + +(define (utf32-le-string->wide-string string #!optional start end) + (%utf32-string->wide-string string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + read-utf32-le-char)) + +(define (%utf32-string->wide-string string start end read-utf32-char) + (let ((input (open-input-string string start end))) + (call-with-wide-output-string + (lambda (output) + (let loop () + (let ((char (read-utf32-char input))) + (if (not (eof-object? char)) + (begin + (write-char char output) + (loop))))))))) + +(define (wide-string->utf32-string string #!optional start end) + (%wide-string->utf32-string string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + (if (host-big-endian?) + %write-utf32-be-char + %write-utf32-le-char))) + +(define (wide-string->utf32-be-string string #!optional start end) + (%wide-string->utf32-string string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + %write-utf32-be-char)) + +(define (wide-string->utf32-le-string string #!optional start end) + (%wide-string->utf32-string string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + %write-utf32-le-char)) + +(define (%wide-string->utf32-string string start end write-utf32-char) + (let ((input (open-wide-input-string string start end))) + (call-with-output-string + (lambda (output) + (let loop () + (let ((char (read-char input))) + (if (not (eof-object? char)) + (begin + (write-utf32-char char output) + (loop))))))))) + +(define (utf32-string-length string #!optional start end) + (if (host-big-endian?) + (%utf32-string-length string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + "32BE" utf32-be-bytes->code-point + 'UTF32-STRING-LENGTH) + (%utf32-string-length string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + "32LE" utf32-le-bytes->code-point + 'UTF32-STRING-LENGTH))) (define (utf32-be-string-length string #!optional start end) - (with-substring-args string start end 'UTF32-BE-STRING-LENGTH - (%utf32-string-length string start end "32BE" utf32-be-bytes->code-point - 'UTF32-BE-STRING-LENGTH))) + (%utf32-string-length string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + "32BE" utf32-be-bytes->code-point + 'UTF32-BE-STRING-LENGTH)) (define (utf32-le-string-length string #!optional start end) - (with-substring-args string start end 'UTF32-LE-STRING-LENGTH - (%utf32-string-length string start end "32LE" utf32-le-bytes->code-point - 'UTF32-LE-STRING-LENGTH))) + (%utf32-string-length string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + "32LE" utf32-le-bytes->code-point + 'UTF32-LE-STRING-LENGTH)) (define (%utf32-string-length string start end type combiner caller) (with-substring-args string start end caller @@ -775,13 +879,18 @@ USA. ;;;; UTF-16 representation +(define (read-utf16-char port) + (if (host-big-endian?) + (read-utf16-be-char port) + (read-utf16-le-char port))) + (define (read-utf16-be-char port) (%read-utf16-char port be-bytes->digit16 'READ-UTF16-BE-CHAR)) (define (read-utf16-le-char port) (%read-utf16-char port le-bytes->digit16 'READ-UTF16-LE-CHAR)) -(define (%read-utf16-char port combinator caller) +(define-integrable (%read-utf16-char port combinator caller) (let ((d0 (read-utf16-digit port combinator))) (if (eof-object? d0) d0 @@ -797,7 +906,7 @@ USA. (guarantee-unicode-code-point pt caller) (integer->char pt))))) -(define (read-utf16-digit port combinator) +(define-integrable (read-utf16-digit port combinator) (let ((b0 (read-byte port))) (if (eof-object? b0) b0 @@ -806,46 +915,11 @@ USA. (error "Truncated UTF-16 input.")) (combinator b0 b1))))) -(define-integrable (be-bytes->digit16 b0 b1) - (fix:or (fix:lsh b0 8) b1)) - -(define-integrable (le-bytes->digit16 b0 b1) - (fix:or (fix:lsh b1 8) b0)) - -(define-integrable (high-surrogate? n) - (fix:= #xD800 (fix:and #xFC00 n))) - -(define-integrable (low-surrogate? n) - (fix:= #xDC00 (fix:and #xFC00 n))) - -(define-integrable (combine-surrogates n0 n1) - (fix:+ (fix:+ (fix:lsh (fix:and n0 #x3FF) 10) - (fix:and n1 #x3FF)) - #x10000)) - -(define (utf16-be-string->wide-string string #!optional start end) - (%utf16-string->wide-string string - (if (default-object? start) #f start) - (if (default-object? end) #f end) - read-utf16-be-char)) - -(define (utf16-le-string->wide-string string #!optional start end) - (%utf16-string->wide-string string - (if (default-object? start) #f start) - (if (default-object? end) #f end) - read-utf16-le-char)) +(define (write-utf16-char char port) + (if (host-big-endian?) + (write-utf16-be-char char port) + (write-utf16-le-char char port))) -(define (%utf16-string->wide-string string start end read-utf16-char) - (let ((input (open-input-string string start end))) - (call-with-wide-output-string - (lambda (output) - (let loop () - (let ((char (read-utf16-char input))) - (if (not (eof-object? char)) - (begin - (write-char char output) - (loop))))))))) - (define (write-utf16-be-char char port) (guarantee-wide-char char 'WRITE-UTF16-BE-CHAR) (%write-utf16-be-char char port)) @@ -866,7 +940,7 @@ USA. (output (fix:and digit #x00FF)) (output (fix:lsh digit -8))))) -(define (%write-utf16-char char port dissecter) +(define-integrable (%write-utf16-char char port dissecter) (let ((pt (char->integer char)) (write-byte (lambda (byte) (write-byte byte port)))) (if (fix:< pt #x10000) @@ -874,6 +948,45 @@ USA. (let ((s (fix:- pt #x10000))) (dissecter (fix:or #xD800 (fix:lsh s -10)) write-byte) (dissecter (fix:or #xDC00 (fix:and s #x3FF)) write-byte))))) + +(define (utf16-string->wide-string string #!optional start end) + (%utf16-string->wide-string string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + (if (host-big-endian?) + read-utf16-be-char + read-utf16-le-char))) + +(define (utf16-be-string->wide-string string #!optional start end) + (%utf16-string->wide-string string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + read-utf16-be-char)) + +(define (utf16-le-string->wide-string string #!optional start end) + (%utf16-string->wide-string string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + read-utf16-le-char)) + +(define (%utf16-string->wide-string string start end read-utf16-char) + (let ((input (open-input-string string start end))) + (call-with-wide-output-string + (lambda (output) + (let loop () + (let ((char (read-utf16-char input))) + (if (not (eof-object? char)) + (begin + (write-char char output) + (loop))))))))) + +(define (wide-string->utf16-string string #!optional start end) + (%wide-string->utf16-string string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + (if (host-big-endian?) + %write-utf16-be-char + %write-utf16-le-char))) (define (wide-string->utf16-be-string string #!optional start end) (%wide-string->utf16-string string @@ -888,10 +1001,7 @@ USA. %write-utf16-le-char)) (define (%wide-string->utf16-string string start end write-utf16-char) - (let ((input - (open-wide-input-string string - (if (default-object? start) #f start) - (if (default-object? end) #f end)))) + (let ((input (open-wide-input-string string start end))) (call-with-output-string (lambda (output) (let loop () @@ -901,15 +1011,32 @@ USA. (write-utf16-char char output) (loop))))))))) +(define (utf16-string-length string #!optional start end) + (if (host-big-endian?) + (%utf16-string-length string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + "16BE" be-bytes->digit16 + 'UTF16-STRING-LENGTH) + (%utf16-string-length string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + "16LE" le-bytes->digit16 + 'UTF16-STRING-LENGTH))) + (define (utf16-be-string-length string #!optional start end) - (with-substring-args string start end 'UTF16-BE-STRING-LENGTH - (%utf16-string-length string start end "16BE" be-bytes->digit16 - 'UTF16-BE-STRING-LENGTH))) + (%utf16-string-length string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + "16BE" be-bytes->digit16 + 'UTF16-BE-STRING-LENGTH)) (define (utf16-le-string-length string #!optional start end) - (with-substring-args string start end 'UTF16-LE-STRING-LENGTH - (%utf16-string-length string start end "16LE" le-bytes->digit16 - 'UTF16-LE-STRING-LENGTH))) + (%utf16-string-length string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + "16LE" le-bytes->digit16 + 'UTF16-LE-STRING-LENGTH)) (define (%utf16-string-length string start end type combiner caller) (with-substring-args string start end caller @@ -932,6 +1059,23 @@ USA. (and (unicode-code-point? d0) (fix:+ start 2))))) start))))) + +(define-integrable (be-bytes->digit16 b0 b1) + (fix:or (fix:lsh b0 8) b1)) + +(define-integrable (le-bytes->digit16 b0 b1) + (fix:or (fix:lsh b1 8) b0)) + +(define-integrable (high-surrogate? n) + (fix:= #xD800 (fix:and #xFC00 n))) + +(define-integrable (low-surrogate? n) + (fix:= #xDC00 (fix:and #xFC00 n))) + +(define-integrable (combine-surrogates n0 n1) + (fix:+ (fix:+ (fix:lsh (fix:and n0 #x3FF) 10) + (fix:and n1 #x3FF)) + #x10000)) ;;;; UTF-8 representation -- 2.25.1