From 33590db0e106ce3fcc8241fb75663cffbf2d747b Mon Sep 17 00:00:00 2001 From: Chris Hanson Date: Fri, 28 Feb 2003 04:40:25 +0000 Subject: [PATCH] Major rewrite of Unicode support. New design supports "wide" characters and strings, and has support for UTF-8, UTF-16, and UTF-32 encodings. --- v7/src/runtime/make.scm | 3 +- v7/src/runtime/parser-buffer.scm | 14 +- v7/src/runtime/runtime.pkg | 62 +- v7/src/runtime/unicode.scm | 1088 +++++++++++++++++++++++------- 4 files changed, 899 insertions(+), 268 deletions(-) diff --git a/v7/src/runtime/make.scm b/v7/src/runtime/make.scm index adee95bd3..43f2a29ed 100644 --- a/v7/src/runtime/make.scm +++ b/v7/src/runtime/make.scm @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: make.scm,v 14.89 2003/02/14 18:28:33 cph Exp $ +$Id: make.scm,v 14.90 2003/02/28 04:40:06 cph Exp $ Copyright (c) 1988,1989,1990,1991,1992 Massachusetts Institute of Technology Copyright (c) 1993,1994,1995,1996,1997 Massachusetts Institute of Technology @@ -486,6 +486,7 @@ USA. (RUNTIME PATHNAME) (RUNTIME WORKING-DIRECTORY) (RUNTIME LOAD) + (RUNTIME UNICODE) ;; Syntax (RUNTIME NUMBER-PARSER) (RUNTIME PARSER) diff --git a/v7/src/runtime/parser-buffer.scm b/v7/src/runtime/parser-buffer.scm index a488f4e37..7ad4f6672 100644 --- a/v7/src/runtime/parser-buffer.scm +++ b/v7/src/runtime/parser-buffer.scm @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: parser-buffer.scm,v 1.7 2003/02/14 18:28:33 cph Exp $ +$Id: parser-buffer.scm,v 1.8 2003/02/28 04:40:12 cph Exp $ Copyright 2001,2002,2003 Massachusetts Institute of Technology @@ -180,12 +180,14 @@ USA. (define (match-utf8-char-in-alphabet buffer alphabet) (let ((p (get-parser-buffer-pointer buffer))) - (if (let ((n - (read-utf8-code-point-from-source + (if (let ((char + (read-utf8-char-from-source (lambda () - (read-parser-buffer-char buffer))))) - (and n - (code-point-in-alphabet? n alphabet))) + (let ((char (read-parser-buffer-char buffer))) + (and char + (char->integer char))))))) + (and (not (eof-object? char)) + (char-in-alphabet? char alphabet))) #t (begin (set-parser-buffer-pointer! buffer p) diff --git a/v7/src/runtime/runtime.pkg b/v7/src/runtime/runtime.pkg index d8ac52b76..ea9d539ad 100644 --- a/v7/src/runtime/runtime.pkg +++ b/v7/src/runtime/runtime.pkg @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: runtime.pkg,v 14.429 2003/02/24 21:53:39 cph Exp $ +$Id: runtime.pkg,v 14.430 2003/02/28 04:40:19 cph Exp $ Copyright (c) 1988,1989,1990,1991,1992 Massachusetts Institute of Technology Copyright (c) 1993,1994,1995,1996,1997 Massachusetts Institute of Technology @@ -243,6 +243,8 @@ USA. flo:vector-ref flo:vector-set! flo:zero? + guarantee-index-fixnum + guarantee-limited-index-fixnum index-fixnum? int:* int:+ @@ -916,9 +918,11 @@ USA. code->char digit->char guarantee-char + guarantee-radix integer->char make-char - name->char) + name->char + radix?) (export (runtime string) %charchar-set alphabet->code-points alphabet->string + alphabet-rtd alphabet? + call-with-wide-output-string char-in-alphabet? char-set->alphabet - code-point->utf8-string - code-point-in-alphabet? code-points->alphabet - read-utf8-code-point - read-utf8-code-point-from-source + guarantee-8-bit-alphabet + guarantee-alphabet + guarantee-unicode-code-point + guarantee-well-formed-code-point-list + guarantee-wide-char + guarantee-wide-string + guarantee-wide-string-index + make-wide-string + open-wide-input-string + open-wide-output-string + read-utf16-be-char + read-utf16-le-char + read-utf32-be-char + read-utf32-le-char + read-utf8-char string->alphabet + string->wide-string unicode-code-point? - utf8-string->code-point - well-formed-code-points-list? - write-utf8-code-point)) + utf16-be-string->wide-string + utf16-be-string-length + utf16-le-string->wide-string + utf16-le-string-length + utf32-be-string-length + utf32-le-string-length + utf8-string->wide-string + utf8-string-length + well-formed-code-point-list? + wide-char? + wide-string + wide-string->string + wide-string->utf16-be-string + wide-string->utf16-le-string + wide-string->utf8-string + wide-string-index? + wide-string-length + wide-string-ref + wide-string-rtd + wide-string-set! + wide-string? + write-utf16-be-char + write-utf16-le-char + write-utf32-be-char + write-utf32-le-char + write-utf8-char) + (export (runtime parser-buffer) + read-utf8-char-from-source)) (define-package (runtime url) (files "url") diff --git a/v7/src/runtime/unicode.scm b/v7/src/runtime/unicode.scm index 8132ac5f9..9f2681396 100644 --- a/v7/src/runtime/unicode.scm +++ b/v7/src/runtime/unicode.scm @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: unicode.scm,v 1.5 2003/02/14 18:28:34 cph Exp $ +$Id: unicode.scm,v 1.6 2003/02/28 04:40:25 cph Exp $ Copyright 2001,2003 Massachusetts Institute of Technology @@ -24,8 +24,12 @@ USA. |# ;;;; Unicode support +;;; package: (runtime unicode) -;;; UTF-8 encoding: +;;; See "http://www.cl.cam.ac.uk/~mgk25/unicode.html". +;;; +;;; UTF-8 encoding +;;; ============== ;;; ;;; max code encoding ;;; ---------- ----------------------------------------------------- @@ -35,14 +39,130 @@ USA. ;;; #x00200000 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx ;;; #x04000000 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx ;;; #x80000000 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx +;;; +;;; It is possible to represent codes with over-long sequences, but +;;; this is disallowed. For example, #\A is normally represented as +;;; #x41, but could also be written as #xC1 #x81, or even longer +;;; sequences. +;;; +;;; Additionally, the codes #xD800 through #xDFFF, #xFFFE, and #xFFFF +;;; are disallowed, as they are not valid Unicode characters. +;;; +;;; UTF-16 encoding +;;; =============== +;;; +;;; Codes in the ranges #x0000 through #xD7FF and #xE000 through +;;; #xFFFD are represented as themselves. Codes in the range #x10000 +;;; through #xFFFFF are represented as a pair: +;;; +;;; 110110xxxxxxxxxx 110111xxxxxxxxxx +;;; +;;; where the first 16-bit word contains the MS 10 bits, and the +;;; second contains the LS 10 bits. As for UTF-8, overlong sequences +;;; are disallowed. +;;; +;;; Some UTF-16 documents start with the code #xFEFF, to identify the +;;; endianness of the document. If instead #xFFFE is encountered, the +;;; opposite endianness should be used. (declare (usual-integrations)) +(define-syntax with-substring-args + (sc-macro-transformer + (lambda (form environment) + (if (syntax-match? '(expression expression expression expression + + expression) + (cdr form)) + (let ((string (close-syntax (list-ref form 1) environment)) + (start (close-syntax (list-ref form 2) environment)) + (end (close-syntax (list-ref form 3) environment)) + (caller (close-syntax (list-ref form 4) environment))) + `(BEGIN + (GUARANTEE-STRING ,string ,caller) + (LET* ((,(list-ref form 3) + (IF (OR (DEFAULT-OBJECT? ,end) (NOT ,end)) + (STRING-LENGTH ,string) + (GUARANTEE-SUBSTRING-END-INDEX + ,end (STRING-LENGTH ,string) ,caller))) + (,(list-ref form 2) + (IF (OR (DEFAULT-OBJECT? ,start) (NOT ,start)) + 0 + (GUARANTEE-SUBSTRING-START-INDEX + ,start ,(list-ref form 3) ,caller)))) + ,@(map (lambda (expr) + (make-syntactic-closure environment + (list (list-ref form 2) (list-ref form 3)) + expr)) + (list-tail form 5))))) + (ill-formed-syntax form))))) + +(define (encoded-string-length string start end type caller validate-char) + (let loop ((start start) (n 0)) + (if (fix:< start end) + (let ((start* (validate-char string start end))) + (if (not start*) + (error:wrong-type-argument string + (string-append "UTF-" type " string") + caller)) + (loop start* (fix:+ n 1))) + n))) + +(define (read-byte port) + (let ((char (read-char port))) + (if (eof-object? char) + char + (let ((b (char->integer char))) + (if (not (fix:< b #x100)) + (error "Illegal input byte:" b)) + b)))) + +(define (write-byte byte port) + (write-char (integer->char byte) port)) + +(define (initialize-package!) + (set! ws-output-port-type (make-port-type ws-output-operations #f)) + (set! ws-input-port-type (make-port-type ws-input-operations #f)) + unspecific) + +;;;; Unicode characters + +(define (wide-char? object) + (and (char? object) + (fix:= (char-bits object) 0) + (unicode-code-point? (char-code object)))) + +(define (guarantee-wide-char object caller) + (if (not (wide-char? object)) + (error:not-wide-char object caller))) + +(define (error:not-wide-char object caller) + (error:wrong-type-argument object "Unicode character" caller)) + +(define (unicode-code-point? object) + (and (index-fixnum? object) + (if (fix:< object #x10000) + (not (illegal-code? object)) + (fix:< object char-code-limit)))) + +(define (guarantee-unicode-code-point object caller) + (if (not (unicode-code-point? object)) + (error:wrong-type-argument object "Unicode code point" caller))) + +(define-integrable (illegal-code? pt) + (or (fix:= #xD800 (fix:and #xF800 pt)) + (fix:= #xFFFE (fix:and #xFFFE pt)))) + +;;;; Alphabets + (define-structure (alphabet (type-descriptor alphabet-rtd)) (low #f read-only #t) (high1 #f read-only #t) (high2 #f read-only #t)) +(define (guarantee-alphabet object caller) + (if (not (alphabet? object)) + (error:wrong-type-argument object "Unicode alphabet" caller))) + (define-integrable (make-alphabet-low) (make-string #x100 (integer->char 0))) @@ -60,37 +180,54 @@ USA. (define null-alphabet (make-alphabet (make-alphabet-low) '#() '#())) -(define (unicode-code-point? n) - (and (exact-nonnegative-integer? n) - (< n #x80000000))) - -(define (code-point-in-alphabet? n alphabet) - (if (not (unicode-code-point? n)) - (error:wrong-type-argument n "unicode code point" - 'CODE-POINT-IN-ALPHABET?)) - (if (not (alphabet? alphabet)) - (error:wrong-type-argument alphabet "unicode alphabet" - 'CODE-POINT-IN-ALPHABET?)) - (if (< n #x800) - (alphabet-low-ref (alphabet-low alphabet) n) +(define (char-in-alphabet? char alphabet) + (guarantee-wide-char char 'CHAR-IN-ALPHABET?) + (guarantee-alphabet alphabet 'CHAR-IN-ALPHABET?) + (%code-point-in-alphabet? (char-code char) alphabet)) + +(define (%code-point-in-alphabet? pt alphabet) + (if (fix:< pt #x800) + (alphabet-low-ref (alphabet-low alphabet) pt) (let ((high1 (alphabet-high1 alphabet)) (high2 (alphabet-high2 alphabet))) (let loop ((lower 0) (upper (vector-length high1))) (and (fix:< lower upper) (let ((index (fix:quotient (fix:+ lower upper) 2))) - (cond ((< n (vector-ref high1 index)) + (cond ((fix:< pt (vector-ref high1 index)) (loop lower index)) - ((< (vector-ref high2 index) n) + ((fix:< (vector-ref high2 index) pt) (loop (fix:+ index 1) upper)) (else #t)))))))) - -(define (char-in-alphabet? char alphabet) - (code-point-in-alphabet? (char-code char) alphabet)) +(define (well-formed-code-point-list? items) + (if (pair? items) + (and (well-formed-item? (car items)) + (let loop ((a (car items)) (items (cdr items))) + (or (not (pair? items)) + (let ((b (car items)) + (items (cdr items))) + (and (well-formed-item? b) + (fix:< (if (pair? a) (cdr a) a) + (if (pair? b) (car b) b)) + (loop b items)))))) + (null? items))) + +(define (well-formed-item? item) + (if (pair? item) + (and (unicode-code-point? (car item)) + (unicode-code-point? (cdr item)) + (fix:< (car item) (cdr item))) + (unicode-code-point? item))) + +(define (guarantee-well-formed-code-point-list object caller) + (if (not (well-formed-code-point-list? object)) + (error:wrong-type-argument object "Unicode code-point list" caller))) + (define (code-points->alphabet items) - (if (not (well-formed-code-points-list? items)) - (error:wrong-type-argument items "code-points list" - 'CODE-POINTS->ALPHABET)) + (guarantee-well-formed-code-point-list items 'CODE-POINTS->ALPHABET) + (%code-points->alphabet items)) + +(define (%code-points->alphabet items) (call-with-values (lambda () (split-list items #x800)) (lambda (low-items high-items) (let ((low (make-alphabet-low))) @@ -121,77 +258,55 @@ USA. (if (pair? items) (let ((item (car items))) (cond ((not (pair? item)) - (if (< item limit) + (if (fix:< item limit) (loop (cdr items) (cons item low)) (values low items))) - ((< (cdr item) limit) + ((fix:< (cdr item) limit) (loop (cdr items) (cons item low))) - ((<= limit (car item)) + ((fix:<= limit (car item)) (values low items)) (else (values (cons (cons (car item) (- limit 1)) low) (cons (cons limit (cdr item)) items))))) (values low '())))) - -(define (well-formed-code-points-list? items) - (if (pair? items) - (and (well-formed-item? (car items)) - (let loop ((a (car items)) (items (cdr items))) - (or (not (pair? items)) - (let ((b (car items)) - (items (cdr items))) - (and (well-formed-item? b) - (< (if (pair? a) (cdr a) a) - (if (pair? b) (car b) b)) - (loop b items)))))) - (null? items))) - -(define (well-formed-item? item) - (if (pair? item) - (and (unicode-code-point? (car item)) - (unicode-code-point? (cdr item)) - (< (car item) (cdr item))) - (unicode-code-point? item))) -(define (char-set->alphabet char-set) - (let ((low (make-alphabet-low))) - (do ((i 0 (fix:+ i 1))) - ((fix:= i #x100)) - (if (char-set-member? char-set (integer->char i)) - (alphabet-low-set! low i))) - (make-alphabet low '#() '#()))) - -(define (alphabet->char-set alphabet) - (predicate->char-set (lambda (char) (char-in-alphabet? char alphabet)))) +(define (alphabet . chars) + (%code-points->alphabet + (sorted-chars->wfcp-list (remove-duplicate-chars chars)))) -(define (string->alphabet string) - (if (not (string? string)) - (error:wrong-type-argument string "string" 'STRING->ALPHABET)) - (let ((n (string-length string)) - (low (make-alphabet-low))) - (do ((i 0 (fix:+ i 1))) - ((fix:= i n)) - (alphabet-low-set! low (vector-8b-ref string i))) - (make-alphabet low '#() '#()))) +(define (remove-duplicate-chars chars) + (let ((table (make-eq-hash-table))) + (for-each (lambda (char) + (guarantee-wide-char char 'REMOVE-DUPLICATE-CHARS) + (hash-table/put! table char #t)) + chars) + (hash-table/key-list table))) -(define (alphabet->string alphabet) - (let loop ((i 0) (chars '())) - (if (fix:< i #x100) - (loop (fix:+ i 1) - (if (code-point-in-alphabet? i alphabet) - (cons (integer->char i) chars) - chars)) - (apply string (reverse! chars))))) - -(define (8-bit-alphabet? alphabet) - (and (fix:= (vector-length (alphabet-high1 alphabet)) 0) - (let ((low (alphabet-low alphabet))) - (let loop ((i #x20)) - (or (fix:= i #x100) - (and (fix:= (vector-8b-ref low i) 0) - (loop (fix:+ i 1)))))))) +(define (sorted-chars->wfcp-list chars) + (let ((chars (sort chars charinteger (car chars))) + (chars (cdr chars)) + (items '())) + (if (pair? chars) + (let ((pt* (char->integer (car chars))) + (chars (cdr chars))) + (if (fix:= pt* (fix:+ pt 1)) + (let find-max ((pt* pt*) (chars chars)) + (if (pair? chars) + (let ((pt** (char->integer (car chars))) + (chars (cdr chars))) + (if (fix:= pt** (fix:+ pt* 1)) + (find-max pt** chars) + (loop pt** chars (cons (cons pt pt*) items)))) + (reverse! (cons (cons pt pt*) items)))) + (loop pt* chars (cons pt items)))) + (reverse! (cons pt items)))) + '()))) (define (alphabet->code-points alphabet) + (guarantee-alphabet alphabet 'ALPHABET->CODE-POINTS) (append! (alphabet-low->code-points (alphabet-low alphabet)) (alphabet-high->code-points (alphabet-high1 alphabet) (alphabet-high2 alphabet)))) @@ -227,11 +342,56 @@ USA. result)) (reverse! result))))) +(define (8-bit-alphabet? alphabet) + (and (fix:= (vector-length (alphabet-high1 alphabet)) 0) + (let ((low (alphabet-low alphabet))) + (let loop ((i #x20)) + (or (fix:= i #x100) + (and (fix:= (vector-8b-ref low i) 0) + (loop (fix:+ i 1)))))))) + +(define-integrable (guarantee-8-bit-alphabet object caller) + (if (not (8-bit-alphabet? object)) + (error:not-8-bit-alphabet object caller))) + +(define (error:not-8-bit-alphabet object caller) + (error:wrong-type-argument object "8-bit alphabet" caller)) + +(define (char-set->alphabet char-set) + (guarantee-char-set char-set 'CHAR-SET->ALPHABET) + (let ((low (make-alphabet-low))) + (do ((i 0 (fix:+ i 1))) + ((fix:= i #x100)) + (if (char-set-member? char-set (integer->char i)) + (alphabet-low-set! low i))) + (make-alphabet low '#() '#()))) + +(define (alphabet->char-set alphabet) + (guarantee-8-bit-alphabet alphabet 'ALPHABET->CHAR-SET) + (predicate->char-set (lambda (char) (char-in-alphabet? char alphabet)))) + +(define (string->alphabet string) + (guarantee-string string 'STRING->ALPHABET) + (let ((n (string-length string)) + (low (make-alphabet-low))) + (do ((i 0 (fix:+ i 1))) + ((fix:= i n)) + (alphabet-low-set! low (vector-8b-ref string i))) + (make-alphabet low '#() '#()))) + +(define (alphabet->string alphabet) + (guarantee-8-bit-alphabet alphabet 'ALPHABET->STRING) + (let loop ((i 0) (chars '())) + (if (fix:< i #x100) + (loop (fix:+ i 1) + (if (%code-point-in-alphabet? i alphabet) + (cons (integer->char i) chars) + chars)) + (apply string (reverse! chars))))) + (define (alphabet+ . alphabets) (for-each (lambda (alphabet) - (if (not (alphabet? alphabet)) - (error:wrong-type-argument alphabet "unicode alphabet" - 'ALPHABET+))) + (guarantee-alphabet alphabet 'ALPHABET+)) alphabets) (reduce alphabet+2 null-alphabet alphabets)) @@ -271,11 +431,11 @@ USA. (subvector-move-left! lower1 i1 n1 lower i) (subvector-move-left! upper1 i1 n1 upper i) (fix:+ i (fix:- n1 i1))) - ((< (vector-ref upper1 i1) (vector-ref lower2 i2)) + ((fix:< (vector-ref upper1 i1) (vector-ref lower2 i2)) (vector-set! lower i (vector-ref lower1 i1)) (vector-set! upper i (vector-ref upper1 i1)) (loop (fix:+ i1 1) i2 (fix:+ i 1))) - ((< (vector-ref upper2 i2) (vector-ref lower1 i1)) + ((fix:< (vector-ref upper2 i2) (vector-ref lower1 i1)) (vector-set! lower i (vector-ref lower2 i2)) (vector-set! upper i (vector-ref upper2 i2)) (loop i1 (fix:+ i2 1) (fix:+ i 1))) @@ -325,16 +485,17 @@ USA. (subvector-move-left! lower1 i1 n1 lower i) (subvector-move-left! upper1 i1 n1 upper i) (fix:+ i (fix:- n1 i1))) - ((< (vector-ref upper1 i1) (vector-ref lower2 i2)) + ((fix:< (vector-ref upper1 i1) (vector-ref lower2 i2)) (vector-set! lower i (vector-ref lower1 i1)) (vector-set! upper i (vector-ref upper1 i1)) (loop (fix:+ i1 1) i2 (fix:+ i 1))) - ((< (vector-ref upper2 i2) (vector-ref lower1 i1)) + ((fix:< (vector-ref upper2 i2) (vector-ref lower1 i1)) (loop i1 (fix:+ i2 1) i)) - ((< (vector-ref lower1 i1) (vector-ref lower2 i2)) + ((fix:< (vector-ref lower1 i1) (vector-ref lower2 i2)) (vector-set! lower i (vector-ref lower1 i1)) (vector-set! upper i (- (vector-ref lower2 i2) 1)) - (if (<= (vector-ref upper1 i1) (vector-ref upper2 i2)) + (if (fix:<= (vector-ref upper1 i1) + (vector-ref upper2 i2)) (loop (fix:+ i1 1) (fix:+ i2 1) (fix:+ i 1)) (begin (vector-set! lower (fix:+ i 1) @@ -342,7 +503,7 @@ USA. (vector-set! upper (fix:+ i 1) (vector-ref upper1 i1)) (loop (fix:+ i1 1) (fix:+ i2 1) (fix:+ i 2))))) - ((<= (vector-ref upper1 i1) (vector-ref upper2 i2)) + ((fix:<= (vector-ref upper1 i1) (vector-ref upper2 i2)) (loop (fix:+ i1 1) (fix:+ i2 1) i)) (else (vector-set! lower i (+ (vector-ref upper2 i2) 1)) @@ -352,171 +513,594 @@ USA. (values (vector-head lower n) (vector-head upper n)) (values lower upper)))))) -(define (read-utf8-code-point port) - (let ((c0 (read-char port)) - (get-next +;;;; Unicode strings + +(define-structure (wide-string (type-descriptor wide-string-rtd) + (constructor %make-wide-string)) + (contents #f read-only #t)) + +(define-integrable (guarantee-wide-string object caller) + (if (not (wide-string? object)) + (error:not-wide-string object caller))) + +(define (error:not-wide-string object caller) + (error:wrong-type-argument object "Unicode string" caller)) + +(define (make-wide-string length #!optional char) + (%make-wide-string + (make-vector length + (if (default-object? char) + (integer->char 0) + (begin + (guarantee-wide-char char 'MAKE-WIDE-STRING) + char))))) + +(define (wide-string . chars) + (for-each (lambda (char) (guarantee-wide-char char 'WIDE-STRING)) chars) + (%make-wide-string (list->vector chars))) + +(define (wide-string-length string) + (guarantee-wide-string string 'WIDE-STRING-LENGTH) + (%wide-string-length string)) + +(define-integrable (%wide-string-length string) + (vector-length (wide-string-contents string))) + +(define (wide-string-ref string index) + (guarantee-wide-string string 'WIDE-STRING-REF) + (guarantee-wide-string-index index string 'WIDE-STRING-REF) + (%wide-string-ref string index)) + +(define-integrable (%wide-string-ref string index) + (vector-ref (wide-string-contents string) index)) + +(define (wide-string-set! string index char) + (guarantee-wide-string string 'WIDE-STRING-SET!) + (guarantee-wide-string-index index string 'WIDE-STRING-SET!) + (guarantee-wide-char char 'WIDE-STRING-SET!) + (%wide-string-set! string index char)) + +(define-integrable (%wide-string-set! string index char) + (vector-set! (wide-string-contents string) index char)) + +(define (wide-string-index? index string) + (and (index-fixnum? index) + (fix:< index (%wide-string-length string)))) + +(define-integrable (guarantee-wide-string-index index string caller) + (if (not (wide-string-index? index string)) + (error:not-wide-string-index index caller))) + +(define (error:not-wide-string-index index caller) + (error:wrong-type-argument index "Unicode string index" caller)) + +(define (open-wide-output-string) + (make-port ws-output-port-type (make-ws-output-state))) + +(define (call-with-wide-output-string generator) + (let ((port (open-wide-output-string))) + (generator port) + (get-output-string port))) + +(define ws-output-port-type) + +(define (make-ws-output-state) + (let ((v (make-vector 17))) + (vector-set! v 0 0) + v)) + +(define ws-output-operations + `((WRITE-CHAR + ,(lambda (port char) + (guarantee-wide-char char 'WRITE-CHAR) + (without-interrupts (lambda () - (let ((c (read-char port))) - (if (eof-object? c) - (error "EOF while reading UTF-8 code point.")) - (if (not (and (fix:<= #x80 (char->integer c)) - (fix:< (char->integer c) #xC0))) - (error "Illegal subsequent UTF-8 char:" c)) - (fix:and (char->integer c) #x3F))))) - (cond ((eof-object? c0) - c0) - ((fix:< (char->integer c0) #x80) - (char->integer c0)) - ((fix:< (char->integer c0) #xE0) - (fix:or (fix:lsh (fix:and (char->integer c0) #x1F) 6) - (get-next))) - ((fix:< (char->integer c0) #xF0) - (let* ((n1 (get-next)) - (n2 (get-next))) - (fix:or (fix:lsh (fix:and (char->integer c0) #x0F) 12) - (fix:or (fix:lsh n1 6) - n2)))) - ((fix:< (char->integer c0) #xF8) - (let* ((n1 (get-next)) - (n2 (get-next)) - (n3 (get-next))) - (fix:or (fix:lsh (fix:and (char->integer c0) #x07) 18) - (fix:or (fix:lsh n1 12) - (fix:or (fix:lsh n2 6) - n3))))) - ((fix:< (char->integer c0) #xFC) - (let* ((n1 (get-next)) - (n2 (get-next)) - (n3 (get-next)) - (n4 (get-next))) - (+ (* (fix:and (char->integer c0) #x03) #x01000000) - (fix:or (fix:lsh n1 18) - (fix:lsh n2 12)) - (fix:or (fix:lsh n3 6) - n4)))) - ((fix:< (char->integer c0) #xFE) - (let* ((n1 (get-next)) - (n2 (get-next)) - (n3 (get-next)) - (n4 (get-next)) - (n5 (get-next))) - (+ (* (fix:and (char->integer c0) #x01) #x40000000) - (* n1 #x01000000) - (fix:or (fix:lsh n2 18) - (fix:lsh n3 12)) - (fix:or (fix:lsh n4 6) - n5)))) - (else - (error "Illegal initial UTF-8 char:" c0))))) + (let* ((v (port/state port)) + (n (vector-ref v 0)) + (n* (fix:+ n 1)) + (v + (if (fix:= (vector-length v) n*) + (vector-grow v (fix:+ n* n)) + v))) + (vector-set! v n* char) + (vector-set! v 0 n*)))))) + (EXTRACT-OUTPUT! + ,(lambda (port) + (%make-wide-string + (without-interrupts + (lambda () + (let ((v (port/state port))) + (subvector v 1 (fix:+ (vector-ref v 0) 1)))))))) + (WRITE-SELF + ,(lambda (port port*) + port + (write-string " to wide string" port*))))) + +(define (string->wide-string string #!optional start end) + (let ((input + (open-input-string string + (if (default-object? start) #f start) + (if (default-object? end) #f end)))) + (call-with-wide-output-string + (lambda (output) + (let loop () + (let ((char (read-char input))) + (if (not (eof-object? char)) + (begin + (write-char char output) + (loop))))))))) + +(define (open-wide-input-string string #!optional start end) + (with-substring-args string start end 'OPEN-WIDE-INPUT-STRING + (make-port ws-input-port-type (make-ws-input-state string start end)))) + +(define ws-input-port-type) + +(define-structure (ws-input-state (type vector) + (conc-name ws-input-state/)) + (string #f read-only #t) + start + (end #f read-only #t)) + +(define-integrable (ws-input-port/string port) + (ws-input-state/string (port/state port))) + +(define-integrable (ws-input-port/start port) + (ws-input-state/start (port/state port))) + +(define-integrable (set-ws-input-port/start! port index) + (set-ws-input-state/start! (port/state port) index)) + +(define-integrable (ws-input-port/end port) + (ws-input-state/end (port/state port))) + +(define ws-input-operations + `((CHAR-READY? + ,(lambda (port interval) + interval + (fix:< (ws-input-port/start port) (ws-input-port/end port)))) + (DISCARD-CHAR + ,(lambda (port) + (set-ws-input-port/start! port (fix:+ (ws-input-port/start port) 1)))) + (PEEK-CHAR + ,(lambda (port) + (let ((start (ws-input-port/start port))) + (if (fix:< start (ws-input-port/end port)) + (%wide-string-ref (ws-input-port/string port) + start) + (make-eof-object port))))) + (READ-CHAR + ,(lambda (port) + (let ((start (ws-input-port/start port))) + (if (fix:< start (ws-input-port/end port)) + (begin + (set-ws-input-port/start! port (fix:+ start 1)) + (%wide-string-ref (ws-input-port/string port) start)) + (make-eof-object port))))) + (WRITE-SELF + ,(lambda (port output-port) + port + (write-string " from wide string" output-port))))) + +(define (wide-string->string string #!optional start end) + (let ((input + (open-wide-input-string string + (if (default-object? start) #f start) + (if (default-object? end) #f end)))) + (call-with-output-string + (lambda (output) + (let loop () + (let ((char (read-char input))) + (if (not (eof-object? char)) + (begin + (write-char char output) + (loop))))))))) + +;;;; UTF-32 representation + +(define (read-utf32-be-char port) + (%read-utf32-char port utf32-be-bytes->code-point 'READ-UTF32-BE-CHAR)) + +(define (read-utf32-le-char port) + (%read-utf32-char port utf32-le-bytes->code-point 'READ-UTF32-LE-CHAR)) + +(define (%read-utf32-char port combiner caller) + (let ((b0 (read-byte port))) + (if (eof-object? b0) + b0 + (let* ((b1 (read-byte port)) + (b2 (read-byte port)) + (b3 (read-byte port))) + (if (or (eof-object? b1) + (eof-object? b2) + (eof-object? b3)) + (error "Truncated UTF-32 input.")) + (let ((pt (combiner b0 b1 b2 b3))) + (guarantee-unicode-code-point pt caller) + (integer->char pt)))))) + +(define-integrable (utf32-be-bytes->code-point b0 b1 b2 b3) + (+ (* b0 #x01000000) + (fix:lsh b1 16) + (fix:lsh b2 8) + b3)) + +(define-integrable (utf32-le-bytes->code-point b0 b1 b2 b3) + (+ (* b3 #x01000000) + (fix:lsh b2 16) + (fix:lsh b1 8) + b0)) -(define (utf8-string->code-point string) - (read-utf8-code-point (open-input-string string))) +(define (write-utf32-be-char char port) + (guarantee-wide-char char 'WRITE-UTF32-BE-CHAR) + (let ((pt (char->integer char))) + (write-byte 0 port) + (write-byte (fix:lsh pt -16) port) + (write-byte (fix:lsh pt -8) port) + (write-byte (fix:and pt #xFF) port))) + +(define (write-utf32-le-char char port) + (guarantee-wide-char char 'WRITE-UTF32-LE-CHAR) + (let ((pt (char->integer char))) + (write-byte (fix:and pt #xFF) port) + (write-byte (fix:lsh pt -8) port) + (write-byte (fix:lsh pt -16) port) + (write-byte 0 port))) + +(define (utf32-be-string-length string #!optional start end) + (with-substring-args string start end 'UTF32-BE-STRING-LENGTH + (%utf32-string-length string start end "32BE" utf32-be-bytes->code-point + 'UTF32-BE-STRING-LENGTH))) + +(define (utf32-le-string-length string #!optional start end) + (with-substring-args string start end 'UTF32-LE-STRING-LENGTH + (%utf32-string-length string start end "32LE" utf32-le-bytes->code-point + 'UTF32-LE-STRING-LENGTH))) + +(define (%utf32-string-length string start end type combiner caller) + (with-substring-args string start end caller + (encoded-string-length string start end type caller + (lambda (string start end) + + (define-integrable (n i) + (vector-8b-ref string (fix:+ start i))) + + (if (fix:< start end) + (let ((start* (fix:+ start 4))) + (and (fix:<= start* end) + (let ((pt (combiner (n 0) (n 1) (n 2) (n 3)))) + (and (unicode-code-point? pt) + start*)))) + start))))) + +;;;; UTF-16 representation + +(define (read-utf16-be-char port) + (%read-utf16-char port be-bytes->digit16 'READ-UTF16-BE-CHAR)) + +(define (read-utf16-le-char port) + (%read-utf16-char port le-bytes->digit16 'READ-UTF16-LE-CHAR)) + +(define (%read-utf16-char port combinator caller) + (let ((d0 (read-utf16-digit port combinator))) + (if (eof-object? d0) + d0 + (let ((pt + (if (high-surrogate? d0) + (let ((d1 (read-utf16-digit port combinator))) + (if (eof-object? d1) + (error "Truncated UTF-16 input.")) + (if (not (low-surrogate? d1)) + (error "Illegal UTF-16 subsequent digit:" d1)) + (combine-surrogates d0 d1)) + d0))) + (guarantee-unicode-code-point pt caller) + (integer->char pt))))) + +(define (read-utf16-digit port combinator) + (let ((b0 (read-byte port))) + (if (eof-object? b0) + b0 + (let ((b1 (read-byte port))) + (if (eof-object? b1) + (error "Truncated UTF-16 input.")) + (combinator b0 b1))))) + +(define-integrable (be-bytes->digit16 b0 b1) + (fix:or (fix:lsh b0 8) b1)) + +(define-integrable (le-bytes->digit16 b0 b1) + (fix:or (fix:lsh b1 8) b0)) + +(define-integrable (high-surrogate? n) + (fix:= #xD800 (fix:and #xFC00 n))) + +(define-integrable (low-surrogate? n) + (fix:= #xDC00 (fix:and #xFC00 n))) + +(define-integrable (combine-surrogates n0 n1) + (fix:+ (fix:+ (fix:lsh (fix:and n0 #x3FF) 10) + (fix:and n1 #x3FF)) + #x10000)) + +(define (utf16-be-string->wide-string string #!optional start end) + (%utf16-string->wide-string string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + read-utf16-be-char)) + +(define (utf16-le-string->wide-string string #!optional start end) + (%utf16-string->wide-string string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + read-utf16-le-char)) + +(define (%utf16-string->wide-string string start end read-utf16-char) + (let ((input (open-input-string string start end))) + (call-with-wide-output-string + (lambda (output) + (let loop () + (let ((char (read-utf16-char input))) + (if (not (eof-object? char)) + (begin + (write-char char output) + (loop))))))))) + +(define (write-utf16-be-char char port) + (guarantee-wide-char char 'WRITE-UTF16-BE-CHAR) + (%write-utf16-be-char char port)) + +(define (write-utf16-le-char char port) + (guarantee-wide-char char 'WRITE-UTF16-LE-CHAR) + (%write-utf16-le-char char port)) + +(define-integrable (%write-utf16-be-char char port) + (%write-utf16-char char port + (lambda (digit output) + (output (fix:lsh digit -8)) + (output (fix:and digit #x00FF))))) + +(define-integrable (%write-utf16-le-char char port) + (%write-utf16-char char port + (lambda (digit output) + (output (fix:and digit #x00FF)) + (output (fix:lsh digit -8))))) + +(define (%write-utf16-char char port dissecter) + (let ((pt (char->integer char)) + (write-byte (lambda (byte) (write-byte byte port)))) + (if (fix:< pt #x10000) + (dissecter pt write-byte) + (let ((s (fix:- pt #x10000))) + (dissecter (fix:or #xD800 (fix:lsh s -10)) write-byte) + (dissecter (fix:or #xDC00 (fix:and s #x3FF)) write-byte))))) + +(define (wide-string->utf16-be-string string #!optional start end) + (%wide-string->utf16-string string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + %write-utf16-be-char)) + +(define (wide-string->utf16-le-string string #!optional start end) + (%wide-string->utf16-string string + (if (default-object? start) #f start) + (if (default-object? end) #f end) + %write-utf16-le-char)) + +(define (%wide-string->utf16-string string start end write-utf16-char) + (let ((input + (open-wide-input-string string + (if (default-object? start) #f start) + (if (default-object? end) #f end)))) + (call-with-output-string + (lambda (output) + (let loop () + (let ((char (read-char input))) + (if (not (eof-object? char)) + (begin + (write-utf16-char char output) + (loop))))))))) -(define (read-utf8-code-point-from-source source) - ;; This is separately implemented to speed up the parser buffer. - (let ((c0 (source)) +(define (utf16-be-string-length string #!optional start end) + (with-substring-args string start end 'UTF16-BE-STRING-LENGTH + (%utf16-string-length string start end "16BE" be-bytes->digit16 + 'UTF16-BE-STRING-LENGTH))) + +(define (utf16-le-string-length string #!optional start end) + (with-substring-args string start end 'UTF16-LE-STRING-LENGTH + (%utf16-string-length string start end "16LE" le-bytes->digit16 + 'UTF16-LE-STRING-LENGTH))) + +(define (%utf16-string-length string start end type combiner caller) + (with-substring-args string start end caller + (encoded-string-length string start end type caller + (lambda (string start end) + + (define-integrable (n i) + (vector-8b-ref string (fix:+ start i))) + + (if (fix:< start end) + (and (fix:<= (fix:+ start 2) end) + (let ((d0 (combiner (n 0) (n 1)))) + (if (high-surrogate? d0) + (and (fix:<= (fix:+ start 4) end) + (let ((d1 (combiner (n 2) (n 3)))) + (and (low-surrogate? d1) + (let ((pt (combine-surrogates d0 d1))) + (and (unicode-code-point? pt) + (fix:+ start 4)))))) + (and (unicode-code-point? d0) + (fix:+ start 2))))) + start))))) + +;;;; UTF-8 representation + +(define (read-utf8-char port) + (read-utf8-char-from-source + (lambda () + (let ((b (read-byte port))) + (if (eof-object? b) + #f + b))))) + +(define (read-utf8-char-from-source source) + (let ((b0 (source)) (get-next (lambda () - (let ((c (source))) - (and c - (fix:<= #x80 (char->integer c)) - (fix:< (char->integer c) #xC0) - (fix:and (char->integer c) #x3F)))))) - (and c0 - (cond ((fix:< (char->integer c0) #x80) - (char->integer c0)) - ((fix:< (char->integer c0) #xE0) - (let ((n1 (get-next))) - (and n1 - (fix:or (fix:lsh (fix:and (char->integer c0) #x1F) 6) - n1)))) - ((fix:< (char->integer c0) #xF0) - (let* ((n1 (get-next)) - (n2 (get-next))) - (and n1 n2 - (fix:or (fix:lsh (fix:and (char->integer c0) #x0F) 12) - (fix:or (fix:lsh n1 6) - n2))))) - ((fix:< (char->integer c0) #xF8) - (let* ((n1 (get-next)) - (n2 (get-next)) - (n3 (get-next))) - (and n1 n2 n3 - (fix:or (fix:lsh (fix:and (char->integer c0) #x07) 18) - (fix:or (fix:lsh n1 12) - (fix:or (fix:lsh n2 6) - n3)))))) - ((fix:< (char->integer c0) #xFC) - (let* ((n1 (get-next)) - (n2 (get-next)) - (n3 (get-next)) - (n4 (get-next))) - (and n1 n2 n3 n4 - (+ (* (fix:and (char->integer c0) #x03) #x01000000) - (fix:or (fix:lsh n1 18) - (fix:lsh n2 12)) - (fix:or (fix:lsh n3 6) - n4))))) - ((fix:< (char->integer c0) #xFE) - (let* ((n1 (get-next)) - (n2 (get-next)) - (n3 (get-next)) - (n4 (get-next)) - (n5 (get-next))) - (and n1 n2 n3 n4 n5 - (+ (* (fix:and (char->integer c0) #x01) #x40000000) - (* n1 #x01000000) - (fix:or (fix:lsh n2 18) - (fix:lsh n3 12)) - (fix:or (fix:lsh n4 6) - n5))))) + (let ((b (source))) + (if (not b) + (error "Truncated UTF-8 input.")) + (if (not (%valid-trailer? b)) + (error "Illegal subsequent UTF-8 byte:" b)) + (fix:and b #x3F))))) + (if b0 + (integer->char + (cond ((fix:< b0 #x80) + b0) + ((fix:< b0 #xE0) + (%vc2 b0) + (%cp2 b0 (get-next))) + ((fix:< b0 #xF0) + (let ((b1 (get-next))) + (%vc3 b0 b1) + (%cp3 b0 b1 (get-next)))) + ((fix:< b0 #xF8) + (let ((b1 (get-next))) + (%vc4 b0 b1) + (let ((b2 (get-next))) + (%cp4 b0 b1 b2 (get-next))))) (else - #f))))) + (error "Illegal UTF-8 byte:" b0)))) + (make-eof-object #f)))) + +(define (utf8-string->wide-string string #!optional start end) + (let ((input + (open-input-string string + (if (default-object? start) #f start) + (if (default-object? end) #f end)))) + (call-with-wide-output-string + (lambda (output) + (let loop () + (let ((char (read-utf8-char input))) + (if (not (eof-object? char)) + (begin + (write-char char output) + (loop))))))))) + +(define (write-utf8-char char port) + (guarantee-wide-char char 'WRITE-UTF8-CHAR) + (%write-utf8-char char port)) + +(define (%write-utf8-char char port) + (let ((pt (char->integer char))) + + (define-integrable (initial-char n-bits offset) + (fix:or (fix:and (fix:lsh #xFF (fix:+ n-bits 1)) #xFF) + (fix:lsh pt (fix:- 0 offset)))) + + (define-integrable (subsequent-char offset) + (fix:or #x80 (fix:and (fix:lsh pt (fix:- 0 offset)) #x3F))) + + (cond ((fix:< pt #x00000080) + (write-byte pt port)) + ((fix:< pt #x00000800) + (write-byte (initial-char 5 6) port) + (write-byte (subsequent-char 0) port)) + ((fix:< pt #x00010000) + (write-byte (initial-char 4 12) port) + (write-byte (subsequent-char 6) port) + (write-byte (subsequent-char 0) port)) + (else + (write-byte (initial-char 3 18) port) + (write-byte (subsequent-char 12) port) + (write-byte (subsequent-char 6) port) + (write-byte (subsequent-char 0) port))))) + +(define (wide-string->utf8-string string #!optional start end) + (let ((input + (open-wide-input-string string + (if (default-object? start) #f start) + (if (default-object? end) #f end)))) + (call-with-output-string + (lambda (output) + (let loop () + (let ((char (read-char input))) + (if (not (eof-object? char)) + (begin + (%write-utf8-char char output) + (loop))))))))) -(define (write-utf8-code-point n port) - - (define-integrable (initial-char n-bits offset) - (fix:or (fix:and (fix:lsh #xFF (fix:+ n-bits 1)) #xFF) - (fix:lsh n (fix:- 0 offset)))) - - (define-integrable (subsequent-char offset) - (fix:or #x80 (fix:and (fix:lsh n (fix:- 0 offset)) #x3F))) - - (define-integrable (output-8b n) - (write-char (integer->char n) port)) - - (if (not (unicode-code-point? n)) - (error:wrong-type-argument n "unicode code point" - 'CODE-POINT->UTF8-STRING)) - (cond ((< n #x00000080) - (output-8b n)) - ((< n #x00000800) - (output-8b (initial-char 5 6)) - (output-8b (subsequent-char 0))) - ((< n #x00010000) - (output-8b (initial-char 4 12)) - (output-8b (subsequent-char 6)) - (output-8b (subsequent-char 0))) - ((< n #x00200000) - (output-8b (initial-char 3 18)) - (output-8b (subsequent-char 12)) - (output-8b (subsequent-char 6)) - (output-8b (subsequent-char 0))) - ((< n #x04000000) - (output-8b (initial-char 2 24)) - (output-8b (subsequent-char 18)) - (output-8b (subsequent-char 12)) - (output-8b (subsequent-char 6)) - (output-8b (subsequent-char 0))) - (else - (output-8b (initial-char 1 30)) - (output-8b (subsequent-char 24)) - (output-8b (subsequent-char 18)) - (output-8b (subsequent-char 12)) - (output-8b (subsequent-char 6)) - (output-8b (subsequent-char 0))))) - -(define (code-point->utf8-string n) - (call-with-output-string - (lambda (port) - (write-utf8-code-point n port)))) +(define (utf8-string-length string #!optional start end) + (with-substring-args string start end 'UTF8-STRING-LENGTH + (encoded-string-length string start end "8" 'UTF8-STRING-LENGTH + (lambda (string start end) + + (define-integrable (check-byte i) + (%valid-trailer? (n i))) + + (define-integrable (n i) + (vector-8b-ref string (fix:+ start i))) + + (if (fix:< start end) + (let ((b0 (vector-8b-ref string start))) + (cond ((fix:< b0 #x80) + (fix:+ start 1)) + ((fix:< b0 #xE0) + (and (fix:<= (fix:+ start 2) end) + (check-byte 1) + (%vs2 b0) + (fix:+ start 2))) + ((fix:< b0 #xF0) + (and (fix:<= (fix:+ start 3) end) + (check-byte 1) + (check-byte 2) + (%vs3 b0 (n 1)) + (fix:+ start 3))) + ((fix:< b0 #xF8) + (and (fix:<= (fix:+ start 4) end) + (check-byte 1) + (%vs4 b0 (n 1)) + (check-byte 2) + (check-byte 3) + (fix:+ start 4))) + (else #f))) + start))))) + +(define-integrable (%vc2 b0) + (if (not (%vs2 b0)) + (error "Illegal UTF-8 sequence:" b0))) + +(define-integrable (%vc3 b0 b1) + (if (not (%vs3 b0 b1)) + (error "Illegal UTF-8 sequence:" b0 b1))) + +(define-integrable (%vc4 b0 b1) + (if (not (%vs4 b0 b1)) + (error "Illegal UTF-8 sequence:" b0 b1))) + +(define-integrable (%vs2 b0) + (fix:> b0 #xC1)) + +(define-integrable (%vs3 b0 b1) + (or (fix:> b0 #xE0) (fix:> b1 #x9F))) + +(define-integrable (%vs4 b0 b1) + (or (fix:> b0 #xF0) (fix:> b1 #x8F))) + +(define-integrable (%cp2 b0 b1) + (fix:or (fix:lsh (fix:and b0 #x1F) 6) + b1)) + +(define-integrable (%cp3 b0 b1 b2) + (fix:or (fix:lsh (fix:and b0 #x0F) 12) + (fix:or (fix:lsh b1 6) + b2))) + +(define-integrable (%cp4 b0 b1 b2 b3) + (fix:or (fix:lsh (fix:and b0 #x07) 18) + (fix:or (fix:lsh b1 12) + (fix:or (fix:lsh b2 6) + b3)))) + +(define-integrable (%valid-trailer? n) + (fix:= #x80 (fix:and #xC0 n))) \ No newline at end of file -- 2.25.1