From: Chris Hanson Date: Wed, 11 Jul 2001 21:23:02 +0000 (+0000) Subject: Add unicode support. X-Git-Tag: 20090517-FFI~2663 X-Git-Url: https://birchwood-abbey.net/git?a=commitdiff_plain;h=d7493339ab5cb3ca05f41bf20d0d21f9b5b223b0;p=mit-scheme.git Add unicode support. --- diff --git a/v7/src/star-parser/compile.scm b/v7/src/star-parser/compile.scm index 4f9326c27..1beb6cc28 100644 --- a/v7/src/star-parser/compile.scm +++ b/v7/src/star-parser/compile.scm @@ -1,6 +1,6 @@ ;;; -*-Scheme-*- ;;; -;;; $Id: compile.scm,v 1.1 2001/06/26 18:03:11 cph Exp $ +;;; $Id: compile.scm,v 1.2 2001/07/11 21:22:55 cph Exp $ ;;; ;;; Copyright (c) 2001 Massachusetts Institute of Technology ;;; @@ -29,7 +29,8 @@ "matcher" "parser" "shared" - "synchk")) + "synchk" + "unicode")) (cref/generate-constructors "parser") (sf "parser.con") (sf "parser.ldr"))) \ No newline at end of file diff --git a/v7/src/star-parser/load.scm b/v7/src/star-parser/load.scm index 6d7ae6e41..865b6060d 100644 --- a/v7/src/star-parser/load.scm +++ b/v7/src/star-parser/load.scm @@ -1,6 +1,6 @@ ;;; -*-Scheme-*- ;;; -;;; $Id: load.scm,v 1.5 2001/07/11 00:41:52 cph Exp $ +;;; $Id: load.scm,v 1.6 2001/07/11 21:22:57 cph Exp $ ;;; ;;; Copyright (c) 2001 Massachusetts Institute of Technology ;;; @@ -24,4 +24,4 @@ (lambda () (fluid-let ((*allow-package-redefinition?* #t)) (package/system-loader "parser" '() 'QUERY)))) -(add-subsystem-identification! "*Parser" '(0 5)) \ No newline at end of file +(add-subsystem-identification! "*Parser" '(0 6)) \ No newline at end of file diff --git a/v7/src/star-parser/matcher.scm b/v7/src/star-parser/matcher.scm index 1560ea035..92e389525 100644 --- a/v7/src/star-parser/matcher.scm +++ b/v7/src/star-parser/matcher.scm @@ -1,6 +1,6 @@ ;;; -*-Scheme-*- ;;; -;;; $Id: matcher.scm,v 1.12 2001/07/10 05:04:44 cph Exp $ +;;; $Id: matcher.scm,v 1.13 2001/07/11 21:23:00 cph Exp $ ;;; ;;; Copyright (c) 2001 Massachusetts Institute of Technology ;;; @@ -139,7 +139,7 @@ `(CHAR-CI ,(string-ref string 0)) expression)))) -(define-matcher-preprocessor 'ALPHABET +(define-matcher-preprocessor 'CHAR-SET (lambda (expression external-bindings internal-bindings) internal-bindings (let ((arg (check-1-arg expression))) @@ -152,6 +152,11 @@ external-bindings)) expression)))) +(define-matcher-preprocessor 'ALPHABET + (lambda (expression external-bindings internal-bindings) + external-bindings internal-bindings + expression)) + (define-matcher-preprocessor 'WITH-POINTER (lambda (expression external-bindings internal-bindings) (check-2-args expression (lambda (expression) (symbol? (cadr expression)))) @@ -258,8 +263,11 @@ (define-atomic-matcher (not-char-ci char) `(MATCH-PARSER-BUFFER-NOT-CHAR-CI ,*buffer-name* ,char)) +(define-atomic-matcher (char-set char-set) + `(MATCH-PARSER-BUFFER-CHAR-IN-SET ,*buffer-name* ,char-set)) + (define-atomic-matcher (alphabet alphabet) - `(MATCH-PARSER-BUFFER-CHAR-IN-SET ,*buffer-name* ,alphabet)) + `(MATCH-UTF8-CHAR-IN-ALPHABET ,*buffer-name* ,alphabet)) (define-atomic-matcher (string string) `(MATCH-PARSER-BUFFER-STRING ,*buffer-name* ,string)) diff --git a/v7/src/star-parser/parser.pkg b/v7/src/star-parser/parser.pkg index 6da877b57..e60526b11 100644 --- a/v7/src/star-parser/parser.pkg +++ b/v7/src/star-parser/parser.pkg @@ -1,6 +1,6 @@ ;;; -*-Scheme-*- ;;; -;;; $Id: parser.pkg,v 1.6 2001/07/11 00:41:55 cph Exp $ +;;; $Id: parser.pkg,v 1.7 2001/07/11 21:22:24 cph Exp $ ;;; ;;; Copyright (c) 2001 Massachusetts Institute of Technology ;;; @@ -68,4 +68,21 @@ (parent ()) (export () define-*matcher-expander - define-*parser-expander)) \ No newline at end of file + define-*parser-expander)) + +(define-package (runtime unicode) + (files "unicode") + (parent ()) + (export () + alphabet+ + alphabet- + alphabet? + char-in-alphabet? + code-point->utf8-string + code-point-in-alphabet? + code-points->alphabet + match-utf8-char-in-alphabet + read-utf8-code-point + string->alphabet + unicode-code-point? + utf8-string->code-point)) \ No newline at end of file diff --git a/v7/src/star-parser/unicode.scm b/v7/src/star-parser/unicode.scm new file mode 100644 index 000000000..b11c5a9e0 --- /dev/null +++ b/v7/src/star-parser/unicode.scm @@ -0,0 +1,474 @@ +;;; -*-Scheme-*- +;;; +;;; $Id: unicode.scm,v 1.1 2001/07/11 21:23:02 cph Exp $ +;;; +;;; Copyright (c) 2001 Massachusetts Institute of Technology +;;; +;;; This program is free software; you can redistribute it and/or +;;; modify it under the terms of the GNU General Public License as +;;; published by the Free Software Foundation; either version 2 of the +;;; License, or (at your option) any later version. +;;; +;;; This program is distributed in the hope that it will be useful, +;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;; General Public License for more details. +;;; +;;; You should have received a copy of the GNU General Public License +;;; along with this program; if not, write to the Free Software +;;; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +;;; 02111-1307, USA. + +;;;; Unicode support + +;;; UTF-8 encoding: +;;; +;;; max code encoding +;;; ---------- ----------------------------------------------------- +;;; #x00000080 0xxxxxxx +;;; #x00000800 110xxxxx 10xxxxxx +;;; #x00010000 1110xxxx 10xxxxxx 10xxxxxx +;;; #x00200000 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx +;;; #x04000000 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx +;;; #x80000000 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + +(declare (usual-integrations)) + +(define-structure (alphabet (type-descriptor alphabet-rtd)) + (low #f read-only #t) + (high1 #f read-only #t) + (high2 #f read-only #t)) + +(define-integrable (make-alphabet-low) + (make-string #x100 (integer->char 0))) + +(define-integrable (alphabet-low-ref low code-point) + (not (fix:= (fix:and (vector-8b-ref low (fix:lsh code-point -3)) + (fix:lsh 1 (fix:and code-point 7))) + 0))) + +(define-integrable (alphabet-low-set! low code-point) + (vector-set! low + (fix:lsh code-point -3) + (fix:or (vector-ref low (fix:lsh code-point -3)) + (fix:lsh 1 (fix:and code-point 7))))) + +(define null-alphabet + (make-alphabet (make-alphabet-low) '#() '#())) + +(define (unicode-code-point? n) + (and (exact-nonnegative-integer? n) + (< n #x80000000))) + +(define (code-point-in-alphabet? n alphabet) + (if (not (unicode-code-point? n)) + (error:wrong-type-argument n "unicode code point" + 'CODE-POINT-IN-ALPHABET?)) + (if (not (alphabet? alphabet)) + (error:wrong-type-argument alphabet "unicode alphabet" + 'CODE-POINT-IN-ALPHABET?)) + (if (< n #x800) + (alphabet-low-ref (alphabet-low alphabet) n) + (let ((high1 (alphabet-high1 alphabet)) + (high2 (alphabet-high2 alphabet))) + (let loop ((lower 0) (upper (vector-length high1))) + (and (fix:< lower upper) + (let ((index (fix:quotient (fix:+ lower upper) 2))) + (cond ((< n (vector-ref high1 index)) + (loop lower index)) + ((< (vector-ref high2 index) n) + (loop (fix:+ index 1) upper)) + (else #t)))))))) + +(define (char-in-alphabet? char alphabet) + (code-point-in-alphabet? (char-code char) alphabet)) + +(define (string->alphabet string) + (if (not (string? string)) + (error:wrong-type-argument string "string" 'STRING->ALPHABET)) + (let ((n (string-length string)) + (low (make-alphabet-low))) + (do ((i 0 (fix:+ i 1))) + ((fix:= i n)) + (alphabet-low-set! low (vector-8b-ref string i))) + (make-alphabet low '#() '#()))) + +(define (code-points->alphabet items) + (if (not (well-formed-items? items)) + (error:wrong-type-argument items "code-points list" + 'CODE-POINTS->ALPHABET)) + (call-with-values (lambda () (split-list items #x800)) + (lambda (low-items high-items) + (let ((low (make-alphabet-low))) + (for-each (lambda (item) + (if (pair? item) + (do ((i (car item) (fix:+ i 1))) + ((fix:> i (cdr item))) + (alphabet-low-set! low i)) + (alphabet-low-set! low item))) + low-items) + (let ((n-high (length high-items))) + (let ((high1 (make-vector n-high)) + (high2 (make-vector n-high))) + (do ((items high-items (cdr items)) + (i 0 (fix:+ i 1))) + ((not (pair? items))) + (if (pair? (car items)) + (begin + (vector-set! high1 i (caar items)) + (vector-set! high2 i (cdar items))) + (begin + (vector-set! high1 i (car items)) + (vector-set! high2 i (car items))))) + (make-alphabet low high1 high2))))))) + +(define (split-list items limit) + (let loop ((items items) (low '())) + (if (pair? items) + (let ((item (car items))) + (cond ((not (pair? item)) + (if (< item limit) + (loop (cdr items) (cons item low)) + (values low items))) + ((< (cdr item) limit) + (loop (cdr items) (cons item low))) + ((<= limit (car item)) + (values low items)) + (else + (values (cons (cons (car item) (- limit 1)) low) + (cons (cons limit (cdr item)) items))))) + (values low '())))) + +(define (well-formed-items? items) + (or (not (pair? items)) + (and (well-formed-item? (car items)) + (let loop ((a (car items)) (items (cdr items))) + (or (not (pair? items)) + (let ((b (car items)) + (items (cdr items))) + (and (well-formed-item? b) + (< (if (pair? a) (cdr a) a) + (if (pair? b) (car b) b)) + (loop b items)))))))) + +(define (well-formed-item? item) + (if (pair? item) + (and (unicode-code-point? (car item)) + (unicode-code-point? (cdr item)) + (< (car item) (cdr item))) + (unicode-code-point? item))) + +(define (alphabet+ . alphabets) + (for-each (lambda (alphabet) + (if (not (alphabet? alphabet)) + (error:wrong-type-argument alphabet "unicode alphabet" + 'ALPHABET+))) + alphabets) + (reduce alphabet+2 null-alphabet alphabets)) + +(define (alphabet+2 a1 a2) + (call-with-values + (lambda () + (alphabet-high+2 (alphabet-high1 a1) + (alphabet-high2 a1) + (alphabet-high1 a2) + (alphabet-high2 a2))) + (lambda (high1 high2) + (make-alphabet (alphabet-low+2 (alphabet-low a1) (alphabet-low a2)) + high1 + high2)))) + +(define (alphabet-low+2 low1 low2) + (let ((low (make-alphabet-low))) + (do ((i 0 (fix:+ i 1))) + ((fix:= i #x100)) + (vector-8b-set! low i + (fix:or (vector-8b-ref low1 i) + (vector-8b-ref low2 i)))) + low)) + +(define (alphabet-high+2 lower1 upper1 lower2 upper2) + (let ((n1 (vector-length lower1)) + (n2 (vector-length lower2))) + (let ((lower (make-vector (fix:+ n1 n2))) + (upper (make-vector (fix:+ n1 n2)))) + (let ((n + (let loop ((i1 0) (i2 0) (i 0)) + (cond ((fix:= i1 n1) + (subvector-move-left! lower2 i2 n2 lower i) + (subvector-move-left! upper2 i2 n2 upper i) + (fix:+ i (fix:- n2 i2))) + ((fix:= i2 n2) + (subvector-move-left! lower1 i1 n1 lower i) + (subvector-move-left! upper1 i1 n1 upper i) + (fix:+ i (fix:- n1 i1))) + ((< (vector-ref upper1 i1) (vector-ref lower2 i2)) + (vector-set! lower i (vector-ref lower1 i1)) + (vector-set! upper i (vector-ref upper1 i1)) + (loop (fix:+ i1 1) i2 (fix:+ i 1))) + ((< (vector-ref upper2 i2) (vector-ref lower1 i1)) + (vector-set! lower i (vector-ref lower2 i2)) + (vector-set! upper i (vector-ref upper2 i2)) + (loop i1 (fix:+ i2 1) (fix:+ i 1))) + (else + (vector-set! lower i + (min (vector-ref lower1 i1) + (vector-ref lower2 i2))) + (vector-set! upper i + (max (vector-ref upper1 i1) + (vector-ref upper2 i2))) + (loop (fix:+ i1 1) (fix:+ i2 1) (fix:+ i 1))))))) + (if (fix:< n (vector-length lower)) + (values (vector-head lower n) (vector-head upper n)) + (values lower upper)))))) + +(define (alphabet- a1 a2) + (call-with-values + (lambda () + (alphabet-high- (alphabet-high1 a1) + (alphabet-high2 a1) + (alphabet-high1 a2) + (alphabet-high2 a2))) + (lambda (high1 high2) + (make-alphabet (alphabet-low- (alphabet-low a1) (alphabet-low a2)) + high1 + high2)))) + +(define (alphabet-low- low1 low2) + (let ((low (make-alphabet-low))) + (do ((i 0 (fix:+ i 1))) + ((fix:= i #x100)) + (vector-8b-set! low i + (fix:and (vector-8b-ref low1 i) + (fix:not (vector-8b-ref low2 i))))) + low)) + +(define (alphabet-high- lower1 upper1 lower2 upper2) + (let ((n1 (vector-length lower1)) + (n2 (vector-length lower2))) + (let ((lower (make-vector (fix:* n1 2))) + (upper (make-vector (fix:* n1 2)))) + (let ((n + (let loop ((i1 0) (i2 0) (i 0)) + (cond ((fix:= i1 n1) + i) + ((fix:= i2 n2) + (subvector-move-left! lower1 i1 n1 lower i) + (subvector-move-left! upper1 i1 n1 upper i) + (fix:+ i (fix:- n1 i1))) + ((< (vector-ref upper1 i1) (vector-ref lower2 i2)) + (vector-set! lower i (vector-ref lower1 i1)) + (vector-set! upper i (vector-ref upper1 i1)) + (loop (fix:+ i1 1) i2 (fix:+ i 1))) + ((< (vector-ref upper2 i2) (vector-ref lower1 i1)) + (loop i1 (fix:+ i2 1) i)) + ((< (vector-ref lower1 i1) (vector-ref lower2 i2)) + (vector-set! lower i (vector-ref lower1 i1)) + (vector-set! upper i (- (vector-ref lower2 i2) 1)) + (if (<= (vector-ref upper1 i1) (vector-ref upper2 i2)) + (loop (fix:+ i1 1) (fix:+ i2 1) (fix:+ i 1)) + (begin + (vector-set! lower (fix:+ i 1) + (+ (vector-ref upper2 i2) 1)) + (vector-set! upper (fix:+ i 1) + (vector-ref upper1 i1)) + (loop (fix:+ i1 1) (fix:+ i2 1) (fix:+ i 2))))) + ((<= (vector-ref upper1 i1) (vector-ref upper2 i2)) + (loop (fix:+ i1 1) (fix:+ i2 1) i)) + (else + (vector-set! lower i (+ (vector-ref upper2 i2) 1)) + (vector-set! upper i (vector-ref upper1 i1)) + (loop (fix:+ i1 1) (fix:+ i2 1) (fix:+ i 1))))))) + (if (fix:< n (vector-length lower)) + (values (vector-head lower n) (vector-head upper n)) + (values lower upper)))))) + +(define (code-point->utf8-string n) + + (define-integrable (initial-char n offset) + (fix:or (fix:and (fix:lsh #xFF (fix:+ n 1)) #xFF) + (fix:lsh n (fix:- 0 offset)))) + + (define-integrable (subsequent-char offset) + (fix:or #x80 + (fix:and (fix:lsh n (fix:- 0 offset)) #x3F))) + + (if (not (unicode-code-point? n)) + (error:wrong-type-argument n "unicode code point" + 'CODE-POINT->UTF8-STRING)) + (cond ((< n #x00000080) + (let ((s (make-string 1))) + (vector-8b-set! s 0 n) + s)) + ((< n #x00000800) + (let ((s (make-string 2))) + (vector-8b-set! s 0 (initial-char 5 6)) + (vector-8b-set! s 1 (subsequent-char 6)) + s)) + ((< n #x00010000) + (let ((s (make-string 3))) + (vector-8b-set! s 0 (initial-char 4 12)) + (vector-8b-set! s 1 (subsequent-char 12)) + (vector-8b-set! s 2 (subsequent-char 6)) + s)) + ((< n #x00200000) + (let ((s (make-string 4))) + (vector-8b-set! s 0 (initial-char 3 18)) + (vector-8b-set! s 1 (subsequent-char 18)) + (vector-8b-set! s 2 (subsequent-char 12)) + (vector-8b-set! s 3 (subsequent-char 6)) + s)) + ((< n #x04000000) + (let ((s (make-string 5))) + (vector-8b-set! s 0 (initial-char 2 24)) + (vector-8b-set! s 1 (subsequent-char 24)) + (vector-8b-set! s 2 (subsequent-char 18)) + (vector-8b-set! s 3 (subsequent-char 12)) + (vector-8b-set! s 4 (subsequent-char 6)) + s)) + (else + (let ((s (make-string 6))) + (vector-8b-set! s 0 (initial-char 1 30)) + (vector-8b-set! s 1 (subsequent-char 30)) + (vector-8b-set! s 2 (subsequent-char 24)) + (vector-8b-set! s 3 (subsequent-char 18)) + (vector-8b-set! s 4 (subsequent-char 12)) + (vector-8b-set! s 5 (subsequent-char 6)) + s)))) + +(define (utf8-string->code-point string) + + (define-integrable (test2 index) + (and (fix:<= #x80 (vector-8b-ref string index)) + (fix:< (vector-8b-ref string index) #xC0))) + + (define-integrable (get2 index) + (fix:and (vector-8b-ref string index) #x3F)) + + (or (cond ((fix:= (string-length string) 0) + #f) + ((fix:< (vector-8b-ref string 0) #x80) + (and (fix:= (string-length string) 1) + (vector-8b-ref string 0))) + ((fix:< (vector-8b-ref string 0) #xE0) + (and (fix:= (string-length string) 2) + (test2 1) + (fix:or (fix:lsh (fix:and (vector-8b-ref string 0) #x1F) 6) + (get2 1)))) + ((fix:< (vector-8b-ref string 0) #xF0) + (and (fix:= (string-length string) 3) + (test2 1) + (test2 2) + (fix:or (fix:lsh (fix:and (vector-8b-ref string 0) #x0F) 12) + (fix:or (fix:lsh (get2 1) 6) + (get2 2))))) + ((fix:< (vector-8b-ref string 0) #xF8) + (and (fix:= (string-length string) 4) + (test2 1) + (test2 2) + (test2 3) + (fix:or (fix:lsh (fix:and (vector-8b-ref string 0) #x07) 18) + (fix:or (fix:lsh (get2 1) 12) + (fix:or (fix:lsh (get2 2) 6) + (get2 3)))))) + ((fix:< (vector-8b-ref string 0) #xFC) + (and (fix:= (string-length string) 5) + (test2 1) + (test2 2) + (test2 3) + (test2 4) + (+ (* (fix:and (vector-8b-ref string 0) #x03) #x01000000) + (fix:or (fix:lsh (get2 1) 18) + (fix:lsh (get2 2) 12)) + (fix:or (fix:lsh (get2 3) 6) + (get2 4))))) + ((fix:< (vector-8b-ref string 0) #xFE) + (and (fix:= (string-length string) 6) + (test2 1) + (test2 2) + (test2 3) + (test2 4) + (test2 5) + (+ (* (fix:and (vector-8b-ref string 0) #x01) #x40000000) + (* (get2 1) #x01000000) + (fix:or (fix:lsh (get2 2) 18) + (fix:lsh (get2 3) 12)) + (fix:or (fix:lsh (get2 4) 6) + (get2 5))))) + (else #f)) + (error:wrong-type-argument string "UTF-8 character" + 'UTF8-STRING->CODE-POINT))) + +(define (match-utf8-char-in-alphabet buffer alphabet) + (let ((p (get-parser-buffer-pointer buffer))) + (let ((n (read-utf8-code-point buffer p))) + (and n + (if (code-point-in-alphabet? n alphabet) + #t + (begin + (set-parser-buffer-pointer! buffer p) + #f)))))) + +(define (read-utf8-code-point buffer p) + (let ((c0 (read-parser-buffer-char buffer)) + (get-next + (lambda () + (let ((c (read-parser-buffer-char buffer))) + (if (and c + (fix:<= #x80 (char->integer c)) + (fix:< (char->integer c) #xC0)) + (fix:and (char->integer c) #x3F) + (begin + (set-parser-buffer-pointer! buffer p) + #f)))))) + (and c0 + (cond ((fix:< (char->integer c0) #x80) + (char->integer c0)) + ((fix:< (char->integer c0) #xE0) + (let ((n1 (get-next))) + (and n1 + (fix:or (fix:lsh (fix:and (char->integer c0) #x1F) 6) + n1)))) + ((fix:< (char->integer c0) #xF0) + (let* ((n1 (get-next)) + (n2 (get-next))) + (and n1 n2 + (fix:or (fix:lsh (fix:and (char->integer c0) #x0F) 12) + (fix:or (fix:lsh n1 6) + n2))))) + ((fix:< (char->integer c0) #xF8) + (let* ((n1 (get-next)) + (n2 (get-next)) + (n3 (get-next))) + (and n1 n2 n3 + (fix:or (fix:lsh (fix:and (char->integer c0) #x07) 18) + (fix:or (fix:lsh n1 12) + (fix:or (fix:lsh n2 6) + n3)))))) + ((fix:< (char->integer c0) #xFC) + (let* ((n1 (get-next)) + (n2 (get-next)) + (n3 (get-next)) + (n4 (get-next))) + (and n1 n2 n3 n4 + (+ (* (fix:and (char->integer c0) #x03) #x01000000) + (fix:or (fix:lsh n1 18) + (fix:lsh n2 12)) + (fix:or (fix:lsh n3 6) + n4))))) + ((fix:< (char->integer c0) #xFE) + (let* ((n1 (get-next)) + (n2 (get-next)) + (n3 (get-next)) + (n4 (get-next)) + (n5 (get-next))) + (and n1 n2 n3 n4 n5 + (+ (* (fix:and (char->integer c0) #x01) #x40000000) + (* n1 #x01000000) + (fix:or (fix:lsh n2 18) + (fix:lsh n3 12)) + (fix:or (fix:lsh n4 6) + n5))))) + (else + (set-parser-buffer-pointer! buffer p) + #f))))) \ No newline at end of file