From 5c3ef4fde3d7deb8cb708f65cc93605901b67d33 Mon Sep 17 00:00:00 2001 From: Chris Hanson Date: Wed, 15 Feb 2017 22:55:36 -0800 Subject: [PATCH] Clean up the character abstraction to be more consistent. * Change unicode-char? correspond to unicode-scalar-value?. * Rename base-char? to bitless-char?. * Eliminate char-integer-limit, unicode-char-code?, and char->scalar-value. --- src/runtime/char.scm | 40 ++++++++++++------------------ src/runtime/chrset.scm | 9 +++---- src/runtime/predicate-metadata.scm | 6 ++--- src/runtime/runtime.pkg | 11 +++----- src/runtime/ucd-glue.scm | 10 ++++++++ src/sf/gconst.scm | 1 - 6 files changed, 35 insertions(+), 42 deletions(-) diff --git a/src/runtime/char.scm b/src/runtime/char.scm index 4ddd4a9a6..d02417876 100644 --- a/src/runtime/char.scm +++ b/src/runtime/char.scm @@ -36,7 +36,6 @@ USA. (define-integrable char-code-limit #x110000) (define-integrable char-bits-limit #x10) -(define-integrable char-integer-limit #x2000000) (define-guarantee char "character") @@ -54,6 +53,10 @@ USA. (define (char-bits char) (fix:lsh (char->integer char) -21)) +(define (bitless-char? object) + (and (char? object) + (fix:< (char->integer object) char-code-limit))) + (define (char-bits-set? bits char) (guarantee-limited-index-fixnum bits char-bits-limit 'CHAR-BITS-SET?) (fix:= bits (fix:and (char-bits char) bits))) @@ -348,39 +351,28 @@ USA. (define (unicode-char? object) (and (char? object) - (unicode-char-code? (char->integer object)))) - -(define (base-char? object) - (and (char? object) - (unicode-code-point? (char->integer object)))) - -(define (unicode-char-code? object) - (and (unicode-scalar-value? object) - (not (non-character? object)))) + (let ((cp (char->integer object))) + (and (fix:< object char-code-limit) + (not (utf16-surrogate? object)))))) (define (unicode-scalar-value? object) (and (unicode-code-point? object) (not (utf16-surrogate? object)))) -(define-integrable (unicode-code-point? object) +(define (unicode-code-point? object) (and (index-fixnum? object) (fix:< object char-code-limit))) (define-guarantee unicode-char "a Unicode character") (define-guarantee unicode-scalar-value "a Unicode scalar value") -(define (char->code-point char #!optional caller) - (let ((n (char->integer char))) - (guarantee unicode-code-point? n caller) - n)) - -(define (char->scalar-value char #!optional caller) +(define (%char->scalar-value char #!optional caller) (let ((n (char->integer char))) (guarantee unicode-scalar-value? n caller) n)) (define (char-general-category char) - (guarantee base-char? char 'char-general-category) + (guarantee bitless-char? char 'char-general-category) (ucd-gc-value char)) (define (code-point-general-category cp) @@ -421,14 +413,14 @@ USA. ;;;; UTF-{8,16,32} encoders (define (char-utf8-byte-length char) - (let ((sv (char->scalar-value char 'char-utf8-byte-length))) + (let ((sv (%char->scalar-value char 'char-utf8-byte-length))) (cond ((fix:< sv #x80) 1) ((fix:< sv #x800) 2) ((fix:< sv #x10000) 3) (else 4)))) (define (encode-utf8-char! bytes index char) - (let ((sv (char->scalar-value char 'encode-utf8-char!))) + (let ((sv (%char->scalar-value char 'encode-utf8-char!))) (define-integrable (initial-byte leader offset) (fix:or leader (fix:lsh sv offset))) @@ -456,13 +448,13 @@ USA. (fix:+ index 4))))) (define (char-utf16-byte-length char) - (if (fix:< (char->scalar-value char 'char-utf16-byte-length) #x10000) + (if (fix:< (%char->scalar-value char 'char-utf16-byte-length) #x10000) 2 4)) (define (utf16-char-encoder setter caller) (lambda (bytes index char) - (let ((sv (char->scalar-value char caller))) + (let ((sv (%char->scalar-value char caller))) (cond ((fix:< sv #x10000) (setter bytes index sv) (fix:+ index 2)) @@ -481,12 +473,12 @@ USA. (utf16-char-encoder bytevector-u16le-set! 'encode-utf16le-char!)) (define (char-utf32-byte-length char) - (char->scalar-value char 'char-utf32-byte-length) + (%char->scalar-value char 'char-utf32-byte-length) 4) (define (utf32-char-encoder setter caller) (lambda (bytes index char) - (setter bytes index (char->scalar-value char caller)))) + (setter bytes index (%char->scalar-value char caller)))) (define encode-utf32be-char! (utf32-char-encoder bytevector-u32be-set! 'encode-utf32be-char!)) diff --git a/src/runtime/chrset.scm b/src/runtime/chrset.scm index 56251effa..9b27294bd 100644 --- a/src/runtime/chrset.scm +++ b/src/runtime/chrset.scm @@ -284,7 +284,7 @@ USA. (define (%cpl-element->ranges elt) (cond ((%range? elt) (list elt)) - ((base-char? elt) (list (char->integer elt))) + ((bitless-char? elt) (list (char->integer elt))) ((ustring? elt) (map char->integer (ustring->list elt))) (else #f))) @@ -336,7 +336,7 @@ USA. (define (cpl-element? object) (or (%range? object) - (base-char? object) + (bitless-char? object) (ustring? object) (char-set? object))) @@ -383,7 +383,7 @@ USA. ;;;; Accessors (define (char-in-set? char char-set) - (guarantee base-char? char 'char-in-set?) + (guarantee bitless-char? char 'char-in-set?) (%code-point-in-char-set? (char->integer char) char-set)) (define (code-point-in-char-set? cp char-set) @@ -495,14 +495,11 @@ USA. (define char-set:not-graphic) (define char-set:not-standard) (define char-set:standard) -(define char-set:unicode) (define char-set:wsp) (define char-standard?) (define char-wsp?) (add-boot-init! (lambda () - (set! char-set:unicode (compute-char-set unicode-char-code?)) - (set! char-set:graphic (%signal->char-set '(#x20 #x7F #xA0 #x100))) (set! char-set:not-graphic (char-set-invert char-set:graphic)) (set! char-graphic? (char-set-predicate char-set:graphic)) diff --git a/src/runtime/predicate-metadata.scm b/src/runtime/predicate-metadata.scm index f63ed45c6..f0f6b3c13 100644 --- a/src/runtime/predicate-metadata.scm +++ b/src/runtime/predicate-metadata.scm @@ -289,8 +289,8 @@ USA. ;; MIT/GNU Scheme: misc (register-predicate! 8-bit-char? '8-bit-char '<= char?) (register-predicate! ascii-char? 'ascii-char '<= 8-bit-char?) - (register-predicate! base-char? 'base-char '<= char?) (register-predicate! bit-string? 'bit-string) + (register-predicate! bitless-char? 'bitless-char '<= char?) (register-predicate! cell? 'cell) (register-predicate! code-point-list? 'code-point-list '<= list?) (register-predicate! compiled-code-address? 'compiled-code-address) @@ -312,13 +312,11 @@ USA. (register-predicate! stack-address? 'stack-address) (register-predicate! thread-mutex? 'thread-mutex) (register-predicate! undefined-value? 'undefined-value) - (register-predicate! unicode-char? 'unicode-char '<= base-char?) + (register-predicate! unicode-char? 'unicode-char '<= bitless-char?) (register-predicate! unicode-code-point? 'unicode-code-point '<= index-fixnum?) (register-predicate! unicode-scalar-value? 'unicode-scalar-value '<= unicode-code-point?) - (register-predicate! unicode-char-code? 'unicode-char-code - '<= unicode-scalar-value?) (register-predicate! uninterned-symbol? 'uninterned-symbol '<= symbol?) (register-predicate! weak-list? 'weak-list) (register-predicate! weak-pair? 'weak-pair) diff --git a/src/runtime/runtime.pkg b/src/runtime/runtime.pkg index c640335d6..04b209a00 100644 --- a/src/runtime/runtime.pkg +++ b/src/runtime/runtime.pkg @@ -1431,12 +1431,11 @@ USA. (export () 8-bit-char? ascii-char? - base-char? + bitless-char? char-8-bit? char->digit char->integer char->name - char->scalar-value char-ascii? char-bit:control char-bit:hyper @@ -1457,7 +1456,6 @@ USA. char-downcase char-foldcase char-general-category - char-integer-limit char-upcase char<=? charchar radix? set-char-bits - unicode-char-code? - unicode-char? unicode-code-point? unicode-scalar-value?) (export (runtime) @@ -1559,7 +1555,9 @@ USA. char-set:not-numeric char-set:not-upper-case char-set:not-whitespace - char-set:numeric) + char-set:numeric + char-set:unicode + unicode-char?) (export (runtime) char-set:folded-symbol-constituent char-set:folded-symbol-initial @@ -1610,7 +1608,6 @@ USA. char-set:not-graphic char-set:not-standard char-set:standard - char-set:unicode char-set:wsp char-set=? char-set? diff --git a/src/runtime/ucd-glue.scm b/src/runtime/ucd-glue.scm index 8bfbd86f0..84a5030d8 100644 --- a/src/runtime/ucd-glue.scm +++ b/src/runtime/ucd-glue.scm @@ -60,6 +60,16 @@ USA. (define-deferred char-set:not-whitespace (char-set-invert char-set:whitespace)) + +(define-deferred char-set:unicode + (compute-char-set + (lambda (cp) + (case (code-point-general-category cp) + ((other:surrogate other:not-assigned) #f) + (else #t))))) + +(define-deferred unicode-char? + (char-set-predicate char-set:unicode)) ;;;; Scheme language: diff --git a/src/sf/gconst.scm b/src/sf/gconst.scm index 090a4323d..1d8817622 100644 --- a/src/sf/gconst.scm +++ b/src/sf/gconst.scm @@ -32,7 +32,6 @@ USA. (define global-constant-objects '(CHAR-BITS-LIMIT CHAR-CODE-LIMIT - CHAR-INTEGER-LIMIT FALSE LAMBDA-TAG:UNNAMED ;needed for cold load SYSTEM-GLOBAL-ENVIRONMENT ;suppresses warnings about (access ...) -- 2.25.1