From a0f0a9052782528fe2c8fea522beb88469fd1ace Mon Sep 17 00:00:00 2001 From: Chris Hanson Date: Wed, 8 Feb 2017 00:21:45 -0800 Subject: [PATCH] Implement "computed" character sets. Also define Unicode symbol characters. --- src/runtime/char.scm | 4 +++ src/runtime/chrset.scm | 76 +++++++++++++++++++++++++++++++++++++++++ src/runtime/runtime.pkg | 4 +++ 3 files changed, 84 insertions(+) diff --git a/src/runtime/char.scm b/src/runtime/char.scm index 860a65819..6943d2fe6 100644 --- a/src/runtime/char.scm +++ b/src/runtime/char.scm @@ -397,6 +397,10 @@ USA. (guarantee unicode-char? char 'char-general-category) (ucd-gc-value (char->integer char))) +(define (unicode-code-point-general-category cp) + (guarantee unicode-code-point? cp 'unicode-code-point-general-category) + (ucd-gc-value cp)) + (define-integrable (utf16-surrogate? cp) (fix:= #xD800 (fix:and #xF800 cp))) diff --git a/src/runtime/chrset.scm b/src/runtime/chrset.scm index d03aedc0f..e2db46da6 100644 --- a/src/runtime/chrset.scm +++ b/src/runtime/chrset.scm @@ -283,6 +283,43 @@ USA. (cdr ranges)))))) (values low '())))) +(define (compute-char-set procedure) + (%make-char-set (%compute-low procedure) + (%code-points->high (%compute-high-ranges procedure)))) + +(define (%compute-low procedure) + (let ((low (%make-low 0))) + (do ((cp 0 (fix:+ cp 1))) + ((not (fix:< cp %low-limit))) + (if (procedure cp) + (%low-set! low cp))) + low)) + +(define (%compute-high-ranges procedure) + (append! (%compute-high-ranges-1 %low-limit #xD800 procedure) + (%compute-high-ranges-1 #xE000 char-code-limit procedure))) + +(define (%compute-high-ranges-1 start end procedure) + (define (find-start cp ranges) + (if (fix:< cp end) + (if (procedure cp) + (find-end (fix:+ cp 1) cp ranges) + (find-start (fix:+ cp 1) ranges)) + (done ranges))) + + (define (find-end cp start ranges) + (if (fix:< cp end) + (if (procedure cp) + (find-end (fix:+ cp 1) start ranges) + (find-start (fix:+ cp 1) + (cons (%make-range start cp) ranges))) + (done (cons (%make-range start end) ranges)))) + + (define (done ranges) + (reverse! ranges)) + + (find-start start '())) + ;;;; Predicates (define (char-in-set? char char-set) @@ -569,6 +606,45 @@ USA. (define-deferred char-set:wsp (char-set #\space #\tab)) (define-deferred char-wsp? (char-set-predicate char-set:wsp)) +;;;; Scheme language: + +(define (symbol-constituent? sv) + (lambda (sv) + (case sv + ;; #\" #\# #\' #\, #\; #\\ #\` #\| + ((#x22 #x23 #x27 #x2c #x3b #x5c #x60 #x7c) #f) + ((#x200C #x200D) #t) + (else + (case (unicode-code-point-general-category sv) + ((letter:uppercase + letter:lowercase + letter:titlecase + letter:modifier + letter:other + mark:nonspacing + number:letter + number:other + punctuation:connector + punctuation:dash + punctuation:other + symbol:math + symbol:currency + symbol:modifier + symbol:other + other:private-use) + #t) + ((mark:spacing-combining + mark:enclosing + number:decimal-digit) + 'subsequent-only) + (else #f)))))) + +(define-deferred char-set:symbol-constituent + (compute-char-set symbol-constituent?)) + +(define-deferred char-set:symbol-initial + (compute-char-set (lambda (sv) (eq? #t (symbol-constituent? sv))))) + ;;;; Backwards compatibility (define (char-set-member? char-set char) diff --git a/src/runtime/runtime.pkg b/src/runtime/runtime.pkg index 534d86517..488770846 100644 --- a/src/runtime/runtime.pkg +++ b/src/runtime/runtime.pkg @@ -1393,6 +1393,7 @@ USA. set-char-bits unicode-char->scalar-value unicode-char? + unicode-code-point-general-category unicode-code-point? unicode-scalar-value->char unicode-scalar-value?)) @@ -1500,6 +1501,8 @@ USA. char-set:upper-case char-set:whitespace char-set:wsp + char-set:symbol-constituent + char-set:symbol-initial char-set=? char-set? char-standard? @@ -1508,6 +1511,7 @@ USA. char-wsp? code-point-list? char-set* + compute-char-set scalar-value-in-char-set? string->char-set) (export (runtime string) -- 2.25.1