Implement "computed" character sets.
authorChris Hanson <org/chris-hanson/cph>
Wed, 8 Feb 2017 08:21:45 +0000 (00:21 -0800)
committerChris Hanson <org/chris-hanson/cph>
Wed, 8 Feb 2017 08:21:45 +0000 (00:21 -0800)
Also define Unicode symbol characters.

src/runtime/char.scm
src/runtime/chrset.scm
src/runtime/runtime.pkg

index 860a65819a57e7df0a80d5a83ff2a14760427a18..6943d2fe68e1f753c8a88821457eb319548facdf 100644 (file)
@@ -397,6 +397,10 @@ USA.
   (guarantee unicode-char? char 'char-general-category)
   (ucd-gc-value (char->integer char)))
 
+(define (unicode-code-point-general-category cp)
+  (guarantee unicode-code-point? cp 'unicode-code-point-general-category)
+  (ucd-gc-value cp))
+
 (define-integrable (utf16-surrogate? cp)
   (fix:= #xD800 (fix:and #xF800 cp)))
 
index d03aedc0f79c809af16b9fe8df86b4a150c98a88..e2db46da6b026cf9ee0fbdbf5924dee64658fc72 100644 (file)
@@ -283,6 +283,43 @@ USA.
                               (cdr ranges))))))
        (values low '()))))
 \f
+(define (compute-char-set procedure)
+  (%make-char-set (%compute-low procedure)
+                 (%code-points->high (%compute-high-ranges procedure))))
+
+(define (%compute-low procedure)
+  (let ((low (%make-low 0)))
+    (do ((cp 0 (fix:+ cp 1)))
+       ((not (fix:< cp %low-limit)))
+      (if (procedure cp)
+         (%low-set! low cp)))
+    low))
+
+(define (%compute-high-ranges procedure)
+  (append! (%compute-high-ranges-1 %low-limit #xD800 procedure)
+          (%compute-high-ranges-1 #xE000 char-code-limit procedure)))
+
+(define (%compute-high-ranges-1 start end procedure)
+  (define (find-start cp ranges)
+    (if (fix:< cp end)
+       (if (procedure cp)
+           (find-end (fix:+ cp 1) cp ranges)
+           (find-start (fix:+ cp 1) ranges))
+       (done ranges)))
+
+  (define (find-end cp start ranges)
+    (if (fix:< cp end)
+       (if (procedure cp)
+           (find-end (fix:+ cp 1) start ranges)
+           (find-start (fix:+ cp 1)
+                       (cons (%make-range start cp) ranges)))
+       (done (cons (%make-range start end) ranges))))
+
+  (define (done ranges)
+    (reverse! ranges))
+
+  (find-start start '()))
+\f
 ;;;; Predicates
 
 (define (char-in-set? char char-set)
@@ -569,6 +606,45 @@ USA.
 (define-deferred char-set:wsp (char-set #\space #\tab))
 (define-deferred char-wsp? (char-set-predicate char-set:wsp))
 \f
+;;;; Scheme language:
+
+(define (symbol-constituent? sv)
+  (lambda (sv)
+    (case sv
+      ;; #\" #\# #\' #\, #\; #\\ #\` #\|
+      ((#x22 #x23 #x27 #x2c #x3b #x5c #x60 #x7c) #f)
+      ((#x200C #x200D) #t)
+      (else
+       (case (unicode-code-point-general-category sv)
+        ((letter:uppercase
+          letter:lowercase
+          letter:titlecase
+          letter:modifier
+          letter:other
+          mark:nonspacing
+          number:letter
+          number:other
+          punctuation:connector
+          punctuation:dash
+          punctuation:other
+          symbol:math
+          symbol:currency
+          symbol:modifier
+          symbol:other
+          other:private-use)
+         #t)
+        ((mark:spacing-combining
+          mark:enclosing
+          number:decimal-digit)
+         'subsequent-only)
+        (else #f))))))
+
+(define-deferred char-set:symbol-constituent
+  (compute-char-set symbol-constituent?))
+
+(define-deferred char-set:symbol-initial
+  (compute-char-set (lambda (sv) (eq? #t (symbol-constituent? sv)))))
+\f
 ;;;; Backwards compatibility
 
 (define (char-set-member? char-set char)
index 534d8651754f6935c49d60e1130b4e96423182e5..48877084632b847f88d7073d657ba17e2957b274 100644 (file)
@@ -1393,6 +1393,7 @@ USA.
          set-char-bits
          unicode-char->scalar-value
          unicode-char?
+         unicode-code-point-general-category
          unicode-code-point?
          unicode-scalar-value->char
          unicode-scalar-value?))
@@ -1500,6 +1501,8 @@ USA.
          char-set:upper-case
          char-set:whitespace
          char-set:wsp
+         char-set:symbol-constituent
+         char-set:symbol-initial
          char-set=?
          char-set?
          char-standard?
@@ -1508,6 +1511,7 @@ USA.
          char-wsp?
          code-point-list?
          char-set*
+         compute-char-set
          scalar-value-in-char-set?
          string->char-set)
   (export (runtime string)