From: Chris Hanson Date: Sun, 23 May 2010 11:50:46 +0000 (-0700) Subject: Change char-set abstraction to handle unicode. X-Git-Tag: 20100708-Gtk~62 X-Git-Url: https://birchwood-abbey.net/git?a=commitdiff_plain;h=1ee48ea8c5277e70dbaa5210206ce683f1809cee;p=mit-scheme.git Change char-set abstraction to handle unicode. --- diff --git a/src/runtime/chrset.scm b/src/runtime/chrset.scm index 88fce3702..ea168d9ac 100644 --- a/src/runtime/chrset.scm +++ b/src/runtime/chrset.scm @@ -28,252 +28,589 @@ USA. (declare (usual-integrations)) -(define-structure (char-set (type-descriptor )) +;;; The character set is stored in two parts. The LOW part is a +;;; bit-vector encoding of the scalar values below %LOW-LIMIT. The +;;; HIGH part is a sequence of scalar-value ranges, each of which has +;;; an inclusive START and an exclusive END. The ranges in the +;;; sequence are all disjoint from one another, and no two ranges are +;;; adjacent. These ranges are sorted so that their STARTs are in +;;; order. +;;; +;;; The HIGH range sequence is implemented as a pair of vectors, one +;;; for the STARTs and one for the ENDs. The two vectors have the +;;; same length. +;;; +;;; For simplicity, character sets are allowed to contain ranges that +;;; contain illegal scalar values. However, CHAR-SET-MEMBER? doesn't +;;; accept illegal characters. + +(define-structure (char-set (type-descriptor ) + (constructor %%make-char-set)) + (low #f read-only #t) + (high-starts #f read-only #t) + (high-ends #f read-only #t) + ;; Backwards compatibility: (table #f read-only #t)) -(define-integrable (guarantee-char-set object procedure) - (if (not (char-set? object)) - (error:wrong-type-argument object "character set" procedure))) +(define-guarantee char-set "character set") -(define-integrable char-set-table-length 256) - -(define (char-set . chars) - (chars->char-set chars)) - -(define (chars->char-set chars) - (if (not (list-of-type? chars - (lambda (char) - (and (char? char) - (fix:< (char->integer char) char-set-table-length))))) - (error:wrong-type-argument chars "ASCII chars" 'CHARS->CHAR-SET)) - (let ((table (make-string char-set-table-length))) - (vector-8b-fill! table 0 char-set-table-length 0) - (do ((chars chars (cdr chars))) - ((not (pair? chars))) - (vector-8b-set! table (char->integer (car chars)) 1)) - (make-char-set table))) - -(define (string->char-set string) - (guarantee-string string 'STRING->CHAR-SET) - (let ((n-chars (string-length string)) - (table (make-string char-set-table-length))) - (vector-8b-fill! table 0 char-set-table-length 0) - (do ((i 0 (fix:+ i 1))) - ((fix:= i n-chars)) - (vector-8b-set! table (vector-8b-ref string i) 1)) - (make-char-set table))) - -(define (ascii-range->char-set lower upper) - (if (not (index-fixnum? lower)) - (error:wrong-type-argument lower "index fixnum" 'ASCII-RANGE->CHAR-SET)) - (if (not (index-fixnum? upper)) - (error:wrong-type-argument upper "index fixnum" 'ASCII-RANGE->CHAR-SET)) - (if (not (fix:<= lower upper)) - (error:bad-range-argument lower 'ASCII-RANGE->CHAR-SET)) - (if (not (fix:<= upper char-set-table-length)) - (error:bad-range-argument upper 'ASCII-RANGE->CHAR-SET)) - (let ((table (make-string char-set-table-length))) - (vector-8b-fill! table 0 lower 0) - (vector-8b-fill! table lower upper 1) - (vector-8b-fill! table upper char-set-table-length 0) - (make-char-set table))) +(define (guarantee-char-sets char-sets #!optional caller) + (for-each (lambda (char-set) (guarantee-char-set char-set caller)) + char-sets)) -(define (predicate->char-set predicate) - (let ((table (make-string char-set-table-length))) - (do ((code 0 (fix:+ code 1))) - ((fix:= code char-set-table-length)) - (vector-8b-set! table code (if (predicate (integer->char code)) 1 0))) - (make-char-set table))) - -(define (char-set=? c1 c2) - (guarantee-char-set c1 'CHAR-SET=?) - (guarantee-char-set c2 'CHAR-SET=?) - (string=? (char-set-table c1) (char-set-table c2))) +(define (%make-char-set low high-starts high-ends) + (%%make-char-set low high-starts high-ends + (let ((table (make-vector-8b #x100))) + (do ((i 0 (fix:+ i 1))) + ((not (fix:< i #x100))) + (vector-8b-set! table i (%low-ref low char-set))) + table))) + +(define-integrable %low-length #x100) +(define-integrable %low-limit #x800) + +(define (%make-low #!optional fill-value) + (make-vector-8b %low-length fill-value)) + +(define (%low-ref low scalar-value) + (not (fix:= (fix:and (vector-8b-ref low (fix:lsh scalar-value -3)) + (fix:lsh 1 (fix:and scalar-value 7))) + 0))) + +(define (%low-set! low scalar-value) + (vector-8b-set! low + (fix:lsh scalar-value -3) + (fix:or (vector-8b-ref low (fix:lsh scalar-value -3)) + (fix:lsh 1 (fix:and scalar-value 7))))) + +(define %null-char-set + (%make-char-set (%make-low 0) '#() '#())) -(define (char-set-members char-set) - (guarantee-char-set char-set 'CHAR-SET-MEMBERS) - (let ((table (char-set-table char-set))) - (let loop ((code (fix:- char-set-table-length 1)) (chars '())) - (if (fix:= code 0) - (if (fix:= 0 (vector-8b-ref table code)) - chars - (cons (integer->char code) chars)) - (loop (fix:- code 1) - (if (fix:= 0 (vector-8b-ref table code)) - chars - (cons (integer->char code) chars))))))) +;;;; Conversion to and from scalar-values list + +(define (well-formed-scalar-value-list? ranges) + (list-of-type? ranges well-formed-scalar-value-range?)) + +(define (well-formed-scalar-value-range? range) + (if (pair? range) + (and (index-fixnum? (car range)) + (index-fixnum? (cdr range)) + (fix:< (car range) (cdr range)) + (fix:<= (cdr range) char-code-limit)) + (and (index-fixnum? range) + (fix:< range char-code-limit)))) + +(define-guarantee well-formed-scalar-value-list "a Unicode scalar-value list") +(define-guarantee well-formed-scalar-value-range "a Unicode scalar-value range") + +(define (%make-range start end) + (if (fix:= (fix:- end start) 1) + start + (cons start end))) + +(define (%range-start range) + (if (pair? range) + (car range) + range)) + +(define (%range-end range) + (if (pair? range) + (car range) + (fix:+ range 1))) + +(define (char-set->scalar-values char-set) + (guarantee-char-set char-set 'CHAR-SET->SCALAR-VALUES) + (reverse! + (%high->scalar-values (char-set-high-starts char-set) + (char-set-high-ends char-set) + (%low->scalar-values (char-set-low char-set) '())))) + +(define (%low->scalar-values low result) + + (define (find-start i result) + (if (fix:< i %low-limit) + (if (%low-ref low i) + (find-end i result) + (find-start (fix:+ i 1) result)) + result)) + + (define (find-end start result) + (let loop ((i (fix:+ start 1))) + (if (fix:< i %low-limit) + (if (%low-ref low i) + (loop (fix:+ i 1)) + (find-start i (cons (%make-range start i) result))) + (cons (%make-range start i) result)))) + + (find-start 0 result)) + +(define (%high->scalar-values starts ends result) + (let ((n (vector-length starts))) + (let loop ((i 0) (result result)) + (if (fix:< i n) + (loop (fix:+ i 1) + (cons (%make-range (vector-ref starts i) + (vector-ref ends i)) + result)) + result)))) + +(define (scalar-values->char-set ranges) + (guarantee-well-formed-scalar-value-list ranges 'SCALAR-VALUES->CHAR-SET) + (%scalar-values->char-set ranges)) + +(define (%scalar-values->char-set ranges) + (receive (low-ranges high-ranges) + (%canonicalize-scalar-value-list ranges) + (receive (high-starts high-ends) + (%scalar-values->high high-ranges) + (%make-char-set (%scalar-values->low low-ranges) + high-starts + high-ends)))) + +(define (%scalar-values->low ranges) + (let ((low (%make-low 0))) + (for-each (lambda (range) + (let ((end (%range-end range))) + (do ((i (%range-start range) (fix:+ i 1))) + ((fix:> i end)) + (%low-set! low i)))) + ranges) + low)) + +(define (%scalar-values->high ranges) + (let ((n-high (length ranges))) + (let ((high-starts (make-vector n-high)) + (high-ends (make-vector n-high))) + (do ((ranges ranges (cdr ranges)) + (i 0 (fix:+ i 1))) + ((not (pair? ranges))) + (vector-set! high-starts i (%range-start (car ranges))) + (vector-set! high-ends i (%range-end (car ranges)))) + (values high-starts high-ends)))) + +(define (%canonicalize-scalar-value-list ranges) + ;; Sort ranges in order, then merge adjacent ranges. + (%split-ranges + (if (pair? ranges) + (let ((ranges (sort ranges %range= (%range-start range) %low-limit) + (values low ranges)) + (else + (values (cons (%make-range (%range-start range) %low-limit) + low) + (cons (%make-range %low-limit (%range-end range)) + (cdr ranges)))))) + (values low '())))) + +;;;; Predicates (define (char-set-member? char-set char) (guarantee-char-set char-set 'CHAR-SET-MEMBER?) - (guarantee-char char 'CHAR-SET-MEMBER?) - (%char-set-member? char-set char)) - -(define (%char-set-member? char-set char) - (and (fix:< (char->integer char) char-set-table-length) - (not (fix:= 0 - (vector-8b-ref (char-set-table char-set) - (char->integer char)))))) + (guarantee-unicode-char char 'CHAR-SET-MEMBER?) + (%scalar-value-in-char-set? (char-code char) char-set)) + +(define (%scalar-value-in-char-set? value char-set) + (if (fix:< value %low-limit) + (%low-ref (char-set-low char-set) value) + (let ((high-starts (char-set-high-starts char-set)) + (high-ends (char-set-high-ends char-set))) + (let loop ((lower 0) (upper (vector-length high-starts))) + (if (fix:< lower upper) + (let ((index (fix:quotient (fix:+ lower upper) 2))) + (cond ((fix:< value (vector-ref high-starts index)) + (loop lower index)) + ((fix:>= value (vector-ref high-ends index)) + (loop (fix:+ index 1) upper)) + (else #t))) + #f))))) + +(define (char-set=? char-set . char-sets) + (guarantee-char-set char-set 'CHAR-SET=?) + (guarantee-char-sets char-sets 'CHAR-SET=?) + (every (lambda (char-set*) + (%=? char-set* char-set)) + char-sets)) + +(define (%=? c1 c2) + (and (%=?-low (char-set-low c1) (char-set-low c2)) + (%=?-high (char-set-high-starts c1) (char-set-high-starts c2)) + (%=?-high (char-set-high-ends c1) (char-set-high-ends c2)))) + +(define (%=?-low l1 l2) + (let loop ((i 0)) + (if (fix:< i %low-length) + (and (fix:= (vector-8b-ref l1 i) (vector-8b-ref l2 i)) + (loop (fix:+ i 1))) + #t))) + +(define (%=?-high h1 h2) + (let ((end (vector-length h1))) + (and (fix:= end (vector-length h2)) + (let loop ((i 0)) + (if (fix:< i end) + (and (fix:= (vector-ref h1 i) (vector-ref h2 i)) + (loop (fix:+ i 1))) + #t))))) + +;;;; 8-bit character sets + +(define (8-bit-char-set? char-set) + (and (char-set? char-set) + (fix:= (vector-length (char-set-high-starts char-set)) 0) + (let ((low (char-set-low char-set))) + (let loop ((i #x20)) + (or (fix:= i %low-length) + (and (fix:= (vector-8b-ref low i) 0) + (loop (fix:+ i 1)))))))) + +(define-guarantee 8-bit-char-set "an 8-bit char-set") + +;;;; Mapping operations (define (char-set-invert char-set) (guarantee-char-set char-set 'CHAR-SET-INVERT) - (predicate->char-set - (lambda (char) - (not (%char-set-member? char-set char))))) - + (%invert char-set)) + +(define-deferred %invert + (%split-map-1 (%low-unary fix:not) + %invert-high)) + +(define (%invert-high starts1 ends1) + (let ((n1 (vector-length starts1))) + + (define (go n i1 prev-end) + (let ((starts (make-vector n)) + (ends (make-vector n))) + (let loop ((i1 i1) (i 0) (prev-end prev-end)) + (if (fix:< i1 n1) + (loop (fix:+ i1 1) + (%high-copy-1 (vector-ref starts1 i1) + (vector-ref ends1 i1) + starts ends i)) + (%high-copy-1 prev-end char-code-limit + starts ends i))) + (values starts ends))) + + (if (and (fix:> n1 0) + (fix:= (vector-ref starts1 0) %low-limit)) + (go n1 1 (vector-ref ends1 0)) + (go (fix:+ n1 1) 0 %low-limit)))) + (define (char-set-union . char-sets) (guarantee-char-sets char-sets 'CHAR-SET-UNION) - (predicate->char-set - (lambda (char) - (there-exists? char-sets - (lambda (char-set) - (%char-set-member? char-set char)))))) + (reduce %union %null-char-set char-sets)) + +(define-deferred %union + (%split-map-2 (%low-binary fix:or) + (%high-binary %high-copy-n %high-copy-n + %high-copy-1 %high-copy-1 + (lambda (start1 end1 start2 end2 starts ends i) + (%high-copy-1 (fix:min start1 start2) + (fix:max end1 end2) + starts ends i))))) (define (char-set-intersection . char-sets) (guarantee-char-sets char-sets 'CHAR-SET-INTERSECTION) - (predicate->char-set - (lambda (char) - (for-all? char-sets - (lambda (char-set) - (%char-set-member? char-set char)))))) - -(define (guarantee-char-sets char-sets procedure) - (for-each (lambda (char-set) (guarantee-char-set char-set procedure)) - char-sets)) - -(define (char-set-difference include exclude) - (guarantee-char-set include 'CHAR-SET-DIFFERENCE) - (guarantee-char-set exclude 'CHAR-SET-DIFFERENCE) - (predicate->char-set - (lambda (char) - (and (%char-set-member? include char) - (not (%char-set-member? exclude char)))))) + (reduce %intersection %null-char-set char-sets)) + +(define-deferred %intersection + (%split-map-2 (%low-binary fix:and) + (%high-binary %high-drop-n %high-drop-n + %high-drop-1 %high-drop-1 + (lambda (start1 end1 start2 end2 starts ends i) + (%high-copy-1 (fix:max start1 start2) + (fix:min end1 end2) + starts ends i))))) + +(define (char-set-difference char-set . char-sets) + (guarantee-char-set char-set 'CHAR-SET-DIFFERENCE) + (guarantee-char-sets char-sets 'CHAR-SET-DIFFERENCE) + (fold-left %difference char-set char-sets)) + +(define-deferred %difference + (%split-map-2 (%low-binary fix:andc) + (%high-binary %high-drop-n %high-copy-n + %high-drop-1 %high-copy-1 + (lambda (start1 end1 start2 end2 starts ends i) + + (define (shave-head i start1 start2) + (if (fix:< start1 start2) + (%high-copy-1 start1 start2 + starts ends i) + i)) + + (define (shave-tail i end1 end2) + (if (fix:< end2 end1) + (%high-copy-1 end2 end1 + starts ends i) + i)) + (shave-tail (shave-head i start1 start2) + end1 + end2))))) -;;;; System Character Sets - -(define char-set:upper-case) -(define char-set:lower-case) -(define char-set:numeric) -(define char-set:graphic) -(define char-set:whitespace) -(define char-set:alphabetic) -(define char-set:alphanumeric) -(define char-set:standard) -(define char-set:newline) - -(define char-set:not-upper-case) -(define char-set:not-lower-case) -(define char-set:not-numeric) -(define char-set:not-graphic) -(define char-set:not-whitespace) -(define char-set:not-alphabetic) -(define char-set:not-alphanumeric) -(define char-set:not-standard) - -;;; Used in RFCs: -(define char-set:ascii) -(define char-set:ctls) -(define char-set:wsp) - -(define (initialize-package!) - (set! char-set:upper-case - (char-set-union (ascii-range->char-set #x41 #x5B) - (ascii-range->char-set #xC0 #xD7) - (ascii-range->char-set #xD8 #xDE))) - (set! char-set:lower-case - (char-set-union (ascii-range->char-set #x61 #x7B) - (ascii-range->char-set #xE0 #xF7) - (ascii-range->char-set #xF8 #xFF))) - (set! char-set:numeric (ascii-range->char-set #x30 #x3A)) - (set! char-set:graphic - (char-set-union (ascii-range->char-set #x20 #x7F) - (ascii-range->char-set #xA0 #x100))) - (set! char-set:whitespace - (char-set #\newline #\tab #\linefeed #\page #\return #\space - (integer->char #xA0))) - (set! char-set:alphabetic - (char-set-union char-set:upper-case char-set:lower-case)) - (set! char-set:alphanumeric - (char-set-union char-set:alphabetic char-set:numeric)) - (set! char-set:standard - (char-set-union char-set:graphic (char-set #\newline))) - (set! char-set:newline (char-set #\newline)) - - (set! char-set:not-upper-case (char-set-invert char-set:upper-case)) - (set! char-set:not-lower-case (char-set-invert char-set:lower-case)) - (set! char-set:not-numeric (char-set-invert char-set:numeric)) - (set! char-set:not-graphic (char-set-invert char-set:graphic)) - (set! char-set:not-whitespace (char-set-invert char-set:whitespace)) - (set! char-set:not-alphabetic (char-set-invert char-set:alphabetic)) - (set! char-set:not-alphanumeric (char-set-invert char-set:alphanumeric)) - (set! char-set:not-standard (char-set-invert char-set:standard)) - - (set! char-set:ascii (ascii-range->char-set #x00 #x80)) - (set! char-set:ctls - (char-set-union (ascii-range->char-set #x00 #x20) - (ascii-range->char-set #x7F #x80))) - (set! char-set:wsp (char-set #\space #\tab)) - unspecific) +;;;; Support for mapping operations + +(define (%split-map-1 %map-low %map-high) + (lambda (c1) + (receive (high-starts high-ends) + (%map-high (char-set-high-starts c1) + (char-set-high-ends c1)) + (%make-char-set (%map-low (char-set-low c1)) + high-starts + high-ends)))) + +(define (%split-map-2 %map-low %map-high) + (lambda (c1 c2) + (receive (high-starts high-ends) + (%map-high (char-set-high-starts c1) + (char-set-high-ends c1) + (char-set-high-starts c2) + (char-set-high-ends c2)) + (%make-char-set (%map-low (char-set-low c1) + (char-set-low c2)) + high-starts + high-ends)))) + +(define (%low-unary operation) + (lambda (low1) + (let ((low* (%make-low))) + (do ((i 0 (fix:+ i 1))) + ((fix:= i %low-length)) + (vector-8b-set! low* i + (operation (vector-8b-ref low1 i)))) + low*))) + +(define (%low-binary operation) + (lambda (low1 low2) + (let ((low (%make-low))) + (do ((i 0 (fix:+ i 1))) + ((fix:= i %low-length)) + (vector-8b-set! low i + (operation (vector-8b-ref low1 i) + (vector-8b-ref low2 i)))) + low))) -(define (char-upper-case? char) - (guarantee-char char 'CHAR-UPPER-CASE?) - (%char-upper-case? char)) - -(define-integrable (%char-upper-case? char) - (%char-set-member? char-set:upper-case char)) - -(define (char-lower-case? char) - (guarantee-char char 'CHAR-LOWER-CASE?) - (%char-lower-case? char)) - -(define-integrable (%char-lower-case? char) - (%char-set-member? char-set:lower-case char)) - -(define (char-numeric? char) - (guarantee-char char 'CHAR-NUMERIC?) - (%char-numeric? char)) - -(define-integrable (%char-numeric? char) - (%char-set-member? char-set:numeric char)) - -(define (char-graphic? char) - (guarantee-char char 'CHAR-GRAPHIC?) - (%char-graphic? char)) - -(define-integrable (%char-graphic? char) - (%char-set-member? char-set:graphic char)) +(define (%high-binary empty-left empty-right + disjoint-left disjoint-right + overlap) + (lambda (starts1 ends1 starts2 ends2) + (let ((n1 (vector-length starts1)) + (n2 (vector-length starts2))) + (let ((starts (make-vector (fix:+ n1 n2))) + (ends (make-vector (fix:+ n1 n2)))) + (let ((n + (let loop ((i1 0) (i2 0) (i 0)) + (cond ((fix:>= i1 n1) + (empty-left starts2 ends2 i2 n2 + starts ends i)) + ((fix:>= i2 n2) + (empty-right starts1 ends1 i1 n1 + starts ends i)) + (else + (let ((start1 (vector-ref starts1 i1)) + (end1 (vector-ref ends1 i1)) + (start2 (vector-ref starts2 i2)) + (end2 (vector-ref ends2 i2))) + (cond ((fix:< end1 start2) + (loop (fix:+ i1 1) + i2 + (disjoint-left start1 end1 + starts ends i))) + ((fix:< end2 start1) + (loop i1 + (fix:+ i2 1) + (disjoint-right start2 end2 + starts ends i))) + (else + (loop (fix:+ i1 1) + (fix:+ i2 1) + (overlap start1 end1 + start2 end2 + starts ends i)))))))))) + (values (vector-head! starts n) + (vector-head! ends n))))))) + +(define (%high-copy-n starts1 ends1 i1 n1 starts ends i) + (subvector-move-left! starts1 i1 n1 starts i) + (subvector-move-left! ends1 i1 n1 ends i) + (fix:+ i (fix:- n1 i1))) + +(define (%high-drop-n starts1 ends1 i1 n1 starts ends i) + starts1 ends1 i1 n1 starts ends + i) + +(define (%high-copy-1 start1 end1 starts ends i) + (vector-set! starts i start1) + (vector-set! ends i end1) + (fix:+ i 1)) + +(define (%high-drop-1 start1 end1 starts ends i) + start1 end1 starts ends + i) + +;;;; Standard character sets + +(define-deferred char-set:upper-case + (char-set-union (ascii-range->char-set #x41 #x5B) + (ascii-range->char-set #xC0 #xD7) + (ascii-range->char-set #xD8 #xDE))) +(define-deferred char-set:not-upper-case (char-set-invert char-set:upper-case)) +(define-deferred char-upper-case? (%char-set-test char-set:upper-case)) + +(define-deferred char-set:lower-case + (char-set-union (ascii-range->char-set #x61 #x7B) + (ascii-range->char-set #xE0 #xF7) + (ascii-range->char-set #xF8 #xFF))) +(define-deferred char-set:not-lower-case (char-set-invert char-set:lower-case)) +(define-deferred char-lower-case? (%char-set-test char-set:lower-case)) + +(define-deferred char-set:numeric (ascii-range->char-set #x30 #x3A)) +(define-deferred char-set:not-numeric (char-set-invert char-set:numeric)) +(define-deferred char-numeric? (%char-set-test char-set:numeric)) + +(define-deferred char-set:graphic + (char-set-union (ascii-range->char-set #x20 #x7F) + (ascii-range->char-set #xA0 #x100))) +(define-deferred char-set:not-graphic (char-set-invert char-set:graphic)) +(define-deferred char-graphic? (%char-set-test char-set:graphic)) + +(define-deferred char-set:whitespace + (char-set #\newline #\tab #\linefeed #\page #\return #\space + (integer->char #xA0))) +(define-deferred char-set:not-whitespace (char-set-invert char-set:whitespace)) +(define-deferred char-whitespace? (%char-set-test char-set:whitespace)) + +(define-deferred char-set:alphabetic + (char-set-union char-set:upper-case char-set:lower-case)) +(define-deferred char-set:not-alphabetic (char-set-invert char-set:alphabetic)) +(define-deferred char-alphabetic? (%char-set-test char-set:alphabetic)) + +(define-deferred char-set:alphanumeric + (char-set-union char-set:alphabetic char-set:numeric)) +(define-deferred char-set:not-alphanumeric + (char-set-invert char-set:alphanumeric)) +(define-deferred char-alphanumeric? (%char-set-test char-set:alphanumeric)) + +(define-deferred char-set:standard + (char-set-union char-set:graphic (char-set #\newline))) +(define-deferred char-set:not-standard (char-set-invert char-set:standard)) +(define-deferred char-standard? (%char-set-test char-set:standard)) + +(define-deferred char-set:newline + (char-set #\newline)) -(define (char-whitespace? char) - (guarantee-char char 'CHAR-WHITESPACE?) - (%char-whitespace? char)) +;;; Used in RFCs: -(define-integrable (%char-whitespace? char) - (%char-set-member? char-set:whitespace char)) +(define-deferred char-set:ascii + (ascii-range->char-set #x00 #x80)) -(define (char-alphabetic? char) - (guarantee-char char 'CHAR-ALPHABETIC?) - (%char-alphabetic? char)) +(define-deferred char-set:ctls + (char-set-union (ascii-range->char-set #x00 #x20) + (ascii-range->char-set #x7F #x80))) +(define-deferred char-ctl? (%char-set-test char-set:ctls)) -(define-integrable (%char-alphabetic? char) - (%char-set-member? char-set:alphabetic char)) +(define-deferred char-set:wsp (char-set #\space #\tab)) +(define-deferred char-wsp? (%char-set-test char-set:wsp)) -(define (char-alphanumeric? char) - (guarantee-char char 'CHAR-ALPHANUMERIC?) - (%char-alphanumeric? char)) +(define (%char-set-test char-set) + (lambda (char) + (char-set-member? char-set char))) + +;;;; Backwards compatibility -(define-integrable (%char-alphanumeric? char) - (%char-set-member? char-set:alphanumeric char)) +(define (string->char-set string) + (guarantee-string string 'STRING->CHAR-SET) + (let ((n (string-length string)) + (low (%make-low 0))) + (do ((i 0 (fix:+ i 1))) + ((fix:= i n)) + (%low-set! low (vector-8b-ref string i))) + (%make-char-set low '#() '#()))) + +(define (char-set->string char-set) + (guarantee-8-bit-char-set char-set 'CHAR-SET->STRING) + (let loop ((i 0) (chars '())) + (if (fix:< i %low-length) + (loop (fix:+ i 1) + (if (%scalar-value-in-char-set? i char-set) + (cons (integer->char i) chars) + chars)) + (apply string (reverse! chars))))) -(define (char-standard? char) - (guarantee-char char 'CHAR-STANDARD?) - (%char-standard? char)) +(define (predicate->char-set predicate) + (%scalar-values->char-set + (filter (lambda (i) + (predicate (integer->char i))) + (iota #x100)))) -(define-integrable (%char-standard? char) - (%char-set-member? char-set:standard char)) +(define (char-set-members char-set) + (guarantee-8-bit-char-set char-set 'CHAR-SET-MEMBERS) + (let ((low (char-set-low char-set))) + (let loop ((code #xFF) (chars '())) + (if (fix:>= code 0) + (loop (fix:- code 1) + (if (%low-ref low code) + (cons (integer->char code) chars) + chars)) + chars)))) -(define (char-ctl? char) - (guarantee-char char 'CHAR-CTL?) - (%char-set-member? char-set:ctls char)) +(define (char-set . chars) + (for-each (lambda (char) + (guarantee-unicode-char char 'CHAR-SET)) + chars) + (%scalar-values->char-set (map char->integer chars))) -(define (char-wsp? char) - (guarantee-char char 'CHAR-WSP?) - (%char-set-member? char-set:wsp char)) \ No newline at end of file +(define (chars->char-set chars) + (guarantee-list-of-type chars unicode-char? "character" 'CHARS->CHAR-SET) + (%scalar-values->char-set (map char->integer chars))) + +(define (ascii-range->char-set start end) + (if (not (index-fixnum? start)) + (error:wrong-type-argument start "index fixnum" 'ASCII-RANGE->CHAR-SET)) + (if (not (index-fixnum? end)) + (error:wrong-type-argument end "index fixnum" 'ASCII-RANGE->CHAR-SET)) + (if (not (fix:<= start end)) + (error:bad-range-argument start 'ASCII-RANGE->CHAR-SET)) + (if (not (fix:<= end #x100)) + (error:bad-range-argument end 'ASCII-RANGE->CHAR-SET)) + (%scalar-values->char-set (list (cons start (fix:- end 1))))) \ No newline at end of file diff --git a/src/runtime/parse.scm b/src/runtime/parse.scm index 17bbca3ba..7b6ea6bf8 100644 --- a/src/runtime/parse.scm +++ b/src/runtime/parse.scm @@ -444,12 +444,12 @@ USA. (previous-char #f) (char (%peek))) (if (or (eof-object? char) - (%char-set-member? atom-delimiters char)) + (char-set-member? atom-delimiters char)) (if quoting? (values (get-output-string port*) quoted? previous-char) (get-output-string port*)) (begin - (if (not (%char-set-member? constituents char)) + (if (not (char-set-member? constituents char)) (error:illegal-char char)) (%discard) (cond ((char=? char #\|) @@ -676,8 +676,8 @@ USA. (lambda () (let ((char (%peek-char port db))) (or (eof-object? char) - (%char-set-member? (db-atom-delimiters db) char)))))) - (if (or (%char-set-member? (db-atom-delimiters db) char) + (char-set-member? (db-atom-delimiters db) char)))))) + (if (or (char-set-member? (db-atom-delimiters db) char) (at-end?)) char (name->char diff --git a/src/runtime/rgxcmp.scm b/src/runtime/rgxcmp.scm index 1f8c2fff0..6f4caca18 100644 --- a/src/runtime/rgxcmp.scm +++ b/src/runtime/rgxcmp.scm @@ -259,43 +259,30 @@ USA. ;;; #\^ must appear anywhere except as the first character in the set. (define (re-compile-char-set pattern negate?) - (let ((length (string-length pattern)) - (table (string-allocate 256))) - (let ((kernel - (lambda (start background foreground) - (let ((adjoin! - (lambda (ascii) - (vector-8b-set! table ascii foreground)))) - (vector-8b-fill! table 0 256 background) - (let loop - ((pattern (substring->list pattern start length))) - (if (pair? pattern) - (if (and (pair? (cdr pattern)) - (char=? (cadr pattern) #\-) - (pair? (cddr pattern))) - (begin - (let ((end (char->ascii (caddr pattern)))) - (let loop - ((index (char->ascii (car pattern)))) - (if (fix:<= index end) - (begin - (vector-8b-set! table - index - foreground) - (loop (fix:+ index 1)))))) - (loop (cdddr pattern))) - (begin - (adjoin! (char->ascii (car pattern))) - (loop (cdr pattern)))))))))) - (if (and (not (fix:zero? length)) - (char=? (string-ref pattern 0) #\^)) - (if negate? - (kernel 1 0 1) - (kernel 1 1 0)) - (if negate? - (kernel 0 1 0) - (kernel 0 0 1)))) - (make-char-set table))) + (define (loop pattern scalar-values) + (if (pair? pattern) + (if (and (pair? (cdr pattern)) + (char=? (cadr pattern) #\-) + (pair? (cddr pattern))) + (loop (cdddr pattern) + (cons (cons (char->integer (car pattern)) + (fix:+ (char->integer (caddr pattern)) 1)) + scalar-values)) + (loop (cdr pattern) + (cons (char->integer (car pattern)) + scalar-values))) + scalar-values)) + + (let ((pattern (string->list pattern))) + (receive (pattern negate?) + (if (and (pair? pattern) + (char=? (car pattern) #\^)) + (values (cdr pattern) (not negate?)) + (values pattern negate?)) + (let ((char-set (scalar-values->char-set (loop pattern '())))) + (if negate? + char-set + (char-set-invert char-set)))))) ;;;; Translation Tables diff --git a/src/runtime/runtime.pkg b/src/runtime/runtime.pkg index cc6e85d6a..137c6182e 100644 --- a/src/runtime/runtime.pkg +++ b/src/runtime/runtime.pkg @@ -647,6 +647,7 @@ USA. vector-fourth vector-grow vector-head + vector-head! vector-length vector-map vector-move! @@ -1070,6 +1071,7 @@ USA. (files "chrset") (parent (runtime)) (export () + 8-bit-char-set? ascii-range->char-set char-alphabetic? char-alphanumeric? @@ -1083,6 +1085,7 @@ USA. char-set-invert char-set-member? char-set-members + char-set->scalar-values char-set-union char-set:alphabetic char-set:alphanumeric @@ -1111,17 +1114,21 @@ USA. char-whitespace? char-wsp? chars->char-set + error:not-8-bit-char-set + error:not-char-set + error:not-well-formed-scalar-value-list + error:not-well-formed-scalar-value-range + guarantee-8-bit-char-set guarantee-char-set + guarantee-well-formed-scalar-value-list + guarantee-well-formed-scalar-value-range predicate->char-set - string->char-set) + scalar-values->char-set + string->char-set + well-formed-scalar-value-list? + well-formed-scalar-value-range?) (export (runtime string) - %char-set-member? - char-set-table) - (export (runtime parser) - %char-set-member?) - (export (runtime regular-expression-compiler) - make-char-set) - (initialization (initialize-package!))) + char-set-table)) (define-package (runtime compiler-info) (files "infstr" "infutl") @@ -5255,8 +5262,6 @@ USA. error:not-utf32-le-string error:not-utf32-string error:not-utf8-string - error:not-well-formed-scalar-value-list - error:not-well-formed-scalar-value-range error:not-wide-string for-all-chars-in-string? for-any-char-in-string? @@ -5273,8 +5278,6 @@ USA. guarantee-utf32-le-string guarantee-utf32-string guarantee-utf8-string - guarantee-well-formed-scalar-value-list - guarantee-well-formed-scalar-value-range guarantee-wide-string guarantee-wide-string-index guarantee-wide-substring @@ -5338,8 +5341,6 @@ USA. utf8-string-length utf8-string-valid? utf8-string? - well-formed-scalar-value-list? - well-formed-scalar-value-range? wide-string wide-string->string wide-string-index? diff --git a/src/runtime/string.scm b/src/runtime/string.scm index 133cf9636..4157af4ce 100644 --- a/src/runtime/string.scm +++ b/src/runtime/string.scm @@ -421,7 +421,7 @@ USA. (if (and allow-runs? (fix:= start index)) result (cons (%substring string start index) result)))) - ((%char-set-member? delimiter (string-ref string index)) + ((char-set-member? delimiter (string-ref string index)) (loop (fix:+ index 1) (fix:+ index 1) (if (and allow-runs? (fix:= start index)) diff --git a/tests/runtime/test-char-set.scm b/tests/runtime/test-char-set.scm new file mode 100644 index 000000000..87a83e488 --- /dev/null +++ b/tests/runtime/test-char-set.scm @@ -0,0 +1,86 @@ +#| -*-Scheme-*- + +Copyright (C) 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, + 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, + 2006, 2007, 2008, 2009, 2010 Massachusetts Institute of Technology + +This file is part of MIT/GNU Scheme. + +MIT/GNU Scheme is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at +your option) any later version. + +MIT/GNU Scheme is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with MIT/GNU Scheme; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, +USA. + +|# + +;;;; Test of character-set abstraction + +(declare (usual-integrations)) + +(define (test-canonicalize-scalar-value-list n-items n-iter) + (run-cpl-test n-items n-iter canonicalize-scalar-value-list)) + +(define (test-char-set->scalar-values n-items n-iter) + (run-cpl-test n-items n-iter + (lambda (cpl) + (char-set->scalar-values (scalar-values->char-set cpl))))) + +(define (run-cpl-test n-items n-iter procedure) + (do ((i 0 (+ i 1)) + (failures '() + (let ((cpl (make-test-cpl n-items))) + (guarantee-well-formed-scalar-value-list cpl) + (let ((cpl* (procedure cpl))) + (if (canonical-scalar-value-list? cpl*) + failures + (cons (cons cpl cpl*) failures)))))) + ((not (< i n-iter)) + (let ((n-failures (length failures))) + (if (> n-failures 0) + (begin + (write-string "Got ") + (write n-failures) + (write-string " failure") + (if (> n-failures 1) + (write-string "s")) + (write-string " out of ") + (write n-iter) + (newline) + (pp failures))))))) + +(define (make-test-cpl n-items) + (make-initialized-list n-items + (lambda (i) + (let loop () + (let ((n (random #x10000))) + (if (unicode-scalar-value? n) + (let ((m (random #x100))) + (if (fix:= m 0) + n + (if (unicode-scalar-value? (fix:+ n m)) + (fix:+ n m) + (loop)))) + (loop))))))) + +(define (canonical-scalar-value-list? items) + (and (well-formed-scalar-value-list? items) + (if (pair? items) + (let loop ((a (car items)) (items (cdr items))) + (if (pair? items) + (let ((b (car items)) + (items (cdr items))) + (and (fix:< (fix:+ (if (pair? a) (cdr a) a) 1) + (if (pair? b) (car b) b)) + (loop b items))) + #t)) + #t))) \ No newline at end of file