From bd62a5b1c0aed5385a8c721dfc820b23b6e4a05b Mon Sep 17 00:00:00 2001 From: Chris Hanson Date: Sun, 7 May 2017 13:39:06 -0700 Subject: [PATCH] Change NFC normalization to use MAYBE values of NFC_QC. --- src/etc/ucd-converter.scm | 3 +- src/runtime/runtime.pkg | 2 +- src/runtime/ucd-table-nfc_qc.scm | 29 ++++- src/runtime/ustring.scm | 181 +++++++++++++++++-------------- 4 files changed, 122 insertions(+), 93 deletions(-) diff --git a/src/etc/ucd-converter.scm b/src/etc/ucd-converter.scm index 306fad621..d0eefd11c 100644 --- a/src/etc/ucd-converter.scm +++ b/src/etc/ucd-converter.scm @@ -598,8 +598,7 @@ USA. (define (metadata->code-generator metadata) (let ((name (metadata-name metadata)) (type-spec (metadata-type-spec metadata))) - (cond ((string=? name "NFC_QC") code-generator:boolean) - ((eq? type-spec 'boolean) code-generator:boolean) + (cond ((eq? type-spec 'boolean) code-generator:boolean) ((eq? type-spec 'ccc) code-generator:ccc) ((eq? type-spec 'code-point) code-generator:code-point) ((eq? type-spec 'code-point*) code-generator:code-point*) diff --git a/src/runtime/runtime.pkg b/src/runtime/runtime.pkg index f2c0beb6c..13ffdb6be 100644 --- a/src/runtime/runtime.pkg +++ b/src/runtime/runtime.pkg @@ -1324,7 +1324,6 @@ USA. char-changes-when-lower-cased? char-changes-when-upper-cased? char-full-composition-exclusion? - char-nfc-quick-check? char-nfd-quick-check? ucd-canonical-cm-second-keys ucd-canonical-cm-second-values @@ -1334,6 +1333,7 @@ USA. ucd-cf-value ucd-gcb-value ucd-lc-value + ucd-nfc_qc-value ucd-tc-value ucd-uc-value ucd-wb-value)) diff --git a/src/runtime/ucd-table-nfc_qc.scm b/src/runtime/ucd-table-nfc_qc.scm index 66a97fc94..28825c598 100644 --- a/src/runtime/ucd-table-nfc_qc.scm +++ b/src/runtime/ucd-table-nfc_qc.scm @@ -30,10 +30,27 @@ USA. (declare (usual-integrations)) -(define (char-nfc-quick-check? char) - (char-in-set? char char-set:nfc-quick-check)) +(define (ucd-nfc_qc-value char) + (let ((sv (char->integer char))) + (vector-ref |ucd-NFC_QC-table-5| (bytevector-u8-ref |ucd-NFC_QC-table-4| (fix:or (fix:lsh (bytevector-u8-ref |ucd-NFC_QC-table-3| (fix:or (fix:lsh (bytevector-u8-ref |ucd-NFC_QC-table-2| (fix:or (fix:lsh (bytevector-u8-ref |ucd-NFC_QC-table-1| (fix:or (fix:lsh (bytevector-u8-ref |ucd-NFC_QC-table-0| (fix:lsh sv -16)) 4) (fix:and 15 (fix:lsh sv -12)))) 4) (fix:and 15 (fix:lsh sv -8)))) 4) (fix:and 15 (fix:lsh sv -4)))) 4) (fix:and 15 sv)))))) -(define-deferred char-set:nfc-quick-check - (char-set* - '((0 . 768) 773 (781 . 783) 784 786 (789 . 795) (796 . 803) (809 . 813) 815 (818 . 824) (825 . 832) (838 . 884) (885 . 894) (895 . 903) (904 . 1619) (1622 . 2364) (2365 . 2392) (2400 . 2494) (2495 . 2519) (2520 . 2524) 2526 (2528 . 2611) (2612 . 2614) (2615 . 2649) (2652 . 2654) (2655 . 2878) (2879 . 2902) (2904 . 2908) (2910 . 3006) (3007 . 3031) (3032 . 3158) (3159 . 3266) (3267 . 3285) (3287 . 3390) (3391 . 3415) (3416 . 3530) (3531 . 3535) (3536 . 3551) (3552 . 3907) (3908 . 3917) (3918 . 3922) (3923 . 3927) (3928 . 3932) (3933 . 3945) (3946 . 3955) 3956 3959 (3961 . 3969) (3970 . 3987) (3988 . 3997) (3998 . 4002) (4003 . 4007) (4008 . 4012) (4013 . 4025) (4026 . 4142) (4143 . 4449) - (4470 . 4520) (4547 . 6965) (6966 . 8049) 8050 8052 8054 8056 8058 8060 (8062 . 8123) (8124 . 8126) (8127 . 8137) 8138 (8140 . 8147) (8148 . 8155) (8156 . 8163) (8164 . 8171) (8172 . 8174) (8176 . 8185) 8186 8188 (8190 . 8192) (8194 . 8486) (8487 . 8490) (8492 . 9001) (9003 . 10972) (10973 . 12441) (12443 . 63744) (64014 . 64016) 64017 (64019 . 64021) 64031 64033 (64035 . 64037) (64039 . 64042) (64110 . 64112) (64218 . 64285) 64286 (64288 . 64298) 64311 64317 64319 64322 64325 (64335 . 69818) (69819 . 69927) (69928 . 70462) (70463 . 70487) (70488 . 70832) (70833 . 70842) (70843 . 70845) (70846 . 71087) (71088 . 119134) (119141 . 119227) (119233 . 194560) (195102 . 1114112)))) +(define-deferred |ucd-NFC_QC-table-0| + (vector->bytevector '#(0 1 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3))) + +(define-deferred |ucd-NFC_QC-table-1| + (vector->bytevector '#(0 1 2 3 4 4 4 4 4 4 4 4 4 4 4 5 4 6 4 4 4 4 4 4 4 4 4 4 4 7 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 8 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4))) + +(define-deferred |ucd-NFC_QC-table-2| + (vector->bytevector '#(0 0 0 1 0 0 2 0 0 3 4 5 6 7 0 8 9 10 0 0 0 0 0 0 0 0 0 11 0 0 0 12 13 14 0 15 0 0 0 0 0 0 16 0 0 0 0 0 17 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 18 19 20 0 0 0 0 21 22 0 23 24 25 0 0 0 0 0 0 0 0 0 0 0 26 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 18 18 27 0 0 0 0 0))) + +(define-deferred |ucd-NFC_QC-table-3| + (vector->bytevector + '#(0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 3 4 5 0 0 6 7 0 0 0 0 0 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 0 0 0 0 0 0 9 0 10 0 0 0 0 0 11 0 12 0 0 0 0 0 13 0 14 0 0 0 0 0 0 0 0 0 0 0 0 0 11 0 15 0 0 0 0 0 11 0 16 0 0 0 0 0 0 0 17 0 0 0 0 0 0 18 19 0 0 0 0 0 11 0 16 0 0 0 0 0 0 20 21 0 0 0 0 0 0 22 23 24 25 26 22 23 24 0 0 0 0 0 0 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 27 28 0 0 29 30 31 0 0 0 0 0 0 32 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 33 0 0 0 34 35 36 37 38 39 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 40 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 41 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42 0 0 0 0 0 0 0 0 0 0 0 43 0 0 0 0 0 0 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 45 46 47 44 44 44 45 44 44 44 44 44 44 48 0 0 0 49 50 51 52 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 53 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 54 0 0 0 0 0 0 0 0 0 0 0 0 0 0 21 0 0 0 0 0 0 0 0 0 0 55 56 0 0 0 0 57 58 0 0 0 44 45 0 0 0 0 0 0 0 0 0 0 0 0 0 0))) + +(define-deferred |ucd-NFC_QC-table-4| + (vector->bytevector + '#(0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 2 2 1 2 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 2 2 0 2 0 0 0 2 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 0 0 2 0 0 0 0 0 0 0 1 1 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 2 0 0 0 0 0 0 0 0 0 2 0 0 0 0 2 0 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 2 0 2 2 0 2 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 2 0 0 0 0 0 0 0 0 0 0 2 0 2 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 2 0 0 2 2 0 0 0 0 0 0 0 0 0 2 0 2 0 2 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0 2 0 2 0 0 2 2 2 2 2 2 2 2 2 2 0 2 0 2 0 0 2 2 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 0 2 0 2 2 0 2 2 0 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0))) + +(define-deferred |ucd-NFC_QC-table-5| + #(yes maybe no)) diff --git a/src/runtime/ustring.scm b/src/runtime/ustring.scm index d71c1bf42..f34252f9a 100644 --- a/src/runtime/ustring.scm +++ b/src/runtime/ustring.scm @@ -71,7 +71,7 @@ USA. (define (%string-immutable? string fail) (cond ((legacy-string? string) #f) - ((ustring? string) (not (%ustring-mutable? string))) + ((ustring? string) (%ustring-immutable? string)) ((slice? string) (not (slice-mutable? string))) (else (fail)))) @@ -131,9 +131,12 @@ USA. (fix:or (fix:andc (%ustring-flags string) #x03) cp-size))) -(define (%ustring-mutable? string) +(define-integrable (%ustring-mutable? string) (fix:= 0 (%ustring-cp-size string))) +(define-integrable (%ustring-immutable? string) + (not (%ustring-mutable? string))) + (define-integrable flag:nfc #x04) (define-integrable flag:nfd #x08) @@ -522,14 +525,16 @@ USA. (else (max-loop cp3-ref)))) (define (%string->immutable string) - (unpack-slice string - (lambda (string* start end) - (let ((result - (immutable-ustring-allocate - (fix:- end start) - (%general-max-cp string* start end)))) - (%general-copy! result 0 string* start end) - result)))) + (if (and (ustring? string) (%ustring-immutable? string)) + string + (unpack-slice string + (lambda (string* start end) + (let ((result + (immutable-ustring-allocate + (fix:- end start) + (%general-max-cp string* start end)))) + (%general-copy! result 0 string* start end) + result))))) ;;;; Streaming builder @@ -557,10 +562,7 @@ USA. (else (error "Unsupported argument:" object))))))))) (define (build-string:nfc strings count max-cp) - (let ((result (build-string:immutable strings count max-cp))) - (if (ustring-in-nfc? result) - result - (string->nfc result)))) + (string->nfc (build-string:immutable strings count max-cp))) (define (build-string:immutable strings count max-cp) (let ((result (immutable-ustring-allocate count max-cp))) @@ -861,90 +863,101 @@ USA. (string-in-nfc? string))) (define (string-in-nfc? string) + (let ((qc (string-nfc-qc string 'string-in-nfc?))) + (if (eq? qc 'maybe) + (%string=? string (%string->nfc string)) + qc))) + +(define (string->nfc string) + (if (eq? #t (string-nfc-qc string 'string->nfc)) + (let ((result (%string->immutable string))) + (ustring-in-nfc! result) + result) + (%string->nfc string))) + +(define (%string->nfc string) + (let ((result + (canonical-composition + (if (string-in-nfd? string) + string + (canonical-decomposition&ordering string + (lambda (string* n max-cp) + (declare (ignore n max-cp)) + string*)))))) + (ustring-in-nfc! result) + result)) + +(define (string-nfc-qc string caller) (cond ((legacy-string? string) #t) ((ustring? string) - (if (ustring-mutable? string) - (ustring-nfc-qc? string 0 (ustring-length string)) - (ustring-in-nfc? string))) + (or (ustring-in-nfc? string) + (ustring-nfc-qc string 0 (string-length string)))) ((slice? string) - (unpack-slice string ustring-nfc-qc?)) + (unpack-slice string ustring-nfc-qc)) (else - (error:not-a string? string 'string-in-nfc?)))) - + (error:not-a string? string caller)))) + +(define (ustring-nfc-qc string start end) + (let ((scan + (lambda (sref) + (let loop ((i start) (last-ccc 0) (result #t)) + (if (fix:< i end) + (let ((char (sref string i))) + (if (fix:< (char->integer char) #x300) + (loop (fix:+ i 1) 0 result) + (let ((ccc (ucd-ccc-value char))) + (and (or (fix:= ccc 0) (fix:>= ccc last-ccc)) + (case (ucd-nfc_qc-value char) + ((yes) (loop (fix:+ i 1) ccc result)) + ((maybe) (loop (fix:+ i 1) ccc 'maybe)) + (else #f)))))) + result))))) + (case (ustring-cp-size string) + ((1) #t) + ((2) (scan ustring2-ref)) + (else (scan ustring3-ref))))) + (define (string-in-nfd? string) - (cond ((or (legacy-string? string) (ustring? string)) - (if (ustring-mutable? string) - (ustring-nfd-qc? string 0 (ustring-length string)) - (ustring-in-nfd? string))) + (cond ((legacy-string? string) + (ustring-nfd-qc? string 0 (ustring-length string))) + ((ustring? string) + (or (ustring-in-nfd? string) + (ustring-nfd-qc? string 0 (ustring-length string)))) ((slice? string) (unpack-slice string ustring-nfd-qc?)) (else (error:not-a string? string 'string-in-nfd?)))) -(define (ustring-nfc-qc? string start end) - (case (ustring-cp-size string) - ((1) #t) - ((2) (%ustring-nfc-qc? ustring2-ref string start end)) - (else (%ustring-nfc-qc? ustring3-ref string start end)))) - (define (ustring-nfd-qc? string start end) - (case (ustring-cp-size string) - ((1) (%ustring-nfd-qc? ustring1-ref string start end)) - ((2) (%ustring-nfd-qc? ustring2-ref string start end)) - (else (%ustring-nfd-qc? ustring3-ref string start end)))) - -(define-integrable (string-nqc-loop cp-limit char-nqc?) - (lambda (sref string start end) - (let loop ((i start) (last-ccc 0)) - (if (fix:< i end) - (let ((char (sref string i))) - (if (fix:< (char->integer char) cp-limit) - (loop (fix:+ i 1) 0) - (let ((ccc (ucd-ccc-value char))) - (and (or (fix:= ccc 0) (fix:>= ccc last-ccc)) - (char-nqc? char) - (loop (fix:+ i 1) ccc))))) - #t)))) - -(define %ustring-nfc-qc? (string-nqc-loop #x300 char-nfc-quick-check?)) -(define %ustring-nfd-qc? (string-nqc-loop #xC0 char-nfd-quick-check?)) - -(define (string->nfc string) - (cond ((and (ustring? string) - (ustring-in-nfc? string)) - string) - ((string-in-nfc? string) - (let ((result (%string->immutable string))) - (ustring-in-nfc! result) - result)) - (else - (let ((result - (canonical-composition - (if (string-in-nfd? string) - string - (canonical-decomposition&ordering string - (lambda (string* n max-cp) - (declare (ignore n max-cp)) - string*)))))) - (ustring-in-nfc! result) - result)))) + (let ((scan + (lambda (sref) + (let loop ((i start) (last-ccc 0)) + (if (fix:< i end) + (let ((char (sref string i))) + (if (fix:< (char->integer char) #xC0) + (loop (fix:+ i 1) 0) + (let ((ccc (ucd-ccc-value char))) + (and (or (fix:= ccc 0) (fix:>= ccc last-ccc)) + (char-nfd-quick-check? char) + (loop (fix:+ i 1) ccc))))) + #t))))) + (case (ustring-cp-size string) + ((1) (scan ustring1-ref)) + ((2) (scan ustring2-ref)) + (else (scan ustring3-ref))))) (define (string->nfd string) - (cond ((and (ustring? string) - (ustring-in-nfd? string)) - string) - ((string-in-nfd? string) - (let ((result (%string->immutable string))) - (ustring-in-nfd! result) - result)) - (else - (canonical-decomposition&ordering string - (lambda (string* n max-cp) - (let ((result (immutable-ustring-allocate n max-cp))) - (%general-copy! result 0 string* 0 n) - (ustring-in-nfd! result) - result)))))) + (if (string-in-nfd? string) + (let ((result (%string->immutable string))) + (ustring-in-nfd! result) + result) + (canonical-decomposition&ordering string + (lambda (string* n max-cp) + (let ((result (immutable-ustring-allocate n max-cp))) + (%general-copy! result 0 string* 0 n) + (ustring-in-nfd! result) + result))))) (define (canonical-decomposition&ordering string k) (let ((end (string-length string)) -- 2.25.1