Initial draft of NFC support; still need to write composition.
authorChris Hanson <org/chris-hanson/cph>
Sat, 25 Mar 2017 22:19:56 +0000 (15:19 -0700)
committerChris Hanson <org/chris-hanson/cph>
Sat, 25 Mar 2017 22:19:56 +0000 (15:19 -0700)
src/runtime/runtime.pkg
src/runtime/ustring.scm

index 2e851479c6ef74ce81534ca745cb81009ab13c84..b03755f8fec00dbb64dd4718f706d2eca88e4637 100644 (file)
@@ -974,6 +974,7 @@ USA.
          string
          string*
          string->list
+         string->nfc
          string->nfd
          string->vector
          string-any
@@ -1008,6 +1009,8 @@ USA.
          string-hash
          string-hash-ci
          string-head
+         string-in-nfc?
+         string-in-nfd?
          string-joiner
          string-joiner*
          string-length
@@ -1270,6 +1273,7 @@ USA.
         "ucd-table-cased"
         "ucd-table-ccc"
         "ucd-table-cf"
+        "ucd-table-comp_ex"
         "ucd-table-cwcf"
         "ucd-table-cwl"
         "ucd-table-cwu"
@@ -1277,6 +1281,7 @@ USA.
         "ucd-table-gcb"
         "ucd-table-lc"
         "ucd-table-lower"
+        "ucd-table-nfc_qc"
         "ucd-table-nfd_qc"
         "ucd-table-nt"
         "ucd-table-nv"
@@ -1314,12 +1319,14 @@ USA.
          char-changes-when-case-folded?
          char-changes-when-lower-cased?
          char-changes-when-upper-cased?
+         char-full-composition-exclusion?
          char-nfd-quick-check?
          ucd-ccc-value
          ucd-cf-value
          ucd-canonical-dm-value
          ucd-gcb-value
          ucd-lc-value
+         ucd-nfc_qc-value
          ucd-tc-value
          ucd-uc-value
          ucd-wb-value))
index 9b55f077ffd85bda7d223b18577541c4fd9cb7d7..a3de91069d9fc9bf0106c641bef2ba945f0c7c6d 100644 (file)
@@ -513,31 +513,45 @@ USA.
 ;;;; Normalization
 
 (define (string->nfd string)
-  (if (or (string-ascii? string)       ;ASCII unaffected by normalization
-         (string-in-nfd? string))
+  (if (string-in-nfd? string)
       string
       (canonical-ordering! (canonical-decomposition string))))
 
-(define (string-ascii? string)
-  (let ((n (string-length string)))
-    (let loop ((i 0))
-      (if (fix:< i n)
-         (and (char-ascii? (string-ref string i))
-              (loop (fix:+ i 1)))
-         #t))))
+(define (string->nfc string)
+  (if (string-in-nfc? string)
+      string
+      (canonical-composition (string->nfd string))))
 
 (define (string-in-nfd? string)
-  (let ((n (string-length string)))
+  (let ((end (string-length string)))
     (let loop ((i 0) (last-ccc 0))
-      (if (fix:< i n)
-         (let* ((char (string-ref string i))
-                (ccc (ucd-ccc-value char)))
-           (and (or (fix:= ccc 0)
-                    (fix:>= ccc last-ccc))
-                (char-nfd-quick-check? char)
-                (loop (fix:+ i 1) ccc)))
+      (if (fix:< i end)
+         (let ((char (string-ref string i)))
+           (if (fix:< (char->integer char) #xC0)
+               (loop (fix:+ i 1) 0)
+               (let ((ccc (ucd-ccc-value char)))
+                 (and (or (fix:= ccc 0)
+                          (fix:>= ccc last-ccc))
+                      (char-nfd-quick-check? char)
+                      (loop (fix:+ i 1) ccc)))))
          #t))))
 
+(define (string-in-nfc? string)
+  (let ((end (string-length string)))
+    (let loop ((i 0) (last-ccc 0) (result #t))
+      (if (fix:< i end)
+         (let ((char (string-ref string i)))
+           (if (fix:< (char->integer char) #x300)
+               (loop (fix:+ i 1) 0 result)
+               (let ((ccc (ucd-ccc-value char)))
+                 (and (or (fix:= ccc 0)
+                          (fix:>= ccc last-ccc))
+                      (case (ucd-nfc_qc-value char)
+                        ((yes) (loop (fix:+ i 1) ccc result))
+                        ((maybe) (loop (fix:+ i 1) ccc 'maybe))
+                        (else #f))))))
+         result))))
+
 (define (canonical-decomposition string)
   (let ((end (string-length string))
        (builder (string-builder)))
@@ -587,6 +601,9 @@ USA.
 
     (scan-for-non-starter 0))
   string)
+
+(define (canonical-composition string)
+  string)
 \f
 ;;;; Grapheme clusters