From 63bde0def818d11a257264fd1a37b45c244d8b63 Mon Sep 17 00:00:00 2001
From: Chris Hanson <org/chris-hanson/cph>
Date: Thu, 23 Feb 2017 23:57:01 -0800
Subject: [PATCH] Implement count-grapheme-clusters.

---
 src/runtime/runtime.pkg |   5 +-
 src/runtime/ustring.scm | 141 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 144 insertions(+), 2 deletions(-)

diff --git a/src/runtime/runtime.pkg b/src/runtime/runtime.pkg
index a7a8411dd..46610ca57 100644
--- a/src/runtime/runtime.pkg
+++ b/src/runtime/runtime.pkg
@@ -1029,6 +1029,7 @@ USA.
 	  substring=?)
   (export ()
 	  (substring string-copy)
+	  count-grapheme-clusters
 	  list->string
 	  make-string
 	  string
@@ -1302,6 +1303,7 @@ USA.
 	 "ucd-table-cwu"
 	 "ucd-table-dm"
 	 "ucd-table-gc"
+	 "ucd-table-gcb"
 	 "ucd-table-lc"
 	 "ucd-table-lower"
 	 "ucd-table-nfd_qc"
@@ -1341,7 +1343,8 @@ USA.
 	  char-changes-when-upper-cased?
 	  char-nfd-quick-check?
 	  ucd-ccc-value
-	  ucd-dm-value))
+	  ucd-dm-value
+	  ucd-gcb-value))
 
 (define-package (runtime ucd-glue)
   (files "ucd-glue")
diff --git a/src/runtime/ustring.scm b/src/runtime/ustring.scm
index 1f34cc266..e5f6f26a3 100644
--- a/src/runtime/ustring.scm
+++ b/src/runtime/ustring.scm
@@ -474,10 +474,19 @@ USA.
 	  #t))))
 
 (define (string->nfd string)
-  (if (string-in-nfd? string)
+  (if (or (string-ascii? string)	;ASCII unaffected by normalization
+	  (string-in-nfd? string))
       string
       (canonical-ordering! (canonical-decomposition string))))
 
+(define (string-ascii? string)
+  (let ((n (string-length string)))
+    (let loop ((i 0))
+      (if (fix:< i n)
+	  (and (char-ascii? (string-ref string i))
+	       (loop (fix:+ i 1)))
+	  #t))))
+
 (define (string-in-nfd? string)
   (let ((n (string-length string)))
     (let loop ((i 0) (last-ccc 0))
@@ -544,6 +553,136 @@ USA.
 	  result))))
 |#
 
+(define (count-grapheme-clusters string)
+  (let ((breaks
+	 (find-grapheme-cluster-breaks string
+				       0
+				       (lambda (i count)
+					 (declare (ignore i))
+					 (fix:+ count 1)))))
+    (if (fix:> breaks 0)
+	(fix:- breaks 1)
+	breaks)))
+
+(define (find-grapheme-cluster-breaks string initial-ctx break)
+  (let ((n (string-length string)))
+
+    (define (state:control i ctx)
+      (normal-transition (get-gcb i) i (break i ctx)))
+
+    (define (state:carriage-return i ctx)
+      (let ((gcb (get-gcb i)))
+	(normal-transition gcb i
+	  (case gcb
+	    ((linefeed) ctx)
+	    (else (break i ctx))))))
+
+    (define (state:extend i ctx)
+      (let ((gcb (get-gcb i)))
+	(normal-transition gcb i
+	  (case gcb
+	    ((extend spacing-mark zero-width-joiner) ctx)
+	    (else (break i ctx))))))
+
+    (define (state:zero-width-joiner i ctx)
+      (let ((gcb (get-gcb i)))
+	(normal-transition gcb i
+	  (case gcb
+	    ((emoji-base-gaz glue-after-zero-width-joiner
+			     extend spacing-mark zero-width-joiner)
+	     ctx)
+	    (else (break i ctx))))))
+
+    (define (state:hangul-syllable-type=l i ctx)
+      (let ((gcb (get-gcb i)))
+	(normal-transition gcb i
+	  (case gcb
+	    ((hangul-syllable-type=l
+	      hangul-syllable-type=lv
+	      hangul-syllable-type=lvt
+	      hangul-syllable-type=v
+	      extend spacing-mark zero-width-joiner)
+	     ctx)
+	    (else (break i ctx))))))
+
+    (define (state:hangul-syllable-type=v i ctx)
+      (let ((gcb (get-gcb i)))
+	(normal-transition gcb i
+	  (case gcb
+	    ((hangul-syllable-type=t hangul-syllable-type=v
+				     extend spacing-mark zero-width-joiner)
+	     ctx)
+	    (else (break i ctx))))))
+
+    (define (state:hangul-syllable-type=t i ctx)
+      (let ((gcb (get-gcb i)))
+	(normal-transition gcb i
+	  (case gcb
+	    ((hangul-syllable-type=t extend spacing-mark zero-width-joiner) ctx)
+	    (else (break i ctx))))))
+
+    (define (state:prepend i ctx)
+      (let ((gcb (get-gcb i)))
+	(normal-transition gcb i
+	  (case gcb
+	    ((control carriage-return linefeed end-of-text) (break i ctx))
+	    (else ctx)))))
+
+    (define (state:emoji-base i ctx)
+      (let ((gcb (get-gcb i)))
+	(normal-transition (if (eq? gcb 'extend) 'emoji-base gcb) i
+	  (case gcb
+	    ((emoji-modifier extend spacing-mark zero-width-joiner) ctx)
+	    (else (break i ctx))))))
+
+    (define (state:regional-indicator i ctx)
+      (let ((gcb
+	     (let ((gcb (get-gcb i)))
+	       (if (eq? gcb 'regional-indicator)
+		   'extend
+		   gcb))))
+	(normal-transition gcb i
+	  (case gcb
+	    ((extend spacing-mark zero-width-joiner) ctx)
+	    (else (break i ctx))))))
+
+    (define (state:end-of-text i ctx)
+      (declare (ignore i))
+      ctx)
+
+    (define (transition state i ctx)
+      (state (fix:+ i 1) ctx))
+
+    (define (normal-transition gcb i ctx)
+      (transition (gcb->state gcb) i ctx))
+
+    (define (gcb->state gcb)
+      (case gcb
+	((control linefeed) state:control)
+	((carriage-return) state:carriage-return)
+	((emoji-base emoji-base-gaz) state:emoji-base)
+	((emoji-modifier extend glue-after-zero-width-joiner spacing-mark other)
+	 state:extend)
+	((hangul-syllable-type=l) state:hangul-syllable-type=l)
+	((hangul-syllable-type=t hangul-syllable-type=lvt)
+	 state:hangul-syllable-type=t)
+	((hangul-syllable-type=v hangul-syllable-type=lv)
+	 state:hangul-syllable-type=v)
+	((prepend) state:prepend)
+	((regional-indicator) state:regional-indicator)
+	((zero-width-joiner) state:zero-width-joiner)
+	((end-of-text) state:end-of-text)
+	(else (error "Unknown gcb value:" gcb))))
+
+    (define (get-gcb i)
+      (if (fix:< i n)
+	  (ucd-gcb-value (string-ref string i))
+	  'end-of-text))
+
+    (if (fix:> n 0)
+	(normal-transition (get-gcb 0) 0 (break 0 initial-ctx))
+	initial-ctx)))
+
 (define (list->string chars)
   (if (every char-8-bit? chars)
       (let ((string (legacy-string-allocate (length chars))))
-- 
2.25.1