From: Chris Hanson <org/chris-hanson/cph>
Date: Sat, 25 Feb 2017 05:08:17 +0000 (-0800)
Subject: Use new GCB values to compress and speed up find-grapheme-cluster-breaks.
X-Git-Tag: mit-scheme-pucked-9.2.12~198^2~23
X-Git-Url: https://birchwood-abbey.net/git?a=commitdiff_plain;h=a8bb470b3944b538ea62c022a278dcb7a68bdaab;p=mit-scheme.git

Use new GCB values to compress and speed up find-grapheme-cluster-breaks.
---

diff --git a/src/runtime/ucd-table-gcb.scm b/src/runtime/ucd-table-gcb.scm
index a6c45e4c3..3a25af12c 100644
--- a/src/runtime/ucd-table-gcb.scm
+++ b/src/runtime/ucd-table-gcb.scm
@@ -38,75 +38,75 @@ USA.
 
 (define (ucd-gcb-entry-0 sv table)
   (declare (ignore sv table))
-  'control)
+  0)
 
 (define (ucd-gcb-entry-1 sv table)
   (declare (ignore sv table))
-  'linefeed)
+  8)
 
 (define (ucd-gcb-entry-2 sv table)
   (declare (ignore sv table))
-  'carriage-return)
+  1)
 
 (define (ucd-gcb-entry-3 sv table)
   (declare (ignore sv table))
-  'other)
+  16)
 
 (define (ucd-gcb-entry-4 sv table)
   (declare (ignore sv table))
-  'extend)
+  5)
 
 (define (ucd-gcb-entry-5 sv table)
   (declare (ignore sv table))
-  'prepend)
+  11)
 
 (define (ucd-gcb-entry-6 sv table)
   (declare (ignore sv table))
-  'spacing-mark)
+  13)
 
 (define (ucd-gcb-entry-7 sv table)
   (declare (ignore sv table))
-  'hangul-syllable-type=l)
+  7)
 
 (define (ucd-gcb-entry-8 sv table)
   (declare (ignore sv table))
-  'hangul-syllable-type=v)
+  15)
 
 (define (ucd-gcb-entry-9 sv table)
   (declare (ignore sv table))
-  'hangul-syllable-type=t)
+  14)
 
 (define (ucd-gcb-entry-10 sv table)
   (declare (ignore sv table))
-  'zero-width-joiner)
+  17)
 
 (define (ucd-gcb-entry-11 sv table)
   (declare (ignore sv table))
-  'emoji-base)
+  2)
 
 (define (ucd-gcb-entry-12 sv table)
   (declare (ignore sv table))
-  'glue-after-zero-width-joiner)
+  6)
 
 (define (ucd-gcb-entry-13 sv table)
   (declare (ignore sv table))
-  'hangul-syllable-type=lv)
+  9)
 
 (define (ucd-gcb-entry-14 sv table)
   (declare (ignore sv table))
-  'hangul-syllable-type=lvt)
+  10)
 
 (define (ucd-gcb-entry-15 sv table)
   (declare (ignore sv table))
-  'regional-indicator)
+  12)
 
 (define (ucd-gcb-entry-16 sv table)
   (declare (ignore sv table))
-  'emoji-modifier)
+  4)
 
 (define (ucd-gcb-entry-17 sv table)
   (declare (ignore sv table))
-  'emoji-base-gaz)
+  3)
 
 (define-deferred ucd-gcb-entry-18
   (let ((offsets (bytevector 0 0 0 0 0 0 0 0 0 0 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 3 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3)))
diff --git a/src/runtime/ustring.scm b/src/runtime/ustring.scm
index e5f6f26a3..82ed9c885 100644
--- a/src/runtime/ustring.scm
+++ b/src/runtime/ustring.scm
@@ -564,123 +564,126 @@ USA.
 	(fix:- breaks 1)
 	breaks)))
 
+(define gcb-names
+  '#(control
+     carriage-return
+     emoji-base
+     emoji-base-gaz
+     emoji-modifier
+     extend
+     glue-after-zwj
+     hst=l
+     linefeed
+     hst=lv
+     hst=lvt
+     prepend
+     regional-indicator
+     spacing-mark
+     hst=t
+     hst=v
+     other
+     zwj))
+
+(define (gcb-code name)
+  (let ((end (vector-length gcb-names)))
+    (let loop ((code 0))
+      (if (not (fix:< code end))
+	  (error "Unknown GCB name:" name))
+      (if (eq? (vector-ref gcb-names code) name)
+	  code
+	  (loop (fix:+ code 1))))))
+
+(define gcb-states
+  (let ((simple-state
+	 (lambda (break?)
+	   (lambda (gcb k)
+	     (k gcb (break? gcb)))))
+	(make-no-breaks
+	 (lambda (names)
+	   (let loop
+	       ((names names)
+		(mask (fix:- (fix:lsh 1 (vector-length gcb-names)) 1)))
+	     (if (pair? names)
+		 (loop (cdr names)
+		       (fix:andc mask (fix:lsh 1 (gcb-code (car names)))))
+		 (lambda (gcb)
+		   (not (fix:= 0 (fix:and mask (fix:lsh 1 gcb)))))))))
+	(make-breaks
+	 (lambda (names)
+	   (let loop
+	       ((names names)
+		(mask 0))
+	     (if (pair? names)
+		 (loop (cdr names)
+		       (fix:or mask (fix:lsh 1 (gcb-code (car names)))))
+		 (lambda (gcb)
+		   (not (fix:= 0 (fix:and mask (fix:lsh 1 gcb))))))))))
+    (let ((state:control (simple-state (make-no-breaks '())))
+	  (state:emoji-base
+	   (let ((gcb:extend (gcb-code 'extend))
+		 (gcb:emoji-base (gcb-code 'emoji-base))
+		 (break?
+		  (make-no-breaks '(emoji-modifier extend spacing-mark zwj))))
+	     (lambda (gcb k)
+	       (if (fix:= gcb gcb:extend)
+		   (k gcb:emoji-base #f)
+		   (k gcb (break? gcb))))))
+	  (state:extend
+	   (simple-state (make-no-breaks '(extend spacing-mark zwj))))
+	  (state:hst=v
+	   (simple-state
+	    (make-no-breaks '(hst=t hst=v extend spacing-mark zwj))))
+	  (state:hst=t
+	   (simple-state (make-no-breaks '(hst=t extend spacing-mark zwj)))))
+      (vector state:control
+	      (simple-state (make-no-breaks '(linefeed)))
+	      state:emoji-base
+	      state:emoji-base
+	      state:extend
+	      state:extend
+	      state:extend
+	      (simple-state
+	       (make-no-breaks
+		'(hst=l hst=lv hst=lvt hst=v extend spacing-mark zwj)))
+	      state:control
+	      state:hst=v
+	      state:hst=t
+	      (simple-state (make-breaks '(control carriage-return linefeed)))
+	      (let ((gcb:regional-indicator (gcb-code 'regional-indicator))
+		    (gcb:extend (gcb-code 'extend))
+		    (break? (make-no-breaks '(extend spacing-mark zwj))))
+		(lambda (gcb k)
+		  (let ((gcb
+			 (if (fix:= gcb gcb:regional-indicator)
+			     gcb:extend
+			     gcb)))
+		    (k gcb (break? gcb)))))
+	      state:extend
+	      state:hst=t
+	      state:hst=v
+	      state:extend
+	      (simple-state
+	       (make-no-breaks
+		'(emoji-base-gaz glue-after-zwj extend spacing-mark zwj)))))))
+
 (define (find-grapheme-cluster-breaks string initial-ctx break)
   (let ((n (string-length string)))
 
-    (define (state:control i ctx)
-      (normal-transition (get-gcb i) i (break i ctx)))
-
-    (define (state:carriage-return i ctx)
-      (let ((gcb (get-gcb i)))
-	(normal-transition gcb i
-	  (case gcb
-	    ((linefeed) ctx)
-	    (else (break i ctx))))))
-
-    (define (state:extend i ctx)
-      (let ((gcb (get-gcb i)))
-	(normal-transition gcb i
-	  (case gcb
-	    ((extend spacing-mark zero-width-joiner) ctx)
-	    (else (break i ctx))))))
-
-    (define (state:zero-width-joiner i ctx)
-      (let ((gcb (get-gcb i)))
-	(normal-transition gcb i
-	  (case gcb
-	    ((emoji-base-gaz glue-after-zero-width-joiner
-			     extend spacing-mark zero-width-joiner)
-	     ctx)
-	    (else (break i ctx))))))
-
-    (define (state:hangul-syllable-type=l i ctx)
-      (let ((gcb (get-gcb i)))
-	(normal-transition gcb i
-	  (case gcb
-	    ((hangul-syllable-type=l
-	      hangul-syllable-type=lv
-	      hangul-syllable-type=lvt
-	      hangul-syllable-type=v
-	      extend spacing-mark zero-width-joiner)
-	     ctx)
-	    (else (break i ctx))))))
-
-    (define (state:hangul-syllable-type=v i ctx)
-      (let ((gcb (get-gcb i)))
-	(normal-transition gcb i
-	  (case gcb
-	    ((hangul-syllable-type=t hangul-syllable-type=v
-				     extend spacing-mark zero-width-joiner)
-	     ctx)
-	    (else (break i ctx))))))
-
-    (define (state:hangul-syllable-type=t i ctx)
-      (let ((gcb (get-gcb i)))
-	(normal-transition gcb i
-	  (case gcb
-	    ((hangul-syllable-type=t extend spacing-mark zero-width-joiner) ctx)
-	    (else (break i ctx))))))
-
-    (define (state:prepend i ctx)
-      (let ((gcb (get-gcb i)))
-	(normal-transition gcb i
-	  (case gcb
-	    ((control carriage-return linefeed end-of-text) (break i ctx))
-	    (else ctx)))))
-
-    (define (state:emoji-base i ctx)
-      (let ((gcb (get-gcb i)))
-	(normal-transition (if (eq? gcb 'extend) 'emoji-base gcb) i
-	  (case gcb
-	    ((emoji-modifier extend spacing-mark zero-width-joiner) ctx)
-	    (else (break i ctx))))))
-
-    (define (state:regional-indicator i ctx)
-      (let ((gcb
-	     (let ((gcb (get-gcb i)))
-	       (if (eq? gcb 'regional-indicator)
-		   'extend
-		   gcb))))
-	(normal-transition gcb i
-	  (case gcb
-	    ((extend spacing-mark zero-width-joiner) ctx)
-	    (else (break i ctx))))))
-
-    (define (state:end-of-text i ctx)
-      (declare (ignore i))
-      ctx)
-
-    (define (transition state i ctx)
-      (state (fix:+ i 1) ctx))
-
-    (define (normal-transition gcb i ctx)
-      (transition (gcb->state gcb) i ctx))
-
-    (define (gcb->state gcb)
-      (case gcb
-	((control linefeed) state:control)
-	((carriage-return) state:carriage-return)
-	((emoji-base emoji-base-gaz) state:emoji-base)
-	((emoji-modifier extend glue-after-zero-width-joiner spacing-mark other)
-	 state:extend)
-	((hangul-syllable-type=l) state:hangul-syllable-type=l)
-	((hangul-syllable-type=t hangul-syllable-type=lvt)
-	 state:hangul-syllable-type=t)
-	((hangul-syllable-type=v hangul-syllable-type=lv)
-	 state:hangul-syllable-type=v)
-	((prepend) state:prepend)
-	((regional-indicator) state:regional-indicator)
-	((zero-width-joiner) state:zero-width-joiner)
-	((end-of-text) state:end-of-text)
-	(else (error "Unknown gcb value:" gcb))))
-
     (define (get-gcb i)
-      (if (fix:< i n)
-	  (ucd-gcb-value (string-ref string i))
-	  'end-of-text))
+      (and (fix:< i n)
+	   (ucd-gcb-value (string-ref string i))))
+
+    (define (transition gcb i ctx)
+      (if gcb
+	  (let ((i* (fix:+ i 1)))
+	    ((vector-ref gcb-states gcb)
+	     (get-gcb i*)
+	     (lambda (gcb* break?)
+	       (transition gcb* i* (if break? (break i* ctx) ctx)))))
+	  ctx))
 
     (if (fix:> n 0)
-	(normal-transition (get-gcb 0) 0 (break 0 initial-ctx))
+	(transition (get-gcb 0) 0 (break 0 initial-ctx))
 	initial-ctx)))
 
 (define (list->string chars)