(define (ucd-gcb-entry-0 sv table)
(declare (ignore sv table))
- 'control)
+ 0)
(define (ucd-gcb-entry-1 sv table)
(declare (ignore sv table))
- 'linefeed)
+ 8)
(define (ucd-gcb-entry-2 sv table)
(declare (ignore sv table))
- 'carriage-return)
+ 1)
(define (ucd-gcb-entry-3 sv table)
(declare (ignore sv table))
- 'other)
+ 16)
(define (ucd-gcb-entry-4 sv table)
(declare (ignore sv table))
- 'extend)
+ 5)
(define (ucd-gcb-entry-5 sv table)
(declare (ignore sv table))
- 'prepend)
+ 11)
(define (ucd-gcb-entry-6 sv table)
(declare (ignore sv table))
- 'spacing-mark)
+ 13)
(define (ucd-gcb-entry-7 sv table)
(declare (ignore sv table))
- 'hangul-syllable-type=l)
+ 7)
(define (ucd-gcb-entry-8 sv table)
(declare (ignore sv table))
- 'hangul-syllable-type=v)
+ 15)
(define (ucd-gcb-entry-9 sv table)
(declare (ignore sv table))
- 'hangul-syllable-type=t)
+ 14)
(define (ucd-gcb-entry-10 sv table)
(declare (ignore sv table))
- 'zero-width-joiner)
+ 17)
(define (ucd-gcb-entry-11 sv table)
(declare (ignore sv table))
- 'emoji-base)
+ 2)
(define (ucd-gcb-entry-12 sv table)
(declare (ignore sv table))
- 'glue-after-zero-width-joiner)
+ 6)
(define (ucd-gcb-entry-13 sv table)
(declare (ignore sv table))
- 'hangul-syllable-type=lv)
+ 9)
(define (ucd-gcb-entry-14 sv table)
(declare (ignore sv table))
- 'hangul-syllable-type=lvt)
+ 10)
(define (ucd-gcb-entry-15 sv table)
(declare (ignore sv table))
- 'regional-indicator)
+ 12)
(define (ucd-gcb-entry-16 sv table)
(declare (ignore sv table))
- 'emoji-modifier)
+ 4)
(define (ucd-gcb-entry-17 sv table)
(declare (ignore sv table))
- 'emoji-base-gaz)
+ 3)
(define-deferred ucd-gcb-entry-18
(let ((offsets (bytevector 0 0 0 0 0 0 0 0 0 0 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 3 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3)))
(fix:- breaks 1)
breaks)))
+(define gcb-names
+ '#(control
+ carriage-return
+ emoji-base
+ emoji-base-gaz
+ emoji-modifier
+ extend
+ glue-after-zwj
+ hst=l
+ linefeed
+ hst=lv
+ hst=lvt
+ prepend
+ regional-indicator
+ spacing-mark
+ hst=t
+ hst=v
+ other
+ zwj))
+
+(define (gcb-code name)
+ (let ((end (vector-length gcb-names)))
+ (let loop ((code 0))
+ (if (not (fix:< code end))
+ (error "Unknown GCB name:" name))
+ (if (eq? (vector-ref gcb-names code) name)
+ code
+ (loop (fix:+ code 1))))))
+\f
+(define gcb-states
+ (let ((simple-state
+ (lambda (break?)
+ (lambda (gcb k)
+ (k gcb (break? gcb)))))
+ (make-no-breaks
+ (lambda (names)
+ (let loop
+ ((names names)
+ (mask (fix:- (fix:lsh 1 (vector-length gcb-names)) 1)))
+ (if (pair? names)
+ (loop (cdr names)
+ (fix:andc mask (fix:lsh 1 (gcb-code (car names)))))
+ (lambda (gcb)
+ (not (fix:= 0 (fix:and mask (fix:lsh 1 gcb)))))))))
+ (make-breaks
+ (lambda (names)
+ (let loop
+ ((names names)
+ (mask 0))
+ (if (pair? names)
+ (loop (cdr names)
+ (fix:or mask (fix:lsh 1 (gcb-code (car names)))))
+ (lambda (gcb)
+ (not (fix:= 0 (fix:and mask (fix:lsh 1 gcb))))))))))
+ (let ((state:control (simple-state (make-no-breaks '())))
+ (state:emoji-base
+ (let ((gcb:extend (gcb-code 'extend))
+ (gcb:emoji-base (gcb-code 'emoji-base))
+ (break?
+ (make-no-breaks '(emoji-modifier extend spacing-mark zwj))))
+ (lambda (gcb k)
+ (if (fix:= gcb gcb:extend)
+ (k gcb:emoji-base #f)
+ (k gcb (break? gcb))))))
+ (state:extend
+ (simple-state (make-no-breaks '(extend spacing-mark zwj))))
+ (state:hst=v
+ (simple-state
+ (make-no-breaks '(hst=t hst=v extend spacing-mark zwj))))
+ (state:hst=t
+ (simple-state (make-no-breaks '(hst=t extend spacing-mark zwj)))))
+ (vector state:control
+ (simple-state (make-no-breaks '(linefeed)))
+ state:emoji-base
+ state:emoji-base
+ state:extend
+ state:extend
+ state:extend
+ (simple-state
+ (make-no-breaks
+ '(hst=l hst=lv hst=lvt hst=v extend spacing-mark zwj)))
+ state:control
+ state:hst=v
+ state:hst=t
+ (simple-state (make-breaks '(control carriage-return linefeed)))
+ (let ((gcb:regional-indicator (gcb-code 'regional-indicator))
+ (gcb:extend (gcb-code 'extend))
+ (break? (make-no-breaks '(extend spacing-mark zwj))))
+ (lambda (gcb k)
+ (let ((gcb
+ (if (fix:= gcb gcb:regional-indicator)
+ gcb:extend
+ gcb)))
+ (k gcb (break? gcb)))))
+ state:extend
+ state:hst=t
+ state:hst=v
+ state:extend
+ (simple-state
+ (make-no-breaks
+ '(emoji-base-gaz glue-after-zwj extend spacing-mark zwj)))))))
+\f
(define (find-grapheme-cluster-breaks string initial-ctx break)
(let ((n (string-length string)))
- (define (state:control i ctx)
- (normal-transition (get-gcb i) i (break i ctx)))
-
- (define (state:carriage-return i ctx)
- (let ((gcb (get-gcb i)))
- (normal-transition gcb i
- (case gcb
- ((linefeed) ctx)
- (else (break i ctx))))))
-
- (define (state:extend i ctx)
- (let ((gcb (get-gcb i)))
- (normal-transition gcb i
- (case gcb
- ((extend spacing-mark zero-width-joiner) ctx)
- (else (break i ctx))))))
-
- (define (state:zero-width-joiner i ctx)
- (let ((gcb (get-gcb i)))
- (normal-transition gcb i
- (case gcb
- ((emoji-base-gaz glue-after-zero-width-joiner
- extend spacing-mark zero-width-joiner)
- ctx)
- (else (break i ctx))))))
-
- (define (state:hangul-syllable-type=l i ctx)
- (let ((gcb (get-gcb i)))
- (normal-transition gcb i
- (case gcb
- ((hangul-syllable-type=l
- hangul-syllable-type=lv
- hangul-syllable-type=lvt
- hangul-syllable-type=v
- extend spacing-mark zero-width-joiner)
- ctx)
- (else (break i ctx))))))
-
- (define (state:hangul-syllable-type=v i ctx)
- (let ((gcb (get-gcb i)))
- (normal-transition gcb i
- (case gcb
- ((hangul-syllable-type=t hangul-syllable-type=v
- extend spacing-mark zero-width-joiner)
- ctx)
- (else (break i ctx))))))
-
- (define (state:hangul-syllable-type=t i ctx)
- (let ((gcb (get-gcb i)))
- (normal-transition gcb i
- (case gcb
- ((hangul-syllable-type=t extend spacing-mark zero-width-joiner) ctx)
- (else (break i ctx))))))
-
- (define (state:prepend i ctx)
- (let ((gcb (get-gcb i)))
- (normal-transition gcb i
- (case gcb
- ((control carriage-return linefeed end-of-text) (break i ctx))
- (else ctx)))))
-
- (define (state:emoji-base i ctx)
- (let ((gcb (get-gcb i)))
- (normal-transition (if (eq? gcb 'extend) 'emoji-base gcb) i
- (case gcb
- ((emoji-modifier extend spacing-mark zero-width-joiner) ctx)
- (else (break i ctx))))))
-
- (define (state:regional-indicator i ctx)
- (let ((gcb
- (let ((gcb (get-gcb i)))
- (if (eq? gcb 'regional-indicator)
- 'extend
- gcb))))
- (normal-transition gcb i
- (case gcb
- ((extend spacing-mark zero-width-joiner) ctx)
- (else (break i ctx))))))
-
- (define (state:end-of-text i ctx)
- (declare (ignore i))
- ctx)
-
- (define (transition state i ctx)
- (state (fix:+ i 1) ctx))
-
- (define (normal-transition gcb i ctx)
- (transition (gcb->state gcb) i ctx))
-
- (define (gcb->state gcb)
- (case gcb
- ((control linefeed) state:control)
- ((carriage-return) state:carriage-return)
- ((emoji-base emoji-base-gaz) state:emoji-base)
- ((emoji-modifier extend glue-after-zero-width-joiner spacing-mark other)
- state:extend)
- ((hangul-syllable-type=l) state:hangul-syllable-type=l)
- ((hangul-syllable-type=t hangul-syllable-type=lvt)
- state:hangul-syllable-type=t)
- ((hangul-syllable-type=v hangul-syllable-type=lv)
- state:hangul-syllable-type=v)
- ((prepend) state:prepend)
- ((regional-indicator) state:regional-indicator)
- ((zero-width-joiner) state:zero-width-joiner)
- ((end-of-text) state:end-of-text)
- (else (error "Unknown gcb value:" gcb))))
-
(define (get-gcb i)
- (if (fix:< i n)
- (ucd-gcb-value (string-ref string i))
- 'end-of-text))
+ (and (fix:< i n)
+ (ucd-gcb-value (string-ref string i))))
+
+ (define (transition gcb i ctx)
+ (if gcb
+ (let ((i* (fix:+ i 1)))
+ ((vector-ref gcb-states gcb)
+ (get-gcb i*)
+ (lambda (gcb* break?)
+ (transition gcb* i* (if break? (break i* ctx) ctx)))))
+ ctx))
(if (fix:> n 0)
- (normal-transition (get-gcb 0) 0 (break 0 initial-ctx))
+ (transition (get-gcb 0) 0 (break 0 initial-ctx))
initial-ctx)))
\f
(define (list->string chars)