From 63bde0def818d11a257264fd1a37b45c244d8b63 Mon Sep 17 00:00:00 2001 From: Chris Hanson Date: Thu, 23 Feb 2017 23:57:01 -0800 Subject: [PATCH] Implement count-grapheme-clusters. --- src/runtime/runtime.pkg | 5 +- src/runtime/ustring.scm | 141 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 144 insertions(+), 2 deletions(-) diff --git a/src/runtime/runtime.pkg b/src/runtime/runtime.pkg index a7a8411dd..46610ca57 100644 --- a/src/runtime/runtime.pkg +++ b/src/runtime/runtime.pkg @@ -1029,6 +1029,7 @@ USA. substring=?) (export () (substring string-copy) + count-grapheme-clusters list->string make-string string @@ -1302,6 +1303,7 @@ USA. "ucd-table-cwu" "ucd-table-dm" "ucd-table-gc" + "ucd-table-gcb" "ucd-table-lc" "ucd-table-lower" "ucd-table-nfd_qc" @@ -1341,7 +1343,8 @@ USA. char-changes-when-upper-cased? char-nfd-quick-check? ucd-ccc-value - ucd-dm-value)) + ucd-dm-value + ucd-gcb-value)) (define-package (runtime ucd-glue) (files "ucd-glue") diff --git a/src/runtime/ustring.scm b/src/runtime/ustring.scm index 1f34cc266..e5f6f26a3 100644 --- a/src/runtime/ustring.scm +++ b/src/runtime/ustring.scm @@ -474,10 +474,19 @@ USA. #t)))) (define (string->nfd string) - (if (string-in-nfd? string) + (if (or (string-ascii? string) ;ASCII unaffected by normalization + (string-in-nfd? string)) string (canonical-ordering! (canonical-decomposition string)))) +(define (string-ascii? string) + (let ((n (string-length string))) + (let loop ((i 0)) + (if (fix:< i n) + (and (char-ascii? (string-ref string i)) + (loop (fix:+ i 1))) + #t)))) + (define (string-in-nfd? string) (let ((n (string-length string))) (let loop ((i 0) (last-ccc 0)) @@ -544,6 +553,136 @@ USA. result)))) |# +(define (count-grapheme-clusters string) + (let ((breaks + (find-grapheme-cluster-breaks string + 0 + (lambda (i count) + (declare (ignore i)) + (fix:+ count 1))))) + (if (fix:> breaks 0) + (fix:- breaks 1) + breaks))) + +(define (find-grapheme-cluster-breaks string initial-ctx break) + (let ((n (string-length string))) + + (define (state:control i ctx) + (normal-transition (get-gcb i) i (break i ctx))) + + (define (state:carriage-return i ctx) + (let ((gcb (get-gcb i))) + (normal-transition gcb i + (case gcb + ((linefeed) ctx) + (else (break i ctx)))))) + + (define (state:extend i ctx) + (let ((gcb (get-gcb i))) + (normal-transition gcb i + (case gcb + ((extend spacing-mark zero-width-joiner) ctx) + (else (break i ctx)))))) + + (define (state:zero-width-joiner i ctx) + (let ((gcb (get-gcb i))) + (normal-transition gcb i + (case gcb + ((emoji-base-gaz glue-after-zero-width-joiner + extend spacing-mark zero-width-joiner) + ctx) + (else (break i ctx)))))) + + (define (state:hangul-syllable-type=l i ctx) + (let ((gcb (get-gcb i))) + (normal-transition gcb i + (case gcb + ((hangul-syllable-type=l + hangul-syllable-type=lv + hangul-syllable-type=lvt + hangul-syllable-type=v + extend spacing-mark zero-width-joiner) + ctx) + (else (break i ctx)))))) + + (define (state:hangul-syllable-type=v i ctx) + (let ((gcb (get-gcb i))) + (normal-transition gcb i + (case gcb + ((hangul-syllable-type=t hangul-syllable-type=v + extend spacing-mark zero-width-joiner) + ctx) + (else (break i ctx)))))) + + (define (state:hangul-syllable-type=t i ctx) + (let ((gcb (get-gcb i))) + (normal-transition gcb i + (case gcb + ((hangul-syllable-type=t extend spacing-mark zero-width-joiner) ctx) + (else (break i ctx)))))) + + (define (state:prepend i ctx) + (let ((gcb (get-gcb i))) + (normal-transition gcb i + (case gcb + ((control carriage-return linefeed end-of-text) (break i ctx)) + (else ctx))))) + + (define (state:emoji-base i ctx) + (let ((gcb (get-gcb i))) + (normal-transition (if (eq? gcb 'extend) 'emoji-base gcb) i + (case gcb + ((emoji-modifier extend spacing-mark zero-width-joiner) ctx) + (else (break i ctx)))))) + + (define (state:regional-indicator i ctx) + (let ((gcb + (let ((gcb (get-gcb i))) + (if (eq? gcb 'regional-indicator) + 'extend + gcb)))) + (normal-transition gcb i + (case gcb + ((extend spacing-mark zero-width-joiner) ctx) + (else (break i ctx)))))) + + (define (state:end-of-text i ctx) + (declare (ignore i)) + ctx) + + (define (transition state i ctx) + (state (fix:+ i 1) ctx)) + + (define (normal-transition gcb i ctx) + (transition (gcb->state gcb) i ctx)) + + (define (gcb->state gcb) + (case gcb + ((control linefeed) state:control) + ((carriage-return) state:carriage-return) + ((emoji-base emoji-base-gaz) state:emoji-base) + ((emoji-modifier extend glue-after-zero-width-joiner spacing-mark other) + state:extend) + ((hangul-syllable-type=l) state:hangul-syllable-type=l) + ((hangul-syllable-type=t hangul-syllable-type=lvt) + state:hangul-syllable-type=t) + ((hangul-syllable-type=v hangul-syllable-type=lv) + state:hangul-syllable-type=v) + ((prepend) state:prepend) + ((regional-indicator) state:regional-indicator) + ((zero-width-joiner) state:zero-width-joiner) + ((end-of-text) state:end-of-text) + (else (error "Unknown gcb value:" gcb)))) + + (define (get-gcb i) + (if (fix:< i n) + (ucd-gcb-value (string-ref string i)) + 'end-of-text)) + + (if (fix:> n 0) + (normal-transition (get-gcb 0) 0 (break 0 initial-ctx)) + initial-ctx))) + (define (list->string chars) (if (every char-8-bit? chars) (let ((string (legacy-string-allocate (length chars)))) -- 2.25.1