From: Chris Hanson Date: Fri, 12 May 2017 03:14:23 +0000 (-0700) Subject: Implement a basic test for testing string searches. X-Git-Tag: mit-scheme-pucked-9.2.12~14^2~60 X-Git-Url: https://birchwood-abbey.net/git?a=commitdiff_plain;h=b50288c5b50ba7a31becc7332c5448bf122a2cd7;p=mit-scheme.git Implement a basic test for testing string searches. --- diff --git a/tests/check.scm b/tests/check.scm index cce540fb7..5e07df496 100644 --- a/tests/check.scm +++ b/tests/check.scm @@ -71,6 +71,7 @@ USA. "runtime/test-rgxcmp" "runtime/test-string" "runtime/test-string-normalization" + "runtime/test-string-search" "runtime/test-thread-queue" "runtime/test-url" ("runtime/test-wttree" (runtime wt-tree)) diff --git a/tests/runtime/test-string-search-data b/tests/runtime/test-string-search-data new file mode 100644 index 000000000..d5fbd5361 --- /dev/null +++ b/tests/runtime/test-string-search-data @@ -0,0 +1,2204 @@ +#| -*-Scheme-*- + +Copyright (C) 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, + 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, + 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, + 2017 Massachusetts Institute of Technology + +This file is part of MIT/GNU Scheme. + +MIT/GNU Scheme is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at +your option) any later version. + +MIT/GNU Scheme is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with MIT/GNU Scheme; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, +USA. + +|# + +;;;; Unicode strings +;;; package: (runtime ustring) + +;;; For simplicity, the implementation uses a 24-bit encoding for non-8-bit +;;; strings. This is not a good long-term approach and should be revisited once +;;; the runtime system has been converted to this string abstraction. + +(declare (usual-integrations)) + +(define-primitives + (allocate-nm-vector 2) + (legacy-string? string? 1) + (legacy-string-allocate string-allocate 1) + (primitive-byte-ref 2) + (primitive-byte-set! 3) + (primitive-datum-ref 2) + (primitive-datum-set! 3) + (primitive-type-ref 2) + (primitive-type-set! 3)) + +(define-integrable (ustring? object) + (object-type? (ucode-type unicode-string) object)) + +(define (mutable-string? object) + (%string-mutable? object (lambda () #f))) + +(define (string-mutable? string) + (%string-mutable? string + (lambda () + (error:not-a string? string 'string-mutable?)))) + +(define (%string-mutable? string fail) + (cond ((legacy-string? string)) + ((ustring? string) (%ustring-mutable? string)) + ((slice? string) (slice-mutable? string)) + (else (fail)))) + +(define (immutable-string? object) + (%string-immutable? object (lambda () #f))) + +(define (string-immutable? string) + (%string-immutable? string + (lambda () + (error:not-a string? string 'string-immutable?)))) + +(define (%string-immutable? string fail) + (cond ((legacy-string? string) #f) + ((ustring? string) (%ustring-immutable? string)) + ((slice? string) (not (slice-mutable? string))) + (else (fail)))) + +(define (register-ustring-predicates!) + (register-predicate! string? 'string) + (register-predicate! mutable-string? 'mutable-string '<= string?) + (register-predicate! immutable-string? 'immutable-string '<= string?) + (register-predicate! nfc-string? 'nfc-string '<= string?) + (register-predicate! legacy-string? 'legacy-string + '<= string? + '<= mutable-string?) + (register-predicate! ustring? 'unicode-string '<= string?) + (register-predicate! slice? 'string-slice '<= string?) + (register-predicate! 8-bit-string? '8-bit-string '<= string?)) + +;;;; Unicode string layout + +(select-on-bytes-per-word + ;; 32-bit words + (begin + (define-integrable byte->object-offset 3) + (define-integrable byte->object-shift -2) + (define-integrable byte0-index 8)) + ;; 64-bit words + (begin + (define-integrable byte->object-offset 7) + (define-integrable byte->object-shift -3) + (define-integrable byte0-index 16))) + +(define (%ustring-allocate n-bytes length cp-size) + (let ((string + (allocate-nm-vector (ucode-type unicode-string) + (fix:+ 1 + (fix:lsh (fix:+ n-bytes byte->object-offset) + byte->object-shift))))) + (%set-ustring-length! string length) + (%set-ustring-flags! string cp-size) ;assumes cp-size in bottom bits + string)) + +(define-integrable (ustring-length string) + (primitive-datum-ref string 1)) + +(define-integrable (%set-ustring-length! string length) + (primitive-datum-set! string 1 length)) + +(define-integrable (%ustring-flags string) + (primitive-type-ref string 1)) + +(define-integrable (%set-ustring-flags! string flags) + (primitive-type-set! string 1 flags)) + +(define (%ustring-cp-size string) + (fix:and #x03 (%ustring-flags string))) + +(define (%set-ustring-cp-size! string cp-size) + (%set-ustring-flags! string + (fix:or (fix:andc (%ustring-flags string) #x03) + cp-size))) + +(define-integrable (%ustring-mutable? string) + (fix:= 0 (%ustring-cp-size string))) + +(define-integrable (%ustring-immutable? string) + (not (%ustring-mutable? string))) + +(define-integrable flag:nfc #x04) +(define-integrable flag:nfc-set #x08) +(define-integrable flag:nfd #x10) + +(define-integrable (%make-flag-tester flag) + (lambda (string) + (not (fix:= 0 (fix:and flag (%ustring-flags string)))))) + +(define ustring-in-nfc? (%make-flag-tester flag:nfc)) +(define ustring-in-nfc-set? (%make-flag-tester flag:nfc-set)) +(define ustring-in-nfd? (%make-flag-tester flag:nfd)) + +(define (ustring-in-nfc! string nfc?) + (%set-ustring-flags! string + (fix:or (fix:andc (%ustring-flags string) + (fix:or flag:nfc flag:nfc-set)) + (if nfc? + (fix:or flag:nfc flag:nfc-set) + flag:nfc-set)))) + +(define (ustring-in-nfd! string nfd?) + (%set-ustring-flags! string + (if nfd? + (fix:or (%ustring-flags string) flag:nfd) + (fix:andc (%ustring-flags string) flag:nfd)))) + +(define-integrable (ustring1-ref string index) + (integer->char (cp1-ref string index))) + +(define-integrable (ustring1-set! string index char) + (cp1-set! string index (char->integer char))) + +(define-integrable (cp1-ref string index) + (primitive-byte-ref string (cp1-index index))) + +(define-integrable (cp1-set! string index cp) + (primitive-byte-set! string (cp1-index index) cp)) + +(define-integrable (cp1-index index) + (fix:+ byte0-index index)) + +(define-integrable (ustring2-ref string index) + (integer->char (cp2-ref string index))) + +(define-integrable (ustring2-set! string index char) + (cp2-set! string index (char->integer char))) + +(define (cp2-ref string index) + (let ((i (cp2-index index))) + (fix:or (primitive-byte-ref string i) + (fix:lsh (primitive-byte-ref string (fix:+ i 1)) 8)))) + +(define (cp2-set! string index cp) + (let ((i (cp2-index index))) + (primitive-byte-set! string i (fix:and cp #xFF)) + (primitive-byte-set! string (fix:+ i 1) (fix:lsh cp -8)))) + +(define-integrable (cp2-index index) + (fix:+ byte0-index (fix:* 2 index))) + +(define-integrable (ustring3-ref string index) + (integer->char (cp3-ref string index))) + +(define-integrable (ustring3-set! string index char) + (cp3-set! string index (char->integer char))) + +(define (cp3-ref string index) + (let ((i (cp3-index index))) + (fix:or (primitive-byte-ref string i) + (fix:or (fix:lsh (primitive-byte-ref string (fix:+ i 1)) 8) + (fix:lsh (primitive-byte-ref string (fix:+ i 2)) 16))))) + +(define (cp3-set! string index cp) + (let ((i (cp3-index index))) + (primitive-byte-set! string i (fix:and cp #xFF)) + (primitive-byte-set! string (fix:+ i 1) (fix:and (fix:lsh cp -8) #xFF)) + (primitive-byte-set! string (fix:+ i 2) (fix:lsh cp -16)))) + +(define-integrable (cp3-index index) + (fix:+ byte0-index (fix:* 3 index))) + +(define (mutable-ustring-allocate n) + (%ustring-allocate (fix:* 3 n) n 0)) + +(define (immutable-ustring-allocate n max-cp) + (cond ((fix:< max-cp #x100) + (let ((s (%ustring-allocate (fix:+ n 1) n 1))) + (ustring-in-nfc! s #t) + (if (fix:< max-cp #xC0) + (ustring-in-nfd! s #t)) + (ustring1-set! s n #\null) ;zero-terminate for C + s)) + ((fix:< max-cp #x10000) + (let ((s (%ustring-allocate (fix:* 2 n) n 2))) + (if (fix:< max-cp #x300) + (ustring-in-nfc! s #t)) + s)) + (else + (%ustring-allocate (fix:* 3 n) n 3)))) + +;;; Used during cold load. +(define (%ustring1? object) + (or (and (ustring? object) + (fix:= 1 (%ustring-cp-size object))) + (legacy-string? object))) + +;;; Used during cold load. +(define (%ascii-ustring! string) + (%set-ustring-cp-size! string 1) + (ustring-in-nfc! string #t) + (ustring-in-nfd! string #t)) + +;;; Used during cold load. +(define (%ascii-ustring-allocate n) + (let ((s (%ustring-allocate (fix:+ n 1) n 1))) + (ustring-in-nfc! s #t) + (ustring-in-nfd! s #t) + (ustring1-set! s n #\null) ;zero-terminate for C + s)) + +(define (ustring-ref string index) + (case (ustring-cp-size string) + ((1) (ustring1-ref string index)) + ((2) (ustring2-ref string index)) + (else (ustring3-ref string index)))) + +(define (ustring-set! string index char) + (case (ustring-cp-size string) + ((1) (ustring1-set! string index char)) + ((2) (ustring2-set! string index char)) + (else (ustring3-set! string index char)))) + +(define (ustring-cp-size string) + (if (legacy-string? string) + 1 + (%ustring-cp-size string))) + +(define (mutable-ustring? object) + (or (legacy-string? object) + (and (ustring? object) + (%ustring-mutable? object)))) + +(define (ustring-mutable? string) + (or (legacy-string? string) + (%ustring-mutable? string))) + +;;;; String slices + +(define (slice? object) + (and (%record? object) + (fix:= 4 (%record-length object)) + (eq? %slice-tag (%record-ref object 0)))) + +(define-integrable (make-slice string start length) + (%record %slice-tag string start length)) + +(define-integrable %slice-tag + '|#[(runtime ustring)slice]|) + +(define-integrable (slice-string slice) (%record-ref slice 1)) +(define-integrable (slice-start slice) (%record-ref slice 2)) +(define-integrable (slice-length slice) (%record-ref slice 3)) + +(define (slice-end slice) + (fix:+ (slice-start slice) (slice-length slice))) + +(define (slice-mutable? slice) + (ustring-mutable? (slice-string slice))) + +(define (unpack-slice string k) + (if (slice? string) + (k (slice-string string) (slice-start string) (slice-end string)) + (k string 0 (ustring-length string)))) + +(define (translate-slice string start end k) + (if (slice? string) + (k (slice-string string) + (fix:+ (slice-start string) start) + (fix:+ (slice-start string) end)) + (k string start end))) + +;;;; Basic operations + +(define (string? object) + (or (legacy-string? object) + (ustring? object) + (slice? object))) + +(define (make-string k #!optional char) + (guarantee index-fixnum? k 'make-string) + (let ((string (mutable-ustring-allocate k))) + (if (not (default-object? char)) + (do ((i 0 (fix:+ i 1))) + ((not (fix:< i k))) + (ustring3-set! string i char))) + string)) + +(define (string-length string) + (cond ((or (legacy-string? string) (ustring? string)) (ustring-length string)) + ((slice? string) (slice-length string)) + (else (error:not-a string? string 'string-length)))) + +(define (string-ref string index) + (guarantee index-fixnum? index 'string-ref) + (cond ((or (legacy-string? string) (ustring? string)) + (if (not (fix:< index (ustring-length string))) + (error:bad-range-argument index 'string-ref)) + (ustring-ref string index)) + ((slice? string) + (if (not (fix:< index (slice-length string))) + (error:bad-range-argument index 'string-ref)) + (ustring-ref (slice-string string) + (fix:+ (slice-start string) index))) + (else + (error:not-a string? string 'string-ref)))) + +(define (string-set! string index char) + (guarantee mutable-string? string 'string-set!) + (guarantee index-fixnum? index 'string-set!) + (guarantee bitless-char? char 'string-set!) + (if (not (fix:< index (string-length string))) + (error:bad-range-argument index 'string-set!)) + (if (slice? string) + (ustring-set! (slice-string string) + (fix:+ (slice-start string) index) + char) + (ustring-set! string index char))) + +;;;; Slice/Copy + +(define (string-slice string #!optional start end) + (let* ((len (string-length string)) + (end (fix:end-index end len 'string-slice)) + (start (fix:start-index start end 'string-slice))) + (cond ((and (fix:= start 0) (fix:= end len)) + string) + ((slice? string) + (make-slice (slice-string string) + (fix:+ (slice-start string) start) + (fix:- end start))) + (else + (make-slice string + start + (fix:- end start)))))) + +(define (string-copy! to at from #!optional start end) + (let* ((end (fix:end-index end (string-length from) 'string-copy!)) + (start (fix:start-index start end 'string-copy!))) + (guarantee index-fixnum? at 'string-copy!) + (let ((final-at (fix:+ at (fix:- end start)))) + (if (not (fix:<= final-at (string-length to))) + (error:bad-range-argument at 'string-copy!)) + (if (not (string-mutable? to)) + (error:bad-range-argument to 'string-copy!)) + (receive (to at) + (if (slice? to) + (values (slice-string to) + (fix:+ (slice-start to) at)) + (values to at)) + (translate-slice from start end + (lambda (from start end) + (%general-copy! to at from start end)))) + final-at))) + +(define (string-copy string #!optional start end) + (let* ((end (fix:end-index end (string-length string) 'string-copy)) + (start (fix:start-index start end 'string-copy))) + (translate-slice string start end + (lambda (string start end) + (let* ((n (fix:- end start)) + (to + (if (legacy-string? string) + (legacy-string-allocate n) + (mutable-ustring-allocate n)))) + (%general-copy! to 0 string start end) + to))))) + +(define (substring string #!optional start end) + (let* ((len (string-length string)) + (end (fix:end-index end (string-length string) 'substring)) + (start (fix:start-index start end 'substring))) + ;; It shouldn't be necessary to copy immutable substrings, but some of these + ;; find their way to Edwin so we can't return a slice here. We will + ;; probably need to implement a procedure to map an arbitrary string to a + ;; legacy string for Edwin's use. + (if (and (fix:= start 0) + (fix:= end len) + (not (slice? string)) + (ustring-in-nfc? string)) + string + (translate-slice string start end + (lambda (string start end) + (let ((to + (immutable-ustring-allocate + (fix:- end start) + (%general-max-cp string start end)))) + (%general-copy! to 0 string start end) + to)))))) + +(define (string-head string end) + (substring string 0 end)) + +(define (string-tail string start) + (substring string start)) + +(define (%general-copy! to at from start end) + + (define-integrable (copy! j i o) + (primitive-byte-set! to (fix:+ j o) (primitive-byte-ref from (fix:+ i o)))) + + (define-integrable (zero! j o) + (primitive-byte-set! to (fix:+ j o) 0)) + + (case (ustring-cp-size from) + ((1) + (let ((start (cp1-index start)) + (end (cp1-index end))) + (case (ustring-cp-size to) + ((1) + (do ((i start (fix:+ i 1)) + (j (cp1-index at) (fix:+ j 1))) + ((not (fix:< i end))) + (copy! j i 0))) + ((2) + (do ((i start (fix:+ i 1)) + (j (cp2-index at) (fix:+ j 2))) + ((not (fix:< i end))) + (copy! j i 0) + (zero! j 1))) + (else + (do ((i start (fix:+ i 1)) + (j (cp3-index at) (fix:+ j 3))) + ((not (fix:< i end))) + (copy! j i 0) + (zero! j 1) + (zero! j 2)))))) + ((2) + (let ((start (cp2-index start)) + (end (cp2-index end))) + (case (ustring-cp-size to) + ((1) + (do ((i start (fix:+ i 2)) + (j (cp1-index at) (fix:+ j 1))) + ((not (fix:< i end))) + (copy! j i 0))) + ((2) + (do ((i start (fix:+ i 2)) + (j (cp2-index at) (fix:+ j 2))) + ((not (fix:< i end))) + (copy! j i 0) + (copy! j i 1))) + (else + (do ((i start (fix:+ i 2)) + (j (cp3-index at) (fix:+ j 3))) + ((not (fix:< i end))) + (copy! j i 0) + (copy! j i 1) + (zero! j 2)))))) + (else + (let ((start (cp3-index start)) + (end (cp3-index end))) + (case (ustring-cp-size to) + ((1) + (do ((i start (fix:+ i 3)) + (j (cp1-index at) (fix:+ j 1))) + ((not (fix:< i end))) + (copy! j i 0))) + ((2) + (do ((i start (fix:+ i 3)) + (j (cp2-index at) (fix:+ j 2))) + ((not (fix:< i end))) + (copy! j i 0) + (copy! j i 1))) + (else + (do ((i start (fix:+ i 3)) + (j (cp3-index at) (fix:+ j 3))) + ((not (fix:< i end))) + (copy! j i 0) + (copy! j i 1) + (copy! j i 2)))))))) + +(define (%general-max-cp string start end) + + (define-integrable (max-loop cp-ref) + (do ((i start (fix:+ i 1)) + (max-cp 0 + (let ((cp (cp-ref string i))) + (if (fix:> cp max-cp) + cp + max-cp)))) + ((not (fix:< i end)) max-cp))) + + (case (ustring-cp-size string) + ((1) (max-loop cp1-ref)) + ((2) (max-loop cp2-ref)) + (else (max-loop cp3-ref)))) + +(define (%string->immutable string) + (if (and (ustring? string) (%ustring-immutable? string)) + string + (unpack-slice string + (lambda (string* start end) + (let ((result + (immutable-ustring-allocate + (fix:- end start) + (%general-max-cp string* start end)))) + (%general-copy! result 0 string* start end) + result))))) + +;;;; Streaming builder + +(define (string-builder #!optional buffer-length) + (let ((builder + (%make-string-builder + (if (default-object? buffer-length) + 16 + (begin + (guarantee positive-fixnum? buffer-length 'string-builder) + buffer-length))))) + (let ((append-char! (builder 'append-char!)) + (append-string! (builder 'append-string!)) + (build (builder 'build))) + (lambda (#!optional object) + (cond ((bitless-char? object) (append-char! object)) + ((string? object) (append-string! object)) + (else + (case object + ((#!default nfc) (build build-string:nfc)) + ((immutable) (build build-string:immutable)) + ((mutable) (build build-string:mutable)) + ((legacy) (build build-string:legacy)) + ((empty? count max-cp reset!) ((builder object))) + (else (error "Unsupported argument:" object))))))))) + +(define (build-string:nfc strings count max-cp) + (string->nfc (build-string:immutable strings count max-cp))) + +(define (build-string:immutable strings count max-cp) + (let ((result (immutable-ustring-allocate count max-cp))) + (fill-result! strings result) + result)) + +(define (build-string:mutable strings count max-cp) + (declare (ignore max-cp)) + (let ((result (mutable-ustring-allocate count))) + (fill-result! strings result) + result)) + +(define (build-string:legacy strings count max-cp) + (if (not (fix:< max-cp #x100)) + (error "Can't build legacy string:" max-cp)) + (let ((result (legacy-string-allocate count))) + (fill-result! strings result) + result)) + +(define (fill-result! strings result) + (do ((strings strings (cdr strings)) + (i 0 (fix:+ i (string-length (car strings))))) + ((not (pair? strings))) + (unpack-slice (car strings) + (lambda (string start end) + (%general-copy! result i string start end))))) + +(define (%make-string-builder buffer-length) + (let ((buffers) + (buffer) + (start) + (index) + (count) + (max-cp)) + + (define (reset!) + (set! buffers '()) + (set! buffer (mutable-ustring-allocate buffer-length)) + (set! start 0) + (set! index 0) + (set! count 0) + (set! max-cp 0) + unspecific) + + (define (get-partial) + (string-slice buffer start index)) + + (define (append-char! char) + (ustring3-set! buffer index char) + (set! index (fix:+ index 1)) + (set! count (fix:+ count 1)) + (set! max-cp (fix:max max-cp (char->integer char))) + (if (not (fix:< index buffer-length)) + (begin + (set! buffers (cons (get-partial) buffers)) + (set! buffer (mutable-ustring-allocate buffer-length)) + (set! start 0) + (set! index 0) + unspecific))) + + (define (append-string! string) + (let ((length (string-length string))) + (if (fix:> length 0) + (begin + (if (fix:> index start) + (begin + (set! buffers (cons (get-partial) buffers)) + (set! start index))) + (set! buffers (cons string buffers)) + (set! count (fix:+ count length)) + (set! max-cp + (fix:max max-cp + (unpack-slice string %general-max-cp))) + unspecific)))) + + (define (build finish) + (finish (reverse + (if (fix:> index start) + (cons (get-partial) buffers) + buffers)) + count + max-cp)) + + (reset!) + (lambda (operator) + (case operator + ((append-char!) append-char!) + ((append-string!) append-string!) + ((build) build) + ((empty?) (lambda () (fix:= count 0))) + ((count) (lambda () count)) + ((max-cp) (lambda () max-cp)) + ((reset!) reset!) + (else (error "Unknown operator:" operator)))))) + +;;;; Compare + +(define (string-compare string1 string2 if= if< if>) + (%string-compare (string->nfc string1) + (string->nfc string2) + if= if< if>)) + +(define (string-compare-ci string1 string2 if= if< if>) + (%string-compare (string-foldcase string1) + (string-foldcase string2) + if= if< if>)) + +;; Non-Unicode implementation, acceptable to R7RS. +(define-integrable (%string-compare string1 string2 if= if< if>) + (let ((end1 (string-length string1)) + (end2 (string-length string2))) + (let ((end (fix:min end1 end2))) + (let loop ((i 0)) + (if (fix:< i end) + (let ((c1 (string-ref string1 i)) + (c2 (string-ref string2 i))) + (cond ((char)) + (else (loop (fix:+ i 1))))) + (cond ((fix:< end1 end2) (if<)) + ((fix:< end2 end1) (if>)) + (else (if=)))))))) + +(define-integrable (true) #t) +(define-integrable (false) #f) + +(define-integrable (%string-comparison-maker if= if< if>) + (lambda (string1 string2) + (%string-compare string1 string2 if= if< if>))) + +(define %string=? (%string-comparison-maker true false false)) +(define %string? (%string-comparison-maker false false true)) +(define %string>=? (%string-comparison-maker true false true)) + +(define-integrable (string-comparison-maker preprocess compare) + (lambda (string1 string2 . strings) + (let loop + ((string1 (preprocess string1)) + (string2 (preprocess string2)) + (strings strings)) + (if (pair? strings) + (and (compare string1 string2) + (loop string2 (preprocess (car strings)) (cdr strings))) + (compare string1 string2))))) + +(define string=? (string-comparison-maker string->nfc %string=?)) +(define stringnfc %stringnfc %string<=?)) +(define string>? (string-comparison-maker string->nfc %string>?)) +(define string>=? (string-comparison-maker string->nfc %string>=?)) + +(define string-ci=? (string-comparison-maker string-foldcase %string=?)) +(define string-ci? (string-comparison-maker string-foldcase %string>?)) +(define string-ci>=? (string-comparison-maker string-foldcase %string>=?)) + +;;;; Match + +(define (string-match-forward string1 string2) + (guarantee nfc-string? string1 'string-match-forward) + (guarantee nfc-string? string2 'string-match-forward) + (let ((end1 (string-length string1)) + (end2 (string-length string2))) + (let ((end (fix:min end1 end2))) + (let loop ((i 0)) + (if (and (fix:< i end) + (char=? (string-ref string1 i) + (string-ref string2 i))) + (loop (fix:+ i 1)) + i))))) + +(define (string-match-backward string1 string2) + (guarantee nfc-string? string1 'string-match-backward) + (guarantee nfc-string? string2 'string-match-backward) + (let ((s1 (fix:- (string-length string1) 1))) + (let loop ((i s1) (j (fix:- (string-length string2) 1))) + (if (and (fix:>= i 0) + (fix:>= j 0) + (char=? (string-ref string1 i) + (string-ref string2 j))) + (loop (fix:- i 1) + (fix:- j 1)) + (fix:- s1 i))))) + +(define (string-prefix? prefix string #!optional start end) + (%string-prefix? (string->nfc prefix) + (string->nfc (string-slice string start end)))) + +(define (string-prefix-ci? prefix string #!optional start end) + (%string-prefix? (string-foldcase prefix) + (string-foldcase (string-slice string start end)))) + +(define (%string-prefix? prefix string) + (let ((n (string-length prefix))) + (and (fix:<= n (string-length string)) + (let loop ((i 0) (j 0)) + (if (fix:< i n) + (and (eq? (string-ref prefix i) (string-ref string j)) + (loop (fix:+ i 1) (fix:+ j 1))) + #t))))) + +(define (string-suffix? suffix string #!optional start end) + (%string-suffix? (string->nfc suffix) + (string->nfc (string-slice string start end)))) + +(define (string-suffix-ci? suffix string #!optional start end) + (%string-suffix? (string-foldcase suffix) + (string-foldcase (string-slice string start end)))) + +(define (%string-suffix? suffix string) + (let ((n (string-length suffix)) + (n* (string-length string))) + (and (fix:<= n n*) + (let loop ((i 0) (j (fix:- n* n))) + (if (fix:< i n) + (and (eq? (string-ref suffix i) (string-ref string j)) + (loop (fix:+ i 1) (fix:+ j 1))) + #t))))) + +;;;; Case + +(define (string-downcase string) + (case-transform ucd-lc-value string)) + +(define (string-foldcase string) + (case-transform ucd-cf-value string)) + +(define (string-upcase string) + (case-transform ucd-uc-value string)) + +(define (case-transform transform string) + (let ((builder (string-builder)) + (end (string-length string))) + (do ((index 0 (fix:+ index 1))) + ((not (fix:< index end))) + (builder (transform (string-ref string index)))) + (builder))) + +(define (string-titlecase string) + (let ((builder (string-builder))) + (find-word-breaks string 0 + (lambda (end start) + (maybe-titlecase string start end builder) + end)) + (builder))) + +(define (maybe-titlecase string start end builder) + (let loop ((index start)) + (if (fix:< index end) + (let ((char (string-ref string index))) + (if (char-cased? char) + (begin + (builder (ucd-tc-value char)) + (do ((index (fix:+ index 1) (fix:+ index 1))) + ((not (fix:< index end))) + (builder (ucd-lc-value (string-ref string index))))) + (begin + (builder char) + (loop (fix:+ index 1)))))))) + +(define (string-lower-case? string) + (nfd-string-lower-case? (string->nfd string))) + +(define (string-upper-case? string) + (nfd-string-upper-case? (string->nfd string))) + +(define (nfd-string-lower-case? nfd) + (let ((end (string-length nfd))) + (let loop ((i 0)) + (if (fix:< i end) + (and (not (char-changes-when-lower-cased? (string-ref nfd i))) + (loop (fix:+ i 1))) + #t)))) + +(define (nfd-string-upper-case? nfd) + (let ((end (string-length nfd))) + (let loop ((i 0)) + (if (fix:< i end) + (and (not (char-changes-when-upper-cased? (string-ref nfd i))) + (loop (fix:+ i 1))) + #t)))) + +(define (nfd-string-case-folded? nfd) + (let ((end (string-length nfd))) + (let loop ((i 0)) + (if (fix:< i end) + (and (not (char-changes-when-case-folded? (string-ref nfd i))) + (loop (fix:+ i 1))) + #t)))) + +;;;; Normalization + +(define (nfc-string? string) + (and (string? string) + (string-in-nfc? string))) + +(define (string-in-nfc? string) + (let ((full-check + (lambda () + (let ((qc (string-nfc-qc string 'string-in-nfc?))) + (if (eq? qc 'maybe) + (%string=? string (%string->nfc string)) + qc))))) + (if (and (ustring? string) + (%ustring-immutable? string)) + (if (ustring-in-nfc-set? string) + (ustring-in-nfc? string) + (let ((nfc? (full-check))) + (ustring-in-nfc! string nfc?) + nfc?)) + (full-check)))) + +(define (string->nfc string) + (if (and (ustring? string) + (%ustring-immutable? string)) + (if (ustring-in-nfc-set? string) + string + (let ((nfc + (case (string-nfc-qc string 'string->nfc) + ((#t) + string) + ((maybe) + (let ((nfc (%string->nfc string))) + (if (%string=? string nfc) + string + nfc))) + (else + (%string->nfc string))))) + (ustring-in-nfc! nfc #t) + nfc)) + (let ((nfc + (if (eq? #t (string-nfc-qc string 'string->nfc)) + (%string->immutable string) + (%string->nfc string)))) + (ustring-in-nfc! nfc #t) + nfc))) + +(define (%string->nfc string) + (canonical-composition + (if (string-in-nfd? string) + string + (canonical-decomposition&ordering string + (lambda (string* n max-cp) + (declare (ignore n max-cp)) + string*))))) + +(define (string-nfc-qc string caller) + (cond ((legacy-string? string) + #t) + ((ustring? string) + (if (and (%ustring-immutable? string) + (ustring-in-nfc-set? string)) + (ustring-in-nfc? string) + (ustring-nfc-qc string 0 (string-length string)))) + ((slice? string) + (unpack-slice string ustring-nfc-qc)) + (else + (error:not-a string? string caller)))) + +(define (ustring-nfc-qc string start end) + (let ((scan + (lambda (sref) + (let loop ((i start) (last-ccc 0) (result #t)) + (if (fix:< i end) + (let ((char (sref string i))) + (if (fix:< (char->integer char) #x300) + (loop (fix:+ i 1) 0 result) + (let ((ccc (ucd-ccc-value char))) + (and (or (fix:= ccc 0) (fix:>= ccc last-ccc)) + (case (ucd-nfc_qc-value char) + ((yes) (loop (fix:+ i 1) ccc result)) + ((maybe) (loop (fix:+ i 1) ccc 'maybe)) + (else #f)))))) + result))))) + (case (ustring-cp-size string) + ((1) #t) + ((2) (scan ustring2-ref)) + (else (scan ustring3-ref))))) + +(define (string-in-nfd? string) + (cond ((legacy-string? string) + (ustring-nfd-qc? string 0 (ustring-length string))) + ((ustring? string) + (or (ustring-in-nfd? string) + (ustring-nfd-qc? string 0 (ustring-length string)))) + ((slice? string) + (unpack-slice string ustring-nfd-qc?)) + (else + (error:not-a string? string 'string-in-nfd?)))) + +(define (ustring-nfd-qc? string start end) + (let ((scan + (lambda (sref) + (let loop ((i start) (last-ccc 0)) + (if (fix:< i end) + (let ((char (sref string i))) + (if (fix:< (char->integer char) #xC0) + (loop (fix:+ i 1) 0) + (let ((ccc (ucd-ccc-value char))) + (and (or (fix:= ccc 0) (fix:>= ccc last-ccc)) + (char-nfd-quick-check? char) + (loop (fix:+ i 1) ccc))))) + #t))))) + (case (ustring-cp-size string) + ((1) (scan ustring1-ref)) + ((2) (scan ustring2-ref)) + (else (scan ustring3-ref))))) + +(define (string->nfd string) + (if (string-in-nfd? string) + (let ((result (%string->immutable string))) + (ustring-in-nfd! result #t) + result) + (canonical-decomposition&ordering string + (lambda (string* n max-cp) + (let ((result (immutable-ustring-allocate n max-cp))) + (%general-copy! result 0 string* 0 n) + (ustring-in-nfd! result #t) + result))))) + +(define (canonical-decomposition&ordering string k) + (let ((end (string-length string)) + (builder (string-builder))) + (do ((i 0 (fix:+ i 1))) + ((not (fix:< i end))) + (let loop ((char (string-ref string i))) + (if (jamo-precomposed? char) + (jamo-decompose char builder) + (let ((dm (ucd-canonical-dm-value char))) + (cond ((eqv? dm char) + (builder char)) + ;; Canonical decomposition always length 1 or 2. + ;; First char might need recursion, second doesn't: + ((char? dm) + (loop dm)) + (else + (loop (string-ref dm 0)) + (builder (string-ref dm 1)))))))) + (let* ((string (builder 'mutable)) + (end (ustring-length string))) + + (define (scan-for-non-starter i) + (if (fix:< i end) + (let ((ccc (ucd-ccc-value (ustring3-ref string i)))) + (if (fix:= 0 ccc) + (scan-for-non-starter (fix:+ i 1)) + (scan-for-non-starter-pair (list ccc) (fix:+ i 1)))))) + + (define (scan-for-non-starter-pair previous i) + (if (fix:< i end) + (let ((ccc (ucd-ccc-value (ustring3-ref string i)))) + (if (fix:= 0 ccc) + (scan-for-non-starter (fix:+ i 1)) + (scan-for-non-starter-pair (maybe-twiddle previous i ccc) + (fix:+ i 1)))))) + + (define (maybe-twiddle previous i ccc) + (if (and (pair? previous) + (fix:< ccc (car previous))) + (begin + (let ((char (ustring3-ref string (fix:- i 1)))) + (ustring3-set! string (fix:- i 1) (ustring3-ref string i)) + (ustring3-set! string i char)) + (cons (car previous) + (maybe-twiddle (cdr previous) (fix:- i 1) ccc))) + (cons ccc previous))) + + (scan-for-non-starter 0) + (k string end (builder 'max-cp))))) + +(define (canonical-composition string) + (let ((end (string-length string)) + (builder (string-builder)) + (sk ucd-canonical-cm-second-keys) + (sv ucd-canonical-cm-second-values)) + + (define (scan-for-first-char i) + (if (fix:< i end) + (let ((fc (string-ref string i))) + (if (and (jamo-leading? fc) + (fix:< (fix:+ i 1) end) + (jamo-vowel? (string-ref string (fix:+ i 1)))) + (if (and (fix:< (fix:+ i 2) end) + (jamo-trailing? (string-ref string (fix:+ i 2)))) + (begin + (builder + (jamo-compose fc + (string-ref string (fix:+ i 1)) + (string-ref string (fix:+ i 2)))) + (scan-for-first-char (fix:+ i 3))) + (begin + (builder + (jamo-compose fc + (string-ref string (fix:+ i 1)) + #f)) + (scan-for-first-char (fix:+ i 2)))) + (test-first-char (fix:+ i 1) fc))))) + + (define (test-first-char i+1 fc) + (let ((fc-index (and (fix:< i+1 end) (ucd-canonical-cm-value fc)))) + (if fc-index + (let ((combiners (get-combiners i+1))) + (if (pair? combiners) + (let ((j (fix:+ i+1 (length combiners)))) + (scan-combiners fc fc-index combiners) + (scan-for-first-char j)) + (let ((fc* (match-second fc-index (string-ref string i+1)))) + (if fc* + (test-first-char (fix:+ i+1 1) fc*) + (begin + (builder fc) + (scan-for-first-char i+1)))))) + (begin + (builder fc) + (scan-for-first-char i+1))))) + + (define (get-combiners j) + (if (fix:< j end) + (let* ((char (string-ref string j)) + (ccc (ucd-ccc-value char))) + (if (fix:= 0 ccc) + '() + (cons (cons char ccc) (get-combiners (fix:+ j 1))))) + '())) + + (define (scan-combiners fc fc-index combiners) + (let loop ((cs combiners) (last-ccc 0)) + (if (pair? cs) + (let* ((c (car cs)) + (fc* + (and (fix:> (cdr c) last-ccc) + (match-second fc-index (car c))))) + (if fc* + (let ((fc-index* (ucd-canonical-cm-value fc*)) + (combiners* (remove-combiner! c combiners))) + (if fc-index* + (scan-combiners fc* fc-index* combiners*) + (done-matching fc* combiners*))) + (loop (cdr cs) (cdr c)))) + (done-matching fc combiners)))) + + (define (remove-combiner! combiner combiners) + (if (eq? combiner (car combiners)) + (cdr combiners) + (begin + (let loop ((this (cdr combiners)) (prev combiners)) + (if (eq? combiner (car this)) + (set-cdr! prev (cdr this)) + (loop (cdr this) this))) + combiners))) + + (define (done-matching fc combiners) + (builder fc) + (for-each (lambda (combiner) (builder (car combiner))) + combiners)) + + (define (match-second fc-index sc) + (let ((keys (vector-ref sk fc-index))) + (let loop ((start 0) (end (string-length keys))) + (and (fix:< start end) + (let ((m (fix:quotient (fix:+ start end) 2))) + (let ((key (string-ref keys m))) + (cond ((char= (char->integer char) jamo-leading-start) + (fix:< (char->integer char) jamo-leading-end))) + +(define (jamo-vowel? char) + (and (fix:>= (char->integer char) jamo-vowel-start) + (fix:< (char->integer char) jamo-vowel-end))) + +(define (jamo-trailing? char) + (and (fix:>= (char->integer char) jamo-trailing-start) + (fix:< (char->integer char) jamo-trailing-end))) + +(define (jamo-precomposed? char) + (and (fix:>= (char->integer char) jamo-precomposed-start) + (fix:< (char->integer char) jamo-precomposed-end))) + +(define (jamo-decompose precomposed builder) + (let ((pi (fix:- (char->integer precomposed) jamo-precomposed-start))) + (builder + (integer->char (fix:+ jamo-leading-start (fix:quotient pi jamo-ncount)))) + (builder + (integer->char + (fix:+ jamo-vowel-start + (fix:quotient (fix:remainder pi jamo-ncount) jamo-tcount)))) + (let ((ti (fix:remainder pi jamo-tcount))) + (if (fix:> ti 0) + (builder (integer->char (fix:+ jamo-tbase ti))))))) + +(define (jamo-compose leading vowel trailing) + (integer->char + (fix:+ jamo-precomposed-start + (fix:+ (fix:+ (fix:* (fix:- (char->integer leading) + jamo-leading-start) + jamo-ncount) + (fix:* (fix:- (char->integer vowel) + jamo-vowel-start) + jamo-tcount)) + (if trailing + (fix:- (char->integer trailing) jamo-tbase) + 0))))) + +;;;; Grapheme clusters + +(define (grapheme-cluster-length string) + (let ((breaks + (find-grapheme-cluster-breaks string + 0 + (lambda (i count) + (declare (ignore i)) + (fix:+ count 1))))) + (if (fix:> breaks 0) + (fix:- breaks 1) + breaks))) + +(define (grapheme-cluster-slice string start end) + ;; START and END refer to the cluster breaks, they must be <= the number of + ;; clusters in STRING. + (guarantee index-fixnum? start 'grapheme-cluster-slice) + (guarantee index-fixnum? end 'grapheme-cluster-slice) + (if (not (fix:<= start end)) + (error:bad-range-argument start 'grapheme-cluster-slice)) + (let ((start-index #f) + (end-index #f)) + (find-grapheme-cluster-breaks string + 0 + (lambda (index count) + (if (fix:= count start) + (set! start-index index)) + (if (fix:= count end) + (set! end-index index)) + (fix:+ count 1))) + (if (not start-index) + (error:bad-range-argument start 'grapheme-cluster-slice)) + (if (not end-index) + (error:bad-range-argument end 'grapheme-cluster-slice)) + (string-slice string start-index end-index))) + +(define (grapheme-cluster-breaks string) + (reverse! (find-grapheme-cluster-breaks string '() cons))) + +(define (find-grapheme-cluster-breaks string initial-ctx break) + (let ((n (string-length string))) + + (define (get-gcb i) + (ucd-gcb-value (string-ref string i))) + + (define (transition gcb i ctx) + (let ((i* (fix:+ i 1))) + (if (fix:< i* n) + ((vector-ref gcb-states gcb) + (get-gcb i*) + (lambda (gcb* break?) + (transition gcb* i* (if break? (break i* ctx) ctx)))) + (break n ctx)))) + + (if (fix:> n 0) + (transition (get-gcb 0) 0 (break 0 initial-ctx)) + initial-ctx))) + +(define gcb-names + '#(control + carriage-return + emoji-base + emoji-base-gaz + emoji-modifier + extend + glue-after-zwj + hst=l + linefeed + hst=lv + hst=lvt + prepend + regional-indicator + spacing-mark + hst=t + hst=v + other + zwj)) + +(define (name->code namev name) + (let ((end (vector-length namev))) + (let loop ((code 0)) + (if (not (fix:< code end)) + (error "Unknown name:" name)) + (if (eq? (vector-ref namev code) name) + code + (loop (fix:+ code 1)))))) + +(define (make-!selector namev names) + (let loop + ((names names) + (mask (fix:- (fix:lsh 1 (vector-length namev)) 1))) + (if (pair? names) + (loop (cdr names) + (fix:andc mask (fix:lsh 1 (name->code namev (car names))))) + (lambda (code) + (not (fix:= 0 (fix:and mask (fix:lsh 1 code)))))))) + +(define (make-selector namev names) + (let loop + ((names names) + (mask 0)) + (if (pair? names) + (loop (cdr names) + (fix:or mask (fix:lsh 1 (name->code namev (car names))))) + (lambda (code) + (not (fix:= 0 (fix:and mask (fix:lsh 1 code)))))))) + +(define gcb-states + (let ((simple-state + (lambda (break?) + (lambda (gcb k) + (k gcb (break? gcb))))) + (gcb-code + (lambda (name) + (name->code gcb-names name))) + (make-no-breaks + (lambda (names) + (make-!selector gcb-names names))) + (make-breaks + (lambda (names) + (make-selector gcb-names names)))) + (let ((state:control (simple-state (make-no-breaks '()))) + (state:emoji-base + (let ((gcb:extend (gcb-code 'extend)) + (gcb:emoji-base (gcb-code 'emoji-base)) + (break? + (make-no-breaks '(emoji-modifier extend spacing-mark zwj)))) + (lambda (gcb k) + (if (fix:= gcb gcb:extend) + (k gcb:emoji-base #f) + (k gcb (break? gcb)))))) + (state:extend + (simple-state (make-no-breaks '(extend spacing-mark zwj)))) + (state:hst=v + (simple-state + (make-no-breaks '(hst=t hst=v extend spacing-mark zwj)))) + (state:hst=t + (simple-state (make-no-breaks '(hst=t extend spacing-mark zwj))))) + (vector state:control + (simple-state (make-no-breaks '(linefeed))) + state:emoji-base + state:emoji-base + state:extend + state:extend + state:extend + (simple-state + (make-no-breaks + '(hst=l hst=lv hst=lvt hst=v extend spacing-mark zwj))) + state:control + state:hst=v + state:hst=t + (simple-state (make-breaks '(control carriage-return linefeed))) + (let ((gcb:regional-indicator (gcb-code 'regional-indicator)) + (gcb:extend (gcb-code 'extend)) + (break? (make-no-breaks '(extend spacing-mark zwj)))) + (lambda (gcb k) + (let ((gcb + (if (fix:= gcb gcb:regional-indicator) + gcb:extend + gcb))) + (k gcb (break? gcb))))) + state:extend + state:hst=t + state:hst=v + state:extend + (simple-state + (make-no-breaks + '(emoji-base-gaz glue-after-zwj extend spacing-mark zwj))))))) + +;;;; Word breaks + +(define (string-word-breaks string) + (reverse! (find-word-breaks string '() cons))) + +(define (find-word-breaks string initial-ctx break) + (let ((n (string-length string))) + + (define (get-wb i) + (ucd-wb-value (string-ref string i))) + + (define (t1 wb0 i0 ctx) + (if (select:breaker wb0) + (t1-breaker wb0 i0 ctx) + (t1-!breaker wb0 i0 ctx))) + + (define (t1-breaker wb0 i0 ctx) + (let ((i1 (fix:+ i0 1))) + (if (fix:< i1 n) + (let ((wb1 (get-wb i1))) + ((vector-ref wb-states wb0) + wb1 + #f + (lambda (wb1* break?) + (t1 wb1* i1 (if break? (break i1 ctx) ctx))) + k2-none)) + ctx))) + + (define (t1-!breaker wb0 i0 ctx) + (let ((i1 (fix:+ i0 1))) + (if (fix:< i1 n) + (let ((wb1 (get-wb i1))) + (cond ((select:extender wb1) + (t1-!breaker (if (select:zwj wb0) wb1 wb0) i1 ctx)) + ((select:breaker wb1) + (t1-breaker wb1 i1 (break i1 ctx))) + (else + (t2 wb0 wb1 i1 ctx)))) + ctx))) + + (define (t2 wb0 wb1 i1 ctx) + (let find-i2 ((i2 (fix:+ i1 1))) + (if (fix:< i2 n) + (let ((wb2 (get-wb i2))) + (if (select:extender wb2) + (find-i2 (fix:+ i2 1)) + ((vector-ref wb-states wb0) + wb1 + wb2 + (lambda (wb1* break?) + (t2 wb1* wb2 i2 (if break? (break i1 ctx) ctx))) + (lambda () + (t1 wb2 i2 ctx))))) + ((vector-ref wb-states wb0) + wb1 + #f + (lambda (wb1* break?) + (declare (ignore wb1*)) + (if break? (break i1 ctx) ctx)) + k2-none)))) + + (define (k2-none) + (error "Should never be called")) + + (if (fix:< 0 n) + (break n (t1 (get-wb 0) 0 (break 0 initial-ctx))) + initial-ctx))) + +(define wb-names + '#(carriage-return + double-quote + emoji-base + emoji-base-gaz + emoji-modifier + extend-num-let + extend + format + glue-after-zwj + hebrew-letter + katakana + letter + linefeed + mid-num-let + mid-letter + mid-number + newline + numeric + regional-indicator + single-quote + other + zwj)) + +(define select:breaker + (make-selector wb-names '(carriage-return linefeed newline))) + +(define select:extender + (make-selector wb-names '(extend format zwj))) + +(define select:zwj + (make-selector wb-names '(zwj))) + +(define wb-states + (make-vector (vector-length wb-names) + (lambda (wb1 wb2 k1 k2) + (declare (ignore wb2 k2)) + (k1 wb1 #t)))) + +(let ((select:mb/ml/sq + (make-selector wb-names '(mid-num-let mid-letter single-quote))) + (select:mb/mn/sq + (make-selector wb-names '(mid-num-let mid-number single-quote))) + (select:hl/le (make-selector wb-names '(hebrew-letter letter))) + (select:hl (make-selector wb-names '(hebrew-letter))) + (select:numeric (make-selector wb-names '(numeric))) + (select:dq (make-selector wb-names '(double-quote))) + (select:ri (make-selector wb-names '(regional-indicator))) + (break?:hl + (make-!selector wb-names + '(extend-num-let hebrew-letter letter numeric + single-quote))) + (break?:alphanum + (make-!selector wb-names + '(extend-num-let hebrew-letter letter numeric))) + (wb:extend (name->code wb-names 'extend))) + + (define (define-state name state) + (vector-set! wb-states (name->code wb-names name) state)) + + (for-each + (lambda (n.b) + (define-state (car n.b) + (let ((break? (make-!selector wb-names (cdr n.b)))) + (lambda (wb1 wb2 k1 k2) + (declare (ignore wb2 k2)) + (k1 wb1 (break? wb1)))))) + '((carriage-return linefeed) + (emoji-base emoji-modifier) + (emoji-base-gaz emoji-modifier) + (katakana extend-num-let katakana) + (zwj emoji-base-gaz glue-after-zwj) + (extend-num-let extend-num-let hebrew-letter katakana letter numeric))) + + (define-state 'hebrew-letter + (lambda (wb1 wb2 k1 k2) + (if (and wb2 + (or (and (select:mb/ml/sq wb1) + (select:hl/le wb2)) + (and (select:dq wb1) + (select:hl wb2)))) + (k2) + (k1 wb1 (break?:hl wb1))))) + + (define-state 'letter + (lambda (wb1 wb2 k1 k2) + (if (and wb2 + (select:mb/ml/sq wb1) + (select:hl/le wb2)) + (k2) + (k1 wb1 (break?:alphanum wb1))))) + + (define-state 'numeric + (lambda (wb1 wb2 k1 k2) + (if (and wb2 + (select:mb/mn/sq wb1) + (select:numeric wb2)) + (k2) + (k1 wb1 (break?:alphanum wb1))))) + + (define-state 'regional-indicator + (let () + (lambda (wb1 wb2 k1 k2) + (declare (ignore wb2 k2)) + (if (select:ri wb1) + (k1 wb:extend #f) + (k1 wb1 #t)))))) + +;;;; Naive search algorithm + +(define (naive-search-forward pattern pend text tstart tend) + (let ((tlast (fix:- tend pend))) + (let find-match ((tindex tstart)) + (and (fix:<= tindex tlast) + (let match ((pi 0) (ti tindex)) + (if (fix:< pi pend) + (if (char=? (string-ref pattern pi) + (string-ref text ti)) + (match (fix:+ pi 1) (fix:+ ti 1)) + (find-match (fix:+ tindex 1))) + tindex)))))) + +(define (naive-search-backward pattern pend text tstart tend) + (let ((tlast (fix:- tend pend))) + (let find-match ((tindex tlast)) + (and (fix:>= tindex tstart) + (let match ((pi 0) (ti tindex)) + (if (fix:< pi pend) + (if (char=? (string-ref pattern pi) + (string-ref text ti)) + (match (fix:+ pi 1) (fix:+ ti 1)) + (find-match (fix:- tindex 1))) + ti)))))) + +(define (naive-search-all pattern pend text tstart tend) + (let ((tlast (fix:- tend pend))) + (let find-match ((tindex tlast) (matches '())) + (if (fix:>= tindex tstart) + (find-match (fix:- tindex 1) + (let match ((pi 0) (ti tindex)) + (if (fix:< pi pend) + (if (char=? (string-ref pattern pi) + (string-ref text ti)) + (match (fix:+ pi 1) (fix:+ ti 1)) + matches) + (cons tindex matches)))) + matches)))) + +;;; Knuth-Morris-Pratt algorithm + +;;; Donald E. Knuth, James H. Morris, Jr., and Vaughan R. Pratt. Fast pattern +;;; matching in strings. SIAM Journal on Computing, 6(2):323–350, 1977. + +;;; Thomas H. Cormen, Charles E. Leiserson, Ronald L. Rivest, and Clifford +;;; Stein, "Introduction to Algorithms, third edition" (Cambridge: The MIT +;;; Press, 2009), section 32.4. + +(define (kmp-search-forward pattern pend text tstart tend) + (receive (pi new-prefix) (kmp-prefix-function pattern pend) + (let loop ((i tstart) (n-matched 0)) + (and (fix:< i tend) + (let ((n-matched (new-prefix (string-ref text i) n-matched))) + (if (fix:< n-matched pend) + (loop (fix:+ i 1) n-matched) + (fix:- i (fix:- pend 1)))))))) + +(define (kmp-search-all pattern pend text tstart tend) + (receive (pi new-prefix) (kmp-prefix-function pattern pend) + (let loop ((i tstart) (n-matched 0) (matches '())) + (if (fix:< i tend) + (let ((n-matched (new-prefix (string-ref text i) n-matched))) + (if (fix:< n-matched pend) + (loop (fix:+ i 1) n-matched matches) + (loop (fix:+ i 1) + (vector-ref pi (fix:- n-matched 1)) + (cons (fix:- i (fix:- pend 1)) matches)))) + (reverse matches))))) + +(define (kmp-prefix-function pattern pend) + (let ((pi (make-vector pend))) + + (define (compute-pi q k) + (vector-set! pi q k) + (let ((q (fix:+ q 1))) + (if (fix:< q pend) + (compute-pi q (new-prefix (string-ref pattern q) k))))) + + (define (new-prefix char n-matched) + (let loop ((n-matched n-matched)) + (cond ((char=? (string-ref pattern n-matched) char) (fix:+ n-matched 1)) + ((fix:> n-matched 0) (loop (vector-ref pi (fix:- n-matched 1)))) + (else 0)))) + + (compute-pi 0 0) + (values pi new-prefix))) + +;;;; Search top level + +(define-integrable (string-matcher caller naive kmp) + (lambda (pattern text #!optional start end) + (guarantee nfc-string? pattern caller) + (guarantee nfc-string? text caller) + (let ((pend (string-length pattern))) + (if (fix:= 0 pend) + (error:bad-range-argument pend caller)) + (let* ((tend (fix:end-index end (string-length text) caller)) + (tstart (fix:start-index start end caller))) + (if (fix:< pend kmp-pattern-min) + (naive pattern pend text tstart tend) + (kmp pattern pend text tstart tend)))))) + +(define-integrable kmp-pattern-min 8) + +(define string-search-forward + (string-matcher 'string-search-forward + naive-search-forward + kmp-search-forward)) + +(define string-search-backward + (string-matcher 'string-search-backward + naive-search-backward + naive-search-backward)) + +(define string-search-all + (string-matcher 'string-search-all + naive-search-all + kmp-search-all)) + +(define (substring? pattern text) + (and (or (fix:= 0 (string-length pattern)) + (string-search-forward (string->nfc pattern) (string->nfc text))) + #t)) + +;;;; Sequence converters + +(define (list->string chars) + (let ((builder (string-builder))) + (for-each (lambda (char) + (guarantee bitless-char? char 'list->string) + (builder char)) + chars) + (builder 'immutable))) + +(define (string->list string #!optional start end) + (let* ((end (fix:end-index end (string-length string) 'string->list)) + (start (fix:start-index start end 'string->list))) + (translate-slice string start end + (lambda (string start end) + + (define-integrable (%string->list sref) + (do ((i (fix:- end 1) (fix:- i 1)) + (chars '() (cons (sref string i) chars))) + ((not (fix:>= i start)) chars))) + + (case (ustring-cp-size string) + ((1) (%string->list ustring1-ref)) + ((2) (%string->list ustring2-ref)) + (else (%string->list ustring3-ref))))))) + +(define (vector->string vector #!optional start end) + (let* ((end (fix:end-index end (vector-length vector) 'vector->string)) + (start (fix:start-index start end 'vector->string)) + (builder (string-builder))) + (do ((i start (fix:+ i 1))) + ((not (fix:< i end))) + (let ((char (vector-ref vector i))) + (guarantee bitless-char? char 'vector->string) + (builder char))) + (builder 'immutable))) + +(define (string->vector string #!optional start end) + (let* ((end (fix:end-index end (string-length string) 'string->vector)) + (start (fix:start-index start end 'string->vector))) + (translate-slice string start end + (lambda (string start end) + (let ((to (make-vector (fix:- end start)))) + (do ((i start (fix:+ i 1)) + (j 0 (fix:+ j 1))) + ((not (fix:< i end))) + (vector-set! to j (ustring-ref string i))) + to))))) + +;;;; Append + +(define (string-append . strings) + (string-append* strings)) + +(define (string-append* strings) + (let ((builder (string-builder))) + (for-each (lambda (string) + (guarantee string? string 'string-append) + (builder string)) + strings) + (builder))) + +(define (string . objects) + (string* objects)) + +(define (string* objects) + (let ((builder (string-builder))) + (for-each (lambda (object) + (if object + (builder + (cond ((bitless-char? object) object) + ((string? object) object) + ;; Needed during boot load: + ((symbol? object) (symbol->string object)) + (else + (call-with-output-string + (lambda (port) + (display object port)))))))) + objects) + (builder))) + +;;;; Mapping + +(define (mapper-values proc string strings) + (cond ((null? strings) + (values (string-length string) + (lambda (i) + (proc (string-ref string i))))) + ((null? (cdr strings)) + (let* ((string2 (car strings)) + (n (fix:min (string-length string) + (string-length string2)))) + (values n + (lambda (i) + (proc (string-ref string i) + (string-ref string2 i)))))) + (else + (let ((n (min-length string-length string strings))) + (values n + (lambda (i) + (apply proc + (string-ref string i) + (map (lambda (string) + (string-ref string i)) + strings)))))))) + +(define (min-length string-length string strings) + (do ((strings strings (cdr strings)) + (n (string-length string) + (fix:min n (string-length (car strings))))) + ((null? strings) n))) + +(define (string-for-each proc string . strings) + (receive (n proc) (mapper-values proc string strings) + (do ((i 0 (fix:+ i 1))) + ((not (fix:< i n))) + (proc i)))) + +(define (string-map proc string . strings) + (receive (n proc) (mapper-values proc string strings) + (let ((builder (string-builder))) + (do ((i 0 (fix:+ i 1))) + ((not (fix:< i n))) + (builder (proc i))) + (builder)))) + +(define (string-count proc string . strings) + (receive (n proc) (mapper-values proc string strings) + (let loop ((i 0) (count 0)) + (if (fix:< i n) + (loop (fix:+ i 1) + (if (proc i) + (fix:+ count 1) + count)) + count)))) + +(define (string-any proc string . strings) + (receive (n proc) (mapper-values proc string strings) + (let loop ((i 0)) + (and (fix:< i n) + (if (proc i) + #t + (loop (fix:+ i 1))))))) + +(define (string-every proc string . strings) + (receive (n proc) (mapper-values proc string strings) + (let loop ((i 0)) + (if (fix:< i n) + (and (proc i) + (loop (fix:+ i 1))) + #t)))) + +(define (string-find-first-index proc string . strings) + (guarantee nfc-string? string 'string-find-first-index) + (guarantee-list-of nfc-string? strings 'string-find-first-index) + (receive (n proc) (mapper-values proc string strings) + (let loop ((i 0)) + (and (fix:< i n) + (if (proc i) + i + (loop (fix:+ i 1))))))) + +(define (string-find-last-index proc string . strings) + (guarantee nfc-string? string 'string-find-last-index) + (guarantee-list-of nfc-string? strings 'string-find-last-index) + (receive (n proc) (mapper-values proc string strings) + (let loop ((i (fix:- n 1))) + (and (fix:>= i 0) + (if (proc i) + i + (loop (fix:- i 1))))))) + +;;;; Joiner + +(define (string-joiner . options) + (let ((joiner (%string-joiner options 'string-joiner))) + (lambda strings + (joiner strings)))) + +(define (string-joiner* . options) + (%string-joiner options 'string-joiner*)) + +(define (%string-joiner options caller) + (receive (infix prefix suffix) (string-joiner-options options caller) + (let ((infix (string-append suffix infix prefix))) + (lambda (strings) + (if (pair? strings) + (let ((builder (string-builder))) + (builder prefix) + (builder (car strings)) + (for-each (lambda (string) + (builder infix) + (builder string)) + (cdr strings)) + (builder suffix) + (builder)) + ""))))) + +(define-deferred string-joiner-options + (keyword-option-parser + (list (list 'infix string? "") + (list 'prefix string? "") + (list 'suffix string? "")))) + +;;;; Splitter + +(define (string-splitter . options) + (receive (delimiter allow-runs? copy?) + (string-splitter-options options 'string-splitter) + (let ((predicate (char-matcher->predicate delimiter 'string-splitter)) + (get-part (if copy? substring string-slice))) + + (lambda (string #!optional start end) + (let* ((end (fix:end-index end (string-length string) 'string-splitter)) + (start (fix:start-index start end 'string-splitter))) + + (define (find-start start) + (if allow-runs? + (let loop ((index start)) + (if (fix:< index end) + (if (predicate (string-ref string index)) + (loop (fix:+ index 1)) + (find-end index (fix:+ index 1))) + '())) + (find-end start start))) + + (define (find-end start index) + (let loop ((index index)) + (if (fix:< index end) + (if (predicate (string-ref string index)) + (cons (get-part string start index) + (find-start (fix:+ index 1))) + (loop (fix:+ index 1))) + (list (get-part string start end))))) + + (find-start start)))))) + +(define-deferred string-splitter-options + (keyword-option-parser + (list (list 'delimiter char-matcher? char-whitespace?) + (list 'allow-runs? boolean? #t) + (list 'copy? boolean? #f)))) + +(define (char-matcher->predicate matcher caller) + (cond ((char? matcher) (char=-predicate matcher)) + ((char-set? matcher) (char-set-predicate matcher)) + ((unary-procedure? matcher) matcher) + (else (error:not-a char-matcher? matcher caller)))) + +(define (char-matcher? object) + (or (char? object) + (char-set? object) + (unary-procedure? object))) + +;;;; Trimmer/Padder + +(define (string-trimmer . options) + (receive (where to-trim copy?) + (string-trimmer-options options 'string-trimmer) + (let ((predicate (char-matcher->predicate to-trim 'string-trimmer)) + (get-trimmed (if copy? substring string-slice))) + (lambda (string) + (let ((end (string-length string))) + (get-trimmed + string + (if (eq? where 'trailing) + 0 + (let loop ((index 0)) + (if (and (fix:< index end) + (predicate (string-ref string index))) + (loop (fix:+ index 1)) + index))) + (if (eq? where 'leading) + end + (let loop ((index end)) + (if (and (fix:> index 0) + (predicate (string-ref string (fix:- index 1)))) + (loop (fix:- index 1)) + index))))))))) + +(define-deferred string-trimmer-options + (keyword-option-parser + (list (list 'where '(leading trailing both) 'both) + (list 'to-trim char-matcher? char-whitespace?) + (list 'copy? boolean? #f)))) + +(define (string-padder . options) + (receive (where fill-with clip?) + (string-padder-options options 'string-padder) + (lambda (string n) + (guarantee index-fixnum? n 'string-padder) + (let ((cluster-length (grapheme-cluster-length string))) + (cond ((fix:= n cluster-length) + string) + ((fix:< n cluster-length) + (if clip? + (if (eq? where 'leading) + (grapheme-cluster-slice string + (fix:- cluster-length n) + cluster-length) + (grapheme-cluster-slice string 0 n)) + string)) + (else + (let ((builder (string-builder))) + (if (eq? where 'trailing) + (builder string)) + (do ((i cluster-length (fix:+ i 1))) + ((not (fix:< i n))) + (builder fill-with)) + (if (eq? where 'leading) + (builder string)) + (builder)))))))) + +(define (grapheme-cluster-string? object) + (and (string? object) + (fix:= 1 (grapheme-cluster-length object)))) + +(define-deferred string-padder-options + (keyword-option-parser + (list (list 'where '(leading trailing) 'leading) + (list 'fill-with grapheme-cluster-string? " ") + (list 'clip? boolean? #t)))) + +;;;; Miscellaneous + +(define (string-fill! string char #!optional start end) + (guarantee mutable-string? string 'string-fill) + (guarantee bitless-char? char 'string-fill!) + (let* ((end (fix:end-index end (string-length string) 'string-fill!)) + (start (fix:start-index start end 'string-fill!))) + (translate-slice string start end + (lambda (string start end) + (do ((index start (fix:+ index 1))) + ((not (fix:< index end)) unspecific) + (ustring-set! string index char)))))) + +(define (string-replace string char1 char2) + (guarantee bitless-char? char1 'string-replace) + (guarantee bitless-char? char2 'string-replace) + (string-map (lambda (char) + (if (char=? char char1) char2 char)) + string)) + +(define (string-hash string #!optional modulus) + (let ((string* (string-for-primitive (string->nfc string)))) + (if (default-object? modulus) + ((ucode-primitive string-hash) string*) + ((ucode-primitive string-hash-mod) string* modulus)))) + +(define (string-hash-ci string #!optional modulus) + (string-hash (string-foldcase string) modulus)) + +(define (8-bit-string? object) + (and (string? object) + (string-8-bit? object))) + +(define (string-8-bit? string) + (unpack-slice string + (lambda (string start end) + (case (ustring-cp-size string) + ((1) #t) + ((2) (every-loop char-8-bit? ustring2-ref string start end)) + (else (every-loop char-8-bit? ustring3-ref string start end)))))) + +(define (string-for-primitive string) + (if (and (not (slice? string)) + (let ((end (string-length string))) + (case (ustring-cp-size string) + ((1) (every-loop char-ascii? ustring1-ref string 0 end)) + ((2) (every-loop char-ascii? ustring2-ref string 0 end)) + (else (every-loop char-ascii? ustring3-ref string 0 end))))) + string + (string->utf8 string))) + +(define-integrable (every-loop proc ref string start end) + (let loop ((i start)) + (if (fix:< i end) + (and (proc (ref string i)) + (loop (fix:+ i 1))) + #t))) + +;;;;Backwards compatibility + +(define-integrable (string-find-maker finder key->predicate) + (lambda (string key #!optional start end) + (let* ((start (if (default-object? start) 0 start)) + (index + (finder (key->predicate key) + (string-slice string start end)))) + (and index + (fix:+ start index))))) + +(define string-find-next-char + (string-find-maker string-find-first-index char=-predicate)) + +(define string-find-next-char-ci + (string-find-maker string-find-first-index char-ci=-predicate)) + +(define string-find-next-char-in-set + (string-find-maker string-find-first-index char-set-predicate)) + +(define string-find-previous-char + (string-find-maker string-find-last-index char=-predicate)) + +(define string-find-previous-char-ci + (string-find-maker string-find-last-index char-ci=-predicate)) + +(define string-find-previous-char-in-set + (string-find-maker string-find-last-index char-set-predicate)) + +(define-integrable (substring-find-maker string-find) + (lambda (string start end key) + (string-find string key start end))) + +(define substring-find-next-char + (substring-find-maker string-find-next-char)) + +(define substring-find-next-char-ci + (substring-find-maker string-find-next-char-ci)) + +(define substring-find-next-char-in-set + (substring-find-maker string-find-next-char-in-set)) + +(define substring-find-previous-char + (substring-find-maker string-find-previous-char)) + +(define substring-find-previous-char-ci + (substring-find-maker string-find-previous-char-ci)) + +(define substring-find-previous-char-in-set + (substring-find-maker string-find-previous-char-in-set)) + +(define (string-move! string1 string2 start2) + (string-copy! string2 start2 string1)) + +(define (substring-move! string1 start1 end1 string2 start2) + (string-copy! string2 start2 string1 start1 end1)) + +(define (substring-cistring char) + (guarantee bitless-char? char 'char->string) + (let ((s (immutable-ustring-allocate 1 (char->integer char)))) + (ustring-set! s 0 char) + s)) + +(define (legacy-string-trimmer where) + (lambda (string #!optional char-set) + ((string-trimmer 'where where + 'copy? #t + 'to-trim + (if (default-object? char-set) + char-set:whitespace + (char-set-invert char-set))) + string))) + +(define string-trim-left (legacy-string-trimmer 'leading)) +(define string-trim-right (legacy-string-trimmer 'trailing)) +(define string-trim (legacy-string-trimmer 'both)) + +(define (legacy-string-padder where) + (lambda (string n #!optional char) + ((string-padder 'where where + 'fill-with (if (default-object? char) + char + (char->string char))) + string n))) + +(define string-pad-left (legacy-string-padder 'leading)) +(define string-pad-right (legacy-string-padder 'trailing)) + +(define (decorated-string-append prefix infix suffix strings) + ((string-joiner* 'prefix prefix + 'infix infix + 'suffix suffix) + strings)) + +(define (burst-string string delimiter allow-runs?) + ((string-splitter 'delimiter delimiter + 'allow-runs? allow-runs? + 'copy? #t) + string)) \ No newline at end of file diff --git a/tests/runtime/test-string-search.scm b/tests/runtime/test-string-search.scm new file mode 100644 index 000000000..a9d9a410e --- /dev/null +++ b/tests/runtime/test-string-search.scm @@ -0,0 +1,113 @@ +#| -*-Scheme-*- + +Copyright (C) 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, + 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, + 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, + 2017 Massachusetts Institute of Technology + +This file is part of MIT/GNU Scheme. + +MIT/GNU Scheme is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at +your option) any later version. + +MIT/GNU Scheme is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with MIT/GNU Scheme; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, +USA. + +|# + +;;;; Tests of string search + +(declare (usual-integrations)) + +(define text + (call-with-input-file + (merge-pathnames "test-string-search-data" + (directory-pathname (current-load-pathname))) + (lambda (port) + (let ((builder (string-builder)) + (buffer (make-string #x1000))) + (let loop () + (let ((n (read-string! buffer port))) + (if (> n 0) + (begin + (builder (substring buffer 0 n)) + (loop))))) + (builder 'immutable))))) + +(define patterns + '( + ;; occur frequently: + "define" + "mutable" + "ustring" + "immutable" + "lambda" + ;; occur near start: + "MIT/GNU Scheme" + "Software" + ;; occur near middle: + "test-first-char" + "ucd-canonical-cm-value" + ;; occur near end: + "decorated-string-append" + "burst-string")) + +(define (find-instances pattern) + (let ((pend (string-length pattern)) + (tend (string-length text))) + + (define (loop ti matches) + (if (fix:< ti tend) + (loop (fix:+ ti 1) + (if (match-1 0 ti) + (cons ti matches) + matches)) + (cons pattern (reverse! matches)))) + + (define (match-1 pi ti) + (if (and (fix:< pi pend) + (fix:< ti tend)) + (and (char=? (string-ref pattern pi) + (string-ref text ti)) + (match-1 (fix:+ pi 1) + (fix:+ ti 1))) + #t)) + + (loop 0 '()))) + +(define pattern-instances + (map find-instances patterns)) + +(define-test 'search-tests + (map (lambda (entry) + (let ((pattern (car entry)) + (indices (cdr entry))) + (list + (lambda () + (with-test-properties + (lambda () + (assert-equal (string-search-forward pattern text) + (car indices))) + 'expression `(string-search-forward ,pattern))) + (lambda () + (with-test-properties + (lambda () + (assert-equal (string-search-backward pattern text) + (last indices))) + 'expression `(string-search-backward ,pattern))) + (lambda () + (with-test-properties + (lambda () + (assert-equal (string-search-all pattern text) + indices)) + 'expression `(string-search-all ,pattern)))))) + pattern-instances)) \ No newline at end of file