From 3ccf4a591801954c3d338810b5219f9c36e5ff6b Mon Sep 17 00:00:00 2001 From: Chris Hanson Date: Sun, 30 Apr 2017 23:09:24 -0700 Subject: [PATCH] Implement converter from regexp patterns to regsexp forms. --- src/runtime/chrset.scm | 4 +- src/runtime/chrsyn.scm | 45 ++++++-- src/runtime/regsexp.scm | 201 +++++++++++++++++++++++++++++---- src/runtime/rgxcmp.scm | 4 +- src/runtime/runtime.pkg | 8 +- tests/runtime/test-regsexp.scm | 51 ++++++++- 6 files changed, 273 insertions(+), 40 deletions(-) diff --git a/src/runtime/chrset.scm b/src/runtime/chrset.scm index 49c3ab78f..49c55f918 100644 --- a/src/runtime/chrset.scm +++ b/src/runtime/chrset.scm @@ -267,7 +267,7 @@ USA. (define (%cpl->char-sets cpl) (let loop ((cpl cpl) (ranges '()) (char-sets '())) (cond ((not (pair? cpl)) - (cons (%ranges->char-set (%normalize-ranges ranges)) + (cons (%ranges->char-set (normalize-ranges ranges)) char-sets)) ((%cpl-element->ranges (car cpl)) => (lambda (ranges*) @@ -287,7 +287,7 @@ USA. ((string? elt) (map char->integer (string->list elt))) (else #f))) -(define (%normalize-ranges ranges) +(define (normalize-ranges ranges) (let ((ranges (filter! (lambda (range) (fix:< (%range-start range) diff --git a/src/runtime/chrsyn.scm b/src/runtime/chrsyn.scm index d146c3395..aab6a82a1 100644 --- a/src/runtime/chrsyn.scm +++ b/src/runtime/chrsyn.scm @@ -145,13 +145,27 @@ USA. (fix:<= (fix:and #xf object) 12))) (error:bad-range-argument object procedure))) +(define (char-syntax-code? object) + (and (char? object) + (let ((n (vector-length char-syntax-codes))) + (let loop ((i 0)) + (and (fix:< i n) + (or (char=? (string-ref (vector-ref char-syntax-codes i) 0) + object) + (loop (fix:+ i 1)))))))) + +(define (char->syntax-code table char) + (string-ref (vector-ref char-syntax-codes + (fix:and #xf (get-char-syntax table char))) + 0)) + (define char-syntax-codes '#(" " "." "w" "_" "(" ")" "'" "\"" "$" "\\" "/" "<" ">")) - + (define (substring-find-next-char-of-syntax string start end table code) (guarantee 8-bit-string? string 'substring-find-next-char-of-syntax) (let ((index - (string-find-first-index (syntax-code-predicate table code) + (string-find-first-index (syntax-code-predicate code table) (string-slice string start end)))) (and index (fix:+ start index)))) @@ -160,18 +174,27 @@ USA. (guarantee 8-bit-string? string 'substring-find-next-char-not-of-syntax) (let ((index (string-find-first-index (let ((pred - (syntax-code-predicate table code))) + (syntax-code-predicate code table))) (lambda (char) (not (pred char)))) (string-slice string start end)))) (and index (fix:+ start index)))) -(define (syntax-code-predicate table code) - (lambda (char) - (char=? code (char->syntax-code table char)))) - -(define (char->syntax-code table char) - (string-ref (vector-ref char-syntax-codes - (fix:and #xf (get-char-syntax table char))) - 0)) \ No newline at end of file +(define (syntax-code-predicate code #!optional table) + (guarantee char-syntax-code? code 'syntax-code-predicate) + (let ((entries + (char-syntax-table/entries + (if (default-object? table) + standard-char-syntax-table + (begin + (guarantee char-syntax-table? table 'syntax-code-predicate) + table))))) + (lambda (char) + (let ((cp (char->integer char))) + (and (fix:< cp #x100) + (char=? (string-ref (vector-ref char-syntax-codes + (fix:and #x0F + (vector-ref entries cp))) + 0) + code)))))) \ No newline at end of file diff --git a/src/runtime/regsexp.scm b/src/runtime/regsexp.scm index 52e95a371..11c4547a8 100644 --- a/src/runtime/regsexp.scm +++ b/src/runtime/regsexp.scm @@ -124,7 +124,21 @@ USA. (define-rule '(ANY-CHAR) (lambda () - (%compile-regsexp '(INVERSE-CHAR-SET "\n")))) + (insn:test-char (negate (char=-predicate #\newline))))) + +(define-rule '(test-char datum) + (lambda (predicate) + (insn:test-char + (if (and (pair? predicate) + (eq? (car predicate) 'not) + (pair? (cdr predicate)) + (null? (cddr predicate))) + (negate (cadr predicate)) + predicate)))) + +(define (negate predicate) + (lambda (object) + (not (predicate object)))) (define-rule '(+ FORM) (lambda (regsexp) @@ -142,6 +156,23 @@ USA. (lambda items (insn:inverse-char-set (char-set* items)))) +(define-rule '(char-syntax datum) + (lambda (code) + (insn:test-char + (if (or (char=? code #\-) + (char=? code #\space)) + char-whitespace? + (syntax-code-predicate code))))) + +(define-rule '(inverse-char-syntax datum) + (lambda (code) + (insn:test-char + (negate + (if (or (char=? code #\-) + (char=? code #\space)) + char-whitespace? + (syntax-code-predicate code)))))) + (define-rule '(LINE-START) (lambda () (insn:line-start))) (define-rule '(LINE-END) (lambda () (insn:line-end))) (define-rule '(STRING-START) (lambda () (insn:string-start))) @@ -259,13 +290,24 @@ USA. (succeed position groups fail) (fail))))) -(define (insn:char char) +(define (insn:test-char predicate) (lambda (succeed) (lambda (position groups fail) - (if (eqv? (next-char position) char) + (if (let ((char (next-char position))) + (and char + (predicate char))) (succeed (next-position position) groups fail) (fail))))) +(define (insn:char char) + (insn:test-char (char=-predicate char))) + +(define (insn:char-set char-set) + (insn:test-char (char-set-predicate char-set))) + +(define (insn:inverse-char-set char-set) + (insn:test-char (negate (char-set-predicate char-set)))) + (define (insn:chars chars) (lambda (succeed) (lambda (position groups fail) @@ -292,24 +334,6 @@ USA. (loop (fix:+ i 1) (next-position position)) (fail))) (succeed position groups fail))))))))) - -(define (insn:char-set char-set) - (lambda (succeed) - (lambda (position groups fail) - (if (let ((char (next-char position))) - (and char - (char-in-set? char char-set))) - (succeed (next-position position) groups fail) - (fail))))) - -(define (insn:inverse-char-set char-set) - (lambda (succeed) - (lambda (position groups fail) - (if (let ((char (next-char position))) - (and char - (not (char-in-set? char char-set)))) - (succeed (next-position position) groups fail) - (fail))))) (define (insn:group key insn) (insn:seq (list (%insn:start-group key) @@ -640,4 +664,137 @@ USA. (lambda (p1 p2) (declare (no-type-checks)) (and (eq? (cdr p1) (cdr p2)) - (fix:= (car p1) (car p2))))) \ No newline at end of file + (fix:= (car p1) (car p2))))) + +(define (re-pattern->regsexp pattern) + (let ((end (string-length pattern))) + (let ((index 0) + (this-alt '()) + (prev-alts '()) + (group-number 0) + (pending-groups '())) + + (define (have-next?) + (fix:< index end)) + + (define (get-next) + (let ((char (string-ref pattern index))) + (set! index (fix:+ index 1)) + char)) + + (define (next-is? char) + (and (char=? (string-ref pattern index) char) + (begin + (set! index (fix:+ index 1)) + #t))) + + (define (get-expr) + (let ((alt (get-alt))) + (if (pair? prev-alts) + `(alt ,@(reverse (cons alt prev-alts))) + alt))) + + (define (get-alt) + (let ((exprs (optimize-alt (reverse this-alt) #f))) + (if (= (length exprs) 1) + (car exprs) + `(seq ,@exprs)))) + + (define (optimize-alt exprs builder) + (if (pair? exprs) + (if (char? (car exprs)) + (let ((builder (or builder (string-builder)))) + (builder (car exprs)) + (optimize-alt (cdr exprs) builder)) + (if builder + (cons (builder) + (cons (car exprs) + (optimize-alt (cdr exprs) #f))) + (cons (car exprs) + (optimize-alt (cdr exprs) #f)))) + (if builder + (list (builder)) + '()))) + + (define (dispatch) + (if (have-next?) + (let ((char (get-next))) + (case char + ((#\\) (dispatch-backslash)) + ((#\$) (output-expr '(line-end))) + ((#\^) (output-expr '(line-start))) + ((#\.) (output-expr '(any-char))) + ((#\[) (parse-char-set)) + ((#\*) (replace-last-expr (lambda (expr) `(* ,expr)))) + ((#\+) (replace-last-expr (lambda (expr) `(+ ,expr)))) + ((#\?) (replace-last-expr (lambda (expr) `(? ,expr)))) + (else (output-expr char)))) + (get-expr))) + + (define (dispatch-backslash) + (let ((char (get-next))) + (case char + ((#\<) (output-expr '(word-start))) + ((#\>) (output-expr '(word-end))) + ((#\b) (output-expr '(word-bound))) + ((#\B) (output-expr '(not-word-bound))) + ((#\`) (output-expr '(string-start))) + ((#\') (output-expr '(string-end))) + ((#\w) (output-expr '(char-syntax #\w))) + ((#\W) (output-expr '(inverse-char-syntax #\w))) + ((#\s) (output-expr `(char-syntax ,(get-next)))) + ((#\S) (output-expr `(inverse-char-syntax ,(get-next)))) + ((#\() (start-group)) + ((#\)) (end-group)) + ((#\|) (push-alt)) + (else (error "Unsupported regexp:" (string #\\ char)))))) + + (define (output-expr expr) + (set! this-alt (cons expr this-alt)) + (dispatch)) + + (define (replace-last-expr transform) + (set-car! this-alt (transform (car this-alt))) + (dispatch)) + + (define (start-group) + (set! group-number (fix:+ group-number 1)) + (set! pending-groups + (cons (vector group-number this-alt prev-alts) + pending-groups)) + (set! this-alt '()) + (set! prev-alts '()) + (dispatch)) + + (define (end-group) + (let ((expr `(group ,(vector-ref (car pending-groups) 0) ,(get-expr)))) + (set! this-alt (vector-ref (car pending-groups) 1)) + (set! prev-alts (vector-ref (car pending-groups) 2)) + (set! pending-groups (cdr pending-groups)) + (output-expr expr))) + + (define (push-alt) + (set! prev-alts (cons (get-alt) prev-alts)) + (set! this-alt '()) + (dispatch)) + + (define (parse-char-set) + (let loop + ((chars + (append (if (next-is? #\^) + (list #\^) + '()) + (if (next-is? #\]) + (list #\]) + '())))) + (let ((char (get-next))) + (if (char=? char #\]) + (output-expr + (receive (ranges invert?) + (re-char-pattern->code-points + (list->string (reverse chars))) + (cons (if invert? 'inverse-char-set 'char-set) + (normalize-ranges ranges)))) + (loop (cons char chars)))))) + + (dispatch)))) \ No newline at end of file diff --git a/src/runtime/rgxcmp.scm b/src/runtime/rgxcmp.scm index b566c8904..cd0a263dc 100644 --- a/src/runtime/rgxcmp.scm +++ b/src/runtime/rgxcmp.scm @@ -251,13 +251,13 @@ USA. (define (re-compile-char-set pattern negate?) (receive (scalar-values negate?*) - (re-char-pattern->scalar-values pattern) + (re-char-pattern->code-points pattern) (let ((char-set (char-set* scalar-values))) (if (if negate? (not negate?*) negate?*) (char-set-invert char-set) char-set)))) -(define (re-char-pattern->scalar-values pattern) +(define (re-char-pattern->code-points pattern) (define (loop pattern scalar-values) (if (pair? pattern) (if (and (pair? (cdr pattern)) diff --git a/src/runtime/runtime.pkg b/src/runtime/runtime.pkg index 013dfb5eb..68fef6206 100644 --- a/src/runtime/runtime.pkg +++ b/src/runtime/runtime.pkg @@ -1405,6 +1405,8 @@ USA. code-point-in-char-set? compute-char-set string->char-set) + (export (runtime regular-sexpression) + normalize-ranges) (export (runtime string) (char-set-table %char-set-table))) @@ -5242,6 +5244,7 @@ USA. compile-regsexp compiled-regsexp? condition-type:compile-regsexp + re-pattern->regsexp regsexp-match-input-port regsexp-match-string)) @@ -5278,7 +5281,7 @@ USA. compiled-regexp/byte-stream compiled-regexp/translation-table condition-type:re-compile-pattern - re-char-pattern->scalar-values + re-char-pattern->code-points re-compile-char re-compile-char-set re-compile-pattern @@ -5336,7 +5339,8 @@ USA. standard-char-syntax-table string->char-syntax substring-find-next-char-not-of-syntax - substring-find-next-char-of-syntax) + substring-find-next-char-of-syntax + syntax-code-predicate) (export (runtime regular-expression) char-syntax-table/entries) (initialization (initialize-package!))) diff --git a/tests/runtime/test-regsexp.scm b/tests/runtime/test-regsexp.scm index 30b6e9956..b96ee5dfc 100644 --- a/tests/runtime/test-regsexp.scm +++ b/tests/runtime/test-regsexp.scm @@ -615,4 +615,53 @@ USA. ((seq (group x (seq (any-char) (any-char) (any-char) (any-char))) (* (any-char)) (group-ref x)) - ("beriberi" 8 (x 0 4)))))) \ No newline at end of file + ("beriberi" 8 (x 0 4)))))) + +(define-test 're-pattern->regsexp + (map (lambda (entry) + (lambda () + (assert-equal (re-pattern->regsexp (car entry)) + (cadr entry)))) + '(("[\r\n\t ]*(This file must be converted with BinHex.*[\r\n][\r\n\t ]*:" + (seq (* (char-set (9 . 11) 13 32)) + "(This file must be converted with BinHex" + (* (any-char)) + (char-set 10 13) + (* (char-set (9 . 11) 13 32)) + ":")) + + ("^begin +[0-7]+ +.+$" + (seq (line-start) + "begin" + (+ #\space) + (+ (char-set (48 . 56))) + (+ #\space) + (+ (any-char)) + (line-end))) + + ("\\`8859-[0-9]+\\'" + (seq (string-start) "8859-" (+ (char-set (48 . 58))) (string-end))) + + ("\\`0x\\([0-9A-Fa-f][0-9A-Fa-f]\\)\t0x\\([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]\\)\t" + (seq (string-start) + "0x" + (group 1 + (seq (char-set (48 . 58) (65 . 71) (97 . 103)) + (char-set (48 . 58) (65 . 71) (97 . 103)))) + "\t0x" + (group + 2 + (seq (char-set (48 . 58) (65 . 71) (97 . 103)) + (char-set (48 . 58) (65 . 71) (97 . 103)) + (char-set (48 . 58) (65 . 71) (97 . 103)) + (char-set (48 . 58) (65 . 71) (97 . 103)))) + "\t")) + + ("\\`\\s *\\(error:\\)?\\s *\\(.*\\)\\s *\\'" + (seq (string-start) + (* (char-syntax #\space)) + (? (group 1 "error:")) + (* (char-syntax #\space)) + (group 2 (* (any-char))) + (* (char-syntax #\space)) + (string-end)))))) \ No newline at end of file -- 2.25.1