From ac4fbf99076be17c3c3e76c2336a8b7702604919 Mon Sep 17 00:00:00 2001 From: Chris Hanson Date: Sat, 30 Nov 2019 00:08:01 -0800 Subject: [PATCH] Rewrite the NFA regexp interpreter into epsilon/matcher phases. This greatly simplifies the interpreter's operation, and isolates the state elision in the epsilon phase where it belongs. Also added hash-consing of states, so that we can use eq? to compare them. --- src/runtime/regexp-nfa.scm | 347 +++++++++++++++++++------------------ src/runtime/runtime.pkg | 2 +- src/runtime/srfi-115.scm | 2 +- 3 files changed, 178 insertions(+), 173 deletions(-) diff --git a/src/runtime/regexp-nfa.scm b/src/runtime/regexp-nfa.scm index ebed38cf2..4326fa040 100644 --- a/src/runtime/regexp-nfa.scm +++ b/src/runtime/regexp-nfa.scm @@ -31,28 +31,33 @@ USA. ;;;; Compiler -(define (generate-matcher thunk) - (parameterize ((shared-state (make-shared-state))) +(define (compile-matcher thunk) + (parameterize ((compiler-shared-state (make-compiler-shared-state))) (let ((insn (thunk))) - (make-matcher (link-insn insn (terminal-node)))))) + (let ((initial-node (link-insn insn (terminal-node)))) + (make-matcher initial-node + (number-of-nodes) + (number-of-groups)))))) -;; This structure is overkill for now but allows adding additional information -;; from the compiler that can be used to make interpretation more efficient. (define-record-type - (make-matcher initial-node) + (make-matcher initial-node n-nodes n-groups) matcher? - (initial-node matcher-initial-node)) + (initial-node matcher-initial-node) + (n-nodes matcher-n-nodes) + (n-groups matcher-n-groups)) -(define shared-state +(define compiler-shared-state (make-parameter #f)) -(define-record-type - (%make-shared-state group-indices) - shared-state? +(define-record-type + (%make-compiler-shared-state node-indices group-indices) + compiler-shared-state? + (node-indices node-indices) (group-indices group-indices)) -(define (make-shared-state) - (%make-shared-state (make-index-generator 1))) +(define (make-compiler-shared-state) + (%make-compiler-shared-state (node-index-generator) + (group-index-generator))) (define (make-index-generator n) (lambda () @@ -60,8 +65,23 @@ USA. (set! n (fix:+ n 1)) n*))) +(define (node-index-generator) + (make-index-generator 0)) + +(define (next-node-index) + ((node-indices (compiler-shared-state)))) + +(define (number-of-nodes) + (next-node-index)) + +(define (group-index-generator) + (make-index-generator 1)) + (define (next-group-index) - ((group-indices (shared-state)))) + ((group-indices (compiler-shared-state)))) + +(define (number-of-groups) + (- (next-group-index) 1)) ;;;; Instructions @@ -107,18 +127,21 @@ USA. ;;;; Nodes (define-record-type - (%make-node type id datum next) + (%make-node type index id datum next) node? (type node-type) + (index node-index) (id %node-id) (datum node-datum) (next node-next %set-node-next!)) (define (make-node type id datum nodes) - (%make-node type id datum nodes)) + (%make-node type (next-node-index) id datum nodes)) (define (node-id node) - (cons (node-type node) (%node-id node))) + (cons* (node-type node) + (node-index node) + (%node-id node))) (define-print-method node? (standard-print-method 'node node-id)) @@ -336,84 +359,30 @@ USA. ;;;; Interpreter (define (run-matcher matcher string start end) + (parameterize ((run-shared-state (make-run-shared-state matcher))) + (let ((initial + (make-state (matcher-initial-node matcher) + (initial-ctx start)))) + (trace-matcher (lambda (port) (write (list 'initial-state initial) port))) + (let ((final (match-nodes initial string start end))) + (trace-matcher (lambda (port) (write (list 'final-state final) port))) + (and final + (all-groups string start (state-ctx final))))))) + +(define run-shared-state + (make-parameter #f)) - (define (finish state) - (trace-matcher (lambda (port) (write (list 'success state) port))) - (all-groups string start (state-ctx state))) - - (let per-index - ((states - (append-state (make-state (matcher-initial-node matcher) - (initial-ctx start)) - (make-state-set))) - (index start) - (prev-char #f)) - (trace-matcher (lambda (port) (pp (cons index (all-elts states)) port))) - (cond ((no-elts? states) - #f) - ((let ((state (first-elt states))) - (and (terminal-state? state) - state)) - => finish) - ((fix:< index end) - (let ((next-char (string-ref string index))) - (per-index (interpret-states states next-char prev-char) - (fix:+ index 1) - next-char))) - (else - (let ((state - (find terminal-state? - (all-elts (interpret-states states #f prev-char))))) - (and state - (finish state))))))) - -(define (interpret-states states next-char prev-char) +(define-record-type + (%make-run-shared-state state-memoizer) + run-shared-state? + (state-memoizer %state-memoizer)) - (define (loop inputs outputs) - (if (no-elts? inputs) - outputs - (interpret-state (first-elt inputs) (rest-elts inputs) outputs))) +(define (make-run-shared-state matcher) + (%make-run-shared-state (make-state-memoizer matcher))) - (define (interpret-state state inputs outputs) - (trace-matcher (lambda (port) (write state port))) - (let ((node (state-node state)) - (ctx (state-ctx state))) - (case (node-type node) - ((lookaround) - (if ((node-datum node) next-char prev-char) - (continue node ctx inputs outputs) - (loop inputs outputs))) - ((ctx-only) - (continue node ((node-datum node) ctx) inputs outputs)) - ((char) - (match char=? node ctx inputs outputs)) - ((char-ci) - (match char-ci=? node ctx inputs outputs)) - ((char-set) - (match char-set-contains? node ctx inputs outputs)) - ((fail) - (loop inputs outputs)) - ((terminal) - (loop inputs (append-state state outputs))) - (else - (error "Unknown node type:" node))))) - - (define (continue node ctx inputs outputs) - (let* ((next (node-next node)) - (state (make-state next ctx))) - (if (fork-node? next) - (loop (prepend-state state inputs) outputs) - (interpret-state state inputs outputs)))) +(define (state-memoizer) + (%state-memoizer (run-shared-state))) - (define (match pred node ctx inputs outputs) - (if (and next-char (pred (node-datum node) next-char)) - (loop inputs - (append-state (make-state (node-next node) (++index ctx)) - outputs)) - (loop inputs outputs))) - - (loop states (make-state-set))) - (define (succeed next-node ctx) (make-state next-node ctx)) @@ -429,100 +398,136 @@ USA. (define param:trace-regexp-nfa? (make-settable-parameter #f)) -(define-record-type - (make-state node ctx) - state? - (node state-node) - (ctx state-ctx)) +(define (match-nodes initial-state string start end) + (let loop ((states (list initial-state)) (index start) (prev-char #f)) + (trace-matcher (lambda (port) (write (cons* 'index index states) port))) + (let ((next-char + (and (fix:< index end) + (string-ref string index)))) + (let ((states (follow-epsilons states next-char prev-char))) + (cond ((not (pair? states)) #f) + ((terminal-state? (car states)) (car states)) + (else + (let ((states (follow-matchers states next-char))) + (if next-char + (loop states (fix:+ index 1) next-char) + (let ((states + (follow-epsilons states next-char prev-char))) + (and (pair? states) + ;; If a terminal state is present, it's always last. + ;; The follow-X procedures guarantee this. + (let ((state (last states))) + (and (terminal-state? state) + state)))))))))))) + +(define (follow-epsilons states next-char prev-char) + (trace-matcher (lambda (port) (pp (cons 'follow-epsilons states) port))) + (let ((seen (make-strong-eq-hash-table))) + + (define (loop inputs outputs) + (if (pair? inputs) + (follow-state (car inputs) (cdr inputs) outputs) + (reverse! outputs))) + + (define (follow-state state inputs outputs) + (if (seen? state) + (loop inputs outputs) + (let ((node (state-node state)) + (ctx (state-ctx state))) + (trace-matcher (lambda (port) (write state port))) + (case (node-type node) + ((fork) + (loop (fold-right (lambda (node* inputs) + (cons (make-state node* ctx) + inputs)) + inputs + (node-next node)) + outputs)) + ((lookaround) + (if ((node-datum node) next-char prev-char) + (follow-state (make-state (node-next node) ctx) + inputs + outputs) + (loop inputs outputs))) + ((ctx-only) + (follow-state (make-state (node-next node) + ((node-datum node) ctx)) + inputs + outputs)) + ((fail) (loop inputs outputs)) + ((terminal) (reverse! (cons state outputs))) + (else (loop inputs (cons state outputs))))))) -(define-print-method state? - (standard-print-method 'state - (lambda (state) - (cons (hash-object (state-node state)) - (node-id (state-node state)))))) + (define (seen? state) + (let ((p (hash-table-intern! seen state (lambda () (list #f))))) + (if (car p) + #t + (begin + (set-car! p #t) + #f)))) -(define (terminal-state? state) - (terminal-node? (state-node state))) + (loop states '()))) -(define (fork-state? state) - (fork-node? (state-node state))) +(define (follow-matchers states next-char) + (trace-matcher (lambda (port) (pp (cons 'follow-matchers states) port))) -(define (fork-state-threads state) - (map (let ((ctx (state-ctx state))) - (lambda (node) - (make-state node ctx))) - (node-next (state-node state)))) - -(define (make-state-set) - (let loop ((seen '()) (states '())) - - (define (add-to-end state) - (if (fork-state? state) - (if (seen? state) - this - (fold append-state - (loop (cons state seen) states) - (fork-state-threads state))) - (if (seen? state) - this - (loop (cons state seen) - (cons state states))))) - - (define (add-to-start state) - (if (fork-state? state) - (if (seen? state) - this - (fold-right prepend-state - (loop (cons state seen) states) - (fork-state-threads state))) - (if (seen? state) - this - (loop (cons state seen) - (append states (list state)))))) + (define (loop inputs outputs) + (if (pair? inputs) + (follow-state (car inputs) (cdr inputs) outputs) + (reverse! outputs))) - (define (seen? state) - (any (lambda (state*) - (state=? state state*)) - seen)) + (define (follow-state state inputs outputs) + (trace-matcher (lambda (port) (write state port))) + (let ((node (state-node state)) + (ctx (state-ctx state))) + (case (node-type node) + ((char) (match char=? node ctx inputs outputs)) + ((char-ci) (match char-ci=? node ctx inputs outputs)) + ((char-set) (match char-set-contains? node ctx inputs outputs)) + ((fail) (loop inputs outputs)) + ((terminal) (reverse! (cons state outputs))) + (else (error "Unknown node type:" node))))) - (define (empty?) - (null? states)) + (define (match pred node ctx inputs outputs) + (loop inputs + (if (and next-char (pred (node-datum node) next-char)) + (cons (make-state (node-next node) (++index ctx)) outputs) + outputs))) - (define (first) - (last states)) + (loop states '())) + +;;;; States - (define (rest) - (loop seen (except-last-pair states))) +(define-record-type + (%make-state node ctx) + state? + (node state-node) + (ctx state-ctx)) - (define (all) - (reverse states)) +(define (make-state node ctx) + (let ((memoizer (state-memoizer))) + (let ((states (vector-ref memoizer (node-index node)))) + (or (find (lambda (state) + (eq? ctx (state-ctx state))) + states) + (let ((state (%make-state node ctx))) + (vector-set! memoizer (node-index node) (cons state states)) + state))))) - (define this - (%make-state-set add-to-end add-to-start empty? first rest all)) +(define (make-state-memoizer matcher) + (make-vector (matcher-n-nodes matcher) '())) - this)) +(define (state=? s1 s2) + (eq? s1 s2)) -(define-record-type - (%make-state-set append prepend empty? first rest all) - state-set? - (append %state-set-append) - (prepend %state-set-prepend) - (empty? %state-set-empty?) - (first %state-set-first) - (rest %state-set-rest) - (all %state-set-all)) +(define-print-method state? + (standard-print-method 'state + (lambda (state) + (node-id (state-node state))))) -(define (append-state state states) ((%state-set-append states) state)) -(define (prepend-state state states) ((%state-set-prepend states) state)) -(define (no-elts? states) ((%state-set-empty? states))) -(define (first-elt states) ((%state-set-first states))) -(define (rest-elts states) ((%state-set-rest states))) -(define (all-elts states) ((%state-set-all states))) +(define (terminal-state? state) + (terminal-node? (state-node state))) -(define (state=? s1 s2) - (and (eq? (state-node s1) (state-node s2)) - (eq? (state-ctx s1) (state-ctx s2)))) - ;;;; Context (define-record-type diff --git a/src/runtime/runtime.pkg b/src/runtime/runtime.pkg index 21aa4cce3..5a6cf9f3f 100644 --- a/src/runtime/runtime.pkg +++ b/src/runtime/runtime.pkg @@ -5571,7 +5571,7 @@ USA. (files "regexp-nfa") (parent (runtime)) (export (runtime regexp srfi-115) - generate-matcher + compile-matcher group-end group-key group-start diff --git a/src/runtime/srfi-115.scm b/src/runtime/srfi-115.scm index 0f4d395e6..11a66e886 100644 --- a/src/runtime/srfi-115.scm +++ b/src/runtime/srfi-115.scm @@ -44,7 +44,7 @@ USA. (make-regexp (parameterize ((%input-pattern sre) (%submatch-next 1)) - (generate-matcher + (compile-matcher (lambda () (compile-sre sre)))))) -- 2.25.1