From 2b05d8beab7dcb7f75658023f09cc50f299762c0 Mon Sep 17 00:00:00 2001 From: Chris Hanson Date: Mon, 6 Mar 2017 21:52:18 -0800 Subject: [PATCH] Eliminate need for file-attributes parser to use custom parser table. Also add tests for the parser using the conveniently-provided test strings. --- src/runtime/ed-ffi.scm | 2 +- src/runtime/file-attributes.scm | 428 +++++-------------------- src/runtime/make.scm | 2 +- src/runtime/runtime.pkg | 9 +- tests/runtime/test-file-attributes.scm | 57 ++++ 5 files changed, 144 insertions(+), 354 deletions(-) create mode 100644 tests/runtime/test-file-attributes.scm diff --git a/src/runtime/ed-ffi.scm b/src/runtime/ed-ffi.scm index 08dac98ec..c1966d262 100644 --- a/src/runtime/ed-ffi.scm +++ b/src/runtime/ed-ffi.scm @@ -64,7 +64,7 @@ USA. ("error" (runtime error-handler)) ("events" (runtime event-distributor)) ("ffi" (runtime ffi)) - ("file-attributes" (runtime parser file-attributes)) + ("file-attributes" (runtime file-attributes)) ("fileio" (runtime file-i/o-port)) ("fixart" (runtime fixnum-arithmetic)) ("floenv" (runtime floating-point-environment)) diff --git a/src/runtime/file-attributes.scm b/src/runtime/file-attributes.scm index ef2c7b96a..cbd183e8f 100644 --- a/src/runtime/file-attributes.scm +++ b/src/runtime/file-attributes.scm @@ -25,360 +25,92 @@ USA. |# ;;;; File attributes parser -;;; package: (runtime parser file-attributes) +;;; package: (runtime file-attributes) (declare (usual-integrations)) -;;; This code will parse "file attributes line" found in the first -;;; or second line of file and delimited by the special -*- sequence. - -;;; Here are sample attribute lines taken from various files -;;; found in the wild. They won't be parsed because they are not -;;; in the first two lines. - -;;; --------------- -#| -*-Scheme-*- -This file is part of MIT/GNU Scheme. -|# - -#||-*- mode:lisp; - package:(FOOBAR :USE (GLOBAL BAZ) - :SHADOW (CAR CDR CONS)); - base:10 - -*- ||# - -;;; -*- Mode: Java; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- - -;;; -*- Mode: C; tab-width: 4; -*- - -;;; For Emacs: -*- mode:cperl; mode:folding -*- - -;;; -*- Mode:Lisp; Package:XLIB; Syntax:COMMON-LISP; Base:10; Lowercase:T -*- - -;;; -*-mode:C;tab-width:3-*- - -;;; -*-mode:c; c-style:k&r; c-basic-offset:4; -*- - -;;;-*-Mode:LISP;Syntax: Common-Lisp;Package:ib;Base:10-*- - -;;;-*-mode:lisp;parser:read-*- - -;;; -*-Mode:Perl; perl-indent-level:8-*- - -;;; -*-mode:JavaScript;coding:latin-1;-*- Time-stamp: "2006-08-09 16:18:45 ADT" - -;;; -*- Mode: C; indent-tabs-mode:nil; c-basic-offset: 8-*- */ - -;;; -*- coding:utf-8;mode:python;mode:font-lock -*- - -;;; -*- test-case-name: twisted.test.test_htb -*- - -;;; -*- mode: C; c-file-style: "gnu" -*- - -;;;-*- syntax:COMMON-LISP; Package: (ITERATE :use "COMMON-LISP" :colon-mode :external) -*- - -;;; -*- package IDE-ini -*- - -;;; -*- Mode: Emacs-Lisp; outline-regexp: " \n;;;;+" -*- - -;;; It should surprise no one that the following comes from a python file. -;;; -*-*- encoding: utf-8 -*-*- - -;;; --------------- +;;; This code will parse a "file attributes line" found in the first line of a +;;; file and delimited by the special -*- sequence. ;;; The most general case is a series of key/value pairs where the key ;;; is followed by a colon and the pairs are separated or delimited by ;;; semicolons. Whitespace is optional and cannot be relied upon to ;;; delimit the end of a key or a value. - -;;; If the parser used the standard atom delimiters and the system -;;; global parser table, a file attributes line like -;;; "-*-mode:lisp;parser:read-*-" would be interpreted as the symbol -;;; '-*-mode:lisp followed by a comment. We therefore need to run the -;;; parser with modified settings. - -;;; We need two modes. The first is the mode where we are expecting -;;; the key of a key/value pair. Since the key is delimited by a -;;; colon, or by whitespace followed by a colon, we need the colon -;;; character to be atom-delimiter. - -;;; The second mode is when we are reading the value of a key/value -;;; pair. The value is read as an ordinary lisp object. This is -;;; slightly different from the standard settings of the Scheme -;;; reader. - -;;; The actual way we parse the mode line is to stay in the first mode -;;; until we read a colon character. At that point, we switch to the -;;; second mode in order to read a single value and return to the -;;; first mode immediately afterwards. -;; These are the char-sets and parser table for use in the mode where -;; we are parsing anything but a value. (mode 1) -(define char-set/file-attributes-atom-delimiters) -(define char-set/file-attributes-constituents) -(define file-attributes-parser-table) - -(define (parse-file-attributes-item parse port) - ;; Prepare the parser for first mode. - (parameterize* (list (cons param:parser-associate-positions? #f) - (cons param:parser-atom-delimiters - char-set/file-attributes-atom-delimiters) - (cons param:parser-fold-case? #f) - (cons param:parser-constituents - char-set/file-attributes-constituents) - ;; no recursion! - (cons param:parser-enable-attributes? #f) - (cons param:parser-keyword-style #f) - (cons param:parser-radix 10) - (cons param:parser-table file-attributes-parser-table)) - (lambda () - (fluid-let ((*parser-associate-positions?* #!default) - (*parser-atom-delimiters* #!default) - (*parser-canonicalize-symbols?* #!default) - (*parser-constituents* #!default) - (*parser-radix* #!default) - (*parser-table* #!default)) - (parse port system-global-environment))))) - -(define (parse-file-attributes-value parse port) - ;; Prepare the parser for second mode. - (parameterize* (list (cons param:parser-associate-positions? #f) - (cons param:parser-atom-delimiters - char-set/atom-delimiters) - (cons param:parser-fold-case? #f) - (cons param:parser-constituents char-set/constituents) - ;; no recursion! - (cons param:parser-enable-attributes? #f) - ;; enable prefix keywords - (cons param:parser-keyword-style 'prefix) - (cons param:parser-radix 10) - (cons param:parser-table system-global-parser-table)) - (lambda () - (fluid-let ((*parser-associate-positions?* #!default) - (*parser-atom-delimiters* #!default) - (*parser-canonicalize-symbols?* #!default) - (*parser-constituents* #!default) - (*parser-radix* #!default) - (*parser-table* #!default)) - (parse port system-global-environment))))) - -(define (parse-file-attributes-line port db multiline) - (declare (ignore db)) - (tokens->alist - (tokenize-file-attributes-line port multiline))) - -;; If we don't see a COLON or a SEMICOLON often enough, we'll assume -;; that we're confused by an ill-formed attributes line and abandon -;; the parsing. -(define file-attributes-confusion-limit 3) - -(define (tokenize-file-attributes-line port multiline) - (let ((parser (top-level-parser port))) - - (define (tokenize confusion-count tokens) - (if (> confusion-count file-attributes-confusion-limit) - (begin - (warn "Ill-formed file attributes list.") - #f) - - (let ((token (if (and (pair? tokens) - (eq? (car tokens) colon-token)) - (parse-file-attributes-value parser port) - (parse-file-attributes-item parser port)))) - - (cond ((eof-object? token) (if multiline - (error:premature-eof port) - token)) - - ((or (eq? token colon-token) - (eq? token semicolon-token)) - ;; saw a colon or semicolon, we're still on track. - (tokenize 0 (cons token tokens))) - - ((eq? token newline-token) - (if multiline - ;; discard if multiline - (tokenize (+ confusion-count 1) tokens) - ;; If we hit the end of line while parsing a single - ;; line, then the file attributes line is ill-formed. - (begin - (warn "Ill-formed file attributes line.") - #f))) - - ((symbol? token) - (let ((token* (if (null? tokens) - (trim-initial-token token) - token))) - (cond ((not token*) (tokenize confusion-count tokens)) - ((string-suffix? "-*-" (symbol->string token*)) - (let ((token** (trim-final-token token*))) - (if token** - (reverse (cons token** tokens)) - (reverse tokens)))) - (else (tokenize (+ confusion-count 1) - (cons token* tokens)))))) - - (else (tokenize (+ confusion-count 1) - (cons token tokens))))))) - - (tokenize 0 '()))) - -;;; In the case where the file attributes line has spurious *- -;;; characters, and perhaps is not whitespace delimited, these -;;; characters will end up being the first token or prepended to the -;;; first token. Examples: -;;; -*-*- encoding: utf-8 -*-*- -;;; -*-*-*-logrus-*-*-*- -;;; -*-*- coding: latin-1 -*-*- - -(define (trim-initial-token sym) - (if (string-prefix? "*-" (symbol->string sym)) - (do ((token-string (symbol->string sym) (string-tail token-string 2))) - ((not (string-prefix? "*-" token-string)) - (if (zero? (string-length token-string)) - #f - (string->symbol token-string))))) - sym) - -;;; If the final token is a symbol that is not whitespace delimited, -;;; then the end marker will be attached to the token. Furthermore, -;;; if there are spurious -* characters, these will have been attached -;;; as well. Examples: -;;; -*-Scheme-*- -;;; -*-outline-*-*- -(define (trim-final-token sym) - (do ((token-string - (let ((s (symbol->string sym))) - (string-head s (- (string-length s) 3))) - (string-head token-string (- (string-length token-string) 2)))) - ((not (string-suffix? "-*" token-string)) - (if (zero? (string-length token-string)) - #f - (string->symbol token-string))))) - -;;; Given a list of tokens, create an alist of keys and values. -(define (tokens->alist tokens) - - ;; A single token is a mode indicator - (define (parse-mode mode-token) - (list (cons 'MODE mode-token))) - - ;; An attribute consists of a key, colon, value - ;; triplet. The key must be a symbol. - (define (parse-attribute tokens) - (let ((key (car tokens)) - (t1 (cdr tokens))) - (if (or (not (symbol? key)) - (not (pair? t1))) - (ill-formed) - (let ((colon (car t1)) - (t2 (cdr t1))) - (if (or (not (eq? colon colon-token)) - (not (pair? t2))) - (ill-formed) - (let ((value (car t2)) - (t3 (cdr t2))) - (if (not (null? t3)) - (ill-formed) - (cons key value)))))))) - - (define (parse-attributes-alist tokens) - (define (group alist accum tail) - (if (pair? tail) - (let ((token (car tail))) - (if (eq? token semicolon-token) - (let ((entry (parse-attribute (reverse accum)))) - (group (if entry - (cons entry alist) - alist) - '() - (cdr tail))) - (group alist (cons token accum) (cdr tail)))) - (if (null? accum) - (reverse alist) - (reverse (let ((entry (parse-attribute (reverse accum)))) - (if entry - (cons entry alist) - alist)))))) - (group '() '() tokens)) - - (define (ill-formed) - (warn "Ill-formed file attributes list.") - #f) - - (if (pair? tokens) - (cond ((memq semicolon-token tokens) - (parse-attributes-alist tokens)) - - ((memq colon-token tokens) - (list (parse-attribute tokens))) - - ((null? (cdr tokens)) - (parse-mode (car tokens))) - - (else (list tokens))) - #f)) - -(define (initialize-package!) - (let* ((constituents char-set/constituents) - (atom-delimiters - (char-set-union char-set:whitespace - ;; Note that colon is a delimiter! - (string->char-set "()[]{}\":;'`,") - (char-set #\U+00AB #\U+00BB))) - (symbol-leaders - (char-set-difference constituents - (char-set-union atom-delimiters - char-set/number-leaders))) - (special-number-leaders - (string->char-set "bBoOdDxXiIeEsSlL")) - (store-char (lambda (v c h) (vector-set! v (char->integer c) h))) - (store-char-set - (lambda (v c h) - (for-each (lambda (c) (store-char v c h)) - (char-set-members c))))) - (let ((initial (make-vector #x100 #f)) - (special (make-vector #x100 #f))) - (store-char-set initial char-set:whitespace handler:whitespace) - (store-char initial #\newline handler:newline) - (store-char-set initial char-set/number-leaders handler:atom) - (store-char-set initial symbol-leaders handler:symbol) - (store-char-set special special-number-leaders handler:number) - (store-char initial #\( handler:list) - (store-char special #\( handler:vector) - (store-char initial #\) handler:close-parenthesis) - (store-char initial #\: handler:colon) - (store-char initial #\; handler:semicolon) - (store-char initial #\' handler:quote) - (store-char initial #\` handler:quasiquote) - (store-char initial #\, handler:unquote) - (store-char initial #\" handler:string) - (store-char initial #\# handler:special) - (store-char special #\f handler:false) - (store-char special #\F handler:false) - (store-char special #\t handler:true) - (store-char special #\T handler:true) - (store-char special #\\ handler:char) -; (store-char special #\! handler:named-constant) - (set! file-attributes-parser-table (make-parser-table initial special)) - ) - (set! char-set/file-attributes-atom-delimiters atom-delimiters) - (set! char-set/file-attributes-constituents constituents)) - unspecific) - -(define (handler:newline port db ctx char) - (declare (ignore port db ctx char)) - newline-token) - -(define (handler:colon port db ctx char) - (declare (ignore port db ctx char)) - colon-token) - -(define (handler:semicolon port db ctx char) - (declare (ignore port db ctx char)) - semicolon-token) - -(define colon-token (list 'COLON)) -(define newline-token (list 'NEWLINE)) -(define semicolon-token (list 'SEMICOLON)) - +(define (parse-file-attributes-string string) + (let ((start (string-search-forward "-*-" string))) + (and start + (let ((v + (*parse-string parse:attributes-line + (string-slice string start)))) + (and v + (filter-map (lambda (p) + (let ((value + (ignore-errors + (lambda () + (read (open-input-string (cdr p))))))) + (and (not (condition? value)) + (cons (intern (car p)) + value)))) + (vector-ref v 0))))))) + +(define (parse-file-attributes-line port db multiline?) + (declare (ignore db multiline?)) + (parse-file-attributes-string (read-line port))) + +(define parse:attributes-line + (*parser + (encapsulate vector->list + (seq (noise match:leader/trailer) + (noise (* (char-set char-set:whitespace))) + (alt (seq parse:key/value-pair + (* (seq ";" + (noise (* (char-set char-set:whitespace))) + parse:key/value-pair)) + (? (seq ";" + (noise (* (char-set char-set:whitespace)))))) + (encapsulate (lambda (v) + (cons "mode" (vector-ref v 0))) + (seq (match (+ (char-set name-chars))) + (noise (* (char-set char-set:whitespace)))))) + (noise match:leader/trailer) + (noise (* (char-set char-set:unicode))))))) + +(define match:leader/trailer + (*matcher (seq "-*-" (* "*-")))) + +(define parse:key/value-pair + (*parser + (encapsulate (lambda (v) + (cons (vector-ref v 0) + (string-trim (vector-ref v 1)))) + (seq (match (+ (char-set name-chars))) + (noise (* (char-set char-set:whitespace))) + ":" + (match match:value))))) + +(define match:value + (*matcher + (+ (alt (char-set value-chars) + (seq #\- (char-set not-asterisk)) + (seq #\- #\* (char-set not-hyphen)) + (seq #\\ (char-set char-set:unicode)) + (seq #\" + (* (alt (char-set string-chars) + (seq #\\ (char-set char-set:unicode)))) + #\"))))) + +(define-deferred name-chars + (char-set-difference char-set:symbol-constituent (char-set #\:))) + +(define-deferred value-chars + (char-set-difference char-set:unicode (char-set #\; #\" #\\ #\- #\*))) + +(define-deferred not-hyphen + (char-set-difference char-set:unicode (char-set #\-))) + +(define-deferred not-asterisk + (char-set-difference char-set:unicode (char-set #\*))) + +(define-deferred string-chars + (char-set-difference char-set:unicode (char-set #\" #\\))) \ No newline at end of file diff --git a/src/runtime/make.scm b/src/runtime/make.scm index f8a6232bd..e1689ca0f 100644 --- a/src/runtime/make.scm +++ b/src/runtime/make.scm @@ -531,7 +531,7 @@ USA. (RUNTIME NUMBER-PARSER) (RUNTIME OPTIONS) (RUNTIME PARSER) - (RUNTIME PARSER FILE-ATTRIBUTES) + (RUNTIME FILE-ATTRIBUTES) ((RUNTIME PATHNAME) INITIALIZE-PARSER-METHOD!) (RUNTIME UNPARSER) (RUNTIME UNSYNTAXER) diff --git a/src/runtime/runtime.pkg b/src/runtime/runtime.pkg index 06e7c8c5c..72c5fe397 100644 --- a/src/runtime/runtime.pkg +++ b/src/runtime/runtime.pkg @@ -3344,12 +3344,13 @@ USA. repl-environment-value) (initialization (initialize-package!))) -(define-package (runtime parser file-attributes) +(define-package (runtime file-attributes) (files "file-attributes") - (parent (runtime parser)) + (parent (runtime)) + (export (runtime) + parse-file-attributes-string) (export (runtime parser) - parse-file-attributes-line) - (initialization (initialize-package!))) + parse-file-attributes-line)) (define-package (runtime parser-table) (files "partab") diff --git a/tests/runtime/test-file-attributes.scm b/tests/runtime/test-file-attributes.scm new file mode 100644 index 000000000..d1c888fc1 --- /dev/null +++ b/tests/runtime/test-file-attributes.scm @@ -0,0 +1,57 @@ +(define-test 'parse-file-attributes-string + (map + (lambda (p) + (let ((string (car p)) + (expected-value (cdr p))) + (lambda () + (with-test-properties + (lambda () + (assert-equal (parse-file-attributes-string string) + expected-value)) + 'expression `(parse-file-attributes-string ,string))))) + '((" -*-Scheme-*- +This file is part of MIT/GNU Scheme. +" + (mode . scheme)) + ("|-*- mode:lisp; + package:(FOOBAR :USE (GLOBAL BAZ) + :SHADOW (CAR CDR CONS)); + base:10 + -*- |" + (mode . lisp) (package foobar :use (global baz) :shadow (car cdr cons)) (base . 10)) + (" -*- Mode: Java; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-" + (mode . java) (tab-width . 4) (indent-tabs-mode . nil) (c-basic-offset . 2)) + (" -*- Mode: C; tab-width: 4; -*-" + (mode . c) (tab-width . 4)) + (" For Emacs: -*- mode:cperl; mode:folding -*-" + (mode . cperl) (mode . folding)) + (" -*- Mode:Lisp; Package:XLIB; Syntax:COMMON-LISP; Base:10; Lowercase:T -*-" + (mode . lisp) (package . xlib) (syntax . common-lisp) (base . 10) (lowercase . t)) + (" -*-mode:C;tab-width:3-*-" + (mode . c) (tab-width . 3)) + (" -*-mode:c; c-style:k&r; c-basic-offset:4; -*-" + (mode . c) (c-style . k&r) (c-basic-offset . 4)) + ("-*-Mode:LISP;Syntax: Common-Lisp;Package:ib;Base:10-*-" + (mode . lisp) (syntax . common-lisp) (package . ib) (base . 10)) + ("-*-mode:lisp;parser:read-*-" + (mode . lisp) (parser . read)) + (" -*-Mode:Perl; perl-indent-level:8-*-" + (mode . perl) (perl-indent-level . 8)) + (" -*-mode:JavaScript;coding:latin-1;-*- Time-stamp: \"2006-08-09 16:18:45 ADT\"" + (mode . javascript) (coding . latin-1)) + (" -*- Mode: C; indent-tabs-mode:nil; c-basic-offset: 8-*- */" + (mode . c) (indent-tabs-mode . nil) (c-basic-offset . 8)) + (" -*- coding:utf-8;mode:python;mode:font-lock -*-" + (coding . utf-8) (mode . python) (mode . font-lock)) + (" -*- test-case-name: twisted.test.test_htb -*-" + (test-case-name . twisted.test.test_htb)) + (" -*- mode: C; c-file-style: \"gnu\" -*-" + (mode . c) (c-file-style . "gnu")) + ("-*- syntax:COMMON-LISP; Package: (ITERATE :use \"COMMON-LISP\" :colon-mode :external) -*-" + (syntax . common-lisp) (package iterate :use "COMMON-LISP" :colon-mode :external)) + (" -*- package IDE-ini -*-" + . #f) + (" -*- Mode: Emacs-Lisp; outline-regexp: \" \\n;;;;+\" -*-" + (mode . emacs-lisp) (outline-regexp . " \n;;;;+")) + (" -*-*- encoding: utf-8 -*-*-" + (encoding . utf-8))))) -- 2.25.1