From: Chris Hanson Date: Sat, 10 Jun 2006 04:43:09 +0000 (+0000) Subject: Add parsing support for RDF/Turtle. X-Git-Tag: 20090517-FFI~1023 X-Git-Url: https://birchwood-abbey.net/git?a=commitdiff_plain;h=807a842c72a816b1d406baa00458f768ded07ec3;p=mit-scheme.git Add parsing support for RDF/Turtle. --- diff --git a/v7/src/xml/compile.scm b/v7/src/xml/compile.scm index 35527bf53..ea2e9b2fa 100644 --- a/v7/src/xml/compile.scm +++ b/v7/src/xml/compile.scm @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: compile.scm,v 1.17 2006/02/18 04:31:34 cph Exp $ +$Id: compile.scm,v 1.18 2006/06/10 04:42:43 cph Exp $ Copyright 2001,2003,2004,2005,2006 Massachusetts Institute of Technology @@ -47,5 +47,6 @@ USA. "xhtml" "xhtml-entities" "rdf-struct" - "rdf-nt")))) + "rdf-nt" + "turtle")))) (cref/generate-constructors "xml" 'ALL))) \ No newline at end of file diff --git a/v7/src/xml/load.scm b/v7/src/xml/load.scm index 6e0364abb..0ba65183a 100644 --- a/v7/src/xml/load.scm +++ b/v7/src/xml/load.scm @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: load.scm,v 1.16 2006/02/18 04:31:42 cph Exp $ +$Id: load.scm,v 1.17 2006/06/10 04:42:50 cph Exp $ Copyright 2001,2002,2003,2004,2006 Massachusetts Institute of Technology @@ -28,4 +28,4 @@ USA. (with-working-directory-pathname (directory-pathname (current-load-pathname)) (lambda () (load-package-set "xml"))) -(add-subsystem-identification! "XML" '(0 8)) \ No newline at end of file +(add-subsystem-identification! "XML" '(1 0)) \ No newline at end of file diff --git a/v7/src/xml/test-turtle.scm b/v7/src/xml/test-turtle.scm new file mode 100644 index 000000000..326feeaa0 --- /dev/null +++ b/v7/src/xml/test-turtle.scm @@ -0,0 +1,75 @@ +#| -*-Scheme-*- + +$Id: test-turtle.scm,v 1.1 2006/06/10 04:42:55 cph Exp $ + +Copyright 2006 Massachusetts Institute of Technology + +This file is part of MIT/GNU Scheme. + +MIT/GNU Scheme is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at +your option) any later version. + +MIT/GNU Scheme is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with MIT/GNU Scheme; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, +USA. + +|# + +(define (run-turtle-tests #!optional root) + (fresh-line) + (let ((base-uri + (string->uri "http://www.w3.org/2001/sw/DataAccess/df1/tests/")) + (root + (if (default-object? root) + "~/xml/turtle-tests/" + root))) + (let ((run-test + (lambda (name) + (write-string name) + (write-string ":") + (do ((i (string-length name) (+ i 1))) + ((not (< i 24)) unspecific) + (write-char #\space)) + (let ((pathname (merge-pathnames name root))) + (let ((c + (ignore-errors + (lambda () + (read-turtle-file pathname base-uri))))) + (if (condition? c) + (write-condition-report c (current-output-port)) + (begin + (write-string "Parsed") + #| + (call-with-output-file + (pathname-new-type pathname "foo") + (lambda (port) + (for-each (lambda (triple) + (write-rdf/nt triple port)) + c))) + |# + )))) + (newline)))) + (do ((i 0 (+ i 1))) + ((not (<= i 25)) unspecific) + (if (not (or (= i 14))) ;test-14 runs out of memory + (run-test (string-append "test-" + (if (< i 10) "0" "") + (number->string i) + ".ttl")))) + (run-test "rdf-schema.ttl") + (run-test "rdfs-namespace.ttl") + (run-test "rdfq-results.ttl") + (do ((i 0 (+ i 1))) + ((not (<= i 14)) unspecific) + (run-test (string-append "bad-" + (if (< i 10) "0" "") + (number->string i) + ".ttl")))))) \ No newline at end of file diff --git a/v7/src/xml/turtle.scm b/v7/src/xml/turtle.scm new file mode 100644 index 000000000..0d4ff46a1 --- /dev/null +++ b/v7/src/xml/turtle.scm @@ -0,0 +1,566 @@ +#| -*-Scheme-*- + +$Id: turtle.scm,v 1.1 2006/06/10 04:43:02 cph Exp $ + +Copyright 2006 Massachusetts Institute of Technology + +This file is part of MIT/GNU Scheme. + +MIT/GNU Scheme is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at +your option) any later version. + +MIT/GNU Scheme is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with MIT/GNU Scheme; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, +USA. + +|# + +;;;; Parser for RDF/Turtle + +(declare (usual-integrations)) + +(define (read-rdf/turtle-file pathname #!optional base-uri) + (let ((pathname (pathname-default-type pathname "ttl"))) + (call-with-input-file pathname + (lambda (port) + (post-process-parser-output + (parse-turtle-doc (input-port->parser-buffer port)) + (if (default-object? base-uri) + (pathname->uri pathname) + (merge-uris (file-namestring pathname) + (->absolute-uri base-uri 'read-turtle-file)))))))) + +(define (parse-turtle-doc buffer) + (parse:ws* buffer) + (discard-parser-buffer-head! buffer) + (let loop ((items '())) + (if (peek-parser-buffer-char buffer) + (let ((v + (or (parse:directive buffer) + (parse:triples buffer) + (parser-buffer-error buffer "Expected subject")))) + (parse:ws* buffer) + (if (not (match-parser-buffer-char buffer #\.)) + (parser-buffer-error buffer "Expected dot")) + (parse:ws* buffer) + (loop (cons (vector-ref v 0) items))) + (reverse! items)))) + +(define parse:directive + (*parser + (encapsulate (lambda (v) (cons 'prefix (vector->list v))) + (seq "@" + (alt (seq "prefix" parse:ws+) + (error #f "Unknown directive name")) + (alt (seq (alt parse:prefix-name (values #f)) + ":" + parse:ws+) + (error #f "Expected prefix name")) + (alt parse:uriref + (error #f "Expected URI reference")))))) + +(define parse:triples + (*parser + (encapsulate (lambda (v) + (cons* 'triples + (vector-ref v 0) + (vector-ref v 1))) + (seq parse:subject + parse:ws+ + parse:predicate-object-list)))) + +(define parse:predicate-object-list + (*parser + (encapsulate vector->list + (seq parse:predicate-object-list-1 + (* (seq parse:ws* + ";" + parse:ws* + parse:predicate-object-list-1)) + (? (seq parse:ws* + ";")))))) + +(define parse:predicate-object-list-1 + (*parser + (encapsulate (lambda (v) + (cons (vector-ref v 0) + (vector-ref v 1))) + (seq (alt parse:resource + (map (lambda (v) v rdf:type) + (match "a"))) + (alt parse:ws+ + (error #f "Expected whitespace")) + (encapsulate vector->list + (seq parse:object-required + (* (seq parse:ws* + "," + parse:ws* + parse:object-required)))))))) + +(define rdf:type + (string->uri "http://www.w3.org/1999/02/22-rdf-syntax-ns#type")) + +(define parse:subject + (*parser (alt parse:resource parse:blank))) + +(define parse:object-required + (*parser + (alt parse:object + (error #f "Expected object")))) + +(define parse:object + (*parser (alt parse:resource parse:blank parse:literal))) + +(define parse:resource + (*parser + (alt (map string->uri parse:uriref) + (encapsulate (lambda (v) (cons 'qname (vector->list v))) + (seq (alt parse:prefix-name (values #f)) + ":" + (alt parse:name (values #f))))))) + +(define parse:blank + (*parser + (with-pointer p + (alt (map make-rdf-bnode + (seq "_:" parse:name)) + ;; This notation should probably accept whitespace between the + ;; brackets, but the spec is written like this: + (encapsulate (lambda (v) v (make-rdf-bnode)) + "[]") + (map (lambda (pols) (cons 'blank-node pols)) + (seq "[" + parse:ws* + parse:predicate-object-list + parse:ws* + (alt "]" (error p "Malformed blank node")))) + (encapsulate (lambda (v) (cons 'collection (vector->list v))) + (seq "(" + parse:ws* + (alt ")" + (seq parse:object-required + (* (seq parse:ws+ parse:object)) + parse:ws* + (alt ")" + (error #f + "Expected close parenthesis")))))))))) + +(define parse:name + (*parser + (match (seq (alphabet alphabet:name-start-char) + (* (alphabet alphabet:name-char)))))) + +(define parse:prefix-name + (*parser + (match (seq (alphabet alphabet:prefix-name-start-char) + (* (alphabet alphabet:name-char)))))) + +;;;; Literals + +(define parse:literal + (*parser + (alt (encapsulate (lambda (v) + (let ((string (vector-ref v 0)) + (type (vector-ref v 1))) + (if (or (not type) + (symbol? type) + (absolute-uri? type)) + (make-rdf-literal string type) + (list 'typed-literal string type)))) + (seq (alt parse:long-string parse:string) + (alt (seq "@" + (alt parse:language + (error #f "Malformed language"))) + (seq "^^" + (alt parse:resource + (error #f "Malformed datatype"))) + (values #f)))) + parse:double + parse:decimal + parse:integer + parse:boolean))) + +(define parse:language + (*parser + (map utf8-string->symbol + (match (seq (+ (char-set char-set:turtle-lower)) + (* (seq "-" + (+ (char-set char-set:turtle-lower+digit))))))))) + +(define parse:integer + (*parser + (map (lambda (s) + (make-rdf-literal + (if (char=? (string-ref s 0) #\-) + s + (let ((end (string-length s))) + (let loop ((i (if (char=? (string-ref s 0) #\+) 1 0))) + (if (and (fix:< i end) (char=? (string-ref s i) #\0)) + (loop (fix:+ i 1)) + (if (fix:= i 0) + s + (string-tail s i)))))) + xsd:integer)) + (match (seq (? (alt "-" "+")) + (+ (char-set char-set:turtle-digit))))))) + +(define xsd:integer + (string->uri "http://www.w3.org/2001/XMLSchema#integer")) + +(define parse:double + (let ((match:exponent + (*matcher + (seq (alt "e" "E") + (? (alt "-" "+")) + (+ (char-set char-set:turtle-digit)))))) + (*parser + (map (lambda (s) (make-rdf-literal s xsd:double)) + (match (seq (? (alt "-" "+")) + (alt (seq (+ (char-set char-set:turtle-digit)) + "." + (* (char-set char-set:turtle-digit)) + match:exponent) + (seq "." + (+ (char-set char-set:turtle-digit)) + match:exponent) + (seq (+ (char-set char-set:turtle-digit)) + match:exponent)))))))) + +(define xsd:double + (string->uri "http://www.w3.org/2001/XMLSchema#double")) + +(define parse:decimal + (*parser + (map (lambda (s) (make-rdf-literal s xsd:decimal)) + (match (seq (? (alt "-" "+")) + (alt (seq (+ (char-set char-set:turtle-digit)) + "." + (* (char-set char-set:turtle-digit))) + (seq "." + (+ (char-set char-set:turtle-digit))) + ;;(+ (char-set char-set:turtle-digit)) + )))))) + +(define xsd:decimal + (string->uri "http://www.w3.org/2001/XMLSchema#decimal")) + +(define parse:boolean + (*parser + (map (lambda (s) (make-rdf-literal s xsd:boolean)) + (alt "true" "false")))) + +(define xsd:boolean + (string->uri "http://www.w3.org/2001/XMLSchema#boolean")) + +;;;; Alphabets + +(define char-set:turtle-hex + (string->char-set "0123456789ABCDEF")) + +(define char-set:turtle-digit + (string->char-set "0123456789")) + +(define char-set:turtle-ws + (string->char-set "\t\n\r ")) + +(define char-set:turtle-lower + (string->char-set "abcdefghijklmnopqrstuvwxyz")) + +(define char-set:turtle-lower+digit + (char-set-union char-set:turtle-lower + char-set:turtle-digit)) + +(define alphabet:name-start-char + (code-points->alphabet + '((#x0041 . #x005A) + #x005F + (#x0061 . #x007A) + (#x00C0 . #x00D6) + (#x00D8 . #x00F6) + (#x00F8 . #x02FF) + (#x0370 . #x037D) + (#x037F . #x1FFF) + (#x200C . #x200D) + (#x2070 . #x218F) + (#x2C00 . #x2FEF) + (#x3001 . #xD7FF) + (#xF900 . #xFDCF) + (#xFDF0 . #xFFFD) + (#x10000 . #xEFFFF)))) + +(define alphabet:name-char + (alphabet+ alphabet:name-start-char + (code-points->alphabet + '(#x002d + (#x0030 . #x0039) + #x00B7 + (#x0300 . #x036F) + (#x203F . #x2040))))) + +(define alphabet:prefix-name-start-char + (alphabet- alphabet:name-start-char (alphabet #\_))) + +(define alphabet:character + (code-points->alphabet '((#x20 . #x5B) (#x5D . #x10FFFF)))) + +(define alphabet:ucharacter + (alphabet- alphabet:character (alphabet #\>))) + +(define alphabet:scharacter + (alphabet- alphabet:character (alphabet #\"))) + +(define alphabet:lcharacter + (alphabet+ alphabet:character (alphabet #\tab #\newline #\return))) + +;;;; Escaped strings + +(define (delimited-region-parser name start-delim end-delim + alphabet parse-escapes) + (lambda (buffer) + (let ((output (open-output-string)) + (start (get-parser-buffer-pointer buffer))) + + (define (read-head) + (let ((p (get-parser-buffer-pointer buffer))) + (let loop () + (cond ((match-parser-buffer-string-no-advance buffer end-delim) + (copy p) + (match-parser-buffer-string buffer end-delim) + (finish)) + ((match-parser-buffer-char-in-alphabet buffer alphabet) + (loop)) + ((match-parser-buffer-char-no-advance buffer #\\) + (copy p) + (let ((p (get-parser-buffer-pointer buffer))) + (read-parser-buffer-char buffer) + (read-escape p))) + (else + (let ((c (peek-parser-buffer-char buffer))) + (if c + (parser-buffer-error buffer + (emsg "Illegal character") + c) + (parser-buffer-error start + (emsg "Unexpected EOF"))))))))) + + (define (read-escape p) + (let ((v (parse-escapes buffer))) + (if v + (begin + (write-char (vector-ref v 0) output) + (read-head)) + (parser-buffer-error p (emsg "Malformed string escape"))))) + + (define (emsg msg) + (string-append msg " in " name)) + + (define (copy p) + (call-with-parser-buffer-tail buffer p + (lambda (string start end) + (write-substring string start end output)))) + + (define (finish) + (vector (get-output-string output))) + + (port/set-coding output 'utf-8) + (and (match-parser-buffer-string buffer start-delim) + (read-head))))) + +(define (simple-escape-parser . chars) + (lambda (buffer) + (let loop ((chars chars)) + (and (pair? chars) + (if (match-parser-buffer-char buffer (car chars)) + (vector (cadr chars)) + (loop (cddr chars))))))) + +(define parse:scharacter-escape + (let ((parse-simple + (simple-escape-parser #\\ #\\ + #\" #\" + #\t #\tab + #\n #\newline + #\r #\return))) + (*parser + (alt parse-simple + parse:unicode-escape)))) + +(define parse:ucharacter-escape + (let ((parse-simple + (simple-escape-parser #\\ #\\ + #\> #\>))) + (*parser + (alt parse-simple + parse:unicode-escape)))) + +(define parse:unicode-escape + (*parser + (map (lambda (s) (integer->char (string->number s 16 #t))) + (alt (seq "u" (match (n*n 4 (char-set char-set:turtle-hex)))) + (seq "U" (match (n*n 8 (char-set char-set:turtle-hex)))))))) + +(define parse:string + (delimited-region-parser "string" + "\"" + "\"" + alphabet:scharacter + parse:scharacter-escape)) + +(define parse:long-string + (delimited-region-parser "long string" + "\"\"\"" + "\"\"\"" + alphabet:lcharacter + parse:scharacter-escape)) + +(define parse:uriref + (delimited-region-parser "URI reference" + "<" + ">" + alphabet:ucharacter + parse:ucharacter-escape)) + +;;;; Whitespace + +(define parse:ws* + (*parser (noise (* match:ws)))) + +(define parse:ws+ + (*parser (noise (+ match:ws)))) + +(define match:ws + (*matcher + (alt (char-set char-set:turtle-ws) + match:comment))) + +(define match:comment + (*matcher + (seq "#" + (sexp (lambda (buffer) + (let loop () + (if (let ((char (read-parser-buffer-char buffer))) + (and char + (not (char=? char #\newline)))) + (loop))) + #t))))) + +;;;; Post-processing + +;;; This code does prefix expansion and URI merging. + +(define (post-process-parser-output stmts base-uri) + (let ((prefixes + (map (lambda (p) + (cons (cadr p) + (uri->string (merge-uris (caddr p) base-uri)))) + (keep-matching-items stmts + (lambda (stmt) + (eq? (car stmt) 'prefix)))))) + (append-map! (lambda (stmt) + (case (car stmt) + ((triples) + (post-process-triples (cadr stmt) + (cddr stmt) + prefixes + base-uri)) + ((prefix) '()) + (else (error "Unknown statement:" stmt)))) + stmts))) + +(define (post-process-triples subject pols prefixes base-uri) + (receive (subject triples) (post-process-resource subject prefixes base-uri) + (append! triples + (post-process-pols subject + pols + prefixes + base-uri)))) + +(define (post-process-pols subject pols prefixes base-uri) + (append-map! (lambda (pol) + (receive (predicate triples) + (post-process-resource (car pol) prefixes base-uri) + (append! triples + (append-map! (lambda (object) + (receive (object triples) + (post-process-resource object + prefixes + base-uri) + (cons (make-rdf-triple subject + predicate + object) + triples))) + (cdr pol))))) + pols)) + +(define (post-process-resource resource prefixes base-uri) + (cond ((or (absolute-uri? resource) + (rdf-bnode? resource) + (rdf-literal? resource)) + (values resource '())) + ((relative-uri? resource) + (values (merge-uris resource base-uri) '())) + ((pair? resource) + (case (car resource) + ((qname) + (values (post-process-qname (cadr resource) + (caddr resource) + prefixes) + '())) + ((blank-node) + (let ((s (make-rdf-bnode))) + (values s + (post-process-pols s (cdr resource) prefixes base-uri)))) + ((typed-literal) + (receive (uri triples) + (post-process-resource (caddr resource) prefixes base-uri) + (values (make-rdf-literal (cadr resource) uri) + triples))) + ((collection) + (post-process-collection (cdr resource) prefixes base-uri)) + (else + (error "Unknown resource:" resource)))) + (else + (error "Unknown resource:" resource)))) + +(define (post-process-qname prefix local prefixes) + (string->uri + (string-append (cdr + (or (find-matching-item prefixes + (if prefix + (lambda (p) + (and (string? (car p)) + (string=? (car p) prefix))) + (lambda (p) + (not (car p))))) + (error "Unknown prefix:" prefix))) + (or local "")))) + +(define (post-process-collection resources prefixes base-uri) + (if (pair? resources) + (receive (first triples) + (post-process-resource (car resources) prefixes base-uri) + (receive (rest triples*) + (post-process-collection (cdr resources) prefixes base-uri) + (let ((p (make-rdf-bnode))) + (values p + (cons* (make-rdf-triple p rdf:first first) + (make-rdf-triple p rdf:rest rest) + (append! triples triples*)))))) + (values rdf:nil '()))) + +(define rdf:nil + (string->uri "http://www.w3.org/1999/02/22-rdf-syntax-ns#nil")) + +(define rdf:first + (string->uri "http://www.w3.org/1999/02/22-rdf-syntax-ns#first")) + +(define rdf:rest + (string->uri "http://www.w3.org/1999/02/22-rdf-syntax-ns#rest")) \ No newline at end of file diff --git a/v7/src/xml/xml.pkg b/v7/src/xml/xml.pkg index bbd6c1520..03118d018 100644 --- a/v7/src/xml/xml.pkg +++ b/v7/src/xml/xml.pkg @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: xml.pkg,v 1.68 2006/06/10 03:52:00 cph Exp $ +$Id: xml.pkg,v 1.69 2006/06/10 04:43:09 cph Exp $ Copyright 2001,2002,2003,2004,2005,2006 Massachusetts Institute of Technology @@ -572,4 +572,10 @@ USA. read-rdf/nt-file rdf/nt-file->source write-rdf/nt - write-rdf/nt-file)) \ No newline at end of file + write-rdf/nt-file)) + +(define-package (runtime rdf turtle) + (files "turtle") + (parent (runtime rdf)) + (export () + read-rdf/turtle-file)) \ No newline at end of file