From: Chris Hanson Date: Tue, 24 May 2005 04:50:50 +0000 (+0000) Subject: Complete rewrite of URL support. New design implements generic codec X-Git-Tag: 20090517-FFI~1308 X-Git-Url: https://birchwood-abbey.net/git?a=commitdiff_plain;h=47f4fe1a17526bc8f302b9a4d1bb46ac327d1a00;p=mit-scheme.git Complete rewrite of URL support. New design implements generic codec support for URIs as defined in RFC 2396, which is both more general and easier to use than the old design. All names have been changed to use the string "uri" rather than "url". A minimal number of URL procedures has been retained to support IMAIL until it is rewritten to use the new design. The package has been renamed to '(runtime uri). --- diff --git a/v7/src/runtime/ed-ffi.scm b/v7/src/runtime/ed-ffi.scm index 36128c98b..1ed3d43b2 100644 --- a/v7/src/runtime/ed-ffi.scm +++ b/v7/src/runtime/ed-ffi.scm @@ -1,9 +1,9 @@ #| -*- Scheme -*- -$Id: ed-ffi.scm,v 1.36 2003/06/08 05:07:00 cph Exp $ +$Id: ed-ffi.scm,v 1.37 2005/05/24 04:50:08 cph Exp $ Copyright (c) 1991,1996,1997,1999,2000 Massachusetts Institute of Technology -Copyright (c) 2001,2002,2003 Massachusetts Institute of Technology +Copyright (c) 2001,2002,2003,2005 Massachusetts Institute of Technology This file is part of MIT/GNU Scheme. @@ -167,7 +167,7 @@ USA. ("unxprm" (runtime os-primitives)) ("unxpth" (runtime pathname unix)) ("uproc" (runtime procedure)) - ("url" (runtime url)) + ("url" (runtime uri)) ("urtrap" (runtime reference-trap)) ("usrint" (runtime user-interface)) ("utabs" (runtime microcode-tables)) diff --git a/v7/src/runtime/make.scm b/v7/src/runtime/make.scm index c3ba4b374..a1673e8a0 100644 --- a/v7/src/runtime/make.scm +++ b/v7/src/runtime/make.scm @@ -1,10 +1,10 @@ #| -*-Scheme-*- -$Id: make.scm,v 14.97 2004/12/13 03:22:21 cph Exp $ +$Id: make.scm,v 14.98 2005/05/24 04:50:17 cph Exp $ Copyright 1988,1989,1990,1991,1992,1993 Massachusetts Institute of Technology Copyright 1994,1995,1996,1997,1998,2000 Massachusetts Institute of Technology -Copyright 2001,2002,2003,2004 Massachusetts Institute of Technology +Copyright 2001,2002,2003,2004,2005 Massachusetts Institute of Technology This file is part of MIT/GNU Scheme. @@ -522,7 +522,7 @@ USA. (RUNTIME EMACS-INTERFACE) ;; More debugging ((RUNTIME CONTINUATION-PARSER) INITIALIZE-SPECIAL-FRAMES! #f) - (RUNTIME URL))) + (RUNTIME URI))) (if (eq? os-name 'NT) (package-initialize '(RUNTIME WIN32-REGISTRY) 'INITIALIZE-PACKAGE! #f)) diff --git a/v7/src/runtime/runtime.pkg b/v7/src/runtime/runtime.pkg index e4cf6701c..a11702135 100644 --- a/v7/src/runtime/runtime.pkg +++ b/v7/src/runtime/runtime.pkg @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: runtime.pkg,v 14.543 2005/05/20 04:08:10 cph Exp $ +$Id: runtime.pkg,v 14.544 2005/05/24 04:50:28 cph Exp $ Copyright 1988,1989,1990,1991,1992,1993 Massachusetts Institute of Technology Copyright 1994,1995,1996,1997,1998,1999 Massachusetts Institute of Technology @@ -4633,9 +4633,29 @@ USA. char-in-alphabet? char-set->alphabet code-points->alphabet + error:not-8-bit-alphabet + error:not-alphabet + error:not-unicode-code-point + error:not-utf16-be-string + error:not-utf16-le-string + error:not-utf16-string + error:not-utf32-be-string + error:not-utf32-le-string + error:not-utf32-string + error:not-utf8-string + error:not-well-formed-code-point-list + error:not-wide-char + error:not-wide-string guarantee-8-bit-alphabet guarantee-alphabet guarantee-unicode-code-point + guarantee-utf16-be-string + guarantee-utf16-le-string + guarantee-utf16-string + guarantee-utf32-be-string + guarantee-utf32-le-string + guarantee-utf32-string + guarantee-utf8-string guarantee-well-formed-code-point-list guarantee-wide-char guarantee-wide-string @@ -4666,29 +4686,37 @@ USA. read-utf32-le-char read-utf8-char string->alphabet + string->utf8-string string->wide-string unicode-code-point? utf16-be-string->wide-string utf16-be-string-length utf16-be-string-valid? + utf16-be-string? utf16-le-string->wide-string utf16-le-string-length utf16-le-string-valid? + utf16-le-string? utf16-string->wide-string utf16-string-length utf16-string-valid? + utf16-string? utf32-be-string->wide-string utf32-be-string-length utf32-be-string-valid? + utf32-be-string? utf32-le-string->wide-string utf32-le-string-length utf32-le-string-valid? + utf32-le-string? utf32-string->wide-string utf32-string-length utf32-string-valid? + utf32-string? utf8-string->wide-string utf8-string-length utf8-string-valid? + utf8-string? well-formed-code-point-list? wide-char? wide-string @@ -4723,37 +4751,83 @@ USA. (export (runtime input-port) wide-string-contents)) -(define-package (runtime url) +(define-package (runtime uri) (files "url") (parent (runtime)) (export () - url:char-set:alpha - url:char-set:alphadigit - url:char-set:digit - url:char-set:escaped - url:char-set:extra - url:char-set:lowalpha - url:char-set:national - url:char-set:punctuation - url:char-set:reserved - url:char-set:safe - url:char-set:scheme - url:char-set:unescaped + (url:decode-string decode-component) + (url:match:escape match-escape) + (url:parse:hostport parse-hostport) + ->uri + absolute-uri? + char-set:uri-alpha + char-set:uri-alphanum + char-set:uri-digit + char-set:uri-hex + char-set:uri-pchar + char-set:uri-reg-name + char-set:uri-rel-segment + char-set:uri-scheme + char-set:uri-userinfo + char-set:uric + char-set:uric-no-slash + error:not-absolute-uri + error:not-heirarchical-uri + error:not-opaque-uri + error:not-relative-uri + error:not-uri + error:not-uri-authority + error:not-uri-host + error:not-uri-path + error:not-uri-port + error:not-uri-registry-name + error:not-uri-scheme + error:not-uri-server + guarantee-absolute-uri + guarantee-heirarchical-uri + guarantee-opaque-uri + guarantee-relative-uri + guarantee-uri + guarantee-uri-authority + guarantee-uri-host + guarantee-uri-path + guarantee-uri-port + guarantee-uri-registry-name + guarantee-uri-scheme + guarantee-uri-server + heirarchical-uri? + make-uri + make-uri-server + opaque-uri? + parse-uri + relative-uri? + string->uri + uri->string + uri-absolute? + uri-authority + uri-authority? + uri-fragment + uri-heirarchical? + uri-host? + uri-opaque? + uri-path + uri-path-absolute? + uri-path-relative? + uri-path? + uri-port? + uri-query + uri-registry-name? + uri-relative? + uri-scheme + uri-scheme? + uri-server-host + uri-server-port + uri-server-userinfo + uri-server? + uri? url:char-set:unreserved - url:decode-string - url:decode-substring url:encode-string - url:encode-substring - url:match:escape - url:match:host - url:match:hostname - url:match:hostnumber - url:match:uchar - url:match:xchar - url:parse:hostport - url:parse:scheme - url:string-encoded? - url:substring-encoded?)) + write-uri)) (define-package (runtime postgresql) (file-case options diff --git a/v7/src/runtime/symbol.scm b/v7/src/runtime/symbol.scm index cba4d0890..10f78e9ad 100644 --- a/v7/src/runtime/symbol.scm +++ b/v7/src/runtime/symbol.scm @@ -1,8 +1,8 @@ #| -*-Scheme-*- -$Id: symbol.scm,v 1.17 2004/12/23 04:43:48 cph Exp $ +$Id: symbol.scm,v 1.18 2005/05/24 04:50:35 cph Exp $ -Copyright 1992,1993,2001,2003,2004 Massachusetts Institute of Technology +Copyright 1992,1993,2001,2003,2004,2005 Massachusetts Institute of Technology This file is part of MIT/GNU Scheme. @@ -57,9 +57,9 @@ USA. (make-unmapped-unbound-reference-trap))) (define (utf8-string->uninterned-symbol string) - (guarantee-string string 'UTF8-STRING->UNINTERNED-SYMBOL) + (guarantee-utf8-string string 'UTF8-STRING->UNINTERNED-SYMBOL) ((ucode-primitive system-pair-cons) (ucode-type uninterned-symbol) - string + (string-copy string) (make-unmapped-unbound-reference-trap))) (define (string->symbol string) @@ -71,7 +71,7 @@ USA. ((ucode-primitive string->symbol) string*)))) (define (utf8-string->symbol string) - (guarantee-string string 'UTF8-STRING->SYMBOL) + (guarantee-utf8-string string 'UTF8-STRING->SYMBOL) (or ((ucode-primitive find-symbol) string) ((ucode-primitive string->symbol) (string-copy string)))) @@ -80,7 +80,7 @@ USA. (define (substring->symbol string start end) (guarantee-substring string start end 'SUBSTRING->SYMBOL) - ((ucode-primitive string->symbol) (substring->utf8-string string start end))) + ((ucode-primitive string->symbol) (string->utf8-string string start end))) (define (string-head->symbol string end) (substring->symbol string 0 end)) @@ -101,50 +101,6 @@ USA. ((not object) "") (else (error:wrong-type-argument object "symbol component" 'SYMBOL)))) -(define (string->utf8-string string) - (let ((end (string-length string))) - (let ((n (count-non-ascii string 0 end))) - (if (fix:> n 0) - (let ((string* (make-string (fix:+ end n)))) - (%substring->utf8-string string 0 end string*) - string*) - string)))) - -(define (substring->utf8-string string start end) - (let ((string* - (make-string - (fix:+ (fix:- end start) - (count-non-ascii string start end))))) - (%substring->utf8-string string start end string*) - string*)) - -(define (count-non-ascii string start end) - (let loop ((i start) (n 0)) - (if (fix:< i end) - (loop (fix:+ i 1) - (if (fix:< (vector-8b-ref string i) #x80) - n - (fix:+ n 1))) - n))) - -(define (%substring->utf8-string string start end string*) - (let loop ((i start) (i* 0)) - (if (fix:< i end) - (if (fix:< (vector-8b-ref string i) #x80) - (begin - (vector-8b-set! string* i* (vector-8b-ref string i)) - (loop (fix:+ i 1) (fix:+ i* 1))) - (begin - (vector-8b-set! - string* - i* - (fix:or #xC0 (fix:lsh (vector-8b-ref string i) -6))) - (vector-8b-set! - string* - (fix:+ i* 1) - (fix:or #x80 (fix:and (vector-8b-ref string i) #x3F))) - (loop (fix:+ i 1) (fix:+ i* 2))))))) - (define (intern string) (if (string-lower-case? string) (string->symbol string) diff --git a/v7/src/runtime/unicode.scm b/v7/src/runtime/unicode.scm index d89f7ce67..80981ccb1 100644 --- a/v7/src/runtime/unicode.scm +++ b/v7/src/runtime/unicode.scm @@ -1,8 +1,8 @@ #| -*-Scheme-*- -$Id: unicode.scm,v 1.21 2004/12/06 21:27:35 cph Exp $ +$Id: unicode.scm,v 1.22 2005/05/24 04:50:43 cph Exp $ -Copyright 2001,2003,2004 Massachusetts Institute of Technology +Copyright 2001,2003,2004,2005 Massachusetts Institute of Technology This file is part of MIT/GNU Scheme. @@ -148,23 +148,13 @@ USA. (fix:= (char-bits object) 0) (unicode-code-point? (char-code object)))) -(define (guarantee-wide-char object caller) - (if (not (wide-char? object)) - (error:not-wide-char object caller))) - -(define (error:not-wide-char object caller) - (error:wrong-type-argument object "a Unicode character" caller)) +(define-guarantee wide-char "a Unicode character") (define (unicode-code-point? object) (and (index-fixnum? object) (legal-code-32? object))) -(define-integrable (guarantee-unicode-code-point object caller) - (if (not (unicode-code-point? object)) - (error:not-unicode-code-point object caller))) - -(define (error:not-unicode-code-point object caller) - (error:wrong-type-argument object "a Unicode code point" caller)) +(define-guarantee unicode-code-point "a Unicode code point") (define-integrable (legal-code-32? pt) (if (fix:< pt #x10000) @@ -185,12 +175,7 @@ USA. (high1 #f read-only #t) (high2 #f read-only #t)) -(define-integrable (guarantee-alphabet object caller) - (if (not (alphabet? object)) - (error:not-alphabet object caller))) - -(define (error:not-alphabet object caller) - (error:wrong-type-argument object "a Unicode alphabet" caller)) +(define-guarantee alphabet "a Unicode alphabet") (define-integrable (make-alphabet-low) (make-string #x100 (integer->char 0))) @@ -248,12 +233,7 @@ USA. (fix:< (car item) (cdr item))) (unicode-code-point? item))) -(define-integrable (guarantee-well-formed-code-point-list object caller) - (if (not (well-formed-code-point-list? object)) - (error:not-well-formed-code-point-list object caller))) - -(define (error:not-well-formed-code-point-list object caller) - (error:wrong-type-argument object "a Unicode code-point list" caller)) +(define-guarantee well-formed-code-point-list "a Unicode code-point list") (define (code-points->alphabet items) (guarantee-well-formed-code-point-list items 'CODE-POINTS->ALPHABET) @@ -380,12 +360,7 @@ USA. (and (fix:= (vector-8b-ref low i) 0) (loop (fix:+ i 1)))))))) -(define-integrable (guarantee-8-bit-alphabet object caller) - (if (not (8-bit-alphabet? object)) - (error:not-8-bit-alphabet object caller))) - -(define (error:not-8-bit-alphabet object caller) - (error:wrong-type-argument object "an 8-bit alphabet" caller)) +(define-guarantee 8-bit-alphabet "an 8-bit alphabet") (define (char-set->alphabet char-set) (guarantee-char-set char-set 'CHAR-SET->ALPHABET) @@ -545,6 +520,8 @@ USA. (constructor %make-wide-string)) (contents #f read-only #t)) +(define-guarantee wide-string "a Unicode string") + (define (make-wide-string length #!optional char) (%make-wide-string (make-vector length @@ -596,13 +573,6 @@ USA. (vector-set! v2 j (vector-ref v1 i)))) string*)) -(define-integrable (guarantee-wide-string object caller) - (if (not (wide-string? object)) - (error:not-wide-string object caller))) - -(define (error:not-wide-string object caller) - (error:wrong-type-argument object "a Unicode string" caller)) - (define (wide-string-index? index string) (and (index-fixnum? index) (fix:< index (%wide-string-length string)))) @@ -821,6 +791,22 @@ USA. (legal-code-32? (combiner (n 0) (n 1) (n 2) (n 3))) (fix:+ start 4)) start)) + +(define (utf32-string? object) + (and (string? object) + (utf32-string-valid? object))) + +(define (utf32-be-string? object) + (and (string? object) + (utf32-be-string-valid? object))) + +(define (utf32-le-string? object) + (and (string? object) + (utf32-le-string-valid? object))) + +(define-guarantee utf32-string "UTF-32 string") +(define-guarantee utf32-be-string "UTF-32BE string") +(define-guarantee utf32-le-string "UTF-32LE string") ;;;; UTF-16 representation @@ -929,7 +915,7 @@ USA. (define (wide-string->utf16-le-string string #!optional start end) (wide-string->utf-string string start end sink-utf16-le-char 'WIDE-STRING->UTF16-LE-STRING)) - + (define (utf16-string-length string #!optional start end) (if (host-big-endian?) (%utf16-string-length string start end "16BE" be-bytes->digit16 @@ -950,7 +936,7 @@ USA. (encoded-string-length string start end type caller (lambda (string start end) (validate-utf16-char string start end combiner))))) - + (define (utf16-string-valid? string #!optional start end) (if (host-big-endian?) (%utf16-string-valid? string start end be-bytes->digit16 @@ -1004,6 +990,22 @@ USA. (fix:+ (fix:+ (fix:lsh (fix:and n0 #x3FF) 10) (fix:and n1 #x3FF)) #x10000)) + +(define (utf16-string? object) + (and (string? object) + (utf16-string-valid? object))) + +(define (utf16-be-string? object) + (and (string? object) + (utf16-be-string-valid? object))) + +(define (utf16-le-string? object) + (and (string? object) + (utf16-le-string-valid? object))) + +(define-guarantee utf16-string "UTF-16 string") +(define-guarantee utf16-be-string "UTF-16BE string") +(define-guarantee utf16-le-string "UTF-16LE string") ;;;; UTF-8 representation @@ -1090,6 +1092,42 @@ USA. (define (utf8-string-valid? string #!optional start end) (with-substring-args string start end 'UTF8-STRING-VALID? (encoded-string-valid? string start end validate-utf8-char))) + +(define (utf8-string? object) + (and (string? object) + (utf8-string-valid? object))) + +(define-guarantee utf8-string "UTF-8 string") + +(define (string->utf8-string string #!optional start end) + (with-substring-args string start end 'STRING->UTF8-STRING + (let ((string* + (make-string + (fix:+ (fix:- end start) + (let loop ((i start) (n 0)) + (if (fix:< i end) + (loop (fix:+ i 1) + (if (fix:< (vector-8b-ref string i) #x80) + n + (fix:+ n 1))) + n)))))) + (let loop ((i start) (i* 0)) + (if (fix:< i end) + (if (fix:< (vector-8b-ref string i) #x80) + (begin + (vector-8b-set! string* i* (vector-8b-ref string i)) + (loop (fix:+ i 1) (fix:+ i* 1))) + (begin + (vector-8b-set! + string* + i* + (fix:or #xC0 (fix:lsh (vector-8b-ref string i) -6))) + (vector-8b-set! + string* + (fix:+ i* 1) + (fix:or #x80 (fix:and (vector-8b-ref string i) #x3F))) + (loop (fix:+ i 1) (fix:+ i* 2)))))) + string*))) (define (validate-utf8-char string start end) diff --git a/v7/src/runtime/url.scm b/v7/src/runtime/url.scm index 64785d179..43e6cf91a 100644 --- a/v7/src/runtime/url.scm +++ b/v7/src/runtime/url.scm @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: url.scm,v 1.17 2005/05/20 04:07:54 cph Exp $ +$Id: url.scm,v 1.18 2005/05/24 04:50:50 cph Exp $ Copyright 2000,2001,2003,2004,2005 Massachusetts Institute of Technology @@ -23,186 +23,497 @@ USA. |# -;;;; URL Encoding +;;;; URI Encoding +;;; package: (runtime uri) + +;;; Based on RFC 2396 (declare (usual-integrations)) -(define url:char-set:lowalpha) -(define url:char-set:alpha) -(define url:char-set:digit) -(define url:char-set:alphadigit) -(define url:char-set:scheme) -(define url:char-set:safe) -(define url:char-set:extra) -(define url:char-set:national) -(define url:char-set:punctuation) -(define url:char-set:reserved) -(define url:char-set:hex) +(define-record-type + (%make-uri scheme authority path-relative? path query fragment) + uri? + (scheme uri-scheme) + (authority uri-authority) + (path-relative? uri-path-relative?) + (path uri-path) + (query uri-query) + (fragment uri-fragment)) + +(define (make-uri scheme authority path-relative? path query fragment) + (if scheme (guarantee-uri-scheme scheme 'MAKE-URI)) + (if authority (guarantee-uri-authority authority 'MAKE-URI)) + (guarantee-uri-path path 'MAKE-URI) + (if query (guarantee-utf8-string query 'MAKE-URI)) + (if fragment (guarantee-utf8-string fragment 'MAKE-URI)) + (if (string? path) + (begin + (if (not scheme) (error:bad-range-argument scheme 'MAKE-URI)) + (if authority (error:bad-range-argument authority 'MAKE-URI)) + (if path-relative? (error:bad-range-argument path-relative? 'MAKE-URI)) + (if query (error:bad-range-argument query 'MAKE-URI)))) + (if (and scheme path-relative?) + (error:bad-range-argument path-relative? 'MAKE-URI)) + (if (and (null? path) (not authority)) + (error:bad-range-argument path 'MAKE-URI)) + (%make-uri scheme authority (if path-relative? #t #f) path query fragment)) + +(define-integrable (uri-path-absolute? uri) + (not (uri-path-relative? uri))) + +(define-integrable (uri-relative? uri) + (if (uri-scheme uri) #f #t)) + +(define-integrable (uri-absolute? uri) + (if (uri-scheme uri) #t #f)) + +(define-integrable (uri-opaque? uri) + (string? (uri-path uri))) + +(define-integrable (uri-heirarchical? uri) + (not (uri-opaque? uri))) + +(define (relative-uri? object) + (and (uri? object) + (uri-relative? object))) + +(define (absolute-uri? object) + (and (uri? object) + (uri-absolute? object))) + +(define (opaque-uri? object) + (and (uri? object) + (uri-opaque? object))) + +(define (heirarchical-uri? object) + (and (uri? object) + (uri-heirarchical? object))) + +(define-guarantee uri "URI") +(define-guarantee relative-uri "relative URI") +(define-guarantee absolute-uri "absolute URI") +(define-guarantee opaque-uri "opaque URI") +(define-guarantee heirarchical-uri "heirarchical URI") + +(define (uri-scheme? object) + (and (interned-symbol? object) + (complete-match match-scheme (symbol-name object)))) + +(define (uri-path? object) + (or (and (utf8-string? object) + (fix:> (string-length object) 0)) + (list-of-type? object + (lambda (elt) + (or (utf8-string? elt) + (and (pair? elt) + (utf8-string? (car elt)) + (list-of-type? (cdr elt) utf8-string?))))))) + +(define (uri-authority? object) + (or (uri-server? object) + (uri-registry-name? object))) + +(define (uri-registry-name? object) + (and (utf8-string? object) + (fix:> (string-length object) 0))) + +(define-record-type + (%make-uri-server host port userinfo) + uri-server? + (host uri-server-host) + (port uri-server-port) + (userinfo uri-server-userinfo)) + +(define (make-uri-server host port userinfo) + (if host (guarantee-uri-host host 'MAKE-URI-SERVER)) + (if port (guarantee-uri-port port 'MAKE-URI-SERVER)) + (if userinfo (guarantee-utf8-string userinfo 'MAKE-URI-SERVER)) + (if (not host) + (begin + (if port (error:bad-range-argument port 'MAKE-URI-SERVER)) + (if userinfo (error:bad-range-argument userinfo 'MAKE-URI-SERVER)))) + (%make-uri-server host port userinfo)) + +(define (uri-host? object) + (and (string? object) + (complete-match match-host object))) + +(define (uri-port? object) + (exact-nonnegative-integer? object)) + +(define-guarantee uri-scheme "URI scheme") +(define-guarantee uri-path "URI path") +(define-guarantee uri-authority "URI authority") +(define-guarantee uri-registry-name "URI registry name") +(define-guarantee uri-server "URI server") +(define-guarantee uri-host "URI host") +(define-guarantee uri-port "URI port") + +(define char-set:uri-alpha) +(define char-set:uri-digit) +(define char-set:uri-alphanum) +(define char-set:uri-alphanum-) +(define char-set:uri-hex) +(define char-set:uri-scheme) +(define char-set:uric) +(define char-set:uric-no-slash) +(define char-set:uri-reg-name) +(define char-set:uri-userinfo) +(define char-set:uri-rel-segment) +(define char-set:uri-pchar) + +(define parse-fragment) +(define parse-query) +(define parse-reg-name) +(define parse-userinfo) +(define parse-rel-segment) +(define parse-pchar) + (define url:char-set:unreserved) (define url:char-set:unescaped) -(define url:char-set:escaped) (define (initialize-package!) - (set! url:char-set:lowalpha (string->char-set "abcdefghijklmnopqrstuvwxyz")) - (set! url:char-set:alpha - (char-set-union url:char-set:lowalpha - (string->char-set "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))) - (set! url:char-set:digit (string->char-set "0123456789")) - (set! url:char-set:alphadigit - (char-set-union url:char-set:alpha url:char-set:digit)) - (set! url:char-set:scheme - (char-set-union url:char-set:alphadigit (string->char-set "+-."))) - (set! url:char-set:safe (string->char-set "$-_.+")) - (set! url:char-set:extra (string->char-set "!*'(),")) - (set! url:char-set:national (string->char-set "{}|\\^~[]`")) - (set! url:char-set:punctuation (string->char-set "<>#%\"")) - (set! url:char-set:reserved (string->char-set ";/?:@&=")) - (set! url:char-set:hex (string->char-set "0123456789abcdefABCDEF")) + (set! char-set:uri-alpha + (string->char-set + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")) + (set! char-set:uri-digit (string->char-set "0123456789")) + (set! char-set:uri-alphanum + (char-set-union char-set:uri-alpha char-set:uri-digit)) + (set! char-set:uri-alphanum- + (char-set-union char-set:uri-alphanum (char-set #\-))) + (set! char-set:uri-hex (string->char-set "0123456789abcdefABCDEF")) + (set! char-set:uri-scheme + (char-set-union char-set:uri-alphanum (string->char-set "+-."))) + (set! char-set:uric + (char-set-union char-set:uri-alphanum + (string->char-set "!'()*-._~") ;mark + (string->char-set "$&+,/:;=?@") ;reserved + )) + (let ((component-chars + (lambda (free) + (char-set-difference char-set:uric (string->char-set free))))) + (set! char-set:uric-no-slash (component-chars "/")) + (set! char-set:uri-reg-name (component-chars "/?")) + (set! char-set:uri-userinfo (component-chars "/?@")) + (set! char-set:uri-rel-segment (component-chars "/:?")) + (set! char-set:uri-pchar (component-chars "/;?"))) + + (set! parse-fragment (component-parser-* char-set:uric)) + (set! parse-query parse-fragment) + (set! parse-reg-name (component-parser-+ char-set:uri-reg-name)) + (set! parse-userinfo (component-parser-* char-set:uri-userinfo)) + (set! parse-rel-segment (component-parser-+ char-set:uri-rel-segment)) + (set! parse-pchar (component-parser-* char-set:uri-pchar)) + + ;; backwards compatibility: (set! url:char-set:unreserved - (char-set-union url:char-set:alphadigit - url:char-set:safe - url:char-set:extra)) + (char-set-union char-set:uri-alphanum + (string->char-set "!$'()*+,-._"))) (set! url:char-set:unescaped (char-set-union url:char-set:unreserved - url:char-set:reserved)) - (set! url:char-set:escaped - (char-set-invert url:char-set:unescaped)) + (string->char-set ";/?:@&="))) unspecific) + +;;;; Parser -(define url:match:uchar - (*matcher - (alt (char-set url:char-set:unreserved) - url:match:escape))) +(define (string->uri string #!optional start end) + (let ((v (complete-parse parse-uri string start end))) + (and v + (vector-ref v 0)))) -(define url:match:xchar - (*matcher - (alt (char-set url:char-set:unescaped) - url:match:escape))) +(define (->uri object) + (cond ((uri? object) object) + ((string? object) (string->uri object)) + ((symbol? object) (string->uri (symbol-name object))) + (else (error:not-uri object '->URI)))) + +(define parse-uri + (*parser + (top-level + (seq (alt parse-absolute-uri + parse-relative-uri + (values #f)) + (alt (seq "#" parse-fragment) + (values #f)))))) + +(define parse-absolute-uri + (*parser + (alt (encapsulate (lambda (v) + (let ((path (vector-ref v 1))) + (%make-uri (vector-ref v 0) + (vector-ref path 0) + (vector-ref path 1) + (vector-ref path 2) + (vector-ref v 2) + (vector-ref v 3)))) + (seq parse-scheme + ":" + (alt parse-net-path parse-abs-path) + (alt (seq "?" parse-query) + (values #f)) + (alt (seq "#" parse-fragment) + (values #f)))) + (encapsulate (lambda (v) + (%make-uri (vector-ref v 0) + #f + #f + (vector-ref v 1) + #f + (vector-ref v 2))) + (seq parse-scheme + ":" + (match (seq (char-set char-set:uric-no-slash) + (* (char-set char-set:uric)))) + (alt (seq "#" parse-fragment) + (values #f))))))) -(define url:match:escape +(define parse-scheme + (*parser + (map intern (match match-scheme)))) + +(define match-scheme (*matcher - (seq "%" - (char-set url:char-set:hex) - (char-set url:char-set:hex)))) + (seq (char-set char-set:uri-alpha) + (* (char-set char-set:uri-scheme))))) + +(define parse-relative-uri + (*parser + (encapsulate (lambda (v) + (let ((path (vector-ref v 0))) + (%make-uri #f + (vector-ref path 0) + (vector-ref path 1) + (vector-ref path 2) + (vector-ref v 1) + (vector-ref v 2)))) + (seq (alt parse-net-path + parse-abs-path + parse-rel-path) + (alt (seq "?" parse-query) + (values #f)) + (alt (seq "#" parse-fragment) + (values #f)))))) -(define url:parse:scheme +(define parse-net-path + (*parser + (encapsulate (lambda (v) (vector (vector-ref v 0) #f (vector-ref v 1))) + (seq "//" + parse-authority + (alt (encapsulate vector->list + (* (seq "/" parse-segment))) + (values '())))))) + +(define parse-abs-path + (*parser + (map (lambda (p) (vector #f #f p)) + (encapsulate vector->list + (* (seq "/" parse-segment)))))) + +(define parse-rel-path (*parser - (map intern - (match (+ url:char-set:scheme))))) + (map (lambda (p) (vector #f #t p)) + (encapsulate vector->list + (seq parse-rel-segment + (* (seq "/" parse-segment))))))) -(define url:parse:hostport +(define parse-segment (*parser - (seq (match url:match:host) - (alt (map string->number - (seq (noise ":") - (match (+ (char-set url:char-set:digit))))) + (encapsulate (lambda (v) + (if (fix:> (vector-length v) 1) + (vector->list v) + (vector-ref v 0))) + (seq parse-pchar + (* (seq ";" parse-pchar)))))) + +(define parse-authority + (*parser + (alt (encapsulate (lambda (v) + (%make-uri-server (vector-ref v 1) + (vector-ref v 2) + (vector-ref v 0))) + (seq (alt parse-userinfo + (values #f)) + parse-hostport)) + parse-reg-name + (values (%make-uri-server #f #f #f))))) + +(define parse-hostport + (*parser + (seq (match match-host) + (alt (seq (noise ":") + (alt (map string->number (match match-digits)) + (values #f))) (values #f))))) -(define url:match:host - (*matcher (alt url:match:hostname url:match:hostnumber))) +(define match-host + (*matcher (alt match-hostname match-ipv4-address))) -(define url:match:hostname +(define match-hostname (let ((match-tail (*matcher - (* (alt (char-set url:char-set:alphadigit) - (seq (+ #\-) - (char-set url:char-set:alphadigit))))))) + (? (seq (* (char-set char-set:uri-alphanum-)) + (char-set char-set:uri-alphanum)))))) (*matcher - (seq (* (seq (char-set url:char-set:alphadigit) + (seq (* (seq (char-set char-set:uri-alphanum) match-tail ".")) - (char-set url:char-set:alpha) - match-tail)))) + (char-set char-set:uri-alpha) + match-tail + (? "."))))) -(define url:match:hostnumber +(define match-ipv4-address (*matcher - (seq (+ (char-set url:char-set:digit)) - "." - (+ (char-set url:char-set:digit)) - "." - (+ (char-set url:char-set:digit)) - "." - (+ (char-set url:char-set:digit))))) + (seq match-digits "." match-digits "." match-digits "." match-digits))) + +(define match-digits + (*matcher (+ (char-set char-set:uri-digit)))) -(define (url:string-encoded? string) - (url:substring-encoded? string 0 (string-length string))) +;;;; Output -(define url:substring-encoded? - (let ((matcher (*matcher (complete (* url:match:xchar))))) - (lambda (string start end) - (matcher (string->parser-buffer string start end))))) +(define (uri->string uri) + (guarantee-uri uri 'URI->STRING) + (call-with-output-string + (lambda (port) + (%write-uri uri port)))) -(define (url:encode-string string) - (url:encode-substring string 0 (string-length string))) - -(define (url:encode-substring string start end) - (let ((n-to-encode - (let loop ((start start) (n-to-encode 0)) - (let ((index - (substring-find-next-char-in-set string start end - url:char-set:escaped))) - (if index - (loop (fix:+ index 1) (fix:+ n-to-encode 1)) - n-to-encode))))) - (if (fix:= 0 n-to-encode) - (substring string start end) - (let ((encoded - (make-string (fix:+ (fix:- end start) (fix:* 2 n-to-encode)))) - (digits "0123456789ABCDEF")) - (let loop ((start start) (i 0)) - (let ((index - (substring-find-next-char-in-set string start end - url:char-set:escaped))) - (if index - (begin - (substring-move! string start index encoded i) - (let ((i (fix:+ i (fix:- index start))) - (code (vector-8b-ref string index))) - (string-set! encoded i #\%) - (string-set! encoded - (fix:+ i 1) - (string-ref digits (fix:lsh code -4))) - (string-set! encoded - (fix:+ i 2) - (string-ref digits (fix:and code #x0F))) - (loop (fix:+ index 1) (fix:+ i 3)))) - (substring-move! string start end encoded i)))) - encoded)))) +(define (uri->symbol uri) + (utf8-string->symbol (uri->string uri))) + +(define (write-uri uri port) + (guarantee-uri uri 'WRITE-URI) + (guarantee-port port 'WRITE-URI) + (%write-uri uri port)) + +(define (%write-uri uri port) + (let ((scheme (uri-scheme uri)) + (authority (uri-authority uri)) + (path-relative? (uri-path-relative? uri)) + (path (uri-path uri)) + (query (uri-query uri)) + (fragment (uri-fragment uri))) + (if scheme + (begin + (write scheme port) + (write-char #\: port))) + (cond ((string? path) + (write-escaped-substring path 0 1 char-set:uric-no-slash port) + (write-escaped-substring path 1 (string-length path) char-set:uric + port)) + (authority + (write-string "//" port) + (write-authority authority port) + (write-abs-path path port)) + (path-relative? + (write-escaped (car path) char-set:uri-rel-segment port) + (write-abs-path (cdr path) port)) + (else + (write-abs-path path port))) + (if query + (begin + (write-char #\? port) + (write-escaped query char-set:uric port))) + (if fragment + (begin + (write-char #\# port) + (write-escaped fragment char-set:uric port))))) + +(define (write-authority authority port) + (if (uri-server? authority) + (begin + (if (uri-server-userinfo authority) + (begin + (write-escaped (uri-server-userinfo authority) + char-set:uri-userinfo + port) + (write-char #\@ port))) + (if (uri-server-host authority) + (write-string (uri-server-host authority) port)) + (if (uri-server-port authority) + (begin + (write-char #\: port) + (write (uri-server-port authority) port)))) + (write-escaped authority char-set:uri-reg-name port))) + +(define (write-abs-path path port) + (let ((write-pchar + (lambda (string) + (write-escaped string char-set:uri-pchar port)))) + (for-each (lambda (segment) + (write-char #\/ port) + (if (string? segment) + (write-pchar segment) + (for-each write-pchar segment))) + path))) -(define (url:decode-string string) - (url:decode-substring string 0 (string-length string))) - -(define (url:decode-substring string start end) - (let ((n-encoded - (let loop ((start start) (n-encoded 0)) - (let ((index (substring-find-next-char string start end #\%))) - (if index - (loop (fix:+ index 1) (fix:+ n-encoded 1)) - n-encoded)))) - (lose - (lambda () - (error "Malformed encoded URL string:" - (substring string start end))))) - (if (fix:= 0 n-encoded) - (substring string start end) - (let ((decoded - (make-string (fix:- (fix:- end start) (fix:* 2 n-encoded))))) - (let loop ((start start) (i 0)) - (let ((index (substring-find-next-char string start end #\%))) - (if index - (begin - (if (not (fix:<= (fix:+ index 3) end)) - (lose)) - (let ((k - (substring->number string - (fix:+ index 1) - (fix:+ index 3) - 16)) - (i* (fix:+ i (fix:- index start)))) - (if (not k) - (lose)) - (substring-move! string start index decoded i) - (vector-8b-set! decoded i* k) - (loop (fix:+ index 3) (fix:+ i* 1)))) - (substring-move! string start end decoded i)))) - decoded)))) \ No newline at end of file +;;;; Escape codecs + +(define (component-parser-* cs) + (*parser + (map decode-component + (match (* (alt (char-set cs) match-escape)))))) + +(define (component-parser-+ cs) + (*parser + (map decode-component + (match (+ (alt (char-set cs) match-escape)))))) + +(define match-escape + (*matcher + (seq "%" + (char-set char-set:uri-hex) + (char-set char-set:uri-hex)))) + +(define (decode-component string) + (if (string-find-next-char string #\%) + (call-with-output-string + (lambda (port) + (let ((end (string-length string))) + (let loop ((i 0)) + (if (fix:< i end) + (if (char=? (string-ref string i) #\%) + (begin + (write-char (integer->char + (substring->number string + (fix:+ i 1) + (fix:+ i 3) + 16 + #t)) + port) + (loop (fix:+ i 3))) + (begin + (write-char (string-ref string i) port) + (loop (fix:+ i 1))))))))) + string)) + +(define (write-escaped string unescaped port) + (write-escaped-substring string 0 (string-length string) unescaped port)) + +(define (write-escaped-substring string start end unescaped port) + (do ((i start (fix:+ i 1))) + ((not (fix:< i end))) + (let ((char (string-ref string i))) + (if (char-set-member? unescaped char) + (write-char char port) + (let ((s (number->string (char->integer char) 16))) + (write-char #\% port) + (if (fix:= (string-length s) 1) + (write-char #\0 port)) + (write-string s port)))))) + +(define (complete-match matcher string #!optional start end) + (let ((buffer (string->parser-buffer string start end))) + (and (matcher buffer) + (not (peek-parser-buffer-char buffer))))) + +(define (complete-parse parser string #!optional start end) + (let ((buffer (string->parser-buffer string start end))) + (let ((v (parser buffer))) + (and v + (not (peek-parser-buffer-char buffer)) + v)))) + +;; backwards compatibility: +(define (url:encode-string string) + (call-with-output-string + (lambda (port) + (write-escaped string url:char-set:unescaped port)))) \ No newline at end of file