From: Chris Hanson Date: Fri, 27 Jan 2017 00:30:33 +0000 (-0800) Subject: Refactor symbol implementation to use UTF-8 bytevectors for names. X-Git-Tag: mit-scheme-pucked-9.2.12~227^2~72 X-Git-Url: https://birchwood-abbey.net/git?a=commitdiff_plain;h=ab51f639b8706056170c90aa7320430e61dd15ac;p=mit-scheme.git Refactor symbol implementation to use UTF-8 bytevectors for names. Primitives handle this correctly since they accept either a legacy string or a bytevector. As long as no one peeks behind the abstraction this should be transparent. However, symbols with non-ASCII names will produce non-legacy strings when asked. AFAIK there are none currently in use. --- diff --git a/src/compiler/base/infnew.scm b/src/compiler/base/infnew.scm index 99d11e5bc..5dd26fef3 100644 --- a/src/compiler/base/infnew.scm +++ b/src/compiler/base/infnew.scm @@ -272,14 +272,14 @@ USA. label-bindings) (let ((map-label/fail (lambda (label) - (let ((key (system-pair-car label))) + (let ((key (symbol-name label))) (let ((datum (hash-table/get labels key no-datum))) (if (eq? datum no-datum) (error "Missing label:" key)) datum)))) (map-label/false (lambda (label) - (hash-table/get labels (system-pair-car label) #f)))) + (hash-table/get labels (symbol-name label) #f)))) (for-each (lambda (label) (set-dbg-label/external?! (map-label/fail label) true)) external-labels) @@ -321,7 +321,7 @@ USA. (let ((offsets (make-rb-tree = <))) (for-each (lambda (binding) (let ((offset (cdr binding)) - (name (system-pair-car (car binding)))) + (name (symbol-name (car binding)))) (let ((datum (rb-tree/lookup offsets offset #f))) (if datum (set-cdr! datum (cons name (cdr datum))) diff --git a/src/runtime/error.scm b/src/runtime/error.scm index d3064e2dc..4a3feea9c 100644 --- a/src/runtime/error.scm +++ b/src/runtime/error.scm @@ -66,7 +66,7 @@ USA. (lambda (n-fields field-indexes) (%make-condition-type (cond ((string? name) (string-copy name)) - ((symbol? name) (symbol->utf8-string name)) + ((symbol? name) (symbol->string name)) ((not name) "(anonymous)") (else (error:wrong-type-argument name "condition-type name" diff --git a/src/runtime/runtime.pkg b/src/runtime/runtime.pkg index 9cd20059b..033c1845c 100644 --- a/src/runtime/runtime.pkg +++ b/src/runtime/runtime.pkg @@ -712,6 +712,7 @@ USA. (parent (runtime)) (export () ;; BEGIN deprecated bindings + (substring->symbol string->symbol) (symbol-append symbol) error:not-interned-symbol error:not-symbol @@ -727,20 +728,14 @@ USA. string->uninterned-symbol string-head->symbol string-tail->symbol - substring->symbol symbol symbol->string - symbol->utf8-string - symbol->wide-string symbol-hash - symbol-hash-mod symbol-name symbol? symbol? - uninterned-symbol? - utf8-string->symbol - utf8-string->uninterned-symbol)) + uninterned-symbol?)) (define-package (runtime microcode-data) (files "udata") @@ -1220,7 +1215,9 @@ USA. ;; vector->ustring ) (export (runtime predicate-metadata) - register-ustring-predicates!)) + register-ustring-predicates!) + (export (runtime symbol) + %ustring*)) (define-package (runtime bytevector) (files "bytevector") @@ -1263,7 +1260,9 @@ USA. utf32le->string utf8->string) (export (runtime predicate-metadata) - register-mit-bytevector-predicates!)) + register-mit-bytevector-predicates!) + (export (runtime symbol) + %legacy-string->bytevector)) (define-package (runtime 1d-property) (files "prop1d") diff --git a/src/runtime/symbol.scm b/src/runtime/symbol.scm index 45e3eb1fa..2ab047c63 100644 --- a/src/runtime/symbol.scm +++ b/src/runtime/symbol.scm @@ -44,125 +44,62 @@ USA. (define-guarantee interned-symbol "interned symbol") (define-guarantee uninterned-symbol "uninterned symbol") -(define (string->uninterned-symbol string) - (make-uninterned-symbol (if (string? string) - (or (ascii-string-copy string) - (string->utf8-string string)) - (wide-string->utf8-string string)))) - -(define (utf8-string->uninterned-symbol string) - (make-uninterned-symbol (if (utf8-string? string) - (string-copy string) - (wide-string->utf8-string string)))) - -(define (make-uninterned-symbol string) +(define (string->uninterned-symbol string #!optional start end) ((ucode-primitive system-pair-cons) (ucode-type uninterned-symbol) - string + (string->utf8 string start end) (make-unmapped-unbound-reference-trap))) -(define (string->symbol string) - ((ucode-primitive string->symbol) (if (string? string) - (or (ascii-string-copy string) - (string->utf8-string string)) - (wide-string->utf8-string string)))) +(define (string->symbol string #!optional start end) + ((ucode-primitive string->symbol) (string->utf8 string start end))) -(define (utf8-string->symbol string) - (if (utf8-string? string) - (or ((ucode-primitive find-symbol) string) - ((ucode-primitive string->symbol) (string-copy string))) - ((ucode-primitive string->symbol) (wide-string->utf8-string string)))) - -(define (substring->symbol string start end) - (guarantee-substring string start end 'SUBSTRING->SYMBOL) - ((ucode-primitive string->symbol) (string->utf8-string string start end))) +(define (symbol->string symbol) + (guarantee symbol? symbol 'symbol->string) + (utf8->string + (let ((name (system-pair-car symbol))) + (cond ((bytevector? name) name) + ((legacy-string? name) (%legacy-string->bytevector name)) + (else (error "Illegal symbol name:" name)))))) (define (string-head->symbol string end) - (substring->symbol string 0 end)) + (string->symbol (ustring-copy string 0 end))) (define (string-tail->symbol string start) - (substring->symbol string start (string-length string))) + (string->symbol (ustring-copy string start))) (define (symbol . objects) - ((ucode-primitive string->symbol) (apply utf8-string objects))) - + (string->symbol (%ustring* objects 'symbol))) + (define (intern string) - ((ucode-primitive string->symbol) - (utf8-string-downcase - (if (string? string) - string - (wide-string->utf8-string string))))) + (string->symbol (cold-load-downcase string))) (define (intern-soft string) - ((ucode-primitive find-symbol) - (utf8-string-downcase - (if (string? string) - string - (wide-string->utf8-string string))))) + ((ucode-primitive find-symbol) (string->utf8 (cold-load-downcase string)))) -(define (utf8-string-downcase string) +(define (cold-load-downcase string) (if (ascii-string? string) ;; Needed during cold load. - (string-downcase string) - (call-with-utf8-input-string string - (lambda (input) - (call-with-utf8-output-string - (lambda (output) - (let loop () - (let ((c (read-char input))) - (if (not (eof-object? c)) - (begin - (write-char (char-downcase c) output) - (loop))))))))))) + (legacy-string-downcase string) + (ustring-downcase string))) (define (ascii-string? string) - (let ((end (string-length string))) - (let loop ((i 0)) - (if (fix:< i end) - (and (fix:< (vector-8b-ref string i) #x80) - (loop (fix:+ i 1))) - #t)))) + (and (legacy-string? string) + (let ((end (legacy-string-length string))) + (let loop ((i 0)) + (if (fix:< i end) + (and (fix:< (vector-8b-ref string i) #x80) + (loop (fix:+ i 1))) + #t))))) (define (symbol-name symbol) - (guarantee-symbol symbol 'SYMBOL-NAME) - (system-pair-car symbol)) - -(define (symbol-hash symbol) - (string-hash (symbol-name symbol))) - -(define (symbol-hash-mod symbol modulus) - (string-hash-mod (symbol-name symbol) modulus)) - -(define (%symbol? x y) - (guarantee-symbol x 'SYMBOL>?) - (guarantee-symbol y 'SYMBOL>?) - (%symbolutf8-string symbol) - (string-copy (symbol-name symbol))) - -(define (symbol->wide-string symbol) - (utf8-string->wide-string (symbol-name symbol))) - -(define (symbol->string symbol) - ;; `Gensyms' are constructed with this, so try the fast copy first. - (or (ascii-string-copy (symbol-name symbol)) - (utf8-string->string (symbol-name symbol)))) \ No newline at end of file + (legacy-stringuninterned-symbol + (string->uninterned-symbol (string-append (symbol-name (identifier->symbol name)) "-value"))) names))) (output/let @@ -426,7 +426,7 @@ USA. (let ((mapping-table (rename-database/mapping-table renames))) (or (hash-table/get mapping-table key #f) (let ((mapped-identifier - (utf8-string->uninterned-symbol + (string->uninterned-symbol (symbol-name (identifier->symbol identifier))))) (hash-table/put! mapping-table key mapped-identifier) (hash-table/put! (rename-database/unmapping-table renames) @@ -445,7 +445,7 @@ USA. ;; with a nicer name. The decorations on this name are just ;; that -- decorations, for human legibility. It is the use of ;; an uninterned symbol that guarantees uniqueness. - (utf8-string->uninterned-symbol + (string->uninterned-symbol (string-append "." (symbol-name (identifier->symbol identifier)) "." diff --git a/src/runtime/url.scm b/src/runtime/url.scm index e86945f17..4430bc40a 100644 --- a/src/runtime/url.scm +++ b/src/runtime/url.scm @@ -549,7 +549,7 @@ USA. ;;;; Output (define (uri->symbol uri) - (utf8-string->symbol (uri->string uri))) + (string->symbol (uri->string uri))) (define (write-uri uri port) (write-string (uri->string uri) port)) diff --git a/src/xml/turtle.scm b/src/xml/turtle.scm index cda67b6c9..988b47863 100644 --- a/src/xml/turtle.scm +++ b/src/xml/turtle.scm @@ -202,7 +202,7 @@ USA. (define parse:language (*parser - (map utf8-string->symbol + (map string->symbol (match (seq (+ (char-set char-set:turtle-lower)) (* (seq "-" (+ (char-set char-set:turtle-lower+digit))))))))) diff --git a/src/xml/xml-names.scm b/src/xml/xml-names.scm index 829a3d339..3f654b455 100644 --- a/src/xml/xml-names.scm +++ b/src/xml/xml-names.scm @@ -142,7 +142,7 @@ USA. (begin (if (not (string-predicate object)) (error:bad-range-argument object constructor)) - (utf8-string->symbol object)) + (string->symbol object)) (begin (guarantee-symbol object constructor) (if (not (string-predicate (symbol-name object))) @@ -233,7 +233,7 @@ USA. (let ((s (symbol-name qname))) (let ((c (string-find-next-char s #\:))) (if c - (utf8-string->symbol (string-head s c)) + (string->symbol (string-head s c)) (null-xml-name-prefix))))) (define (xml-qname-local qname) @@ -244,5 +244,5 @@ USA. (let ((s (symbol-name qname))) (let ((c (string-find-next-char s #\:))) (if c - (utf8-string->symbol (string-tail s (fix:+ c 1))) + (string->symbol (string-tail s (fix:+ c 1))) qname)))) \ No newline at end of file diff --git a/src/xml/xml-rpc.scm b/src/xml/xml-rpc.scm index 96ed74ccf..2307499fd 100644 --- a/src/xml/xml-rpc.scm +++ b/src/xml/xml-rpc.scm @@ -82,7 +82,7 @@ USA. (require (xml-name=? (xml-element-name elt) '|methodCall|)) (values (let ((s (content-string (named-child '|methodName| elt)))) (require (re-string-match "\\`[a-zA-Z0-9_.:/]+\\'" s)) - (utf8-string->symbol s)) + (string->symbol s)) (let ((elt (%named-child 'params elt))) (if elt (parse-params elt) @@ -248,7 +248,7 @@ USA. (named-children 'value (single-named-child 'data elt)))) ((struct) (map (lambda (elt) - (cons (utf8-string->symbol + (cons (string->symbol (content-string (named-child 'name elt))) (decode-value (named-child 'value elt)))) (named-children 'member elt))) @@ -291,7 +291,7 @@ USA. ((string? object) (encode-string object)) ((symbol? object) - (encode-string (symbol->utf8-string object))) + (encode-string (symbol->string object))) ((decoded-time? object) (rpc-elt:date-time (decoded-time->xml-rpc-iso8601-string object))) ((and (pair? object) @@ -302,7 +302,7 @@ USA. (rpc-elt:struct (map (lambda (item) (rpc-elt:member - (rpc-elt:name (symbol->utf8-string (car item))) + (rpc-elt:name (symbol->string (car item))) (encode-value (cdr item)))) (cdr object)))) ((list? object)