From: Chris Hanson Date: Thu, 23 Dec 2004 04:44:18 +0000 (+0000) Subject: Change symbol names to use UTF-8 encoding. X-Git-Tag: 20090517-FFI~1407 X-Git-Url: https://birchwood-abbey.net/git?a=commitdiff_plain;h=bfcd71042c4135fe6e174db9d8fc8aa61cbd558f;p=mit-scheme.git Change symbol names to use UTF-8 encoding. --- diff --git a/v7/src/runtime/runtime.pkg b/v7/src/runtime/runtime.pkg index 23e365937..46cc1f464 100644 --- a/v7/src/runtime/runtime.pkg +++ b/v7/src/runtime/runtime.pkg @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: runtime.pkg,v 14.523 2004/12/20 04:38:49 cph Exp $ +$Id: runtime.pkg,v 14.524 2004/12/23 04:43:38 cph Exp $ Copyright 1988,1989,1990,1991,1992,1993 Massachusetts Institute of Technology Copyright 1994,1995,1996,1997,1998,1999 Massachusetts Institute of Technology @@ -501,13 +501,16 @@ USA. substring->symbol symbol symbol->string + symbol->utf8-string symbol-append symbol-hash symbol-hash-mod symbol-name symbolsymbol + utf8-string->uninterned-symbol) (export (runtime parser) %string->symbol)) diff --git a/v7/src/runtime/symbol.scm b/v7/src/runtime/symbol.scm index eb24c8e4d..cba4d0890 100644 --- a/v7/src/runtime/symbol.scm +++ b/v7/src/runtime/symbol.scm @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: symbol.scm,v 1.16 2004/07/15 04:07:40 cph Exp $ +$Id: symbol.scm,v 1.17 2004/12/23 04:43:48 cph Exp $ Copyright 1992,1993,2001,2003,2004 Massachusetts Institute of Technology @@ -52,44 +52,98 @@ USA. (define (string->uninterned-symbol string) (guarantee-string string 'STRING->UNINTERNED-SYMBOL) + ((ucode-primitive system-pair-cons) (ucode-type uninterned-symbol) + (string->utf8-string string) + (make-unmapped-unbound-reference-trap))) + +(define (utf8-string->uninterned-symbol string) + (guarantee-string string 'UTF8-STRING->UNINTERNED-SYMBOL) ((ucode-primitive system-pair-cons) (ucode-type uninterned-symbol) string (make-unmapped-unbound-reference-trap))) (define (string->symbol string) - ;; Calling STRING-COPY prevents the symbol from being affected if - ;; the string is mutated. The string is copied only if the symbol - ;; is created. + (guarantee-string string 'STRING->SYMBOL) + (let ((string* (string->utf8-string string))) + (if (eq? string* string) + (or ((ucode-primitive find-symbol) string) + ((ucode-primitive string->symbol) (string-copy string))) + ((ucode-primitive string->symbol) string*)))) + +(define (utf8-string->symbol string) + (guarantee-string string 'UTF8-STRING->SYMBOL) (or ((ucode-primitive find-symbol) string) ((ucode-primitive string->symbol) (string-copy string)))) (define (%string->symbol string) - (or ((ucode-primitive find-symbol) string) - ((ucode-primitive string->symbol) string))) + ((ucode-primitive string->symbol) (string->utf8-string string))) (define (substring->symbol string start end) - ((ucode-primitive string->symbol) (substring string start end))) + (guarantee-substring string start end 'SUBSTRING->SYMBOL) + ((ucode-primitive string->symbol) (substring->utf8-string string start end))) (define (string-head->symbol string end) - ((ucode-primitive string->symbol) (string-head string end))) + (substring->symbol string 0 end)) (define (string-tail->symbol string start) - ((ucode-primitive string->symbol) (string-tail string start))) + (substring->symbol string start (string-length string))) (define (symbol . objects) ((ucode-primitive string->symbol) - (apply string-append - (map (lambda (object) - (cond ((symbol? object) (symbol-name object)) - ((string? object) object) - ((char? object) (string object)) - ((number? object) (number->string object)) - ((not object) "") - (else - (error:wrong-type-argument object - "symbol component" - 'SYMBOL)))) - objects)))) + (apply string-append (map ->utf8-string objects)))) + +(define (->utf8-string object) + (cond ((symbol? object) (symbol-name object)) + ((string? object) (string->utf8-string object)) + ((wide-string? object) (wide-string->utf8-string object)) + ((wide-char? object) (wide-string->utf8-string (wide-string object))) + ((number? object) (number->string object)) + ((not object) "") + (else (error:wrong-type-argument object "symbol component" 'SYMBOL)))) + +(define (string->utf8-string string) + (let ((end (string-length string))) + (let ((n (count-non-ascii string 0 end))) + (if (fix:> n 0) + (let ((string* (make-string (fix:+ end n)))) + (%substring->utf8-string string 0 end string*) + string*) + string)))) + +(define (substring->utf8-string string start end) + (let ((string* + (make-string + (fix:+ (fix:- end start) + (count-non-ascii string start end))))) + (%substring->utf8-string string start end string*) + string*)) + +(define (count-non-ascii string start end) + (let loop ((i start) (n 0)) + (if (fix:< i end) + (loop (fix:+ i 1) + (if (fix:< (vector-8b-ref string i) #x80) + n + (fix:+ n 1))) + n))) + +(define (%substring->utf8-string string start end string*) + (let loop ((i start) (i* 0)) + (if (fix:< i end) + (if (fix:< (vector-8b-ref string i) #x80) + (begin + (vector-8b-set! string* i* (vector-8b-ref string i)) + (loop (fix:+ i 1) (fix:+ i* 1))) + (begin + (vector-8b-set! + string* + i* + (fix:or #xC0 (fix:lsh (vector-8b-ref string i) -6))) + (vector-8b-set! + string* + (fix:+ i* 1) + (fix:or #x80 (fix:and (vector-8b-ref string i) #x3F))) + (loop (fix:+ i 1) (fix:+ i* 2))))))) (define (intern string) (if (string-lower-case? string) @@ -106,20 +160,23 @@ USA. (guarantee-symbol symbol 'SYMBOL-NAME) (system-pair-car symbol)) -(define-integrable (symbol->string symbol) - (string-copy (symbol-name symbol))) - (define (symbol-append . symbols) ((ucode-primitive string->symbol) - (apply string-append (map symbol-name symbols)))) + (apply string-append + (map (lambda (symbol) + (guarantee-symbol symbol 'SYMBOL-APPEND) + (system-pair-car symbol)) + symbols)))) -(define-integrable (symbol-hash symbol) +(define (symbol-hash symbol) (string-hash (symbol-name symbol))) -(define-integrable (symbol-hash-mod symbol modulus) +(define (symbol-hash-mod symbol modulus) (string-hash-mod (symbol-name symbol) modulus)) (define (symbolutf8-string symbol) + (string-copy (symbol-name symbol))) + +(define (symbol->string symbol) + (wide-string->string (utf8-string->wide-string (symbol-name symbol)))) \ No newline at end of file diff --git a/v7/src/xml/xml-names.scm b/v7/src/xml/xml-names.scm index 7de854304..69025b783 100644 --- a/v7/src/xml/xml-names.scm +++ b/v7/src/xml/xml-names.scm @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: xml-names.scm,v 1.7 2004/10/14 02:48:51 cph Exp $ +$Id: xml-names.scm,v 1.8 2004/12/23 04:44:18 cph Exp $ Copyright 2003,2004 Massachusetts Institute of Technology @@ -43,7 +43,7 @@ USA. ((#f) #f) ((ILLEGAL) iri) (else - (let ((prefix (string-head->symbol s c))) + (let ((prefix (utf8-string->symbol (string-head s c)))) (or (and (eq? prefix 'xml) (not (eq? iri xml-iri))) (and (eq? prefix 'xmlns) @@ -84,7 +84,7 @@ USA. (begin (if (not (string-is-xml-nmtoken? object)) (error:bad-range-argument object 'MAKE-XML-NMTOKEN)) - (string->symbol object)) + (utf8-string->symbol object)) (begin (guarantee-xml-nmtoken object 'MAKE-XML-NMTOKEN) object))) @@ -203,7 +203,7 @@ USA. (begin (if (not (string-is-xml-name? object)) (error:bad-range-argument object 'MAKE-XML-QNAME)) - (string->symbol object)) + (utf8-string->symbol object)) (begin (guarantee-xml-qname object 'MAKE-XML-QNAME) object))) @@ -221,21 +221,21 @@ USA. (define (xml-qname-string qname) (guarantee-xml-qname qname 'XML-QNAME-STRING) - (symbol->string qname)) + (symbol->utf8-string qname)) (define (xml-qname-local qname) (let ((s (symbol-name qname))) (let ((c (find-prefix-separator s))) (if (or (not c) (eq? c 'ILLEGAL)) qname - (string-tail->symbol s (fix:+ c 1)))))) + (utf8-string->symbol (string-tail s (fix:+ c 1))))))) (define (xml-qname-prefix qname) (let ((s (symbol-name qname))) (let ((c (find-prefix-separator s))) (if (or (not c) (eq? c 'ILLEGAL)) (null-xml-name-prefix) - (string-head->symbol s c))))) + (utf8-string->symbol (string-head s c)))))) (define (find-prefix-separator s) (let ((c (string-find-next-char s #\:)))