From: Chris Hanson <org/chris-hanson/cph>
Date: Thu, 23 Dec 2004 04:44:18 +0000 (+0000)
Subject: Change symbol names to use UTF-8 encoding.
X-Git-Tag: 20090517-FFI~1407
X-Git-Url: https://birchwood-abbey.net/git?a=commitdiff_plain;h=bfcd71042c4135fe6e174db9d8fc8aa61cbd558f;p=mit-scheme.git

Change symbol names to use UTF-8 encoding.
---

diff --git a/v7/src/runtime/runtime.pkg b/v7/src/runtime/runtime.pkg
index 23e365937..46cc1f464 100644
--- a/v7/src/runtime/runtime.pkg
+++ b/v7/src/runtime/runtime.pkg
@@ -1,6 +1,6 @@
 #| -*-Scheme-*-
 
-$Id: runtime.pkg,v 14.523 2004/12/20 04:38:49 cph Exp $
+$Id: runtime.pkg,v 14.524 2004/12/23 04:43:38 cph Exp $
 
 Copyright 1988,1989,1990,1991,1992,1993 Massachusetts Institute of Technology
 Copyright 1994,1995,1996,1997,1998,1999 Massachusetts Institute of Technology
@@ -501,13 +501,16 @@ USA.
 	  substring->symbol
 	  symbol
 	  symbol->string
+	  symbol->utf8-string
 	  symbol-append
 	  symbol-hash
 	  symbol-hash-mod
 	  symbol-name
 	  symbol<?
 	  symbol?
-	  uninterned-symbol?)
+	  uninterned-symbol?
+	  utf8-string->symbol
+	  utf8-string->uninterned-symbol)
   (export (runtime parser)
 	  %string->symbol))
 
diff --git a/v7/src/runtime/symbol.scm b/v7/src/runtime/symbol.scm
index eb24c8e4d..cba4d0890 100644
--- a/v7/src/runtime/symbol.scm
+++ b/v7/src/runtime/symbol.scm
@@ -1,6 +1,6 @@
 #| -*-Scheme-*-
 
-$Id: symbol.scm,v 1.16 2004/07/15 04:07:40 cph Exp $
+$Id: symbol.scm,v 1.17 2004/12/23 04:43:48 cph Exp $
 
 Copyright 1992,1993,2001,2003,2004 Massachusetts Institute of Technology
 
@@ -52,44 +52,98 @@ USA.
 
 (define (string->uninterned-symbol string)
   (guarantee-string string 'STRING->UNINTERNED-SYMBOL)
+  ((ucode-primitive system-pair-cons) (ucode-type uninterned-symbol)
+				      (string->utf8-string string)
+				      (make-unmapped-unbound-reference-trap)))
+
+(define (utf8-string->uninterned-symbol string)
+  (guarantee-string string 'UTF8-STRING->UNINTERNED-SYMBOL)
   ((ucode-primitive system-pair-cons) (ucode-type uninterned-symbol)
 				      string
 				      (make-unmapped-unbound-reference-trap)))
 
 (define (string->symbol string)
-  ;; Calling STRING-COPY prevents the symbol from being affected if
-  ;; the string is mutated.  The string is copied only if the symbol
-  ;; is created.
+  (guarantee-string string 'STRING->SYMBOL)
+  (let ((string* (string->utf8-string string)))
+    (if (eq? string* string)
+	(or ((ucode-primitive find-symbol) string)
+	    ((ucode-primitive string->symbol) (string-copy string)))
+	((ucode-primitive string->symbol) string*))))
+
+(define (utf8-string->symbol string)
+  (guarantee-string string 'UTF8-STRING->SYMBOL)
   (or ((ucode-primitive find-symbol) string)
       ((ucode-primitive string->symbol) (string-copy string))))
 
 (define (%string->symbol string)
-  (or ((ucode-primitive find-symbol) string)
-      ((ucode-primitive string->symbol) string)))
+  ((ucode-primitive string->symbol) (string->utf8-string string)))
 
 (define (substring->symbol string start end)
-  ((ucode-primitive string->symbol) (substring string start end)))
+  (guarantee-substring string start end 'SUBSTRING->SYMBOL)
+  ((ucode-primitive string->symbol) (substring->utf8-string string start end)))
 
 (define (string-head->symbol string end)
-  ((ucode-primitive string->symbol) (string-head string end)))
+  (substring->symbol string 0 end))
 
 (define (string-tail->symbol string start)
-  ((ucode-primitive string->symbol) (string-tail string start)))
+  (substring->symbol string start (string-length string)))
 
 (define (symbol . objects)
   ((ucode-primitive string->symbol)
-   (apply string-append
-	  (map (lambda (object)
-		 (cond ((symbol? object) (symbol-name object))
-		       ((string? object) object)
-		       ((char? object) (string object))
-		       ((number? object) (number->string object))
-		       ((not object) "")
-		       (else
-			(error:wrong-type-argument object
-						   "symbol component"
-						   'SYMBOL))))
-	       objects))))
+   (apply string-append (map ->utf8-string objects))))
+
+(define (->utf8-string object)
+  (cond ((symbol? object) (symbol-name object))
+	((string? object) (string->utf8-string object))
+	((wide-string? object) (wide-string->utf8-string object))
+	((wide-char? object) (wide-string->utf8-string (wide-string object)))
+	((number? object) (number->string object))
+	((not object) "")
+	(else (error:wrong-type-argument object "symbol component" 'SYMBOL))))
+
+(define (string->utf8-string string)
+  (let ((end (string-length string)))
+    (let ((n (count-non-ascii string 0 end)))
+      (if (fix:> n 0)
+	  (let ((string* (make-string (fix:+ end n))))
+	    (%substring->utf8-string string 0 end string*)
+	    string*)
+	  string))))
+
+(define (substring->utf8-string string start end)
+  (let ((string*
+	 (make-string
+	  (fix:+ (fix:- end start)
+		 (count-non-ascii string start end)))))
+    (%substring->utf8-string string start end string*)
+    string*))
+
+(define (count-non-ascii string start end)
+  (let loop ((i start) (n 0))
+    (if (fix:< i end)
+	(loop (fix:+ i 1)
+	      (if (fix:< (vector-8b-ref string i) #x80)
+		  n
+		  (fix:+ n 1)))
+	n)))
+
+(define (%substring->utf8-string string start end string*)
+  (let loop ((i start) (i* 0))
+    (if (fix:< i end)
+	(if (fix:< (vector-8b-ref string i) #x80)
+	    (begin
+	      (vector-8b-set! string* i* (vector-8b-ref string i))
+	      (loop (fix:+ i 1) (fix:+ i* 1)))
+	    (begin
+	      (vector-8b-set!
+	       string*
+	       i*
+	       (fix:or #xC0 (fix:lsh (vector-8b-ref string i) -6)))
+	      (vector-8b-set!
+	       string*
+	       (fix:+ i* 1)
+	       (fix:or #x80 (fix:and (vector-8b-ref string i) #x3F)))
+	      (loop (fix:+ i 1) (fix:+ i* 2)))))))
 
 (define (intern string)
   (if (string-lower-case? string)
@@ -106,20 +160,23 @@ USA.
   (guarantee-symbol symbol 'SYMBOL-NAME)
   (system-pair-car symbol))
 
-(define-integrable (symbol->string symbol)
-  (string-copy (symbol-name symbol)))
-
 (define (symbol-append . symbols)
   ((ucode-primitive string->symbol)
-   (apply string-append (map symbol-name symbols))))
+   (apply string-append
+	  (map (lambda (symbol)
+		 (guarantee-symbol symbol 'SYMBOL-APPEND)
+		 (system-pair-car symbol))
+	       symbols))))
 
-(define-integrable (symbol-hash symbol)
+(define (symbol-hash symbol)
   (string-hash (symbol-name symbol)))
 
-(define-integrable (symbol-hash-mod symbol modulus)
+(define (symbol-hash-mod symbol modulus)
   (string-hash-mod (symbol-name symbol) modulus))
 
 (define (symbol<? x y)
+  (guarantee-symbol x 'SYMBOL<?)
+  (guarantee-symbol y 'SYMBOL<?)
   (let ((sx (system-pair-car x))
 	(sy (system-pair-car y)))
     (let ((lx (string-length sx))
@@ -131,4 +188,10 @@ USA.
 		((fix:= (vector-8b-ref sx i) (vector-8b-ref sy i))
 		 (loop (fix:+ i 1)))
 		(else
-		 (fix:< (vector-8b-ref sx i) (vector-8b-ref sy i)))))))))
\ No newline at end of file
+		 (fix:< (vector-8b-ref sx i) (vector-8b-ref sy i)))))))))
+
+(define (symbol->utf8-string symbol)
+  (string-copy (symbol-name symbol)))
+
+(define (symbol->string symbol)
+  (wide-string->string (utf8-string->wide-string (symbol-name symbol))))
\ No newline at end of file
diff --git a/v7/src/xml/xml-names.scm b/v7/src/xml/xml-names.scm
index 7de854304..69025b783 100644
--- a/v7/src/xml/xml-names.scm
+++ b/v7/src/xml/xml-names.scm
@@ -1,6 +1,6 @@
 #| -*-Scheme-*-
 
-$Id: xml-names.scm,v 1.7 2004/10/14 02:48:51 cph Exp $
+$Id: xml-names.scm,v 1.8 2004/12/23 04:44:18 cph Exp $
 
 Copyright 2003,2004 Massachusetts Institute of Technology
 
@@ -43,7 +43,7 @@ USA.
 	    ((#f) #f)
 	    ((ILLEGAL) iri)
 	    (else
-	     (let ((prefix (string-head->symbol s c)))
+	     (let ((prefix (utf8-string->symbol (string-head s c))))
 	       (or (and (eq? prefix 'xml)
 			(not (eq? iri xml-iri)))
 		   (and (eq? prefix 'xmlns)
@@ -84,7 +84,7 @@ USA.
       (begin
 	(if (not (string-is-xml-nmtoken? object))
 	    (error:bad-range-argument object 'MAKE-XML-NMTOKEN))
-	(string->symbol object))
+	(utf8-string->symbol object))
       (begin
 	(guarantee-xml-nmtoken object 'MAKE-XML-NMTOKEN)
 	object)))
@@ -203,7 +203,7 @@ USA.
       (begin
 	(if (not (string-is-xml-name? object))
 	    (error:bad-range-argument object 'MAKE-XML-QNAME))
-	(string->symbol object))
+	(utf8-string->symbol object))
       (begin
 	(guarantee-xml-qname object 'MAKE-XML-QNAME)
 	object)))
@@ -221,21 +221,21 @@ USA.
 
 (define (xml-qname-string qname)
   (guarantee-xml-qname qname 'XML-QNAME-STRING)
-  (symbol->string qname))
+  (symbol->utf8-string qname))
 
 (define (xml-qname-local qname)
   (let ((s (symbol-name qname)))
     (let ((c (find-prefix-separator s)))
       (if (or (not c) (eq? c 'ILLEGAL))
 	  qname
-	  (string-tail->symbol s (fix:+ c 1))))))
+	  (utf8-string->symbol (string-tail s (fix:+ c 1)))))))
 
 (define (xml-qname-prefix qname)
   (let ((s (symbol-name qname)))
     (let ((c (find-prefix-separator s)))
       (if (or (not c) (eq? c 'ILLEGAL))
 	  (null-xml-name-prefix)
-	  (string-head->symbol s c)))))
+	  (utf8-string->symbol (string-head s c))))))
 
 (define (find-prefix-separator s)
   (let ((c (string-find-next-char s #\:)))