From: Chris Hanson Date: Mon, 18 Aug 2008 00:12:53 +0000 (+0000) Subject: Export procedures for managing UTF-16 surrogate pairs. X-Git-Tag: 20090517-FFI~244 X-Git-Url: https://birchwood-abbey.net/git?a=commitdiff_plain;h=cb4bfa686299b7a07cfd1ac87e02873f76f7cb42;p=mit-scheme.git Export procedures for managing UTF-16 surrogate pairs. --- diff --git a/v7/src/runtime/runtime.pkg b/v7/src/runtime/runtime.pkg index f01271450..f924cc68f 100644 --- a/v7/src/runtime/runtime.pkg +++ b/v7/src/runtime/runtime.pkg @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: runtime.pkg,v 14.655 2008/07/27 04:24:13 cph Exp $ +$Id: runtime.pkg,v 14.656 2008/08/18 00:12:49 cph Exp $ Copyright (C) 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, @@ -4880,11 +4880,14 @@ USA. char-in-alphabet? char-set->alphabet code-points->alphabet + combine-utf16-surrogates error:not-8-bit-alphabet error:not-alphabet error:not-unicode-code-point error:not-utf16-be-string + error:not-utf16-high-surrogate error:not-utf16-le-string + error:not-utf16-low-surrogate error:not-utf16-string error:not-utf32-be-string error:not-utf32-le-string @@ -4899,7 +4902,9 @@ USA. guarantee-alphabet guarantee-unicode-code-point guarantee-utf16-be-string + guarantee-utf16-high-surrogate guarantee-utf16-le-string + guarantee-utf16-low-surrogate guarantee-utf16-string guarantee-utf32-be-string guarantee-utf32-le-string @@ -4925,6 +4930,7 @@ USA. open-utf32-output-string open-utf8-input-string open-utf8-output-string + split-into-utf16-surrogates string->alphabet string->utf16-be-string string->utf16-le-string @@ -4940,10 +4946,12 @@ USA. utf16-be-string-length utf16-be-string-valid? utf16-be-string? + utf16-high-surrogate? utf16-le-string->wide-string utf16-le-string-length utf16-le-string-valid? utf16-le-string? + utf16-low-surrogate? utf16-string->wide-string utf16-string-length utf16-string-valid? diff --git a/v7/src/runtime/unicode.scm b/v7/src/runtime/unicode.scm index 886910fbe..5ffae99a5 100644 --- a/v7/src/runtime/unicode.scm +++ b/v7/src/runtime/unicode.scm @@ -1,6 +1,6 @@ #| -*-Scheme-*- -$Id: unicode.scm,v 1.41 2008/08/17 09:42:29 cph Exp $ +$Id: unicode.scm,v 1.42 2008/08/18 00:12:53 cph Exp $ Copyright (C) 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, @@ -937,9 +937,9 @@ Not used at the moment. (if (fix:< start end) (and (fix:<= (fix:+ start 2) end) (let ((d0 (combiner (n 0) (n 1)))) - (if (high-surrogate? d0) + (if (utf16-high-surrogate? d0) (and (fix:<= (fix:+ start 4) end) - (low-surrogate? (combiner (n 2) (n 3))) + (utf16-low-surrogate? (combiner (n 2) (n 3))) (fix:+ start 4)) (and (legal-code-16? d0) (fix:+ start 2))))) @@ -951,17 +951,19 @@ Not used at the moment. (define (le-octets->digit16 b0 b1) (fix:or (fix:lsh b1 8) b0)) -(define-integrable (high-surrogate? n) - (fix:= #xD800 (fix:and #xFC00 n))) - -(define-integrable (low-surrogate? n) - (fix:= #xDC00 (fix:and #xFC00 n))) - -(define-integrable (combine-surrogates n0 n1) - (fix:+ (fix:+ (fix:lsh (fix:and n0 #x3FF) 10) - (fix:and n1 #x3FF)) +(define (combine-utf16-surrogates h l) + (guarantee-utf16-high-surrogate h 'combine-utf16-surrogates) + (guarantee-utf16-low-surrogate l 'combine-utf16-surrogates) + (fix:+ (fix:+ (fix:lsh (fix:and h #x3FF) 10) + (fix:and l #x3FF)) #x10000)) +(define (split-into-utf16-surrogates n) + (guarantee-unicode-code-point n 'split-into-utf16-surrogates) + (let ((n (fix:- n #x10000))) + (values (fix:or (fix:and (fix:lsh n -10) #x03FF) #xD800) + (fix:or (fix:and n #x03FF) #xDC00)))) + (define (utf16-string? object) (and (string? object) (utf16-string-valid? object))) @@ -974,9 +976,19 @@ Not used at the moment. (and (string? object) (utf16-le-string-valid? object))) +(define (utf16-high-surrogate? n) + (and (index-fixnum? n) + (fix:= #xD800 (fix:and #xFC00 n)))) + +(define (utf16-low-surrogate? n) + (and (index-fixnum? n) + (fix:= #xDC00 (fix:and #xFC00 n)))) + (define-guarantee utf16-string "UTF-16 string") (define-guarantee utf16-be-string "UTF-16BE string") (define-guarantee utf16-le-string "UTF-16LE string") +(define-guarantee utf16-high-surrogate "UTF-16 high surrogate") +(define-guarantee utf16-low-surrogate "UTF-16 low surrogate") ;;;; UTF-8 representation