Implement tests for characters, particularly UTF-8 codec.

author Chris Hanson <org/chris-hanson/cph>

Fri, 20 Jan 2017 09:46:10 +0000 (01:46 -0800)

committer Chris Hanson <org/chris-hanson/cph>

Fri, 20 Jan 2017 09:46:10 +0000 (01:46 -0800)
author Chris Hanson <org/chris-hanson/cph>
Fri, 20 Jan 2017 09:46:10 +0000 (01:46 -0800)
committer Chris Hanson <org/chris-hanson/cph>
Fri, 20 Jan 2017 09:46:10 +0000 (01:46 -0800)
diff --git a/tests/runtime/test-char.scm b/tests/runtime/test-char.scm

new file mode 100644 (file)

index 0000000..e8705a6
--- /dev/null
+++ b/tests/runtime/test-char.scm
@@ -0,0 +1,286 @@
+#| -*-Scheme-*-
+
+Copyright (C) 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994,
+    1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
+    2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
+    2017 Massachusetts Institute of Technology
+
+This file is part of MIT/GNU Scheme.
+
+MIT/GNU Scheme is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at
+your option) any later version.
+
+MIT/GNU Scheme is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with MIT/GNU Scheme; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301,
+USA.
+
+|#
+
+;;;; Test of character abstraction
+
+(declare (usual-integrations))
+\f
+(define named-chars
+  '((#\null #x00)
+    (#\alarm #x07)
+    (#\backspace #x08)
+    (#\tab #x09)
+    (#\newline #x0A)
+    (#\page #x0C)
+    (#\return #x0D)
+    (#\escape #x1B)
+    (#\space #x20)
+    (#\delete #x7F)))
+
+(define-test 'named-chars
+  (lambda ()
+    (for-each (lambda (entry)
+               (let ((char (car entry))
+                     (code (cadr entry)))
+                 (assert-= code (char->integer char))
+                 (assert-eq char (integer->char code))))
+             named-chars)))
+
+(define ascii-chars
+  '(#\u+00 #\u+01 #\u+02 #\u+03 #\u+04 #\u+05 #\u+06 #\u+07
+    #\u+08 #\u+09 #\u+0A #\u+0B #\u+0C #\u+0D #\u+0E #\u+0F
+    #\u+10 #\u+11 #\u+12 #\u+13 #\u+14 #\u+15 #\u+16 #\u+17
+    #\u+18 #\u+19 #\u+1A #\u+1B #\u+1C #\u+1D #\u+1E #\u+1F
+    #\u+20 #\! #\" #\# #\$ #\% #\& #\' #\( #\) #\* #\+ #\, #\- #\. #\/
+    #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9 #\: #\; #\< #\= #\> #\? #\@
+    #\A #\B #\C #\D #\E #\F #\G #\H #\I #\J #\K #\L #\M #\N #\O #\P #\Q
+    #\R #\S #\T #\U #\V #\W #\X #\Y #\Z #\[ #\\ #\] #\^ #\_ #\`
+    #\a #\b #\c #\d #\e #\f #\g #\h #\i #\j #\k #\l #\m #\n #\o #\p #\q
+    #\r #\s #\t #\u #\v #\w #\x #\y #\z #\{ #\| #\} #\~ #\u+7F))
+
+(define-test 'basic-ascii
+  (lambda ()
+    (for-each basic-char-tests
+              ascii-chars
+              (iota #x80))))
+
+(define (basic-char-tests char code)
+  (assert-true (char? char))
+  (assert-eq char (integer->char code))
+  (assert-= code (char->integer char)))
+\f
+(define-test 'valid-utf8-sequences
+  (lambda ()
+    (for-each (lambda (entry)
+                (let ((bytes (car entry))
+                      (char (cadr entry)))
+                 (let ((n (bytevector-length bytes)))
+                   (assert-= n
+                             (initial-byte->utf8-char-length
+                              (bytevector-u8-ref bytes 0)))
+                   (assert-= n
+                             (char-utf8-byte-length char))
+                   (assert-eq char (decode-utf8-char bytes 0))
+                   (do ((offset 0 (+ offset 1)))
+                       ((not (< offset 8)))
+                     (let ((bv (make-bytevector 16))
+                           (m (+ offset n)))
+                       (assert-= m (encode-utf8-char! bv offset char))
+                       (assert-equal bytes (bytevector-copy bv offset m)))))))
+              valid-utf8-sequences)))
+
+(define valid-utf8-sequences
+  `(
+    ,@(map (lambda (char)
+            (list (bytevector (char->integer char)) char))
+          ascii-chars)
+
+    ;; 2.1  First possible sequence of a certain length
+    (#u8(#x00) #\u+00000000)
+    (#u8(#xC2 #x80) #\u+00000080)
+    (#u8(#xE0 #xA0 #x80) #\u+00000800)
+    (#u8(#xF0 #x90 #x80 #x80) #\u+00010000)
+
+    ;; 2.2  Last possible sequence of a certain length
+    (#u8(#x7F) #\u+0000007F)
+    (#u8(#xDF #xBF) #\u+000007FF)
+    (#u8(#xEF #xBF #xBD) #\u+0000FFFD)
+
+    ;; 2.3  Other boundary conditions
+    (#u8(#xED #x9F #xBF) #\u+0000D7FF)
+    (#u8(#xEE #x80 #x80) #\u+0000E000)
+    (#u8(#xEF #xBF #xBD) #\u+0000FFFD)
+    (#u8(#xF4 #x8F #xBF #xBD) #\u+0010FFFD)
+    ))
+\f
+(define-test 'invalid-known-length-utf8-sequences
+  (lambda ()
+    (for-each (lambda (entry)
+               (let ((bytes (car entry))
+                     (length (cadr entry)))
+                 (assert-= length
+                           (initial-byte->utf8-char-length
+                             (bytevector-u8-ref bytes 0)))
+                 (assert-error (lambda () (decode-utf8-char bytes 0)))))
+              invalid-known-length-sequences)))
+
+(define invalid-known-length-sequences
+  `(
+    ;; 3.2  Lonely start characters
+    ;; 3.2.1  All 32 first bytes of 2-byte sequences (0xc0-0xdf),
+    ;;        each followed by a space character:
+    ,@(map (lambda (initial-byte)
+             (list (bytevector initial-byte #x20) 2))
+           (iota #x20 #xC0))
+
+    ;; 3.2.2  All 16 first bytes of 3-byte sequences (0xe0-0xef),
+    ;;        each followed by a space character:
+    ,@(map (lambda (initial-byte)
+             (list (bytevector initial-byte #x20) 3))
+           (iota #x10 #xE0))
+
+    ;; 3.2.3  All 8 first bytes of 4-byte sequences (0xf0-0xf7),
+    ;;        each followed by a space character:
+    ,@(map (lambda (initial-byte)
+             (list (bytevector initial-byte #x20) 4))
+           (iota #x08 #xF0))
+
+    ;; 3.3  Sequences with last continuation byte missing
+    ,@(map (lambda (bytes)
+             (list bytes (+ (bytevector-length bytes) 1)))
+           '(
+             #u8(#xC0)                     ; #\u+0000
+             #u8(#xE0 #x80)                ; #\u+0000
+             #u8(#xF0 #x80 #x80)           ; #\u+0000
+             #u8(#xDF)                     ; #\u+000007FF
+             #u8(#xEF #xBF)                ; #\u+0000FFFF
+             #u8(#xF7 #xBF #xBF)           ; #\u+001FFFFF
+             ))
+    ))
+
+(define-test 'illegal-chars
+  (lambda ()
+    (for-each (lambda (cp)
+               (assert-false (unicode-scalar-value? cp))
+               (let ((char (integer->char cp)))
+                 (assert-false (unicode-char? char))
+                 (assert-error
+                  (lambda ()
+                    (encode-utf8-char! (make-bytevector 16) 0 char)))))
+             illegal-characters)))
+
+(define illegal-characters
+  `(
+    ;; Other noncharacters:
+    ,@(iota #x20 #xFDD0)
+    ,@(append-map (lambda (plane)
+                   (let ((prefix (* plane #x10000)))
+                     (list (+ prefix #xFFFE)
+                           (+ prefix #xFFFF))))
+                 (iota #x11 #x01))
+    ))
+\f
+(define-test 'invalid-utf8-sequences
+  (lambda ()
+    (for-each (lambda (bytes)
+                (assert-error (lambda () (decode-utf8-char bytes 0))))
+              invalid-utf8-sequences)))
+
+(define invalid-utf8-sequences
+  `(
+
+    ;; 2.2  Last possible sequence of a certain length
+    #u8(#xEF #xBF #xBF)                        ; #\u+0000FFFF
+    #u8(#xF7 #xBF #xBF #xBF)           ; #\u+001FFFFF
+
+    ;; 2.3  Other boundary conditions
+    #u8(#xF4 #x8F #xBF #xBF)           ; #\u+0010FFFF
+    #u8(#xF4 #x90 #x80 #x80)           ; #\u+00110000
+
+    ;; 3.1  Unexpected continuation bytes
+    ;; (duplicated below)
+    ;; ,@(map bytevector
+    ;;        (iota #x20 #x80))
+
+    ;; 2 continuation bytes: #u8(#x80 #xBF)
+    ;; 3 continuation bytes: #u8(#x80 #xBF #x80)
+    ;; 4 continuation bytes: #u8(#x80 #xBF #x80 #xBF)
+    ;; 5 continuation bytes: #u8(#x80 #xBF #x80 #xBF #x80)
+    ;; 6 continuation bytes: #u8(#x80 #xBF #x80 #xBF #x80 #xBF)
+    ;; 7 continuation bytes: #u8(#x80 #xBF #x80 #xBF #x80 #xBF #x80)
+
+    ;; 3.1.9  Sequence of all 64 possible continuation bytes (0x80-0xbf):
+    ,@(map bytevector
+           (iota #x40 #x80))
+
+    ;; 3.2.4  All 4 first bytes of 5-byte sequences (0xf8-0xfb),
+    ;;        each followed by a space character:
+    ,@(map (lambda (initial-byte)
+             (bytevector initial-byte #x20))
+           (iota #x04 #xF8))
+
+    ;; 3.2.5  All 2 first bytes of 6-byte sequences (0xfc-0xfd),
+    ;;        each followed by a space character:
+    ,@(map (lambda (initial-byte)
+             (bytevector initial-byte #x20))
+           (iota #x02 #xFC))
+
+    ;; 3.3  Sequences with last continuation byte missing
+    #u8(#xF8 #x80 #x80 #x80)           ; #\u+0000
+    #u8(#xFC #x80 #x80 #x80 #x80)      ; #\u+0000
+    #u8(#xFB #xBF #xBF #xBF)           ; #\u+03FFFFFF
+    #u8(#xFD #xBF #xBF #xBF #xBF)      ; #\u+7FFFFFFF
+
+    ;; 4.1  Examples of an overlong ASCII character
+    #u8(#xC0 #xAF)                      ; #\u+002F
+    #u8(#xE0 #x80 #xAF)                 ; #\u+002F
+    #u8(#xF0 #x80 #x80 #xAF)            ; #\u+002F
+    #u8(#xF8 #x80 #x80 #x80 #xAF)       ; #\u+002F
+    #u8(#xFC #x80 #x80 #x80 #x80 #xAF)  ; #\u+002F
+
+    ;; 4.2  Maximum overlong sequences
+    #u8(#xC1 #xBF)                      ; #\u+0000007F
+    #u8(#xE0 #x9F #xBF)                 ; #\u+000007FF
+    #u8(#xF0 #x8F #xBF #xBF)            ; #\u+0000FFFF
+    #u8(#xF8 #x87 #xBF #xBF #xBF)       ; #\u+001FFFFF
+    #u8(#xFC #x83 #xBF #xBF #xBF #xBF)  ; #\u+03FFFFFF
+
+    ;; 4.3  Overlong representation of the NUL character
+    #u8(#xC0 #x80)                      ; #\u+0000
+    #u8(#xE0 #x80 #x80)                 ; #\u+0000
+    #u8(#xF0 #x80 #x80 #x80)            ; #\u+0000
+    #u8(#xF8 #x80 #x80 #x80 #x80)       ; #\u+0000
+    #u8(#xFC #x80 #x80 #x80 #x80 #x80)  ; #\u+0000
+
+    ;; 3.5  Impossible bytes
+    #u8(#xFE)
+    #u8(#xFF)
+    #u8(#xFE #xFE #xFF #xFF)
+
+    ;; 5.1 Single UTF-16 surrogates
+    #u8(#xED #xA0 #x80)                 ; #\u+D800
+    #u8(#xED #xAD #xBF)                 ; #\u+DB7F
+    #u8(#xED #xAE #x80)                 ; #\u+DB80
+    #u8(#xED #xAF #xBF)                 ; #\u+DBFF
+    #u8(#xED #xB0 #x80)                 ; #\u+DC00
+    #u8(#xED #xBE #x80)                 ; #\u+DF80
+    #u8(#xED #xBF #xBF)                 ; #\u+DFFF
+
+    ;; 5.2 Paired UTF-16 surrogates
+    ;; (#\u+D800 #\u+DC00 #u8(#xED #xA0 #x80 #xED #xB0 #x80))
+    ;; (#\u+D800 #\u+DFFF #u8(#xED #xA0 #x80 #xED #xBF #xBF))
+    ;; (#\u+DB7F #\u+DC00 #u8(#xED #xAD #xBF #xED #xB0 #x80))
+    ;; (#\u+DB7F #\u+DFFF #u8(#xED #xAD #xBF #xED #xBF #xBF))
+    ;; (#\u+DB80 #\u+DC00 #u8(#xED #xAE #x80 #xED #xB0 #x80))
+    ;; (#\u+DB80 #\u+DFFF #u8(#xED #xAE #x80 #xED #xBF #xBF))
+    ;; (#\u+DBFF #\u+DC00 #u8(#xED #xAF #xBF #xED #xB0 #x80))
+    ;; (#\u+DBFF #\u+DFFF #u8(#xED #xAF #xBF #xED #xBF #xBF))
+
+    ;; 5.3 Noncharacter code positions
+    ;; Particularly problematic noncharacters in 16-bit applications:
+    #u8(#xEF #xBF #xBE)                 ; #\u+FFFE
+    #u8(#xEF #xBF #xBF)                 ; #\u+FFFF
+    ))
+\ No newline at end of file
author	Chris Hanson <org/chris-hanson/cph>
	Fri, 20 Jan 2017 09:46:10 +0000 (01:46 -0800)
committer	Chris Hanson <org/chris-hanson/cph>
	Fri, 20 Jan 2017 09:46:10 +0000 (01:46 -0800)