From 1c3cfd2bb4f4da23eff585d8c5f02be40ee0b878 Mon Sep 17 00:00:00 2001 From: Chris Hanson Date: Thu, 9 Feb 2017 22:14:53 -0800 Subject: [PATCH] Add metadata to all of the XML properties. --- src/etc/ucd-converter.scm | 63 ++- src/etc/ucd-raw-props/names.scm | 676 +++++++++++++++++++++++++++----- 2 files changed, 627 insertions(+), 112 deletions(-) diff --git a/src/etc/ucd-converter.scm b/src/etc/ucd-converter.scm index 074449edd..b3cfa2887 100644 --- a/src/etc/ucd-converter.scm +++ b/src/etc/ucd-converter.scm @@ -56,18 +56,57 @@ USA. (define (raw-file-name name) (merge-pathnames (ustring-append name ".scm") raw-directory)) -(define ucd-version-file-name - (raw-file-name "version")) - -(define (prop-file-name prop-name) - (raw-file-name (ustring-append "prop-" prop-name))) +(define (read-ucd-property-metadata) + (let ((properties (read-file (raw-file-name "names")))) + (for-each (lambda (metadata) + (if (not (and (list? metadata) + (= 3 (length metadata)) + (string? (car metadata)) + (symbol? (cadr metadata)) + (property-type? (caddr metadata)))) + (error "Ill-formed property metadata record:" metadata))) + properties) + properties)) + +(define (property-type? object) + (or (simple-property-type? object) + (and (pair? object) + (symbol? (car object)) + (list? (cdr object)) + (case (car object) + ((enum) + (or (every string? (cdr object)) + (every (lambda (elt) + (and (pair? elt) + (string? (car elt)) + (or (boolean? (cdr elt)) + (symbol? (cdr elt))))) + (cdr object)))) + ((regex) + (and (= 2 (length object)) + (string? (cadr object)))) + ((or) + (and (= 3 (length object)) + (string? (cadr object)) + (simple-property-type? (caddr object)))) + (else #f))))) + +(define (simple-property-type? object) + (case object + ((boolean byte code-point code-point* code-point+ exact-rational + list-of-script string) + #t) + (else #f))) + +(define ucd-property-metadata + (read-ucd-property-metadata)) (define all-ucd-prop-names - (read-file (raw-file-name "names"))) - + (map car ucd-property-metadata)) + (define (write-standard-property-files document) (let ((ucd-version (ucd-description document))) - (call-with-output-file ucd-version-file-name + (call-with-output-file (ucd-version-file-name) (lambda (port) (write-line ucd-version port))) (for-each (lambda (prop-name) @@ -111,10 +150,16 @@ USA. (newline port)) (define (read-ucd-version-file) - (car (read-file ucd-version-file-name))) + (car (read-file (ucd-version-file-name)))) (define (read-prop-file prop-name) (read-file (prop-file-name prop-name))) + +(define (ucd-version-file-name) + (raw-file-name "version")) + +(define (prop-file-name prop-name) + (raw-file-name (ustring-append "prop-" prop-name))) ;;;; UCD property extraction diff --git a/src/etc/ucd-raw-props/names.scm b/src/etc/ucd-raw-props/names.scm index 2b9718df8..706644649 100644 --- a/src/etc/ucd-raw-props/names.scm +++ b/src/etc/ucd-raw-props/names.scm @@ -1,103 +1,573 @@ -"AHex" -"Alpha" -"Bidi_C" -"Bidi_M" -"CE" -"CI" -"CWCF" -"CWCM" -"CWKCF" -"CWL" -"CWT" -"CWU" -"Cased" -"Comp_Ex" -"DI" -"Dash" -"Dep" -"Dia" -"Ext" -"FC_NFKC" -"GCB" -"Gr_Base" -"Gr_Ext" -"Gr_Link" -"Hex" -"Hyphen" -"IDC" -"IDS" -"IDSB" -"IDST" -"Ideo" -"InMC" -"InPC" -"InSC" -"JSN" -"Join_C" -"LOE" -"Lower" -"Math" -"NChar" -"NFC_QC" -"NFD_QC" -"NFKC_CF" -"NFKC_QC" -"NFKD_QC" -"OAlpha" -"ODI" -"OGr_Ext" -"OIDC" -"OIDS" -"OLower" -"OMath" -"OUpper" -"PCM" -"Pat_Syn" -"Pat_WS" -"QMark" -"Radical" -"SB" -"SD" -"STerm" -"Term" -"UIdeo" -"Upper" -"VS" -"WB" -"WSpace" -"XIDC" -"XIDS" -"XO_NFC" -"XO_NFD" -"XO_NFKC" -"XO_NFKD" -"age" -"bc" -"blk" -"bmg" -"bpb" -"bpt" -"ccc" -"cf" -"dm" -"dt" -"ea" -"gc" -"hst" -"isc" -"jg" -"jt" -"lb" -"lc" -"na" -"na1" -"nt" -"nv" -"sc" -"scf" -"scx" -"slc" -"stc" -"suc" -"tc" -"uc" +("AHex" ascii-hex-digit boolean) +("Alpha" alphabetic boolean) +("Bidi_C" bidi-control boolean) +("Bidi_M" bidi-mirrored boolean) +("CE" composition-exclusion boolean) +("CI" case-ignorable boolean) +("CWCF" changes-when-case-folded boolean) +("CWCM" changes-when-case-mapped boolean) +("CWKCF" changes-when-nfkc-case-folded boolean) +("CWL" changes-when-lower-cased boolean) +("CWT" changes-when-title-cased boolean) +("CWU" changes-when-upper-cased boolean) +("Cased" cased boolean) +("Comp_Ex" full-composition-exclusion boolean) +("DI" default-ignorable-code-point boolean) +("Dash" dash boolean) +("Dep" deprecated boolean) +("Dia" diacritic boolean) +("Ext" extender boolean) +("FC_NFKC" fc-nfkc-closure (or "#" code-point+)) +("GCB" grapheme-cluster-break + (enum "CN" "CR" "EB" "EBG" "EM" "EX" "GAZ" "L" "LF" "LV" "LVT" + "PP" "RI" "SM" "T" "V" "XX" "ZWJ")) +("Gr_Base" grapheme-base boolean) +("Gr_Ext" grapheme-extend boolean) +("Gr_Link" grapheme-link boolean) +("Hex" hex-digit boolean) +("Hyphen" hyphen boolean) +("IDC" id-continue boolean) +("IDS" id-start boolean) +("IDSB" ids-binary-operator boolean) +("IDST" ids-trinary-operator boolean) +("Ideo" ideographic boolean) +("InMC" indic-matra-category + (enum "Right" + "Left" + "Visual_Order_Left" + "Left_And_Right" + "Top" + "Bottom" + "Top_And_Bottom" + "Top_And_Right" + "Top_And_Left" + "Top_And_Left_And_Right" + "Bottom_And_Right" + "Top_And_Bottom_And_Right" + "Overstruck" + "Invisible" + "NA")) +("InPC" indic-positional-category + (enum "Bottom" + "Bottom_And_Right" + "Left" + "Left_And_Right" + "NA" + "Overstruck" + "Right" + "Top" + "Top_And_Bottom" + "Top_And_Bottom_And_Right" + "Top_And_Left" + "Top_And_Left_And_Right" + "Top_And_Right" + "Visual_Order_Left")) +("InSC" indic-syllabic-category + (enum "Avagraha" + "Bindu" + "Brahmi_Joining_Number" + "Cantillation_Mark" + "Consonant" + "Consonant_Dead" + "Consonant_Final" + "Consonant_Head_Letter" + "Consonant_Killer" + "Consonant_Medial" + "Consonant_Placeholder" + "Consonant_Preceding_Repha" + "Consonant_Prefixed" + "Consonant_Repha" + "Consonant_Subjoined" + "Consonant_Succeeding_Repha" + "Consonant_With_Stacker" + "Gemination_Mark" + "Invisible_Stacker" + "Joiner" + "Modifying_Letter" + "Non_Joiner" + "Nukta" + "Number" + "Number_Joiner" + "Other" + "Pure_Killer" + "Register_Shifter" + "Syllable_Modifier" + "Tone_Letter" + "Tone_Mark" + "Virama" + "Visarga" + "Vowel" + "Vowel_Dependent" + "Vowel_Independent")) +("JSN" jamo-short-name (regex "[A-Z]{0,3}")) +("Join_C" join-control boolean) +("LOE" logical-order-exception boolean) +("Lower" lower-case boolean) +("Math" math boolean) +("NChar" noncharactor-code-point boolean) +("NFC_QC" nfc-quick-check (enum "Y" "N" "M")) +("NFD_QC" nfd-quick-check (enum "Y" "N")) +("NFKC_CF" nfkc-case-fold (or "#" code-point*)) +("NFKC_QC" nfkc-quick-check (enum "Y" "N" "M")) +("NFKD_QC" nfkd-quick-check (enum "Y" "N")) +("OAlpha" other-alphabetic boolean) +("ODI" other-default-ignorable-code-point boolean) +("OGr_Ext" other-grapheme-extend boolean) +("OIDC" other-id-continue boolean) +("OIDS" other-id-start boolean) +("OLower" other-lower-case boolean) +("OMath" other-math boolean) +("OUpper" other-upper-case boolean) +("PCM" prepended-concatenation-mark boolean) +("Pat_Syn" pattern-syntax boolean) +("Pat_WS" pattern-white-space boolean) +("QMark" quotation-mark boolean) +("Radical" radical boolean) +("SB" sentence-break + (enum "AT" "CL" "CR" "EX" "FO" "LE" "LF" "LO" "NU" "SC" "SE" + "SP" "ST" "UP" "XX")) +("SD" soft-dotted boolean) +("STerm" sentence-terminal boolean) +("Term" terminal-punctuation boolean) +("UIdeo" unified-ideograph boolean) +("Upper" upper-case boolean) +("VS" variation-selector boolean) +("WB" word-break + (enum "CR" "DQ" "EB" "EBG" "EM" "EX" "Extend" "FO" "GAZ" "HL" "KA" "LE" "LF" + "MB" "ML" "MN" "NL" "NU" "RI" "SQ" "XX" "ZWJ")) +("WSpace" white-space boolean) +("XIDC" xid-continue boolean) +("XIDS" xid-start boolean) +("XO_NFC" expands-on-nfc boolean) +("XO_NFD" expands-on-nfd boolean) +("XO_NFKC" expands-on-nfkc boolean) +("XO_NFKD" expands-on-nfkd boolean) +("age" age + (enum "1.1" "2.0" "2.1" "3.0" "3.1" "3.2" "4.0" "4.1" "5.0" "5.1" "5.2" + "6.0" "6.1" "6.2" "6.3" "7.0" "8.0" "9.0" "unassigned")) +("bc" bidirectional-class + (enum "AL" "AN" "B " "BN" "CS" "EN" "ES" "ET" "FSI" "L" "LRE" "LRI" "LRO" + "NSM" "ON" "PDF" "PDI" "R" "RLE" "RLI" "RLO" "S" "WS")) +("blk" block + (enum "Adlam" + "Aegean_Numbers" + "Ahom" + "Alchemical" + "Alphabetic_PF" + "Anatolian_Hieroglyphs" + "Ancient_Greek_Music" + "Ancient_Greek_Numbers" + "Ancient_Symbols" + "Arabic" + "Arabic_Ext_A" + "Arabic_Math" + "Arabic_PF_A" + "Arabic_PF_B" + "Arabic_Sup" + "Armenian" + "Arrows" + "ASCII" + "Avestan" + "Balinese" + "Bamum" + "Bamum_Sup" + "Bassa_Vah" + "Batak" + "Bengali" + "Bhaiksuki" + "Block_Elements" + "Bopomofo" + "Bopomofo_Ext" + "Box_Drawing" + "Brahmi" + "Braille" + "Buginese" + "Buhid" + "Byzantine_Music" + "Carian" + "Caucasian_Albanian" + "Chakma" + "Cham" + "Cherokee" + "Cherokee_Sup" + "CJK" + "CJK_Compat" + "CJK_Compat_Forms" + "CJK_Compat_Ideographs" + "CJK_Compat_Ideographs_Sup" + "CJK_Ext_A" + "CJK_Ext_B" + "CJK_Ext_C" + "CJK_Ext_D" + "CJK_Ext_E" + "CJK_Radicals_Sup" + "CJK_Strokes" + "CJK_Symbols" + "Compat_Jamo" + "Control_Pictures" + "Coptic" + "Coptic_Epact_Numbers" + "Counting_Rod" + "Cuneiform" + "Cuneiform_Numbers" + "Currency_Symbols" + "Cypriot_Syllabary" + "Cyrillic" + "Cyrillic_Ext_A" + "Cyrillic_Ext_B" + "Cyrillic_Ext_C" + "Cyrillic_Sup" + "Deseret" + "Devanagari" + "Devanagari_Ext" + "Diacriticals" + "Diacriticals_For_Symbols" + "Diacriticals_Sup" + "Diacriticals_Ext" + "Dingbats" + "Domino" + "Duployan" + "Early_Dynastic_Cuneiform" + "Egyptian_Hieroglyphs" + "Elbasan" + "Emoticons" + "Enclosed_Alphanum" + "Enclosed_Alphanum_Sup" + "Enclosed_CJK" + "Enclosed_Ideographic_Sup" + "Ethiopic" + "Ethiopic_Ext" + "Ethiopic_Ext_A" + "Ethiopic_Sup" + "Geometric_Shapes" + "Geometric_Shapes_Ext" + "Georgian" + "Georgian_Sup" + "Glagolitic" + "Glagolitic_Sup" + "Gothic" + "Grantha" + "Greek" + "Greek_Ext" + "Gujarati" + "Gurmukhi" + "Half_And_Full_Forms" + "Half_Marks" + "Hangul" + "Hanunoo" + "Hatran" + "Hebrew" + "High_PU_Surrogates" + "High_Surrogates" + "Hiragana" + "IDC" + "Ideographic_Symbols" + "Imperial_Aramaic" + "Indic_Number_Forms" + "Inscriptional_Pahlavi" + "Inscriptional_Parthian" + "IPA_Ext" + "Jamo" + "Jamo_Ext_A" + "Jamo_Ext_B" + "Javanese" + "Kaithi" + "Kana_Sup" + "Kanbun" + "Kangxi" + "Kannada" + "Katakana" + "Katakana_Ext" + "Kayah_Li" + "Kharoshthi" + "Khmer" + "Khmer_Symbols" + "Khojki" + "Khudawadi" + "Lao" + "Latin_1_Sup" + "Latin_Ext_A" + "Latin_Ext_Additional" + "Latin_Ext_B" + "Latin_Ext_C" + "Latin_Ext_D" + "Latin_Ext_E" + "Lepcha" + "Letterlike_Symbols" + "Limbu" + "Linear_A" + "Linear_B_Ideograms" + "Linear_B_Syllabary" + "Lisu" + "Low_Surrogates" + "Lycian" + "Lydian" + "Mahajani" + "Mahjong" + "Malayalam" + "Mandaic" + "Manichaean" + "Marchen" + "Math_Alphanum" + "Math_Operators" + "Meetei_Mayek" + "Meetei_Mayek_Ext" + "Mende_Kikakui" + "Meroitic_Cursive" + "Meroitic_Hieroglyphs" + "Miao" + "Misc_Arrows" + "Misc_Math_Symbols_A" + "Misc_Math_Symbols_B" + "Misc_Pictographs" + "Misc_Symbols" + "Misc_Technical" + "Modi" + "Modifier_Letters" + "Modifier_Tone_Letters" + "Mongolian" + "Mongolian_Sup" + "Mro" + "Music" + "Multani" + "Myanmar" + "Myanmar_Ext_A" + "Myanmar_Ext_B" + "Nabataean" + "NB" + "New_Tai_Lue" + "Newa" + "NKo" + "Number_Forms" + "OCR" + "Ogham" + "Ol_Chiki" + "Old_Hungarian" + "Old_Italic" + "Old_North_Arabian" + "Old_Permic" + "Old_Persian" + "Old_South_Arabian" + "Old_Turkic" + "Oriya" + "Ornamental_Dingbats" + "Osage" + "Osmanya" + "Pahawh_Hmong" + "Palmyrene" + "Pau_Cin_Hau" + "Phags_Pa" + "Phaistos" + "Phoenician" + "Phonetic_Ext" + "Phonetic_Ext_Sup" + "Playing_Cards" + "Psalter_Pahlavi" + "PUA" + "Punctuation" + "Rejang" + "Rumi" + "Runic" + "Samaritan" + "Saurashtra" + "Sharada" + "Shavian" + "Shorthand_Format_Controls" + "Siddham" + "Sinhala" + "Sinhala_Archaic_Numbers" + "Small_Forms" + "Sora_Sompeng" + "Specials" + "Sundanese" + "Sundanese_Sup" + "Sup_Arrows_A" + "Sup_Arrows_B" + "Sup_Arrows_C" + "Sup_Math_Operators" + "Sup_PUA_A" + "Sup_PUA_B" + "Sup_Punctuation" + "Sup_Symbols_And_Pictographs" + "Super_And_Sub" + "Sutton_SignWriting" + "Syloti_Nagri" + "Syriac" + "Tagalog" + "Tagbanwa" + "Tags" + "Tai_Le" + "Tai_Tham" + "Tai_Viet" + "Tai_Xuan_Jing" + "Takri" + "Tamil" + "Tangut" + "Tangut_Components" + "Telugu" + "Thaana" + "Thai" + "Tibetan" + "Tifinagh" + "Tirhuta" + "Transport_And_Map" + "UCAS" + "UCAS_Ext" + "Ugaritic" + "Vai" + "Vedic_Ext" + "Vertical_Forms" + "VS" + "VS_Sup" + "Warang_Citi" + "Yi_Radicals" + "Yi_Syllables" + "Yijing")) +("bmg" mirror-image (or "" code-point)) +("bpb" bidi-paired-bracket (or "#" code-point)) +("bpt" bidi-paired-bracket-type (enum "o" "c" "n")) +("ccc" combining-class byte) ;<= 254 +("cf" case-folding (or "#" code-point+)) +("dm" decomposition-mapping (or "#" code-point*)) +("dt" decomposition-type + (enum "can" "com" "enc" "fin" "font" "fra" "init" "iso" "med" + "nar" "nb" "sml" "sqr" "sub" "sup" "vert" "wide" "none")) +("ea" east-asian-width (enum "A" "F" "H" "N" "Na" "W")) +("gc" general-category + (enum ("Lu" . letter:uppercase) + ("Ll" . letter:lowercase) + ("Lt" . letter:titlecase) + ("Lm" . letter:modifier) + ("Lo" . letter:other) + ("Mn" . mark:nonspacing) + ("Mc" . mark:spacing-combining) + ("Me" . mark:enclosing) + ("Nd" . number:decimal-digit) + ("Nl" . number:letter) + ("No" . number:other) + ("Pc" . punctuation:connector) + ("Pd" . punctuation:dash) + ("Ps" . punctuation:open) + ("Pe" . punctuation:close) + ("Pi" . punctuation:initial-quote) + ("Pf" . punctuation:final-quote) + ("Po" . punctuation:other) + ("Sm" . symbol:math) + ("Sc" . symbol:currency) + ("Sk" . symbol:modifier) + ("So" . symbol:other) + ("Zs" . separator:space) + ("Zl" . separator:line) + ("Zp" . separator:paragraph) + ("Cc" . other:control) + ("Cf" . other:format) + ("Cs" . other:surrogate) + ("Co" . other:private-use) + ("Cn" . other:not-assigned))) +("hst" hangul-syllable-type (enum "L" "LV" "LVT" "T" "V" "NA")) +("isc" iso-10646-comment string) +("jg" joining-group + (enum "African_Feh" "African_Noon" "African_Qaf" + "Ain" "Alaph" "Alef" "Alef_Maqsurah" + "Beh" "Beth" "Burushaski_Yeh_Barree" + "Dal" "Dalath_Rish" "E" + "Farsi_Yeh" "Fe" "Feh" "Final_Semkath" + "Gaf" "Gamal" + "Hah" "Hamza_On_Heh_Goal" "He" + "Heh" "Heh_Goal" "Heth" + "Kaf" "Kaph" "Khaph" "Knotted_Heh" + "Lam" "Lamadh" + "Manichaean_Aleph" + "Manichaean_Ayin" + "Manichaean_Beth" + "Manichaean_Daleth" + "Manichaean_Dhamedh" + "Manichaean_Five" + "Manichaean_Gimel" + "Manichaean_Heth" + "Manichaean_Hundred" + "Manichaean_Kaph" + "Manichaean_Lamedh" + "Manichaean_Mem" + "Manichaean_Nun" + "Manichaean_One" + "Manichaean_Pe" + "Manichaean_Qoph" + "Manichaean_Resh" + "Manichaean_Sadhe" + "Manichaean_Samekh" + "Manichaean_Taw" + "Manichaean_Ten" + "Manichaean_Teth" + "Manichaean_Thamedh" + "Manichaean_Twenty" + "Manichaean_Waw" + "Manichaean_Yodh" + "Manichaean_Zayin" + "Meem" "Mim" + "No_Joining_Group" "Noon" "Nun" "Nya" + "Pe" "Qaf" "Qaph" "Reh" "Reversed_Pe" + "Rohingya_Yeh" + "Sad" "Sadhe" "Seen" "Semkath" "Shin" + "Straight_Waw" + "Swash_Kaf" "Syriac_Waw" "Tah" "Taw" + "Teh_Marbuta" "Teh_Marbuta_Goal" "Teth" "Waw" "Yeh" + "Yeh_Barree" "Yeh_With_Tail" "Yudh" + "Yudh_He" "Zain" "Zhain")) +("jt" joining-class (enum "U" "C" "T" "D" "L" "R")) +("lb" line-break + (enum "AI" "AL" "B2" "BA" "BB" "BK" "CB" "CJ" "CL" "CM" "CP" "CR" "EB" + "EM" "EX" "GL" "H2" "H3" "HL" "HY" "ID" "IN" "IS" "JL" "JT" "JV" + "LF" "NL" "NS" "NU" "OP" "PO" "PR" "QU" "RI" "SA" "SG" "SP" "SY" + "WJ" "XX" "ZW" "ZWJ")) +("lc" lower-case (or "#" code-point+)) +("na" name string) +("na1" name string) +("nt" numeric-type + (enum ("None" . #f) + ("De" . decimal) + ("Di" . digit) + ("Nu" . numeric))) +("nv" numeric-value (or "NaN" exact-rational)) +("sc" script + (enum "Adlm" "Aghb" "Ahom" "Arab" "Armi" "Armn" "Avst" + "Bali" "Bamu" "Bass" "Batk" "Beng" "Bhks" + "Bopo" "Brah" "Brai" "Bugi" "Buhd" + "Cakm" "Cans" "Cari" "Cham" "Cher" "Copt" "Cprt" + "Cyrl" + "Deva" "Dsrt" "Dupl" + "Elba" "Egyp" "Ethi" + "Geor" "Glag" "Goth" "Gran" "Grek" "Gujr" "Guru" + "Hang" "Hani" "Hano" "Hatr" "Hebr" "Hira" "Hluw" + "Hmng" "Hrkt" "Hung" + "Ital" + "Java" + "Kali" "Kana" "Khar" "Khmr" "Khoj" "Knda" "Kthi" + "Lana" "Laoo" "Latn" "Lepc" "Limb" "Lina" "Linb" + "Lisu" "Lyci" "Lydi" + "Mahj" "Mand" "Mani" "Marc" + "Mend" "Merc" "Mero" "Mlym" + "Modi" "Mong" "Mroo" "Mtei" "Mult" "Mymr" + "Narb" "Nbat" "Newa" "Nkoo" + "Ogam" "Olck" "Orkh" "Orya" "Osge" "Osma" + "Palm" "Pauc" "Perm" "Phag" "Phli" "Phlp" "Phnx" + "Plrd" "Prti" + "Qaai" + "Rjng" "Runr" + "Samr" "Sarb" "Saur" "Sgnw" "Shaw" "Shrd" "Sidd" + "Sind" "Sinh" "Sora" "Sund" "Sylo" "Syrc" + "Tagb" "Takr" "Tale" "Talu" "Taml" "Tang" "Tavt" + "Telu" "Tfng" "Tglg" "Thaa" "Thai" "Tibt" "Tirh" + "Ugar" + "Vaii" + "Wara" + "Xpeo" "Xsux" + "Yiii" + "Zinh" "Zyyy" "Zzzz")) +("scf" simple-case-folding (or "#" code-point)) +("scx" script-extension list-of-script) +("slc" simple-lower-case (or "#" code-point)) +("stc" simple-title-case (or "#" code-point)) +("suc" simple-upper-case (or "#" code-point)) +("tc" title-case (or "#" code-point+)) +("uc" upper-case (or "#" code-point+)) -- 2.25.1