diff --git a/src/etc/unicode.py b/src/etc/unicode.py index 48c14c0cd710..0f6e1c4c6062 100755 --- a/src/etc/unicode.py +++ b/src/etc/unicode.py @@ -26,11 +26,15 @@ def fetch(f): def load_unicode_data(f): fetch(f) gencats = {} + combines = [] canon_decomp = {} compat_decomp = {} curr_cat = "" + curr_combine = "" c_lo = 0 c_hi = 0 + com_lo = 0 + com_hi = 0 for line in fileinput.input(f): fields = line.split(";") if len(fields) != 15: @@ -69,7 +73,21 @@ def load_unicode_data(f): c_lo = code c_hi = code - return (canon_decomp, compat_decomp, gencats) + if curr_combine == "": + curr_combine = combine + com_lo = code + com_hi = code + + if curr_combine == combine: + com_hi = code + else: + if curr_combine != "0": + combines.append((com_lo, com_hi, curr_combine)) + curr_combine = combine + com_lo = code + com_hi = code + + return (canon_decomp, compat_decomp, gencats, combines) def load_derived_core_properties(f): @@ -193,7 +211,7 @@ def format_table_content(f, content, indent): line = " "*indent + chunk f.write(line) -def emit_decomp_module(f, canon, compat): +def emit_decomp_module(f, canon, compat, combine): canon_keys = canon.keys() canon_keys.sort() @@ -217,8 +235,26 @@ def emit_decomp_module(f, canon, compat): } None => None } + }\n +""") + + f.write(""" + fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 { + use cmp::{Equal, Less, Greater}; + match r.bsearch(|&(lo, hi, _)| { + if lo <= c && c <= hi { Equal } + else if hi < c { Less } + else { Greater } + }) { + Some(idx) => { + let (_, _, result) = r[idx]; + result + } + None => 0 + } }\n\n """) + f.write(" // Canonical decompositions\n") f.write(" static canonical_table : &'static [(char, &'static [char])] = &[\n") data = "" @@ -237,6 +273,7 @@ def emit_decomp_module(f, canon, compat): data += "])" format_table_content(f, data, 8) f.write("\n ];\n\n") + f.write(" // Compatibility decompositions\n") f.write(" static compatibility_table : &'static [(char, &'static [char])] = &[\n") data = "" @@ -255,10 +292,22 @@ def emit_decomp_module(f, canon, compat): data += "])" format_table_content(f, data, 8) f.write("\n ];\n\n") + + f.write(" static combining_class_table : &'static [(char, char, u8)] = &[\n") + ix = 0 + for pair in combine: + f.write(ch_prefix(ix)) + f.write("(%s, %s, %s)" % (escape_char(pair[0]), escape_char(pair[1]), pair[2])) + ix += 1 + f.write("\n ];\n") + f.write(" pub fn canonical(c: char, i: &fn(char)) " + "{ d(c, i, false); }\n\n") f.write(" pub fn compatibility(c: char, i: &fn(char)) " +"{ d(c, i, true); }\n\n") + f.write(" pub fn canonical_combining_class(c: char) -> u8 {\n" + + " bsearch_range_value_table(c, combining_class_table)\n" + + " }\n\n") f.write(" fn d(c: char, i: &fn(char), k: bool) {\n") f.write(" use iterator::Iterator;\n"); @@ -302,7 +351,7 @@ for i in [r]: os.remove(i); rf = open(r, "w") -(canon_decomp, compat_decomp, gencats) = load_unicode_data("UnicodeData.txt") +(canon_decomp, compat_decomp, gencats, combines) = load_unicode_data("UnicodeData.txt") # Preamble rf.write('''// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT @@ -324,7 +373,7 @@ rf.write('''// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGH emit_property_module(rf, "general_category", gencats) -emit_decomp_module(rf, canon_decomp, compat_decomp) +emit_decomp_module(rf, canon_decomp, compat_decomp, combines) derived = load_derived_core_properties("DerivedCoreProperties.txt") emit_property_module(rf, "derived_property", derived) diff --git a/src/libstd/unicode.rs b/src/libstd/unicode.rs index 587b5a79838b..6d763b58cd12 100644 --- a/src/libstd/unicode.rs +++ b/src/libstd/unicode.rs @@ -1469,6 +1469,22 @@ pub mod decompose { } + fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 { + use cmp::{Equal, Less, Greater}; + match r.bsearch(|&(lo, hi, _)| { + if lo <= c && c <= hi { Equal } + else if hi < c { Less } + else { Greater } + }) { + Some(idx) => { + let (_, _, result) = r[idx]; + result + } + None => 0 + } + } + + // Canonical decompositions static canonical_table : &'static [(char, &'static [char])] = &[ ('\xc0', &['\x41', '\u0300']), ('\xc1', &['\x41', '\u0301']), ('\xc2', &['\x41', '\u0302']), @@ -3465,10 +3481,160 @@ pub mod decompose { &['\u53ef']) ]; + static combining_class_table : &'static [(char, char, u8)] = &[ + ('\u0300', '\u0314', 230), ('\u0315', '\u0315', 232), + ('\u0316', '\u0319', 220), ('\u031a', '\u031a', 232), + ('\u031b', '\u031b', 216), ('\u031c', '\u0320', 220), + ('\u0321', '\u0322', 202), ('\u0323', '\u0326', 220), + ('\u0327', '\u0328', 202), ('\u0329', '\u0333', 220), + ('\u0334', '\u0338', 1), ('\u0339', '\u033c', 220), + ('\u033d', '\u0344', 230), ('\u0345', '\u0345', 240), + ('\u0346', '\u0346', 230), ('\u0347', '\u0349', 220), + ('\u034a', '\u034c', 230), ('\u034d', '\u034e', 220), + ('\u0350', '\u0352', 230), ('\u0353', '\u0356', 220), + ('\u0357', '\u0357', 230), ('\u0358', '\u0358', 232), + ('\u0359', '\u035a', 220), ('\u035b', '\u035b', 230), + ('\u035c', '\u035c', 233), ('\u035d', '\u035e', 234), + ('\u035f', '\u035f', 233), ('\u0360', '\u0361', 234), + ('\u0362', '\u0362', 233), ('\u0363', '\u036f', 230), + ('\u0483', '\u0487', 230), ('\u0591', '\u0591', 220), + ('\u0592', '\u0595', 230), ('\u0596', '\u0596', 220), + ('\u0597', '\u0599', 230), ('\u059a', '\u059a', 222), + ('\u059b', '\u059b', 220), ('\u059c', '\u05a1', 230), + ('\u05a2', '\u05a7', 220), ('\u05a8', '\u05a9', 230), + ('\u05aa', '\u05aa', 220), ('\u05ab', '\u05ac', 230), + ('\u05ad', '\u05ad', 222), ('\u05ae', '\u05ae', 228), + ('\u05af', '\u05af', 230), ('\u05b0', '\u05b0', 10), + ('\u05b1', '\u05b1', 11), ('\u05b2', '\u05b2', 12), + ('\u05b3', '\u05b3', 13), ('\u05b4', '\u05b4', 14), + ('\u05b5', '\u05b5', 15), ('\u05b6', '\u05b6', 16), + ('\u05b7', '\u05b7', 17), ('\u05b8', '\u05b8', 18), + ('\u05b9', '\u05ba', 19), ('\u05bb', '\u05bb', 20), + ('\u05bc', '\u05bc', 21), ('\u05bd', '\u05bd', 22), + ('\u05bf', '\u05bf', 23), ('\u05c1', '\u05c1', 24), + ('\u05c2', '\u05c2', 25), ('\u05c4', '\u05c4', 230), + ('\u05c5', '\u05c5', 220), ('\u05c7', '\u05c7', 18), + ('\u0610', '\u0617', 230), ('\u0618', '\u0618', 30), + ('\u0619', '\u0619', 31), ('\u061a', '\u061a', 32), + ('\u064b', '\u064b', 27), ('\u064c', '\u064c', 28), + ('\u064d', '\u064d', 29), ('\u064e', '\u064e', 30), + ('\u064f', '\u064f', 31), ('\u0650', '\u0650', 32), + ('\u0651', '\u0651', 33), ('\u0652', '\u0652', 34), + ('\u0653', '\u0654', 230), ('\u0655', '\u0656', 220), + ('\u0657', '\u065b', 230), ('\u065c', '\u065c', 220), + ('\u065d', '\u065e', 230), ('\u065f', '\u065f', 220), + ('\u0670', '\u0670', 35), ('\u06d6', '\u06dc', 230), + ('\u06df', '\u06e2', 230), ('\u06e3', '\u06e3', 220), + ('\u06e4', '\u06e4', 230), ('\u06e7', '\u06e8', 230), + ('\u06ea', '\u06ea', 220), ('\u06eb', '\u06ec', 230), + ('\u06ed', '\u06ed', 220), ('\u0711', '\u0711', 36), + ('\u0730', '\u0730', 230), ('\u0731', '\u0731', 220), + ('\u0732', '\u0733', 230), ('\u0734', '\u0734', 220), + ('\u0735', '\u0736', 230), ('\u0737', '\u0739', 220), + ('\u073a', '\u073a', 230), ('\u073b', '\u073c', 220), + ('\u073d', '\u073d', 230), ('\u073e', '\u073e', 220), + ('\u073f', '\u0741', 230), ('\u0742', '\u0742', 220), + ('\u0743', '\u0743', 230), ('\u0744', '\u0744', 220), + ('\u0745', '\u0745', 230), ('\u0746', '\u0746', 220), + ('\u0747', '\u0747', 230), ('\u0748', '\u0748', 220), + ('\u0749', '\u074a', 230), ('\u07eb', '\u07f1', 230), + ('\u07f2', '\u07f2', 220), ('\u07f3', '\u07f3', 230), + ('\u0816', '\u0819', 230), ('\u081b', '\u0823', 230), + ('\u0825', '\u0827', 230), ('\u0829', '\u082d', 230), + ('\u0859', '\u085b', 220), ('\u08e4', '\u08e5', 230), + ('\u08e6', '\u08e6', 220), ('\u08e7', '\u08e8', 230), + ('\u08e9', '\u08e9', 220), ('\u08ea', '\u08ec', 230), + ('\u08ed', '\u08ef', 220), ('\u08f0', '\u08f0', 27), + ('\u08f1', '\u08f1', 28), ('\u08f2', '\u08f2', 29), + ('\u08f3', '\u08f5', 230), ('\u08f6', '\u08f6', 220), + ('\u08f7', '\u08f8', 230), ('\u08f9', '\u08fa', 220), + ('\u08fb', '\u08fe', 230), ('\u093c', '\u093c', 7), + ('\u094d', '\u094d', 9), ('\u0951', '\u0951', 230), + ('\u0952', '\u0952', 220), ('\u0953', '\u0954', 230), + ('\u09bc', '\u09bc', 7), ('\u09cd', '\u09cd', 9), + ('\u0a3c', '\u0a3c', 7), ('\u0a4d', '\u0a4d', 9), + ('\u0abc', '\u0abc', 7), ('\u0acd', '\u0acd', 9), + ('\u0b3c', '\u0b3c', 7), ('\u0b4d', '\u0b4d', 9), + ('\u0bcd', '\u0bcd', 9), ('\u0c4d', '\u0c4d', 9), + ('\u0c55', '\u0c55', 84), ('\u0c56', '\u0c56', 91), + ('\u0cbc', '\u0cbc', 7), ('\u0ccd', '\u0ccd', 9), + ('\u0d4d', '\u0d4d', 9), ('\u0dca', '\u0dca', 9), + ('\u0e38', '\u0e39', 103), ('\u0e3a', '\u0e3a', 9), + ('\u0e48', '\u0e4b', 107), ('\u0eb8', '\u0eb9', 118), + ('\u0ec8', '\u0ecb', 122), ('\u0f18', '\u0f19', 220), + ('\u0f35', '\u0f35', 220), ('\u0f37', '\u0f37', 220), + ('\u0f39', '\u0f39', 216), ('\u0f71', '\u0f71', 129), + ('\u0f72', '\u0f72', 130), ('\u0f74', '\u0f74', 132), + ('\u0f7a', '\u0f7d', 130), ('\u0f80', '\u0f80', 130), + ('\u0f82', '\u0f83', 230), ('\u0f84', '\u0f84', 9), + ('\u0f86', '\u0f87', 230), ('\u0fc6', '\u0fc6', 220), + ('\u1037', '\u1037', 7), ('\u1039', '\u103a', 9), + ('\u108d', '\u108d', 220), ('\u135d', '\u135f', 230), + ('\u1714', '\u1714', 9), ('\u1734', '\u1734', 9), + ('\u17d2', '\u17d2', 9), ('\u17dd', '\u17dd', 230), + ('\u18a9', '\u18a9', 228), ('\u1939', '\u1939', 222), + ('\u193a', '\u193a', 230), ('\u193b', '\u193b', 220), + ('\u1a17', '\u1a17', 230), ('\u1a18', '\u1a18', 220), + ('\u1a60', '\u1a60', 9), ('\u1a75', '\u1a7c', 230), + ('\u1a7f', '\u1a7f', 220), ('\u1b34', '\u1b34', 7), + ('\u1b44', '\u1b44', 9), ('\u1b6b', '\u1b6b', 230), + ('\u1b6c', '\u1b6c', 220), ('\u1b6d', '\u1b73', 230), + ('\u1baa', '\u1bab', 9), ('\u1be6', '\u1be6', 7), + ('\u1bf2', '\u1bf3', 9), ('\u1c37', '\u1c37', 7), + ('\u1cd0', '\u1cd2', 230), ('\u1cd4', '\u1cd4', 1), + ('\u1cd5', '\u1cd9', 220), ('\u1cda', '\u1cdb', 230), + ('\u1cdc', '\u1cdf', 220), ('\u1ce0', '\u1ce0', 230), + ('\u1ce2', '\u1ce8', 1), ('\u1ced', '\u1ced', 220), + ('\u1cf4', '\u1cf4', 230), ('\u1dc0', '\u1dc1', 230), + ('\u1dc2', '\u1dc2', 220), ('\u1dc3', '\u1dc9', 230), + ('\u1dca', '\u1dca', 220), ('\u1dcb', '\u1dcc', 230), + ('\u1dcd', '\u1dcd', 234), ('\u1dce', '\u1dce', 214), + ('\u1dcf', '\u1dcf', 220), ('\u1dd0', '\u1dd0', 202), + ('\u1dd1', '\u1de6', 230), ('\u1dfc', '\u1dfc', 233), + ('\u1dfd', '\u1dfd', 220), ('\u1dfe', '\u1dfe', 230), + ('\u1dff', '\u1dff', 220), ('\u20d0', '\u20d1', 230), + ('\u20d2', '\u20d3', 1), ('\u20d4', '\u20d7', 230), + ('\u20d8', '\u20da', 1), ('\u20db', '\u20dc', 230), + ('\u20e1', '\u20e1', 230), ('\u20e5', '\u20e6', 1), + ('\u20e7', '\u20e7', 230), ('\u20e8', '\u20e8', 220), + ('\u20e9', '\u20e9', 230), ('\u20ea', '\u20eb', 1), + ('\u20ec', '\u20ef', 220), ('\u20f0', '\u20f0', 230), + ('\u2cef', '\u2cf1', 230), ('\u2d7f', '\u2d7f', 9), + ('\u2de0', '\u2dff', 230), ('\u302a', '\u302a', 218), + ('\u302b', '\u302b', 228), ('\u302c', '\u302c', 232), + ('\u302d', '\u302d', 222), ('\u302e', '\u302f', 224), + ('\u3099', '\u309a', 8), ('\ua66f', '\ua66f', 230), + ('\ua674', '\ua67d', 230), ('\ua69f', '\ua69f', 230), + ('\ua6f0', '\ua6f1', 230), ('\ua806', '\ua806', 9), + ('\ua8c4', '\ua8c4', 9), ('\ua8e0', '\ua8f1', 230), + ('\ua92b', '\ua92d', 220), ('\ua953', '\ua953', 9), + ('\ua9b3', '\ua9b3', 7), ('\ua9c0', '\ua9c0', 9), + ('\uaab0', '\uaab0', 230), ('\uaab2', '\uaab3', 230), + ('\uaab4', '\uaab4', 220), ('\uaab7', '\uaab8', 230), + ('\uaabe', '\uaabf', 230), ('\uaac1', '\uaac1', 230), + ('\uaaf6', '\uaaf6', 9), ('\uabed', '\uabed', 9), + ('\ufb1e', '\ufb1e', 26), ('\ufe20', '\ufe26', 230), + ('\U000101fd', '\U000101fd', 220), ('\U00010a0d', '\U00010a0d', 220), + ('\U00010a0f', '\U00010a0f', 230), ('\U00010a38', '\U00010a38', 230), + ('\U00010a39', '\U00010a39', 1), ('\U00010a3a', '\U00010a3a', 220), + ('\U00010a3f', '\U00010a3f', 9), ('\U00011046', '\U00011046', 9), + ('\U000110b9', '\U000110b9', 9), ('\U000110ba', '\U000110ba', 7), + ('\U00011100', '\U00011102', 230), ('\U00011133', '\U00011134', 9), + ('\U000111c0', '\U000111c0', 9), ('\U000116b6', '\U000116b6', 9), + ('\U000116b7', '\U000116b7', 7), ('\U0001d165', '\U0001d166', 216), + ('\U0001d167', '\U0001d169', 1), ('\U0001d16d', '\U0001d16d', 226), + ('\U0001d16e', '\U0001d172', 216), ('\U0001d17b', '\U0001d182', 220), + ('\U0001d185', '\U0001d189', 230), ('\U0001d18a', '\U0001d18b', 220), + ('\U0001d1aa', '\U0001d1ad', 230), ('\U0001d242', '\U0001d244', 230) + ]; pub fn canonical(c: char, i: &fn(char)) { d(c, i, false); } pub fn compatibility(c: char, i: &fn(char)) { d(c, i, true); } + pub fn canonical_combining_class(c: char) -> u8 { + bsearch_range_value_table(c, combining_class_table) + } + fn d(c: char, i: &fn(char), k: bool) { use iterator::Iterator; if c <= '\x7f' { i(c); return; }