From 8c54d5bf406fbfdbebd1a4553f430fca02b2c117 Mon Sep 17 00:00:00 2001 From: Florian Zeitz Date: Mon, 12 May 2014 22:25:38 +0200 Subject: [PATCH] core: Move Hangul decomposition into unicode.rs --- src/etc/unicode.py | 79 +++++++++++++++++++++++++++++++----------- src/libcore/char.rs | 54 ++++------------------------- src/libcore/unicode.rs | 48 +++++++++++++++++++++++-- 3 files changed, 111 insertions(+), 70 deletions(-) diff --git a/src/etc/unicode.py b/src/etc/unicode.py index e98c65ca50ee..f079ef73cd8e 100755 --- a/src/etc/unicode.py +++ b/src/etc/unicode.py @@ -321,17 +321,24 @@ def emit_core_decomp_module(f, canon, compat): format_table_content(f, data, 8) f.write("\n ];\n\n") - f.write(" pub fn canonical(c: char, i: |char|) " - + "{ d(c, i, false); }\n\n") - f.write(" pub fn compatibility(c: char, i: |char|) " - +"{ d(c, i, true); }\n\n") - f.write(" fn d(c: char, i: |char|, k: bool) {\n") - f.write(" use iter::Iterator;\n"); - - f.write(" if c <= '\\x7f' { i(c); return; }\n") - - # First check the canonical decompositions f.write(""" + pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); } + + pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); } + + fn d(c: char, i: |char|, k: bool) { + use iter::Iterator; + + // 7-bit ASCII never decomposes + if c <= '\\x7f' { i(c); return; } + + // Perform decomposition for Hangul + if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) { + decompose_hangul(c, i); + return; + } + + // First check the canonical decompositions match bsearch_table(c, canonical_table) { Some(canon) => { for x in canon.iter() { @@ -340,13 +347,12 @@ def emit_core_decomp_module(f, canon, compat): return; } None => () - }\n\n""") + } - # Bottom out if we're not doing compat. - f.write(" if !k { i(c); return; }\n") + // Bottom out if we're not doing compat. + if !k { i(c); return; } - # Then check the compatibility decompositions - f.write(""" + // Then check the compatibility decompositions match bsearch_table(c, compatibility_table) { Some(compat) => { for x in compat.iter() { @@ -355,12 +361,45 @@ def emit_core_decomp_module(f, canon, compat): return; } None => () - }\n\n""") + } - # Finally bottom out. - f.write(" i(c);\n") - f.write(" }\n") - f.write("}\n\n") + // Finally bottom out. + i(c); + } + + // Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior + static S_BASE: u32 = 0xAC00; + static L_BASE: u32 = 0x1100; + static V_BASE: u32 = 0x1161; + static T_BASE: u32 = 0x11A7; + static L_COUNT: u32 = 19; + static V_COUNT: u32 = 21; + static T_COUNT: u32 = 28; + static N_COUNT: u32 = (V_COUNT * T_COUNT); + static S_COUNT: u32 = (L_COUNT * N_COUNT); + + // Decompose a precomposed Hangul syllable + fn decompose_hangul(s: char, f: |char|) { + use cast::transmute; + + let si = s as u32 - S_BASE; + + let li = si / N_COUNT; + unsafe { + f(transmute(L_BASE + li)); + + let vi = (si % N_COUNT) / T_COUNT; + f(transmute(V_BASE + vi)); + + let ti = si % T_COUNT; + if ti > 0 { + f(transmute(T_BASE + ti)); + } + } + } +} + +""") def emit_std_decomp_module(f, combine): f.write("pub mod decompose {\n"); diff --git a/src/libcore/char.rs b/src/libcore/char.rs index ca5e56f0649c..71a2d75715b5 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -27,7 +27,12 @@ use mem::transmute; use option::{None, Option, Some}; use iter::{Iterator, range_step}; -use unicode::{derived_property, property, general_category, decompose, conversions}; +use unicode::{derived_property, property, general_category, conversions}; + +/// Returns the canonical decomposition of a character. +pub use unicode::decompose::decompose_canonical; +/// Returns the compatibility decomposition of a character. +pub use unicode::decompose::decompose_compatible; #[cfg(not(test))] use cmp::{Eq, Ord, TotalEq, TotalOrd, Ordering}; #[cfg(not(test))] use default::Default; @@ -285,53 +290,6 @@ pub fn from_digit(num: uint, radix: uint) -> Option { } } -// Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior -static S_BASE: u32 = 0xAC00; -static L_BASE: u32 = 0x1100; -static V_BASE: u32 = 0x1161; -static T_BASE: u32 = 0x11A7; -static L_COUNT: u32 = 19; -static V_COUNT: u32 = 21; -static T_COUNT: u32 = 28; -static N_COUNT: u32 = (V_COUNT * T_COUNT); -static S_COUNT: u32 = (L_COUNT * N_COUNT); - -// Decompose a precomposed Hangul syllable -fn decompose_hangul(s: char, f: |char|) { - let si = s as u32 - S_BASE; - - let li = si / N_COUNT; - unsafe { - f(transmute(L_BASE + li)); - - let vi = (si % N_COUNT) / T_COUNT; - f(transmute(V_BASE + vi)); - - let ti = si % T_COUNT; - if ti > 0 { - f(transmute(T_BASE + ti)); - } - } -} - -/// Returns the canonical decomposition of a character -pub fn decompose_canonical(c: char, f: |char|) { - if (c as u32) < S_BASE || (c as u32) >= (S_BASE + S_COUNT) { - decompose::canonical(c, f); - } else { - decompose_hangul(c, f); - } -} - -/// Returns the compatibility decomposition of a character -pub fn decompose_compatible(c: char, f: |char|) { - if (c as u32) < S_BASE || (c as u32) >= (S_BASE + S_COUNT) { - decompose::compatibility(c, f); - } else { - decompose_hangul(c, f); - } -} - /// /// Returns the hexadecimal Unicode escape of a `char` /// diff --git a/src/libcore/unicode.rs b/src/libcore/unicode.rs index b3298bde0554..bffde2323bf9 100644 --- a/src/libcore/unicode.rs +++ b/src/libcore/unicode.rs @@ -2121,14 +2121,24 @@ pub mod decompose { &['\u53ef']) ]; - pub fn canonical(c: char, i: |char|) { d(c, i, false); } - pub fn compatibility(c: char, i: |char|) { d(c, i, true); } + pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); } + + pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); } fn d(c: char, i: |char|, k: bool) { use iter::Iterator; + + // 7-bit ASCII never decomposes if c <= '\x7f' { i(c); return; } + // Perform decomposition for Hangul + if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) { + decompose_hangul(c, i); + return; + } + + // First check the canonical decompositions match bsearch_table(c, canonical_table) { Some(canon) => { for x in canon.iter() { @@ -2139,8 +2149,10 @@ pub mod decompose { None => () } + // Bottom out if we're not doing compat. if !k { i(c); return; } + // Then check the compatibility decompositions match bsearch_table(c, compatibility_table) { Some(compat) => { for x in compat.iter() { @@ -2151,8 +2163,40 @@ pub mod decompose { None => () } + // Finally bottom out. i(c); } + + // Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior + static S_BASE: u32 = 0xAC00; + static L_BASE: u32 = 0x1100; + static V_BASE: u32 = 0x1161; + static T_BASE: u32 = 0x11A7; + static L_COUNT: u32 = 19; + static V_COUNT: u32 = 21; + static T_COUNT: u32 = 28; + static N_COUNT: u32 = (V_COUNT * T_COUNT); + static S_COUNT: u32 = (L_COUNT * N_COUNT); + + // Decompose a precomposed Hangul syllable + fn decompose_hangul(s: char, f: |char|) { + use mem::transmute; + + let si = s as u32 - S_BASE; + + let li = si / N_COUNT; + unsafe { + f(transmute(L_BASE + li)); + + let vi = (si % N_COUNT) / T_COUNT; + f(transmute(V_BASE + vi)); + + let ti = si % T_COUNT; + if ti > 0 { + f(transmute(T_BASE + ti)); + } + } + } } pub mod derived_property {