Auto merge of #21488 - aturon:os-str, r=alexcrichton

Per [RFC 517](https://github.com/rust-lang/rfcs/pull/575/), this commit introduces platform-native strings. The API is essentially as described in the RFC. The WTF-8 implementation is adapted from @SimonSapin's [implementation](https://github.com/SimonSapin/rust-wtf8). To make this work, some encodign and decoding functionality in `libcore` is now exported in a "raw" fashion reusable for WTF-8. These exports are *not* reexported in `std`, nor are they stable.
2015-01-24 19:39:52 +00:00 · 2015-01-24 19:39:52 +00:00 · bb7cc4eb26
commit bb7cc4eb26
parent 76fbb35831 c5369ebc7f
12 changed files with 1850 additions and 92 deletions
--- a/src/libcore/char.rs
+++ b/src/libcore/char.rs
@ -258,49 +258,69 @@ impl CharExt for char {
    #[inline]
    #[unstable = "pending decision about Iterator/Writer/Reader"]
    fn encode_utf8(self, dst: &mut [u8]) -> Option<uint> {
-        // Marked #[inline] to allow llvm optimizing it away
-        let code = self as u32;
-        if code < MAX_ONE_B && dst.len() >= 1 {
-            dst[0] = code as u8;
-            Some(1)
-        } else if code < MAX_TWO_B && dst.len() >= 2 {
-            dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B;
-            dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT;
-            Some(2)
-        } else if code < MAX_THREE_B && dst.len() >= 3  {
-            dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B;
-            dst[1] = (code >>  6u & 0x3F_u32) as u8 | TAG_CONT;
-            dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT;
-            Some(3)
-        } else if dst.len() >= 4 {
-            dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B;
-            dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT;
-            dst[2] = (code >>  6u & 0x3F_u32) as u8 | TAG_CONT;
-            dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT;
-            Some(4)
-        } else {
-            None
-        }
+        encode_utf8_raw(self as u32, dst)
    }

    #[inline]
    #[unstable = "pending decision about Iterator/Writer/Reader"]
    fn encode_utf16(self, dst: &mut [u16]) -> Option<uint> {
-        // Marked #[inline] to allow llvm optimizing it away
-        let mut ch = self as u32;
-        if (ch & 0xFFFF_u32) == ch  && dst.len() >= 1 {
-            // The BMP falls through (assuming non-surrogate, as it should)
-            dst[0] = ch as u16;
-            Some(1)
-        } else if dst.len() >= 2 {
-            // Supplementary planes break into surrogates.
-            ch -= 0x1_0000_u32;
-            dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
-            dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
-            Some(2)
-        } else {
-            None
-        }
+        encode_utf16_raw(self as u32, dst)
+    }
+}
+
+/// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
+/// and then returns the number of bytes written.
+///
+/// If the buffer is not large enough, nothing will be written into it
+/// and a `None` will be returned.
+#[inline]
+#[unstable]
+pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option<uint> {
+    // Marked #[inline] to allow llvm optimizing it away
+    if code < MAX_ONE_B && dst.len() >= 1 {
+        dst[0] = code as u8;
+        Some(1)
+    } else if code < MAX_TWO_B && dst.len() >= 2 {
+        dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B;
+        dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT;
+        Some(2)
+    } else if code < MAX_THREE_B && dst.len() >= 3  {
+        dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B;
+        dst[1] = (code >>  6u & 0x3F_u32) as u8 | TAG_CONT;
+        dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT;
+        Some(3)
+    } else if dst.len() >= 4 {
+        dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B;
+        dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT;
+        dst[2] = (code >>  6u & 0x3F_u32) as u8 | TAG_CONT;
+        dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT;
+        Some(4)
+    } else {
+        None
+    }
+}
+
+/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
+/// and then returns the number of `u16`s written.
+///
+/// If the buffer is not large enough, nothing will be written into it
+/// and a `None` will be returned.
+#[inline]
+#[unstable]
+pub fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option<uint> {
+    // Marked #[inline] to allow llvm optimizing it away
+    if (ch & 0xFFFF_u32) == ch  && dst.len() >= 1 {
+        // The BMP falls through (assuming non-surrogate, as it should)
+        dst[0] = ch as u16;
+        Some(1)
+    } else if dst.len() >= 2 {
+        // Supplementary planes break into surrogates.
+        ch -= 0x1_0000_u32;
+        dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
+        dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
+        Some(2)
+    } else {
+        None
    }
 }

--- a/src/libcore/str/mod.rs
+++ b/src/libcore/str/mod.rs
@ -305,43 +305,52 @@ fn unwrap_or_0(opt: Option<&u8>) -> u8 {
    }
 }

+/// Reads the next code point out of a byte iterator (assuming a
+/// UTF-8-like encoding).
+#[unstable]
+pub fn next_code_point(bytes: &mut slice::Iter<u8>) -> Option<u32> {
+    // Decode UTF-8
+    let x = match bytes.next() {
+        None => return None,
+        Some(&next_byte) if next_byte < 128 => return Some(next_byte as u32),
+        Some(&next_byte) => next_byte,
+    };
+
+    // Multibyte case follows
+    // Decode from a byte combination out of: [[[x y] z] w]
+    // NOTE: Performance is sensitive to the exact formulation here
+    let init = utf8_first_byte!(x, 2);
+    let y = unwrap_or_0(bytes.next());
+    let mut ch = utf8_acc_cont_byte!(init, y);
+    if x >= 0xE0 {
+        // [[x y z] w] case
+        // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
+        let z = unwrap_or_0(bytes.next());
+        let y_z = utf8_acc_cont_byte!((y & CONT_MASK) as u32, z);
+        ch = init << 12 | y_z;
+        if x >= 0xF0 {
+            // [x y z w] case
+            // use only the lower 3 bits of `init`
+            let w = unwrap_or_0(bytes.next());
+            ch = (init & 7) << 18 | utf8_acc_cont_byte!(y_z, w);
+        }
+    }
+
+    Some(ch)
+}
+
 #[stable]
 impl<'a> Iterator for Chars<'a> {
    type Item = char;

    #[inline]
    fn next(&mut self) -> Option<char> {
-        // Decode UTF-8, using the valid UTF-8 invariant
-        let x = match self.iter.next() {
-            None => return None,
-            Some(&next_byte) if next_byte < 128 => return Some(next_byte as char),
-            Some(&next_byte) => next_byte,
-        };
-
-        // Multibyte case follows
-        // Decode from a byte combination out of: [[[x y] z] w]
-        // NOTE: Performance is sensitive to the exact formulation here
-        let init = utf8_first_byte!(x, 2);
-        let y = unwrap_or_0(self.iter.next());
-        let mut ch = utf8_acc_cont_byte!(init, y);
-        if x >= 0xE0 {
-            // [[x y z] w] case
-            // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
-            let z = unwrap_or_0(self.iter.next());
-            let y_z = utf8_acc_cont_byte!((y & CONT_MASK) as u32, z);
-            ch = init << 12 | y_z;
-            if x >= 0xF0 {
-                // [x y z w] case
-                // use only the lower 3 bits of `init`
-                let w = unwrap_or_0(self.iter.next());
-                ch = (init & 7) << 18 | utf8_acc_cont_byte!(y_z, w);
+        next_code_point(&mut self.iter).map(|ch| {
+            // str invariant says `ch` is a valid Unicode Scalar Value
+            unsafe {
+                mem::transmute(ch)
            }
-        }
-
-        // str invariant says `ch` is a valid Unicode Scalar Value
-        unsafe {
-            Some(mem::transmute(ch))
-        }
+        })
    }

    #[inline]
@ -1517,25 +1526,8 @@ impl StrExt for str {

    #[inline]
    fn char_range_at(&self, i: uint) -> CharRange {
-        if self.as_bytes()[i] < 128u8 {
-            return CharRange {ch: self.as_bytes()[i] as char, next: i + 1 };
-        }
-
-        // Multibyte case is a fn to allow char_range_at to inline cleanly
-        fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
-            let mut val = s.as_bytes()[i] as u32;
-            let w = UTF8_CHAR_WIDTH[val as uint] as uint;
-            assert!((w != 0));
-
-            val = utf8_first_byte!(val, w);
-            val = utf8_acc_cont_byte!(val, s.as_bytes()[i + 1]);
-            if w > 2 { val = utf8_acc_cont_byte!(val, s.as_bytes()[i + 2]); }
-            if w > 3 { val = utf8_acc_cont_byte!(val, s.as_bytes()[i + 3]); }
-
-            return CharRange {ch: unsafe { mem::transmute(val) }, next: i + w};
-        }
-
-        return multibyte_char_range_at(self, i);
+        let (c, n) = char_range_at_raw(self.as_bytes(), i);
+        CharRange { ch: unsafe { mem::transmute(c) }, next: n }
    }

    #[inline]
@ -1653,6 +1645,32 @@ impl StrExt for str {
    fn parse<T: FromStr>(&self) -> Option<T> { FromStr::from_str(self) }
 }

+/// Pluck a code point out of a UTF-8-like byte slice and return the
+/// index of the next code point.
+#[inline]
+#[unstable]
+pub fn char_range_at_raw(bytes: &[u8], i: uint) -> (u32, usize) {
+    if bytes[i] < 128u8 {
+        return (bytes[i] as u32, i + 1);
+    }
+
+    // Multibyte case is a fn to allow char_range_at to inline cleanly
+    fn multibyte_char_range_at(bytes: &[u8], i: uint) -> (u32, usize) {
+        let mut val = bytes[i] as u32;
+        let w = UTF8_CHAR_WIDTH[val as uint] as uint;
+        assert!((w != 0));
+
+        val = utf8_first_byte!(val, w);
+        val = utf8_acc_cont_byte!(val, bytes[i + 1]);
+        if w > 2 { val = utf8_acc_cont_byte!(val, bytes[i + 2]); }
+        if w > 3 { val = utf8_acc_cont_byte!(val, bytes[i + 3]); }
+
+        return (val, i + w);
+    }
+
+    multibyte_char_range_at(bytes, i)
+}
+
 #[stable]
 impl<'a> Default for &'a str {
    #[stable]