diff --git a/src/libcollections/lib.rs b/src/libcollections/lib.rs index 7658611d809a..5179b04f8824 100644 --- a/src/libcollections/lib.rs +++ b/src/libcollections/lib.rs @@ -40,6 +40,7 @@ #![feature(str_char)] #![feature(slice_patterns)] #![feature(debug_builders)] +#![feature(utf8_error)] #![cfg_attr(test, feature(rand, rustc_private, test, hash, collections))] #![cfg_attr(test, allow(deprecated))] // rand diff --git a/src/libcollections/string.rs b/src/libcollections/string.rs index 441d0f2c5df7..9c9f2d628b82 100644 --- a/src/libcollections/string.rs +++ b/src/libcollections/string.rs @@ -132,7 +132,7 @@ impl String { /// /// let invalid_vec = vec![240, 144, 128]; /// let s = String::from_utf8(invalid_vec).err().unwrap(); - /// assert_eq!(s.utf8_error(), Utf8Error::TooShort); + /// let err = s.utf8_error(); /// assert_eq!(s.into_bytes(), [240, 144, 128]); /// ``` #[inline] @@ -156,14 +156,10 @@ impl String { /// ``` #[stable(feature = "rust1", since = "1.0.0")] pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> Cow<'a, str> { - let mut i = 0; + let mut i; match str::from_utf8(v) { Ok(s) => return Cow::Borrowed(s), - Err(e) => { - if let Utf8Error::InvalidByte(firstbad) = e { - i = firstbad; - } - } + Err(e) => i = e.valid_up_to(), } const TAG_CONT_U8: u8 = 128; @@ -188,9 +184,9 @@ impl String { }; } - // subseqidx is the index of the first byte of the subsequence we're looking at. - // It's used to copy a bunch of contiguous good codepoints at once instead of copying - // them one by one. + // subseqidx is the index of the first byte of the subsequence we're + // looking at. It's used to copy a bunch of contiguous good codepoints + // at once instead of copying them one by one. let mut subseqidx = i; while i < total { diff --git a/src/libcollectionstest/str.rs b/src/libcollectionstest/str.rs index 15f15900e783..cacafab4e3c1 100644 --- a/src/libcollectionstest/str.rs +++ b/src/libcollectionstest/str.rs @@ -1502,7 +1502,7 @@ fn test_str_from_utf8() { assert_eq!(from_utf8(xs), Ok("ศไทย中华Việt Nam")); let xs = b"hello\xFF"; - assert_eq!(from_utf8(xs), Err(Utf8Error::TooShort)); + assert!(from_utf8(xs).is_err()); } #[test] diff --git a/src/libcollectionstest/string.rs b/src/libcollectionstest/string.rs index 5d6aa8ac0dcc..3184f842e9ae 100644 --- a/src/libcollectionstest/string.rs +++ b/src/libcollectionstest/string.rs @@ -45,7 +45,6 @@ fn test_from_utf8() { let xs = b"hello\xFF".to_vec(); let err = String::from_utf8(xs).err().unwrap(); - assert_eq!(err.utf8_error(), Utf8Error::TooShort); assert_eq!(err.into_bytes(), b"hello\xff".to_vec()); } diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index 9bc760b56ec4..fc623f21167c 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -106,19 +106,19 @@ Section: Creating a string /// Errors which can occur when attempting to interpret a byte slice as a `str`. #[derive(Copy, Eq, PartialEq, Clone, Debug)] -#[unstable(feature = "core", - reason = "error enumeration recently added and definitions may be refined")] -pub enum Utf8Error { - /// An invalid byte was detected at the byte offset given. - /// - /// The offset is guaranteed to be in bounds of the slice in question, and - /// the byte at the specified offset was the first invalid byte in the - /// sequence detected. - InvalidByte(usize), +#[stable(feature = "rust1", since = "1.0.0")] +pub struct Utf8Error { + valid_up_to: usize, +} - /// The byte slice was invalid because more bytes were needed but no more - /// bytes were available. - TooShort, +impl Utf8Error { + /// Returns the index in the given string up to which valid UTF-8 was + /// verified. + /// + /// Starting at the index provided, but not necessarily at it precisely, an + /// invalid UTF-8 encoding sequence was found. + #[unstable(feature = "utf8_error", reason = "method just added")] + pub fn valid_up_to(&self) -> usize { self.valid_up_to } } /// Converts a slice of bytes to a string slice without performing any @@ -147,14 +147,7 @@ pub unsafe fn from_utf8_unchecked<'a>(v: &'a [u8]) -> &'a str { #[stable(feature = "rust1", since = "1.0.0")] impl fmt::Display for Utf8Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match *self { - Utf8Error::InvalidByte(n) => { - write!(f, "invalid utf-8: invalid byte at index {}", n) - } - Utf8Error::TooShort => { - write!(f, "invalid utf-8: byte slice too short") - } - } + write!(f, "invalid utf-8: invalid byte near index {}", self.valid_up_to) } } @@ -1218,14 +1211,16 @@ fn run_utf8_validation_iterator(iter: &mut slice::Iter) // restore the iterator we had at the start of this codepoint. macro_rules! err { () => {{ *iter = old.clone(); - return Err(Utf8Error::InvalidByte(whole.len() - iter.as_slice().len())) + return Err(Utf8Error { + valid_up_to: whole.len() - iter.as_slice().len() + }) }}} macro_rules! next { () => { match iter.next() { Some(a) => *a, // we needed data, but there was none: error! - None => return Err(Utf8Error::TooShort), + None => err!(), } }} diff --git a/src/libstd/error.rs b/src/libstd/error.rs index c9babeb32301..96087bf1183d 100644 --- a/src/libstd/error.rs +++ b/src/libstd/error.rs @@ -122,10 +122,7 @@ impl Error for str::ParseBoolError { #[stable(feature = "rust1", since = "1.0.0")] impl Error for str::Utf8Error { fn description(&self) -> &str { - match *self { - str::Utf8Error::TooShort => "invalid utf-8: not enough bytes", - str::Utf8Error::InvalidByte(..) => "invalid utf-8: corrupt contents", - } + "invalid utf-8: corrupt contents" } }