From f329030b095aa30ce29be0c3459615d85506747b Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Fri, 10 Apr 2015 16:05:09 -0700 Subject: [PATCH] std: Stabilize the Utf8Error type The meaning of each variant of this enum was somewhat ambiguous and it's uncler that we wouldn't even want to add more enumeration values in the future. As a result this error has been altered to instead become an opaque structure. Learning about the "first invalid byte index" is still an unstable feature, but the type itself is now stable. --- src/libcollections/lib.rs | 1 + src/libcollections/string.rs | 16 +++++-------- src/libcollectionstest/str.rs | 2 +- src/libcollectionstest/string.rs | 1 - src/libcore/str/mod.rs | 39 ++++++++++++++------------------ src/libstd/error.rs | 5 +--- 6 files changed, 26 insertions(+), 38 deletions(-) diff --git a/src/libcollections/lib.rs b/src/libcollections/lib.rs index 7658611d809a..5179b04f8824 100644 --- a/src/libcollections/lib.rs +++ b/src/libcollections/lib.rs @@ -40,6 +40,7 @@ #![feature(str_char)] #![feature(slice_patterns)] #![feature(debug_builders)] +#![feature(utf8_error)] #![cfg_attr(test, feature(rand, rustc_private, test, hash, collections))] #![cfg_attr(test, allow(deprecated))] // rand diff --git a/src/libcollections/string.rs b/src/libcollections/string.rs index 441d0f2c5df7..9c9f2d628b82 100644 --- a/src/libcollections/string.rs +++ b/src/libcollections/string.rs @@ -132,7 +132,7 @@ impl String { /// /// let invalid_vec = vec![240, 144, 128]; /// let s = String::from_utf8(invalid_vec).err().unwrap(); - /// assert_eq!(s.utf8_error(), Utf8Error::TooShort); + /// let err = s.utf8_error(); /// assert_eq!(s.into_bytes(), [240, 144, 128]); /// ``` #[inline] @@ -156,14 +156,10 @@ impl String { /// ``` #[stable(feature = "rust1", since = "1.0.0")] pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> Cow<'a, str> { - let mut i = 0; + let mut i; match str::from_utf8(v) { Ok(s) => return Cow::Borrowed(s), - Err(e) => { - if let Utf8Error::InvalidByte(firstbad) = e { - i = firstbad; - } - } + Err(e) => i = e.valid_up_to(), } const TAG_CONT_U8: u8 = 128; @@ -188,9 +184,9 @@ impl String { }; } - // subseqidx is the index of the first byte of the subsequence we're looking at. - // It's used to copy a bunch of contiguous good codepoints at once instead of copying - // them one by one. + // subseqidx is the index of the first byte of the subsequence we're + // looking at. It's used to copy a bunch of contiguous good codepoints + // at once instead of copying them one by one. let mut subseqidx = i; while i < total { diff --git a/src/libcollectionstest/str.rs b/src/libcollectionstest/str.rs index 15f15900e783..cacafab4e3c1 100644 --- a/src/libcollectionstest/str.rs +++ b/src/libcollectionstest/str.rs @@ -1502,7 +1502,7 @@ fn test_str_from_utf8() { assert_eq!(from_utf8(xs), Ok("ศไทย中华Việt Nam")); let xs = b"hello\xFF"; - assert_eq!(from_utf8(xs), Err(Utf8Error::TooShort)); + assert!(from_utf8(xs).is_err()); } #[test] diff --git a/src/libcollectionstest/string.rs b/src/libcollectionstest/string.rs index 5d6aa8ac0dcc..3184f842e9ae 100644 --- a/src/libcollectionstest/string.rs +++ b/src/libcollectionstest/string.rs @@ -45,7 +45,6 @@ fn test_from_utf8() { let xs = b"hello\xFF".to_vec(); let err = String::from_utf8(xs).err().unwrap(); - assert_eq!(err.utf8_error(), Utf8Error::TooShort); assert_eq!(err.into_bytes(), b"hello\xff".to_vec()); } diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index 9bc760b56ec4..fc623f21167c 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -106,19 +106,19 @@ Section: Creating a string /// Errors which can occur when attempting to interpret a byte slice as a `str`. #[derive(Copy, Eq, PartialEq, Clone, Debug)] -#[unstable(feature = "core", - reason = "error enumeration recently added and definitions may be refined")] -pub enum Utf8Error { - /// An invalid byte was detected at the byte offset given. - /// - /// The offset is guaranteed to be in bounds of the slice in question, and - /// the byte at the specified offset was the first invalid byte in the - /// sequence detected. - InvalidByte(usize), +#[stable(feature = "rust1", since = "1.0.0")] +pub struct Utf8Error { + valid_up_to: usize, +} - /// The byte slice was invalid because more bytes were needed but no more - /// bytes were available. - TooShort, +impl Utf8Error { + /// Returns the index in the given string up to which valid UTF-8 was + /// verified. + /// + /// Starting at the index provided, but not necessarily at it precisely, an + /// invalid UTF-8 encoding sequence was found. + #[unstable(feature = "utf8_error", reason = "method just added")] + pub fn valid_up_to(&self) -> usize { self.valid_up_to } } /// Converts a slice of bytes to a string slice without performing any @@ -147,14 +147,7 @@ pub unsafe fn from_utf8_unchecked<'a>(v: &'a [u8]) -> &'a str { #[stable(feature = "rust1", since = "1.0.0")] impl fmt::Display for Utf8Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match *self { - Utf8Error::InvalidByte(n) => { - write!(f, "invalid utf-8: invalid byte at index {}", n) - } - Utf8Error::TooShort => { - write!(f, "invalid utf-8: byte slice too short") - } - } + write!(f, "invalid utf-8: invalid byte near index {}", self.valid_up_to) } } @@ -1218,14 +1211,16 @@ fn run_utf8_validation_iterator(iter: &mut slice::Iter) // restore the iterator we had at the start of this codepoint. macro_rules! err { () => {{ *iter = old.clone(); - return Err(Utf8Error::InvalidByte(whole.len() - iter.as_slice().len())) + return Err(Utf8Error { + valid_up_to: whole.len() - iter.as_slice().len() + }) }}} macro_rules! next { () => { match iter.next() { Some(a) => *a, // we needed data, but there was none: error! - None => return Err(Utf8Error::TooShort), + None => err!(), } }} diff --git a/src/libstd/error.rs b/src/libstd/error.rs index c9babeb32301..96087bf1183d 100644 --- a/src/libstd/error.rs +++ b/src/libstd/error.rs @@ -122,10 +122,7 @@ impl Error for str::ParseBoolError { #[stable(feature = "rust1", since = "1.0.0")] impl Error for str::Utf8Error { fn description(&self) -> &str { - match *self { - str::Utf8Error::TooShort => "invalid utf-8: not enough bytes", - str::Utf8Error::InvalidByte(..) => "invalid utf-8: corrupt contents", - } + "invalid utf-8: corrupt contents" } }