From 560944b982385623655f1e8503af5e7b4ca0a436 Mon Sep 17 00:00:00 2001 From: Clar Charr Date: Mon, 13 Feb 2017 20:37:42 -0500 Subject: [PATCH 1/7] Add From> implementations. --- src/libcollections/string.rs | 16 +++++++++++++++ src/libcollections/vec.rs | 16 +++++++++++++++ src/libstd/ffi/c_str.rs | 33 ++++++++++++++++++++++++------- src/libstd/ffi/os_str.rs | 34 +++++++++++++++++++++++++------- src/libstd/path.rs | 34 +++++++++++++++++++++++++------- src/libstd/sys/redox/os_str.rs | 6 ++++++ src/libstd/sys/unix/os_str.rs | 6 ++++++ src/libstd/sys/windows/os_str.rs | 6 ++++++ src/libstd/sys_common/wtf8.rs | 6 ++++++ 9 files changed, 136 insertions(+), 21 deletions(-) diff --git a/src/libcollections/string.rs b/src/libcollections/string.rs index 43323676ab45..13c99a2d59be 100644 --- a/src/libcollections/string.rs +++ b/src/libcollections/string.rs @@ -1974,6 +1974,22 @@ impl<'a> From<&'a str> for String { } } +// note: test pulls in libstd, which causes errors here +#[cfg(not(test))] +#[stable(feature = "string_from_box", since = "1.17.0")] +impl From> for String { + fn from(s: Box) -> String { + s.into_string() + } +} + +#[stable(feature = "box_from_str", since = "1.17.0")] +impl Into> for String { + fn into(self) -> Box { + self.into_boxed_str() + } +} + #[stable(feature = "string_from_cow_str", since = "1.14.0")] impl<'a> From> for String { fn from(s: Cow<'a, str>) -> String { diff --git a/src/libcollections/vec.rs b/src/libcollections/vec.rs index d38c9f6e1cf8..e4a6af33409e 100644 --- a/src/libcollections/vec.rs +++ b/src/libcollections/vec.rs @@ -1897,6 +1897,22 @@ impl<'a, T> From> for Vec where [T]: ToOwned> { } } +// note: test pulls in libstd, which causes errors here +#[cfg(not(test))] +#[stable(feature = "vec_from_box", since = "1.17.0")] +impl From> for Vec { + fn from(s: Box<[T]>) -> Vec { + s.into_vec() + } +} + +#[stable(feature = "box_from_vec", since = "1.17.0")] +impl Into> for Vec { + fn into(self) -> Box<[T]> { + self.into_boxed_slice() + } +} + #[stable(feature = "rust1", since = "1.0.0")] impl<'a> From<&'a str> for Vec { fn from(s: &'a str) -> Vec { diff --git a/src/libstd/ffi/c_str.rs b/src/libstd/ffi/c_str.rs index bc678fcb8385..a3f4154ba4af 100644 --- a/src/libstd/ffi/c_str.rs +++ b/src/libstd/ffi/c_str.rs @@ -304,7 +304,7 @@ impl CString { } /// Converts this `CString` into a boxed `CStr`. - #[unstable(feature = "into_boxed_c_str", issue = "0")] + #[unstable(feature = "into_boxed_c_str", issue = "40380")] pub fn into_boxed_c_str(self) -> Box { unsafe { mem::transmute(self.into_inner()) } } @@ -394,6 +394,20 @@ impl<'a> From<&'a CStr> for Box { } } +#[stable(feature = "c_string_from_box", since = "1.17.0")] +impl From> for CString { + fn from(s: Box) -> CString { + s.into_c_string() + } +} + +#[stable(feature = "box_from_c_string", since = "1.17.0")] +impl Into> for CString { + fn into(self) -> Box { + self.into_boxed_c_str() + } +} + #[stable(feature = "default_box_extra", since = "1.17.0")] impl Default for Box { fn default() -> Box { @@ -694,6 +708,12 @@ impl CStr { pub fn to_string_lossy(&self) -> Cow { String::from_utf8_lossy(self.to_bytes()) } + + /// Converts a `Box` into a `CString` without copying or allocating. + #[unstable(feature = "into_boxed_c_str", issue = "40380")] + pub fn into_c_string(self: Box) -> CString { + unsafe { mem::transmute(self) } + } } #[stable(feature = "rust1", since = "1.0.0")] @@ -888,12 +908,11 @@ mod tests { fn into_boxed() { let orig: &[u8] = b"Hello, world!\0"; let cstr = CStr::from_bytes_with_nul(orig).unwrap(); - let cstring = cstr.to_owned(); - let box1: Box = Box::from(cstr); - let box2 = cstring.into_boxed_c_str(); - assert_eq!(cstr, &*box1); - assert_eq!(box1, box2); - assert_eq!(&*box2, cstr); + let boxed: Box = Box::from(cstr); + let cstring = cstr.to_owned().into_boxed_c_str().into_c_string(); + assert_eq!(cstr, &*boxed); + assert_eq!(&*boxed, &*cstring); + assert_eq!(&*cstring, cstr); } #[test] diff --git a/src/libstd/ffi/os_str.rs b/src/libstd/ffi/os_str.rs index 7b8bf42e0a74..c9c207d8b8eb 100644 --- a/src/libstd/ffi/os_str.rs +++ b/src/libstd/ffi/os_str.rs @@ -206,7 +206,7 @@ impl OsString { } /// Converts this `OsString` into a boxed `OsStr`. - #[unstable(feature = "into_boxed_os_str", issue = "0")] + #[unstable(feature = "into_boxed_os_str", issue = "40380")] pub fn into_boxed_os_str(self) -> Box { unsafe { mem::transmute(self.inner.into_box()) } } @@ -442,6 +442,13 @@ impl OsStr { self.inner.inner.len() } + /// Converts a `Box` into an `OsString` without copying or allocating. + #[unstable(feature = "into_boxed_os_str", issue = "40380")] + pub fn into_os_string(self: Box) -> OsString { + let inner: Box = unsafe { mem::transmute(self) }; + OsString { inner: Buf::from_box(inner) } + } + /// Gets the underlying byte representation. /// /// Note: it is *crucial* that this API is private, to avoid @@ -458,6 +465,20 @@ impl<'a> From<&'a OsStr> for Box { } } +#[stable(feature = "os_string_from_box", since = "1.17.0")] +impl<'a> From> for OsString { + fn from(boxed: Box) -> OsString { + boxed.into_os_string() + } +} + +#[stable(feature = "box_from_c_string", since = "1.17.0")] +impl Into> for OsString { + fn into(self) -> Box { + self.into_boxed_os_str() + } +} + #[stable(feature = "box_default_extra", since = "1.17.0")] impl Default for Box { fn default() -> Box { @@ -766,12 +787,11 @@ mod tests { fn into_boxed() { let orig = "Hello, world!"; let os_str = OsStr::new(orig); - let os_string = os_str.to_owned(); - let box1: Box = Box::from(os_str); - let box2 = os_string.into_boxed_os_str(); - assert_eq!(os_str, &*box1); - assert_eq!(box1, box2); - assert_eq!(&*box2, os_str); + let boxed: Box = Box::from(os_str); + let os_string = os_str.to_owned().into_boxed_os_str().into_os_string(); + assert_eq!(os_str, &*boxed); + assert_eq!(&*boxed, &*os_string); + assert_eq!(&*os_string, os_str); } #[test] diff --git a/src/libstd/path.rs b/src/libstd/path.rs index 245a6d945b5a..49b01bc08537 100644 --- a/src/libstd/path.rs +++ b/src/libstd/path.rs @@ -1196,7 +1196,7 @@ impl PathBuf { } /// Converts this `PathBuf` into a boxed `Path`. - #[unstable(feature = "into_boxed_path", issue = "0")] + #[unstable(feature = "into_boxed_path", issue = "40380")] pub fn into_boxed_path(self) -> Box { unsafe { mem::transmute(self.inner.into_boxed_os_str()) } } @@ -1210,6 +1210,20 @@ impl<'a> From<&'a Path> for Box { } } +#[stable(feature = "path_buf_from_box", since = "1.17.0")] +impl<'a> From> for PathBuf { + fn from(boxed: Box) -> PathBuf { + boxed.into_path_buf() + } +} + +#[stable(feature = "box_from_path_buf", since = "1.17.0")] +impl Into> for PathBuf { + fn into(self) -> Box { + self.into_boxed_path() + } +} + #[stable(feature = "box_default_extra", since = "1.17.0")] impl Default for Box { fn default() -> Box { @@ -2089,6 +2103,13 @@ impl Path { pub fn is_dir(&self) -> bool { fs::metadata(self).map(|m| m.is_dir()).unwrap_or(false) } + + /// Converts a `Box` into a `PathBuf` without copying or allocating. + #[unstable(feature = "into_boxed_path", issue = "40380")] + pub fn into_path_buf(self: Box) -> PathBuf { + let inner: Box = unsafe { mem::transmute(self) }; + PathBuf { inner: OsString::from(inner) } + } } #[stable(feature = "rust1", since = "1.0.0")] @@ -3703,12 +3724,11 @@ mod tests { fn into_boxed() { let orig: &str = "some/sort/of/path"; let path = Path::new(orig); - let path_buf = path.to_owned(); - let box1: Box = Box::from(path); - let box2 = path_buf.into_boxed_path(); - assert_eq!(path, &*box1); - assert_eq!(box1, box2); - assert_eq!(&*box2, path); + let boxed: Box = Box::from(path); + let path_buf = path.to_owned().into_boxed_path().into_path_buf(); + assert_eq!(path, &*boxed); + assert_eq!(&*boxed, &*path_buf); + assert_eq!(&*path_buf, path); } #[test] diff --git a/src/libstd/sys/redox/os_str.rs b/src/libstd/sys/redox/os_str.rs index 0f967863899c..90b8289524e2 100644 --- a/src/libstd/sys/redox/os_str.rs +++ b/src/libstd/sys/redox/os_str.rs @@ -99,6 +99,12 @@ impl Buf { pub fn into_box(self) -> Box { unsafe { mem::transmute(self.inner.into_boxed_slice()) } } + + #[inline] + pub fn from_box(boxed: Box) -> Buf { + let inner: Box<[u8]> = unsafe { mem::transmute(boxed) }; + Buf { inner: inner.into_vec() } + } } impl Slice { diff --git a/src/libstd/sys/unix/os_str.rs b/src/libstd/sys/unix/os_str.rs index 938bcfc6d162..225924168a5b 100644 --- a/src/libstd/sys/unix/os_str.rs +++ b/src/libstd/sys/unix/os_str.rs @@ -99,6 +99,12 @@ impl Buf { pub fn into_box(self) -> Box { unsafe { mem::transmute(self.inner.into_boxed_slice()) } } + + #[inline] + pub fn from_box(boxed: Box) -> Buf { + let inner: Box<[u8]> = unsafe { mem::transmute(boxed) }; + Buf { inner: inner.into_vec() } + } } impl Slice { diff --git a/src/libstd/sys/windows/os_str.rs b/src/libstd/sys/windows/os_str.rs index 04e45dcf5496..810b67b785b5 100644 --- a/src/libstd/sys/windows/os_str.rs +++ b/src/libstd/sys/windows/os_str.rs @@ -93,6 +93,12 @@ impl Buf { pub fn into_box(self) -> Box { unsafe { mem::transmute(self.inner.into_box()) } } + + #[inline] + pub fn from_box(boxed: Box) -> Buf { + let inner: Box = unsafe { mem::transmute(boxed) }; + Buf { inner: Wtf8Buf::from_box(inner) } + } } impl Slice { diff --git a/src/libstd/sys_common/wtf8.rs b/src/libstd/sys_common/wtf8.rs index 1d61181a4ee0..28cab10e8f9c 100644 --- a/src/libstd/sys_common/wtf8.rs +++ b/src/libstd/sys_common/wtf8.rs @@ -346,6 +346,12 @@ impl Wtf8Buf { pub fn into_box(self) -> Box { unsafe { mem::transmute(self.bytes.into_boxed_slice()) } } + + /// Converts a `Box` into a `Wtf8Buf`. + pub fn from_box(boxed: Box) -> Wtf8Buf { + let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) }; + Wtf8Buf { bytes: bytes.into_vec() } + } } /// Create a new WTF-8 string from an iterator of code points. From e31e264c55be03e7ca9477bfb32ffa03387ac8a2 Mon Sep 17 00:00:00 2001 From: Tatsuyuki Ishi Date: Thu, 9 Mar 2017 17:49:37 +0900 Subject: [PATCH 2/7] rustbuild: Make save-analysis an option --- configure | 1 + src/bootstrap/config.rs | 4 ++++ src/bootstrap/config.toml.example | 3 +++ src/bootstrap/dist.rs | 10 +--------- src/bootstrap/lib.rs | 2 +- src/ci/run.sh | 1 + 6 files changed, 11 insertions(+), 10 deletions(-) diff --git a/configure b/configure index d8861dacafac..fae457e9c0b1 100755 --- a/configure +++ b/configure @@ -645,6 +645,7 @@ opt dist-host-only 0 "only install bins for the host architecture" opt inject-std-version 1 "inject the current compiler version of libstd into programs" opt llvm-version-check 1 "check if the LLVM version is supported, build anyway" opt codegen-tests 1 "run the src/test/codegen tests" +opt save-analysis 0 "save API analysis data" opt option-checking 1 "complain about unrecognized options in this configure script" opt ninja 0 "build LLVM using the Ninja generator (for MSVC, requires building in the correct environment)" opt locked-deps 0 "force Cargo.lock to be up to date" diff --git a/src/bootstrap/config.rs b/src/bootstrap/config.rs index 87c35e0502ce..b65277035498 100644 --- a/src/bootstrap/config.rs +++ b/src/bootstrap/config.rs @@ -73,6 +73,7 @@ pub struct Config { pub rustc_default_ar: Option, pub rust_optimize_tests: bool, pub rust_debuginfo_tests: bool, + pub rust_save_analysis: bool, pub rust_dist_src: bool, pub build: String, @@ -223,6 +224,7 @@ struct Rust { optimize_tests: Option, debuginfo_tests: Option, codegen_tests: Option, + save_analysis: Option, } /// TOML representation of how each build target is configured. @@ -347,6 +349,7 @@ impl Config { set(&mut config.rust_optimize_tests, rust.optimize_tests); set(&mut config.rust_debuginfo_tests, rust.debuginfo_tests); set(&mut config.codegen_tests, rust.codegen_tests); + set(&mut config.rust_save_analysis, rust.save_analysis); set(&mut config.rust_rpath, rust.rpath); set(&mut config.debug_jemalloc, rust.debug_jemalloc); set(&mut config.use_jemalloc, rust.use_jemalloc); @@ -453,6 +456,7 @@ impl Config { ("LOCAL_REBUILD", self.local_rebuild), ("NINJA", self.ninja), ("CODEGEN_TESTS", self.codegen_tests), + ("SAVE_ANALYSIS", self.rust_save_analysis), ("LOCKED_DEPS", self.locked_deps), ("VENDOR", self.vendor), ("FULL_BOOTSTRAP", self.full_bootstrap), diff --git a/src/bootstrap/config.toml.example b/src/bootstrap/config.toml.example index 776bd729119e..42cf3dcabf4e 100644 --- a/src/bootstrap/config.toml.example +++ b/src/bootstrap/config.toml.example @@ -229,6 +229,9 @@ # saying that the FileCheck executable is missing, you may want to disable this. #codegen-tests = true +# Flag indicating whether the API analysis data should be saved. +#save-analysis = false + # ============================================================================= # Options for specific targets # diff --git a/src/bootstrap/dist.rs b/src/bootstrap/dist.rs index 5c4b718490c0..30f4f6b33df4 100644 --- a/src/bootstrap/dist.rs +++ b/src/bootstrap/dist.rs @@ -313,16 +313,8 @@ pub fn rust_src_location(build: &Build) -> PathBuf { pub fn analysis(build: &Build, compiler: &Compiler, target: &str) { println!("Dist analysis"); - if build.config.channel != "nightly" { - println!("\tskipping - not on nightly channel"); - return; - } if compiler.host != build.config.build { - println!("\tskipping - not a build host"); - return - } - if compiler.stage != 2 { - println!("\tskipping - not stage2"); + println!("\tskipping, not a build host"); return } diff --git a/src/bootstrap/lib.rs b/src/bootstrap/lib.rs index 4831b3808374..f234db98bc3f 100644 --- a/src/bootstrap/lib.rs +++ b/src/bootstrap/lib.rs @@ -524,7 +524,7 @@ impl Build { .env(format!("CFLAGS_{}", target), self.cflags(target).join(" ")); } - if self.config.channel == "nightly" && compiler.is_final_stage(self) { + if self.config.rust_save_analysis && compiler.is_final_stage(self) { cargo.env("RUSTC_SAVE_ANALYSIS", "api".to_string()); } diff --git a/src/ci/run.sh b/src/ci/run.sh index 4c4836d7ca23..55c6196b1ae7 100755 --- a/src/ci/run.sh +++ b/src/ci/run.sh @@ -42,6 +42,7 @@ fi if [ "$DEPLOY$DEPLOY_ALT" != "" ]; then RUST_CONFIGURE_ARGS="$RUST_CONFIGURE_ARGS --release-channel=nightly" RUST_CONFIGURE_ARGS="$RUST_CONFIGURE_ARGS --enable-llvm-static-stdcpp" + RUST_CONFIGURE_ARGS="$RUST_CONFIGURE_ARGS --enable-save-analysis" if [ "$NO_LLVM_ASSERTIONS" = "1" ]; then RUST_CONFIGURE_ARGS="$RUST_CONFIGURE_ARGS --disable-llvm-assertions" From 5a88c7e5a16227bb74d78d36ba4f37ebf5dec8d4 Mon Sep 17 00:00:00 2001 From: Tatsuyuki Ishi Date: Fri, 10 Mar 2017 09:35:17 +0900 Subject: [PATCH 3/7] rustbuild: Skip saving analysis when disabled --- src/bootstrap/dist.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/bootstrap/dist.rs b/src/bootstrap/dist.rs index 30f4f6b33df4..b44f0834f303 100644 --- a/src/bootstrap/dist.rs +++ b/src/bootstrap/dist.rs @@ -311,6 +311,10 @@ pub fn rust_src_location(build: &Build) -> PathBuf { /// Creates a tarball of save-analysis metadata, if available. pub fn analysis(build: &Build, compiler: &Compiler, target: &str) { + if !build.config.rust_save_analysis { + return + } + println!("Dist analysis"); if compiler.host != build.config.build { From d4040c3a3fa9cc416f2f997b52946d6cd6b17e48 Mon Sep 17 00:00:00 2001 From: Tatsuyuki Ishi Date: Sat, 11 Mar 2017 20:00:01 +0900 Subject: [PATCH 4/7] rustbuild: Add save-analysis to install --- src/bootstrap/install.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/bootstrap/install.rs b/src/bootstrap/install.rs index ba8442ebd8c3..249f241a151b 100644 --- a/src/bootstrap/install.rs +++ b/src/bootstrap/install.rs @@ -49,6 +49,10 @@ pub fn install(build: &Build, stage: u32, host: &str) { install_sh(&build, "docs", "rust-docs", stage, host, &prefix, &docdir, &libdir, &mandir, &empty_dir); } + if build.config.rust_save_analysis { + install_sh(&build, "analysis", "rust-analysis", stage, host, &prefix, + &docdir, &libdir, &mandir, &empty_dir); + } install_sh(&build, "std", "rust-std", stage, host, &prefix, &docdir, &libdir, &mandir, &empty_dir); install_sh(&build, "rustc", "rustc", stage, host, &prefix, From 182044248ca2aa569844a25e73f90e5bc2fd05d3 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Thu, 2 Mar 2017 17:27:57 +0100 Subject: [PATCH 5/7] Add Utf8Error::resume_from, to help incremental and/or lossy decoding. Without this, code outside of the standard library needs to reimplement most of the logic `from_utf8` to interpret the bytes after `valid_up_to()`. --- src/libcollectionstest/lib.rs | 1 + src/libcollectionstest/str.rs | 30 ++++++++++++++ src/libcore/str/mod.rs | 78 +++++++++++++++++++++++++---------- 3 files changed, 87 insertions(+), 22 deletions(-) diff --git a/src/libcollectionstest/lib.rs b/src/libcollectionstest/lib.rs index d97d9b8ab83f..a7018daf0984 100644 --- a/src/libcollectionstest/lib.rs +++ b/src/libcollectionstest/lib.rs @@ -28,6 +28,7 @@ #![feature(test)] #![feature(unboxed_closures)] #![feature(unicode)] +#![feature(utf8_error_resume_from)] extern crate collections; extern crate test; diff --git a/src/libcollectionstest/str.rs b/src/libcollectionstest/str.rs index 8071c7e8c20d..5de74d68b9ec 100644 --- a/src/libcollectionstest/str.rs +++ b/src/libcollectionstest/str.rs @@ -540,6 +540,36 @@ fn from_utf8_mostly_ascii() { } } +#[test] +fn from_utf8_error() { + macro_rules! test { + ($input: expr, $expected_valid_up_to: expr, $expected_resume_from: expr) => { + let error = from_utf8($input).unwrap_err(); + assert_eq!(error.valid_up_to(), $expected_valid_up_to); + assert_eq!(error.resume_from(), $expected_resume_from); + } + } + test!(b"A\xC3\xA9 \xFF ", 4, Some(5)); + test!(b"A\xC3\xA9 \x80 ", 4, Some(5)); + test!(b"A\xC3\xA9 \xC1 ", 4, Some(5)); + test!(b"A\xC3\xA9 \xC1", 4, Some(5)); + test!(b"A\xC3\xA9 \xC2", 4, None); + test!(b"A\xC3\xA9 \xC2 ", 4, Some(5)); + test!(b"A\xC3\xA9 \xC2\xC0", 4, Some(5)); + test!(b"A\xC3\xA9 \xE0", 4, None); + test!(b"A\xC3\xA9 \xE0\x9F", 4, Some(5)); + test!(b"A\xC3\xA9 \xE0\xA0", 4, None); + test!(b"A\xC3\xA9 \xE0\xA0\xC0", 4, Some(6)); + test!(b"A\xC3\xA9 \xE0\xA0 ", 4, Some(6)); + test!(b"A\xC3\xA9 \xED\xA0\x80 ", 4, Some(5)); + test!(b"A\xC3\xA9 \xF1", 4, None); + test!(b"A\xC3\xA9 \xF1\x80", 4, None); + test!(b"A\xC3\xA9 \xF1\x80\x80", 4, None); + test!(b"A\xC3\xA9 \xF1 ", 4, Some(5)); + test!(b"A\xC3\xA9 \xF1\x80 ", 4, Some(6)); + test!(b"A\xC3\xA9 \xF1\x80\x80 ", 4, Some(7)); +} + #[test] fn test_as_bytes() { // no null diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index 52e330163105..eb13d28e82d2 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -125,13 +125,14 @@ Section: Creating a string #[stable(feature = "rust1", since = "1.0.0")] pub struct Utf8Error { valid_up_to: usize, + invalid_length: Option, } impl Utf8Error { /// Returns the index in the given string up to which valid UTF-8 was /// verified. /// - /// It is the maximum index such that `from_utf8(input[..index])` + /// It is the maximum index such that `from_utf8(&input[..index])` /// would return `Ok(_)`. /// /// # Examples @@ -152,6 +153,21 @@ impl Utf8Error { /// ``` #[stable(feature = "utf8_error", since = "1.5.0")] pub fn valid_up_to(&self) -> usize { self.valid_up_to } + + /// Provide more information about the failure: + /// + /// * `None`: the end of the input was reached unexpectedly. + /// `self.valid_up_to()` is 1 to 3 bytes from the end of the input. + /// If a byte stream (such as a file or a network socket) is being decoded incrementally, + /// this could be a valid `char` whose UTF-8 byte sequence is spanning multiple chunks. + /// + /// * `Some(index)`: an unexpected byte was encountered. + /// The index provided is where decoding should resume + /// (after inserting a U+FFFD REPLACEMENT CHARACTER) in case of lossy decoding. + #[unstable(feature = "utf8_error_resume_from", reason ="new", issue = "0")] + pub fn resume_from(&self) -> Option { + self.invalid_length.map(|l| self.valid_up_to + l as usize) + } } /// Converts a slice of bytes to a string slice. @@ -300,7 +316,12 @@ pub unsafe fn from_utf8_unchecked(v: &[u8]) -> &str { #[stable(feature = "rust1", since = "1.0.0")] impl fmt::Display for Utf8Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "invalid utf-8: invalid byte near index {}", self.valid_up_to) + if let Some(invalid_length) = self.invalid_length { + write!(f, "invalid utf-8 sequence of {} bytes from index {}", + invalid_length, self.valid_up_to) + } else { + write!(f, "incomplete utf-8 byte sequence from index {}", self.valid_up_to) + } } } @@ -1241,17 +1262,20 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { while index < len { let old_offset = index; - macro_rules! err { () => {{ - return Err(Utf8Error { - valid_up_to: old_offset - }) - }}} + macro_rules! err { + ($invalid_length: expr) => { + return Err(Utf8Error { + valid_up_to: old_offset, + invalid_length: $invalid_length, + }) + } + } macro_rules! next { () => {{ index += 1; // we needed data, but there was none: error! if index >= len { - err!() + err!(None) } v[index] }}} @@ -1259,7 +1283,6 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { let first = v[index]; if first >= 128 { let w = UTF8_CHAR_WIDTH[first as usize]; - let second = next!(); // 2-byte encoding is for codepoints \u{0080} to \u{07ff} // first C2 80 last DF BF // 3-byte encoding is for codepoints \u{0800} to \u{ffff} @@ -1279,25 +1302,36 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / // %xF4 %x80-8F 2( UTF8-tail ) match w { - 2 => if second & !CONT_MASK != TAG_CONT_U8 {err!()}, + 2 => if next!() & !CONT_MASK != TAG_CONT_U8 { + err!(Some(1)) + }, 3 => { - match (first, second, next!() & !CONT_MASK) { - (0xE0 , 0xA0 ... 0xBF, TAG_CONT_U8) | - (0xE1 ... 0xEC, 0x80 ... 0xBF, TAG_CONT_U8) | - (0xED , 0x80 ... 0x9F, TAG_CONT_U8) | - (0xEE ... 0xEF, 0x80 ... 0xBF, TAG_CONT_U8) => {} - _ => err!() + match (first, next!()) { + (0xE0 , 0xA0 ... 0xBF) | + (0xE1 ... 0xEC, 0x80 ... 0xBF) | + (0xED , 0x80 ... 0x9F) | + (0xEE ... 0xEF, 0x80 ... 0xBF) => {} + _ => err!(Some(1)) + } + if next!() & !CONT_MASK != TAG_CONT_U8 { + err!(Some(2)) } } 4 => { - match (first, second, next!() & !CONT_MASK, next!() & !CONT_MASK) { - (0xF0 , 0x90 ... 0xBF, TAG_CONT_U8, TAG_CONT_U8) | - (0xF1 ... 0xF3, 0x80 ... 0xBF, TAG_CONT_U8, TAG_CONT_U8) | - (0xF4 , 0x80 ... 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {} - _ => err!() + match (first, next!()) { + (0xF0 , 0x90 ... 0xBF) | + (0xF1 ... 0xF3, 0x80 ... 0xBF) | + (0xF4 , 0x80 ... 0x8F) => {} + _ => err!(Some(1)) + } + if next!() & !CONT_MASK != TAG_CONT_U8 { + err!(Some(2)) + } + if next!() & !CONT_MASK != TAG_CONT_U8 { + err!(Some(3)) } } - _ => err!() + _ => err!(Some(1)) } index += 1; } else { From b5f16a10e9406fc1c19294fee1c33e507a17458e Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Mon, 6 Mar 2017 22:06:30 +0100 Subject: [PATCH 6/7] Replace Utf8Error::resume_from with Utf8Error::error_len Their relationship is: * `resume_from = error_len.map(|l| l + valid_up_to)` * error_len is always one of None, Some(1), Some(2), or Some(3). When I started using resume_from I almost always ended up subtracting valid_up_to to obtain error_len. Therefore the latter is what should be provided in the first place. --- src/libcollectionstest/lib.rs | 2 +- src/libcollectionstest/str.rs | 30 +++++++++++++++--------------- src/libcore/str/mod.rs | 22 ++++++++++++---------- 3 files changed, 28 insertions(+), 26 deletions(-) diff --git a/src/libcollectionstest/lib.rs b/src/libcollectionstest/lib.rs index a7018daf0984..98d0b1c8e156 100644 --- a/src/libcollectionstest/lib.rs +++ b/src/libcollectionstest/lib.rs @@ -28,7 +28,7 @@ #![feature(test)] #![feature(unboxed_closures)] #![feature(unicode)] -#![feature(utf8_error_resume_from)] +#![feature(utf8_error_error_len)] extern crate collections; extern crate test; diff --git a/src/libcollectionstest/str.rs b/src/libcollectionstest/str.rs index 5de74d68b9ec..c9b7104fec4f 100644 --- a/src/libcollectionstest/str.rs +++ b/src/libcollectionstest/str.rs @@ -543,31 +543,31 @@ fn from_utf8_mostly_ascii() { #[test] fn from_utf8_error() { macro_rules! test { - ($input: expr, $expected_valid_up_to: expr, $expected_resume_from: expr) => { + ($input: expr, $expected_valid_up_to: expr, $expected_error_len: expr) => { let error = from_utf8($input).unwrap_err(); assert_eq!(error.valid_up_to(), $expected_valid_up_to); - assert_eq!(error.resume_from(), $expected_resume_from); + assert_eq!(error.error_len(), $expected_error_len); } } - test!(b"A\xC3\xA9 \xFF ", 4, Some(5)); - test!(b"A\xC3\xA9 \x80 ", 4, Some(5)); - test!(b"A\xC3\xA9 \xC1 ", 4, Some(5)); - test!(b"A\xC3\xA9 \xC1", 4, Some(5)); + test!(b"A\xC3\xA9 \xFF ", 4, Some(1)); + test!(b"A\xC3\xA9 \x80 ", 4, Some(1)); + test!(b"A\xC3\xA9 \xC1 ", 4, Some(1)); + test!(b"A\xC3\xA9 \xC1", 4, Some(1)); test!(b"A\xC3\xA9 \xC2", 4, None); - test!(b"A\xC3\xA9 \xC2 ", 4, Some(5)); - test!(b"A\xC3\xA9 \xC2\xC0", 4, Some(5)); + test!(b"A\xC3\xA9 \xC2 ", 4, Some(1)); + test!(b"A\xC3\xA9 \xC2\xC0", 4, Some(1)); test!(b"A\xC3\xA9 \xE0", 4, None); - test!(b"A\xC3\xA9 \xE0\x9F", 4, Some(5)); + test!(b"A\xC3\xA9 \xE0\x9F", 4, Some(1)); test!(b"A\xC3\xA9 \xE0\xA0", 4, None); - test!(b"A\xC3\xA9 \xE0\xA0\xC0", 4, Some(6)); - test!(b"A\xC3\xA9 \xE0\xA0 ", 4, Some(6)); - test!(b"A\xC3\xA9 \xED\xA0\x80 ", 4, Some(5)); + test!(b"A\xC3\xA9 \xE0\xA0\xC0", 4, Some(2)); + test!(b"A\xC3\xA9 \xE0\xA0 ", 4, Some(2)); + test!(b"A\xC3\xA9 \xED\xA0\x80 ", 4, Some(1)); test!(b"A\xC3\xA9 \xF1", 4, None); test!(b"A\xC3\xA9 \xF1\x80", 4, None); test!(b"A\xC3\xA9 \xF1\x80\x80", 4, None); - test!(b"A\xC3\xA9 \xF1 ", 4, Some(5)); - test!(b"A\xC3\xA9 \xF1\x80 ", 4, Some(6)); - test!(b"A\xC3\xA9 \xF1\x80\x80 ", 4, Some(7)); + test!(b"A\xC3\xA9 \xF1 ", 4, Some(1)); + test!(b"A\xC3\xA9 \xF1\x80 ", 4, Some(2)); + test!(b"A\xC3\xA9 \xF1\x80\x80 ", 4, Some(3)); } #[test] diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index eb13d28e82d2..63b12932c3d6 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -125,7 +125,7 @@ Section: Creating a string #[stable(feature = "rust1", since = "1.0.0")] pub struct Utf8Error { valid_up_to: usize, - invalid_length: Option, + error_len: Option, } impl Utf8Error { @@ -161,12 +161,14 @@ impl Utf8Error { /// If a byte stream (such as a file or a network socket) is being decoded incrementally, /// this could be a valid `char` whose UTF-8 byte sequence is spanning multiple chunks. /// - /// * `Some(index)`: an unexpected byte was encountered. - /// The index provided is where decoding should resume + /// * `Some(len)`: an unexpected byte was encountered. + /// The length provided is that of the invalid byte sequence + /// that starts at the index given by `valid_up_to()`. + /// Decoding should resume after that sequence /// (after inserting a U+FFFD REPLACEMENT CHARACTER) in case of lossy decoding. - #[unstable(feature = "utf8_error_resume_from", reason ="new", issue = "0")] - pub fn resume_from(&self) -> Option { - self.invalid_length.map(|l| self.valid_up_to + l as usize) + #[unstable(feature = "utf8_error_error_len", reason ="new", issue = "0")] + pub fn error_len(&self) -> Option { + self.error_len.map(|len| len as usize) } } @@ -316,9 +318,9 @@ pub unsafe fn from_utf8_unchecked(v: &[u8]) -> &str { #[stable(feature = "rust1", since = "1.0.0")] impl fmt::Display for Utf8Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - if let Some(invalid_length) = self.invalid_length { + if let Some(error_len) = self.error_len { write!(f, "invalid utf-8 sequence of {} bytes from index {}", - invalid_length, self.valid_up_to) + error_len, self.valid_up_to) } else { write!(f, "incomplete utf-8 byte sequence from index {}", self.valid_up_to) } @@ -1263,10 +1265,10 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { while index < len { let old_offset = index; macro_rules! err { - ($invalid_length: expr) => { + ($error_len: expr) => { return Err(Utf8Error { valid_up_to: old_offset, - invalid_length: $invalid_length, + error_len: $error_len, }) } } From 73370c543ea130a3d6d9097aa56b786c72dc6c94 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Mon, 13 Mar 2017 23:54:06 +0100 Subject: [PATCH 7/7] Add tracking issue number for Utf8Error::error_len --- src/libcore/str/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index 63b12932c3d6..2919adc1cbc6 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -166,7 +166,7 @@ impl Utf8Error { /// that starts at the index given by `valid_up_to()`. /// Decoding should resume after that sequence /// (after inserting a U+FFFD REPLACEMENT CHARACTER) in case of lossy decoding. - #[unstable(feature = "utf8_error_error_len", reason ="new", issue = "0")] + #[unstable(feature = "utf8_error_error_len", reason ="new", issue = "40494")] pub fn error_len(&self) -> Option { self.error_len.map(|len| len as usize) }