From 3ac00a94895f4d9a8119ef35494fddb9d66436ae Mon Sep 17 00:00:00 2001 From: Huon Wilson Date: Tue, 11 Jun 2013 21:37:22 +1000 Subject: [PATCH] std: remove substr & str::count_*, methodise char_len, implement slice_chars. The confusing mixture of byte index and character count meant that every use of .substr was incorrect; replaced by slice_chars which only uses character indices. The old behaviour of `.substr(start, n)` can be emulated via `.slice_from(start).slice_chars(0, n)`. --- doc/tutorial.md | 2 +- src/libextra/rope.rs | 24 +++--- src/libextra/terminfo/searcher.rs | 6 +- src/librustc/driver/driver.rs | 2 +- src/librustc/middle/resolve.rs | 12 +-- src/librustc/middle/trans/expr.rs | 2 +- src/librusti/rusti.rc | 5 +- src/libstd/str.rs | 117 +++++++++++------------------- src/libstd/unstable/extfmt.rs | 8 +- src/test/run-pass/utf8_chars.rs | 2 +- 10 files changed, 76 insertions(+), 104 deletions(-) diff --git a/doc/tutorial.md b/doc/tutorial.md index f487398d19c2..a45803611666 100644 --- a/doc/tutorial.md +++ b/doc/tutorial.md @@ -1410,7 +1410,7 @@ let new_favorite_crayon_name = favorite_crayon_name.trim(); if favorite_crayon_name.len() > 5 { // Create a substring - println(favorite_crayon_name.substr(0, 5)); + println(favorite_crayon_name.slice_chars(0, 5)); } ~~~ diff --git a/src/libextra/rope.rs b/src/libextra/rope.rs index de5496414234..0856c256c5b1 100644 --- a/src/libextra/rope.rs +++ b/src/libextra/rope.rs @@ -84,9 +84,9 @@ pub fn of_str(str: @~str) -> Rope { * * # Return value * - * A rope representing the same string as `str.substr(byte_offset, - * byte_len)`. Depending on `byte_len`, this rope may be empty, flat - * or complex. + * A rope representing the same string as `str.slice(byte_offset, + * byte_offset + byte_len)`. Depending on `byte_len`, this rope may + * be empty, flat or complex. * * # Performance note * @@ -588,7 +588,7 @@ pub mod node { * * char_len - The number of chars in the leaf. * * content - Contents of the leaf. * - * Note that we can have `char_len < str::char_len(content)`, if + * Note that we can have `char_len < content.char_len()`, if * this leaf is only a subset of the string. Also note that the * string can be shared between several ropes, e.g. for indexing * purposes. @@ -680,7 +680,7 @@ pub mod node { */ pub fn of_substr(str: @~str, byte_start: uint, byte_len: uint) -> @Node { return of_substr_unsafer(str, byte_start, byte_len, - str::count_chars(*str, byte_start, byte_len)); + str.slice(byte_start, byte_start + byte_len).char_len()); } /** @@ -734,7 +734,7 @@ pub mod node { if i == 0u { first_leaf_char_len } else { hint_max_leaf_char_len }; let chunk_byte_len = - str::count_bytes(*str, offset, chunk_char_len); + str.slice_from(offset).slice_chars(0, chunk_char_len).len(); nodes[i] = @Leaf(Leaf { byte_offset: offset, byte_len: chunk_byte_len, @@ -938,7 +938,7 @@ pub mod node { match (*node) { node::Leaf(x) => { let char_len = - str::count_chars(*x.content, byte_offset, byte_len); + x.content.slice(byte_offset, byte_offset + byte_len).char_len(); return @Leaf(Leaf { byte_offset: byte_offset, byte_len: byte_len, @@ -1002,9 +1002,9 @@ pub mod node { return node; } let byte_offset = - str::count_bytes(*x.content, 0u, char_offset); + x.content.slice_chars(0, char_offset).len(); let byte_len = - str::count_bytes(*x.content, byte_offset, char_len); + x.content.slice_from(byte_offset).slice_chars(0, char_len).len(); return @Leaf(Leaf { byte_offset: byte_offset, byte_len: byte_len, @@ -1312,7 +1312,7 @@ mod tests { let sample = @~"0123456789ABCDE"; let r = of_str(sample); - assert_eq!(char_len(r), str::char_len(*sample)); + assert_eq!(char_len(r), sample.char_len()); assert!(rope_to_string(r) == *sample); } @@ -1328,7 +1328,7 @@ mod tests { } let sample = @copy *buf; let r = of_str(sample); - assert!(char_len(r) == str::char_len(*sample)); + assert_eq!(char_len(r), sample.char_len()); assert!(rope_to_string(r) == *sample); let mut string_iter = 0u; @@ -1374,7 +1374,7 @@ mod tests { } } - assert_eq!(len, str::char_len(*sample)); + assert_eq!(len, sample.char_len()); } #[test] diff --git a/src/libextra/terminfo/searcher.rs b/src/libextra/terminfo/searcher.rs index ecc99f74626c..48a3e1e9c695 100644 --- a/src/libextra/terminfo/searcher.rs +++ b/src/libextra/terminfo/searcher.rs @@ -27,7 +27,7 @@ pub fn get_dbpath_for_term(term: &str) -> Option<~path> { let homedir = os::homedir(); let mut dirs_to_search = ~[]; - let first_char = term.substr(0, 1); + let first_char = term.char_at(0); // Find search directory match getenv("TERMINFO") { @@ -57,12 +57,12 @@ pub fn get_dbpath_for_term(term: &str) -> Option<~path> { // Look for the terminal in all of the search directories for dirs_to_search.each |p| { - let newp = ~p.push_many(&[first_char.to_owned(), term.to_owned()]); + let newp = ~p.push_many(&[str::from_char(first_char), term.to_owned()]); if os::path_exists(p) && os::path_exists(newp) { return Some(newp); } // on some installations the dir is named after the hex of the char (e.g. OS X) - let newp = ~p.push_many(&[fmt!("%x", first_char[0] as uint), term.to_owned()]); + let newp = ~p.push_many(&[fmt!("%x", first_char as uint), term.to_owned()]); if os::path_exists(p) && os::path_exists(newp) { return Some(newp); } diff --git a/src/librustc/driver/driver.rs b/src/librustc/driver/driver.rs index 2b6f4067ccae..7a631b5fc683 100644 --- a/src/librustc/driver/driver.rs +++ b/src/librustc/driver/driver.rs @@ -590,7 +590,7 @@ pub fn build_session_options(binary: @~str, // FIXME: #4318 Instead of to_ascii and to_str_ascii, could use // to_ascii_consume and to_str_consume to not do a unnecessary copy. - let level_short = level_name.substr(0,1); + let level_short = level_name.slice_chars(0, 1); let level_short = level_short.to_ascii().to_upper().to_str_ascii(); let flags = vec::append(getopts::opt_strs(matches, level_short), getopts::opt_strs(matches, level_name)); diff --git a/src/librustc/middle/resolve.rs b/src/librustc/middle/resolve.rs index 7c9877a838ad..1adb991c96e7 100644 --- a/src/librustc/middle/resolve.rs +++ b/src/librustc/middle/resolve.rs @@ -2678,14 +2678,14 @@ impl Resolver { match module_prefix_result { Failed => { let mpath = self.idents_to_str(module_path); - match self.idents_to_str(module_path).rfind(':') { + match mpath.rfind(':') { Some(idx) => { self.session.span_err(span, fmt!("unresolved import: could not find `%s` \ - in `%s`", mpath.substr(idx, - mpath.len() - idx), - // idx - 1 to account for the extra - // colon - mpath.substr(0, idx - 1))); + in `%s`", + // idx +- 1 to account for the colons + // on either side + mpath.slice_from(idx + 1), + mpath.slice_to(idx - 1))); }, None => (), }; diff --git a/src/librustc/middle/trans/expr.rs b/src/librustc/middle/trans/expr.rs index 5b286a783119..65755345ac3d 100644 --- a/src/librustc/middle/trans/expr.rs +++ b/src/librustc/middle/trans/expr.rs @@ -1704,5 +1704,5 @@ fn trans_assign_op(bcx: block, } fn shorten(x: ~str) -> ~str { - if x.len() > 60 { x.substr(0, 60).to_owned() } else { x } + if x.char_len() > 60 { x.slice_chars(0, 60).to_owned() } else { x } } diff --git a/src/librusti/rusti.rc b/src/librusti/rusti.rc index f8b39d9fbc72..3e1883062f33 100644 --- a/src/librusti/rusti.rc +++ b/src/librusti/rusti.rc @@ -284,7 +284,7 @@ fn run_cmd(repl: &mut Repl, _in: @io::Reader, _out: @io::Writer, for args.each |arg| { let (crate, filename) = if arg.ends_with(".rs") || arg.ends_with(".rc") { - (arg.substr(0, arg.len() - 3).to_owned(), copy *arg) + (arg.slice_to(arg.len() - 3).to_owned(), copy *arg) } else { (copy *arg, arg + ".rs") }; @@ -342,7 +342,8 @@ pub fn run_line(repl: &mut Repl, in: @io::Reader, out: @io::Writer, line: ~str, // FIXME #5898: conflicts with Cell.take(), so can't be at the top level use core::iterator::IteratorUtil; - let full = line.substr(1, line.len() - 1); + // drop the : and the \n (one byte each) + let full = line.slice(1, line.len() - 1); let split: ~[~str] = full.word_iter().transform(|s| s.to_owned()).collect(); let len = split.len(); diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 20d2194de07b..0c99dd4f9fe0 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -848,15 +848,6 @@ fn match_at<'a,'b>(haystack: &'a str, needle: &'b str, at: uint) -> bool { return true; } - -/* -Section: String properties -*/ - -/// Returns the number of characters that a string holds -#[inline(always)] -pub fn char_len(s: &str) -> uint { count_chars(s, 0u, s.len()) } - /* Section: Misc */ @@ -974,46 +965,6 @@ pub fn with_capacity(capacity: uint) -> ~str { buf } -/** - * As char_len but for a slice of a string - * - * # Arguments - * - * * s - A valid string - * * start - The position inside `s` where to start counting in bytes - * * end - The position where to stop counting - * - * # Return value - * - * The number of Unicode characters in `s` between the given indices. - */ -pub fn count_chars(s: &str, start: uint, end: uint) -> uint { - assert!(s.is_char_boundary(start)); - assert!(s.is_char_boundary(end)); - let mut (i, len) = (start, 0u); - while i < end { - let next = s.char_range_at(i).next; - len += 1u; - i = next; - } - return len; -} - -/// Counts the number of bytes taken by the first `n` chars in `s` -/// starting from `start`. -pub fn count_bytes<'b>(s: &'b str, start: uint, n: uint) -> uint { - assert!(s.is_char_boundary(start)); - let mut (end, cnt) = (start, n); - let l = s.len(); - while cnt > 0u { - assert!(end < l); - let next = s.char_range_at(end).next; - cnt -= 1u; - end = next; - } - end - start -} - /// Given a first byte, determine how many bytes are in this UTF-8 character pub fn utf8_char_width(b: u8) -> uint { let byte: uint = b as uint; @@ -1394,11 +1345,14 @@ pub trait StrSlice<'self> { fn is_alphanumeric(&self) -> bool; fn len(&self) -> uint; fn char_len(&self) -> uint; + fn slice(&self, begin: uint, end: uint) -> &'self str; fn slice_from(&self, begin: uint) -> &'self str; fn slice_to(&self, end: uint) -> &'self str; + + fn slice_chars(&self, begin: uint, end: uint) -> &'self str; + fn starts_with(&self, needle: &str) -> bool; - fn substr(&self, begin: uint, n: uint) -> &'self str; fn escape_default(&self) -> ~str; fn escape_unicode(&self) -> ~str; fn trim(&self) -> &'self str; @@ -1595,7 +1549,8 @@ impl<'self> StrSlice<'self> for &'self str { } /// Returns the number of characters that a string holds #[inline] - fn char_len(&self) -> uint { char_len(*self) } + fn char_len(&self) -> uint { self.iter().count() } + /** * Returns a slice of the given string from the byte range * [`begin`..`end`) @@ -1626,6 +1581,32 @@ impl<'self> StrSlice<'self> for &'self str { fn slice_to(&self, end: uint) -> &'self str { self.slice(0, end) } + + /// Returns a slice of the string from the char range + /// [`begin`..`end`). + /// + /// Fails if `begin` > `end` or the either `begin` or `end` are + /// beyond the last character of the string. + fn slice_chars(&self, begin: uint, end: uint) -> &'self str { + assert!(begin <= end); + // not sure how to use the iterators for this nicely. + let mut (position, count) = (0, 0); + let l = self.len(); + while count < begin && position < l { + position = self.char_range_at(position).next; + count += 1; + } + if count < begin { fail!("Attempted to begin slice_chars beyond end of string") } + let start_byte = position; + while count < end && position < l { + position = self.char_range_at(position).next; + count += 1; + } + if count < end { fail!("Attempted to end slice_chars beyond end of string") } + + self.slice(start_byte, position) + } + /// Returns true if `needle` is a prefix of the string. fn starts_with<'a>(&self, needle: &'a str) -> bool { let (self_len, needle_len) = (self.len(), needle.len()); @@ -1641,16 +1622,6 @@ impl<'self> StrSlice<'self> for &'self str { else { match_at(*self, needle, self_len - needle_len) } } - /** - * Take a substring of another. - * - * Returns a string containing `n` characters starting at byte offset - * `begin`. - */ - #[inline] - fn substr(&self, begin: uint, n: uint) -> &'self str { - self.slice(begin, begin + count_bytes(*self, begin, n)) - } /// Escape each char in `s` with char::escape_default. #[inline] fn escape_default(&self) -> ~str { escape_default(*self) } @@ -2367,14 +2338,14 @@ mod tests { assert_eq!("\u2620".len(), 3u); assert_eq!("\U0001d11e".len(), 4u); - assert_eq!(char_len(""), 0u); - assert_eq!(char_len("hello world"), 11u); - assert_eq!(char_len("\x63"), 1u); - assert_eq!(char_len("\xa2"), 1u); - assert_eq!(char_len("\u03c0"), 1u); - assert_eq!(char_len("\u2620"), 1u); - assert_eq!(char_len("\U0001d11e"), 1u); - assert_eq!(char_len("ประเทศไทย中华Việt Nam"), 19u); + assert_eq!("".char_len(), 0u); + assert_eq!("hello world".char_len(), 11u); + assert_eq!("\x63".char_len(), 1u); + assert_eq!("\xa2".char_len(), 1u); + assert_eq!("\u03c0".char_len(), 1u); + assert_eq!("\u2620".char_len(), 1u); + assert_eq!("\U0001d11e".char_len(), 1u); + assert_eq!("ประเทศไทย中华Việt Nam".char_len(), 19u); } #[test] @@ -2509,13 +2480,13 @@ mod tests { } #[test] - fn test_substr() { - fn t(a: &str, b: &str, start: int) { - assert_eq!(a.substr(start as uint, b.len()), b); + fn test_slice_chars() { + fn t(a: &str, b: &str, start: uint) { + assert_eq!(a.slice_chars(start, start + b.char_len()), b); } t("hello", "llo", 2); t("hello", "el", 1); - assert_eq!("ะเทศไท", "ประเทศไทย中华Việt Nam".substr(6u, 6u)); + assert_eq!("ะเทศไท", "ประเทศไทย中华Việt Nam".slice_chars(2, 8)); } #[test] diff --git a/src/libstd/unstable/extfmt.rs b/src/libstd/unstable/extfmt.rs index 8a6f58d7992e..7d9ce585d7c5 100644 --- a/src/libstd/unstable/extfmt.rs +++ b/src/libstd/unstable/extfmt.rs @@ -325,7 +325,7 @@ pub mod ct { 'o' => TyOctal, 'f' => TyFloat, '?' => TyPoly, - _ => err(~"unknown type in conversion: " + s.substr(i, 1)) + _ => err(fmt!("unknown type in conversion: %c", s.char_at(i))) }; Parsed::new(t, i + 1) @@ -546,7 +546,7 @@ pub mod rt { // displayed let unpadded = match cv.precision { CountImplied => s, - CountIs(max) => if (max as uint) < str::char_len(s) { + CountIs(max) => if (max as uint) < s.char_len() { s.slice(0, max as uint) } else { s @@ -584,7 +584,7 @@ pub mod rt { ~"" } else { let s = uint::to_str_radix(num, radix); - let len = str::char_len(s); + let len = s.char_len(); if len < prec { let diff = prec - len; let pad = str::from_chars(vec::from_elem(diff, '0')); @@ -614,7 +614,7 @@ pub mod rt { } CountIs(width) => { width as uint } }; - let strlen = str::char_len(s) + headsize; + let strlen = s.char_len() + headsize; if uwidth <= strlen { for head.iter().advance |&c| { buf.push_char(c); diff --git a/src/test/run-pass/utf8_chars.rs b/src/test/run-pass/utf8_chars.rs index 11f0d59c7002..c126a84e7822 100644 --- a/src/test/run-pass/utf8_chars.rs +++ b/src/test/run-pass/utf8_chars.rs @@ -21,7 +21,7 @@ pub fn main() { let schs: ~[char] = s.iter().collect(); assert!(s.len() == 10u); - assert!(str::char_len(s) == 4u); + assert!(s.char_len() == 4u); assert!(schs.len() == 4u); assert!(str::from_chars(schs) == s); assert!(s.char_at(0u) == 'e');