auto merge of #5980 : Kimundi/rust/ascii-encoding, r=thestinger
Added Ascii type to use for byte inputs that are known to contain Ascii only.
This commit is contained in:
commit
05f9586d06
6 changed files with 320 additions and 31 deletions
|
|
@ -1,4 +1,4 @@
|
|||
// Copyright 2012 The Rust Project Developers. See the COPYRIGHT
|
||||
// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
|
|
@ -234,6 +234,21 @@ pub fn escape_default(c: char) -> ~str {
|
|||
}
|
||||
}
|
||||
|
||||
/// Returns the amount of bytes this character would need if encoded in utf8
|
||||
pub fn len_utf8_bytes(c: char) -> uint {
|
||||
static max_one_b: uint = 128u;
|
||||
static max_two_b: uint = 2048u;
|
||||
static max_three_b: uint = 65536u;
|
||||
static max_four_b: uint = 2097152u;
|
||||
|
||||
let code = c as uint;
|
||||
if code < max_one_b { 1u }
|
||||
else if code < max_two_b { 2u }
|
||||
else if code < max_three_b { 3u }
|
||||
else if code < max_four_b { 4u }
|
||||
else { fail!(~"invalid character!") }
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two chars
|
||||
*
|
||||
|
|
@ -334,7 +349,6 @@ fn test_escape_default() {
|
|||
assert_eq!(escape_default('\U0001d4b6'), ~"\\U0001d4b6");
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_escape_unicode() {
|
||||
assert_eq!(escape_unicode('\x00'), ~"\\x00");
|
||||
|
|
|
|||
|
|
@ -164,6 +164,9 @@ pub mod vec;
|
|||
pub mod at_vec;
|
||||
pub mod str;
|
||||
|
||||
#[path = "str/ascii.rs"]
|
||||
pub mod ascii;
|
||||
|
||||
pub mod ptr;
|
||||
pub mod owned;
|
||||
pub mod managed;
|
||||
|
|
|
|||
|
|
@ -45,9 +45,10 @@ pub use path::Path;
|
|||
pub use path::PosixPath;
|
||||
pub use path::WindowsPath;
|
||||
pub use ptr::Ptr;
|
||||
pub use ascii::{Ascii, AsciiCast, OwnedAsciiCast, AsciiStr};
|
||||
pub use str::{StrSlice, OwnedStr};
|
||||
pub use to_bytes::IterBytes;
|
||||
pub use to_str::ToStr;
|
||||
pub use to_str::{ToStr, ToStrConsume};
|
||||
pub use tuple::{CopyableTuple, ImmutableTuple, ExtendedTupleOps};
|
||||
pub use vec::{CopyableVector, ImmutableVector};
|
||||
pub use vec::{ImmutableEqVector, ImmutableCopyableVector};
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
// Copyright 2012 The Rust Project Developers. See the COPYRIGHT
|
||||
// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
|
|
@ -789,16 +789,18 @@ pub fn each_split_within<'a>(ss: &'a str,
|
|||
|
||||
/// Convert a string to lowercase. ASCII only
|
||||
pub fn to_lower(s: &str) -> ~str {
|
||||
map(s,
|
||||
|c| unsafe{(libc::tolower(c as libc::c_char)) as char}
|
||||
)
|
||||
do map(s) |c| {
|
||||
assert!(char::is_ascii(c));
|
||||
(unsafe{libc::tolower(c as libc::c_char)}) as char
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert a string to uppercase. ASCII only
|
||||
pub fn to_upper(s: &str) -> ~str {
|
||||
map(s,
|
||||
|c| unsafe{(libc::toupper(c as libc::c_char)) as char}
|
||||
)
|
||||
do map(s) |c| {
|
||||
assert!(char::is_ascii(c));
|
||||
(unsafe{libc::toupper(c as libc::c_char)}) as char
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -2317,20 +2319,20 @@ pub mod raw {
|
|||
}
|
||||
|
||||
/// Removes the last byte from a string and returns it. (Not UTF-8 safe).
|
||||
pub fn pop_byte(s: &mut ~str) -> u8 {
|
||||
pub unsafe fn pop_byte(s: &mut ~str) -> u8 {
|
||||
let len = len(*s);
|
||||
assert!((len > 0u));
|
||||
let b = s[len - 1u];
|
||||
unsafe { set_len(s, len - 1u) };
|
||||
set_len(s, len - 1u);
|
||||
return b;
|
||||
}
|
||||
|
||||
/// Removes the first byte from a string and returns it. (Not UTF-8 safe).
|
||||
pub fn shift_byte(s: &mut ~str) -> u8 {
|
||||
pub unsafe fn shift_byte(s: &mut ~str) -> u8 {
|
||||
let len = len(*s);
|
||||
assert!((len > 0u));
|
||||
let b = s[0];
|
||||
*s = unsafe { raw::slice_bytes_owned(*s, 1u, len) };
|
||||
*s = raw::slice_bytes_owned(*s, 1u, len);
|
||||
return b;
|
||||
}
|
||||
|
||||
|
|
@ -3096,12 +3098,11 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_to_lower() {
|
||||
unsafe {
|
||||
assert!(~"" == map(~"",
|
||||
|c| libc::tolower(c as c_char) as char));
|
||||
assert!(~"ymca" == map(~"YMCA",
|
||||
|c| libc::tolower(c as c_char) as char));
|
||||
}
|
||||
// libc::tolower, and hence str::to_lower
|
||||
// are culturally insensitive: they only work for ASCII
|
||||
// (see Issue #1347)
|
||||
assert!(~"" == to_lower(""));
|
||||
assert!(~"ymca" == to_lower("YMCA"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -3346,7 +3347,7 @@ mod tests {
|
|||
#[test]
|
||||
fn test_shift_byte() {
|
||||
let mut s = ~"ABC";
|
||||
let b = raw::shift_byte(&mut s);
|
||||
let b = unsafe{raw::shift_byte(&mut s)};
|
||||
assert!((s == ~"BC"));
|
||||
assert!((b == 65u8));
|
||||
}
|
||||
|
|
@ -3354,7 +3355,7 @@ mod tests {
|
|||
#[test]
|
||||
fn test_pop_byte() {
|
||||
let mut s = ~"ABC";
|
||||
let b = raw::pop_byte(&mut s);
|
||||
let b = unsafe{raw::pop_byte(&mut s)};
|
||||
assert!((s == ~"AB"));
|
||||
assert!((b == 67u8));
|
||||
}
|
||||
|
|
@ -3666,12 +3667,8 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_map() {
|
||||
unsafe {
|
||||
assert!(~"" == map(~"", |c|
|
||||
libc::toupper(c as c_char) as char));
|
||||
assert!(~"YMCA" == map(~"ymca",
|
||||
|c| libc::toupper(c as c_char) as char));
|
||||
}
|
||||
assert!(~"" == map(~"", |c| unsafe {libc::toupper(c as c_char)} as char));
|
||||
assert!(~"YMCA" == map(~"ymca", |c| unsafe {libc::toupper(c as c_char)} as char));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -3685,11 +3682,11 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_any() {
|
||||
assert!(false == any(~"", char::is_uppercase));
|
||||
assert!(false == any(~"", char::is_uppercase));
|
||||
assert!(false == any(~"ymca", char::is_uppercase));
|
||||
assert!(true == any(~"YMCA", char::is_uppercase));
|
||||
assert!(true == any(~"yMCA", char::is_uppercase));
|
||||
assert!(true == any(~"Ymcy", char::is_uppercase));
|
||||
assert!(true == any(~"yMCA", char::is_uppercase));
|
||||
assert!(true == any(~"Ymcy", char::is_uppercase));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
268
src/libcore/str/ascii.rs
Normal file
268
src/libcore/str/ascii.rs
Normal file
|
|
@ -0,0 +1,268 @@
|
|||
// Copyright 2013 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use to_str::{ToStr,ToStrConsume};
|
||||
use str;
|
||||
use cast;
|
||||
|
||||
/// Datatype to hold one ascii character. It is 8 bit long.
|
||||
#[deriving(Clone, Eq)]
|
||||
pub struct Ascii { priv chr: u8 }
|
||||
|
||||
pub impl Ascii {
|
||||
/// Converts a ascii character into a `u8`.
|
||||
#[inline(always)]
|
||||
fn to_byte(self) -> u8 {
|
||||
self.chr
|
||||
}
|
||||
|
||||
/// Converts a ascii character into a `char`.
|
||||
#[inline(always)]
|
||||
fn to_char(self) -> char {
|
||||
self.chr as char
|
||||
}
|
||||
|
||||
/// Convert to lowercase.
|
||||
#[inline(always)]
|
||||
fn to_lower(self) -> Ascii {
|
||||
if self.chr >= 65 && self.chr <= 90 {
|
||||
Ascii{chr: self.chr | 0x20 }
|
||||
} else {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert to uppercase.
|
||||
#[inline(always)]
|
||||
fn to_upper(self) -> Ascii {
|
||||
if self.chr >= 97 && self.chr <= 122 {
|
||||
Ascii{chr: self.chr & !0x20 }
|
||||
} else {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
// Compares two ascii characters of equality, ignoring case.
|
||||
#[inline(always)]
|
||||
fn eq_ignore_case(self, other: Ascii) -> bool {
|
||||
self.to_lower().chr == other.to_lower().chr
|
||||
}
|
||||
}
|
||||
|
||||
impl ToStr for Ascii {
|
||||
#[inline(always)]
|
||||
fn to_str(&self) -> ~str { str::from_bytes(['\'' as u8, self.chr, '\'' as u8]) }
|
||||
}
|
||||
|
||||
/// Trait for converting into an ascii type.
|
||||
pub trait AsciiCast<T> {
|
||||
/// Convert to an ascii type
|
||||
fn to_ascii(&self) -> T;
|
||||
|
||||
/// Check if convertible to ascii
|
||||
fn is_ascii(&self) -> bool;
|
||||
}
|
||||
|
||||
impl<'self> AsciiCast<&'self[Ascii]> for &'self [u8] {
|
||||
#[inline(always)]
|
||||
fn to_ascii(&self) -> &'self[Ascii] {
|
||||
assert!(self.is_ascii());
|
||||
unsafe{ cast::transmute(*self) }
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_ascii(&self) -> bool {
|
||||
for self.each |b| {
|
||||
if !b.is_ascii() { return false; }
|
||||
}
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
impl<'self> AsciiCast<&'self[Ascii]> for &'self str {
|
||||
#[inline(always)]
|
||||
fn to_ascii(&self) -> &'self[Ascii] {
|
||||
assert!(self.is_ascii());
|
||||
let (p,len): (*u8, uint) = unsafe{ cast::transmute(*self) };
|
||||
unsafe{ cast::transmute((p, len - 1))}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_ascii(&self) -> bool {
|
||||
for self.each |b| {
|
||||
if !b.is_ascii() { return false; }
|
||||
}
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
impl AsciiCast<Ascii> for u8 {
|
||||
#[inline(always)]
|
||||
fn to_ascii(&self) -> Ascii {
|
||||
assert!(self.is_ascii());
|
||||
Ascii{ chr: *self }
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_ascii(&self) -> bool {
|
||||
*self & 128 == 0u8
|
||||
}
|
||||
}
|
||||
|
||||
impl AsciiCast<Ascii> for char {
|
||||
#[inline(always)]
|
||||
fn to_ascii(&self) -> Ascii {
|
||||
assert!(self.is_ascii());
|
||||
Ascii{ chr: *self as u8 }
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_ascii(&self) -> bool {
|
||||
*self - ('\x7F' & *self) == '\x00'
|
||||
}
|
||||
}
|
||||
|
||||
/// Trait for copyless casting to an ascii vector.
|
||||
pub trait OwnedAsciiCast {
|
||||
/// Take ownership and cast to an ascii vector without trailing zero element.
|
||||
fn to_ascii_consume(self) -> ~[Ascii];
|
||||
}
|
||||
|
||||
impl OwnedAsciiCast for ~[u8] {
|
||||
#[inline(always)]
|
||||
fn to_ascii_consume(self) -> ~[Ascii] {
|
||||
assert!(self.is_ascii());
|
||||
unsafe {cast::transmute(self)}
|
||||
}
|
||||
}
|
||||
|
||||
impl OwnedAsciiCast for ~str {
|
||||
#[inline(always)]
|
||||
fn to_ascii_consume(self) -> ~[Ascii] {
|
||||
assert!(self.is_ascii());
|
||||
let mut s = self;
|
||||
unsafe {
|
||||
str::raw::pop_byte(&mut s);
|
||||
cast::transmute(s)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Trait for converting an ascii type to a string. Needed to convert `&[Ascii]` to `~str`
|
||||
pub trait AsciiStr {
|
||||
/// Convert to a string.
|
||||
fn to_str_ascii(&self) -> ~str;
|
||||
|
||||
/// Convert to vector representing a lower cased ascii string.
|
||||
fn to_lower(&self) -> ~[Ascii];
|
||||
|
||||
/// Convert to vector representing a upper cased ascii string.
|
||||
fn to_upper(&self) -> ~[Ascii];
|
||||
|
||||
}
|
||||
|
||||
impl<'self> AsciiStr for &'self [Ascii] {
|
||||
#[inline(always)]
|
||||
fn to_str_ascii(&self) -> ~str {
|
||||
let mut cpy = self.to_owned();
|
||||
cpy.push(0u8.to_ascii());
|
||||
unsafe {cast::transmute(cpy)}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_lower(&self) -> ~[Ascii] {
|
||||
self.map(|a| a.to_lower())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn to_upper(&self) -> ~[Ascii] {
|
||||
self.map(|a| a.to_upper())
|
||||
}
|
||||
}
|
||||
|
||||
impl ToStrConsume for ~[Ascii] {
|
||||
#[inline(always)]
|
||||
fn to_str_consume(self) -> ~str {
|
||||
let mut cpy = self;
|
||||
cpy.push(0u8.to_ascii());
|
||||
unsafe {cast::transmute(cpy)}
|
||||
}
|
||||
}
|
||||
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
macro_rules! v2ascii (
|
||||
( [$($e:expr),*]) => ( [$(Ascii{chr:$e}),*]);
|
||||
(~[$($e:expr),*]) => (~[$(Ascii{chr:$e}),*]);
|
||||
)
|
||||
|
||||
#[test]
|
||||
fn test_ascii() {
|
||||
assert_eq!(65u8.to_ascii().to_byte(), 65u8);
|
||||
assert_eq!(65u8.to_ascii().to_char(), 'A');
|
||||
assert_eq!('A'.to_ascii().to_char(), 'A');
|
||||
assert_eq!('A'.to_ascii().to_byte(), 65u8);
|
||||
|
||||
assert_eq!('A'.to_ascii().to_lower().to_char(), 'a');
|
||||
assert_eq!('Z'.to_ascii().to_lower().to_char(), 'z');
|
||||
assert_eq!('a'.to_ascii().to_upper().to_char(), 'A');
|
||||
assert_eq!('z'.to_ascii().to_upper().to_char(), 'Z');
|
||||
|
||||
assert_eq!('@'.to_ascii().to_lower().to_char(), '@');
|
||||
assert_eq!('['.to_ascii().to_lower().to_char(), '[');
|
||||
assert_eq!('`'.to_ascii().to_upper().to_char(), '`');
|
||||
assert_eq!('{'.to_ascii().to_upper().to_char(), '{');
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ascii_vec() {
|
||||
assert_eq!((&[40u8, 32u8, 59u8]).to_ascii(), v2ascii!([40, 32, 59]));
|
||||
assert_eq!("( ;".to_ascii(), v2ascii!([40, 32, 59]));
|
||||
// FIXME: #5475 borrowchk error, owned vectors do not live long enough
|
||||
// if chained-from directly
|
||||
let v = ~[40u8, 32u8, 59u8]; assert_eq!(v.to_ascii(), v2ascii!([40, 32, 59]));
|
||||
let v = ~"( ;"; assert_eq!(v.to_ascii(), v2ascii!([40, 32, 59]));
|
||||
|
||||
assert_eq!("abCDef&?#".to_ascii().to_lower().to_str_ascii(), ~"abcdef&?#");
|
||||
assert_eq!("abCDef&?#".to_ascii().to_upper().to_str_ascii(), ~"ABCDEF&?#");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_ascii_vec() {
|
||||
// FIXME: #4318 Compiler crashes on moving self
|
||||
//assert_eq!(~"( ;".to_ascii_consume(), v2ascii!(~[40, 32, 59]));
|
||||
//assert_eq!(~[40u8, 32u8, 59u8].to_ascii_consume(), v2ascii!(~[40, 32, 59]));
|
||||
//assert_eq!(~"( ;".to_ascii_consume_with_null(), v2ascii!(~[40, 32, 59, 0]));
|
||||
//assert_eq!(~[40u8, 32u8, 59u8].to_ascii_consume_with_null(),
|
||||
// v2ascii!(~[40, 32, 59, 0]));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ascii_to_str() { assert_eq!(v2ascii!([40, 32, 59]).to_str_ascii(), ~"( ;"); }
|
||||
|
||||
#[test]
|
||||
fn test_ascii_to_str_consume() {
|
||||
// FIXME: #4318 Compiler crashes on moving self
|
||||
//assert_eq!(v2ascii!(~[40, 32, 59]).to_str_consume(), ~"( ;");
|
||||
}
|
||||
|
||||
#[test] #[should_fail]
|
||||
fn test_ascii_vec_fail_u8_slice() { (&[127u8, 128u8, 255u8]).to_ascii(); }
|
||||
|
||||
#[test] #[should_fail]
|
||||
fn test_ascii_vec_fail_str_slice() { "zoä华".to_ascii(); }
|
||||
|
||||
#[test] #[should_fail]
|
||||
fn test_ascii_fail_u8_slice() { 255u8.to_ascii(); }
|
||||
|
||||
#[test] #[should_fail]
|
||||
fn test_ascii_fail_char_slice() { 'λ'.to_ascii(); }
|
||||
}
|
||||
|
|
@ -20,6 +20,12 @@ pub trait ToStr {
|
|||
fn to_str(&self) -> ~str;
|
||||
}
|
||||
|
||||
/// Trait for converting a type to a string, consuming it in the process.
|
||||
pub trait ToStrConsume {
|
||||
// Cosume and convert to a string.
|
||||
fn to_str_consume(self) -> ~str;
|
||||
}
|
||||
|
||||
impl ToStr for bool {
|
||||
#[inline(always)]
|
||||
fn to_str(&self) -> ~str { ::bool::to_str(*self) }
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue