511 lines
16 KiB
Rust
511 lines
16 KiB
Rust
//! A character type.
|
||
//!
|
||
//! The `char` type represents a single character. More specifically, since
|
||
//! 'character' isn't a well-defined concept in Unicode, `char` is a '[Unicode
|
||
//! scalar value]', which is similar to, but not the same as, a '[Unicode code
|
||
//! point]'.
|
||
//!
|
||
//! [Unicode scalar value]: http://www.unicode.org/glossary/#unicode_scalar_value
|
||
//! [Unicode code point]: http://www.unicode.org/glossary/#code_point
|
||
//!
|
||
//! This module exists for technical reasons, the primary documentation for
|
||
//! `char` is directly on [the `char` primitive type](../../std/primitive.char.html)
|
||
//! itself.
|
||
//!
|
||
//! This module is the home of the iterator implementations for the iterators
|
||
//! implemented on `char`, as well as some useful constants and conversion
|
||
//! functions that convert various types to `char`.
|
||
|
||
#![allow(non_snake_case)]
|
||
#![stable(feature = "core_char", since = "1.2.0")]
|
||
|
||
mod convert;
|
||
mod decode;
|
||
mod methods;
|
||
|
||
// stable re-exports
|
||
#[stable(feature = "char_from_unchecked", since = "1.5.0")]
|
||
pub use self::convert::from_u32_unchecked;
|
||
#[stable(feature = "try_from", since = "1.34.0")]
|
||
pub use self::convert::CharTryFromError;
|
||
#[stable(feature = "char_from_str", since = "1.20.0")]
|
||
pub use self::convert::ParseCharError;
|
||
#[stable(feature = "rust1", since = "1.0.0")]
|
||
pub use self::convert::{from_digit, from_u32};
|
||
#[stable(feature = "decode_utf16", since = "1.9.0")]
|
||
pub use self::decode::{decode_utf16, DecodeUtf16, DecodeUtf16Error};
|
||
#[stable(feature = "unicode_version", since = "1.45.0")]
|
||
pub use crate::unicode::UNICODE_VERSION;
|
||
|
||
use crate::fmt::{self, Write};
|
||
use crate::iter::FusedIterator;
|
||
|
||
// UTF-8 ranges and tags for encoding characters
|
||
const TAG_CONT: u8 = 0b1000_0000;
|
||
const TAG_TWO_B: u8 = 0b1100_0000;
|
||
const TAG_THREE_B: u8 = 0b1110_0000;
|
||
const TAG_FOUR_B: u8 = 0b1111_0000;
|
||
const MAX_ONE_B: u32 = 0x80;
|
||
const MAX_TWO_B: u32 = 0x800;
|
||
const MAX_THREE_B: u32 = 0x10000;
|
||
|
||
/*
|
||
Lu Uppercase_Letter an uppercase letter
|
||
Ll Lowercase_Letter a lowercase letter
|
||
Lt Titlecase_Letter a digraphic character, with first part uppercase
|
||
Lm Modifier_Letter a modifier letter
|
||
Lo Other_Letter other letters, including syllables and ideographs
|
||
Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
|
||
Mc Spacing_Mark a spacing combining mark (positive advance width)
|
||
Me Enclosing_Mark an enclosing combining mark
|
||
Nd Decimal_Number a decimal digit
|
||
Nl Letter_Number a letterlike numeric character
|
||
No Other_Number a numeric character of other type
|
||
Pc Connector_Punctuation a connecting punctuation mark, like a tie
|
||
Pd Dash_Punctuation a dash or hyphen punctuation mark
|
||
Ps Open_Punctuation an opening punctuation mark (of a pair)
|
||
Pe Close_Punctuation a closing punctuation mark (of a pair)
|
||
Pi Initial_Punctuation an initial quotation mark
|
||
Pf Final_Punctuation a final quotation mark
|
||
Po Other_Punctuation a punctuation mark of other type
|
||
Sm Math_Symbol a symbol of primarily mathematical use
|
||
Sc Currency_Symbol a currency sign
|
||
Sk Modifier_Symbol a non-letterlike modifier symbol
|
||
So Other_Symbol a symbol of other type
|
||
Zs Space_Separator a space character (of various non-zero widths)
|
||
Zl Line_Separator U+2028 LINE SEPARATOR only
|
||
Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
|
||
Cc Control a C0 or C1 control code
|
||
Cf Format a format control character
|
||
Cs Surrogate a surrogate code point
|
||
Co Private_Use a private-use character
|
||
Cn Unassigned a reserved unassigned code point or a noncharacter
|
||
*/
|
||
|
||
/// The highest valid code point a `char` can have.
|
||
///
|
||
/// A [`char`] is a [Unicode Scalar Value], which means that it is a [Code
|
||
/// Point], but only ones within a certain range. `MAX` is the highest valid
|
||
/// code point that's a valid [Unicode Scalar Value].
|
||
///
|
||
/// [`char`]: ../../std/primitive.char.html
|
||
/// [Unicode Scalar Value]: http://www.unicode.org/glossary/#unicode_scalar_value
|
||
/// [Code Point]: http://www.unicode.org/glossary/#code_point
|
||
#[stable(feature = "rust1", since = "1.0.0")]
|
||
pub const MAX: char = char::MAX;
|
||
|
||
/// `U+FFFD REPLACEMENT CHARACTER` (<28>) is used in Unicode to represent a
|
||
/// decoding error.
|
||
///
|
||
/// It can occur, for example, when giving ill-formed UTF-8 bytes to
|
||
/// [`String::from_utf8_lossy`](../../std/string/struct.String.html#method.from_utf8_lossy).
|
||
#[stable(feature = "decode_utf16", since = "1.9.0")]
|
||
pub const REPLACEMENT_CHARACTER: char = char::REPLACEMENT_CHARACTER;
|
||
|
||
/// Returns an iterator that yields the hexadecimal Unicode escape of a
|
||
/// character, as `char`s.
|
||
///
|
||
/// This `struct` is created by the [`escape_unicode`] method on [`char`]. See
|
||
/// its documentation for more.
|
||
///
|
||
/// [`escape_unicode`]: ../../std/primitive.char.html#method.escape_unicode
|
||
/// [`char`]: ../../std/primitive.char.html
|
||
#[derive(Clone, Debug)]
|
||
#[stable(feature = "rust1", since = "1.0.0")]
|
||
pub struct EscapeUnicode {
|
||
c: char,
|
||
state: EscapeUnicodeState,
|
||
|
||
// The index of the next hex digit to be printed (0 if none),
|
||
// i.e., the number of remaining hex digits to be printed;
|
||
// increasing from the least significant digit: 0x543210
|
||
hex_digit_idx: usize,
|
||
}
|
||
|
||
// The enum values are ordered so that their representation is the
|
||
// same as the remaining length (besides the hexadecimal digits). This
|
||
// likely makes `len()` a single load from memory) and inline-worth.
|
||
#[derive(Clone, Debug)]
|
||
enum EscapeUnicodeState {
|
||
Done,
|
||
RightBrace,
|
||
Value,
|
||
LeftBrace,
|
||
Type,
|
||
Backslash,
|
||
}
|
||
|
||
#[stable(feature = "rust1", since = "1.0.0")]
|
||
impl Iterator for EscapeUnicode {
|
||
type Item = char;
|
||
|
||
fn next(&mut self) -> Option<char> {
|
||
match self.state {
|
||
EscapeUnicodeState::Backslash => {
|
||
self.state = EscapeUnicodeState::Type;
|
||
Some('\\')
|
||
}
|
||
EscapeUnicodeState::Type => {
|
||
self.state = EscapeUnicodeState::LeftBrace;
|
||
Some('u')
|
||
}
|
||
EscapeUnicodeState::LeftBrace => {
|
||
self.state = EscapeUnicodeState::Value;
|
||
Some('{')
|
||
}
|
||
EscapeUnicodeState::Value => {
|
||
let hex_digit = ((self.c as u32) >> (self.hex_digit_idx * 4)) & 0xf;
|
||
let c = from_digit(hex_digit, 16).unwrap();
|
||
if self.hex_digit_idx == 0 {
|
||
self.state = EscapeUnicodeState::RightBrace;
|
||
} else {
|
||
self.hex_digit_idx -= 1;
|
||
}
|
||
Some(c)
|
||
}
|
||
EscapeUnicodeState::RightBrace => {
|
||
self.state = EscapeUnicodeState::Done;
|
||
Some('}')
|
||
}
|
||
EscapeUnicodeState::Done => None,
|
||
}
|
||
}
|
||
|
||
#[inline]
|
||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||
let n = self.len();
|
||
(n, Some(n))
|
||
}
|
||
|
||
#[inline]
|
||
fn count(self) -> usize {
|
||
self.len()
|
||
}
|
||
|
||
fn last(self) -> Option<char> {
|
||
match self.state {
|
||
EscapeUnicodeState::Done => None,
|
||
|
||
EscapeUnicodeState::RightBrace
|
||
| EscapeUnicodeState::Value
|
||
| EscapeUnicodeState::LeftBrace
|
||
| EscapeUnicodeState::Type
|
||
| EscapeUnicodeState::Backslash => Some('}'),
|
||
}
|
||
}
|
||
}
|
||
|
||
#[stable(feature = "exact_size_escape", since = "1.11.0")]
|
||
impl ExactSizeIterator for EscapeUnicode {
|
||
#[inline]
|
||
fn len(&self) -> usize {
|
||
// The match is a single memory access with no branching
|
||
self.hex_digit_idx
|
||
+ match self.state {
|
||
EscapeUnicodeState::Done => 0,
|
||
EscapeUnicodeState::RightBrace => 1,
|
||
EscapeUnicodeState::Value => 2,
|
||
EscapeUnicodeState::LeftBrace => 3,
|
||
EscapeUnicodeState::Type => 4,
|
||
EscapeUnicodeState::Backslash => 5,
|
||
}
|
||
}
|
||
}
|
||
|
||
#[stable(feature = "fused", since = "1.26.0")]
|
||
impl FusedIterator for EscapeUnicode {}
|
||
|
||
#[stable(feature = "char_struct_display", since = "1.16.0")]
|
||
impl fmt::Display for EscapeUnicode {
|
||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||
for c in self.clone() {
|
||
f.write_char(c)?;
|
||
}
|
||
Ok(())
|
||
}
|
||
}
|
||
|
||
/// An iterator that yields the literal escape code of a `char`.
|
||
///
|
||
/// This `struct` is created by the [`escape_default`] method on [`char`]. See
|
||
/// its documentation for more.
|
||
///
|
||
/// [`escape_default`]: ../../std/primitive.char.html#method.escape_default
|
||
/// [`char`]: ../../std/primitive.char.html
|
||
#[derive(Clone, Debug)]
|
||
#[stable(feature = "rust1", since = "1.0.0")]
|
||
pub struct EscapeDefault {
|
||
state: EscapeDefaultState,
|
||
}
|
||
|
||
#[derive(Clone, Debug)]
|
||
enum EscapeDefaultState {
|
||
Done,
|
||
Char(char),
|
||
Backslash(char),
|
||
Unicode(EscapeUnicode),
|
||
}
|
||
|
||
#[stable(feature = "rust1", since = "1.0.0")]
|
||
impl Iterator for EscapeDefault {
|
||
type Item = char;
|
||
|
||
fn next(&mut self) -> Option<char> {
|
||
match self.state {
|
||
EscapeDefaultState::Backslash(c) => {
|
||
self.state = EscapeDefaultState::Char(c);
|
||
Some('\\')
|
||
}
|
||
EscapeDefaultState::Char(c) => {
|
||
self.state = EscapeDefaultState::Done;
|
||
Some(c)
|
||
}
|
||
EscapeDefaultState::Done => None,
|
||
EscapeDefaultState::Unicode(ref mut iter) => iter.next(),
|
||
}
|
||
}
|
||
|
||
#[inline]
|
||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||
let n = self.len();
|
||
(n, Some(n))
|
||
}
|
||
|
||
#[inline]
|
||
fn count(self) -> usize {
|
||
self.len()
|
||
}
|
||
|
||
fn nth(&mut self, n: usize) -> Option<char> {
|
||
match self.state {
|
||
EscapeDefaultState::Backslash(c) if n == 0 => {
|
||
self.state = EscapeDefaultState::Char(c);
|
||
Some('\\')
|
||
}
|
||
EscapeDefaultState::Backslash(c) if n == 1 => {
|
||
self.state = EscapeDefaultState::Done;
|
||
Some(c)
|
||
}
|
||
EscapeDefaultState::Backslash(_) => {
|
||
self.state = EscapeDefaultState::Done;
|
||
None
|
||
}
|
||
EscapeDefaultState::Char(c) => {
|
||
self.state = EscapeDefaultState::Done;
|
||
|
||
if n == 0 { Some(c) } else { None }
|
||
}
|
||
EscapeDefaultState::Done => None,
|
||
EscapeDefaultState::Unicode(ref mut i) => i.nth(n),
|
||
}
|
||
}
|
||
|
||
fn last(self) -> Option<char> {
|
||
match self.state {
|
||
EscapeDefaultState::Unicode(iter) => iter.last(),
|
||
EscapeDefaultState::Done => None,
|
||
EscapeDefaultState::Backslash(c) | EscapeDefaultState::Char(c) => Some(c),
|
||
}
|
||
}
|
||
}
|
||
|
||
#[stable(feature = "exact_size_escape", since = "1.11.0")]
|
||
impl ExactSizeIterator for EscapeDefault {
|
||
fn len(&self) -> usize {
|
||
match self.state {
|
||
EscapeDefaultState::Done => 0,
|
||
EscapeDefaultState::Char(_) => 1,
|
||
EscapeDefaultState::Backslash(_) => 2,
|
||
EscapeDefaultState::Unicode(ref iter) => iter.len(),
|
||
}
|
||
}
|
||
}
|
||
|
||
#[stable(feature = "fused", since = "1.26.0")]
|
||
impl FusedIterator for EscapeDefault {}
|
||
|
||
#[stable(feature = "char_struct_display", since = "1.16.0")]
|
||
impl fmt::Display for EscapeDefault {
|
||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||
for c in self.clone() {
|
||
f.write_char(c)?;
|
||
}
|
||
Ok(())
|
||
}
|
||
}
|
||
|
||
/// An iterator that yields the literal escape code of a `char`.
|
||
///
|
||
/// This `struct` is created by the [`escape_debug`] method on [`char`]. See its
|
||
/// documentation for more.
|
||
///
|
||
/// [`escape_debug`]: ../../std/primitive.char.html#method.escape_debug
|
||
/// [`char`]: ../../std/primitive.char.html
|
||
#[stable(feature = "char_escape_debug", since = "1.20.0")]
|
||
#[derive(Clone, Debug)]
|
||
pub struct EscapeDebug(EscapeDefault);
|
||
|
||
#[stable(feature = "char_escape_debug", since = "1.20.0")]
|
||
impl Iterator for EscapeDebug {
|
||
type Item = char;
|
||
fn next(&mut self) -> Option<char> {
|
||
self.0.next()
|
||
}
|
||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||
self.0.size_hint()
|
||
}
|
||
}
|
||
|
||
#[stable(feature = "char_escape_debug", since = "1.20.0")]
|
||
impl ExactSizeIterator for EscapeDebug {}
|
||
|
||
#[stable(feature = "fused", since = "1.26.0")]
|
||
impl FusedIterator for EscapeDebug {}
|
||
|
||
#[stable(feature = "char_escape_debug", since = "1.20.0")]
|
||
impl fmt::Display for EscapeDebug {
|
||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||
fmt::Display::fmt(&self.0, f)
|
||
}
|
||
}
|
||
|
||
/// Returns an iterator that yields the lowercase equivalent of a `char`.
|
||
///
|
||
/// This `struct` is created by the [`to_lowercase`] method on [`char`]. See
|
||
/// its documentation for more.
|
||
///
|
||
/// [`to_lowercase`]: ../../std/primitive.char.html#method.to_lowercase
|
||
/// [`char`]: ../../std/primitive.char.html
|
||
#[stable(feature = "rust1", since = "1.0.0")]
|
||
#[derive(Debug, Clone)]
|
||
pub struct ToLowercase(CaseMappingIter);
|
||
|
||
#[stable(feature = "rust1", since = "1.0.0")]
|
||
impl Iterator for ToLowercase {
|
||
type Item = char;
|
||
fn next(&mut self) -> Option<char> {
|
||
self.0.next()
|
||
}
|
||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||
self.0.size_hint()
|
||
}
|
||
}
|
||
|
||
#[stable(feature = "fused", since = "1.26.0")]
|
||
impl FusedIterator for ToLowercase {}
|
||
|
||
#[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")]
|
||
impl ExactSizeIterator for ToLowercase {}
|
||
|
||
/// Returns an iterator that yields the uppercase equivalent of a `char`.
|
||
///
|
||
/// This `struct` is created by the [`to_uppercase`] method on [`char`]. See
|
||
/// its documentation for more.
|
||
///
|
||
/// [`to_uppercase`]: ../../std/primitive.char.html#method.to_uppercase
|
||
/// [`char`]: ../../std/primitive.char.html
|
||
#[stable(feature = "rust1", since = "1.0.0")]
|
||
#[derive(Debug, Clone)]
|
||
pub struct ToUppercase(CaseMappingIter);
|
||
|
||
#[stable(feature = "rust1", since = "1.0.0")]
|
||
impl Iterator for ToUppercase {
|
||
type Item = char;
|
||
fn next(&mut self) -> Option<char> {
|
||
self.0.next()
|
||
}
|
||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||
self.0.size_hint()
|
||
}
|
||
}
|
||
|
||
#[stable(feature = "fused", since = "1.26.0")]
|
||
impl FusedIterator for ToUppercase {}
|
||
|
||
#[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")]
|
||
impl ExactSizeIterator for ToUppercase {}
|
||
|
||
#[derive(Debug, Clone)]
|
||
enum CaseMappingIter {
|
||
Three(char, char, char),
|
||
Two(char, char),
|
||
One(char),
|
||
Zero,
|
||
}
|
||
|
||
impl CaseMappingIter {
|
||
fn new(chars: [char; 3]) -> CaseMappingIter {
|
||
if chars[2] == '\0' {
|
||
if chars[1] == '\0' {
|
||
CaseMappingIter::One(chars[0]) // Including if chars[0] == '\0'
|
||
} else {
|
||
CaseMappingIter::Two(chars[0], chars[1])
|
||
}
|
||
} else {
|
||
CaseMappingIter::Three(chars[0], chars[1], chars[2])
|
||
}
|
||
}
|
||
}
|
||
|
||
impl Iterator for CaseMappingIter {
|
||
type Item = char;
|
||
fn next(&mut self) -> Option<char> {
|
||
match *self {
|
||
CaseMappingIter::Three(a, b, c) => {
|
||
*self = CaseMappingIter::Two(b, c);
|
||
Some(a)
|
||
}
|
||
CaseMappingIter::Two(b, c) => {
|
||
*self = CaseMappingIter::One(c);
|
||
Some(b)
|
||
}
|
||
CaseMappingIter::One(c) => {
|
||
*self = CaseMappingIter::Zero;
|
||
Some(c)
|
||
}
|
||
CaseMappingIter::Zero => None,
|
||
}
|
||
}
|
||
|
||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||
let size = match self {
|
||
CaseMappingIter::Three(..) => 3,
|
||
CaseMappingIter::Two(..) => 2,
|
||
CaseMappingIter::One(_) => 1,
|
||
CaseMappingIter::Zero => 0,
|
||
};
|
||
(size, Some(size))
|
||
}
|
||
}
|
||
|
||
impl fmt::Display for CaseMappingIter {
|
||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||
match *self {
|
||
CaseMappingIter::Three(a, b, c) => {
|
||
f.write_char(a)?;
|
||
f.write_char(b)?;
|
||
f.write_char(c)
|
||
}
|
||
CaseMappingIter::Two(b, c) => {
|
||
f.write_char(b)?;
|
||
f.write_char(c)
|
||
}
|
||
CaseMappingIter::One(c) => f.write_char(c),
|
||
CaseMappingIter::Zero => Ok(()),
|
||
}
|
||
}
|
||
}
|
||
|
||
#[stable(feature = "char_struct_display", since = "1.16.0")]
|
||
impl fmt::Display for ToLowercase {
|
||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||
fmt::Display::fmt(&self.0, f)
|
||
}
|
||
}
|
||
|
||
#[stable(feature = "char_struct_display", since = "1.16.0")]
|
||
impl fmt::Display for ToUppercase {
|
||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||
fmt::Display::fmt(&self.0, f)
|
||
}
|
||
}
|