Rollup merge of #66015 - popzxc:refactor-librustc_parser, r=matklad
librustc_lexer: Refactor the module This PR introduces a refactoring of the `librustc_lexer` in order to improve readability. All the changes performed are only cosmetic and do not introduce any changes the lexer logic or performance. Newly introduced modules `literal`, `token` and `utils` are just copy-pasted from the `lib.rs` and do not contain even cosmetic changes (I decided to do so so it'll be easier to review changes looking only on diff). r? @petrochenkov cc @Centril @matklad
This commit is contained in:
commit
81550a00d1
2 changed files with 161 additions and 124 deletions
|
|
@ -41,10 +41,20 @@ impl<'a> Cursor<'a> {
|
|||
/// If requested position doesn't exist, `EOF_CHAR` is returned.
|
||||
/// However, getting `EOF_CHAR` doesn't always mean actual end of file,
|
||||
/// it should be checked with `is_eof` method.
|
||||
pub(crate) fn nth_char(&self, n: usize) -> char {
|
||||
fn nth_char(&self, n: usize) -> char {
|
||||
self.chars().nth(n).unwrap_or(EOF_CHAR)
|
||||
}
|
||||
|
||||
/// Peeks the next symbol from the input stream without consuming it.
|
||||
pub(crate) fn first(&self) -> char {
|
||||
self.nth_char(0)
|
||||
}
|
||||
|
||||
/// Peeks the second symbol from the input stream without consuming it.
|
||||
pub(crate) fn second(&self) -> char {
|
||||
self.nth_char(1)
|
||||
}
|
||||
|
||||
/// Checks if there is nothing more to consume.
|
||||
pub(crate) fn is_eof(&self) -> bool {
|
||||
self.chars.as_str().is_empty()
|
||||
|
|
|
|||
|
|
@ -18,6 +18,8 @@ mod cursor;
|
|||
pub mod unescape;
|
||||
|
||||
use crate::cursor::{Cursor, EOF_CHAR};
|
||||
use self::TokenKind::*;
|
||||
use self::LiteralKind::*;
|
||||
|
||||
/// Parsed token.
|
||||
/// It doesn't contain information about data that has been parsed,
|
||||
|
|
@ -116,7 +118,6 @@ pub enum TokenKind {
|
|||
/// Unknown token, not expected by the lexer, e.g. "№"
|
||||
Unknown,
|
||||
}
|
||||
use self::TokenKind::*;
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum LiteralKind {
|
||||
|
|
@ -137,7 +138,6 @@ pub enum LiteralKind {
|
|||
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
|
||||
RawByteStr { n_hashes: usize, started: bool, terminated: bool },
|
||||
}
|
||||
use self::LiteralKind::*;
|
||||
|
||||
/// Base of numeric literal encoding according to its prefix.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
|
|
@ -241,14 +241,13 @@ pub fn is_id_continue(c: char) -> bool {
|
|||
|| (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c))
|
||||
}
|
||||
|
||||
|
||||
impl Cursor<'_> {
|
||||
/// Parses a token from the input string.
|
||||
fn advance_token(&mut self) -> Token {
|
||||
let first_char = self.bump().unwrap();
|
||||
let token_kind = match first_char {
|
||||
// Slash, comment or block comment.
|
||||
'/' => match self.nth_char(0) {
|
||||
'/' => match self.first() {
|
||||
'/' => self.line_comment(),
|
||||
'*' => self.block_comment(),
|
||||
_ => Slash,
|
||||
|
|
@ -257,8 +256,8 @@ impl Cursor<'_> {
|
|||
// Whitespace sequence.
|
||||
c if is_whitespace(c) => self.whitespace(),
|
||||
|
||||
// Raw string literal or identifier.
|
||||
'r' => match (self.nth_char(0), self.nth_char(1)) {
|
||||
// Raw identifier, raw string literal or identifier.
|
||||
'r' => match (self.first(), self.second()) {
|
||||
('#', c1) if is_id_start(c1) => self.raw_ident(),
|
||||
('#', _) | ('"', _) => {
|
||||
let (n_hashes, started, terminated) = self.raw_double_quoted_string();
|
||||
|
|
@ -273,7 +272,7 @@ impl Cursor<'_> {
|
|||
},
|
||||
|
||||
// Byte literal, byte string literal, raw byte string literal or identifier.
|
||||
'b' => match (self.nth_char(0), self.nth_char(1)) {
|
||||
'b' => match (self.first(), self.second()) {
|
||||
('\'', _) => {
|
||||
self.bump();
|
||||
let terminated = self.single_quoted_string();
|
||||
|
|
@ -366,31 +365,23 @@ impl Cursor<'_> {
|
|||
}
|
||||
|
||||
fn line_comment(&mut self) -> TokenKind {
|
||||
debug_assert!(self.prev() == '/' && self.nth_char(0) == '/');
|
||||
debug_assert!(self.prev() == '/' && self.first() == '/');
|
||||
self.bump();
|
||||
loop {
|
||||
match self.nth_char(0) {
|
||||
'\n' => break,
|
||||
EOF_CHAR if self.is_eof() => break,
|
||||
_ => {
|
||||
self.bump();
|
||||
}
|
||||
}
|
||||
}
|
||||
self.eat_while(|c| c != '\n');
|
||||
LineComment
|
||||
}
|
||||
|
||||
fn block_comment(&mut self) -> TokenKind {
|
||||
debug_assert!(self.prev() == '/' && self.nth_char(0) == '*');
|
||||
debug_assert!(self.prev() == '/' && self.first() == '*');
|
||||
self.bump();
|
||||
let mut depth = 1usize;
|
||||
while let Some(c) = self.bump() {
|
||||
match c {
|
||||
'/' if self.nth_char(0) == '*' => {
|
||||
'/' if self.first() == '*' => {
|
||||
self.bump();
|
||||
depth += 1;
|
||||
}
|
||||
'*' if self.nth_char(0) == '/' => {
|
||||
'*' if self.first() == '/' => {
|
||||
self.bump();
|
||||
depth -= 1;
|
||||
if depth == 0 {
|
||||
|
|
@ -409,31 +400,27 @@ impl Cursor<'_> {
|
|||
|
||||
fn whitespace(&mut self) -> TokenKind {
|
||||
debug_assert!(is_whitespace(self.prev()));
|
||||
while is_whitespace(self.nth_char(0)) {
|
||||
self.bump();
|
||||
}
|
||||
self.eat_while(is_whitespace);
|
||||
Whitespace
|
||||
}
|
||||
|
||||
fn raw_ident(&mut self) -> TokenKind {
|
||||
debug_assert!(
|
||||
self.prev() == 'r'
|
||||
&& self.nth_char(0) == '#'
|
||||
&& is_id_start(self.nth_char(1))
|
||||
&& self.first() == '#'
|
||||
&& is_id_start(self.second())
|
||||
);
|
||||
// Eat "#" symbol.
|
||||
self.bump();
|
||||
self.bump();
|
||||
while is_id_continue(self.nth_char(0)) {
|
||||
self.bump();
|
||||
}
|
||||
// Eat the identifier part of RawIdent.
|
||||
self.eat_identifier();
|
||||
RawIdent
|
||||
}
|
||||
|
||||
fn ident(&mut self) -> TokenKind {
|
||||
debug_assert!(is_id_start(self.prev()));
|
||||
while is_id_continue(self.nth_char(0)) {
|
||||
self.bump();
|
||||
}
|
||||
// Start is already eaten, eat the rest of identifier.
|
||||
self.eat_while(is_id_continue);
|
||||
Ident
|
||||
}
|
||||
|
||||
|
|
@ -442,7 +429,7 @@ impl Cursor<'_> {
|
|||
let mut base = Base::Decimal;
|
||||
if first_digit == '0' {
|
||||
// Attempt to parse encoding base.
|
||||
let has_digits = match self.nth_char(0) {
|
||||
let has_digits = match self.first() {
|
||||
'b' => {
|
||||
base = Base::Binary;
|
||||
self.bump();
|
||||
|
|
@ -476,23 +463,23 @@ impl Cursor<'_> {
|
|||
self.eat_decimal_digits();
|
||||
};
|
||||
|
||||
match self.nth_char(0) {
|
||||
match self.first() {
|
||||
// Don't be greedy if this is actually an
|
||||
// integer literal followed by field/method access or a range pattern
|
||||
// (`0..2` and `12.foo()`)
|
||||
'.' if self.nth_char(1) != '.'
|
||||
&& !is_id_start(self.nth_char(1)) =>
|
||||
'.' if self.second() != '.'
|
||||
&& !is_id_start(self.second()) =>
|
||||
{
|
||||
// might have stuff after the ., and if it does, it needs to start
|
||||
// with a number
|
||||
self.bump();
|
||||
let mut empty_exponent = false;
|
||||
if self.nth_char(0).is_digit(10) {
|
||||
if self.first().is_digit(10) {
|
||||
self.eat_decimal_digits();
|
||||
match self.nth_char(0) {
|
||||
match self.first() {
|
||||
'e' | 'E' => {
|
||||
self.bump();
|
||||
empty_exponent = self.float_exponent().is_err()
|
||||
empty_exponent = !self.eat_float_exponent();
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
|
|
@ -501,7 +488,7 @@ impl Cursor<'_> {
|
|||
}
|
||||
'e' | 'E' => {
|
||||
self.bump();
|
||||
let empty_exponent = self.float_exponent().is_err();
|
||||
let empty_exponent = !self.eat_float_exponent();
|
||||
Float { base, empty_exponent }
|
||||
}
|
||||
_ => Int { base, empty_int: false },
|
||||
|
|
@ -510,65 +497,76 @@ impl Cursor<'_> {
|
|||
|
||||
fn lifetime_or_char(&mut self) -> TokenKind {
|
||||
debug_assert!(self.prev() == '\'');
|
||||
let mut starts_with_number = false;
|
||||
|
||||
// Check if the first symbol after '\'' is a valid identifier
|
||||
// character or a number (not a digit followed by '\'').
|
||||
if (is_id_start(self.nth_char(0))
|
||||
|| self.nth_char(0).is_digit(10) && {
|
||||
starts_with_number = true;
|
||||
true
|
||||
})
|
||||
&& self.nth_char(1) != '\''
|
||||
{
|
||||
self.bump();
|
||||
let can_be_a_lifetime = if self.second() == '\'' {
|
||||
// It's surely not a lifetime.
|
||||
false
|
||||
} else {
|
||||
// If the first symbol is valid for identifier, it can be a lifetime.
|
||||
// Also check if it's a number for a better error reporting (so '0 will
|
||||
// be reported as invalid lifetime and not as unterminated char literal).
|
||||
is_id_start(self.first()) || self.first().is_digit(10)
|
||||
};
|
||||
|
||||
// Skip the identifier.
|
||||
while is_id_continue(self.nth_char(0)) {
|
||||
self.bump();
|
||||
if !can_be_a_lifetime {
|
||||
let terminated = self.single_quoted_string();
|
||||
let suffix_start = self.len_consumed();
|
||||
if terminated {
|
||||
self.eat_literal_suffix();
|
||||
}
|
||||
|
||||
return if self.nth_char(0) == '\'' {
|
||||
self.bump();
|
||||
let kind = Char { terminated: true };
|
||||
Literal { kind, suffix_start: self.len_consumed() }
|
||||
} else {
|
||||
Lifetime { starts_with_number }
|
||||
};
|
||||
let kind = Char { terminated };
|
||||
return Literal { kind, suffix_start };
|
||||
}
|
||||
|
||||
// This is not a lifetime (checked above), parse a char literal.
|
||||
let terminated = self.single_quoted_string();
|
||||
let suffix_start = self.len_consumed();
|
||||
if terminated {
|
||||
self.eat_literal_suffix();
|
||||
// Either a lifetime or a character literal with
|
||||
// length greater than 1.
|
||||
|
||||
let starts_with_number = self.first().is_digit(10);
|
||||
|
||||
// Skip the literal contents.
|
||||
// First symbol can be a number (which isn't a valid identifier start),
|
||||
// so skip it without any checks.
|
||||
self.bump();
|
||||
self.eat_while(is_id_continue);
|
||||
|
||||
// Check if after skipping literal contents we've met a closing
|
||||
// single quote (which means that user attempted to create a
|
||||
// string with single quotes).
|
||||
if self.first() == '\'' {
|
||||
self.bump();
|
||||
let kind = Char { terminated: true };
|
||||
return Literal { kind, suffix_start: self.len_consumed() };
|
||||
}
|
||||
let kind = Char { terminated };
|
||||
return Literal { kind, suffix_start };
|
||||
|
||||
return Lifetime { starts_with_number };
|
||||
}
|
||||
|
||||
fn single_quoted_string(&mut self) -> bool {
|
||||
debug_assert!(self.prev() == '\'');
|
||||
// Parse `'''` as a single char literal.
|
||||
if self.nth_char(0) == '\'' && self.nth_char(1) == '\'' {
|
||||
// Check if it's a one-symbol literal.
|
||||
if self.second() == '\'' && self.first() != '\\' {
|
||||
self.bump();
|
||||
self.bump();
|
||||
return true;
|
||||
}
|
||||
|
||||
// Literal has more than one symbol.
|
||||
|
||||
// Parse until either quotes are terminated or error is detected.
|
||||
let mut first = true;
|
||||
loop {
|
||||
match self.nth_char(0) {
|
||||
// Probably beginning of the comment, which we don't want to include
|
||||
// to the error report.
|
||||
'/' if !first => break,
|
||||
// Newline without following '\'' means unclosed quote, stop parsing.
|
||||
'\n' if self.nth_char(1) != '\'' => break,
|
||||
// End of file, stop parsing.
|
||||
EOF_CHAR if self.is_eof() => break,
|
||||
match self.first() {
|
||||
// Quotes are terminated, finish parsing.
|
||||
'\'' => {
|
||||
self.bump();
|
||||
return true;
|
||||
}
|
||||
// Probably beginning of the comment, which we don't want to include
|
||||
// to the error report.
|
||||
'/' => break,
|
||||
// Newline without following '\'' means unclosed quote, stop parsing.
|
||||
'\n' if self.second() != '\'' => break,
|
||||
// End of file, stop parsing.
|
||||
EOF_CHAR if self.is_eof() => break,
|
||||
// Escaped slash is considered one character, so bump twice.
|
||||
'\\' => {
|
||||
self.bump();
|
||||
|
|
@ -579,8 +577,8 @@ impl Cursor<'_> {
|
|||
self.bump();
|
||||
}
|
||||
}
|
||||
first = false;
|
||||
}
|
||||
// String was not terminated.
|
||||
false
|
||||
}
|
||||
|
||||
|
|
@ -588,62 +586,71 @@ impl Cursor<'_> {
|
|||
/// if string is terminated.
|
||||
fn double_quoted_string(&mut self) -> bool {
|
||||
debug_assert!(self.prev() == '"');
|
||||
loop {
|
||||
match self.nth_char(0) {
|
||||
while let Some(c) = self.bump() {
|
||||
match c {
|
||||
'"' => {
|
||||
self.bump();
|
||||
return true;
|
||||
}
|
||||
EOF_CHAR if self.is_eof() => return false,
|
||||
'\\' if self.nth_char(1) == '\\' || self.nth_char(1) == '"' => {
|
||||
'\\' if self.first() == '\\' || self.first() == '"' => {
|
||||
// Bump again to skip escaped character.
|
||||
self.bump();
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
self.bump();
|
||||
}
|
||||
// End of file reached.
|
||||
false
|
||||
}
|
||||
|
||||
/// Eats the double-quoted string and returns a tuple of
|
||||
/// (amount of the '#' symbols, raw string started, raw string terminated)
|
||||
fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) {
|
||||
debug_assert!(self.prev() == 'r');
|
||||
// Count opening '#' symbols.
|
||||
let n_hashes = {
|
||||
let mut acc: usize = 0;
|
||||
loop {
|
||||
match self.bump() {
|
||||
Some('#') => acc += 1,
|
||||
Some('"') => break acc,
|
||||
None | Some(_) => return (acc, false, false),
|
||||
}
|
||||
}
|
||||
};
|
||||
let mut started: bool = false;
|
||||
let mut finished: bool = false;
|
||||
|
||||
// Skip the string itself and check that amount of closing '#'
|
||||
// symbols is equal to the amount of opening ones.
|
||||
loop {
|
||||
match self.bump() {
|
||||
Some('"') => {
|
||||
let mut acc = n_hashes;
|
||||
while self.nth_char(0) == '#' && acc > 0 {
|
||||
self.bump();
|
||||
acc -= 1;
|
||||
}
|
||||
if acc == 0 {
|
||||
return (n_hashes, true, true);
|
||||
}
|
||||
}
|
||||
Some(_) => (),
|
||||
None => return (n_hashes, true, false),
|
||||
}
|
||||
// Count opening '#' symbols.
|
||||
let n_hashes = self.eat_while(|c| c == '#');
|
||||
|
||||
// Check that string is started.
|
||||
match self.bump() {
|
||||
Some('"') => started = true,
|
||||
_ => return (n_hashes, started, finished),
|
||||
}
|
||||
|
||||
// Skip the string contents and on each '#' character met, check if this is
|
||||
// a raw string termination.
|
||||
while !finished {
|
||||
self.eat_while(|c| c != '"');
|
||||
|
||||
if self.is_eof() {
|
||||
return (n_hashes, started, finished);
|
||||
}
|
||||
|
||||
// Eat closing double quote.
|
||||
self.bump();
|
||||
|
||||
// Check that amount of closing '#' symbols
|
||||
// is equal to the amount of opening ones.
|
||||
let mut hashes_left = n_hashes;
|
||||
let is_closing_hash = |c| {
|
||||
if c == '#' && hashes_left != 0 {
|
||||
hashes_left -= 1;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
};
|
||||
finished = self.eat_while(is_closing_hash) == n_hashes;
|
||||
}
|
||||
|
||||
(n_hashes, started, finished)
|
||||
}
|
||||
|
||||
fn eat_decimal_digits(&mut self) -> bool {
|
||||
let mut has_digits = false;
|
||||
loop {
|
||||
match self.nth_char(0) {
|
||||
match self.first() {
|
||||
'_' => {
|
||||
self.bump();
|
||||
}
|
||||
|
|
@ -660,7 +667,7 @@ impl Cursor<'_> {
|
|||
fn eat_hexadecimal_digits(&mut self) -> bool {
|
||||
let mut has_digits = false;
|
||||
loop {
|
||||
match self.nth_char(0) {
|
||||
match self.first() {
|
||||
'_' => {
|
||||
self.bump();
|
||||
}
|
||||
|
|
@ -674,23 +681,43 @@ impl Cursor<'_> {
|
|||
has_digits
|
||||
}
|
||||
|
||||
fn float_exponent(&mut self) -> Result<(), ()> {
|
||||
/// Eats the float exponent. Returns true if at least one digit was met,
|
||||
/// and returns false otherwise.
|
||||
fn eat_float_exponent(&mut self) -> bool {
|
||||
debug_assert!(self.prev() == 'e' || self.prev() == 'E');
|
||||
if self.nth_char(0) == '-' || self.nth_char(0) == '+' {
|
||||
if self.first() == '-' || self.first() == '+' {
|
||||
self.bump();
|
||||
}
|
||||
if self.eat_decimal_digits() { Ok(()) } else { Err(()) }
|
||||
self.eat_decimal_digits()
|
||||
}
|
||||
|
||||
// Eats the suffix if it's an identifier.
|
||||
// Eats the suffix of the literal, e.g. "_u8".
|
||||
fn eat_literal_suffix(&mut self) {
|
||||
if !is_id_start(self.nth_char(0)) {
|
||||
self.eat_identifier();
|
||||
}
|
||||
|
||||
// Eats the identifier.
|
||||
fn eat_identifier(&mut self) {
|
||||
if !is_id_start(self.first()) {
|
||||
return;
|
||||
}
|
||||
self.bump();
|
||||
|
||||
while is_id_continue(self.nth_char(0)) {
|
||||
self.eat_while(is_id_continue);
|
||||
}
|
||||
|
||||
/// Eats symbols while predicate returns true or until the end of file is reached.
|
||||
/// Returns amount of eaten symbols.
|
||||
fn eat_while<F>(&mut self, mut predicate: F) -> usize
|
||||
where
|
||||
F: FnMut(char) -> bool
|
||||
{
|
||||
let mut eaten: usize = 0;
|
||||
while predicate(self.first()) && !self.is_eof() {
|
||||
eaten += 1;
|
||||
self.bump();
|
||||
}
|
||||
|
||||
eaten
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue