Rollup merge of #66015 - popzxc:refactor-librustc_parser, r=matklad

librustc_lexer: Refactor the module

This PR introduces a refactoring of the `librustc_lexer` in order to improve readability.

All the changes performed are only cosmetic and do not introduce any changes the lexer logic or performance.

Newly introduced modules `literal`, `token` and `utils` are just copy-pasted from the `lib.rs` and do not contain even cosmetic changes (I decided to do so so it'll be easier to review changes looking only on diff).

r? @petrochenkov

cc @Centril @matklad
This commit is contained in:
Mazdak Farrokhzad 2019-11-06 07:03:05 +01:00 committed by GitHub
commit 81550a00d1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 161 additions and 124 deletions

View file

@ -41,10 +41,20 @@ impl<'a> Cursor<'a> {
/// If requested position doesn't exist, `EOF_CHAR` is returned.
/// However, getting `EOF_CHAR` doesn't always mean actual end of file,
/// it should be checked with `is_eof` method.
pub(crate) fn nth_char(&self, n: usize) -> char {
fn nth_char(&self, n: usize) -> char {
self.chars().nth(n).unwrap_or(EOF_CHAR)
}
/// Peeks the next symbol from the input stream without consuming it.
pub(crate) fn first(&self) -> char {
self.nth_char(0)
}
/// Peeks the second symbol from the input stream without consuming it.
pub(crate) fn second(&self) -> char {
self.nth_char(1)
}
/// Checks if there is nothing more to consume.
pub(crate) fn is_eof(&self) -> bool {
self.chars.as_str().is_empty()

View file

@ -18,6 +18,8 @@ mod cursor;
pub mod unescape;
use crate::cursor::{Cursor, EOF_CHAR};
use self::TokenKind::*;
use self::LiteralKind::*;
/// Parsed token.
/// It doesn't contain information about data that has been parsed,
@ -116,7 +118,6 @@ pub enum TokenKind {
/// Unknown token, not expected by the lexer, e.g. "№"
Unknown,
}
use self::TokenKind::*;
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum LiteralKind {
@ -137,7 +138,6 @@ pub enum LiteralKind {
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
RawByteStr { n_hashes: usize, started: bool, terminated: bool },
}
use self::LiteralKind::*;
/// Base of numeric literal encoding according to its prefix.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
@ -241,14 +241,13 @@ pub fn is_id_continue(c: char) -> bool {
|| (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c))
}
impl Cursor<'_> {
/// Parses a token from the input string.
fn advance_token(&mut self) -> Token {
let first_char = self.bump().unwrap();
let token_kind = match first_char {
// Slash, comment or block comment.
'/' => match self.nth_char(0) {
'/' => match self.first() {
'/' => self.line_comment(),
'*' => self.block_comment(),
_ => Slash,
@ -257,8 +256,8 @@ impl Cursor<'_> {
// Whitespace sequence.
c if is_whitespace(c) => self.whitespace(),
// Raw string literal or identifier.
'r' => match (self.nth_char(0), self.nth_char(1)) {
// Raw identifier, raw string literal or identifier.
'r' => match (self.first(), self.second()) {
('#', c1) if is_id_start(c1) => self.raw_ident(),
('#', _) | ('"', _) => {
let (n_hashes, started, terminated) = self.raw_double_quoted_string();
@ -273,7 +272,7 @@ impl Cursor<'_> {
},
// Byte literal, byte string literal, raw byte string literal or identifier.
'b' => match (self.nth_char(0), self.nth_char(1)) {
'b' => match (self.first(), self.second()) {
('\'', _) => {
self.bump();
let terminated = self.single_quoted_string();
@ -366,31 +365,23 @@ impl Cursor<'_> {
}
fn line_comment(&mut self) -> TokenKind {
debug_assert!(self.prev() == '/' && self.nth_char(0) == '/');
debug_assert!(self.prev() == '/' && self.first() == '/');
self.bump();
loop {
match self.nth_char(0) {
'\n' => break,
EOF_CHAR if self.is_eof() => break,
_ => {
self.bump();
}
}
}
self.eat_while(|c| c != '\n');
LineComment
}
fn block_comment(&mut self) -> TokenKind {
debug_assert!(self.prev() == '/' && self.nth_char(0) == '*');
debug_assert!(self.prev() == '/' && self.first() == '*');
self.bump();
let mut depth = 1usize;
while let Some(c) = self.bump() {
match c {
'/' if self.nth_char(0) == '*' => {
'/' if self.first() == '*' => {
self.bump();
depth += 1;
}
'*' if self.nth_char(0) == '/' => {
'*' if self.first() == '/' => {
self.bump();
depth -= 1;
if depth == 0 {
@ -409,31 +400,27 @@ impl Cursor<'_> {
fn whitespace(&mut self) -> TokenKind {
debug_assert!(is_whitespace(self.prev()));
while is_whitespace(self.nth_char(0)) {
self.bump();
}
self.eat_while(is_whitespace);
Whitespace
}
fn raw_ident(&mut self) -> TokenKind {
debug_assert!(
self.prev() == 'r'
&& self.nth_char(0) == '#'
&& is_id_start(self.nth_char(1))
&& self.first() == '#'
&& is_id_start(self.second())
);
// Eat "#" symbol.
self.bump();
self.bump();
while is_id_continue(self.nth_char(0)) {
self.bump();
}
// Eat the identifier part of RawIdent.
self.eat_identifier();
RawIdent
}
fn ident(&mut self) -> TokenKind {
debug_assert!(is_id_start(self.prev()));
while is_id_continue(self.nth_char(0)) {
self.bump();
}
// Start is already eaten, eat the rest of identifier.
self.eat_while(is_id_continue);
Ident
}
@ -442,7 +429,7 @@ impl Cursor<'_> {
let mut base = Base::Decimal;
if first_digit == '0' {
// Attempt to parse encoding base.
let has_digits = match self.nth_char(0) {
let has_digits = match self.first() {
'b' => {
base = Base::Binary;
self.bump();
@ -476,23 +463,23 @@ impl Cursor<'_> {
self.eat_decimal_digits();
};
match self.nth_char(0) {
match self.first() {
// Don't be greedy if this is actually an
// integer literal followed by field/method access or a range pattern
// (`0..2` and `12.foo()`)
'.' if self.nth_char(1) != '.'
&& !is_id_start(self.nth_char(1)) =>
'.' if self.second() != '.'
&& !is_id_start(self.second()) =>
{
// might have stuff after the ., and if it does, it needs to start
// with a number
self.bump();
let mut empty_exponent = false;
if self.nth_char(0).is_digit(10) {
if self.first().is_digit(10) {
self.eat_decimal_digits();
match self.nth_char(0) {
match self.first() {
'e' | 'E' => {
self.bump();
empty_exponent = self.float_exponent().is_err()
empty_exponent = !self.eat_float_exponent();
}
_ => (),
}
@ -501,7 +488,7 @@ impl Cursor<'_> {
}
'e' | 'E' => {
self.bump();
let empty_exponent = self.float_exponent().is_err();
let empty_exponent = !self.eat_float_exponent();
Float { base, empty_exponent }
}
_ => Int { base, empty_int: false },
@ -510,65 +497,76 @@ impl Cursor<'_> {
fn lifetime_or_char(&mut self) -> TokenKind {
debug_assert!(self.prev() == '\'');
let mut starts_with_number = false;
// Check if the first symbol after '\'' is a valid identifier
// character or a number (not a digit followed by '\'').
if (is_id_start(self.nth_char(0))
|| self.nth_char(0).is_digit(10) && {
starts_with_number = true;
true
})
&& self.nth_char(1) != '\''
{
self.bump();
let can_be_a_lifetime = if self.second() == '\'' {
// It's surely not a lifetime.
false
} else {
// If the first symbol is valid for identifier, it can be a lifetime.
// Also check if it's a number for a better error reporting (so '0 will
// be reported as invalid lifetime and not as unterminated char literal).
is_id_start(self.first()) || self.first().is_digit(10)
};
// Skip the identifier.
while is_id_continue(self.nth_char(0)) {
self.bump();
if !can_be_a_lifetime {
let terminated = self.single_quoted_string();
let suffix_start = self.len_consumed();
if terminated {
self.eat_literal_suffix();
}
return if self.nth_char(0) == '\'' {
self.bump();
let kind = Char { terminated: true };
Literal { kind, suffix_start: self.len_consumed() }
} else {
Lifetime { starts_with_number }
};
let kind = Char { terminated };
return Literal { kind, suffix_start };
}
// This is not a lifetime (checked above), parse a char literal.
let terminated = self.single_quoted_string();
let suffix_start = self.len_consumed();
if terminated {
self.eat_literal_suffix();
// Either a lifetime or a character literal with
// length greater than 1.
let starts_with_number = self.first().is_digit(10);
// Skip the literal contents.
// First symbol can be a number (which isn't a valid identifier start),
// so skip it without any checks.
self.bump();
self.eat_while(is_id_continue);
// Check if after skipping literal contents we've met a closing
// single quote (which means that user attempted to create a
// string with single quotes).
if self.first() == '\'' {
self.bump();
let kind = Char { terminated: true };
return Literal { kind, suffix_start: self.len_consumed() };
}
let kind = Char { terminated };
return Literal { kind, suffix_start };
return Lifetime { starts_with_number };
}
fn single_quoted_string(&mut self) -> bool {
debug_assert!(self.prev() == '\'');
// Parse `'''` as a single char literal.
if self.nth_char(0) == '\'' && self.nth_char(1) == '\'' {
// Check if it's a one-symbol literal.
if self.second() == '\'' && self.first() != '\\' {
self.bump();
self.bump();
return true;
}
// Literal has more than one symbol.
// Parse until either quotes are terminated or error is detected.
let mut first = true;
loop {
match self.nth_char(0) {
// Probably beginning of the comment, which we don't want to include
// to the error report.
'/' if !first => break,
// Newline without following '\'' means unclosed quote, stop parsing.
'\n' if self.nth_char(1) != '\'' => break,
// End of file, stop parsing.
EOF_CHAR if self.is_eof() => break,
match self.first() {
// Quotes are terminated, finish parsing.
'\'' => {
self.bump();
return true;
}
// Probably beginning of the comment, which we don't want to include
// to the error report.
'/' => break,
// Newline without following '\'' means unclosed quote, stop parsing.
'\n' if self.second() != '\'' => break,
// End of file, stop parsing.
EOF_CHAR if self.is_eof() => break,
// Escaped slash is considered one character, so bump twice.
'\\' => {
self.bump();
@ -579,8 +577,8 @@ impl Cursor<'_> {
self.bump();
}
}
first = false;
}
// String was not terminated.
false
}
@ -588,62 +586,71 @@ impl Cursor<'_> {
/// if string is terminated.
fn double_quoted_string(&mut self) -> bool {
debug_assert!(self.prev() == '"');
loop {
match self.nth_char(0) {
while let Some(c) = self.bump() {
match c {
'"' => {
self.bump();
return true;
}
EOF_CHAR if self.is_eof() => return false,
'\\' if self.nth_char(1) == '\\' || self.nth_char(1) == '"' => {
'\\' if self.first() == '\\' || self.first() == '"' => {
// Bump again to skip escaped character.
self.bump();
}
_ => (),
}
self.bump();
}
// End of file reached.
false
}
/// Eats the double-quoted string and returns a tuple of
/// (amount of the '#' symbols, raw string started, raw string terminated)
fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) {
debug_assert!(self.prev() == 'r');
// Count opening '#' symbols.
let n_hashes = {
let mut acc: usize = 0;
loop {
match self.bump() {
Some('#') => acc += 1,
Some('"') => break acc,
None | Some(_) => return (acc, false, false),
}
}
};
let mut started: bool = false;
let mut finished: bool = false;
// Skip the string itself and check that amount of closing '#'
// symbols is equal to the amount of opening ones.
loop {
match self.bump() {
Some('"') => {
let mut acc = n_hashes;
while self.nth_char(0) == '#' && acc > 0 {
self.bump();
acc -= 1;
}
if acc == 0 {
return (n_hashes, true, true);
}
}
Some(_) => (),
None => return (n_hashes, true, false),
}
// Count opening '#' symbols.
let n_hashes = self.eat_while(|c| c == '#');
// Check that string is started.
match self.bump() {
Some('"') => started = true,
_ => return (n_hashes, started, finished),
}
// Skip the string contents and on each '#' character met, check if this is
// a raw string termination.
while !finished {
self.eat_while(|c| c != '"');
if self.is_eof() {
return (n_hashes, started, finished);
}
// Eat closing double quote.
self.bump();
// Check that amount of closing '#' symbols
// is equal to the amount of opening ones.
let mut hashes_left = n_hashes;
let is_closing_hash = |c| {
if c == '#' && hashes_left != 0 {
hashes_left -= 1;
true
} else {
false
}
};
finished = self.eat_while(is_closing_hash) == n_hashes;
}
(n_hashes, started, finished)
}
fn eat_decimal_digits(&mut self) -> bool {
let mut has_digits = false;
loop {
match self.nth_char(0) {
match self.first() {
'_' => {
self.bump();
}
@ -660,7 +667,7 @@ impl Cursor<'_> {
fn eat_hexadecimal_digits(&mut self) -> bool {
let mut has_digits = false;
loop {
match self.nth_char(0) {
match self.first() {
'_' => {
self.bump();
}
@ -674,23 +681,43 @@ impl Cursor<'_> {
has_digits
}
fn float_exponent(&mut self) -> Result<(), ()> {
/// Eats the float exponent. Returns true if at least one digit was met,
/// and returns false otherwise.
fn eat_float_exponent(&mut self) -> bool {
debug_assert!(self.prev() == 'e' || self.prev() == 'E');
if self.nth_char(0) == '-' || self.nth_char(0) == '+' {
if self.first() == '-' || self.first() == '+' {
self.bump();
}
if self.eat_decimal_digits() { Ok(()) } else { Err(()) }
self.eat_decimal_digits()
}
// Eats the suffix if it's an identifier.
// Eats the suffix of the literal, e.g. "_u8".
fn eat_literal_suffix(&mut self) {
if !is_id_start(self.nth_char(0)) {
self.eat_identifier();
}
// Eats the identifier.
fn eat_identifier(&mut self) {
if !is_id_start(self.first()) {
return;
}
self.bump();
while is_id_continue(self.nth_char(0)) {
self.eat_while(is_id_continue);
}
/// Eats symbols while predicate returns true or until the end of file is reached.
/// Returns amount of eaten symbols.
fn eat_while<F>(&mut self, mut predicate: F) -> usize
where
F: FnMut(char) -> bool
{
let mut eaten: usize = 0;
while predicate(self.first()) && !self.is_eof() {
eaten += 1;
self.bump();
}
eaten
}
}