Rollup merge of #66015 - popzxc:refactor-librustc_parser, r=matklad

librustc_lexer: Refactor the module This PR introduces a refactoring of the `librustc_lexer` in order to improve readability. All the changes performed are only cosmetic and do not introduce any changes the lexer logic or performance. Newly introduced modules `literal`, `token` and `utils` are just copy-pasted from the `lib.rs` and do not contain even cosmetic changes (I decided to do so so it'll be easier to review changes looking only on diff). r? @petrochenkov cc @Centril @matklad
2019-11-06 07:03:05 +01:00 · 2019-11-06 07:03:05 +01:00 · 81550a00d1
commit 81550a00d1
parent 24af0c94b3 31735b02c9
2 changed files with 161 additions and 124 deletions
--- a/src/librustc_lexer/src/cursor.rs
+++ b/src/librustc_lexer/src/cursor.rs
@ -41,10 +41,20 @@ impl<'a> Cursor<'a> {
    /// If requested position doesn't exist, `EOF_CHAR` is returned.
    /// However, getting `EOF_CHAR` doesn't always mean actual end of file,
    /// it should be checked with `is_eof` method.
-    pub(crate) fn nth_char(&self, n: usize) -> char {
+    fn nth_char(&self, n: usize) -> char {
        self.chars().nth(n).unwrap_or(EOF_CHAR)
    }

+    /// Peeks the next symbol from the input stream without consuming it.
+    pub(crate) fn first(&self) -> char {
+        self.nth_char(0)
+    }
+
+    /// Peeks the second symbol from the input stream without consuming it.
+    pub(crate) fn second(&self) -> char {
+        self.nth_char(1)
+    }
+
    /// Checks if there is nothing more to consume.
    pub(crate) fn is_eof(&self) -> bool {
        self.chars.as_str().is_empty()
--- a/src/librustc_lexer/src/lib.rs
+++ b/src/librustc_lexer/src/lib.rs
@ -18,6 +18,8 @@ mod cursor;
 pub mod unescape;

 use crate::cursor::{Cursor, EOF_CHAR};
+use self::TokenKind::*;
+use self::LiteralKind::*;

 /// Parsed token.
 /// It doesn't contain information about data that has been parsed,
@ -116,7 +118,6 @@ pub enum TokenKind {
    /// Unknown token, not expected by the lexer, e.g. "№"
    Unknown,
 }
-use self::TokenKind::*;

 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
 pub enum LiteralKind {
@ -137,7 +138,6 @@ pub enum LiteralKind {
    /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
    RawByteStr { n_hashes: usize, started: bool, terminated: bool },
 }
-use self::LiteralKind::*;

 /// Base of numeric literal encoding according to its prefix.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
@ -241,14 +241,13 @@ pub fn is_id_continue(c: char) -> bool {
        || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c))
 }

-
 impl Cursor<'_> {
    /// Parses a token from the input string.
    fn advance_token(&mut self) -> Token {
        let first_char = self.bump().unwrap();
        let token_kind = match first_char {
            // Slash, comment or block comment.
-            '/' => match self.nth_char(0) {
+            '/' => match self.first() {
                '/' => self.line_comment(),
                '*' => self.block_comment(),
                _ => Slash,
@ -257,8 +256,8 @@ impl Cursor<'_> {
            // Whitespace sequence.
            c if is_whitespace(c) => self.whitespace(),

-            // Raw string literal or identifier.
-            'r' => match (self.nth_char(0), self.nth_char(1)) {
+            // Raw identifier, raw string literal or identifier.
+            'r' => match (self.first(), self.second()) {
                ('#', c1) if is_id_start(c1) => self.raw_ident(),
                ('#', _) | ('"', _) => {
                    let (n_hashes, started, terminated) = self.raw_double_quoted_string();
@ -273,7 +272,7 @@ impl Cursor<'_> {
            },

            // Byte literal, byte string literal, raw byte string literal or identifier.
-            'b' => match (self.nth_char(0), self.nth_char(1)) {
+            'b' => match (self.first(), self.second()) {
                ('\'', _) => {
                    self.bump();
                    let terminated = self.single_quoted_string();
@ -366,31 +365,23 @@ impl Cursor<'_> {
    }

    fn line_comment(&mut self) -> TokenKind {
-        debug_assert!(self.prev() == '/' && self.nth_char(0) == '/');
+        debug_assert!(self.prev() == '/' && self.first() == '/');
        self.bump();
-        loop {
-            match self.nth_char(0) {
-                '\n' => break,
-                EOF_CHAR if self.is_eof() => break,
-                _ => {
-                    self.bump();
-                }
-            }
-        }
+        self.eat_while(|c| c != '\n');
        LineComment
    }

    fn block_comment(&mut self) -> TokenKind {
-        debug_assert!(self.prev() == '/' && self.nth_char(0) == '*');
+        debug_assert!(self.prev() == '/' && self.first() == '*');
        self.bump();
        let mut depth = 1usize;
        while let Some(c) = self.bump() {
            match c {
-                '/' if self.nth_char(0) == '*' => {
+                '/' if self.first() == '*' => {
                    self.bump();
                    depth += 1;
                }
-                '*' if self.nth_char(0) == '/' => {
+                '*' if self.first() == '/' => {
                    self.bump();
                    depth -= 1;
                    if depth == 0 {
@ -409,31 +400,27 @@ impl Cursor<'_> {

    fn whitespace(&mut self) -> TokenKind {
        debug_assert!(is_whitespace(self.prev()));
-        while is_whitespace(self.nth_char(0)) {
-            self.bump();
-        }
+        self.eat_while(is_whitespace);
        Whitespace
    }

    fn raw_ident(&mut self) -> TokenKind {
        debug_assert!(
            self.prev() == 'r'
-                && self.nth_char(0) == '#'
-                && is_id_start(self.nth_char(1))
+                && self.first() == '#'
+                && is_id_start(self.second())
        );
+        // Eat "#" symbol.
        self.bump();
-        self.bump();
-        while is_id_continue(self.nth_char(0)) {
-            self.bump();
-        }
+        // Eat the identifier part of RawIdent.
+        self.eat_identifier();
        RawIdent
    }

    fn ident(&mut self) -> TokenKind {
        debug_assert!(is_id_start(self.prev()));
-        while is_id_continue(self.nth_char(0)) {
-            self.bump();
-        }
+        // Start is already eaten, eat the rest of identifier.
+        self.eat_while(is_id_continue);
        Ident
    }

@ -442,7 +429,7 @@ impl Cursor<'_> {
        let mut base = Base::Decimal;
        if first_digit == '0' {
            // Attempt to parse encoding base.
-            let has_digits = match self.nth_char(0) {
+            let has_digits = match self.first() {
                'b' => {
                    base = Base::Binary;
                    self.bump();
@ -476,23 +463,23 @@ impl Cursor<'_> {
            self.eat_decimal_digits();
        };

-        match self.nth_char(0) {
+        match self.first() {
            // Don't be greedy if this is actually an
            // integer literal followed by field/method access or a range pattern
            // (`0..2` and `12.foo()`)
-            '.' if self.nth_char(1) != '.'
-                && !is_id_start(self.nth_char(1)) =>
+            '.' if self.second() != '.'
+                && !is_id_start(self.second()) =>
            {
                // might have stuff after the ., and if it does, it needs to start
                // with a number
                self.bump();
                let mut empty_exponent = false;
-                if self.nth_char(0).is_digit(10) {
+                if self.first().is_digit(10) {
                    self.eat_decimal_digits();
-                    match self.nth_char(0) {
+                    match self.first() {
                        'e' | 'E' => {
                            self.bump();
-                            empty_exponent = self.float_exponent().is_err()
+                            empty_exponent = !self.eat_float_exponent();
                        }
                        _ => (),
                    }
@ -501,7 +488,7 @@ impl Cursor<'_> {
            }
            'e' | 'E' => {
                self.bump();
-                let empty_exponent = self.float_exponent().is_err();
+                let empty_exponent = !self.eat_float_exponent();
                Float { base, empty_exponent }
            }
            _ => Int { base, empty_int: false },
@ -510,65 +497,76 @@ impl Cursor<'_> {

    fn lifetime_or_char(&mut self) -> TokenKind {
        debug_assert!(self.prev() == '\'');
-        let mut starts_with_number = false;

-        // Check if the first symbol after '\'' is a valid identifier
-        // character or a number (not a digit followed by '\'').
-        if (is_id_start(self.nth_char(0))
-            || self.nth_char(0).is_digit(10) && {
-                starts_with_number = true;
-                true
-            })
-            && self.nth_char(1) != '\''
-        {
-            self.bump();
+        let can_be_a_lifetime = if self.second() == '\'' {
+            // It's surely not a lifetime.
+            false
+        } else {
+            // If the first symbol is valid for identifier, it can be a lifetime.
+            // Also check if it's a number for a better error reporting (so '0 will
+            // be reported as invalid lifetime and not as unterminated char literal).
+            is_id_start(self.first()) || self.first().is_digit(10)
+        };

-            // Skip the identifier.
-            while is_id_continue(self.nth_char(0)) {
-                self.bump();
+        if !can_be_a_lifetime {
+            let terminated = self.single_quoted_string();
+            let suffix_start = self.len_consumed();
+            if terminated {
+                self.eat_literal_suffix();
            }
-
-            return if self.nth_char(0) == '\'' {
-                self.bump();
-                let kind = Char { terminated: true };
-                Literal { kind, suffix_start: self.len_consumed() }
-            } else {
-                Lifetime { starts_with_number }
-            };
+            let kind = Char { terminated };
+            return Literal { kind, suffix_start };
        }

-        // This is not a lifetime (checked above), parse a char literal.
-        let terminated = self.single_quoted_string();
-        let suffix_start = self.len_consumed();
-        if terminated {
-            self.eat_literal_suffix();
+        // Either a lifetime or a character literal with
+        // length greater than 1.
+
+        let starts_with_number = self.first().is_digit(10);
+
+        // Skip the literal contents.
+        // First symbol can be a number (which isn't a valid identifier start),
+        // so skip it without any checks.
+        self.bump();
+        self.eat_while(is_id_continue);
+
+        // Check if after skipping literal contents we've met a closing
+        // single quote (which means that user attempted to create a
+        // string with single quotes).
+        if self.first() == '\'' {
+            self.bump();
+            let kind = Char { terminated: true };
+            return Literal { kind, suffix_start: self.len_consumed() };
        }
-        let kind = Char { terminated };
-        return Literal { kind, suffix_start };
+
+        return Lifetime { starts_with_number };
    }

    fn single_quoted_string(&mut self) -> bool {
        debug_assert!(self.prev() == '\'');
-        // Parse `'''` as a single char literal.
-        if self.nth_char(0) == '\'' && self.nth_char(1) == '\'' {
+        // Check if it's a one-symbol literal.
+        if self.second() == '\'' && self.first() != '\\' {
            self.bump();
+            self.bump();
+            return true;
        }
+
+        // Literal has more than one symbol.
+
        // Parse until either quotes are terminated or error is detected.
-        let mut first = true;
        loop {
-            match self.nth_char(0) {
-                // Probably beginning of the comment, which we don't want to include
-                // to the error report.
-                '/' if !first => break,
-                // Newline without following '\'' means unclosed quote, stop parsing.
-                '\n' if self.nth_char(1) != '\'' => break,
-                // End of file, stop parsing.
-                EOF_CHAR if self.is_eof() => break,
+            match self.first() {
                // Quotes are terminated, finish parsing.
                '\'' => {
                    self.bump();
                    return true;
                }
+                // Probably beginning of the comment, which we don't want to include
+                // to the error report.
+                '/' => break,
+                // Newline without following '\'' means unclosed quote, stop parsing.
+                '\n' if self.second() != '\'' => break,
+                // End of file, stop parsing.
+                EOF_CHAR if self.is_eof() => break,
                // Escaped slash is considered one character, so bump twice.
                '\\' => {
                    self.bump();
@ -579,8 +577,8 @@ impl Cursor<'_> {
                    self.bump();
                }
            }
-            first = false;
        }
+        // String was not terminated.
        false
    }

@ -588,62 +586,71 @@ impl Cursor<'_> {
    /// if string is terminated.
    fn double_quoted_string(&mut self) -> bool {
        debug_assert!(self.prev() == '"');
-        loop {
-            match self.nth_char(0) {
+        while let Some(c) = self.bump() {
+            match c {
                '"' => {
-                    self.bump();
                    return true;
                }
-                EOF_CHAR if self.is_eof() => return false,
-                '\\' if self.nth_char(1) == '\\' || self.nth_char(1) == '"' => {
+                '\\' if self.first() == '\\' || self.first() == '"' => {
+                    // Bump again to skip escaped character.
                    self.bump();
                }
                _ => (),
            }
-            self.bump();
        }
+        // End of file reached.
+        false
    }

    /// Eats the double-quoted string and returns a tuple of
    /// (amount of the '#' symbols, raw string started, raw string terminated)
    fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) {
        debug_assert!(self.prev() == 'r');
-        // Count opening '#' symbols.
-        let n_hashes = {
-            let mut acc: usize = 0;
-            loop {
-                match self.bump() {
-                    Some('#') => acc += 1,
-                    Some('"') => break acc,
-                    None | Some(_) => return (acc, false, false),
-                }
-            }
-        };
+        let mut started: bool = false;
+        let mut finished: bool = false;

-        // Skip the string itself and check that amount of closing '#'
-        // symbols is equal to the amount of opening ones.
-        loop {
-            match self.bump() {
-                Some('"') => {
-                    let mut acc = n_hashes;
-                    while self.nth_char(0) == '#' && acc > 0 {
-                        self.bump();
-                        acc -= 1;
-                    }
-                    if acc == 0 {
-                        return (n_hashes, true, true);
-                    }
-                }
-                Some(_) => (),
-                None => return (n_hashes, true, false),
-            }
+        // Count opening '#' symbols.
+        let n_hashes = self.eat_while(|c| c == '#');
+
+        // Check that string is started.
+        match self.bump() {
+            Some('"') => started = true,
+            _ => return (n_hashes, started, finished),
        }
+
+        // Skip the string contents and on each '#' character met, check if this is
+        // a raw string termination.
+        while !finished {
+            self.eat_while(|c| c != '"');
+
+            if self.is_eof() {
+                return (n_hashes, started, finished);
+            }
+
+            // Eat closing double quote.
+            self.bump();
+
+            // Check that amount of closing '#' symbols
+            // is equal to the amount of opening ones.
+            let mut hashes_left = n_hashes;
+            let is_closing_hash = |c| {
+                if c == '#' && hashes_left != 0 {
+                    hashes_left -= 1;
+                    true
+                } else {
+                    false
+                }
+            };
+            finished = self.eat_while(is_closing_hash) == n_hashes;
+        }
+
+        (n_hashes, started, finished)
    }

    fn eat_decimal_digits(&mut self) -> bool {
        let mut has_digits = false;
        loop {
-            match self.nth_char(0) {
+            match self.first() {
                '_' => {
                    self.bump();
                }
@ -660,7 +667,7 @@ impl Cursor<'_> {
    fn eat_hexadecimal_digits(&mut self) -> bool {
        let mut has_digits = false;
        loop {
-            match self.nth_char(0) {
+            match self.first() {
                '_' => {
                    self.bump();
                }
@ -674,23 +681,43 @@ impl Cursor<'_> {
        has_digits
    }

-    fn float_exponent(&mut self) -> Result<(), ()> {
+    /// Eats the float exponent. Returns true if at least one digit was met,
+    /// and returns false otherwise.
+    fn eat_float_exponent(&mut self) -> bool {
        debug_assert!(self.prev() == 'e' || self.prev() == 'E');
-        if self.nth_char(0) == '-' || self.nth_char(0) == '+' {
+        if self.first() == '-' || self.first() == '+' {
            self.bump();
        }
-        if self.eat_decimal_digits() { Ok(()) } else { Err(()) }
+        self.eat_decimal_digits()
    }

-    // Eats the suffix if it's an identifier.
+    // Eats the suffix of the literal, e.g. "_u8".
    fn eat_literal_suffix(&mut self) {
-        if !is_id_start(self.nth_char(0)) {
+        self.eat_identifier();
+    }
+
+    // Eats the identifier.
+    fn eat_identifier(&mut self) {
+        if !is_id_start(self.first()) {
            return;
        }
        self.bump();

-        while is_id_continue(self.nth_char(0)) {
+        self.eat_while(is_id_continue);
+    }
+
+    /// Eats symbols while predicate returns true or until the end of file is reached.
+    /// Returns amount of eaten symbols.
+    fn eat_while<F>(&mut self, mut predicate: F) -> usize
+    where
+        F: FnMut(char) -> bool
+    {
+        let mut eaten: usize = 0;
+        while predicate(self.first()) && !self.is_eof() {
+            eaten += 1;
            self.bump();
        }
+
+        eaten
    }
 }