diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs
index af7a822bc29e..3cb5cdc04396 100644
--- a/src/librustdoc/html/highlight.rs
+++ b/src/librustdoc/html/highlight.rs
@@ -18,7 +18,6 @@ use std::io;
use syntax::parse;
use syntax::parse::lexer;
-use syntax::codemap::{BytePos, Span};
use html::escape::Escape;
@@ -59,38 +58,30 @@ fn doit(sess: &parse::ParseSess, mut lexer: lexer::StringReader,
None => {}
}
try!(write!(out, "class='rust {}'>\n", class.unwrap_or("")));
- let mut last = BytePos(0);
let mut is_attribute = false;
let mut is_macro = false;
let mut is_macro_nonterminal = false;
loop {
let next = lexer.next_token();
- let test = if next.tok == t::EOF {lexer.pos} else {next.sp.lo};
- // The lexer consumes all whitespace and non-doc-comments when iterating
- // between tokens. If this token isn't directly adjacent to our last
- // token, then we need to emit the whitespace/comment.
- //
- // If the gap has any '/' characters then we consider the whole thing a
- // comment. This will classify some whitespace as a comment, but that
- // doesn't matter too much for syntax highlighting purposes.
- if test > last {
- let snip = sess.span_diagnostic.cm.span_to_snippet(Span {
- lo: last,
- hi: test,
- expn_info: None,
- }).unwrap();
- if snip.as_slice().contains("/") {
- try!(write!(out, "",
- Escape(snip.as_slice())));
- } else {
- try!(write!(out, "{}", Escape(snip.as_slice())));
- }
- }
- last = next.sp.hi;
+ let snip = |sp| sess.span_diagnostic.cm.span_to_snippet(sp).unwrap();
+
if next.tok == t::EOF { break }
let klass = match next.tok {
+ t::WS => {
+ try!(write!(out, "{}", Escape(snip(next.sp).as_slice())));
+ continue
+ },
+ t::COMMENT => {
+ try!(write!(out, "",
+ Escape(snip(next.sp).as_slice())));
+ continue
+ },
+ t::SHEBANG(s) => {
+ try!(write!(out, "{}", Escape(s.as_str())));
+ continue
+ },
// If this '&' token is directly adjacent to another token, assume
// that it's the address-of operator instead of the and-operator.
// This allows us to give all pointers their own class (`Box` and
diff --git a/src/libsyntax/parse/attr.rs b/src/libsyntax/parse/attr.rs
index b2297ec770cc..c227d8a0fedc 100644
--- a/src/libsyntax/parse/attr.rs
+++ b/src/libsyntax/parse/attr.rs
@@ -34,7 +34,7 @@ impl<'a> ParserAttr for Parser<'a> {
fn parse_outer_attributes(&mut self) -> Vec {
let mut attrs: Vec = Vec::new();
loop {
- debug!("parse_outer_attributes: self.token={:?}",
+ debug!("parse_outer_attributes: self.token={}",
self.token);
match self.token {
token::POUND => {
diff --git a/src/libsyntax/parse/lexer/comments.rs b/src/libsyntax/parse/lexer/comments.rs
index c5dd10382a95..3f3a8a723f10 100644
--- a/src/libsyntax/parse/lexer/comments.rs
+++ b/src/libsyntax/parse/lexer/comments.rs
@@ -13,7 +13,7 @@ use codemap::{BytePos, CharPos, CodeMap, Pos};
use diagnostic;
use parse::lexer::{is_whitespace, Reader};
use parse::lexer::{StringReader, TokenAndSpan};
-use parse::lexer::{is_line_non_doc_comment, is_block_non_doc_comment};
+use parse::lexer::is_block_doc_comment;
use parse::lexer;
use parse::token;
@@ -42,9 +42,9 @@ pub struct Comment {
}
pub fn is_doc_comment(s: &str) -> bool {
- (s.starts_with("///") && !is_line_non_doc_comment(s)) ||
+ (s.starts_with("///") && super::is_doc_comment(s)) ||
s.starts_with("//!") ||
- (s.starts_with("/**") && !is_block_non_doc_comment(s)) ||
+ (s.starts_with("/**") && is_block_doc_comment(s)) ||
s.starts_with("/*!")
}
@@ -260,7 +260,7 @@ fn read_block_comment(rdr: &mut StringReader,
rdr.bump();
rdr.bump();
}
- if !is_block_non_doc_comment(curr_line.as_slice()) {
+ if is_block_doc_comment(curr_line.as_slice()) {
return
}
assert!(!curr_line.as_slice().contains_char('\n'));
diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs
index 61a37f77d348..947f3d59b86f 100644
--- a/src/libsyntax/parse/lexer/mod.rs
+++ b/src/libsyntax/parse/lexer/mod.rs
@@ -187,7 +187,7 @@ impl<'a> StringReader<'a> {
/// Advance peek_tok and peek_span to refer to the next token, and
/// possibly update the interner.
fn advance_token(&mut self) {
- match self.consume_whitespace_and_comments() {
+ match self.scan_whitespace_or_comment() {
Some(comment) => {
self.peek_span = comment.sp;
self.peek_tok = comment.tok;
@@ -339,8 +339,7 @@ impl<'a> StringReader<'a> {
/// PRECONDITION: self.curr is not whitespace
/// Eats any kind of comment.
- /// Returns a Some(sugared-doc-attr) if one exists, None otherwise
- fn consume_any_line_comment(&mut self) -> Option {
+ fn scan_comment(&mut self) -> Option {
match self.curr {
Some(c) => {
if c.is_whitespace() {
@@ -375,28 +374,32 @@ impl<'a> StringReader<'a> {
}
self.bump();
}
- let ret = self.with_str_from(start_bpos, |string| {
+ return self.with_str_from(start_bpos, |string| {
// but comments with only more "/"s are not
- if !is_line_non_doc_comment(string) {
- Some(TokenAndSpan{
- tok: token::DOC_COMMENT(str_to_ident(string)),
- sp: codemap::mk_sp(start_bpos, self.last_pos)
- })
+ let tok = if is_doc_comment(string) {
+ token::DOC_COMMENT(str_to_ident(string))
} else {
- None
- }
- });
+ token::COMMENT
+ };
- if ret.is_some() {
- return ret;
- }
+ return Some(TokenAndSpan{
+ tok: tok,
+ sp: codemap::mk_sp(start_bpos, self.last_pos)
+ });
+ });
} else {
+ let start_bpos = self.last_pos - BytePos(2);
while !self.curr_is('\n') && !self.is_eof() { self.bump(); }
+ return Some(TokenAndSpan {
+ tok: token::COMMENT,
+ sp: codemap::mk_sp(start_bpos, self.last_pos)
+ });
}
- // Restart whitespace munch.
- self.consume_whitespace_and_comments()
}
- Some('*') => { self.bump(); self.bump(); self.consume_block_comment() }
+ Some('*') => {
+ self.bump(); self.bump();
+ self.scan_block_comment()
+ }
_ => None
}
} else if self.curr_is('#') {
@@ -412,9 +415,15 @@ impl<'a> StringReader<'a> {
let cmap = CodeMap::new();
cmap.files.borrow_mut().push(self.filemap.clone());
let loc = cmap.lookup_char_pos_adj(self.last_pos);
+ debug!("Skipping a shebang");
if loc.line == 1u && loc.col == CharPos(0u) {
+ // FIXME: Add shebang "token", return it
+ let start = self.last_pos;
while !self.curr_is('\n') && !self.is_eof() { self.bump(); }
- return self.consume_whitespace_and_comments();
+ return Some(TokenAndSpan {
+ tok: token::SHEBANG(self.ident_from(start)),
+ sp: codemap::mk_sp(start, self.last_pos)
+ });
}
}
None
@@ -423,15 +432,33 @@ impl<'a> StringReader<'a> {
}
}
- /// EFFECT: eats whitespace and comments.
- /// Returns a Some(sugared-doc-attr) if one exists, None otherwise.
- fn consume_whitespace_and_comments(&mut self) -> Option {
- while is_whitespace(self.curr) { self.bump(); }
- return self.consume_any_line_comment();
+ /// If there is whitespace, shebang, or a comment, scan it. Otherwise,
+ /// return None.
+ fn scan_whitespace_or_comment(&mut self) -> Option {
+ match self.curr.unwrap_or('\0') {
+ // # to handle shebang at start of file -- this is the entry point
+ // for skipping over all "junk"
+ '/' | '#' => {
+ let c = self.scan_comment();
+ debug!("scanning a comment {}", c);
+ c
+ },
+ c if is_whitespace(Some(c)) => {
+ let start_bpos = self.last_pos;
+ while is_whitespace(self.curr) { self.bump(); }
+ let c = Some(TokenAndSpan {
+ tok: token::WS,
+ sp: codemap::mk_sp(start_bpos, self.last_pos)
+ });
+ debug!("scanning whitespace: {}", c);
+ c
+ },
+ _ => None
+ }
}
/// Might return a sugared-doc-attr
- fn consume_block_comment(&mut self) -> Option {
+ fn scan_block_comment(&mut self) -> Option {
// block comments starting with "/**" or "/*!" are doc-comments
let is_doc_comment = self.curr_is('*') || self.curr_is('!');
let start_bpos = self.last_pos - BytePos(2);
@@ -466,28 +493,23 @@ impl<'a> StringReader<'a> {
self.bump();
}
- let res = if is_doc_comment {
- self.with_str_from(start_bpos, |string| {
- // but comments with only "*"s between two "/"s are not
- if !is_block_non_doc_comment(string) {
- let string = if has_cr {
- self.translate_crlf(start_bpos, string,
- "bare CR not allowed in block doc-comment")
- } else { string.into_maybe_owned() };
- Some(TokenAndSpan{
- tok: token::DOC_COMMENT(str_to_ident(string.as_slice())),
- sp: codemap::mk_sp(start_bpos, self.last_pos)
- })
- } else {
- None
- }
- })
- } else {
- None
- };
+ self.with_str_from(start_bpos, |string| {
+ // but comments with only "*"s between two "/"s are not
+ let tok = if is_block_doc_comment(string) {
+ let string = if has_cr {
+ self.translate_crlf(start_bpos, string,
+ "bare CR not allowed in block doc-comment")
+ } else { string.into_maybe_owned() };
+ token::DOC_COMMENT(str_to_ident(string.as_slice()))
+ } else {
+ token::COMMENT
+ };
- // restart whitespace munch.
- if res.is_some() { res } else { self.consume_whitespace_and_comments() }
+ Some(TokenAndSpan{
+ tok: tok,
+ sp: codemap::mk_sp(start_bpos, self.last_pos)
+ })
+ })
}
/// Scan through any digits (base `radix`) or underscores, and return how
@@ -1242,12 +1264,18 @@ fn in_range(c: Option, lo: char, hi: char) -> bool {
fn is_dec_digit(c: Option) -> bool { return in_range(c, '0', '9'); }
-pub fn is_line_non_doc_comment(s: &str) -> bool {
- s.starts_with("////")
+pub fn is_doc_comment(s: &str) -> bool {
+ let res = (s.starts_with("///") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'/')
+ || s.starts_with("//!");
+ debug!("is `{}` a doc comment? {}", s, res);
+ res
}
-pub fn is_block_non_doc_comment(s: &str) -> bool {
- s.starts_with("/***")
+pub fn is_block_doc_comment(s: &str) -> bool {
+ let res = (s.starts_with("/**") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'*')
+ || s.starts_with("/*!");
+ debug!("is `{}` a doc comment? {}", s, res);
+ res
}
fn ident_start(c: Option) -> bool {
@@ -1383,9 +1411,9 @@ mod test {
}
#[test] fn line_doc_comments() {
- assert!(!is_line_non_doc_comment("///"));
- assert!(!is_line_non_doc_comment("/// blah"));
- assert!(is_line_non_doc_comment("////"));
+ assert!(is_doc_comment("///"));
+ assert!(is_doc_comment("/// blah"));
+ assert!(!is_doc_comment("////"));
}
#[test] fn nested_block_comments() {
diff --git a/src/libsyntax/parse/parser.rs b/src/libsyntax/parse/parser.rs
index e0bcb41a7536..51f2c74d3aef 100644
--- a/src/libsyntax/parse/parser.rs
+++ b/src/libsyntax/parse/parser.rs
@@ -325,10 +325,24 @@ fn is_plain_ident_or_underscore(t: &token::Token) -> bool {
is_plain_ident(t) || *t == token::UNDERSCORE
}
+/// Get a token the parser cares about
+fn real_token(rdr: &mut Reader) -> TokenAndSpan {
+ let mut t = rdr.next_token();
+ loop {
+ match t.tok {
+ token::WS | token::COMMENT | token::SHEBANG(_) => {
+ t = rdr.next_token();
+ },
+ _ => break
+ }
+ }
+ t
+}
+
impl<'a> Parser<'a> {
pub fn new(sess: &'a ParseSess, cfg: ast::CrateConfig,
mut rdr: Box) -> Parser<'a> {
- let tok0 = rdr.next_token();
+ let tok0 = real_token(rdr);
let span = tok0.sp;
let placeholder = TokenAndSpan {
tok: token::UNDERSCORE,
@@ -864,7 +878,7 @@ impl<'a> Parser<'a> {
None
};
let next = if self.buffer_start == self.buffer_end {
- self.reader.next_token()
+ real_token(self.reader)
} else {
// Avoid token copies with `replace`.
let buffer_start = self.buffer_start as uint;
@@ -908,7 +922,7 @@ impl<'a> Parser<'a> {
-> R {
let dist = distance as int;
while self.buffer_length() < dist {
- self.buffer[self.buffer_end as uint] = self.reader.next_token();
+ self.buffer[self.buffer_end as uint] = real_token(self.reader);
self.buffer_end = (self.buffer_end + 1) & 3;
}
f(&self.buffer[((self.buffer_start + dist - 1) & 3) as uint].tok)
diff --git a/src/libsyntax/parse/token.rs b/src/libsyntax/parse/token.rs
index 83d373d033b3..e65f9f208a31 100644
--- a/src/libsyntax/parse/token.rs
+++ b/src/libsyntax/parse/token.rs
@@ -97,8 +97,18 @@ pub enum Token {
/* For interpolation */
INTERPOLATED(Nonterminal),
-
DOC_COMMENT(Ident),
+
+ // Junk. These carry no data because we don't really care about the data
+ // they *would* carry, and don't really want to allocate a new ident for
+ // them. Instead, users could extract that from the associated span.
+
+ /// Whitespace
+ WS,
+ /// Comment
+ COMMENT,
+ SHEBANG(Ident),
+
EOF,
}
@@ -231,6 +241,10 @@ pub fn to_string(t: &Token) -> String {
/* Other */
DOC_COMMENT(s) => get_ident(s).get().to_string(),
EOF => "".to_string(),
+ WS => " ".to_string(),
+ COMMENT => "/* */".to_string(),
+ SHEBANG(s) => format!("/* shebang: {}*/", s.as_str()),
+
INTERPOLATED(ref nt) => {
match nt {
&NtExpr(ref e) => ::print::pprust::expr_to_string(&**e),