diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index af7a822bc29e..3cb5cdc04396 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -18,7 +18,6 @@ use std::io; use syntax::parse; use syntax::parse::lexer; -use syntax::codemap::{BytePos, Span}; use html::escape::Escape; @@ -59,38 +58,30 @@ fn doit(sess: &parse::ParseSess, mut lexer: lexer::StringReader, None => {} } try!(write!(out, "class='rust {}'>\n", class.unwrap_or(""))); - let mut last = BytePos(0); let mut is_attribute = false; let mut is_macro = false; let mut is_macro_nonterminal = false; loop { let next = lexer.next_token(); - let test = if next.tok == t::EOF {lexer.pos} else {next.sp.lo}; - // The lexer consumes all whitespace and non-doc-comments when iterating - // between tokens. If this token isn't directly adjacent to our last - // token, then we need to emit the whitespace/comment. - // - // If the gap has any '/' characters then we consider the whole thing a - // comment. This will classify some whitespace as a comment, but that - // doesn't matter too much for syntax highlighting purposes. - if test > last { - let snip = sess.span_diagnostic.cm.span_to_snippet(Span { - lo: last, - hi: test, - expn_info: None, - }).unwrap(); - if snip.as_slice().contains("/") { - try!(write!(out, "{}", - Escape(snip.as_slice()))); - } else { - try!(write!(out, "{}", Escape(snip.as_slice()))); - } - } - last = next.sp.hi; + let snip = |sp| sess.span_diagnostic.cm.span_to_snippet(sp).unwrap(); + if next.tok == t::EOF { break } let klass = match next.tok { + t::WS => { + try!(write!(out, "{}", Escape(snip(next.sp).as_slice()))); + continue + }, + t::COMMENT => { + try!(write!(out, "{}", + Escape(snip(next.sp).as_slice()))); + continue + }, + t::SHEBANG(s) => { + try!(write!(out, "{}", Escape(s.as_str()))); + continue + }, // If this '&' token is directly adjacent to another token, assume // that it's the address-of operator instead of the and-operator. // This allows us to give all pointers their own class (`Box` and diff --git a/src/libsyntax/parse/attr.rs b/src/libsyntax/parse/attr.rs index b2297ec770cc..c227d8a0fedc 100644 --- a/src/libsyntax/parse/attr.rs +++ b/src/libsyntax/parse/attr.rs @@ -34,7 +34,7 @@ impl<'a> ParserAttr for Parser<'a> { fn parse_outer_attributes(&mut self) -> Vec { let mut attrs: Vec = Vec::new(); loop { - debug!("parse_outer_attributes: self.token={:?}", + debug!("parse_outer_attributes: self.token={}", self.token); match self.token { token::POUND => { diff --git a/src/libsyntax/parse/lexer/comments.rs b/src/libsyntax/parse/lexer/comments.rs index c5dd10382a95..3f3a8a723f10 100644 --- a/src/libsyntax/parse/lexer/comments.rs +++ b/src/libsyntax/parse/lexer/comments.rs @@ -13,7 +13,7 @@ use codemap::{BytePos, CharPos, CodeMap, Pos}; use diagnostic; use parse::lexer::{is_whitespace, Reader}; use parse::lexer::{StringReader, TokenAndSpan}; -use parse::lexer::{is_line_non_doc_comment, is_block_non_doc_comment}; +use parse::lexer::is_block_doc_comment; use parse::lexer; use parse::token; @@ -42,9 +42,9 @@ pub struct Comment { } pub fn is_doc_comment(s: &str) -> bool { - (s.starts_with("///") && !is_line_non_doc_comment(s)) || + (s.starts_with("///") && super::is_doc_comment(s)) || s.starts_with("//!") || - (s.starts_with("/**") && !is_block_non_doc_comment(s)) || + (s.starts_with("/**") && is_block_doc_comment(s)) || s.starts_with("/*!") } @@ -260,7 +260,7 @@ fn read_block_comment(rdr: &mut StringReader, rdr.bump(); rdr.bump(); } - if !is_block_non_doc_comment(curr_line.as_slice()) { + if is_block_doc_comment(curr_line.as_slice()) { return } assert!(!curr_line.as_slice().contains_char('\n')); diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 61a37f77d348..947f3d59b86f 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -187,7 +187,7 @@ impl<'a> StringReader<'a> { /// Advance peek_tok and peek_span to refer to the next token, and /// possibly update the interner. fn advance_token(&mut self) { - match self.consume_whitespace_and_comments() { + match self.scan_whitespace_or_comment() { Some(comment) => { self.peek_span = comment.sp; self.peek_tok = comment.tok; @@ -339,8 +339,7 @@ impl<'a> StringReader<'a> { /// PRECONDITION: self.curr is not whitespace /// Eats any kind of comment. - /// Returns a Some(sugared-doc-attr) if one exists, None otherwise - fn consume_any_line_comment(&mut self) -> Option { + fn scan_comment(&mut self) -> Option { match self.curr { Some(c) => { if c.is_whitespace() { @@ -375,28 +374,32 @@ impl<'a> StringReader<'a> { } self.bump(); } - let ret = self.with_str_from(start_bpos, |string| { + return self.with_str_from(start_bpos, |string| { // but comments with only more "/"s are not - if !is_line_non_doc_comment(string) { - Some(TokenAndSpan{ - tok: token::DOC_COMMENT(str_to_ident(string)), - sp: codemap::mk_sp(start_bpos, self.last_pos) - }) + let tok = if is_doc_comment(string) { + token::DOC_COMMENT(str_to_ident(string)) } else { - None - } - }); + token::COMMENT + }; - if ret.is_some() { - return ret; - } + return Some(TokenAndSpan{ + tok: tok, + sp: codemap::mk_sp(start_bpos, self.last_pos) + }); + }); } else { + let start_bpos = self.last_pos - BytePos(2); while !self.curr_is('\n') && !self.is_eof() { self.bump(); } + return Some(TokenAndSpan { + tok: token::COMMENT, + sp: codemap::mk_sp(start_bpos, self.last_pos) + }); } - // Restart whitespace munch. - self.consume_whitespace_and_comments() } - Some('*') => { self.bump(); self.bump(); self.consume_block_comment() } + Some('*') => { + self.bump(); self.bump(); + self.scan_block_comment() + } _ => None } } else if self.curr_is('#') { @@ -412,9 +415,15 @@ impl<'a> StringReader<'a> { let cmap = CodeMap::new(); cmap.files.borrow_mut().push(self.filemap.clone()); let loc = cmap.lookup_char_pos_adj(self.last_pos); + debug!("Skipping a shebang"); if loc.line == 1u && loc.col == CharPos(0u) { + // FIXME: Add shebang "token", return it + let start = self.last_pos; while !self.curr_is('\n') && !self.is_eof() { self.bump(); } - return self.consume_whitespace_and_comments(); + return Some(TokenAndSpan { + tok: token::SHEBANG(self.ident_from(start)), + sp: codemap::mk_sp(start, self.last_pos) + }); } } None @@ -423,15 +432,33 @@ impl<'a> StringReader<'a> { } } - /// EFFECT: eats whitespace and comments. - /// Returns a Some(sugared-doc-attr) if one exists, None otherwise. - fn consume_whitespace_and_comments(&mut self) -> Option { - while is_whitespace(self.curr) { self.bump(); } - return self.consume_any_line_comment(); + /// If there is whitespace, shebang, or a comment, scan it. Otherwise, + /// return None. + fn scan_whitespace_or_comment(&mut self) -> Option { + match self.curr.unwrap_or('\0') { + // # to handle shebang at start of file -- this is the entry point + // for skipping over all "junk" + '/' | '#' => { + let c = self.scan_comment(); + debug!("scanning a comment {}", c); + c + }, + c if is_whitespace(Some(c)) => { + let start_bpos = self.last_pos; + while is_whitespace(self.curr) { self.bump(); } + let c = Some(TokenAndSpan { + tok: token::WS, + sp: codemap::mk_sp(start_bpos, self.last_pos) + }); + debug!("scanning whitespace: {}", c); + c + }, + _ => None + } } /// Might return a sugared-doc-attr - fn consume_block_comment(&mut self) -> Option { + fn scan_block_comment(&mut self) -> Option { // block comments starting with "/**" or "/*!" are doc-comments let is_doc_comment = self.curr_is('*') || self.curr_is('!'); let start_bpos = self.last_pos - BytePos(2); @@ -466,28 +493,23 @@ impl<'a> StringReader<'a> { self.bump(); } - let res = if is_doc_comment { - self.with_str_from(start_bpos, |string| { - // but comments with only "*"s between two "/"s are not - if !is_block_non_doc_comment(string) { - let string = if has_cr { - self.translate_crlf(start_bpos, string, - "bare CR not allowed in block doc-comment") - } else { string.into_maybe_owned() }; - Some(TokenAndSpan{ - tok: token::DOC_COMMENT(str_to_ident(string.as_slice())), - sp: codemap::mk_sp(start_bpos, self.last_pos) - }) - } else { - None - } - }) - } else { - None - }; + self.with_str_from(start_bpos, |string| { + // but comments with only "*"s between two "/"s are not + let tok = if is_block_doc_comment(string) { + let string = if has_cr { + self.translate_crlf(start_bpos, string, + "bare CR not allowed in block doc-comment") + } else { string.into_maybe_owned() }; + token::DOC_COMMENT(str_to_ident(string.as_slice())) + } else { + token::COMMENT + }; - // restart whitespace munch. - if res.is_some() { res } else { self.consume_whitespace_and_comments() } + Some(TokenAndSpan{ + tok: tok, + sp: codemap::mk_sp(start_bpos, self.last_pos) + }) + }) } /// Scan through any digits (base `radix`) or underscores, and return how @@ -1242,12 +1264,18 @@ fn in_range(c: Option, lo: char, hi: char) -> bool { fn is_dec_digit(c: Option) -> bool { return in_range(c, '0', '9'); } -pub fn is_line_non_doc_comment(s: &str) -> bool { - s.starts_with("////") +pub fn is_doc_comment(s: &str) -> bool { + let res = (s.starts_with("///") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'/') + || s.starts_with("//!"); + debug!("is `{}` a doc comment? {}", s, res); + res } -pub fn is_block_non_doc_comment(s: &str) -> bool { - s.starts_with("/***") +pub fn is_block_doc_comment(s: &str) -> bool { + let res = (s.starts_with("/**") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'*') + || s.starts_with("/*!"); + debug!("is `{}` a doc comment? {}", s, res); + res } fn ident_start(c: Option) -> bool { @@ -1383,9 +1411,9 @@ mod test { } #[test] fn line_doc_comments() { - assert!(!is_line_non_doc_comment("///")); - assert!(!is_line_non_doc_comment("/// blah")); - assert!(is_line_non_doc_comment("////")); + assert!(is_doc_comment("///")); + assert!(is_doc_comment("/// blah")); + assert!(!is_doc_comment("////")); } #[test] fn nested_block_comments() { diff --git a/src/libsyntax/parse/parser.rs b/src/libsyntax/parse/parser.rs index e0bcb41a7536..51f2c74d3aef 100644 --- a/src/libsyntax/parse/parser.rs +++ b/src/libsyntax/parse/parser.rs @@ -325,10 +325,24 @@ fn is_plain_ident_or_underscore(t: &token::Token) -> bool { is_plain_ident(t) || *t == token::UNDERSCORE } +/// Get a token the parser cares about +fn real_token(rdr: &mut Reader) -> TokenAndSpan { + let mut t = rdr.next_token(); + loop { + match t.tok { + token::WS | token::COMMENT | token::SHEBANG(_) => { + t = rdr.next_token(); + }, + _ => break + } + } + t +} + impl<'a> Parser<'a> { pub fn new(sess: &'a ParseSess, cfg: ast::CrateConfig, mut rdr: Box) -> Parser<'a> { - let tok0 = rdr.next_token(); + let tok0 = real_token(rdr); let span = tok0.sp; let placeholder = TokenAndSpan { tok: token::UNDERSCORE, @@ -864,7 +878,7 @@ impl<'a> Parser<'a> { None }; let next = if self.buffer_start == self.buffer_end { - self.reader.next_token() + real_token(self.reader) } else { // Avoid token copies with `replace`. let buffer_start = self.buffer_start as uint; @@ -908,7 +922,7 @@ impl<'a> Parser<'a> { -> R { let dist = distance as int; while self.buffer_length() < dist { - self.buffer[self.buffer_end as uint] = self.reader.next_token(); + self.buffer[self.buffer_end as uint] = real_token(self.reader); self.buffer_end = (self.buffer_end + 1) & 3; } f(&self.buffer[((self.buffer_start + dist - 1) & 3) as uint].tok) diff --git a/src/libsyntax/parse/token.rs b/src/libsyntax/parse/token.rs index 83d373d033b3..e65f9f208a31 100644 --- a/src/libsyntax/parse/token.rs +++ b/src/libsyntax/parse/token.rs @@ -97,8 +97,18 @@ pub enum Token { /* For interpolation */ INTERPOLATED(Nonterminal), - DOC_COMMENT(Ident), + + // Junk. These carry no data because we don't really care about the data + // they *would* carry, and don't really want to allocate a new ident for + // them. Instead, users could extract that from the associated span. + + /// Whitespace + WS, + /// Comment + COMMENT, + SHEBANG(Ident), + EOF, } @@ -231,6 +241,10 @@ pub fn to_string(t: &Token) -> String { /* Other */ DOC_COMMENT(s) => get_ident(s).get().to_string(), EOF => "".to_string(), + WS => " ".to_string(), + COMMENT => "/* */".to_string(), + SHEBANG(s) => format!("/* shebang: {}*/", s.as_str()), + INTERPOLATED(ref nt) => { match nt { &NtExpr(ref e) => ::print::pprust::expr_to_string(&**e),