From ac0bb4126c835ea89313cbb4f534c37bc14a2694 Mon Sep 17 00:00:00 2001 From: mcarton Date: Thu, 26 May 2016 22:53:38 +0200 Subject: [PATCH] Improve markdown parsing for the doc lint --- clippy_lints/src/doc.rs | 198 +++++++++++++++++++++++--------------- tests/compile-fail/doc.rs | 7 ++ 2 files changed, 128 insertions(+), 77 deletions(-) diff --git a/clippy_lints/src/doc.rs b/clippy_lints/src/doc.rs index cf32c1731fa3..fd4640c21584 100644 --- a/clippy_lints/src/doc.rs +++ b/clippy_lints/src/doc.rs @@ -51,6 +51,8 @@ impl EarlyLintPass for Doc { } pub fn check_attrs<'a>(cx: &EarlyContext, valid_idents: &[String], attrs: &'a [ast::Attribute]) { + let mut docs = vec![]; + let mut in_multiline = false; for attr in attrs { if attr.node.is_sugared_doc { @@ -66,37 +68,20 @@ pub fn check_attrs<'a>(cx: &EarlyContext, valid_idents: &[String], attrs: &'a [a in_multiline = !in_multiline; } if !in_multiline { - check_doc(cx, valid_idents, real_doc, span); + docs.push((real_doc, span)); } } } } } -} -macro_rules! jump_to { - // Get the next character’s first byte UTF-8 friendlyly. - (@next_char, $chars: expr, $len: expr) => {{ - if let Some(&(pos, _)) = $chars.peek() { - pos - } else { - $len - } - }}; - - // Jump to the next `$c`. If no such character is found, give up. - ($chars: expr, $c: expr, $len: expr) => {{ - if $chars.find(|&(_, c)| c == $c).is_some() { - jump_to!(@next_char, $chars, $len) - } - else { - return; - } - }}; + for (doc, span) in docs { + let _ = check_doc(cx, valid_idents, doc, span); + } } #[allow(while_let_loop)] // #362 -pub fn check_doc(cx: &EarlyContext, valid_idents: &[String], doc: &str, span: Span) { +pub fn check_doc(cx: &EarlyContext, valid_idents: &[String], doc: &str, span: Span) -> Result<(), ()> { // In markdown, `_` can be used to emphasize something, or, is a raw `_` depending on context. // There really is no markdown specification that would disambiguate this properly. This is // what GitHub and Rustdoc do: @@ -108,8 +93,8 @@ pub fn check_doc(cx: &EarlyContext, valid_idents: &[String], doc: &str, span: Sp // (_baz_) → (baz) // foo _ bar _ baz → foo _ bar _ baz - /// Character that can appear in a word - fn is_word_char(c: char) -> bool { + /// Character that can appear in a path + fn is_path_char(c: char) -> bool { match c { t if t.is_alphanumeric() => true, ':' | '_' => true, @@ -117,81 +102,140 @@ pub fn check_doc(cx: &EarlyContext, valid_idents: &[String], doc: &str, span: Sp } } - #[allow(cast_possible_truncation)] - fn word_span(mut span: Span, begin: usize, end: usize) -> Span { - debug_assert_eq!(end as u32 as usize, end); - debug_assert_eq!(begin as u32 as usize, begin); - span.hi = span.lo + BytePos(end as u32); - span.lo = span.lo + BytePos(begin as u32); - span + #[derive(Clone, Debug)] + struct Parser<'a> { + link: bool, + line: &'a str, + span: Span, + current_word_begin: usize, + new_line: bool, + pos: usize, } - let mut new_line = true; - let len = doc.len(); - let mut chars = doc.char_indices().peekable(); - let mut current_word_begin = 0; + impl<'a> Parser<'a> { + fn advance_begin(&mut self) { + self.current_word_begin = self.pos; + } + + fn peek(&self) -> Option { + self.line[self.pos..].chars().next() + } + + fn jump_to(&mut self, n: char) -> Result<(), ()> { + while let Some(c) = self.next() { + if c == n { + self.advance_begin(); + return Ok(()); + } + } + + return Err(()); + } + + fn put_back(&mut self, c: char) { + self.pos -= c.len_utf8(); + } + + #[allow(cast_possible_truncation)] + fn word(&self) -> (&'a str, Span) { + let begin = self.current_word_begin; + let end = self.pos; + + debug_assert_eq!(end as u32 as usize, end); + debug_assert_eq!(begin as u32 as usize, begin); + + let mut span = self.span; + span.hi = span.lo + BytePos(end as u32); + span.lo = span.lo + BytePos(begin as u32); + + (&self.line[begin..end], span) + } + } + + impl<'a> Iterator for Parser<'a> { + type Item = char; + + fn next(&mut self) -> Option { + let mut chars = self.line[self.pos..].chars(); + let c = chars.next(); + + if let Some(c) = c { + self.pos += c.len_utf8(); + } else { + // TODO: new line + } + + c + } + } + + let mut parser = Parser { + link: false, + line: doc, + span: span, + current_word_begin: 0, + new_line: true, + pos: 0, + }; + loop { - match chars.next() { - Some((_, c)) => { + match parser.next() { + Some(c) => { match c { '#' if new_line => { // don’t warn on titles - current_word_begin = jump_to!(chars, '\n', len); + try!(parser.jump_to('\n')); } '`' => { - current_word_begin = jump_to!(chars, '`', len); + try!(parser.jump_to('`')); } '[' => { - let end = jump_to!(chars, ']', len); - let link_text = &doc[current_word_begin + 1..end]; - let word_span = word_span(span, current_word_begin + 1, end + 1); - - match chars.peek() { - Some(&(_, c)) => { - // Trying to parse a link. Let’s ignore the link. - - // FIXME: how does markdown handles such link? - // https://en.wikipedia.org/w/index.php?title=) - match c { - '(' => { // inline link - current_word_begin = jump_to!(chars, ')', len); - check_doc(cx, valid_idents, link_text, word_span); - } - '[' => { // reference link - current_word_begin = jump_to!(chars, ']', len); - check_doc(cx, valid_idents, link_text, word_span); - } - ':' => { // reference link - current_word_begin = jump_to!(chars, '\n', len); - } - _ => { // automatic reference link - current_word_begin = jump_to!(@next_char, chars, len); - check_doc(cx, valid_idents, link_text, word_span); - } + // Check for a reference definition `[foo]:` at the beginning of a line + let mut link = true; + if parser.new_line { + let mut lookup_parser = parser.clone(); + if let Some(_) = lookup_parser.find(|&c| c == ']') { + if let Some(':') = lookup_parser.next() { + try!(lookup_parser.jump_to(')')); + parser = lookup_parser; + link = false; } } - None => return, + } + + parser.advance_begin(); + parser.link = link; + } + ']' if parser.link => { + parser.link = false; + + match parser.peek() { + Some('(') => try!(parser.jump_to(')')), + Some('[') => try!(parser.jump_to(']')), + Some(_) => continue, + None => return Err(()), } } - // anything that’s neither alphanumeric nor '_' is not part of an ident anyway - c if !c.is_alphanumeric() && c != '_' => { - current_word_begin = jump_to!(@next_char, chars, len); + c if !is_path_char(c) => { + parser.advance_begin(); } _ => { - let end = match chars.find(|&(_, c)| !is_word_char(c)) { - Some((end, _)) => end, - None => len, - }; - let word_span = word_span(span, current_word_begin, end); - check_word(cx, valid_idents, &doc[current_word_begin..end], word_span); - current_word_begin = jump_to!(@next_char, chars, len); + if let Some(c) = parser.find(|&c| !is_path_char(c)) { + parser.put_back(c); + } + + let (word, span) = parser.word(); + check_word(cx, valid_idents, word, span); + parser.advance_begin(); } } - new_line = c == '\n' || (new_line && c.is_whitespace()); + parser.new_line = c == '\n' || (parser.new_line && c.is_whitespace()); } None => break, } } + + Ok(()) } fn check_word(cx: &EarlyContext, valid_idents: &[String], word: &str, span: Span) { diff --git a/tests/compile-fail/doc.rs b/tests/compile-fail/doc.rs index eca9d79354c9..f5c1e4768e97 100755 --- a/tests/compile-fail/doc.rs +++ b/tests/compile-fail/doc.rs @@ -12,6 +12,8 @@ /// Markdown is _weird_. I mean _really weird_. This \_ is ok. So is `_`. But not Foo::some_fun //~^ ERROR: you should put `Foo::some_fun` between ticks /// which should be reported only once despite being __doubly bad__. +/// Here be ::is::a::global:path. +//~^ ERROR: you should put `is::a::global:path` between ticks /// be_sure_we_got_to_the_end_of_it //~^ ERROR: you should put `be_sure_we_got_to_the_end_of_it` between ticks fn foo_bar() { @@ -141,3 +143,8 @@ fn issue900() { //~^ ERROR: you should put `be_sure_we_got_to_the_end_of_it` between ticks fn issue883() { } + +/// `foo_bar +/// baz_quz` +fn multiline() { +}