From ac0bb4126c835ea89313cbb4f534c37bc14a2694 Mon Sep 17 00:00:00 2001
From: mcarton <cartonmartin+git@gmail.com>
Date: Thu, 26 May 2016 22:53:38 +0200
Subject: [PATCH] Improve markdown parsing for the doc lint

---
 clippy_lints/src/doc.rs   | 198 +++++++++++++++++++++++---------------
 tests/compile-fail/doc.rs |   7 ++
 2 files changed, 128 insertions(+), 77 deletions(-)
diff --git a/clippy_lints/src/doc.rs b/clippy_lints/src/doc.rs
index cf32c1731fa3..fd4640c21584 100644
--- a/clippy_lints/src/doc.rs
+++ b/clippy_lints/src/doc.rs
@@ -51,6 +51,8 @@ impl EarlyLintPass for Doc {
 }
 
 pub fn check_attrs<'a>(cx: &EarlyContext, valid_idents: &[String], attrs: &'a [ast::Attribute]) {
+    let mut docs = vec![];
+
     let mut in_multiline = false;
     for attr in attrs {
         if attr.node.is_sugared_doc {
@@ -66,37 +68,20 @@ pub fn check_attrs<'a>(cx: &EarlyContext, valid_idents: &[String], attrs: &'a [a
                         in_multiline = !in_multiline;
                     }
                     if !in_multiline {
-                        check_doc(cx, valid_idents, real_doc, span);
+                        docs.push((real_doc, span));
                     }
                 }
             }
         }
     }
-}
 
-macro_rules! jump_to {
-    // Get the next character’s first byte UTF-8 friendlyly.
-    (@next_char, $chars: expr, $len: expr) => {{
-        if let Some(&(pos, _)) = $chars.peek() {
-            pos
-        } else {
-            $len
-        }
-    }};
-
-    // Jump to the next `$c`. If no such character is found, give up.
-    ($chars: expr, $c: expr, $len: expr) => {{
-        if $chars.find(|&(_, c)| c == $c).is_some() {
-            jump_to!(@next_char, $chars, $len)
-        }
-        else {
-            return;
-        }
-    }};
+    for (doc, span) in docs {
+        let _ = check_doc(cx, valid_idents, doc, span);
+    }
 }
 
 #[allow(while_let_loop)] // #362
-pub fn check_doc(cx: &EarlyContext, valid_idents: &[String], doc: &str, span: Span) {
+pub fn check_doc(cx: &EarlyContext, valid_idents: &[String], doc: &str, span: Span) -> Result<(), ()> {
     // In markdown, `_` can be used to emphasize something, or, is a raw `_` depending on context.
     // There really is no markdown specification that would disambiguate this properly. This is
     // what GitHub and Rustdoc do:
@@ -108,8 +93,8 @@ pub fn check_doc(cx: &EarlyContext, valid_idents: &[String], doc: &str, span: Sp
     // (_baz_)             → (<em>baz</em>)
     // foo _ bar _ baz     → foo _ bar _ baz
 
-    /// Character that can appear in a word
-    fn is_word_char(c: char) -> bool {
+    /// Character that can appear in a path
+    fn is_path_char(c: char) -> bool {
         match c {
             t if t.is_alphanumeric() => true,
             ':' | '_' => true,
@@ -117,81 +102,140 @@ pub fn check_doc(cx: &EarlyContext, valid_idents: &[String], doc: &str, span: Sp
         }
     }
 
-    #[allow(cast_possible_truncation)]
-    fn word_span(mut span: Span, begin: usize, end: usize) -> Span {
-        debug_assert_eq!(end as u32 as usize, end);
-        debug_assert_eq!(begin as u32 as usize, begin);
-        span.hi = span.lo + BytePos(end as u32);
-        span.lo = span.lo + BytePos(begin as u32);
-        span
+    #[derive(Clone, Debug)]
+    struct Parser<'a> {
+        link: bool,
+        line: &'a str,
+        span: Span,
+        current_word_begin: usize,
+        new_line: bool,
+        pos: usize,
     }
 
-    let mut new_line = true;
-    let len = doc.len();
-    let mut chars = doc.char_indices().peekable();
-    let mut current_word_begin = 0;
+    impl<'a> Parser<'a> {
+        fn advance_begin(&mut self) {
+            self.current_word_begin = self.pos;
+        }
+
+        fn peek(&self) -> Option<char> {
+            self.line[self.pos..].chars().next()
+        }
+
+        fn jump_to(&mut self, n: char) -> Result<(), ()> {
+            while let Some(c) = self.next() {
+                if c == n {
+                    self.advance_begin();
+                    return Ok(());
+                }
+            }
+
+            return Err(());
+        }
+
+        fn put_back(&mut self, c: char) {
+            self.pos -= c.len_utf8();
+        }
+
+        #[allow(cast_possible_truncation)]
+        fn word(&self) -> (&'a str, Span) {
+            let begin = self.current_word_begin;
+            let end = self.pos;
+
+            debug_assert_eq!(end as u32 as usize, end);
+            debug_assert_eq!(begin as u32 as usize, begin);
+
+            let mut span = self.span;
+            span.hi = span.lo + BytePos(end as u32);
+            span.lo = span.lo + BytePos(begin as u32);
+
+            (&self.line[begin..end], span)
+        }
+    }
+
+    impl<'a> Iterator for Parser<'a> {
+        type Item = char;
+
+        fn next(&mut self) -> Option<char> {
+            let mut chars = self.line[self.pos..].chars();
+            let c = chars.next();
+
+            if let Some(c) = c {
+                self.pos += c.len_utf8();
+            } else {
+                // TODO: new line
+            }
+
+            c
+        }
+    }
+
+    let mut parser = Parser {
+        link: false,
+        line: doc,
+        span: span,
+        current_word_begin: 0,
+        new_line: true,
+        pos: 0,
+    };
+
     loop {
-        match chars.next() {
-            Some((_, c)) => {
+        match parser.next() {
+            Some(c) => {
                 match c {
                     '#' if new_line => { // don’t warn on titles
-                        current_word_begin = jump_to!(chars, '\n', len);
+                        try!(parser.jump_to('\n'));
                     }
                     '`' => {
-                        current_word_begin = jump_to!(chars, '`', len);
+                        try!(parser.jump_to('`'));
                     }
                     '[' => {
-                        let end = jump_to!(chars, ']', len);
-                        let link_text = &doc[current_word_begin + 1..end];
-                        let word_span = word_span(span, current_word_begin + 1, end + 1);
-
-                        match chars.peek() {
-                            Some(&(_, c)) => {
-                                // Trying to parse a link. Let’s ignore the link.
-
-                                // FIXME: how does markdown handles such link?
-                                // https://en.wikipedia.org/w/index.php?title=)
-                                match c {
-                                    '(' => { // inline link
-                                        current_word_begin = jump_to!(chars, ')', len);
-                                        check_doc(cx, valid_idents, link_text, word_span);
-                                    }
-                                    '[' => { // reference link
-                                        current_word_begin = jump_to!(chars, ']', len);
-                                        check_doc(cx, valid_idents, link_text, word_span);
-                                    }
-                                    ':' => { // reference link
-                                        current_word_begin = jump_to!(chars, '\n', len);
-                                    }
-                                    _ => { // automatic reference link
-                                        current_word_begin = jump_to!(@next_char, chars, len);
-                                        check_doc(cx, valid_idents, link_text, word_span);
-                                    }
+                        // Check for a reference definition `[foo]:` at the beginning of a line
+                        let mut link = true;
+                        if parser.new_line {
+                            let mut lookup_parser = parser.clone();
+                            if let Some(_) = lookup_parser.find(|&c| c == ']') {
+                                if let Some(':') = lookup_parser.next() {
+                                    try!(lookup_parser.jump_to(')'));
+                                    parser = lookup_parser;
+                                    link = false;
                                 }
                             }
-                            None => return,
+                        }
+
+                        parser.advance_begin();
+                        parser.link = link;
+                    }
+                    ']' if parser.link => {
+                        parser.link = false;
+
+                        match parser.peek() {
+                            Some('(') => try!(parser.jump_to(')')),
+                            Some('[') => try!(parser.jump_to(']')),
+                            Some(_) => continue,
+                            None => return Err(()),
                         }
                     }
-                    // anything that’s neither alphanumeric nor '_' is not part of an ident anyway
-                    c if !c.is_alphanumeric() && c != '_' => {
-                        current_word_begin = jump_to!(@next_char, chars, len);
+                    c if !is_path_char(c) => {
+                        parser.advance_begin();
                     }
                     _ => {
-                        let end = match chars.find(|&(_, c)| !is_word_char(c)) {
-                            Some((end, _)) => end,
-                            None => len,
-                        };
-                        let word_span = word_span(span, current_word_begin, end);
-                        check_word(cx, valid_idents, &doc[current_word_begin..end], word_span);
-                        current_word_begin = jump_to!(@next_char, chars, len);
+                        if let Some(c) = parser.find(|&c| !is_path_char(c)) {
+                            parser.put_back(c);
+                        }
+
+                        let (word, span) = parser.word();
+                        check_word(cx, valid_idents, word, span);
+                        parser.advance_begin();
                     }
                 }
 
-                new_line = c == '\n' || (new_line && c.is_whitespace());
+                parser.new_line = c == '\n' || (parser.new_line && c.is_whitespace());
             }
             None => break,
         }
     }
+
+    Ok(())
 }
 
 fn check_word(cx: &EarlyContext, valid_idents: &[String], word: &str, span: Span) {
diff --git a/tests/compile-fail/doc.rs b/tests/compile-fail/doc.rs
index eca9d79354c9..f5c1e4768e97 100755
--- a/tests/compile-fail/doc.rs
+++ b/tests/compile-fail/doc.rs
@@ -12,6 +12,8 @@
 /// Markdown is _weird_. I mean _really weird_.  This \_ is ok. So is `_`. But not Foo::some_fun
 //~^ ERROR: you should put `Foo::some_fun` between ticks
 /// which should be reported only once despite being __doubly bad__.
+/// Here be ::is::a::global:path.
+//~^ ERROR: you should put `is::a::global:path` between ticks
 /// be_sure_we_got_to_the_end_of_it
 //~^ ERROR: you should put `be_sure_we_got_to_the_end_of_it` between ticks
 fn foo_bar() {
@@ -141,3 +143,8 @@ fn issue900() {
 //~^ ERROR: you should put `be_sure_we_got_to_the_end_of_it` between ticks
 fn issue883() {
 }
+
+/// `foo_bar
+/// baz_quz`
+fn multiline() {
+}