auto merge of #11490 : wting/rust/wting_11362_update_extract_tests, r=alexcrichton

Refactored the file quite a bit, I can add unit tests if desired. There's a few changes from the previous version's behavior: - destination directory will be created if it doesn't exist - strings and file is written as unicode I have a few questions, but will ask them in #11362.
2014-01-28 14:11:33 -08:00 · 2014-01-28 14:11:33 -08:00 · edfb546e4b
commit edfb546e4b
parent c6bd05303c 9f60e7c306
18 changed files with 333 additions and 208 deletions
--- a/src/etc/extract-tests.py
+++ b/src/etc/extract-tests.py
@ -1,77 +1,209 @@
 # xfail-license
+# -*- coding: utf-8 -*-
+"""
+Script for extracting compilable fragments from markdown documentation. See
+prep.js for a description of the format recognized by this tool. Expects
+a directory fragments/ to exist under the current directory, and writes the
+fragments in there as individual .rs files.
+"""
+from __future__ import print_function
+from codecs import open
+from collections import deque
+from itertools import imap
+import os
+import re
+import sys

-# Script for extracting compilable fragments from markdown
-# documentation. See prep.js for a description of the format
-# recognized by this tool. Expects a directory fragments/ to exist
-# under the current directory, and writes the fragments in there as
-# individual .rs files.
+# regexes
+CHAPTER_NAME_REGEX = re.compile(r'# (.*)')
+CODE_BLOCK_DELIM_REGEX = re.compile(r'~~~')
+COMMENT_REGEX = re.compile(r'^# ')
+COMPILER_DIRECTIVE_REGEX = re.compile(r'\#\[(.*)\];')
+ELLIPSES_REGEX = re.compile(r'\.\.\.')
+EXTERN_MOD_REGEX = re.compile(r'\bextern mod extra\b')
+MAIN_FUNCTION_REGEX = re.compile(r'\bfn main\b')
+TAGS_REGEX = re.compile(r'\.([\w-]*)')

-import sys, re
+# tags to ignore
+IGNORE_TAGS = \
+        frozenset(["abnf", "ebnf", "field", "keyword", "notrust", "precedence"])

-if len(sys.argv) < 3:
-    print("Please provide an input filename")
-    sys.exit(1)
+# header for code snippet files
+OUTPUT_BLOCK_HEADER = '\n'.join((
+    "#[ deny(warnings) ];",
+    "#[ allow(unused_variable) ];",
+    "#[ allow(dead_assignment) ];",
+    "#[ allow(unused_mut) ];",
+    "#[ allow(attribute_usage) ];",
+    "#[ allow(dead_code) ];",
+    "#[ feature(macro_rules, globs, struct_variant, managed_boxes) ];\n",))

-filename = sys.argv[1]
-dest = sys.argv[2]
-f = open(filename)
-lines = f.readlines()
-f.close()

-cur = 0
-line = ""
-chapter = ""
-chapter_n = 0
+def add_extern_mod(block):
+    if not has_extern_mod(block):
+        # add `extern mod extra;` after compiler directives
+        directives = []
+        while len(block) and is_compiler_directive(block[0]):
+            directives.append(block.popleft())

-while cur < len(lines):
-    line = lines[cur]
-    cur += 1
-    chap = re.match("# (.*)", line)
-    if chap:
-        chapter = re.sub(r"\W", "_", chap.group(1)).lower()
-        chapter_n = 1
-    elif re.match("~~~", line):
-        # Parse the tags that open a code block in the pandoc format:
-        # ~~~ {.tag1 .tag2}
-        tags = re.findall("\.([\w-]*)", line)
-        block = ""
-        ignore = "notrust" in tags or "ignore" in tags
-        # Some tags used by the language ref that indicate not rust
-        ignore |= "ebnf" in tags
-        ignore |= "abnf" in tags
-        ignore |= "keyword" in tags
-        ignore |= "field" in tags
-        ignore |= "precedence" in tags
-        xfail = "xfail-test" in tags
-        while cur < len(lines):
-            line = lines[cur]
-            cur += 1
-            if re.match("~~~", line):
-                break
-            else:
-                # Lines beginning with '# ' are turned into valid code
-                line = re.sub("^# ", "", line)
-                # Allow ellipses in code snippets
-                line = re.sub("\.\.\.", "", line)
-                block += line
-        if not ignore:
-            if not re.search(r"\bfn main\b", block):
-                block = "fn main() {\n" + block + "\n}\n"
-            if not re.search(r"\bextern mod extra\b", block):
-                block = "extern mod extra;\n" + block
-            block = """#[ deny(warnings) ];
-#[ allow(unused_variable) ];\n
-#[ allow(dead_assignment) ];\n
-#[ allow(unused_mut) ];\n
-#[ allow(attribute_usage) ];\n
-#[ allow(dead_code) ];\n
-#[ feature(macro_rules, globs, struct_variant, managed_boxes) ];\n
-""" + block
-            if xfail:
-                block = "// xfail-test\n" + block
-            filename = (dest + "/" + str(chapter)
-                        + "_" + str(chapter_n) + ".rs")
-            chapter_n += 1
-            f = open(filename, 'w')
-            f.write(block)
-            f.close()
+        block.appendleft("\nextern mod extra;\n\n")
+        block.extendleft(reversed(directives))
+
+    return block
+
+
+def add_main_function(block):
+    if not has_main_function(block):
+        prepend_spaces = lambda x: '    ' + x
+        block = deque(imap(prepend_spaces, block))
+        block.appendleft("\nfn main() {\n")
+        block.append("\n}\n")
+    return block
+
+
+def extract_code_fragments(dest_dir, lines):
+    """
+    Extracts all the code fragments from a file that do not have ignored tags
+    writing them to the following file:
+
+        [dest dir]/[chapter name]_[chapter_index].rs
+    """
+    chapter_name = None
+    chapter_index = 0
+
+    for line in lines:
+        if is_chapter_title(line):
+            chapter_name = get_chapter_name(line)
+            chapter_index = 1
+            continue
+
+        if not is_code_block_delim(line):
+            continue
+
+        assert chapter_name, "Chapter name missing for code block."
+        tags = get_tags(line)
+        block = get_code_block(lines)
+
+        if tags & IGNORE_TAGS:
+            continue
+
+        block = add_extern_mod(add_main_function(block))
+        block.appendleft(OUTPUT_BLOCK_HEADER)
+
+        if "ignore" in tags:
+            block.appendleft("//xfail-test\n")
+        elif "should_fail" in tags:
+            block.appendleft("//should-fail\n")
+
+        output_filename = os.path.join(
+                dest_dir,
+                chapter_name + '_' + str(chapter_index) + '.rs')
+
+        write_file(output_filename, block)
+        chapter_index += 1
+
+
+def has_extern_mod(block):
+    """Checks if a code block has the line `extern mod extra`."""
+    find_extern_mod = lambda x: re.search(EXTERN_MOD_REGEX, x)
+    return any(imap(find_extern_mod, block))
+
+
+def has_main_function(block):
+    """Checks if a code block has a main function."""
+    find_main_fn = lambda x: re.search(MAIN_FUNCTION_REGEX, x)
+    return any(imap(find_main_fn, block))
+
+
+def is_chapter_title(line):
+    return re.match(CHAPTER_NAME_REGEX, line)
+
+
+def is_code_block_delim(line):
+    return re.match(CODE_BLOCK_DELIM_REGEX, line)
+
+
+def is_compiler_directive(line):
+    return re.match(COMPILER_DIRECTIVE_REGEX, line)
+
+
+def get_chapter_name(line):
+    """Get the chapter name from a `# Containers` line."""
+    return re.sub(
+            r'\W',
+            '_',
+            re.match(CHAPTER_NAME_REGEX, line).group(1)).lower()
+
+
+def get_code_block(lines):
+    """
+    Get a code block surrounded by ~~~, for example:
+
+        1: ~~~ { .tag }
+        2: let u: ~[u32] = ~[0, 1, 2];
+        3: let v: &[u32] = &[0, 1, 2, 3];
+        4: let w: [u32, .. 5] = [0, 1, 2, 3, 4];
+        5:
+        6: println!("u: {}, v: {}, w: {}", u.len(), v.len(), w.len());
+        7: ~~~
+
+    Returns lines 2-6. Assumes line 1 has been consumed by the caller.
+    """
+    strip_comments = lambda x: re.sub(COMMENT_REGEX, '', x)
+    strip_ellipses = lambda x: re.sub(ELLIPSES_REGEX, '', x)
+
+    result = deque()
+
+    for line in lines:
+        if is_code_block_delim(line):
+            break
+        result.append(strip_comments(strip_ellipses(line)))
+    return result
+
+
+def get_lines(filename):
+    with open(filename) as f:
+        for line in f:
+            yield line
+
+
+def get_tags(line):
+    """
+    Retrieves all tags from the line format:
+        ~~~ { .tag1 .tag2 .tag3 }
+    """
+    return set(re.findall(TAGS_REGEX, line))
+
+
+def write_file(filename, lines):
+    with open(filename, 'w', encoding='utf-8') as f:
+        for line in lines:
+            f.write(unicode(line, encoding='utf-8', errors='replace'))
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    if len(sys.argv) < 2:
+        sys.stderr.write("Please provide an input filename.")
+        sys.exit(1)
+    elif len(sys.argv) < 3:
+        sys.stderr.write("Please provide a destination directory.")
+        sys.exit(1)
+
+    input_file = sys.argv[1]
+    dest_dir = sys.argv[2]
+
+    if not os.path.exists(input_file):
+        sys.stderr.write("Input file does not exist.")
+        sys.exit(1)
+
+    if not os.path.exists(dest_dir):
+        os.mkdir(dest_dir)
+
+    extract_code_fragments(dest_dir, get_lines(input_file))
+
+
+if __name__ == "__main__":
+    sys.exit(main())