parsing for the macro system

2012-06-12 10:59:50 -07:00 · 2012-06-12 10:59:50 -07:00 · 4f104954a6
commit 4f104954a6
parent 650dfe58a3
8 changed files with 370 additions and 29 deletions
--- a/src/libsyntax/ast.rs
+++ b/src/libsyntax/ast.rs
@ -377,6 +377,17 @@ enum token_tree {
    tt_flat(span, token::token)
 }

+#[auto_serialize]
+type matcher = spanned<matcher_>;
+
+#[auto_serialize]
+enum matcher_ {
+    mtc_tok(token::token),
+    /* body, separator, zero ok? : */
+    mtc_rep([matcher], option<token::token>, bool),
+    mtc_bb(ident, ident, uint)
+}
+
 #[auto_serialize]
 type mac = spanned<mac_>;

--- a/src/libsyntax/ext/earley_parser.rs
+++ b/src/libsyntax/ext/earley_parser.rs
@ -0,0 +1,243 @@
+// Earley-like parser for macros.
+import parse::token;
+import parse::token::{token, EOF, to_str, whole_nt};
+import parse::lexer::{reader, tt_reader, tt_reader_as_reader};
+import parse::parser::{parser,SOURCE_FILE};
+import parse::common::parser_common;
+import parse::parse_sess;
+import dvec::{dvec, extensions};
+import ast::{matcher, mtc_tok, mtc_rep, mtc_bb};
+
+/* This is an Earley-like parser, without support for nonterminals.  This
+means that there are no completer or predictor rules, and therefore no need to
+store one column per token: instead, there's a set of current Earley items and
+a set of next ones. Instead of NTs, we have a special case for Kleene
+star. The big-O, in pathological cases, is worse than traditional Earley
+parsing, but it's an easier fit for Macro-by-Example-style rules, and I think
+the overhead is lower. */
+
+
+/* to avoid costly uniqueness checks, we require that `mtc_rep` always has a
+nonempty body. */
+
+enum matcher_pos_up { /* to break a circularity */
+    matcher_pos_up(option<matcher_pos>)
+}
+
+fn is_some(&&mpu: matcher_pos_up) -> bool {
+    alt mpu {
+      matcher_pos_up(none) { false }
+      _ { true }
+    }
+}
+
+type matcher_pos = ~{
+    elts: [ast::matcher], // maybe should be /& ? Need to understand regions.
+    sep: option<token>,
+    mut idx: uint,
+    mut up: matcher_pos_up, // mutable for swapping only
+    matches: [dvec<@arb_depth>]
+};
+
+fn copy_up(&& mpu: matcher_pos_up) -> matcher_pos {
+    alt mpu {
+      matcher_pos_up(some(mp)) { copy mp }
+      _ { fail }
+    }
+}
+
+fn count_names(ms: [matcher]/&) -> uint {
+    vec::foldl(0u, ms, {|ct, m|
+        ct + alt m.node {
+          mtc_tok(_) { 0u }
+          mtc_rep(more_ms, _, _) { count_names(more_ms) }
+          mtc_bb(_,_,_) { 1u }
+        }})
+}
+
+fn new_matcher_pos(ms: [matcher], sep: option<token>) -> matcher_pos {
+    ~{elts: ms, sep: sep, mut idx: 0u, mut up: matcher_pos_up(none),
+      matches: copy vec::from_fn(count_names(ms), {|_i| dvec::dvec()}) }
+}
+
+/* logically, an arb_depth should contain only one kind of nonterminal */
+enum arb_depth { leaf(whole_nt), seq([@arb_depth]) }
+
+type earley_item = matcher_pos;
+
+
+fn parse(sess: parse_sess, cfg: ast::crate_cfg, rdr: reader, ms: [matcher])
+    -> [@arb_depth] {
+    let mut cur_eis = [];
+    vec::push(cur_eis, new_matcher_pos(ms, none));
+
+    loop {
+        let mut bb_eis = []; // black-box parsed by parser.rs
+        let mut next_eis = []; // or proceed normally
+        let mut eof_eis = [];
+
+        let {tok: tok, sp: _} = rdr.peek();
+
+        /* we append new items to this while we go */
+        while cur_eis.len() > 0u { /* for each Earley Item */
+            let mut ei = vec::pop(cur_eis);
+
+            let idx = ei.idx;
+            let len = ei.elts.len();
+
+            /* at end of sequence */
+            if idx >= len {
+                // can't move out of `alt`s, so:
+                if is_some(ei.up) {
+                    // hack: a matcher sequence is repeating iff it has a
+                    // parent (the top level is just a container)
+
+
+                    // disregard separator, try to go up
+                    // (remove this condition to make trailing seps ok)
+                    if idx == len {
+                        // pop from the matcher position
+
+                        let new_pos = copy_up(ei.up);
+
+                        // update matches (the MBE "parse tree") by appending
+                        // each tree as a subtree.
+
+                        // I bet this is a perf problem: we're preemptively
+                        // doing a lot of array work that will get thrown away
+                        // most of the time.
+                        for ei.matches.eachi() { |idx, elt|
+                            new_pos.matches[idx].push(@seq(elt.get()));
+                        }
+
+                        new_pos.idx += 1u;
+                        vec::push(cur_eis, new_pos);
+                    }
+
+                    // can we go around again?
+
+                    // the *_t vars are workarounds for the lack of unary move
+                    alt copy ei.sep {
+                      some(t) if idx == len { // we need a separator
+                        if tok == t { //pass the separator
+                            let ei_t <- ei;
+                            ei_t.idx += 1u;
+                            vec::push(next_eis, ei_t);
+                        }
+                      }
+                      _ { // we don't need a separator
+                        let ei_t <- ei;
+                        ei_t.idx = 0u;
+                        vec::push(cur_eis, ei_t);
+                      }
+                    }
+                } else {
+                    vec::push(eof_eis, ei);
+                }
+            } else {
+                alt copy ei.elts[idx].node {
+                  /* need to descend into sequence */
+                  mtc_rep(matchers, sep, zero_ok) {
+                    if zero_ok {
+                        let new_ei = copy ei;
+                        new_ei.idx += 1u;
+                        vec::push(cur_eis, new_ei);
+                    }
+
+                    let matches = vec::map(ei.matches, // fresh, same size:
+                                           {|_m| dvec::<@arb_depth>()});
+                    let ei_t <- ei;
+                    vec::push(cur_eis, ~{
+                        elts: matchers, sep: sep, mut idx: 0u,
+                        mut up: matcher_pos_up(some(ei_t)),
+                        matches: matches
+                    });
+                  }
+                  mtc_bb(_,_,_) { vec::push(bb_eis, ei) }
+                  mtc_tok(t) {
+                    let ei_t <- ei;
+                    if t == tok { ei_t.idx += 1u; vec::push(next_eis, ei_t)}
+                  }
+                }
+            }
+        }
+
+        /* error messages here could be improved with links to orig. rules */
+        if tok == EOF {
+            if eof_eis.len() == 1u {
+                let ret_val = vec::map(eof_eis[0u].matches, {|dv| dv.pop()});
+                ret ret_val; /* success */
+            } else if eof_eis.len() > 1u {
+                rdr.fatal("Ambiguity: multiple successful parses");
+            } else {
+                rdr.fatal("Unexpected end of macro invocation");
+            }
+        } else {
+            if (bb_eis.len() > 0u && next_eis.len() > 0u)
+                || bb_eis.len() > 1u {
+                let nts = str::connect(vec::map(bb_eis, {|ei|
+                    alt ei.elts[ei.idx].node
+                        { mtc_bb(_,name,_) { *name } _ { fail; } }
+                }), " or ");
+                rdr.fatal(#fmt["Local ambiguity: multiple parsing options: \
+                                built-in NTs %s or %u other options.",
+                              nts, next_eis.len()]);
+            } else if (bb_eis.len() == 0u && next_eis.len() == 0u) {
+                rdr.fatal("No rules expected the token "
+                          + to_str(*rdr.interner(), tok));
+            } else if (next_eis.len() > 0u) {
+                /* Now process the next token */
+                while(next_eis.len() > 0u) {
+                    vec::push(cur_eis, vec::pop(next_eis));
+                }
+                rdr.next_token();
+            } else /* bb_eis.len() == 1 */ {
+                let rust_parser = parser(sess, cfg, rdr.dup(), SOURCE_FILE);
+
+                let ei = vec::pop(bb_eis);
+                alt ei.elts[ei.idx].node {
+                  mtc_bb(_, name, idx) {
+                    ei.matches[idx].push(@leaf(
+                        parse_nt(rust_parser, *name)));
+                    ei.idx += 1u;
+                  }
+                  _ { fail; }
+                }
+                vec::push(cur_eis,ei);
+
+                /* this would fail if zero-length tokens existed */
+                while rdr.peek().sp.lo < rust_parser.span.lo {
+                    rdr.next_token();
+                }
+            }
+        }
+
+        assert cur_eis.len() > 0u;
+    }
+}
+
+fn parse_nt(p: parser, name: str) -> whole_nt {
+    alt name {
+      "item" { alt p.parse_item([], ast::public) {
+        some(i) { token::w_item(i) }
+        none { p.fatal("expected an item keyword") }
+      }}
+      "block" { token::w_block(p.parse_block()) }
+      "stmt" { token::w_stmt(p.parse_stmt([])) }
+      "pat" { token::w_pat(p.parse_pat()) }
+      "expr" { token::w_expr(p.parse_expr()) }
+      "ty" { token::w_ty(p.parse_ty(false /* no need to disambiguate*/)) }
+      // this could be handled like a token, since it is one
+      "ident" { token::w_ident(p.parse_ident()) }
+      "path" { token::w_path(p.parse_path_with_tps(false)) }
+      _ { p.fatal("Unsupported builtin nonterminal parser: " + name)}
+    }
+}
+
+// Local Variables:
+// mode: rust;
+// fill-column: 78;
+// indent-tabs-mode: nil
+// c-basic-offset: 4
+// buffer-file-coding-system: utf-8-unix
+// End:
--- a/src/libsyntax/parse/comments.rs
+++ b/src/libsyntax/parse/comments.rs
@ -199,8 +199,9 @@ fn gather_comments_and_literals(span_diagnostic: diagnostic::span_handler,


        let bstart = rdr.pos;
+        rdr.next_token();
        //discard, and look ahead; we're working with internal state
-        let {tok: tok, sp: sp} = rdr.next_token();
+        let {tok: tok, sp: sp} = rdr.peek();
        if token::is_lit(tok) {
            let s = get_str_from(rdr, bstart);
            vec::push(literals, {lit: s, pos: sp.lo});
--- a/src/libsyntax/parse/common.rs
+++ b/src/libsyntax/parse/common.rs
@ -111,8 +111,8 @@ impl parser_common for parser {
        if !self.eat_keyword(word) {
            self.fatal("expecting " + word + ", found " +
                    token_to_str(self.reader, self.token));
+        }
    }
-}

    fn is_restricted_keyword(word: str) -> bool {
        self.restricted_keywords.contains_key(word)
--- a/src/libsyntax/parse/lexer.rs
+++ b/src/libsyntax/parse/lexer.rs
@ -13,13 +13,17 @@ iface reader {
    fn is_eof() -> bool;
    fn next_token() -> {tok: token::token, sp: span};
    fn fatal(str) -> !;
+    fn span_diag() -> diagnostic::span_handler;
    fn interner() -> @interner::interner<@str>;
+    fn peek() -> {tok: token::token, sp: span};
+    fn dup() -> reader;
 }

 enum tt_frame_up { /* to break a circularity */
    tt_frame_up(option<tt_frame>)
 }

+/* TODO: figure out how to have a uniquely linked stack, and change to `~` */
 #[doc = "an unzipping of `token_tree`s"]
 type tt_frame = @{
    readme: [ast::token_tree],
@ -27,7 +31,7 @@ type tt_frame = @{
    up: tt_frame_up
 };

-type tt_reader = ~{
+type tt_reader = @{
    span_diagnostic: diagnostic::span_handler,
    interner: @interner::interner<@str>,
    mut cur: tt_frame,
@ -39,7 +43,7 @@ type tt_reader = ~{
 fn new_tt_reader(span_diagnostic: diagnostic::span_handler,
                 itr: @interner::interner<@str>, src: [ast::token_tree])
    -> tt_reader {
-    let r = ~{span_diagnostic: span_diagnostic, interner: itr,
+    let r = @{span_diagnostic: span_diagnostic, interner: itr,
              mut cur: @{readme: src, mut idx: 0u,
                         up: tt_frame_up(option::none)},
              /* dummy values, never read: */
@ -62,7 +66,7 @@ pure fn dup_tt_frame(&&f: tt_frame) -> tt_frame {
 }

 pure fn dup_tt_reader(&&r: tt_reader) -> tt_reader {
-    ~{span_diagnostic: r.span_diagnostic, interner: r.interner,
+    @{span_diagnostic: r.span_diagnostic, interner: r.interner,
      mut cur: dup_tt_frame(r.cur),
      mut cur_tok: r.cur_tok, mut cur_span: r.cur_span}
 }
@ -75,13 +79,17 @@ type string_reader = @{
    mut curr: char,
    mut chpos: uint,
    filemap: codemap::filemap,
-    interner: @interner::interner<@str>
+    interner: @interner::interner<@str>,
+    /* cached: */
+    mut peek_tok: token::token,
+    mut peek_span: span
 };

 fn new_string_reader(span_diagnostic: diagnostic::span_handler,
                     filemap: codemap::filemap,
                     itr: @interner::interner<@str>) -> string_reader {
    let r = new_low_level_string_reader(span_diagnostic, filemap, itr);
+    string_advance_token(r); /* fill in peek_* */
    ret r;
 }

@ -93,7 +101,10 @@ fn new_low_level_string_reader(span_diagnostic: diagnostic::span_handler,
    let r = @{span_diagnostic: span_diagnostic, src: filemap.src,
              mut col: 0u, mut pos: 0u, mut curr: -1 as char,
              mut chpos: filemap.start_pos.ch,
-              filemap: filemap, interner: itr};
+              filemap: filemap, interner: itr,
+              /* dummy values; not read */
+              mut peek_tok: token::EOF,
+              mut peek_span: ast_util::mk_sp(0u,0u)};
    if r.pos < (*filemap.src).len() {
        let next = str::char_range_at(*r.src, r.pos);
        r.pos = next.next;
@ -102,23 +113,29 @@ fn new_low_level_string_reader(span_diagnostic: diagnostic::span_handler,
    ret r;
 }

+fn dup_string_reader(&&r: string_reader) -> string_reader {
+    @{span_diagnostic: r.span_diagnostic, src: r.src,
+      mut col: r.col, mut pos: r.pos, mut curr: r.curr, mut chpos: r.chpos,
+      filemap: r.filemap, interner: r.interner,
+      mut peek_tok: r.peek_tok, mut peek_span: r.peek_span}
+}
+
 impl string_reader_as_reader of reader for string_reader {
    fn is_eof() -> bool { is_eof(self) }
    fn next_token() -> {tok: token::token, sp: span} {
-        consume_whitespace_and_comments(self);
-        let start_chpos = self.chpos;
-        let tok = if is_eof(self) {
-            token::EOF
-        } else {
-            next_token_inner(self)
-        };
-        ret {tok: tok, sp: ast_util::mk_sp(start_chpos, self.chpos)};
+        let ret_val = {tok: self.peek_tok, sp: self.peek_span};
+        string_advance_token(self);
+        ret ret_val;
    }
    fn fatal(m: str) -> ! {
-        self.span_diagnostic.span_fatal(
-            ast_util::mk_sp(self.chpos, self.chpos), m)
+        self.span_diagnostic.span_fatal(copy self.peek_span, m)
    }
+    fn span_diag() -> diagnostic::span_handler { self.span_diagnostic }
    fn interner() -> @interner::interner<@str> { self.interner }
+    fn peek() -> {tok: token::token, sp: span} {
+        {tok: self.peek_tok, sp: self.peek_span}
+    }
+    fn dup() -> reader { dup_string_reader(self) as reader }
 }

 impl tt_reader_as_reader of reader for tt_reader {
@ -135,13 +152,25 @@ impl tt_reader_as_reader of reader for tt_reader {
    fn fatal(m: str) -> ! {
        self.span_diagnostic.span_fatal(copy self.cur_span, m);
    }
+    fn span_diag() -> diagnostic::span_handler { self.span_diagnostic }
    fn interner() -> @interner::interner<@str> { self.interner }
+    fn peek() -> {tok: token::token, sp: span} {
+        { tok: self.cur_tok, sp: self.cur_span }
+    }
+    fn dup() -> reader { dup_tt_reader(self) as reader }
 }

 fn string_advance_token(&&r: string_reader) {
    consume_whitespace_and_comments(r);

-    next_token_inner(r);
+    if is_eof(r) {
+        r.peek_tok = token::EOF;
+    } else {
+        let start_chpos = r.chpos;
+        r.peek_tok = next_token_inner(r);
+        r.peek_span = ast_util::mk_sp(start_chpos, r.chpos);
+    };
+
 }

 fn tt_next_token(&&r: tt_reader) -> {tok: token::token, sp: span} {
@ -149,11 +178,11 @@ fn tt_next_token(&&r: tt_reader) -> {tok: token::token, sp: span} {
    if r.cur.idx >= vec::len(r.cur.readme) {
        /* done with this set; pop */
        alt r.cur.up {
-          tt_frame_up(option::none) {
+          tt_frame_up(none) {
            r.cur_tok = token::EOF;
            ret ret_val;
          }
-          tt_frame_up(option::some(tt_f)) {
+          tt_frame_up(some(tt_f)) {
            r.cur = tt_f;
            /* the above `if` would need to be a `while` if we didn't know
            that the last thing in a `tt_delim` is always a `tt_flat` */
@ -165,11 +194,10 @@ fn tt_next_token(&&r: tt_reader) -> {tok: token::token, sp: span} {
    between popping and pushing until we got to an actual `tt_flat` */
    loop { /* because it's easiest, this handles `tt_delim` not starting
    with a `tt_flat`, even though it won't happen */
-        alt r.cur.readme[r.cur.idx] {
+        alt copy r.cur.readme[r.cur.idx] {
          tt_delim(tts) {
-            /* TODO: this copy should be a unary move, once they exist */
            r.cur = @{readme: tts, mut idx: 0u,
-                      up: tt_frame_up(option::some(copy r.cur)) };
+                      up: tt_frame_up(option::some(r.cur)) };
          }
          tt_flat(sp, tok) {
            r.cur_span = sp; r.cur_tok = tok;
--- a/src/libsyntax/parse/parser.rs
+++ b/src/libsyntax/parse/parser.rs
@ -803,10 +803,11 @@ class parser {
        } else if self.token == token::POUND
            && self.look_ahead(1u) == token::POUND {
            self.bump(); self.bump();
-            let macname = self.parse_path_without_tps();
-            let macbody = self.parse_token_tree();
-            ret pexpr(self.mk_mac_expr(lo, self.span.hi,
-                                       mac_invoc_tt(macname, macbody)));
+            //let macname = self.parse_path_without_tps();
+            //let macbody = self.parse_token_tree();
+            //ret pexpr(self.mk_mac_expr(lo, self.span.hi,
+            //                           mac_invoc_tt(macname, macbody)));
+            ret pexpr(self.parse_tt_mac_demo());
        } else if self.token == token::POUND
            && self.look_ahead(1u) == token::LT {
            self.bump(); self.bump();
@ -1084,6 +1085,58 @@ class parser {
        };
    }

+    /* temporary */
+    fn parse_tt_mac_demo() -> @expr {
+        let ms = self.parse_seq(token::LBRACE, token::RBRACE,
+                                common::seq_sep_none(),
+                                {|p| p.parse_matcher(@mut 0u)}).node;
+        let tt = self.parse_token_tree();
+        alt tt {
+          tt_delim(tts) {
+            let rdr = lexer::new_tt_reader(self.reader.span_diag(),
+                                           self.reader.interner(), tts)
+                as reader;
+            ext::earley_parser::parse(self.sess, self.cfg, rdr, ms);
+          }
+          _ { fail; }
+        }
+
+        ret self.mk_expr(0u, 0u, expr_break);
+    }
+
+    fn parse_matcher(name_idx: @mut uint) -> matcher {
+        let lo = self.span.lo;
+        let mut sep = none;
+        if self.eat_keyword("sep") { sep = some(self.token); self.bump(); }
+
+        let m = if self.is_keyword("many")||self.is_keyword("at_least_one") {
+            let zero_ok = self.is_keyword("many");
+            self.bump();
+            let ms = (self.parse_seq(token::LPAREN, token::RPAREN,
+                                     common::seq_sep_none(),
+                                     {|p| p.parse_matcher(name_idx)}).node);
+            if ms.len() == 0u {
+                self.fatal("repetition body must be nonempty");
+            }
+            mtc_rep(ms, sep, zero_ok)
+        } else if option::is_some(sep) {
+            self.fatal("`sep <tok>` must preceed `many` or `at_least_one`");
+        } else if self.eat_keyword("parse") {
+            let bound_to = self.parse_ident();
+            self.expect(token::EQ);
+            let nt_name = self.parse_ident();
+
+            let m = mtc_bb(bound_to, nt_name, *name_idx);
+            *name_idx += 1u;
+            m
+        } else {
+            let m = mtc_tok(self.token);
+            self.bump();
+            m
+        };
+        ret spanned(lo, self.span.hi, m);
+    }
+

    fn parse_prefix_expr() -> pexpr {
        let lo = self.span.lo;
--- a/src/libsyntax/parse/token.rs
+++ b/src/libsyntax/parse/token.rs
@ -85,7 +85,7 @@ enum token {

 #[auto_serialize]
 #[doc = "For interpolation during macro expansion."]
-enum whole_nonterminal {
+enum whole_nt {
    w_item(@ast::item),
    w_block(ast::blk),
    w_stmt(@ast::stmt),
@ -257,7 +257,9 @@ fn contextual_keyword_table() -> hashmap<str, ()> {
        "self", "send", "static",
        "to",
        "use",
-        "with"
+        "with",
+        /* temp */
+        "sep", "many", "at_least_one", "parse"
    ];
    for keys.each {|word|
        words.insert(word, ());
--- a/src/libsyntax/syntax.rc
+++ b/src/libsyntax/syntax.rc
@ -29,6 +29,7 @@ mod util {

 mod parse {
    export parser;
+    export common;
    export lexer;
    export token;
    export comments;
@ -64,6 +65,8 @@ mod ext {
    mod qquote;
    mod build;

+    mod earley_parser;
+
    mod fmt;
    mod env;
    mod simplext;