parsing for the macro system

This commit is contained in:
Paul Stansifer 2012-06-12 10:59:50 -07:00
parent 650dfe58a3
commit 4f104954a6
8 changed files with 370 additions and 29 deletions

View file

@ -377,6 +377,17 @@ enum token_tree {
tt_flat(span, token::token)
}
#[auto_serialize]
type matcher = spanned<matcher_>;
#[auto_serialize]
enum matcher_ {
mtc_tok(token::token),
/* body, separator, zero ok? : */
mtc_rep([matcher], option<token::token>, bool),
mtc_bb(ident, ident, uint)
}
#[auto_serialize]
type mac = spanned<mac_>;

View file

@ -0,0 +1,243 @@
// Earley-like parser for macros.
import parse::token;
import parse::token::{token, EOF, to_str, whole_nt};
import parse::lexer::{reader, tt_reader, tt_reader_as_reader};
import parse::parser::{parser,SOURCE_FILE};
import parse::common::parser_common;
import parse::parse_sess;
import dvec::{dvec, extensions};
import ast::{matcher, mtc_tok, mtc_rep, mtc_bb};
/* This is an Earley-like parser, without support for nonterminals. This
means that there are no completer or predictor rules, and therefore no need to
store one column per token: instead, there's a set of current Earley items and
a set of next ones. Instead of NTs, we have a special case for Kleene
star. The big-O, in pathological cases, is worse than traditional Earley
parsing, but it's an easier fit for Macro-by-Example-style rules, and I think
the overhead is lower. */
/* to avoid costly uniqueness checks, we require that `mtc_rep` always has a
nonempty body. */
enum matcher_pos_up { /* to break a circularity */
matcher_pos_up(option<matcher_pos>)
}
fn is_some(&&mpu: matcher_pos_up) -> bool {
alt mpu {
matcher_pos_up(none) { false }
_ { true }
}
}
type matcher_pos = ~{
elts: [ast::matcher], // maybe should be /& ? Need to understand regions.
sep: option<token>,
mut idx: uint,
mut up: matcher_pos_up, // mutable for swapping only
matches: [dvec<@arb_depth>]
};
fn copy_up(&& mpu: matcher_pos_up) -> matcher_pos {
alt mpu {
matcher_pos_up(some(mp)) { copy mp }
_ { fail }
}
}
fn count_names(ms: [matcher]/&) -> uint {
vec::foldl(0u, ms, {|ct, m|
ct + alt m.node {
mtc_tok(_) { 0u }
mtc_rep(more_ms, _, _) { count_names(more_ms) }
mtc_bb(_,_,_) { 1u }
}})
}
fn new_matcher_pos(ms: [matcher], sep: option<token>) -> matcher_pos {
~{elts: ms, sep: sep, mut idx: 0u, mut up: matcher_pos_up(none),
matches: copy vec::from_fn(count_names(ms), {|_i| dvec::dvec()}) }
}
/* logically, an arb_depth should contain only one kind of nonterminal */
enum arb_depth { leaf(whole_nt), seq([@arb_depth]) }
type earley_item = matcher_pos;
fn parse(sess: parse_sess, cfg: ast::crate_cfg, rdr: reader, ms: [matcher])
-> [@arb_depth] {
let mut cur_eis = [];
vec::push(cur_eis, new_matcher_pos(ms, none));
loop {
let mut bb_eis = []; // black-box parsed by parser.rs
let mut next_eis = []; // or proceed normally
let mut eof_eis = [];
let {tok: tok, sp: _} = rdr.peek();
/* we append new items to this while we go */
while cur_eis.len() > 0u { /* for each Earley Item */
let mut ei = vec::pop(cur_eis);
let idx = ei.idx;
let len = ei.elts.len();
/* at end of sequence */
if idx >= len {
// can't move out of `alt`s, so:
if is_some(ei.up) {
// hack: a matcher sequence is repeating iff it has a
// parent (the top level is just a container)
// disregard separator, try to go up
// (remove this condition to make trailing seps ok)
if idx == len {
// pop from the matcher position
let new_pos = copy_up(ei.up);
// update matches (the MBE "parse tree") by appending
// each tree as a subtree.
// I bet this is a perf problem: we're preemptively
// doing a lot of array work that will get thrown away
// most of the time.
for ei.matches.eachi() { |idx, elt|
new_pos.matches[idx].push(@seq(elt.get()));
}
new_pos.idx += 1u;
vec::push(cur_eis, new_pos);
}
// can we go around again?
// the *_t vars are workarounds for the lack of unary move
alt copy ei.sep {
some(t) if idx == len { // we need a separator
if tok == t { //pass the separator
let ei_t <- ei;
ei_t.idx += 1u;
vec::push(next_eis, ei_t);
}
}
_ { // we don't need a separator
let ei_t <- ei;
ei_t.idx = 0u;
vec::push(cur_eis, ei_t);
}
}
} else {
vec::push(eof_eis, ei);
}
} else {
alt copy ei.elts[idx].node {
/* need to descend into sequence */
mtc_rep(matchers, sep, zero_ok) {
if zero_ok {
let new_ei = copy ei;
new_ei.idx += 1u;
vec::push(cur_eis, new_ei);
}
let matches = vec::map(ei.matches, // fresh, same size:
{|_m| dvec::<@arb_depth>()});
let ei_t <- ei;
vec::push(cur_eis, ~{
elts: matchers, sep: sep, mut idx: 0u,
mut up: matcher_pos_up(some(ei_t)),
matches: matches
});
}
mtc_bb(_,_,_) { vec::push(bb_eis, ei) }
mtc_tok(t) {
let ei_t <- ei;
if t == tok { ei_t.idx += 1u; vec::push(next_eis, ei_t)}
}
}
}
}
/* error messages here could be improved with links to orig. rules */
if tok == EOF {
if eof_eis.len() == 1u {
let ret_val = vec::map(eof_eis[0u].matches, {|dv| dv.pop()});
ret ret_val; /* success */
} else if eof_eis.len() > 1u {
rdr.fatal("Ambiguity: multiple successful parses");
} else {
rdr.fatal("Unexpected end of macro invocation");
}
} else {
if (bb_eis.len() > 0u && next_eis.len() > 0u)
|| bb_eis.len() > 1u {
let nts = str::connect(vec::map(bb_eis, {|ei|
alt ei.elts[ei.idx].node
{ mtc_bb(_,name,_) { *name } _ { fail; } }
}), " or ");
rdr.fatal(#fmt["Local ambiguity: multiple parsing options: \
built-in NTs %s or %u other options.",
nts, next_eis.len()]);
} else if (bb_eis.len() == 0u && next_eis.len() == 0u) {
rdr.fatal("No rules expected the token "
+ to_str(*rdr.interner(), tok));
} else if (next_eis.len() > 0u) {
/* Now process the next token */
while(next_eis.len() > 0u) {
vec::push(cur_eis, vec::pop(next_eis));
}
rdr.next_token();
} else /* bb_eis.len() == 1 */ {
let rust_parser = parser(sess, cfg, rdr.dup(), SOURCE_FILE);
let ei = vec::pop(bb_eis);
alt ei.elts[ei.idx].node {
mtc_bb(_, name, idx) {
ei.matches[idx].push(@leaf(
parse_nt(rust_parser, *name)));
ei.idx += 1u;
}
_ { fail; }
}
vec::push(cur_eis,ei);
/* this would fail if zero-length tokens existed */
while rdr.peek().sp.lo < rust_parser.span.lo {
rdr.next_token();
}
}
}
assert cur_eis.len() > 0u;
}
}
fn parse_nt(p: parser, name: str) -> whole_nt {
alt name {
"item" { alt p.parse_item([], ast::public) {
some(i) { token::w_item(i) }
none { p.fatal("expected an item keyword") }
}}
"block" { token::w_block(p.parse_block()) }
"stmt" { token::w_stmt(p.parse_stmt([])) }
"pat" { token::w_pat(p.parse_pat()) }
"expr" { token::w_expr(p.parse_expr()) }
"ty" { token::w_ty(p.parse_ty(false /* no need to disambiguate*/)) }
// this could be handled like a token, since it is one
"ident" { token::w_ident(p.parse_ident()) }
"path" { token::w_path(p.parse_path_with_tps(false)) }
_ { p.fatal("Unsupported builtin nonterminal parser: " + name)}
}
}
// Local Variables:
// mode: rust;
// fill-column: 78;
// indent-tabs-mode: nil
// c-basic-offset: 4
// buffer-file-coding-system: utf-8-unix
// End:

View file

@ -199,8 +199,9 @@ fn gather_comments_and_literals(span_diagnostic: diagnostic::span_handler,
let bstart = rdr.pos;
rdr.next_token();
//discard, and look ahead; we're working with internal state
let {tok: tok, sp: sp} = rdr.next_token();
let {tok: tok, sp: sp} = rdr.peek();
if token::is_lit(tok) {
let s = get_str_from(rdr, bstart);
vec::push(literals, {lit: s, pos: sp.lo});

View file

@ -111,8 +111,8 @@ impl parser_common for parser {
if !self.eat_keyword(word) {
self.fatal("expecting " + word + ", found " +
token_to_str(self.reader, self.token));
}
}
}
fn is_restricted_keyword(word: str) -> bool {
self.restricted_keywords.contains_key(word)

View file

@ -13,13 +13,17 @@ iface reader {
fn is_eof() -> bool;
fn next_token() -> {tok: token::token, sp: span};
fn fatal(str) -> !;
fn span_diag() -> diagnostic::span_handler;
fn interner() -> @interner::interner<@str>;
fn peek() -> {tok: token::token, sp: span};
fn dup() -> reader;
}
enum tt_frame_up { /* to break a circularity */
tt_frame_up(option<tt_frame>)
}
/* TODO: figure out how to have a uniquely linked stack, and change to `~` */
#[doc = "an unzipping of `token_tree`s"]
type tt_frame = @{
readme: [ast::token_tree],
@ -27,7 +31,7 @@ type tt_frame = @{
up: tt_frame_up
};
type tt_reader = ~{
type tt_reader = @{
span_diagnostic: diagnostic::span_handler,
interner: @interner::interner<@str>,
mut cur: tt_frame,
@ -39,7 +43,7 @@ type tt_reader = ~{
fn new_tt_reader(span_diagnostic: diagnostic::span_handler,
itr: @interner::interner<@str>, src: [ast::token_tree])
-> tt_reader {
let r = ~{span_diagnostic: span_diagnostic, interner: itr,
let r = @{span_diagnostic: span_diagnostic, interner: itr,
mut cur: @{readme: src, mut idx: 0u,
up: tt_frame_up(option::none)},
/* dummy values, never read: */
@ -62,7 +66,7 @@ pure fn dup_tt_frame(&&f: tt_frame) -> tt_frame {
}
pure fn dup_tt_reader(&&r: tt_reader) -> tt_reader {
~{span_diagnostic: r.span_diagnostic, interner: r.interner,
@{span_diagnostic: r.span_diagnostic, interner: r.interner,
mut cur: dup_tt_frame(r.cur),
mut cur_tok: r.cur_tok, mut cur_span: r.cur_span}
}
@ -75,13 +79,17 @@ type string_reader = @{
mut curr: char,
mut chpos: uint,
filemap: codemap::filemap,
interner: @interner::interner<@str>
interner: @interner::interner<@str>,
/* cached: */
mut peek_tok: token::token,
mut peek_span: span
};
fn new_string_reader(span_diagnostic: diagnostic::span_handler,
filemap: codemap::filemap,
itr: @interner::interner<@str>) -> string_reader {
let r = new_low_level_string_reader(span_diagnostic, filemap, itr);
string_advance_token(r); /* fill in peek_* */
ret r;
}
@ -93,7 +101,10 @@ fn new_low_level_string_reader(span_diagnostic: diagnostic::span_handler,
let r = @{span_diagnostic: span_diagnostic, src: filemap.src,
mut col: 0u, mut pos: 0u, mut curr: -1 as char,
mut chpos: filemap.start_pos.ch,
filemap: filemap, interner: itr};
filemap: filemap, interner: itr,
/* dummy values; not read */
mut peek_tok: token::EOF,
mut peek_span: ast_util::mk_sp(0u,0u)};
if r.pos < (*filemap.src).len() {
let next = str::char_range_at(*r.src, r.pos);
r.pos = next.next;
@ -102,23 +113,29 @@ fn new_low_level_string_reader(span_diagnostic: diagnostic::span_handler,
ret r;
}
fn dup_string_reader(&&r: string_reader) -> string_reader {
@{span_diagnostic: r.span_diagnostic, src: r.src,
mut col: r.col, mut pos: r.pos, mut curr: r.curr, mut chpos: r.chpos,
filemap: r.filemap, interner: r.interner,
mut peek_tok: r.peek_tok, mut peek_span: r.peek_span}
}
impl string_reader_as_reader of reader for string_reader {
fn is_eof() -> bool { is_eof(self) }
fn next_token() -> {tok: token::token, sp: span} {
consume_whitespace_and_comments(self);
let start_chpos = self.chpos;
let tok = if is_eof(self) {
token::EOF
} else {
next_token_inner(self)
};
ret {tok: tok, sp: ast_util::mk_sp(start_chpos, self.chpos)};
let ret_val = {tok: self.peek_tok, sp: self.peek_span};
string_advance_token(self);
ret ret_val;
}
fn fatal(m: str) -> ! {
self.span_diagnostic.span_fatal(
ast_util::mk_sp(self.chpos, self.chpos), m)
self.span_diagnostic.span_fatal(copy self.peek_span, m)
}
fn span_diag() -> diagnostic::span_handler { self.span_diagnostic }
fn interner() -> @interner::interner<@str> { self.interner }
fn peek() -> {tok: token::token, sp: span} {
{tok: self.peek_tok, sp: self.peek_span}
}
fn dup() -> reader { dup_string_reader(self) as reader }
}
impl tt_reader_as_reader of reader for tt_reader {
@ -135,13 +152,25 @@ impl tt_reader_as_reader of reader for tt_reader {
fn fatal(m: str) -> ! {
self.span_diagnostic.span_fatal(copy self.cur_span, m);
}
fn span_diag() -> diagnostic::span_handler { self.span_diagnostic }
fn interner() -> @interner::interner<@str> { self.interner }
fn peek() -> {tok: token::token, sp: span} {
{ tok: self.cur_tok, sp: self.cur_span }
}
fn dup() -> reader { dup_tt_reader(self) as reader }
}
fn string_advance_token(&&r: string_reader) {
consume_whitespace_and_comments(r);
next_token_inner(r);
if is_eof(r) {
r.peek_tok = token::EOF;
} else {
let start_chpos = r.chpos;
r.peek_tok = next_token_inner(r);
r.peek_span = ast_util::mk_sp(start_chpos, r.chpos);
};
}
fn tt_next_token(&&r: tt_reader) -> {tok: token::token, sp: span} {
@ -149,11 +178,11 @@ fn tt_next_token(&&r: tt_reader) -> {tok: token::token, sp: span} {
if r.cur.idx >= vec::len(r.cur.readme) {
/* done with this set; pop */
alt r.cur.up {
tt_frame_up(option::none) {
tt_frame_up(none) {
r.cur_tok = token::EOF;
ret ret_val;
}
tt_frame_up(option::some(tt_f)) {
tt_frame_up(some(tt_f)) {
r.cur = tt_f;
/* the above `if` would need to be a `while` if we didn't know
that the last thing in a `tt_delim` is always a `tt_flat` */
@ -165,11 +194,10 @@ fn tt_next_token(&&r: tt_reader) -> {tok: token::token, sp: span} {
between popping and pushing until we got to an actual `tt_flat` */
loop { /* because it's easiest, this handles `tt_delim` not starting
with a `tt_flat`, even though it won't happen */
alt r.cur.readme[r.cur.idx] {
alt copy r.cur.readme[r.cur.idx] {
tt_delim(tts) {
/* TODO: this copy should be a unary move, once they exist */
r.cur = @{readme: tts, mut idx: 0u,
up: tt_frame_up(option::some(copy r.cur)) };
up: tt_frame_up(option::some(r.cur)) };
}
tt_flat(sp, tok) {
r.cur_span = sp; r.cur_tok = tok;

View file

@ -803,10 +803,11 @@ class parser {
} else if self.token == token::POUND
&& self.look_ahead(1u) == token::POUND {
self.bump(); self.bump();
let macname = self.parse_path_without_tps();
let macbody = self.parse_token_tree();
ret pexpr(self.mk_mac_expr(lo, self.span.hi,
mac_invoc_tt(macname, macbody)));
//let macname = self.parse_path_without_tps();
//let macbody = self.parse_token_tree();
//ret pexpr(self.mk_mac_expr(lo, self.span.hi,
// mac_invoc_tt(macname, macbody)));
ret pexpr(self.parse_tt_mac_demo());
} else if self.token == token::POUND
&& self.look_ahead(1u) == token::LT {
self.bump(); self.bump();
@ -1084,6 +1085,58 @@ class parser {
};
}
/* temporary */
fn parse_tt_mac_demo() -> @expr {
let ms = self.parse_seq(token::LBRACE, token::RBRACE,
common::seq_sep_none(),
{|p| p.parse_matcher(@mut 0u)}).node;
let tt = self.parse_token_tree();
alt tt {
tt_delim(tts) {
let rdr = lexer::new_tt_reader(self.reader.span_diag(),
self.reader.interner(), tts)
as reader;
ext::earley_parser::parse(self.sess, self.cfg, rdr, ms);
}
_ { fail; }
}
ret self.mk_expr(0u, 0u, expr_break);
}
fn parse_matcher(name_idx: @mut uint) -> matcher {
let lo = self.span.lo;
let mut sep = none;
if self.eat_keyword("sep") { sep = some(self.token); self.bump(); }
let m = if self.is_keyword("many")||self.is_keyword("at_least_one") {
let zero_ok = self.is_keyword("many");
self.bump();
let ms = (self.parse_seq(token::LPAREN, token::RPAREN,
common::seq_sep_none(),
{|p| p.parse_matcher(name_idx)}).node);
if ms.len() == 0u {
self.fatal("repetition body must be nonempty");
}
mtc_rep(ms, sep, zero_ok)
} else if option::is_some(sep) {
self.fatal("`sep <tok>` must preceed `many` or `at_least_one`");
} else if self.eat_keyword("parse") {
let bound_to = self.parse_ident();
self.expect(token::EQ);
let nt_name = self.parse_ident();
let m = mtc_bb(bound_to, nt_name, *name_idx);
*name_idx += 1u;
m
} else {
let m = mtc_tok(self.token);
self.bump();
m
};
ret spanned(lo, self.span.hi, m);
}
fn parse_prefix_expr() -> pexpr {
let lo = self.span.lo;

View file

@ -85,7 +85,7 @@ enum token {
#[auto_serialize]
#[doc = "For interpolation during macro expansion."]
enum whole_nonterminal {
enum whole_nt {
w_item(@ast::item),
w_block(ast::blk),
w_stmt(@ast::stmt),
@ -257,7 +257,9 @@ fn contextual_keyword_table() -> hashmap<str, ()> {
"self", "send", "static",
"to",
"use",
"with"
"with",
/* temp */
"sep", "many", "at_least_one", "parse"
];
for keys.each {|word|
words.insert(word, ());

View file

@ -29,6 +29,7 @@ mod util {
mod parse {
export parser;
export common;
export lexer;
export token;
export comments;
@ -64,6 +65,8 @@ mod ext {
mod qquote;
mod build;
mod earley_parser;
mod fmt;
mod env;
mod simplext;