Make lexer buffer the whole file

This way, it won't have to go through a bunch of calls for each byte fetched.
2011-04-08 16:48:17 +02:00 · 2011-04-08 16:48:17 +02:00 · 094d31f5e4
commit 094d31f5e4
parent cae703c0b1
2 changed files with 82 additions and 67 deletions
--- a/src/comp/front/lexer.rs
+++ b/src/comp/front/lexer.rs
@ -11,24 +11,26 @@ import util.common;
 import util.common.new_str_hash;

 state type reader = state obj {
-                          fn is_eof() -> bool;
-                          fn curr() -> char;
-                          fn next() -> char;
-                          impure fn bump();
-                          fn mark();
-                          fn get_filename() -> str;
-                          fn get_mark_pos() -> common.pos;
-                          fn get_curr_pos() -> common.pos;
-                          fn get_keywords() -> hashmap[str,token.token];
-                          fn get_reserved() -> hashmap[str,()];
+    fn is_eof() -> bool;
+    fn curr() -> char;
+    fn next() -> char;
+    impure fn init();
+    impure fn bump();
+    fn mark();
+    fn get_filename() -> str;
+    fn get_mark_pos() -> common.pos;
+    fn get_curr_pos() -> common.pos;
+    fn get_keywords() -> hashmap[str,token.token];
+    fn get_reserved() -> hashmap[str,()];
 };

 impure fn new_reader(io.reader rdr, str filename) -> reader
 {
-    state obj reader(io.reader rdr,
+    state obj reader(str file,
                     str filename,
-                     mutable char c,
-                     mutable char n,
+                     uint len,
+                     mutable uint pos,
+                     mutable char ch,
                     mutable uint mark_line,
                     mutable uint mark_col,
                     mutable uint line,
@ -36,64 +38,69 @@ impure fn new_reader(io.reader rdr, str filename) -> reader
                     hashmap[str,token.token] keywords,
                     hashmap[str,()] reserved) {

-            fn is_eof() -> bool {
-                ret c == (-1) as char;
+        fn is_eof() -> bool {
+            ret ch == -1 as char;
+        }
+
+        fn get_curr_pos() -> common.pos {
+            ret rec(line=line, col=col);
+        }
+
+        fn get_mark_pos() -> common.pos {
+            ret rec(line=mark_line, col=mark_col);
+        }
+
+        fn get_filename() -> str {
+            ret filename;
+        }
+
+        fn curr() -> char {
+            ret ch;
+        }
+
+        fn next() -> char {
+            if (pos < len) {ret _str.char_at(file, pos);}
+            else {ret -1 as char;}
+        }
+            
+        impure fn init() {
+            if (pos < len) {
+                auto next = _str.char_range_at(file, pos);
+                pos = next._1;
+                ch = next._0;
            }
+        }

-            fn get_curr_pos() -> common.pos {
-                ret rec(line=line, col=col);
-            }
-
-            fn get_mark_pos() -> common.pos {
-                ret rec(line=mark_line, col=mark_col);
-            }
-
-            fn get_filename() -> str {
-                ret filename;
-            }
-
-            fn curr() -> char {
-                ret c;
-            }
-
-            fn next() -> char {
-                ret n;
-            }
-
-            impure fn bump() {
-
-                let char prev = c;
-
-                c = n;
-
-                if (c == (-1) as char) {
-                    ret;
-                }
-
-                if (prev == '\n') {
+        impure fn bump() {
+            if (pos < len) {
+                if (ch == '\n') {
                    line += 1u;
                    col = 0u;
                } else {
                    col += 1u;
                }
-
-                n = rdr.read_char();
-            }
-
-            fn mark() {
-                mark_line = line;
-                mark_col = col;
-            }
-
-            fn get_keywords() -> hashmap[str,token.token] {
-                ret keywords;
-            }
-
-            fn get_reserved() -> hashmap[str,()] {
-                ret reserved;
+                auto next = _str.char_range_at(file, pos);
+                pos = next._1;
+                ch = next._0;
+            } else {
+                ch = -1 as char;
            }
        }

+        fn mark() {
+            mark_line = line;
+            mark_col = col;
+        }
+
+        fn get_keywords() -> hashmap[str,token.token] {
+            ret keywords;
+        }
+
+        fn get_reserved() -> hashmap[str,()] {
+            ret reserved;
+        }
+    }
+
    auto keywords = new_str_hash[token.token]();

    keywords.insert("mod", token.MOD);
@ -208,13 +215,14 @@ impure fn new_reader(io.reader rdr, str filename) -> reader
    reserved.insert("m128", ()); // IEEE 754-2008 'decimal128'
    reserved.insert("dec", ());  // One of m32, m64, m128

-    ret reader(rdr, filename, rdr.read_char(),
-               rdr.read_char(), 1u, 0u, 1u, 0u, keywords, reserved);
+    auto file = _str.unsafe_from_bytes(rdr.read_whole_stream());
+    auto rd = reader(file, filename, _str.byte_len(file), 0u, -1 as char,
+                     1u, 0u, 1u, 0u, keywords, reserved);
+    rd.init();
+    ret rd;
 }


-
-
 fn in_range(char c, char lo, char hi) -> bool {
    ret lo <= c && c <= hi;
 }
@ -689,7 +697,6 @@ impure fn next_token(reader rdr) -> token.token {

        case ('"') {
            rdr.bump();
-            // FIXME: general utf8-consumption support.
            while (rdr.curr() != '"') {
                alt (rdr.curr()) {
                    case ('\\') {
@ -850,7 +857,7 @@ impure fn read_block_comment(reader rdr) -> cmnt {

 impure fn gather_comments(str path) -> vec[cmnt] {
    auto srdr = io.file_reader(path);
-    auto rdr = lexer.new_reader(srdr, path);
+    auto rdr = new_reader(srdr, path);
    let vec[cmnt] comments = vec();
    while (!rdr.is_eof()) {
        while (true) {
--- a/src/lib/io.rs
+++ b/src/lib/io.rs
@ -41,6 +41,7 @@ type reader =
          impure fn read_le_uint(uint size) -> uint;
          impure fn read_le_int(uint size) -> int;
          impure fn read_be_uint(uint size) -> uint;
+          impure fn read_whole_stream() -> vec[u8];

          impure fn seek(int offset, seek_style whence);
          impure fn tell() -> uint; // FIXME: eventually u64
@ -170,6 +171,13 @@ state obj new_reader(buf_reader rdr) {
        }
        ret val;
    }
+    impure fn read_whole_stream() -> vec[u8] {
+        let vec[u8] buf = vec();
+        while (!rdr.eof()) {
+            buf += rdr.read(2048u);
+        }
+        ret buf;
+    }
    impure fn seek(int offset, seek_style whence) {
        ret rdr.seek(offset, whence);
    }