From b8a4c1415b154fa1e5bd8bb54e681f0f5e21e2a4 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 13 Jun 2014 18:56:24 +0100 Subject: [PATCH] Add br##"xx"## raw byte string literals. --- src/librustdoc/html/highlight.rs | 2 +- src/libsyntax/parse/lexer/mod.rs | 56 ++++++++++++++++++- src/libsyntax/parse/parser.rs | 1 + src/libsyntax/parse/token.rs | 7 +++ src/test/compile-fail/raw-byte-string-eof.rs | 16 ++++++ .../compile-fail/raw-byte-string-literals.rs | 17 ++++++ src/test/run-pass/byte-literals.rs | 8 ++- 7 files changed, 102 insertions(+), 5 deletions(-) create mode 100644 src/test/compile-fail/raw-byte-string-eof.rs create mode 100644 src/test/compile-fail/raw-byte-string-literals.rs diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index 172a1be7b4e5..daa9ee3da844 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -140,7 +140,7 @@ fn doit(sess: &parse::ParseSess, mut lexer: lexer::StringReader, } // text literals - t::LIT_BYTE(..) | t::LIT_BINARY(..) | + t::LIT_BYTE(..) | t::LIT_BINARY(..) | t::LIT_BINARY_RAW(..) | t::LIT_CHAR(..) | t::LIT_STR(..) | t::LIT_STR_RAW(..) => "string", // number literals diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 59bcf059fcd3..31f15fd7495a 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -650,12 +650,13 @@ impl<'a> StringReader<'a> { /// token, and updates the interner fn next_token_inner(&mut self) -> token::Token { let c = self.curr; - if ident_start(c) && match (c.unwrap(), self.nextch()) { + if ident_start(c) && match (c.unwrap(), self.nextch(), self.nextnextch()) { // Note: r as in r" or r#" is part of a raw string literal, // b as in b' is part of a byte literal. // They are not identifiers, and are handled further down. - ('r', Some('"')) | ('r', Some('#')) | - ('b', Some('"')) | ('b', Some('\'')) => false, + ('r', Some('"'), _) | ('r', Some('#'), _) | + ('b', Some('"'), _) | ('b', Some('\''), _) | + ('b', Some('r'), Some('"')) | ('b', Some('r'), Some('#')) => false, _ => true } { let start = self.last_pos; @@ -863,6 +864,7 @@ impl<'a> StringReader<'a> { return match self.curr { Some('\'') => parse_byte(self), Some('"') => parse_byte_string(self), + Some('r') => parse_raw_byte_string(self), _ => unreachable!() // Should have been a token::IDENT above. }; @@ -978,6 +980,54 @@ impl<'a> StringReader<'a> { self_.bump(); return token::LIT_BINARY(Rc::new(value)); } + + fn parse_raw_byte_string(self_: &mut StringReader) -> token::Token { + let start_bpos = self_.last_pos; + self_.bump(); + let mut hash_count = 0u; + while self_.curr_is('#') { + self_.bump(); + hash_count += 1; + } + + if self_.is_eof() { + self_.fatal_span(start_bpos, self_.last_pos, "unterminated raw string"); + } else if !self_.curr_is('"') { + self_.fatal_span_char(start_bpos, self_.last_pos, + "only `#` is allowed in raw string delimitation; \ + found illegal character", + self_.curr.unwrap()); + } + self_.bump(); + let content_start_bpos = self_.last_pos; + let mut content_end_bpos; + 'outer: loop { + match self_.curr { + None => self_.fatal_span(start_bpos, self_.last_pos, + "unterminated raw string"), + Some('"') => { + content_end_bpos = self_.last_pos; + for _ in range(0, hash_count) { + self_.bump(); + if !self_.curr_is('#') { + continue 'outer; + } + } + break; + }, + Some(c) => if c > '\x7F' { + self_.err_span_char(self_.last_pos, self_.last_pos, + "raw byte string must be ASCII", c); + } + } + self_.bump(); + } + self_.bump(); + let bytes = self_.with_str_from_to(content_start_bpos, + content_end_bpos, + |s| s.as_bytes().to_owned()); + return token::LIT_BINARY_RAW(Rc::new(bytes), hash_count); + } } '"' => { let mut accum_str = String::new(); diff --git a/src/libsyntax/parse/parser.rs b/src/libsyntax/parse/parser.rs index 826d28ef3ff8..ae2ec216bee6 100644 --- a/src/libsyntax/parse/parser.rs +++ b/src/libsyntax/parse/parser.rs @@ -1529,6 +1529,7 @@ impl<'a> Parser<'a> { token::LIT_STR_RAW(s, n) => { LitStr(self.id_to_interned_str(s), ast::RawStr(n)) } + token::LIT_BINARY_RAW(ref v, _) | token::LIT_BINARY(ref v) => LitBinary(v.clone()), token::LPAREN => { self.expect(&token::RPAREN); LitNil }, _ => { self.unexpected_last(tok); } diff --git a/src/libsyntax/parse/token.rs b/src/libsyntax/parse/token.rs index b76dcaf0b94c..a2af417ed79a 100644 --- a/src/libsyntax/parse/token.rs +++ b/src/libsyntax/parse/token.rs @@ -88,6 +88,7 @@ pub enum Token { LIT_STR(ast::Ident), LIT_STR_RAW(ast::Ident, uint), /* raw str delimited by n hash symbols */ LIT_BINARY(Rc>), + LIT_BINARY_RAW(Rc>, uint), /* raw binary str delimited by n hash symbols */ /* Name components */ // an identifier contains an "is_mod_name" boolean, @@ -243,6 +244,10 @@ pub fn to_str(t: &Token) -> String { "b\"{}\"", v.iter().map(|&b| b as char).collect::().escape_default()) } + LIT_BINARY_RAW(ref s, n) => { + format!("br{delim}\"{string}\"{delim}", + delim="#".repeat(n), string=s.as_slice().to_ascii().as_str_ascii()) + } /* Name components */ IDENT(s, _) => get_ident(s).get().to_string(), @@ -298,6 +303,7 @@ pub fn can_begin_expr(t: &Token) -> bool { LIT_STR(_) => true, LIT_STR_RAW(_, _) => true, LIT_BINARY(_) => true, + LIT_BINARY_RAW(_, _) => true, POUND => true, AT => true, NOT => true, @@ -338,6 +344,7 @@ pub fn is_lit(t: &Token) -> bool { LIT_STR(_) => true, LIT_STR_RAW(_, _) => true, LIT_BINARY(_) => true, + LIT_BINARY_RAW(_, _) => true, _ => false } } diff --git a/src/test/compile-fail/raw-byte-string-eof.rs b/src/test/compile-fail/raw-byte-string-eof.rs new file mode 100644 index 000000000000..83ea9db39b79 --- /dev/null +++ b/src/test/compile-fail/raw-byte-string-eof.rs @@ -0,0 +1,16 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + + +pub fn main() { + br##"a"#; //~ unterminated raw string +} + + diff --git a/src/test/compile-fail/raw-byte-string-literals.rs b/src/test/compile-fail/raw-byte-string-literals.rs new file mode 100644 index 000000000000..7a3d1b2318a3 --- /dev/null +++ b/src/test/compile-fail/raw-byte-string-literals.rs @@ -0,0 +1,17 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + + +pub fn main() { + br"é"; //~ raw byte string must be ASCII + br##~"a"~##; //~ only `#` is allowed in raw string delimitation +} + + diff --git a/src/test/run-pass/byte-literals.rs b/src/test/run-pass/byte-literals.rs index 58df7dc8efd2..5317fdc391fd 100644 --- a/src/test/run-pass/byte-literals.rs +++ b/src/test/run-pass/byte-literals.rs @@ -11,6 +11,7 @@ static FOO: u8 = b'\xF0'; static BAR: &'static [u8] = b"a\xF0\t"; +static BAZ: &'static [u8] = br"a\n"; pub fn main() { assert_eq!(b'a', 97u8); @@ -24,7 +25,6 @@ pub fn main() { assert_eq!(b'\xF0', 240u8); assert_eq!(FOO, 240u8); - // FIXME: Do we want this to be valid? assert_eq!([42, ..b'\t'].as_slice(), &[42, 42, 42, 42, 42, 42, 42, 42, 42]); match 42 { @@ -47,4 +47,10 @@ pub fn main() { b"a\n" => {}, _ => fail!(), } + + assert_eq!(BAZ, &[97u8, 92u8, 110u8]); + assert_eq!(br"a\n", &[97u8, 92u8, 110u8]); + assert_eq!(br"a\n", b"a\\n"); + assert_eq!(br###"a"##b"###, &[97u8, 34u8, 35u8, 35u8, 98u8]); + assert_eq!(br###"a"##b"###, b"a\"##b"); }