From ef46314d1e6418b57a1d0a4efb5853dcaf8077d4 Mon Sep 17 00:00:00 2001 From: Daniel Patterson Date: Tue, 24 Jul 2012 23:21:32 -0400 Subject: [PATCH] std: integrating erickt's url encoding/decoding from github.com/erickt/rust-uri into std::net::url --- src/libstd/net_url.rs | 389 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 385 insertions(+), 4 deletions(-) diff --git a/src/libstd/net_url.rs b/src/libstd/net_url.rs index 5f77e7d414fb..c54d624cfe6e 100644 --- a/src/libstd/net_url.rs +++ b/src/libstd/net_url.rs @@ -1,6 +1,17 @@ //! Types/fns concerning URLs (see RFC 3986) -export url, userinfo, query, from_str, to_str, get_scheme; +import map; +import map::{hashmap, str_hash}; +import io::{reader, reader_util}; +import dvec::{dvec, extensions}; + +export url, userinfo, query; +export from_str, to_str; +export get_scheme; + +export encode, decode; +export encode_component, decode_component; +export encode_form_urlencoded, decode_form_urlencoded; type url = { scheme: ~str, @@ -28,6 +39,223 @@ fn userinfo(-user: ~str, -pass: option<~str>) -> userinfo { {user: user, pass: pass} } +fn encode_inner(s: ~str, full_url: bool) -> ~str { + do io::with_str_reader(s) |rdr| { + let mut out = ~""; + + while !rdr.eof() { + let ch = rdr.read_byte() as char; + alt ch { + // unreserved: + 'A' to 'Z' | + 'a' to 'z' | + '0' to '9' | + '-' | '.' | '_' | '~' { + str::push_char(out, ch); + } + _ { + if full_url { + alt ch { + // gen-delims: + ':' | '/' | '?' | '#' | '[' | ']' | '@' | + + // sub-delims: + '!' | '$' | '&' | '"' | '(' | ')' | '*' | + '+' | ',' | ';' | '=' { + str::push_char(out, ch); + } + + _ { out += #fmt("%%%X", ch as uint); } + } + } else { + out += #fmt("%%%X", ch as uint); + } + } + } + } + + out + } +} + +/** Encodes a URI by replacing reserved characters with percent encoded character + * sequences. + * + * This function is compliant with RFC 3986. + */ +fn encode(s: ~str) -> ~str { + encode_inner(s, true) +} + +/** Encodes a URI component by replacing reserved characters with percent encoded + * character sequences. + * + * This function is compliant with RFC 3986. + */ +fn encode_component(s: ~str) -> ~str { + encode_inner(s, false) +} + +fn decode_inner(s: ~str, full_url: bool) -> ~str { + do io::with_str_reader(s) |rdr| { + let mut out = ~""; + + while !rdr.eof() { + alt rdr.read_char() { + '%' { + let bytes = rdr.read_bytes(2u); + let ch = uint::parse_buf(bytes, 16u).get() as char; + + if full_url { + // Only decode some characters: + alt ch { + // gen-delims: + ':' | '/' | '?' | '#' | '[' | ']' | '@' | + + // sub-delims: + '!' | '$' | '&' | '"' | '(' | ')' | '*' | + '+' | ',' | ';' | '=' { + str::push_char(out, '%'); + str::push_char(out, bytes[0u] as char); + str::push_char(out, bytes[1u] as char); + } + + ch { str::push_char(out, ch); } + } + } else { + str::push_char(out, ch); + } + } + ch { str::push_char(out, ch); } + } + } + + out + } +} + +/** Decode a string encoded with percent encoding. + * + * This will only decode escape sequences generated by encode_uri. + */ +fn decode(s: ~str) -> ~str { + decode_inner(s, true) +} + +/** Decode a string encoded with percent encoding. + */ +fn decode_component(s: ~str) -> ~str { + decode_inner(s, false) +} + +fn encode_plus(s: ~str) -> ~str { + do io::with_str_reader(s) |rdr| { + let mut out = ~""; + + while !rdr.eof() { + let ch = rdr.read_byte() as char; + alt ch { + 'A' to 'Z' | 'a' to 'z' | '0' to '9' | '_' | '.' | '-' { + str::push_char(out, ch); + } + ' ' { str::push_char(out, '+'); } + _ { out += #fmt("%%%X", ch as uint); } + } + } + + out + } +} + +/** Encode a hashmap to the 'application/x-www-form-urlencoded' media type. + */ +fn encode_form_urlencoded(m: hashmap<~str, @dvec<@~str>>) -> ~str { + let mut out = ~""; + let mut first = true; + + for m.each |key, values| { + let key = encode_plus(key); + + for (*values).each |value| { + if first { + first = false; + } else { + str::push_char(out, '&'); + first = false; + } + + out += #fmt("%s=%s", key, encode_plus(*value)); + } + } + + out +} + +/** Decode a string encoded with the 'application/x-www-form-urlencoded' media + * type into a hashmap. + */ +fn decode_form_urlencoded(s: ~[u8]) -> hashmap<~str, @dvec<@~str>> { + do io::with_bytes_reader(s) |rdr| { + let m = str_hash(); + let mut key = ~""; + let mut value = ~""; + let mut parsing_key = true; + + while !rdr.eof() { + alt rdr.read_char() { + '&' | ';' { + if key != ~"" && value != ~"" { + let values = alt m.find(key) { + some(values) { values } + none { + let values = @dvec(); + m.insert(key, values); + values + } + }; + (*values).push(@value) + } + + parsing_key = true; + key = ~""; + value = ~""; + } + '=' { parsing_key = false; } + ch { + let ch = alt ch { + '%' { + uint::parse_buf(rdr.read_bytes(2u), 16u).get() as char + } + '+' { ' ' } + ch { ch } + }; + + if parsing_key { + str::push_char(key, ch) + } else { + str::push_char(value, ch) + } + } + } + } + + if key != ~"" && value != ~"" { + let values = alt m.find(key) { + some(values) { values } + none { + let values = @dvec(); + m.insert(key, values); + values + } + }; + (*values).push(@value) + } + + m + } +} + + fn split_char_first(s: ~str, c: char) -> (~str, ~str) { let mut v = str::splitn_char(s, c, 1); if v.len() == 1 { @@ -62,7 +290,7 @@ fn query_from_str(rawquery: ~str) -> query { if str::len(rawquery) != 0 { for str::split_char(rawquery, '&').each |p| { let (k, v) = split_char_first(p, '='); - vec::push(query, (k, v)); + vec::push(query, (decode_component(k), decode_component(v))); }; } return query; @@ -72,7 +300,7 @@ fn query_to_str(query: query) -> ~str { let mut strvec = ~[]; for query.each |kv| { let (k, v) = kv; - strvec += ~[fmt!{"%s=%s", k, v}]; + strvec += ~[#fmt("%s=%s", encode_component(k), encode_component(v))]; }; return str::connect(strvec, ~"&"); } @@ -130,7 +358,7 @@ fn from_str(rawurl: ~str) -> result::result { let (rest, query) = split_char_first(rest, '?'); let query = query_from_str(query); let (host, pth) = split_char_first(rest, '/'); - let mut path = pth; + let mut path = decode_component(pth); if str::len(path) != 0 { str::unshift_char(path, '/'); } @@ -242,4 +470,157 @@ mod tests { assert to_str(result::unwrap(from_str(url))) == url; } + #[test] + fn test_url_component_encoding() { + let url = ~"http://rust-lang.org/doc%20uments?ba%25d%20=%23%26%2B"; + let u = result::unwrap(from_str(url)); + assert u.path == ~"/doc uments"; + assert u.query.find(|kv| kv.first() == ~"ba%d ") + .get().second() == ~"#&+"; + } + + #[test] + fn test_encode() { + assert encode(~"") == ~""; + assert encode(~"http://example.com") == ~"http://example.com"; + assert encode(~"foo bar% baz") == ~"foo%20bar%25%20baz"; + assert encode(~" ") == ~"%20"; + assert encode(~"!") == ~"!"; + assert encode(~"\"") == ~"\""; + assert encode(~"#") == ~"#"; + assert encode(~"$") == ~"$"; + assert encode(~"%") == ~"%25"; + assert encode(~"&") == ~"&"; + assert encode(~"'") == ~"%27"; + assert encode(~"(") == ~"("; + assert encode(~")") == ~")"; + assert encode(~"*") == ~"*"; + assert encode(~"+") == ~"+"; + assert encode(~",") == ~","; + assert encode(~"/") == ~"/"; + assert encode(~":") == ~":"; + assert encode(~";") == ~";"; + assert encode(~"=") == ~"="; + assert encode(~"?") == ~"?"; + assert encode(~"@") == ~"@"; + assert encode(~"[") == ~"["; + assert encode(~"]") == ~"]"; + } + + #[test] + fn test_encode_component() { + assert encode_component(~"") == ~""; + assert encode_component(~"http://example.com") == + ~"http%3A%2F%2Fexample.com"; + assert encode_component(~"foo bar% baz") == ~"foo%20bar%25%20baz"; + assert encode_component(~" ") == ~"%20"; + assert encode_component(~"!") == ~"%21"; + assert encode_component(~"#") == ~"%23"; + assert encode_component(~"$") == ~"%24"; + assert encode_component(~"%") == ~"%25"; + assert encode_component(~"&") == ~"%26"; + assert encode_component(~"'") == ~"%27"; + assert encode_component(~"(") == ~"%28"; + assert encode_component(~")") == ~"%29"; + assert encode_component(~"*") == ~"%2A"; + assert encode_component(~"+") == ~"%2B"; + assert encode_component(~",") == ~"%2C"; + assert encode_component(~"/") == ~"%2F"; + assert encode_component(~":") == ~"%3A"; + assert encode_component(~";") == ~"%3B"; + assert encode_component(~"=") == ~"%3D"; + assert encode_component(~"?") == ~"%3F"; + assert encode_component(~"@") == ~"%40"; + assert encode_component(~"[") == ~"%5B"; + assert encode_component(~"]") == ~"%5D"; + } + + #[test] + fn test_decode() { + assert decode(~"") == ~""; + assert decode(~"abc/def 123") == ~"abc/def 123"; + assert decode(~"abc%2Fdef%20123") == ~"abc%2Fdef 123"; + assert decode(~"%20") == ~" "; + assert decode(~"%21") == ~"%21"; + assert decode(~"%22") == ~"%22"; + assert decode(~"%23") == ~"%23"; + assert decode(~"%24") == ~"%24"; + assert decode(~"%25") == ~"%"; + assert decode(~"%26") == ~"%26"; + assert decode(~"%27") == ~"'"; + assert decode(~"%28") == ~"%28"; + assert decode(~"%29") == ~"%29"; + assert decode(~"%2A") == ~"%2A"; + assert decode(~"%2B") == ~"%2B"; + assert decode(~"%2C") == ~"%2C"; + assert decode(~"%2F") == ~"%2F"; + assert decode(~"%3A") == ~"%3A"; + assert decode(~"%3B") == ~"%3B"; + assert decode(~"%3D") == ~"%3D"; + assert decode(~"%3F") == ~"%3F"; + assert decode(~"%40") == ~"%40"; + assert decode(~"%5B") == ~"%5B"; + assert decode(~"%5D") == ~"%5D"; + } + + #[test] + fn test_decode_component() { + assert decode_component(~"") == ~""; + assert decode_component(~"abc/def 123") == ~"abc/def 123"; + assert decode_component(~"abc%2Fdef%20123") == ~"abc/def 123"; + assert decode_component(~"%20") == ~" "; + assert decode_component(~"%21") == ~"!"; + assert decode_component(~"%22") == ~"\""; + assert decode_component(~"%23") == ~"#"; + assert decode_component(~"%24") == ~"$"; + assert decode_component(~"%25") == ~"%"; + assert decode_component(~"%26") == ~"&"; + assert decode_component(~"%27") == ~"'"; + assert decode_component(~"%28") == ~"("; + assert decode_component(~"%29") == ~")"; + assert decode_component(~"%2A") == ~"*"; + assert decode_component(~"%2B") == ~"+"; + assert decode_component(~"%2C") == ~","; + assert decode_component(~"%2F") == ~"/"; + assert decode_component(~"%3A") == ~":"; + assert decode_component(~"%3B") == ~";"; + assert decode_component(~"%3D") == ~"="; + assert decode_component(~"%3F") == ~"?"; + assert decode_component(~"%40") == ~"@"; + assert decode_component(~"%5B") == ~"["; + assert decode_component(~"%5D") == ~"]"; + } + + #[test] + fn test_encode_form_urlencoded() { + let m = str_hash(); + assert encode_form_urlencoded(m) == ~""; + + m.insert(~"", @dvec()); + m.insert(~"foo", @dvec()); + assert encode_form_urlencoded(m) == ~""; + + let m = str_hash(); + m.insert(~"foo", @dvec::from_vec(~[mut @~"bar", @~"123"])); + assert encode_form_urlencoded(m) == ~"foo=bar&foo=123"; + + let m = str_hash(); + m.insert(~"foo bar", @dvec::from_vec(~[mut @~"abc", @~"12 = 34"])); + assert encode_form_urlencoded(m) == ~"foo+bar=abc&foo+bar=12+%3D+34"; + } + + #[test] + fn test_decode_form_urlencoded() { + import map::hash_from_strs; + + assert decode_form_urlencoded(~[]) == str_hash(); + + let s = str::bytes(~"a=1&foo+bar=abc&foo+bar=12+%3D+34"); + assert decode_form_urlencoded(s) == hash_from_strs(~[ + (~"a", @dvec::from_elem(@~"1")), + (~"foo bar", @dvec::from_vec(~[mut @~"abc", @~"12 = 34"])) + ]); + } + } +