auto merge of #8545 : blake2-ppc/rust/iterbytes, r=alexcrichton

Address issue #5257, for example these values all had the same hash value: ("aaa", "bbb", "ccc") ("aaab", "bb", "ccc") ("aaabbb", "", "ccc") IterBytes for &[A] now includes the length, before calling iter_bytes on each element. IterBytes for &str is now terminated by a byte that does not appear in UTF-8. This way only one more byte is processed when hashing strings.
2013-08-18 12:41:59 -07:00 · 2013-08-18 12:41:59 -07:00 · e185b049af
commit e185b049af
parent 88bd2155d7 bfa1331cd7
3 changed files with 88 additions and 60 deletions
--- a/src/libstd/hash.rs
+++ b/src/libstd/hash.rs
@ -409,6 +409,14 @@ mod tests {

    use uint;

+    // Hash just the bytes of the slice, without length prefix
+    struct Bytes<'self>(&'self [u8]);
+    impl<'self> IterBytes for Bytes<'self> {
+        fn iter_bytes(&self, _lsb0: bool, f: &fn(&[u8]) -> bool) -> bool {
+            f(**self)
+        }
+    }
+
    #[test]
    fn test_siphash() {
        let vecs : [[u8, ..8], ..64] = [
@ -496,7 +504,7 @@ mod tests {
        while t < 64 {
            debug!("siphash test %?", t);
            let vec = u8to64_le!(vecs[t], 0);
-            let out = buf.hash_keyed(k0, k1);
+            let out = Bytes(buf.as_slice()).hash_keyed(k0, k1);
            debug!("got %?, expected %?", out, vec);
            assert_eq!(vec, out);

@ -587,4 +595,18 @@ mod tests {
    fn test_float_hashes_of_zero() {
        assert_eq!(0.0.hash(), (-0.0).hash());
    }
+
+    #[test]
+    fn test_hash_no_concat_alias() {
+        let s = ("aa", "bb");
+        let t = ("aabb", "");
+        let u = ("a", "abb");
+
+        let v = (&[1u8], &[0u8, 0], &[0u8]);
+        let w = (&[1u8, 0, 0, 0], &[], &[]);
+
+        assert!(v != w);
+        assert!(s.hash() != t.hash() && s.hash() != u.hash());
+        assert!(v.hash() != w.hash());
+    }
 }
--- a/src/libstd/str/ascii.rs
+++ b/src/libstd/str/ascii.rs
@ -376,7 +376,6 @@ static ASCII_UPPER_MAP: &'static [u8] = &[
 #[cfg(test)]
 mod tests {
    use super::*;
-    use to_bytes::ToBytes;
    use str::from_char;

    macro_rules! v2ascii (
@ -445,7 +444,6 @@ mod tests {

    #[test]
    fn test_ascii_to_bytes() {
-        assert_eq!(v2ascii!(~[40, 32, 59]).to_bytes(false), ~[40u8, 32u8, 59u8]);
        assert_eq!(v2ascii!(~[40, 32, 59]).into_bytes(), ~[40u8, 32u8, 59u8]);
    }

--- a/src/libstd/to_bytes.rs
+++ b/src/libstd/to_bytes.rs
@ -15,37 +15,43 @@ The `ToBytes` and `IterBytes` traits
 */

 use cast;
+use container::Container;
 use io;
 use io::Writer;
 use iterator::Iterator;
 use option::{None, Option, Some};
-use str::StrSlice;
-use vec::ImmutableVector;
+use str::{Str, StrSlice};
+use vec::{Vector, ImmutableVector};

 pub type Cb<'self> = &'self fn(buf: &[u8]) -> bool;

-/**
- * A trait to implement in order to make a type hashable;
- * This works in combination with the trait `Hash::Hash`, and
- * may in the future be merged with that trait or otherwise
- * modified when default methods and trait inheritance are
- * completed.
- */
+///
+/// A trait to implement in order to make a type hashable;
+/// This works in combination with the trait `std::hash::Hash`, and
+/// may in the future be merged with that trait or otherwise
+/// modified when default methods and trait inheritance are
+/// completed.
+///
+/// IterBytes should be implemented so that the extent of the
+/// produced byte stream can be discovered, given the original
+/// type.
+/// For example, the IterBytes implementation for vectors emits
+/// its length first, and enums should emit their discriminant.
+///
 pub trait IterBytes {
-    /**
-     * Call the provided callback `f` one or more times with
-     * byte-slices that should be used when computing a hash
-     * value or otherwise "flattening" the structure into
-     * a sequence of bytes. The `lsb0` parameter conveys
-     * whether the caller is asking for little-endian bytes
-     * (`true`) or big-endian (`false`); this should only be
-     * relevant in implementations that represent a single
-     * multi-byte datum such as a 32 bit integer or 64 bit
-     * floating-point value. It can be safely ignored for
-     * larger structured types as they are usually processed
-     * left-to-right in declaration order, regardless of
-     * underlying memory endianness.
-     */
+    /// Call the provided callback `f` one or more times with
+    /// byte-slices that should be used when computing a hash
+    /// value or otherwise "flattening" the structure into
+    /// a sequence of bytes. The `lsb0` parameter conveys
+    /// whether the caller is asking for little-endian bytes
+    /// (`true`) or big-endian (`false`); this should only be
+    /// relevant in implementations that represent a single
+    /// multi-byte datum such as a 32 bit integer or 64 bit
+    /// floating-point value. It can be safely ignored for
+    /// larger structured types as they are usually processed
+    /// left-to-right in declaration order, regardless of
+    /// underlying memory endianness.
+    ///
    fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool;
 }

@ -224,74 +230,76 @@ impl IterBytes for f64 {
 impl<'self,A:IterBytes> IterBytes for &'self [A] {
    #[inline]
    fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool {
+        self.len().iter_bytes(lsb0, |b| f(b)) &&
        self.iter().advance(|elt| elt.iter_bytes(lsb0, |b| f(b)))
    }
 }

-impl<A:IterBytes,B:IterBytes> IterBytes for (A,B) {
-  #[inline]
-  fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool {
-    match *self {
-      (ref a, ref b) => { a.iter_bytes(lsb0, |b| f(b)) &&
-                          b.iter_bytes(lsb0, |b| f(b)) }
+impl<A: IterBytes> IterBytes for (A, ) {
+    fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool {
+        match *self {
+            (ref a, ) => a.iter_bytes(lsb0, |b| f(b))
+        }
    }
-  }
 }

-impl<A:IterBytes,B:IterBytes,C:IterBytes> IterBytes for (A,B,C) {
-  #[inline]
-  fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool {
-    match *self {
-      (ref a, ref b, ref c) => {
-        a.iter_bytes(lsb0, |b| f(b)) &&
-        b.iter_bytes(lsb0, |b| f(b)) &&
-        c.iter_bytes(lsb0, |b| f(b))
-      }
-    }
-  }
-}
+macro_rules! iter_bytes_tuple(
+    ($($A:ident),+) => (
+        impl<$($A: IterBytes),+> IterBytes for ($($A),+) {
+            fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool {
+                match *self {
+                    ($(ref $A),+) => {
+                        $(
+                            $A .iter_bytes(lsb0, |b| f(b))
+                        )&&+
+                    }
+                }
+            }
+        }
+    )
+)

-// Move this to vec, probably.
-fn borrow<'x,A>(a: &'x [A]) -> &'x [A] {
-    a
-}
+iter_bytes_tuple!(A, B)
+iter_bytes_tuple!(A, B, C)
+iter_bytes_tuple!(A, B, C, D)
+iter_bytes_tuple!(A, B, C, D, E)
+iter_bytes_tuple!(A, B, C, D, E, F)
+iter_bytes_tuple!(A, B, C, D, E, F, G)
+iter_bytes_tuple!(A, B, C, D, E, F, G, H)

 impl<A:IterBytes> IterBytes for ~[A] {
    #[inline]
    fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool {
-        borrow(*self).iter_bytes(lsb0, f)
+        self.as_slice().iter_bytes(lsb0, f)
    }
 }

 impl<A:IterBytes> IterBytes for @[A] {
    #[inline]
    fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool {
-        borrow(*self).iter_bytes(lsb0, f)
+        self.as_slice().iter_bytes(lsb0, f)
    }
 }

 impl<'self> IterBytes for &'self str {
    #[inline]
    fn iter_bytes(&self, _lsb0: bool, f: Cb) -> bool {
-        f(self.as_bytes())
+        // Terminate the string with a byte that does not appear in UTF-8
+        f(self.as_bytes()) && f([0xFF])
    }
 }

 impl IterBytes for ~str {
    #[inline]
-    fn iter_bytes(&self, _lsb0: bool, f: Cb) -> bool {
-        // this should possibly include the null terminator, but that
-        // breaks .find_equiv on hashmaps.
-        f(self.as_bytes())
+    fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool {
+        self.as_slice().iter_bytes(lsb0, f)
    }
 }

 impl IterBytes for @str {
    #[inline]
-    fn iter_bytes(&self, _lsb0: bool, f: Cb) -> bool {
-        // this should possibly include the null terminator, but that
-        // breaks .find_equiv on hashmaps.
-        f(self.as_bytes())
+    fn iter_bytes(&self, lsb0: bool, f: Cb) -> bool {
+        self.as_slice().iter_bytes(lsb0, f)
    }
 }