From 7c04b3c5bd5b7e6fe70273d3e021818f86da4967 Mon Sep 17 00:00:00 2001 From: Alexis Beingessner Date: Sun, 5 Oct 2014 09:48:38 -0400 Subject: [PATCH] flesh out BTree docs --- src/libcollections/btree/map.rs | 43 +++++++++++++++++++++++++++++++++ src/libcollections/btree/set.rs | 5 ++++ 2 files changed, 48 insertions(+) diff --git a/src/libcollections/btree/map.rs b/src/libcollections/btree/map.rs index 956c3279d047..a061f9dcaef6 100644 --- a/src/libcollections/btree/map.rs +++ b/src/libcollections/btree/map.rs @@ -29,6 +29,47 @@ use ringbuf::RingBuf; /// A map based on a B-Tree. +/// +/// B-Trees represent a fundamental compromise between cache-efficiency and actually minimizing +/// the amount of work performed in a search. In theory, a binary search tree (BST) is the optimal +/// choice for a sorted map, as a perfectly balanced BST performs the theoretical minimum amount of +/// comparisons necessary to find an element (log2n). However, in practice the way this +/// is done is *very* inefficient for modern computer architectures. In particular, every element +/// is stored in its own individually heap-allocated node. This means that every single insertion +/// triggers a heap-allocation, and every single comparison should be a cache-miss. Since these +/// are both notably expensive things to do in practice, we are forced to at very least reconsider +/// the BST strategy. +/// +/// A B-Tree instead makes each node contain B-1 to 2B-1 elements in a contiguous array. By doing +/// this, we reduce the number of allocations by a factor of B, and improve cache effeciency in +/// searches. However, this does mean that searches will have to do *more* comparisons on average. +/// The precise number of comparisons depends on the node search strategy used. For optimal cache +/// effeciency, one could search the nodes linearly. For optimal comparisons, one could search +/// search the node using binary search. As a compromise, one could also perform a linear search +/// that initially only checks every ith element for some choice of i. +/// +/// Currently, our implementation simply performs naive linear search. This provides excellent +/// performance on *small* nodes of elements which are cheap to compare. However in the future we +/// would like to further explore choosing the optimal search strategy based on the choice of B, +/// and possibly other factors. Using linear search, searching for a random element is expected +/// to take O(BlogBn) comparisons, which is generally worse than a BST. In practice, +/// however, performance is excellent. `BTreeMap` is able to readily outperform `TreeMap` under +/// many workloads, and is competetive where it doesn't. BTreeMap also generally *scales* better +/// than TreeMap, making it more appropriate for large datasets. +/// +/// However, `TreeMap` may still be more appropriate to use in many contexts. If elements are very +/// large or expensive to compare, `TreeMap` may be more appropriate. It won't allocate any +/// more space than is needed, and will perform the minimal number of comparisons necessary. +/// `TreeMap` also provides much better performance stability guarantees. Generally, very few +/// changes need to be made to update a BST, and two updates are expected to take about the same +/// amount of time on roughly equal sized BSTs. However a B-Tree's performance is much more +/// amortized. If a node is overfull, it must be split into two nodes. If a node is underfull, it +/// may be merged with another. Both of these operations are relatively expensive to perform, and +/// it's possible to force one to occur at every single level of the tree in a single insertion or +/// deletion. In fact, a malicious or otherwise unlucky sequence of insertions and deletions can +/// force this degenerate behaviour to occur on every operation. While the total amount of work +/// done on each operation isn't *catastrophic*, and *is* still bounded by O(BlogBn), +/// it is certainly much slower when it does. #[deriving(Clone)] pub struct BTreeMap { root: Node, @@ -93,6 +134,8 @@ impl BTreeMap { } /// Makes a new empty BTreeMap with the given B. + /// + /// B cannot be less than 2. pub fn with_b(b: uint) -> BTreeMap { assert!(b > 1, "B must be greater than 1"); BTreeMap { diff --git a/src/libcollections/btree/set.rs b/src/libcollections/btree/set.rs index b21af89742c9..8958f0ef5bee 100644 --- a/src/libcollections/btree/set.rs +++ b/src/libcollections/btree/set.rs @@ -23,6 +23,9 @@ use core::fmt::Show; use {Mutable, Set, MutableSet, MutableMap, Map}; /// A set based on a B-Tree. +/// +/// See BTreeMap's documentation for a detailed discussion of this collection's performance +/// benefits and drawbacks. #[deriving(Clone, Hash, PartialEq, Eq, Ord, PartialOrd)] pub struct BTreeSet{ map: BTreeMap, @@ -65,6 +68,8 @@ impl BTreeSet { } /// Makes a new BTreeSet with the given B. + /// + /// B cannot be less than 2. pub fn with_b(b: uint) -> BTreeSet { BTreeSet { map: BTreeMap::with_b(b) } }