rustdoc-search: yet another stringdex optimization attempt

This one's uses a different tactic. It shouldn't significantly
increase the amount of downloaded index data, but still reduces
the amount of disk usage.

This one works by changing the suffix-only node representation
to omit some data that's needed for checking. Since those nodes
make up the bulk of the tree, it reduces the data they store,
but also requires validating the match by fetching the name
itself (but the names list is pretty small, and when I tried
it with wordnet "indexing" it was about the same).
This commit is contained in:
Michael Howell 2025-08-26 19:46:50 -07:00
parent 5ab69249f3
commit 80e18051cb
5 changed files with 648 additions and 165 deletions

View file

@ -5225,9 +5225,9 @@ dependencies = [
[[package]]
name = "stringdex"
version = "0.0.1-alpha4"
version = "0.0.1-alpha9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2841fd43df5b1ff1b042e167068a1fe9b163dc93041eae56ab2296859013a9a0"
checksum = "7081029913fd7d591c0112182aba8c98ae886b4f12edb208130496cd17dc3c15"
dependencies = [
"stacker",
]

View file

@ -21,7 +21,7 @@ rustdoc-json-types = { path = "../rustdoc-json-types" }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
smallvec = "1.8.1"
stringdex = { version = "0.0.1-alpha4" }
stringdex = { version = "0.0.1-alpha9" }
tempfile = "3"
threadpool = "1.8.1"
tracing = "0.1"

View file

@ -1211,7 +1211,7 @@ class DocSearch {
* will never fulfill.
*/
async buildIndex() {
const nn = this.database.getIndex("normalizedName");
const nn = this.database.getData("normalizedName");
if (!nn) {
return;
}
@ -3706,7 +3706,7 @@ class DocSearch {
* @returns {AsyncGenerator<rustdoc.ResultObject>}
*/
async function*(currentCrate) {
const index = this.database.getIndex("normalizedName");
const index = this.database.getData("normalizedName");
if (!index) {
return;
}
@ -3835,8 +3835,7 @@ class DocSearch {
};
if (elem.normalizedPathLast === "") {
// faster full-table scan for this specific case.
const nameData = this.database.getData("name");
const l = nameData ? nameData.length : 0;
const l = index.length;
for (let id = 0; id < l; ++id) {
if (!idDuplicates.has(id)) {
idDuplicates.add(id);
@ -3938,7 +3937,7 @@ class DocSearch {
* @returns {AsyncGenerator<rustdoc.ResultObject>}
*/
async function*(inputs, output, typeInfo, currentCrate) {
const index = this.database.getIndex("normalizedName");
const index = this.database.getData("normalizedName");
if (!index) {
return;
}

View file

@ -5,17 +5,8 @@ declare namespace stringdex {
* The client interface to Stringdex.
*/
interface Database {
getIndex(colname: string): SearchTree|undefined;
getData(colname: string): DataColumn|undefined;
}
/**
* A search index file.
*/
interface SearchTree {
trie(): Trie;
search(name: Uint8Array|string): Promise<Trie?>;
searchLev(name: Uint8Array|string): AsyncGenerator<Trie>;
}
/**
* A compressed node in the search tree.
*
@ -29,9 +20,7 @@ declare namespace stringdex {
matches(): RoaringBitmap;
substringMatches(): AsyncGenerator<RoaringBitmap>;
prefixMatches(): AsyncGenerator<RoaringBitmap>;
keys(): Uint8Array;
keysExcludeSuffixOnly(): Uint8Array;
children(): [number, Promise<Trie>][];
childrenExcludeSuffixOnly(): [number, Promise<Trie>][];
child(id: number): Promise<Trie>?;
}
@ -41,6 +30,8 @@ declare namespace stringdex {
interface DataColumn {
isEmpty(id: number): boolean;
at(id: number): Promise<Uint8Array|undefined>;
search(name: Uint8Array|string): Promise<Trie?>;
searchLev(name: Uint8Array|string): AsyncGenerator<Trie>;
length: number,
}
/**

File diff suppressed because it is too large Load diff