rustdoc-search: shard the search result descriptions
The descriptions are, on almost all crates[^1], the majority of the size of the search index, even though they aren't really used for searching. This makes it relatively easy to separate them into their own files. This commit also bumps us to ES8. Out of the browsers we support, all of them support async functions according to caniuse. https://caniuse.com/async-functions [^1]: <https://microsoft.github.io/windows-docs-rs/>, a crate with 44MiB of pure names and no descriptions for them, is an outlier and should not be counted.
This commit is contained in:
parent
351890d682
commit
5b44bfda7f
11 changed files with 427 additions and 228 deletions
|
|
@ -184,40 +184,15 @@ pub(crate) enum RenderTypeId {
|
|||
|
||||
impl RenderTypeId {
|
||||
pub fn write_to_string(&self, string: &mut String) {
|
||||
// (sign, value)
|
||||
let (sign, id): (bool, u32) = match &self {
|
||||
let id: i32 = match &self {
|
||||
// 0 is a sentinel, everything else is one-indexed
|
||||
// concrete type
|
||||
RenderTypeId::Index(idx) if *idx >= 0 => (false, (idx + 1isize).try_into().unwrap()),
|
||||
RenderTypeId::Index(idx) if *idx >= 0 => (idx + 1isize).try_into().unwrap(),
|
||||
// generic type parameter
|
||||
RenderTypeId::Index(idx) => (true, (-*idx).try_into().unwrap()),
|
||||
RenderTypeId::Index(idx) => (*idx).try_into().unwrap(),
|
||||
_ => panic!("must convert render types to indexes before serializing"),
|
||||
};
|
||||
// zig-zag encoding
|
||||
let value: u32 = (id << 1) | (if sign { 1 } else { 0 });
|
||||
// Self-terminating hex use capital letters for everything but the
|
||||
// least significant digit, which is lowercase. For example, decimal 17
|
||||
// would be `` Aa `` if zig-zag encoding weren't used.
|
||||
//
|
||||
// Zig-zag encoding, however, stores the sign bit as the last bit.
|
||||
// This means, in the last hexit, 1 is actually `c`, -1 is `b`
|
||||
// (`a` is the imaginary -0), and, because all the bits are shifted
|
||||
// by one, `` A` `` is actually 8 and `` Aa `` is -8.
|
||||
//
|
||||
// https://rust-lang.github.io/rustc-dev-guide/rustdoc-internals/search.html
|
||||
// describes the encoding in more detail.
|
||||
let mut shift: u32 = 28;
|
||||
let mut mask: u32 = 0xF0_00_00_00;
|
||||
while shift < 32 {
|
||||
let hexit = (value & mask) >> shift;
|
||||
if hexit != 0 || shift == 0 {
|
||||
let hex =
|
||||
char::try_from(if shift == 0 { '`' } else { '@' } as u32 + hexit).unwrap();
|
||||
string.push(hex);
|
||||
}
|
||||
shift = shift.wrapping_sub(4);
|
||||
mask = mask >> 4;
|
||||
}
|
||||
search_index::write_vlqhex_to_string(id, string);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -17,12 +17,25 @@ use crate::html::format::join_with_double_colon;
|
|||
use crate::html::markdown::short_markdown_summary;
|
||||
use crate::html::render::{self, IndexItem, IndexItemFunctionType, RenderType, RenderTypeId};
|
||||
|
||||
/// The serialized search description sharded version
|
||||
///
|
||||
/// The `index` is a JSON-encoded list of names and other information.
|
||||
///
|
||||
/// The desc has newlined descriptions, split up by size into 1MiB shards.
|
||||
/// For example, `(4, "foo\nbar\nbaz\nquux")`.
|
||||
pub(crate) struct SerializedSearchIndex {
|
||||
pub(crate) index: String,
|
||||
pub(crate) desc: Vec<(usize, String)>,
|
||||
}
|
||||
|
||||
const DESC_INDEX_SHARD_LEN: usize = 1024 * 1024;
|
||||
|
||||
/// Builds the search index from the collected metadata
|
||||
pub(crate) fn build_index<'tcx>(
|
||||
krate: &clean::Crate,
|
||||
cache: &mut Cache,
|
||||
tcx: TyCtxt<'tcx>,
|
||||
) -> String {
|
||||
) -> SerializedSearchIndex {
|
||||
let mut itemid_to_pathid = FxHashMap::default();
|
||||
let mut primitives = FxHashMap::default();
|
||||
let mut associated_types = FxHashMap::default();
|
||||
|
|
@ -318,7 +331,6 @@ pub(crate) fn build_index<'tcx>(
|
|||
.collect::<Vec<_>>();
|
||||
|
||||
struct CrateData<'a> {
|
||||
doc: String,
|
||||
items: Vec<&'a IndexItem>,
|
||||
paths: Vec<(ItemType, Vec<Symbol>)>,
|
||||
// The String is alias name and the vec is the list of the elements with this alias.
|
||||
|
|
@ -327,6 +339,9 @@ pub(crate) fn build_index<'tcx>(
|
|||
aliases: &'a BTreeMap<String, Vec<usize>>,
|
||||
// Used when a type has more than one impl with an associated item with the same name.
|
||||
associated_item_disambiguators: &'a Vec<(usize, String)>,
|
||||
// A list of shard lengths encoded as vlqhex. See the comment in write_vlqhex_to_string
|
||||
// for information on the format.
|
||||
descindex: String,
|
||||
}
|
||||
|
||||
struct Paths {
|
||||
|
|
@ -408,7 +423,6 @@ pub(crate) fn build_index<'tcx>(
|
|||
let mut names = Vec::with_capacity(self.items.len());
|
||||
let mut types = String::with_capacity(self.items.len());
|
||||
let mut full_paths = Vec::with_capacity(self.items.len());
|
||||
let mut descriptions = Vec::with_capacity(self.items.len());
|
||||
let mut parents = Vec::with_capacity(self.items.len());
|
||||
let mut functions = String::with_capacity(self.items.len());
|
||||
let mut deprecated = Vec::with_capacity(self.items.len());
|
||||
|
|
@ -431,7 +445,6 @@ pub(crate) fn build_index<'tcx>(
|
|||
parents.push(item.parent_idx.map(|x| x + 1).unwrap_or(0));
|
||||
|
||||
names.push(item.name.as_str());
|
||||
descriptions.push(&item.desc);
|
||||
|
||||
if !item.path.is_empty() {
|
||||
full_paths.push((index, &item.path));
|
||||
|
|
@ -454,14 +467,12 @@ pub(crate) fn build_index<'tcx>(
|
|||
let has_aliases = !self.aliases.is_empty();
|
||||
let mut crate_data =
|
||||
serializer.serialize_struct("CrateData", if has_aliases { 9 } else { 8 })?;
|
||||
crate_data.serialize_field("doc", &self.doc)?;
|
||||
crate_data.serialize_field("t", &types)?;
|
||||
crate_data.serialize_field("n", &names)?;
|
||||
// Serialize as an array of item indices and full paths
|
||||
crate_data.serialize_field("q", &full_paths)?;
|
||||
crate_data.serialize_field("d", &descriptions)?;
|
||||
crate_data.serialize_field("i", &parents)?;
|
||||
crate_data.serialize_field("f", &functions)?;
|
||||
crate_data.serialize_field("D", &self.descindex)?;
|
||||
crate_data.serialize_field("c", &deprecated)?;
|
||||
crate_data.serialize_field("p", &paths)?;
|
||||
crate_data.serialize_field("b", &self.associated_item_disambiguators)?;
|
||||
|
|
@ -472,16 +483,46 @@ pub(crate) fn build_index<'tcx>(
|
|||
}
|
||||
}
|
||||
|
||||
// Collect the index into a string
|
||||
format!(
|
||||
let desc = {
|
||||
let mut result = Vec::new();
|
||||
let mut set = String::new();
|
||||
let mut len: usize = 0;
|
||||
for desc in std::iter::once(&crate_doc).chain(crate_items.iter().map(|item| &item.desc)) {
|
||||
if set.len() >= DESC_INDEX_SHARD_LEN {
|
||||
result.push((len, std::mem::replace(&mut set, String::new())));
|
||||
len = 0;
|
||||
} else if len != 0 {
|
||||
set.push('\n');
|
||||
}
|
||||
set.push_str(&desc);
|
||||
len += 1;
|
||||
}
|
||||
result.push((len, std::mem::replace(&mut set, String::new())));
|
||||
result
|
||||
};
|
||||
|
||||
let descindex = {
|
||||
let mut descindex = String::with_capacity(desc.len() * 4);
|
||||
for &(len, _) in desc.iter() {
|
||||
write_vlqhex_to_string(len.try_into().unwrap(), &mut descindex);
|
||||
}
|
||||
descindex
|
||||
};
|
||||
|
||||
assert_eq!(crate_items.len() + 1, desc.iter().map(|(len, _)| *len).sum::<usize>());
|
||||
|
||||
// The index, which is actually used to search, is JSON
|
||||
// It uses `JSON.parse(..)` to actually load, since JSON
|
||||
// parses faster than the full JavaScript syntax.
|
||||
let index = format!(
|
||||
r#"["{}",{}]"#,
|
||||
krate.name(tcx),
|
||||
serde_json::to_string(&CrateData {
|
||||
doc: crate_doc,
|
||||
items: crate_items,
|
||||
paths: crate_paths,
|
||||
aliases: &aliases,
|
||||
associated_item_disambiguators: &associated_item_disambiguators,
|
||||
descindex,
|
||||
})
|
||||
.expect("failed serde conversion")
|
||||
// All these `replace` calls are because we have to go through JS string for JSON content.
|
||||
|
|
@ -489,7 +530,45 @@ pub(crate) fn build_index<'tcx>(
|
|||
.replace('\'', r"\'")
|
||||
// We need to escape double quotes for the JSON.
|
||||
.replace("\\\"", "\\\\\"")
|
||||
)
|
||||
);
|
||||
SerializedSearchIndex { index, desc }
|
||||
}
|
||||
|
||||
pub(crate) fn write_vlqhex_to_string(n: i32, string: &mut String) {
|
||||
let (sign, magnitude): (bool, u32) =
|
||||
if n >= 0 { (false, n.try_into().unwrap()) } else { (true, (-n).try_into().unwrap()) };
|
||||
// zig-zag encoding
|
||||
let value: u32 = (magnitude << 1) | (if sign { 1 } else { 0 });
|
||||
// Self-terminating hex use capital letters for everything but the
|
||||
// least significant digit, which is lowercase. For example, decimal 17
|
||||
// would be `` Aa `` if zig-zag encoding weren't used.
|
||||
//
|
||||
// Zig-zag encoding, however, stores the sign bit as the last bit.
|
||||
// This means, in the last hexit, 1 is actually `c`, -1 is `b`
|
||||
// (`a` is the imaginary -0), and, because all the bits are shifted
|
||||
// by one, `` A` `` is actually 8 and `` Aa `` is -8.
|
||||
//
|
||||
// https://rust-lang.github.io/rustc-dev-guide/rustdoc-internals/search.html
|
||||
// describes the encoding in more detail.
|
||||
let mut shift: u32 = 28;
|
||||
let mut mask: u32 = 0xF0_00_00_00;
|
||||
// first skip leading zeroes
|
||||
while shift < 32 {
|
||||
let hexit = (value & mask) >> shift;
|
||||
if hexit != 0 || shift == 0 {
|
||||
break;
|
||||
}
|
||||
shift = shift.wrapping_sub(4);
|
||||
mask = mask >> 4;
|
||||
}
|
||||
// now write the rest
|
||||
while shift < 32 {
|
||||
let hexit = (value & mask) >> shift;
|
||||
let hex = char::try_from(if shift == 0 { '`' } else { '@' } as u32 + hexit).unwrap();
|
||||
string.push(hex);
|
||||
shift = shift.wrapping_sub(4);
|
||||
mask = mask >> 4;
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_function_type_for_search<'tcx>(
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ use crate::formats::cache::Cache;
|
|||
use crate::formats::item_type::ItemType;
|
||||
use crate::formats::Impl;
|
||||
use crate::html::format::Buffer;
|
||||
use crate::html::render::search_index::SerializedSearchIndex;
|
||||
use crate::html::render::{AssocItemLink, ImplRenderingParameters};
|
||||
use crate::html::{layout, static_files};
|
||||
use crate::visit::DocVisitor;
|
||||
|
|
@ -46,7 +47,7 @@ use crate::{try_err, try_none};
|
|||
pub(super) fn write_shared(
|
||||
cx: &mut Context<'_>,
|
||||
krate: &Crate,
|
||||
search_index: String,
|
||||
search_index: SerializedSearchIndex,
|
||||
options: &RenderOptions,
|
||||
) -> Result<(), Error> {
|
||||
// Write out the shared files. Note that these are shared among all rustdoc
|
||||
|
|
@ -312,7 +313,7 @@ pub(super) fn write_shared(
|
|||
let dst = cx.dst.join(&format!("search-index{}.js", cx.shared.resource_suffix));
|
||||
let (mut all_indexes, mut krates) =
|
||||
try_err!(collect_json(&dst, krate.name(cx.tcx()).as_str()), &dst);
|
||||
all_indexes.push(search_index);
|
||||
all_indexes.push(search_index.index);
|
||||
krates.push(krate.name(cx.tcx()).to_string());
|
||||
krates.sort();
|
||||
|
||||
|
|
@ -335,6 +336,32 @@ else if (window.initSearch) window.initSearch(searchIndex);
|
|||
Ok(v.into_bytes())
|
||||
})?;
|
||||
|
||||
let search_desc_dir = cx.dst.join(format!("search.desc/{krate}", krate = krate.name(cx.tcx())));
|
||||
if Path::new(&search_desc_dir).exists() {
|
||||
try_err!(std::fs::remove_dir_all(&search_desc_dir), &search_desc_dir);
|
||||
}
|
||||
try_err!(std::fs::create_dir_all(&search_desc_dir), &search_desc_dir);
|
||||
let kratename = krate.name(cx.tcx()).to_string();
|
||||
for (i, (_, data)) in search_index.desc.into_iter().enumerate() {
|
||||
let output_filename = static_files::suffix_path(
|
||||
&format!("{kratename}-desc-{i}-.js"),
|
||||
&cx.shared.resource_suffix,
|
||||
);
|
||||
let path = search_desc_dir.join(output_filename);
|
||||
try_err!(
|
||||
std::fs::write(
|
||||
&path,
|
||||
&format!(
|
||||
r##"searchState.loadedDescShard({kratename}, {i}, {data})"##,
|
||||
kratename = serde_json::to_string(&kratename).unwrap(),
|
||||
data = serde_json::to_string(&data).unwrap(),
|
||||
)
|
||||
.into_bytes()
|
||||
),
|
||||
&path
|
||||
);
|
||||
}
|
||||
|
||||
write_invocation_specific("crates.js", &|| {
|
||||
let krates = krates.iter().map(|k| format!("\"{k}\"")).join(",");
|
||||
Ok(format!("window.ALL_CRATES = [{krates}];").into_bytes())
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue