diff --git a/src/tools/coverage-dump/src/covfun.rs b/src/tools/coverage-dump/src/covfun.rs index 82ebd33d0d1c..458fd680429a 100644 --- a/src/tools/coverage-dump/src/covfun.rs +++ b/src/tools/coverage-dump/src/covfun.rs @@ -5,7 +5,8 @@ use std::sync::OnceLock; use anyhow::{Context, anyhow}; use regex::Regex; -use crate::parser::{Parser, unescape_llvm_string_contents}; +use crate::llvm_utils::unescape_llvm_string_contents; +use crate::parser::Parser; pub(crate) fn dump_covfun_mappings( llvm_ir: &str, diff --git a/src/tools/coverage-dump/src/llvm_utils.rs b/src/tools/coverage-dump/src/llvm_utils.rs new file mode 100644 index 000000000000..017fdbec0fc2 --- /dev/null +++ b/src/tools/coverage-dump/src/llvm_utils.rs @@ -0,0 +1,46 @@ +use std::sync::OnceLock; + +use regex::bytes; + +#[cfg(test)] +mod tests; + +/// Given the raw contents of a string literal in LLVM IR assembly, decodes any +/// backslash escapes and returns a vector containing the resulting byte string. +pub(crate) fn unescape_llvm_string_contents(contents: &str) -> Vec { + let escape_re = { + static RE: OnceLock = OnceLock::new(); + // LLVM IR supports two string escapes: `\\` and `\xx`. + RE.get_or_init(|| bytes::Regex::new(r"\\\\|\\([0-9A-Za-z]{2})").unwrap()) + }; + + fn u8_from_hex_digits(digits: &[u8]) -> u8 { + // We know that the input contains exactly 2 hex digits, so these calls + // should never fail. + assert_eq!(digits.len(), 2); + let digits = std::str::from_utf8(digits).unwrap(); + u8::from_str_radix(digits, 16).unwrap() + } + + escape_re + .replace_all(contents.as_bytes(), |captures: &bytes::Captures<'_>| { + let byte = match captures.get(1) { + None => b'\\', + Some(hex_digits) => u8_from_hex_digits(hex_digits.as_bytes()), + }; + [byte] + }) + .into_owned() +} + +/// LLVM's profiler/coverage metadata often uses an MD5 hash truncated to +/// 64 bits as a way to associate data stored in different tables/sections. +pub(crate) fn truncated_md5(bytes: &[u8]) -> u64 { + use md5::{Digest, Md5}; + let mut hasher = Md5::new(); + hasher.update(bytes); + let hash: [u8; 8] = hasher.finalize().as_slice()[..8].try_into().unwrap(); + // The truncated hash is explicitly little-endian, regardless of host + // or target platform. (See `MD5Result::low` in LLVM's `MD5.h`.) + u64::from_le_bytes(hash) +} diff --git a/src/tools/coverage-dump/src/parser/tests.rs b/src/tools/coverage-dump/src/llvm_utils/tests.rs similarity index 100% rename from src/tools/coverage-dump/src/parser/tests.rs rename to src/tools/coverage-dump/src/llvm_utils/tests.rs diff --git a/src/tools/coverage-dump/src/main.rs b/src/tools/coverage-dump/src/main.rs index b21e3e292f2b..6408b97a06fd 100644 --- a/src/tools/coverage-dump/src/main.rs +++ b/src/tools/coverage-dump/src/main.rs @@ -1,4 +1,5 @@ mod covfun; +mod llvm_utils; mod parser; mod prf_names; diff --git a/src/tools/coverage-dump/src/parser.rs b/src/tools/coverage-dump/src/parser.rs index 0bd4abdae3ef..f26a57b43b33 100644 --- a/src/tools/coverage-dump/src/parser.rs +++ b/src/tools/coverage-dump/src/parser.rs @@ -1,38 +1,4 @@ -#[cfg(test)] -mod tests; - -use std::sync::OnceLock; - use anyhow::ensure; -use regex::bytes; - -/// Given the raw contents of a string literal in LLVM IR assembly, decodes any -/// backslash escapes and returns a vector containing the resulting byte string. -pub(crate) fn unescape_llvm_string_contents(contents: &str) -> Vec { - let escape_re = { - static RE: OnceLock = OnceLock::new(); - // LLVM IR supports two string escapes: `\\` and `\xx`. - RE.get_or_init(|| bytes::Regex::new(r"\\\\|\\([0-9A-Za-z]{2})").unwrap()) - }; - - fn u8_from_hex_digits(digits: &[u8]) -> u8 { - // We know that the input contains exactly 2 hex digits, so these calls - // should never fail. - assert_eq!(digits.len(), 2); - let digits = std::str::from_utf8(digits).unwrap(); - u8::from_str_radix(digits, 16).unwrap() - } - - escape_re - .replace_all(contents.as_bytes(), |captures: &bytes::Captures<'_>| { - let byte = match captures.get(1) { - None => b'\\', - Some(hex_digits) => u8_from_hex_digits(hex_digits.as_bytes()), - }; - [byte] - }) - .into_owned() -} pub(crate) struct Parser<'a> { rest: &'a [u8], diff --git a/src/tools/coverage-dump/src/prf_names.rs b/src/tools/coverage-dump/src/prf_names.rs index 96d097c79a31..fe193efd8e5f 100644 --- a/src/tools/coverage-dump/src/prf_names.rs +++ b/src/tools/coverage-dump/src/prf_names.rs @@ -4,7 +4,8 @@ use std::sync::OnceLock; use anyhow::{anyhow, ensure}; use regex::Regex; -use crate::parser::{Parser, unescape_llvm_string_contents}; +use crate::llvm_utils::{truncated_md5, unescape_llvm_string_contents}; +use crate::parser::Parser; /// Scans through the contents of an LLVM IR assembly file to find `__llvm_prf_names` /// entries, decodes them, and creates a table that maps name hash values to @@ -25,18 +26,6 @@ pub(crate) fn make_function_names_table(llvm_ir: &str) -> anyhow::Result u64 { - use md5::{Digest, Md5}; - let mut hasher = Md5::new(); - hasher.update(bytes); - let hash: [u8; 8] = hasher.finalize().as_slice()[..8].try_into().unwrap(); - // The truncated hash is explicitly little-endian, regardless of host - // or target platform. (See `MD5Result::low` in LLVM's `MD5.h`.) - u64::from_le_bytes(hash) - } - fn demangle_if_able(symbol_name_bytes: &[u8]) -> anyhow::Result { // In practice, raw symbol names should always be ASCII. let symbol_name_str = std::str::from_utf8(symbol_name_bytes)?;