coverage-dump: Extract some common code to an llvm_utils submodule
This commit is contained in:
parent
a3d5562fcf
commit
89319f2e12
6 changed files with 51 additions and 48 deletions
|
|
@ -5,7 +5,8 @@ use std::sync::OnceLock;
|
|||
use anyhow::{Context, anyhow};
|
||||
use regex::Regex;
|
||||
|
||||
use crate::parser::{Parser, unescape_llvm_string_contents};
|
||||
use crate::llvm_utils::unescape_llvm_string_contents;
|
||||
use crate::parser::Parser;
|
||||
|
||||
pub(crate) fn dump_covfun_mappings(
|
||||
llvm_ir: &str,
|
||||
|
|
|
|||
46
src/tools/coverage-dump/src/llvm_utils.rs
Normal file
46
src/tools/coverage-dump/src/llvm_utils.rs
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
use std::sync::OnceLock;
|
||||
|
||||
use regex::bytes;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
/// Given the raw contents of a string literal in LLVM IR assembly, decodes any
|
||||
/// backslash escapes and returns a vector containing the resulting byte string.
|
||||
pub(crate) fn unescape_llvm_string_contents(contents: &str) -> Vec<u8> {
|
||||
let escape_re = {
|
||||
static RE: OnceLock<bytes::Regex> = OnceLock::new();
|
||||
// LLVM IR supports two string escapes: `\\` and `\xx`.
|
||||
RE.get_or_init(|| bytes::Regex::new(r"\\\\|\\([0-9A-Za-z]{2})").unwrap())
|
||||
};
|
||||
|
||||
fn u8_from_hex_digits(digits: &[u8]) -> u8 {
|
||||
// We know that the input contains exactly 2 hex digits, so these calls
|
||||
// should never fail.
|
||||
assert_eq!(digits.len(), 2);
|
||||
let digits = std::str::from_utf8(digits).unwrap();
|
||||
u8::from_str_radix(digits, 16).unwrap()
|
||||
}
|
||||
|
||||
escape_re
|
||||
.replace_all(contents.as_bytes(), |captures: &bytes::Captures<'_>| {
|
||||
let byte = match captures.get(1) {
|
||||
None => b'\\',
|
||||
Some(hex_digits) => u8_from_hex_digits(hex_digits.as_bytes()),
|
||||
};
|
||||
[byte]
|
||||
})
|
||||
.into_owned()
|
||||
}
|
||||
|
||||
/// LLVM's profiler/coverage metadata often uses an MD5 hash truncated to
|
||||
/// 64 bits as a way to associate data stored in different tables/sections.
|
||||
pub(crate) fn truncated_md5(bytes: &[u8]) -> u64 {
|
||||
use md5::{Digest, Md5};
|
||||
let mut hasher = Md5::new();
|
||||
hasher.update(bytes);
|
||||
let hash: [u8; 8] = hasher.finalize().as_slice()[..8].try_into().unwrap();
|
||||
// The truncated hash is explicitly little-endian, regardless of host
|
||||
// or target platform. (See `MD5Result::low` in LLVM's `MD5.h`.)
|
||||
u64::from_le_bytes(hash)
|
||||
}
|
||||
|
|
@ -1,4 +1,5 @@
|
|||
mod covfun;
|
||||
mod llvm_utils;
|
||||
mod parser;
|
||||
mod prf_names;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,38 +1,4 @@
|
|||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use anyhow::ensure;
|
||||
use regex::bytes;
|
||||
|
||||
/// Given the raw contents of a string literal in LLVM IR assembly, decodes any
|
||||
/// backslash escapes and returns a vector containing the resulting byte string.
|
||||
pub(crate) fn unescape_llvm_string_contents(contents: &str) -> Vec<u8> {
|
||||
let escape_re = {
|
||||
static RE: OnceLock<bytes::Regex> = OnceLock::new();
|
||||
// LLVM IR supports two string escapes: `\\` and `\xx`.
|
||||
RE.get_or_init(|| bytes::Regex::new(r"\\\\|\\([0-9A-Za-z]{2})").unwrap())
|
||||
};
|
||||
|
||||
fn u8_from_hex_digits(digits: &[u8]) -> u8 {
|
||||
// We know that the input contains exactly 2 hex digits, so these calls
|
||||
// should never fail.
|
||||
assert_eq!(digits.len(), 2);
|
||||
let digits = std::str::from_utf8(digits).unwrap();
|
||||
u8::from_str_radix(digits, 16).unwrap()
|
||||
}
|
||||
|
||||
escape_re
|
||||
.replace_all(contents.as_bytes(), |captures: &bytes::Captures<'_>| {
|
||||
let byte = match captures.get(1) {
|
||||
None => b'\\',
|
||||
Some(hex_digits) => u8_from_hex_digits(hex_digits.as_bytes()),
|
||||
};
|
||||
[byte]
|
||||
})
|
||||
.into_owned()
|
||||
}
|
||||
|
||||
pub(crate) struct Parser<'a> {
|
||||
rest: &'a [u8],
|
||||
|
|
|
|||
|
|
@ -4,7 +4,8 @@ use std::sync::OnceLock;
|
|||
use anyhow::{anyhow, ensure};
|
||||
use regex::Regex;
|
||||
|
||||
use crate::parser::{Parser, unescape_llvm_string_contents};
|
||||
use crate::llvm_utils::{truncated_md5, unescape_llvm_string_contents};
|
||||
use crate::parser::Parser;
|
||||
|
||||
/// Scans through the contents of an LLVM IR assembly file to find `__llvm_prf_names`
|
||||
/// entries, decodes them, and creates a table that maps name hash values to
|
||||
|
|
@ -25,18 +26,6 @@ pub(crate) fn make_function_names_table(llvm_ir: &str) -> anyhow::Result<HashMap
|
|||
Some(payload)
|
||||
}
|
||||
|
||||
/// LLVM's profiler/coverage metadata often uses an MD5 hash truncated to
|
||||
/// 64 bits as a way to associate data stored in different tables/sections.
|
||||
fn truncated_md5(bytes: &[u8]) -> u64 {
|
||||
use md5::{Digest, Md5};
|
||||
let mut hasher = Md5::new();
|
||||
hasher.update(bytes);
|
||||
let hash: [u8; 8] = hasher.finalize().as_slice()[..8].try_into().unwrap();
|
||||
// The truncated hash is explicitly little-endian, regardless of host
|
||||
// or target platform. (See `MD5Result::low` in LLVM's `MD5.h`.)
|
||||
u64::from_le_bytes(hash)
|
||||
}
|
||||
|
||||
fn demangle_if_able(symbol_name_bytes: &[u8]) -> anyhow::Result<String> {
|
||||
// In practice, raw symbol names should always be ASCII.
|
||||
let symbol_name_str = std::str::from_utf8(symbol_name_bytes)?;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue