Use load intrinsic and loop for intrinsic-test programs. Add --release flag back to intrinsic-test programs.

This commit is contained in:
James McGregor 2022-06-21 18:42:39 +01:00 committed by Amanieu d'Antras
parent e79701c56e
commit 893bbdd717
6 changed files with 184 additions and 109 deletions

View file

@ -67,20 +67,6 @@ vrnd64xq_f64
vrnd64z_f64
vrnd64zq_f64
# Takes too long to compile tests
vcopyq_laneq_u8
vcopyq_laneq_s8
vcopyq_laneq_p8
vcopyq_lane_u8
vcopyq_lane_s8
vcopyq_lane_p8
vcopy_laneq_u8
vcopy_laneq_s8
vcopy_laneq_p8
vcopy_lane_u8
vcopy_lane_s8
vcopy_lane_p8
# QEMU 6.0 doesn't support these instructions
vmmlaq_s32
vmmlaq_u32

View file

@ -1,6 +1,6 @@
use std::ops::Range;
use crate::types::IntrinsicType;
use crate::types::{IntrinsicType, TypeKind};
use crate::Language;
/// An argument for the intrinsic.
@ -90,49 +90,108 @@ impl ArgumentList {
.join(", ")
}
/// Creates a line that initializes this argument for C code.
/// e.g. `int32x2_t a = { 0x1, 0x2 };`
pub fn init_random_values_c(&self, pass: usize) -> String {
/// Creates a line for each argument that initializes an array for C from which `loads` argument
/// values can be loaded as a sliding window.
/// e.g `const int32x2_t a_vals = {0x3effffff, 0x3effffff, 0x3f7fffff}`, if loads=2.
pub fn gen_arglists_c(&self, loads: u32) -> String {
self.iter()
.filter_map(|arg| {
(!arg.has_constraint()).then(|| {
format!(
"{ty} {name} = {{ {values} }};",
ty = arg.to_c_type(),
"const {ty} {name}_vals[] = {{ {values} }};",
ty = arg.ty.c_scalar_type(),
name = arg.name,
values = arg.ty.populate_random(pass, &Language::C)
values = arg.ty.populate_random(loads, &Language::C)
)
})
})
.collect::<Vec<_>>()
.join("\n ")
.join("\n")
}
/// Creates a line that initializes this argument for Rust code.
/// e.g. `let a = transmute([0x1, 0x2]);`
pub fn init_random_values_rust(&self, pass: usize) -> String {
/// Creates a line for each argument that initializes an array for Rust from which `loads` argument
/// values can be loaded as a sliding window, e.g `const A_VALS: [u32; 20] = [...];`
pub fn gen_arglists_rust(&self, loads: u32) -> String {
self.iter()
.filter_map(|arg| {
(!arg.has_constraint()).then(|| {
if arg.is_simd() {
format!(
"let {name} = ::std::mem::transmute([{values}]);",
name = arg.name,
values = arg.ty.populate_random(pass, &Language::Rust),
)
} else {
format!(
"let {name} = {value};",
name = arg.name,
value = arg.ty.populate_random(pass, &Language::Rust)
)
}
format!(
"const {upper_name}_VALS: [{ty}; {load_size}] = unsafe{{ [{values}] }};",
upper_name = arg.name.to_uppercase(),
ty = arg.ty.rust_scalar_type(),
load_size = arg.ty.num_lanes() * arg.ty.num_vectors() + loads - 1,
values = arg.ty.populate_random(loads, &Language::Rust)
)
})
})
.collect::<Vec<_>>()
.join("\n")
}
/// Creates a line for each argument that initalizes the argument from an array [arg]_vals at
/// an offset i using a load intrinsic, in C.
/// e.g `uint8x8_t a = vld1_u8(&a_vals[i]);`
pub fn load_values_c(&self, p64_armv7_workaround: bool) -> String {
self.iter()
.filter_map(|arg| {
// The ACLE doesn't support 64-bit polynomial loads on Armv7
// This and the cast are a workaround for this
let armv7_p64 = if let TypeKind::Poly = arg.ty.kind() {
p64_armv7_workaround
} else {
false
};
(!arg.has_constraint()).then(|| {
format!(
"{ty} {name} = {open_cast}{load}(&{name}_vals[i]){close_cast};",
ty = arg.to_c_type(),
name = arg.name,
load = if arg.is_simd() {
arg.ty.get_load_function(p64_armv7_workaround)
} else {
"*".to_string()
},
open_cast = if armv7_p64 {
format!("cast<{}>(", arg.to_c_type())
} else {
"".to_string()
},
close_cast = if armv7_p64 {
")".to_string()
} else {
"".to_string()
}
)
})
})
.collect::<Vec<_>>()
.join("\n ")
}
/// Creates a line for each argument that initalizes the argument from array [ARG]_VALS at
/// an offset i using a load intrinsic, in Rust.
/// e.g `let a = vld1_u8(A_VALS.as_ptr().offset(i));`
pub fn load_values_rust(&self) -> String {
self.iter()
.filter_map(|arg| {
(!arg.has_constraint()).then(|| {
format!(
"let {name} = {load}({upper_name}_VALS.as_ptr().offset(i));",
name = arg.name,
upper_name = arg.name.to_uppercase(),
load = if arg.is_simd() {
arg.ty.get_load_function(false)
} else {
"*".to_string()
},
)
})
})
.collect::<Vec<_>>()
.join("\n ")
}
pub fn iter(&self) -> std::slice::Iter<'_, Argument> {
self.args.iter()
}

View file

@ -20,8 +20,9 @@ pub struct Intrinsic {
impl Intrinsic {
/// Generates a std::cout for the intrinsics results that will match the
/// rust debug output format for the return type.
pub fn print_result_c(&self, index: usize, additional: &str) -> String {
/// rust debug output format for the return type. The generated line assumes
/// there is an int i in scope which is the current pass number.
pub fn print_result_c(&self, additional: &str) -> String {
let lanes = if self.results.num_vectors() > 1 {
(0..self.results.num_vectors())
.map(|vector| {
@ -72,7 +73,7 @@ impl Intrinsic {
};
format!(
r#"std::cout << "Result {additional}-{idx}: {ty}" << std::fixed << std::setprecision(150) << {lanes} << "{close}" << std::endl;"#,
r#"std::cout << "Result {additional}-" << i+1 << ": {ty}" << std::fixed << std::setprecision(150) << {lanes} << "{close}" << std::endl;"#,
ty = if self.results.is_simd() {
format!("{}(", self.results.c_type())
} else {
@ -81,11 +82,31 @@ impl Intrinsic {
close = if self.results.is_simd() { ")" } else { "" },
lanes = lanes,
additional = additional,
idx = index,
)
}
pub fn generate_pass_rust(&self, index: usize, additional: &str) -> String {
pub fn generate_loop_c(
&self,
additional: &str,
passes: u32,
p64_armv7_workaround: bool,
) -> String {
format!(
r#" {{
for (int i=0; i<{passes}; i++) {{
{loaded_args}
auto __return_value = {intrinsic_call}({args});
{print_result}
}}
}}"#,
loaded_args = self.arguments.load_values_c(p64_armv7_workaround),
intrinsic_call = self.name,
args = self.arguments.as_call_param_c(),
print_result = self.print_result_c(additional)
)
}
pub fn generate_loop_rust(&self, additional: &str, passes: u32) -> String {
let constraints = self.arguments.as_constraint_parameters_rust();
let constraints = if !constraints.is_empty() {
format!("::<{}>", constraints)
@ -94,32 +115,20 @@ impl Intrinsic {
};
format!(
r#"
unsafe {{
{initialized_args}
let res = {intrinsic_call}{const}({args});
println!("Result {additional}-{idx}: {{:.150?}}", res);
}}"#,
initialized_args = self.arguments.init_random_values_rust(index),
r#" {{
for i in 0..{passes} {{
unsafe {{
{loaded_args}
let __return_value = {intrinsic_call}{const}({args});
println!("Result {additional}-{{}}: {{:.150?}}", i+1, __return_value);
}}
}}
}}"#,
loaded_args = self.arguments.load_values_rust(),
intrinsic_call = self.name,
const = constraints,
args = self.arguments.as_call_param_rust(),
additional = additional,
idx = index,
const = constraints,
)
}
pub fn generate_pass_c(&self, index: usize, additional: &str) -> String {
format!(
r#" {{
{initialized_args}
auto __return_value = {intrinsic_call}({args});
{print_result}
}}"#,
initialized_args = self.arguments.init_random_values_c(index),
intrinsic_call = self.name,
args = self.arguments.as_call_param_c(),
print_result = self.print_result_c(index, additional)
)
}
}

View file

@ -23,13 +23,21 @@ mod intrinsic;
mod types;
mod values;
// The number of times each intrinsic will be called.
const PASSES: u32 = 20;
#[derive(Debug, PartialEq)]
pub enum Language {
Rust,
C,
}
fn gen_code_c(intrinsic: &Intrinsic, constraints: &[&Argument], name: String) -> String {
fn gen_code_c(
intrinsic: &Intrinsic,
constraints: &[&Argument],
name: String,
p64_armv7_workaround: bool,
) -> String {
if let Some((current, constraints)) = constraints.split_last() {
let range = current
.constraints
@ -47,19 +55,25 @@ fn gen_code_c(intrinsic: &Intrinsic, constraints: &[&Argument], name: String) ->
name = current.name,
ty = current.ty.c_type(),
val = i,
pass = gen_code_c(intrinsic, constraints, format!("{}-{}", name, i))
pass = gen_code_c(
intrinsic,
constraints,
format!("{}-{}", name, i),
p64_armv7_workaround
)
)
})
.collect()
} else {
(1..20)
.map(|idx| intrinsic.generate_pass_c(idx, &name))
.collect::<Vec<_>>()
.join("\n")
intrinsic.generate_loop_c(&name, PASSES, p64_armv7_workaround)
}
}
fn generate_c_program(header_files: &[&str], intrinsic: &Intrinsic) -> String {
fn generate_c_program(
header_files: &[&str],
intrinsic: &Intrinsic,
p64_armv7_workaround: bool,
) -> String {
let constraints = intrinsic
.arguments
.iter()
@ -75,7 +89,7 @@ fn generate_c_program(header_files: &[&str], intrinsic: &Intrinsic) -> String {
template<typename T1, typename T2> T1 cast(T2 x) {{
static_assert(sizeof(T1) == sizeof(T2), "sizeof T1 and T2 must be the same");
T1 ret = 0;
T1 ret{{}};
memcpy(&ret, &x, sizeof(T1));
return ret;
}}
@ -95,6 +109,8 @@ std::ostream& operator<<(std::ostream& os, poly128_t value) {{
}}
#endif
{arglists}
int main(int argc, char **argv) {{
{passes}
return 0;
@ -104,7 +120,13 @@ int main(int argc, char **argv) {{
.map(|header| format!("#include <{}>", header))
.collect::<Vec<_>>()
.join("\n"),
passes = gen_code_c(intrinsic, constraints.as_slice(), Default::default()),
arglists = intrinsic.arguments.gen_arglists_c(PASSES),
passes = gen_code_c(
intrinsic,
constraints.as_slice(),
Default::default(),
p64_armv7_workaround
),
)
}
@ -131,10 +153,7 @@ fn gen_code_rust(intrinsic: &Intrinsic, constraints: &[&Argument], name: String)
})
.collect()
} else {
(1..20)
.map(|idx| intrinsic.generate_pass_rust(idx, &name))
.collect::<Vec<_>>()
.join("\n")
intrinsic.generate_loop_rust(&name, PASSES)
}
}
@ -153,11 +172,14 @@ fn generate_rust_program(intrinsic: &Intrinsic, a32: bool) -> String {
#![allow(non_upper_case_globals)]
use core_arch::arch::{target_arch}::*;
{arglists}
fn main() {{
{passes}
}}
"#,
target_arch = if a32 { "arm" } else { "aarch64" },
arglists = intrinsic.arguments.gen_arglists_rust(PASSES),
passes = gen_code_rust(intrinsic, &constraints, Default::default())
)
}
@ -203,7 +225,7 @@ fn build_c(intrinsics: &Vec<Intrinsic>, compiler: &str, a32: bool) -> bool {
let c_filename = format!(r#"c_programs/{}.cpp"#, i.name);
let mut file = File::create(&c_filename).unwrap();
let c_code = generate_c_program(&["arm_neon.h", "arm_acle.h"], &i);
let c_code = generate_c_program(&["arm_neon.h", "arm_acle.h"], &i, a32);
file.write_all(c_code.into_bytes().as_slice()).unwrap();
compile_c(&c_filename, &i, compiler, a32)
})
@ -259,7 +281,7 @@ path = "{intrinsic}/main.rs""#,
.current_dir("rust_programs")
.arg("-c")
.arg(format!(
"cargo {toolchain} build --target {target}",
"cargo {toolchain} build --target {target} --release",
toolchain = toolchain,
target = if a32 {
"armv7-unknown-linux-gnueabihf"
@ -407,7 +429,7 @@ fn compare_outputs(intrinsics: &Vec<Intrinsic>, toolchain: &str, runner: &str, a
.current_dir("rust_programs")
.arg("-c")
.arg(format!(
"cargo {toolchain} run --target {target} --bin {intrinsic}",
"cargo {toolchain} run --target {target} --bin {intrinsic} --release",
intrinsic = intrinsic.name,
toolchain = toolchain,
target = if a32 {

View file

@ -1,7 +1,7 @@
use std::fmt;
use std::str::FromStr;
use crate::values::values_for_pass;
use crate::values::value_for_array;
use crate::Language;
#[derive(Debug, PartialEq, Copy, Clone)]
@ -160,8 +160,7 @@ impl IntrinsicType {
}
}
#[allow(unused)]
fn c_scalar_type(&self) -> String {
pub fn c_scalar_type(&self) -> String {
format!(
"{prefix}{bits}_t",
prefix = self.kind().c_prefix(),
@ -169,7 +168,7 @@ impl IntrinsicType {
)
}
fn rust_scalar_type(&self) -> String {
pub fn rust_scalar_type(&self) -> String {
format!(
"{prefix}{bits}",
prefix = self.kind().rust_prefix(),
@ -289,18 +288,19 @@ impl IntrinsicType {
}
}
/// Generates a comma list of values that can be used to initialize an
/// argument for the intrinsic call.
/// Generates a comma list of values that can be used to initialize the array that
/// an argument for the intrinsic call is loaded from.
/// This is determistic based on the pass number.
///
/// * `pass`: The pass index, i.e. the iteration index for the call to an intrinsic
/// * `loads`: The number of values that need to be loaded from the argument array
/// * e.g for argument type uint32x2, loads=2 results in a string representing 4 32-bit values
///
/// Returns a string such as
/// * `0x1, 0x7F, 0xFF` if `language` is `Language::C`
/// * `0x1 as _, 0x7F as _, 0xFF as _` if `language` is `Language::Rust`
pub fn populate_random(&self, pass: usize, language: &Language) -> String {
pub fn populate_random(&self, loads: u32, language: &Language) -> String {
match self {
IntrinsicType::Ptr { child, .. } => child.populate_random(pass, language),
IntrinsicType::Ptr { child, .. } => child.populate_random(loads, language),
IntrinsicType::Type {
bit_len: Some(bit_len),
kind,
@ -308,11 +308,11 @@ impl IntrinsicType {
vec_len,
..
} if kind == &TypeKind::Int || kind == &TypeKind::UInt || kind == &TypeKind::Poly => (0
..(simd_len.unwrap_or(1) * vec_len.unwrap_or(1)))
..(simd_len.unwrap_or(1) * vec_len.unwrap_or(1) + loads - 1))
.map(|i| {
format!(
"{}{}",
values_for_pass(*bit_len, i, pass),
value_for_array(*bit_len, i),
match language {
&Language::Rust => format!(" as {ty} ", ty = self.rust_scalar_type()),
&Language::C => String::from(""),
@ -327,15 +327,15 @@ impl IntrinsicType {
simd_len,
vec_len,
..
} => (0..(simd_len.unwrap_or(1) * vec_len.unwrap_or(1)))
} => (0..(simd_len.unwrap_or(1) * vec_len.unwrap_or(1) + loads - 1))
.map(|i| {
format!(
"{}({})",
match language {
&Language::Rust => "f32::from_bits",
&Language::Rust => "std::mem::transmute",
&Language::C => "cast<float, uint32_t>",
},
values_for_pass(32, i, pass),
value_for_array(32, i),
)
})
.collect::<Vec<_>>()
@ -346,15 +346,15 @@ impl IntrinsicType {
simd_len,
vec_len,
..
} => (0..(simd_len.unwrap_or(1) * vec_len.unwrap_or(1)))
} => (0..(simd_len.unwrap_or(1) * vec_len.unwrap_or(1) + loads - 1))
.map(|i| {
format!(
"{}({}{})",
match language {
&Language::Rust => "f64::from_bits",
&Language::Rust => "std::mem::transmute",
&Language::C => "cast<double, uint64_t>",
},
values_for_pass(64, i, pass),
value_for_array(64, i),
match language {
&Language::Rust => " as u64",
&Language::C => "",
@ -368,10 +368,9 @@ impl IntrinsicType {
}
/// Determines the load function for this type.
#[allow(unused)]
pub fn get_load_function(&self) -> String {
pub fn get_load_function(&self, armv7_p64_workaround: bool) -> String {
match self {
IntrinsicType::Ptr { child, .. } => child.get_load_function(),
IntrinsicType::Ptr { child, .. } => child.get_load_function(armv7_p64_workaround),
IntrinsicType::Type {
kind: k,
bit_len: Some(bl),
@ -379,7 +378,7 @@ impl IntrinsicType {
vec_len,
..
} => {
let quad = if (simd_len.unwrap_or(1) * bl) > 64 {
let quad = if simd_len.unwrap_or(1) * bl > 64 {
"q"
} else {
""
@ -390,7 +389,8 @@ impl IntrinsicType {
TypeKind::UInt => "u",
TypeKind::Int => "s",
TypeKind::Float => "f",
TypeKind::Poly => "p",
// The ACLE doesn't support 64-bit polynomial loads on Armv7
TypeKind::Poly => if armv7_p64_workaround && *bl == 64 {"s"} else {"p"},
x => todo!("get_load_function TypeKind: {:#?}", x),
},
size = bl,

View file

@ -1,9 +1,8 @@
/// Gets a hex constant value for a single lane in in a determistic way
/// Gets a hex constant value for a single value in the argument values array in a determistic way
/// * `bits`: The number of bits for the type, only 8, 16, 32, 64 are valid values
/// * `simd`: The index of the simd lane we are generating for
/// * `pass`: The index of the pass we are generating the values for
pub fn values_for_pass(bits: u32, simd: u32, pass: usize) -> String {
let index = pass + (simd as usize);
/// * `index`: The position in the array we are generating for
pub fn value_for_array(bits: u32, index: u32) -> String {
let index = index as usize;
if bits == 8 {
format!("{:#X}", VALUES_8[index % VALUES_8.len()])