add vcopy neon instructions (#1139)

This commit is contained in:
Sparrow Li 2021-04-24 08:49:11 +08:00 committed by GitHub
parent 03e109a2f3
commit 8852d07441
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 1646 additions and 13 deletions

File diff suppressed because it is too large Load diff

View file

@ -359,6 +359,118 @@ extern "C" {
fn vsriq_n_s64_(a: int64x2_t, b: int64x2_t, n: i32) -> int64x2_t;
}
/// Duplicate vector element to vector or scalar
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(nop, N1 = 0, N2 = 0))]
#[rustc_legacy_const_generics(1, 3)]
pub unsafe fn vcopy_lane_s64<const N1: i32, const N2: i32>(
_a: int64x1_t,
b: int64x1_t,
) -> int64x1_t {
static_assert!(N1 : i32 where N1 == 0);
static_assert!(N2 : i32 where N2 == 0);
b
}
/// Duplicate vector element to vector or scalar
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(nop, N1 = 0, N2 = 0))]
#[rustc_legacy_const_generics(1, 3)]
pub unsafe fn vcopy_lane_u64<const N1: i32, const N2: i32>(
_a: uint64x1_t,
b: uint64x1_t,
) -> uint64x1_t {
static_assert!(N1 : i32 where N1 == 0);
static_assert!(N2 : i32 where N2 == 0);
b
}
/// Duplicate vector element to vector or scalar
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(nop, N1 = 0, N2 = 0))]
#[rustc_legacy_const_generics(1, 3)]
pub unsafe fn vcopy_lane_p64<const N1: i32, const N2: i32>(
_a: poly64x1_t,
b: poly64x1_t,
) -> poly64x1_t {
static_assert!(N1 : i32 where N1 == 0);
static_assert!(N2 : i32 where N2 == 0);
b
}
/// Duplicate vector element to vector or scalar
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(nop, N1 = 0, N2 = 0))]
#[rustc_legacy_const_generics(1, 3)]
pub unsafe fn vcopy_lane_f64<const N1: i32, const N2: i32>(
_a: float64x1_t,
b: float64x1_t,
) -> float64x1_t {
static_assert!(N1 : i32 where N1 == 0);
static_assert!(N2 : i32 where N2 == 0);
b
}
/// Duplicate vector element to vector or scalar
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(nop, LANE1 = 0, LANE2 = 1))]
#[rustc_legacy_const_generics(1, 3)]
pub unsafe fn vcopy_laneq_s64<const LANE1: i32, const LANE2: i32>(
_a: int64x1_t,
b: int64x2_t,
) -> int64x1_t {
static_assert!(LANE1 : i32 where LANE1 == 0);
static_assert_imm1!(LANE2);
transmute::<i64, _>(simd_extract(b, LANE2 as u32))
}
/// Duplicate vector element to vector or scalar
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(nop, LANE1 = 0, LANE2 = 1))]
#[rustc_legacy_const_generics(1, 3)]
pub unsafe fn vcopy_laneq_u64<const LANE1: i32, const LANE2: i32>(
_a: uint64x1_t,
b: uint64x2_t,
) -> uint64x1_t {
static_assert!(LANE1 : i32 where LANE1 == 0);
static_assert_imm1!(LANE2);
transmute::<u64, _>(simd_extract(b, LANE2 as u32))
}
/// Duplicate vector element to vector or scalar
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(nop, LANE1 = 0, LANE2 = 1))]
#[rustc_legacy_const_generics(1, 3)]
pub unsafe fn vcopy_laneq_p64<const LANE1: i32, const LANE2: i32>(
_a: poly64x1_t,
b: poly64x2_t,
) -> poly64x1_t {
static_assert!(LANE1 : i32 where LANE1 == 0);
static_assert_imm1!(LANE2);
transmute::<u64, _>(simd_extract(b, LANE2 as u32))
}
/// Duplicate vector element to vector or scalar
#[inline]
#[target_feature(enable = "neon")]
#[cfg_attr(test, assert_instr(nop, LANE1 = 0, LANE2 = 1))]
#[rustc_legacy_const_generics(1, 3)]
pub unsafe fn vcopy_laneq_f64<const LANE1: i32, const LANE2: i32>(
_a: float64x1_t,
b: float64x2_t,
) -> float64x1_t {
static_assert!(LANE1 : i32 where LANE1 == 0);
static_assert_imm1!(LANE2);
transmute::<f64, _>(simd_extract(b, LANE2 as u32))
}
/// Load multiple single-element structures to one, two, three, or four registers.
#[inline]
#[target_feature(enable = "neon")]
@ -3793,6 +3905,78 @@ mod tests {
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vcopy_lane_s64() {
let a: i64x1 = i64x1::new(1);
let b: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
let e: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
let r: i64x1 = transmute(vcopy_lane_s64::<0, 0>(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vcopy_lane_u64() {
let a: u64x1 = u64x1::new(1);
let b: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
let r: u64x1 = transmute(vcopy_lane_u64::<0, 0>(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vcopy_lane_p64() {
let a: i64x1 = i64x1::new(1);
let b: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
let e: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
let r: i64x1 = transmute(vcopy_lane_p64::<0, 0>(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vcopy_lane_f64() {
let a: f64 = 1.;
let b: f64 = 0.;
let e: f64 = 0.;
let r: f64 = transmute(vcopy_lane_f64::<0, 0>(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vcopy_laneq_s64() {
let a: i64x1 = i64x1::new(1);
let b: i64x2 = i64x2::new(0, 0x7F_FF_FF_FF_FF_FF_FF_FF);
let e: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
let r: i64x1 = transmute(vcopy_laneq_s64::<0, 1>(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vcopy_laneq_u64() {
let a: u64x1 = u64x1::new(1);
let b: u64x2 = u64x2::new(0, 0xFF_FF_FF_FF_FF_FF_FF_FF);
let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
let r: u64x1 = transmute(vcopy_laneq_u64::<0, 1>(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vcopy_laneq_p64() {
let a: i64x1 = i64x1::new(1);
let b: i64x2 = i64x2::new(0, 0x7F_FF_FF_FF_FF_FF_FF_FF);
let e: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
let r: i64x1 = transmute(vcopy_laneq_p64::<0, 1>(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vcopy_laneq_f64() {
let a: f64 = 1.;
let b: f64x2 = f64x2::new(0., 0.5);
let e: f64 = 0.5;
let r: f64 = transmute(vcopy_laneq_f64::<0, 1>(transmute(a), transmute(b)));
assert_eq!(r, e);
}
#[simd_test(enable = "neon")]
unsafe fn test_vceq_u64() {
test_cmp_u64(

View file

@ -721,6 +721,124 @@ generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
arm = vacge.s
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
/// Insert vector element from another vector element
name = vcopy
lane-suffixes
constn = LANE1:LANE2
multi_fn = static_assert_imm-in0_exp_len-LANE1
multi_fn = static_assert_imm-in_exp_len-LANE2
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in0_len-LANE2}
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
n = 0:1
validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
aarch64 = mov
generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x2_t, int32x4_t, int64x2_t
generate uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x2_t, uint32x4_t, uint64x2_t
generate poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t
/// Insert vector element from another vector element
name = vcopy
lane-suffixes
constn = LANE1:LANE2
multi_fn = static_assert_imm-in0_exp_len-LANE1
multi_fn = static_assert_imm-in_exp_len-LANE2
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in0_len-LANE2}
a = 1., 2., 3., 4.
b = 0., 0.5, 0., 0.
n = 0:1
validate 0.5, 2., 3., 4.
aarch64 = mov
generate float32x2_t, float32x4_t, float64x2_t
/// Insert vector element from another vector element
name = vcopy
lane-suffixes
constn = LANE1:LANE2
multi_fn = static_assert_imm-in0_exp_len-LANE1
multi_fn = static_assert_imm-in_exp_len-LANE2
multi_fn = simd_shuffle-in_len-noext, a:in_t, a, a, {asc-0-in_len}
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in_len-LANE2}
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
n = 0:1
validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
aarch64 = mov
generate int8x8_t:int8x16_t:int8x8_t, int16x4_t:int16x8_t:int16x4_t, int32x2_t:int32x4_t:int32x2_t
generate uint8x8_t:uint8x16_t:uint8x8_t, uint16x4_t:uint16x8_t:uint16x4_t, uint32x2_t:uint32x4_t:uint32x2_t
generate poly8x8_t:poly8x16_t:poly8x8_t, poly16x4_t:poly16x8_t:poly16x4_t
/// Insert vector element from another vector element
name = vcopy
lane-suffixes
constn = LANE1:LANE2
multi_fn = static_assert_imm-in0_exp_len-LANE1
multi_fn = static_assert_imm-in_exp_len-LANE2
multi_fn = simd_shuffle-in_len-noext, a:in_t, a, a, {asc-0-in_len}
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in_len-LANE2}
a = 1., 2., 3., 4.
b = 0., 0.5, 0., 0.
n = 0:1
validate 0.5, 2., 3., 4.
aarch64 = mov
generate float32x2_t:float32x4_t:float32x2_t
/// Insert vector element from another vector element
name = vcopy
lane-suffixes
constn = LANE1:LANE2
multi_fn = static_assert_imm-in0_exp_len-LANE1
multi_fn = static_assert_imm-in_exp_len-LANE2
multi_fn = simd_shuffle-in0_len-noext, b:in_t0, b, b, {asc-0-in0_len}
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in0_len-LANE2}
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
n = 0:1
validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
aarch64 = mov
generate int8x16_t:int8x8_t:int8x16_t, int16x8_t:int16x4_t:int16x8_t, int32x4_t:int32x2_t:int32x4_t
generate uint8x16_t:uint8x8_t:uint8x16_t, uint16x8_t:uint16x4_t:uint16x8_t, uint32x4_t:uint32x2_t:uint32x4_t
generate poly8x16_t:poly8x8_t:poly8x16_t, poly16x8_t:poly16x4_t:poly16x8_t
/// Insert vector element from another vector element
name = vcopy
lane-suffixes
constn = LANE1:LANE2
multi_fn = static_assert_imm-in0_exp_len-LANE1
multi_fn = static_assert_imm-in_exp_len-LANE2
multi_fn = simd_shuffle-in0_len-noext, b:in_t0, b, b, {asc-0-in0_len}
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in0_len-LANE2}
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
b = MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
n = 1:0
validate 1, MAX
aarch64 = zip1
generate int64x2_t:int64x1_t:int64x2_t, uint64x2_t:uint64x1_t:uint64x2_t, poly64x2_t:poly64x1_t:poly64x2_t
/// Insert vector element from another vector element
name = vcopy
lane-suffixes
constn = LANE1:LANE2
multi_fn = static_assert_imm-in0_exp_len-LANE1
multi_fn = static_assert_imm-in_exp_len-LANE2
multi_fn = simd_shuffle-in0_len-noext, b:in_t0, b, b, {asc-0-in0_len}
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-noext, a, b, {ins-in0_len-in0_len-LANE2}
a = 1., 2., 3., 4.
b = 0.5, 0., 0., 0.
n = 1:0
validate 1., 0.5, 3., 4.
aarch64 = mov
generate float32x4_t:float32x2_t:float32x4_t
aarch64 = zip1
generate float64x2_t:float64x1_t:float64x2_t
/// Floating-point convert to higher precision long
name = vcvt
double-suffixes
@ -1036,7 +1154,7 @@ generate float32x2_t:f32, float32x4_t:f32, float64x1_t:f64, float64x2_t:f64
name = vext
constn = N
multi_fn = static_assert_imm-out_exp_len-N
multi_fn = matchn-out_exp_len, simd_shuffle-out_len-noext, a, b, {asc-n-out_len}
multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-noext, a, b, {asc-n-out_len}
a = 0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15
b = 9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11
n = HFLEN
@ -1050,7 +1168,7 @@ generate int*_t, uint*_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t
name = vext
constn = N
multi_fn = static_assert_imm-out_exp_len-N
multi_fn = matchn-out_exp_len, simd_shuffle-out_len-noext, a, b, {asc-n-out_len}
multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-noext, a, b, {asc-n-out_len}
a = 0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15
b = 9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11
n = HFLEN
@ -1066,7 +1184,7 @@ generate int64x2_t, uint64x2_t
name = vext
constn = N
multi_fn = static_assert_imm-out_exp_len-N
multi_fn = matchn-out_exp_len, simd_shuffle-out_len-noext, a, b, {asc-n-out_len}
multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-noext, a, b, {asc-n-out_len}
a = 0., 2., 2., 3.
b = 3., 4., 5., 6.,
n = HFLEN

View file

@ -949,21 +949,50 @@ fn gen_aarch64(
String::new()
};
let const_declare = if let Some(constn) = constn {
format!(r#"<const {}: i32>"#, constn)
if constn.contains(":") {
let constns: Vec<_> = constn.split(':').map(|v| v.to_string()).collect();
assert_eq!(constns.len(), 2);
format!(r#"<const {}: i32, const {}: i32>"#, constns[0], constns[1])
} else {
format!(r#"<const {}: i32>"#, constn)
}
} else {
String::new()
};
let const_assert = if let Some(constn) = constn {
format!(
r#", {} = {}"#,
constn,
map_val(in_t[1], current_tests[0].3.as_ref().unwrap())
)
if constn.contains(":") {
let constns: Vec<_> = constn.split(':').map(|v| v.to_string()).collect();
let const_test = current_tests[0].3.as_ref().unwrap();
let const_tests: Vec<_> = const_test.split(':').map(|v| v.to_string()).collect();
assert_eq!(constns.len(), 2);
assert_eq!(const_tests.len(), 2);
format!(
r#", {} = {}, {} = {}"#,
constns[0],
map_val(in_t[1], &const_tests[0]),
constns[1],
map_val(in_t[1], &const_tests[1]),
)
} else {
format!(
r#", {} = {}"#,
constn,
map_val(in_t[1], current_tests[0].3.as_ref().unwrap())
)
}
} else {
String::new()
};
let const_legacy = if constn.is_some() {
format!("\n#[rustc_legacy_const_generics({})]", para_num)
let const_legacy = if let Some(constn) = constn {
if constn.contains(":") {
format!(
"\n#[rustc_legacy_const_generics({}, {})]",
para_num - 1,
para_num + 1
)
} else {
format!("\n#[rustc_legacy_const_generics({})]", para_num)
}
} else {
String::new()
};
@ -1105,7 +1134,16 @@ fn gen_test(
let c: Vec<String> = c.iter().take(len_in[2]).cloned().collect();
let e: Vec<String> = e.iter().take(len_out).cloned().collect();
let const_value = if let Some(constn) = n {
format!(r#"::<{}>"#, map_val(in_t[1], constn))
if constn.contains(":") {
let constns: Vec<_> = constn.split(':').map(|v| v.to_string()).collect();
format!(
r#"::<{}, {}>"#,
map_val(in_t[1], &constns[0]),
map_val(in_t[1], &constns[1])
)
} else {
format!(r#"::<{}>"#, map_val(in_t[1], constn))
}
} else {
String::new()
};
@ -1739,11 +1777,41 @@ fn get_call(
let len = match &*fn_format[2] {
"out_len" => type_len(out_t),
"in_len" => type_len(in_t[1]),
"in0_len" => type_len(in_t[0]),
"halflen" => type_len(in_t[1]) / 2,
_ => 0,
};
return asc(start, len);
}
if fn_name.starts_with("ins") {
let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect();
let n = n.unwrap();
let len = match &*fn_format[1] {
"out_len" => type_len(out_t),
"in_len" => type_len(in_t[1]),
"in0_len" => type_len(in_t[0]),
_ => 0,
};
let offset = match &*fn_format[2] {
"out_len" => type_len(out_t),
"in_len" => type_len(in_t[1]),
"in0_len" => type_len(in_t[0]),
_ => 0,
};
let mut s = String::from("[");
for i in 0..len {
if i != 0 {
s.push_str(", ");
}
if i == n as usize {
s.push_str(&format!("{} + {} as u32", offset.to_string(), fn_format[3]));
} else {
s.push_str(&i.to_string());
}
}
s.push_str("]");
return s;
}
if fn_name.starts_with("static_assert_imm") {
let fn_format: Vec<_> = fn_name.split('-').map(|v| v.to_string()).collect();
let len = match &*fn_format[1] {
@ -1751,6 +1819,8 @@ fn get_call(
"out_bits_exp_len" => type_bits_exp_len(out_t),
"in_exp_len" => type_exp_len(in_t[1]),
"in_bits_exp_len" => type_bits_exp_len(in_t[1]),
"in0_exp_len" => type_exp_len(in_t[0]),
"in1_exp_len" => type_exp_len(in_t[1]),
"in2_exp_len" => type_exp_len(in_t[2]),
_ => 0,
};
@ -1796,9 +1866,10 @@ fn get_call(
let len = match &*fn_format[1] {
"out_exp_len" => type_exp_len(out_t),
"in_exp_len" => type_exp_len(in_t[1]),
"in0_exp_len" => type_exp_len(in_t[0]),
_ => 0,
};
let mut call = format!("match N & 0b{} {{\n", "1".repeat(len));
let mut call = format!("match {} & 0b{} {{\n", &fn_format[2], "1".repeat(len));
let mut sub_call = String::new();
for p in 1..params.len() {
if !sub_call.is_empty() {
@ -1946,6 +2017,8 @@ fn get_call(
} else if fn_format[1] == "nosuffix" {
} else if fn_format[1] == "in_len" {
fn_name.push_str(&type_len(in_t[1]).to_string());
} else if fn_format[1] == "in0_len" {
fn_name.push_str(&type_len(in_t[0]).to_string());
} else if fn_format[1] == "out_len" {
fn_name.push_str(&type_len(out_t).to_string());
} else if fn_format[1] == "halflen" {