diff --git a/src/tools/miri/src/shims/x86/mod.rs b/src/tools/miri/src/shims/x86/mod.rs index f2d120df21c4..b71aec02166b 100644 --- a/src/tools/miri/src/shims/x86/mod.rs +++ b/src/tools/miri/src/shims/x86/mod.rs @@ -18,6 +18,7 @@ mod sse; mod sse2; mod sse3; mod sse41; +mod sse42; mod ssse3; impl<'tcx> EvalContextExt<'tcx> for crate::MiriInterpCx<'tcx> {} @@ -137,6 +138,11 @@ pub(super) trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { this, link_name, abi, args, dest, ); } + name if name.starts_with("sse42.") => { + return sse42::EvalContextExt::emulate_x86_sse42_intrinsic( + this, link_name, abi, args, dest, + ); + } name if name.starts_with("aesni.") => { return aesni::EvalContextExt::emulate_x86_aesni_intrinsic( this, link_name, abi, args, dest, diff --git a/src/tools/miri/src/shims/x86/sse42.rs b/src/tools/miri/src/shims/x86/sse42.rs new file mode 100644 index 000000000000..4e50d5e5dcfb --- /dev/null +++ b/src/tools/miri/src/shims/x86/sse42.rs @@ -0,0 +1,500 @@ +use rustc_middle::mir; +use rustc_middle::ty::layout::LayoutOf as _; +use rustc_middle::ty::Ty; +use rustc_span::Symbol; +use rustc_target::abi::Size; +use rustc_target::spec::abi::Abi; + +use crate::*; + +/// A bitmask constant for scrutinizing the immediate byte provided +/// to the string comparison intrinsics. It distinuishes between +/// 16-bit integers and 8-bit integers. See [`compare_strings`] +/// for more details about the immediate byte. +const USE_WORDS: u8 = 1; + +/// A bitmask constant for scrutinizing the immediate byte provided +/// to the string comparison intrinsics. It distinuishes between +/// signed integers and unsigned integers. See [`compare_strings`] +/// for more details about the immediate byte. +const USE_SIGNED: u8 = 2; + +/// The main worker for the string comparison intrinsics, where the given +/// strings are analyzed according to the given immediate byte. +/// +/// # Arguments +/// +/// * `str1` - The first string argument. It is always a length 16 array of bytes +/// or a length 8 array of two-byte words. +/// * `str2` - The second string argument. It is always a length 16 array of bytes +/// or a length 8 array of two-byte words. +/// * `len` is the length values of the supplied strings. It is distinct from the operand length +/// in that it describes how much of `str1` and `str2` will be used for the calculation and may +/// be smaller than the array length of `str1` and `str2`. The string length is counted in bytes +/// if using byte operands and in two-byte words when using two-byte word operands. +/// If the value is `None`, the length of a string is determined by the first +/// null value inside the string. +/// * `imm` is the immediate byte argument supplied to the intrinsic. The byte influences +/// the operation as follows: +/// +/// ```text +/// 0babccddef +/// || | |||- Use of bytes vs use of two-byte words inside the operation. +/// || | || +/// || | ||- Use of signed values versus use of unsigned values. +/// || | | +/// || | |- The comparison operation performed. A total of four operations are available. +/// || | * Equal any: Checks which characters of `str2` are inside `str1`. +/// || | * String ranges: Check if characters in `str2` are inside the provided character ranges. +/// || | Adjacent characters in `str1` constitute one range. +/// || | * String comparison: Mark positions where `str1` and `str2` have the same character. +/// || | * Substring search: Mark positions where `str1` is a substring in `str2`. +/// || | +/// || |- Result Polarity. The result bits may be subjected to a bitwise complement +/// || if these bits are set. +/// || +/// ||- Output selection. This bit has two meanings depending on the instruction. +/// | If the instruction is generating a mask, it distinguishes between a bit mask +/// | and a byte mask. Otherwise it distinguishes between the most significand bit +/// | and the least significand bit when generating an index. +/// | +/// |- This bit is ignored. It is expected that this bit is set to zero, but it is +/// not a requirement. +/// ``` +/// +/// # Returns +/// +/// A result mask. The bit at index `i` inside the mask is set if 'str2' starting at `i` +/// fulfills the test as defined inside the immediate byte. +/// The mask may be negated if negation flags inside the immediate byte are set. +/// +/// For more information, see the Intel Software Developer's Manual, Vol. 2b, Chapter 4.1. +#[allow(clippy::arithmetic_side_effects)] +fn compare_strings<'tcx>( + this: &mut MiriInterpCx<'tcx>, + str1: &OpTy<'tcx>, + str2: &OpTy<'tcx>, + len: Option<(u64, u64)>, + imm: u8, +) -> InterpResult<'tcx, i32> { + let default_len = default_len::(imm); + let (len1, len2) = if let Some(t) = len { + t + } else { + let len1 = implicit_len(this, str1, imm)?.unwrap_or(default_len); + let len2 = implicit_len(this, str2, imm)?.unwrap_or(default_len); + (len1, len2) + }; + + let mut result = 0; + match (imm >> 2) & 3 { + 0 => { + // Equal any: Checks which characters of `str2` are inside `str1`. + for i in 0..len2 { + let ch2 = this.read_immediate(&this.project_index(str2, i)?)?; + + for j in 0..len1 { + let ch1 = this.read_immediate(&this.project_index(str1, j)?)?; + + let eq = this.binary_op(mir::BinOp::Eq, &ch1, &ch2)?; + if eq.to_scalar().to_bool()? { + result |= 1 << i; + break; + } + } + } + } + 1 => { + // String ranges: Check if characters in `str2` are inside the provided character ranges. + // Adjacent characters in `str1` constitute one range. + let len1 = len1 - (len1 & 1); + let get_ch = |ch: Scalar| -> InterpResult<'tcx, i32> { + let result = match (imm & USE_WORDS != 0, imm & USE_SIGNED != 0) { + (true, true) => i32::from(ch.to_i16()?), + (true, false) => i32::from(ch.to_u16()?), + (false, true) => i32::from(ch.to_i8()?), + (false, false) => i32::from(ch.to_u8()?), + }; + Ok(result) + }; + + for i in 0..len2 { + for j in (0..len1).step_by(2) { + let ch2 = get_ch(this.read_scalar(&this.project_index(str2, i)?)?)?; + let ch1_1 = get_ch(this.read_scalar(&this.project_index(str1, j)?)?)?; + let ch1_2 = get_ch(this.read_scalar(&this.project_index(str1, j + 1)?)?)?; + + if ch1_1 <= ch2 && ch2 <= ch1_2 { + result |= 1 << i; + } + } + } + } + 2 => { + // String comparison: Mark positions where `str1` and `str2` have the same character. + result = (1 << default_len) - 1; + result ^= (1 << len1.max(len2)) - 1; + + for i in 0..len1.min(len2) { + let ch1 = this.read_immediate(&this.project_index(str1, i)?)?; + let ch2 = this.read_immediate(&this.project_index(str2, i)?)?; + let eq = this.binary_op(mir::BinOp::Eq, &ch1, &ch2)?; + result |= i32::from(eq.to_scalar().to_bool()?) << i; + } + } + 3 => { + // Substring search: Mark positions where `str1` is a substring in `str2`. + if len1 == 0 { + result = (1 << default_len) - 1; + } else if len1 <= len2 { + for i in 0..len2 { + if len1 > len2 - i { + break; + } + + result |= 1 << i; + + for j in 0..len1 { + let k = i + j; + + if k >= default_len { + break; + } else { + let ch1 = this.read_immediate(&this.project_index(str1, j)?)?; + let ch2 = this.read_immediate(&this.project_index(str2, k)?)?; + let ne = this.binary_op(mir::BinOp::Ne, &ch1, &ch2)?; + + if ne.to_scalar().to_bool()? { + result &= !(1 << i); + break; + } + } + } + } + } + } + _ => unreachable!(), + } + + // Polarity: Possibly perform a bitwise complement on the result. + match (imm >> 4) & 3 { + 3 => result ^= (1 << len1) - 1, + 1 => result ^= (1 << default_len) - 1, + _ => (), + } + + Ok(result) +} + +/// Obtain the arguments of the intrinsic based on its name. +/// The result is a tuple with the following values: +/// * The first string argument. +/// * The second string argument. +/// * The string length values, if the intrinsic requires them. +/// * The immediate instruction byte. +/// +/// The string arguments will be transmuted into arrays of bytes +/// or two-byte words, depending on the value of the immediate byte. +/// Originally, they are [__m128i](https://doc.rust-lang.org/stable/core/arch/x86_64/struct.__m128i.html) values +/// corresponding to the x86 128-bit integer SIMD type. +fn deconstruct_args<'tcx>( + unprefixed_name: &str, + this: &mut MiriInterpCx<'tcx>, + link_name: Symbol, + abi: Abi, + args: &[OpTy<'tcx>], +) -> InterpResult<'tcx, (OpTy<'tcx>, OpTy<'tcx>, Option<(u64, u64)>, u8)> { + let array_layout_fn = |this: &mut MiriInterpCx<'tcx>, imm: u8| { + if imm & USE_WORDS != 0 { + this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u16, 8)) + } else { + this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u8, 16)) + } + }; + + // The fourth letter of each string comparison intrinsic is either 'e' for "explicit" or 'i' for "implicit". + // The distinction will correspond to the intrinsics type signature. In this constext, "explicit" and "implicit" + // refer to the way the string length is determined. The length is either passed explicitly in the "explicit" + // case or determined by a null terminator in the "implicit" case. + let is_explicit = match unprefixed_name.as_bytes().get(4) { + Some(&b'e') => true, + Some(&b'i') => false, + _ => unreachable!(), + }; + + if is_explicit { + let [str1, len1, str2, len2, imm] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + let imm = this.read_scalar(imm)?.to_u8()?; + + let default_len = default_len::(imm); + let len1 = u64::from(this.read_scalar(len1)?.to_u32()?.min(default_len)); + let len2 = u64::from(this.read_scalar(len2)?.to_u32()?.min(default_len)); + + let array_layout = array_layout_fn(this, imm)?; + let str1 = str1.transmute(array_layout, this)?; + let str2 = str2.transmute(array_layout, this)?; + + Ok((str1, str2, Some((len1, len2)), imm)) + } else { + let [str1, str2, imm] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + let imm = this.read_scalar(imm)?.to_u8()?; + + let array_layout = array_layout_fn(this, imm)?; + let str1 = str1.transmute(array_layout, this)?; + let str2 = str2.transmute(array_layout, this)?; + + Ok((str1, str2, None, imm)) + } +} + +/// Calculate the c-style string length for a given string `str`. +/// The string is either a length 16 array of bytes a length 8 array of two-byte words. +fn implicit_len<'tcx>( + this: &mut MiriInterpCx<'tcx>, + str: &OpTy<'tcx>, + imm: u8, +) -> InterpResult<'tcx, Option> { + let mut result = None; + let zero = ImmTy::from_int(0, str.layout.field(this, 0)); + + for i in 0..default_len::(imm) { + let ch = this.read_immediate(&this.project_index(str, i)?)?; + let is_zero = this.binary_op(mir::BinOp::Eq, &ch, &zero)?; + if is_zero.to_scalar().to_bool()? { + result = Some(i); + break; + } + } + Ok(result) +} + +#[inline] +fn default_len>(imm: u8) -> T { + if imm & USE_WORDS != 0 { T::from(8u8) } else { T::from(16u8) } +} + +impl<'tcx> EvalContextExt<'tcx> for crate::MiriInterpCx<'tcx> {} +pub(super) trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { + fn emulate_x86_sse42_intrinsic( + &mut self, + link_name: Symbol, + abi: Abi, + args: &[OpTy<'tcx>], + dest: &MPlaceTy<'tcx>, + ) -> InterpResult<'tcx, EmulateItemResult> { + let this = self.eval_context_mut(); + this.expect_target_feature_for_intrinsic(link_name, "sse4.2")?; + // Prefix should have already been checked. + let unprefixed_name = link_name.as_str().strip_prefix("llvm.x86.sse42.").unwrap(); + + match unprefixed_name { + // Used to implement the `_mm_cmpestrm` and the `_mm_cmpistrm` functions. + // These functions compare the input strings and return the resulting mask. + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=1044,922 + "pcmpistrm128" | "pcmpestrm128" => { + let (str1, str2, len, imm) = + deconstruct_args(unprefixed_name, this, link_name, abi, args)?; + let mask = compare_strings(this, &str1, &str2, len, imm)?; + + // The sixth bit inside the immediate byte distiguishes + // between a bit mask or a byte mask when generating a mask. + if imm & 0b100_0000 != 0 { + let (array_layout, size) = if imm & USE_WORDS != 0 { + (this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u16, 8))?, 2) + } else { + (this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u8, 16))?, 1) + }; + let size = Size::from_bytes(size); + let dest = dest.transmute(array_layout, this)?; + + for i in 0..default_len::(imm) { + let result = helpers::bool_to_simd_element(mask & (1 << i) != 0, size); + this.write_scalar(result, &this.project_index(&dest, i)?)?; + } + } else { + let layout = this.layout_of(this.tcx.types.i128)?; + let dest = dest.transmute(layout, this)?; + this.write_scalar(Scalar::from_i128(i128::from(mask)), &dest)?; + } + } + + // Used to implement the `_mm_cmpestra` and the `_mm_cmpistra` functions. + // These functions compare the input strings and return `1` if the end of the second + // input string is not reached and the resulting mask is zero, and `0` otherwise. + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=919,1041 + "pcmpistria128" | "pcmpestria128" => { + let (str1, str2, len, imm) = + deconstruct_args(unprefixed_name, this, link_name, abi, args)?; + let result = if compare_strings(this, &str1, &str2, len, imm)? != 0 { + false + } else if let Some((_, len)) = len { + len >= default_len::(imm) + } else { + implicit_len(this, &str1, imm)?.is_some() + }; + + this.write_scalar(Scalar::from_i32(i32::from(result)), dest)?; + } + + // Used to implement the `_mm_cmpestri` and the `_mm_cmpistri` functions. + // These functions compare the input strings and return the bit index + // for most significant or least significant bit of the resulting mask. + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=921,1043 + "pcmpistri128" | "pcmpestri128" => { + let (str1, str2, len, imm) = + deconstruct_args(unprefixed_name, this, link_name, abi, args)?; + let mask = compare_strings(this, &str1, &str2, len, imm)?; + + let len = default_len::(imm); + // The sixth bit inside the immediate byte distiguishes between the least + // significant bit and the most significant bit when generating an index. + let result = if imm & 0b100_0000 != 0 { + // most significant bit + 31u32.wrapping_sub(mask.leading_zeros()).min(len) + } else { + // least significant bit + mask.trailing_zeros().min(len) + }; + this.write_scalar(Scalar::from_i32(i32::try_from(result).unwrap()), dest)?; + } + + // Used to implement the `_mm_cmpestro` and the `_mm_cmpistro` functions. + // These functions compare the input strings and return the lowest bit of the + // resulting mask. + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=923,1045 + "pcmpistrio128" | "pcmpestrio128" => { + let (str1, str2, len, imm) = + deconstruct_args(unprefixed_name, this, link_name, abi, args)?; + let mask = compare_strings(this, &str1, &str2, len, imm)?; + this.write_scalar(Scalar::from_i32(mask & 1), dest)?; + } + + // Used to implement the `_mm_cmpestrc` and the `_mm_cmpistrc` functions. + // These functions compare the input strings and return `1` if the resulting + // mask was non-zero, and `0` otherwise. + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=920,1042 + "pcmpistric128" | "pcmpestric128" => { + let (str1, str2, len, imm) = + deconstruct_args(unprefixed_name, this, link_name, abi, args)?; + let mask = compare_strings(this, &str1, &str2, len, imm)?; + this.write_scalar(Scalar::from_i32(i32::from(mask != 0)), dest)?; + } + + // Used to implement the `_mm_cmpistrz` and the `_mm_cmpistrs` functions. + // These functions return `1` if the string end has been reached and `0` otherwise. + // Since these functions define the string length implicitly, it is equal to a + // search for a null terminator (see `deconstruct_args` for more details). + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=924,925 + "pcmpistriz128" | "pcmpistris128" => { + let [str1, str2, imm] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + let imm = this.read_scalar(imm)?.to_u8()?; + + let str = if unprefixed_name == "pcmpistris128" { str1 } else { str2 }; + let array_layout = if imm & USE_WORDS != 0 { + this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u16, 8))? + } else { + this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u8, 16))? + }; + let str = str.transmute(array_layout, this)?; + let result = implicit_len(this, &str, imm)?.is_some(); + + this.write_scalar(Scalar::from_i32(i32::from(result)), dest)?; + } + + // Used to implement the `_mm_cmpestrz` and the `_mm_cmpestrs` functions. + // These functions return 1 if the explicitly passed string length is smaller + // than 16 for byte-sized operands or 8 for word-sized operands. + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=1046,1047 + "pcmpestriz128" | "pcmpestris128" => { + let [_, len1, _, len2, imm] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + let len = if unprefixed_name == "pcmpestris128" { len1 } else { len2 }; + let len = this.read_scalar(len)?.to_i32()?; + let imm = this.read_scalar(imm)?.to_u8()?; + this.write_scalar( + Scalar::from_i32(i32::from(len < default_len::(imm))), + dest, + )?; + } + + // Used to implement the `_mm_crc32_u{8, 16, 32, 64}` functions. + // These functions calculate a 32-bit CRC using `0x11EDC6F41` + // as the polynomial, also known as CRC32C. + // https://datatracker.ietf.org/doc/html/rfc3720#section-12.1 + "crc32.32.8" | "crc32.32.16" | "crc32.32.32" | "crc32.64.64" => { + let bit_size = match unprefixed_name { + "crc32.32.8" => 8, + "crc32.32.16" => 16, + "crc32.32.32" => 32, + "crc32.64.64" => 64, + _ => unreachable!(), + }; + + if bit_size == 64 && this.tcx.sess.target.arch != "x86_64" { + return Ok(EmulateItemResult::NotSupported); + } + + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + let left = this.read_scalar(left)?; + let right = this.read_scalar(right)?; + + let crc = if bit_size == 64 { + // The 64-bit version will only consider the lower 32 bits, + // while the upper 32 bits get discarded. + #[allow(clippy::cast_possible_truncation)] + u128::from((left.to_u64()? as u32).reverse_bits()) + } else { + u128::from(left.to_u32()?.reverse_bits()) + }; + let v = match bit_size { + 8 => u128::from(right.to_u8()?.reverse_bits()), + 16 => u128::from(right.to_u16()?.reverse_bits()), + 32 => u128::from(right.to_u32()?.reverse_bits()), + 64 => u128::from(right.to_u64()?.reverse_bits()), + _ => unreachable!(), + }; + + // Perform polynomial division modulo 2. + // The algorithm for the division is an adapted version of the + // schoolbook division algorithm used for normal integer or polynomial + // division. In this context, the quotient is not calculated, since + // only the remainder is needed. + // + // The algorithm works as follows: + // 1. Pull down digits until division can be performed. In the context of division + // modulo 2 it means locating the most significant digit of the dividend and shifting + // the divisor such that the position of the divisors most significand digit and the + // dividends most significand digit match. + // 2. Perform a division and determine the remainder. Since it is arithmetic modulo 2, + // this operation is a simple bitwise exclusive or. + // 3. Repeat steps 1. and 2. until the full remainder is calculated. This is the case + // once the degree of the remainder polynomial is smaller than the degree of the + // divisor polynomial. In other words, the number of leading zeros of the remainder + // is larger than the number of leading zeros of the divisor. It is important to + // note that standard arithmetic comparison is not applicable here: + // 0b10011 / 0b11111 = 0b01100 is a valid division, even though the dividend is + // smaller than the divisor. + let mut dividend = (crc << bit_size) ^ (v << 32); + const POLYNOMIAL: u128 = 0x11EDC6F41; + while dividend.leading_zeros() <= POLYNOMIAL.leading_zeros() { + dividend ^= + (POLYNOMIAL << POLYNOMIAL.leading_zeros()) >> dividend.leading_zeros(); + } + + let result = u32::try_from(dividend).unwrap().reverse_bits(); + let result = if bit_size == 64 { + Scalar::from_u64(u64::from(result)) + } else { + Scalar::from_u32(result) + }; + + this.write_scalar(result, dest)?; + } + _ => return Ok(EmulateItemResult::NotSupported), + } + Ok(EmulateItemResult::NeedsReturn) + } +} diff --git a/src/tools/miri/tests/pass/shims/x86/intrinsics-x86-sse42.rs b/src/tools/miri/tests/pass/shims/x86/intrinsics-x86-sse42.rs new file mode 100644 index 000000000000..586caf4f280c --- /dev/null +++ b/src/tools/miri/tests/pass/shims/x86/intrinsics-x86-sse42.rs @@ -0,0 +1,423 @@ +// Ignore everything except x86 and x86_64 +// Any new targets that are added to CI should be ignored here. +// (We cannot use `cfg`-based tricks here since the `target-feature` flags below only work on x86.) +//@ignore-target-aarch64 +//@ignore-target-arm +//@ignore-target-avr +//@ignore-target-s390x +//@ignore-target-thumbv7em +//@ignore-target-wasm32 +//@compile-flags: -C target-feature=+sse4.2 + +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; +use std::mem::transmute; + +fn main() { + assert!(is_x86_feature_detected!("sse4.2")); + + unsafe { + test_sse42(); + } +} + +#[target_feature(enable = "sse4.2")] +unsafe fn test_sse42() { + // Mostly copied from library/stdarch/crates/core_arch/src/x86/sse42.rs + + test_crc(); + test_cmp(); + test_str(); +} + +#[target_feature(enable = "sse4.2")] +unsafe fn test_crc() { + #[target_feature(enable = "sse4.2")] + unsafe fn test_mm_crc32_u8() { + let crc = 0x2aa1e72b; + let v = 0x2a; + let i = _mm_crc32_u8(crc, v); + assert_eq!(i, 0xf24122e4); + + let crc = 0x61343ec4; + let v = 0xef; + let i = _mm_crc32_u8(crc, v); + assert_eq!(i, 0xb95511db); + } + test_mm_crc32_u8(); + + #[target_feature(enable = "sse4.2")] + unsafe fn test_mm_crc32_u16() { + let crc = 0x8ecec3b5; + let v = 0x22b; + let i = _mm_crc32_u16(crc, v); + assert_eq!(i, 0x13bb2fb); + + let crc = 0x150bc664; + let v = 0xa6c0; + let i = _mm_crc32_u16(crc, v); + assert_eq!(i, 0xab04fe4e); + } + test_mm_crc32_u16(); + + #[target_feature(enable = "sse4.2")] + unsafe fn test_mm_crc32_u32() { + let crc = 0xae2912c8; + let v = 0x845fed; + let i = _mm_crc32_u32(crc, v); + assert_eq!(i, 0xffae2ed1); + + let crc = 0x1a198fe3; + let v = 0x885585c2; + let i = _mm_crc32_u32(crc, v); + assert_eq!(i, 0x22443a7b); + } + test_mm_crc32_u32(); + + #[cfg(target_arch = "x86_64")] + #[target_feature(enable = "sse4.2")] + unsafe fn test_mm_crc32_u64() { + let crc = 0x7819dccd3e824; + let v = 0x2a22b845fed; + let i = _mm_crc32_u64(crc, v); + assert_eq!(i, 0xbb6cdc6c); + + let crc = 0x6dd960387fe13819; + let v = 0x1a7ea8fb571746b0; + let i = _mm_crc32_u64(crc, v); + assert_eq!(i, 0x315b4f6); + } + #[cfg(not(target_arch = "x86_64"))] + unsafe fn test_mm_crc32_u64() {} + test_mm_crc32_u64(); +} + +#[target_feature(enable = "sse4.2")] +unsafe fn test_cmp() { + let a = _mm_set_epi64x(0x2a, 0); + let b = _mm_set1_epi64x(0x00); + let i = _mm_cmpgt_epi64(a, b); + assert_eq_m128i(i, _mm_set_epi64x(0xffffffffffffffffu64 as i64, 0x00)); +} + +#[target_feature(enable = "sse4.2")] +unsafe fn test_str() { + #[target_feature(enable = "sse4.2")] + unsafe fn str_to_m128i(s: &[u8]) -> __m128i { + assert!(s.len() <= 16); + let slice = &mut [0u8; 16]; + std::ptr::copy_nonoverlapping(s.as_ptr(), slice.as_mut_ptr(), s.len()); + _mm_loadu_si128(slice.as_ptr() as *const _) + } + + // Test the `_mm_cmpistrm` intrinsic. + #[target_feature(enable = "sse4.2")] + unsafe fn test_mm_cmpistrm() { + let a = str_to_m128i(b"Hello! Good-Bye!"); + let b = str_to_m128i(b"hello! good-bye!"); + let i = _mm_cmpistrm::<_SIDD_UNIT_MASK>(a, b); + #[rustfmt::skip] + let res = _mm_setr_epi8( + 0x00, !0, !0, !0, !0, !0, !0, 0x00, + !0, !0, !0, !0, 0x00, !0, !0, !0, + ); + assert_eq_m128i(i, res); + } + test_mm_cmpistrm(); + + // Test the `_mm_cmpistri` intrinsic. + #[target_feature(enable = "sse4.2")] + unsafe fn test_mm_cmpistri() { + let a = str_to_m128i(b"Hello"); + let b = str_to_m128i(b" Hello "); + let i = _mm_cmpistri::<_SIDD_CMP_EQUAL_ORDERED>(a, b); + assert_eq!(3, i); + } + test_mm_cmpistri(); + + // Test the `_mm_cmpistrz` intrinsic. + #[target_feature(enable = "sse4.2")] + unsafe fn test_mm_cmpistrz() { + let a = str_to_m128i(b""); + let b = str_to_m128i(b"Hello"); + let i = _mm_cmpistrz::<_SIDD_CMP_EQUAL_ORDERED>(a, b); + assert_eq!(1, i); + } + test_mm_cmpistrz(); + + // Test the `_mm_cmpistrc` intrinsic. + #[target_feature(enable = "sse4.2")] + unsafe fn test_mm_cmpistrc() { + let a = str_to_m128i(b" "); + let b = str_to_m128i(b" ! "); + let i = _mm_cmpistrc::<_SIDD_UNIT_MASK>(a, b); + assert_eq!(1, i); + } + test_mm_cmpistrc(); + + // Test the `_mm_cmpistrs` intrinsic. + #[target_feature(enable = "sse4.2")] + unsafe fn test_mm_cmpistrs() { + let a = str_to_m128i(b"Hello"); + let b = str_to_m128i(b""); + let i = _mm_cmpistrs::<_SIDD_CMP_EQUAL_ORDERED>(a, b); + assert_eq!(1, i); + } + test_mm_cmpistrs(); + + // Test the `_mm_cmpistro` intrinsic. + #[target_feature(enable = "sse4.2")] + unsafe fn test_mm_cmpistro() { + #[rustfmt::skip] + let a_bytes = _mm_setr_epi8( + 0x00, 0x47, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, + 0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ); + #[rustfmt::skip] + let b_bytes = _mm_setr_epi8( + 0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, + 0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ); + let a = a_bytes; + let b = b_bytes; + let i = _mm_cmpistro::<{ _SIDD_UWORD_OPS | _SIDD_UNIT_MASK }>(a, b); + assert_eq!(0, i); + } + test_mm_cmpistro(); + + // Test the `_mm_cmpistra` intrinsic. + #[target_feature(enable = "sse4.2")] + unsafe fn test_mm_cmpistra() { + let a = str_to_m128i(b""); + let b = str_to_m128i(b"Hello!!!!!!!!!!!"); + let i = _mm_cmpistra::<_SIDD_UNIT_MASK>(a, b); + assert_eq!(1, i); + } + test_mm_cmpistra(); + + // Test the `_mm_cmpestrm` intrinsic. + #[target_feature(enable = "sse4.2")] + unsafe fn test_mm_cmpestrm() { + let a = str_to_m128i(b"Hello!"); + let b = str_to_m128i(b"Hello."); + let i = _mm_cmpestrm::<_SIDD_UNIT_MASK>(a, 5, b, 5); + #[rustfmt::skip] + let r = _mm_setr_epi8( + !0, !0, !0, !0, !0, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ); + assert_eq_m128i(i, r); + } + test_mm_cmpestrm(); + + // Test the `_mm_cmpestri` intrinsic. + #[target_feature(enable = "sse4.2")] + unsafe fn test_mm_cmpestri() { + let a = str_to_m128i(b"bar - garbage"); + let b = str_to_m128i(b"foobar"); + let i = _mm_cmpestri::<_SIDD_CMP_EQUAL_ORDERED>(a, 3, b, 6); + assert_eq!(3, i); + } + test_mm_cmpestri(); + + // Test the `_mm_cmpestrz` intrinsic. + #[target_feature(enable = "sse4.2")] + unsafe fn test_mm_cmpestrz() { + let a = str_to_m128i(b""); + let b = str_to_m128i(b"Hello"); + let i = _mm_cmpestrz::<_SIDD_CMP_EQUAL_ORDERED>(a, 16, b, 6); + assert_eq!(1, i); + } + test_mm_cmpestrz(); + + // Test the `_mm_cmpestrs` intrinsic. + #[target_feature(enable = "sse4.2")] + unsafe fn test_mm_cmpestrc() { + let va = str_to_m128i(b"!!!!!!!!"); + let vb = str_to_m128i(b" "); + let i = _mm_cmpestrc::<_SIDD_UNIT_MASK>(va, 7, vb, 7); + assert_eq!(0, i); + } + test_mm_cmpestrc(); + + // Test the `_mm_cmpestrs` intrinsic. + #[target_feature(enable = "sse4.2")] + unsafe fn test_mm_cmpestrs() { + #[rustfmt::skip] + let a_bytes = _mm_setr_epi8( + 0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, + 0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ); + let a = a_bytes; + let b = _mm_set1_epi8(0x00); + let i = _mm_cmpestrs::<_SIDD_UWORD_OPS>(a, 8, b, 0); + assert_eq!(0, i); + } + test_mm_cmpestrs(); + + // Test the `_mm_cmpestro` intrinsic. + #[target_feature(enable = "sse4.2")] + unsafe fn test_mm_cmpestro() { + let a = str_to_m128i(b"Hello"); + let b = str_to_m128i(b"World"); + let i = _mm_cmpestro::<_SIDD_UBYTE_OPS>(a, 5, b, 5); + assert_eq!(0, i); + } + test_mm_cmpestro(); + + // Test the `_mm_cmpestra` intrinsic. + #[target_feature(enable = "sse4.2")] + unsafe fn test_mm_cmpestra() { + let a = str_to_m128i(b"Cannot match a"); + let b = str_to_m128i(b"Null after 14"); + let i = _mm_cmpestra::<{ _SIDD_CMP_EQUAL_EACH | _SIDD_UNIT_MASK }>(a, 14, b, 16); + assert_eq!(1, i); + } + test_mm_cmpestra(); + + // Additional tests not inside the standard library. + + // Test the subset functionality of the intrinsic. + unsafe fn test_subset() { + let a = str_to_m128i(b"ABCDEFG"); + let b = str_to_m128i(b"ABC UVW XYZ EFG"); + + let i = _mm_cmpistrm::<{ _SIDD_CMP_EQUAL_ANY | _SIDD_UNIT_MASK }>(a, b); + #[rustfmt::skip] + let res = _mm_setr_epi8( + !0, !0, !0, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, !0, !0, !0, 0x00, + ); + assert_eq_m128i(i, res); + } + test_subset(); + + // Properly test index generation. + unsafe fn test_index() { + let a = str_to_m128i(b"Hello"); + let b = str_to_m128i(b"Hello Hello H"); + + let i = _mm_cmpistri::<{ _SIDD_CMP_EQUAL_EACH | _SIDD_LEAST_SIGNIFICANT }>(a, b); + assert_eq!(i, 0); + + let i = _mm_cmpistri::<{ _SIDD_CMP_EQUAL_EACH | _SIDD_MOST_SIGNIFICANT }>(a, b); + assert_eq!(i, 15); + + let a = str_to_m128i(b"Hello"); + let b = str_to_m128i(b" "); + let i = _mm_cmpistri::<{ _SIDD_CMP_EQUAL_EACH | _SIDD_MOST_SIGNIFICANT }>(a, b); + assert_eq!(i, 16); + } + test_index(); + + // Properly test the substring functionality of the intrinsics. + #[target_feature(enable = "sse4.2")] + unsafe fn test_substring() { + let a = str_to_m128i(b"Hello"); + let b = str_to_m128i(b"Hello Hello H"); + + let i = _mm_cmpistrm::<{ _SIDD_CMP_EQUAL_ORDERED | _SIDD_UNIT_MASK }>(a, b); + #[rustfmt::skip] + let res = _mm_setr_epi8( + !0, 0x00, 0x00, 0x00, 0x00, 0x00, !0, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ); + assert_eq_m128i(i, res); + } + test_substring(); + + // Test the range functionality of the intrinsics. + // Will also test signed values and word-sized values. + #[target_feature(enable = "sse4.2")] + unsafe fn test_ranges() { + let a = _mm_setr_epi16(0, 1, 7, 8, 0, 0, -100, 100); + let b = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + + let i = + _mm_cmpestrm::<{ _SIDD_SWORD_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK }>(a, 2, b, 8); + let res = _mm_setr_epi16(!0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m128i(i, res); + + let i = + _mm_cmpestrm::<{ _SIDD_SWORD_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK }>(a, 3, b, 8); + let res = _mm_setr_epi16(!0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m128i(i, res); + + let i = + _mm_cmpestrm::<{ _SIDD_SWORD_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK }>(a, 4, b, 8); + let res = _mm_setr_epi16(!0, 0, 0, 0, 0, 0, !0, !0); + assert_eq_m128i(i, res); + + let i = + _mm_cmpestrm::<{ _SIDD_SWORD_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK }>(a, 6, b, 8); + let res = _mm_setr_epi16(!0, 0, 0, 0, 0, 0, !0, !0); + assert_eq_m128i(i, res); + + let i = + _mm_cmpestrm::<{ _SIDD_SWORD_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK }>(a, 8, b, 8); + let res = _mm_setr_epi16(!0, !0, !0, !0, !0, !0, !0, !0); + assert_eq_m128i(i, res); + } + test_ranges(); + + // Confirm that the polarity bits work as indended. + #[target_feature(enable = "sse4.2")] + unsafe fn test_polarity() { + let a = str_to_m128i(b"Hello!"); + let b = str_to_m128i(b"hello?"); + + let i = _mm_cmpistrm::< + { (_SIDD_MASKED_NEGATIVE_POLARITY ^ _SIDD_NEGATIVE_POLARITY) | _SIDD_UNIT_MASK }, + >(a, b); + #[rustfmt::skip] + let res = _mm_setr_epi8( + 0x00, !0, !0, !0, !0, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ); + assert_eq_m128i(i, res); + + let i = _mm_cmpistrm::<{ _SIDD_MASKED_NEGATIVE_POLARITY | _SIDD_UNIT_MASK }>(a, b); + #[rustfmt::skip] + let res = _mm_setr_epi8( + !0, 0x00, 0x00, 0x00, 0x00, !0, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ); + assert_eq_m128i(i, res); + + let i = _mm_cmpistrm::<{ _SIDD_NEGATIVE_POLARITY | _SIDD_UNIT_MASK }>(a, b); + #[rustfmt::skip] + let res = _mm_setr_epi8( + !0, 0x00, 0x00, 0x00, 0x00, !0, !0, !0, + !0, !0, !0, !0, !0, !0, !0, !0, + ); + assert_eq_m128i(i, res); + } + test_polarity(); + + // Test the code path in which the intrinsic is supposed to + // return a bit mask instead of a byte mask. + #[target_feature(enable = "sse4.2")] + unsafe fn test_bitmask() { + let a = str_to_m128i(b"Hello! Good-Bye!"); + let b = str_to_m128i(b"hello! good-bye!"); + + let i = _mm_cmpistrm::<0>(a, b); + #[rustfmt::skip] + let res = _mm_setr_epi32(0b11101111_01111110, 0, 0, 0); + assert_eq_m128i(i, res); + + let i = _mm_cmpistrm::<_SIDD_MASKED_NEGATIVE_POLARITY>(a, b); + #[rustfmt::skip] + let res = _mm_setr_epi32(0b00010000_10000001, 0, 0, 0); + assert_eq_m128i(i, res); + } + test_bitmask(); +} + +#[track_caller] +#[target_feature(enable = "sse2")] +pub unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) { + assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b)) +}