Update the intrinsic checker tool (#1258)

2021-12-04 13:03:30 +00:00 · 2021-12-04 13:03:30 +00:00 · 937978eeef
commit 937978eeef
parent 972030f2b2
20 changed files with 1204 additions and 794 deletions
--- a/library/stdarch/.github/workflows/main.yml
+++ b/library/stdarch/.github/workflows/main.yml
@ -116,7 +116,6 @@ jobs:
          os: ubuntu-latest
        - target: armv7-unknown-linux-gnueabihf
          os: ubuntu-latest
-          rustflags: -C target-feature=+neon
        - target: mips-unknown-linux-gnu
          os: ubuntu-latest
          norun: true
--- a/library/stdarch/ci/docker/aarch64-unknown-linux-gnu/Dockerfile
+++ b/library/stdarch/ci/docker/aarch64-unknown-linux-gnu/Dockerfile
@ -1,4 +1,4 @@
-FROM ubuntu:20.04
+FROM ubuntu:21.10
 RUN apt-get update && apt-get install -y --no-install-recommends \
  gcc \
  g++ \
@ -10,7 +10,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
  qemu-user \
  make \
  file \
-  clang-12 \
+  clang-13 \
  lld

 ENV CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc \
--- a/library/stdarch/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile
+++ b/library/stdarch/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile
@ -1,13 +1,17 @@
-FROM ubuntu:18.04
+FROM ubuntu:21.10
 RUN apt-get update && apt-get install -y --no-install-recommends \
  gcc \
+  g++ \
  ca-certificates \
  libc6-dev \
  gcc-arm-linux-gnueabihf \
+  g++-arm-linux-gnueabihf \
  libc6-dev-armhf-cross \
  qemu-user \
  make \
-  file
+  file \
+  clang-13 \
+  lld
 ENV CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc \
    CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER="qemu-arm -L /usr/arm-linux-gnueabihf" \
    OBJDUMP=arm-linux-gnueabihf-objdump
--- a/library/stdarch/ci/run-docker.sh
+++ b/library/stdarch/ci/run-docker.sh
@ -25,7 +25,7 @@ run() {
      --env NORUN \
      --env RUSTFLAGS \
      --env STDARCH_TEST_NORUN \
-      --volume "$(dirname "$(dirname "$(command -v cargo)")")":/cargo \
+      --volume "${HOME}/.cargo":/cargo \
      --volume "$(rustc --print sysroot)":/rust:ro \
      --volume "$(pwd)":/checkout:ro \
      --volume "$(pwd)"/target:/checkout/target \
--- a/library/stdarch/ci/run.sh
+++ b/library/stdarch/ci/run.sh
@ -37,6 +37,13 @@ case ${TARGET} in
    mips-* | mipsel-*)
 	export RUSTFLAGS="${RUSTFLAGS} -C llvm-args=-fast-isel=false"
 	;;
+    # Some of our test dependencies use the deprecated `gcc` crates which is
+    # missing a fix from https://github.com/alexcrichton/cc-rs/pull/627. Apply
+    # the workaround manually here.
+    armv7-*eabihf | thumbv7-*eabihf)
+        export RUSTFLAGS="${RUSTFLAGS} -Ctarget-feature=+neon"
+        export TARGET_CFLAGS="-mfpu=vfpv3-d16"
+        ;;
 esac

 echo "RUSTFLAGS=${RUSTFLAGS}"
@ -122,7 +129,10 @@ esac

 if [ "${TARGET}" = "aarch64-unknown-linux-gnu" ]; then
    export CPPFLAGS="-fuse-ld=lld -I/usr/aarch64-linux-gnu/include/ -I/usr/aarch64-linux-gnu/include/c++/9/aarch64-linux-gnu/"
-    cargo run ${INTRINSIC_TEST} --release --bin intrinsic-test -- crates/intrinsic-test/acle/tools/intrinsic_db/advsimd.csv --runner "${CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER}" --cppcompiler "clang++-12" --skip crates/intrinsic-test/missing.txt
+    RUST_LOG=warn cargo run ${INTRINSIC_TEST} --release --bin intrinsic-test -- crates/intrinsic-test/acle/tools/intrinsic_db/advsimd.csv --runner "${CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER}" --cppcompiler "clang++-13" --skip crates/intrinsic-test/missing_aarch64.txt
+elif [ "${TARGET}" = "armv7-unknown-linux-gnueabihf" ]; then
+    export CPPFLAGS="-fuse-ld=lld -I/usr/arm-linux-gnueabihf/include/ -I/usr/arm-linux-gnueabihf/include/c++/9/arm-linux-gnueabihf/"
+    RUST_LOG=warn cargo run ${INTRINSIC_TEST} --release --bin intrinsic-test -- crates/intrinsic-test/acle/tools/intrinsic_db/advsimd.csv --runner "${CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER}" --cppcompiler "clang++-13" --skip crates/intrinsic-test/missing_arm.txt --a32
 fi

 if [ "$NORUN" != "1" ] && [ "$NOSTD" != 1 ]; then
--- a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
@ -4455,7 +4455,7 @@ pub unsafe fn vnegq_s64(a: int64x2_t) -> int64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(neg))]
 pub unsafe fn vnegd_s64(a: i64) -> i64 {
-    -a
+    a.wrapping_neg()
 }

 /// Negate
@ -5213,7 +5213,7 @@ pub unsafe fn vld2q_s64(a: *const i64) -> int64x2x2_t {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v2i64.p0v2i64")]
        fn vld2q_s64_(ptr: *const int64x2_t) -> int64x2x2_t;
    }
-    vld2q_s64_(a.cast())
+    vld2q_s64_(a as _)
 }

 /// Load multiple 2-element structures to two registers
@ -5242,7 +5242,7 @@ pub unsafe fn vld2_f64(a: *const f64) -> float64x1x2_t {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v1f64.p0v1f64")]
        fn vld2_f64_(ptr: *const float64x1_t) -> float64x1x2_t;
    }
-    vld2_f64_(a.cast())
+    vld2_f64_(a as _)
 }

 /// Load multiple 2-element structures to two registers
@ -5255,7 +5255,7 @@ pub unsafe fn vld2q_f64(a: *const f64) -> float64x2x2_t {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v2f64.p0v2f64")]
        fn vld2q_f64_(ptr: *const float64x2_t) -> float64x2x2_t;
    }
-    vld2q_f64_(a.cast())
+    vld2q_f64_(a as _)
 }

 /// Load single 2-element structure and replicate to all lanes of two registers
@ -5268,7 +5268,7 @@ pub unsafe fn vld2q_dup_s64(a: *const i64) -> int64x2x2_t {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v2i64.p0i64")]
        fn vld2q_dup_s64_(ptr: *const i64) -> int64x2x2_t;
    }
-    vld2q_dup_s64_(a.cast())
+    vld2q_dup_s64_(a as _)
 }

 /// Load single 2-element structure and replicate to all lanes of two registers
@ -5297,7 +5297,7 @@ pub unsafe fn vld2_dup_f64(a: *const f64) -> float64x1x2_t {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v1f64.p0f64")]
        fn vld2_dup_f64_(ptr: *const f64) -> float64x1x2_t;
    }
-    vld2_dup_f64_(a.cast())
+    vld2_dup_f64_(a as _)
 }

 /// Load single 2-element structure and replicate to all lanes of two registers
@ -5310,7 +5310,7 @@ pub unsafe fn vld2q_dup_f64(a: *const f64) -> float64x2x2_t {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v2f64.p0f64")]
        fn vld2q_dup_f64_(ptr: *const f64) -> float64x2x2_t;
    }
-    vld2q_dup_f64_(a.cast())
+    vld2q_dup_f64_(a as _)
 }

 /// Load multiple 2-element structures to two registers
@ -5325,7 +5325,7 @@ pub unsafe fn vld2q_lane_s8<const LANE: i32>(a: *const i8, b: int8x16x2_t) -> in
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v16i8.p0i8")]
        fn vld2q_lane_s8_(a: int8x16_t, b: int8x16_t, n: i64, ptr: *const i8) -> int8x16x2_t;
    }
-    vld2q_lane_s8_(b.0, b.1, LANE as i64, a.cast())
+    vld2q_lane_s8_(b.0, b.1, LANE as i64, a as _)
 }

 /// Load multiple 2-element structures to two registers
@ -5340,7 +5340,7 @@ pub unsafe fn vld2_lane_s64<const LANE: i32>(a: *const i64, b: int64x1x2_t) -> i
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v1i64.p0i8")]
        fn vld2_lane_s64_(a: int64x1_t, b: int64x1_t, n: i64, ptr: *const i8) -> int64x1x2_t;
    }
-    vld2_lane_s64_(b.0, b.1, LANE as i64, a.cast())
+    vld2_lane_s64_(b.0, b.1, LANE as i64, a as _)
 }

 /// Load multiple 2-element structures to two registers
@ -5355,7 +5355,7 @@ pub unsafe fn vld2q_lane_s64<const LANE: i32>(a: *const i64, b: int64x2x2_t) ->
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v2i64.p0i8")]
        fn vld2q_lane_s64_(a: int64x2_t, b: int64x2_t, n: i64, ptr: *const i8) -> int64x2x2_t;
    }
-    vld2q_lane_s64_(b.0, b.1, LANE as i64, a.cast())
+    vld2q_lane_s64_(b.0, b.1, LANE as i64, a as _)
 }

 /// Load multiple 2-element structures to two registers
@ -5430,7 +5430,7 @@ pub unsafe fn vld2_lane_f64<const LANE: i32>(a: *const f64, b: float64x1x2_t) ->
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v1f64.p0i8")]
        fn vld2_lane_f64_(a: float64x1_t, b: float64x1_t, n: i64, ptr: *const i8) -> float64x1x2_t;
    }
-    vld2_lane_f64_(b.0, b.1, LANE as i64, a.cast())
+    vld2_lane_f64_(b.0, b.1, LANE as i64, a as _)
 }

 /// Load multiple 2-element structures to two registers
@ -5445,7 +5445,7 @@ pub unsafe fn vld2q_lane_f64<const LANE: i32>(a: *const f64, b: float64x2x2_t) -
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v2f64.p0i8")]
        fn vld2q_lane_f64_(a: float64x2_t, b: float64x2_t, n: i64, ptr: *const i8) -> float64x2x2_t;
    }
-    vld2q_lane_f64_(b.0, b.1, LANE as i64, a.cast())
+    vld2q_lane_f64_(b.0, b.1, LANE as i64, a as _)
 }

 /// Load multiple 3-element structures to three registers
@ -5458,7 +5458,7 @@ pub unsafe fn vld3q_s64(a: *const i64) -> int64x2x3_t {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v2i64.p0v2i64")]
        fn vld3q_s64_(ptr: *const int64x2_t) -> int64x2x3_t;
    }
-    vld3q_s64_(a.cast())
+    vld3q_s64_(a as _)
 }

 /// Load multiple 3-element structures to three registers
@ -5487,7 +5487,7 @@ pub unsafe fn vld3_f64(a: *const f64) -> float64x1x3_t {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v1f64.p0v1f64")]
        fn vld3_f64_(ptr: *const float64x1_t) -> float64x1x3_t;
    }
-    vld3_f64_(a.cast())
+    vld3_f64_(a as _)
 }

 /// Load multiple 3-element structures to three registers
@ -5500,7 +5500,7 @@ pub unsafe fn vld3q_f64(a: *const f64) -> float64x2x3_t {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v2f64.p0v2f64")]
        fn vld3q_f64_(ptr: *const float64x2_t) -> float64x2x3_t;
    }
-    vld3q_f64_(a.cast())
+    vld3q_f64_(a as _)
 }

 /// Load single 3-element structure and replicate to all lanes of three registers
@ -5513,7 +5513,7 @@ pub unsafe fn vld3q_dup_s64(a: *const i64) -> int64x2x3_t {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v2i64.p0i64")]
        fn vld3q_dup_s64_(ptr: *const i64) -> int64x2x3_t;
    }
-    vld3q_dup_s64_(a.cast())
+    vld3q_dup_s64_(a as _)
 }

 /// Load single 3-element structure and replicate to all lanes of three registers
@ -5542,7 +5542,7 @@ pub unsafe fn vld3_dup_f64(a: *const f64) -> float64x1x3_t {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v1f64.p0f64")]
        fn vld3_dup_f64_(ptr: *const f64) -> float64x1x3_t;
    }
-    vld3_dup_f64_(a.cast())
+    vld3_dup_f64_(a as _)
 }

 /// Load single 3-element structure and replicate to all lanes of three registers
@ -5555,7 +5555,7 @@ pub unsafe fn vld3q_dup_f64(a: *const f64) -> float64x2x3_t {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v2f64.p0f64")]
        fn vld3q_dup_f64_(ptr: *const f64) -> float64x2x3_t;
    }
-    vld3q_dup_f64_(a.cast())
+    vld3q_dup_f64_(a as _)
 }

 /// Load multiple 3-element structures to two registers
@ -5570,7 +5570,7 @@ pub unsafe fn vld3q_lane_s8<const LANE: i32>(a: *const i8, b: int8x16x3_t) -> in
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v16i8.p0i8")]
        fn vld3q_lane_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t, n: i64, ptr: *const i8) -> int8x16x3_t;
    }
-    vld3q_lane_s8_(b.0, b.1, b.2, LANE as i64, a.cast())
+    vld3q_lane_s8_(b.0, b.1, b.2, LANE as i64, a as _)
 }

 /// Load multiple 3-element structures to two registers
@ -5585,7 +5585,7 @@ pub unsafe fn vld3_lane_s64<const LANE: i32>(a: *const i64, b: int64x1x3_t) -> i
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v1i64.p0i8")]
        fn vld3_lane_s64_(a: int64x1_t, b: int64x1_t, c: int64x1_t, n: i64, ptr: *const i8) -> int64x1x3_t;
    }
-    vld3_lane_s64_(b.0, b.1, b.2, LANE as i64, a.cast())
+    vld3_lane_s64_(b.0, b.1, b.2, LANE as i64, a as _)
 }

 /// Load multiple 3-element structures to two registers
@ -5600,7 +5600,7 @@ pub unsafe fn vld3q_lane_s64<const LANE: i32>(a: *const i64, b: int64x2x3_t) ->
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v2i64.p0i8")]
        fn vld3q_lane_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t, n: i64, ptr: *const i8) -> int64x2x3_t;
    }
-    vld3q_lane_s64_(b.0, b.1, b.2, LANE as i64, a.cast())
+    vld3q_lane_s64_(b.0, b.1, b.2, LANE as i64, a as _)
 }

 /// Load multiple 3-element structures to three registers
@ -5675,7 +5675,7 @@ pub unsafe fn vld3_lane_f64<const LANE: i32>(a: *const f64, b: float64x1x3_t) ->
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v1f64.p0i8")]
        fn vld3_lane_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t, n: i64, ptr: *const i8) -> float64x1x3_t;
    }
-    vld3_lane_f64_(b.0, b.1, b.2, LANE as i64, a.cast())
+    vld3_lane_f64_(b.0, b.1, b.2, LANE as i64, a as _)
 }

 /// Load multiple 3-element structures to three registers
@ -5690,7 +5690,7 @@ pub unsafe fn vld3q_lane_f64<const LANE: i32>(a: *const f64, b: float64x2x3_t) -
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v2f64.p0i8")]
        fn vld3q_lane_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t, n: i64, ptr: *const i8) -> float64x2x3_t;
    }
-    vld3q_lane_f64_(b.0, b.1, b.2, LANE as i64, a.cast())
+    vld3q_lane_f64_(b.0, b.1, b.2, LANE as i64, a as _)
 }

 /// Load multiple 4-element structures to four registers
@ -5703,7 +5703,7 @@ pub unsafe fn vld4q_s64(a: *const i64) -> int64x2x4_t {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v2i64.p0v2i64")]
        fn vld4q_s64_(ptr: *const int64x2_t) -> int64x2x4_t;
    }
-    vld4q_s64_(a.cast())
+    vld4q_s64_(a as _)
 }

 /// Load multiple 4-element structures to four registers
@ -5732,7 +5732,7 @@ pub unsafe fn vld4_f64(a: *const f64) -> float64x1x4_t {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v1f64.p0v1f64")]
        fn vld4_f64_(ptr: *const float64x1_t) -> float64x1x4_t;
    }
-    vld4_f64_(a.cast())
+    vld4_f64_(a as _)
 }

 /// Load multiple 4-element structures to four registers
@ -5745,7 +5745,7 @@ pub unsafe fn vld4q_f64(a: *const f64) -> float64x2x4_t {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v2f64.p0v2f64")]
        fn vld4q_f64_(ptr: *const float64x2_t) -> float64x2x4_t;
    }
-    vld4q_f64_(a.cast())
+    vld4q_f64_(a as _)
 }

 /// Load single 4-element structure and replicate to all lanes of four registers
@ -5758,7 +5758,7 @@ pub unsafe fn vld4q_dup_s64(a: *const i64) -> int64x2x4_t {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v2i64.p0i64")]
        fn vld4q_dup_s64_(ptr: *const i64) -> int64x2x4_t;
    }
-    vld4q_dup_s64_(a.cast())
+    vld4q_dup_s64_(a as _)
 }

 /// Load single 4-element structure and replicate to all lanes of four registers
@ -5787,7 +5787,7 @@ pub unsafe fn vld4_dup_f64(a: *const f64) -> float64x1x4_t {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v1f64.p0f64")]
        fn vld4_dup_f64_(ptr: *const f64) -> float64x1x4_t;
    }
-    vld4_dup_f64_(a.cast())
+    vld4_dup_f64_(a as _)
 }

 /// Load single 4-element structure and replicate to all lanes of four registers
@ -5800,7 +5800,7 @@ pub unsafe fn vld4q_dup_f64(a: *const f64) -> float64x2x4_t {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v2f64.p0f64")]
        fn vld4q_dup_f64_(ptr: *const f64) -> float64x2x4_t;
    }
-    vld4q_dup_f64_(a.cast())
+    vld4q_dup_f64_(a as _)
 }

 /// Load multiple 4-element structures to four registers
@ -5815,7 +5815,7 @@ pub unsafe fn vld4q_lane_s8<const LANE: i32>(a: *const i8, b: int8x16x4_t) -> in
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v16i8.p0i8")]
        fn vld4q_lane_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: int8x16_t, n: i64, ptr: *const i8) -> int8x16x4_t;
    }
-    vld4q_lane_s8_(b.0, b.1, b.2, b.3, LANE as i64, a.cast())
+    vld4q_lane_s8_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
 }

 /// Load multiple 4-element structures to four registers
@ -5830,7 +5830,7 @@ pub unsafe fn vld4_lane_s64<const LANE: i32>(a: *const i64, b: int64x1x4_t) -> i
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v1i64.p0i8")]
        fn vld4_lane_s64_(a: int64x1_t, b: int64x1_t, c: int64x1_t, d: int64x1_t, n: i64, ptr: *const i8) -> int64x1x4_t;
    }
-    vld4_lane_s64_(b.0, b.1, b.2, b.3, LANE as i64, a.cast())
+    vld4_lane_s64_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
 }

 /// Load multiple 4-element structures to four registers
@ -5845,7 +5845,7 @@ pub unsafe fn vld4q_lane_s64<const LANE: i32>(a: *const i64, b: int64x2x4_t) ->
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v2i64.p0i8")]
        fn vld4q_lane_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t, d: int64x2_t, n: i64, ptr: *const i8) -> int64x2x4_t;
    }
-    vld4q_lane_s64_(b.0, b.1, b.2, b.3, LANE as i64, a.cast())
+    vld4q_lane_s64_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
 }

 /// Load multiple 4-element structures to four registers
@ -5920,7 +5920,7 @@ pub unsafe fn vld4_lane_f64<const LANE: i32>(a: *const f64, b: float64x1x4_t) ->
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v1f64.p0i8")]
        fn vld4_lane_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t, d: float64x1_t, n: i64, ptr: *const i8) -> float64x1x4_t;
    }
-    vld4_lane_f64_(b.0, b.1, b.2, b.3, LANE as i64, a.cast())
+    vld4_lane_f64_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
 }

 /// Load multiple 4-element structures to four registers
@ -5935,7 +5935,7 @@ pub unsafe fn vld4q_lane_f64<const LANE: i32>(a: *const f64, b: float64x2x4_t) -
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v2f64.p0i8")]
        fn vld4q_lane_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t, d: float64x2_t, n: i64, ptr: *const i8) -> float64x2x4_t;
    }
-    vld4q_lane_f64_(b.0, b.1, b.2, b.3, LANE as i64, a.cast())
+    vld4q_lane_f64_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
 }

 /// Store multiple single-element structures from one, two, three, or four registers
@ -6046,7 +6046,7 @@ pub unsafe fn vst2q_s64(a: *mut i64, b: int64x2x2_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v2i64.p0i8")]
        fn vst2q_s64_(a: int64x2_t, b: int64x2_t, ptr: *mut i8);
    }
-    vst2q_s64_(b.0, b.1, a.cast())
+    vst2q_s64_(b.0, b.1, a as _)
 }

 /// Store multiple 2-element structures from two registers
@ -6075,7 +6075,7 @@ pub unsafe fn vst2_f64(a: *mut f64, b: float64x1x2_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v1f64.p0i8")]
        fn vst2_f64_(a: float64x1_t, b: float64x1_t, ptr: *mut i8);
    }
-    vst2_f64_(b.0, b.1, a.cast())
+    vst2_f64_(b.0, b.1, a as _)
 }

 /// Store multiple 2-element structures from two registers
@ -6088,7 +6088,7 @@ pub unsafe fn vst2q_f64(a: *mut f64, b: float64x2x2_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v2f64.p0i8")]
        fn vst2q_f64_(a: float64x2_t, b: float64x2_t, ptr: *mut i8);
    }
-    vst2q_f64_(b.0, b.1, a.cast())
+    vst2q_f64_(b.0, b.1, a as _)
 }

 /// Store multiple 2-element structures from two registers
@ -6103,7 +6103,7 @@ pub unsafe fn vst2q_lane_s8<const LANE: i32>(a: *mut i8, b: int8x16x2_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v16i8.p0i8")]
        fn vst2q_lane_s8_(a: int8x16_t, b: int8x16_t, n: i64, ptr: *mut i8);
    }
-    vst2q_lane_s8_(b.0, b.1, LANE as i64, a.cast())
+    vst2q_lane_s8_(b.0, b.1, LANE as i64, a as _)
 }

 /// Store multiple 2-element structures from two registers
@ -6118,7 +6118,7 @@ pub unsafe fn vst2_lane_s64<const LANE: i32>(a: *mut i64, b: int64x1x2_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v1i64.p0i8")]
        fn vst2_lane_s64_(a: int64x1_t, b: int64x1_t, n: i64, ptr: *mut i8);
    }
-    vst2_lane_s64_(b.0, b.1, LANE as i64, a.cast())
+    vst2_lane_s64_(b.0, b.1, LANE as i64, a as _)
 }

 /// Store multiple 2-element structures from two registers
@ -6133,7 +6133,7 @@ pub unsafe fn vst2q_lane_s64<const LANE: i32>(a: *mut i64, b: int64x2x2_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v2i64.p0i8")]
        fn vst2q_lane_s64_(a: int64x2_t, b: int64x2_t, n: i64, ptr: *mut i8);
    }
-    vst2q_lane_s64_(b.0, b.1, LANE as i64, a.cast())
+    vst2q_lane_s64_(b.0, b.1, LANE as i64, a as _)
 }

 /// Store multiple 2-element structures from two registers
@ -6208,7 +6208,7 @@ pub unsafe fn vst2_lane_f64<const LANE: i32>(a: *mut f64, b: float64x1x2_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v1f64.p0i8")]
        fn vst2_lane_f64_(a: float64x1_t, b: float64x1_t, n: i64, ptr: *mut i8);
    }
-    vst2_lane_f64_(b.0, b.1, LANE as i64, a.cast())
+    vst2_lane_f64_(b.0, b.1, LANE as i64, a as _)
 }

 /// Store multiple 2-element structures from two registers
@ -6223,7 +6223,7 @@ pub unsafe fn vst2q_lane_f64<const LANE: i32>(a: *mut f64, b: float64x2x2_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v2f64.p0i8")]
        fn vst2q_lane_f64_(a: float64x2_t, b: float64x2_t, n: i64, ptr: *mut i8);
    }
-    vst2q_lane_f64_(b.0, b.1, LANE as i64, a.cast())
+    vst2q_lane_f64_(b.0, b.1, LANE as i64, a as _)
 }

 /// Store multiple 3-element structures from three registers
@ -6236,7 +6236,7 @@ pub unsafe fn vst3q_s64(a: *mut i64, b: int64x2x3_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v2i64.p0i8")]
        fn vst3q_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t, ptr: *mut i8);
    }
-    vst3q_s64_(b.0, b.1, b.2, a.cast())
+    vst3q_s64_(b.0, b.1, b.2, a as _)
 }

 /// Store multiple 3-element structures from three registers
@ -6265,7 +6265,7 @@ pub unsafe fn vst3_f64(a: *mut f64, b: float64x1x3_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v1f64.p0i8")]
        fn vst3_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t, ptr: *mut i8);
    }
-    vst3_f64_(b.0, b.1, b.2, a.cast())
+    vst3_f64_(b.0, b.1, b.2, a as _)
 }

 /// Store multiple 3-element structures from three registers
@ -6278,7 +6278,7 @@ pub unsafe fn vst3q_f64(a: *mut f64, b: float64x2x3_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v2f64.p0i8")]
        fn vst3q_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t, ptr: *mut i8);
    }
-    vst3q_f64_(b.0, b.1, b.2, a.cast())
+    vst3q_f64_(b.0, b.1, b.2, a as _)
 }

 /// Store multiple 3-element structures from three registers
@ -6293,7 +6293,7 @@ pub unsafe fn vst3q_lane_s8<const LANE: i32>(a: *mut i8, b: int8x16x3_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v16i8.p0i8")]
        fn vst3q_lane_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t, n: i64, ptr: *mut i8);
    }
-    vst3q_lane_s8_(b.0, b.1, b.2, LANE as i64, a.cast())
+    vst3q_lane_s8_(b.0, b.1, b.2, LANE as i64, a as _)
 }

 /// Store multiple 3-element structures from three registers
@ -6308,7 +6308,7 @@ pub unsafe fn vst3_lane_s64<const LANE: i32>(a: *mut i64, b: int64x1x3_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v1i64.p0i8")]
        fn vst3_lane_s64_(a: int64x1_t, b: int64x1_t, c: int64x1_t, n: i64, ptr: *mut i8);
    }
-    vst3_lane_s64_(b.0, b.1, b.2, LANE as i64, a.cast())
+    vst3_lane_s64_(b.0, b.1, b.2, LANE as i64, a as _)
 }

 /// Store multiple 3-element structures from three registers
@ -6323,7 +6323,7 @@ pub unsafe fn vst3q_lane_s64<const LANE: i32>(a: *mut i64, b: int64x2x3_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v2i64.p0i8")]
        fn vst3q_lane_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t, n: i64, ptr: *mut i8);
    }
-    vst3q_lane_s64_(b.0, b.1, b.2, LANE as i64, a.cast())
+    vst3q_lane_s64_(b.0, b.1, b.2, LANE as i64, a as _)
 }

 /// Store multiple 3-element structures from three registers
@ -6398,7 +6398,7 @@ pub unsafe fn vst3_lane_f64<const LANE: i32>(a: *mut f64, b: float64x1x3_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v1f64.p0i8")]
        fn vst3_lane_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t, n: i64, ptr: *mut i8);
    }
-    vst3_lane_f64_(b.0, b.1, b.2, LANE as i64, a.cast())
+    vst3_lane_f64_(b.0, b.1, b.2, LANE as i64, a as _)
 }

 /// Store multiple 3-element structures from three registers
@ -6413,7 +6413,7 @@ pub unsafe fn vst3q_lane_f64<const LANE: i32>(a: *mut f64, b: float64x2x3_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v2f64.p0i8")]
        fn vst3q_lane_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t, n: i64, ptr: *mut i8);
    }
-    vst3q_lane_f64_(b.0, b.1, b.2, LANE as i64, a.cast())
+    vst3q_lane_f64_(b.0, b.1, b.2, LANE as i64, a as _)
 }

 /// Store multiple 4-element structures from four registers
@ -6426,7 +6426,7 @@ pub unsafe fn vst4q_s64(a: *mut i64, b: int64x2x4_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v2i64.p0i8")]
        fn vst4q_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t, d: int64x2_t, ptr: *mut i8);
    }
-    vst4q_s64_(b.0, b.1, b.2, b.3, a.cast())
+    vst4q_s64_(b.0, b.1, b.2, b.3, a as _)
 }

 /// Store multiple 4-element structures from four registers
@ -6455,7 +6455,7 @@ pub unsafe fn vst4_f64(a: *mut f64, b: float64x1x4_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v1f64.p0i8")]
        fn vst4_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t, d: float64x1_t, ptr: *mut i8);
    }
-    vst4_f64_(b.0, b.1, b.2, b.3, a.cast())
+    vst4_f64_(b.0, b.1, b.2, b.3, a as _)
 }

 /// Store multiple 4-element structures from four registers
@ -6468,7 +6468,7 @@ pub unsafe fn vst4q_f64(a: *mut f64, b: float64x2x4_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v2f64.p0i8")]
        fn vst4q_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t, d: float64x2_t, ptr: *mut i8);
    }
-    vst4q_f64_(b.0, b.1, b.2, b.3, a.cast())
+    vst4q_f64_(b.0, b.1, b.2, b.3, a as _)
 }

 /// Store multiple 4-element structures from four registers
@ -6483,7 +6483,7 @@ pub unsafe fn vst4q_lane_s8<const LANE: i32>(a: *mut i8, b: int8x16x4_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v16i8.p0i8")]
        fn vst4q_lane_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: int8x16_t, n: i64, ptr: *mut i8);
    }
-    vst4q_lane_s8_(b.0, b.1, b.2, b.3, LANE as i64, a.cast())
+    vst4q_lane_s8_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
 }

 /// Store multiple 4-element structures from four registers
@ -6498,7 +6498,7 @@ pub unsafe fn vst4_lane_s64<const LANE: i32>(a: *mut i64, b: int64x1x4_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v1i64.p0i8")]
        fn vst4_lane_s64_(a: int64x1_t, b: int64x1_t, c: int64x1_t, d: int64x1_t, n: i64, ptr: *mut i8);
    }
-    vst4_lane_s64_(b.0, b.1, b.2, b.3, LANE as i64, a.cast())
+    vst4_lane_s64_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
 }

 /// Store multiple 4-element structures from four registers
@ -6513,7 +6513,7 @@ pub unsafe fn vst4q_lane_s64<const LANE: i32>(a: *mut i64, b: int64x2x4_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v2i64.p0i8")]
        fn vst4q_lane_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t, d: int64x2_t, n: i64, ptr: *mut i8);
    }
-    vst4q_lane_s64_(b.0, b.1, b.2, b.3, LANE as i64, a.cast())
+    vst4q_lane_s64_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
 }

 /// Store multiple 4-element structures from four registers
@ -6588,7 +6588,7 @@ pub unsafe fn vst4_lane_f64<const LANE: i32>(a: *mut f64, b: float64x1x4_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v1f64.p0i8")]
        fn vst4_lane_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t, d: float64x1_t, n: i64, ptr: *mut i8);
    }
-    vst4_lane_f64_(b.0, b.1, b.2, b.3, LANE as i64, a.cast())
+    vst4_lane_f64_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
 }

 /// Store multiple 4-element structures from four registers
@ -6603,7 +6603,7 @@ pub unsafe fn vst4q_lane_f64<const LANE: i32>(a: *mut f64, b: float64x2x4_t) {
        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v2f64.p0i8")]
        fn vst4q_lane_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t, d: float64x2_t, n: i64, ptr: *mut i8);
    }
-    vst4q_lane_f64_(b.0, b.1, b.2, b.3, LANE as i64, a.cast())
+    vst4q_lane_f64_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
 }

 /// Multiply
@ -7512,7 +7512,7 @@ pub unsafe fn vsubq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vsubd_s64(a: i64, b: i64) -> i64 {
-    a - b
+    a.wrapping_sub(b)
 }

 /// Subtract
@ -7520,7 +7520,7 @@ pub unsafe fn vsubd_s64(a: i64, b: i64) -> i64 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vsubd_u64(a: u64, b: u64) -> u64 {
-    a - b
+    a.wrapping_sub(b)
 }

 /// Add
@ -7528,7 +7528,7 @@ pub unsafe fn vsubd_u64(a: u64, b: u64) -> u64 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vaddd_s64(a: i64, b: i64) -> i64 {
-    a + b
+    a.wrapping_add(b)
 }

 /// Add
@ -7536,7 +7536,7 @@ pub unsafe fn vaddd_s64(a: i64, b: i64) -> i64 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vaddd_u64(a: u64, b: u64) -> u64 {
-    a + b
+    a.wrapping_add(b)
 }

 /// Floating-point add across vector
@ -11536,7 +11536,7 @@ pub unsafe fn vrshrn_high_n_u64<const N: i32>(a: uint32x2_t, b: uint64x2_t) -> u
 pub unsafe fn vrsrad_n_s64<const N: i32>(a: i64, b: i64) -> i64 {
    static_assert!(N : i32 where N >= 1 && N <= 64);
    let b: i64 = vrshrd_n_s64::<N>(b);
-    a + b
+    a.wrapping_add(b)
 }

 /// Ungisned rounding shift right and accumulate.
@ -11547,7 +11547,7 @@ pub unsafe fn vrsrad_n_s64<const N: i32>(a: i64, b: i64) -> i64 {
 pub unsafe fn vrsrad_n_u64<const N: i32>(a: u64, b: u64) -> u64 {
    static_assert!(N : i32 where N >= 1 && N <= 64);
    let b: u64 = vrshrd_n_u64::<N>(b);
-    a + b
+    a.wrapping_add(b)
 }

 /// Rounding subtract returning high narrow
@ -17802,7 +17802,7 @@ mod test {
        let a: [f64; 2] = [0., 1.];
        let e: [f64; 1] = [1.];
        let mut r: [f64; 1] = [0f64; 1];
-        vst1_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst1_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -17811,7 +17811,7 @@ mod test {
        let a: [f64; 3] = [0., 1., 2.];
        let e: [f64; 2] = [1., 0.];
        let mut r: [f64; 2] = [0f64; 2];
-        vst1q_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst1q_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -17820,7 +17820,7 @@ mod test {
        let a: [f64; 3] = [0., 1., 2.];
        let e: [f64; 2] = [1., 2.];
        let mut r: [f64; 2] = [0f64; 2];
-        vst1_f64_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst1_f64_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -17829,7 +17829,7 @@ mod test {
        let a: [f64; 5] = [0., 1., 2., 3., 4.];
        let e: [f64; 4] = [1., 2., 3., 4.];
        let mut r: [f64; 4] = [0f64; 4];
-        vst1q_f64_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst1q_f64_x2(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -17838,7 +17838,7 @@ mod test {
        let a: [f64; 4] = [0., 1., 2., 3.];
        let e: [f64; 3] = [1., 2., 3.];
        let mut r: [f64; 3] = [0f64; 3];
-        vst1_f64_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst1_f64_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -17847,7 +17847,7 @@ mod test {
        let a: [f64; 7] = [0., 1., 2., 3., 4., 5., 6.];
        let e: [f64; 6] = [1., 2., 3., 4., 5., 6.];
        let mut r: [f64; 6] = [0f64; 6];
-        vst1q_f64_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst1q_f64_x3(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -17856,7 +17856,7 @@ mod test {
        let a: [f64; 5] = [0., 1., 2., 3., 4.];
        let e: [f64; 4] = [1., 2., 3., 4.];
        let mut r: [f64; 4] = [0f64; 4];
-        vst1_f64_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst1_f64_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -17865,7 +17865,7 @@ mod test {
        let a: [f64; 9] = [0., 1., 2., 3., 4., 5., 6., 7., 8.];
        let e: [f64; 8] = [1., 2., 3., 4., 5., 6., 7., 8.];
        let mut r: [f64; 8] = [0f64; 8];
-        vst1q_f64_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst1q_f64_x4(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -17874,7 +17874,7 @@ mod test {
        let a: [i64; 5] = [0, 1, 2, 2, 3];
        let e: [i64; 4] = [1, 2, 2, 3];
        let mut r: [i64; 4] = [0i64; 4];
-        vst2q_s64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst2q_s64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -17883,7 +17883,7 @@ mod test {
        let a: [u64; 5] = [0, 1, 2, 2, 3];
        let e: [u64; 4] = [1, 2, 2, 3];
        let mut r: [u64; 4] = [0u64; 4];
-        vst2q_u64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst2q_u64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -17892,7 +17892,7 @@ mod test {
        let a: [u64; 5] = [0, 1, 2, 2, 3];
        let e: [u64; 4] = [1, 2, 2, 3];
        let mut r: [u64; 4] = [0u64; 4];
-        vst2q_p64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst2q_p64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -17901,7 +17901,7 @@ mod test {
        let a: [f64; 3] = [0., 1., 2.];
        let e: [f64; 2] = [1., 2.];
        let mut r: [f64; 2] = [0f64; 2];
-        vst2_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst2_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -17910,7 +17910,7 @@ mod test {
        let a: [f64; 5] = [0., 1., 2., 2., 3.];
        let e: [f64; 4] = [1., 2., 2., 3.];
        let mut r: [f64; 4] = [0f64; 4];
-        vst2q_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst2q_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -17919,7 +17919,7 @@ mod test {
        let a: [i8; 33] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17];
        let e: [i8; 32] = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
        let mut r: [i8; 32] = [0i8; 32];
-        vst2q_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst2q_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -17928,7 +17928,7 @@ mod test {
        let a: [i64; 3] = [0, 1, 2];
        let e: [i64; 2] = [1, 2];
        let mut r: [i64; 2] = [0i64; 2];
-        vst2_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst2_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -17937,7 +17937,7 @@ mod test {
        let a: [i64; 5] = [0, 1, 2, 2, 3];
        let e: [i64; 4] = [1, 2, 0, 0];
        let mut r: [i64; 4] = [0i64; 4];
-        vst2q_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst2q_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -17946,7 +17946,7 @@ mod test {
        let a: [u8; 33] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17];
        let e: [u8; 32] = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
        let mut r: [u8; 32] = [0u8; 32];
-        vst2q_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst2q_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -17955,7 +17955,7 @@ mod test {
        let a: [u64; 3] = [0, 1, 2];
        let e: [u64; 2] = [1, 2];
        let mut r: [u64; 2] = [0u64; 2];
-        vst2_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst2_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -17964,7 +17964,7 @@ mod test {
        let a: [u64; 5] = [0, 1, 2, 2, 3];
        let e: [u64; 4] = [1, 2, 0, 0];
        let mut r: [u64; 4] = [0u64; 4];
-        vst2q_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst2q_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -17973,7 +17973,7 @@ mod test {
        let a: [u8; 33] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17];
        let e: [u8; 32] = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
        let mut r: [u8; 32] = [0u8; 32];
-        vst2q_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst2q_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -17982,7 +17982,7 @@ mod test {
        let a: [u64; 3] = [0, 1, 2];
        let e: [u64; 2] = [1, 2];
        let mut r: [u64; 2] = [0u64; 2];
-        vst2_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst2_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -17991,7 +17991,7 @@ mod test {
        let a: [u64; 5] = [0, 1, 2, 2, 3];
        let e: [u64; 4] = [1, 2, 0, 0];
        let mut r: [u64; 4] = [0u64; 4];
-        vst2q_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst2q_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18000,7 +18000,7 @@ mod test {
        let a: [f64; 3] = [0., 1., 2.];
        let e: [f64; 2] = [1., 2.];
        let mut r: [f64; 2] = [0f64; 2];
-        vst2_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst2_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18009,7 +18009,7 @@ mod test {
        let a: [f64; 5] = [0., 1., 2., 2., 3.];
        let e: [f64; 4] = [1., 2., 0., 0.];
        let mut r: [f64; 4] = [0f64; 4];
-        vst2q_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst2q_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18018,7 +18018,7 @@ mod test {
        let a: [i64; 7] = [0, 1, 2, 2, 4, 2, 4];
        let e: [i64; 6] = [1, 2, 2, 2, 4, 4];
        let mut r: [i64; 6] = [0i64; 6];
-        vst3q_s64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst3q_s64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18027,7 +18027,7 @@ mod test {
        let a: [u64; 7] = [0, 1, 2, 2, 4, 2, 4];
        let e: [u64; 6] = [1, 2, 2, 2, 4, 4];
        let mut r: [u64; 6] = [0u64; 6];
-        vst3q_u64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst3q_u64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18036,7 +18036,7 @@ mod test {
        let a: [u64; 7] = [0, 1, 2, 2, 4, 2, 4];
        let e: [u64; 6] = [1, 2, 2, 2, 4, 4];
        let mut r: [u64; 6] = [0u64; 6];
-        vst3q_p64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst3q_p64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18045,7 +18045,7 @@ mod test {
        let a: [f64; 4] = [0., 1., 2., 2.];
        let e: [f64; 3] = [1., 2., 2.];
        let mut r: [f64; 3] = [0f64; 3];
-        vst3_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst3_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18054,7 +18054,7 @@ mod test {
        let a: [f64; 7] = [0., 1., 2., 2., 4., 2., 4.];
        let e: [f64; 6] = [1., 2., 2., 2., 4., 4.];
        let mut r: [f64; 6] = [0f64; 6];
-        vst3q_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst3q_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18063,7 +18063,7 @@ mod test {
        let a: [i8; 49] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48];
        let e: [i8; 48] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
        let mut r: [i8; 48] = [0i8; 48];
-        vst3q_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst3q_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18072,7 +18072,7 @@ mod test {
        let a: [i64; 4] = [0, 1, 2, 2];
        let e: [i64; 3] = [1, 2, 2];
        let mut r: [i64; 3] = [0i64; 3];
-        vst3_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst3_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18081,7 +18081,7 @@ mod test {
        let a: [i64; 7] = [0, 1, 2, 2, 4, 2, 4];
        let e: [i64; 6] = [1, 2, 2, 0, 0, 0];
        let mut r: [i64; 6] = [0i64; 6];
-        vst3q_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst3q_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18090,7 +18090,7 @@ mod test {
        let a: [u8; 49] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48];
        let e: [u8; 48] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
        let mut r: [u8; 48] = [0u8; 48];
-        vst3q_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst3q_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18099,7 +18099,7 @@ mod test {
        let a: [u64; 4] = [0, 1, 2, 2];
        let e: [u64; 3] = [1, 2, 2];
        let mut r: [u64; 3] = [0u64; 3];
-        vst3_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst3_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18108,7 +18108,7 @@ mod test {
        let a: [u64; 7] = [0, 1, 2, 2, 4, 2, 4];
        let e: [u64; 6] = [1, 2, 2, 0, 0, 0];
        let mut r: [u64; 6] = [0u64; 6];
-        vst3q_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst3q_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18117,7 +18117,7 @@ mod test {
        let a: [u8; 49] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48];
        let e: [u8; 48] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
        let mut r: [u8; 48] = [0u8; 48];
-        vst3q_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst3q_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18126,7 +18126,7 @@ mod test {
        let a: [u64; 4] = [0, 1, 2, 2];
        let e: [u64; 3] = [1, 2, 2];
        let mut r: [u64; 3] = [0u64; 3];
-        vst3_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst3_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18135,7 +18135,7 @@ mod test {
        let a: [u64; 7] = [0, 1, 2, 2, 4, 2, 4];
        let e: [u64; 6] = [1, 2, 2, 0, 0, 0];
        let mut r: [u64; 6] = [0u64; 6];
-        vst3q_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst3q_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18144,7 +18144,7 @@ mod test {
        let a: [f64; 4] = [0., 1., 2., 2.];
        let e: [f64; 3] = [1., 2., 2.];
        let mut r: [f64; 3] = [0f64; 3];
-        vst3_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst3_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18153,7 +18153,7 @@ mod test {
        let a: [f64; 7] = [0., 1., 2., 2., 3., 2., 3.];
        let e: [f64; 6] = [1., 2., 2., 0., 0., 0.];
        let mut r: [f64; 6] = [0f64; 6];
-        vst3q_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst3q_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18162,7 +18162,7 @@ mod test {
        let a: [i64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8];
        let e: [i64; 8] = [1, 2, 2, 6, 2, 6, 6, 8];
        let mut r: [i64; 8] = [0i64; 8];
-        vst4q_s64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst4q_s64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18171,7 +18171,7 @@ mod test {
        let a: [u64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8];
        let e: [u64; 8] = [1, 2, 2, 6, 2, 6, 6, 8];
        let mut r: [u64; 8] = [0u64; 8];
-        vst4q_u64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst4q_u64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18180,7 +18180,7 @@ mod test {
        let a: [u64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8];
        let e: [u64; 8] = [1, 2, 2, 6, 2, 6, 6, 8];
        let mut r: [u64; 8] = [0u64; 8];
-        vst4q_p64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst4q_p64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18189,7 +18189,7 @@ mod test {
        let a: [f64; 5] = [0., 1., 2., 2., 6.];
        let e: [f64; 4] = [1., 2., 2., 6.];
        let mut r: [f64; 4] = [0f64; 4];
-        vst4_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst4_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18198,7 +18198,7 @@ mod test {
        let a: [f64; 9] = [0., 1., 2., 2., 6., 2., 6., 6., 8.];
        let e: [f64; 8] = [1., 2., 2., 6., 2., 6., 6., 8.];
        let mut r: [f64; 8] = [0f64; 8];
-        vst4q_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst4q_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18207,7 +18207,7 @@ mod test {
        let a: [i8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64];
        let e: [i8; 64] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
        let mut r: [i8; 64] = [0i8; 64];
-        vst4q_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst4q_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18216,7 +18216,7 @@ mod test {
        let a: [i64; 5] = [0, 1, 2, 2, 6];
        let e: [i64; 4] = [1, 2, 2, 6];
        let mut r: [i64; 4] = [0i64; 4];
-        vst4_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst4_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18225,7 +18225,7 @@ mod test {
        let a: [i64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8];
        let e: [i64; 8] = [1, 2, 2, 6, 0, 0, 0, 0];
        let mut r: [i64; 8] = [0i64; 8];
-        vst4q_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst4q_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18234,7 +18234,7 @@ mod test {
        let a: [u8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64];
        let e: [u8; 64] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
        let mut r: [u8; 64] = [0u8; 64];
-        vst4q_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst4q_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18243,7 +18243,7 @@ mod test {
        let a: [u64; 5] = [0, 1, 2, 2, 6];
        let e: [u64; 4] = [1, 2, 2, 6];
        let mut r: [u64; 4] = [0u64; 4];
-        vst4_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst4_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18252,7 +18252,7 @@ mod test {
        let a: [u64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8];
        let e: [u64; 8] = [1, 2, 2, 6, 0, 0, 0, 0];
        let mut r: [u64; 8] = [0u64; 8];
-        vst4q_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst4q_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18261,7 +18261,7 @@ mod test {
        let a: [u8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64];
        let e: [u8; 64] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
        let mut r: [u8; 64] = [0u8; 64];
-        vst4q_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst4q_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18270,7 +18270,7 @@ mod test {
        let a: [u64; 5] = [0, 1, 2, 2, 6];
        let e: [u64; 4] = [1, 2, 2, 6];
        let mut r: [u64; 4] = [0u64; 4];
-        vst4_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst4_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18279,7 +18279,7 @@ mod test {
        let a: [u64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8];
        let e: [u64; 8] = [1, 2, 2, 6, 0, 0, 0, 0];
        let mut r: [u64; 8] = [0u64; 8];
-        vst4q_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst4q_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18288,7 +18288,7 @@ mod test {
        let a: [f64; 5] = [0., 1., 2., 2., 6.];
        let e: [f64; 4] = [1., 2., 2., 6.];
        let mut r: [f64; 4] = [0f64; 4];
-        vst4_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst4_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

@ -18297,7 +18297,7 @@ mod test {
        let a: [f64; 9] = [0., 1., 2., 2., 6., 2., 6., 6., 8.];
        let e: [f64; 8] = [1., 2., 2., 6., 0., 0., 0., 0.];
        let mut r: [f64; 8] = [0f64; 8];
-        vst4q_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        vst4q_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
    }

--- a/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
@ -2814,7 +2814,7 @@ pub unsafe fn vshrd_n_u64<const N: i32>(a: u64) -> u64 {
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vsrad_n_s64<const N: i32>(a: i64, b: i64) -> i64 {
    static_assert!(N : i32 where N >= 1 && N <= 64);
-    a + vshrd_n_s64::<N>(b)
+    a.wrapping_add(vshrd_n_s64::<N>(b))
 }

 /// Unsigned shift right and accumulate
@ -2824,7 +2824,7 @@ pub unsafe fn vsrad_n_s64<const N: i32>(a: i64, b: i64) -> i64 {
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vsrad_n_u64<const N: i32>(a: u64, b: u64) -> u64 {
    static_assert!(N : i32 where N >= 1 && N <= 64);
-    a + vshrd_n_u64::<N>(b)
+    a.wrapping_add(vshrd_n_u64::<N>(b))
 }

 /// Shift Left and Insert (immediate)
--- a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
--- a/library/stdarch/crates/core_arch/src/arm_shared/neon/mod.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/mod.rs
@ -6,8 +6,7 @@ mod generated;
 pub use self::generated::*;

 use crate::{
-    convert::TryInto, core_arch::simd::*, core_arch::simd_llvm::*, hint::unreachable_unchecked,
-    mem::transmute,
+    core_arch::simd::*, core_arch::simd_llvm::*, hint::unreachable_unchecked, mem::transmute,
 };
 #[cfg(test)]
 use stdarch_test::assert_instr;
--- a/library/stdarch/crates/core_arch/src/simd.rs
+++ b/library/stdarch/crates/core_arch/src/simd.rs
@ -10,7 +10,7 @@ macro_rules! simd_ty {

        #[allow(clippy::use_self)]
        impl $id {
-            #[inline]
+            #[inline(always)]
            pub(crate) const fn new($($elem_name: $elem_ty),*) -> Self {
                $id($($elem_name),*)
            }
@ -43,12 +43,12 @@ macro_rules! simd_m_ty {

        #[allow(clippy::use_self)]
        impl $id {
-            #[inline]
+            #[inline(always)]
            const fn bool_to_internal(x: bool) -> $ety {
                [0 as $ety, !(0 as $ety)][x as usize]
            }

-            #[inline]
+            #[inline(always)]
            pub(crate) const fn new($($elem_name: bool),*) -> Self {
                $id($(Self::bool_to_internal($elem_name)),*)
            }
--- a/library/stdarch/crates/intrinsic-test/missing.txt
+++ b/library/stdarch/crates/intrinsic-test/missing.txt
@ -1,115 +0,0 @@
-vmmlaq_s32
-vmmlaq_u32
-vrnd32x_f64
-vrnd32xq_f64
-vrnd32z_f64
-vrnd32zq_f64
-vrnd64x_f64
-vrnd64z_f64
-vrnd64zq_f64
-vsm3partw1q_u32
-vsm3partw2q_u32
-vsm3tt1bq_u32
-vsm3tt2aq_u32
-vsm3tt2bq_u32
-vsm4ekeyq_u32
-vsm4eq_u32
-vsudot_lane_s32
-vsudot_laneq_s32
-vsudotq_lane_s32
-vsudotq_laneq_s32
-vusdot_lane_s32
-vusdot_laneq_s32
-vusdot_s32
-vusdotq_lane_s32
-vusdotq_laneq_s32
-vusdotq_s32
-vcls_u16
-vcls_u32
-vcls_u8
-vclsq_u16
-vclsq_u32
-vclsq_u8
-vcreate_s16
-vcreate_u16
-vpaddq_s64
-vpaddq_u64
-vqshlu_n_s16
-vqshlu_n_s32
-vqshlu_n_s64
-vqshlu_n_s8
-vqshlub_n_s8
-vqshlud_n_s64
-vqshluh_n_s16
-vqshluq_n_s16
-vqshluq_n_s32
-vqshluq_n_s64
-vqshluq_n_s8
-vqshlus_n_s32
-vrax1q_u64
-vreinterpretq_p128_f32
-vreinterpretq_p128_f64
-vreinterpretq_p128_p16
-vreinterpretq_p128_p8
-vreinterpretq_p128_s16
-vreinterpretq_p128_s32
-vreinterpretq_p128_s64
-vreinterpretq_p128_s8
-vreinterpretq_p128_u16
-vreinterpretq_p128_u32
-vreinterpretq_p128_u64
-vreinterpretq_p128_u8
-vrnd32x_f32
-vrnd32xq_f32
-vrnd32z_f32
-vrnd32zq_f32
-vrnd64x_f32
-vrnd64xq_f32
-vrnd64xq_f64
-vrnd64z_f32
-vrnd64zq_f32
-vsha512h2q_u64
-vsha512hq_u64
-vsha512su0q_u64
-vsha512su1q_u64
-vslid_n_s64
-vslid_n_u64
-vsm3ss1q_u32
-vsm3tt1aq_u32
-vsrid_n_s64
-vsrid_n_u64
-vusmmlaq_s32
-vxarq_u64
-vadd_p16
-vadd_p64
-vadd_p8
-vaddq_p16
-vaddq_p64
-vaddq_p8
-vbcaxq_s16
-vbcaxq_s32
-vbcaxq_s64
-vbcaxq_s8
-vbcaxq_u16
-vbcaxq_u32
-vbcaxq_u64
-vbcaxq_u8
-veor3q_s16
-veor3q_s32
-veor3q_s64
-veor3q_s8
-veor3q_u16
-veor3q_u32
-veor3q_u64
-veor3q_u8
-vshld_s64
-vshld_u64
-vcopyq_laneq_u8
-vcopyq_laneq_s8
-vcopyq_laneq_p8
-vcopyq_lane_u8
-vcopyq_lane_s8
-vcopyq_lane_p8
-vcopy_laneq_u8
-vcopy_laneq_s8
-vcopy_laneq_p8
--- a/library/stdarch/crates/intrinsic-test/missing_aarch64.txt
+++ b/library/stdarch/crates/intrinsic-test/missing_aarch64.txt
@ -0,0 +1,133 @@
+# Not implemented in stdarch yet
+vbfdot_f32
+vbfdot_lane_f32
+vbfdot_laneq_f32
+vbfdotq_f32
+vbfdotq_lane_f32
+vbfdotq_laneq_f32
+vbfmlalbq_f32
+vbfmlalbq_lane_f32
+vbfmlalbq_laneq_f32
+vbfmlaltq_f32
+vbfmlaltq_lane_f32
+vbfmlaltq_laneq_f32
+vbfmmlaq_f32
+vsudot_laneq_s32
+vsudot_lane_s32
+vsudotq_laneq_s32
+vsudotq_lane_s32
+vusdot_laneq_s32
+vusdot_lane_s32
+vusdotq_laneq_s32
+vusdotq_lane_s32
+vusdotq_s32
+vusdot_s32
+
+# Implemented in Clang but missing from CSV
+vcmla_f64
+vcmla_lane_f64
+vcmla_laneq_f64
+vcmlaq_lane_f64
+vcmlaq_laneq_f64
+vcmlaq_rot180_lane_f64
+vcmlaq_rot180_laneq_f64
+vcmlaq_rot270_lane_f64
+vcmlaq_rot270_laneq_f64
+vcmlaq_rot90_lane_f64
+vcmlaq_rot90_laneq_f64
+vcmla_rot180_f64
+vcmla_rot180_lane_f64
+vcmla_rot180_laneq_f64
+vcmla_rot270_f64
+vcmla_rot270_lane_f64
+vcmla_rot270_laneq_f64
+vcmla_rot90_f64
+vcmla_rot90_lane_f64
+vcmla_rot90_laneq_f64
+
+# Implemented in Clang and stdarch but missing from CSV
+vmov_n_p64
+vmovq_n_p64
+vreinterpret_f32_p64
+vreinterpret_p64_s64
+vreinterpretq_f32_p128
+vreinterpretq_f32_p64
+vreinterpretq_p128_p64
+vreinterpretq_p64_p128
+vtst_p16
+vtstq_p16
+
+# Missing from both Clang and stdarch
+vrnd32x_f64
+vrnd32xq_f64
+vrnd32z_f64
+vrnd32zq_f64
+vrnd64x_f64
+vrnd64xq_f64
+vrnd64z_f64
+vrnd64zq_f64
+
+# Takes too long to compile tests
+vcopyq_laneq_u8
+vcopyq_laneq_s8
+vcopyq_laneq_p8
+vcopyq_lane_u8
+vcopyq_lane_s8
+vcopyq_lane_p8
+vcopy_laneq_u8
+vcopy_laneq_s8
+vcopy_laneq_p8
+vcopy_lane_u8
+vcopy_lane_s8
+vcopy_lane_p8
+
+# QEMU 6.0 doesn't support these instructions
+vmmlaq_s32
+vmmlaq_u32
+vsm3partw1q_u32
+vsm3partw2q_u32
+vsm3ss1q_u32
+vsm3tt1aq_u32
+vsm3tt1bq_u32
+vsm3tt2aq_u32
+vsm3tt2bq_u32
+vsm4ekeyq_u32
+vsm4eq_u32
+vusmmlaq_s32
+
+# LLVM select error in debug builds
+vqshlu_n_s16
+vqshlu_n_s32
+vqshlu_n_s64
+vqshlu_n_s8
+vqshlub_n_s8
+vqshlud_n_s64
+vqshluh_n_s16
+vqshluq_n_s16
+vqshluq_n_s32
+vqshluq_n_s64
+vqshluq_n_s8
+vqshlus_n_s32
+
+# These tests produce a different result from C but only in debug builds of
+# stdarch. This likely both a bug in stdarch (expanding to a different LLVM
+# intrinsic) and a bug in LLVM (incorrect optimization changing the behavior of
+# integer operations).
+vqrdmlah_lane_s16
+vqrdmlah_lane_s32
+vqrdmlah_laneq_s16
+vqrdmlah_laneq_s32
+vqrdmlah_s16
+vqrdmlah_s32
+vqrdmlahh_lane_s16
+vqrdmlahh_laneq_s16
+vqrdmlahh_s16
+vqrdmlahq_lane_s16
+vqrdmlahq_lane_s32
+vqrdmlahq_laneq_s16
+vqrdmlahq_laneq_s32
+vqrdmlahq_s16
+vqrdmlahq_s32
+vqrdmlahs_lane_s32
+vqrdmlahs_laneq_s32
+vqrdmlahs_s32
--- a/library/stdarch/crates/intrinsic-test/missing_arm.txt
+++ b/library/stdarch/crates/intrinsic-test/missing_arm.txt
@ -0,0 +1,334 @@
+# Not implemented in stdarch yet
+vbfdot_f32
+vbfdot_lane_f32
+vbfdot_laneq_f32
+vbfdotq_f32
+vbfdotq_lane_f32
+vbfdotq_laneq_f32
+vbfmlalbq_f32
+vbfmlalbq_lane_f32
+vbfmlalbq_laneq_f32
+vbfmlaltq_f32
+vbfmlaltq_lane_f32
+vbfmlaltq_laneq_f32
+vbfmmlaq_f32
+vsudot_laneq_s32
+vsudot_lane_s32
+vsudotq_laneq_s32
+vsudotq_lane_s32
+vusdot_laneq_s32
+vusdot_lane_s32
+vusdotq_laneq_s32
+vusdotq_lane_s32
+vusdotq_s32
+vusdot_s32
+
+# Implemented in Clang and stdarch but missing from CSV
+vtst_p16
+vtstq_p16
+
+# QEMU 6.0 doesn't support these instructions
+vmmlaq_s32
+vmmlaq_u32
+vusmmlaq_s32
+
+# Implemented in Clang and stdarch for A64 only even though CSV claims A32 support
+__crc32d
+__crc32cd
+vaddq_p64
+vbsl_p64
+vbslq_p64
+vceq_p64
+vceqq_p64
+vceqz_p64
+vceqzq_p64
+vcombine_p64
+vcopy_lane_p64
+vcopy_laneq_p64
+vcopyq_lane_p64
+vcopyq_laneq_p64
+vcreate_p64
+vdup_lane_p64
+vdup_n_p64
+vdupq_lane_p64
+vdupq_n_p64
+vext_p64
+vextq_p64
+vget_high_p64
+vget_lane_p64
+vget_low_p64
+vgetq_lane_p64
+vmovn_high_s16
+vmovn_high_s32
+vmovn_high_s64
+vmovn_high_u16
+vmovn_high_u32
+vmovn_high_u64
+vmull_high_p64
+vmull_p64
+vreinterpret_p16_p64
+vreinterpret_p64_f32
+vreinterpret_p64_p16
+vreinterpret_p64_p8
+vreinterpret_p64_s16
+vreinterpret_p64_s32
+vreinterpret_p64_s8
+vreinterpret_p64_u16
+vreinterpret_p64_u32
+vreinterpret_p64_u64
+vreinterpret_p64_u8
+vreinterpret_p8_p64
+vreinterpretq_f64_u64
+vreinterpretq_p128_f32
+vreinterpretq_p128_p16
+vreinterpretq_p128_p8
+vreinterpretq_p128_s16
+vreinterpretq_p128_s32
+vreinterpretq_p128_s64
+vreinterpretq_p128_s8
+vreinterpretq_p128_u16
+vreinterpretq_p128_u32
+vreinterpretq_p128_u64
+vreinterpretq_p128_u8
+vreinterpretq_p16_p64
+vreinterpretq_p64_f32
+vreinterpretq_p64_p16
+vreinterpretq_p64_p8
+vreinterpretq_p64_s16
+vreinterpretq_p64_s32
+vreinterpretq_p64_s64
+vreinterpretq_p64_s8
+vreinterpretq_p64_u16
+vreinterpretq_p64_u32
+vreinterpretq_p64_u64
+vreinterpretq_p64_u8
+vreinterpretq_p8_p64
+vreinterpretq_s16_p64
+vreinterpretq_s32_p64
+vreinterpretq_s64_p64
+vreinterpretq_s8_p64
+vreinterpretq_u16_p64
+vreinterpretq_u32_p64
+vreinterpretq_u64_p64
+vreinterpretq_u8_p64
+vreinterpret_s16_p64
+vreinterpret_s32_p64
+vreinterpret_s64_p64
+vreinterpret_s8_p64
+vreinterpret_u16_p64
+vreinterpret_u32_p64
+vreinterpret_u64_p64
+vreinterpret_u8_p64
+vrndn_f64
+vrndnq_f64
+vset_lane_p64
+vsetq_lane_p64
+vsli_n_p64
+vsliq_n_p64
+vsri_n_p64
+vsriq_n_p64
+vtst_p64
+vtstq_p64
+
+# Present in Clang header but triggers an ICE due to lack of backend support.
+vcmla_f32
+vcmla_lane_f32
+vcmla_laneq_f32
+vcmla_rot180_f32
+vcmla_rot180_lane_f32
+vcmla_rot180_laneq_f32
+vcmla_rot270_f32
+vcmla_rot270_lane_f32
+vcmla_rot270_laneq_f32
+vcmla_rot90_f32
+vcmla_rot90_lane_f32
+vcmla_rot90_laneq_f32
+vcmlaq_f32
+vcmlaq_lane_f32
+vcmlaq_laneq_f32
+vcmlaq_rot180_f32
+vcmlaq_rot180_lane_f32
+vcmlaq_rot180_laneq_f32
+vcmlaq_rot270_f32
+vcmlaq_rot270_lane_f32
+vcmlaq_rot270_laneq_f32
+vcmlaq_rot90_f32
+vcmlaq_rot90_lane_f32
+vcmlaq_rot90_laneq_f32
+
+# Implemented in stdarch for A64 only, Clang support both A32/A64
+vadd_s64
+vadd_u64
+vcaddq_rot270_f32
+vcaddq_rot90_f32
+vcadd_rot270_f32
+vcadd_rot90_f32
+vcombine_f32
+vcombine_p16
+vcombine_p8
+vcombine_s16
+vcombine_s32
+vcombine_s64
+vcombine_s8
+vcombine_u16
+vcombine_u32
+vcombine_u64
+vcombine_u8
+vcvtaq_s32_f32
+vcvtaq_u32_f32
+vcvta_s32_f32
+vcvta_u32_f32
+vcvtmq_s32_f32
+vcvtmq_u32_f32
+vcvtm_s32_f32
+vcvtm_u32_f32
+vcvtnq_s32_f32
+vcvtnq_u32_f32
+vcvtn_s32_f32
+vcvtn_u32_f32
+vcvtpq_s32_f32
+vcvtpq_u32_f32
+vcvtp_s32_f32
+vcvtp_u32_f32
+vdot_lane_s32
+vdot_lane_u32
+vdotq_lane_s32
+vdotq_lane_u32
+vdotq_s32
+vdotq_u32
+vdot_s32
+vdot_u32
+vqdmulh_lane_s16
+vqdmulh_lane_s32
+vqdmulhq_lane_s16
+vqdmulhq_lane_s32
+vrnda_f32
+vrnda_f32
+vrndaq_f32
+vrndaq_f32
+vrnd_f32
+vrnd_f32
+vrndi_f32
+vrndi_f32
+vrndiq_f32
+vrndiq_f32
+vrndm_f32
+vrndm_f32
+vrndmq_f32
+vrndmq_f32
+vrndns_f32
+vrndp_f32
+vrndpq_f32
+vrndq_f32
+vrndq_f32
+vrndx_f32
+vrndxq_f32
+
+# LLVM select error in debug builds
+vqrshrn_n_s16
+vqrshrn_n_s32
+vqrshrn_n_s64
+vqrshrn_n_u16
+vqrshrn_n_u32
+vqrshrn_n_u64
+vqrshrun_n_s16
+vqrshrun_n_s32
+vqrshrun_n_s64
+vqshrn_n_s16
+vqshrn_n_s32
+vqshrn_n_s64
+vqshrn_n_u16
+vqshrn_n_u32
+vqshrn_n_u64
+vqshrun_n_s16
+vqshrun_n_s32
+vqshrun_n_s64
+vrshrn_n_s16
+vrshrn_n_s32
+vrshrn_n_s64
+vrshrn_n_u16
+vrshrn_n_u32
+vrshrn_n_u64
+vshrq_n_u64
+vshr_n_u64
+
+# Failing tests: stdarch has incorrect results compared to Clang
+vqshlu_n_s16
+vqshlu_n_s32
+vqshlu_n_s64
+vqshlu_n_s8
+vqshluq_n_s16
+vqshluq_n_s32
+vqshluq_n_s64
+vqshluq_n_s8
+vsli_n_p16
+vsli_n_p8
+vsli_n_s16
+vsli_n_s32
+vsli_n_s64
+vsli_n_s8
+vsli_n_u16
+vsli_n_u32
+vsli_n_u64
+vsli_n_u8
+vsliq_n_p16
+vsliq_n_p8
+vsliq_n_s16
+vsliq_n_s32
+vsliq_n_s64
+vsliq_n_s8
+vsliq_n_u16
+vsliq_n_u32
+vsliq_n_u64
+vsliq_n_u8
+vsri_n_p16
+vsri_n_p8
+vsri_n_s16
+vsri_n_s32
+vsri_n_s64
+vsri_n_s8
+vsri_n_u16
+vsri_n_u32
+vsri_n_u64
+vsri_n_u8
+vsriq_n_p16
+vsriq_n_p8
+vsriq_n_s16
+vsriq_n_s32
+vsriq_n_s64
+vsriq_n_s8
+vsriq_n_u16
+vsriq_n_u32
+vsriq_n_u64
+vsriq_n_u8
+
+# These produce a different result on Clang depending on the optimization level.
+# This is definitely a bug in LLVM.
+vadd_f32
+vaddq_f32
+vcvt_s32_f32
+vcvt_u32_f32
+vcvtq_s32_f32
+vcvtq_u32_f32
+vfma_f32
+vfma_n_f32
+vfmaq_f32
+vfmaq_n_f32
+vfms_f32
+vfmsq_f32
+vmla_f32
+vmla_lane_f32
+vmla_n_f32
+vmlaq_f32
+vmlaq_lane_f32
+vmlaq_n_f32
+vmls_f32
+vmls_lane_f32
+vmls_n_f32
+vmlsq_f32
+vmlsq_lane_f32
+vmlsq_n_f32
+vmul_lane_f32
+vmul_n_f32
+vmulq_lane_f32
+vmulq_n_f32
--- a/library/stdarch/crates/intrinsic-test/src/acle_csv_parser.rs
+++ b/library/stdarch/crates/intrinsic-test/src/acle_csv_parser.rs
@ -82,11 +82,17 @@ impl Into<Intrinsic> for ACLEIntrinsicLine {
            })
            .collect();
        let arguments = ArgumentList { args };
+        let a64_only = match &*self.supported_architectures {
+            "A64" => true,
+            "v7/A32/A64" | "A32/A64" => false,
+            _ => panic!("Invalid supported architectures"),
+        };

        Intrinsic {
            name: name.to_string(),
            arguments,
            results,
+            a64_only,
        }
    }
 }
--- a/library/stdarch/crates/intrinsic-test/src/intrinsic.rs
+++ b/library/stdarch/crates/intrinsic-test/src/intrinsic.rs
@ -13,6 +13,9 @@ pub struct Intrinsic {

    /// The return type of this intrinsic.
    pub results: IntrinsicType,
+
+    /// Whether this intrinsic is only available on A64.
+    pub a64_only: bool,
 }

 impl Intrinsic {
--- a/library/stdarch/crates/intrinsic-test/src/main.rs
+++ b/library/stdarch/crates/intrinsic-test/src/main.rs
@ -72,12 +72,15 @@ fn generate_c_program(header_files: &[&str], intrinsic: &Intrinsic) -> String {
 #include <cstring>
 #include <iomanip>
 #include <sstream>
+
 template<typename T1, typename T2> T1 cast(T2 x) {{
  static_assert(sizeof(T1) == sizeof(T2), "sizeof T1 and T2 must be the same");
  T1 ret = 0;
  memcpy(&ret, &x, sizeof(T1));
  return ret;
 }}
+
+#ifdef __aarch64__
 std::ostream& operator<<(std::ostream& os, poly128_t value) {{
  std::stringstream temp;
  do {{
@ -90,6 +93,8 @@ std::ostream& operator<<(std::ostream& os, poly128_t value) {{
  os << res;
  return os;
 }}
+#endif
+
 int main(int argc, char **argv) {{
 {passes}
    return 0;
@ -133,7 +138,7 @@ fn gen_code_rust(intrinsic: &Intrinsic, constraints: &[&Argument], name: String)
    }
 }

-fn generate_rust_program(intrinsic: &Intrinsic) -> String {
+fn generate_rust_program(intrinsic: &Intrinsic, a32: bool) -> String {
    let constraints = intrinsic
        .arguments
        .iter()
@ -146,25 +151,26 @@ fn generate_rust_program(intrinsic: &Intrinsic) -> String {
 #![feature(stdsimd)]
 #![allow(overflowing_literals)]
 #![allow(non_upper_case_globals)]
-use core_arch::arch::aarch64::*;
+use core_arch::arch::{target_arch}::*;

 fn main() {{
 {passes}
 }}
 "#,
+        target_arch = if a32 { "arm" } else { "aarch64" },
        passes = gen_code_rust(intrinsic, &constraints, Default::default())
    )
 }

-fn compile_c(c_filename: &str, intrinsic: &Intrinsic, compiler: &str) -> bool {
+fn compile_c(c_filename: &str, intrinsic: &Intrinsic, compiler: &str, a32: bool) -> bool {
    let flags = std::env::var("CPPFLAGS").unwrap_or("".into());

    let output = Command::new("sh")
        .arg("-c")
        .arg(format!(
            "{cpp} {cppflags} {arch_flags} -Wno-narrowing -O2 -target {target} -o c_programs/{intrinsic} {filename}",
-            target = "aarch64-unknown-linux-gnu",
-            arch_flags = "-march=armv8.6-a+crypto+sha3+crc+dotprod",
+            target = if a32 { "armv7-unknown-linux-gnueabihf" } else { "aarch64-unknown-linux-gnu" },
+            arch_flags = if a32 { "-march=armv8.6-a+crypto+crc+dotprod" } else { "-march=armv8.6-a+crypto+sha3+crc+dotprod" },
            filename = c_filename,
            intrinsic = intrinsic.name,
            cpp = compiler,
@ -175,19 +181,13 @@ fn compile_c(c_filename: &str, intrinsic: &Intrinsic, compiler: &str) -> bool {
        if output.status.success() {
            true
        } else {
-            let stderr = std::str::from_utf8(&output.stderr).unwrap_or("");
-            if stderr.contains("error: use of undeclared identifier") {
-                warn!("Skipping intrinsic due to no support: {}", intrinsic.name);
-                true
-            } else {
-                error!(
-                    "Failed to compile code for intrinsic: {}\n\nstdout:\n{}\n\nstderr:\n{}",
-                    intrinsic.name,
-                    std::str::from_utf8(&output.stdout).unwrap_or(""),
-                    std::str::from_utf8(&output.stderr).unwrap_or("")
-                );
-                false
-            }
+            error!(
+                "Failed to compile code for intrinsic: {}\n\nstdout:\n{}\n\nstderr:\n{}",
+                intrinsic.name,
+                std::str::from_utf8(&output.stdout).unwrap_or(""),
+                std::str::from_utf8(&output.stderr).unwrap_or("")
+            );
+            false
        }
    } else {
        error!("Command failed: {:#?}", output);
@ -195,7 +195,7 @@ fn compile_c(c_filename: &str, intrinsic: &Intrinsic, compiler: &str) -> bool {
    }
 }

-fn build_c(intrinsics: &Vec<Intrinsic>, compiler: &str) -> bool {
+fn build_c(intrinsics: &Vec<Intrinsic>, compiler: &str, a32: bool) -> bool {
    let _ = std::fs::create_dir("c_programs");
    intrinsics
        .par_iter()
@ -205,20 +205,20 @@ fn build_c(intrinsics: &Vec<Intrinsic>, compiler: &str) -> bool {

            let c_code = generate_c_program(&["arm_neon.h", "arm_acle.h"], &i);
            file.write_all(c_code.into_bytes().as_slice()).unwrap();
-            compile_c(&c_filename, &i, compiler)
+            compile_c(&c_filename, &i, compiler, a32)
        })
        .find_any(|x| !x)
        .is_none()
 }

-fn build_rust(intrinsics: &Vec<Intrinsic>, toolchain: &str) -> bool {
+fn build_rust(intrinsics: &Vec<Intrinsic>, toolchain: &str, a32: bool) -> bool {
    intrinsics.iter().for_each(|i| {
        let rust_dir = format!(r#"rust_programs/{}"#, i.name);
        let _ = std::fs::create_dir_all(&rust_dir);
        let rust_filename = format!(r#"{}/main.rs"#, rust_dir);
        let mut file = File::create(&rust_filename).unwrap();

-        let c_code = generate_rust_program(&i);
+        let c_code = generate_rust_program(&i, a32);
        file.write_all(c_code.into_bytes().as_slice()).unwrap();
    });

@ -259,10 +259,15 @@ path = "{intrinsic}/main.rs""#,
        .current_dir("rust_programs")
        .arg("-c")
        .arg(format!(
-            "cargo {toolchain} build --release --target {target}",
+            "cargo {toolchain} build --target {target}",
            toolchain = toolchain,
-            target = "aarch64-unknown-linux-gnu",
+            target = if a32 {
+                "armv7-unknown-linux-gnueabihf"
+            } else {
+                "aarch64-unknown-linux-gnu"
+            },
        ))
+        .env("RUSTFLAGS", "-Cdebuginfo=0")
        .output();
    if let Ok(output) = output {
        if output.status.success() {
@ -317,6 +322,12 @@ fn main() {
                .long("skip")
                .help("Filename for a list of intrinsics to skip (one per line)"),
        )
+        .arg(
+            Arg::with_name("A32")
+                .takes_value(false)
+                .long("a32")
+                .help("Run tests for A32 instrinsics instead of A64"),
+        )
        .get_matches();

    let filename = matches.value_of("INPUT").unwrap();
@ -328,10 +339,15 @@ fn main() {
    let c_runner = matches.value_of("RUNNER").unwrap_or("");
    let skip = if let Some(filename) = matches.value_of("SKIP") {
        let data = std::fs::read_to_string(&filename).expect("Failed to open file");
-        data.lines().map(String::from).collect_vec()
+        data.lines()
+            .map(str::trim)
+            .filter(|s| !s.contains('#'))
+            .map(String::from)
+            .collect_vec()
    } else {
        Default::default()
    };
+    let a32 = matches.is_present("A32");

    let intrinsics = get_acle_intrinsics(filename);

@ -352,18 +368,19 @@ fn main() {
        .filter(|i| !i.arguments.iter().any(|a| a.is_ptr()))
        .filter(|i| !i.arguments.iter().any(|a| a.ty.inner_size() == 128))
        .filter(|i| !skip.contains(&i.name))
+        .filter(|i| !(a32 && i.a64_only))
        .collect::<Vec<_>>();
    intrinsics.dedup();

-    if !build_c(&intrinsics, cpp_compiler) {
+    if !build_c(&intrinsics, cpp_compiler, a32) {
        std::process::exit(2);
    }

-    if !build_rust(&intrinsics, &toolchain) {
+    if !build_rust(&intrinsics, &toolchain, a32) {
        std::process::exit(3);
    }

-    if !compare_outputs(&intrinsics, &toolchain, &c_runner) {
+    if !compare_outputs(&intrinsics, &toolchain, &c_runner, a32) {
        std::process::exit(1)
    }
 }
@ -374,7 +391,7 @@ enum FailureReason {
    Difference(String, String, String),
 }

-fn compare_outputs(intrinsics: &Vec<Intrinsic>, toolchain: &str, runner: &str) -> bool {
+fn compare_outputs(intrinsics: &Vec<Intrinsic>, toolchain: &str, runner: &str, a32: bool) -> bool {
    let intrinsics = intrinsics
        .par_iter()
        .filter_map(|intrinsic| {
@ -390,11 +407,16 @@ fn compare_outputs(intrinsics: &Vec<Intrinsic>, toolchain: &str, runner: &str) -
                .current_dir("rust_programs")
                .arg("-c")
                .arg(format!(
-                    "cargo {toolchain} run --release --target {target} --bin {intrinsic}",
+                    "cargo {toolchain} run --target {target} --bin {intrinsic}",
                    intrinsic = intrinsic.name,
                    toolchain = toolchain,
-                    target = "aarch64-unknown-linux-gnu",
+                    target = if a32 {
+                        "armv7-unknown-linux-gnueabihf"
+                    } else {
+                        "aarch64-unknown-linux-gnu"
+                    },
                ))
+                .env("RUSTFLAGS", "-Cdebuginfo=0")
                .output();

            let (c, rust) = match (c, rust) {
--- a/library/stdarch/crates/intrinsic-test/src/types.rs
+++ b/library/stdarch/crates/intrinsic-test/src/types.rs
@ -258,6 +258,9 @@ impl IntrinsicType {
    /// This is required for 8 bit types due to printing as the 8 bit types use
    /// a char and when using that in `std::cout` it will print as a character,
    /// which means value of 0 will be printed as a null byte.
+    ///
+    /// This is also needed for polynomial types because we want them to be
+    /// printed as unsigned integers to match Rust's `Debug` impl.
    pub fn c_promotion(&self) -> &str {
        match *self {
            IntrinsicType::Type {
@ -267,9 +270,21 @@ impl IntrinsicType {
            } if bit_len == 8 => match kind {
                TypeKind::Int => "(int)",
                TypeKind::UInt => "(unsigned int)",
-                TypeKind::Poly => "(unsigned int)",
+                TypeKind::Poly => "(unsigned int)(uint8_t)",
                _ => "",
            },
+            IntrinsicType::Type {
+                kind: TypeKind::Poly,
+                bit_len: Some(bit_len),
+                ..
+            } => match bit_len {
+                8 => unreachable!("handled above"),
+                16 => "(uint16_t)",
+                32 => "(uint32_t)",
+                64 => "(uint64_t)",
+                128 => "",
+                _ => panic!("invalid bit_len"),
+            },
            _ => "",
        }
    }
--- a/library/stdarch/crates/stdarch-gen/neon.spec
+++ b/library/stdarch/crates/stdarch-gen/neon.spec
@ -896,7 +896,7 @@ validate BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1,

 arm = vcls
 aarch64 = cls
-generate uint*_t
+generate uint8x8_t:int8x8_t, uint8x16_t:int8x16_t, uint16x4_t:int16x4_t, uint16x8_t:int16x8_t, uint32x2_t:int32x2_t, uint32x4_t:int32x4_t

 /// Count leading zero bits
 name = vclz
@ -2058,7 +2058,7 @@ generate int*_t

 /// Negate
 name = vneg
-multi_fn = -a
+multi_fn = a.wrapping_neg()
 a = 1
 validate -1

@ -4055,7 +4055,7 @@ generate float*_t

 /// Subtract
 name = vsub
-multi_fn = a - b
+multi_fn = a.wrapping_sub(b)
 a = 3
 b = 2
 validate 1
@ -4065,7 +4065,7 @@ generate i64, u64

 /// Add
 name = vadd
-multi_fn = a + b
+multi_fn = a.wrapping_add(b)
 a = 1
 b = 2
 validate 3
@ -5894,7 +5894,7 @@ name = vqshl
 n-suffix
 constn = N
 multi_fn = static_assert_imm-out_bits_exp_len-N
-multi_fn = vqshl-self-noext, a, {vdup-nself-noext, N.try_into().unwrap()}
+multi_fn = vqshl-self-noext, a, {vdup-nself-noext, N as _}
 a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 n = 2
 validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
@ -5921,7 +5921,7 @@ name = vqshl
 n-suffix
 constn = N
 multi_fn = static_assert_imm-out_bits_exp_len-N
-multi_fn = vqshl-self-noext, a, {vdup-nsigned-noext, N.try_into().unwrap()}
+multi_fn = vqshl-self-noext, a, {vdup-nsigned-noext, N as _}
 a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 n = 2
 validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
@ -6480,7 +6480,7 @@ name = vrshr
 n-suffix
 constn = N
 multi_fn = static_assert-N-1-bits
-multi_fn = vrshl-self-noext, a, {vdup-nself-noext, (-N).try_into().unwrap()}
+multi_fn = vrshl-self-noext, a, {vdup-nself-noext, (-N) as _}
 a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
 n = 2
 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
@ -6507,7 +6507,7 @@ name = vrshr
 n-suffix
 constn = N
 multi_fn = static_assert-N-1-bits
-multi_fn = vrshl-self-noext, a, {vdup-nsigned-noext, (-N).try_into().unwrap()}
+multi_fn = vrshl-self-noext, a, {vdup-nsigned-noext, (-N) as _}
 a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
 n = 2
 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
@ -6613,7 +6613,7 @@ n-suffix
 constn = N
 multi_fn = static_assert-N-1-bits
 multi_fn = vrshr-nself-::<N>, b:in_t, b
-multi_fn = a + b
+multi_fn = a.wrapping_add(b)
 a = 1
 b = 4
 n = 2
@ -6628,7 +6628,7 @@ n-suffix
 constn = N
 multi_fn = static_assert-N-1-bits
 multi_fn = vrshr-nself-::<N>, b:in_t, b
-multi_fn = a + b
+multi_fn = a.wrapping_add(b)
 a = 1
 b = 4
 n = 2
@ -6804,7 +6804,7 @@ name = vshl
 n-suffix
 constn = N
 multi_fn = static_assert_imm-out_bits_exp_len-N
-multi_fn = simd_shl, a, {vdup-nself-noext, N.try_into().unwrap()}
+multi_fn = simd_shl, a, {vdup-nself-noext, N as _}
 a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 n = 2
 validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
@ -6818,7 +6818,7 @@ name = vshll
 n-suffix
 constn = N
 multi_fn = static_assert-N-0-bits
-multi_fn = simd_shl, {simd_cast, a}, {vdup-nout-noext, N.try_into().unwrap()}
+multi_fn = simd_shl, {simd_cast, a}, {vdup-nout-noext, N as _}
 a = 1, 2, 3, 4, 5, 6, 7, 8
 n = 2
 validate 4, 8, 12, 16, 20, 24, 28, 32
@ -6851,7 +6851,7 @@ n-suffix
 constn = N
 multi_fn = static_assert-N-1-bits
 multi_fn = fix_right_shift_imm-N-bits
-multi_fn = simd_shr, a, {vdup-nself-noext, n.try_into().unwrap()}
+multi_fn = simd_shr, a, {vdup-nself-noext, n as _}
 a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
 n = 2
 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
@ -6867,7 +6867,7 @@ name = vshrn_n
 no-q
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_cast, {simd_shr, a, {vdup-nself-noext, N.try_into().unwrap()}}
+multi_fn = simd_cast, {simd_shr, a, {vdup-nself-noext, N as _}}
 a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
 n = 2
 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
--- a/library/stdarch/crates/stdarch-gen/src/main.rs
+++ b/library/stdarch/crates/stdarch-gen/src/main.rs
@ -1304,7 +1304,7 @@ fn gen_aarch64(
                };
                format!(
                    r#"{}
-    {}{}({}, {} as i64, a.cast())"#,
+    {}{}({}, {} as i64, a as _)"#,
                    multi_calls,
                    ext_c,
                    current_fn,
@ -1327,7 +1327,7 @@ fn gen_aarch64(
                }
            }
        } else if link_aarch64.is_some() && matches!(fn_type, Fntype::Store) {
-            let cast = if is_vstx(&name) { ".cast()" } else { "" };
+            let cast = if is_vstx(&name) { " as _" } else { "" };
            match type_sub_len(in_t[1]) {
                1 => format!(r#"{}{}(b, a{})"#, ext_c, current_fn, cast),
                2 => format!(r#"{}{}(b.0, b.1, a{})"#, ext_c, current_fn, cast),
@ -1336,7 +1336,7 @@ fn gen_aarch64(
                _ => panic!("unsupported type: {}", in_t[1]),
            }
        } else if link_aarch64.is_some() && is_vldx(&name) {
-            format!(r#"{}{}(a.cast())"#, ext_c, current_fn,)
+            format!(r#"{}{}(a as _)"#, ext_c, current_fn,)
        } else {
            let trans: [&str; 2] = if link_t[3] != out_t {
                ["transmute(", ")"]
@ -1553,7 +1553,7 @@ fn gen_store_test(
        let a: [{}; {}] = {};
        let e: [{}; {}] = {};
        let mut r: [{}; {}] = [0{}; {}];
-        {}{}(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast()));
+        {}{}(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr() as _));
        assert_eq!(r, e);
 "#,
            type_to_native_type(in_t[1]),
@ -2196,7 +2196,7 @@ fn gen_arm(
                        _ => "",
                    };
                    format!(
-                        "{}(a.cast(), {}, {}, {})",
+                        "{}(a as _, {}, {}, {})",
                        current_fn,
                        subs,
                        constn.as_deref().unwrap(),
@ -2235,7 +2235,7 @@ fn gen_arm(
            } else if matches!(fn_type, Fntype::Store) {
                let (cast, size) = if is_vstx(&name) {
                    (
-                        ".cast()",
+                        " as _",
                        format!(", {}", type_bits(&type_to_sub_type(in_t[1])) / 8),
                    )
                } else {
@ -2276,7 +2276,7 @@ fn gen_arm(
                            _ => "",
                        };
                        format!(
-                            "{}({}, {} as i64, a.cast())",
+                            "{}({}, {} as i64, a as _)",
                            current_fn,
                            subs,
                            constn.as_deref().unwrap()
@ -2307,7 +2307,7 @@ fn gen_arm(
                        _ => String::new(),
                    }
                } else if matches!(fn_type, Fntype::Store) {
-                    let cast = if is_vstx(&name) { ".cast()" } else { "" };
+                    let cast = if is_vstx(&name) { " as _" } else { "" };
                    match type_sub_len(in_t[1]) {
                        1 => format!("{}(b, a{})", current_fn, cast),
                        2 => format!("{}(b.0, b.1, a{})", current_fn, cast),
@ -2316,7 +2316,7 @@ fn gen_arm(
                        _ => String::new(),
                    }
                } else if link_aarch64.is_some() && is_vldx(&name) {
-                    format!("{}(a.cast())", current_fn)
+                    format!("{}(a as _)", current_fn)
                } else {
                    String::new()
                };
--- a/library/stdarch/rustfmt.toml
+++ b/library/stdarch/rustfmt.toml