Implement RISC-V Zk extension intrinsics

2023-08-06 15:41:34 +02:00 · 2023-08-06 15:41:34 +02:00 · f0be271de9
commit f0be271de9
parent e301d8bba4
7 changed files with 1458 additions and 171 deletions
--- a/library/stdarch/crates/core_arch/src/mod.rs
+++ b/library/stdarch/crates/core_arch/src/mod.rs
@ -66,6 +66,7 @@ pub mod arch {
    #[doc(cfg(any(target_arch = "riscv32")))]
    #[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
    pub mod riscv32 {
+        pub use crate::core_arch::riscv32::*;
        pub use crate::core_arch::riscv_shared::*;
    }

@ -279,6 +280,10 @@ mod aarch64;
 #[doc(cfg(any(target_arch = "arm")))]
 mod arm;

+#[cfg(any(target_arch = "riscv32", doc))]
+#[doc(cfg(any(target_arch = "riscv32")))]
+mod riscv32;
+
 #[cfg(any(target_arch = "riscv64", doc))]
 #[doc(cfg(any(target_arch = "riscv64")))]
 mod riscv64;
--- a/library/stdarch/crates/core_arch/src/riscv32/mod.rs
+++ b/library/stdarch/crates/core_arch/src/riscv32/mod.rs
@ -0,0 +1,5 @@
+//! RISC-V RV32 specific intrinsics
+
+mod zk;
+
+pub use zk::*;
--- a/library/stdarch/crates/core_arch/src/riscv32/zk.rs
+++ b/library/stdarch/crates/core_arch/src/riscv32/zk.rs
@ -0,0 +1,458 @@
+#[allow(unused)]
+use core::arch::asm;
+
+#[allow(unused)]
+macro_rules! constify_imm2 {
+    ($imm2:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match $imm2 & 0b11 {
+            0b00 => $expand!(0),
+            0b01 => $expand!(1),
+            0b10 => $expand!(2),
+            _ => $expand!(3),
+        }
+    };
+}
+
+/// AES final round encryption instruction for RV32.
+///
+/// This instruction sources a single byte from rs2 according to bs. To this it applies the
+/// forward AES SBox operation, before XOR’ing the result with rs1. This instruction must
+/// always be implemented such that its execution latency does not depend on the data being
+/// operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.3
+///
+/// # Note
+///
+/// The `bs` parameter is expected to be a constant value and only the bottom 2 bits of `bs` are
+/// used.
+///
+/// # Safety
+///
+/// This function is safe to use if the `zkne` target feature is present.
+#[target_feature(enable = "zkne")]
+#[cfg_attr(test, assert_instr(aes32esi))]
+#[inline]
+pub unsafe fn aes32esi(rs1: u32, rs2: u32, bs: u8) -> u32 {
+    macro_rules! aes32esi {
+            ($imm2:expr) => {{
+                let value: u32;
+                unsafe {
+                    asm!(
+                        concat!("aes32esi {rd},{rs1},{rs2},", $imm2),
+                        rd = lateout(reg) value,
+                        rs1 = in(reg) rs1,
+                        rs2 = in(reg) rs2,
+                        options(pure, nomem, nostack),
+                    );
+                }
+                value
+            }}
+        }
+    constify_imm2!(bs, aes32esi)
+}
+
+/// AES middle round encryption instruction for RV32 with.
+///
+/// This instruction sources a single byte from rs2 according to bs. To this it applies the
+/// forward AES SBox operation, and a partial forward MixColumn, before XOR’ing the result with
+/// rs1. This instruction must always be implemented such that its execution latency does not
+/// depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.4
+///
+/// # Note
+///
+/// The `bs` parameter is expected to be a constant value and only the bottom 2 bits of `bs` are
+/// used.
+///
+/// # Safety
+///
+/// This function is safe to use if the `zkne` target feature is present.
+#[target_feature(enable = "zkne")]
+#[cfg_attr(test, assert_instr(aes32esmi))]
+#[inline]
+pub unsafe fn aes32esmi(rs1: u32, rs2: u32, bs: u8) -> u32 {
+    macro_rules! aes32esmi {
+            ($imm2:expr) => {{
+                let value: u32;
+                unsafe {
+                    asm!(
+                        concat!("aes32esmi {rd},{rs1},{rs2},", $imm2),
+                        rd = lateout(reg) value,
+                        rs1 = in(reg) rs1,
+                        rs2 = in(reg) rs2,
+                        options(pure, nomem, nostack),
+                    );
+                }
+                value
+            }}
+        }
+    constify_imm2!(bs, aes32esmi)
+}
+
+/// AES final round decryption instruction for RV32.
+///
+/// This instruction sources a single byte from rs2 according to bs. To this it applies the
+/// inverse AES SBox operation, and XOR’s the result with rs1. This instruction must always be
+/// implemented such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.1
+///
+/// # Note
+///
+/// The `bs` parameter is expected to be a constant value and only the bottom 2 bits of `bs` are
+/// used.
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknd` target feature is present.
+#[target_feature(enable = "zknd")]
+#[cfg_attr(test, assert_instr(aes32dsi))]
+#[inline]
+pub unsafe fn aes32dsi(rs1: u32, rs2: u32, bs: u8) -> u32 {
+    macro_rules! aes32dsi {
+            ($imm2:expr) => {{
+                let value: u32;
+                unsafe {
+                    asm!(
+                        concat!("aes32dsi {rd},{rs1},{rs2},", $imm2),
+                        rd = lateout(reg) value,
+                        rs1 = in(reg) rs1,
+                        rs2 = in(reg) rs2,
+                        options(pure, nomem, nostack),
+                    );
+                }
+                value
+            }}
+        }
+    constify_imm2!(bs, aes32dsi)
+}
+
+#[target_feature(enable = "zknd")]
+#[cfg_attr(test, assert_instr(aes32dsmi))]
+#[inline]
+/// AES middle round decryption instruction for RV32.
+///
+/// This instruction sources a single byte from rs2 according to bs. To this it applies the
+/// inverse AES SBox operation, and a partial inverse MixColumn, before XOR’ing the result with
+/// rs1. This instruction must always be implemented such that its execution latency does not
+/// depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.2
+///
+/// # Note
+///
+/// The `bs` parameter is expected to be a constant value and only the bottom 2 bits of `bs` are
+/// used.
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknd` target feature is present.
+pub unsafe fn aes32dsmi(rs1: u32, rs2: u32, bs: u8) -> u32 {
+    macro_rules! aes32dsmi {
+            ($imm2:expr) => {{
+                let value: u32;
+                unsafe {
+                    asm!(
+                        concat!("aes32dsmi {rd},{rs1},{rs2},", $imm2),
+                        rd = lateout(reg) value,
+                        rs1 = in(reg) rs1,
+                        rs2 = in(reg) rs2,
+                        options(pure, nomem, nostack),
+                    );
+                }
+                value
+            }}
+        }
+    constify_imm2!(bs, aes32dsmi)
+}
+
+/// Place upper/lower halves of the source register into odd/even bits of the destination
+/// respectivley.
+///
+/// This instruction places bits in the low half of the source register into the even bit
+/// positions of the destination, and bits in the high half of the source register into the odd
+/// bit positions of the destination. It is the inverse of the unzip instruction. This
+/// instruction is available only on RV32.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.49
+///
+/// # Safety
+///
+/// This function is safe to use if the `zbkb` target feature is present.
+#[target_feature(enable = "zbkb")]
+#[cfg_attr(test, assert_instr(zip))]
+#[inline]
+pub unsafe fn zip(rs: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(
+            "zip {rd},{rs}",
+            rd = lateout(reg) value,
+            rs = in(reg) rs,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Place odd and even bits of the source word into upper/lower halves of the destination.
+///
+/// This instruction places the even bits of the source register into the low half of the
+/// destination, and the odd bits of the source into the high bits of the destination. It is
+/// the inverse of the zip instruction. This instruction is available only on RV32.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.45
+///
+/// # Safety
+///
+/// This function is safe to use if the `zbkb` target feature is present.
+#[target_feature(enable = "zbkb")]
+#[cfg_attr(test, assert_instr(unzip))]
+#[inline]
+pub unsafe fn unzip(rs: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(
+            "unzip {rd},{rs}",
+            rd = lateout(reg) value,
+            rs = in(reg) rs,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Implements the high half of the Sigma0 transformation, as used in the SHA2-512 hash
+/// function \[49\] (Section 4.1.3).
+///
+/// This instruction is implemented on RV32 only. Used to compute the Sigma0 transform of the
+/// SHA2-512 hash function in conjunction with the sha512sig0l instruction. The transform is a
+/// 64-bit to 64-bit function, so the input and output are each represented by two 32-bit
+/// registers. This instruction must always be implemented such that its execution latency does
+/// not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.31
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha512sig0h))]
+#[inline]
+pub unsafe fn sha512sig0h(rs1: u32, rs2: u32) -> u32 {
+    let value: u32;
+    unsafe {
+        asm!(
+            "sha512sig0h {rd},{rs1},{rs2}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            rs2 = in(reg) rs2,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Implements the low half of the Sigma0 transformation, as used in the SHA2-512 hash function
+/// \[49\] (Section 4.1.3).
+///
+/// This instruction is implemented on RV32 only. Used to compute the Sigma0 transform of the
+/// SHA2-512 hash function in conjunction with the sha512sig0h instruction. The transform is a
+/// 64-bit to 64-bit function, so the input and output are each represented by two 32-bit
+/// registers. This instruction must always be implemented such that its execution latency does
+/// not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.32
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha512sig0l))]
+#[inline]
+pub unsafe fn sha512sig0l(rs1: u32, rs2: u32) -> u32 {
+    let value: u32;
+    unsafe {
+        asm!(
+            "sha512sig0l {rd},{rs1},{rs2}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            rs2 = in(reg) rs2,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Implements the high half of the Sigma1 transformation, as used in the SHA2-512 hash
+/// function \[49\] (Section 4.1.3).
+///
+/// This instruction is implemented on RV32 only. Used to compute the Sigma1 transform of the
+/// SHA2-512 hash function in conjunction with the sha512sig1l instruction. The transform is a
+/// 64-bit to 64-bit function, so the input and output are each represented by two 32-bit
+/// registers. This instruction must always be implemented such that its execution latency does
+/// not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.33
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha512sig1h))]
+#[inline]
+pub unsafe fn sha512sig1h(rs1: u32, rs2: u32) -> u32 {
+    let value: u32;
+    unsafe {
+        asm!(
+            "sha512sig1h {rd},{rs1},{rs2}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            rs2 = in(reg) rs2,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Implements the low half of the Sigma1 transformation, as used in the SHA2-512 hash function
+/// \[49\] (Section 4.1.3).
+///
+/// This instruction is implemented on RV32 only. Used to compute the Sigma1 transform of the
+/// SHA2-512 hash function in conjunction with the sha512sig1h instruction. The transform is a
+/// 64-bit to 64-bit function, so the input and output are each represented by two 32-bit
+/// registers. This instruction must always be implemented such that its execution latency does
+/// not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.34
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha512sig1l))]
+#[inline]
+pub unsafe fn sha512sig1l(rs1: u32, rs2: u32) -> u32 {
+    let value: u32;
+    unsafe {
+        asm!(
+            "sha512sig1l {rd},{rs1},{rs2}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            rs2 = in(reg) rs2,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Implements the Sum0 transformation, as used in the SHA2-512 hash function \[49\] (Section
+/// 4.1.3).
+///
+/// This instruction is implemented on RV32 only. Used to compute the Sum0 transform of the
+/// SHA2-512 hash function. The transform is a 64-bit to 64-bit function, so the input and
+/// output is represented by two 32-bit registers. This instruction must always be implemented
+/// such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.35
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha512sum0r))]
+#[inline]
+pub unsafe fn sha512sum0r(rs1: u32, rs2: u32) -> u32 {
+    let value: u32;
+    unsafe {
+        asm!(
+            "sha512sum0r {rd},{rs1},{rs2}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            rs2 = in(reg) rs2,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Implements the Sum1 transformation, as used in the SHA2-512 hash function \[49\] (Section
+/// 4.1.3).
+///
+/// This instruction is implemented on RV32 only. Used to compute the Sum1 transform of the
+/// SHA2-512 hash function. The transform is a 64-bit to 64-bit function, so the input and
+/// output is represented by two 32-bit registers. This instruction must always be implemented
+/// such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.36
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha512sum1r))]
+#[inline]
+pub unsafe fn sha512sum1r(rs1: u32, rs2: u32) -> u32 {
+    let value: u32;
+    unsafe {
+        asm!(
+            "sha512sum1r {rd},{rs1},{rs2}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            rs2 = in(reg) rs2,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
--- a/library/stdarch/crates/core_arch/src/riscv64/mod.rs
+++ b/library/stdarch/crates/core_arch/src/riscv64/mod.rs
@ -1,6 +1,10 @@
 //! RISC-V RV64 specific intrinsics
 use crate::arch::asm;

+mod zk;
+
+pub use zk::*;
+
 /// Loads virtual machine memory by unsigned word integer
 ///
 /// This instruction performs an explicit memory access as though `V=1`;
--- a/library/stdarch/crates/core_arch/src/riscv64/zk.rs
+++ b/library/stdarch/crates/core_arch/src/riscv64/zk.rs
@ -0,0 +1,388 @@
+#[allow(unused)]
+use core::arch::asm;
+
+#[allow(unused)]
+macro_rules! constify_imm_0_until_10 {
+    ($imm2:expr, $expand:ident) => {
+        match $imm2 {
+            1 => $expand!(1),
+            2 => $expand!(2),
+            3 => $expand!(3),
+            4 => $expand!(4),
+            5 => $expand!(5),
+            6 => $expand!(6),
+            7 => $expand!(7),
+            8 => $expand!(8),
+            9 => $expand!(9),
+            10 => $expand!(10),
+            _ => $expand!(0),
+        }
+    };
+}
+
+/// AES final round encryption instruction for RV64.
+///
+/// Uses the two 64-bit source registers to represent the entire AES state, and produces half
+/// of the next round output, applying the ShiftRows and SubBytes steps. This instruction must
+/// always be implemented such that its execution latency does not depend on the data being
+/// operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.7
+///
+/// # Safety
+///
+/// This function is safe to use if the `zkne` target feature is present.
+#[target_feature(enable = "zkne")]
+#[cfg_attr(test, assert_instr(aes64es))]
+#[inline]
+pub unsafe fn aes64es(rs1: u64, rs2: u64) -> u64 {
+    let value: u64;
+    unsafe {
+        asm!(
+            "aes64es {rd},{rs1},{rs2}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            rs2 = in(reg) rs2,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// AES middle round encryption instruction for RV64.
+///
+/// Uses the two 64-bit source registers to represent the entire AES state, and produces half
+/// of the next round output, applying the ShiftRows, SubBytes and MixColumns steps. This
+/// instruction must always be implemented such that its execution latency does not depend on
+/// the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.8
+///
+/// # Safety
+///
+/// This function is safe to use if the `zkne` target feature is present.
+#[target_feature(enable = "zkne")]
+#[cfg_attr(test, assert_instr(aes64esm))]
+#[inline]
+pub unsafe fn aes64esm(rs1: u64, rs2: u64) -> u64 {
+    let value: u64;
+    unsafe {
+        asm!(
+            "aes64esm {rd},{rs1},{rs2}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            rs2 = in(reg) rs2,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// AES final round decryption instruction for RV64.
+///
+/// Uses the two 64-bit source registers to represent the entire AES state, and produces half
+/// of the next round output, applying the Inverse ShiftRows and SubBytes steps. This
+/// instruction must always be implemented such that its execution latency does not depend on
+/// the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.5
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknd` target feature is present.
+#[target_feature(enable = "zknd")]
+#[cfg_attr(test, assert_instr(aes64ds))]
+#[inline]
+pub unsafe fn aes64ds(rs1: u64, rs2: u64) -> u64 {
+    let value: u64;
+    unsafe {
+        asm!(
+            "aes64ds {rd},{rs1},{rs2}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            rs2 = in(reg) rs2,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// AES middle round decryption instruction for RV64.
+///
+/// Uses the two 64-bit source registers to represent the entire AES state, and produces half
+/// of the next round output, applying the Inverse ShiftRows, SubBytes and MixColumns steps.
+/// This instruction must always be implemented such that its execution latency does not depend
+/// on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.6
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknd` target feature is present.
+#[target_feature(enable = "zknd")]
+#[cfg_attr(test, assert_instr(aes64dsm))]
+#[inline]
+pub unsafe fn aes64dsm(rs1: u64, rs2: u64) -> u64 {
+    let value: u64;
+    unsafe {
+        asm!(
+            "aes64esm {rd},{rs1},{rs2}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            rs2 = in(reg) rs2,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// This instruction implements part of the KeySchedule operation for the AES Block cipher
+/// involving the SBox operation.
+///
+/// This instruction implements the rotation, SubBytes and Round Constant addition steps of the
+/// AES block cipher Key Schedule. This instruction must always be implemented such that its
+/// execution latency does not depend on the data being operated on. Note that rnum must be in
+/// the range 0x0..0xA. The values 0xB..0xF are reserved.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.10
+///
+/// # Note
+///
+/// The `rnum` parameter is expected to be a constant value inside the range of `0..=10`, if a
+/// value outside the valid range is given it uses `rnum=0`.
+///
+/// # Safety
+///
+/// This function is safe to use if the `zkne` or `zknd` target feature is present.
+#[target_feature(enable = "zkne", enable = "zknd")]
+#[cfg_attr(test, assert_instr(aes64ks1i))]
+#[inline]
+pub unsafe fn aes64ks1i(rs1: u64, rnum: u8) -> u64 {
+    macro_rules! aes64ks1i {
+            ($imm_0_until_10:expr) => {{
+                let value: u64;
+                unsafe {
+                    asm!(
+                        concat!("aes64ks1i {rd},{rs1},", $imm_0_until_10),
+                        rd = lateout(reg) value,
+                        rs1 = in(reg) rs1,
+                        options(pure, nomem, nostack),
+                    )
+                }
+                value
+            }}
+        }
+    constify_imm_0_until_10!(rnum, aes64ks1i)
+}
+
+/// This instruction implements part of the KeySchedule operation for the AES Block cipher.
+///
+/// This instruction implements the additional XOR’ing of key words as part of the AES block
+/// cipher Key Schedule. This instruction must always be implemented such that its execution
+/// latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.11
+///
+/// # Safety
+///
+/// This function is safe to use if the `zkne` or `zknd` target feature is present.
+#[target_feature(enable = "zkne", enable = "zknd")]
+#[cfg_attr(test, assert_instr(aes64ks2))]
+#[inline]
+pub unsafe fn aes64ks2(rs1: u64, rs2: u64) -> u64 {
+    let value: u64;
+    unsafe {
+        asm!(
+            "aes64ks2 {rd},{rs1},{rs2}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            rs2 = in(reg) rs2,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Pack the low 16-bits of rs1 and rs2 into rd on RV64
+///
+/// This instruction packs the low 16 bits of rs1 and rs2 into the 32 least-significant bits of
+/// rd, sign extending the 32-bit result to the rest of rd. This instruction only exists on
+/// RV64 based systems.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.26
+///
+/// # Safety
+///
+/// This function is safe to use if the `zbkb` target feature is present.
+#[target_feature(enable = "zbkb")]
+#[cfg_attr(test, assert_instr(packw))]
+#[inline]
+pub unsafe fn packw(rs1: u64, rs2: u64) -> u64 {
+    let value: u64;
+    unsafe {
+        asm!(
+            "packw {rd},{rs1},{rs2}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            rs2 = in(reg) rs2,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Implements the Sigma0 transformation function as used in the SHA2-512 hash function \[49\]
+/// (Section 4.1.3).
+///
+/// This instruction is supported for the RV64 base architecture. It implements the Sigma0
+/// transform of the SHA2-512 hash function. \[49\]. This instruction must always be
+/// implemented such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.37
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha512sig0))]
+#[inline]
+pub unsafe fn sha512sig0(rs1: u64) -> u64 {
+    let value: u64;
+    unsafe {
+        asm!(
+            "sha512sig0 {rd},{rs1}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Implements the Sigma1 transformation function as used in the SHA2-512 hash function \[49\]
+/// (Section 4.1.3).
+///
+/// This instruction is supported for the RV64 base architecture. It implements the Sigma1
+/// transform of the SHA2-512 hash function. \[49\]. This instruction must always be
+/// implemented such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.38
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha512sig1))]
+#[inline]
+pub unsafe fn sha512sig1(rs1: u64) -> u64 {
+    let value: u64;
+    unsafe {
+        asm!(
+            "sha512sig1 {rd},{rs1}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Implements the Sum0 transformation function as used in the SHA2-512 hash function \[49\]
+/// (Section 4.1.3).
+///
+/// This instruction is supported for the RV64 base architecture. It implements the Sum0
+/// transform of the SHA2-512 hash function. \[49\]. This instruction must always be
+/// implemented such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.39
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha512sum0))]
+#[inline]
+pub unsafe fn sha512sum0(rs1: u64) -> u64 {
+    let value: u64;
+    unsafe {
+        asm!(
+            "sha512sum0 {rd},{rs1}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Implements the Sum1 transformation function as used in the SHA2-512 hash function \[49\]
+/// (Section 4.1.3).
+///
+/// This instruction is supported for the RV64 base architecture. It implements the Sum1
+/// transform of the SHA2-512 hash function. \[49\]. This instruction must always be
+/// implemented such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.40
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha512sum1))]
+#[inline]
+pub unsafe fn sha512sum1(rs1: u64) -> u64 {
+    let value: u64;
+    unsafe {
+        asm!(
+            "sha512sum0 {rd},{rs1}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
--- a/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs
+++ b/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs
@ -1,8 +1,11 @@
 //! Shared RISC-V intrinsics
+
+mod zk;
 mod p;

 #[unstable(feature = "stdsimd", issue = "27731")]
 pub use p::*;
+pub use zk::*;

 use crate::arch::asm;

@ -628,179 +631,9 @@ pub fn frflags() -> u32 {
 /// and then writing a new value obtained from the five least-significant bits of
 /// input variable `value` into `fflags`.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub fn fsflags(value: u32) -> u32 {
    let original: u32;
    unsafe { asm!("fsflags {}, {}", out(reg) original, in(reg) value, options(nomem, nostack)) }
    original
 }
-
-/// `P0` transformation function as is used in the SM3 hash algorithm
-///
-/// This function is included in `Zksh` extension. It's defined as:
-///
-/// ```text
-/// P0(X) = X ⊕ (X ≪ 9) ⊕ (X ≪ 17)
-/// ```
-///
-/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
-///
-/// In the SM3 algorithm, the `P0` transformation is used as `E ← P0(TT2)` when the
-/// compression function `CF` uses the intermediate value `TT2` to calculate
-/// the variable `E` in one iteration for subsequent processes.
-///
-/// According to RISC-V Cryptography Extensions, Volume I, the execution latency of
-/// this instruction must always be independent from the data it operates on.
-#[inline]
-#[target_feature(enable = "zksh")]
-pub fn sm3p0(x: u32) -> u32 {
-    let ans: u32;
-    unsafe { asm!("sm3p0 {}, {}", lateout(reg) ans, in(reg) x, options(pure, nomem, nostack)) };
-    ans
-}
-
-/// `P1` transformation function as is used in the SM3 hash algorithm
-///
-/// This function is included in `Zksh` extension. It's defined as:
-///
-/// ```text
-/// P1(X) = X ⊕ (X ≪ 15) ⊕ (X ≪ 23)
-/// ```
-///
-/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
-///
-/// In the SM3 algorithm, the `P1` transformation is used to expand message,
-/// where expanded word `Wj` can be generated from the previous words.
-/// The whole process can be described as the following pseudocode:
-///
-/// ```text
-/// FOR j=16 TO 67
-///     Wj ← P1(Wj−16 ⊕ Wj−9 ⊕ (Wj−3 ≪ 15)) ⊕ (Wj−13 ≪ 7) ⊕ Wj−6
-/// ENDFOR
-/// ```
-///
-/// According to RISC-V Cryptography Extensions, Volume I, the execution latency of
-/// this instruction must always be independent from the data it operates on.
-#[inline]
-#[target_feature(enable = "zksh")]
-pub fn sm3p1(x: u32) -> u32 {
-    let ans: u32;
-    unsafe { asm!("sm3p1 {}, {}", lateout(reg) ans, in(reg) x, options(pure, nomem, nostack)) };
-    ans
-}
-
-/// Accelerates the round function `F` in the SM4 block cipher algorithm
-///
-/// This instruction is included in extension `Zksed`. It's defined as:
-///
-/// ```text
-/// SM4ED(x, a, BS) = x ⊕ T(ai)
-/// ... where
-/// ai = a.bytes[BS]
-/// T(ai) = L(τ(ai))
-/// bi = τ(ai) = SM4-S-Box(ai)
-/// ci = L(bi) = bi ⊕ (bi ≪ 2) ⊕ (bi ≪ 10) ⊕ (bi ≪ 18) ⊕ (bi ≪ 24)
-/// SM4ED = (ci ≪ (BS * 8)) ⊕ x
-/// ```
-///
-/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
-/// As is defined above, `T` is a combined transformation of non linear S-Box transform `τ`
-/// and linear layer transform `L`.
-///
-/// In the SM4 algorithm, the round function `F` is defined as:
-///
-/// ```text
-/// F(x0, x1, x2, x3, rk) = x0 ⊕ T(x1 ⊕ x2 ⊕ x3 ⊕ rk)
-/// ... where
-/// T(A) = L(τ(A))
-/// B = τ(A) = (SM4-S-Box(a0), SM4-S-Box(a1), SM4-S-Box(a2), SM4-S-Box(a3))
-/// C = L(B) = B ⊕ (B ≪ 2) ⊕ (B ≪ 10) ⊕ (B ≪ 18) ⊕ (B ≪ 24)
-/// ```
-///
-/// It can be implemented by `sm4ed` instruction like:
-///
-/// ```no_run
-/// # #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
-/// # fn round_function(x0: u32, x1: u32, x2: u32, x3: u32, rk: u32) -> u32 {
-/// # #[cfg(target_arch = "riscv32")] use core::arch::riscv32::sm4ed;
-/// # #[cfg(target_arch = "riscv64")] use core::arch::riscv64::sm4ed;
-/// let a = x1 ^ x2 ^ x3 ^ rk;
-/// let c0 = sm4ed::<0>(x0, a);
-/// let c1 = sm4ed::<1>(c0, a); // c1 represents c[0..=1], etc.
-/// let c2 = sm4ed::<2>(c1, a);
-/// let c3 = sm4ed::<3>(c2, a);
-/// return c3; // c3 represents c[0..=3]
-/// # }
-/// ```
-///
-/// According to RISC-V Cryptography Extensions, Volume I, the execution latency of
-/// this instruction must always be independent from the data it operates on.
-#[inline]
-#[target_feature(enable = "zksed")]
-pub fn sm4ed<const BS: u8>(x: u32, a: u32) -> u32 {
-    static_assert!(BS <= 3);
-    let ans: u32;
-    unsafe {
-        asm!("sm4ed {}, {}, {}, {}", lateout(reg) ans, in(reg) x, in(reg) a, const BS, options(pure, nomem, nostack))
-    };
-    ans
-}
-
-/// Accelerates the key schedule operation in the SM4 block cipher algorithm
-///
-/// This instruction is included in extension `Zksed`. It's defined as:
-///
-/// ```text
-/// SM4KS(x, k, BS) = x ⊕ T'(ki)
-/// ... where
-/// ki = k.bytes[BS]
-/// T'(ki) = L'(τ(ki))
-/// bi = τ(ki) = SM4-S-Box(ki)
-/// ci = L'(bi) = bi ⊕ (bi ≪ 13) ⊕ (bi ≪ 23)
-/// SM4KS = (ci ≪ (BS * 8)) ⊕ x
-/// ```
-///
-/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
-/// As is defined above, `T'` is a combined transformation of non linear S-Box transform `τ`
-/// and the replaced linear layer transform `L'`.
-///
-/// In the SM4 algorithm, the key schedule is defined as:
-///
-/// ```text
-/// rk[i] = K[i+4] = K[i] ⊕ T'(K[i+1] ⊕ K[i+2] ⊕ K[i+3] ⊕ CK[i])
-/// ... where
-/// K[0..=3] = MK[0..=3] ⊕ FK[0..=3]
-/// T'(K) = L'(τ(K))
-/// B = τ(K) = (SM4-S-Box(k0), SM4-S-Box(k1), SM4-S-Box(k2), SM4-S-Box(k3))
-/// C = L'(B) = B ⊕ (B ≪ 13) ⊕ (B ≪ 23)
-/// ```
-///
-/// where `MK` represents the input 128-bit encryption key,
-/// constants `FK` and `CK` are fixed system configuration constant values defined by the SM4 algorithm.
-/// Hence, the key schedule operation can be implemented by `sm4ks` instruction like:
-///
-/// ```no_run
-/// # #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
-/// # fn key_schedule(k0: u32, k1: u32, k2: u32, k3: u32, ck_i: u32) -> u32 {
-/// # #[cfg(target_arch = "riscv32")] use core::arch::riscv32::sm4ks;
-/// # #[cfg(target_arch = "riscv64")] use core::arch::riscv64::sm4ks;
-/// let k = k1 ^ k2 ^ k3 ^ ck_i;
-/// let c0 = sm4ks::<0>(k0, k);
-/// let c1 = sm4ks::<1>(c0, k); // c1 represents c[0..=1], etc.
-/// let c2 = sm4ks::<2>(c1, k);
-/// let c3 = sm4ks::<3>(c2, k);
-/// return c3; // c3 represents c[0..=3]
-/// # }
-/// ```
-///
-/// According to RISC-V Cryptography Extensions, Volume I, the execution latency of
-/// this instruction must always be independent from the data it operates on.
-#[inline]
-#[target_feature(enable = "zksed")]
-pub fn sm4ks<const BS: u8>(x: u32, k: u32) -> u32 {
-    static_assert!(BS <= 3);
-    let ans: u32;
-    unsafe {
-        asm!("sm4ks {}, {}, {}, {}", lateout(reg) ans, in(reg) x, in(reg) k, const BS, options(pure, nomem, nostack))
-    };
-    ans
-}
--- a/library/stdarch/crates/core_arch/src/riscv_shared/zk.rs
+++ b/library/stdarch/crates/core_arch/src/riscv_shared/zk.rs
@ -0,0 +1,594 @@
+#[allow(unused)]
+use core::arch::asm;
+
+#[allow(unused)]
+macro_rules! constify_imm2 {
+    ($imm2:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match $imm2 & 0b11 {
+            0b00 => $expand!(0),
+            0b01 => $expand!(1),
+            0b10 => $expand!(2),
+            _ => $expand!(3),
+        }
+    };
+}
+
+/// Pack the low halves of rs1 and rs2 into rd.
+///
+/// The pack instruction packs the XLEN/2-bit lower halves of rs1 and rs2 into rd, with rs1 in
+/// the lower half and rs2 in the upper half.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.17
+///
+/// # Safety
+///
+/// This function is safe to use if the `zbkb` target feature is present.
+#[target_feature(enable = "zbkb")]
+#[cfg_attr(test, assert_instr(pack))]
+#[inline]
+pub unsafe fn pack(rs1: usize, rs2: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(
+            "pack {rd},{rs1},{rs2}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            rs2 = in(reg) rs2,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Pack the low bytes of rs1 and rs2 into rd.
+///
+/// And the packh instruction packs the least-significant bytes of rs1 and rs2 into the 16
+/// least-significant bits of rd, zero extending the rest of rd.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.18
+///
+/// # Safety
+///
+/// This function is safe to use if the `zbkb` target feature is present.
+#[target_feature(enable = "zbkb")]
+#[cfg_attr(test, assert_instr(packh))]
+#[inline]
+pub unsafe fn packh(rs1: usize, rs2: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(
+            "packh {rd},{rs1},{rs2}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            rs2 = in(reg) rs2,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Reverse the bits in each byte of a source register.
+///
+/// This instruction reverses the order of the bits in every byte of a register.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.13
+///
+/// # Safety
+///
+/// This function is safe to use if the `zbkb` target feature is present.
+#[target_feature(enable = "zbkb")]
+#[cfg_attr(test, assert_instr(brev8))]
+#[inline]
+pub unsafe fn brev8(rs: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(
+            "brev8 {rd},{rs}",
+            rd = lateout(reg) value,
+            rs = in(reg) rs,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Byte-wise lookup of indicies into a vector in registers.
+///
+/// The xperm8 instruction operates on bytes. The rs1 register contains a vector of XLEN/8
+/// 8-bit elements. The rs2 register contains a vector of XLEN/8 8-bit indexes. The result is
+/// each element in rs2 replaced by the indexed element in rs1, or zero if the index into rs2
+/// is out of bounds.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.47
+///
+/// # Safety
+///
+/// This function is safe to use if the `zbkx` target feature is present.
+#[target_feature(enable = "zbkx")]
+#[cfg_attr(test, assert_instr(xperm8))]
+#[inline]
+pub unsafe fn xperm8(rs1: usize, rs2: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(
+            "xperm8 {rd},{rs1},{rs2}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            rs2 = in(reg) rs2,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Nibble-wise lookup of indicies into a vector.
+///
+/// The xperm4 instruction operates on nibbles. The rs1 register contains a vector of XLEN/4
+/// 4-bit elements. The rs2 register contains a vector of XLEN/4 4-bit indexes. The result is
+/// each element in rs2 replaced by the indexed element in rs1, or zero if the index into rs2
+/// is out of bounds.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.48
+///
+/// # Safety
+///
+/// This function is safe to use if the `zbkx` target feature is present.
+#[target_feature(enable = "zbkx")]
+#[cfg_attr(test, assert_instr(xperm4))]
+#[inline]
+pub unsafe fn xperm4(rs1: usize, rs2: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(
+            "xperm4 {rd},{rs1},{rs2}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            rs2 = in(reg) rs2,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Implements the Sigma0 transformation function as used in the SHA2-256 hash function \[49\]
+/// (Section 4.1.2).
+///
+/// This instruction is supported for both RV32 and RV64 base architectures. For RV32, the
+/// entire XLEN source register is operated on. For RV64, the low 32 bits of the source
+/// register are operated on, and the result sign extended to XLEN bits. Though named for
+/// SHA2-256, the instruction works for both the SHA2-224 and SHA2-256 parameterisations as
+/// described in \[49\]. This instruction must always be implemented such that its execution
+/// latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.27
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha256sig0))]
+#[inline]
+pub unsafe fn sha256sig0(rs1: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(
+            "sha256sig0 {rd},{rs1}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Implements the Sigma1 transformation function as used in the SHA2-256 hash function \[49\]
+/// (Section 4.1.2).
+///
+/// This instruction is supported for both RV32 and RV64 base architectures. For RV32, the
+/// entire XLEN source register is operated on. For RV64, the low 32 bits of the source
+/// register are operated on, and the result sign extended to XLEN bits. Though named for
+/// SHA2-256, the instruction works for both the SHA2-224 and SHA2-256 parameterisations as
+/// described in \[49\]. This instruction must always be implemented such that its execution
+/// latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.28
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha256sig1))]
+#[inline]
+pub unsafe fn sha256sig1(rs1: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(
+            "sha256sig1 {rd},{rs1}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Implements the Sum0 transformation function as used in the SHA2-256 hash function \[49\]
+/// (Section 4.1.2).
+///
+/// This instruction is supported for both RV32 and RV64 base architectures. For RV32, the
+/// entire XLEN source register is operated on. For RV64, the low 32 bits of the source
+/// register are operated on, and the result sign extended to XLEN bits. Though named for
+/// SHA2-256, the instruction works for both the SHA2-224 and SHA2-256 parameterisations as
+/// described in \[49\]. This instruction must always be implemented such that its execution
+/// latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.29
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha256sum0))]
+#[inline]
+pub unsafe fn sha256sum0(rs1: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(
+            "sha256sig1 {rd},{rs1}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Implements the Sum1 transformation function as used in the SHA2-256 hash function \[49\]
+/// (Section 4.1.2).
+///
+/// This instruction is supported for both RV32 and RV64 base architectures. For RV32, the
+/// entire XLEN source register is operated on. For RV64, the low 32 bits of the source
+/// register are operated on, and the result sign extended to XLEN bits. Though named for
+/// SHA2-256, the instruction works for both the SHA2-224 and SHA2-256 parameterisations as
+/// described in \[49\]. This instruction must always be implemented such that its execution
+/// latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.30
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha256sum1))]
+#[inline]
+pub unsafe fn sha256sum1(rs1: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(
+            "sha256sig1 {rd},{rs1}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Accelerates the block encrypt/decrypt operation of the SM4 block cipher \[5, 31\].
+///
+/// Implements a T-tables in hardware style approach to accelerating the SM4 round function. A
+/// byte is extracted from rs2 based on bs, to which the SBox and linear layer transforms are
+/// applied, before the result is XOR’d with rs1 and written back to rd. This instruction
+/// exists on RV32 and RV64 base architectures. On RV64, the 32-bit result is sign extended to
+/// XLEN bits. This instruction must always be implemented such that its execution latency does
+/// not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.43
+///
+/// # Note
+///
+/// The `bs` parameter is expected to be a constant value and only the bottom 2 bits of `bs` are
+/// used.
+///
+/// # Safety
+///
+/// This function is safe to use if the `zksed` target feature is present.
+///
+/// # Details
+///
+/// Accelerates the round function `F` in the SM4 block cipher algorithm
+///
+/// This instruction is included in extension `Zksed`. It's defined as:
+///
+/// ```text
+/// SM4ED(x, a, BS) = x ⊕ T(ai)
+/// ... where
+/// ai = a.bytes[BS]
+/// T(ai) = L(τ(ai))
+/// bi = τ(ai) = SM4-S-Box(ai)
+/// ci = L(bi) = bi ⊕ (bi ≪ 2) ⊕ (bi ≪ 10) ⊕ (bi ≪ 18) ⊕ (bi ≪ 24)
+/// SM4ED = (ci ≪ (BS * 8)) ⊕ x
+/// ```
+///
+/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
+/// As is defined above, `T` is a combined transformation of non linear S-Box transform `τ`
+/// and linear layer transform `L`.
+///
+/// In the SM4 algorithm, the round function `F` is defined as:
+///
+/// ```text
+/// F(x0, x1, x2, x3, rk) = x0 ⊕ T(x1 ⊕ x2 ⊕ x3 ⊕ rk)
+/// ... where
+/// T(A) = L(τ(A))
+/// B = τ(A) = (SM4-S-Box(a0), SM4-S-Box(a1), SM4-S-Box(a2), SM4-S-Box(a3))
+/// C = L(B) = B ⊕ (B ≪ 2) ⊕ (B ≪ 10) ⊕ (B ≪ 18) ⊕ (B ≪ 24)
+/// ```
+///
+/// It can be implemented by `sm4ed` instruction like:
+///
+/// ```no_run
+/// # #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
+/// # fn round_function(x0: u32, x1: u32, x2: u32, x3: u32, rk: u32) -> u32 {
+/// # #[cfg(target_arch = "riscv32")] use core::arch::riscv32::sm4ed;
+/// # #[cfg(target_arch = "riscv64")] use core::arch::riscv64::sm4ed;
+/// let a = x1 ^ x2 ^ x3 ^ rk;
+/// let c0 = sm4ed(x0, a, 0);
+/// let c1 = sm4ed(c0, a, 1); // c1 represents c[0..=1], etc.
+/// let c2 = sm4ed(c1, a, 2);
+/// let c3 = sm4ed(c2, a, 3);
+/// return c3; // c3 represents c[0..=3]
+/// # }
+/// ```
+#[target_feature(enable = "zksed")]
+#[cfg_attr(test, assert_instr(sm4ed))]
+#[inline]
+pub unsafe fn sm4ed(rs1: usize, rs2: usize, bs: u8) -> usize {
+    macro_rules! sm4ed {
+        ($imm2:expr) => {{
+            let value: usize;
+            unsafe {
+                asm!(
+                    concat!("sm4ed {rd},{rs1},{rs2},", $imm2),
+                    rd = lateout(reg) value,
+                    rs1 = in(reg) rs1,
+                    rs2 = in(reg) rs2,
+                    options(pure, nomem, nostack),
+                )
+            }
+            value
+        }}
+    }
+    constify_imm2!(bs, sm4ed)
+}
+
+/// Accelerates the Key Schedule operation of the SM4 block cipher \[5, 31\] with `bs=0`.
+///
+/// Implements a T-tables in hardware style approach to accelerating the SM4 Key Schedule. A
+/// byte is extracted from rs2 based on bs, to which the SBox and linear layer transforms are
+/// applied, before the result is XOR’d with rs1 and written back to rd. This instruction
+/// exists on RV32 and RV64 base architectures. On RV64, the 32-bit result is sign extended to
+/// XLEN bits. This instruction must always be implemented such that its execution latency does
+/// not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.44
+///
+/// # Note
+///
+/// The `bs` parameter is expected to be a constant value and only the bottom 2 bits of `bs` are
+/// used.
+///
+/// # Safety
+///
+/// This function is safe to use if the `zksed` target feature is present.
+///
+/// # Details
+///
+/// Accelerates the round function `F` in the SM4 block cipher algorithm
+///
+/// This instruction is included in extension `Zksed`. It's defined as:
+///
+/// ```text
+/// SM4ED(x, a, BS) = x ⊕ T(ai)
+/// ... where
+/// ai = a.bytes[BS]
+/// T(ai) = L(τ(ai))
+/// bi = τ(ai) = SM4-S-Box(ai)
+/// ci = L(bi) = bi ⊕ (bi ≪ 2) ⊕ (bi ≪ 10) ⊕ (bi ≪ 18) ⊕ (bi ≪ 24)
+/// SM4ED = (ci ≪ (BS * 8)) ⊕ x
+/// ```
+///
+/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
+/// As is defined above, `T` is a combined transformation of non linear S-Box transform `τ`
+/// and linear layer transform `L`.
+///
+/// In the SM4 algorithm, the round function `F` is defined as:
+///
+/// ```text
+/// F(x0, x1, x2, x3, rk) = x0 ⊕ T(x1 ⊕ x2 ⊕ x3 ⊕ rk)
+/// ... where
+/// T(A) = L(τ(A))
+/// B = τ(A) = (SM4-S-Box(a0), SM4-S-Box(a1), SM4-S-Box(a2), SM4-S-Box(a3))
+/// C = L(B) = B ⊕ (B ≪ 2) ⊕ (B ≪ 10) ⊕ (B ≪ 18) ⊕ (B ≪ 24)
+/// ```
+///
+/// It can be implemented by `sm4ed` instruction like:
+///
+/// ```no_run
+/// # #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
+/// # fn round_function(x0: u32, x1: u32, x2: u32, x3: u32, rk: u32) -> u32 {
+/// # #[cfg(target_arch = "riscv32")] use core::arch::riscv32::sm4ed;
+/// # #[cfg(target_arch = "riscv64")] use core::arch::riscv64::sm4ed;
+/// let a = x1 ^ x2 ^ x3 ^ rk;
+/// let c0 = sm4ed(x0, a, 0);
+/// let c1 = sm4ed(c0, a, 1); // c1 represents c[0..=1], etc.
+/// let c2 = sm4ed(c1, a, 2);
+/// let c3 = sm4ed(c2, a, 3);
+/// return c3; // c3 represents c[0..=3]
+/// # }
+/// ```
+#[target_feature(enable = "zksed")]
+#[cfg_attr(test, assert_instr(sm4ks))]
+#[inline]
+pub unsafe fn sm4ks(rs1: usize, rs2: usize, bs: u8) -> usize {
+    macro_rules! sm4ks {
+        ($imm2:expr) => {{
+            let value: usize;
+            unsafe {
+                asm!(
+                    concat!("sm4ks {rd},{rs1},{rs2},", $imm2),
+                    rd = lateout(reg) value,
+                    rs1 = in(reg) rs1,
+                    rs2 = in(reg) rs2,
+                    options(pure, nomem, nostack),
+                )
+            }
+            value
+        }}
+    }
+    constify_imm2!(bs, sm4ks)
+}
+
+/// Implements the P0 transformation function as used in the SM3 hash function [4, 30].
+///
+/// This instruction is supported for the RV32 and RV64 base architectures. It implements the
+/// P0 transform of the SM3 hash function [4, 30]. This instruction must always be implemented
+/// such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.41
+///
+/// # Safety
+///
+/// This function is safe to use if the `zksh` target feature is present.
+///
+/// # Details
+///
+/// `P0` transformation function as is used in the SM3 hash algorithm
+///
+/// This function is included in `Zksh` extension. It's defined as:
+///
+/// ```text
+/// P0(X) = X ⊕ (X ≪ 9) ⊕ (X ≪ 17)
+/// ```
+///
+/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
+///
+/// In the SM3 algorithm, the `P0` transformation is used as `E ← P0(TT2)` when the
+/// compression function `CF` uses the intermediate value `TT2` to calculate
+/// the variable `E` in one iteration for subsequent processes.
+#[target_feature(enable = "zksh")]
+#[cfg_attr(test, assert_instr(sm3p0))]
+#[inline]
+pub unsafe fn sm3p0(rs1: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(
+            "sm3p0 {rd},{rs1}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}
+
+/// Implements the P1 transformation function as used in the SM3 hash function [4, 30].
+///
+/// This instruction is supported for the RV32 and RV64 base architectures. It implements the
+/// P1 transform of the SM3 hash function [4, 30]. This instruction must always be implemented
+/// such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.42
+///
+/// # Safety
+///
+/// This function is safe to use if the `zksh` target feature is present.
+///
+/// # Details
+///
+/// `P1` transformation function as is used in the SM3 hash algorithm
+///
+/// This function is included in `Zksh` extension. It's defined as:
+///
+/// ```text
+/// P1(X) = X ⊕ (X ≪ 15) ⊕ (X ≪ 23)
+/// ```
+///
+/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
+///
+/// In the SM3 algorithm, the `P1` transformation is used to expand message,
+/// where expanded word `Wj` can be generated from the previous words.
+/// The whole process can be described as the following pseudocode:
+///
+/// ```text
+/// FOR j=16 TO 67
+///     Wj ← P1(Wj−16 ⊕ Wj−9 ⊕ (Wj−3 ≪ 15)) ⊕ (Wj−13 ≪ 7) ⊕ Wj−6
+/// ENDFOR
+/// ```
+#[target_feature(enable = "zksh")]
+#[cfg_attr(test, assert_instr(sm3p1))]
+#[inline]
+pub unsafe fn sm3p1(rs1: usize) -> usize {
+    let value: usize;
+    unsafe {
+        asm!(
+            "sm3p1 {rd},{rs1}",
+            rd = lateout(reg) value,
+            rs1 = in(reg) rs1,
+            options(pure, nomem, nostack),
+        )
+    }
+    value
+}