From 4636c59df5a7be4e47758588ad188bcb1f666f7c Mon Sep 17 00:00:00 2001
From: Antoni Boucher <bouanto@zoho.com>
Date: Fri, 29 Apr 2022 23:14:26 -0400
Subject: [PATCH] Add more SIMD

---
 src/builder.rs        |  2 +-
 src/consts.rs         | 15 +-------
 src/context.rs        |  9 +++++
 src/intrinsic/llvm.rs | 19 ++++++++++
 src/intrinsic/simd.rs | 87 +++++++++++++++++++++++++++++++++++--------
 5 files changed, 102 insertions(+), 30 deletions(-)

diff --git a/src/builder.rs b/src/builder.rs
index 9a5cf785a1f5..f0b93c3d5170 100644
--- a/src/builder.rs
+++ b/src/builder.rs
@@ -1343,7 +1343,7 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
                 }
                 // TODO(antoyo): switch to using new_vector_access.
                 let array = self.context.new_bitcast(None, v2, array_type);
-                for i in 0..vec_num_units {
+                for i in 0..(mask_num_units - vec_num_units) {
                     elements.push(self.context.new_array_access(None, array, self.context.new_rvalue_from_int(self.int_type, i as i32)).to_rvalue());
                 }
                 let v1 = self.context.new_rvalue_from_vector(None, result_type, &elements);
diff --git a/src/consts.rs b/src/consts.rs
index 4350c00e94a7..4b517fd85f05 100644
--- a/src/consts.rs
+++ b/src/consts.rs
@@ -27,12 +27,7 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
         }
         // NOTE: since bitcast makes a value non-constant, don't bitcast if not necessary as some
         // SIMD builtins require a constant value.
-        if value.get_type() != typ {
-            self.context.new_bitcast(None, value, typ)
-        }
-        else {
-            value
-        }
+        self.bitcast_if_needed(value, typ)
     }
 }
 
@@ -86,13 +81,7 @@ impl<'gcc, 'tcx> StaticMethods for CodegenCx<'gcc, 'tcx> {
 
         // TODO(antoyo): set alignment.
 
-        let value =
-            if value.get_type() != gcc_type {
-                self.context.new_bitcast(None, value, gcc_type)
-            }
-            else {
-                value
-            };
+        let value = self.bitcast_if_needed(value, gcc_type);
         global.global_set_initializer_rvalue(value);
 
         // As an optimization, all shared statics which do not have interior
diff --git a/src/context.rs b/src/context.rs
index 83c4683a6683..92b30ef9b4d8 100644
--- a/src/context.rs
+++ b/src/context.rs
@@ -279,6 +279,15 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
     pub fn sess(&self) -> &Session {
         &self.tcx.sess
     }
+
+    pub fn bitcast_if_needed(&self, value: RValue<'gcc>, expected_type: Type<'gcc>) -> RValue<'gcc> {
+        if value.get_type() != expected_type {
+            self.context.new_bitcast(None, value, expected_type)
+        }
+        else {
+            value
+        }
+    }
 }
 
 impl<'gcc, 'tcx> BackendTypes for CodegenCx<'gcc, 'tcx> {
diff --git a/src/intrinsic/llvm.rs b/src/intrinsic/llvm.rs
index 4b41b0ba6e78..aab93b927558 100644
--- a/src/intrinsic/llvm.rs
+++ b/src/intrinsic/llvm.rs
@@ -21,6 +21,25 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function
         "llvm.x86.xgetbv" => "__builtin_ia32_xgetbv",
         // NOTE: this doc specifies the equivalent GCC builtins: http://huonw.github.io/llvmint/llvmint/x86/index.html
         "llvm.sqrt.v2f64" => "__builtin_ia32_sqrtpd",
+
+        // The above doc points to unknown builtins for the following, so override them:
+        "llvm.x86.avx2.gather.d.d" => "__builtin_ia32_gathersiv4si",
+        "llvm.x86.avx2.gather.d.d.256" => "__builtin_ia32_gathersiv8si",
+        "llvm.x86.avx2.gather.d.ps" => "__builtin_ia32_gathersiv4sf",
+        "llvm.x86.avx2.gather.d.ps.256" => "__builtin_ia32_gathersiv8sf",
+        "llvm.x86.avx2.gather.d.q" => "__builtin_ia32_gathersiv2di",
+        "llvm.x86.avx2.gather.d.q.256" => "__builtin_ia32_gathersiv4di",
+        "llvm.x86.avx2.gather.d.pd" => "__builtin_ia32_gathersiv2df",
+        "llvm.x86.avx2.gather.d.pd.256" => "__builtin_ia32_gathersiv4df",
+        "llvm.x86.avx2.gather.q.d" => "__builtin_ia32_gatherdiv4si",
+        "llvm.x86.avx2.gather.q.d.256" => "__builtin_ia32_gatherdiv4si256",
+        "llvm.x86.avx2.gather.q.ps" => "__builtin_ia32_gatherdiv4sf",
+        "llvm.x86.avx2.gather.q.ps.256" => "__builtin_ia32_gatherdiv4sf256",
+        "llvm.x86.avx2.gather.q.q" => "__builtin_ia32_gatherdiv2di",
+        "llvm.x86.avx2.gather.q.q.256" => "__builtin_ia32_gatherdiv4di",
+        "llvm.x86.avx2.gather.q.pd" => "__builtin_ia32_gatherdiv2df",
+        "llvm.x86.avx2.gather.q.pd.256" => "__builtin_ia32_gatherdiv4df",
+        "" => "",
         // NOTE: this file is generated by https://github.com/GuillaumeGomez/llvmint/blob/master/generate_list.py
         _ => include!("archs.rs"),
     };
diff --git a/src/intrinsic/simd.rs b/src/intrinsic/simd.rs
index e5753e318c7f..9204fbdfaba7 100644
--- a/src/intrinsic/simd.rs
+++ b/src/intrinsic/simd.rs
@@ -202,14 +202,8 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>,
             };
         let builtin = bx.context.get_target_builtin_function(func_name);
         let param1_type = builtin.get_param(0).to_rvalue().get_type();
-        let vector =
-            if vector.get_type() != param1_type {
-                // TODO(antoyo): perhaps use __builtin_convertvector for vector casting.
-                bx.context.new_bitcast(None, vector, param1_type)
-            }
-            else {
-                vector
-            };
+        // TODO(antoyo): perhaps use __builtin_convertvector for vector casting.
+        let vector = bx.cx.bitcast_if_needed(vector, param1_type);
         let result = bx.context.new_call(None, builtin, &[vector, value, bx.context.new_cast(None, index, bx.int_type)]);
         // TODO(antoyo): perhaps use __builtin_convertvector for vector casting.
         return Ok(bx.context.new_bitcast(None, result, vector.get_type()));
@@ -539,18 +533,79 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>,
         let vec_ty = bx.cx.type_vector(elem_ty, in_len as u64);
 
         let func = bx.context.get_target_builtin_function(builtin_name);
-        let param1_type = func.get_parameter(0).get_type();
-        let lhs =
-            if lhs.get_type() != param1_type {
-                bx.context.new_bitcast(None, lhs, param1_type)
-            }
-            else {
-                lhs
-            };
+        let param1_type = func.get_param(0).to_rvalue().get_type();
+        let param2_type = func.get_param(1).to_rvalue().get_type();
+        let lhs = bx.cx.bitcast_if_needed(lhs, param1_type);
+        let rhs = bx.cx.bitcast_if_needed(rhs, param2_type);
         let result = bx.context.new_call(None, func, &[lhs, rhs]);
         // TODO(antoyo): perhaps use __builtin_convertvector for vector casting.
         return Ok(bx.context.new_bitcast(None, result, vec_ty));
     }
 
+    macro_rules! arith_red {
+        ($name:ident : $integer_reduce:ident, $float_reduce:ident, $ordered:expr, $op:ident,
+         $identity:expr) => {
+            if name == sym::$name {
+                require!(
+                    ret_ty == in_elem,
+                    "expected return type `{}` (element of input `{}`), found `{}`",
+                    in_elem,
+                    in_ty,
+                    ret_ty
+                );
+                return match in_elem.kind() {
+                    ty::Int(_) | ty::Uint(_) => {
+                        let r = bx.$integer_reduce(args[0].immediate());
+                        if $ordered {
+                            // if overflow occurs, the result is the
+                            // mathematical result modulo 2^n:
+                            Ok(bx.$op(args[1].immediate(), r))
+                        } else {
+                            Ok(bx.$integer_reduce(args[0].immediate()))
+                        }
+                    }
+                    ty::Float(f) => {
+                        let acc = if $ordered {
+                            // ordered arithmetic reductions take an accumulator
+                            args[1].immediate()
+                        } else {
+                            // unordered arithmetic reductions use the identity accumulator
+                            match f.bit_width() {
+                                32 => bx.const_real(bx.type_f32(), $identity),
+                                64 => bx.const_real(bx.type_f64(), $identity),
+                                v => return_error!(
+                                    r#"
+unsupported {} from `{}` with element `{}` of size `{}` to `{}`"#,
+                                    sym::$name,
+                                    in_ty,
+                                    in_elem,
+                                    v,
+                                    ret_ty
+                                ),
+                            }
+                        };
+                        Ok(bx.$float_reduce(acc, args[0].immediate()))
+                    }
+                    _ => return_error!(
+                        "unsupported {} from `{}` with element `{}` to `{}`",
+                        sym::$name,
+                        in_ty,
+                        in_elem,
+                        ret_ty
+                    ),
+                };
+            }
+        };
+    }
+
+    // TODO: use a recursive algorithm a-la Hacker's Delight.
+    arith_red!(
+        simd_reduce_add_unordered: vector_reduce_add,
+        vector_reduce_fadd_fast,
+        false,
+        add,
+        0.0
+    );
+
     unimplemented!("simd {}", name);
 }