Merge pull request #172 from rust-lang/feature/more-simd

Implement more SIMD intrinsics
2022-05-04 23:39:04 -04:00 · 2022-05-04 23:39:04 -04:00 · e062c37125
commit e062c37125
parent 852735da05 4a9744059f
9 changed files with 580 additions and 35 deletions
--- a/src/asm.rs
+++ b/src/asm.rs
@ -595,7 +595,7 @@ fn reg_to_gcc(reg: InlineAsmRegOrRegClass) -> ConstraintOrRegister {
            InlineAsmRegClass::X86(X86InlineAsmRegClass::xmm_reg)
            | InlineAsmRegClass::X86(X86InlineAsmRegClass::ymm_reg) => "x",
            InlineAsmRegClass::X86(X86InlineAsmRegClass::zmm_reg) => "v",
-            InlineAsmRegClass::X86(X86InlineAsmRegClass::kreg) => unimplemented!(),
+            InlineAsmRegClass::X86(X86InlineAsmRegClass::kreg) => "Yk",
            InlineAsmRegClass::Wasm(WasmInlineAsmRegClass::local) => unimplemented!(),
            InlineAsmRegClass::X86(
                X86InlineAsmRegClass::x87_reg | X86InlineAsmRegClass::mmx_reg,
--- a/src/builder.rs
+++ b/src/builder.rs
@ -7,6 +7,7 @@ use gccjit::{
    BinaryOp,
    Block,
    ComparisonOp,
+    Context,
    Function,
    LValue,
    RValue,
@ -47,6 +48,7 @@ use rustc_target::spec::{HasTargetSpec, Target};

 use crate::common::{SignType, TypeReflection, type_is_pointer};
 use crate::context::CodegenCx;
+use crate::intrinsic::llvm;
 use crate::type_of::LayoutGccExt;

 // TODO(antoyo)
@ -216,11 +218,17 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
            return Cow::Borrowed(args);
        }

+        let func_name = format!("{:?}", func_ptr);
+
        let casted_args: Vec<_> = param_types
            .into_iter()
            .zip(args.iter())
            .enumerate()
            .map(|(index, (expected_ty, &actual_val))| {
+                if llvm::ignore_arg_cast(&func_name, index, args.len()) {
+                    return actual_val;
+                }
+
                let actual_ty = actual_val.get_type();
                if expected_ty != actual_ty {
                    if !actual_ty.is_vector() && !expected_ty.is_vector() && actual_ty.is_integral() && expected_ty.is_integral() && actual_ty.get_size() != expected_ty.get_size() {
@ -297,6 +305,8 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
        if return_type != void_type {
            unsafe { RETURN_VALUE_COUNT += 1 };
            let result = current_func.new_local(None, return_type, &format!("ptrReturnValue{}", unsafe { RETURN_VALUE_COUNT }));
+            let func_name = format!("{:?}", func_ptr);
+            let args = llvm::adjust_intrinsic_arguments(&self, gcc_func, args, &func_name);
            self.block.add_assignment(None, result, self.cx.context.new_call_through_ptr(None, func_ptr, &args));
            result.to_rvalue()
        }
@ -1316,6 +1326,11 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
                element_type
            }
            else {
+                #[cfg(feature="master")]
+                {
+                    self.cx.type_ix(element_type.get_size() as u64 * 8)
+                }
+                #[cfg(not(feature="master"))]
                self.int_type
            };
        for i in 0..mask_num_units {
@ -1343,7 +1358,7 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
                }
                // TODO(antoyo): switch to using new_vector_access.
                let array = self.context.new_bitcast(None, v2, array_type);
-                for i in 0..vec_num_units {
+                for i in 0..(mask_num_units - vec_num_units) {
                    elements.push(self.context.new_array_access(None, array, self.context.new_rvalue_from_int(self.int_type, i as i32)).to_rvalue());
                }
                let v1 = self.context.new_rvalue_from_vector(None, result_type, &elements);
@ -1380,6 +1395,98 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
    pub fn shuffle_vector(&mut self, _v1: RValue<'gcc>, _v2: RValue<'gcc>, _mask: RValue<'gcc>) -> RValue<'gcc> {
        unimplemented!();
    }
+
+    #[cfg(feature="master")]
+    pub fn vector_reduce<F>(&mut self, src: RValue<'gcc>, op: F) -> RValue<'gcc>
+    where F: Fn(RValue<'gcc>, RValue<'gcc>, &'gcc Context<'gcc>) -> RValue<'gcc>
+    {
+        let vector_type = src.get_type().unqualified().dyncast_vector().expect("vector type");
+        let element_count = vector_type.get_num_units();
+        let mut vector_elements = vec![];
+        for i in 0..element_count {
+            vector_elements.push(i);
+        }
+        let mask_type = self.context.new_vector_type(self.int_type, element_count as u64);
+        let mut shift = 1;
+        let mut res = src;
+        while shift < element_count {
+            let vector_elements: Vec<_> =
+                vector_elements.iter()
+                    .map(|i| self.context.new_rvalue_from_int(self.int_type, ((i + shift) % element_count) as i32))
+                    .collect();
+            let mask = self.context.new_rvalue_from_vector(None, mask_type, &vector_elements);
+            let shifted = self.context.new_rvalue_vector_perm(None, res, res, mask);
+            shift *= 2;
+            res = op(res, shifted, &self.context);
+        }
+        self.context.new_vector_access(None, res, self.context.new_rvalue_zero(self.int_type))
+            .to_rvalue()
+    }
+
+    #[cfg(not(feature="master"))]
+    pub fn vector_reduce<F>(&mut self, src: RValue<'gcc>, op: F) -> RValue<'gcc>
+    where F: Fn(RValue<'gcc>, RValue<'gcc>, &'gcc Context<'gcc>) -> RValue<'gcc>
+    {
+        unimplemented!();
+    }
+
+    pub fn vector_reduce_op(&mut self, src: RValue<'gcc>, op: BinaryOp) -> RValue<'gcc> {
+        self.vector_reduce(src, |a, b, context| context.new_binary_op(None, op, a.get_type(), a, b))
+    }
+
+    pub fn vector_reduce_fadd_fast(&mut self, _acc: RValue<'gcc>, _src: RValue<'gcc>) -> RValue<'gcc> {
+        unimplemented!();
+    }
+
+    pub fn vector_reduce_fmul_fast(&mut self, _acc: RValue<'gcc>, _src: RValue<'gcc>) -> RValue<'gcc> {
+        unimplemented!();
+    }
+
+    // Inspired by Hacker's Delight min implementation.
+    pub fn vector_reduce_min(&mut self, src: RValue<'gcc>) -> RValue<'gcc> {
+        self.vector_reduce(src, |a, b, context| {
+            let differences_or_zeros = difference_or_zero(a, b, context);
+            context.new_binary_op(None, BinaryOp::Minus, a.get_type(), a, differences_or_zeros)
+        })
+    }
+
+    // Inspired by Hacker's Delight max implementation.
+    pub fn vector_reduce_max(&mut self, src: RValue<'gcc>) -> RValue<'gcc> {
+        self.vector_reduce(src, |a, b, context| {
+            let differences_or_zeros = difference_or_zero(a, b, context);
+            context.new_binary_op(None, BinaryOp::Plus, b.get_type(), b, differences_or_zeros)
+        })
+    }
+
+    pub fn vector_select(&mut self, cond: RValue<'gcc>, then_val: RValue<'gcc>, else_val: RValue<'gcc>) -> RValue<'gcc> {
+        // cond is a vector of integers, not of bools.
+        let cond_type = cond.get_type();
+        let vector_type = cond_type.unqualified().dyncast_vector().expect("vector type");
+        let num_units = vector_type.get_num_units();
+        let element_type = vector_type.get_element_type();
+        let zeros = vec![self.context.new_rvalue_zero(element_type); num_units];
+        let zeros = self.context.new_rvalue_from_vector(None, cond_type, &zeros);
+
+        let masks = self.context.new_comparison(None, ComparisonOp::NotEquals, cond, zeros);
+        let then_vals = masks & then_val;
+
+        let ones = vec![self.context.new_rvalue_one(element_type); num_units];
+        let ones = self.context.new_rvalue_from_vector(None, cond_type, &ones);
+        let inverted_masks = masks + ones;
+        // NOTE: sometimes, the type of else_val can be different than the type of then_val in
+        // libgccjit (vector of int vs vector of int32_t), but they should be the same for the AND
+        // operation to work.
+        let else_val = self.context.new_bitcast(None, else_val, then_val.get_type());
+        let else_vals = inverted_masks & else_val;
+
+        then_vals | else_vals
+    }
+}
+
+fn difference_or_zero<'gcc>(a: RValue<'gcc>, b: RValue<'gcc>, context: &'gcc Context<'gcc>) -> RValue<'gcc> {
+    let difference = a - b;
+    let masks = context.new_comparison(None, ComparisonOp::GreaterThanEquals, b, a);
+    difference & masks
 }

 impl<'a, 'gcc, 'tcx> StaticBuilderMethods for Builder<'a, 'gcc, 'tcx> {
--- a/src/common.rs
+++ b/src/common.rs
@ -117,8 +117,8 @@ impl<'gcc, 'tcx> ConstMethods<'tcx> for CodegenCx<'gcc, 'tcx> {
        unimplemented!();
    }

-    fn const_real(&self, _t: Type<'gcc>, _val: f64) -> RValue<'gcc> {
-        unimplemented!();
+    fn const_real(&self, typ: Type<'gcc>, val: f64) -> RValue<'gcc> {
+        self.context.new_rvalue_from_double(typ, val)
    }

    fn const_str(&self, s: Symbol) -> (RValue<'gcc>, RValue<'gcc>) {
--- a/src/consts.rs
+++ b/src/consts.rs
@ -27,12 +27,7 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
        }
        // NOTE: since bitcast makes a value non-constant, don't bitcast if not necessary as some
        // SIMD builtins require a constant value.
-        if value.get_type() != typ {
-            self.context.new_bitcast(None, value, typ)
-        }
-        else {
-            value
-        }
+        self.bitcast_if_needed(value, typ)
    }
 }

@ -86,13 +81,7 @@ impl<'gcc, 'tcx> StaticMethods for CodegenCx<'gcc, 'tcx> {

        // TODO(antoyo): set alignment.

-        let value =
-            if value.get_type() != gcc_type {
-                self.context.new_bitcast(None, value, gcc_type)
-            }
-            else {
-                value
-            };
+        let value = self.bitcast_if_needed(value, gcc_type);
        global.global_set_initializer_rvalue(value);

        // As an optimization, all shared statics which do not have interior
--- a/src/context.rs
+++ b/src/context.rs
@ -35,6 +35,7 @@ pub struct CodegenCx<'gcc, 'tcx> {
    pub normal_function_addresses: RefCell<FxHashSet<RValue<'gcc>>>,

    pub functions: RefCell<FxHashMap<String, Function<'gcc>>>,
+    pub intrinsics: RefCell<FxHashMap<String, Function<'gcc>>>,

    pub tls_model: gccjit::TlsModel,

@ -184,6 +185,7 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
            current_func: RefCell::new(None),
            normal_function_addresses: Default::default(),
            functions: RefCell::new(functions),
+            intrinsics: RefCell::new(FxHashMap::default()),

            tls_model,

@ -279,6 +281,15 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
    pub fn sess(&self) -> &Session {
        &self.tcx.sess
    }
+
+    pub fn bitcast_if_needed(&self, value: RValue<'gcc>, expected_type: Type<'gcc>) -> RValue<'gcc> {
+        if value.get_type() != expected_type {
+            self.context.new_bitcast(None, value, expected_type)
+        }
+        else {
+            value
+        }
+    }
 }

 impl<'gcc, 'tcx> BackendTypes for CodegenCx<'gcc, 'tcx> {
@ -306,8 +317,16 @@ impl<'gcc, 'tcx> MiscMethods<'tcx> for CodegenCx<'gcc, 'tcx> {
    }

    fn get_fn_addr(&self, instance: Instance<'tcx>) -> RValue<'gcc> {
-        let func = get_fn(self, instance);
-        let func = self.rvalue_as_function(func);
+        let func_name = self.tcx.symbol_name(instance).name;
+
+        let func =
+            if self.intrinsics.borrow().contains_key(func_name) {
+                self.intrinsics.borrow()[func_name].clone()
+            }
+            else {
+                let func = get_fn(self, instance);
+                self.rvalue_as_function(func)
+            };
        let ptr = func.get_address(None);

        // TODO(antoyo): don't do this twice: i.e. in declare_fn and here.
--- a/src/declare.rs
+++ b/src/declare.rs
@ -11,7 +11,7 @@ use crate::intrinsic::llvm;
 impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
    pub fn get_or_insert_global(&self, name: &str, ty: Type<'gcc>, is_tls: bool, link_section: Option<Symbol>) -> LValue<'gcc> {
        if self.globals.borrow().contains_key(name) {
-            let typ = self.globals.borrow().get(name).expect("global").get_type();
+            let typ = self.globals.borrow()[name].get_type();
            let global = self.context.new_global(None, GlobalKind::Imported, typ, name);
            if is_tls {
                global.set_tls_model(self.tls_model);
@ -103,11 +103,13 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
 /// update the declaration and return existing Value instead.
 fn declare_raw_fn<'gcc>(cx: &CodegenCx<'gcc, '_>, name: &str, _callconv: () /*llvm::CallConv*/, return_type: Type<'gcc>, param_types: &[Type<'gcc>], variadic: bool) -> Function<'gcc> {
    if name.starts_with("llvm.") {
-        return llvm::intrinsic(name, cx);
+        let intrinsic = llvm::intrinsic(name, cx);
+        cx.intrinsics.borrow_mut().insert(name.to_string(), intrinsic);
+        return intrinsic;
    }
    let func =
        if cx.functions.borrow().contains_key(name) {
-            *cx.functions.borrow().get(name).expect("function")
+            cx.functions.borrow()[name]
        }
        else {
            let params: Vec<_> = param_types.into_iter().enumerate()
--- a/src/intrinsic/llvm.rs
+++ b/src/intrinsic/llvm.rs
@ -1,6 +1,172 @@
-use gccjit::Function;
+use std::borrow::Cow;

-use crate::context::CodegenCx;
+use gccjit::{Function, FunctionPtrType, RValue, ToRValue};
+
+use crate::{context::CodegenCx, builder::Builder};
+
+pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc, 'tcx>, gcc_func: FunctionPtrType<'gcc>, mut args: Cow<'b, [RValue<'gcc>]>, func_name: &str) -> Cow<'b, [RValue<'gcc>]> {
+    // Some LLVM intrinsics do not map 1-to-1 to GCC intrinsics, so we add the missing
+    // arguments here.
+    if gcc_func.get_param_count() != args.len() {
+        match &*func_name {
+            "__builtin_ia32_pmuldq512_mask" | "__builtin_ia32_pmuludq512_mask"
+                // FIXME(antoyo): the following intrinsics has 4 (or 5) arguments according to the doc, but is defined with 2 (or 3) arguments in library/stdarch/crates/core_arch/src/x86/avx512f.rs.
+                | "__builtin_ia32_pmaxsd512_mask" | "__builtin_ia32_pmaxsq512_mask" | "__builtin_ia32_pmaxsq256_mask"
+                | "__builtin_ia32_pmaxsq128_mask" | "__builtin_ia32_maxps512_mask" | "__builtin_ia32_maxpd512_mask"
+                | "__builtin_ia32_pmaxud512_mask" | "__builtin_ia32_pmaxuq512_mask" | "__builtin_ia32_pmaxuq256_mask"
+                | "__builtin_ia32_pmaxuq128_mask"
+                | "__builtin_ia32_pminsd512_mask" | "__builtin_ia32_pminsq512_mask" | "__builtin_ia32_pminsq256_mask"
+                | "__builtin_ia32_pminsq128_mask" | "__builtin_ia32_minps512_mask" | "__builtin_ia32_minpd512_mask"
+                | "__builtin_ia32_pminud512_mask" | "__builtin_ia32_pminuq512_mask" | "__builtin_ia32_pminuq256_mask"
+                | "__builtin_ia32_pminuq128_mask" | "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask"
+                => {
+                    // TODO: refactor by separating those intrinsics outside of this branch.
+                    let add_before_last_arg =
+                        match &*func_name {
+                            "__builtin_ia32_maxps512_mask" | "__builtin_ia32_maxpd512_mask"
+                                | "__builtin_ia32_minps512_mask" | "__builtin_ia32_minpd512_mask"
+                                | "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => true,
+                            _ => false,
+                        };
+                    let new_first_arg_is_zero =
+                        match &*func_name {
+                            "__builtin_ia32_pmaxuq256_mask" | "__builtin_ia32_pmaxuq128_mask"
+                                | "__builtin_ia32_pminuq256_mask" | "__builtin_ia32_pminuq128_mask" => true,
+                            _ => false
+                        };
+                    let arg3_index =
+                        match &*func_name {
+                            "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => 1,
+                            _ => 2,
+                        };
+                    let mut new_args = args.to_vec();
+                    let arg3_type = gcc_func.get_param_type(arg3_index);
+                    let first_arg =
+                        if new_first_arg_is_zero {
+                            let vector_type = arg3_type.dyncast_vector().expect("vector type");
+                            let zero = builder.context.new_rvalue_zero(vector_type.get_element_type());
+                            let num_units = vector_type.get_num_units();
+                            builder.context.new_rvalue_from_vector(None, arg3_type, &vec![zero; num_units])
+                        }
+                        else {
+                            builder.current_func().new_local(None, arg3_type, "undefined_for_intrinsic").to_rvalue()
+                        };
+                    if add_before_last_arg {
+                        new_args.insert(new_args.len() - 1, first_arg);
+                    }
+                    else {
+                        new_args.push(first_arg);
+                    }
+                    let arg4_index =
+                        match &*func_name {
+                            "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => 2,
+                            _ => 3,
+                        };
+                    let arg4_type = gcc_func.get_param_type(arg4_index);
+                    let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
+                    if add_before_last_arg {
+                        new_args.insert(new_args.len() - 1, minus_one);
+                    }
+                    else {
+                        new_args.push(minus_one);
+                    }
+                    args = new_args.into();
+                },
+                "__builtin_ia32_pternlogd512_mask" | "__builtin_ia32_pternlogd256_mask"
+                    | "__builtin_ia32_pternlogd128_mask" | "__builtin_ia32_pternlogq512_mask"
+                    | "__builtin_ia32_pternlogq256_mask" | "__builtin_ia32_pternlogq128_mask" => {
+                        let mut new_args = args.to_vec();
+                        let arg5_type = gcc_func.get_param_type(4);
+                        let minus_one = builder.context.new_rvalue_from_int(arg5_type, -1);
+                        new_args.push(minus_one);
+                        args = new_args.into();
+                    },
+                    "__builtin_ia32_vfmaddps512_mask" | "__builtin_ia32_vfmaddpd512_mask" => {
+                        let mut new_args = args.to_vec();
+
+                        let mut last_arg = None;
+                        if args.len() == 4 {
+                            last_arg = new_args.pop();
+                        }
+
+                        let arg4_type = gcc_func.get_param_type(3);
+                        let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
+                        new_args.push(minus_one);
+
+                        if args.len() == 3 {
+                            // Both llvm.fma.v16f32 and llvm.x86.avx512.vfmadd.ps.512 maps to
+                            // the same GCC intrinsic, but the former has 3 parameters and the
+                            // latter has 4 so it doesn't require this additional argument.
+                            let arg5_type = gcc_func.get_param_type(4);
+                            new_args.push(builder.context.new_rvalue_from_int(arg5_type, 4));
+                        }
+
+                        if let Some(last_arg) = last_arg {
+                            new_args.push(last_arg);
+                        }
+
+                        args = new_args.into();
+                    },
+                    "__builtin_ia32_addps512_mask" | "__builtin_ia32_addpd512_mask"
+                        | "__builtin_ia32_subps512_mask" | "__builtin_ia32_subpd512_mask"
+                        | "__builtin_ia32_mulps512_mask" | "__builtin_ia32_mulpd512_mask"
+                        | "__builtin_ia32_divps512_mask" | "__builtin_ia32_divpd512_mask" => {
+                        let mut new_args = args.to_vec();
+                        let last_arg = new_args.pop().expect("last arg");
+                        let arg3_type = gcc_func.get_param_type(2);
+                        let undefined = builder.current_func().new_local(None, arg3_type, "undefined_for_intrinsic").to_rvalue();
+                        new_args.push(undefined);
+                        let arg4_type = gcc_func.get_param_type(3);
+                        let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
+                        new_args.push(minus_one);
+                        new_args.push(last_arg);
+                        args = new_args.into();
+                    },
+                    "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask" => {
+                        let mut new_args = args.to_vec();
+                        let last_arg = new_args.pop().expect("last arg");
+                        let arg4_type = gcc_func.get_param_type(3);
+                        let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
+                        new_args.push(minus_one);
+                        new_args.push(last_arg);
+                        args = new_args.into();
+                    },
+                    _ => (),
+        }
+    }
+
+    args
+}
+
+pub fn ignore_arg_cast(func_name: &str, index: usize, args_len: usize) -> bool {
+    // NOTE: these intrinsics have missing parameters before the last one, so ignore the
+    // last argument type check.
+    // FIXME(antoyo): find a way to refactor in order to avoid this hack.
+    match func_name {
+        "__builtin_ia32_maxps512_mask" | "__builtin_ia32_maxpd512_mask"
+            | "__builtin_ia32_minps512_mask" | "__builtin_ia32_minpd512_mask" | "__builtin_ia32_sqrtps512_mask"
+            | "__builtin_ia32_sqrtpd512_mask" | "__builtin_ia32_addps512_mask" | "__builtin_ia32_addpd512_mask"
+            | "__builtin_ia32_subps512_mask" | "__builtin_ia32_subpd512_mask"
+            | "__builtin_ia32_mulps512_mask" | "__builtin_ia32_mulpd512_mask"
+            | "__builtin_ia32_divps512_mask" | "__builtin_ia32_divpd512_mask"
+            | "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask" => {
+                if index == args_len - 1 {
+                    return true;
+                }
+            },
+        "__builtin_ia32_vfmaddps512_mask" | "__builtin_ia32_vfmaddpd512_mask" => {
+            // Since there are two LLVM intrinsics that map to each of these GCC builtins and only
+            // one of them has a missing parameter before the last one, we check the number of
+            // arguments to distinguish those cases.
+            if args_len == 4 && index == args_len - 1 {
+                return true;
+            }
+        },
+        _ => (),
+    }
+
+    false
+}

 #[cfg(not(feature="master"))]
 pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function<'gcc> {
@ -21,6 +187,59 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function
        "llvm.x86.xgetbv" => "__builtin_ia32_xgetbv",
        // NOTE: this doc specifies the equivalent GCC builtins: http://huonw.github.io/llvmint/llvmint/x86/index.html
        "llvm.sqrt.v2f64" => "__builtin_ia32_sqrtpd",
+        "llvm.x86.avx512.pmul.dq.512" => "__builtin_ia32_pmuldq512_mask",
+        "llvm.x86.avx512.pmulu.dq.512" => "__builtin_ia32_pmuludq512_mask",
+        "llvm.x86.avx512.mask.pmaxs.q.256" => "__builtin_ia32_pmaxsq256_mask",
+        "llvm.x86.avx512.mask.pmaxs.q.128" => "__builtin_ia32_pmaxsq128_mask",
+        "llvm.x86.avx512.max.ps.512" => "__builtin_ia32_maxps512_mask",
+        "llvm.x86.avx512.max.pd.512" => "__builtin_ia32_maxpd512_mask",
+        "llvm.x86.avx512.mask.pmaxu.q.256" => "__builtin_ia32_pmaxuq256_mask",
+        "llvm.x86.avx512.mask.pmaxu.q.128" => "__builtin_ia32_pmaxuq128_mask",
+        "llvm.x86.avx512.mask.pmins.q.256" => "__builtin_ia32_pminsq256_mask",
+        "llvm.x86.avx512.mask.pmins.q.128" => "__builtin_ia32_pminsq128_mask",
+        "llvm.x86.avx512.min.ps.512" => "__builtin_ia32_minps512_mask",
+        "llvm.x86.avx512.min.pd.512" => "__builtin_ia32_minpd512_mask",
+        "llvm.x86.avx512.mask.pminu.q.256" => "__builtin_ia32_pminuq256_mask",
+        "llvm.x86.avx512.mask.pminu.q.128" => "__builtin_ia32_pminuq128_mask",
+        "llvm.fma.v16f32" => "__builtin_ia32_vfmaddps512_mask",
+        "llvm.fma.v8f64" => "__builtin_ia32_vfmaddpd512_mask",
+        "llvm.x86.avx512.vfmaddsub.ps.512" => "__builtin_ia32_vfmaddsubps512_mask",
+        "llvm.x86.avx512.vfmaddsub.pd.512" => "__builtin_ia32_vfmaddsubpd512_mask",
+        "llvm.x86.avx512.pternlog.d.512" => "__builtin_ia32_pternlogd512_mask",
+        "llvm.x86.avx512.pternlog.d.256" => "__builtin_ia32_pternlogd256_mask",
+        "llvm.x86.avx512.pternlog.d.128" => "__builtin_ia32_pternlogd128_mask",
+        "llvm.x86.avx512.pternlog.q.512" => "__builtin_ia32_pternlogq512_mask",
+        "llvm.x86.avx512.pternlog.q.256" => "__builtin_ia32_pternlogq256_mask",
+        "llvm.x86.avx512.pternlog.q.128" => "__builtin_ia32_pternlogq128_mask",
+        "llvm.x86.avx512.add.ps.512" => "__builtin_ia32_addps512_mask",
+        "llvm.x86.avx512.add.pd.512" => "__builtin_ia32_addpd512_mask",
+        "llvm.x86.avx512.sub.ps.512" => "__builtin_ia32_subps512_mask",
+        "llvm.x86.avx512.sub.pd.512" => "__builtin_ia32_subpd512_mask",
+        "llvm.x86.avx512.mul.ps.512" => "__builtin_ia32_mulps512_mask",
+        "llvm.x86.avx512.mul.pd.512" => "__builtin_ia32_mulpd512_mask",
+        "llvm.x86.avx512.div.ps.512" => "__builtin_ia32_divps512_mask",
+        "llvm.x86.avx512.div.pd.512" => "__builtin_ia32_divpd512_mask",
+        "llvm.x86.avx512.vfmadd.ps.512" => "__builtin_ia32_vfmaddps512_mask",
+        "llvm.x86.avx512.vfmadd.pd.512" => "__builtin_ia32_vfmaddpd512_mask",
+
+        // The above doc points to unknown builtins for the following, so override them:
+        "llvm.x86.avx2.gather.d.d" => "__builtin_ia32_gathersiv4si",
+        "llvm.x86.avx2.gather.d.d.256" => "__builtin_ia32_gathersiv8si",
+        "llvm.x86.avx2.gather.d.ps" => "__builtin_ia32_gathersiv4sf",
+        "llvm.x86.avx2.gather.d.ps.256" => "__builtin_ia32_gathersiv8sf",
+        "llvm.x86.avx2.gather.d.q" => "__builtin_ia32_gathersiv2di",
+        "llvm.x86.avx2.gather.d.q.256" => "__builtin_ia32_gathersiv4di",
+        "llvm.x86.avx2.gather.d.pd" => "__builtin_ia32_gathersiv2df",
+        "llvm.x86.avx2.gather.d.pd.256" => "__builtin_ia32_gathersiv4df",
+        "llvm.x86.avx2.gather.q.d" => "__builtin_ia32_gatherdiv4si",
+        "llvm.x86.avx2.gather.q.d.256" => "__builtin_ia32_gatherdiv4si256",
+        "llvm.x86.avx2.gather.q.ps" => "__builtin_ia32_gatherdiv4sf",
+        "llvm.x86.avx2.gather.q.ps.256" => "__builtin_ia32_gatherdiv4sf256",
+        "llvm.x86.avx2.gather.q.q" => "__builtin_ia32_gatherdiv2di",
+        "llvm.x86.avx2.gather.q.q.256" => "__builtin_ia32_gatherdiv4di",
+        "llvm.x86.avx2.gather.q.pd" => "__builtin_ia32_gatherdiv2df",
+        "llvm.x86.avx2.gather.q.pd.256" => "__builtin_ia32_gatherdiv4df",
+        "" => "",
        // NOTE: this file is generated by https://github.com/GuillaumeGomez/llvmint/blob/master/generate_list.py
        _ => include!("archs.rs"),
    };
--- a/src/intrinsic/simd.rs
+++ b/src/intrinsic/simd.rs
@ -1,15 +1,17 @@
 use std::cmp::Ordering;

-use gccjit::{RValue, Type, ToRValue};
+use gccjit::{BinaryOp, RValue, Type, ToRValue};
 use rustc_codegen_ssa::base::compare_simd_types;
 use rustc_codegen_ssa::common::{TypeKind, span_invalid_monomorphization_error};
 use rustc_codegen_ssa::mir::operand::OperandRef;
+use rustc_codegen_ssa::mir::place::PlaceRef;
 use rustc_codegen_ssa::traits::{BaseTypeMethods, BuilderMethods};
 use rustc_hir as hir;
 use rustc_middle::span_bug;
 use rustc_middle::ty::layout::HasTyCtxt;
 use rustc_middle::ty::{self, Ty};
 use rustc_span::{Span, Symbol, sym};
+use rustc_target::abi::Align;

 use crate::builder::Builder;
 use crate::intrinsic;
@ -55,7 +57,53 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>,
    let sig =
        tcx.normalize_erasing_late_bound_regions(ty::ParamEnv::reveal_all(), callee_ty.fn_sig(tcx));
    let arg_tys = sig.inputs();
-    let name_str = name.as_str();
+
+    if name == sym::simd_select_bitmask {
+        require_simd!(arg_tys[1], "argument");
+        let (len, _) = arg_tys[1].simd_size_and_type(bx.tcx());
+
+        let expected_int_bits = (len.max(8) - 1).next_power_of_two();
+        let expected_bytes = len / 8 + ((len % 8 > 0) as u64);
+
+        let mask_ty = arg_tys[0];
+        let mut mask = match mask_ty.kind() {
+            ty::Int(i) if i.bit_width() == Some(expected_int_bits) => args[0].immediate(),
+            ty::Uint(i) if i.bit_width() == Some(expected_int_bits) => args[0].immediate(),
+            ty::Array(elem, len)
+                if matches!(elem.kind(), ty::Uint(ty::UintTy::U8))
+                    && len.try_eval_usize(bx.tcx, ty::ParamEnv::reveal_all())
+                        == Some(expected_bytes) =>
+            {
+                let place = PlaceRef::alloca(bx, args[0].layout);
+                args[0].val.store(bx, place);
+                let int_ty = bx.type_ix(expected_bytes * 8);
+                let ptr = bx.pointercast(place.llval, bx.cx.type_ptr_to(int_ty));
+                bx.load(int_ty, ptr, Align::ONE)
+            }
+            _ => return_error!(
+                "invalid bitmask `{}`, expected `u{}` or `[u8; {}]`",
+                mask_ty,
+                expected_int_bits,
+                expected_bytes
+            ),
+        };
+
+        let arg1 = args[1].immediate();
+        let arg1_type = arg1.get_type();
+        let arg1_vector_type = arg1_type.unqualified().dyncast_vector().expect("vector type");
+        let arg1_element_type = arg1_vector_type.get_element_type();
+
+        let mut elements = vec![];
+        let one = bx.context.new_rvalue_one(mask.get_type());
+        for _ in 0..len {
+            let element = bx.context.new_cast(None, mask & one, arg1_element_type);
+            elements.push(element);
+            mask = mask >> one;
+        }
+        let vector_mask = bx.context.new_rvalue_from_vector(None, arg1_type, &elements);
+
+        return Ok(bx.vector_select(vector_mask, arg1, args[2].immediate()));
+    }

    // every intrinsic below takes a SIMD vector as its first argument
    require_simd!(arg_tys[0], "input");
@ -102,7 +150,7 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>,
        ));
    }

-    if let Some(stripped) = name_str.strip_prefix("simd_shuffle") {
+    if let Some(stripped) = name.as_str().strip_prefix("simd_shuffle") {
        let n: u64 =
            if stripped.is_empty() {
                // Make sure this is actually an array, since typeck only checks the length-suffixed
@ -172,6 +220,7 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>,
        // NOTE: we cannot cast to an array and assign to its element here because the value might
        // not be an l-value. So, call a builtin to set the element.
        // TODO(antoyo): perhaps we could create a new vector or maybe there's a GIMPLE instruction for that?
+        // TODO(antoyo): don't use target specific builtins here.
        let func_name =
            match in_len {
                2 => {
@ -202,14 +251,8 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>,
            };
        let builtin = bx.context.get_target_builtin_function(func_name);
        let param1_type = builtin.get_param(0).to_rvalue().get_type();
-        let vector =
-            if vector.get_type() != param1_type {
-                // TODO(antoyo): perhaps use __builtin_convertvector for vector casting.
-                bx.context.new_bitcast(None, vector, param1_type)
-            }
-            else {
-                vector
-            };
+        // TODO(antoyo): perhaps use __builtin_convertvector for vector casting.
+        let vector = bx.cx.bitcast_if_needed(vector, param1_type);
        let result = bx.context.new_call(None, builtin, &[vector, value, bx.context.new_cast(None, index, bx.int_type)]);
        // TODO(antoyo): perhaps use __builtin_convertvector for vector casting.
        return Ok(bx.context.new_bitcast(None, result, vector.get_type()));
@ -228,6 +271,24 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>,
        return Ok(bx.context.new_vector_access(None, vector, args[1].immediate()).to_rvalue());
    }

+    if name == sym::simd_select {
+        let m_elem_ty = in_elem;
+        let m_len = in_len;
+        require_simd!(arg_tys[1], "argument");
+        let (v_len, _) = arg_tys[1].simd_size_and_type(bx.tcx());
+        require!(
+            m_len == v_len,
+            "mismatched lengths: mask length `{}` != other vector length `{}`",
+            m_len,
+            v_len
+        );
+        match m_elem_ty.kind() {
+            ty::Int(_) => {}
+            _ => return_error!("mask element type is `{}`, expected `i_`", m_elem_ty),
+        }
+        return Ok(bx.vector_select(args[0].immediate(), args[1].immediate(), args[2].immediate()));
+    }
+
    if name == sym::simd_cast {
        require_simd!(ret_ty, "return");
        let (out_len, out_elem) = ret_ty.simd_size_and_type(bx.tcx());
@ -336,6 +397,10 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>,
                });
            }
            (Style::Int(_), Style::Float) => {
+                // TODO: add support for internal functions in libgccjit to get access to IFN_VEC_CONVERT which is
+                // doing like __builtin_convertvector?
+                // Or maybe provide convert_vector as an API since it might not easy to get the
+                // types of internal functions.
                unimplemented!();
            }
            (Style::Float, Style::Int(_)) => {
@ -539,10 +604,150 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>,
        let vec_ty = bx.cx.type_vector(elem_ty, in_len as u64);

        let func = bx.context.get_target_builtin_function(builtin_name);
+        let param1_type = func.get_param(0).to_rvalue().get_type();
+        let param2_type = func.get_param(1).to_rvalue().get_type();
+        let lhs = bx.cx.bitcast_if_needed(lhs, param1_type);
+        let rhs = bx.cx.bitcast_if_needed(rhs, param2_type);
        let result = bx.context.new_call(None, func, &[lhs, rhs]);
        // TODO(antoyo): perhaps use __builtin_convertvector for vector casting.
        return Ok(bx.context.new_bitcast(None, result, vec_ty));
    }

+    macro_rules! arith_red {
+        ($name:ident : $vec_op:expr, $float_reduce:ident, $ordered:expr, $op:ident,
+         $identity:expr) => {
+            if name == sym::$name {
+                require!(
+                    ret_ty == in_elem,
+                    "expected return type `{}` (element of input `{}`), found `{}`",
+                    in_elem,
+                    in_ty,
+                    ret_ty
+                );
+                return match in_elem.kind() {
+                    ty::Int(_) | ty::Uint(_) => {
+                        let r = bx.vector_reduce_op(args[0].immediate(), $vec_op);
+                        if $ordered {
+                            // if overflow occurs, the result is the
+                            // mathematical result modulo 2^n:
+                            Ok(bx.$op(args[1].immediate(), r))
+                        }
+                        else {
+                            Ok(bx.vector_reduce_op(args[0].immediate(), $vec_op))
+                        }
+                    }
+                    ty::Float(_) => {
+                        if $ordered {
+                            // ordered arithmetic reductions take an accumulator
+                            let acc = args[1].immediate();
+                            Ok(bx.$float_reduce(acc, args[0].immediate()))
+                        }
+                        else {
+                            Ok(bx.vector_reduce_op(args[0].immediate(), $vec_op))
+                        }
+                    }
+                    _ => return_error!(
+                        "unsupported {} from `{}` with element `{}` to `{}`",
+                        sym::$name,
+                        in_ty,
+                        in_elem,
+                        ret_ty
+                    ),
+                };
+            }
+        };
+    }
+
+    arith_red!(
+        simd_reduce_add_unordered: BinaryOp::Plus,
+        vector_reduce_fadd_fast,
+        false,
+        add,
+        0.0 // TODO: Use this argument.
+    );
+    arith_red!(
+        simd_reduce_mul_unordered: BinaryOp::Mult,
+        vector_reduce_fmul_fast,
+        false,
+        mul,
+        1.0
+    );
+
+    macro_rules! minmax_red {
+        ($name:ident: $reduction:ident) => {
+            if name == sym::$name {
+                require!(
+                    ret_ty == in_elem,
+                    "expected return type `{}` (element of input `{}`), found `{}`",
+                    in_elem,
+                    in_ty,
+                    ret_ty
+                );
+                return match in_elem.kind() {
+                    ty::Int(_) | ty::Uint(_) | ty::Float(_) => Ok(bx.$reduction(args[0].immediate())),
+                    _ => return_error!(
+                        "unsupported {} from `{}` with element `{}` to `{}`",
+                        sym::$name,
+                        in_ty,
+                        in_elem,
+                        ret_ty
+                    ),
+                };
+            }
+        };
+    }
+
+    minmax_red!(simd_reduce_min: vector_reduce_min);
+    minmax_red!(simd_reduce_max: vector_reduce_max);
+
+    macro_rules! bitwise_red {
+        ($name:ident : $op:expr, $boolean:expr) => {
+            if name == sym::$name {
+                let input = if !$boolean {
+                    require!(
+                        ret_ty == in_elem,
+                        "expected return type `{}` (element of input `{}`), found `{}`",
+                        in_elem,
+                        in_ty,
+                        ret_ty
+                    );
+                    args[0].immediate()
+                } else {
+                    match in_elem.kind() {
+                        ty::Int(_) | ty::Uint(_) => {}
+                        _ => return_error!(
+                            "unsupported {} from `{}` with element `{}` to `{}`",
+                            sym::$name,
+                            in_ty,
+                            in_elem,
+                            ret_ty
+                        ),
+                    }
+
+                    // boolean reductions operate on vectors of i1s:
+                    let i1 = bx.type_i1();
+                    let i1xn = bx.type_vector(i1, in_len as u64);
+                    bx.trunc(args[0].immediate(), i1xn)
+                };
+                return match in_elem.kind() {
+                    ty::Int(_) | ty::Uint(_) => {
+                        let r = bx.vector_reduce_op(input, $op);
+                        Ok(if !$boolean { r } else { bx.zext(r, bx.type_bool()) })
+                    }
+                    _ => return_error!(
+                        "unsupported {} from `{}` with element `{}` to `{}`",
+                        sym::$name,
+                        in_ty,
+                        in_elem,
+                        ret_ty
+                    ),
+                };
+            }
+        };
+    }
+
+    bitwise_red!(simd_reduce_and: BinaryOp::BitwiseAnd, false);
+    bitwise_red!(simd_reduce_or: BinaryOp::BitwiseOr, false);
+
    unimplemented!("simd {}", name);
 }
--- a/src/type_.rs
+++ b/src/type_.rs
@ -247,6 +247,10 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {

        self.context.new_array_type(None, ty, len)
    }
+
+    pub fn type_bool(&self) -> Type<'gcc> {
+        self.context.new_type::<bool>()
+    }
 }

 pub fn struct_fields<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, layout: TyAndLayout<'tcx>) -> (Vec<Type<'gcc>>, bool) {