diff --git a/src/asm.rs b/src/asm.rs index 1cae78114c9d..738c990fa827 100644 --- a/src/asm.rs +++ b/src/asm.rs @@ -595,7 +595,7 @@ fn reg_to_gcc(reg: InlineAsmRegOrRegClass) -> ConstraintOrRegister { InlineAsmRegClass::X86(X86InlineAsmRegClass::xmm_reg) | InlineAsmRegClass::X86(X86InlineAsmRegClass::ymm_reg) => "x", InlineAsmRegClass::X86(X86InlineAsmRegClass::zmm_reg) => "v", - InlineAsmRegClass::X86(X86InlineAsmRegClass::kreg) => unimplemented!(), + InlineAsmRegClass::X86(X86InlineAsmRegClass::kreg) => "Yk", InlineAsmRegClass::Wasm(WasmInlineAsmRegClass::local) => unimplemented!(), InlineAsmRegClass::X86( X86InlineAsmRegClass::x87_reg | X86InlineAsmRegClass::mmx_reg, diff --git a/src/builder.rs b/src/builder.rs index 9a5cf785a1f5..8fa78c7e189b 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -7,6 +7,7 @@ use gccjit::{ BinaryOp, Block, ComparisonOp, + Context, Function, LValue, RValue, @@ -47,6 +48,7 @@ use rustc_target::spec::{HasTargetSpec, Target}; use crate::common::{SignType, TypeReflection, type_is_pointer}; use crate::context::CodegenCx; +use crate::intrinsic::llvm; use crate::type_of::LayoutGccExt; // TODO(antoyo) @@ -216,11 +218,17 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> { return Cow::Borrowed(args); } + let func_name = format!("{:?}", func_ptr); + let casted_args: Vec<_> = param_types .into_iter() .zip(args.iter()) .enumerate() .map(|(index, (expected_ty, &actual_val))| { + if llvm::ignore_arg_cast(&func_name, index, args.len()) { + return actual_val; + } + let actual_ty = actual_val.get_type(); if expected_ty != actual_ty { if !actual_ty.is_vector() && !expected_ty.is_vector() && actual_ty.is_integral() && expected_ty.is_integral() && actual_ty.get_size() != expected_ty.get_size() { @@ -297,6 +305,8 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> { if return_type != void_type { unsafe { RETURN_VALUE_COUNT += 1 }; let result = current_func.new_local(None, return_type, &format!("ptrReturnValue{}", unsafe { RETURN_VALUE_COUNT })); + let func_name = format!("{:?}", func_ptr); + let args = llvm::adjust_intrinsic_arguments(&self, gcc_func, args, &func_name); self.block.add_assignment(None, result, self.cx.context.new_call_through_ptr(None, func_ptr, &args)); result.to_rvalue() } @@ -1316,6 +1326,11 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> { element_type } else { + #[cfg(feature="master")] + { + self.cx.type_ix(element_type.get_size() as u64 * 8) + } + #[cfg(not(feature="master"))] self.int_type }; for i in 0..mask_num_units { @@ -1343,7 +1358,7 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> { } // TODO(antoyo): switch to using new_vector_access. let array = self.context.new_bitcast(None, v2, array_type); - for i in 0..vec_num_units { + for i in 0..(mask_num_units - vec_num_units) { elements.push(self.context.new_array_access(None, array, self.context.new_rvalue_from_int(self.int_type, i as i32)).to_rvalue()); } let v1 = self.context.new_rvalue_from_vector(None, result_type, &elements); @@ -1380,6 +1395,98 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> { pub fn shuffle_vector(&mut self, _v1: RValue<'gcc>, _v2: RValue<'gcc>, _mask: RValue<'gcc>) -> RValue<'gcc> { unimplemented!(); } + + #[cfg(feature="master")] + pub fn vector_reduce(&mut self, src: RValue<'gcc>, op: F) -> RValue<'gcc> + where F: Fn(RValue<'gcc>, RValue<'gcc>, &'gcc Context<'gcc>) -> RValue<'gcc> + { + let vector_type = src.get_type().unqualified().dyncast_vector().expect("vector type"); + let element_count = vector_type.get_num_units(); + let mut vector_elements = vec![]; + for i in 0..element_count { + vector_elements.push(i); + } + let mask_type = self.context.new_vector_type(self.int_type, element_count as u64); + let mut shift = 1; + let mut res = src; + while shift < element_count { + let vector_elements: Vec<_> = + vector_elements.iter() + .map(|i| self.context.new_rvalue_from_int(self.int_type, ((i + shift) % element_count) as i32)) + .collect(); + let mask = self.context.new_rvalue_from_vector(None, mask_type, &vector_elements); + let shifted = self.context.new_rvalue_vector_perm(None, res, res, mask); + shift *= 2; + res = op(res, shifted, &self.context); + } + self.context.new_vector_access(None, res, self.context.new_rvalue_zero(self.int_type)) + .to_rvalue() + } + + #[cfg(not(feature="master"))] + pub fn vector_reduce(&mut self, src: RValue<'gcc>, op: F) -> RValue<'gcc> + where F: Fn(RValue<'gcc>, RValue<'gcc>, &'gcc Context<'gcc>) -> RValue<'gcc> + { + unimplemented!(); + } + + pub fn vector_reduce_op(&mut self, src: RValue<'gcc>, op: BinaryOp) -> RValue<'gcc> { + self.vector_reduce(src, |a, b, context| context.new_binary_op(None, op, a.get_type(), a, b)) + } + + pub fn vector_reduce_fadd_fast(&mut self, _acc: RValue<'gcc>, _src: RValue<'gcc>) -> RValue<'gcc> { + unimplemented!(); + } + + pub fn vector_reduce_fmul_fast(&mut self, _acc: RValue<'gcc>, _src: RValue<'gcc>) -> RValue<'gcc> { + unimplemented!(); + } + + // Inspired by Hacker's Delight min implementation. + pub fn vector_reduce_min(&mut self, src: RValue<'gcc>) -> RValue<'gcc> { + self.vector_reduce(src, |a, b, context| { + let differences_or_zeros = difference_or_zero(a, b, context); + context.new_binary_op(None, BinaryOp::Minus, a.get_type(), a, differences_or_zeros) + }) + } + + // Inspired by Hacker's Delight max implementation. + pub fn vector_reduce_max(&mut self, src: RValue<'gcc>) -> RValue<'gcc> { + self.vector_reduce(src, |a, b, context| { + let differences_or_zeros = difference_or_zero(a, b, context); + context.new_binary_op(None, BinaryOp::Plus, b.get_type(), b, differences_or_zeros) + }) + } + + pub fn vector_select(&mut self, cond: RValue<'gcc>, then_val: RValue<'gcc>, else_val: RValue<'gcc>) -> RValue<'gcc> { + // cond is a vector of integers, not of bools. + let cond_type = cond.get_type(); + let vector_type = cond_type.unqualified().dyncast_vector().expect("vector type"); + let num_units = vector_type.get_num_units(); + let element_type = vector_type.get_element_type(); + let zeros = vec![self.context.new_rvalue_zero(element_type); num_units]; + let zeros = self.context.new_rvalue_from_vector(None, cond_type, &zeros); + + let masks = self.context.new_comparison(None, ComparisonOp::NotEquals, cond, zeros); + let then_vals = masks & then_val; + + let ones = vec![self.context.new_rvalue_one(element_type); num_units]; + let ones = self.context.new_rvalue_from_vector(None, cond_type, &ones); + let inverted_masks = masks + ones; + // NOTE: sometimes, the type of else_val can be different than the type of then_val in + // libgccjit (vector of int vs vector of int32_t), but they should be the same for the AND + // operation to work. + let else_val = self.context.new_bitcast(None, else_val, then_val.get_type()); + let else_vals = inverted_masks & else_val; + + then_vals | else_vals + } +} + +fn difference_or_zero<'gcc>(a: RValue<'gcc>, b: RValue<'gcc>, context: &'gcc Context<'gcc>) -> RValue<'gcc> { + let difference = a - b; + let masks = context.new_comparison(None, ComparisonOp::GreaterThanEquals, b, a); + difference & masks } impl<'a, 'gcc, 'tcx> StaticBuilderMethods for Builder<'a, 'gcc, 'tcx> { diff --git a/src/common.rs b/src/common.rs index 703e20947fe8..e4a08da446bb 100644 --- a/src/common.rs +++ b/src/common.rs @@ -117,8 +117,8 @@ impl<'gcc, 'tcx> ConstMethods<'tcx> for CodegenCx<'gcc, 'tcx> { unimplemented!(); } - fn const_real(&self, _t: Type<'gcc>, _val: f64) -> RValue<'gcc> { - unimplemented!(); + fn const_real(&self, typ: Type<'gcc>, val: f64) -> RValue<'gcc> { + self.context.new_rvalue_from_double(typ, val) } fn const_str(&self, s: Symbol) -> (RValue<'gcc>, RValue<'gcc>) { diff --git a/src/consts.rs b/src/consts.rs index 4350c00e94a7..4b517fd85f05 100644 --- a/src/consts.rs +++ b/src/consts.rs @@ -27,12 +27,7 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> { } // NOTE: since bitcast makes a value non-constant, don't bitcast if not necessary as some // SIMD builtins require a constant value. - if value.get_type() != typ { - self.context.new_bitcast(None, value, typ) - } - else { - value - } + self.bitcast_if_needed(value, typ) } } @@ -86,13 +81,7 @@ impl<'gcc, 'tcx> StaticMethods for CodegenCx<'gcc, 'tcx> { // TODO(antoyo): set alignment. - let value = - if value.get_type() != gcc_type { - self.context.new_bitcast(None, value, gcc_type) - } - else { - value - }; + let value = self.bitcast_if_needed(value, gcc_type); global.global_set_initializer_rvalue(value); // As an optimization, all shared statics which do not have interior diff --git a/src/context.rs b/src/context.rs index 83c4683a6683..4bc8c5a6760e 100644 --- a/src/context.rs +++ b/src/context.rs @@ -35,6 +35,7 @@ pub struct CodegenCx<'gcc, 'tcx> { pub normal_function_addresses: RefCell>>, pub functions: RefCell>>, + pub intrinsics: RefCell>>, pub tls_model: gccjit::TlsModel, @@ -184,6 +185,7 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> { current_func: RefCell::new(None), normal_function_addresses: Default::default(), functions: RefCell::new(functions), + intrinsics: RefCell::new(FxHashMap::default()), tls_model, @@ -279,6 +281,15 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> { pub fn sess(&self) -> &Session { &self.tcx.sess } + + pub fn bitcast_if_needed(&self, value: RValue<'gcc>, expected_type: Type<'gcc>) -> RValue<'gcc> { + if value.get_type() != expected_type { + self.context.new_bitcast(None, value, expected_type) + } + else { + value + } + } } impl<'gcc, 'tcx> BackendTypes for CodegenCx<'gcc, 'tcx> { @@ -306,8 +317,16 @@ impl<'gcc, 'tcx> MiscMethods<'tcx> for CodegenCx<'gcc, 'tcx> { } fn get_fn_addr(&self, instance: Instance<'tcx>) -> RValue<'gcc> { - let func = get_fn(self, instance); - let func = self.rvalue_as_function(func); + let func_name = self.tcx.symbol_name(instance).name; + + let func = + if self.intrinsics.borrow().contains_key(func_name) { + self.intrinsics.borrow()[func_name].clone() + } + else { + let func = get_fn(self, instance); + self.rvalue_as_function(func) + }; let ptr = func.get_address(None); // TODO(antoyo): don't do this twice: i.e. in declare_fn and here. diff --git a/src/declare.rs b/src/declare.rs index 43017376916d..a619e2f77125 100644 --- a/src/declare.rs +++ b/src/declare.rs @@ -11,7 +11,7 @@ use crate::intrinsic::llvm; impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> { pub fn get_or_insert_global(&self, name: &str, ty: Type<'gcc>, is_tls: bool, link_section: Option) -> LValue<'gcc> { if self.globals.borrow().contains_key(name) { - let typ = self.globals.borrow().get(name).expect("global").get_type(); + let typ = self.globals.borrow()[name].get_type(); let global = self.context.new_global(None, GlobalKind::Imported, typ, name); if is_tls { global.set_tls_model(self.tls_model); @@ -103,11 +103,13 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> { /// update the declaration and return existing Value instead. fn declare_raw_fn<'gcc>(cx: &CodegenCx<'gcc, '_>, name: &str, _callconv: () /*llvm::CallConv*/, return_type: Type<'gcc>, param_types: &[Type<'gcc>], variadic: bool) -> Function<'gcc> { if name.starts_with("llvm.") { - return llvm::intrinsic(name, cx); + let intrinsic = llvm::intrinsic(name, cx); + cx.intrinsics.borrow_mut().insert(name.to_string(), intrinsic); + return intrinsic; } let func = if cx.functions.borrow().contains_key(name) { - *cx.functions.borrow().get(name).expect("function") + cx.functions.borrow()[name] } else { let params: Vec<_> = param_types.into_iter().enumerate() diff --git a/src/intrinsic/llvm.rs b/src/intrinsic/llvm.rs index 4b41b0ba6e78..1b089f08f764 100644 --- a/src/intrinsic/llvm.rs +++ b/src/intrinsic/llvm.rs @@ -1,6 +1,172 @@ -use gccjit::Function; +use std::borrow::Cow; -use crate::context::CodegenCx; +use gccjit::{Function, FunctionPtrType, RValue, ToRValue}; + +use crate::{context::CodegenCx, builder::Builder}; + +pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc, 'tcx>, gcc_func: FunctionPtrType<'gcc>, mut args: Cow<'b, [RValue<'gcc>]>, func_name: &str) -> Cow<'b, [RValue<'gcc>]> { + // Some LLVM intrinsics do not map 1-to-1 to GCC intrinsics, so we add the missing + // arguments here. + if gcc_func.get_param_count() != args.len() { + match &*func_name { + "__builtin_ia32_pmuldq512_mask" | "__builtin_ia32_pmuludq512_mask" + // FIXME(antoyo): the following intrinsics has 4 (or 5) arguments according to the doc, but is defined with 2 (or 3) arguments in library/stdarch/crates/core_arch/src/x86/avx512f.rs. + | "__builtin_ia32_pmaxsd512_mask" | "__builtin_ia32_pmaxsq512_mask" | "__builtin_ia32_pmaxsq256_mask" + | "__builtin_ia32_pmaxsq128_mask" | "__builtin_ia32_maxps512_mask" | "__builtin_ia32_maxpd512_mask" + | "__builtin_ia32_pmaxud512_mask" | "__builtin_ia32_pmaxuq512_mask" | "__builtin_ia32_pmaxuq256_mask" + | "__builtin_ia32_pmaxuq128_mask" + | "__builtin_ia32_pminsd512_mask" | "__builtin_ia32_pminsq512_mask" | "__builtin_ia32_pminsq256_mask" + | "__builtin_ia32_pminsq128_mask" | "__builtin_ia32_minps512_mask" | "__builtin_ia32_minpd512_mask" + | "__builtin_ia32_pminud512_mask" | "__builtin_ia32_pminuq512_mask" | "__builtin_ia32_pminuq256_mask" + | "__builtin_ia32_pminuq128_mask" | "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" + => { + // TODO: refactor by separating those intrinsics outside of this branch. + let add_before_last_arg = + match &*func_name { + "__builtin_ia32_maxps512_mask" | "__builtin_ia32_maxpd512_mask" + | "__builtin_ia32_minps512_mask" | "__builtin_ia32_minpd512_mask" + | "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => true, + _ => false, + }; + let new_first_arg_is_zero = + match &*func_name { + "__builtin_ia32_pmaxuq256_mask" | "__builtin_ia32_pmaxuq128_mask" + | "__builtin_ia32_pminuq256_mask" | "__builtin_ia32_pminuq128_mask" => true, + _ => false + }; + let arg3_index = + match &*func_name { + "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => 1, + _ => 2, + }; + let mut new_args = args.to_vec(); + let arg3_type = gcc_func.get_param_type(arg3_index); + let first_arg = + if new_first_arg_is_zero { + let vector_type = arg3_type.dyncast_vector().expect("vector type"); + let zero = builder.context.new_rvalue_zero(vector_type.get_element_type()); + let num_units = vector_type.get_num_units(); + builder.context.new_rvalue_from_vector(None, arg3_type, &vec![zero; num_units]) + } + else { + builder.current_func().new_local(None, arg3_type, "undefined_for_intrinsic").to_rvalue() + }; + if add_before_last_arg { + new_args.insert(new_args.len() - 1, first_arg); + } + else { + new_args.push(first_arg); + } + let arg4_index = + match &*func_name { + "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => 2, + _ => 3, + }; + let arg4_type = gcc_func.get_param_type(arg4_index); + let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); + if add_before_last_arg { + new_args.insert(new_args.len() - 1, minus_one); + } + else { + new_args.push(minus_one); + } + args = new_args.into(); + }, + "__builtin_ia32_pternlogd512_mask" | "__builtin_ia32_pternlogd256_mask" + | "__builtin_ia32_pternlogd128_mask" | "__builtin_ia32_pternlogq512_mask" + | "__builtin_ia32_pternlogq256_mask" | "__builtin_ia32_pternlogq128_mask" => { + let mut new_args = args.to_vec(); + let arg5_type = gcc_func.get_param_type(4); + let minus_one = builder.context.new_rvalue_from_int(arg5_type, -1); + new_args.push(minus_one); + args = new_args.into(); + }, + "__builtin_ia32_vfmaddps512_mask" | "__builtin_ia32_vfmaddpd512_mask" => { + let mut new_args = args.to_vec(); + + let mut last_arg = None; + if args.len() == 4 { + last_arg = new_args.pop(); + } + + let arg4_type = gcc_func.get_param_type(3); + let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); + new_args.push(minus_one); + + if args.len() == 3 { + // Both llvm.fma.v16f32 and llvm.x86.avx512.vfmadd.ps.512 maps to + // the same GCC intrinsic, but the former has 3 parameters and the + // latter has 4 so it doesn't require this additional argument. + let arg5_type = gcc_func.get_param_type(4); + new_args.push(builder.context.new_rvalue_from_int(arg5_type, 4)); + } + + if let Some(last_arg) = last_arg { + new_args.push(last_arg); + } + + args = new_args.into(); + }, + "__builtin_ia32_addps512_mask" | "__builtin_ia32_addpd512_mask" + | "__builtin_ia32_subps512_mask" | "__builtin_ia32_subpd512_mask" + | "__builtin_ia32_mulps512_mask" | "__builtin_ia32_mulpd512_mask" + | "__builtin_ia32_divps512_mask" | "__builtin_ia32_divpd512_mask" => { + let mut new_args = args.to_vec(); + let last_arg = new_args.pop().expect("last arg"); + let arg3_type = gcc_func.get_param_type(2); + let undefined = builder.current_func().new_local(None, arg3_type, "undefined_for_intrinsic").to_rvalue(); + new_args.push(undefined); + let arg4_type = gcc_func.get_param_type(3); + let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); + new_args.push(minus_one); + new_args.push(last_arg); + args = new_args.into(); + }, + "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask" => { + let mut new_args = args.to_vec(); + let last_arg = new_args.pop().expect("last arg"); + let arg4_type = gcc_func.get_param_type(3); + let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1); + new_args.push(minus_one); + new_args.push(last_arg); + args = new_args.into(); + }, + _ => (), + } + } + + args +} + +pub fn ignore_arg_cast(func_name: &str, index: usize, args_len: usize) -> bool { + // NOTE: these intrinsics have missing parameters before the last one, so ignore the + // last argument type check. + // FIXME(antoyo): find a way to refactor in order to avoid this hack. + match func_name { + "__builtin_ia32_maxps512_mask" | "__builtin_ia32_maxpd512_mask" + | "__builtin_ia32_minps512_mask" | "__builtin_ia32_minpd512_mask" | "__builtin_ia32_sqrtps512_mask" + | "__builtin_ia32_sqrtpd512_mask" | "__builtin_ia32_addps512_mask" | "__builtin_ia32_addpd512_mask" + | "__builtin_ia32_subps512_mask" | "__builtin_ia32_subpd512_mask" + | "__builtin_ia32_mulps512_mask" | "__builtin_ia32_mulpd512_mask" + | "__builtin_ia32_divps512_mask" | "__builtin_ia32_divpd512_mask" + | "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask" => { + if index == args_len - 1 { + return true; + } + }, + "__builtin_ia32_vfmaddps512_mask" | "__builtin_ia32_vfmaddpd512_mask" => { + // Since there are two LLVM intrinsics that map to each of these GCC builtins and only + // one of them has a missing parameter before the last one, we check the number of + // arguments to distinguish those cases. + if args_len == 4 && index == args_len - 1 { + return true; + } + }, + _ => (), + } + + false +} #[cfg(not(feature="master"))] pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function<'gcc> { @@ -21,6 +187,59 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function "llvm.x86.xgetbv" => "__builtin_ia32_xgetbv", // NOTE: this doc specifies the equivalent GCC builtins: http://huonw.github.io/llvmint/llvmint/x86/index.html "llvm.sqrt.v2f64" => "__builtin_ia32_sqrtpd", + "llvm.x86.avx512.pmul.dq.512" => "__builtin_ia32_pmuldq512_mask", + "llvm.x86.avx512.pmulu.dq.512" => "__builtin_ia32_pmuludq512_mask", + "llvm.x86.avx512.mask.pmaxs.q.256" => "__builtin_ia32_pmaxsq256_mask", + "llvm.x86.avx512.mask.pmaxs.q.128" => "__builtin_ia32_pmaxsq128_mask", + "llvm.x86.avx512.max.ps.512" => "__builtin_ia32_maxps512_mask", + "llvm.x86.avx512.max.pd.512" => "__builtin_ia32_maxpd512_mask", + "llvm.x86.avx512.mask.pmaxu.q.256" => "__builtin_ia32_pmaxuq256_mask", + "llvm.x86.avx512.mask.pmaxu.q.128" => "__builtin_ia32_pmaxuq128_mask", + "llvm.x86.avx512.mask.pmins.q.256" => "__builtin_ia32_pminsq256_mask", + "llvm.x86.avx512.mask.pmins.q.128" => "__builtin_ia32_pminsq128_mask", + "llvm.x86.avx512.min.ps.512" => "__builtin_ia32_minps512_mask", + "llvm.x86.avx512.min.pd.512" => "__builtin_ia32_minpd512_mask", + "llvm.x86.avx512.mask.pminu.q.256" => "__builtin_ia32_pminuq256_mask", + "llvm.x86.avx512.mask.pminu.q.128" => "__builtin_ia32_pminuq128_mask", + "llvm.fma.v16f32" => "__builtin_ia32_vfmaddps512_mask", + "llvm.fma.v8f64" => "__builtin_ia32_vfmaddpd512_mask", + "llvm.x86.avx512.vfmaddsub.ps.512" => "__builtin_ia32_vfmaddsubps512_mask", + "llvm.x86.avx512.vfmaddsub.pd.512" => "__builtin_ia32_vfmaddsubpd512_mask", + "llvm.x86.avx512.pternlog.d.512" => "__builtin_ia32_pternlogd512_mask", + "llvm.x86.avx512.pternlog.d.256" => "__builtin_ia32_pternlogd256_mask", + "llvm.x86.avx512.pternlog.d.128" => "__builtin_ia32_pternlogd128_mask", + "llvm.x86.avx512.pternlog.q.512" => "__builtin_ia32_pternlogq512_mask", + "llvm.x86.avx512.pternlog.q.256" => "__builtin_ia32_pternlogq256_mask", + "llvm.x86.avx512.pternlog.q.128" => "__builtin_ia32_pternlogq128_mask", + "llvm.x86.avx512.add.ps.512" => "__builtin_ia32_addps512_mask", + "llvm.x86.avx512.add.pd.512" => "__builtin_ia32_addpd512_mask", + "llvm.x86.avx512.sub.ps.512" => "__builtin_ia32_subps512_mask", + "llvm.x86.avx512.sub.pd.512" => "__builtin_ia32_subpd512_mask", + "llvm.x86.avx512.mul.ps.512" => "__builtin_ia32_mulps512_mask", + "llvm.x86.avx512.mul.pd.512" => "__builtin_ia32_mulpd512_mask", + "llvm.x86.avx512.div.ps.512" => "__builtin_ia32_divps512_mask", + "llvm.x86.avx512.div.pd.512" => "__builtin_ia32_divpd512_mask", + "llvm.x86.avx512.vfmadd.ps.512" => "__builtin_ia32_vfmaddps512_mask", + "llvm.x86.avx512.vfmadd.pd.512" => "__builtin_ia32_vfmaddpd512_mask", + + // The above doc points to unknown builtins for the following, so override them: + "llvm.x86.avx2.gather.d.d" => "__builtin_ia32_gathersiv4si", + "llvm.x86.avx2.gather.d.d.256" => "__builtin_ia32_gathersiv8si", + "llvm.x86.avx2.gather.d.ps" => "__builtin_ia32_gathersiv4sf", + "llvm.x86.avx2.gather.d.ps.256" => "__builtin_ia32_gathersiv8sf", + "llvm.x86.avx2.gather.d.q" => "__builtin_ia32_gathersiv2di", + "llvm.x86.avx2.gather.d.q.256" => "__builtin_ia32_gathersiv4di", + "llvm.x86.avx2.gather.d.pd" => "__builtin_ia32_gathersiv2df", + "llvm.x86.avx2.gather.d.pd.256" => "__builtin_ia32_gathersiv4df", + "llvm.x86.avx2.gather.q.d" => "__builtin_ia32_gatherdiv4si", + "llvm.x86.avx2.gather.q.d.256" => "__builtin_ia32_gatherdiv4si256", + "llvm.x86.avx2.gather.q.ps" => "__builtin_ia32_gatherdiv4sf", + "llvm.x86.avx2.gather.q.ps.256" => "__builtin_ia32_gatherdiv4sf256", + "llvm.x86.avx2.gather.q.q" => "__builtin_ia32_gatherdiv2di", + "llvm.x86.avx2.gather.q.q.256" => "__builtin_ia32_gatherdiv4di", + "llvm.x86.avx2.gather.q.pd" => "__builtin_ia32_gatherdiv2df", + "llvm.x86.avx2.gather.q.pd.256" => "__builtin_ia32_gatherdiv4df", + "" => "", // NOTE: this file is generated by https://github.com/GuillaumeGomez/llvmint/blob/master/generate_list.py _ => include!("archs.rs"), }; diff --git a/src/intrinsic/simd.rs b/src/intrinsic/simd.rs index f1167bc3a3b6..6c2834fccf30 100644 --- a/src/intrinsic/simd.rs +++ b/src/intrinsic/simd.rs @@ -1,15 +1,17 @@ use std::cmp::Ordering; -use gccjit::{RValue, Type, ToRValue}; +use gccjit::{BinaryOp, RValue, Type, ToRValue}; use rustc_codegen_ssa::base::compare_simd_types; use rustc_codegen_ssa::common::{TypeKind, span_invalid_monomorphization_error}; use rustc_codegen_ssa::mir::operand::OperandRef; +use rustc_codegen_ssa::mir::place::PlaceRef; use rustc_codegen_ssa::traits::{BaseTypeMethods, BuilderMethods}; use rustc_hir as hir; use rustc_middle::span_bug; use rustc_middle::ty::layout::HasTyCtxt; use rustc_middle::ty::{self, Ty}; use rustc_span::{Span, Symbol, sym}; +use rustc_target::abi::Align; use crate::builder::Builder; use crate::intrinsic; @@ -55,7 +57,53 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>, let sig = tcx.normalize_erasing_late_bound_regions(ty::ParamEnv::reveal_all(), callee_ty.fn_sig(tcx)); let arg_tys = sig.inputs(); - let name_str = name.as_str(); + + if name == sym::simd_select_bitmask { + require_simd!(arg_tys[1], "argument"); + let (len, _) = arg_tys[1].simd_size_and_type(bx.tcx()); + + let expected_int_bits = (len.max(8) - 1).next_power_of_two(); + let expected_bytes = len / 8 + ((len % 8 > 0) as u64); + + let mask_ty = arg_tys[0]; + let mut mask = match mask_ty.kind() { + ty::Int(i) if i.bit_width() == Some(expected_int_bits) => args[0].immediate(), + ty::Uint(i) if i.bit_width() == Some(expected_int_bits) => args[0].immediate(), + ty::Array(elem, len) + if matches!(elem.kind(), ty::Uint(ty::UintTy::U8)) + && len.try_eval_usize(bx.tcx, ty::ParamEnv::reveal_all()) + == Some(expected_bytes) => + { + let place = PlaceRef::alloca(bx, args[0].layout); + args[0].val.store(bx, place); + let int_ty = bx.type_ix(expected_bytes * 8); + let ptr = bx.pointercast(place.llval, bx.cx.type_ptr_to(int_ty)); + bx.load(int_ty, ptr, Align::ONE) + } + _ => return_error!( + "invalid bitmask `{}`, expected `u{}` or `[u8; {}]`", + mask_ty, + expected_int_bits, + expected_bytes + ), + }; + + let arg1 = args[1].immediate(); + let arg1_type = arg1.get_type(); + let arg1_vector_type = arg1_type.unqualified().dyncast_vector().expect("vector type"); + let arg1_element_type = arg1_vector_type.get_element_type(); + + let mut elements = vec![]; + let one = bx.context.new_rvalue_one(mask.get_type()); + for _ in 0..len { + let element = bx.context.new_cast(None, mask & one, arg1_element_type); + elements.push(element); + mask = mask >> one; + } + let vector_mask = bx.context.new_rvalue_from_vector(None, arg1_type, &elements); + + return Ok(bx.vector_select(vector_mask, arg1, args[2].immediate())); + } // every intrinsic below takes a SIMD vector as its first argument require_simd!(arg_tys[0], "input"); @@ -102,7 +150,7 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>, )); } - if let Some(stripped) = name_str.strip_prefix("simd_shuffle") { + if let Some(stripped) = name.as_str().strip_prefix("simd_shuffle") { let n: u64 = if stripped.is_empty() { // Make sure this is actually an array, since typeck only checks the length-suffixed @@ -172,6 +220,7 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>, // NOTE: we cannot cast to an array and assign to its element here because the value might // not be an l-value. So, call a builtin to set the element. // TODO(antoyo): perhaps we could create a new vector or maybe there's a GIMPLE instruction for that? + // TODO(antoyo): don't use target specific builtins here. let func_name = match in_len { 2 => { @@ -202,14 +251,8 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>, }; let builtin = bx.context.get_target_builtin_function(func_name); let param1_type = builtin.get_param(0).to_rvalue().get_type(); - let vector = - if vector.get_type() != param1_type { - // TODO(antoyo): perhaps use __builtin_convertvector for vector casting. - bx.context.new_bitcast(None, vector, param1_type) - } - else { - vector - }; + // TODO(antoyo): perhaps use __builtin_convertvector for vector casting. + let vector = bx.cx.bitcast_if_needed(vector, param1_type); let result = bx.context.new_call(None, builtin, &[vector, value, bx.context.new_cast(None, index, bx.int_type)]); // TODO(antoyo): perhaps use __builtin_convertvector for vector casting. return Ok(bx.context.new_bitcast(None, result, vector.get_type())); @@ -228,6 +271,24 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>, return Ok(bx.context.new_vector_access(None, vector, args[1].immediate()).to_rvalue()); } + if name == sym::simd_select { + let m_elem_ty = in_elem; + let m_len = in_len; + require_simd!(arg_tys[1], "argument"); + let (v_len, _) = arg_tys[1].simd_size_and_type(bx.tcx()); + require!( + m_len == v_len, + "mismatched lengths: mask length `{}` != other vector length `{}`", + m_len, + v_len + ); + match m_elem_ty.kind() { + ty::Int(_) => {} + _ => return_error!("mask element type is `{}`, expected `i_`", m_elem_ty), + } + return Ok(bx.vector_select(args[0].immediate(), args[1].immediate(), args[2].immediate())); + } + if name == sym::simd_cast { require_simd!(ret_ty, "return"); let (out_len, out_elem) = ret_ty.simd_size_and_type(bx.tcx()); @@ -336,6 +397,10 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>, }); } (Style::Int(_), Style::Float) => { + // TODO: add support for internal functions in libgccjit to get access to IFN_VEC_CONVERT which is + // doing like __builtin_convertvector? + // Or maybe provide convert_vector as an API since it might not easy to get the + // types of internal functions. unimplemented!(); } (Style::Float, Style::Int(_)) => { @@ -539,10 +604,150 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>, let vec_ty = bx.cx.type_vector(elem_ty, in_len as u64); let func = bx.context.get_target_builtin_function(builtin_name); + let param1_type = func.get_param(0).to_rvalue().get_type(); + let param2_type = func.get_param(1).to_rvalue().get_type(); + let lhs = bx.cx.bitcast_if_needed(lhs, param1_type); + let rhs = bx.cx.bitcast_if_needed(rhs, param2_type); let result = bx.context.new_call(None, func, &[lhs, rhs]); // TODO(antoyo): perhaps use __builtin_convertvector for vector casting. return Ok(bx.context.new_bitcast(None, result, vec_ty)); } + macro_rules! arith_red { + ($name:ident : $vec_op:expr, $float_reduce:ident, $ordered:expr, $op:ident, + $identity:expr) => { + if name == sym::$name { + require!( + ret_ty == in_elem, + "expected return type `{}` (element of input `{}`), found `{}`", + in_elem, + in_ty, + ret_ty + ); + return match in_elem.kind() { + ty::Int(_) | ty::Uint(_) => { + let r = bx.vector_reduce_op(args[0].immediate(), $vec_op); + if $ordered { + // if overflow occurs, the result is the + // mathematical result modulo 2^n: + Ok(bx.$op(args[1].immediate(), r)) + } + else { + Ok(bx.vector_reduce_op(args[0].immediate(), $vec_op)) + } + } + ty::Float(_) => { + if $ordered { + // ordered arithmetic reductions take an accumulator + let acc = args[1].immediate(); + Ok(bx.$float_reduce(acc, args[0].immediate())) + } + else { + Ok(bx.vector_reduce_op(args[0].immediate(), $vec_op)) + } + } + _ => return_error!( + "unsupported {} from `{}` with element `{}` to `{}`", + sym::$name, + in_ty, + in_elem, + ret_ty + ), + }; + } + }; + } + + arith_red!( + simd_reduce_add_unordered: BinaryOp::Plus, + vector_reduce_fadd_fast, + false, + add, + 0.0 // TODO: Use this argument. + ); + arith_red!( + simd_reduce_mul_unordered: BinaryOp::Mult, + vector_reduce_fmul_fast, + false, + mul, + 1.0 + ); + + macro_rules! minmax_red { + ($name:ident: $reduction:ident) => { + if name == sym::$name { + require!( + ret_ty == in_elem, + "expected return type `{}` (element of input `{}`), found `{}`", + in_elem, + in_ty, + ret_ty + ); + return match in_elem.kind() { + ty::Int(_) | ty::Uint(_) | ty::Float(_) => Ok(bx.$reduction(args[0].immediate())), + _ => return_error!( + "unsupported {} from `{}` with element `{}` to `{}`", + sym::$name, + in_ty, + in_elem, + ret_ty + ), + }; + } + }; + } + + minmax_red!(simd_reduce_min: vector_reduce_min); + minmax_red!(simd_reduce_max: vector_reduce_max); + + macro_rules! bitwise_red { + ($name:ident : $op:expr, $boolean:expr) => { + if name == sym::$name { + let input = if !$boolean { + require!( + ret_ty == in_elem, + "expected return type `{}` (element of input `{}`), found `{}`", + in_elem, + in_ty, + ret_ty + ); + args[0].immediate() + } else { + match in_elem.kind() { + ty::Int(_) | ty::Uint(_) => {} + _ => return_error!( + "unsupported {} from `{}` with element `{}` to `{}`", + sym::$name, + in_ty, + in_elem, + ret_ty + ), + } + + // boolean reductions operate on vectors of i1s: + let i1 = bx.type_i1(); + let i1xn = bx.type_vector(i1, in_len as u64); + bx.trunc(args[0].immediate(), i1xn) + }; + return match in_elem.kind() { + ty::Int(_) | ty::Uint(_) => { + let r = bx.vector_reduce_op(input, $op); + Ok(if !$boolean { r } else { bx.zext(r, bx.type_bool()) }) + } + _ => return_error!( + "unsupported {} from `{}` with element `{}` to `{}`", + sym::$name, + in_ty, + in_elem, + ret_ty + ), + }; + } + }; + } + + bitwise_red!(simd_reduce_and: BinaryOp::BitwiseAnd, false); + bitwise_red!(simd_reduce_or: BinaryOp::BitwiseOr, false); + unimplemented!("simd {}", name); } diff --git a/src/type_.rs b/src/type_.rs index db2b5ea8ab20..002b95db36de 100644 --- a/src/type_.rs +++ b/src/type_.rs @@ -247,6 +247,10 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> { self.context.new_array_type(None, ty, len) } + + pub fn type_bool(&self) -> Type<'gcc> { + self.context.new_type::() + } } pub fn struct_fields<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, layout: TyAndLayout<'tcx>) -> (Vec>, bool) {