diff --git a/Readme.md b/Readme.md
index fe23a2676966..5bbd29fceba2 100644
--- a/Readme.md
+++ b/Readme.md
@@ -127,6 +127,10 @@ To get the `rustc` command to run in `gdb`, add the `--verbose` flag to `cargo b
  * Build the stage2 compiler (`rustup toolchain link debug-current build/x86_64-unknown-linux-gnu/stage2`).
  * Clean and rebuild the codegen with `debug-current` in the file `rust-toolchain`.
 
+### How to use [mem-trace](https://github.com/antoyo/mem-trace)
+
+`rustc` needs to be built without `jemalloc` so that `mem-trace` can overload `malloc` since `jemalloc` is linked statically, so a `LD_PRELOAD`-ed library won't a chance to intercept the calls to `malloc`.
+
 ### How to build a cross-compiling libgccjit
 
 #### Building libgccjit
diff --git a/src/base.rs b/src/base.rs
index e4ecbd46f0c4..19c981309d75 100644
--- a/src/base.rs
+++ b/src/base.rs
@@ -81,11 +81,17 @@ pub fn compile_codegen_unit<'tcx>(tcx: TyCtxt<'tcx>, cgu_name: Symbol, supports_
         // TODO(antoyo): only add the following cli argument if the feature is supported.
         context.add_command_line_option("-msse2");
         context.add_command_line_option("-mavx2");
-        context.add_command_line_option("-msha");
-        context.add_command_line_option("-mpclmul");
         // FIXME(antoyo): the following causes an illegal instruction on vmovdqu64 in std_example on my CPU.
         // Only add if the CPU supports it.
-        //context.add_command_line_option("-mavx512f");
+        /*context.add_command_line_option("-mavx512f");
+        context.add_command_line_option("-msha");
+        context.add_command_line_option("-mpclmul");
+        context.add_command_line_option("-mfma");
+        context.add_command_line_option("-mfma4");
+        context.add_command_line_option("-mavx512vpopcntdq");
+        context.add_command_line_option("-mavx512vl");
+        context.add_command_line_option("-m64");
+        context.add_command_line_option("-mbmi");*/
         for arg in &tcx.sess.opts.cg.llvm_args {
             context.add_command_line_option(arg);
         }
diff --git a/src/builder.rs b/src/builder.rs
index 3e1f56c183ac..3804a958e691 100644
--- a/src/builder.rs
+++ b/src/builder.rs
@@ -213,7 +213,7 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
 
                 let actual_ty = actual_val.get_type();
                 if expected_ty != actual_ty {
-                    if !actual_ty.is_vector() && !expected_ty.is_vector() && actual_ty.is_integral() && expected_ty.is_integral() && actual_ty.get_size() != expected_ty.get_size() {
+                    if !actual_ty.is_vector() && !expected_ty.is_vector() && actual_ty.is_integral() && expected_ty.is_integral() {
                         self.context.new_cast(None, actual_val, expected_ty)
                     }
                     else if on_stack_param_indices.contains(&index) {
@@ -1390,18 +1390,20 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
     where F: Fn(RValue<'gcc>, RValue<'gcc>, &'gcc Context<'gcc>) -> RValue<'gcc>
     {
         let vector_type = src.get_type().unqualified().dyncast_vector().expect("vector type");
+        let element_type = vector_type.get_element_type();
+        let mask_element_type = self.type_ix(element_type.get_size() as u64 * 8);
         let element_count = vector_type.get_num_units();
         let mut vector_elements = vec![];
         for i in 0..element_count {
             vector_elements.push(i);
         }
-        let mask_type = self.context.new_vector_type(self.int_type, element_count as u64);
+        let mask_type = self.context.new_vector_type(mask_element_type, element_count as u64);
         let mut shift = 1;
         let mut res = src;
         while shift < element_count {
             let vector_elements: Vec<_> =
                 vector_elements.iter()
-                    .map(|i| self.context.new_rvalue_from_int(self.int_type, ((i + shift) % element_count) as i32))
+                    .map(|i| self.context.new_rvalue_from_int(mask_element_type, ((i + shift) % element_count) as i32))
                     .collect();
             let mask = self.context.new_rvalue_from_vector(None, mask_type, &vector_elements);
             let shifted = self.context.new_rvalue_vector_perm(None, res, res, mask);
diff --git a/src/intrinsic/llvm.rs b/src/intrinsic/llvm.rs
index 42cf06c8c7ab..f2faae070284 100644
--- a/src/intrinsic/llvm.rs
+++ b/src/intrinsic/llvm.rs
@@ -288,7 +288,10 @@ pub fn adjust_intrinsic_return_value<'a, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc,
     match func_name {
         "__builtin_ia32_vfmaddss3_round" | "__builtin_ia32_vfmaddsd3_round" => {
             let zero = builder.context.new_rvalue_zero(builder.int_type);
-            return_value = builder.context.new_vector_access(None, return_value, zero).to_rvalue();
+            #[cfg(feature="master")]
+            {
+                return_value = builder.context.new_vector_access(None, return_value, zero).to_rvalue();
+            }
         },
         "__builtin_ia32_addcarryx_u64" | "__builtin_ia32_sbb_u64" | "__builtin_ia32_addcarryx_u32" | "__builtin_ia32_sbb_u32" => {
             // Both llvm.x86.addcarry.32 and llvm.x86.addcarryx.u32 points to the same GCC builtin,
diff --git a/src/intrinsic/simd.rs b/src/intrinsic/simd.rs
index bf5d555736ae..8f9862414e60 100644
--- a/src/intrinsic/simd.rs
+++ b/src/intrinsic/simd.rs
@@ -216,7 +216,7 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>,
         let variable = bx.current_func().new_local(None, vector.get_type(), "new_vector");
         bx.llbb().add_assignment(None, variable, vector);
         let lvalue = bx.context.new_vector_access(None, variable.to_rvalue(), index);
-        // TODO: si simd_insert est constant, utiliser BIT_REF…
+        // TODO: if simd_insert is constant, use BIT_REF.
         bx.llbb().add_assignment(None, lvalue, value);
         return Ok(variable.to_rvalue());
     }
@@ -252,6 +252,7 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>,
         return Ok(bx.vector_select(args[0].immediate(), args[1].immediate(), args[2].immediate()));
     }
 
+    #[cfg(feature="master")]
     if name == sym::simd_cast {
         require_simd!(ret_ty, "return");
         let (out_len, out_elem) = ret_ty.simd_size_and_type(bx.tcx());