rust-lang · Nov 10, 2023
diff --git a/‎compiler/rustc_codegen_cranelift/Readme.md
Lines changed: 0 additions & 2 deletions b/‎compiler/rustc_codegen_cranelift/Readme.md
Lines changed: 0 additions & 2 deletions
diff --git a/‎compiler/rustc_codegen_cranelift/build_system/tests.rs
Lines changed: 1 addition & 0 deletions b/‎compiler/rustc_codegen_cranelift/build_system/tests.rs
Lines changed: 1 addition & 0 deletions
diff --git a/‎compiler/rustc_codegen_cranelift/config.txt
Lines changed: 1 addition & 0 deletions b/‎compiler/rustc_codegen_cranelift/config.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎compiler/rustc_codegen_cranelift/example/neon.rs
Lines changed: 234 additions & 0 deletions b/‎compiler/rustc_codegen_cranelift/example/neon.rs
Lines changed: 234 additions & 0 deletions
diff --git a/‎compiler/rustc_codegen_cranelift/patches/stdlib-lock.toml
Lines changed: 4 additions & 5 deletions b/‎compiler/rustc_codegen_cranelift/patches/stdlib-lock.toml
Lines changed: 4 additions & 5 deletions
diff --git a/‎compiler/rustc_codegen_cranelift/rust-toolchain
Lines changed: 1 addition & 1 deletion b/‎compiler/rustc_codegen_cranelift/rust-toolchain
Lines changed: 1 addition & 1 deletion
diff --git a/‎compiler/rustc_codegen_cranelift/scripts/test_rustc_tests.sh
Lines changed: 5 additions & 0 deletions b/‎compiler/rustc_codegen_cranelift/scripts/test_rustc_tests.sh
Lines changed: 5 additions & 0 deletions
diff --git a/‎compiler/rustc_codegen_cranelift/src/inline_asm.rs
Lines changed: 92 additions & 9 deletions b/‎compiler/rustc_codegen_cranelift/src/inline_asm.rs
Lines changed: 92 additions & 9 deletions
diff --git a/‎compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs
Lines changed: 15 additions & 0 deletions b/‎compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs
Lines changed: 15 additions & 0 deletions
diff --git a/‎compiler/rustc_codegen_cranelift/src/intrinsics/llvm_aarch64.rs
Lines changed: 90 additions & 21 deletions b/‎compiler/rustc_codegen_cranelift/src/intrinsics/llvm_aarch64.rs
Lines changed: 90 additions & 21 deletions
diff --git a/‎compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
Lines changed: 403 additions & 29 deletions b/‎compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
Lines changed: 403 additions & 29 deletions
diff --git a/‎compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs
Lines changed: 59 additions & 0 deletions b/‎compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs
Lines changed: 59 additions & 0 deletions
diff --git a/‎compiler/rustc_codegen_cranelift/src/value_and_place.rs
Lines changed: 56 additions & 0 deletions b/‎compiler/rustc_codegen_cranelift/src/value_and_place.rs
Lines changed: 56 additions & 0 deletions
@@ -76,8 +76,6 @@ configuration options.
 
 ## Not yet supported
 
-* Inline assembly ([no cranelift support](https://github.com/bytecodealliance/wasmtime/issues/1041))
-    * On UNIX there is support for invoking an external assembler for `global_asm!` and `asm!`.
 * SIMD ([tracked here](https://github.com/rust-lang/rustc_codegen_cranelift/issues/171), `std::simd` fully works, `std::arch` is partially supported)
 * Unwinding on panics ([no cranelift support](https://github.com/bytecodealliance/wasmtime/issues/1677), `-Cpanic=abort` is enabled by default)
 
 
@@ -99,6 +99,7 @@ const BASE_SYSROOT_SUITE: &[TestCase] = &[
     TestCase::build_bin_and_run("aot.mod_bench", "example/mod_bench.rs", &[]),
     TestCase::build_bin_and_run("aot.issue-72793", "example/issue-72793.rs", &[]),
     TestCase::build_bin("aot.issue-59326", "example/issue-59326.rs"),
+    TestCase::build_bin_and_run("aot.neon", "example/neon.rs", &[]),
 ];
 
 pub(crate) static RAND_REPO: GitRepo = GitRepo::github(
 
@@ -42,6 +42,7 @@ aot.float-minmax-pass
 aot.mod_bench
 aot.issue-72793
 aot.issue-59326
+aot.neon
 
 testsuite.extended_sysroot
 test.rust-random/rand
 
@@ -0,0 +1,234 @@
+// Most of these tests are copied from https://github.com/japaric/stdsimd/blob/0f4413d01c4f0c3ffbc5a69e9a37fbc7235b31a9/coresimd/arm/neon.rs
+
+#![feature(portable_simd)]
+
+#[cfg(target_arch = "aarch64")]
+use std::arch::aarch64::*;
+use std::mem::transmute;
+use std::simd::*;
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmin_s8() {
+    let a = i8x8::from([1, -2, 3, -4, 5, 6, 7, 8]);
+    let b = i8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
+    let e = i8x8::from([-2, -4, 5, 7, 0, 2, 4, 6]);
+    let r: i8x8 = transmute(vpmin_s8(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmin_s16() {
+    let a = i16x4::from([1, 2, 3, -4]);
+    let b = i16x4::from([0, 3, 2, 5]);
+    let e = i16x4::from([1, -4, 0, 2]);
+    let r: i16x4 = transmute(vpmin_s16(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmin_s32() {
+    let a = i32x2::from([1, -2]);
+    let b = i32x2::from([0, 3]);
+    let e = i32x2::from([-2, 0]);
+    let r: i32x2 = transmute(vpmin_s32(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmin_u8() {
+    let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
+    let b = u8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
+    let e = u8x8::from([1, 3, 5, 7, 0, 2, 4, 6]);
+    let r: u8x8 = transmute(vpmin_u8(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmin_u16() {
+    let a = u16x4::from([1, 2, 3, 4]);
+    let b = u16x4::from([0, 3, 2, 5]);
+    let e = u16x4::from([1, 3, 0, 2]);
+    let r: u16x4 = transmute(vpmin_u16(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmin_u32() {
+    let a = u32x2::from([1, 2]);
+    let b = u32x2::from([0, 3]);
+    let e = u32x2::from([1, 0]);
+    let r: u32x2 = transmute(vpmin_u32(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmin_f32() {
+    let a = f32x2::from([1., -2.]);
+    let b = f32x2::from([0., 3.]);
+    let e = f32x2::from([-2., 0.]);
+    let r: f32x2 = transmute(vpmin_f32(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmax_s8() {
+    let a = i8x8::from([1, -2, 3, -4, 5, 6, 7, 8]);
+    let b = i8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
+    let e = i8x8::from([1, 3, 6, 8, 3, 5, 7, 9]);
+    let r: i8x8 = transmute(vpmax_s8(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmax_s16() {
+    let a = i16x4::from([1, 2, 3, -4]);
+    let b = i16x4::from([0, 3, 2, 5]);
+    let e = i16x4::from([2, 3, 3, 5]);
+    let r: i16x4 = transmute(vpmax_s16(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmax_s32() {
+    let a = i32x2::from([1, -2]);
+    let b = i32x2::from([0, 3]);
+    let e = i32x2::from([1, 3]);
+    let r: i32x2 = transmute(vpmax_s32(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmax_u8() {
+    let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
+    let b = u8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
+    let e = u8x8::from([2, 4, 6, 8, 3, 5, 7, 9]);
+    let r: u8x8 = transmute(vpmax_u8(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmax_u16() {
+    let a = u16x4::from([1, 2, 3, 4]);
+    let b = u16x4::from([0, 3, 2, 5]);
+    let e = u16x4::from([2, 4, 3, 5]);
+    let r: u16x4 = transmute(vpmax_u16(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmax_u32() {
+    let a = u32x2::from([1, 2]);
+    let b = u32x2::from([0, 3]);
+    let e = u32x2::from([2, 3]);
+    let r: u32x2 = transmute(vpmax_u32(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmax_f32() {
+    let a = f32x2::from([1., -2.]);
+    let b = f32x2::from([0., 3.]);
+    let e = f32x2::from([1., 3.]);
+    let r: f32x2 = transmute(vpmax_f32(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpadd_s16() {
+    let a = i16x4::from([1, 2, 3, 4]);
+    let b = i16x4::from([0, -1, -2, -3]);
+    let r: i16x4 = transmute(vpadd_s16(transmute(a), transmute(b)));
+    let e = i16x4::from([3, 7, -1, -5]);
+    assert_eq!(r, e);
+}
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpadd_s32() {
+    let a = i32x2::from([1, 2]);
+    let b = i32x2::from([0, -1]);
+    let r: i32x2 = transmute(vpadd_s32(transmute(a), transmute(b)));
+    let e = i32x2::from([3, -1]);
+    assert_eq!(r, e);
+}
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpadd_s8() {
+    let a = i8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
+    let b = i8x8::from([0, -1, -2, -3, -4, -5, -6, -7]);
+    let r: i8x8 = transmute(vpadd_s8(transmute(a), transmute(b)));
+    let e = i8x8::from([3, 7, 11, 15, -1, -5, -9, -13]);
+    assert_eq!(r, e);
+}
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpadd_u16() {
+    let a = u16x4::from([1, 2, 3, 4]);
+    let b = u16x4::from([30, 31, 32, 33]);
+    let r: u16x4 = transmute(vpadd_u16(transmute(a), transmute(b)));
+    let e = u16x4::from([3, 7, 61, 65]);
+    assert_eq!(r, e);
+}
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpadd_u32() {
+    let a = u32x2::from([1, 2]);
+    let b = u32x2::from([30, 31]);
+    let r: u32x2 = transmute(vpadd_u32(transmute(a), transmute(b)));
+    let e = u32x2::from([3, 61]);
+    assert_eq!(r, e);
+}
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpadd_u8() {
+    let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
+    let b = u8x8::from([30, 31, 32, 33, 34, 35, 36, 37]);
+    let r: u8x8 = transmute(vpadd_u8(transmute(a), transmute(b)));
+    let e = u8x8::from([3, 7, 11, 15, 61, 65, 69, 73]);
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vqsub_u8() {
+    let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 0xff]);
+    let b = u8x8::from([30, 1, 1, 1, 34, 0xff, 36, 37]);
+    let r: u8x8 = transmute(vqsub_u8(transmute(a), transmute(b)));
+    let e = u8x8::from([0, 1, 2, 3, 0, 0, 0, 218]);
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vqadd_u8() {
+    let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 0xff]);
+    let b = u8x8::from([30, 1, 1, 1, 34, 0xff, 36, 37]);
+    let r: u8x8 = transmute(vqadd_u8(transmute(a), transmute(b)));
+    let e = u8x8::from([31, 3, 4, 5, 39, 0xff, 43, 0xff]);
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+fn main() {
+    unsafe {
+        test_vpmin_s8();
+        test_vpmin_s16();
+        test_vpmin_s32();
+        test_vpmin_u8();
+        test_vpmin_u16();
+        test_vpmin_u32();
+        test_vpmin_f32();
+        test_vpmax_s8();
+        test_vpmax_s16();
+        test_vpmax_s32();
+        test_vpmax_u8();
+        test_vpmax_u16();
+        test_vpmax_u32();
+        test_vpmax_f32();
+
+        test_vpadd_s16();
+        test_vpadd_s32();
+        test_vpadd_s8();
+        test_vpadd_u16();
+        test_vpadd_u32();
+        test_vpadd_u8();
+
+        test_vqsub_u8();
+        test_vqadd_u8();
+    }
+}
+
+#[cfg(not(target_arch = "aarch64"))]
+fn main() {}
@@ -58,9 +58,9 @@ dependencies = [
 
 [[package]]
 name = "compiler_builtins"
-version = "0.1.100"
+version = "0.1.103"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6c0f24437059853f0fa64afc51f338f93647a3de4cf3358ba1bb4171a199775"
+checksum = "a3b73c3443a5fd2438d7ba4853c64e4c8efc2404a9e28a9234cc2d5eebc6c242"
 dependencies = [
  "cc",
  "rustc-std-workspace-core",
@@ -158,9 +158,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.149"
+version = "0.2.150"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
+checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c"
 dependencies = [
  "rustc-std-workspace-core",
 ]
@@ -415,7 +415,6 @@ dependencies = [
 name = "unwind"
 version = "0.0.0"
 dependencies = [
- "cc",
  "cfg-if",
  "compiler_builtins",
  "core",
 
@@ -1,3 +1,3 @@
 [toolchain]
-channel = "nightly-2023-10-29"
+channel = "nightly-2023-11-10"
 components = ["rust-src", "rustc-dev", "llvm-tools"]
@@ -146,6 +146,11 @@ rm tests/ui/process/nofile-limit.rs # TODO some AArch64 linking issue
 
 rm tests/ui/stdio-is-blocking.rs # really slow with unoptimized libstd
 
+# rustc bugs
+# ==========
+# https://github.com/rust-lang/rust/pull/116447#issuecomment-1790451463
+rm tests/ui/coroutine/gen_block_*.rs
+
 cp ../dist/bin/rustdoc-clif ../dist/bin/rustdoc # some tests expect bin/rustdoc to exist
 
 # prevent $(RUSTDOC) from picking up the sysroot built by x.py. It conflicts with the one used by
 
@@ -13,7 +13,7 @@ use crate::prelude::*;
 enum CInlineAsmOperand<'tcx> {
     In {
         reg: InlineAsmRegOrRegClass,
-        value: CValue<'tcx>,
+        value: Value,
     },
     Out {
         reg: InlineAsmRegOrRegClass,
@@ -23,7 +23,7 @@ enum CInlineAsmOperand<'tcx> {
     InOut {
         reg: InlineAsmRegOrRegClass,
         _late: bool,
-        in_value: CValue<'tcx>,
+        in_value: Value,
         out_place: Option<CPlace<'tcx>>,
     },
     Const {
@@ -47,17 +47,20 @@ pub(crate) fn codegen_inline_asm<'tcx>(
     // Used by panic_abort on Windows, but uses a syntax which only happens to work with
     // asm!() by accident and breaks with the GNU assembler as well as global_asm!() for
     // the LLVM backend.
-    if template[0] == InlineAsmTemplatePiece::String("int $$0x29".to_string()) {
+    if template.len() == 1
+        && template[0] == InlineAsmTemplatePiece::String("int $$0x29".to_string())
+    {
         fx.bcx.ins().trap(TrapCode::User(1));
         return;
     }
 
     let operands = operands
         .into_iter()
         .map(|operand| match *operand {
-            InlineAsmOperand::In { reg, ref value } => {
-                CInlineAsmOperand::In { reg, value: crate::base::codegen_operand(fx, value) }
-            }
+            InlineAsmOperand::In { reg, ref value } => CInlineAsmOperand::In {
+                reg,
+                value: crate::base::codegen_operand(fx, value).load_scalar(fx),
+            },
             InlineAsmOperand::Out { reg, late, ref place } => CInlineAsmOperand::Out {
                 reg,
                 late,
@@ -67,7 +70,7 @@ pub(crate) fn codegen_inline_asm<'tcx>(
                 CInlineAsmOperand::InOut {
                     reg,
                     _late: late,
-                    in_value: crate::base::codegen_operand(fx, in_value),
+                    in_value: crate::base::codegen_operand(fx, in_value).load_scalar(fx),
                     out_place: out_place.map(|place| crate::base::codegen_place(fx, place)),
                 }
             }
@@ -165,15 +168,15 @@ pub(crate) fn codegen_inline_asm<'tcx>(
     for (i, operand) in operands.iter().enumerate() {
         match operand {
             CInlineAsmOperand::In { reg: _, value } => {
-                inputs.push((asm_gen.stack_slots_input[i].unwrap(), value.load_scalar(fx)));
+                inputs.push((asm_gen.stack_slots_input[i].unwrap(), *value));
             }
             CInlineAsmOperand::Out { reg: _, late: _, place } => {
                 if let Some(place) = place {
                     outputs.push((asm_gen.stack_slots_output[i].unwrap(), *place));
                 }
             }
             CInlineAsmOperand::InOut { reg: _, _late: _, in_value, out_place } => {
-                inputs.push((asm_gen.stack_slots_input[i].unwrap(), in_value.load_scalar(fx)));
+                inputs.push((asm_gen.stack_slots_input[i].unwrap(), *in_value));
                 if let Some(out_place) = out_place {
                     outputs.push((asm_gen.stack_slots_output[i].unwrap(), *out_place));
                 }
@@ -726,3 +729,83 @@ fn call_inline_asm<'tcx>(
         place.write_cvalue(fx, CValue::by_val(value, place.layout()));
     }
 }
+
+pub(crate) fn codegen_xgetbv<'tcx>(
+    fx: &mut FunctionCx<'_, '_, 'tcx>,
+    xcr_no: Value,
+    ret: CPlace<'tcx>,
+) {
+    // FIXME add .eh_frame unwind info directives
+
+    let operands = vec![
+        CInlineAsmOperand::In {
+            reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)),
+            value: xcr_no,
+        },
+        CInlineAsmOperand::Out {
+            reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
+            late: true,
+            place: Some(ret),
+        },
+        CInlineAsmOperand::Out {
+            reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
+            late: true,
+            place: None,
+        },
+    ];
+    let options = InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM;
+
+    let mut inputs = Vec::new();
+    let mut outputs = Vec::new();
+
+    let mut asm_gen = InlineAssemblyGenerator {
+        tcx: fx.tcx,
+        arch: fx.tcx.sess.asm_arch.unwrap(),
+        enclosing_def_id: fx.instance.def_id(),
+        template: &[InlineAsmTemplatePiece::String(
+            "
+            xgetbv
+            // out = rdx << 32 | rax
+            shl rdx, 32
+            or rax, rdx
+            "
+            .to_string(),
+        )],
+        operands: &operands,
+        options,
+        registers: Vec::new(),
+        stack_slots_clobber: Vec::new(),
+        stack_slots_input: Vec::new(),
+        stack_slots_output: Vec::new(),
+        stack_slot_size: Size::from_bytes(0),
+    };
+    asm_gen.allocate_registers();
+    asm_gen.allocate_stack_slots();
+
+    let inline_asm_index = fx.cx.inline_asm_index.get();
+    fx.cx.inline_asm_index.set(inline_asm_index + 1);
+    let asm_name = format!(
+        "__inline_asm_{}_n{}",
+        fx.cx.cgu_name.as_str().replace('.', "__").replace('-', "_"),
+        inline_asm_index
+    );
+
+    let generated_asm = asm_gen.generate_asm_wrapper(&asm_name);
+    fx.cx.global_asm.push_str(&generated_asm);
+
+    for (i, operand) in operands.iter().enumerate() {
+        match operand {
+            CInlineAsmOperand::In { reg: _, value } => {
+                inputs.push((asm_gen.stack_slots_input[i].unwrap(), *value));
+            }
+            CInlineAsmOperand::Out { reg: _, late: _, place } => {
+                if let Some(place) = place {
+                    outputs.push((asm_gen.stack_slots_output[i].unwrap(), *place));
+                }
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    call_inline_asm(fx, &asm_name, asm_gen.stack_slot_size, inputs, outputs);
+}
@@ -51,6 +51,21 @@ pub(crate) fn codegen_llvm_intrinsic_call<'tcx>(
             });
         }
 
+        _ if intrinsic.starts_with("llvm.fma.v") => {
+            intrinsic_args!(fx, args => (x,y,z); intrinsic);
+
+            simd_trio_for_each_lane(
+                fx,
+                x,
+                y,
+                z,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, lane_x, lane_y, lane_z| {
+                    fx.bcx.ins().fma(lane_x, lane_y, lane_z)
+                },
+            );
+        }
+
         _ => {
             fx.tcx
                 .sess
 
@@ -44,15 +44,19 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
             });
         }
 
-        _ if intrinsic.starts_with("llvm.aarch64.neon.sqadd.v") => {
+        _ if intrinsic.starts_with("llvm.aarch64.neon.sqadd.v")
+            || intrinsic.starts_with("llvm.aarch64.neon.uqadd.v") =>
+        {
             intrinsic_args!(fx, args => (x, y); intrinsic);
 
             simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| {
                 crate::num::codegen_saturating_int_binop(fx, BinOp::Add, x_lane, y_lane)
             });
         }
 
-        _ if intrinsic.starts_with("llvm.aarch64.neon.sqsub.v") => {
+        _ if intrinsic.starts_with("llvm.aarch64.neon.sqsub.v")
+            || intrinsic.starts_with("llvm.aarch64.neon.uqsub.v") =>
+        {
             intrinsic_args!(fx, args => (x, y); intrinsic);
 
             simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| {
@@ -156,6 +160,90 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
             });
         }
 
+        _ if intrinsic.starts_with("llvm.aarch64.neon.umaxp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umax(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.smaxp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smax(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.uminp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umin(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.sminp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smin(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.fminp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmin(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.fmaxp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmax(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.addp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().iadd(x_lane, y_lane),
+            );
+        }
+
         // FIXME generalize vector types
         "llvm.aarch64.neon.tbl1.v16i8" => {
             intrinsic_args!(fx, args => (t, idx); intrinsic);
@@ -172,25 +260,6 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
             }
         }
 
-        // FIXME generalize vector types
-        "llvm.aarch64.neon.umaxp.v16i8" => {
-            intrinsic_args!(fx, args => (a, b); intrinsic);
-
-            // FIXME add helper for horizontal pairwise operations
-            for i in 0..8 {
-                let lane1 = a.value_lane(fx, i * 2).load_scalar(fx);
-                let lane2 = a.value_lane(fx, i * 2 + 1).load_scalar(fx);
-                let res = fx.bcx.ins().umax(lane1, lane2);
-                ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
-            }
-            for i in 0..8 {
-                let lane1 = b.value_lane(fx, i * 2).load_scalar(fx);
-                let lane2 = b.value_lane(fx, i * 2 + 1).load_scalar(fx);
-                let res = fx.bcx.ins().umax(lane1, lane2);
-                ret.place_lane(fx, 8 + i).to_ptr().store(fx, res, MemFlags::trusted());
-            }
-        }
-
         /*
         _ if intrinsic.starts_with("llvm.aarch64.neon.sshl.v")
             || intrinsic.starts_with("llvm.aarch64.neon.sqshl.v")
 
@@ -132,6 +132,65 @@ fn simd_pair_for_each_lane<'tcx>(
     }
 }
 
+fn simd_horizontal_pair_for_each_lane<'tcx>(
+    fx: &mut FunctionCx<'_, '_, 'tcx>,
+    x: CValue<'tcx>,
+    y: CValue<'tcx>,
+    ret: CPlace<'tcx>,
+    f: &dyn Fn(&mut FunctionCx<'_, '_, 'tcx>, Ty<'tcx>, Ty<'tcx>, Value, Value) -> Value,
+) {
+    assert_eq!(x.layout(), y.layout());
+    let layout = x.layout();
+
+    let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+    let lane_layout = fx.layout_of(lane_ty);
+    let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+    let ret_lane_layout = fx.layout_of(ret_lane_ty);
+    assert_eq!(lane_count, ret_lane_count);
+
+    for lane_idx in 0..lane_count {
+        let src = if lane_idx < (lane_count / 2) { x } else { y };
+        let src_idx = lane_idx % (lane_count / 2);
+
+        let lhs_lane = src.value_lane(fx, src_idx * 2).load_scalar(fx);
+        let rhs_lane = src.value_lane(fx, src_idx * 2 + 1).load_scalar(fx);
+
+        let res_lane = f(fx, lane_layout.ty, ret_lane_layout.ty, lhs_lane, rhs_lane);
+        let res_lane = CValue::by_val(res_lane, ret_lane_layout);
+
+        ret.place_lane(fx, lane_idx).write_cvalue(fx, res_lane);
+    }
+}
+
+fn simd_trio_for_each_lane<'tcx>(
+    fx: &mut FunctionCx<'_, '_, 'tcx>,
+    x: CValue<'tcx>,
+    y: CValue<'tcx>,
+    z: CValue<'tcx>,
+    ret: CPlace<'tcx>,
+    f: &dyn Fn(&mut FunctionCx<'_, '_, 'tcx>, Ty<'tcx>, Ty<'tcx>, Value, Value, Value) -> Value,
+) {
+    assert_eq!(x.layout(), y.layout());
+    let layout = x.layout();
+
+    let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+    let lane_layout = fx.layout_of(lane_ty);
+    let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+    let ret_lane_layout = fx.layout_of(ret_lane_ty);
+    assert_eq!(lane_count, ret_lane_count);
+
+    for lane_idx in 0..lane_count {
+        let x_lane = x.value_lane(fx, lane_idx).load_scalar(fx);
+        let y_lane = y.value_lane(fx, lane_idx).load_scalar(fx);
+        let z_lane = z.value_lane(fx, lane_idx).load_scalar(fx);
+
+        let res_lane = f(fx, lane_layout.ty, ret_lane_layout.ty, x_lane, y_lane, z_lane);
+        let res_lane = CValue::by_val(res_lane, ret_lane_layout);
+
+        ret.place_lane(fx, lane_idx).write_cvalue(fx, res_lane);
+    }
+}
+
 fn simd_reduce<'tcx>(
     fx: &mut FunctionCx<'_, '_, 'tcx>,
     val: CValue<'tcx>,
 
@@ -243,6 +243,34 @@ impl<'tcx> CValue<'tcx> {
         let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
         let lane_layout = fx.layout_of(lane_ty);
         assert!(lane_idx < lane_count);
+
+        match self.0 {
+            CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
+            CValueInner::ByRef(ptr, None) => {
+                let field_offset = lane_layout.size * lane_idx;
+                let field_ptr = ptr.offset_i64(fx, i64::try_from(field_offset.bytes()).unwrap());
+                CValue::by_ref(field_ptr, lane_layout)
+            }
+            CValueInner::ByRef(_, Some(_)) => unreachable!(),
+        }
+    }
+
+    /// Like [`CValue::value_field`] except using the passed type as lane type instead of the one
+    /// specified by the vector type.
+    pub(crate) fn value_typed_lane(
+        self,
+        fx: &mut FunctionCx<'_, '_, 'tcx>,
+        lane_ty: Ty<'tcx>,
+        lane_idx: u64,
+    ) -> CValue<'tcx> {
+        let layout = self.1;
+        assert!(layout.ty.is_simd());
+        let (orig_lane_count, orig_lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+        let lane_layout = fx.layout_of(lane_ty);
+        assert!(
+            (lane_idx + 1) * lane_layout.size <= orig_lane_count * fx.layout_of(orig_lane_ty).size
+        );
+
         match self.0 {
             CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
             CValueInner::ByRef(ptr, None) => {
@@ -734,6 +762,34 @@ impl<'tcx> CPlace<'tcx> {
         }
     }
 
+    /// Like [`CPlace::place_field`] except using the passed type as lane type instead of the one
+    /// specified by the vector type.
+    pub(crate) fn place_typed_lane(
+        self,
+        fx: &mut FunctionCx<'_, '_, 'tcx>,
+        lane_ty: Ty<'tcx>,
+        lane_idx: u64,
+    ) -> CPlace<'tcx> {
+        let layout = self.layout();
+        assert!(layout.ty.is_simd());
+        let (orig_lane_count, orig_lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+        let lane_layout = fx.layout_of(lane_ty);
+        assert!(
+            (lane_idx + 1) * lane_layout.size <= orig_lane_count * fx.layout_of(orig_lane_ty).size
+        );
+
+        match self.inner {
+            CPlaceInner::Var(_, _) => unreachable!(),
+            CPlaceInner::VarPair(_, _, _) => unreachable!(),
+            CPlaceInner::Addr(ptr, None) => {
+                let field_offset = lane_layout.size * lane_idx;
+                let field_ptr = ptr.offset_i64(fx, i64::try_from(field_offset.bytes()).unwrap());
+                CPlace::for_ptr(field_ptr, lane_layout)
+            }
+            CPlaceInner::Addr(_, Some(_)) => unreachable!(),
+        }
+    }
+
     pub(crate) fn place_index(
         self,
         fx: &mut FunctionCx<'_, '_, 'tcx>,