Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit d4c86cf

Browse files
committedNov 10, 2023
Auto merge of #117779 - bjorn3:sync_cg_clif-2023-11-10, r=bjorn3
Subtree update for rustc_codegen_cranelift Significantly improved support for simd intrinsics. r? `@ghost` `@rustbot` label +A-codegen +A-cranelift +T-compiler
2 parents 3d0e99d + d186b49 commit d4c86cf

File tree

13 files changed

+961
-67
lines changed

13 files changed

+961
-67
lines changed
 

‎compiler/rustc_codegen_cranelift/Readme.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,6 @@ configuration options.
7676

7777
## Not yet supported
7878

79-
* Inline assembly ([no cranelift support](https://github.com/bytecodealliance/wasmtime/issues/1041))
80-
* On UNIX there is support for invoking an external assembler for `global_asm!` and `asm!`.
8179
* SIMD ([tracked here](https://github.com/rust-lang/rustc_codegen_cranelift/issues/171), `std::simd` fully works, `std::arch` is partially supported)
8280
* Unwinding on panics ([no cranelift support](https://github.com/bytecodealliance/wasmtime/issues/1677), `-Cpanic=abort` is enabled by default)
8381

‎compiler/rustc_codegen_cranelift/build_system/tests.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ const BASE_SYSROOT_SUITE: &[TestCase] = &[
9999
TestCase::build_bin_and_run("aot.mod_bench", "example/mod_bench.rs", &[]),
100100
TestCase::build_bin_and_run("aot.issue-72793", "example/issue-72793.rs", &[]),
101101
TestCase::build_bin("aot.issue-59326", "example/issue-59326.rs"),
102+
TestCase::build_bin_and_run("aot.neon", "example/neon.rs", &[]),
102103
];
103104

104105
pub(crate) static RAND_REPO: GitRepo = GitRepo::github(

‎compiler/rustc_codegen_cranelift/config.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ aot.float-minmax-pass
4242
aot.mod_bench
4343
aot.issue-72793
4444
aot.issue-59326
45+
aot.neon
4546

4647
testsuite.extended_sysroot
4748
test.rust-random/rand
Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
// Most of these tests are copied from https://github.com/japaric/stdsimd/blob/0f4413d01c4f0c3ffbc5a69e9a37fbc7235b31a9/coresimd/arm/neon.rs
2+
3+
#![feature(portable_simd)]
4+
5+
#[cfg(target_arch = "aarch64")]
6+
use std::arch::aarch64::*;
7+
use std::mem::transmute;
8+
use std::simd::*;
9+
10+
#[cfg(target_arch = "aarch64")]
11+
unsafe fn test_vpmin_s8() {
12+
let a = i8x8::from([1, -2, 3, -4, 5, 6, 7, 8]);
13+
let b = i8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
14+
let e = i8x8::from([-2, -4, 5, 7, 0, 2, 4, 6]);
15+
let r: i8x8 = transmute(vpmin_s8(transmute(a), transmute(b)));
16+
assert_eq!(r, e);
17+
}
18+
19+
#[cfg(target_arch = "aarch64")]
20+
unsafe fn test_vpmin_s16() {
21+
let a = i16x4::from([1, 2, 3, -4]);
22+
let b = i16x4::from([0, 3, 2, 5]);
23+
let e = i16x4::from([1, -4, 0, 2]);
24+
let r: i16x4 = transmute(vpmin_s16(transmute(a), transmute(b)));
25+
assert_eq!(r, e);
26+
}
27+
28+
#[cfg(target_arch = "aarch64")]
29+
unsafe fn test_vpmin_s32() {
30+
let a = i32x2::from([1, -2]);
31+
let b = i32x2::from([0, 3]);
32+
let e = i32x2::from([-2, 0]);
33+
let r: i32x2 = transmute(vpmin_s32(transmute(a), transmute(b)));
34+
assert_eq!(r, e);
35+
}
36+
37+
#[cfg(target_arch = "aarch64")]
38+
unsafe fn test_vpmin_u8() {
39+
let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
40+
let b = u8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
41+
let e = u8x8::from([1, 3, 5, 7, 0, 2, 4, 6]);
42+
let r: u8x8 = transmute(vpmin_u8(transmute(a), transmute(b)));
43+
assert_eq!(r, e);
44+
}
45+
46+
#[cfg(target_arch = "aarch64")]
47+
unsafe fn test_vpmin_u16() {
48+
let a = u16x4::from([1, 2, 3, 4]);
49+
let b = u16x4::from([0, 3, 2, 5]);
50+
let e = u16x4::from([1, 3, 0, 2]);
51+
let r: u16x4 = transmute(vpmin_u16(transmute(a), transmute(b)));
52+
assert_eq!(r, e);
53+
}
54+
55+
#[cfg(target_arch = "aarch64")]
56+
unsafe fn test_vpmin_u32() {
57+
let a = u32x2::from([1, 2]);
58+
let b = u32x2::from([0, 3]);
59+
let e = u32x2::from([1, 0]);
60+
let r: u32x2 = transmute(vpmin_u32(transmute(a), transmute(b)));
61+
assert_eq!(r, e);
62+
}
63+
64+
#[cfg(target_arch = "aarch64")]
65+
unsafe fn test_vpmin_f32() {
66+
let a = f32x2::from([1., -2.]);
67+
let b = f32x2::from([0., 3.]);
68+
let e = f32x2::from([-2., 0.]);
69+
let r: f32x2 = transmute(vpmin_f32(transmute(a), transmute(b)));
70+
assert_eq!(r, e);
71+
}
72+
73+
#[cfg(target_arch = "aarch64")]
74+
unsafe fn test_vpmax_s8() {
75+
let a = i8x8::from([1, -2, 3, -4, 5, 6, 7, 8]);
76+
let b = i8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
77+
let e = i8x8::from([1, 3, 6, 8, 3, 5, 7, 9]);
78+
let r: i8x8 = transmute(vpmax_s8(transmute(a), transmute(b)));
79+
assert_eq!(r, e);
80+
}
81+
82+
#[cfg(target_arch = "aarch64")]
83+
unsafe fn test_vpmax_s16() {
84+
let a = i16x4::from([1, 2, 3, -4]);
85+
let b = i16x4::from([0, 3, 2, 5]);
86+
let e = i16x4::from([2, 3, 3, 5]);
87+
let r: i16x4 = transmute(vpmax_s16(transmute(a), transmute(b)));
88+
assert_eq!(r, e);
89+
}
90+
91+
#[cfg(target_arch = "aarch64")]
92+
unsafe fn test_vpmax_s32() {
93+
let a = i32x2::from([1, -2]);
94+
let b = i32x2::from([0, 3]);
95+
let e = i32x2::from([1, 3]);
96+
let r: i32x2 = transmute(vpmax_s32(transmute(a), transmute(b)));
97+
assert_eq!(r, e);
98+
}
99+
100+
#[cfg(target_arch = "aarch64")]
101+
unsafe fn test_vpmax_u8() {
102+
let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
103+
let b = u8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
104+
let e = u8x8::from([2, 4, 6, 8, 3, 5, 7, 9]);
105+
let r: u8x8 = transmute(vpmax_u8(transmute(a), transmute(b)));
106+
assert_eq!(r, e);
107+
}
108+
109+
#[cfg(target_arch = "aarch64")]
110+
unsafe fn test_vpmax_u16() {
111+
let a = u16x4::from([1, 2, 3, 4]);
112+
let b = u16x4::from([0, 3, 2, 5]);
113+
let e = u16x4::from([2, 4, 3, 5]);
114+
let r: u16x4 = transmute(vpmax_u16(transmute(a), transmute(b)));
115+
assert_eq!(r, e);
116+
}
117+
118+
#[cfg(target_arch = "aarch64")]
119+
unsafe fn test_vpmax_u32() {
120+
let a = u32x2::from([1, 2]);
121+
let b = u32x2::from([0, 3]);
122+
let e = u32x2::from([2, 3]);
123+
let r: u32x2 = transmute(vpmax_u32(transmute(a), transmute(b)));
124+
assert_eq!(r, e);
125+
}
126+
127+
#[cfg(target_arch = "aarch64")]
128+
unsafe fn test_vpmax_f32() {
129+
let a = f32x2::from([1., -2.]);
130+
let b = f32x2::from([0., 3.]);
131+
let e = f32x2::from([1., 3.]);
132+
let r: f32x2 = transmute(vpmax_f32(transmute(a), transmute(b)));
133+
assert_eq!(r, e);
134+
}
135+
136+
#[cfg(target_arch = "aarch64")]
137+
unsafe fn test_vpadd_s16() {
138+
let a = i16x4::from([1, 2, 3, 4]);
139+
let b = i16x4::from([0, -1, -2, -3]);
140+
let r: i16x4 = transmute(vpadd_s16(transmute(a), transmute(b)));
141+
let e = i16x4::from([3, 7, -1, -5]);
142+
assert_eq!(r, e);
143+
}
144+
#[cfg(target_arch = "aarch64")]
145+
unsafe fn test_vpadd_s32() {
146+
let a = i32x2::from([1, 2]);
147+
let b = i32x2::from([0, -1]);
148+
let r: i32x2 = transmute(vpadd_s32(transmute(a), transmute(b)));
149+
let e = i32x2::from([3, -1]);
150+
assert_eq!(r, e);
151+
}
152+
#[cfg(target_arch = "aarch64")]
153+
unsafe fn test_vpadd_s8() {
154+
let a = i8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
155+
let b = i8x8::from([0, -1, -2, -3, -4, -5, -6, -7]);
156+
let r: i8x8 = transmute(vpadd_s8(transmute(a), transmute(b)));
157+
let e = i8x8::from([3, 7, 11, 15, -1, -5, -9, -13]);
158+
assert_eq!(r, e);
159+
}
160+
#[cfg(target_arch = "aarch64")]
161+
unsafe fn test_vpadd_u16() {
162+
let a = u16x4::from([1, 2, 3, 4]);
163+
let b = u16x4::from([30, 31, 32, 33]);
164+
let r: u16x4 = transmute(vpadd_u16(transmute(a), transmute(b)));
165+
let e = u16x4::from([3, 7, 61, 65]);
166+
assert_eq!(r, e);
167+
}
168+
#[cfg(target_arch = "aarch64")]
169+
unsafe fn test_vpadd_u32() {
170+
let a = u32x2::from([1, 2]);
171+
let b = u32x2::from([30, 31]);
172+
let r: u32x2 = transmute(vpadd_u32(transmute(a), transmute(b)));
173+
let e = u32x2::from([3, 61]);
174+
assert_eq!(r, e);
175+
}
176+
#[cfg(target_arch = "aarch64")]
177+
unsafe fn test_vpadd_u8() {
178+
let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
179+
let b = u8x8::from([30, 31, 32, 33, 34, 35, 36, 37]);
180+
let r: u8x8 = transmute(vpadd_u8(transmute(a), transmute(b)));
181+
let e = u8x8::from([3, 7, 11, 15, 61, 65, 69, 73]);
182+
assert_eq!(r, e);
183+
}
184+
185+
#[cfg(target_arch = "aarch64")]
186+
unsafe fn test_vqsub_u8() {
187+
let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 0xff]);
188+
let b = u8x8::from([30, 1, 1, 1, 34, 0xff, 36, 37]);
189+
let r: u8x8 = transmute(vqsub_u8(transmute(a), transmute(b)));
190+
let e = u8x8::from([0, 1, 2, 3, 0, 0, 0, 218]);
191+
assert_eq!(r, e);
192+
}
193+
194+
#[cfg(target_arch = "aarch64")]
195+
unsafe fn test_vqadd_u8() {
196+
let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 0xff]);
197+
let b = u8x8::from([30, 1, 1, 1, 34, 0xff, 36, 37]);
198+
let r: u8x8 = transmute(vqadd_u8(transmute(a), transmute(b)));
199+
let e = u8x8::from([31, 3, 4, 5, 39, 0xff, 43, 0xff]);
200+
assert_eq!(r, e);
201+
}
202+
203+
#[cfg(target_arch = "aarch64")]
204+
fn main() {
205+
unsafe {
206+
test_vpmin_s8();
207+
test_vpmin_s16();
208+
test_vpmin_s32();
209+
test_vpmin_u8();
210+
test_vpmin_u16();
211+
test_vpmin_u32();
212+
test_vpmin_f32();
213+
test_vpmax_s8();
214+
test_vpmax_s16();
215+
test_vpmax_s32();
216+
test_vpmax_u8();
217+
test_vpmax_u16();
218+
test_vpmax_u32();
219+
test_vpmax_f32();
220+
221+
test_vpadd_s16();
222+
test_vpadd_s32();
223+
test_vpadd_s8();
224+
test_vpadd_u16();
225+
test_vpadd_u32();
226+
test_vpadd_u8();
227+
228+
test_vqsub_u8();
229+
test_vqadd_u8();
230+
}
231+
}
232+
233+
#[cfg(not(target_arch = "aarch64"))]
234+
fn main() {}

‎compiler/rustc_codegen_cranelift/patches/stdlib-lock.toml

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,9 @@ dependencies = [
5858

5959
[[package]]
6060
name = "compiler_builtins"
61-
version = "0.1.100"
61+
version = "0.1.103"
6262
source = "registry+https://github.com/rust-lang/crates.io-index"
63-
checksum = "d6c0f24437059853f0fa64afc51f338f93647a3de4cf3358ba1bb4171a199775"
63+
checksum = "a3b73c3443a5fd2438d7ba4853c64e4c8efc2404a9e28a9234cc2d5eebc6c242"
6464
dependencies = [
6565
"cc",
6666
"rustc-std-workspace-core",
@@ -158,9 +158,9 @@ dependencies = [
158158

159159
[[package]]
160160
name = "libc"
161-
version = "0.2.149"
161+
version = "0.2.150"
162162
source = "registry+https://github.com/rust-lang/crates.io-index"
163-
checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
163+
checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c"
164164
dependencies = [
165165
"rustc-std-workspace-core",
166166
]
@@ -415,7 +415,6 @@ dependencies = [
415415
name = "unwind"
416416
version = "0.0.0"
417417
dependencies = [
418-
"cc",
419418
"cfg-if",
420419
"compiler_builtins",
421420
"core",
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
[toolchain]
2-
channel = "nightly-2023-10-29"
2+
channel = "nightly-2023-11-10"
33
components = ["rust-src", "rustc-dev", "llvm-tools"]

‎compiler/rustc_codegen_cranelift/scripts/test_rustc_tests.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,11 @@ rm tests/ui/process/nofile-limit.rs # TODO some AArch64 linking issue
146146

147147
rm tests/ui/stdio-is-blocking.rs # really slow with unoptimized libstd
148148

149+
# rustc bugs
150+
# ==========
151+
# https://github.com/rust-lang/rust/pull/116447#issuecomment-1790451463
152+
rm tests/ui/coroutine/gen_block_*.rs
153+
149154
cp ../dist/bin/rustdoc-clif ../dist/bin/rustdoc # some tests expect bin/rustdoc to exist
150155

151156
# prevent $(RUSTDOC) from picking up the sysroot built by x.py. It conflicts with the one used by

‎compiler/rustc_codegen_cranelift/src/inline_asm.rs

Lines changed: 92 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use crate::prelude::*;
1313
enum CInlineAsmOperand<'tcx> {
1414
In {
1515
reg: InlineAsmRegOrRegClass,
16-
value: CValue<'tcx>,
16+
value: Value,
1717
},
1818
Out {
1919
reg: InlineAsmRegOrRegClass,
@@ -23,7 +23,7 @@ enum CInlineAsmOperand<'tcx> {
2323
InOut {
2424
reg: InlineAsmRegOrRegClass,
2525
_late: bool,
26-
in_value: CValue<'tcx>,
26+
in_value: Value,
2727
out_place: Option<CPlace<'tcx>>,
2828
},
2929
Const {
@@ -47,17 +47,20 @@ pub(crate) fn codegen_inline_asm<'tcx>(
4747
// Used by panic_abort on Windows, but uses a syntax which only happens to work with
4848
// asm!() by accident and breaks with the GNU assembler as well as global_asm!() for
4949
// the LLVM backend.
50-
if template[0] == InlineAsmTemplatePiece::String("int $$0x29".to_string()) {
50+
if template.len() == 1
51+
&& template[0] == InlineAsmTemplatePiece::String("int $$0x29".to_string())
52+
{
5153
fx.bcx.ins().trap(TrapCode::User(1));
5254
return;
5355
}
5456

5557
let operands = operands
5658
.into_iter()
5759
.map(|operand| match *operand {
58-
InlineAsmOperand::In { reg, ref value } => {
59-
CInlineAsmOperand::In { reg, value: crate::base::codegen_operand(fx, value) }
60-
}
60+
InlineAsmOperand::In { reg, ref value } => CInlineAsmOperand::In {
61+
reg,
62+
value: crate::base::codegen_operand(fx, value).load_scalar(fx),
63+
},
6164
InlineAsmOperand::Out { reg, late, ref place } => CInlineAsmOperand::Out {
6265
reg,
6366
late,
@@ -67,7 +70,7 @@ pub(crate) fn codegen_inline_asm<'tcx>(
6770
CInlineAsmOperand::InOut {
6871
reg,
6972
_late: late,
70-
in_value: crate::base::codegen_operand(fx, in_value),
73+
in_value: crate::base::codegen_operand(fx, in_value).load_scalar(fx),
7174
out_place: out_place.map(|place| crate::base::codegen_place(fx, place)),
7275
}
7376
}
@@ -165,15 +168,15 @@ pub(crate) fn codegen_inline_asm<'tcx>(
165168
for (i, operand) in operands.iter().enumerate() {
166169
match operand {
167170
CInlineAsmOperand::In { reg: _, value } => {
168-
inputs.push((asm_gen.stack_slots_input[i].unwrap(), value.load_scalar(fx)));
171+
inputs.push((asm_gen.stack_slots_input[i].unwrap(), *value));
169172
}
170173
CInlineAsmOperand::Out { reg: _, late: _, place } => {
171174
if let Some(place) = place {
172175
outputs.push((asm_gen.stack_slots_output[i].unwrap(), *place));
173176
}
174177
}
175178
CInlineAsmOperand::InOut { reg: _, _late: _, in_value, out_place } => {
176-
inputs.push((asm_gen.stack_slots_input[i].unwrap(), in_value.load_scalar(fx)));
179+
inputs.push((asm_gen.stack_slots_input[i].unwrap(), *in_value));
177180
if let Some(out_place) = out_place {
178181
outputs.push((asm_gen.stack_slots_output[i].unwrap(), *out_place));
179182
}
@@ -726,3 +729,83 @@ fn call_inline_asm<'tcx>(
726729
place.write_cvalue(fx, CValue::by_val(value, place.layout()));
727730
}
728731
}
732+
733+
pub(crate) fn codegen_xgetbv<'tcx>(
734+
fx: &mut FunctionCx<'_, '_, 'tcx>,
735+
xcr_no: Value,
736+
ret: CPlace<'tcx>,
737+
) {
738+
// FIXME add .eh_frame unwind info directives
739+
740+
let operands = vec![
741+
CInlineAsmOperand::In {
742+
reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)),
743+
value: xcr_no,
744+
},
745+
CInlineAsmOperand::Out {
746+
reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
747+
late: true,
748+
place: Some(ret),
749+
},
750+
CInlineAsmOperand::Out {
751+
reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
752+
late: true,
753+
place: None,
754+
},
755+
];
756+
let options = InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM;
757+
758+
let mut inputs = Vec::new();
759+
let mut outputs = Vec::new();
760+
761+
let mut asm_gen = InlineAssemblyGenerator {
762+
tcx: fx.tcx,
763+
arch: fx.tcx.sess.asm_arch.unwrap(),
764+
enclosing_def_id: fx.instance.def_id(),
765+
template: &[InlineAsmTemplatePiece::String(
766+
"
767+
xgetbv
768+
// out = rdx << 32 | rax
769+
shl rdx, 32
770+
or rax, rdx
771+
"
772+
.to_string(),
773+
)],
774+
operands: &operands,
775+
options,
776+
registers: Vec::new(),
777+
stack_slots_clobber: Vec::new(),
778+
stack_slots_input: Vec::new(),
779+
stack_slots_output: Vec::new(),
780+
stack_slot_size: Size::from_bytes(0),
781+
};
782+
asm_gen.allocate_registers();
783+
asm_gen.allocate_stack_slots();
784+
785+
let inline_asm_index = fx.cx.inline_asm_index.get();
786+
fx.cx.inline_asm_index.set(inline_asm_index + 1);
787+
let asm_name = format!(
788+
"__inline_asm_{}_n{}",
789+
fx.cx.cgu_name.as_str().replace('.', "__").replace('-', "_"),
790+
inline_asm_index
791+
);
792+
793+
let generated_asm = asm_gen.generate_asm_wrapper(&asm_name);
794+
fx.cx.global_asm.push_str(&generated_asm);
795+
796+
for (i, operand) in operands.iter().enumerate() {
797+
match operand {
798+
CInlineAsmOperand::In { reg: _, value } => {
799+
inputs.push((asm_gen.stack_slots_input[i].unwrap(), *value));
800+
}
801+
CInlineAsmOperand::Out { reg: _, late: _, place } => {
802+
if let Some(place) = place {
803+
outputs.push((asm_gen.stack_slots_output[i].unwrap(), *place));
804+
}
805+
}
806+
_ => unreachable!(),
807+
}
808+
}
809+
810+
call_inline_asm(fx, &asm_name, asm_gen.stack_slot_size, inputs, outputs);
811+
}

‎compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,21 @@ pub(crate) fn codegen_llvm_intrinsic_call<'tcx>(
5151
});
5252
}
5353

54+
_ if intrinsic.starts_with("llvm.fma.v") => {
55+
intrinsic_args!(fx, args => (x,y,z); intrinsic);
56+
57+
simd_trio_for_each_lane(
58+
fx,
59+
x,
60+
y,
61+
z,
62+
ret,
63+
&|fx, _lane_ty, _res_lane_ty, lane_x, lane_y, lane_z| {
64+
fx.bcx.ins().fma(lane_x, lane_y, lane_z)
65+
},
66+
);
67+
}
68+
5469
_ => {
5570
fx.tcx
5671
.sess

‎compiler/rustc_codegen_cranelift/src/intrinsics/llvm_aarch64.rs

Lines changed: 90 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -44,15 +44,19 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
4444
});
4545
}
4646

47-
_ if intrinsic.starts_with("llvm.aarch64.neon.sqadd.v") => {
47+
_ if intrinsic.starts_with("llvm.aarch64.neon.sqadd.v")
48+
|| intrinsic.starts_with("llvm.aarch64.neon.uqadd.v") =>
49+
{
4850
intrinsic_args!(fx, args => (x, y); intrinsic);
4951

5052
simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| {
5153
crate::num::codegen_saturating_int_binop(fx, BinOp::Add, x_lane, y_lane)
5254
});
5355
}
5456

55-
_ if intrinsic.starts_with("llvm.aarch64.neon.sqsub.v") => {
57+
_ if intrinsic.starts_with("llvm.aarch64.neon.sqsub.v")
58+
|| intrinsic.starts_with("llvm.aarch64.neon.uqsub.v") =>
59+
{
5660
intrinsic_args!(fx, args => (x, y); intrinsic);
5761

5862
simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| {
@@ -156,6 +160,90 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
156160
});
157161
}
158162

163+
_ if intrinsic.starts_with("llvm.aarch64.neon.umaxp.v") => {
164+
intrinsic_args!(fx, args => (x, y); intrinsic);
165+
166+
simd_horizontal_pair_for_each_lane(
167+
fx,
168+
x,
169+
y,
170+
ret,
171+
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umax(x_lane, y_lane),
172+
);
173+
}
174+
175+
_ if intrinsic.starts_with("llvm.aarch64.neon.smaxp.v") => {
176+
intrinsic_args!(fx, args => (x, y); intrinsic);
177+
178+
simd_horizontal_pair_for_each_lane(
179+
fx,
180+
x,
181+
y,
182+
ret,
183+
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smax(x_lane, y_lane),
184+
);
185+
}
186+
187+
_ if intrinsic.starts_with("llvm.aarch64.neon.uminp.v") => {
188+
intrinsic_args!(fx, args => (x, y); intrinsic);
189+
190+
simd_horizontal_pair_for_each_lane(
191+
fx,
192+
x,
193+
y,
194+
ret,
195+
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umin(x_lane, y_lane),
196+
);
197+
}
198+
199+
_ if intrinsic.starts_with("llvm.aarch64.neon.sminp.v") => {
200+
intrinsic_args!(fx, args => (x, y); intrinsic);
201+
202+
simd_horizontal_pair_for_each_lane(
203+
fx,
204+
x,
205+
y,
206+
ret,
207+
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smin(x_lane, y_lane),
208+
);
209+
}
210+
211+
_ if intrinsic.starts_with("llvm.aarch64.neon.fminp.v") => {
212+
intrinsic_args!(fx, args => (x, y); intrinsic);
213+
214+
simd_horizontal_pair_for_each_lane(
215+
fx,
216+
x,
217+
y,
218+
ret,
219+
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmin(x_lane, y_lane),
220+
);
221+
}
222+
223+
_ if intrinsic.starts_with("llvm.aarch64.neon.fmaxp.v") => {
224+
intrinsic_args!(fx, args => (x, y); intrinsic);
225+
226+
simd_horizontal_pair_for_each_lane(
227+
fx,
228+
x,
229+
y,
230+
ret,
231+
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmax(x_lane, y_lane),
232+
);
233+
}
234+
235+
_ if intrinsic.starts_with("llvm.aarch64.neon.addp.v") => {
236+
intrinsic_args!(fx, args => (x, y); intrinsic);
237+
238+
simd_horizontal_pair_for_each_lane(
239+
fx,
240+
x,
241+
y,
242+
ret,
243+
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().iadd(x_lane, y_lane),
244+
);
245+
}
246+
159247
// FIXME generalize vector types
160248
"llvm.aarch64.neon.tbl1.v16i8" => {
161249
intrinsic_args!(fx, args => (t, idx); intrinsic);
@@ -172,25 +260,6 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
172260
}
173261
}
174262

175-
// FIXME generalize vector types
176-
"llvm.aarch64.neon.umaxp.v16i8" => {
177-
intrinsic_args!(fx, args => (a, b); intrinsic);
178-
179-
// FIXME add helper for horizontal pairwise operations
180-
for i in 0..8 {
181-
let lane1 = a.value_lane(fx, i * 2).load_scalar(fx);
182-
let lane2 = a.value_lane(fx, i * 2 + 1).load_scalar(fx);
183-
let res = fx.bcx.ins().umax(lane1, lane2);
184-
ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
185-
}
186-
for i in 0..8 {
187-
let lane1 = b.value_lane(fx, i * 2).load_scalar(fx);
188-
let lane2 = b.value_lane(fx, i * 2 + 1).load_scalar(fx);
189-
let res = fx.bcx.ins().umax(lane1, lane2);
190-
ret.place_lane(fx, 8 + i).to_ptr().store(fx, res, MemFlags::trusted());
191-
}
192-
}
193-
194263
/*
195264
_ if intrinsic.starts_with("llvm.aarch64.neon.sshl.v")
196265
|| intrinsic.starts_with("llvm.aarch64.neon.sqshl.v")

‎compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs

Lines changed: 403 additions & 29 deletions
Large diffs are not rendered by default.

‎compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,65 @@ fn simd_pair_for_each_lane<'tcx>(
132132
}
133133
}
134134

135+
fn simd_horizontal_pair_for_each_lane<'tcx>(
136+
fx: &mut FunctionCx<'_, '_, 'tcx>,
137+
x: CValue<'tcx>,
138+
y: CValue<'tcx>,
139+
ret: CPlace<'tcx>,
140+
f: &dyn Fn(&mut FunctionCx<'_, '_, 'tcx>, Ty<'tcx>, Ty<'tcx>, Value, Value) -> Value,
141+
) {
142+
assert_eq!(x.layout(), y.layout());
143+
let layout = x.layout();
144+
145+
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
146+
let lane_layout = fx.layout_of(lane_ty);
147+
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
148+
let ret_lane_layout = fx.layout_of(ret_lane_ty);
149+
assert_eq!(lane_count, ret_lane_count);
150+
151+
for lane_idx in 0..lane_count {
152+
let src = if lane_idx < (lane_count / 2) { x } else { y };
153+
let src_idx = lane_idx % (lane_count / 2);
154+
155+
let lhs_lane = src.value_lane(fx, src_idx * 2).load_scalar(fx);
156+
let rhs_lane = src.value_lane(fx, src_idx * 2 + 1).load_scalar(fx);
157+
158+
let res_lane = f(fx, lane_layout.ty, ret_lane_layout.ty, lhs_lane, rhs_lane);
159+
let res_lane = CValue::by_val(res_lane, ret_lane_layout);
160+
161+
ret.place_lane(fx, lane_idx).write_cvalue(fx, res_lane);
162+
}
163+
}
164+
165+
fn simd_trio_for_each_lane<'tcx>(
166+
fx: &mut FunctionCx<'_, '_, 'tcx>,
167+
x: CValue<'tcx>,
168+
y: CValue<'tcx>,
169+
z: CValue<'tcx>,
170+
ret: CPlace<'tcx>,
171+
f: &dyn Fn(&mut FunctionCx<'_, '_, 'tcx>, Ty<'tcx>, Ty<'tcx>, Value, Value, Value) -> Value,
172+
) {
173+
assert_eq!(x.layout(), y.layout());
174+
let layout = x.layout();
175+
176+
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
177+
let lane_layout = fx.layout_of(lane_ty);
178+
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
179+
let ret_lane_layout = fx.layout_of(ret_lane_ty);
180+
assert_eq!(lane_count, ret_lane_count);
181+
182+
for lane_idx in 0..lane_count {
183+
let x_lane = x.value_lane(fx, lane_idx).load_scalar(fx);
184+
let y_lane = y.value_lane(fx, lane_idx).load_scalar(fx);
185+
let z_lane = z.value_lane(fx, lane_idx).load_scalar(fx);
186+
187+
let res_lane = f(fx, lane_layout.ty, ret_lane_layout.ty, x_lane, y_lane, z_lane);
188+
let res_lane = CValue::by_val(res_lane, ret_lane_layout);
189+
190+
ret.place_lane(fx, lane_idx).write_cvalue(fx, res_lane);
191+
}
192+
}
193+
135194
fn simd_reduce<'tcx>(
136195
fx: &mut FunctionCx<'_, '_, 'tcx>,
137196
val: CValue<'tcx>,

‎compiler/rustc_codegen_cranelift/src/value_and_place.rs

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,34 @@ impl<'tcx> CValue<'tcx> {
243243
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
244244
let lane_layout = fx.layout_of(lane_ty);
245245
assert!(lane_idx < lane_count);
246+
247+
match self.0 {
248+
CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
249+
CValueInner::ByRef(ptr, None) => {
250+
let field_offset = lane_layout.size * lane_idx;
251+
let field_ptr = ptr.offset_i64(fx, i64::try_from(field_offset.bytes()).unwrap());
252+
CValue::by_ref(field_ptr, lane_layout)
253+
}
254+
CValueInner::ByRef(_, Some(_)) => unreachable!(),
255+
}
256+
}
257+
258+
/// Like [`CValue::value_field`] except using the passed type as lane type instead of the one
259+
/// specified by the vector type.
260+
pub(crate) fn value_typed_lane(
261+
self,
262+
fx: &mut FunctionCx<'_, '_, 'tcx>,
263+
lane_ty: Ty<'tcx>,
264+
lane_idx: u64,
265+
) -> CValue<'tcx> {
266+
let layout = self.1;
267+
assert!(layout.ty.is_simd());
268+
let (orig_lane_count, orig_lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
269+
let lane_layout = fx.layout_of(lane_ty);
270+
assert!(
271+
(lane_idx + 1) * lane_layout.size <= orig_lane_count * fx.layout_of(orig_lane_ty).size
272+
);
273+
246274
match self.0 {
247275
CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
248276
CValueInner::ByRef(ptr, None) => {
@@ -734,6 +762,34 @@ impl<'tcx> CPlace<'tcx> {
734762
}
735763
}
736764

765+
/// Like [`CPlace::place_field`] except using the passed type as lane type instead of the one
766+
/// specified by the vector type.
767+
pub(crate) fn place_typed_lane(
768+
self,
769+
fx: &mut FunctionCx<'_, '_, 'tcx>,
770+
lane_ty: Ty<'tcx>,
771+
lane_idx: u64,
772+
) -> CPlace<'tcx> {
773+
let layout = self.layout();
774+
assert!(layout.ty.is_simd());
775+
let (orig_lane_count, orig_lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
776+
let lane_layout = fx.layout_of(lane_ty);
777+
assert!(
778+
(lane_idx + 1) * lane_layout.size <= orig_lane_count * fx.layout_of(orig_lane_ty).size
779+
);
780+
781+
match self.inner {
782+
CPlaceInner::Var(_, _) => unreachable!(),
783+
CPlaceInner::VarPair(_, _, _) => unreachable!(),
784+
CPlaceInner::Addr(ptr, None) => {
785+
let field_offset = lane_layout.size * lane_idx;
786+
let field_ptr = ptr.offset_i64(fx, i64::try_from(field_offset.bytes()).unwrap());
787+
CPlace::for_ptr(field_ptr, lane_layout)
788+
}
789+
CPlaceInner::Addr(_, Some(_)) => unreachable!(),
790+
}
791+
}
792+
737793
pub(crate) fn place_index(
738794
self,
739795
fx: &mut FunctionCx<'_, '_, 'tcx>,

0 commit comments

Comments
 (0)
Please sign in to comment.