diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 73343e1c80f33..db36e68c8feec 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -15,6 +15,7 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineOperand.h" #define DEBUG_TYPE "si-shrink-instructions" @@ -44,6 +45,7 @@ class SIShrinkInstructions { void shrinkMIMG(MachineInstr &MI) const; void shrinkMadFma(MachineInstr &MI) const; bool shrinkScalarLogicOp(MachineInstr &MI) const; + bool shrinkToBitset(MachineInstr &MI) const; bool tryReplaceDeadSDST(MachineInstr &MI) const; bool instAccessReg(iterator_range &&R, Register Reg, unsigned SubReg) const; @@ -577,8 +579,7 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { const bool IsUndef = SrcReg->isUndef(); const bool IsKill = SrcReg->isKill(); MI.setDesc(TII->get(Opc)); - if (Opc == AMDGPU::S_BITSET0_B32 || - Opc == AMDGPU::S_BITSET1_B32) { + if (Opc == AMDGPU::S_BITSET0_B32 || Opc == AMDGPU::S_BITSET1_B32) { Src0->ChangeToImmediate(NewImm); // Remove the immediate and add the tied input. MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false, @@ -594,6 +595,66 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { return false; } +// case 1: +// From: +// s_lshl_b32 s1, 1, s1 +// s_or_b32 s0, s0, s1 +// To: +// s_bitset1_b32 s0, s1 +// +// case 2: +// s_lshl_b32 s1, 1, s1 +// s_andn2_b32 s0, s0, s1 +// To: +// s_bitset0_b32 s0, s1 +bool SIShrinkInstructions::shrinkToBitset(MachineInstr &MI) const { + MachineOperand *Dest = &MI.getOperand(0); + MachineOperand *Src0 = &MI.getOperand(1); + MachineOperand *Src1 = &MI.getOperand(2); + + if (!Src0->isReg() || !Src1->isReg() || Dest->getReg() != Src0->getReg()) + return false; + + MachineInstr *Shl = MRI->getUniqueVRegDef(Src1->getReg()); + if (!Shl || Shl->getOpcode() != AMDGPU::S_LSHL_B32 || + !Shl->getOperand(1).isImm() || Shl->getOperand(1).getImm() != 1 || + MI.getParent() != Shl->getParent()) + return false; + + if (!MRI->hasAtMostUserInstrs(Shl->getOperand(0).getReg(), 2)) + return false; + + int ShlSrc1Reg = Shl->getOperand(2).getReg(); + bool IsKilled = false; + for (auto IE = MI.getIterator(), I = std::next(Shl->getIterator()); I != IE; + ++I) { + for (MachineOperand &MO : I->operands()) { + if (MO.isReg() && MO.getReg() == ShlSrc1Reg) { + if (MO.isDef()) + return false; + if (MO.isKill()) { + MO.setIsKill(false); + IsKilled = true; + } + } + } + } + + unsigned int NewOpc = (MI.getOpcode() == AMDGPU::S_OR_B32) + ? AMDGPU::S_BITSET1_B32 + : AMDGPU::S_BITSET0_B32; + MI.setDesc(TII->get(NewOpc)); + Src0->setReg(ShlSrc1Reg); + if (IsKilled) + Src0->setIsKill(true); + MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false, + /*isImp*/ false, Src0->isKill(), + /*isDead*/ false, Src0->isUndef()); + MI.tieOperands(0, 2); + Shl->eraseFromParent(); + return true; +} + // This is the same as MachineInstr::readsRegister/modifiesRegister except // it takes subregs into account. bool SIShrinkInstructions::instAccessReg( @@ -951,6 +1012,12 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { continue; } + if (MI.getOpcode() == AMDGPU::S_ANDN2_B32 || + MI.getOpcode() == AMDGPU::S_OR_B32) { + if (shrinkToBitset(MI)) + continue; + } + if (TII->isMIMG(MI.getOpcode()) && ST->getGeneration() >= AMDGPUSubtarget::GFX10 && MF.getProperties().hasProperty( diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index 8319e112f526e..bada7eb3eba93 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -728,9 +728,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_bitset0_b32 s1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 ; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 @@ -813,9 +812,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_bitset0_b32 s1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 ; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 @@ -898,9 +896,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_bitset0_b32 s1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 @@ -1120,9 +1117,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_bitset0_b32 s1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 ; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1 @@ -1211,9 +1207,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_bitset0_b32 s1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 ; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1 @@ -1301,9 +1296,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_bitset0_b32 s1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 @@ -2183,9 +2177,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_bitset0_b32 s1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 ; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 @@ -2268,9 +2261,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_bitset0_b32 s1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 ; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 @@ -2354,9 +2346,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_bitset0_b32 s1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 9775a37276dfd..15ad3c03a2b9c 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -620,9 +620,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_bitset0_b32 s1, s2 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 @@ -706,9 +705,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_bitset0_b32 s1, s2 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 @@ -4027,9 +4025,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_bitset0_b32 s1, s2 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 @@ -4113,9 +4110,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_bitset0_b32 s1, s2 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 @@ -6436,9 +6432,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_bitset0_b32 s1, s2 ; GFX1032_ITERATIVE-NEXT: s_and_b32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 @@ -6522,9 +6517,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_bitset0_b32 s1, s2 ; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 @@ -7068,10 +7062,9 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 +; GFX1032_ITERATIVE-NEXT: s_bitset0_b32 s2, s3 ; GFX1032_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 @@ -7166,10 +7159,9 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 +; GFX1132_ITERATIVE-NEXT: s_bitset0_b32 s2, s3 ; GFX1132_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 @@ -7797,9 +7789,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_bitset0_b32 s1, s2 ; GFX1032_ITERATIVE-NEXT: s_or_b32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 @@ -7883,9 +7874,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_bitset0_b32 s1, s2 ; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 @@ -8428,10 +8418,9 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 +; GFX1032_ITERATIVE-NEXT: s_bitset0_b32 s2, s3 ; GFX1032_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 @@ -8526,10 +8515,9 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 +; GFX1132_ITERATIVE-NEXT: s_bitset0_b32 s2, s3 ; GFX1132_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 @@ -9157,9 +9145,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_bitset0_b32 s1, s2 ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 @@ -9243,9 +9230,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_bitset0_b32 s1, s2 ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 @@ -9788,10 +9774,9 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 +; GFX1032_ITERATIVE-NEXT: s_bitset0_b32 s2, s3 ; GFX1032_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 @@ -9886,10 +9871,9 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 +; GFX1132_ITERATIVE-NEXT: s_bitset0_b32 s2, s3 ; GFX1132_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 @@ -10517,9 +10501,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_bitset0_b32 s1, s2 ; GFX1032_ITERATIVE-NEXT: s_max_i32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 @@ -10603,9 +10586,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_bitset0_b32 s1, s2 ; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 @@ -12345,9 +12327,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_bitset0_b32 s1, s2 ; GFX1032_ITERATIVE-NEXT: s_min_i32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 @@ -12431,9 +12412,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_bitset0_b32 s1, s2 ; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 @@ -14173,9 +14153,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_bitset0_b32 s1, s2 ; GFX1032_ITERATIVE-NEXT: s_max_u32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 @@ -14259,9 +14238,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_bitset0_b32 s1, s2 ; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 @@ -15988,9 +15966,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_bitset0_b32 s1, s2 ; GFX1032_ITERATIVE-NEXT: s_min_u32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 @@ -16074,9 +16051,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_bitset0_b32 s1, s2 ; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index 6a82dbeec5e2f..dc7e010e60c2e 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -727,9 +727,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_bitset0_b32 s1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 ; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 @@ -812,9 +811,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_bitset0_b32 s1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 ; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 @@ -897,9 +895,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_bitset0_b32 s1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 @@ -1777,9 +1774,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_bitset0_b32 s1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 ; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_1 @@ -1862,9 +1858,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_bitset0_b32 s1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 ; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1 @@ -1948,9 +1943,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_bitset0_b32 s1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index dd4c0b0625ea8..f4a5838fc7188 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -747,9 +747,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_bitset0_b32 s1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 ; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 @@ -834,9 +833,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_bitset0_b32 s1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 ; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 @@ -920,9 +918,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_bitset0_b32 s1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 @@ -1948,9 +1945,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 +; GFX10W32-NEXT: s_bitset0_b32 s1, s2 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 ; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 @@ -2035,9 +2031,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX11W32-NEXT: s_bitset0_b32 s1, s2 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 ; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 @@ -2122,9 +2117,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX12W32-NEXT: s_bitset0_b32 s1, s2 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/bitset01.ll b/llvm/test/CodeGen/AMDGPU/bitset01.ll new file mode 100644 index 0000000000000..edea33d70b50e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bitset01.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; https://github.com/llvm/llvm-project/issues/130245 + +define amdgpu_ps i32 @s_bitset1_b32(i32 inreg %src0, i32 inreg %bit.index) { +; SI-LABEL: s_bitset1_b32: +; SI: ; %bb.0: +; SI-NEXT: s_bitset1_b32 s0, s1 +; SI-NEXT: ; return to shader part epilog + %set.bit.at.index = shl i32 1, %bit.index + %or = or i32 %src0, %set.bit.at.index + ret i32 %or +} + +define amdgpu_ps i32 @s_bitset1_b32_no_opt(i32 inreg %src0, i32 inreg %bit.index) { +; SI-LABEL: s_bitset1_b32_no_opt: +; SI: ; %bb.0: +; SI-NEXT: s_lshl_b32 s1, 2, s1 +; SI-NEXT: s_or_b32 s0, s0, s1 +; SI-NEXT: ; return to shader part epilog + %set.bit.at.index = shl i32 2, %bit.index + %or = or i32 %src0, %set.bit.at.index + ret i32 %or +} + +define amdgpu_ps float @s_bitset1_b32_v(i32 %src0, i32 %bit.index) { +; SI-LABEL: s_bitset1_b32_v: +; SI: ; %bb.0: +; SI-NEXT: v_lshl_or_b32 v0, 1, v1, v0 +; SI-NEXT: ; return to shader part epilog + %set.bit.at.index = shl i32 1, %bit.index + %or = or i32 %src0, %set.bit.at.index + %bc = bitcast i32 %or to float + ret float %bc +} + +define amdgpu_ps i32 @s_bitset0_b32(i32 inreg %src0, i32 inreg %bit.index) { +; SI-LABEL: s_bitset0_b32: +; SI: ; %bb.0: +; SI-NEXT: s_bitset0_b32 s0, s1 +; SI-NEXT: ; return to shader part epilog + %set.bit.at.index = shl i32 1, %bit.index + %other.bits = xor i32 %set.bit.at.index, -1 + %and = and i32 %src0, %other.bits + ret i32 %and +} + +define amdgpu_ps i32 @s_bitset0_b32_no_opt(i32 inreg %src0, i32 inreg %bit.index) { +; SI-LABEL: s_bitset0_b32_no_opt: +; SI: ; %bb.0: +; SI-NEXT: s_lshl_b32 s1, 10, s1 +; SI-NEXT: s_andn2_b32 s0, s0, s1 +; SI-NEXT: ; return to shader part epilog + %set.bit.at.index = shl i32 10, %bit.index + %other.bits = xor i32 %set.bit.at.index, -1 + %and = and i32 %src0, %other.bits + ret i32 %and +} + +define amdgpu_ps float @s_bitset0_b32_v(i32 %src0, i32 %bit.index) { +; SI-LABEL: s_bitset0_b32_v: +; SI: ; %bb.0: +; SI-NEXT: v_lshlrev_b32_e64 v1, v1, 1 +; SI-NEXT: v_not_b32_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v0, v0, v1 +; SI-NEXT: ; return to shader part epilog + %set.bit.at.index = shl i32 1, %bit.index + %other.bits = xor i32 %set.bit.at.index, -1 + %and = and i32 %src0, %other.bits + %bc = bitcast i32 %and to float + ret float %bc +} + +define amdgpu_ps <4 x i32> @v_test(<4 x i32> %src0, <4 x i32> %bit.index) { +; SI-LABEL: v_test: +; SI: ; %bb.0: +; SI-NEXT: v_lshl_or_b32 v0, 1, v4, v0 +; SI-NEXT: v_lshl_or_b32 v1, 1, v5, v1 +; SI-NEXT: v_lshl_or_b32 v2, 1, v6, v2 +; SI-NEXT: v_lshl_or_b32 v3, 1, v7, v3 +; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: v_readfirstlane_b32 s1, v1 +; SI-NEXT: v_readfirstlane_b32 s2, v2 +; SI-NEXT: v_readfirstlane_b32 s3, v3 +; SI-NEXT: ; return to shader part epilog + %set.bit.at.index = shl <4 x i32> , %bit.index + %or = or <4 x i32> %src0, %set.bit.at.index + ret <4 x i32> %or +}