diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index 4f9beeaacfaee..ba661348ca5b5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -308,7 +308,7 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) { // Opcodes that support pretty much all combinations of reg banks and LLTs // (except S1). There is no point in writing rules for them. if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES || - Opc == AMDGPU::G_MERGE_VALUES) { + Opc == AMDGPU::G_MERGE_VALUES || Opc == AMDGPU::G_BITCAST) { RBLHelper.applyMappingTrivial(*MI); continue; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index a7c1d7ab98adf..7ff822c6f6580 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -171,6 +171,62 @@ void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) { MI.eraseFromParent(); } +const std::pair +RegBankLegalizeHelper::unpackZExt(Register Reg) { + auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg); + auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff); + auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask); + auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16)); + return {Lo.getReg(0), Hi.getReg(0)}; +} + +const std::pair +RegBankLegalizeHelper::unpackSExt(Register Reg) { + auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg); + auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16); + auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16)); + return {Lo.getReg(0), Hi.getReg(0)}; +} + +const std::pair +RegBankLegalizeHelper::unpackAExt(Register Reg) { + auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg); + auto Lo = PackedS32; + auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16)); + return {Lo.getReg(0), Hi.getReg(0)}; +} + +void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) { + Register Lo, Hi; + switch (MI.getOpcode()) { + case AMDGPU::G_SHL: { + auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg()); + auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg()); + Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0); + Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0); + break; + } + case AMDGPU::G_LSHR: { + auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg()); + auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg()); + Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0); + Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0); + break; + } + case AMDGPU::G_ASHR: { + auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg()); + auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg()); + Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0); + Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0); + break; + } + default: + llvm_unreachable("Unpack lowering not implemented"); + } + B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi}); + MI.eraseFromParent(); +} + static bool isSignedBFE(MachineInstr &MI) { if (GIntrinsic *GI = dyn_cast(&MI)) return (GI->is(Intrinsic::amdgcn_sbfe)); @@ -306,6 +362,33 @@ void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) { MI.eraseFromParent(); } +void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) { + auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg()); + int Amt = MI.getOperand(2).getImm(); + Register Lo, Hi; + // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend + if (Amt <= 32) { + auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0)); + if (Amt == 32) { + // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx + Lo = Freeze.getReg(0); + } else { + // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx + Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0); + } + + auto SignExtCst = B.buildConstant(SgprRB_S32, 31); + Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0); + } else { + // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx + Lo = Op1.getReg(0); + Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0); + } + + B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi}); + MI.eraseFromParent(); +} + void RegBankLegalizeHelper::lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, SmallSet &WaterfallSgprs) { @@ -328,6 +411,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, MI.eraseFromParent(); return; } + case UnpackBitShift: + return lowerUnpackBitShift(MI); case Ext32To64: { const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg()); MachineInstrBuilder Hi; @@ -394,6 +479,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, return lowerSplitTo32(MI); case SplitTo32Select: return lowerSplitTo32Select(MI); + case SplitTo32SExtInReg: + return lowerSplitTo32SExtInReg(MI); case SplitLoad: { LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); unsigned Size = DstTy.getSizeInBits(); @@ -483,6 +570,13 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case SgprP5: case VgprP5: return LLT::pointer(5, 32); + case SgprV2S16: + case VgprV2S16: + case UniInVgprV2S16: + return LLT::fixed_vector(2, 16); + case SgprV2S32: + case VgprV2S32: + return LLT::fixed_vector(2, 32); case SgprV4S32: case VgprV4S32: case UniInVgprV4S32: @@ -556,6 +650,8 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case SgprP3: case SgprP4: case SgprP5: + case SgprV2S16: + case SgprV2S32: case SgprV4S32: case SgprB32: case SgprB64: @@ -565,6 +661,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case SgprB512: case UniInVcc: case UniInVgprS32: + case UniInVgprV2S16: case UniInVgprV4S32: case UniInVgprB32: case UniInVgprB64: @@ -586,6 +683,8 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case VgprP3: case VgprP4: case VgprP5: + case VgprV2S16: + case VgprV2S32: case VgprV4S32: case VgprB32: case VgprB64: @@ -623,6 +722,8 @@ void RegBankLegalizeHelper::applyMappingDst( case SgprP3: case SgprP4: case SgprP5: + case SgprV2S16: + case SgprV2S32: case SgprV4S32: case Vgpr16: case Vgpr32: @@ -632,6 +733,8 @@ void RegBankLegalizeHelper::applyMappingDst( case VgprP3: case VgprP4: case VgprP5: + case VgprV2S16: + case VgprV2S32: case VgprV4S32: { assert(Ty == getTyFromID(MethodIDs[OpIdx])); assert(RB == getRegBankFromID(MethodIDs[OpIdx])); @@ -666,6 +769,7 @@ void RegBankLegalizeHelper::applyMappingDst( break; } case UniInVgprS32: + case UniInVgprV2S16: case UniInVgprV4S32: { assert(Ty == getTyFromID(MethodIDs[OpIdx])); assert(RB == SgprRB); @@ -739,6 +843,8 @@ void RegBankLegalizeHelper::applyMappingSrc( case SgprP3: case SgprP4: case SgprP5: + case SgprV2S16: + case SgprV2S32: case SgprV4S32: { assert(Ty == getTyFromID(MethodIDs[i])); assert(RB == getRegBankFromID(MethodIDs[i])); @@ -764,6 +870,8 @@ void RegBankLegalizeHelper::applyMappingSrc( case VgprP3: case VgprP4: case VgprP5: + case VgprV2S16: + case VgprV2S32: case VgprV4S32: { assert(Ty == getTyFromID(MethodIDs[i])); if (RB != VgprRB) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index a9011ba07b8e6..50bd86dc15a1f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -111,10 +111,15 @@ class RegBankLegalizeHelper { SmallSet &SgprWaterfallOperandRegs); void lowerVccExtToSel(MachineInstr &MI); + const std::pair unpackZExt(Register Reg); + const std::pair unpackSExt(Register Reg); + const std::pair unpackAExt(Register Reg); + void lowerUnpackBitShift(MachineInstr &MI); void lowerV_BFE(MachineInstr &MI); void lowerS_BFE(MachineInstr &MI); void lowerSplitTo32(MachineInstr &MI); void lowerSplitTo32Select(MachineInstr &MI); + void lowerSplitTo32SExtInReg(MachineInstr &MI); }; } // end namespace AMDGPU diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index f803217f82e6c..89056b0271f12 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -60,6 +60,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::pointer(4, 64); case P5: return MRI.getType(Reg) == LLT::pointer(5, 32); + case V2S32: + return MRI.getType(Reg) == LLT::fixed_vector(2, 32); case V4S32: return MRI.getType(Reg) == LLT::fixed_vector(4, 32); case B32: @@ -92,6 +94,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg); case UniP5: return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg); + case UniV2S16: + return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isUniform(Reg); case UniB32: return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg); case UniB64: @@ -122,6 +126,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isDivergent(Reg); case DivP5: return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isDivergent(Reg); + case DivV2S16: + return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg); case DivB32: return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg); case DivB64: @@ -435,7 +441,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, MachineRegisterInfo &_MRI) : ST(&_ST), MRI(&_MRI) { - addRulesForGOpcs({G_ADD}, Standard) + addRulesForGOpcs({G_ADD, G_SUB}, Standard) .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); @@ -452,11 +458,36 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32}); addRulesForGOpcs({G_SHL}, Standard) + .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32ZExt}}) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackBitShift}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) + .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}}) .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}}); + + addRulesForGOpcs({G_LSHR}, Standard) + .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}}) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackBitShift}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}}); - addRulesForGOpcs({G_LSHR}, Standard).Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}); + addRulesForGOpcs({G_ASHR}, Standard) + .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32ZExt}}) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackBitShift}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) + .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}}); + + addRulesForGOpcs({G_FRAME_INDEX}).Any({{UniP5, _}, {{SgprP5}, {None}}}); addRulesForGOpcs({G_UBFX, G_SBFX}, Standard) .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}, S_BFE}) @@ -515,6 +546,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}}) .Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}}) .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}}) + .Any({{UniV2S16, V2S32}, {{SgprV2S16}, {SgprV2S32}}}) + .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}}) // This is non-trivial. VgprToVccCopy is done using compare instruction. .Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}}) .Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}}) @@ -550,6 +583,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}) .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}}); + addRulesForGOpcs({G_SEXT_INREG}) + .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}) + .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}}) + .Any({{UniS64, S64}, {{Sgpr64}, {Sgpr64}}}) + .Any({{DivS64, S64}, {{Vgpr64}, {Vgpr64}, SplitTo32SExtInReg}}); + bool hasUnalignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12; bool hasSMRDSmall = ST->hasScalarSubwordLoads(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 1c70597024b6a..bddfb8dd1913f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -75,6 +75,10 @@ enum UniformityLLTOpPredicateID { V3S32, V4S32, + UniV2S16, + + DivV2S16, + // B types B32, B64, @@ -117,7 +121,9 @@ enum RegBankLLTMappingApplyID { SgprP3, SgprP4, SgprP5, + SgprV2S16, SgprV4S32, + SgprV2S32, SgprB32, SgprB64, SgprB96, @@ -134,6 +140,8 @@ enum RegBankLLTMappingApplyID { VgprP3, VgprP4, VgprP5, + VgprV2S16, + VgprV2S32, VgprB32, VgprB64, VgprB96, @@ -145,6 +153,7 @@ enum RegBankLLTMappingApplyID { // Dst only modifiers: read-any-lane and truncs UniInVcc, UniInVgprS32, + UniInVgprV2S16, UniInVgprV4S32, UniInVgprB32, UniInVgprB64, @@ -173,11 +182,13 @@ enum LoweringMethodID { DoNotLower, VccExtToSel, UniExtToSel, + UnpackBitShift, S_BFE, V_BFE, VgprToVccCopy, SplitTo32, SplitTo32Select, + SplitTo32SExtInReg, Ext32To64, UniCstExt, SplitLoad, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index 4c6bf6503ca1f..ff03cf1231d08 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define i8 @v_ashr_i8(i8 %value, i8 %amount) { ; GFX6-LABEL: v_ashr_i8: @@ -70,14 +70,29 @@ define i8 @v_ashr_i8_7(i8 %value) { } define amdgpu_ps i8 @s_ashr_i8(i8 inreg %value, i8 inreg %amount) { -; GCN-LABEL: s_ashr_i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_sext_i32_i8 s0, s0 -; GCN-NEXT: s_ashr_i32 s0, s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_ashr_i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_sext_i32_i8 s0, s0 +; GFX6-NEXT: s_ashr_i32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_ashr_i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_sext_i32_i8 s0, s0 +; GFX8-NEXT: s_ashr_i32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_ashr_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_sext_i32_i8 s0, s0 +; GFX9-NEXT: s_ashr_i32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_ashr_i8: ; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xff ; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0 ; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index be1dc7f0c67f9..6baa10bb48621 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define i8 @v_lshr_i8(i8 %value, i8 %amount) { ; GFX6-LABEL: v_lshr_i8: @@ -69,15 +69,33 @@ define i8 @v_lshr_i8_7(i8 %value) { } define amdgpu_ps i8 @s_lshr_i8(i8 inreg %value, i8 inreg %amount) { -; GCN-LABEL: s_lshr_i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_and_b32 s0, s0, 0xff -; GCN-NEXT: s_lshr_b32 s0, s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_lshr_i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s0, s0, 0xff +; GFX6-NEXT: s_lshr_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_lshr_i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s0, s0, 0xff +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_lshr_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s0, s0, 0xff +; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX9-NEXT: s_lshr_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_i8: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xff +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xff +; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i8 %value, %amount @@ -93,18 +111,21 @@ define amdgpu_ps i8 @s_lshr_i8_7(i8 inreg %value) { ; GFX8-LABEL: s_lshr_i8_7: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s0, s0, 0xff +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshr_b32 s0, s0, 7 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_lshr_i8_7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s0, s0, 0xff +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX9-NEXT: s_lshr_b32 s0, s0, 7 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_i8_7: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xff +; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 7 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i8 %value, 7 @@ -831,22 +852,22 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou ; ; GFX9-LABEL: s_lshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s2, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshr_b32 s0, s0, s1 -; GFX9-NEXT: s_lshr_b32 s1, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX9-NEXT: s_lshr_b32 s1, s2, s1 +; GFX9-NEXT: s_lshr_b32 s0, s0, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_v2i16: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_and_b32 s2, s0, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10PLUS-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1 -; GFX10PLUS-NEXT: s_lshr_b32 s1, s2, s3 -; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshr_b32 s1, s2, s1 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s3 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s1, s0 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to i32 @@ -1024,34 +1045,34 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg ; ; GFX9-LABEL: s_lshr_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s4, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NEXT: s_lshr_b32 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s2, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s2, s1, 16 -; GFX9-NEXT: s_and_b32 s1, s1, 0xffff +; GFX9-NEXT: s_lshr_b32 s2, s4, s2 +; GFX9-NEXT: s_lshr_b32 s0, s0, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX9-NEXT: s_and_b32 s2, s1, 0xffff +; GFX9-NEXT: s_lshr_b32 s1, s1, 16 ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_lshr_b32 s1, s1, s3 -; GFX9-NEXT: s_lshr_b32 s2, s2, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: s_lshr_b32 s2, s2, s3 +; GFX9-NEXT: s_lshr_b32 s1, s1, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_v4i16: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_and_b32 s4, s0, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10PLUS-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s2 -; GFX10PLUS-NEXT: s_lshr_b32 s2, s4, s5 -; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s2, s4, s2 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s5 +; GFX10PLUS-NEXT: s_and_b32 s4, s1, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, 16 ; GFX10PLUS-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s3 -; GFX10PLUS-NEXT: s_lshr_b32 s3, s4, s5 -; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10PLUS-NEXT: s_lshr_b32 s3, s4, s3 +; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s5 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s3, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr <4 x i16> %value, %amount %cast = bitcast <4 x i16> %result to <2 x i32> @@ -1221,58 +1242,58 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg ; ; GFX9-LABEL: s_lshr_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s8, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s8, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: s_lshr_b32 s9, s4, 16 -; GFX9-NEXT: s_lshr_b32 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s4, s8, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_and_b32 s1, s1, 0xffff +; GFX9-NEXT: s_lshr_b32 s4, s8, s4 +; GFX9-NEXT: s_lshr_b32 s0, s0, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s0 +; GFX9-NEXT: s_and_b32 s4, s1, 0xffff +; GFX9-NEXT: s_lshr_b32 s1, s1, 16 ; GFX9-NEXT: s_lshr_b32 s8, s5, 16 -; GFX9-NEXT: s_lshr_b32 s1, s1, s5 -; GFX9-NEXT: s_lshr_b32 s4, s4, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_and_b32 s2, s2, 0xffff -; GFX9-NEXT: s_lshr_b32 s5, s6, 16 -; GFX9-NEXT: s_lshr_b32 s2, s2, s6 ; GFX9-NEXT: s_lshr_b32 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-NEXT: s_lshr_b32 s1, s1, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s1 +; GFX9-NEXT: s_and_b32 s4, s2, 0xffff +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: s_lshr_b32 s5, s6, 16 +; GFX9-NEXT: s_lshr_b32 s4, s4, s6 +; GFX9-NEXT: s_lshr_b32 s2, s2, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s2 +; GFX9-NEXT: s_and_b32 s4, s3, 0xffff +; GFX9-NEXT: s_lshr_b32 s3, s3, 16 ; GFX9-NEXT: s_lshr_b32 s5, s7, 16 -; GFX9-NEXT: s_lshr_b32 s3, s3, s7 -; GFX9-NEXT: s_lshr_b32 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: s_lshr_b32 s4, s4, s7 +; GFX9-NEXT: s_lshr_b32 s3, s3, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_v8i16: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshr_b32 s8, s0, 16 -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_and_b32 s8, s0, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10PLUS-NEXT: s_lshr_b32 s9, s4, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s4 -; GFX10PLUS-NEXT: s_lshr_b32 s4, s8, s9 -; GFX10PLUS-NEXT: s_lshr_b32 s8, s1, 16 -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s4, s8, s4 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s9 +; GFX10PLUS-NEXT: s_and_b32 s8, s1, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, 16 ; GFX10PLUS-NEXT: s_lshr_b32 s9, s5, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s5 -; GFX10PLUS-NEXT: s_lshr_b32 s5, s8, s9 -; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10PLUS-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10PLUS-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s5, s8, s5 +; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s9 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s4, s0 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s5, s1 +; GFX10PLUS-NEXT: s_and_b32 s4, s2, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s2, s2, 16 ; GFX10PLUS-NEXT: s_lshr_b32 s5, s6, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s2, s2, s6 -; GFX10PLUS-NEXT: s_lshr_b32 s4, s4, s5 -; GFX10PLUS-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10PLUS-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s4, s4, s6 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s2, s5 +; GFX10PLUS-NEXT: s_and_b32 s5, s3, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s3, s3, 16 ; GFX10PLUS-NEXT: s_lshr_b32 s6, s7, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s3, s3, s7 -; GFX10PLUS-NEXT: s_lshr_b32 s5, s5, s6 -; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX10PLUS-NEXT: s_lshr_b32 s5, s5, s7 +; GFX10PLUS-NEXT: s_lshr_b32 s3, s3, s6 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s2, s4, s2 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s3, s5, s3 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr <8 x i16> %value, %amount %cast = bitcast <8 x i16> %result to <4 x i32> @@ -1605,8 +1626,9 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX6-LABEL: v_lshr_i65: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 1, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, 1 ; GFX6-NEXT: v_mov_b32_e32 v5, 0 +; GFX6-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v3 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[0:1], v3 @@ -1627,8 +1649,9 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX8-LABEL: v_lshr_i65: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v4, 1, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, 1 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffffffc0, v3 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] @@ -1649,8 +1672,9 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX9-LABEL: v_lshr_i65: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v4, 1, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, 1 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX9-NEXT: v_sub_u32_e32 v8, 64, v3 ; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffc0, v3 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] @@ -1671,6 +1695,7 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX10-LABEL: v_lshr_i65: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v3 @@ -1693,21 +1718,22 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX11-LABEL: v_lshr_i65: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 +; GFX11-NEXT: v_mov_b32_e32 v4, 1 ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v4, 1, v2 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v3 ; GFX11-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 ; GFX11-NEXT: v_lshlrev_b64 v[8:9], v2, v[4:5] -; GFX11-NEXT: v_or_b32_e32 v2, v6, v8 -; GFX11-NEXT: v_or_b32_e32 v6, v7, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v3 ; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] ; GFX11-NEXT: v_lshrrev_b64 v[4:5], v3, v[4:5] -; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v6, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v2, v6, v8 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v9 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v6, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = lshr i65 %value, %amount @@ -1719,8 +1745,9 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX6-NEXT: v_mov_b32_e32 v0, 1 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1731,8 +1758,9 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, 1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1743,8 +1771,9 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1755,6 +1784,7 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v3 @@ -1766,8 +1796,8 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX11-LABEL: v_lshr_i65_33: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 1, v2 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, 1 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 1, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX11-NEXT: v_or_b32_e32 v0, v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir index 615cfec2b31cf..a0cb85f710443 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=regbankselect -regbankselect-fast -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=regbankselect -regbankselect-greedy -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -o - %s | FileCheck %s --- name: ashr_s32_ss @@ -206,8 +205,7 @@ body: | ; CHECK-NEXT: [[ASHR:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST]], [[C]](s32) ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>) ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[BITCAST1]], 16 - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C1]](s32) + ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C]](s32) ; CHECK-NEXT: [[ASHR2:%[0-9]+]]:sgpr(s32) = G_ASHR [[SEXT_INREG]], [[SEXT_INREG1]](s32) ; CHECK-NEXT: [[ASHR3:%[0-9]+]]:sgpr(s32) = G_ASHR [[ASHR]], [[ASHR1]](s32) ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ASHR2]](s32), [[ASHR3]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir index c5024924a4d32..60b89bf42031d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=regbankselect -regbankselect-fast -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=regbankselect -regbankselect-greedy -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -o - %s | FileCheck %s --- name: lshr_s32_ss @@ -201,15 +200,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr1 ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST]], [[C1]] + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C1]](s32) ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C2]](s32) - ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C3]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C]] + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:sgpr(s32) = G_LSHR [[AND]], [[AND1]](s32) ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:sgpr(s32) = G_LSHR [[LSHR]], [[LSHR1]](s32) ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext-inreg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext-inreg.mir index cf0ca2c9eb634..1a8fa56a7f799 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext-inreg.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext-inreg.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s --- name: sext_inreg_s_s32_1 @@ -137,7 +136,7 @@ body: | ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:vgpr(s32) = G_FREEZE [[UV]] ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[FREEZE]], 1 - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 31 ; CHECK-NEXT: [[ASHR:%[0-9]+]]:vgpr(s32) = G_ASHR [[SEXT_INREG]], [[C]](s32) ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[SEXT_INREG]](s32), [[ASHR]](s32) ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s64) @@ -162,7 +161,7 @@ body: | ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:vgpr(s32) = G_FREEZE [[UV]] ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[FREEZE]], 31 - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 31 ; CHECK-NEXT: [[ASHR:%[0-9]+]]:vgpr(s32) = G_ASHR [[SEXT_INREG]], [[C]](s32) ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[SEXT_INREG]](s32), [[ASHR]](s32) ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s64) @@ -186,7 +185,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:vgpr(s32) = G_FREEZE [[UV]] - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 31 ; CHECK-NEXT: [[ASHR:%[0-9]+]]:vgpr(s32) = G_ASHR [[FREEZE]], [[C]](s32) ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[FREEZE]](s32), [[ASHR]](s32) ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s64) @@ -209,9 +208,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) - ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[COPY1]], 1 - ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY1]](s32), [[SEXT_INREG]](s32) + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[UV1]], 1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[UV]](s32), [[SEXT_INREG]](s32) ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_SEXT_INREG %0, 33 @@ -232,9 +230,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) - ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[COPY1]], 3 - ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY1]](s32), [[SEXT_INREG]](s32) + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[UV1]], 3 + ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[UV]](s32), [[SEXT_INREG]](s32) ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_SEXT_INREG %0, 35 @@ -255,9 +252,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) - ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[COPY1]], 31 - ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY1]](s32), [[SEXT_INREG]](s32) + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[UV1]], 31 + ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[UV]](s32), [[SEXT_INREG]](s32) ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_SEXT_INREG %0, 63 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-shl.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-shl.mir index b4290ea0a4203..6bdf8e7e1de6f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-shl.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-shl.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=regbankselect -regbankselect-fast -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=regbankselect -regbankselect-greedy -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -o - %s | FileCheck %s --- name: shl_s32_ss @@ -204,8 +203,7 @@ body: | ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; CHECK-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[BITCAST]], [[BITCAST1]](s32) ; CHECK-NEXT: [[SHL1:%[0-9]+]]:sgpr(s32) = G_SHL [[LSHR]], [[LSHR1]](s32) ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SHL]](s32), [[SHL1]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll index 46b75eb55cb52..a9b3deb3e49f4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define i8 @v_sext_inreg_i8_4(i8 %value) { ; GCN-LABEL: v_sext_inreg_i8_4: @@ -1077,13 +1077,13 @@ define i64 @v_sext_inreg_i64_23(i64 %value) { ; GCN-LABEL: v_sext_inreg_i64_23: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_i32 v1, v0, 0, 9 +; GCN-NEXT: v_bfe_i32 v1, v1, 0, 9 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_sext_inreg_i64_23: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_bfe_i32 v1, v0, 0, 9 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v1, 0, 9 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %shl = shl i64 %value, 23 %ashr = ashr i64 %shl, 23 @@ -1170,13 +1170,13 @@ define i64 @v_sext_inreg_i64_31(i64 %value) { ; GCN-LABEL: v_sext_inreg_i64_31: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_i32 v1, v0, 0, 1 +; GCN-NEXT: v_bfe_i32 v1, v1, 0, 1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_sext_inreg_i64_31: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_bfe_i32 v1, v0, 0, 1 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v1, 0, 1 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %shl = shl i64 %value, 31 %ashr = ashr i64 %shl, 31 @@ -1262,15 +1262,15 @@ define <2 x i64> @v_sext_inreg_v2i64_16(<2 x i64> %value) { ; GCN-LABEL: v_sext_inreg_v2i64_16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_i32 v1, v0, 0, 16 -; GCN-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GCN-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_sext_inreg_v2i64_16: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_bfe_i32 v1, v0, 0, 16 -; GFX10PLUS-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX10PLUS-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %shl = shl <2 x i64> %value, %ashr = ashr <2 x i64> %shl, @@ -1281,15 +1281,15 @@ define <2 x i64> @v_sext_inreg_v2i64_31(<2 x i64> %value) { ; GCN-LABEL: v_sext_inreg_v2i64_31: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_i32 v1, v0, 0, 1 -; GCN-NEXT: v_bfe_i32 v3, v2, 0, 1 +; GCN-NEXT: v_bfe_i32 v1, v1, 0, 1 +; GCN-NEXT: v_bfe_i32 v3, v3, 0, 1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_sext_inreg_v2i64_31: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_bfe_i32 v1, v0, 0, 1 -; GFX10PLUS-NEXT: v_bfe_i32 v3, v2, 0, 1 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v1, 0, 1 +; GFX10PLUS-NEXT: v_bfe_i32 v3, v3, 0, 1 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %shl = shl <2 x i64> %value, %ashr = ashr <2 x i64> %shl, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index 139652eb55e3d..2f03c7156babc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define i8 @v_shl_i8(i8 %value, i8 %amount) { ; GFX6-LABEL: v_shl_i8: @@ -64,13 +64,26 @@ define i8 @v_shl_i8_7(i8 %value) { } define amdgpu_ps i8 @s_shl_i8(i8 inreg %value, i8 inreg %amount) { -; GCN-LABEL: s_shl_i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_lshl_b32 s0, s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_shl_i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshl_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_shl_i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_shl_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_shl_i8: ; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xff ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i8 %value, %amount