27struct MFMAPaddingRatioParser :
public cl::parser<unsigned> {
32 return O.error(
"'" + Arg +
"' value invalid for uint argument!");
35 return O.error(
"'" + Arg +
"' value must be in the range [0, 100]!");
45 cl::desc(
"Fill a percentage of the latency between "
46 "neighboring MFMA with s_nops."));
50 cl::desc(
"Maximum function size for exhausive hazard search"));
60 : IsHazardRecognizerMode(
false), CurrCycleInstr(nullptr), MF(MF),
62 TRI(
TII.getRegisterInfo()), TSchedModel(
TII.getSchedModel()),
63 UseVALUReadHazardExhaustiveSearch(
false),
64 ClauseUses(
TRI.getNumRegUnits()), ClauseDefs(
TRI.getNumRegUnits()) {
70 EmittedInstrs.clear();
82 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
86 return Opcode == AMDGPU::S_GETREG_B32;
91 case AMDGPU::S_SETREG_B32:
92 case AMDGPU::S_SETREG_B32_mode:
93 case AMDGPU::S_SETREG_IMM32_B32:
94 case AMDGPU::S_SETREG_IMM32_B32_mode:
101 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
105 return Opcode == AMDGPU::S_RFE_B64;
110 case AMDGPU::S_MOVRELS_B32:
111 case AMDGPU::S_MOVRELS_B64:
112 case AMDGPU::S_MOVRELD_B32:
113 case AMDGPU::S_MOVRELD_B64:
125 unsigned Opcode =
MI.getOpcode();
129 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
130 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
133 if (!ST.hasGFX940Insts())
141 if (
TII.isAlwaysGDS(
MI.getOpcode()))
144 switch (
MI.getOpcode()) {
145 case AMDGPU::S_SENDMSG:
146 case AMDGPU::S_SENDMSGHALT:
147 case AMDGPU::S_TTRACEDATA:
151 case AMDGPU::DS_PERMUTE_B32:
152 case AMDGPU::DS_BPERMUTE_B32:
155 if (
TII.isDS(
MI.getOpcode())) {
157 AMDGPU::OpName::gds);
158 if (
MI.getOperand(GDS).getImm())
166 unsigned Opcode =
MI.getOpcode();
167 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
168 Opcode == AMDGPU::V_PERMLANE64_B32 ||
169 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
170 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
171 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
172 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
173 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
174 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
175 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64;
185 AMDGPU::OpName::simm16);
205 if (checkFPAtomicToDenormModeHazard(
MI) > 0)
214 && checkVMEMHazards(
MI) > 0)
223 if (
isDivFMas(
MI->getOpcode()) && checkDivFMasHazards(
MI) > 0)
226 if (
isRWLane(
MI->getOpcode()) && checkRWLaneHazards(
MI) > 0)
234 if (
isSGetReg(
MI->getOpcode()) && checkGetRegHazards(
MI) > 0)
237 if (
isSSetReg(
MI->getOpcode()) && checkSetRegHazards(
MI) > 0)
240 if (
isRFE(
MI->getOpcode()) && checkRFEHazards(
MI) > 0)
245 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
246 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
250 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr))) &&
251 checkReadM0Hazards(
MI) > 0)
262 if (
MI->isInlineAsm() && checkInlineAsmHazards(
MI) > 0)
270 while (Quantity > 0) {
271 unsigned Arg = std::min(Quantity, 8u);
279GCNHazardRecognizer::getMFMAPipelineWaitStates(
const MachineInstr &
MI)
const {
286void GCNHazardRecognizer::processBundle() {
290 for (;
MI != E &&
MI->isInsideBundle(); ++
MI) {
291 CurrCycleInstr = &*
MI;
294 if (IsHazardRecognizerMode) {
295 fixHazards(CurrCycleInstr);
303 for (
unsigned i = 0, e = std::min(WaitStates,
MaxLookAhead - 1); i <
e; ++i)
304 EmittedInstrs.push_front(
nullptr);
306 EmittedInstrs.push_front(CurrCycleInstr);
309 CurrCycleInstr =
nullptr;
313 assert(IsHazardRecognizerMode);
317 if (
MI->isInsideBundle())
327 IsHazardRecognizerMode =
true;
331 CurrCycleInstr =
nullptr;
342 return std::max(WaitStates, checkSMRDHazards(
MI));
345 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(
MI));
347 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(
MI));
353 WaitStates = std::max(WaitStates, checkVMEMHazards(
MI));
356 WaitStates = std::max(WaitStates, checkVALUHazards(
MI));
359 WaitStates = std::max(WaitStates, checkDPPHazards(
MI));
362 WaitStates = std::max(WaitStates, checkDivFMasHazards(
MI));
365 WaitStates = std::max(WaitStates, checkRWLaneHazards(
MI));
370 WaitStates = std::max(WaitStates, checkMAIVALUHazards(
MI));
372 if (
MI->isInlineAsm())
373 return std::max(WaitStates, checkInlineAsmHazards(
MI));
376 return std::max(WaitStates, checkGetRegHazards(
MI));
379 return std::max(WaitStates, checkSetRegHazards(
MI));
382 return std::max(WaitStates, checkRFEHazards(
MI));
386 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
387 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
391 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr)))
392 return std::max(WaitStates, checkReadM0Hazards(
MI));
395 return std::max(WaitStates, checkMAIHazards(
MI));
400 return std::max(WaitStates, checkMAILdStHazards(
MI));
403 return std::max(WaitStates, checkPermlaneHazards(
MI));
409 EmittedInstrs.push_front(
nullptr);
415 if (!CurrCycleInstr) {
416 EmittedInstrs.push_front(
nullptr);
426 if (!NumWaitStates) {
427 CurrCycleInstr =
nullptr;
432 EmittedInstrs.push_front(CurrCycleInstr);
439 EmittedInstrs.push_front(
nullptr);
447 CurrCycleInstr =
nullptr;
451 llvm_unreachable(
"hazard recognizer does not support bottom-up scheduling.");
464template <
typename StateT>
477 switch (IsHazard(State, *
I)) {
487 if (
I->isInlineAsm() ||
I->isMetaInstruction())
490 UpdateState(State, *
I);
494 if (!Visited.
insert(Pred).second)
497 if (
hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
521 if (
I->isInlineAsm())
524 WaitStates += GetNumWaitStates(*
I);
526 if (IsExpired(*
I, WaitStates))
527 return std::numeric_limits<int>::max();
530 int MinWaitStates = std::numeric_limits<int>::max();
532 if (!Visited.
insert(Pred).second)
536 IsExpired, Visited, GetNumWaitStates);
538 MinWaitStates = std::min(MinWaitStates, W);
541 return MinWaitStates;
548 std::next(
MI->getReverseIterator()),
549 0, IsExpired, Visited);
552int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
int Limit) {
553 if (IsHazardRecognizerMode) {
555 return WaitStates >= Limit;
557 return ::getWaitStatesSince(IsHazard, CurrCycleInstr,
IsExpiredFn);
566 if (
MI->isInlineAsm())
571 if (WaitStates >= Limit)
574 return std::numeric_limits<int>::max();
577int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg,
578 IsHazardFn IsHazardDef,
583 return IsHazardDef(
MI) &&
MI.modifiesRegister(Reg, TRI);
589int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
629int GCNHazardRecognizer::checkSoftClauseHazards(
MachineInstr *MEM) {
635 bool IsSMRD = TII.
isSMRD(*MEM);
661 if (ClauseDefs.
none())
674 return ClauseDefs.
anyCommon(ClauseUses) ? 1 : 0;
677int GCNHazardRecognizer::checkSMRDHazards(
MachineInstr *SMRD) {
678 int WaitStatesNeeded = 0;
680 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
684 return WaitStatesNeeded;
688 int SmrdSgprWaitStates = 4;
701 int WaitStatesNeededForUse =
702 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
704 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
714 int WaitStatesNeededForUse =
715 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(),
718 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
722 return WaitStatesNeeded;
725int GCNHazardRecognizer::checkVMEMHazards(
MachineInstr* VMEM) {
729 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
733 const int VmemSgprWaitStates = 5;
741 int WaitStatesNeededForUse =
742 VmemSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
744 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
746 return WaitStatesNeeded;
754 int DppVgprWaitStates = 2;
755 int DppExecWaitStates = 5;
756 int WaitStatesNeeded = 0;
758 return TII->isVALU(
MI);
764 int WaitStatesNeededForUse =
765 DppVgprWaitStates - getWaitStatesSinceDef(
769 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
772 WaitStatesNeeded = std::max(
774 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
777 return WaitStatesNeeded;
780int GCNHazardRecognizer::checkDivFMasHazards(
MachineInstr *DivFMas) {
785 const int DivFMasWaitStates = 4;
787 return TII->isVALU(
MI);
789 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
792 return DivFMasWaitStates - WaitStatesNeeded;
795int GCNHazardRecognizer::checkGetRegHazards(
MachineInstr *GetRegInstr) {
797 unsigned GetRegHWReg =
getHWReg(TII, *GetRegInstr);
799 const int GetRegWaitStates = 2;
803 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, GetRegWaitStates);
805 return GetRegWaitStates - WaitStatesNeeded;
808int GCNHazardRecognizer::checkSetRegHazards(
MachineInstr *SetRegInstr) {
810 unsigned HWReg =
getHWReg(TII, *SetRegInstr);
816 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, SetRegWaitStates);
817 return SetRegWaitStates - WaitStatesNeeded;
820int GCNHazardRecognizer::createsVALUHazard(
const MachineInstr &
MI) {
825 unsigned Opcode =
MI.getOpcode();
831 VDataRCID =
Desc.operands()[VDataIdx].RegClass;
841 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
845 (!SOffset || !SOffset->
isReg()))
853 if (
TII->isMIMG(
MI)) {
860 if (
TII->isFLAT(
MI)) {
873GCNHazardRecognizer::checkVALUHazardsHelper(
const MachineOperand &Def,
880 int WaitStatesNeeded = 0;
882 if (!
TRI->isVectorRegister(
MRI,
Def.getReg()))
883 return WaitStatesNeeded;
886 int DataIdx = createsVALUHazard(
MI);
887 return DataIdx >= 0 &&
888 TRI->regsOverlap(
MI.getOperand(DataIdx).getReg(), Reg);
891 int WaitStatesNeededForDef =
892 VALUWaitStates - getWaitStatesSince(
IsHazardFn, VALUWaitStates);
893 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
895 return WaitStatesNeeded;
911 unsigned Opcode =
MI.getOpcode();
921 if (
auto *DstSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel))
923 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
929 if (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src0_modifiers) &
931 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
935 (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src2_modifiers) &
937 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
943 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
964 for (
auto &Operand : VALU->operands()) {
965 if (Operand.isReg() &&
TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
972int GCNHazardRecognizer::checkVALUHazards(
MachineInstr *VALU) {
973 int WaitStatesNeeded = 0;
976 const int TransDefWaitstates = 1;
986 if (
Use.isReg() &&
TRI->regsOverlap(Def,
Use.getReg()))
993 int WaitStatesNeededForDef =
995 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
996 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1000 const int Shift16DefWaitstates = 1;
1010 if (ProducerMI.isInlineAsm()) {
1012 for (
auto &Def : ProducerMI.all_defs()) {
1021 int WaitStatesNeededForDef =
1022 Shift16DefWaitstates -
1023 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1024 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1028 const int VALUWriteSGPRVALUReadWaitstates = 2;
1029 const int VALUWriteEXECRWLane = 4;
1030 const int VALUWriteVGPRReadlaneRead = 1;
1038 return MI.modifiesRegister(
UseReg, TRI);
1047 int WaitStatesNeededForDef =
1048 VALUWriteSGPRVALUReadWaitstates -
1049 getWaitStatesSince(IsVALUDefSGPRFn,
1050 VALUWriteSGPRVALUReadWaitstates);
1051 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1055 if (
VALU->readsRegister(AMDGPU::VCC, TRI)) {
1057 int WaitStatesNeededForDef =
1058 VALUWriteSGPRVALUReadWaitstates -
1059 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1060 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1063 switch (
VALU->getOpcode()) {
1064 case AMDGPU::V_READLANE_B32:
1065 case AMDGPU::V_READFIRSTLANE_B32: {
1068 int WaitStatesNeededForDef =
1069 VALUWriteVGPRReadlaneRead -
1070 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1071 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1074 case AMDGPU::V_WRITELANE_B32: {
1076 int WaitStatesNeededForDef =
1077 VALUWriteEXECRWLane -
1078 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1079 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1090 return WaitStatesNeeded;
1095 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def,
MRI));
1098 return WaitStatesNeeded;
1101int GCNHazardRecognizer::checkInlineAsmHazards(
MachineInstr *IA) {
1115 int WaitStatesNeeded = 0;
1119 if (
Op.isReg() &&
Op.isDef()) {
1120 if (!
TRI.isVectorRegister(
MRI,
Op.getReg()))
1125 std::max(WaitStatesNeeded, checkVALUHazardsHelper(
Op,
MRI));
1131 const int Shift16DefWaitstates = 1;
1133 auto IsShift16BitDefFn = [
this, &
IA](
const MachineInstr &ProducerMI) {
1137 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1138 IA->readsRegister(Dst->getReg(), &TRI);
1140 if (ProducerMI.isInlineAsm()) {
1142 for (
auto &Def : ProducerMI.all_defs()) {
1143 if (
IA->modifiesRegister(
Def.getReg(), &TRI) ||
1144 IA->readsRegister(
Def.getReg(), &TRI)) {
1153 int WaitStatesNeededForDef =
1154 Shift16DefWaitstates -
1155 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1156 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1159 return WaitStatesNeeded;
1162int GCNHazardRecognizer::checkRWLaneHazards(
MachineInstr *RWLane) {
1168 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1176 const int RWLaneWaitStates = 4;
1177 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg,
IsHazardFn,
1179 return RWLaneWaitStates - WaitStatesSince;
1182int GCNHazardRecognizer::checkRFEHazards(
MachineInstr *RFE) {
1188 const int RFEWaitStates = 1;
1193 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, RFEWaitStates);
1194 return RFEWaitStates - WaitStatesNeeded;
1199 const int ReadM0WaitStates = 1;
1201 return ReadM0WaitStates -
1202 getWaitStatesSinceDef(AMDGPU::M0,
IsHazardFn, ReadM0WaitStates);
1206 fixVMEMtoScalarWriteHazards(
MI);
1207 fixVcmpxPermlaneHazards(
MI);
1208 fixSMEMtoVectorWriteHazards(
MI);
1209 fixVcmpxExecWARHazard(
MI);
1210 fixLdsBranchVmemWARHazard(
MI);
1212 fixLdsDirectVALUHazard(
MI);
1213 fixLdsDirectVMEMHazard(
MI);
1215 fixVALUPartialForwardingHazard(
MI);
1216 fixVALUTransUseHazard(
MI);
1218 fixShift64HighRegBug(
MI);
1219 fixVALUMaskWriteHazard(
MI);
1220 fixVALUReadSGPRHazard(
MI);
1221 fixRequiredExportPriority(
MI);
1226 return (
TII.isVOPC(
MI) ||
1227 (
MI.isCompare() && (
TII.isVOP3(
MI) ||
TII.isSDWA(
MI)))) &&
1228 MI.modifiesRegister(AMDGPU::EXEC, &
TRI);
1231bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(
MachineInstr *
MI) {
1242 unsigned Opc =
MI.getOpcode();
1244 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1248 std::numeric_limits<int>::max())
1254 auto *Src0 =
TII->getNamedOperand(*
MI, AMDGPU::OpName::src0);
1256 bool IsUndef = Src0->isUndef();
1258 TII->get(AMDGPU::V_MOV_B32_e32))
1265bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(
MachineInstr *
MI) {
1273 if (
MI->getNumDefs() == 0)
1285 I.findRegisterUseOperand(
Def.getReg(),
TRI,
false);
1295 (
MI.getOpcode() == AMDGPU::S_WAITCNT &&
1296 !
MI.getOperand(0).getImm()) ||
1297 (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1302 std::numeric_limits<int>::max())
1307 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1312bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(
MachineInstr *
MI) {
1321 switch (
MI->getOpcode()) {
1322 case AMDGPU::V_READLANE_B32:
1323 case AMDGPU::V_READFIRSTLANE_B32:
1324 SDSTName = AMDGPU::OpName::vdst;
1327 SDSTName = AMDGPU::OpName::sdst;
1336 for (
const auto &MO :
MI->implicit_operands()) {
1337 if (MO.isDef() &&
TRI->isSGPRClass(
TRI->getPhysRegBaseClass(MO.getReg()))) {
1353 if (
TII->isSALU(
MI)) {
1354 switch (
MI.getOpcode()) {
1355 case AMDGPU::S_SETVSKIP:
1356 case AMDGPU::S_VERSION:
1357 case AMDGPU::S_WAITCNT_VSCNT:
1358 case AMDGPU::S_WAITCNT_VMCNT:
1359 case AMDGPU::S_WAITCNT_EXPCNT:
1362 case AMDGPU::S_WAITCNT_LGKMCNT:
1364 return (
MI.getOperand(1).getImm() == 0) &&
1365 (
MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1366 case AMDGPU::S_WAITCNT: {
1367 const int64_t
Imm =
MI.getOperand(0).getImm();
1370 return (Decoded.
DsCnt == 0);
1374 if (
TII->isSOPP(
MI))
1390 std::numeric_limits<int>::max())
1394 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1399bool GCNHazardRecognizer::fixVcmpxExecWARHazard(
MachineInstr *
MI) {
1408 if (!
MI->modifiesRegister(AMDGPU::EXEC, TRI))
1414 return I.readsRegister(AMDGPU::EXEC, TRI);
1420 if (
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))
1422 for (
auto MO :
MI.implicit_operands())
1423 if (MO.isDef() &&
TRI->isSGPRClass(
TRI->getPhysRegBaseClass(MO.getReg())))
1426 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1433 std::numeric_limits<int>::max())
1437 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1444 if (!ST.hasLdsBranchVmemWARHazard())
1449 bool HasLds =
false;
1450 bool HasVmem =
false;
1451 for (
auto &
MBB : MF) {
1452 for (
auto &
MI :
MBB) {
1456 if (HasLds && HasVmem)
1464 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1465 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1466 !
I.getOperand(1).getImm();
1469bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(
MachineInstr *
MI) {
1470 if (!RunLdsBranchVmemWARHazardFixup)
1484 auto InstType = IsHazardInst(*
MI);
1497 auto InstType2 = IsHazardInst(
I);
1498 return InstType2 && InstType != InstType2;
1502 auto InstType2 = IsHazardInst(
I);
1503 if (InstType == InstType2)
1510 std::numeric_limits<int>::max();
1514 std::numeric_limits<int>::max())
1519 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1526bool GCNHazardRecognizer::fixLdsDirectVALUHazard(
MachineInstr *
MI) {
1530 const int NoHazardWaitStates = 15;
1534 bool VisitedTrans =
false;
1540 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1543 if (WaitStates >= NoHazardWaitStates)
1554 auto Count = ::getWaitStatesSince(
IsHazardFn,
MI->getParent(),
1555 std::next(
MI->getReverseIterator()), 0,
1564 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvdst);
1565 WaitVdstOp->
setImm(std::min(Count, NoHazardWaitStates));
1570bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(
MachineInstr *
MI) {
1581 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1588 (
I.getOpcode() == AMDGPU::S_WAITCNT && !
I.getOperand(0).getImm()) ||
1589 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1592 !
TII.getNamedOperand(
I, AMDGPU::OpName::waitvsrc)->getImm());
1596 std::numeric_limits<int>::max())
1599 if (LdsdirCanWait) {
1600 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1603 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1610bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(
MachineInstr *
MI) {
1626 if (SrcVGPRs.
size() <= 1)
1644 const int Intv1plus2MaxVALUs = 2;
1645 const int Intv3MaxVALUs = 4;
1646 const int IntvMaxVALUs = 6;
1647 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1651 int ExecPos = std::numeric_limits<int>::max();
1660 if (State.VALUs > NoHazardVALUWaitStates)
1661 return HazardExpired;
1666 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1668 return HazardExpired;
1671 bool Changed =
false;
1674 if (!State.DefPos.count(Src) &&
I.modifiesRegister(Src, &TRI)) {
1675 State.DefPos[Src] = State.VALUs;
1680 if (State.ExecPos == std::numeric_limits<int>::max()) {
1681 if (!State.DefPos.empty() &&
I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1682 State.ExecPos = State.VALUs;
1689 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1690 return HazardExpired;
1694 return NoHazardFound;
1697 if (State.ExecPos == std::numeric_limits<int>::max())
1698 return NoHazardFound;
1700 int PreExecPos = std::numeric_limits<int>::max();
1701 int PostExecPos = std::numeric_limits<int>::max();
1703 for (
auto Entry : State.DefPos) {
1704 int DefVALUs =
Entry.second;
1705 if (DefVALUs != std::numeric_limits<int>::max()) {
1706 if (DefVALUs >= State.ExecPos)
1707 PreExecPos = std::min(PreExecPos, DefVALUs);
1709 PostExecPos = std::min(PostExecPos, DefVALUs);
1714 if (PostExecPos == std::numeric_limits<int>::max())
1715 return NoHazardFound;
1718 int Intv3VALUs = PostExecPos;
1719 if (Intv3VALUs > Intv3MaxVALUs)
1720 return HazardExpired;
1723 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1724 if (Intv2VALUs > Intv1plus2MaxVALUs)
1725 return HazardExpired;
1728 if (PreExecPos == std::numeric_limits<int>::max())
1729 return NoHazardFound;
1732 int Intv1VALUs = PreExecPos - State.ExecPos;
1733 if (Intv1VALUs > Intv1plus2MaxVALUs)
1734 return HazardExpired;
1737 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1738 return HazardExpired;
1742 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1748 if (!hasHazard<StateType>(State,
IsHazardFn, UpdateStateFn,
MI->getParent(),
1749 std::next(
MI->getReverseIterator()), Visited))
1753 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1759bool GCNHazardRecognizer::fixVALUTransUseHazard(
MachineInstr *
MI) {
1784 const int IntvMaxVALUs = 5;
1785 const int IntvMaxTRANS = 1;
1797 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1798 return HazardExpired;
1803 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1804 I.getOperand(0).getImm() == 0x0fff))
1805 return HazardExpired;
1810 if (
I.modifiesRegister(Src, &TRI)) {
1816 return NoHazardFound;
1818 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1826 if (!hasHazard<StateType>(State,
IsHazardFn, UpdateStateFn,
MI->getParent(),
1827 std::next(
MI->getReverseIterator()), Visited))
1833 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1853 TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
1855 TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
1858 TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1860 if (
TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1861 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1870 TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
1871 if (
TRI->regsOverlap(PrevDstReg, CurIndex))
1885 std::numeric_limits<int>::max())
1888 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(),
TII->get(AMDGPU::V_NOP_e32));
1893bool GCNHazardRecognizer::fixShift64HighRegBug(
MachineInstr *
MI) {
1898 switch (
MI->getOpcode()) {
1901 case AMDGPU::V_LSHLREV_B64_e64:
1902 case AMDGPU::V_LSHRREV_B64_e64:
1903 case AMDGPU::V_ASHRREV_I64_e64:
1914 if (!
TRI.isVGPR(
MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1917 if (AmtReg != AMDGPU::VGPR255 &&
MRI.isPhysRegUsed(AmtReg + 1))
1921 bool OverlappedSrc = Src1->
isReg() &&
TRI.regsOverlap(Src1->
getReg(), AmtReg);
1922 bool OverlappedDst =
MI->modifiesRegister(AmtReg, &TRI);
1923 bool Overlapped = OverlappedSrc || OverlappedDst;
1925 assert(!OverlappedDst || !OverlappedSrc ||
1926 Src1->
getReg() ==
MI->getOperand(0).getReg());
1928 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1931 for (
MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1932 : AMDGPU::VGPR_32RegClass) {
1933 if (!
MI->modifiesRegister(Reg, &TRI) && !
MI->readsRegister(Reg, &TRI)) {
1944 NewAmtLo =
TRI.getSubReg(NewReg, AMDGPU::sub0);
1986 MI->getOperand(0).setReg(NewReg);
1987 if (OverlappedSrc) {
1997 int NSAtoVMEMWaitStates = 1;
2006 const auto *
Offset =
TII->getNamedOperand(*
MI, AMDGPU::OpName::offset);
2014 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2015 TII->getInstSizeInBytes(
I) >= 16;
2018 return NSAtoVMEMWaitStates - getWaitStatesSince(
IsHazardFn, 1);
2021int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
MachineInstr *
MI) {
2022 int FPAtomicToDenormModeWaitStates = 3;
2028 if (
MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2041 switch (
MI.getOpcode()) {
2042 case AMDGPU::S_WAITCNT:
2043 case AMDGPU::S_WAITCNT_VSCNT:
2044 case AMDGPU::S_WAITCNT_VMCNT:
2045 case AMDGPU::S_WAITCNT_EXPCNT:
2046 case AMDGPU::S_WAITCNT_LGKMCNT:
2047 case AMDGPU::S_WAIT_IDLE:
2056 return FPAtomicToDenormModeWaitStates -
2075 int NeighborMFMALatency = 0;
2076 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2081 NeighborMFMALatency = this->getMFMAPipelineWaitStates(
MI);
2085 const int MaxMFMAPipelineWaitStates = 16;
2086 int WaitStatesSinceNeighborMFMA =
2087 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2089 int NeighborMFMAPaddingNeeded =
2091 WaitStatesSinceNeighborMFMA;
2093 return std::max(0, NeighborMFMAPaddingNeeded);
2097 int WaitStatesNeeded = 0;
2098 unsigned Opc =
MI->getOpcode();
2104 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) {
2105 const int LegacyVALUWritesVGPRWaitStates = 2;
2106 const int VALUWritesExecWaitStates = 4;
2107 const int MaxWaitStates = 4;
2109 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2110 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2111 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2113 if (WaitStatesNeeded < MaxWaitStates) {
2115 const int MaxWaitStates = 2;
2120 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2121 getWaitStatesSinceDef(
Use.getReg(), IsVALUFn, MaxWaitStates);
2122 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2124 if (WaitStatesNeeded == MaxWaitStates)
2134 if (
Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2137 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2138 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2139 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2140 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2141 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2142 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2143 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2144 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2145 const int MaxWaitStates = 18;
2147 unsigned HazardDefLatency = 0;
2149 auto IsOverlappedMFMAFn = [
Reg, &HazardDefLatency,
2157 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2158 return TRI.regsOverlap(DstReg, Reg);
2161 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2163 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2165 int OpNo =
Op.getOperandNo();
2166 if (OpNo == SrcCIdx) {
2167 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2168 }
else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2169 switch (HazardDefLatency) {
2170 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2172 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2174 case 16: [[fallthrough]];
2175 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2178 }
else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2179 switch (HazardDefLatency) {
2180 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2182 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2184 case 16: [[fallthrough]];
2185 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2190 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2191 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2193 if (WaitStatesNeeded == MaxWaitStates)
2194 return WaitStatesNeeded;
2197 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2200 return TRI.regsOverlap(Reg, DstReg);
2203 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2204 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2205 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2206 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2207 if (OpNo == SrcCIdx)
2208 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2209 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2210 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2212 WaitStatesNeededForUse = NeedWaitStates -
2213 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2214 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2216 if (WaitStatesNeeded == MaxWaitStates)
2217 return WaitStatesNeeded;
2220 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2221 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2222 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2223 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2224 const int MaxWaitStates = 13;
2225 Register DstReg =
MI->getOperand(0).getReg();
2226 unsigned HazardDefLatency = 0;
2228 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2234 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2235 return TRI.regsOverlap(Reg, DstReg);
2238 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2240 switch (HazardDefLatency) {
2241 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2243 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2245 case 16: [[fallthrough]];
2246 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2250 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2251 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2255 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2257 return WaitStatesNeeded;
2268 return NumPasses + 1 + IsGFX950;
2279 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2297 return NumPasses + 2;
2305 return NumPasses + 3;
2309 int WaitStatesNeeded = 0;
2310 unsigned Opc =
MI->getOpcode();
2322 return WaitStatesNeeded;
2324 const int VALUWritesExecWaitStates = 4;
2325 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2326 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2327 VALUWritesExecWaitStates);
2328 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2334 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2335 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2336 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2337 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2338 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2339 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2340 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2341 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2342 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2343 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2344 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2345 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2346 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2347 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2348 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2349 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2350 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2351 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2352 const int MaxWaitStates = 19;
2360 auto IsOverlappedMFMAFn = [
Reg, &FullReg, &MI1,
2365 FullReg = (DstReg ==
Reg);
2367 return TRI.regsOverlap(DstReg, Reg);
2370 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2371 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2372 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2375 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2376 if (NumWaitStates == std::numeric_limits<int>::max())
2381 int NeedWaitStates = 0;
2382 if (OpNo == SrcCIdx) {
2385 }
else if (FullReg) {
2386 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2387 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2388 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2389 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2390 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2392 TSchedModel.computeInstrLatency(MI1) == 2)
2393 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2396 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2397 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2398 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2399 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2403 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2404 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2406 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2407 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2409 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2412 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2423 NumPasses, ST.hasGFX950Insts()))
2429 switch (NumPasses) {
2432 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2433 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2438 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2439 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2444 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2445 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2454 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2455 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2456 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2457 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2460 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2461 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2463 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2464 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2465 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2468 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2480 switch (NumPasses) {
2482 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2487 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2491 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2495 if (WaitStatesNeeded >= NeedWaitStates)
2498 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2499 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2501 if (WaitStatesNeeded == MaxWaitStates)
2506 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2508 return WaitStatesNeeded;
2516 int WaitStatesNeeded = 0;
2519 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2528 const int AccVgprReadLdStWaitStates = 2;
2529 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2530 const int MaxWaitStates = 2;
2532 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2533 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2534 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2536 if (WaitStatesNeeded == MaxWaitStates)
2537 return WaitStatesNeeded;
2540 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2541 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2546 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 ) <
2547 std::numeric_limits<int>::max();
2550 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2551 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2552 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2555 return WaitStatesNeeded;
2560 "this is a different vcmpx+permlane hazard");
2572 const int VCmpXWritesExecWaitStates = 4;
2573 const int VALUWritesVDstWaitStates = 2;
2574 int WaitStatesNeeded = 0;
2581 int WaitStatesSinceDef =
2582 VALUWritesVDstWaitStates -
2583 getWaitStatesSinceDef(Reg, IsVALUFn,
2584 VALUWritesVDstWaitStates);
2585 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2586 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2590 int VCmpXHazardWaits =
2591 VCmpXWritesExecWaitStates -
2592 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2594 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2595 return WaitStatesNeeded;
2603 return NumPasses + 2;
2611 return NumPasses + 3;
2619 return NumPasses + 3;
2627 return NumPasses + 2;
2644 int WaitStatesNeeded = 0;
2656 !
TRI.regsOverlap(
MI.getOperand(0).getReg(), Reg))
2665 !
TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
2671 bool DGEMMAfterVALUWrite =
false;
2672 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite,
this](
const MachineInstr &
MI) {
2675 DGEMMAfterVALUWrite =
true;
2679 if (!
TII.isVALU(
MI) || !DGEMMAfterVALUWrite)
2686 AMDGPU::OpName::src2);
2688 if (IsMemOrExport || IsVALU) {
2689 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2690 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2691 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2692 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2693 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2694 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2695 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2696 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
2697 const int DotWriteSameDotReadSrcAB = 3;
2698 const int DotWriteDifferentVALURead = 3;
2699 const int DMFMABetweenVALUWriteVMEMRead = 2;
2700 const int MaxWaitStates = 19;
2708 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2711 int NeedWaitStates = 0;
2712 if (
DOT->getOpcode() ==
MI->getOpcode()) {
2713 if (&
Use - &
MI->getOperand(0) != SrcCIdx)
2714 NeedWaitStates = DotWriteSameDotReadSrcAB;
2716 NeedWaitStates = DotWriteDifferentVALURead;
2719 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2720 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2728 DGEMMAfterVALUWrite =
false;
2729 if (
TRI.isVectorRegister(
MRI, Reg)) {
2730 int WaitStatesNeededForUse =
2731 DMFMABetweenVALUWriteVMEMRead -
2732 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2733 DMFMABetweenVALUWriteVMEMRead);
2735 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2740 WaitStatesSinceDef =
2741 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2745 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2746 int NumPasses = HazardDefLatency;
2747 int NeedWaitStates = MaxWaitStates;
2749 if (
isDGEMM(MFMA->getOpcode())) {
2750 switch (HazardDefLatency) {
2752 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2753 : DMFMA4x4WriteVgprVALUReadWaitStates;
2759 ? DMFMA16x16WriteVgprMemExpReadWaitStates
2761 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
2762 : DMFMA16x16WriteVgprVALUReadWaitStates);
2774 switch (HazardDefLatency) {
2776 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2779 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2782 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2789 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2790 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2792 if (WaitStatesNeeded == MaxWaitStates)
2797 unsigned Opc =
MI->getOpcode();
2798 const int DMFMAToFMA64WaitStates = 2;
2799 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2800 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2801 Opc == AMDGPU::V_FMAC_F64_dpp) &&
2802 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2803 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2804 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2805 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2808 if (!IsVALU && !IsMemOrExport)
2809 return WaitStatesNeeded;
2812 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2813 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2814 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2815 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2816 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2817 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2818 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2819 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2820 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2821 const int DotWriteDifferentVALUWrite = 3;
2822 const int MaxWaitStates = 19;
2823 const int MaxWarWaitStates = 15;
2828 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2830 if (DOT &&
DOT->getOpcode() !=
MI->getOpcode())
2831 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2832 WaitStatesSinceDef);
2835 WaitStatesSinceDef =
2836 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2838 int NeedWaitStates = MaxWaitStates;
2839 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2841 if (
isDGEMM(MFMA->getOpcode())) {
2842 switch (NumPasses) {
2844 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2848 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2859 switch (NumPasses) {
2861 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2864 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2867 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2874 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2875 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2877 if (WaitStatesNeeded == MaxWaitStates)
2883 !
MI.readsRegister(Reg, &TRI))
2890 TII.getNamedOperand(
MI, AMDGPU::OpName::src2);
2900 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2905 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2906 int NeedWaitStates = MaxWaitStates;
2907 switch (HazardDefLatency) {
2908 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2911 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2913 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2915 case 16: [[fallthrough]];
2916 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2920 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2921 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2924 return WaitStatesNeeded;
2937 return MAI !=
nullptr;
2941 if (IsMFMAFn(*
MI)) {
2942 int W = getWaitStatesSince(IsMFMAFn, 16);
2944 return W < (int)TSchedModel.computeInstrLatency(MAI);
2958 while (
I->isBundledWithPred())
2964 if (
I->getOpcode() != AMDGPU::S_GETPC_B64)
2968 const unsigned NewBytes = 4;
2970 "Unexpected instruction insertion in bundle");
2973 while (NextMI !=
End && NextMI->isBundledWithPred()) {
2974 for (
auto &Operand : NextMI->operands()) {
2975 if (Operand.isGlobal())
2976 Operand.setOffset(Operand.getOffset() + NewBytes);
2982bool GCNHazardRecognizer::fixVALUMaskWriteHazard(
MachineInstr *
MI) {
2999 if (!SDSTOp || !SDSTOp->
isReg())
3003 if (HazardReg == AMDGPU::EXEC ||
3004 HazardReg == AMDGPU::EXEC_LO ||
3005 HazardReg == AMDGPU::EXEC_HI ||
3006 HazardReg == AMDGPU::M0)
3010 switch (
I.getOpcode()) {
3011 case AMDGPU::V_ADDC_U32_e32:
3012 case AMDGPU::V_ADDC_U32_dpp:
3013 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3014 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3015 case AMDGPU::V_CNDMASK_B32_e32:
3016 case AMDGPU::V_CNDMASK_B32_dpp:
3017 case AMDGPU::V_DIV_FMAS_F32_e64:
3018 case AMDGPU::V_DIV_FMAS_F64_e64:
3019 case AMDGPU::V_SUBB_U32_e32:
3020 case AMDGPU::V_SUBB_U32_dpp:
3021 case AMDGPU::V_SUBBREV_U32_e32:
3022 case AMDGPU::V_SUBBREV_U32_dpp:
3024 return HazardReg == AMDGPU::VCC ||
3025 HazardReg == AMDGPU::VCC_LO ||
3026 HazardReg == AMDGPU::VCC_HI;
3027 case AMDGPU::V_ADDC_U32_e64:
3028 case AMDGPU::V_ADDC_U32_e64_dpp:
3029 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3030 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3031 case AMDGPU::V_CNDMASK_B32_e64:
3032 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3033 case AMDGPU::V_SUBB_U32_e64:
3034 case AMDGPU::V_SUBB_U32_e64_dpp:
3035 case AMDGPU::V_SUBBREV_U32_e64:
3036 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3040 return TRI.regsOverlap(SSRCOp->
getReg(), HazardReg);
3050 if (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3059 for (
int OpNo = 0,
End =
I.getNumOperands(); OpNo <
End; ++OpNo) {
3067 if (OpReg == AMDGPU::EXEC ||
3068 OpReg == AMDGPU::EXEC_LO ||
3069 OpReg == AMDGPU::EXEC_HI)
3072 if (
Op.isImplicit()) {
3073 if (OpReg == AMDGPU::VCC ||
3074 OpReg == AMDGPU::VCC_LO ||
3075 OpReg == AMDGPU::VCC_HI)
3079 if (
TRI.isSGPRReg(
MRI, OpReg))
3084 if (!
TII.isInlineConstant(
Op, OpInfo))
3093 std::numeric_limits<int>::max())
3096 auto NextMI = std::next(
MI->getIterator());
3099 auto NewMI =
BuildMI(*
MI->getParent(), NextMI,
MI->getDebugLoc(),
3100 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3116 case AMDGPU::EXEC_LO:
3117 case AMDGPU::EXEC_HI:
3118 case AMDGPU::SGPR_NULL:
3119 case AMDGPU::SGPR_NULL64:
3124 unsigned RegN =
TRI.getEncodingValue(Reg);
3127 return (RegN >> 1) & 0x3f;
3131void GCNHazardRecognizer::computeVALUHazardSGPRs(
MachineFunction *MMF) {
3135 if (!VALUReadHazardSGPRs.
empty())
3144 UseVALUReadHazardExhaustiveSearch =
3149 bool UseVALUUseCache =
3151 VALUReadHazardSGPRs.
resize(64, !UseVALUUseCache);
3152 if (!UseVALUUseCache)
3160 BitVector SALUWriteSGPRs(64), ReadSGPRs(64);
3169 if (!IsVALU && !IsSALU)
3178 if (
Op.isImplicit() && !(Reg == AMDGPU::VCC_LO ||
3179 Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC))
3181 if (!
TRI.isSGPRReg(
MRI, Reg))
3186 if (IsVALU &&
Op.isUse()) {
3188 if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN]))
3189 VALUReadHazardSGPRs.
set(*RegN);
3190 ReadSGPRs.
set(*RegN);
3191 }
else if (IsSALU) {
3193 SALUWriteSGPRs.
set(*RegN);
3195 ReadSGPRs.
set(*RegN);
3202bool GCNHazardRecognizer::fixVALUReadSGPRHazard(
MachineInstr *
MI) {
3218 if (!(MIIsSALU || MIIsVALU))
3228 TII.getNamedOperand(*
MI, AMDGPU::OpName::sdst);
3229 if (!SDSTOp || !SDSTOp->
isReg())
3233 if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO ||
3234 HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0)
3238 auto NextMI = std::next(
MI->getIterator());
3239 auto NewMI =
BuildMI(*
MI->getParent(), NextMI,
MI->getDebugLoc(),
3240 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3251 computeVALUHazardSGPRs(
MI->getMF());
3254 if (VALUReadHazardSGPRs.
none())
3259 const bool IsSetPC = (
MI->isCall() ||
MI->isReturn()) &&
3260 !(
MI->getOpcode() == AMDGPU::S_ENDPGM ||
3261 MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED);
3273 if (MIIsSALU &&
Op.isImplicit())
3276 if (!
TRI.isSGPRReg(
MRI, OpReg))
3283 if (!VALUReadHazardSGPRs[*RegN])
3290 if (SGPRsUsed.
empty())
3300 if (IsSetPC &&
I.getNumDefs() > 0)
3304 return I.modifiesRegister(Reg, &TRI);
3310 if (Count >= SALUExpiryCount)
3313 if (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3319 auto WaitStatesFn = [
this, &SGPRsUsed](
const MachineInstr &
I) {
3325 [
this, &
I](
Register Reg) {
return I.readsRegister(Reg, &TRI); }))
3332 int WaitStates = ::getWaitStatesSince(
IsHazardFn,
MI->getParent(),
3333 std::next(
MI->getReverseIterator()), 0,
3336 if (WaitStates >= SALUExpiryCount)
3340 if (UseVALUReadHazardExhaustiveSearch) {
3344 if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI)
3347 return Register(AMDGPU::SGPR0_SGPR1 + *RegN);
3349 auto SearchHazardFn = [
this, hazardPair,
3355 return I.readsRegister(hazardPair(Reg), &TRI);
3358 auto SearchExpiredFn = [&](
const MachineInstr &
I,
int Count) {
3361 if (::getWaitStatesSince(SearchHazardFn,
MI, SearchExpiredFn) ==
3362 std::numeric_limits<int>::max())
3367 auto NewMI =
BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(),
3368 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3380 if (EntryMBB.
begin() != EntryMBB.
end()) {
3381 auto &EntryMI = *EntryMBB.
begin();
3382 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3383 EntryMI.getOperand(0).getImm() >= Priority)
3392bool GCNHazardRecognizer::fixRequiredExportPriority(
MachineInstr *
MI) {
3411 const int MaxPriority = 3;
3412 const int NormalPriority = 2;
3413 const int PostExportPriority = 0;
3415 auto It =
MI->getIterator();
3416 switch (
MI->getOpcode()) {
3417 case AMDGPU::S_ENDPGM:
3418 case AMDGPU::S_ENDPGM_SAVED:
3419 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3420 case AMDGPU::SI_RETURN_TO_EPILOG:
3426 case AMDGPU::S_SETPRIO: {
3428 auto &PrioOp =
MI->getOperand(0);
3429 int Prio = PrioOp.getImm();
3430 bool InWA = (Prio == PostExportPriority) &&
3431 (It !=
MBB->
begin() &&
TII.isEXP(*std::prev(It)));
3432 if (InWA || Prio >= NormalPriority)
3434 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3438 if (!
TII.isEXP(*
MI))
3445 bool Changed =
false;
3449 auto NextMI = std::next(It);
3450 bool EndOfShader =
false;
3451 if (NextMI !=
MBB->
end()) {
3453 if (
TII.isEXP(*NextMI))
3456 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3457 NextMI->getOperand(0).getImm() == PostExportPriority)
3459 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3466 .
addImm(PostExportPriority);
3471 .
addReg(AMDGPU::SGPR_NULL)
unsigned const MachineRegisterInfo * MRI
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Analysis containing CSE Info
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI)
static bool isLdsDma(const MachineInstr &MI)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static std::optional< unsigned > sgprPairNumber(Register Reg, const SIRegisterInfo &TRI)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static bool hasHazard(StateT State, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, DenseSet< const MachineBasicBlock * > &Visited)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static bool isDGEMM(unsigned Opcode)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > MaxExhaustiveHazardSearch("amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden, cl::desc("Maximum function size for exhausive hazard search"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static const uint32_t IV[8]
void resize(unsigned N, bool t=false)
resize - Grow or shrink the bitvector.
bool anyCommon(const BitVector &RHS) const
Test if any common bits are set.
bool none() const
none - Returns true if none of the bits are set.
bool empty() const
empty - Tests whether there are no bits in this bitvector.
This class represents an Operation in the Expression.
Implements a dense probed hash-table based set.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
unsigned PreEmitNoopsCommon(MachineInstr *)
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
bool hasShift64HighRegBug() const
bool hasFPAtomicToDenormModeHazard() const
bool hasLdsBranchVmemWARHazard() const
bool hasGFX90AInsts() const
bool hasDstSelForwardingHazard() const
bool hasCvtScaleForwardingHazard() const
const SIInstrInfo * getInstrInfo() const override
bool hasVALUMaskWriteHazard() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasVcmpxExecWARHazard() const
bool hasReadM0MovRelInterpHazard() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasRequiredExportPriority() const
bool hasLdsWaitVMSRC() const
bool hasExtendedWaitCounts() const
bool hasVcmpxPermlaneHazard() const
bool hasGFX950Insts() const
bool has12DWordStoreHazard() const
bool hasVALUPartialForwardingHazard() const
bool hasNoDataDepHazard() const
unsigned getSetRegWaitStates() const
Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
bool hasTransForwardingHazard() const
bool hasGFX940Insts() const
bool hasReadM0LdsDmaHazard() const
bool hasVALUReadSGPRHazard() const
bool hasSMEMtoVectorWriteHazard() const
bool hasVMEMtoScalarWriteHazard() const
bool hasNSAtoVMEMBug() const
bool hasVDecCoExecHazard() const
bool hasReadM0SendMsgHazard() const
bool hasReadM0LdsDirectHazard() const
bool isXNACKEnabled() const
bool hasSMRDReadVALUDefHazard() const
A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR was written by a VALU inst...
bool hasRFEHazards() const
bool hasVMEMReadSGPRVALUDefHazard() const
A read of an SGPR by a VMEM instruction requires 5 wait states when the SGPR was written by a VALU In...
bool hasVALUTransUseHazard() const
bool hasLdsDirect() const
void compute(FunctionT &F)
Compute the cycle info for a function.
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
Describe properties that are true of each instruction in the target description file.
ArrayRef< MCOperandInfo > operands() const
This holds information about one operand of a machine instruction, indicating the register class for ...
Wrapper class representing physical registers. Should be passed by value.
Instructions::const_reverse_iterator const_reverse_instr_iterator
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< pred_iterator > predecessors()
bool hasCalls() const
Return true if the current function has any function calls.
unsigned getInstructionCount() const
Return the number of MachineInstrs in this MachineFunction.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Wrapper class representing virtual and physical registers.
static bool isMAI(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSOPP(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isVINTRP(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
unsigned getOccupancy() const
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned getMaxLookAhead() const
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
StringRef - Represent a constant reference to a string, i.e.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
ProcResIter getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
A Use represents the edge between a Value definition and its users.
unsigned getOperandNo() const
Return the operand # of this use in its User.
LLVM Value Representation.
std::pair< iterator, bool > insert(const ValueT &V)
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
IsaVersion getIsaVersion(StringRef GPU)
bool getMAIIsGFX940XDL(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool getMAIIsDGEMM(unsigned Opc)
Returns true if MAI operation is a double precision GEMM.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< po_iterator< T > > post_order(const T &G)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
auto reverse(ContainerTy &&C)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.
Description of the encoding of one expression Op.
Summarize the scheduling resources required for an instruction of a particular scheduling class.
uint16_t ReleaseAtCycle
Cycle at which the resource will be released by an instruction, relatively to the cycle in which the ...