LLVM 20.0.0git
GCNHazardRecognizer.cpp
Go to the documentation of this file.
1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "GCNSubtarget.h"
22
23using namespace llvm;
24
25namespace {
26
27struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
28 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
29
30 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
31 if (Arg.getAsInteger(0, Value))
32 return O.error("'" + Arg + "' value invalid for uint argument!");
33
34 if (Value > 100)
35 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
36
37 return false;
38 }
39};
40
41} // end anonymous namespace
42
44 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
45 cl::desc("Fill a percentage of the latency between "
46 "neighboring MFMA with s_nops."));
47
49 "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden,
50 cl::desc("Maximum function size for exhausive hazard search"));
51
52//===----------------------------------------------------------------------===//
53// Hazard Recognizer Implementation
54//===----------------------------------------------------------------------===//
55
57 const GCNSubtarget &ST);
58
60 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
61 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
62 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
63 UseVALUReadHazardExhaustiveSearch(false),
64 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
65 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
66 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
67}
68
70 EmittedInstrs.clear();
71}
72
75}
76
78 CurrCycleInstr = MI;
79}
80
81static bool isDivFMas(unsigned Opcode) {
82 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
83}
84
85static bool isSGetReg(unsigned Opcode) {
86 return Opcode == AMDGPU::S_GETREG_B32;
87}
88
89static bool isSSetReg(unsigned Opcode) {
90 switch (Opcode) {
91 case AMDGPU::S_SETREG_B32:
92 case AMDGPU::S_SETREG_B32_mode:
93 case AMDGPU::S_SETREG_IMM32_B32:
94 case AMDGPU::S_SETREG_IMM32_B32_mode:
95 return true;
96 }
97 return false;
98}
99
100static bool isRWLane(unsigned Opcode) {
101 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
102}
103
104static bool isRFE(unsigned Opcode) {
105 return Opcode == AMDGPU::S_RFE_B64;
106}
107
108static bool isSMovRel(unsigned Opcode) {
109 switch (Opcode) {
110 case AMDGPU::S_MOVRELS_B32:
111 case AMDGPU::S_MOVRELS_B64:
112 case AMDGPU::S_MOVRELD_B32:
113 case AMDGPU::S_MOVRELD_B64:
114 return true;
115 default:
116 return false;
117 }
118}
119
120static bool isDGEMM(unsigned Opcode) {
121 return AMDGPU::getMAIIsDGEMM(Opcode);
122}
123
124static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
125 unsigned Opcode = MI.getOpcode();
126
127 if (!SIInstrInfo::isMAI(MI) ||
128 isDGEMM(Opcode) ||
129 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
130 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
131 return false;
132
133 if (!ST.hasGFX940Insts())
134 return true;
135
136 return AMDGPU::getMAIIsGFX940XDL(Opcode);
137}
138
140 const MachineInstr &MI) {
141 if (TII.isAlwaysGDS(MI.getOpcode()))
142 return true;
143
144 switch (MI.getOpcode()) {
145 case AMDGPU::S_SENDMSG:
146 case AMDGPU::S_SENDMSGHALT:
147 case AMDGPU::S_TTRACEDATA:
148 return true;
149 // These DS opcodes don't support GDS.
150 case AMDGPU::DS_NOP:
151 case AMDGPU::DS_PERMUTE_B32:
152 case AMDGPU::DS_BPERMUTE_B32:
153 return false;
154 default:
155 if (TII.isDS(MI.getOpcode())) {
156 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
157 AMDGPU::OpName::gds);
158 if (MI.getOperand(GDS).getImm())
159 return true;
160 }
161 return false;
162 }
163}
164
165static bool isPermlane(const MachineInstr &MI) {
166 unsigned Opcode = MI.getOpcode();
167 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
168 Opcode == AMDGPU::V_PERMLANE64_B32 ||
169 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
170 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
171 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
172 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
173 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
174 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
175 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64;
176}
177
178static bool isLdsDma(const MachineInstr &MI) {
179 return SIInstrInfo::isVALU(MI) &&
181}
182
183static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
184 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
185 AMDGPU::OpName::simm16);
186 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
187}
188
191 MachineInstr *MI = SU->getInstr();
192 // If we are not in "HazardRecognizerMode" and therefore not being run from
193 // the scheduler, track possible stalls from hazards but don't insert noops.
194 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
195
196 if (MI->isBundle())
197 return NoHazard;
198
199 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
200 return HazardType;
201
202 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
203 return HazardType;
204
205 if (checkFPAtomicToDenormModeHazard(MI) > 0)
206 return HazardType;
207
208 if (ST.hasNoDataDepHazard())
209 return NoHazard;
210
211 // FIXME: Should flat be considered vmem?
212 if ((SIInstrInfo::isVMEM(*MI) ||
214 && checkVMEMHazards(MI) > 0)
215 return HazardType;
216
217 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
218 return HazardType;
219
220 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
221 return HazardType;
222
223 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
224 return HazardType;
225
226 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
227 return HazardType;
228
231 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
232 return HazardType;
233
234 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
235 return HazardType;
236
237 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
238 return HazardType;
239
240 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
241 return HazardType;
242
243 if (((ST.hasReadM0MovRelInterpHazard() &&
244 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
245 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
246 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
248 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
250 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
251 checkReadM0Hazards(MI) > 0)
252 return HazardType;
253
254 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
255 return HazardType;
256
257 if ((SIInstrInfo::isVMEM(*MI) ||
259 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
260 return HazardType;
261
262 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
263 return HazardType;
264
265 return NoHazard;
266}
267
269 unsigned Quantity) {
270 while (Quantity > 0) {
271 unsigned Arg = std::min(Quantity, 8u);
272 Quantity -= Arg;
273 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
274 .addImm(Arg - 1);
275 }
276}
277
278unsigned
279GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
280 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
281 assert(TSchedModel.getWriteProcResBegin(SC) !=
282 TSchedModel.getWriteProcResEnd(SC));
283 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
284}
285
286void GCNHazardRecognizer::processBundle() {
287 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
289 // Check bundled MachineInstr's for hazards.
290 for (; MI != E && MI->isInsideBundle(); ++MI) {
291 CurrCycleInstr = &*MI;
292 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
293
294 if (IsHazardRecognizerMode) {
295 fixHazards(CurrCycleInstr);
296
297 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
298 }
299
300 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
301 // include the bundled MI directly after, only add a maximum of
302 // (MaxLookAhead - 1) noops to EmittedInstrs.
303 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
304 EmittedInstrs.push_front(nullptr);
305
306 EmittedInstrs.push_front(CurrCycleInstr);
307 EmittedInstrs.resize(MaxLookAhead);
308 }
309 CurrCycleInstr = nullptr;
310}
311
312void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
313 assert(IsHazardRecognizerMode);
314
315 unsigned NumPreNoops = PreEmitNoops(MI);
316 EmitNoops(NumPreNoops);
317 if (MI->isInsideBundle())
318 insertNoopsInBundle(MI, TII, NumPreNoops);
319 else
320 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
321 NumPreNoops);
323 AdvanceCycle();
324}
325
327 IsHazardRecognizerMode = true;
328 CurrCycleInstr = MI;
329 unsigned W = PreEmitNoopsCommon(MI);
330 fixHazards(MI);
331 CurrCycleInstr = nullptr;
332 return W;
333}
334
336 if (MI->isBundle())
337 return 0;
338
339 int WaitStates = 0;
340
342 return std::max(WaitStates, checkSMRDHazards(MI));
343
344 if (ST.hasNSAtoVMEMBug())
345 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
346
347 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
348
349 if (ST.hasNoDataDepHazard())
350 return WaitStates;
351
353 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
354
356 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
357
359 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
360
361 if (isDivFMas(MI->getOpcode()))
362 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
363
364 if (isRWLane(MI->getOpcode()))
365 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
366
369 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
370 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
371
372 if (MI->isInlineAsm())
373 return std::max(WaitStates, checkInlineAsmHazards(MI));
374
375 if (isSGetReg(MI->getOpcode()))
376 return std::max(WaitStates, checkGetRegHazards(MI));
377
378 if (isSSetReg(MI->getOpcode()))
379 return std::max(WaitStates, checkSetRegHazards(MI));
380
381 if (isRFE(MI->getOpcode()))
382 return std::max(WaitStates, checkRFEHazards(MI));
383
384 if ((ST.hasReadM0MovRelInterpHazard() &&
385 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
386 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
387 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
389 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
391 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
392 return std::max(WaitStates, checkReadM0Hazards(MI));
393
395 return std::max(WaitStates, checkMAIHazards(MI));
396
397 if (SIInstrInfo::isVMEM(*MI) ||
400 return std::max(WaitStates, checkMAILdStHazards(MI));
401
402 if (ST.hasGFX950Insts() && isPermlane(*MI))
403 return std::max(WaitStates, checkPermlaneHazards(MI));
404
405 return WaitStates;
406}
407
409 EmittedInstrs.push_front(nullptr);
410}
411
413 // When the scheduler detects a stall, it will call AdvanceCycle() without
414 // emitting any instructions.
415 if (!CurrCycleInstr) {
416 EmittedInstrs.push_front(nullptr);
417 return;
418 }
419
420 if (CurrCycleInstr->isBundle()) {
421 processBundle();
422 return;
423 }
424
425 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
426 if (!NumWaitStates) {
427 CurrCycleInstr = nullptr;
428 return;
429 }
430
431 // Keep track of emitted instructions
432 EmittedInstrs.push_front(CurrCycleInstr);
433
434 // Add a nullptr for each additional wait state after the first. Make sure
435 // not to add more than getMaxLookAhead() items to the list, since we
436 // truncate the list to that size right after this loop.
437 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
438 i < e; ++i) {
439 EmittedInstrs.push_front(nullptr);
440 }
441
442 // getMaxLookahead() is the largest number of wait states we will ever need
443 // to insert, so there is no point in keeping track of more than that many
444 // wait states.
445 EmittedInstrs.resize(getMaxLookAhead());
446
447 CurrCycleInstr = nullptr;
448}
449
451 llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
452}
453
454//===----------------------------------------------------------------------===//
455// Helper Functions
456//===----------------------------------------------------------------------===//
457
458using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
459
460using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
461using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
462
463// Search for a hazard in a block and its predecessors.
464template <typename StateT>
465static bool
466hasHazard(StateT State,
467 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
468 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
469 const MachineBasicBlock *MBB,
472 for (auto E = MBB->instr_rend(); I != E; ++I) {
473 // No need to look at parent BUNDLE instructions.
474 if (I->isBundle())
475 continue;
476
477 switch (IsHazard(State, *I)) {
478 case HazardFound:
479 return true;
480 case HazardExpired:
481 return false;
482 default:
483 // Continue search
484 break;
485 }
486
487 if (I->isInlineAsm() || I->isMetaInstruction())
488 continue;
489
490 UpdateState(State, *I);
491 }
492
493 for (MachineBasicBlock *Pred : MBB->predecessors()) {
494 if (!Visited.insert(Pred).second)
495 continue;
496
497 if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
498 Visited))
499 return true;
500 }
501
502 return false;
503}
504
505// Returns a minimum wait states since \p I walking all predecessors.
506// Only scans until \p IsExpired does not return true.
507// Can only be run in a hazard recognizer mode.
513 for (auto E = MBB->instr_rend(); I != E; ++I) {
514 // Don't add WaitStates for parent BUNDLE instructions.
515 if (I->isBundle())
516 continue;
517
518 if (IsHazard(*I))
519 return WaitStates;
520
521 if (I->isInlineAsm())
522 continue;
523
524 WaitStates += GetNumWaitStates(*I);
525
526 if (IsExpired(*I, WaitStates))
527 return std::numeric_limits<int>::max();
528 }
529
530 int MinWaitStates = std::numeric_limits<int>::max();
531 for (MachineBasicBlock *Pred : MBB->predecessors()) {
532 if (!Visited.insert(Pred).second)
533 continue;
534
535 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
536 IsExpired, Visited, GetNumWaitStates);
537
538 MinWaitStates = std::min(MinWaitStates, W);
539 }
540
541 return MinWaitStates;
542}
543
545 const MachineInstr *MI, IsExpiredFn IsExpired) {
547 return getWaitStatesSince(IsHazard, MI->getParent(),
548 std::next(MI->getReverseIterator()),
549 0, IsExpired, Visited);
550}
551
552int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
553 if (IsHazardRecognizerMode) {
554 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
555 return WaitStates >= Limit;
556 };
557 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
558 }
559
560 int WaitStates = 0;
561 for (MachineInstr *MI : EmittedInstrs) {
562 if (MI) {
563 if (IsHazard(*MI))
564 return WaitStates;
565
566 if (MI->isInlineAsm())
567 continue;
568 }
569 ++WaitStates;
570
571 if (WaitStates >= Limit)
572 break;
573 }
574 return std::numeric_limits<int>::max();
575}
576
577int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
578 IsHazardFn IsHazardDef,
579 int Limit) {
580 const SIRegisterInfo *TRI = ST.getRegisterInfo();
581
582 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
583 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
584 };
585
586 return getWaitStatesSince(IsHazardFn, Limit);
587}
588
589int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
590 int Limit) {
591 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
592 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
593 };
594
595 return getWaitStatesSince(IsHazardFn, Limit);
596}
597
598//===----------------------------------------------------------------------===//
599// No-op Hazard Detection
600//===----------------------------------------------------------------------===//
601
602static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
603 MCRegister Reg) {
604 for (MCRegUnit Unit : TRI.regunits(Reg))
605 BV.set(Unit);
606}
607
608static void addRegsToSet(const SIRegisterInfo &TRI,
610 BitVector &DefSet, BitVector &UseSet) {
611 for (const MachineOperand &Op : Ops) {
612 if (Op.isReg())
613 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
614 }
615}
616
617void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
618 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
619}
620
622 return !SIInstrInfo::isSMRD(*MI);
623}
624
627}
628
629int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
630 // SMEM soft clause are only present on VI+, and only matter if xnack is
631 // enabled.
632 if (!ST.isXNACKEnabled())
633 return 0;
634
635 bool IsSMRD = TII.isSMRD(*MEM);
636
637 resetClause();
638
639 // A soft-clause is any group of consecutive SMEM instructions. The
640 // instructions in this group may return out of order and/or may be
641 // replayed (i.e. the same instruction issued more than once).
642 //
643 // In order to handle these situations correctly we need to make sure that
644 // when a clause has more than one instruction, no instruction in the clause
645 // writes to a register that is read by another instruction in the clause
646 // (including itself). If we encounter this situation, we need to break the
647 // clause by inserting a non SMEM instruction.
648
649 for (MachineInstr *MI : EmittedInstrs) {
650 // When we hit a non-SMEM instruction then we have passed the start of the
651 // clause and we can stop.
652 if (!MI)
653 break;
654
656 break;
657
658 addClauseInst(*MI);
659 }
660
661 if (ClauseDefs.none())
662 return 0;
663
664 // We need to make sure not to put loads and stores in the same clause if they
665 // use the same address. For now, just start a new clause whenever we see a
666 // store.
667 if (MEM->mayStore())
668 return 1;
669
670 addClauseInst(*MEM);
671
672 // If the set of defs and uses intersect then we cannot add this instruction
673 // to the clause, so we have a hazard.
674 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
675}
676
677int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
678 int WaitStatesNeeded = 0;
679
680 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
681
682 // This SMRD hazard only affects SI.
683 if (!ST.hasSMRDReadVALUDefHazard())
684 return WaitStatesNeeded;
685
686 // A read of an SGPR by SMRD instruction requires 4 wait states when the
687 // SGPR was written by a VALU instruction.
688 int SmrdSgprWaitStates = 4;
689 auto IsHazardDefFn = [this](const MachineInstr &MI) {
690 return TII.isVALU(MI);
691 };
692 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
693 return TII.isSALU(MI);
694 };
695
696 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
697
698 for (const MachineOperand &Use : SMRD->uses()) {
699 if (!Use.isReg())
700 continue;
701 int WaitStatesNeededForUse =
702 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
703 SmrdSgprWaitStates);
704 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
705
706 // This fixes what appears to be undocumented hardware behavior in SI where
707 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
708 // needs some number of nops in between. We don't know how many we need, but
709 // let's use 4. This wasn't discovered before probably because the only
710 // case when this happens is when we expand a 64-bit pointer into a full
711 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
712 // probably never encountered in the closed-source land.
713 if (IsBufferSMRD) {
714 int WaitStatesNeededForUse =
715 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
716 IsBufferHazardDefFn,
717 SmrdSgprWaitStates);
718 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
719 }
720 }
721
722 return WaitStatesNeeded;
723}
724
725int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
727 return 0;
728
729 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
730
731 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
732 // SGPR was written by a VALU Instruction.
733 const int VmemSgprWaitStates = 5;
734 auto IsHazardDefFn = [this](const MachineInstr &MI) {
735 return TII.isVALU(MI);
736 };
737 for (const MachineOperand &Use : VMEM->uses()) {
738 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
739 continue;
740
741 int WaitStatesNeededForUse =
742 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
743 VmemSgprWaitStates);
744 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
745 }
746 return WaitStatesNeeded;
747}
748
749int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
750 const SIRegisterInfo *TRI = ST.getRegisterInfo();
751 const SIInstrInfo *TII = ST.getInstrInfo();
752
753 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
754 int DppVgprWaitStates = 2;
755 int DppExecWaitStates = 5;
756 int WaitStatesNeeded = 0;
757 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
758 return TII->isVALU(MI);
759 };
760
761 for (const MachineOperand &Use : DPP->uses()) {
762 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
763 continue;
764 int WaitStatesNeededForUse =
765 DppVgprWaitStates - getWaitStatesSinceDef(
766 Use.getReg(),
767 [](const MachineInstr &) { return true; },
768 DppVgprWaitStates);
769 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
770 }
771
772 WaitStatesNeeded = std::max(
773 WaitStatesNeeded,
774 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
775 DppExecWaitStates));
776
777 return WaitStatesNeeded;
778}
779
780int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
781 const SIInstrInfo *TII = ST.getInstrInfo();
782
783 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
784 // instruction.
785 const int DivFMasWaitStates = 4;
786 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
787 return TII->isVALU(MI);
788 };
789 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
790 DivFMasWaitStates);
791
792 return DivFMasWaitStates - WaitStatesNeeded;
793}
794
795int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
796 const SIInstrInfo *TII = ST.getInstrInfo();
797 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
798
799 const int GetRegWaitStates = 2;
800 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
801 return GetRegHWReg == getHWReg(TII, MI);
802 };
803 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
804
805 return GetRegWaitStates - WaitStatesNeeded;
806}
807
808int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
809 const SIInstrInfo *TII = ST.getInstrInfo();
810 unsigned HWReg = getHWReg(TII, *SetRegInstr);
811
812 const int SetRegWaitStates = ST.getSetRegWaitStates();
813 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
814 return HWReg == getHWReg(TII, MI);
815 };
816 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
817 return SetRegWaitStates - WaitStatesNeeded;
818}
819
820int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
821 if (!MI.mayStore())
822 return -1;
823
824 const SIInstrInfo *TII = ST.getInstrInfo();
825 unsigned Opcode = MI.getOpcode();
826 const MCInstrDesc &Desc = MI.getDesc();
827
828 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
829 int VDataRCID = -1;
830 if (VDataIdx != -1)
831 VDataRCID = Desc.operands()[VDataIdx].RegClass;
832
833 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
834 // There is no hazard if the instruction does not use vector regs
835 // (like wbinvl1)
836 if (VDataIdx == -1)
837 return -1;
838 // For MUBUF/MTBUF instructions this hazard only exists if the
839 // instruction is not using a register in the soffset field.
840 const MachineOperand *SOffset =
841 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
842 // If we have no soffset operand, then assume this field has been
843 // hardcoded to zero.
844 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
845 (!SOffset || !SOffset->isReg()))
846 return VDataIdx;
847 }
848
849 // MIMG instructions create a hazard if they don't use a 256-bit T# and
850 // the store size is greater than 8 bytes and they have more than two bits
851 // of their dmask set.
852 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
853 if (TII->isMIMG(MI)) {
854 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
855 assert(SRsrcIdx != -1 &&
856 AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
857 (void)SRsrcIdx;
858 }
859
860 if (TII->isFLAT(MI)) {
861 // There is no hazard if the instruction does not use vector regs
862 if (VDataIdx == -1)
863 return -1;
864
865 if (AMDGPU::getRegBitWidth(VDataRCID) > 64)
866 return VDataIdx;
867 }
868
869 return -1;
870}
871
872int
873GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
874 const MachineRegisterInfo &MRI) {
875 // Helper to check for the hazard where VMEM instructions that store more than
876 // 8 bytes can have there store data over written by the next instruction.
877 const SIRegisterInfo *TRI = ST.getRegisterInfo();
878
879 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
880 int WaitStatesNeeded = 0;
881
882 if (!TRI->isVectorRegister(MRI, Def.getReg()))
883 return WaitStatesNeeded;
884 Register Reg = Def.getReg();
885 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
886 int DataIdx = createsVALUHazard(MI);
887 return DataIdx >= 0 &&
888 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
889 };
890
891 int WaitStatesNeededForDef =
892 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
893 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
894
895 return WaitStatesNeeded;
896}
897
898/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
899/// pack the computed value into correct bit position of the dest register. This
900/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
901/// dst_sel that is not aligned to the register. This function analayzes the \p
902/// MI and \returns an operand with dst forwarding issue, or nullptr if
903/// none exists.
904static const MachineOperand *
907 return nullptr;
908
909 const SIInstrInfo *TII = ST.getInstrInfo();
910
911 unsigned Opcode = MI.getOpcode();
912
913 // There are three different types of instructions
914 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
915 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
916 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
917 // op_sel[3:2]
918 // != 0
919 if (SIInstrInfo::isSDWA(MI)) {
920 // Type 1: SDWA with dst_sel != DWORD
921 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
922 if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
923 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
924 }
925
926 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode);
927 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
928 // Type 2: VOP3 which write the hi bits
929 if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
931 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
932
933 // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
934 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
935 (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
937 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
938 }
939
940 // Special case: nop is required for all the opsel values for fp4 sr variant
941 // cvt scale instructions
942 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
943 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
944
945 return nullptr;
946}
947
948/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
949/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
950/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
952 const MachineOperand *Dst,
953 const SIRegisterInfo *TRI) {
954 // We must consider implicit reads of the VALU. SDWA with dst_sel and
955 // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
956 // and we must account for that hazard.
957 // We also must account for WAW hazards. In particular, WAW with dest
958 // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
959 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
960 // check for ECC. Without accounting for this hazard, the ECC will be
961 // wrong.
962 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
963 // complete zeroesHigh16BitsOfDest)
964 for (auto &Operand : VALU->operands()) {
965 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
966 return true;
967 }
968 }
969 return false;
970}
971
972int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
973 int WaitStatesNeeded = 0;
974
976 const int TransDefWaitstates = 1;
977
978 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
980 return false;
981 const SIRegisterInfo *TRI = ST.getRegisterInfo();
982 const SIInstrInfo *TII = ST.getInstrInfo();
983 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
984
985 for (const MachineOperand &Use : VALU->explicit_uses()) {
986 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
987 return true;
988 }
989
990 return false;
991 };
992
993 int WaitStatesNeededForDef =
994 TransDefWaitstates -
995 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
996 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
997 }
998
1000 const int Shift16DefWaitstates = 1;
1001
1002 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
1003 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1004 const MachineOperand *ForwardedDst =
1005 getDstSelForwardingOperand(ProducerMI, ST);
1006 if (ForwardedDst) {
1007 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
1008 }
1009
1010 if (ProducerMI.isInlineAsm()) {
1011 // Assume inline asm has dst forwarding hazard
1012 for (auto &Def : ProducerMI.all_defs()) {
1013 if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
1014 return true;
1015 }
1016 }
1017
1018 return false;
1019 };
1020
1021 int WaitStatesNeededForDef =
1022 Shift16DefWaitstates -
1023 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1024 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1025 }
1026
1027 if (ST.hasVDecCoExecHazard()) {
1028 const int VALUWriteSGPRVALUReadWaitstates = 2;
1029 const int VALUWriteEXECRWLane = 4;
1030 const int VALUWriteVGPRReadlaneRead = 1;
1031
1032 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1033 const MachineRegisterInfo &MRI = MF.getRegInfo();
1035 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1036 if (!SIInstrInfo::isVALU(MI))
1037 return false;
1038 return MI.modifiesRegister(UseReg, TRI);
1039 };
1040
1041 for (const MachineOperand &Use : VALU->explicit_uses()) {
1042 if (!Use.isReg())
1043 continue;
1044
1045 UseReg = Use.getReg();
1046 if (TRI->isSGPRReg(MRI, UseReg)) {
1047 int WaitStatesNeededForDef =
1048 VALUWriteSGPRVALUReadWaitstates -
1049 getWaitStatesSince(IsVALUDefSGPRFn,
1050 VALUWriteSGPRVALUReadWaitstates);
1051 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1052 }
1053 }
1054
1055 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1056 UseReg = AMDGPU::VCC;
1057 int WaitStatesNeededForDef =
1058 VALUWriteSGPRVALUReadWaitstates -
1059 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1060 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1061 }
1062
1063 switch (VALU->getOpcode()) {
1064 case AMDGPU::V_READLANE_B32:
1065 case AMDGPU::V_READFIRSTLANE_B32: {
1066 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1067 UseReg = Src->getReg();
1068 int WaitStatesNeededForDef =
1069 VALUWriteVGPRReadlaneRead -
1070 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1071 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1072 }
1073 [[fallthrough]];
1074 case AMDGPU::V_WRITELANE_B32: {
1075 UseReg = AMDGPU::EXEC;
1076 int WaitStatesNeededForDef =
1077 VALUWriteEXECRWLane -
1078 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1079 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1080 break;
1081 }
1082 default:
1083 break;
1084 }
1085 }
1086
1087 // This checks for the hazard where VMEM instructions that store more than
1088 // 8 bytes can have there store data over written by the next instruction.
1089 if (!ST.has12DWordStoreHazard())
1090 return WaitStatesNeeded;
1091
1092 const MachineRegisterInfo &MRI = MF.getRegInfo();
1093
1094 for (const MachineOperand &Def : VALU->defs()) {
1095 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1096 }
1097
1098 return WaitStatesNeeded;
1099}
1100
1101int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1102 // This checks for hazards associated with inline asm statements.
1103 // Since inline asms can contain just about anything, we use this
1104 // to call/leverage other check*Hazard routines. Note that
1105 // this function doesn't attempt to address all possible inline asm
1106 // hazards (good luck), but is a collection of what has been
1107 // problematic thus far.
1108
1109 // see checkVALUHazards()
1112 return 0;
1113
1114 const MachineRegisterInfo &MRI = MF.getRegInfo();
1115 int WaitStatesNeeded = 0;
1116
1117 for (const MachineOperand &Op :
1119 if (Op.isReg() && Op.isDef()) {
1120 if (!TRI.isVectorRegister(MRI, Op.getReg()))
1121 continue;
1122
1123 if (ST.has12DWordStoreHazard()) {
1124 WaitStatesNeeded =
1125 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1126 }
1127 }
1128 }
1129
1130 if (ST.hasDstSelForwardingHazard()) {
1131 const int Shift16DefWaitstates = 1;
1132
1133 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1134 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1135 // Assume inline asm reads the dst
1136 if (Dst)
1137 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1138 IA->readsRegister(Dst->getReg(), &TRI);
1139
1140 if (ProducerMI.isInlineAsm()) {
1141 // If MI is inline asm, assume it has dst forwarding hazard
1142 for (auto &Def : ProducerMI.all_defs()) {
1143 if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1144 IA->readsRegister(Def.getReg(), &TRI)) {
1145 return true;
1146 }
1147 }
1148 }
1149
1150 return false;
1151 };
1152
1153 int WaitStatesNeededForDef =
1154 Shift16DefWaitstates -
1155 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1156 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1157 }
1158
1159 return WaitStatesNeeded;
1160}
1161
1162int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1163 const SIInstrInfo *TII = ST.getInstrInfo();
1164 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1165 const MachineRegisterInfo &MRI = MF.getRegInfo();
1166
1167 const MachineOperand *LaneSelectOp =
1168 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1169
1170 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1171 return 0;
1172
1173 Register LaneSelectReg = LaneSelectOp->getReg();
1174 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1175
1176 const int RWLaneWaitStates = 4;
1177 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1178 RWLaneWaitStates);
1179 return RWLaneWaitStates - WaitStatesSince;
1180}
1181
1182int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1183 if (!ST.hasRFEHazards())
1184 return 0;
1185
1186 const SIInstrInfo *TII = ST.getInstrInfo();
1187
1188 const int RFEWaitStates = 1;
1189
1190 auto IsHazardFn = [TII](const MachineInstr &MI) {
1191 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1192 };
1193 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1194 return RFEWaitStates - WaitStatesNeeded;
1195}
1196
1197int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1198 const SIInstrInfo *TII = ST.getInstrInfo();
1199 const int ReadM0WaitStates = 1;
1200 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1201 return ReadM0WaitStates -
1202 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1203}
1204
1205void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1206 fixVMEMtoScalarWriteHazards(MI);
1207 fixVcmpxPermlaneHazards(MI);
1208 fixSMEMtoVectorWriteHazards(MI);
1209 fixVcmpxExecWARHazard(MI);
1210 fixLdsBranchVmemWARHazard(MI);
1211 if (ST.hasLdsDirect()) {
1212 fixLdsDirectVALUHazard(MI);
1213 fixLdsDirectVMEMHazard(MI);
1214 }
1215 fixVALUPartialForwardingHazard(MI);
1216 fixVALUTransUseHazard(MI);
1217 fixWMMAHazards(MI);
1218 fixShift64HighRegBug(MI);
1219 fixVALUMaskWriteHazard(MI);
1220 fixVALUReadSGPRHazard(MI);
1221 fixRequiredExportPriority(MI);
1222}
1223
1225 const MachineInstr &MI) {
1226 return (TII.isVOPC(MI) ||
1227 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1228 MI.modifiesRegister(AMDGPU::EXEC, &TRI);
1229}
1230
1231bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1232 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1233 return false;
1234
1235 const SIInstrInfo *TII = ST.getInstrInfo();
1236 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1237 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1238 return isVCmpXWritesExec(*TII, *TRI, MI);
1239 };
1240
1241 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1242 unsigned Opc = MI.getOpcode();
1243 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1244 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1245 };
1246
1247 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1248 std::numeric_limits<int>::max())
1249 return false;
1250
1251 // V_NOP will be discarded by SQ.
1252 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1253 // which is always a VGPR and available.
1254 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1255 Register Reg = Src0->getReg();
1256 bool IsUndef = Src0->isUndef();
1257 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1258 TII->get(AMDGPU::V_MOV_B32_e32))
1259 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1260 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1261
1262 return true;
1263}
1264
1265bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1267 return false;
1269
1271 return false;
1272
1273 if (MI->getNumDefs() == 0)
1274 return false;
1275
1276 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1277
1278 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1281 return false;
1282
1283 for (const MachineOperand &Def : MI->defs()) {
1284 const MachineOperand *Op =
1285 I.findRegisterUseOperand(Def.getReg(), TRI, false);
1286 if (!Op)
1287 continue;
1288 return true;
1289 }
1290 return false;
1291 };
1292
1293 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1294 return SIInstrInfo::isVALU(MI) ||
1295 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1296 !MI.getOperand(0).getImm()) ||
1297 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1298 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1299 };
1300
1301 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1302 std::numeric_limits<int>::max())
1303 return false;
1304
1305 const SIInstrInfo *TII = ST.getInstrInfo();
1306 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1307 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1309 return true;
1310}
1311
1312bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1314 return false;
1316
1317 if (!SIInstrInfo::isVALU(*MI))
1318 return false;
1319
1320 unsigned SDSTName;
1321 switch (MI->getOpcode()) {
1322 case AMDGPU::V_READLANE_B32:
1323 case AMDGPU::V_READFIRSTLANE_B32:
1324 SDSTName = AMDGPU::OpName::vdst;
1325 break;
1326 default:
1327 SDSTName = AMDGPU::OpName::sdst;
1328 break;
1329 }
1330
1331 const SIInstrInfo *TII = ST.getInstrInfo();
1332 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1333 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1334 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1335 if (!SDST) {
1336 for (const auto &MO : MI->implicit_operands()) {
1337 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1338 SDST = &MO;
1339 break;
1340 }
1341 }
1342 }
1343
1344 if (!SDST)
1345 return false;
1346
1347 const Register SDSTReg = SDST->getReg();
1348 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1349 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1350 };
1351
1352 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1353 if (TII->isSALU(MI)) {
1354 switch (MI.getOpcode()) {
1355 case AMDGPU::S_SETVSKIP:
1356 case AMDGPU::S_VERSION:
1357 case AMDGPU::S_WAITCNT_VSCNT:
1358 case AMDGPU::S_WAITCNT_VMCNT:
1359 case AMDGPU::S_WAITCNT_EXPCNT:
1360 // These instructions cannot not mitigate the hazard.
1361 return false;
1362 case AMDGPU::S_WAITCNT_LGKMCNT:
1363 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1364 return (MI.getOperand(1).getImm() == 0) &&
1365 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1366 case AMDGPU::S_WAITCNT: {
1367 const int64_t Imm = MI.getOperand(0).getImm();
1369 // DsCnt corresponds to LGKMCnt here.
1370 return (Decoded.DsCnt == 0);
1371 }
1372 default:
1373 // SOPP instructions cannot mitigate the hazard.
1374 if (TII->isSOPP(MI))
1375 return false;
1376 // At this point the SALU can be assumed to mitigate the hazard
1377 // because either:
1378 // (a) it is independent of the at risk SMEM (breaking chain),
1379 // or
1380 // (b) it is dependent on the SMEM, in which case an appropriate
1381 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1382 // SMEM instruction.
1383 return true;
1384 }
1385 }
1386 return false;
1387 };
1388
1389 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1390 std::numeric_limits<int>::max())
1391 return false;
1392
1393 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1394 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1395 .addImm(0);
1396 return true;
1397}
1398
1399bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1400 if (!ST.hasVcmpxExecWARHazard())
1401 return false;
1403
1404 if (!SIInstrInfo::isVALU(*MI))
1405 return false;
1406
1407 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1408 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1409 return false;
1410
1411 auto IsHazardFn = [TRI](const MachineInstr &I) {
1413 return false;
1414 return I.readsRegister(AMDGPU::EXEC, TRI);
1415 };
1416
1417 const SIInstrInfo *TII = ST.getInstrInfo();
1418 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1419 if (SIInstrInfo::isVALU(MI)) {
1420 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1421 return true;
1422 for (auto MO : MI.implicit_operands())
1423 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1424 return true;
1425 }
1426 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1427 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1428 return true;
1429 return false;
1430 };
1431
1432 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1433 std::numeric_limits<int>::max())
1434 return false;
1435
1436 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1437 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1439 return true;
1440}
1441
1443 const GCNSubtarget &ST) {
1444 if (!ST.hasLdsBranchVmemWARHazard())
1445 return false;
1446
1447 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1448 // instructions need to appear in the same function.
1449 bool HasLds = false;
1450 bool HasVmem = false;
1451 for (auto &MBB : MF) {
1452 for (auto &MI : MBB) {
1453 HasLds |= SIInstrInfo::isDS(MI);
1454 HasVmem |=
1456 if (HasLds && HasVmem)
1457 return true;
1458 }
1459 }
1460 return false;
1461}
1462
1464 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1465 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1466 !I.getOperand(1).getImm();
1467}
1468
1469bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1470 if (!RunLdsBranchVmemWARHazardFixup)
1471 return false;
1472
1475
1476 auto IsHazardInst = [](const MachineInstr &MI) {
1477 if (SIInstrInfo::isDS(MI))
1478 return 1;
1480 return 2;
1481 return 0;
1482 };
1483
1484 auto InstType = IsHazardInst(*MI);
1485 if (!InstType)
1486 return false;
1487
1488 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1489 return IsHazardInst(I) || isStoreCountWaitZero(I);
1490 };
1491
1492 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1493 if (!I.isBranch())
1494 return false;
1495
1496 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1497 auto InstType2 = IsHazardInst(I);
1498 return InstType2 && InstType != InstType2;
1499 };
1500
1501 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1502 auto InstType2 = IsHazardInst(I);
1503 if (InstType == InstType2)
1504 return true;
1505
1506 return isStoreCountWaitZero(I);
1507 };
1508
1509 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1510 std::numeric_limits<int>::max();
1511 };
1512
1513 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1514 std::numeric_limits<int>::max())
1515 return false;
1516
1517 const SIInstrInfo *TII = ST.getInstrInfo();
1518 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1519 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1520 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1521 .addImm(0);
1522
1523 return true;
1524}
1525
1526bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1528 return false;
1529
1530 const int NoHazardWaitStates = 15;
1531 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1532 const Register VDSTReg = VDST->getReg();
1533
1534 bool VisitedTrans = false;
1535 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1536 if (!SIInstrInfo::isVALU(I))
1537 return false;
1538 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1539 // Cover both WAR and WAW
1540 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1541 };
1542 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1543 if (WaitStates >= NoHazardWaitStates)
1544 return true;
1545 // Instructions which cause va_vdst==0 expire hazard
1548 };
1549 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1550 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1551 };
1552
1554 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1555 std::next(MI->getReverseIterator()), 0,
1556 IsExpiredFn, Visited, GetWaitStatesFn);
1557
1558 // Transcendentals can execute in parallel to other VALUs.
1559 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1560 if (VisitedTrans)
1561 Count = 0;
1562
1563 MachineOperand *WaitVdstOp =
1564 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1565 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1566
1567 return true;
1568}
1569
1570bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1572 return false;
1573
1574 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1575 const Register VDSTReg = VDST->getReg();
1576
1577 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1580 return false;
1581 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1582 };
1583 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1584 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1585 // according to the type of VMEM instruction.
1586 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1588 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1589 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1590 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1591 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1592 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1593 };
1594
1595 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1596 std::numeric_limits<int>::max())
1597 return false;
1598
1599 if (LdsdirCanWait) {
1600 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1601 } else {
1602 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1603 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1605 }
1606
1607 return true;
1608}
1609
1610bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1612 return false;
1614
1615 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1616 return false;
1617
1619
1620 for (const MachineOperand &Use : MI->explicit_uses()) {
1621 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1622 SrcVGPRs.insert(Use.getReg());
1623 }
1624
1625 // Only applies with >= 2 unique VGPR sources
1626 if (SrcVGPRs.size() <= 1)
1627 return false;
1628
1629 // Look for the following pattern:
1630 // Va <- VALU [PreExecPos]
1631 // intv1
1632 // Exec <- SALU [ExecPos]
1633 // intv2
1634 // Vb <- VALU [PostExecPos]
1635 // intv3
1636 // MI Va, Vb (WaitState = 0)
1637 //
1638 // Where:
1639 // intv1 + intv2 <= 2 VALUs
1640 // intv3 <= 4 VALUs
1641 //
1642 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1643
1644 const int Intv1plus2MaxVALUs = 2;
1645 const int Intv3MaxVALUs = 4;
1646 const int IntvMaxVALUs = 6;
1647 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1648
1649 struct StateType {
1651 int ExecPos = std::numeric_limits<int>::max();
1652 int VALUs = 0;
1653 };
1654
1655 StateType State;
1656
1657 // This overloads expiry testing with all the hazard detection
1658 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1659 // Too many VALU states have passed
1660 if (State.VALUs > NoHazardVALUWaitStates)
1661 return HazardExpired;
1662
1663 // Instructions which cause va_vdst==0 expire hazard
1666 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1667 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1668 return HazardExpired;
1669
1670 // Track registers writes
1671 bool Changed = false;
1672 if (SIInstrInfo::isVALU(I)) {
1673 for (Register Src : SrcVGPRs) {
1674 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1675 State.DefPos[Src] = State.VALUs;
1676 Changed = true;
1677 }
1678 }
1679 } else if (SIInstrInfo::isSALU(I)) {
1680 if (State.ExecPos == std::numeric_limits<int>::max()) {
1681 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1682 State.ExecPos = State.VALUs;
1683 Changed = true;
1684 }
1685 }
1686 }
1687
1688 // Early expiration: too many VALUs in intv3
1689 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1690 return HazardExpired;
1691
1692 // Only evaluate state if something changed
1693 if (!Changed)
1694 return NoHazardFound;
1695
1696 // Determine positions of VALUs pre/post exec change
1697 if (State.ExecPos == std::numeric_limits<int>::max())
1698 return NoHazardFound;
1699
1700 int PreExecPos = std::numeric_limits<int>::max();
1701 int PostExecPos = std::numeric_limits<int>::max();
1702
1703 for (auto Entry : State.DefPos) {
1704 int DefVALUs = Entry.second;
1705 if (DefVALUs != std::numeric_limits<int>::max()) {
1706 if (DefVALUs >= State.ExecPos)
1707 PreExecPos = std::min(PreExecPos, DefVALUs);
1708 else
1709 PostExecPos = std::min(PostExecPos, DefVALUs);
1710 }
1711 }
1712
1713 // Need a VALUs post exec change
1714 if (PostExecPos == std::numeric_limits<int>::max())
1715 return NoHazardFound;
1716
1717 // Too many VALUs in intv3?
1718 int Intv3VALUs = PostExecPos;
1719 if (Intv3VALUs > Intv3MaxVALUs)
1720 return HazardExpired;
1721
1722 // Too many VALUs in intv2?
1723 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1724 if (Intv2VALUs > Intv1plus2MaxVALUs)
1725 return HazardExpired;
1726
1727 // Need a VALUs pre exec change
1728 if (PreExecPos == std::numeric_limits<int>::max())
1729 return NoHazardFound;
1730
1731 // Too many VALUs in intv1?
1732 int Intv1VALUs = PreExecPos - State.ExecPos;
1733 if (Intv1VALUs > Intv1plus2MaxVALUs)
1734 return HazardExpired;
1735
1736 // Too many VALUs in intv1 + intv2
1737 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1738 return HazardExpired;
1739
1740 return HazardFound;
1741 };
1742 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1744 State.VALUs += 1;
1745 };
1746
1748 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1749 std::next(MI->getReverseIterator()), Visited))
1750 return false;
1751
1752 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1753 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1754 .addImm(0x0fff);
1755
1756 return true;
1757}
1758
1759bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1760 if (!ST.hasVALUTransUseHazard())
1761 return false;
1763
1764 if (!SIInstrInfo::isVALU(*MI))
1765 return false;
1766
1767 SmallSet<Register, 4> SrcVGPRs;
1768
1769 for (const MachineOperand &Use : MI->explicit_uses()) {
1770 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1771 SrcVGPRs.insert(Use.getReg());
1772 }
1773
1774 // Look for the following pattern:
1775 // Va <- TRANS VALU
1776 // intv
1777 // MI Va (WaitState = 0)
1778 //
1779 // Where:
1780 // intv <= 5 VALUs / 1 TRANS
1781 //
1782 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1783
1784 const int IntvMaxVALUs = 5;
1785 const int IntvMaxTRANS = 1;
1786
1787 struct StateType {
1788 int VALUs = 0;
1789 int TRANS = 0;
1790 };
1791
1792 StateType State;
1793
1794 // This overloads expiry testing with all the hazard detection
1795 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1796 // Too many VALU states have passed
1797 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1798 return HazardExpired;
1799
1800 // Instructions which cause va_vdst==0 expire hazard
1803 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1804 I.getOperand(0).getImm() == 0x0fff))
1805 return HazardExpired;
1806
1807 // Track registers writes
1808 if (SIInstrInfo::isTRANS(I)) {
1809 for (Register Src : SrcVGPRs) {
1810 if (I.modifiesRegister(Src, &TRI)) {
1811 return HazardFound;
1812 }
1813 }
1814 }
1815
1816 return NoHazardFound;
1817 };
1818 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1820 State.VALUs += 1;
1822 State.TRANS += 1;
1823 };
1824
1826 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1827 std::next(MI->getReverseIterator()), Visited))
1828 return false;
1829
1830 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1831 // avoided.
1832 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1833 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1835
1836 return true;
1837}
1838
1839bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1841 return false;
1842
1843 const SIInstrInfo *TII = ST.getInstrInfo();
1844 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1845
1846 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1848 return false;
1849
1850 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1851 // with the dest(matrix D) of the previous wmma.
1852 const Register CurSrc0Reg =
1853 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1854 const Register CurSrc1Reg =
1855 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1856
1857 const Register PrevDstReg =
1858 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1859
1860 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1861 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1862 return true;
1863 }
1864
1865 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1866 // but Index can't overlap with PrevDstReg.
1867 if (AMDGPU::isGFX12Plus(ST)) {
1868 if (SIInstrInfo::isSWMMAC(*MI)) {
1869 const Register CurIndex =
1870 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1871 if (TRI->regsOverlap(PrevDstReg, CurIndex))
1872 return true;
1873 }
1874 return false;
1875 }
1876
1877 return false;
1878 };
1879
1880 auto IsExpiredFn = [](const MachineInstr &I, int) {
1881 return SIInstrInfo::isVALU(I);
1882 };
1883
1884 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1885 std::numeric_limits<int>::max())
1886 return false;
1887
1888 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1889
1890 return true;
1891}
1892
1893bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1894 if (!ST.hasShift64HighRegBug())
1895 return false;
1897
1898 switch (MI->getOpcode()) {
1899 default:
1900 return false;
1901 case AMDGPU::V_LSHLREV_B64_e64:
1902 case AMDGPU::V_LSHRREV_B64_e64:
1903 case AMDGPU::V_ASHRREV_I64_e64:
1904 break;
1905 }
1906
1907 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1908 if (!Amt->isReg())
1909 return false;
1910
1911 Register AmtReg = Amt->getReg();
1912 const MachineRegisterInfo &MRI = MF.getRegInfo();
1913 // Check if this is a last VGPR in the allocation block.
1914 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1915 return false;
1916
1917 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1918 return false;
1919
1920 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1921 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1922 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1923 bool Overlapped = OverlappedSrc || OverlappedDst;
1924
1925 assert(!OverlappedDst || !OverlappedSrc ||
1926 Src1->getReg() == MI->getOperand(0).getReg());
1928 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1929
1930 Register NewReg;
1931 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1932 : AMDGPU::VGPR_32RegClass) {
1933 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1934 NewReg = Reg;
1935 break;
1936 }
1937 }
1938
1939 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1940 : NewReg;
1941 Register NewAmtLo;
1942
1943 if (Overlapped)
1944 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1945
1946 DebugLoc DL = MI->getDebugLoc();
1947 MachineBasicBlock *MBB = MI->getParent();
1948 // Insert a full wait count because found register might be pending a wait.
1949 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1950 .addImm(0);
1951
1952 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1953 if (Overlapped)
1954 runOnInstruction(
1955 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1956 .addDef(AmtReg - 1)
1957 .addReg(AmtReg - 1, RegState::Undef)
1958 .addReg(NewAmtLo, RegState::Undef));
1959 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1960 .addDef(AmtReg)
1961 .addReg(AmtReg, RegState::Undef)
1962 .addReg(NewAmt, RegState::Undef));
1963
1964 // Instructions emitted after the current instruction will be processed by the
1965 // parent loop of the hazard recognizer in a natural way.
1966 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1967 AmtReg)
1968 .addDef(NewAmt)
1969 .addReg(NewAmt)
1970 .addReg(AmtReg);
1971 if (Overlapped)
1972 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1973 AmtReg - 1)
1974 .addDef(NewAmtLo)
1975 .addReg(NewAmtLo)
1976 .addReg(AmtReg - 1);
1977
1978 // Re-running hazard recognizer on the modified instruction is not necessary,
1979 // inserted V_SWAP_B32 has already both read and write new registers so
1980 // hazards related to these register has already been handled.
1981 Amt->setReg(NewAmt);
1982 Amt->setIsKill(false);
1983 // We do not update liveness, so verifier may see it as undef.
1984 Amt->setIsUndef();
1985 if (OverlappedDst)
1986 MI->getOperand(0).setReg(NewReg);
1987 if (OverlappedSrc) {
1988 Src1->setReg(NewReg);
1989 Src1->setIsKill(false);
1990 Src1->setIsUndef();
1991 }
1992
1993 return true;
1994}
1995
1996int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1997 int NSAtoVMEMWaitStates = 1;
1998
1999 if (!ST.hasNSAtoVMEMBug())
2000 return 0;
2001
2003 return 0;
2004
2005 const SIInstrInfo *TII = ST.getInstrInfo();
2006 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2007 if (!Offset || (Offset->getImm() & 6) == 0)
2008 return 0;
2009
2010 auto IsHazardFn = [TII](const MachineInstr &I) {
2011 if (!SIInstrInfo::isMIMG(I))
2012 return false;
2013 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
2014 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2015 TII->getInstSizeInBytes(I) >= 16;
2016 };
2017
2018 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
2019}
2020
2021int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
2022 int FPAtomicToDenormModeWaitStates = 3;
2023
2025 return 0;
2027
2028 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2029 return 0;
2030
2031 auto IsHazardFn = [](const MachineInstr &I) {
2033 return false;
2034 return SIInstrInfo::isFPAtomic(I);
2035 };
2036
2037 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2038 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2039 return true;
2040
2041 switch (MI.getOpcode()) {
2042 case AMDGPU::S_WAITCNT:
2043 case AMDGPU::S_WAITCNT_VSCNT:
2044 case AMDGPU::S_WAITCNT_VMCNT:
2045 case AMDGPU::S_WAITCNT_EXPCNT:
2046 case AMDGPU::S_WAITCNT_LGKMCNT:
2047 case AMDGPU::S_WAIT_IDLE:
2048 return true;
2049 default:
2050 break;
2051 }
2052
2053 return false;
2054 };
2055
2056 return FPAtomicToDenormModeWaitStates -
2057 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2058}
2059
2060int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
2062
2063 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2064}
2065
2066int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
2067 // Early exit if no padding is requested.
2068 if (MFMAPaddingRatio == 0)
2069 return 0;
2070
2072 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2073 return 0;
2074
2075 int NeighborMFMALatency = 0;
2076 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2077 this](const MachineInstr &MI) {
2078 if (!SIInstrInfo::isMFMA(MI))
2079 return false;
2080
2081 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2082 return true;
2083 };
2084
2085 const int MaxMFMAPipelineWaitStates = 16;
2086 int WaitStatesSinceNeighborMFMA =
2087 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2088
2089 int NeighborMFMAPaddingNeeded =
2090 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2091 WaitStatesSinceNeighborMFMA;
2092
2093 return std::max(0, NeighborMFMAPaddingNeeded);
2094}
2095
2096int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
2097 int WaitStatesNeeded = 0;
2098 unsigned Opc = MI->getOpcode();
2099
2100 auto IsVALUFn = [](const MachineInstr &MI) {
2101 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2102 };
2103
2104 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2105 const int LegacyVALUWritesVGPRWaitStates = 2;
2106 const int VALUWritesExecWaitStates = 4;
2107 const int MaxWaitStates = 4;
2108
2109 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2110 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2111 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2112
2113 if (WaitStatesNeeded < MaxWaitStates) {
2114 for (const MachineOperand &Use : MI->explicit_uses()) {
2115 const int MaxWaitStates = 2;
2116
2117 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2118 continue;
2119
2120 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2121 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2122 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2123
2124 if (WaitStatesNeeded == MaxWaitStates)
2125 break;
2126 }
2127 }
2128 }
2129
2130 for (const MachineOperand &Op : MI->explicit_operands()) {
2131 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2132 continue;
2133
2134 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2135 continue;
2136
2137 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2138 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2139 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2140 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2141 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2142 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2143 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2144 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2145 const int MaxWaitStates = 18;
2146 Register Reg = Op.getReg();
2147 unsigned HazardDefLatency = 0;
2148
2149 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2150 this](const MachineInstr &MI) {
2151 if (!SIInstrInfo::isMFMA(MI))
2152 return false;
2153 Register DstReg = MI.getOperand(0).getReg();
2154 if (DstReg == Reg)
2155 return false;
2156 HazardDefLatency =
2157 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2158 return TRI.regsOverlap(DstReg, Reg);
2159 };
2160
2161 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2162 MaxWaitStates);
2163 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2164 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2165 int OpNo = Op.getOperandNo();
2166 if (OpNo == SrcCIdx) {
2167 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2168 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2169 switch (HazardDefLatency) {
2170 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2171 break;
2172 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2173 break;
2174 case 16: [[fallthrough]];
2175 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2176 break;
2177 }
2178 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2179 switch (HazardDefLatency) {
2180 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2181 break;
2182 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2183 break;
2184 case 16: [[fallthrough]];
2185 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2186 break;
2187 }
2188 }
2189
2190 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2191 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2192
2193 if (WaitStatesNeeded == MaxWaitStates)
2194 return WaitStatesNeeded; // Early exit.
2195
2196 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2197 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2198 return false;
2199 Register DstReg = MI.getOperand(0).getReg();
2200 return TRI.regsOverlap(Reg, DstReg);
2201 };
2202
2203 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2204 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2205 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2206 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2207 if (OpNo == SrcCIdx)
2208 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2209 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2210 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2211
2212 WaitStatesNeededForUse = NeedWaitStates -
2213 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2214 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2215
2216 if (WaitStatesNeeded == MaxWaitStates)
2217 return WaitStatesNeeded; // Early exit.
2218 }
2219
2220 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2221 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2222 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2223 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2224 const int MaxWaitStates = 13;
2225 Register DstReg = MI->getOperand(0).getReg();
2226 unsigned HazardDefLatency = 0;
2227
2228 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2229 this](const MachineInstr &MI) {
2230 if (!SIInstrInfo::isMFMA(MI))
2231 return false;
2232 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2233 HazardDefLatency =
2234 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2235 return TRI.regsOverlap(Reg, DstReg);
2236 };
2237
2238 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2239 int NeedWaitStates;
2240 switch (HazardDefLatency) {
2241 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2242 break;
2243 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2244 break;
2245 case 16: [[fallthrough]];
2246 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2247 break;
2248 }
2249
2250 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2251 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2252 }
2253
2254 // Pad neighboring MFMA with noops for better inter-wave performance.
2255 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2256
2257 return WaitStatesNeeded;
2258}
2259
2260static int
2262 bool IsGFX950) {
2263 // xdl def cycles | gfx940 | gfx950
2264 // 2 pass | 3 4
2265 // 4 pass | 5 6
2266 // 8 pass | 9 10
2267 // 16 pass | 17 18
2268 return NumPasses + 1 + IsGFX950;
2269}
2270
2271static int
2273 bool IsGFX950) {
2274 // xdl def cycles | gfx940 | gfx950
2275 // 2 pass | 3 3
2276 // 4 pass | 5 6
2277 // 8 pass | 9 10
2278 // 16 pass | 17 18
2279 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2280}
2281
2282static int
2284 // 2 pass -> 2
2285 // 4 pass -> 4
2286 // 8 pass -> 8
2287 // 16 pass -> 16
2288 return NumPasses;
2289}
2290
2291static int
2293 // 2 pass -> 4
2294 // 4 pass -> 6
2295 // 8 pass -> 10
2296 // 16 pass -> 18
2297 return NumPasses + 2;
2298}
2299
2301 // 2 pass -> 5
2302 // 4 pass -> 7
2303 // 8 pass -> 11
2304 // 16 pass -> 19
2305 return NumPasses + 3;
2306}
2307
2308int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2309 int WaitStatesNeeded = 0;
2310 unsigned Opc = MI->getOpcode();
2311
2312 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2314 };
2315
2316 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2319 };
2320
2321 if (!SIInstrInfo::isMFMA(*MI))
2322 return WaitStatesNeeded;
2323
2324 const int VALUWritesExecWaitStates = 4;
2325 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2326 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2327 VALUWritesExecWaitStates);
2328 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2329
2330 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2331
2332 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2333 for (const MachineOperand &Use : MI->explicit_uses()) {
2334 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2335 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2336 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2337 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2338 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2339 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2340 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2341 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2342 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2343 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2344 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2345 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2346 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2347 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2348 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2349 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2350 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2351 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2352 const int MaxWaitStates = 19;
2353
2354 if (!Use.isReg())
2355 continue;
2356 Register Reg = Use.getReg();
2357 bool FullReg;
2358 const MachineInstr *MI1;
2359
2360 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2361 this](const MachineInstr &MI) {
2362 if (!SIInstrInfo::isMFMA(MI))
2363 return false;
2364 Register DstReg = MI.getOperand(0).getReg();
2365 FullReg = (DstReg == Reg);
2366 MI1 = &MI;
2367 return TRI.regsOverlap(DstReg, Reg);
2368 };
2369
2370 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2371 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2372 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2373
2374 int NumWaitStates =
2375 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2376 if (NumWaitStates == std::numeric_limits<int>::max())
2377 continue;
2378
2379 int OpNo = Use.getOperandNo();
2380 unsigned Opc1 = MI1->getOpcode();
2381 int NeedWaitStates = 0;
2382 if (OpNo == SrcCIdx) {
2383 if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2384 NeedWaitStates = 0;
2385 } else if (FullReg) {
2386 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2387 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2388 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2389 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2390 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2391 else if (ST.hasGFX940Insts() &&
2392 TSchedModel.computeInstrLatency(MI1) == 2)
2393 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2394 } else {
2395 switch (Opc1) {
2396 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2397 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2398 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2399 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2400 if (!isXDL(ST, *MI))
2401 NeedWaitStates =
2402 ST.hasGFX950Insts()
2403 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2404 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2405 break;
2406 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2407 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2408 if (!isXDL(ST, *MI))
2409 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2410 break;
2411 default:
2412 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2413 if (ST.hasGFX940Insts()) {
2414 if (isXDL(ST, *MI) && !isXDL(ST, *MI1))
2415 break;
2416
2417 NeedWaitStates =
2418 isXDL(ST, *MI1)
2419 ? (isXDL(ST, *MI)
2421 NumPasses, ST.hasGFX950Insts())
2423 NumPasses, ST.hasGFX950Insts()))
2425 NumPasses);
2426 break;
2427 }
2428
2429 switch (NumPasses) {
2430 case 2:
2431 NeedWaitStates =
2432 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2433 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2434 break;
2435 case 8:
2436 NeedWaitStates =
2437 isDGEMM(Opc)
2438 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2439 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2440 break;
2441 case 16:
2442 NeedWaitStates =
2443 isDGEMM(Opc)
2444 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2445 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2446 break;
2447 default:
2448 llvm_unreachable("unexpected number of passes");
2449 }
2450 }
2451 }
2452 } else {
2453 switch (Opc1) {
2454 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2455 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2456 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2457 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2458 NeedWaitStates =
2459 ST.hasGFX950Insts()
2460 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2461 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2462 break;
2463 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2464 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2465 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2466 break;
2467 default:
2468 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2469
2470 if (ST.hasGFX940Insts()) {
2471 NeedWaitStates =
2472 isXDL(ST, *MI1)
2474 NumPasses)
2476 NumPasses);
2477 break;
2478 }
2479
2480 switch (NumPasses) {
2481 case 2:
2482 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2483 break;
2484 case 4:
2485 llvm_unreachable("unexpected number of passes for mfma");
2486 case 8:
2487 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2488 break;
2489 case 16:
2490 default:
2491 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2492 }
2493 }
2494 }
2495 if (WaitStatesNeeded >= NeedWaitStates)
2496 continue;
2497
2498 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2499 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2500
2501 if (WaitStatesNeeded == MaxWaitStates)
2502 break;
2503 }
2504
2505 // Pad neighboring MFMA with noops for better inter-wave performance.
2506 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2507
2508 return WaitStatesNeeded;
2509}
2510
2511int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2512 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2513 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2514 return 0;
2515
2516 int WaitStatesNeeded = 0;
2517
2518 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2519 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2520 };
2521
2522 for (const MachineOperand &Op : MI->explicit_uses()) {
2523 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2524 continue;
2525
2526 Register Reg = Op.getReg();
2527
2528 const int AccVgprReadLdStWaitStates = 2;
2529 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2530 const int MaxWaitStates = 2;
2531
2532 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2533 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2534 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2535
2536 if (WaitStatesNeeded == MaxWaitStates)
2537 return WaitStatesNeeded; // Early exit.
2538
2539 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2540 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2541 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2542 return false;
2543 auto IsVALUFn = [](const MachineInstr &MI) {
2545 };
2546 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2547 std::numeric_limits<int>::max();
2548 };
2549
2550 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2551 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2552 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2553 }
2554
2555 return WaitStatesNeeded;
2556}
2557
2558int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
2560 "this is a different vcmpx+permlane hazard");
2561 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2562 const SIInstrInfo *TII = ST.getInstrInfo();
2563
2564 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2565 return isVCmpXWritesExec(*TII, *TRI, MI);
2566 };
2567
2568 auto IsVALUFn = [](const MachineInstr &MI) {
2569 return SIInstrInfo::isVALU(MI);
2570 };
2571
2572 const int VCmpXWritesExecWaitStates = 4;
2573 const int VALUWritesVDstWaitStates = 2;
2574 int WaitStatesNeeded = 0;
2575
2576 for (const MachineOperand &Op : MI->explicit_uses()) {
2577 if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
2578 continue;
2579 Register Reg = Op.getReg();
2580
2581 int WaitStatesSinceDef =
2582 VALUWritesVDstWaitStates -
2583 getWaitStatesSinceDef(Reg, IsVALUFn,
2584 /*MaxWaitStates=*/VALUWritesVDstWaitStates);
2585 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2586 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2587 break;
2588 }
2589
2590 int VCmpXHazardWaits =
2591 VCmpXWritesExecWaitStates -
2592 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2593
2594 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2595 return WaitStatesNeeded;
2596}
2597
2599 // 2 pass -> 4
2600 // 4 pass -> 6
2601 // 8 pass -> 10
2602 // 16 pass -> 18
2603 return NumPasses + 2;
2604}
2605
2607 // 2 pass -> 5
2608 // 4 pass -> 7
2609 // 8 pass -> 11
2610 // 16 pass -> 19
2611 return NumPasses + 3;
2612}
2613
2615 // 2 pass -> 5
2616 // 4 pass -> 7
2617 // 8 pass -> 11
2618 // 16 pass -> 19
2619 return NumPasses + 3;
2620}
2621
2623 // 2 pass -> 4
2624 // 4 pass -> 6
2625 // 8 pass -> 10
2626 // 16 pass -> 18
2627 return NumPasses + 2;
2628}
2629
2630int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2631 if (!ST.hasGFX90AInsts())
2632 return 0;
2633
2634 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2635 return isDGEMM(MI.getOpcode());
2636 };
2637
2638 // This is checked in checkMAIHazards90A()
2639 if (SIInstrInfo::isMFMA(*MI))
2640 return 0;
2641
2642 const MachineRegisterInfo &MRI = MF.getRegInfo();
2643
2644 int WaitStatesNeeded = 0;
2645
2646 bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2649 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2650 bool IsVALU = SIInstrInfo::isVALU(*MI);
2651
2652 const MachineInstr *MFMA = nullptr;
2653 unsigned Reg;
2654 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2655 if (!SIInstrInfo::isMFMA(MI) ||
2656 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2657 return false;
2658 MFMA = &MI;
2659 return true;
2660 };
2661
2662 const MachineInstr *DOT = nullptr;
2663 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2664 if (!SIInstrInfo::isDOT(MI) ||
2665 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2666 return false;
2667 DOT = &MI;
2668 return true;
2669 };
2670
2671 bool DGEMMAfterVALUWrite = false;
2672 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2673 // Found DGEMM on reverse traversal to def.
2674 if (isDGEMM(MI.getOpcode()))
2675 DGEMMAfterVALUWrite = true;
2676
2677 // Only hazard if register is defined by a VALU and a DGEMM is found after
2678 // after the def.
2679 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2680 return false;
2681
2682 return true;
2683 };
2684
2685 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2686 AMDGPU::OpName::src2);
2687
2688 if (IsMemOrExport || IsVALU) {
2689 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2690 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2691 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2692 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2693 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2694 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2695 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2696 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
2697 const int DotWriteSameDotReadSrcAB = 3;
2698 const int DotWriteDifferentVALURead = 3;
2699 const int DMFMABetweenVALUWriteVMEMRead = 2;
2700 const int MaxWaitStates = 19;
2701
2702 for (const MachineOperand &Use : MI->explicit_uses()) {
2703 if (!Use.isReg())
2704 continue;
2705 Reg = Use.getReg();
2706
2707 DOT = nullptr;
2708 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2709 MaxWaitStates);
2710 if (DOT) {
2711 int NeedWaitStates = 0;
2712 if (DOT->getOpcode() == MI->getOpcode()) {
2713 if (&Use - &MI->getOperand(0) != SrcCIdx)
2714 NeedWaitStates = DotWriteSameDotReadSrcAB;
2715 } else {
2716 NeedWaitStates = DotWriteDifferentVALURead;
2717 }
2718
2719 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2720 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2721 }
2722
2723 // Workaround for HW data hazard bug observed only in GFX90A. When there
2724 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2725 // causes the SQ to incorrectly not insert two wait states between the two
2726 // instructions needed to avoid data hazard.
2727 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2728 DGEMMAfterVALUWrite = false;
2729 if (TRI.isVectorRegister(MRI, Reg)) {
2730 int WaitStatesNeededForUse =
2731 DMFMABetweenVALUWriteVMEMRead -
2732 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2733 DMFMABetweenVALUWriteVMEMRead);
2734
2735 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2736 }
2737 }
2738
2739 MFMA = nullptr;
2740 WaitStatesSinceDef =
2741 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2742 if (!MFMA)
2743 continue;
2744
2745 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2746 int NumPasses = HazardDefLatency;
2747 int NeedWaitStates = MaxWaitStates;
2748
2749 if (isDGEMM(MFMA->getOpcode())) {
2750 switch (HazardDefLatency) {
2751 case 4:
2752 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2753 : DMFMA4x4WriteVgprVALUReadWaitStates;
2754 break;
2755 case 8:
2756 case 16:
2757 NeedWaitStates =
2758 IsMemOrExport
2759 ? DMFMA16x16WriteVgprMemExpReadWaitStates
2760 : (ST.hasGFX950Insts()
2761 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
2762 : DMFMA16x16WriteVgprVALUReadWaitStates);
2763 break;
2764 default:
2765 llvm_unreachable("unexpected dgemm");
2766 }
2767 } else if (ST.hasGFX940Insts()) {
2768 NeedWaitStates =
2769 isXDL(ST, *MFMA)
2772 NumPasses);
2773 } else {
2774 switch (HazardDefLatency) {
2775 case 2:
2776 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2777 break;
2778 case 8:
2779 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2780 break;
2781 case 16:
2782 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2783 break;
2784 default:
2785 llvm_unreachable("unexpected number of passes for mfma");
2786 }
2787 }
2788
2789 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2790 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2791
2792 if (WaitStatesNeeded == MaxWaitStates)
2793 break;
2794 }
2795 }
2796
2797 unsigned Opc = MI->getOpcode();
2798 const int DMFMAToFMA64WaitStates = 2;
2799 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2800 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2801 Opc == AMDGPU::V_FMAC_F64_dpp) &&
2802 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2803 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2804 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2805 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2806 }
2807
2808 if (!IsVALU && !IsMemOrExport)
2809 return WaitStatesNeeded;
2810
2811 for (const MachineOperand &Def : MI->defs()) {
2812 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2813 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2814 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2815 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2816 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2817 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2818 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2819 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2820 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2821 const int DotWriteDifferentVALUWrite = 3;
2822 const int MaxWaitStates = 19;
2823 const int MaxWarWaitStates = 15;
2824
2825 Reg = Def.getReg();
2826
2827 DOT = nullptr;
2828 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2829 MaxWaitStates);
2830 if (DOT && DOT->getOpcode() != MI->getOpcode())
2831 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2832 WaitStatesSinceDef);
2833
2834 MFMA = nullptr;
2835 WaitStatesSinceDef =
2836 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2837 if (MFMA) {
2838 int NeedWaitStates = MaxWaitStates;
2839 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2840
2841 if (isDGEMM(MFMA->getOpcode())) {
2842 switch (NumPasses) {
2843 case 4:
2844 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2845 break;
2846 case 8:
2847 case 16:
2848 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2849 break;
2850 default:
2851 llvm_unreachable("unexpected number of cycles for dgemm");
2852 }
2853 } else if (ST.hasGFX940Insts()) {
2854 NeedWaitStates =
2855 isXDL(ST, *MFMA)
2858 } else {
2859 switch (NumPasses) {
2860 case 2:
2861 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2862 break;
2863 case 8:
2864 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2865 break;
2866 case 16:
2867 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2868 break;
2869 default:
2870 llvm_unreachable("Unexpected number of passes for mfma");
2871 }
2872 }
2873
2874 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2875 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2876
2877 if (WaitStatesNeeded == MaxWaitStates)
2878 break;
2879 }
2880
2881 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2882 if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2883 !MI.readsRegister(Reg, &TRI))
2884 return false;
2885
2886 if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2887 return false;
2888
2889 const MachineOperand *SrcC =
2890 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2891 assert(SrcC);
2892 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2893 return false;
2894
2895 MFMA = &MI;
2896 return true;
2897 };
2898
2899 MFMA = nullptr;
2900 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2901 MaxWarWaitStates);
2902 if (!MFMA)
2903 continue;
2904
2905 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2906 int NeedWaitStates = MaxWaitStates;
2907 switch (HazardDefLatency) {
2908 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2909 break;
2910 case 4: assert(ST.hasGFX940Insts());
2911 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2912 break;
2913 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2914 break;
2915 case 16: [[fallthrough]];
2916 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2917 break;
2918 }
2919
2920 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2921 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2922 }
2923
2924 return WaitStatesNeeded;
2925}
2926
2928 if (!SU->isInstr())
2929 return false;
2930
2931 const MachineInstr *MAI = nullptr;
2932
2933 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2934 MAI = nullptr;
2936 MAI = &MI;
2937 return MAI != nullptr;
2938 };
2939
2940 MachineInstr *MI = SU->getInstr();
2941 if (IsMFMAFn(*MI)) {
2942 int W = getWaitStatesSince(IsMFMAFn, 16);
2943 if (MAI)
2944 return W < (int)TSchedModel.computeInstrLatency(MAI);
2945 }
2946
2947 return false;
2948}
2949
2950// Adjust global offsets for instructions bundled with S_GETPC_B64 after
2951// insertion of a new instruction.
2952static void updateGetPCBundle(MachineInstr *NewMI) {
2953 if (!NewMI->isBundled())
2954 return;
2955
2956 // Find start of bundle.
2957 auto I = NewMI->getIterator();
2958 while (I->isBundledWithPred())
2959 I--;
2960 if (I->isBundle())
2961 I++;
2962
2963 // Bail if this is not an S_GETPC bundle.
2964 if (I->getOpcode() != AMDGPU::S_GETPC_B64)
2965 return;
2966
2967 // Update offsets of any references in the bundle.
2968 const unsigned NewBytes = 4;
2969 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2970 "Unexpected instruction insertion in bundle");
2971 auto NextMI = std::next(NewMI->getIterator());
2972 auto End = NewMI->getParent()->end();
2973 while (NextMI != End && NextMI->isBundledWithPred()) {
2974 for (auto &Operand : NextMI->operands()) {
2975 if (Operand.isGlobal())
2976 Operand.setOffset(Operand.getOffset() + NewBytes);
2977 }
2978 NextMI++;
2979 }
2980}
2981
2982bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2983 if (!ST.hasVALUMaskWriteHazard())
2984 return false;
2986
2987 if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
2988 return false;
2989
2990 // The hazard sequence is three instructions:
2991 // 1. VALU reads SGPR as mask
2992 // 2. SALU writes SGPR
2993 // 3. SALU reads SGPR
2994 // The hazard can expire if the distance between 2 and 3 is sufficient.
2995 // In practice this happens <10% of the time, hence this always assumes
2996 // the hazard exists if 1 and 2 are present to avoid searching.
2997
2998 const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2999 if (!SDSTOp || !SDSTOp->isReg())
3000 return false;
3001
3002 const Register HazardReg = SDSTOp->getReg();
3003 if (HazardReg == AMDGPU::EXEC ||
3004 HazardReg == AMDGPU::EXEC_LO ||
3005 HazardReg == AMDGPU::EXEC_HI ||
3006 HazardReg == AMDGPU::M0)
3007 return false;
3008
3009 auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
3010 switch (I.getOpcode()) {
3011 case AMDGPU::V_ADDC_U32_e32:
3012 case AMDGPU::V_ADDC_U32_dpp:
3013 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3014 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3015 case AMDGPU::V_CNDMASK_B32_e32:
3016 case AMDGPU::V_CNDMASK_B32_dpp:
3017 case AMDGPU::V_DIV_FMAS_F32_e64:
3018 case AMDGPU::V_DIV_FMAS_F64_e64:
3019 case AMDGPU::V_SUBB_U32_e32:
3020 case AMDGPU::V_SUBB_U32_dpp:
3021 case AMDGPU::V_SUBBREV_U32_e32:
3022 case AMDGPU::V_SUBBREV_U32_dpp:
3023 // These implicitly read VCC as mask source.
3024 return HazardReg == AMDGPU::VCC ||
3025 HazardReg == AMDGPU::VCC_LO ||
3026 HazardReg == AMDGPU::VCC_HI;
3027 case AMDGPU::V_ADDC_U32_e64:
3028 case AMDGPU::V_ADDC_U32_e64_dpp:
3029 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3030 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3031 case AMDGPU::V_CNDMASK_B32_e64:
3032 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3033 case AMDGPU::V_SUBB_U32_e64:
3034 case AMDGPU::V_SUBB_U32_e64_dpp:
3035 case AMDGPU::V_SUBBREV_U32_e64:
3036 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3037 // Only check mask register overlaps.
3038 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
3039 assert(SSRCOp);
3040 return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
3041 }
3042 default:
3043 return false;
3044 }
3045 };
3046
3047 const MachineRegisterInfo &MRI = MF.getRegInfo();
3048 auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
3049 // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
3050 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3051 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
3052 return true;
3053
3054 // VALU access to any SGPR or literal constant other than HazardReg
3055 // mitigates hazard. No need to check HazardReg here as this will
3056 // only be called when !IsHazardFn.
3057 if (!SIInstrInfo::isVALU(I))
3058 return false;
3059 for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
3060 const MachineOperand &Op = I.getOperand(OpNo);
3061 if (Op.isReg()) {
3062 Register OpReg = Op.getReg();
3063 // Only consider uses
3064 if (!Op.isUse())
3065 continue;
3066 // Ignore EXEC
3067 if (OpReg == AMDGPU::EXEC ||
3068 OpReg == AMDGPU::EXEC_LO ||
3069 OpReg == AMDGPU::EXEC_HI)
3070 continue;
3071 // Ignore all implicit uses except VCC
3072 if (Op.isImplicit()) {
3073 if (OpReg == AMDGPU::VCC ||
3074 OpReg == AMDGPU::VCC_LO ||
3075 OpReg == AMDGPU::VCC_HI)
3076 return true;
3077 continue;
3078 }
3079 if (TRI.isSGPRReg(MRI, OpReg))
3080 return true;
3081 } else {
3082 const MCInstrDesc &InstDesc = I.getDesc();
3083 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
3084 if (!TII.isInlineConstant(Op, OpInfo))
3085 return true;
3086 }
3087 }
3088 return false;
3089 };
3090
3091 // Check for hazard
3092 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
3093 std::numeric_limits<int>::max())
3094 return false;
3095
3096 auto NextMI = std::next(MI->getIterator());
3097
3098 // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
3099 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3100 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3102
3103 // SALU write may be s_getpc in a bundle.
3104 updateGetPCBundle(NewMI);
3105
3106 return true;
3107}
3108
3109// Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR.
3110// i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc
3111static std::optional<unsigned> sgprPairNumber(Register Reg,
3112 const SIRegisterInfo &TRI) {
3113 switch (Reg) {
3114 case AMDGPU::M0:
3115 case AMDGPU::EXEC:
3116 case AMDGPU::EXEC_LO:
3117 case AMDGPU::EXEC_HI:
3118 case AMDGPU::SGPR_NULL:
3119 case AMDGPU::SGPR_NULL64:
3120 return {};
3121 default:
3122 break;
3123 }
3124 unsigned RegN = TRI.getEncodingValue(Reg);
3125 if (RegN > 127)
3126 return {};
3127 return (RegN >> 1) & 0x3f;
3128}
3129
3130// For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs.
3131void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) {
3132 assert(MMF == &MF);
3133
3134 // Assume non-empty vector means it has already been computed.
3135 if (!VALUReadHazardSGPRs.empty())
3136 return;
3137
3139 bool IsCallFree =
3140 AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls();
3141
3142 // Exhaustive search is only viable in non-caller/callee functions where
3143 // VALUs will be exposed to the hazard recognizer.
3144 UseVALUReadHazardExhaustiveSearch =
3145 IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None &&
3147
3148 // Consider all SGPRs hazards if the shader uses function calls or is callee.
3149 bool UseVALUUseCache =
3150 IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
3151 VALUReadHazardSGPRs.resize(64, !UseVALUUseCache);
3152 if (!UseVALUUseCache)
3153 return;
3154
3155 // Perform a post ordered reverse scan to find VALUs which read an SGPR
3156 // before a SALU write to the same SGPR. This provides a reduction in
3157 // hazard insertion when all VALU access to an SGPR occurs after its last
3158 // SALU write, when compared to a linear scan.
3159 const MachineRegisterInfo &MRI = MF.getRegInfo();
3160 BitVector SALUWriteSGPRs(64), ReadSGPRs(64);
3162 CI.compute(*MMF);
3163
3164 for (auto *MBB : post_order(&MF)) {
3165 bool InCycle = CI.getCycle(MBB) != nullptr;
3166 for (auto &MI : reverse(MBB->instrs())) {
3167 bool IsVALU = SIInstrInfo::isVALU(MI);
3168 bool IsSALU = SIInstrInfo::isSALU(MI);
3169 if (!IsVALU && !IsSALU)
3170 continue;
3171
3172 for (const MachineOperand &Op : MI.operands()) {
3173 if (!Op.isReg())
3174 continue;
3175 Register Reg = Op.getReg();
3176 assert(!Op.getSubReg());
3177 // Only consider implicit operands of VCC.
3178 if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO ||
3179 Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC))
3180 continue;
3181 if (!TRI.isSGPRReg(MRI, Reg))
3182 continue;
3183 auto RegN = sgprPairNumber(Reg, TRI);
3184 if (!RegN)
3185 continue;
3186 if (IsVALU && Op.isUse()) {
3187 // Note: any access within a cycle must be considered a hazard.
3188 if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN]))
3189 VALUReadHazardSGPRs.set(*RegN);
3190 ReadSGPRs.set(*RegN);
3191 } else if (IsSALU) {
3192 if (Op.isDef())
3193 SALUWriteSGPRs.set(*RegN);
3194 else
3195 ReadSGPRs.set(*RegN);
3196 }
3197 }
3198 }
3199 }
3200}
3201
3202bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
3203 if (!ST.hasVALUReadSGPRHazard())
3204 return false;
3205
3206 // The hazard sequence is fundamentally three instructions:
3207 // 1. VALU reads SGPR
3208 // 2. SALU writes SGPR
3209 // 3. VALU/SALU reads SGPR
3210 // Try to avoid searching for (1) because the expiry point of the hazard is
3211 // indeterminate; however, the hazard between (2) and (3) can expire if the
3212 // gap contains sufficient SALU instructions with no usage of SGPR from (1).
3213 // Note: SGPRs must be considered as 64-bit pairs as hazard exists
3214 // even if individual SGPRs are accessed.
3215
3216 bool MIIsSALU = SIInstrInfo::isSALU(*MI);
3217 bool MIIsVALU = SIInstrInfo::isVALU(*MI);
3218 if (!(MIIsSALU || MIIsVALU))
3219 return false;
3220
3221 // Avoid expensive search when compile time is priority by
3222 // mitigating every SALU which writes an SGPR.
3225 return false;
3226
3227 const MachineOperand *SDSTOp =
3228 TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
3229 if (!SDSTOp || !SDSTOp->isReg())
3230 return false;
3231
3232 const Register HazardReg = SDSTOp->getReg();
3233 if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO ||
3234 HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0)
3235 return false;
3236
3237 // Add s_wait_alu sa_sdst(0) after SALU write.
3238 auto NextMI = std::next(MI->getIterator());
3239 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3240 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3242
3243 // SALU write may be s_getpc in a bundle.
3244 updateGetPCBundle(NewMI);
3245
3246 return true;
3247 }
3248
3249 // Pre-compute set of SGPR pairs read by VALUs.
3250 // Note: pass mutable pointer to MachineFunction for CycleInfo.
3251 computeVALUHazardSGPRs(MI->getMF());
3252
3253 // If no VALUs hazard SGPRs exist then nothing to do.
3254 if (VALUReadHazardSGPRs.none())
3255 return false;
3256
3257 // All SGPR writes before a call/return must be flushed as the callee/caller
3258 // will not will not see the hazard chain, i.e. (2) to (3) described above.
3259 const bool IsSetPC = (MI->isCall() || MI->isReturn()) &&
3260 !(MI->getOpcode() == AMDGPU::S_ENDPGM ||
3261 MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED);
3262
3263 // Collect all SGPR sources for MI which are read by a VALU.
3264 const MachineRegisterInfo &MRI = MF.getRegInfo();
3265 SmallSet<Register, 4> SGPRsUsed;
3266
3267 if (!IsSetPC) {
3268 for (const MachineOperand &Op : MI->all_uses()) {
3269 Register OpReg = Op.getReg();
3270
3271 // Only consider VCC implicit uses on VALUs.
3272 // The only expected SALU implicit access is SCC which is no hazard.
3273 if (MIIsSALU && Op.isImplicit())
3274 continue;
3275
3276 if (!TRI.isSGPRReg(MRI, OpReg))
3277 continue;
3278
3279 auto RegN = sgprPairNumber(OpReg, TRI);
3280 if (!RegN)
3281 continue;
3282
3283 if (!VALUReadHazardSGPRs[*RegN])
3284 continue;
3285
3286 SGPRsUsed.insert(OpReg);
3287 }
3288
3289 // No SGPRs -> nothing to do.
3290 if (SGPRsUsed.empty())
3291 return false;
3292 }
3293
3294 // A hazard is any SALU which writes one of the SGPRs read by MI.
3295 auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) {
3296 if (!SIInstrInfo::isSALU(I))
3297 return false;
3298 // Ensure SGPR flush before call/return by conservatively assuming every
3299 // SALU writes an SGPR.
3300 if (IsSetPC && I.getNumDefs() > 0)
3301 return true;
3302 // Check for any register writes.
3303 return any_of(SGPRsUsed, [this, &I](Register Reg) {
3304 return I.modifiesRegister(Reg, &TRI);
3305 });
3306 };
3307
3308 const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11;
3309 auto IsExpiredFn = [&](const MachineInstr &I, int Count) {
3310 if (Count >= SALUExpiryCount)
3311 return true;
3312 // s_wait_alu sa_sdst(0) on path mitigates hazard.
3313 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3314 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
3315 return true;
3316 return false;
3317 };
3318
3319 auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) {
3320 // Only count true SALUs as wait states.
3322 return 0;
3323 // SALU must be unrelated to any hazard registers.
3324 if (any_of(SGPRsUsed,
3325 [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); }))
3326 return 0;
3327 return 1;
3328 };
3329
3330 // Check for the hazard.
3332 int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
3333 std::next(MI->getReverseIterator()), 0,
3334 IsExpiredFn, Visited, WaitStatesFn);
3335
3336 if (WaitStates >= SALUExpiryCount)
3337 return false;
3338
3339 // Validate hazard through an exhaustive search.
3340 if (UseVALUReadHazardExhaustiveSearch) {
3341 // A hazard is any VALU which reads one of the paired SGPRs read by MI.
3342 // This is searching for (1) in the hazard description.
3343 auto hazardPair = [this](Register Reg) {
3344 if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI)
3345 return Register(AMDGPU::VCC);
3346 auto RegN = sgprPairNumber(Reg, TRI);
3347 return Register(AMDGPU::SGPR0_SGPR1 + *RegN);
3348 };
3349 auto SearchHazardFn = [this, hazardPair,
3350 &SGPRsUsed](const MachineInstr &I) {
3351 if (!SIInstrInfo::isVALU(I))
3352 return false;
3353 // Check for any register reads.
3354 return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) {
3355 return I.readsRegister(hazardPair(Reg), &TRI);
3356 });
3357 };
3358 auto SearchExpiredFn = [&](const MachineInstr &I, int Count) {
3359 return false;
3360 };
3361 if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) ==
3362 std::numeric_limits<int>::max())
3363 return false;
3364 }
3365
3366 // Add s_wait_alu sa_sdst(0) before SALU read.
3367 auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3368 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3370
3371 // SALU read may be after s_getpc in a bundle.
3372 updateGetPCBundle(NewMI);
3373
3374 return true;
3375}
3376
3377static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3378 const SIInstrInfo &TII) {
3379 MachineBasicBlock &EntryMBB = MF->front();
3380 if (EntryMBB.begin() != EntryMBB.end()) {
3381 auto &EntryMI = *EntryMBB.begin();
3382 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3383 EntryMI.getOperand(0).getImm() >= Priority)
3384 return false;
3385 }
3386
3387 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3388 .addImm(Priority);
3389 return true;
3390}
3391
3392bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3393 if (!ST.hasRequiredExportPriority())
3394 return false;
3395
3396 // Assume the following shader types will never have exports,
3397 // and avoid adding or adjusting S_SETPRIO.
3398 MachineBasicBlock *MBB = MI->getParent();
3399 MachineFunction *MF = MBB->getParent();
3400 auto CC = MF->getFunction().getCallingConv();
3401 switch (CC) {
3406 return false;
3407 default:
3408 break;
3409 }
3410
3411 const int MaxPriority = 3;
3412 const int NormalPriority = 2;
3413 const int PostExportPriority = 0;
3414
3415 auto It = MI->getIterator();
3416 switch (MI->getOpcode()) {
3417 case AMDGPU::S_ENDPGM:
3418 case AMDGPU::S_ENDPGM_SAVED:
3419 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3420 case AMDGPU::SI_RETURN_TO_EPILOG:
3421 // Ensure shader with calls raises priority at entry.
3422 // This ensures correct priority if exports exist in callee.
3423 if (MF->getFrameInfo().hasCalls())
3424 return ensureEntrySetPrio(MF, NormalPriority, TII);
3425 return false;
3426 case AMDGPU::S_SETPRIO: {
3427 // Raise minimum priority unless in workaround.
3428 auto &PrioOp = MI->getOperand(0);
3429 int Prio = PrioOp.getImm();
3430 bool InWA = (Prio == PostExportPriority) &&
3431 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3432 if (InWA || Prio >= NormalPriority)
3433 return false;
3434 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3435 return true;
3436 }
3437 default:
3438 if (!TII.isEXP(*MI))
3439 return false;
3440 break;
3441 }
3442
3443 // Check entry priority at each export (as there will only be a few).
3444 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3445 bool Changed = false;
3447 Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3448
3449 auto NextMI = std::next(It);
3450 bool EndOfShader = false;
3451 if (NextMI != MBB->end()) {
3452 // Only need WA at end of sequence of exports.
3453 if (TII.isEXP(*NextMI))
3454 return Changed;
3455 // Assume appropriate S_SETPRIO after export means WA already applied.
3456 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3457 NextMI->getOperand(0).getImm() == PostExportPriority)
3458 return Changed;
3459 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3460 }
3461
3462 const DebugLoc &DL = MI->getDebugLoc();
3463
3464 // Lower priority.
3465 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3466 .addImm(PostExportPriority);
3467
3468 if (!EndOfShader) {
3469 // Wait for exports to complete.
3470 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3471 .addReg(AMDGPU::SGPR_NULL)
3472 .addImm(0);
3473 }
3474
3475 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3476 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3477
3478 if (!EndOfShader) {
3479 // Return to normal (higher) priority.
3480 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3481 .addImm(NormalPriority);
3482 }
3483
3484 return true;
3485}
unsigned const MachineRegisterInfo * MRI
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Analysis containing CSE Info
Definition: CSEInfo.cpp:28
bool End
Definition: ELF_riscv.cpp:480
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI)
static bool isLdsDma(const MachineInstr &MI)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static std::optional< unsigned > sgprPairNumber(Register Reg, const SIRegisterInfo &TRI)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static bool hasHazard(StateT State, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, DenseSet< const MachineBasicBlock * > &Visited)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static bool isDGEMM(unsigned Opcode)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > MaxExhaustiveHazardSearch("amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden, cl::desc("Maximum function size for exhausive hazard search"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
if(PassOpts->AAPipeline)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static const uint32_t IV[8]
Definition: blake3_impl.h:78
void resize(unsigned N, bool t=false)
resize - Grow or shrink the bitvector.
Definition: BitVector.h:341
bool anyCommon(const BitVector &RHS) const
Test if any common bits are set.
Definition: BitVector.h:489
BitVector & set()
Definition: BitVector.h:351
bool none() const
none - Returns true if none of the bits are set.
Definition: BitVector.h:188
bool empty() const
empty - Tests whether there are no bits in this bitvector.
Definition: BitVector.h:156
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
unsigned PreEmitNoopsCommon(MachineInstr *)
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
bool hasShift64HighRegBug() const
bool hasFPAtomicToDenormModeHazard() const
bool hasLdsBranchVmemWARHazard() const
bool hasGFX90AInsts() const
bool hasDstSelForwardingHazard() const
bool hasMAIInsts() const
Definition: GCNSubtarget.h:837
bool hasCvtScaleForwardingHazard() const
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
bool hasVALUMaskWriteHazard() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasVcmpxExecWARHazard() const
bool hasReadM0MovRelInterpHazard() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:291
bool hasRequiredExportPriority() const
bool hasLdsWaitVMSRC() const
bool hasExtendedWaitCounts() const
bool hasVcmpxPermlaneHazard() const
bool hasGFX950Insts() const
bool has12DWordStoreHazard() const
bool hasVALUPartialForwardingHazard() const
bool hasNoDataDepHazard() const
Definition: GCNSubtarget.h:950
unsigned getSetRegWaitStates() const
Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
Definition: GCNSubtarget.h:519
bool hasTransForwardingHazard() const
bool hasGFX940Insts() const
bool hasReadM0LdsDmaHazard() const
bool hasVALUReadSGPRHazard() const
bool hasSMEMtoVectorWriteHazard() const
bool hasVMEMtoScalarWriteHazard() const
bool hasNSAtoVMEMBug() const
bool hasVDecCoExecHazard() const
bool hasReadM0SendMsgHazard() const
bool hasReadM0LdsDirectHazard() const
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:619
bool hasSMRDReadVALUDefHazard() const
A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR was written by a VALU inst...
Definition: GCNSubtarget.h:504
bool hasRFEHazards() const
Definition: GCNSubtarget.h:514
bool hasVMEMReadSGPRVALUDefHazard() const
A read of an SGPR by a VMEM instruction requires 5 wait states when the SGPR was written by a VALU In...
Definition: GCNSubtarget.h:510
bool isWave64() const
bool hasVALUTransUseHazard() const
bool hasLdsDirect() const
void compute(FunctionT &F)
Compute the cycle info for a function.
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
Instructions::const_reverse_iterator const_reverse_instr_iterator
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< pred_iterator > predecessors()
bool hasCalls() const
Return true if the current function has any function calls.
unsigned getInstructionCount() const
Return the number of MachineInstrs in this MachineFunction.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:347
bool isBundle() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
Definition: MachineInstr.h:472
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:801
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:563
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:441
static bool isSOPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:481
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:553
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:545
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:658
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:417
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:521
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isVINTRP(const MachineInstr &MI)
Definition: SIInstrInfo.h:793
static bool isDOT(const MachineInstr &MI)
Definition: SIInstrInfo.h:814
static bool isSWMMAC(const MachineInstr &MI)
Definition: SIInstrInfo.h:830
static bool isLDSDIR(const MachineInstr &MI)
Definition: SIInstrInfo.h:842
bool isBufferSMRD(const MachineInstr &MI) const
static bool isTRANS(const MachineInstr &MI)
Definition: SIInstrInfo.h:777
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:537
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:627
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:769
static bool isMFMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:809
static bool isFPAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:925
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:589
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:818
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:621
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:425
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:378
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:390
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
bool empty() const
Definition: SmallSet.h:168
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:470
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
ProcResIter getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:31
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition: ilist_node.h:132
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
IsaVersion getIsaVersion(StringRef GPU)
bool getMAIIsGFX940XDL(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool getMAIIsDGEMM(unsigned Opc)
Returns true if MAI operation is a double precision GEMM.
@ Entry
Definition: COFF.h:844
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double e
Definition: MathExtras.h:48
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:480
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< po_iterator< T > > post_order(const T &G)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
Instruction set architecture version.
Definition: TargetParser.h:130
Represents the counter values to wait for in an s_waitcnt instruction.
Description of the encoding of one expression Op.
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:121
uint16_t ReleaseAtCycle
Cycle at which the resource will be released by an instruction, relatively to the cycle in which the ...
Definition: MCSchedule.h:71
Definition: regcomp.c:192