[AMDGPU] IGLP: Fixes for VMEM load detection and unsigned int handling #135090

ro-i · 2025-04-09T22:10:40Z

Fixes:

detection of VMEM_READS which are FLAT loads.
unsigned int underflows in MFMASmallGemmSingleWaveOpt::applyIGLPStrategy.
resetting global static DSWCounters for new runs.

This LLVM defect was identified via the AMD Fuzzing project.

Fixes: - detection of VMEM_READS which are FLAT loads. - unsigned int underflows in `MFMASmallGemmSingleWaveOpt::applyIGLPStrategy`. - resetting global static DSWCounters for new runs. This LLVM defect was identified via the AMD Fuzzing project.

llvmbot · 2025-04-09T22:11:12Z

@llvm/pr-subscribers-backend-amdgpu

Author: Robert Imschweiler (ro-i)

Changes

Fixes:

detection of VMEM_READS which are FLAT loads.
unsigned int underflows in MFMASmallGemmSingleWaveOpt::applyIGLPStrategy.
resetting global static DSWCounters for new runs.

This LLVM defect was identified via the AMD Fuzzing project.

Full diff: https://github.com/llvm/llvm-project/pull/135090.diff

2 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp (+18-11)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll (+22)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 7b4d00c8214cb..cea3bcf4b31df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -80,6 +80,10 @@ enum class SchedGroupMask {
   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
 };
 
+static bool handleAsVMEMInstr(const MachineInstr &MI, const SIInstrInfo *TII) {
+  return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI));
+}
+
 class SchedGroup;
 
 // InstructionRule class is used to enact a filter which determines whether or
@@ -1891,7 +1895,7 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
         }
       }
 
-      assert(Cache->size());
+      assert(!MFMAsFound || Cache->size());
       auto *DAG = SyncPipe[0].DAG;
       for (auto &Elt : *Cache) {
         if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))
@@ -1994,7 +1998,7 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
       }
 
       if (NumBits < 128) {
-        assert(TII->isVMEM(*MI) && MI->mayLoad());
+        assert(handleAsVMEMInstr(*MI, TII) && MI->mayLoad());
         if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(
                           MRI, MI->getOperand(0))) <=
             128)
@@ -2079,6 +2083,9 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
 static unsigned DSWCount = 0;
 static unsigned DSWWithPermCount = 0;
 static unsigned DSWWithSharedVMEMCount = 0;
+static void resetDSWCounters() {
+  DSWCount = DSWWithPermCount = DSWWithSharedVMEMCount = 0;
+}
 
 bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
@@ -2138,7 +2145,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 
         for (auto &Succ : Pred.getSUnit()->Succs) {
           auto *MI = Succ.getSUnit()->getInstr();
-          if (!TII->isVMEM(*MI) || !MI->mayLoad())
+          if (!handleAsVMEMInstr(*MI, TII) || !MI->mayLoad())
             continue;
 
           if (MissedAny || !VMEMLookup.size()) {
@@ -2200,7 +2207,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
   SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
   // Interleave MFMA with DS_READ prefetch
-  for (unsigned I = 0; I < DSRCount - 4; ++I) {
+  for (unsigned I = 4; I < DSRCount; ++I) {
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
@@ -2213,7 +2220,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
   // Phase 2a: Loop carried dependency with V_PERM
   // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
   // depend on. Interleave MFMA to keep XDL unit busy throughout.
-  for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) {
+  for (unsigned I = DSWWithSharedVMEMCount; I < DSWWithPermCount; ++I) {
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
@@ -2250,7 +2257,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
   // Phase 2b: Loop carried dependency without V_PERM
   // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
   // Interleave MFMA to keep XDL unit busy throughout.
-  for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) {
+  for (unsigned I = DSWWithPermCount; I < DSWCount; I++) {
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
@@ -2426,17 +2433,15 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
     Result = true;
 
   else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
-           (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+           handleAsVMEMInstr(MI, TII))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
-           MI.mayLoad() &&
-           (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+           MI.mayLoad() && handleAsVMEMInstr(MI, TII))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
-           MI.mayStore() &&
-           (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+           MI.mayStore() && handleAsVMEMInstr(MI, TII))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
@@ -2703,5 +2708,7 @@ bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
 /// for a given region.
 std::unique_ptr<ScheduleDAGMutation>
 llvm::createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) {
+  if (Phase == AMDGPU::SchedulingPhase::Initial)
+    resetDSWCounters();
   return std::make_unique<IGroupLPDAGMutation>(Phase);
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index 08c0d15432915..3ce25c0fd1fef 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -321,6 +321,28 @@ entry:
   ret void
 }
 
+; Check fixes for:
+; - detection of VMEM_READS which are FLAT loads.
+; - unsigned int underflows in MFMASmallGemmSingleWaveOpt::applyIGLPStrategy.
+; - resetting global static DSWCounters for new runs.
+; (reduced fuzzer-generated test case)
+define amdgpu_kernel void @test_iglp_opt_flat_load(ptr %ptr1, ptr %ptr2, ptr addrspace(3) %ptr3, ptr addrspace(3) %ptr4) {
+entry:
+  %LGV2 = load <8 x half>, ptr %ptr1, align 16
+  %LGV = load i1, ptr %ptr2, align 1
+  call void @llvm.amdgcn.iglp.opt(i32 1)
+  %C = fcmp ugt <8 x half> zeroinitializer, %LGV2
+  store <8 x i1> %C, ptr addrspace(3) %ptr3, align 1
+  br i1 %LGV, label %common.ret, label %F
+
+common.ret:                                       ; preds = %F, %entry
+  ret void
+
+F:                                                ; preds = %entry
+  store <32 x float> zeroinitializer, ptr addrspace(3) %ptr4, align 128
+  br label %common.ret
+}
+
 declare void @llvm.amdgcn.iglp.opt(i32) #1
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll

arsenm · 2025-04-10T11:19:57Z

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll

+; - detection of VMEM_READS which are FLAT loads.
+; - unsigned int underflows in MFMASmallGemmSingleWaveOpt::applyIGLPStrategy.
+; - resetting global static DSWCounters for new runs.
+; (reduced fuzzer-generated test case)


Not sure how it checks for all these things

I mean, maybe not for every unsigned int underflow I detected there. (I detected a lot while reducing the initial fuzzer test, but I wasn't sure whether it makes sense to try to find a specific test case for every issue I encountered...)

arsenm · 2025-04-10T11:20:42Z

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

@@ -80,6 +80,10 @@ enum class SchedGroupMask {
  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
 };

+static bool handleAsVMEMInstr(const MachineInstr &MI, const SIInstrInfo *TII) {
+  return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI));


I don't understand this condition. isVMEM should cover it. There should also be no instructions that are flat and DS

I just copied the condition I found in the original code and put it in a separate little utility function so I could reuse it at places in the code where the condition has not been updated manually. The TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)) condition was introduced in https://reviews.llvm.org/D128158, but not reused consequently.
Now it is used consequently.
But I can check what happens if it is not used at all...

If I only use TII->isVMEM(MI) in all places I currently use handleAsVMEMInstr, there are some failing tests:

LLVM :: CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll LLVM :: CodeGen/AMDGPU/sched-barrier-hang-weak-dep.mir LLVM :: CodeGen/AMDGPU/sched-barrier-pre-RA.mir LLVM :: CodeGen/AMDGPU/sched-group-barrier-pipeline-solver.mir LLVM :: CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir

There seems to be GLOBAL_STORE_DWORDX4_SADDR %14:vgpr_32, %135:vreg_128_align2, %143.sub2_sub3:sgpr_128, 0, 0, implicit $exec :: (store (s128) into %ir.gep2, align 128, addrspace 1), for example, which is both FLAT and !DS

Yes, it is a FLAT encoded instruction and not a DS encoded instruction. Both of those are trivially true. But it should be covered by isVMEM

Apparently not:

(gdb) p MI.dump() GLOBAL_STORE_DWORDX4_SADDR %14:vgpr_32, %135:vreg_128_align2, %143.sub2_sub3:sgpr_128, 0, 0, implicit $exec :: (store (s128) into %ir.gep2, align 128, addrspace 1) $1 = void (gdb) p TII->isVMEM(MI) $2 = false (gdb) p TII->isFLAT(MI) $3 = true (gdb) p TII->isDS(MI) $4 = false

(let me look into why this isn't the case although it should be)

arsenm · 2025-04-10T11:22:00Z

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

 static unsigned DSWCount = 0;
 static unsigned DSWWithPermCount = 0;
 static unsigned DSWWithSharedVMEMCount = 0;
+static void resetDSWCounters() {
+  DSWCount = DSWWithPermCount = DSWWithSharedVMEMCount = 0;
+}


Using global variables here is completely wrong, these need to be pass members

I know, but since the code seems to be experimental, I thought it might be expected that I don't change it too much?

I'd split fixing this into a separate patch from the rest, and fix the static variables

But if I may, I'm happy to adapt that as well

Alright, will do

because the original authors actually wanted to maintain state.

And doing it like this is completely invalid. This needs to be per-function state at most. Also note that the SIMachineFunctionInfo is definitely not the correct place to put this either, since this is still information that only needs to be local to the pass.

So you're saying that we should make DSWCount, DSWWithPermCount and DSWWithSharedVMEMCount static variables of MFMASmallGemmSingleWaveOpt::applyIGLPStrategy() because they are only used in that function?

However, this wouldn't work for these class-level static members, for example:

llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

Lines 891 to 914 in ae0aa2d

class MFMAExpInterleaveOpt final : public IGLPStrategy {

private:

// The count of TRANS SUs involved in the interleaved pipeline

static unsigned TransPipeCount;

// The count of MFMA SUs involved in the interleaved pipeline

static unsigned MFMAPipeCount;

// The count of Add SUs involved in the interleaved pipeline

static unsigned AddPipeCount;

// The number of transitive MFMA successors for each TRANS SU

static unsigned MFMAEnablement;

// The number of transitive TRANS predecessors for each MFMA SU

static unsigned ExpRequirement;

// The count of independent "chains" of MFMA instructions in the pipeline

static unsigned MFMAChains;

// The length of each independent "chain" of MFMA instructions

static unsigned MFMAChainLength;

// Whether or not the pipeline has V_CVT instructions

static bool HasCvt;

// Whether or not there are instructions between the TRANS instruction and

// V_CVT

static bool HasChainBetweenCvt;

// The first occuring DS_READ which feeds an MFMA chain

static std::optional<unsigned> FirstPipeDSR;

// The MFMAPipe SUs with no MFMA predecessors

Because e.g. TransPipeCount is used in MFMAExpInterleaveOpt::applyIGLPStrategy() as well as in MFMAExpInterleaveOpt::analyzeDAG(). I mean, MFMAExpInterleaveOpt hasn't bothered me / the fuzzer (yet?) and maybe there is no problem, but we might as well fix it while we're at it... (?)

They should not be static anywhere. These are not static. These are this current compile context. This needs to be non-static, function instanced variables that are thread safe and not shared between unrelated compiles in the same process

Just to be sure: That would imply that values from previous stages (AMDGPU::SchedulingPhase) should not be cached and that I should remove this behavior? So that, for example, DSWWithSharedVMEMCount is recalculated on every call to MFMASmallGemmSingleWaveOpt::applyIGLPStrategy()?

I don't know, maybe it should within a single function. But certainly not across functions or compilations

ro-i requested review from kerbowa and jrbyrnes April 9, 2025 22:10

llvmbot added the backend:AMDGPU label Apr 9, 2025

arsenm reviewed Apr 10, 2025

View reviewed changes

fix test

a188054

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AMDGPU] IGLP: Fixes for VMEM load detection and unsigned int handling #135090

[AMDGPU] IGLP: Fixes for VMEM load detection and unsigned int handling #135090

ro-i commented Apr 9, 2025

llvmbot commented Apr 9, 2025

arsenm Apr 10, 2025

ro-i Apr 10, 2025

arsenm Apr 10, 2025

ro-i Apr 10, 2025

ro-i Apr 10, 2025

arsenm Apr 10, 2025

ro-i Apr 10, 2025

ro-i Apr 10, 2025

arsenm Apr 10, 2025

ro-i Apr 10, 2025

arsenm Apr 10, 2025

ro-i Apr 10, 2025

ro-i Apr 10, 2025

arsenm Apr 12, 2025

ro-i Apr 12, 2025

arsenm Apr 13, 2025

ro-i Apr 13, 2025

arsenm Apr 13, 2025

	class MFMAExpInterleaveOpt final : public IGLPStrategy {
	private:
	// The count of TRANS SUs involved in the interleaved pipeline
	static unsigned TransPipeCount;
	// The count of MFMA SUs involved in the interleaved pipeline
	static unsigned MFMAPipeCount;
	// The count of Add SUs involved in the interleaved pipeline
	static unsigned AddPipeCount;
	// The number of transitive MFMA successors for each TRANS SU
	static unsigned MFMAEnablement;
	// The number of transitive TRANS predecessors for each MFMA SU
	static unsigned ExpRequirement;
	// The count of independent "chains" of MFMA instructions in the pipeline
	static unsigned MFMAChains;
	// The length of each independent "chain" of MFMA instructions
	static unsigned MFMAChainLength;
	// Whether or not the pipeline has V_CVT instructions
	static bool HasCvt;
	// Whether or not there are instructions between the TRANS instruction and
	// V_CVT
	static bool HasChainBetweenCvt;
	// The first occuring DS_READ which feeds an MFMA chain
	static std::optional<unsigned> FirstPipeDSR;
	// The MFMAPipe SUs with no MFMA predecessors

[AMDGPU] IGLP: Fixes for VMEM load detection and unsigned int handling #135090

Are you sure you want to change the base?

[AMDGPU] IGLP: Fixes for VMEM load detection and unsigned int handling #135090

Conversation

ro-i commented Apr 9, 2025

llvmbot commented Apr 9, 2025

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment