-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[AMDGPU] IGLP: Fixes for VMEM load detection and unsigned int handling #135090
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Fixes: - detection of VMEM_READS which are FLAT loads. - unsigned int underflows in `MFMASmallGemmSingleWaveOpt::applyIGLPStrategy`. - resetting global static DSWCounters for new runs. This LLVM defect was identified via the AMD Fuzzing project.
@llvm/pr-subscribers-backend-amdgpu Author: Robert Imschweiler (ro-i) ChangesFixes:
This LLVM defect was identified via the AMD Fuzzing project. Full diff: https://github.com/llvm/llvm-project/pull/135090.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 7b4d00c8214cb..cea3bcf4b31df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -80,6 +80,10 @@ enum class SchedGroupMask {
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
};
+static bool handleAsVMEMInstr(const MachineInstr &MI, const SIInstrInfo *TII) {
+ return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI));
+}
+
class SchedGroup;
// InstructionRule class is used to enact a filter which determines whether or
@@ -1891,7 +1895,7 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
}
}
- assert(Cache->size());
+ assert(!MFMAsFound || Cache->size());
auto *DAG = SyncPipe[0].DAG;
for (auto &Elt : *Cache) {
if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))
@@ -1994,7 +1998,7 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
}
if (NumBits < 128) {
- assert(TII->isVMEM(*MI) && MI->mayLoad());
+ assert(handleAsVMEMInstr(*MI, TII) && MI->mayLoad());
if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(
MRI, MI->getOperand(0))) <=
128)
@@ -2079,6 +2083,9 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
static unsigned DSWCount = 0;
static unsigned DSWWithPermCount = 0;
static unsigned DSWWithSharedVMEMCount = 0;
+static void resetDSWCounters() {
+ DSWCount = DSWWithPermCount = DSWWithSharedVMEMCount = 0;
+}
bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
@@ -2138,7 +2145,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
for (auto &Succ : Pred.getSUnit()->Succs) {
auto *MI = Succ.getSUnit()->getInstr();
- if (!TII->isVMEM(*MI) || !MI->mayLoad())
+ if (!handleAsVMEMInstr(*MI, TII) || !MI->mayLoad())
continue;
if (MissedAny || !VMEMLookup.size()) {
@@ -2200,7 +2207,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
// Interleave MFMA with DS_READ prefetch
- for (unsigned I = 0; I < DSRCount - 4; ++I) {
+ for (unsigned I = 4; I < DSRCount; ++I) {
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
@@ -2213,7 +2220,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
// Phase 2a: Loop carried dependency with V_PERM
// Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
// depend on. Interleave MFMA to keep XDL unit busy throughout.
- for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) {
+ for (unsigned I = DSWWithSharedVMEMCount; I < DSWWithPermCount; ++I) {
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
@@ -2250,7 +2257,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
// Phase 2b: Loop carried dependency without V_PERM
// Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
// Interleave MFMA to keep XDL unit busy throughout.
- for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) {
+ for (unsigned I = DSWWithPermCount; I < DSWCount; I++) {
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
@@ -2426,17 +2433,15 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
Result = true;
else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
- (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+ handleAsVMEMInstr(MI, TII))
Result = true;
else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
- MI.mayLoad() &&
- (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+ MI.mayLoad() && handleAsVMEMInstr(MI, TII))
Result = true;
else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
- MI.mayStore() &&
- (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+ MI.mayStore() && handleAsVMEMInstr(MI, TII))
Result = true;
else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
@@ -2703,5 +2708,7 @@ bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
/// for a given region.
std::unique_ptr<ScheduleDAGMutation>
llvm::createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) {
+ if (Phase == AMDGPU::SchedulingPhase::Initial)
+ resetDSWCounters();
return std::make_unique<IGroupLPDAGMutation>(Phase);
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index 08c0d15432915..3ce25c0fd1fef 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -321,6 +321,28 @@ entry:
ret void
}
+; Check fixes for:
+; - detection of VMEM_READS which are FLAT loads.
+; - unsigned int underflows in MFMASmallGemmSingleWaveOpt::applyIGLPStrategy.
+; - resetting global static DSWCounters for new runs.
+; (reduced fuzzer-generated test case)
+define amdgpu_kernel void @test_iglp_opt_flat_load(ptr %ptr1, ptr %ptr2, ptr addrspace(3) %ptr3, ptr addrspace(3) %ptr4) {
+entry:
+ %LGV2 = load <8 x half>, ptr %ptr1, align 16
+ %LGV = load i1, ptr %ptr2, align 1
+ call void @llvm.amdgcn.iglp.opt(i32 1)
+ %C = fcmp ugt <8 x half> zeroinitializer, %LGV2
+ store <8 x i1> %C, ptr addrspace(3) %ptr3, align 1
+ br i1 %LGV, label %common.ret, label %F
+
+common.ret: ; preds = %F, %entry
+ ret void
+
+F: ; preds = %entry
+ store <32 x float> zeroinitializer, ptr addrspace(3) %ptr4, align 128
+ br label %common.ret
+}
+
declare void @llvm.amdgcn.iglp.opt(i32) #1
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1
|
; - detection of VMEM_READS which are FLAT loads. | ||
; - unsigned int underflows in MFMASmallGemmSingleWaveOpt::applyIGLPStrategy. | ||
; - resetting global static DSWCounters for new runs. | ||
; (reduced fuzzer-generated test case) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure how it checks for all these things
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mean, maybe not for every unsigned int underflow I detected there. (I detected a lot while reducing the initial fuzzer test, but I wasn't sure whether it makes sense to try to find a specific test case for every issue I encountered...)
@@ -80,6 +80,10 @@ enum class SchedGroupMask { | |||
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) | |||
}; | |||
|
|||
static bool handleAsVMEMInstr(const MachineInstr &MI, const SIInstrInfo *TII) { | |||
return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand this condition. isVMEM should cover it. There should also be no instructions that are flat and DS
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I just copied the condition I found in the original code and put it in a separate little utility function so I could reuse it at places in the code where the condition has not been updated manually. The TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))
condition was introduced in https://reviews.llvm.org/D128158, but not reused consequently.
Now it is used consequently.
But I can check what happens if it is not used at all...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I only use TII->isVMEM(MI)
in all places I currently use handleAsVMEMInstr
, there are some failing tests:
LLVM :: CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
LLVM :: CodeGen/AMDGPU/sched-barrier-hang-weak-dep.mir
LLVM :: CodeGen/AMDGPU/sched-barrier-pre-RA.mir
LLVM :: CodeGen/AMDGPU/sched-group-barrier-pipeline-solver.mir
LLVM :: CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir
There seems to be GLOBAL_STORE_DWORDX4_SADDR %14:vgpr_32, %135:vreg_128_align2, %143.sub2_sub3:sgpr_128, 0, 0, implicit $exec :: (store (s128) into %ir.gep2, align 128, addrspace 1)
, for example, which is both FLAT and !DS
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it is a FLAT encoded instruction and not a DS encoded instruction. Both of those are trivially true. But it should be covered by isVMEM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Apparently not:
(gdb) p MI.dump()
GLOBAL_STORE_DWORDX4_SADDR %14:vgpr_32, %135:vreg_128_align2, %143.sub2_sub3:sgpr_128, 0, 0, implicit $exec :: (store (s128) into %ir.gep2, align 128, addrspace 1)
$1 = void
(gdb) p TII->isVMEM(MI)
$2 = false
(gdb) p TII->isFLAT(MI)
$3 = true
(gdb) p TII->isDS(MI)
$4 = false
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(let me look into why this isn't the case although it should be)
static unsigned DSWCount = 0; | ||
static unsigned DSWWithPermCount = 0; | ||
static unsigned DSWWithSharedVMEMCount = 0; | ||
static void resetDSWCounters() { | ||
DSWCount = DSWWithPermCount = DSWWithSharedVMEMCount = 0; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Using global variables here is completely wrong, these need to be pass members
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I know, but since the code seems to be experimental, I thought it might be expected that I don't change it too much?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd split fixing this into a separate patch from the rest, and fix the static variables
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But if I may, I'm happy to adapt that as well
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Alright, will do
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
because the original authors actually wanted to maintain state.
And doing it like this is completely invalid. This needs to be per-function state at most. Also note that the SIMachineFunctionInfo is definitely not the correct place to put this either, since this is still information that only needs to be local to the pass.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So you're saying that we should make DSWCount
, DSWWithPermCount
and DSWWithSharedVMEMCount
static variables of MFMASmallGemmSingleWaveOpt::applyIGLPStrategy()
because they are only used in that function?
However, this wouldn't work for these class-level static members, for example:
llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
Lines 891 to 914 in ae0aa2d
class MFMAExpInterleaveOpt final : public IGLPStrategy { | |
private: | |
// The count of TRANS SUs involved in the interleaved pipeline | |
static unsigned TransPipeCount; | |
// The count of MFMA SUs involved in the interleaved pipeline | |
static unsigned MFMAPipeCount; | |
// The count of Add SUs involved in the interleaved pipeline | |
static unsigned AddPipeCount; | |
// The number of transitive MFMA successors for each TRANS SU | |
static unsigned MFMAEnablement; | |
// The number of transitive TRANS predecessors for each MFMA SU | |
static unsigned ExpRequirement; | |
// The count of independent "chains" of MFMA instructions in the pipeline | |
static unsigned MFMAChains; | |
// The length of each independent "chain" of MFMA instructions | |
static unsigned MFMAChainLength; | |
// Whether or not the pipeline has V_CVT instructions | |
static bool HasCvt; | |
// Whether or not there are instructions between the TRANS instruction and | |
// V_CVT | |
static bool HasChainBetweenCvt; | |
// The first occuring DS_READ which feeds an MFMA chain | |
static std::optional<unsigned> FirstPipeDSR; | |
// The MFMAPipe SUs with no MFMA predecessors |
Because e.g.
TransPipeCount
is used in MFMAExpInterleaveOpt::applyIGLPStrategy()
as well as in MFMAExpInterleaveOpt::analyzeDAG()
. I mean, MFMAExpInterleaveOpt
hasn't bothered me / the fuzzer (yet?) and maybe there is no problem, but we might as well fix it while we're at it... (?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
They should not be static anywhere. These are not static. These are this current compile context. This needs to be non-static, function instanced variables that are thread safe and not shared between unrelated compiles in the same process
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just to be sure: That would imply that values from previous stages (AMDGPU::SchedulingPhase
) should not be cached and that I should remove this behavior? So that, for example, DSWWithSharedVMEMCount
is recalculated on every call to MFMASmallGemmSingleWaveOpt::applyIGLPStrategy()
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know, maybe it should within a single function. But certainly not across functions or compilations
Fixes:
MFMASmallGemmSingleWaveOpt::applyIGLPStrategy
.This LLVM defect was identified via the AMD Fuzzing project.