-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[VectorCombine] Support simplification to scalar store for multiple insertelt #132820
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
…nsertelt Previously, we supported simplifying load-insertelt-store to getelementptr-store when only one insertelt exists. This patch supports multiple insertelements. Proof: https://alive2.llvm.org/ce/z/QTspTf
e3274a2
to
e589ed0
Compare
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: hanbeom (ParkHanbum) ChangesPreviously, we supported simplifying load-insertelt-store to getelementptr-store This patch supports multiple insertelements. Proof: https://alive2.llvm.org/ce/z/QTspTf Patch is 20.57 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/132820.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 4bfe41a5ed00d..483a344d33fb7 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -115,7 +115,7 @@ class VectorCombine {
bool scalarizeVPIntrinsic(Instruction &I);
bool foldExtractedCmps(Instruction &I);
bool foldBinopOfReductions(Instruction &I);
- bool foldSingleElementStore(Instruction &I);
+ bool foldInsertElementsStore(Instruction &I);
bool scalarizeLoadExtract(Instruction &I);
bool foldConcatOfBoolMasks(Instruction &I);
bool foldPermuteOfBinops(Instruction &I);
@@ -1493,58 +1493,88 @@ static Align computeAlignmentAfterScalarization(Align VectorAlignment,
// %0 = bitcast <4 x i32>* %a to i32*
// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
// store i32 %b, i32* %1
-bool VectorCombine::foldSingleElementStore(Instruction &I) {
+bool VectorCombine::foldInsertElementsStore(Instruction &I) {
auto *SI = cast<StoreInst>(&I);
if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
return false;
- // TODO: Combine more complicated patterns (multiple insert) by referencing
- // TargetTransformInfo.
- Instruction *Source;
- Value *NewElement;
- Value *Idx;
- if (!match(SI->getValueOperand(),
- m_InsertElt(m_Instruction(Source), m_Value(NewElement),
- m_Value(Idx))))
- return false;
-
- if (auto *Load = dyn_cast<LoadInst>(Source)) {
- auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
- Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
- // Don't optimize for atomic/volatile load or store. Ensure memory is not
- // modified between, vector type matches store size, and index is inbounds.
- if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
- !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
- SrcAddr != SI->getPointerOperand()->stripPointerCasts())
- return false;
+ Value *Source = SI->getValueOperand();
+ // Track back multiple inserts.
+ SmallVector<std::pair<Value *, Value *>, 4> InsertElements;
+ Value *Base = Source;
+ while (auto *Insert = dyn_cast<InsertElementInst>(Base)) {
+ if (!Insert->hasOneUse())
+ break;
+ Value *InsertVal = Insert->getOperand(1);
+ Value *Idx = Insert->getOperand(2);
+ InsertElements.push_back({InsertVal, Idx});
+ Base = Insert->getOperand(0);
+ }
- auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
- if (ScalarizableIdx.isUnsafe() ||
- isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
- MemoryLocation::get(SI), AA))
- return false;
+ if (InsertElements.empty())
+ return false;
- // Ensure we add the load back to the worklist BEFORE its users so they can
- // erased in the correct order.
- Worklist.push(Load);
+ auto *Load = dyn_cast<LoadInst>(Base);
+ if (!Load)
+ return false;
+ auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
+ Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
+ // Don't optimize for atomic/volatile load or store. Ensure memory is not
+ // modified between, vector type matches store size, and index is inbounds.
+ if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
+ !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
+ SrcAddr != SI->getPointerOperand()->stripPointerCasts())
+ return false;
+
+ if (isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
+ MemoryLocation::get(SI), AA))
+ return false;
+
+ for (size_t i = 0; i < InsertElements.size(); i++) {
+ Value *Idx = InsertElements[i].second;
+ auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
+ if (ScalarizableIdx.isUnsafe())
+ return false;
if (ScalarizableIdx.isSafeWithFreeze())
ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
+ }
+
+ // Ensure we add the load back to the worklist BEFORE its users so they can
+ // erased in the correct order.
+ Worklist.push(Load);
+ stable_sort(InsertElements, [](const std::pair<Value *, Value *> &A,
+ const std::pair<Value *, Value *> &B) {
+ bool AIsConst = isa<ConstantInt>(A.second);
+ bool BIsConst = isa<ConstantInt>(B.second);
+ if (AIsConst != BIsConst)
+ return AIsConst;
+
+ if (AIsConst && BIsConst)
+ return cast<ConstantInt>(A.second)->getZExtValue() <
+ cast<ConstantInt>(B.second)->getZExtValue();
+ return false;
+ });
+
+ StoreInst *NSI;
+ for (size_t i = 0; i < InsertElements.size(); i++) {
+ Value *InsertVal = InsertElements[i].first;
+ Value *Idx = InsertElements[i].second;
+
Value *GEP = Builder.CreateInBoundsGEP(
SI->getValueOperand()->getType(), SI->getPointerOperand(),
{ConstantInt::get(Idx->getType(), 0), Idx});
- StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
+ NSI = Builder.CreateStore(InsertVal, GEP);
NSI->copyMetadata(*SI);
Align ScalarOpAlignment = computeAlignmentAfterScalarization(
- std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
+ std::max(SI->getAlign(), Load->getAlign()), InsertVal->getType(), Idx,
*DL);
NSI->setAlignment(ScalarOpAlignment);
- replaceValue(I, *NSI);
- eraseInstruction(I);
- return true;
}
- return false;
+ replaceValue(I, *NSI);
+ eraseInstruction(I);
+ return true;
}
/// Try to scalarize vector loads feeding extractelement instructions.
@@ -3527,7 +3557,7 @@ bool VectorCombine::run() {
}
if (Opcode == Instruction::Store)
- MadeChange |= foldSingleElementStore(I);
+ MadeChange |= foldInsertElementsStore(I);
// If this is an early pipeline invocation of this pass, we are done.
if (TryEarlyFoldsOnly)
diff --git a/llvm/test/Transforms/VectorCombine/load-insert-store.ll b/llvm/test/Transforms/VectorCombine/load-insert-store.ll
index 93565c1a708eb..33b4562844720 100644
--- a/llvm/test/Transforms/VectorCombine/load-insert-store.ll
+++ b/llvm/test/Transforms/VectorCombine/load-insert-store.ll
@@ -16,6 +16,78 @@ entry:
ret void
}
+define void @insert_store2(ptr %q, i16 zeroext %s) {
+; CHECK-LABEL: @insert_store2(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q:%.*]], i32 0, i32 6
+; CHECK-NEXT: store i16 [[S:%.*]], ptr [[TMP0]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 7
+; CHECK-NEXT: store i16 [[S]], ptr [[TMP1]], align 2
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = load <8 x i16>, ptr %q
+ %vec1 = insertelement <8 x i16> %0, i16 %s, i32 6
+ %vec2 = insertelement <8 x i16> %vec1, i16 %s, i32 7
+ store <8 x i16> %vec2, ptr %q, align 1
+ ret void
+}
+
+define void @insert_store3(ptr %q, i16 zeroext %s) {
+; CHECK-LABEL: @insert_store3(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q:%.*]], i32 0, i32 5
+; CHECK-NEXT: store i16 [[S:%.*]], ptr [[TMP0]], align 2
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 6
+; CHECK-NEXT: store i16 [[S]], ptr [[TMP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 7
+; CHECK-NEXT: store i16 [[S]], ptr [[TMP2]], align 2
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = load <8 x i16>, ptr %q
+ %vec1 = insertelement <8 x i16> %0, i16 %s, i32 5
+ %vec2 = insertelement <8 x i16> %vec1, i16 %s, i32 6
+ %vec3 = insertelement <8 x i16> %vec2, i16 %s, i32 7
+ store <8 x i16> %vec3, ptr %q, align 1
+ ret void
+}
+
+define void @insert_store8(ptr %q, i16 zeroext %s) {
+; CHECK-LABEL: @insert_store8(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q:%.*]], i32 0, i32 0
+; CHECK-NEXT: store i16 [[S:%.*]], ptr [[TMP0]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 1
+; CHECK-NEXT: store i16 [[S]], ptr [[TMP1]], align 2
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 2
+; CHECK-NEXT: store i16 [[S]], ptr [[TMP2]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 3
+; CHECK-NEXT: store i16 [[S]], ptr [[TMP3]], align 2
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 4
+; CHECK-NEXT: store i16 [[S]], ptr [[TMP4]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 5
+; CHECK-NEXT: store i16 [[S]], ptr [[TMP5]], align 2
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 6
+; CHECK-NEXT: store i16 [[S]], ptr [[TMP6]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 7
+; CHECK-NEXT: store i16 [[S]], ptr [[TMP7]], align 2
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = load <8 x i16>, ptr %q
+ %vec1 = insertelement <8 x i16> %0, i16 %s, i32 0
+ %vec2 = insertelement <8 x i16> %vec1, i16 %s, i32 1
+ %vec3 = insertelement <8 x i16> %vec2, i16 %s, i32 2
+ %vec4 = insertelement <8 x i16> %vec3, i16 %s, i32 3
+ %vec5 = insertelement <8 x i16> %vec4, i16 %s, i32 4
+ %vec6 = insertelement <8 x i16> %vec5, i16 %s, i32 5
+ %vec7 = insertelement <8 x i16> %vec6, i16 %s, i32 6
+ %vec8 = insertelement <8 x i16> %vec7, i16 %s, i32 7
+ store <8 x i16> %vec8, ptr %q, align 1
+ ret void
+}
+
define void @insert_store_i16_align1(ptr %q, i16 zeroext %s) {
; CHECK-LABEL: @insert_store_i16_align1(
; CHECK-NEXT: entry:
@@ -827,3 +899,257 @@ bb:
declare i32 @bar(i32, i1) readonly
declare double @llvm.log2.f64(double)
+
+define void @insert_store_gap(ptr %q, i16 zeroext %s) {
+; CHECK-LABEL: @insert_store_gap(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q:%.*]], i32 0, i32 2
+; CHECK-NEXT: store i16 [[S:%.*]], ptr [[TMP0]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 5
+; CHECK-NEXT: store i16 [[S]], ptr [[TMP1]], align 2
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = load <8 x i16>, ptr %q
+ %vec1 = insertelement <8 x i16> %0, i16 %s, i32 2
+ %vec2 = insertelement <8 x i16> %vec1, i16 %s, i32 5
+ store <8 x i16> %vec2, ptr %q
+ ret void
+}
+
+define void @insert_store_reverse(ptr %q, i16 zeroext %s) {
+; CHECK-LABEL: @insert_store_reverse(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q:%.*]], i32 0, i32 5
+; CHECK-NEXT: store i16 [[S:%.*]], ptr [[TMP0]], align 2
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 6
+; CHECK-NEXT: store i16 [[S]], ptr [[TMP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 7
+; CHECK-NEXT: store i16 [[S]], ptr [[TMP2]], align 2
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = load <8 x i16>, ptr %q
+ %vec1 = insertelement <8 x i16> %0, i16 %s, i32 7
+ %vec2 = insertelement <8 x i16> %vec1, i16 %s, i32 6
+ %vec3 = insertelement <8 x i16> %vec2, i16 %s, i32 5
+ store <8 x i16> %vec3, ptr %q
+ ret void
+}
+
+define void @insert_store_duplicate(ptr %q, i16 zeroext %s) {
+; CHECK-LABEL: @insert_store_duplicate(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q:%.*]], i32 0, i32 3
+; CHECK-NEXT: store i16 [[S:%.*]], ptr [[TMP0]], align 2
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 3
+; CHECK-NEXT: store i16 [[S]], ptr [[TMP1]], align 2
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = load <8 x i16>, ptr %q
+ %vec1 = insertelement <8 x i16> %0, i16 %s, i32 3
+ %vec2 = insertelement <8 x i16> %vec1, i16 %s, i32 3
+ store <8 x i16> %vec2, ptr %q
+ ret void
+}
+
+define void @insert_store_i32(ptr %q, i32 zeroext %s) {
+; CHECK-LABEL: @insert_store_i32(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <4 x i32>, ptr [[Q:%.*]], i32 0, i32 2
+; CHECK-NEXT: store i32 [[S:%.*]], ptr [[TMP0]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr [[Q]], i32 0, i32 3
+; CHECK-NEXT: store i32 [[S]], ptr [[TMP1]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = load <4 x i32>, ptr %q
+ %vec1 = insertelement <4 x i32> %0, i32 %s, i32 2
+ %vec2 = insertelement <4 x i32> %vec1, i32 %s, i32 3
+ store <4 x i32> %vec2, ptr %q
+ ret void
+}
+
+define void @insert_store_i8(ptr %q, i8 zeroext %s) {
+; CHECK-LABEL: @insert_store_i8(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[Q:%.*]], i32 0, i32 8
+; CHECK-NEXT: store i8 [[S:%.*]], ptr [[TMP0]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <16 x i8>, ptr [[Q]], i32 0, i32 9
+; CHECK-NEXT: store i8 [[S]], ptr [[TMP1]], align 1
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = load <16 x i8>, ptr %q
+ %vec1 = insertelement <16 x i8> %0, i8 %s, i32 8
+ %vec2 = insertelement <16 x i8> %vec1, i8 %s, i32 9
+ store <16 x i8> %vec2, ptr %q
+ ret void
+}
+
+define void @insert_store_alignment(ptr %q, i16 zeroext %s) {
+; CHECK-LABEL: @insert_store_alignment(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q:%.*]], i32 0, i32 0
+; CHECK-NEXT: store i16 [[S:%.*]], ptr [[TMP0]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 4
+; CHECK-NEXT: store i16 [[S]], ptr [[TMP1]], align 8
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = load <8 x i16>, ptr %q, align 16
+ %vec1 = insertelement <8 x i16> %0, i16 %s, i32 0
+ %vec2 = insertelement <8 x i16> %vec1, i16 %s, i32 4
+ store <8 x i16> %vec2, ptr %q, align 16
+ ret void
+}
+
+define void @insert_store_size(ptr %q, i16 zeroext %s) {
+; CHECK-LABEL: @insert_store_size(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i16>, ptr [[Q:%.*]], i32 0, i32 8
+; CHECK-NEXT: store i16 [[S:%.*]], ptr [[TMP0]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <16 x i16>, ptr [[Q]], i32 0, i32 12
+; CHECK-NEXT: store i16 [[S]], ptr [[TMP1]], align 8
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = load <16 x i16>, ptr %q
+ %vec1 = insertelement <16 x i16> %0, i16 %s, i32 8
+ %vec2 = insertelement <16 x i16> %vec1, i16 %s, i32 12
+ store <16 x i16> %vec2, ptr %q
+ ret void
+}
+
+define void @insert_store_nonconst4(ptr %q, i8 zeroext %s, i32 %idx1, i32 %idx2, i32 %idx3, i32 %idx4) {
+; CHECK-LABEL: @insert_store_nonconst4(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[Q:%.*]], align 16
+; CHECK-NEXT: [[VECINS1:%.*]] = insertelement <16 x i8> [[TMP0]], i8 [[S:%.*]], i32 [[IDX1:%.*]]
+; CHECK-NEXT: [[VECINS2:%.*]] = insertelement <16 x i8> [[VECINS1]], i8 [[S]], i32 [[IDX2:%.*]]
+; CHECK-NEXT: [[VECINS3:%.*]] = insertelement <16 x i8> [[VECINS2]], i8 [[S]], i32 [[IDX3:%.*]]
+; CHECK-NEXT: [[VECINS4:%.*]] = insertelement <16 x i8> [[VECINS3]], i8 [[S]], i32 [[IDX4:%.*]]
+; CHECK-NEXT: store <16 x i8> [[VECINS4]], ptr [[Q]], align 16
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = load <16 x i8>, ptr %q
+ %vecins1 = insertelement <16 x i8> %0, i8 %s, i32 %idx1
+ %vecins2 = insertelement <16 x i8> %vecins1, i8 %s, i32 %idx2
+ %vecins3 = insertelement <16 x i8> %vecins2, i8 %s, i32 %idx3
+ %vecins4 = insertelement <16 x i8> %vecins3, i8 %s, i32 %idx4
+ store <16 x i8> %vecins4, ptr %q
+ ret void
+}
+
+define void @insert_store_vscale_nonconst2(ptr %q, i8 zeroext %s, i32 %idx1, i32 %idx2) {
+; CHECK-LABEL: @insert_store_vscale_nonconst2(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 16 x i8>, ptr [[Q:%.*]], align 16
+; CHECK-NEXT: [[VECINS1:%.*]] = insertelement <vscale x 16 x i8> [[TMP0]], i8 [[S:%.*]], i32 [[IDX1:%.*]]
+; CHECK-NEXT: [[VECINS2:%.*]] = insertelement <vscale x 16 x i8> [[VECINS1]], i8 [[S]], i32 [[IDX2:%.*]]
+; CHECK-NEXT: store <vscale x 16 x i8> [[VECINS2]], ptr [[Q]], align 16
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = load <vscale x 16 x i8>, ptr %q
+ %vecins1 = insertelement <vscale x 16 x i8> %0, i8 %s, i32 %idx1
+ %vecins2 = insertelement <vscale x 16 x i8> %vecins1, i8 %s, i32 %idx2
+ store <vscale x 16 x i8> %vecins2, ptr %q
+ ret void
+}
+
+define void @insert_store_nonconst_large_alignment2(ptr %q, i32 zeroext %s, i32 %idx1, i32 %idx2) {
+; CHECK-LABEL: @insert_store_nonconst_large_alignment2(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i32 [[IDX1:%.*]], 4
+; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[IDX2:%.*]], 4
+; CHECK-NEXT: call void @llvm.assume(i1 [[CMP1]])
+; CHECK-NEXT: call void @llvm.assume(i1 [[CMP2]])
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <4 x i32>, ptr [[Q:%.*]], i32 0, i32 [[IDX2]]
+; CHECK-NEXT: store i32 [[S:%.*]], ptr [[TMP0]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr [[Q]], i32 0, i32 [[IDX1]]
+; CHECK-NEXT: store i32 [[S]], ptr [[TMP1]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %cmp1 = icmp ult i32 %idx1, 4
+ %cmp2 = icmp ult i32 %idx2, 4
+ call void @llvm.assume(i1 %cmp1)
+ call void @llvm.assume(i1 %cmp2)
+ %i = load <4 x i32>, ptr %q, align 128
+ %vecins1 = insertelement <4 x i32> %i, i32 %s, i32 %idx1
+ %vecins2 = insertelement <4 x i32> %vecins1, i32 %s, i32 %idx2
+ store <4 x i32> %vecins2, ptr %q, align 128
+ ret void
+}
+
+define void @insert_store_nonconst_align_maximum_8_2(ptr %q, i64 %s, i32 %idx1, i32 %idx2) {
+; CHECK-LABEL: @insert_store_nonconst_align_maximum_8_2(
+; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i32 [[IDX1:%.*]], 2
+; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[IDX2:%.*]], 2
+; CHECK-NEXT: call void @llvm.assume(i1 [[CMP1]])
+; CHECK-NEXT: call void @llvm.assume(i1 [[CMP2]])
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <8 x i64>, ptr [[Q:%.*]], i32 0, i32 [[IDX2]]
+; CHECK-NEXT: store i64 [[S:%.*]], ptr [[TMP1]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <8 x i64>, ptr [[Q]], i32 0, i32 [[IDX1]]
+; CHECK-NEXT: store i64 [[S]], ptr [[TMP2]], align 8
+; CHECK-NEXT: ret void
+;
+ %cmp1 = icmp ult i32 %idx1, 2
+ %cmp2 = icmp ult i32 %idx2, 2
+ call void @llvm.assume(i1 %cmp1)
+ call void @llvm.assume(i1 %cmp2)
+ %i = load <8 x i64>, ptr %q, align 8
+ %vecins1 = insertelement <8 x i64> %i, i64 %s, i32 %idx1
+ %vecins2 = insertelement <8 x i64> %vecins1, i64 %s, i32 %idx2
+ store <8 x i64> %vecins2, ptr %q, align 8
+ ret void
+}
+
+define void @insert_store_nonconst_align_maximum_4_2(ptr %q, i64 %s, i32 %idx1, i32 %idx2) {
+; CHECK-LABEL: @insert_store_nonconst_align_maximum_4_2(
+; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i32 [[IDX1:%.*]], 2
+; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[IDX2:%.*]], 2
+; CHECK-NEXT: call void @llvm.assume(i1 [[CMP1]])
+; CHECK-NEXT: call void @llvm.assume(i1 [[CMP2]])
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <8 x i64>, ptr [[Q:%.*]], i32 0, i32 [[IDX2]]
+; CHECK-NEXT: store i64 [[S:%.*]], ptr [[TMP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <8 x i64>, ptr [[Q]], i32 0, i32 [[IDX1]]
+; CHECK-NEXT: store i64 [[S]], ptr [[TMP2]], align 4
+; CHECK-NEXT: ret void
+;
+ %cmp1 = icmp ult i32 %idx1, 2
+ %cmp2 = icmp ult i32 %idx2, 2
+ call void @llvm.assume(i1 %cmp1)
+ call void @llvm.assume(i1 %cmp2)
+ %i = load <8 x i64>, ptr %q, align 4
+ %vecins1 = insertelement <8 x i64> %i, i64 %s, i32 %idx1
+ %vecins2 = insertelement <8 x i64> %vecins1, i64 %s, i32 %idx2
+ store <8 x i64> %vecins2, ptr %q, align 4
+ ret void
+}
+
+define void @insert_store_nonconst_align_larger_2(ptr %q, i64 %s, i32 %idx1, i32 %idx2) {
+; CHECK-LABEL: @insert_store_nonconst_align_larger_2(
+; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i32 [[IDX1:%.*]], 2
+; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[IDX2:%.*]], 2
+; CHECK-NEXT: call void @llvm.assume(i1 [[CMP1]])
+; CHECK-NEXT: call void @llvm.assume(i1 [[CMP2]])
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <8 x i64>, ptr [[Q:%.*]], i32 0, i32 [[IDX2]]
+; CHECK-NEXT: store...
[truncated]
|
@RKSimon I’ve implemented the items that were registered as TODO in vectorcombine. Could someone review this PR? Thanks! |
https://llvm.org/docs/DeveloperPolicy.html#obtaining-commit-access |
@RKSimon Thanks to you, I'm now a committer, can I add you as a reviewer? |
Previously, we supported simplifying load-insertelt-store to getelementptr-store
when only one insertelt exists.
This patch supports multiple insertelements.
Proof: https://alive2.llvm.org/ce/z/QTspTf