diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index af94dc01c8c5c..faf8d7923db79 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4526,6 +4526,12 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, return false; } +static bool hasReplicatorRegion(VPlan &Plan) { + return any_of(VPBlockUtils::blocksOnly(vp_depth_first_shallow( + Plan.getVectorLoopRegion()->getEntry())), + [](auto *VPRB) { return VPRB->isReplicator(); }); +} + #ifndef NDEBUG VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1)); @@ -4598,6 +4604,15 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { continue; } + if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) { + LLVM_DEBUG( + dbgs() + << "LV: Not considering vector loop of width " << VF + << " because it would cause replicated blocks to be generated," + << " which isn't allowed when optimizing for size.\n"); + continue; + } + if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail())) ChosenFactor = Candidate; } @@ -7771,6 +7786,14 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { << " because it will not generate any vector instructions.\n"); continue; } + if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) { + LLVM_DEBUG( + dbgs() + << "LV: Not considering vector loop of width " << VF + << " because it would cause replicated blocks to be generated," + << " which isn't allowed when optimizing for size.\n"); + continue; + } InstructionCost Cost = cost(*P, VF); VectorizationFactor CurrentFactor(VF, Cost, ScalarCost); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll index e6d93ea192e56..c18f9f2fae06b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll @@ -1,22 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; REQUIRES: asserts -; RUN: opt < %s -passes=loop-vectorize -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST -; RUN: opt < %s -passes=loop-vectorize,instcombine,simplifycfg -force-vector-width=2 -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s +; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST +; RUN: opt < %s -passes=loop-vectorize,instcombine,simplifycfg -force-vector-interleave=1 -force-vector-width=2 -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" ; This test checks that we correctly compute the scalarized operands for a -; user-specified vectorization factor when interleaving is disabled. We use the -; "optsize" attribute to disable all interleaving calculations. A cost of 4 -; for %var4 indicates that we would scalarize it's operand (%var3), giving +; user-specified vectorization factor when interleaving is disabled. We use +; -force-vector-interleave=1 to disable all interleaving calculations. A cost of +; 4 for %var4 indicates that we would scalarize it's operand (%var3), giving ; %var4 a lower scalarization overhead. ; ; COST-LABEL: predicated_udiv_scalarized_operand ; COST: Cost of 5 for VF 2: profitable to scalarize %var4 = udiv i64 %var2, %var3 ; ; -define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) optsize { +define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) { ; CHECK-LABEL: @predicated_udiv_scalarized_operand( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index 4435c31e3b189..b96a768bba24d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -1472,55 +1472,29 @@ exit: ret void } -define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize { +define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) { ; DEFAULT-LABEL: define void @redundant_branch_and_tail_folding( -; DEFAULT-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) #[[ATTR4:[0-9]+]] { +; DEFAULT-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) { ; DEFAULT-NEXT: [[ENTRY:.*]]: ; DEFAULT-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; DEFAULT: [[VECTOR_PH]]: ; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] ; DEFAULT: [[VECTOR_BODY]]: -; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ] -; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ] -; DEFAULT-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], splat (i64 20) -; DEFAULT-NEXT: [[TMP1:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 1) -; DEFAULT-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32> -; DEFAULT-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 -; DEFAULT-NEXT: br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; DEFAULT: [[PRED_STORE_IF]]: -; DEFAULT-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 -; DEFAULT-NEXT: store i32 [[TMP4]], ptr [[DST]], align 4 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE]] -; DEFAULT: [[PRED_STORE_CONTINUE]]: -; DEFAULT-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 -; DEFAULT-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] -; DEFAULT: [[PRED_STORE_IF1]]: -; DEFAULT-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 -; DEFAULT-NEXT: store i32 [[TMP6]], ptr [[DST]], align 4 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE2]] -; DEFAULT: [[PRED_STORE_CONTINUE2]]: -; DEFAULT-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 -; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] -; DEFAULT: [[PRED_STORE_IF3]]: -; DEFAULT-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 -; DEFAULT-NEXT: store i32 [[TMP8]], ptr [[DST]], align 4 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE4]] -; DEFAULT: [[PRED_STORE_CONTINUE4]]: -; DEFAULT-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 -; DEFAULT-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]] -; DEFAULT: [[PRED_STORE_IF5]]: -; DEFAULT-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 -; DEFAULT-NEXT: store i32 [[TMP10]], ptr [[DST]], align 4 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE6]] -; DEFAULT: [[PRED_STORE_CONTINUE6]]: -; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) -; DEFAULT-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; DEFAULT-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; DEFAULT-NEXT: [[TMP0:%.*]] = add nuw nsw <4 x i64> [[STEP_ADD]], splat (i64 1) +; DEFAULT-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[TMP0]] to <4 x i32> +; DEFAULT-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; DEFAULT-NEXT: store i32 [[TMP2]], ptr [[DST]], align 4 +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], splat (i64 4) +; DEFAULT-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; DEFAULT-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; DEFAULT: [[MIDDLE_BLOCK]]: -; DEFAULT-NEXT: br label %[[EXIT:.*]] +; DEFAULT-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; DEFAULT: [[SCALAR_PH]]: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; DEFAULT-NEXT: br label %[[LOOP_HEADER:.*]] ; DEFAULT: [[LOOP_HEADER]]: ; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] @@ -1537,7 +1511,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize { ; DEFAULT-NEXT: ret void ; ; PRED-LABEL: define void @redundant_branch_and_tail_folding( -; PRED-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) #[[ATTR4:[0-9]+]] { +; PRED-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) { ; PRED-NEXT: [[ENTRY:.*]]: ; PRED-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; PRED: [[VECTOR_PH]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll index 3239d2c2e9388..5c876b760e943 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll @@ -228,7 +228,6 @@ for.cond.cleanup: ; This should be vectorized and tail predicated without optsize, as that's ; faster, but not with optsize, as it's much larger. -; FIXME: Currently we avoid tail predication only with minsize define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) { ; DEFAULT-LABEL: define void @tail_predicate_without_optsize( ; DEFAULT-SAME: ptr [[P:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i32 [[N:%.*]]) { @@ -428,182 +427,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) ; OPTSIZE-LABEL: define void @tail_predicate_without_optsize( ; OPTSIZE-SAME: ptr [[P:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { ; OPTSIZE-NEXT: [[ENTRY:.*]]: -; OPTSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; OPTSIZE: [[VECTOR_PH]]: -; OPTSIZE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 -; OPTSIZE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer -; OPTSIZE-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0 -; OPTSIZE-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT3]], <16 x i8> poison, <16 x i32> zeroinitializer -; OPTSIZE-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0 -; OPTSIZE-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT5]], <16 x i8> poison, <16 x i32> zeroinitializer -; OPTSIZE-NEXT: br label %[[VECTOR_BODY:.*]] -; OPTSIZE: [[VECTOR_BODY]]: -; OPTSIZE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE36:.*]] ] -; OPTSIZE-NEXT: [[VEC_IND:%.*]] = phi <16 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE36]] ] -; OPTSIZE-NEXT: [[VEC_IND1:%.*]] = phi <16 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[PRED_STORE_CONTINUE36]] ] -; OPTSIZE-NEXT: [[TMP72:%.*]] = icmp ule <16 x i8> [[VEC_IND]], splat (i8 14) -; OPTSIZE-NEXT: [[TMP1:%.*]] = mul <16 x i8> [[BROADCAST_SPLAT]], [[VEC_IND1]] -; OPTSIZE-NEXT: [[TMP2:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 1) -; OPTSIZE-NEXT: [[TMP3:%.*]] = mul <16 x i8> [[TMP2]], [[BROADCAST_SPLAT4]] -; OPTSIZE-NEXT: [[TMP4:%.*]] = add <16 x i8> [[TMP3]], [[TMP1]] -; OPTSIZE-NEXT: [[TMP5:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 2) -; OPTSIZE-NEXT: [[TMP6:%.*]] = mul <16 x i8> [[TMP5]], [[BROADCAST_SPLAT6]] -; OPTSIZE-NEXT: [[TMP7:%.*]] = add <16 x i8> [[TMP4]], [[TMP6]] -; OPTSIZE-NEXT: [[TMP8:%.*]] = extractelement <16 x i1> [[TMP72]], i32 0 -; OPTSIZE-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; OPTSIZE: [[PRED_STORE_IF]]: -; OPTSIZE-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; OPTSIZE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP9]] -; OPTSIZE-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP7]], i32 0 -; OPTSIZE-NEXT: store i8 [[TMP11]], ptr [[TMP10]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE]] -; OPTSIZE: [[PRED_STORE_CONTINUE]]: -; OPTSIZE-NEXT: [[TMP12:%.*]] = extractelement <16 x i1> [[TMP72]], i32 1 -; OPTSIZE-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] -; OPTSIZE: [[PRED_STORE_IF7]]: -; OPTSIZE-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 1 -; OPTSIZE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP13]] -; OPTSIZE-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[TMP7]], i32 1 -; OPTSIZE-NEXT: store i8 [[TMP15]], ptr [[TMP14]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE8]] -; OPTSIZE: [[PRED_STORE_CONTINUE8]]: -; OPTSIZE-NEXT: [[TMP16:%.*]] = extractelement <16 x i1> [[TMP72]], i32 2 -; OPTSIZE-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] -; OPTSIZE: [[PRED_STORE_IF9]]: -; OPTSIZE-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 2 -; OPTSIZE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP17]] -; OPTSIZE-NEXT: [[TMP19:%.*]] = extractelement <16 x i8> [[TMP7]], i32 2 -; OPTSIZE-NEXT: store i8 [[TMP19]], ptr [[TMP18]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE10]] -; OPTSIZE: [[PRED_STORE_CONTINUE10]]: -; OPTSIZE-NEXT: [[TMP20:%.*]] = extractelement <16 x i1> [[TMP72]], i32 3 -; OPTSIZE-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] -; OPTSIZE: [[PRED_STORE_IF11]]: -; OPTSIZE-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 3 -; OPTSIZE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP21]] -; OPTSIZE-NEXT: [[TMP23:%.*]] = extractelement <16 x i8> [[TMP7]], i32 3 -; OPTSIZE-NEXT: store i8 [[TMP23]], ptr [[TMP22]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE12]] -; OPTSIZE: [[PRED_STORE_CONTINUE12]]: -; OPTSIZE-NEXT: [[TMP24:%.*]] = extractelement <16 x i1> [[TMP72]], i32 4 -; OPTSIZE-NEXT: br i1 [[TMP24]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] -; OPTSIZE: [[PRED_STORE_IF13]]: -; OPTSIZE-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 4 -; OPTSIZE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP25]] -; OPTSIZE-NEXT: [[TMP27:%.*]] = extractelement <16 x i8> [[TMP7]], i32 4 -; OPTSIZE-NEXT: store i8 [[TMP27]], ptr [[TMP26]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE14]] -; OPTSIZE: [[PRED_STORE_CONTINUE14]]: -; OPTSIZE-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[TMP72]], i32 5 -; OPTSIZE-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]] -; OPTSIZE: [[PRED_STORE_IF15]]: -; OPTSIZE-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 5 -; OPTSIZE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP29]] -; OPTSIZE-NEXT: [[TMP31:%.*]] = extractelement <16 x i8> [[TMP7]], i32 5 -; OPTSIZE-NEXT: store i8 [[TMP31]], ptr [[TMP30]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE16]] -; OPTSIZE: [[PRED_STORE_CONTINUE16]]: -; OPTSIZE-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP72]], i32 6 -; OPTSIZE-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]] -; OPTSIZE: [[PRED_STORE_IF17]]: -; OPTSIZE-NEXT: [[TMP33:%.*]] = add i64 [[INDEX]], 6 -; OPTSIZE-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP33]] -; OPTSIZE-NEXT: [[TMP35:%.*]] = extractelement <16 x i8> [[TMP7]], i32 6 -; OPTSIZE-NEXT: store i8 [[TMP35]], ptr [[TMP34]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE18]] -; OPTSIZE: [[PRED_STORE_CONTINUE18]]: -; OPTSIZE-NEXT: [[TMP36:%.*]] = extractelement <16 x i1> [[TMP72]], i32 7 -; OPTSIZE-NEXT: br i1 [[TMP36]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]] -; OPTSIZE: [[PRED_STORE_IF19]]: -; OPTSIZE-NEXT: [[TMP37:%.*]] = add i64 [[INDEX]], 7 -; OPTSIZE-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP37]] -; OPTSIZE-NEXT: [[TMP39:%.*]] = extractelement <16 x i8> [[TMP7]], i32 7 -; OPTSIZE-NEXT: store i8 [[TMP39]], ptr [[TMP38]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE20]] -; OPTSIZE: [[PRED_STORE_CONTINUE20]]: -; OPTSIZE-NEXT: [[TMP40:%.*]] = extractelement <16 x i1> [[TMP72]], i32 8 -; OPTSIZE-NEXT: br i1 [[TMP40]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] -; OPTSIZE: [[PRED_STORE_IF21]]: -; OPTSIZE-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], 8 -; OPTSIZE-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP41]] -; OPTSIZE-NEXT: [[TMP43:%.*]] = extractelement <16 x i8> [[TMP7]], i32 8 -; OPTSIZE-NEXT: store i8 [[TMP43]], ptr [[TMP42]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE22]] -; OPTSIZE: [[PRED_STORE_CONTINUE22]]: -; OPTSIZE-NEXT: [[TMP44:%.*]] = extractelement <16 x i1> [[TMP72]], i32 9 -; OPTSIZE-NEXT: br i1 [[TMP44]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] -; OPTSIZE: [[PRED_STORE_IF23]]: -; OPTSIZE-NEXT: [[TMP45:%.*]] = add i64 [[INDEX]], 9 -; OPTSIZE-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP45]] -; OPTSIZE-NEXT: [[TMP47:%.*]] = extractelement <16 x i8> [[TMP7]], i32 9 -; OPTSIZE-NEXT: store i8 [[TMP47]], ptr [[TMP46]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE24]] -; OPTSIZE: [[PRED_STORE_CONTINUE24]]: -; OPTSIZE-NEXT: [[TMP48:%.*]] = extractelement <16 x i1> [[TMP72]], i32 10 -; OPTSIZE-NEXT: br i1 [[TMP48]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]] -; OPTSIZE: [[PRED_STORE_IF25]]: -; OPTSIZE-NEXT: [[TMP49:%.*]] = add i64 [[INDEX]], 10 -; OPTSIZE-NEXT: [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP49]] -; OPTSIZE-NEXT: [[TMP51:%.*]] = extractelement <16 x i8> [[TMP7]], i32 10 -; OPTSIZE-NEXT: store i8 [[TMP51]], ptr [[TMP50]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE26]] -; OPTSIZE: [[PRED_STORE_CONTINUE26]]: -; OPTSIZE-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP72]], i32 11 -; OPTSIZE-NEXT: br i1 [[TMP52]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]] -; OPTSIZE: [[PRED_STORE_IF27]]: -; OPTSIZE-NEXT: [[TMP53:%.*]] = add i64 [[INDEX]], 11 -; OPTSIZE-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP53]] -; OPTSIZE-NEXT: [[TMP55:%.*]] = extractelement <16 x i8> [[TMP7]], i32 11 -; OPTSIZE-NEXT: store i8 [[TMP55]], ptr [[TMP54]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE28]] -; OPTSIZE: [[PRED_STORE_CONTINUE28]]: -; OPTSIZE-NEXT: [[TMP56:%.*]] = extractelement <16 x i1> [[TMP72]], i32 12 -; OPTSIZE-NEXT: br i1 [[TMP56]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]] -; OPTSIZE: [[PRED_STORE_IF29]]: -; OPTSIZE-NEXT: [[TMP57:%.*]] = add i64 [[INDEX]], 12 -; OPTSIZE-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP57]] -; OPTSIZE-NEXT: [[TMP59:%.*]] = extractelement <16 x i8> [[TMP7]], i32 12 -; OPTSIZE-NEXT: store i8 [[TMP59]], ptr [[TMP58]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE30]] -; OPTSIZE: [[PRED_STORE_CONTINUE30]]: -; OPTSIZE-NEXT: [[TMP60:%.*]] = extractelement <16 x i1> [[TMP72]], i32 13 -; OPTSIZE-NEXT: br i1 [[TMP60]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32:.*]] -; OPTSIZE: [[PRED_STORE_IF31]]: -; OPTSIZE-NEXT: [[TMP61:%.*]] = add i64 [[INDEX]], 13 -; OPTSIZE-NEXT: [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP61]] -; OPTSIZE-NEXT: [[TMP63:%.*]] = extractelement <16 x i8> [[TMP7]], i32 13 -; OPTSIZE-NEXT: store i8 [[TMP63]], ptr [[TMP62]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE32]] -; OPTSIZE: [[PRED_STORE_CONTINUE32]]: -; OPTSIZE-NEXT: [[TMP64:%.*]] = extractelement <16 x i1> [[TMP72]], i32 14 -; OPTSIZE-NEXT: br i1 [[TMP64]], label %[[PRED_STORE_IF33:.*]], label %[[PRED_STORE_CONTINUE34:.*]] -; OPTSIZE: [[PRED_STORE_IF33]]: -; OPTSIZE-NEXT: [[TMP65:%.*]] = add i64 [[INDEX]], 14 -; OPTSIZE-NEXT: [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP65]] -; OPTSIZE-NEXT: [[TMP67:%.*]] = extractelement <16 x i8> [[TMP7]], i32 14 -; OPTSIZE-NEXT: store i8 [[TMP67]], ptr [[TMP66]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE34]] -; OPTSIZE: [[PRED_STORE_CONTINUE34]]: -; OPTSIZE-NEXT: [[TMP68:%.*]] = extractelement <16 x i1> [[TMP72]], i32 15 -; OPTSIZE-NEXT: br i1 [[TMP68]], label %[[PRED_STORE_IF35:.*]], label %[[PRED_STORE_CONTINUE36]] -; OPTSIZE: [[PRED_STORE_IF35]]: -; OPTSIZE-NEXT: [[TMP69:%.*]] = add i64 [[INDEX]], 15 -; OPTSIZE-NEXT: [[TMP70:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP69]] -; OPTSIZE-NEXT: [[TMP71:%.*]] = extractelement <16 x i8> [[TMP7]], i32 15 -; OPTSIZE-NEXT: store i8 [[TMP71]], ptr [[TMP70]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE36]] -; OPTSIZE: [[PRED_STORE_CONTINUE36]]: -; OPTSIZE-NEXT: [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16) -; OPTSIZE-NEXT: [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16) -; OPTSIZE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; OPTSIZE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; OPTSIZE: [[MIDDLE_BLOCK]]: -; OPTSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]] -; OPTSIZE: [[SCALAR_PH]]: -; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] ; OPTSIZE: [[FOR_BODY]]: -; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; OPTSIZE-NEXT: [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i8 ; OPTSIZE-NEXT: [[MUL:%.*]] = mul i8 [[A]], [[TMP0]] ; OPTSIZE-NEXT: [[SHR:%.*]] = lshr i8 [[TMP0]], 1 @@ -616,7 +442,7 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) ; OPTSIZE-NEXT: store i8 [[ADD10]], ptr [[ARRAYIDX]], align 1 ; OPTSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; OPTSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 15 -; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY]] ; OPTSIZE: [[FOR_COND_CLEANUP]]: ; OPTSIZE-NEXT: ret void ; @@ -800,7 +626,7 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 ; OPTSIZE-NEXT: [[TMP24:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) ; OPTSIZE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; OPTSIZE-NEXT: [[TMP25:%.*]] = extractelement [[TMP24]], i32 0 -; OPTSIZE-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; OPTSIZE-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; OPTSIZE: [[MIDDLE_BLOCK]]: ; OPTSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; OPTSIZE: [[SCALAR_PH]]: @@ -820,7 +646,7 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 ; OPTSIZE-NEXT: store i8 [[ADD10]], ptr [[ARRAYIDX]], align 1 ; OPTSIZE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; OPTSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 15 -; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; OPTSIZE: [[FOR_COND_CLEANUP]]: ; OPTSIZE-NEXT: ret void ; @@ -1008,7 +834,7 @@ define void @dont_vectorize_with_minsize() { ; OPTSIZE-NEXT: store <8 x i16> [[TMP9]], ptr [[TMP7]], align 2 ; OPTSIZE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; OPTSIZE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 -; OPTSIZE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; OPTSIZE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; OPTSIZE: [[MIDDLE_BLOCK]]: ; OPTSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] ; OPTSIZE: [[SCALAR_PH]]: @@ -1028,7 +854,7 @@ define void @dont_vectorize_with_minsize() { ; OPTSIZE-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2 ; OPTSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; OPTSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 -; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; OPTSIZE: [[FOR_COND_CLEANUP]]: ; OPTSIZE-NEXT: ret void ; @@ -1187,7 +1013,7 @@ define void @vectorization_forced_minsize_reduce_width() { ; OPTSIZE-NEXT: store <8 x i16> [[TMP9]], ptr [[TMP7]], align 2 ; OPTSIZE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; OPTSIZE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 -; OPTSIZE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; OPTSIZE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; OPTSIZE: [[MIDDLE_BLOCK]]: ; OPTSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] ; OPTSIZE: [[SCALAR_PH]]: @@ -1207,7 +1033,7 @@ define void @vectorization_forced_minsize_reduce_width() { ; OPTSIZE-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2 ; OPTSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; OPTSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 -; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; OPTSIZE: [[FOR_COND_CLEANUP]]: ; OPTSIZE-NEXT: ret void ; @@ -1285,4 +1111,3 @@ attributes #0 = { "target-features"="+sve" } !0 = distinct !{!0, !1} !1 = !{!"llvm.loop.vectorize.enable", i1 true} - diff --git a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll index 1e91fc9c7c56d..1ce272bcfb783 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll @@ -224,7 +224,6 @@ for.cond.cleanup: ; This should be vectorized and tail predicated without optsize, as that's ; faster, but not with optsize, as it's much larger. -; FIXME: Currently we avoid tail predication only with minsize define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) { ; DEFAULT-LABEL: define void @tail_predicate_without_optsize( ; DEFAULT-SAME: ptr [[P:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i32 [[N:%.*]]) { @@ -424,182 +423,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) ; OPTSIZE-LABEL: define void @tail_predicate_without_optsize( ; OPTSIZE-SAME: ptr [[P:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { ; OPTSIZE-NEXT: [[ENTRY:.*]]: -; OPTSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; OPTSIZE: [[VECTOR_PH]]: -; OPTSIZE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 -; OPTSIZE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer -; OPTSIZE-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0 -; OPTSIZE-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT3]], <16 x i8> poison, <16 x i32> zeroinitializer -; OPTSIZE-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0 -; OPTSIZE-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT5]], <16 x i8> poison, <16 x i32> zeroinitializer -; OPTSIZE-NEXT: br label %[[VECTOR_BODY:.*]] -; OPTSIZE: [[VECTOR_BODY]]: -; OPTSIZE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE36:.*]] ] -; OPTSIZE-NEXT: [[VEC_IND:%.*]] = phi <16 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE36]] ] -; OPTSIZE-NEXT: [[VEC_IND1:%.*]] = phi <16 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[PRED_STORE_CONTINUE36]] ] -; OPTSIZE-NEXT: [[TMP72:%.*]] = icmp ule <16 x i8> [[VEC_IND]], splat (i8 14) -; OPTSIZE-NEXT: [[TMP1:%.*]] = mul <16 x i8> [[BROADCAST_SPLAT]], [[VEC_IND1]] -; OPTSIZE-NEXT: [[TMP2:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 1) -; OPTSIZE-NEXT: [[TMP3:%.*]] = mul <16 x i8> [[TMP2]], [[BROADCAST_SPLAT4]] -; OPTSIZE-NEXT: [[TMP4:%.*]] = add <16 x i8> [[TMP3]], [[TMP1]] -; OPTSIZE-NEXT: [[TMP5:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 2) -; OPTSIZE-NEXT: [[TMP6:%.*]] = mul <16 x i8> [[TMP5]], [[BROADCAST_SPLAT6]] -; OPTSIZE-NEXT: [[TMP7:%.*]] = add <16 x i8> [[TMP4]], [[TMP6]] -; OPTSIZE-NEXT: [[TMP8:%.*]] = extractelement <16 x i1> [[TMP72]], i32 0 -; OPTSIZE-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; OPTSIZE: [[PRED_STORE_IF]]: -; OPTSIZE-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; OPTSIZE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP9]] -; OPTSIZE-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP7]], i32 0 -; OPTSIZE-NEXT: store i8 [[TMP11]], ptr [[TMP10]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE]] -; OPTSIZE: [[PRED_STORE_CONTINUE]]: -; OPTSIZE-NEXT: [[TMP12:%.*]] = extractelement <16 x i1> [[TMP72]], i32 1 -; OPTSIZE-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] -; OPTSIZE: [[PRED_STORE_IF7]]: -; OPTSIZE-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 1 -; OPTSIZE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP13]] -; OPTSIZE-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[TMP7]], i32 1 -; OPTSIZE-NEXT: store i8 [[TMP15]], ptr [[TMP14]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE8]] -; OPTSIZE: [[PRED_STORE_CONTINUE8]]: -; OPTSIZE-NEXT: [[TMP16:%.*]] = extractelement <16 x i1> [[TMP72]], i32 2 -; OPTSIZE-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] -; OPTSIZE: [[PRED_STORE_IF9]]: -; OPTSIZE-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 2 -; OPTSIZE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP17]] -; OPTSIZE-NEXT: [[TMP19:%.*]] = extractelement <16 x i8> [[TMP7]], i32 2 -; OPTSIZE-NEXT: store i8 [[TMP19]], ptr [[TMP18]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE10]] -; OPTSIZE: [[PRED_STORE_CONTINUE10]]: -; OPTSIZE-NEXT: [[TMP20:%.*]] = extractelement <16 x i1> [[TMP72]], i32 3 -; OPTSIZE-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] -; OPTSIZE: [[PRED_STORE_IF11]]: -; OPTSIZE-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 3 -; OPTSIZE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP21]] -; OPTSIZE-NEXT: [[TMP23:%.*]] = extractelement <16 x i8> [[TMP7]], i32 3 -; OPTSIZE-NEXT: store i8 [[TMP23]], ptr [[TMP22]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE12]] -; OPTSIZE: [[PRED_STORE_CONTINUE12]]: -; OPTSIZE-NEXT: [[TMP24:%.*]] = extractelement <16 x i1> [[TMP72]], i32 4 -; OPTSIZE-NEXT: br i1 [[TMP24]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] -; OPTSIZE: [[PRED_STORE_IF13]]: -; OPTSIZE-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 4 -; OPTSIZE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP25]] -; OPTSIZE-NEXT: [[TMP27:%.*]] = extractelement <16 x i8> [[TMP7]], i32 4 -; OPTSIZE-NEXT: store i8 [[TMP27]], ptr [[TMP26]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE14]] -; OPTSIZE: [[PRED_STORE_CONTINUE14]]: -; OPTSIZE-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[TMP72]], i32 5 -; OPTSIZE-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]] -; OPTSIZE: [[PRED_STORE_IF15]]: -; OPTSIZE-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 5 -; OPTSIZE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP29]] -; OPTSIZE-NEXT: [[TMP31:%.*]] = extractelement <16 x i8> [[TMP7]], i32 5 -; OPTSIZE-NEXT: store i8 [[TMP31]], ptr [[TMP30]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE16]] -; OPTSIZE: [[PRED_STORE_CONTINUE16]]: -; OPTSIZE-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP72]], i32 6 -; OPTSIZE-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]] -; OPTSIZE: [[PRED_STORE_IF17]]: -; OPTSIZE-NEXT: [[TMP33:%.*]] = add i64 [[INDEX]], 6 -; OPTSIZE-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP33]] -; OPTSIZE-NEXT: [[TMP35:%.*]] = extractelement <16 x i8> [[TMP7]], i32 6 -; OPTSIZE-NEXT: store i8 [[TMP35]], ptr [[TMP34]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE18]] -; OPTSIZE: [[PRED_STORE_CONTINUE18]]: -; OPTSIZE-NEXT: [[TMP36:%.*]] = extractelement <16 x i1> [[TMP72]], i32 7 -; OPTSIZE-NEXT: br i1 [[TMP36]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]] -; OPTSIZE: [[PRED_STORE_IF19]]: -; OPTSIZE-NEXT: [[TMP37:%.*]] = add i64 [[INDEX]], 7 -; OPTSIZE-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP37]] -; OPTSIZE-NEXT: [[TMP39:%.*]] = extractelement <16 x i8> [[TMP7]], i32 7 -; OPTSIZE-NEXT: store i8 [[TMP39]], ptr [[TMP38]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE20]] -; OPTSIZE: [[PRED_STORE_CONTINUE20]]: -; OPTSIZE-NEXT: [[TMP40:%.*]] = extractelement <16 x i1> [[TMP72]], i32 8 -; OPTSIZE-NEXT: br i1 [[TMP40]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] -; OPTSIZE: [[PRED_STORE_IF21]]: -; OPTSIZE-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], 8 -; OPTSIZE-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP41]] -; OPTSIZE-NEXT: [[TMP43:%.*]] = extractelement <16 x i8> [[TMP7]], i32 8 -; OPTSIZE-NEXT: store i8 [[TMP43]], ptr [[TMP42]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE22]] -; OPTSIZE: [[PRED_STORE_CONTINUE22]]: -; OPTSIZE-NEXT: [[TMP44:%.*]] = extractelement <16 x i1> [[TMP72]], i32 9 -; OPTSIZE-NEXT: br i1 [[TMP44]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] -; OPTSIZE: [[PRED_STORE_IF23]]: -; OPTSIZE-NEXT: [[TMP45:%.*]] = add i64 [[INDEX]], 9 -; OPTSIZE-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP45]] -; OPTSIZE-NEXT: [[TMP47:%.*]] = extractelement <16 x i8> [[TMP7]], i32 9 -; OPTSIZE-NEXT: store i8 [[TMP47]], ptr [[TMP46]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE24]] -; OPTSIZE: [[PRED_STORE_CONTINUE24]]: -; OPTSIZE-NEXT: [[TMP48:%.*]] = extractelement <16 x i1> [[TMP72]], i32 10 -; OPTSIZE-NEXT: br i1 [[TMP48]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]] -; OPTSIZE: [[PRED_STORE_IF25]]: -; OPTSIZE-NEXT: [[TMP49:%.*]] = add i64 [[INDEX]], 10 -; OPTSIZE-NEXT: [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP49]] -; OPTSIZE-NEXT: [[TMP51:%.*]] = extractelement <16 x i8> [[TMP7]], i32 10 -; OPTSIZE-NEXT: store i8 [[TMP51]], ptr [[TMP50]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE26]] -; OPTSIZE: [[PRED_STORE_CONTINUE26]]: -; OPTSIZE-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP72]], i32 11 -; OPTSIZE-NEXT: br i1 [[TMP52]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]] -; OPTSIZE: [[PRED_STORE_IF27]]: -; OPTSIZE-NEXT: [[TMP53:%.*]] = add i64 [[INDEX]], 11 -; OPTSIZE-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP53]] -; OPTSIZE-NEXT: [[TMP55:%.*]] = extractelement <16 x i8> [[TMP7]], i32 11 -; OPTSIZE-NEXT: store i8 [[TMP55]], ptr [[TMP54]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE28]] -; OPTSIZE: [[PRED_STORE_CONTINUE28]]: -; OPTSIZE-NEXT: [[TMP56:%.*]] = extractelement <16 x i1> [[TMP72]], i32 12 -; OPTSIZE-NEXT: br i1 [[TMP56]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]] -; OPTSIZE: [[PRED_STORE_IF29]]: -; OPTSIZE-NEXT: [[TMP57:%.*]] = add i64 [[INDEX]], 12 -; OPTSIZE-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP57]] -; OPTSIZE-NEXT: [[TMP59:%.*]] = extractelement <16 x i8> [[TMP7]], i32 12 -; OPTSIZE-NEXT: store i8 [[TMP59]], ptr [[TMP58]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE30]] -; OPTSIZE: [[PRED_STORE_CONTINUE30]]: -; OPTSIZE-NEXT: [[TMP60:%.*]] = extractelement <16 x i1> [[TMP72]], i32 13 -; OPTSIZE-NEXT: br i1 [[TMP60]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32:.*]] -; OPTSIZE: [[PRED_STORE_IF31]]: -; OPTSIZE-NEXT: [[TMP61:%.*]] = add i64 [[INDEX]], 13 -; OPTSIZE-NEXT: [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP61]] -; OPTSIZE-NEXT: [[TMP63:%.*]] = extractelement <16 x i8> [[TMP7]], i32 13 -; OPTSIZE-NEXT: store i8 [[TMP63]], ptr [[TMP62]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE32]] -; OPTSIZE: [[PRED_STORE_CONTINUE32]]: -; OPTSIZE-NEXT: [[TMP64:%.*]] = extractelement <16 x i1> [[TMP72]], i32 14 -; OPTSIZE-NEXT: br i1 [[TMP64]], label %[[PRED_STORE_IF33:.*]], label %[[PRED_STORE_CONTINUE34:.*]] -; OPTSIZE: [[PRED_STORE_IF33]]: -; OPTSIZE-NEXT: [[TMP65:%.*]] = add i64 [[INDEX]], 14 -; OPTSIZE-NEXT: [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP65]] -; OPTSIZE-NEXT: [[TMP67:%.*]] = extractelement <16 x i8> [[TMP7]], i32 14 -; OPTSIZE-NEXT: store i8 [[TMP67]], ptr [[TMP66]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE34]] -; OPTSIZE: [[PRED_STORE_CONTINUE34]]: -; OPTSIZE-NEXT: [[TMP68:%.*]] = extractelement <16 x i1> [[TMP72]], i32 15 -; OPTSIZE-NEXT: br i1 [[TMP68]], label %[[PRED_STORE_IF35:.*]], label %[[PRED_STORE_CONTINUE36]] -; OPTSIZE: [[PRED_STORE_IF35]]: -; OPTSIZE-NEXT: [[TMP69:%.*]] = add i64 [[INDEX]], 15 -; OPTSIZE-NEXT: [[TMP70:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP69]] -; OPTSIZE-NEXT: [[TMP71:%.*]] = extractelement <16 x i8> [[TMP7]], i32 15 -; OPTSIZE-NEXT: store i8 [[TMP71]], ptr [[TMP70]], align 1 -; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE36]] -; OPTSIZE: [[PRED_STORE_CONTINUE36]]: -; OPTSIZE-NEXT: [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16) -; OPTSIZE-NEXT: [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16) -; OPTSIZE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; OPTSIZE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; OPTSIZE: [[MIDDLE_BLOCK]]: -; OPTSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]] -; OPTSIZE: [[SCALAR_PH]]: -; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] ; OPTSIZE: [[FOR_BODY]]: -; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; OPTSIZE-NEXT: [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i8 ; OPTSIZE-NEXT: [[MUL:%.*]] = mul i8 [[A]], [[TMP0]] ; OPTSIZE-NEXT: [[SHR:%.*]] = lshr i8 [[TMP0]], 1 @@ -612,7 +438,7 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) ; OPTSIZE-NEXT: store i8 [[ADD10]], ptr [[ARRAYIDX]], align 1 ; OPTSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; OPTSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 15 -; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY]] ; OPTSIZE: [[FOR_COND_CLEANUP]]: ; OPTSIZE-NEXT: ret void ; @@ -737,7 +563,7 @@ define void @dont_vectorize_with_minsize() { ; OPTSIZE-NEXT: store <4 x i16> [[TMP9]], ptr [[TMP7]], align 2 ; OPTSIZE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4 ; OPTSIZE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 -; OPTSIZE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; OPTSIZE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; OPTSIZE: [[MIDDLE_BLOCK]]: ; OPTSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] ; OPTSIZE: [[SCALAR_PH]]: @@ -757,7 +583,7 @@ define void @dont_vectorize_with_minsize() { ; OPTSIZE-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2 ; OPTSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; OPTSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 -; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; OPTSIZE: [[FOR_COND_CLEANUP]]: ; OPTSIZE-NEXT: ret void ; @@ -906,7 +732,7 @@ define void @vectorization_forced() { ; OPTSIZE-NEXT: store <4 x i16> [[TMP9]], ptr [[TMP7]], align 2 ; OPTSIZE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4 ; OPTSIZE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 -; OPTSIZE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; OPTSIZE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; OPTSIZE: [[MIDDLE_BLOCK]]: ; OPTSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] ; OPTSIZE: [[SCALAR_PH]]: @@ -926,7 +752,7 @@ define void @vectorization_forced() { ; OPTSIZE-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2 ; OPTSIZE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; OPTSIZE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 -; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; OPTSIZE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; OPTSIZE: [[FOR_COND_CLEANUP]]: ; OPTSIZE-NEXT: ret void ; @@ -1002,4 +828,3 @@ for.cond.cleanup: !0 = distinct !{!0, !1} !1 = !{!"llvm.loop.vectorize.enable", i1 true} - diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll index c38410091d7bc..5fc9e64147801 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll @@ -220,7 +220,7 @@ for.body: %sum.1 = add nuw nsw i32 %add, %sum.0 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 %exitcond = icmp eq i32 %lftr.wideiv, %N - br i1 %exitcond, label %for.cond.cleanup, label %for.body + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !6 for.cond.cleanup: ret i32 %sum.1 @@ -233,6 +233,5 @@ attributes #0 = { nounwind optsize uwtable "target-cpu"="core-avx2" "target-feat !7 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} !8 = !{!"llvm.loop.vectorize.enable", i1 true} -!10 = distinct !{!10, !11, !12} +!10 = distinct !{!10, !11} !11 = !{!"llvm.loop.vectorize.predicate.enable", i1 false} -!12 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll index 0b6b789c1dcea..b6aea9c6c27c5 100644 --- a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll +++ b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll @@ -10,7 +10,7 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" ; -force-vector-interleave, but is a multiple of the internally computed MaxVF; ; e.g., when all types are i32 lead to MaxVF=1. -define void @pr45679(ptr %A) optsize { +define void @pr45679(ptr %A) { ; CHECK-LABEL: @pr45679( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll index 1b2a809a552d8..57d0aa3787c45 100644 --- a/llvm/test/Transforms/LoopVectorize/struct-return.ll +++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll @@ -211,7 +211,7 @@ exit: ; Test crafted to exercise computePredInstDiscount with struct results ; (mainly it does not crash). ; CHECK-REMARKS: remark: {{.*}} vectorized loop -define void @scalarized_predicated_struct_return(ptr %a) optsize { +define void @scalarized_predicated_struct_return(ptr %a) { ; CHECK-LABEL: define void @scalarized_predicated_struct_return ; CHECK: vector.body: ; CHECK: pred.store.if: diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll index d2d99827d5f35..40a1eb477a212 100644 --- a/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll @@ -43,7 +43,7 @@ while.end: ; Make sure a loop is successfully vectorized with fold-tail when the backedge ; taken count is constant and used inside the loop. Issue revealed by D76992. ; -define void @reuse_const_btc(ptr %A) optsize { +define void @reuse_const_btc(ptr %A) { ; CHECK-LABEL: @reuse_const_btc ; CHECK: {{%.*}} = icmp ule <4 x i32> {{%.*}}, splat (i32 13) ; CHECK: {{%.*}} = select <4 x i1> {{%.*}}, <4 x i32> splat (i32 12), <4 x i32> splat (i32 13) diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll index cf9991d68fce6..43172cc2f170a 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -1035,32 +1035,32 @@ exit: ; Test case with a dead GEP between the load and store regions. Dead recipes ; need to be removed before merging. -define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr noalias %dst) optsize { +define void @merge_with_dead_gep_between_regions(i32 %n, i32 %k, ptr noalias %src, ptr noalias %dst) { ; CHECK-LABEL: LV: Checking a loop in 'merge_with_dead_gep_between_regions' ; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count -; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-NEXT: Live-in ir<%n> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: ; CHECK-NEXT: Successor(s): vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: +; CHECK-NEXT: vp<[[END:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1> ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION +; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<%n>, ir<-1>, vp<[[VF]]> ; CHECK-NEXT: vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> -; CHECK-NEXT: EMIT vp<[[WIDE_IV:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[CAN_IV]]> -; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[WIDE_IV]]>, vp<[[BTC]]> +; CHECK-NEXT: WIDEN ir<%cond> = icmp ult ir<%iv>, ir<%k> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { ; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> +; CHECK-NEXT: BRANCH-ON-MASK ir<%cond> ; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: @@ -1074,19 +1074,31 @@ define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr n ; CHECK-NEXT: pred.store.continue: ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): loop.1 +; CHECK-NEXT: Successor(s): loop.then.1 ; CHECK-EMPTY: -; CHECK-NEXT: loop.1: -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: loop.then.1: +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: Successor(s): ir-bb +; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[END]]>, ir<%n> +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i32 [ %n, %entry ], [ %iv.next, %loop.latch ] (extra operand: vp<[[RESUME]]> from scalar.ph) +; CHECK-NEXT: IR %iv.next = add nsw i32 %iv, -1 +; CHECK-NEXT: IR %cond = icmp ult i32 %iv, %k +; CHECK-NEXT: No successors ; CHECK-EMPTY: -; CHECK-NEXT: ir-bb +; CHECK-NEXT: ir-bb: ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -1094,13 +1106,20 @@ entry: br label %loop loop: - %iv = phi i32[ %n, %entry ], [ %iv.next, %loop ] + %iv = phi i32[ %n, %entry ], [ %iv.next, %loop.latch ] %iv.next = add nsw i32 %iv, -1 + %cond = icmp ult i32 %iv, %k + br i1 %cond, label %loop.then, label %loop.latch + +loop.then: %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv %l = load i32, ptr %gep.src, align 16 %dead_gep = getelementptr inbounds i32, ptr %dst, i64 1 %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv store i32 %l, ptr %gep.dst, align 16 + br label %loop.latch + +loop.latch: %ec = icmp eq i32 %iv.next, 0 br i1 %ec, label %exit, label %loop