[LV] Vectorize cases with larger number of RT checks, execute only if profitable.
This patch replaces the tight hard cut-off for the number of runtime checks with a more accurate cost-driven approach. The new approach allows vectorization with a larger number of runtime checks in general, but only executes the vector loop (and runtime checks) if considered profitable at runtime. Profitable here means that the cost-model indicates that the runtime check cost + vector loop cost < scalar loop cost. To do that, LV computes the minimum trip count for which runtime check cost + vector-loop-cost < scalar loop cost. Note that there is still a hard cut-off to avoid excessive compile-time/code-size increases, but it is much larger than the original limit. The performance impact on standard test-suites like SPEC2006/SPEC2006/MultiSource is mostly neutral, but the new approach can give substantial gains in cases where we failed to vectorize before due to the over-aggressive cut-offs. On AArch64 with -O3, I didn't observe any regressions outside the noise level (<0.4%) and there are the following execution time improvements. Both `IRSmk` and `srad` are relatively short running, but the changes are far above the noise level for them on my benchmark system. ``` CFP2006/447.dealII/447.dealII -1.9% CINT2017rate/525.x264_r/525.x264_r -2.2% ASC_Sequoia/IRSmk/IRSmk -9.2% Rodinia/srad/srad -36.1% ``` `size` regressions on AArch64 with -O3 are ``` MultiSource/Applications/hbd/hbd 90256.00 106768.00 18.3% MultiSourc...ks/ASCI_Purple/SMG2000/smg2000 240676.00 257268.00 6.9% MultiSourc...enchmarks/mafft/pairlocalalign 472603.00 489131.00 3.5% External/S...2017rate/525.x264_r/525.x264_r 613831.00 630343.00 2.7% External/S...NT2006/464.h264ref/464.h264ref 818920.00 835448.00 2.0% External/S...te/538.imagick_r/538.imagick_r 1994730.00 2027754.00 1.7% MultiSourc...nchmarks/tramp3d-v4/tramp3d-v4 1236471.00 1253015.00 1.3% MultiSource/Applications/oggenc/oggenc 2108147.00 2124675.00 0.8% External/S.../CFP2006/447.dealII/447.dealII 4742999.00 4759559.00 0.3% External/S...rate/510.parest_r/510.parest_r 14206377.00 14239433.00 0.2% ``` Reviewed By: lebedev.ri, ebrevnov, dmgreen Differential Revision: https://reviews.llvm.org/D109368
This commit is contained in:
parent
aa78c5298e
commit
644a965c1e
|
@ -219,16 +219,9 @@ public:
|
|||
ExactFPMathInst = I;
|
||||
}
|
||||
|
||||
void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
|
||||
|
||||
Instruction *getExactFPInst() { return ExactFPMathInst; }
|
||||
|
||||
unsigned getNumRuntimePointerChecks() const {
|
||||
return NumRuntimePointerChecks;
|
||||
}
|
||||
|
||||
private:
|
||||
unsigned NumRuntimePointerChecks = 0;
|
||||
Instruction *ExactFPMathInst = nullptr;
|
||||
};
|
||||
|
||||
|
|
|
@ -993,7 +993,6 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
|
|||
}
|
||||
}
|
||||
|
||||
Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
|
||||
PSE.addPredicate(LAI->getPSE().getPredicate());
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -33,7 +33,6 @@ class LoopInfo;
|
|||
class LoopVectorizationLegality;
|
||||
class LoopVectorizationCostModel;
|
||||
class PredicatedScalarEvolution;
|
||||
class LoopVectorizationRequirements;
|
||||
class LoopVectorizeHints;
|
||||
class OptimizationRemarkEmitter;
|
||||
class TargetTransformInfo;
|
||||
|
@ -191,6 +190,10 @@ struct VectorizationFactor {
|
|||
/// Cost of the scalar loop.
|
||||
InstructionCost ScalarCost;
|
||||
|
||||
/// The minimum trip count required to make vectorization profitable, e.g. due
|
||||
/// to runtime checks.
|
||||
ElementCount MinProfitableTripCount;
|
||||
|
||||
VectorizationFactor(ElementCount Width, InstructionCost Cost,
|
||||
InstructionCost ScalarCost)
|
||||
: Width(Width), Cost(Cost), ScalarCost(ScalarCost) {}
|
||||
|
@ -268,8 +271,6 @@ class LoopVectorizationPlanner {
|
|||
|
||||
const LoopVectorizeHints &Hints;
|
||||
|
||||
LoopVectorizationRequirements &Requirements;
|
||||
|
||||
OptimizationRemarkEmitter *ORE;
|
||||
|
||||
SmallVector<VPlanPtr, 4> VPlans;
|
||||
|
@ -285,10 +286,9 @@ public:
|
|||
InterleavedAccessInfo &IAI,
|
||||
PredicatedScalarEvolution &PSE,
|
||||
const LoopVectorizeHints &Hints,
|
||||
LoopVectorizationRequirements &Requirements,
|
||||
OptimizationRemarkEmitter *ORE)
|
||||
: OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI),
|
||||
PSE(PSE), Hints(Hints), Requirements(Requirements), ORE(ORE) {}
|
||||
PSE(PSE), Hints(Hints), ORE(ORE) {}
|
||||
|
||||
/// Plan how to best vectorize, return the best VF and its cost, or None if
|
||||
/// vectorization and interleaving should be avoided up front.
|
||||
|
|
|
@ -196,10 +196,9 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold(
|
|||
"value are vectorized only if no scalar iteration overheads "
|
||||
"are incurred."));
|
||||
|
||||
static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
|
||||
"pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
|
||||
cl::desc("The maximum allowed number of runtime memory checks with a "
|
||||
"vectorize(enable) pragma."));
|
||||
static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
|
||||
"vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
|
||||
cl::desc("The maximum allowed number of runtime memory checks"));
|
||||
|
||||
// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
|
||||
// that predication is preferred, and this lists all options. I.e., the
|
||||
|
@ -442,6 +441,7 @@ public:
|
|||
const TargetLibraryInfo *TLI,
|
||||
const TargetTransformInfo *TTI, AssumptionCache *AC,
|
||||
OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
|
||||
ElementCount MinProfitableTripCount,
|
||||
unsigned UnrollFactor, LoopVectorizationLegality *LVL,
|
||||
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
|
||||
ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
|
||||
|
@ -453,6 +453,11 @@ public:
|
|||
// of the original loop header may change as the transformation happens.
|
||||
OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
|
||||
OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
|
||||
|
||||
if (MinProfitableTripCount.isZero())
|
||||
this->MinProfitableTripCount = VecWidth;
|
||||
else
|
||||
this->MinProfitableTripCount = MinProfitableTripCount;
|
||||
}
|
||||
|
||||
virtual ~InnerLoopVectorizer() = default;
|
||||
|
@ -656,6 +661,8 @@ protected:
|
|||
/// vector elements.
|
||||
ElementCount VF;
|
||||
|
||||
ElementCount MinProfitableTripCount;
|
||||
|
||||
/// The vectorization unroll factor to use. Each scalar is vectorized to this
|
||||
/// many different vector instructions.
|
||||
unsigned UF;
|
||||
|
@ -735,6 +742,7 @@ public:
|
|||
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
|
||||
ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
|
||||
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
|
||||
ElementCount::getFixed(1),
|
||||
ElementCount::getFixed(1), UnrollFactor, LVL, CM,
|
||||
BFI, PSI, Check) {}
|
||||
|
||||
|
@ -783,8 +791,8 @@ public:
|
|||
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
|
||||
GeneratedRTChecks &Checks)
|
||||
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
|
||||
EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
|
||||
Checks),
|
||||
EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
|
||||
CM, BFI, PSI, Checks),
|
||||
EPI(EPI) {}
|
||||
|
||||
// Override this function to handle the more complex control flow around the
|
||||
|
@ -1849,14 +1857,17 @@ class GeneratedRTChecks {
|
|||
|
||||
DominatorTree *DT;
|
||||
LoopInfo *LI;
|
||||
TargetTransformInfo *TTI;
|
||||
|
||||
SCEVExpander SCEVExp;
|
||||
SCEVExpander MemCheckExp;
|
||||
|
||||
bool CostTooHigh = false;
|
||||
|
||||
public:
|
||||
GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
|
||||
const DataLayout &DL)
|
||||
: DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
|
||||
TargetTransformInfo *TTI, const DataLayout &DL)
|
||||
: DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
|
||||
MemCheckExp(SE, DL, "scev.check") {}
|
||||
|
||||
/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
|
||||
|
@ -1867,6 +1878,15 @@ public:
|
|||
void Create(Loop *L, const LoopAccessInfo &LAI,
|
||||
const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
|
||||
|
||||
// Hard cutoff to limit compile-time increase in case a very large number of
|
||||
// runtime checks needs to be generated.
|
||||
// TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
|
||||
// profile info.
|
||||
CostTooHigh =
|
||||
LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
|
||||
if (CostTooHigh)
|
||||
return;
|
||||
|
||||
BasicBlock *LoopHeader = L->getHeader();
|
||||
BasicBlock *Preheader = L->getLoopPreheader();
|
||||
|
||||
|
@ -1938,6 +1958,44 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
InstructionCost getCost() {
|
||||
if (SCEVCheckBlock || MemCheckBlock)
|
||||
LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
|
||||
|
||||
if (CostTooHigh) {
|
||||
InstructionCost Cost;
|
||||
Cost.setInvalid();
|
||||
LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
|
||||
return Cost;
|
||||
}
|
||||
|
||||
InstructionCost RTCheckCost = 0;
|
||||
if (SCEVCheckBlock)
|
||||
for (Instruction &I : *SCEVCheckBlock) {
|
||||
if (SCEVCheckBlock->getTerminator() == &I)
|
||||
continue;
|
||||
InstructionCost C =
|
||||
TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
|
||||
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
|
||||
RTCheckCost += C;
|
||||
}
|
||||
if (MemCheckBlock)
|
||||
for (Instruction &I : *MemCheckBlock) {
|
||||
if (MemCheckBlock->getTerminator() == &I)
|
||||
continue;
|
||||
InstructionCost C =
|
||||
TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
|
||||
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
|
||||
RTCheckCost += C;
|
||||
}
|
||||
|
||||
if (SCEVCheckBlock || MemCheckBlock)
|
||||
LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
|
||||
<< "\n");
|
||||
|
||||
return RTCheckCost;
|
||||
}
|
||||
|
||||
/// Remove the created SCEV & memory runtime check blocks & instructions, if
|
||||
/// unused.
|
||||
~GeneratedRTChecks() {
|
||||
|
@ -2880,9 +2938,16 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
|
|||
// If tail is to be folded, vector loop takes care of all iterations.
|
||||
Type *CountTy = Count->getType();
|
||||
Value *CheckMinIters = Builder.getFalse();
|
||||
Value *Step = createStepForVF(Builder, CountTy, VF, UF);
|
||||
auto CreateStep = [&]() {
|
||||
// Create step with max(MinProTripCount, UF * VF).
|
||||
if (UF * VF.getKnownMinValue() < MinProfitableTripCount.getKnownMinValue())
|
||||
return createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
|
||||
return createStepForVF(Builder, CountTy, VF, UF);
|
||||
};
|
||||
|
||||
if (!Cost->foldTailByMasking())
|
||||
CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
|
||||
CheckMinIters =
|
||||
Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
|
||||
else if (VF.isScalable()) {
|
||||
// vscale is not necessarily a power-of-2, which means we cannot guarantee
|
||||
// an overflow to zero when updating induction variables and so an
|
||||
|
@ -2894,8 +2959,9 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
|
|||
Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
|
||||
|
||||
// Don't execute the vector loop if (UMax - n) < (VF * UF).
|
||||
CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step);
|
||||
CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
|
||||
}
|
||||
|
||||
// Create new preheader for vector loop.
|
||||
LoopVectorPreHeader =
|
||||
SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
|
||||
|
@ -2920,7 +2986,6 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
|
|||
}
|
||||
|
||||
BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
|
||||
|
||||
BasicBlock *const SCEVCheckBlock =
|
||||
RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
|
||||
if (!SCEVCheckBlock)
|
||||
|
@ -7363,14 +7428,6 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
|
|||
return VectorizationFactor::Disabled();
|
||||
}
|
||||
|
||||
bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const {
|
||||
unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
|
||||
return (NumRuntimePointerChecks >
|
||||
VectorizerParams::RuntimeMemoryCheckThreshold &&
|
||||
!Hints.allowReordering()) ||
|
||||
NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
|
||||
}
|
||||
|
||||
Optional<VectorizationFactor>
|
||||
LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
|
||||
assert(OrigLoop->isInnermost() && "Inner loop expected.");
|
||||
|
@ -10103,8 +10160,7 @@ static bool processLoopInVPlanNativePath(
|
|||
// Use the planner for outer loop vectorization.
|
||||
// TODO: CM is not used at this point inside the planner. Turn CM into an
|
||||
// optional argument if we don't need it in the future.
|
||||
LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
|
||||
Requirements, ORE);
|
||||
LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE);
|
||||
|
||||
// Get user vectorization factor.
|
||||
ElementCount UserVF = Hints.getWidth();
|
||||
|
@ -10123,10 +10179,10 @@ static bool processLoopInVPlanNativePath(
|
|||
VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
|
||||
|
||||
{
|
||||
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
|
||||
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
|
||||
F->getParent()->getDataLayout());
|
||||
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
|
||||
&CM, BFI, PSI, Checks);
|
||||
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
|
||||
VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
|
||||
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
|
||||
<< L->getHeader()->getParent()->getName() << "\"\n");
|
||||
LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
|
||||
|
@ -10183,6 +10239,94 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
|
|||
}
|
||||
}
|
||||
|
||||
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
|
||||
VectorizationFactor &VF, Loop *L,
|
||||
ScalarEvolution &SE) {
|
||||
InstructionCost CheckCost = Checks.getCost();
|
||||
if (!CheckCost.isValid())
|
||||
return false;
|
||||
|
||||
// When interleaving only scalar and vector cost will be equal, which in turn
|
||||
// would lead to a divide by 0. Fall back to hard threshold.
|
||||
if (VF.Width.isScalar()) {
|
||||
if (CheckCost > VectorizeMemoryCheckThreshold) {
|
||||
LLVM_DEBUG(
|
||||
dbgs()
|
||||
<< "LV: Interleaving only is not profitable due to runtime checks\n");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// First, compute the minimum iteration count required so that the vector
|
||||
// loop outperforms the scalar loop.
|
||||
// The total cost of the scalar loop is
|
||||
// ScalarC * TC
|
||||
// where
|
||||
// * TC is the actual trip count of the loop.
|
||||
// * ScalarC is the cost of a single scalar iteration.
|
||||
//
|
||||
// The total cost of the vector loop is
|
||||
// RtC + VecC * (TC / VF) + EpiC
|
||||
// where
|
||||
// * RtC is the cost of the generated runtime checks
|
||||
// * VecC is the cost of a single vector iteration.
|
||||
// * TC is the actual trip count of the loop
|
||||
// * VF is the vectorization factor
|
||||
// * EpiCost is the cost of the generated epilogue, including the cost
|
||||
// of the remaining scalar operations.
|
||||
//
|
||||
// Vectorization is profitable once the total vector cost is less than the
|
||||
// total scalar cost:
|
||||
// RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
|
||||
//
|
||||
// Now we can compute the minimum required trip count TC as
|
||||
// (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
|
||||
//
|
||||
// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
|
||||
// the computations are performed on doubles, not integers and the result
|
||||
// is rounded up, hence we get an upper estimate of the TC.
|
||||
unsigned IntVF = VF.Width.getKnownMinValue();
|
||||
double ScalarC = *VF.ScalarCost.getValue();
|
||||
double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
|
||||
double RtC = *CheckCost.getValue();
|
||||
double MinTC1 = RtC / (ScalarC - VecCOverVF);
|
||||
|
||||
// Second, compute a minimum iteration count so that the cost of the
|
||||
// runtime checks is only a fraction of the total scalar loop cost. This
|
||||
// adds a loop-dependent bound on the overhead incurred if the runtime
|
||||
// checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
|
||||
// * TC. To bound the runtime check to be a fraction 1/X of the scalar
|
||||
// cost, compute
|
||||
// RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
|
||||
double MinTC2 = RtC * 10 / ScalarC;
|
||||
|
||||
// Now pick the larger minimum. If it is not a multiple of VF, choose the
|
||||
// next closest multiple of VF. This should partly compensate for ignoring
|
||||
// the epilogue cost.
|
||||
uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
|
||||
VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF));
|
||||
|
||||
LLVM_DEBUG(
|
||||
dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
|
||||
<< VF.MinProfitableTripCount << "\n");
|
||||
|
||||
// Skip vectorization if the expected trip count is less than the minimum
|
||||
// required trip count.
|
||||
if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
|
||||
if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
|
||||
VF.MinProfitableTripCount)) {
|
||||
LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
|
||||
"trip count < minimum profitable VF ("
|
||||
<< *ExpectedTC << " < " << VF.MinProfitableTripCount
|
||||
<< ")\n");
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
|
||||
: InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
|
||||
!EnableLoopInterleaving),
|
||||
|
@ -10340,8 +10484,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
|
|||
CM.collectElementTypesForWidening();
|
||||
|
||||
// Use the planner for vectorization.
|
||||
LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
|
||||
Requirements, ORE);
|
||||
LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE);
|
||||
|
||||
// Get user vectorization factor and interleave count.
|
||||
ElementCount UserVF = Hints.getWidth();
|
||||
|
@ -10353,21 +10496,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
|
|||
VectorizationFactor VF = VectorizationFactor::Disabled();
|
||||
unsigned IC = 1;
|
||||
|
||||
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
|
||||
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
|
||||
F->getParent()->getDataLayout());
|
||||
if (MaybeVF) {
|
||||
if (LVP.requiresTooManyRuntimeChecks()) {
|
||||
ORE->emit([&]() {
|
||||
return OptimizationRemarkAnalysisAliasing(
|
||||
DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
|
||||
L->getHeader())
|
||||
<< "loop not vectorized: cannot prove it is safe to reorder "
|
||||
"memory operations";
|
||||
});
|
||||
LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
|
||||
Hints.emitRemarkWithHints();
|
||||
return false;
|
||||
}
|
||||
VF = *MaybeVF;
|
||||
// Select the interleave count.
|
||||
IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
|
||||
|
@ -10377,6 +10508,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
|
|||
// they turn out to not be profitable.
|
||||
if (VF.Width.isVector() || SelectedIC > 1)
|
||||
Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
|
||||
|
||||
// Check if it is profitable to vectorize with runtime checks.
|
||||
bool ForceVectorization =
|
||||
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
|
||||
if (!ForceVectorization &&
|
||||
!areRuntimeChecksProfitable(Checks, VF, L, *PSE.getSE()))
|
||||
return false;
|
||||
}
|
||||
|
||||
// Identify the diagnostic messages that should be produced.
|
||||
|
@ -10533,8 +10671,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
|
|||
if (!MainILV.areSafetyChecksAdded())
|
||||
DisableRuntimeUnroll = true;
|
||||
} else {
|
||||
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
|
||||
&LVL, &CM, BFI, PSI, Checks);
|
||||
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
|
||||
VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
|
||||
PSI, Checks);
|
||||
|
||||
VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
|
||||
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 -S %s | FileCheck %s
|
||||
; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 -S %s | FileCheck --check-prefixes=CHECK,DEFAULT %s
|
||||
; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefixes=CHECK,THRESHOLD %s
|
||||
|
||||
; Tests for loops with large numbers of runtime checks. Check that loops are
|
||||
; vectorized, if the loop trip counts are large and the impact of the runtime
|
||||
|
@ -57,11 +58,13 @@ exit:
|
|||
ret void
|
||||
}
|
||||
|
||||
; FIXME
|
||||
; The trip count in the loop in this function high enough to warrant large runtime checks.
|
||||
; CHECK-LABEL: define {{.*}} @test_tc_big_enough
|
||||
; CHECK-NOT: vector.memcheck
|
||||
; CHECK-NOT: vector.body
|
||||
; DEFAULT: vector.memcheck
|
||||
; DEFAULT: vector.body
|
||||
; THRESHOLD-NOT: vector.memcheck
|
||||
; THRESHOLD-NOT: vector.body
|
||||
;
|
||||
define void @test_tc_big_enough(i16* %ptr.1, i16* %ptr.2, i16* %ptr.3, i16* %ptr.4, i64 %off.1, i64 %off.2) {
|
||||
entry:
|
||||
br label %loop
|
||||
|
@ -112,8 +115,11 @@ exit:
|
|||
|
||||
define void @test_tc_unknown(i16* %ptr.1, i16* %ptr.2, i16* %ptr.3, i16* %ptr.4, i64 %off.1, i64 %off.2, i64 %N) {
|
||||
; CHECK-LABEL: define void @test_tc_unknown
|
||||
; CHECK-NOT: vector.memcheck
|
||||
; CHECK-NOT: vector.body
|
||||
; DEFAULT: [[ADD:%.+]] = add i64 %N, 1
|
||||
; DEFAULT-NEXT: [[C:%.+]] = icmp ult i64 [[ADD]], 16
|
||||
; DEFAULT-NEXT: br i1 [[C]], label %scalar.ph, label %vector.memcheck
|
||||
; THRESHOLD-NOT: vector.memcheck
|
||||
; THRESHOLD-NOT: vector.body
|
||||
;
|
||||
entry:
|
||||
br label %loop
|
||||
|
|
|
@ -10,9 +10,9 @@ define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 {
|
|||
; CHECK-LABEL: @simple_memset(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
|
|
|
@ -8,9 +8,9 @@ define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 {
|
|||
; CHECK-LABEL: @simple_memset(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
|
@ -102,9 +102,9 @@ define void @cond_memset(i32 %val, i32* noalias readonly %cond_ptr, i32* noalias
|
|||
; CHECK-LABEL: @cond_memset(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
|
|
|
@ -10,9 +10,9 @@ define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 {
|
|||
; CHECK-LABEL: @simple_memset(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
|
@ -63,9 +63,9 @@ define void @simple_memcpy(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
|
|||
; CHECK-LABEL: @simple_memcpy(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
|
@ -123,9 +123,9 @@ define void @copy_stride4(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
|
|||
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = sub i64 -1, [[TMP2]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = sub i64 -1, [[TMP2]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[TMP4]]
|
||||
; CHECK-NEXT: br i1 [[TMP6]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
|
@ -193,9 +193,9 @@ define void @simple_gather_scatter(i32* noalias %dst, i32* noalias %src, i32* no
|
|||
; CHECK-LABEL: @simple_gather_scatter(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
|
@ -253,9 +253,9 @@ while.end.loopexit: ; preds = %while.body
|
|||
define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 {
|
||||
; CHECK-LABEL: @uniform_load(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[N:%.*]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[N:%.*]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
|
@ -312,9 +312,9 @@ for.end: ; preds = %for.body, %entry
|
|||
define void @cond_uniform_load(i32* noalias %dst, i32* noalias readonly %src, i32* noalias readonly %cond, i64 %n) #0 {
|
||||
; CHECK-LABEL: @cond_uniform_load(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[N:%.*]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[N:%.*]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
|
@ -389,9 +389,9 @@ for.end: ; preds = %for.inc, %entry
|
|||
define void @uniform_store(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 {
|
||||
; CHECK-LABEL: @uniform_store(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[N:%.*]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[N:%.*]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
|
@ -445,9 +445,9 @@ define void @simple_fdiv(float* noalias %dst, float* noalias %src, i64 %n) #0 {
|
|||
; CHECK-LABEL: @simple_fdiv(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
|
@ -507,9 +507,9 @@ define i32 @add_reduction_i32(i32* %ptr, i64 %n) #0 {
|
|||
; CHECK-LABEL: @add_reduction_i32(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
|
@ -563,9 +563,9 @@ define float @add_reduction_f32(float* %ptr, i64 %n) #0 {
|
|||
; CHECK-LABEL: @add_reduction_f32(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
|
@ -617,9 +617,9 @@ while.end.loopexit: ; preds = %while.body
|
|||
define i32 @cond_xor_reduction(i32* noalias %a, i32* noalias %cond, i64 %N) #0 {
|
||||
; CHECK-LABEL: @cond_xor_reduction(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[N:%.*]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 -1, [[N:%.*]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
||||
; CHECK: vector.ph:
|
||||
|
|
|
@ -863,7 +863,7 @@ define void @test_gather_not_profitable_pr48429(i32 %d, float* readonly noalias
|
|||
; AVX512-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], -4
|
||||
; AVX512-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 2
|
||||
; AVX512-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
|
||||
; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 16
|
||||
; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 32
|
||||
; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
|
||||
; AVX512: vector.memcheck:
|
||||
; AVX512-NEXT: [[TMP4:%.*]] = shl nsw i64 [[IDX_EXT]], 2
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
; REQUIRES: asserts
|
||||
|
||||
; RUN: opt -runtime-memory-check-threshold=9 -passes='loop-vectorize' -mtriple=x86_64-unknown-linux -S -debug %s 2>&1 | FileCheck %s
|
||||
; RUN: opt -passes='loop-vectorize' -mtriple=x86_64-unknown-linux -S -debug %s 2>&1 | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
||||
|
||||
|
@ -10,11 +10,63 @@ declare double @llvm.pow.f64(double, double)
|
|||
|
||||
; Test case where the memory runtime checks and vector body is more expensive
|
||||
; than running the scalar loop.
|
||||
; TODO: should not be vectorized.
|
||||
define void @test(double* nocapture %A, double* nocapture %B, double* nocapture %C, double* nocapture %D, double* nocapture %E) {
|
||||
|
||||
; CHECK: Calculating cost of runtime checks:
|
||||
; CHECK-NEXT: 0 for {{.+}} = getelementptr double, double* %A, i64 16
|
||||
; CHECK-NEXT: 0 for {{.+}} = bitcast double*
|
||||
; CHECK-NEXT: 0 for {{.+}} = getelementptr double, double* %B, i64 16
|
||||
; CHECK-NEXT: 0 for {{.+}} = bitcast double*
|
||||
; CHECK-NEXT: 0 for {{.+}} = getelementptr double, double* %E, i64 16
|
||||
; CHECK-NEXT: 0 for {{.+}} = bitcast double*
|
||||
; CHECK-NEXT: 0 for {{.+}} = getelementptr double, double* %C, i64 16
|
||||
; CHECK-NEXT: 0 for {{.+}} = bitcast double*
|
||||
; CHECK-NEXT: 0 for {{.+}} = getelementptr double, double* %D, i64 16
|
||||
; CHECK-NEXT: 0 for {{.+}} = bitcast double*
|
||||
; CHECK-NEXT: 1 for {{.+}} = icmp ult i8*
|
||||
; CHECK-NEXT: 1 for {{.+}} = icmp ult i8*
|
||||
; CHECK-NEXT: 1 for {{.+}} = and i1
|
||||
; CHECK-NEXT: 1 for {{.+}} = icmp ult i8*
|
||||
; CHECK-NEXT: 1 for {{.+}} = icmp ult i8*
|
||||
; CHECK-NEXT: 1 for {{.+}} = and i1
|
||||
; CHECK-NEXT: 1 for {{.+}} = or i1
|
||||
; CHECK-NEXT: 1 for {{.+}} = icmp ult i8*
|
||||
; CHECK-NEXT: 1 for {{.+}} = icmp ult i8*
|
||||
; CHECK-NEXT: 1 for {{.+}} = and i1
|
||||
; CHECK-NEXT: 1 for {{.+}} = or i1
|
||||
; CHECK-NEXT: 1 for {{.+}} = icmp ult i8*
|
||||
; CHECK-NEXT: 1 for {{.+}} = icmp ult i8*
|
||||
; CHECK-NEXT: 1 for {{.+}} = and i1
|
||||
; CHECK-NEXT: 1 for {{.+}} = or i1
|
||||
; CHECK-NEXT: 1 for {{.+}} = icmp ult i8*
|
||||
; CHECK-NEXT: 1 for {{.+}} = icmp ult i8*
|
||||
; CHECK-NEXT: 1 for {{.+}} = and i1
|
||||
; CHECK-NEXT: 1 for {{.+}} = or i1
|
||||
; CHECK-NEXT: 1 for {{.+}} = icmp ult i8*
|
||||
; CHECK-NEXT: 1 for {{.+}} = icmp ult i8*
|
||||
; CHECK-NEXT: 1 for {{.+}} = and i1
|
||||
; CHECK-NEXT: 1 for {{.+}} = or i1
|
||||
; CHECK-NEXT: 1 for {{.+}} = icmp ult i8*
|
||||
; CHECK-NEXT: 1 for {{.+}} = icmp ult i8*
|
||||
; CHECK-NEXT: 1 for {{.+}} = and i1
|
||||
; CHECK-NEXT: 1 for {{.+}} = or i1
|
||||
; CHECK-NEXT: 1 for {{.+}} = icmp ult i8*
|
||||
; CHECK-NEXT: 1 for {{.+}} = icmp ult i8*
|
||||
; CHECK-NEXT: 1 for {{.+}} = and i1
|
||||
; CHECK-NEXT: 1 for {{.+}} = or i1
|
||||
; CHECK-NEXT: 1 for {{.+}} = icmp ult i8*
|
||||
; CHECK-NEXT: 1 for {{.+}} = icmp ult i8*
|
||||
; CHECK-NEXT: 1 for {{.+}} = and i1
|
||||
; CHECK-NEXT: 1 for {{.+}} = or i1
|
||||
; CHECK-NEXT: Total cost of runtime checks: 35
|
||||
|
||||
; CHECK: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (16 < 70)
|
||||
;
|
||||
; CHECK-LABEL: @test(
|
||||
; CHECK: vector.memcheck
|
||||
; CHECK: vector.body
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: br label %for.body
|
||||
; CHECK-NOT: vector.memcheck
|
||||
; CHECK-NOT: vector.body
|
||||
;
|
||||
entry:
|
||||
br label %for.body
|
||||
|
|
|
@ -15,7 +15,7 @@ define void @foo(i8 addrspace(1)* align 8 dereferenceable_or_null(16), i8 addrsp
|
|||
; CHECK-NEXT: [[DOT12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP1:%.*]], i64 16
|
||||
; CHECK-NEXT: [[DOT13:%.*]] = bitcast i8 addrspace(1)* [[DOT12]] to i8 addrspace(1)* addrspace(1)*
|
||||
; CHECK-NEXT: [[UMAX2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2:%.*]], i64 1)
|
||||
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX2]], 16
|
||||
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX2]], 20
|
||||
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
|
||||
; CHECK: vector.memcheck:
|
||||
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2]], i64 1)
|
||||
|
|
|
@ -40,7 +40,7 @@ define i32 @main(i32* %ptr) {
|
|||
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], 1
|
||||
; CHECK-NEXT: [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 [[TMP4]])
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = sub i32 [[TMP5]], [[UMIN1]]
|
||||
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP6]], 8
|
||||
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP6]], 32
|
||||
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
|
||||
; CHECK: vector.scevcheck:
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = add i8 [[CONV3]], -1
|
||||
|
|
|
@ -22,7 +22,7 @@ define {} addrspace(10)* @japi1_vect_42283({} addrspace(10)** nocapture readonly
|
|||
; CHECK-NEXT: [[DOTELT1:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(10)* [[TMP6]], i64 0, i32 1
|
||||
; CHECK-NEXT: [[DOTUNPACK2:%.*]] = load i64, i64 addrspace(10)* [[DOTELT1]], align 8, !tbaa [[TBAA8]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = add nsw i64 [[TMP2]], 1
|
||||
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP11]], 16
|
||||
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP11]], 28
|
||||
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
|
||||
; CHECK: vector.scevcheck:
|
||||
; CHECK-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[TMP2]])
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
; RUN: opt < %s -loop-vectorize -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=OVERRIDE
|
||||
; RUN: opt < %s -loop-vectorize -pragma-vectorize-memory-check-threshold=6 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s
|
||||
; RUN: opt < %s -loop-vectorize -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
||||
|
||||
|
@ -8,20 +7,12 @@ target triple = "x86_64-unknown-linux"
|
|||
; First loop produced diagnostic pass remark.
|
||||
;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 2)
|
||||
; Second loop produces diagnostic analysis remark.
|
||||
;CHECK: remark: {{.*}}:0:0: loop not vectorized: cannot prove it is safe to reorder memory operations
|
||||
|
||||
; First loop produced diagnostic pass remark.
|
||||
;OVERRIDE: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 2)
|
||||
; Second loop produces diagnostic pass remark.
|
||||
;OVERRIDE: remark: {{.*}}:0:0: loop not vectorized: cannot prove it is safe to reorder memory operations
|
||||
;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 1)
|
||||
|
||||
; We are vectorizing with 6 runtime checks.
|
||||
;CHECK-LABEL: func1x6(
|
||||
;CHECK: <4 x i32>
|
||||
;CHECK: ret
|
||||
;OVERRIDE-LABEL: func1x6(
|
||||
;OVERRIDE: <4 x i32>
|
||||
;OVERRIDE: ret
|
||||
define i32 @func1x6(i32* nocapture %out, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
@ -52,14 +43,10 @@ for.end: ; preds = %for.body
|
|||
ret i32 undef
|
||||
}
|
||||
|
||||
; We are not vectorizing with 12 runtime checks.
|
||||
; We are vectorizing with 12 runtime checks.
|
||||
;CHECK-LABEL: func2x6(
|
||||
;CHECK-NOT: <4 x i32>
|
||||
;CHECK: <4 x i32>
|
||||
;CHECK: ret
|
||||
; We vectorize with 12 checks if a vectorization hint is provided.
|
||||
;OVERRIDE-LABEL: func2x6(
|
||||
;OVERRIDE-NOT: <4 x i32>
|
||||
;OVERRIDE: ret
|
||||
define i32 @func2x6(i32* nocapture %out, i32* nocapture %out2, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
@ -100,4 +87,3 @@ for.body: ; preds = %for.body, %entry
|
|||
for.end: ; preds = %for.body
|
||||
ret i32 undef
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue