diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index ce3ff2ee7aecab20f159ffb99fda4b758951db09..f9d7d4fbbe5033bdd669fab4f96865f80b93dfb0 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -726,6 +726,8 @@ public: bool isProfitableToLoopVersioning() const; + bool isProfitableToDirectPrefetch() const; + bool useAA() const; /// Return true if this type is legal. @@ -1674,6 +1676,7 @@ public: virtual bool isTruncateFree(Type *Ty1, Type *Ty2) = 0; virtual bool isProfitableToHoist(Instruction *I) = 0; virtual bool isProfitableToLoopVersioning() = 0; + virtual bool isProfitableToDirectPrefetch() = 0; virtual bool useAA() = 0; virtual bool isTypeLegal(Type *Ty) = 0; virtual unsigned getRegUsageForType(Type *Ty) = 0; @@ -2144,6 +2147,9 @@ public: bool isProfitableToLoopVersioning() override { return Impl.isProfitableToLoopVersioning(); } + bool isProfitableToDirectPrefetch() override { + return Impl.isProfitableToDirectPrefetch(); + } bool useAA() override { return Impl.useAA(); } bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); } unsigned getRegUsageForType(Type *Ty) override { diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 3a99b02bc3637eed73572cb1d6891eef48e87689..80479ade16f4d126e3288d00262df28736096a3b 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -319,6 +319,8 @@ public: bool isProfitableToLoopVersioning() const { return false; } + bool isProfitableToDirectPrefetch() const { return true; } + bool useAA() const { return false; } bool isTypeLegal(Type *Ty) const { return false; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 0cd6599d42966a1f8197934a7ecaa953cf986341..50c7ecdde77f13ae047ecca75c4bbfbf6fb6acd8 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -403,6 +403,8 @@ public: bool isProfitableToLoopVersioning() const { return false; } + bool isProfitableToDirectPrefetch() const { return true; } + bool useAA() const { return getST()->useAA(); } bool isTypeLegal(Type *Ty) { diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index e26c7ad3dd90bf505a7a85645352ef7568363dde..02da1e1dc65b96ca94c5b474049829a6e792e0bd 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -490,6 +490,10 @@ bool TargetTransformInfo::isProfitableToLoopVersioning() const { return TTIImpl->isProfitableToLoopVersioning(); } +bool TargetTransformInfo::isProfitableToDirectPrefetch() const { + return TTIImpl->isProfitableToDirectPrefetch(); +} + bool TargetTransformInfo::useAA() const { return TTIImpl->useAA(); } bool TargetTransformInfo::isTypeLegal(Type *Ty) const { diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 52268ec9fa3b42f80eca328bce5f580dc7035d3e..ec481250224b27d50695866ffbc4479d904d5ee2 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -263,22 +263,28 @@ void AArch64Subtarget::initializeProperties() { break; case TSV110: CacheLineSize = 64; + PrefetchDistance = 1024; PrefFunctionLogAlignment = 4; PrefLoopLogAlignment = 2; + MinPrefetchStride = 4; break; case HIP09: CacheLineSize = 64; + PrefetchDistance = 1024; PrefFunctionLogAlignment = 4; PrefLoopLogAlignment = 2; VScaleForTuning = 2; DefaultSVETFOpts = TailFoldingOpts::Simple; + MinPrefetchStride = 4; break; case HIP12: CacheLineSize = 64; + PrefetchDistance = 1024; PrefFunctionLogAlignment = 4; PrefLoopLogAlignment = 2; VScaleForTuning = 2; DefaultSVETFOpts = TailFoldingOpts::Simple; + MinPrefetchStride = 4; break; case ThunderX3T110: CacheLineSize = 64; diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 48a818455d1a7a81ffa80dd014fd34079ad243f1..557044820d0694bf70edbafb390049336d3d0125 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -213,6 +213,16 @@ public: } } + bool isHiSiliconHIPProc() const { + switch (ARMProcFamily) { + case HIP09: + case HIP12: + return true; + default: + return false; + } + } + bool isXRaySupported() const override { return true; } unsigned getMinVectorRegisterBitWidth() const { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 6f45c03ae977616e0cb9bc5fd3f8ffb2791095ed..74b27770f35c44993d36b12a4f12a4e2c055a2c9 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -395,6 +395,10 @@ bool AArch64TTIImpl::isProfitableToLoopVersioning() const { return ST->isHiSiliconProc() || ForceEnableExperimentalOpt; } +bool AArch64TTIImpl::isProfitableToDirectPrefetch() const { + return !ST->isHiSiliconHIPProc(); +} + InstructionCost AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 0f4a4d7dc80460651365486d2ca8b1bd0651ef81..eae92c7aa5b6c8eacbba6e121db050ad86ab6a66 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -94,6 +94,8 @@ public: bool isProfitableToLoopVersioning() const; + bool isProfitableToDirectPrefetch() const; + /// @} /// \name Vector TTI Implementations diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp index 7c2770979a900fa7e953a712b192144c27bee3ce..bd395becc40acbeacf1f1fa19210d4074e41e2c1 100644 --- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -14,21 +14,34 @@ #include "llvm/InitializePasses.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/Module.h" +#include "llvm/IR/ReplaceConstant.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopSimplify.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #define DEBUG_TYPE "loop-data-prefetch" @@ -37,9 +50,9 @@ using namespace llvm; // By default, we limit this to creating 16 PHIs (which is a little over half // of the allocatable register set). -static cl::opt -PrefetchWrites("loop-prefetch-writes", cl::Hidden, cl::init(false), - cl::desc("Prefetch write addresses")); +static cl::opt PrefetchWrites("loop-prefetch-writes", cl::Hidden, + cl::init(false), + cl::desc("Prefetch write addresses")); static cl::opt PrefetchDistance("prefetch-distance", @@ -54,31 +67,136 @@ static cl::opt MaxPrefetchIterationsAhead( "max-prefetch-iters-ahead", cl::desc("Max number of iterations to prefetch ahead"), cl::Hidden); +static cl::opt + IndirectLoadPrefetch("indirect-load-prefetch", cl::Hidden, cl::init(false), + cl::desc("Enable indirect load prefetch")); + +static cl::opt PrefetchIterationsAhead( + "indirect-prefetch-iters-ahead", + cl::desc("Number of iterations for indirect-load prefetch"), cl::Hidden, + cl::init(0)); + +static cl::opt SkipIntermediate( + "indirect-prefetch-skip-intermediate", cl::Hidden, cl::init(false), + cl::desc( + "Skip prefetching intermediate loads while doing indirect prefetch")); + +static cl::opt IndirectionLevel( + "indirect-level", + cl::desc("Indirection level considered for indirect load prefetch"), + cl::Hidden, cl::init(2)); + +static cl::opt RandomAccessPrefetch( + "random-access-prefetch", cl::Hidden, cl::init(false), + cl::desc("Enable random offset indirect load prefetch")); + +static cl::opt + EnableNonFaultyLoad("prefetch-with-nonfaulty-load", cl::Hidden, + cl::init(false), + cl::desc("Prefetch with non-faulty Load instruction.")); + +static cl::opt CachelineSize("prefetch-cache-line-size", + cl::desc("Specify cache line size"), + cl::Hidden, cl::init(64)); + +static cl::opt + OuterLoopPrefetch("outer-loop-prefetch", cl::Hidden, cl::init(false), + cl::desc("Enable prefetch in outer loops")); + +static cl::opt + DisableDirectLoadPrefetch("disable-direct-prefetch", cl::Hidden, + cl::init(false), + cl::desc("Disable direct load prefetch")); + +static cl::opt + PrefetchLoopDepth("prefetch-loop-depth", + cl::desc("Least loop depth to insert prefetch"), + cl::Hidden, cl::init(1)); + STATISTIC(NumPrefetches, "Number of prefetches inserted"); +STATISTIC(NumIndPrefetches, "Number of indirect prefetches inserted"); namespace { +// Helper function to return a type with the same size as +// given step size +static Type *getPtrTypefromPHI(PHINode *PHI, int64_t StepSize) { + Type *Int8Ty = Type::getInt8Ty(PHI->getParent()->getContext()); + return ArrayType::get(Int8Ty, StepSize); +} + /// Loop prefetch implementation class. class LoopDataPrefetch { public: - LoopDataPrefetch(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI, - ScalarEvolution *SE, const TargetTransformInfo *TTI, + LoopDataPrefetch(AliasAnalysis *AA, AssumptionCache *AC, DominatorTree *DT, + LoopInfo *LI, ScalarEvolution *SE, + const TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE) - : AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {} + : AA(AA), AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {} bool run(); private: bool runOnLoop(Loop *L); + Value *getCanonicalishSizeVariable(Loop *L, PHINode *PHI) const; + Value * + getLoopIterationNumber(Loop *L, + SmallPtrSet &LoopAuxIndPHINodes, + ValueMap &AuxIndBounds); + /// If prefetch instruction is not inserted, need to clean iteration + /// instructions in the preheader. + void cleanLoopIterationNumber(Value *NumIterations); + /// Returns whether the auxiliary induction variable can generate bound. + /// If it can, add PHI to LoopAuxIndPHINodes + bool canGetAuxIndVarBound(Loop *L, PHINode *PHI, + SmallPtrSet &LoopAuxIndPHINodes); + + /// Generate bound for the auxiliary induction variable at the + /// preheader and add it to AuxIndBounds. + /// Returns whether the bound was successfully generated. + bool getAuxIndVarBound(Loop *L, PHINode *PHI, Value *NumIterations, + ValueMap &AuxIndBounds); + + bool insertPrefetcherForIndirectLoad( + Loop *L, unsigned Idx, Value *NumIterations, + SmallVector &CandidateMemoryLoads, + SmallSetVector &DependentInsts, + ValueMap &AuxIndBounds, + SmallVectorImpl> &Transforms, + unsigned ItersAhead); + + bool findCandidateMemoryLoads( + Instruction *I, SmallSetVector &InstList, + SmallPtrSet &InstSet, + SmallVector &CandidateMemoryLoads, + std::vector> &DependentInstList, + SmallPtrSet LoopAuxIndPHINodes); + + /// Helper function to determine whether the given load is in + /// CandidateMemoryLoads. If yes, add the candidate's depending inst to the + /// list + bool isLoadInCandidateMemoryLoads( + LoadInst *LoadI, SmallSetVector &InstList, + SmallPtrSet &InstSet, + SmallVector &CandidateMemoryLoads, + std::vector> &DependentInstList); + + /// Returns whether the given loop can do indirect prefetch and should be + /// processed to insert prefetches for indirect loads. + bool canDoIndirectPrefetch(Loop *L); + + bool isCrcHashDataAccess(Instruction *I, Instruction *PrefetchingLoad); + bool isIntermediateLoadSupported(Loop *L, LoadInst *&CandidateLoad, + SmallSetVector &InstList); + bool doIndirectPrefetch(Loop *L, unsigned ItersAhead); /// Check if the stride of the accesses is large enough to /// warrant a prefetch. bool isStrideLargeEnough(const SCEVAddRecExpr *AR, unsigned TargetMinStride); unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, - unsigned NumPrefetches, - bool HasCall) { + unsigned NumPrefetches, bool HasCall) { if (MinPrefetchStride.getNumOccurrences() > 0) return MinPrefetchStride; return TTI->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, @@ -103,6 +221,15 @@ private: return TTI->enableWritePrefetching(); } + bool isSupportsNonFaultyLoad(Module *M) { + if (EnableNonFaultyLoad.getNumOccurrences() > 0) + return EnableNonFaultyLoad; + Triple TargetTriple = Triple(M->getTargetTriple()); + return TTI->supportsScalableVectors() && + TargetTriple.getArch() == Triple::aarch64; + } + + AliasAnalysis *AA; AssumptionCache *AC; DominatorTree *DT; LoopInfo *LI; @@ -120,6 +247,7 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addPreserved(); @@ -140,6 +268,7 @@ public: char LoopDataPrefetchLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(LoopDataPrefetchLegacyPass, "loop-data-prefetch", "Loop Data Prefetch", false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) @@ -169,8 +298,825 @@ bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR, return TargetMinStride <= AbsStride; } +/// Use the induction variable to generate value representing the total num of +/// iterations for the loop in the preheader. +Value *LoopDataPrefetch::getLoopIterationNumber( + Loop *L, SmallPtrSet &LoopAuxIndPHINodes, + ValueMap &AuxIndBounds) { + Value *LoopBoundValue; + Value *LoopStepValue; + Value *LoopStartValue; + Value *LoopPreHeader; + Value *NumIterations; + + // Use induction variable to derive number of iterations for the loop which + // will be used to calculate the upper bound for other auxiliary induction + // variables. + PHINode *PHI = L->getInductionVariable(*SE); + if (PHI == nullptr) + return nullptr; + + auto LoopLB = L->getBounds(*SE); + if (!LoopLB) + return nullptr; + + LoopStartValue = &(LoopLB->getInitialIVValue()); + LoopStepValue = LoopLB->getStepValue(); + LoopBoundValue = &(LoopLB->getFinalIVValue()); + LoopPreHeader = L->getLoopPreheader(); + + if (LoopStartValue == nullptr || LoopStepValue == nullptr || + LoopBoundValue == nullptr || LoopPreHeader == nullptr) + return nullptr; + + // Step should be constant. + if (!isa(SE->getSCEV(LoopStepValue))) + return nullptr; + + // Make sure each of them is invariant so we can use them in the preheader. + if (!L->isLoopInvariant(LoopBoundValue) || + !L->isLoopInvariant(LoopStepValue) || !L->isLoopInvariant(LoopStartValue)) + return nullptr; + + // Generate instruction that calculated the total number of iterations of the + // loop in the preheader. + IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + Value *Range = Builder.CreateSub(LoopBoundValue, LoopStartValue); + NumIterations = Builder.CreateSDiv(Range, LoopStepValue); + + LoopAuxIndPHINodes.insert(PHI); + Value *Bound = nullptr; + // If the step is positive, the upper bound isn't included, i.e. accessing + // [bound] is not legal, so subtract the bound by LoopStepValue to prevent out + // of bounds memory access. + if (SE->isKnownNegative(SE->getSCEV(LoopStepValue))) + Bound = LoopBoundValue; + else + Bound = Builder.CreateSub(LoopBoundValue, LoopStepValue); + AuxIndBounds.insert(std::pair(PHI, Bound)); + return NumIterations; +} + +/// If prefetch instruction is not inserted. Need to clean iteration instruction +/// in the preheader. +void LoopDataPrefetch::cleanLoopIterationNumber(Value *NumIterations) { + RecursivelyDeleteTriviallyDeadInstructions(NumIterations); +} + +/// Returns whether the auxiliary induction variable can generate bound. +/// If it can generate a bound, add PHI to LoopAuxIndPHINodes +bool LoopDataPrefetch::canGetAuxIndVarBound( + Loop *L, PHINode *PHI, SmallPtrSet &LoopAuxIndPHINodes) { + Value *AuxIndVarStartValue = + PHI->getIncomingValueForBlock(L->getLoopPreheader()); + if (AuxIndVarStartValue == nullptr) + return false; + + const SCEV *LSCEV = SE->getSCEV(PHI); + const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); + + if (LSCEVAddRec == nullptr) + return false; + + // Currently, we only support constant steps. + if (dyn_cast(LSCEVAddRec->getStepRecurrence(*SE))) { + InductionDescriptor IndDesc; + if (!InductionDescriptor::isInductionPHI(PHI, L, SE, IndDesc)) + return false; + + if (IndDesc.getInductionOpcode() != Instruction::Add && + IndDesc.getInductionOpcode() != Instruction::Sub && + IndDesc.getKind() != InductionDescriptor::IK_PtrInduction) + return false; + return true; + } + return false; +} + +/// Generate bound for the auxiliary induction variable at the preheader and add +/// it to AuxIndBounds. Returns whether the bound was successfully generated. +bool LoopDataPrefetch::getAuxIndVarBound( + Loop *L, PHINode *PHI, Value *NumIterations, + ValueMap &AuxIndBounds) { + IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + Value *AuxIndVarStartValue = + PHI->getIncomingValueForBlock(L->getLoopPreheader()); + if (AuxIndVarStartValue == nullptr) + return false; + + const SCEV *LSCEV = SE->getSCEV(PHI); + const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); + + // Currently, we only support constant steps. + if (const SCEVConstant *ConstPtrDiff = + dyn_cast(LSCEVAddRec->getStepRecurrence(*SE))) { + Value *AuxIndVarBound; + InductionDescriptor IndDesc; + if (!InductionDescriptor::isInductionPHI(PHI, L, SE, IndDesc)) + return false; + + // Calculate the upper bound for the auxiliary induction variable. + Value *CastedNumIterations = + Builder.CreateSExtOrTrunc(NumIterations, ConstPtrDiff->getType()); + + // Subtract one from CastedNumIterations as we want the bound to be in + // bounds. If there are N iterations, the first iteration will access the + // array at offset 0. On the N-th iteration, it will access the array at + // offset N-1, not N. + CastedNumIterations = Builder.CreateSub( + CastedNumIterations, ConstantInt::get(ConstPtrDiff->getType(), 1)); + // Teh induction operator is add / sub + if (IndDesc.getInductionOpcode() == Instruction::Add || + IndDesc.getInductionOpcode() == Instruction::Sub) { + Value *Range = + Builder.CreateMul(ConstPtrDiff->getValue(), CastedNumIterations); + AuxIndVarBound = Builder.CreateAdd(Range, AuxIndVarStartValue); + } else if (IndDesc.getKind() == InductionDescriptor::IK_PtrInduction) { + // The induction variable is a pointer + int64_t StepSize = ConstPtrDiff->getAPInt().getSExtValue(); + if (SE->isKnownNegative(ConstPtrDiff)) { + StepSize = -StepSize; + CastedNumIterations = Builder.CreateMul( + ConstantInt::getSigned(ConstPtrDiff->getType(), -1), + CastedNumIterations); + } + Type *GEPType = getPtrTypefromPHI(PHI, StepSize); + AuxIndVarBound = Builder.CreateInBoundsGEP(GEPType, AuxIndVarStartValue, + CastedNumIterations); + } else + return false; + + LLVM_DEBUG(dbgs() << "Added " + << (isa(SE->getSCEV(AuxIndVarBound)) + ? "Constant " + : "") + << "AuxIndVarBound " << *AuxIndVarBound + << " for AuxIndVar:" << *PHI << "\n"); + AuxIndBounds.insert(std::pair(PHI, AuxIndVarBound)); + + return true; + } + return false; +} + +// Helper function to calculate the step for a given loop +static uint64_t getStep(PHINode *PN, ScalarEvolution *SE) { + // Get the constant step for the induction phi so we can use it to calculate + // how much we should increase the induction for prefetching. + uint64_t Step = 0; + const SCEV *LSCEV = SE->getSCEV(PN); + const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); + + if (LSCEVAddRec == nullptr) + return Step; + + if (const SCEVConstant *ConstPtrDiff = + dyn_cast(LSCEVAddRec->getStepRecurrence(*SE))) { + Step = ConstPtrDiff->getAPInt().getZExtValue(); + } + return Step; +} + +// Helper function to determine if the loop step is positive +static bool isPositiveStep(PHINode *PN, ScalarEvolution *SE) { + bool PositiveStep = true; + const SCEV *LSCEV = SE->getSCEV(PN); + const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); + if (const SCEVConstant *ConstPtrDiff = + dyn_cast(LSCEVAddRec->getStepRecurrence(*SE))) { + if (SE->isKnownNegative(ConstPtrDiff)) { + PositiveStep = false; + } + } + return PositiveStep; +} + +// Helper function to calculate the step type of a PHI node. If the PHI node is +// not a pointer type, get the type PHI Node itself. Otherwise, get the integer +// type of the PHI's step/offset value. +static Type *getStepTypeFromPHINode(PHINode *PN, ScalarEvolution *SE) { + // Get the constant step for the induction phi so we can use it to calculate + // how much we should increase the induction for prefetching. + Type *T = PN->getType(); + if (!T->isPointerTy()) + return T; + + const SCEV *LSCEV = SE->getSCEV(PN); + const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); + if (const SCEVConstant *ConstPtrDiff = + dyn_cast(LSCEVAddRec->getStepRecurrence(*SE))) + return ConstPtrDiff->getType(); + + return T; +} + +/// This function will take an instr list that contains indirect loads and +/// transform them into prefetchers. E.g. Transform following indirect load +/// A[B[i]]: +/// phi indvar [0] [bound] +/// idxB = gep *B, indvar +/// offsetA = load * idxB +/// idxA = gep *A, offsetA +/// valueA = load *idxA +/// To indirect load with prefetchers N iteration ahead: +/// phi indvar [0] [bound] +/// offsetN = add indvar, N +/// offset2N = add indvar, 2N +/// compare = icmp offsetN, bound +/// offsetN = select compare, offsetN, bound +/// preIdxN = gep *B, offsetN +/// preIdx2N = get *B, offset2N +/// call prefetch(preIdx2N) +/// preOffsetA = load preIdxN +/// preIdxA = gep *A, preOffsetA +/// call prefetch(preIdxA) +/// idxB = gep *B, indvar +/// offsetA = load *idxB +/// idxA = gep *A, offsetA +/// valueA = load *idxA +bool LoopDataPrefetch::insertPrefetcherForIndirectLoad( + Loop *L, unsigned Idx, Value *NumIterations, + SmallVector &CandidateMemoryLoads, + SmallSetVector &DependentInsts, + ValueMap &AuxIndBounds, + SmallVectorImpl> &Transforms, + unsigned ItersAhead) { + bool PositiveStep = true; + Instruction *TargetIndirectLoad = CandidateMemoryLoads[Idx]; + IRBuilder<> Builder(TargetIndirectLoad); + Module *M = TargetIndirectLoad->getModule(); + Type *I32Ty = Type::getInt32Ty(TargetIndirectLoad->getParent()->getContext()); + + bool isRandomAccess = false; + bool isCallDependency = false; + for (auto *I : DependentInsts) { + isCallDependency |= isa(I); + if (isCrcHashDataAccess(I, TargetIndirectLoad)) { + isRandomAccess = true; + break; + } + } + // CallInst dependency only support for Random access with CRC. + if (!isRandomAccess && (isCallDependency || !canDoIndirectPrefetch(L))) + return false; + + // If indirect load prefetch is not specified then, exit for non random cases. + if (!IndirectLoadPrefetch && !isRandomAccess) + return false; + + LLVM_DEBUG(dbgs() << "Inserting indirect prefetchers for\t" + << *TargetIndirectLoad << "\twith " << DependentInsts.size() + << " dependent instructions\n"); + + // Keep track of the number of prefetches left to process among the + // DependentInst List. We assume that for given indirectLevel N, we will have + // N prefetches to do, unless we are skipping intermediate loads, then we are + // only doing 1 prefetch. + size_t NumPrefetchesLeft = SkipIntermediate ? 1 : IndirectionLevel; + int64_t Step; + while (!DependentInsts.empty()) { + Instruction *DependentInst = DependentInsts.pop_back_val(); + Instruction *Inst = dyn_cast(DependentInst); + + switch (Inst->getOpcode()) { + case Instruction::PHI: { + // Get the constant step for the induction phi so we can use it to + // calculate how much we should increase the induction for prefetching. + PHINode *PN = dyn_cast(Inst); + Step = getStep(PN, SE); + PositiveStep = isPositiveStep(PN, SE); + Type *InstType = getStepTypeFromPHINode(PN, SE); + if (!PositiveStep) + Step = -Step; + + // Make sure phi node is i64 or i32. + if (!InstType->isIntegerTy(64) && !InstType->isIntegerTy(32)) + return false; + + // Create the bound for this PHI if needed: + if (!AuxIndBounds.count(PN)) + getAuxIndVarBound(L, PN, NumIterations, AuxIndBounds); + + // We create values based on the induction variable so we can use it to + // generate prefetcher later on. The first value (indvar + IterationAhead + // * step) will be used for the load of prefetched address and it must + // not exceeding the bound. The second value (indvar + 2 * IterationAhead + // * step) will be used to generate prefether for the load of address. + // The subsequent values are generated in a similar fashion to generate + // prefetchers for offset of all dependent loads. + + // Insert the new instruction after all PHI node. + auto InsertionPoint = Inst; + if (auto FirstNonPHI = Inst->getParent()->getFirstNonPHI()) + InsertionPoint = FirstNonPHI->getPrevNode(); + + for (size_t i = 0; i < NumPrefetchesLeft; i++) { + if (i > 0 && SkipIntermediate) + break; + + if (Transforms.size() < i + 1) { + Transforms.push_back(DenseMap()); + } else if (Transforms[i].count(Inst)) + continue; + + // Create the new operation for the target load + Value *NewOp = nullptr; + if (Inst->getType()->isPointerTy()) { + Type *GEPType = getPtrTypefromPHI(PN, Step); + int64_t Offset = + PrefetchIterationsAhead ? PrefetchIterationsAhead : ItersAhead; + if (!PositiveStep) + Offset = -Offset; + // Do not need to calculate Offset * Step as it is calculated + // implicitly within the GEP instruction + NewOp = Builder.CreateInBoundsGEP( + GEPType, Inst, + ConstantInt::getSigned(InstType, (i + 1) * Offset)); + } else { + // FullStep is the initial offset for the new value, taking into + // account, both Step and the number of iterations ahead to prefetch. + // If indirect prefetch iterations ahead is enabled, we directly use + // the supplied indirect-prefetch-iters-ahead value. + int64_t FullStep = PrefetchIterationsAhead + ? PrefetchIterationsAhead * Step + : ItersAhead * Step; + + Instruction::BinaryOps BiOp = + PositiveStep ? Instruction::Add : Instruction::Sub; + NewOp = Builder.CreateBinOp( + BiOp, Inst, + ConstantInt::get(Inst->getType(), (i + 1) * FullStep)); + } + + if (auto NewOpInstr = dyn_cast(NewOp)) { + NewOpInstr->moveAfter(InsertionPoint); + InsertionPoint = NewOpInstr; + } + + // Create the new operations for the offset loads + if (i > 0 && i == NumPrefetchesLeft - 1) { + Transforms[i].insert(std::pair(Inst, NewOp)); + } else { + Value *NewCmp = Builder.CreateICmp( + PositiveStep ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, NewOp, + AuxIndBounds[cast(Inst)]); + Value *NewSelect = + Builder.CreateSelect(NewCmp, NewOp, AuxIndBounds[PN]); + Transforms[i].insert(std::pair(Inst, NewSelect)); + + if (auto NewCmpInstr = dyn_cast(NewCmp)) { + NewCmpInstr->moveAfter(InsertionPoint); + InsertionPoint = NewCmpInstr; + } + + if (auto NewSelectInstr = dyn_cast(NewSelect)) { + NewSelectInstr->moveAfter(InsertionPoint); + InsertionPoint = NewSelectInstr; + } + } + } + break; + } + case Instruction::Load: { + LoadInst *LoadI = dyn_cast(Inst); + Value *LoadPtr = LoadI->getPointerOperand(); + if (!SkipIntermediate) + NumPrefetchesLeft--; + + auto GeneratePrefetcher = [&](llvm::Value *PrefetchPtr) { + Function *PrefetchFunc = Intrinsic::getDeclaration( + M, Intrinsic::prefetch, LoadPtr->getType()); + Value *PrefetchArg[] = {PrefetchPtr, ConstantInt::get(I32Ty, 0), + ConstantInt::get(I32Ty, 3), + ConstantInt::get(I32Ty, 1)}; + CallInst *PrefetchCall = CallInst::Create(PrefetchFunc, PrefetchArg); + return PrefetchCall; + }; + + auto CloneNonFaultyLoad = [&](LoadInst *Load, int PrefetchLevel, + Value *LoadOperand) { + auto DL = Load->getParent()->getModule()->getDataLayout(); + auto ScalableWidth = + TTI->getRegisterBitWidth(TargetTransformInfo::RGK_ScalableVector) + .getKnownMinValue(); + + auto LDType = Load->getType(); + unsigned LDTypeSize = (LDType->isPointerTy()) + ? DL.getTypeStoreSizeInBits(LDType) + : LDType->getScalarSizeInBits(); + int ElementCount = ScalableWidth / LDTypeSize; + Type *PredTy = ScalableVectorType::get( + Type::getInt1Ty(Load->getParent()->getContext()), ElementCount); + auto *PTruePat = + ConstantInt::get(I32Ty, 1 /* = AArch64SVEPredPattern::vl1*/); + + auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, + {PredTy}, {PTruePat}); + PTrue->moveAfter(Load); + Type *ScaledLoadTy = ScalableVectorType::get(LDType, ElementCount); + auto *Ldnf1 = Builder.CreateIntrinsic( + Intrinsic::aarch64_sve_ldnf1, {ScaledLoadTy}, {PTrue, LoadOperand}); + Ldnf1->moveAfter(PTrue); + auto *ExtractIdx = ConstantInt::get(I32Ty, 0); + Instruction *Element = dyn_cast( + Builder.CreateExtractElement(Ldnf1, ExtractIdx)); + Element->moveAfter(Ldnf1); + Ldnf1->replaceUsesOfWith(LoadOperand, + Transforms[PrefetchLevel][LoadOperand]); + return Element; + }; + + if (!DependentInsts.empty()) { + // For any intermediate (not last) load, we generate a load for all the + // offset at min(indvar+N*IterationsAhead*step, bound)] for each N up to + // NumPrefetchesLeft - 1, and generate a prefetcher at + // (indvar+(N+1)*IterationAhead*step) for the offset load. + Instruction *PrefetchOffsetLoad = nullptr; + for (size_t i = 0; i < NumPrefetchesLeft; i++) { + if (Transforms[i].count(LoadI)) + continue; + if (isSupportsNonFaultyLoad(M)) + PrefetchOffsetLoad = CloneNonFaultyLoad(LoadI, i, LoadPtr); + else { + PrefetchOffsetLoad = LoadI->clone(); + Builder.Insert(PrefetchOffsetLoad); + PrefetchOffsetLoad->moveAfter(LoadI); + PrefetchOffsetLoad->replaceUsesOfWith(LoadPtr, + Transforms[i][LoadPtr]); + } + Transforms[i].insert( + std::pair(LoadI, PrefetchOffsetLoad)); + } + + if (SkipIntermediate) + break; + + // Create a prefetcher for the offset load. + if (PrefetchOffsetLoad) { + CallInst *PrefetchCall = + GeneratePrefetcher(Transforms[NumPrefetchesLeft][LoadPtr]); + PrefetchCall->insertAfter(PrefetchOffsetLoad); + NumIndPrefetches++; + } + } else { + CallInst *PrefetchCall = GeneratePrefetcher(Transforms[0][LoadPtr]); + PrefetchCall->insertAfter(LoadI); + NumIndPrefetches++; + } + break; + } + default: { + // For other types of instructions, we make a clone of the instruction and + // replace operands that we already transformed before. + for (size_t j = 0; j < NumPrefetchesLeft; j++) { + if (j >= Transforms.size() || Transforms[j].count(Inst)) + continue; + Instruction *TransformedInst = Inst->clone(); + Builder.Insert(TransformedInst); + TransformedInst->moveAfter(Inst); + for (unsigned i = 0; i < TransformedInst->getNumOperands(); i++) { + Value *Operand = TransformedInst->getOperand(i); + if (Transforms[j].count(Operand)) + TransformedInst->replaceUsesOfWith(Operand, Transforms[j][Operand]); + } + + Transforms[j].insert( + std::pair(Inst, TransformedInst)); + } + break; + } + } + } + return true; +} + +/// Find the indirect load that depends on the auxiliary induction variable and +/// construct an instr list that contains loop variant instruction from the +/// target load to the candidate phi instr. +bool LoopDataPrefetch::findCandidateMemoryLoads( + Instruction *I, SmallSetVector &InstList, + SmallPtrSet &InstSet, + SmallVector &CandidateMemoryLoads, + std::vector> &DependentInstList, + SmallPtrSet LoopAuxIndPHINodes) { + bool ret = false; + + for (Use &U : I->operands()) { + // If value is loop invariant, just continue + if (LI->getLoopFor(I->getParent())->isLoopInvariant(U.get())) + continue; + + Instruction *OperandInst = dyn_cast(U.get()); + if (OperandInst != nullptr) { + switch (OperandInst->getOpcode()) { + case Instruction::Load: { + // Check if the load instruction that it depends on is already in the + // candidate. If yes, add the candidate's depending instr to the list. + // If not, the load instruction it depends on is invalid. + LoadInst *LoadI = dyn_cast(OperandInst); + if (isLoadInCandidateMemoryLoads(LoadI, InstList, InstSet, + CandidateMemoryLoads, + DependentInstList)) { + // We do not return early in case there are other auxiliary induction + // variables to check. + ret = true; + } + break; + } + case Instruction::PHI: { + // Check if PHI is the loop auxiliary induction PHI. If yes, found a + // valid load dependent on loop auxiliary induction variable. If not, + // invalid candidate. + PHINode *PhiInst = dyn_cast(OperandInst); + if (LoopAuxIndPHINodes.contains(PhiInst)) { + // In order to prevent the size of SmallVector from going out of + // bounds for large cases, only the last access of the element is + // retained. Update the position of OperandInst in the InstList. + if (InstList.count(OperandInst)) + InstList.remove(OperandInst); + InstList.insert(OperandInst); + return true; + } + break; + } + case Instruction::Call: { + if (OperandInst->mayReadOrWriteMemory()) + return false; + CallInst *Call = dyn_cast(OperandInst); + if (!Call->doesNotThrow()) + return false; + + // Use DFS to search though the operands. + InstList.insert(OperandInst); + if (findCandidateMemoryLoads(OperandInst, InstList, InstSet, + CandidateMemoryLoads, DependentInstList, + LoopAuxIndPHINodes)) { + // We do not return early in case there are other auxiliary + // induction variable to check + ret = true; + } else { + // If the Operand isn't dependent on an auxiliary induction + // variable, remove any instructions added to DependentInstList from + // this operand + if (InstList.count(OperandInst)) + InstList.remove(OperandInst); + InstList.insert(OperandInst); + return false; + } + break; + } + case Instruction::Invoke: { + // We currently can not handle case where indirect load depends on other + // functions yet. + return false; + } + default: { + // Use DFS to search though the operands. + if (InstList.count(OperandInst)) + InstList.remove(OperandInst); + InstList.insert(OperandInst); + if (findCandidateMemoryLoads(OperandInst, InstList, InstSet, + CandidateMemoryLoads, DependentInstList, + LoopAuxIndPHINodes)) { + // We do not return early in case there are other auxiliary induction + // variables to check + ret = true; + } else { + // If the operand isn't dependent on an auxiliary induction variable, + // remove any instructions added to DependentInstList from this + // operand + InstList.remove(OperandInst); + } + } + } + } + } + return ret; +} + +/// Helper function to determine whether the given load is in +/// CandidateMemoryLoads. If Yes, add the candidate's depending instr to the +/// list. +bool LoopDataPrefetch::isLoadInCandidateMemoryLoads( + LoadInst *LoadI, SmallSetVector &InstList, + SmallPtrSet &InstSet, + SmallVector &CandidateMemoryLoads, + std::vector> &DependentInstList) { + size_t CandidateLoadIndex = 0; + for (auto CandidateMemoryLoad : CandidateMemoryLoads) { + if (LoadI == CandidateMemoryLoad) + break; + CandidateLoadIndex++; + } + + if (CandidateLoadIndex >= CandidateMemoryLoads.size() || InstSet.count(LoadI)) + return false; + + for (auto CandidateInst : DependentInstList[CandidateLoadIndex]) { + if (InstList.count(CandidateInst)) + InstList.remove(CandidateInst); + InstList.insert(CandidateInst); + InstSet.insert(CandidateInst); + } + return true; +} + +/// Returns whether the given loop should be processed to insert prefetches for +/// indirect loads. +bool LoopDataPrefetch::canDoIndirectPrefetch(Loop *L) { + // Support inner most loops in a simple form. However, the parent of inner + // loop will be processed as well in the case of nested loops. If + // indirectLevel is low, only allow one block loop, otherwise, allow up to 5 + // under certain conditions. + if (!L->isInnermost() || !L->getLoopPreheader() || + (IndirectionLevel <= 3 && L->getNumBlocks() != 1) || + (IndirectionLevel > 3 && L->getNumBlocks() == 1) || L->getNumBlocks() > 5) + return false; + return true; +} + +/// Check if the load depends on Crc Hash functions. +bool LoopDataPrefetch::isCrcHashDataAccess(Instruction *I, + Instruction *PrefetchingLoad) { + if (llvm::IntrinsicInst *II = dyn_cast(I)) + // If CRC functions are used for offset calculation then offset will be + // random. To avoid cache misses, data prefetch is needed. + switch (II->getIntrinsicID()) { + case Intrinsic::aarch64_crc32b: + case Intrinsic::aarch64_crc32cb: + case Intrinsic::aarch64_crc32h: + case Intrinsic::aarch64_crc32ch: + case Intrinsic::aarch64_crc32w: + case Intrinsic::aarch64_crc32cw: + case Intrinsic::aarch64_crc32x: + case Intrinsic::aarch64_crc32cx: { + // Checking Candidate load is incremented by 1. + if (auto *LI = dyn_cast(PrefetchingLoad)) { + if (auto *GEPI = dyn_cast(LI->getPointerOperand())) { + // The data access will be consecutive, if the gep has one indices. + if (GEPI->getNumOperands() > 2) + return false; + auto *PtrIndices = dyn_cast(GEPI->getOperand(1)); + if (!PtrIndices || isa(PtrIndices)) + return true; + for (auto &U : PtrIndices->uses()) + if (auto *PN = dyn_cast(U.getUser())) + if (getStep(PN, SE) <= 1) + return true; + } + } + break; + } + } + return false; +} + +bool LoopDataPrefetch::isIntermediateLoadSupported( + Loop *L, LoadInst *&CandidateLoad, + SmallSetVector &InstList) { + BasicBlock *DependentBB = nullptr; + for (auto *I : InstList) { + if (isSupportsNonFaultyLoad(CandidateLoad->getModule())) { + if (LoadInst *IntermediateLoad = dyn_cast(I)) { + if (IntermediateLoad == CandidateLoad) + continue; + // If intermediate load is scalable then, using sve non-faulting + // can be used. + auto *LoadTy = IntermediateLoad->getType(); + if (!LoadTy->isIntegerTy() && !LoadTy->isFloatingPointTy() && + !LoadTy->isPointerTy()) { + return false; + } + } + } else { + // If the intermediate load is in a differnt basicblock then, there is + // a chance of segmenttation fault. + if (DependentBB && isa(I) && DependentBB != I->getParent() && + L->contains(I->getParent())) { + return false; + } + if (L->contains(I->getParent())) + DependentBB = I->getParent(); + } + } + return true; +} + +bool LoopDataPrefetch::doIndirectPrefetch(Loop *L, unsigned ItersAhead) { + // List of valid phi nodes that indirect loads can depend on. + SmallPtrSet LoopAuxIndPHINodes; + // Map of valid phi node to its bound value in the preheader. + ValueMap AuxIndBounds; + // Candidate memory loads in the loop. + SmallVector CandidateMemoryLoads; + // List of instruction from phi to load. + std::vector> DependentInstList; + // List of store instr in the loop. + SmallVector LoopStorePtrs; + bool MadeChange = false; + + // Get loop induction and auxiliary induction phis. (They will be candidates + // for phi node matching during construction of the candidate instructions.) + // And we use the phi nodes to determine the loop upperbound. + Value *NumIterations = + getLoopIterationNumber(L, LoopAuxIndPHINodes, AuxIndBounds); + if (NumIterations == nullptr) + return MadeChange; + + if (!RandomAccessPrefetch && !canDoIndirectPrefetch(L)) { + cleanLoopIterationNumber(NumIterations); + return MadeChange; + } + + // Find candidate auxiliary induction variables which could be a dependent for + // the indirect load. + for (auto &I : *L->getHeader()) + if (PHINode *PHI = dyn_cast(&I)) { + InductionDescriptor IndDesc; + if (InductionDescriptor::isInductionPHI(PHI, L, SE, IndDesc) && + L->getInductionVariable(*SE) != PHI) { + if (canGetAuxIndVarBound(L, PHI, LoopAuxIndPHINodes)) + LoopAuxIndPHINodes.insert(PHI); + } + } + + // Will search for candidates in the parent loop of the current inner most + // loop. This will capture more opportunities in the outer loop. + SmallVector BBList; + for (auto &BB : L->blocks()) + BBList.push_back(BB); + if (L->getParentLoop()) + for (auto &BB : L->getParentLoop()->blocks()) { + // We don't want to repeat blocks in the case of nested loops. + if (L->contains(BB)) + continue; + BBList.push_back(BB); + } + + // Iterate through the loop and keep track of the memory loads and the + // instruction list they depend on. + for (const auto BB : BBList) { + for (auto &I : *BB) + if (LoadInst *LoadI = dyn_cast(&I)) { + SmallSetVector InstList; + SmallSet InstSet; + InstList.insert(LoadI); + InstSet.insert(LoadI); + if (findCandidateMemoryLoads(LoadI, InstList, InstSet, + CandidateMemoryLoads, DependentInstList, + LoopAuxIndPHINodes)) { + if (!isIntermediateLoadSupported(L, LoadI, InstList)) + continue; + LLVM_DEBUG(dbgs() << "Found load candidate " << *LoadI << "\n"); + CandidateMemoryLoads.push_back(LoadI); + DependentInstList.push_back(InstList); + } + } else if (StoreInst *StoreI = dyn_cast(&I)) { + // Keep track of store insts to avoid conflict. + LoopStorePtrs.push_back(StoreI->getPointerOperand()); + } + } + + // Keep track of previously transformed instrs for offset load and target + // loads so we can reuse them. + SmallVector> Transforms; + for (unsigned i = 0; i < CandidateMemoryLoads.size(); i++) { + SmallSetVector DependentInsts = DependentInstList[i]; + unsigned NumLoads = 0; + bool NoConflict = true; + // Find candidate that contains indirect loads and check load for offset + // doesn't alias with other stores. + for (auto DependentInst : DependentInsts) { + if (LoadInst *LoadI = dyn_cast(DependentInst)) { + NumLoads++; + // For the load of target address offset, we avoid the load being + // conflict with stores in the same loop. + if (NumLoads == IndirectionLevel) { + Value *LoadPtr = LoadI->getPointerOperand(); + for (Value *StorePtr : LoopStorePtrs) + if (AA->isMustAlias(LoadPtr, StorePtr)) { + NoConflict = false; + break; + } + } + } + } + + // Prefetch all indirect loads without conflict to the offset load. + if (NumLoads == IndirectionLevel && NoConflict) { + MadeChange |= insertPrefetcherForIndirectLoad( + L, i, NumIterations, CandidateMemoryLoads, DependentInsts, + AuxIndBounds, Transforms, ItersAhead); + } + } + + cleanLoopIterationNumber(NumIterations); + return MadeChange; +} + PreservedAnalyses LoopDataPrefetchPass::run(Function &F, FunctionAnalysisManager &AM) { + AliasAnalysis *AA = &AM.getResult(F); DominatorTree *DT = &AM.getResult(F); LoopInfo *LI = &AM.getResult(F); ScalarEvolution *SE = &AM.getResult(F); @@ -179,8 +1125,16 @@ PreservedAnalyses LoopDataPrefetchPass::run(Function &F, &AM.getResult(F); const TargetTransformInfo *TTI = &AM.getResult(F); - LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE); - bool Changed = LDP.run(); + // Ensure loops are in simplified form which is a pre-requisite for loop data + // prefetch pass. Added only for new PM since the legacy PM has already added + // LoopSimplify pass as a dependency. + bool Changed = false; + for (auto &L : *LI) { + Changed |= simplifyLoop(L, DT, LI, SE, AC, nullptr, false); + } + + LoopDataPrefetch LDP(AA, AC, DT, LI, SE, TTI, ORE); + Changed |= LDP.run(); if (Changed) { PreservedAnalyses PA; @@ -196,6 +1150,7 @@ bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) { if (skipFunction(F)) return false; + AliasAnalysis *AA = &getAnalysis().getAAResults(); DominatorTree *DT = &getAnalysis().getDomTree(); LoopInfo *LI = &getAnalysis().getLoopInfo(); ScalarEvolution *SE = &getAnalysis().getSE(); @@ -206,7 +1161,7 @@ bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) { const TargetTransformInfo *TTI = &getAnalysis().getTTI(F); - LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE); + LoopDataPrefetch LDP(AA, AC, DT, LI, SE, TTI, ORE); return LDP.run(); } @@ -214,14 +1169,26 @@ bool LoopDataPrefetch::run() { // If PrefetchDistance is not set, don't run the pass. This gives an // opportunity for targets to run this pass for selected subtargets only // (whose TTI sets PrefetchDistance and CacheLineSize). - if (getPrefetchDistance() == 0 || TTI->getCacheLineSize() == 0) { + if (getPrefetchDistance() == 0 || + (TTI->getCacheLineSize() == 0 && CachelineSize == 0)) { LLVM_DEBUG(dbgs() << "Please set both PrefetchDistance and CacheLineSize " "for loop data prefetch.\n"); return false; } bool MadeChange = false; + if (DisableDirectLoadPrefetch.getNumOccurrences() == 0 && + !TTI->isProfitableToDirectPrefetch()) { + LLVM_DEBUG(dbgs() << "Disabling direct load prefetching.\n"); + DisableDirectLoadPrefetch = true; + } + if (RandomAccessPrefetch) { + OuterLoopPrefetch = true; + } + if (DisableDirectLoadPrefetch && !IndirectLoadPrefetch && + !RandomAccessPrefetch) + return MadeChange; for (Loop *I : *LI) for (Loop *L : depth_first(I)) MadeChange |= runOnLoop(L); @@ -274,10 +1241,18 @@ struct Prefetch { bool LoopDataPrefetch::runOnLoop(Loop *L) { bool MadeChange = false; - // Only prefetch in the inner-most loop - if (!L->isInnermost()) + if (L->getLoopDepth() < PrefetchLoopDepth) return MadeChange; + bool IsInnerMost = true; + // Prefetch outer loop if needed. + if (!L->isInnermost()) { + if (OuterLoopPrefetch) + IsInnerMost = false; + else + return MadeChange; + } + SmallPtrSet EphValues; CodeMetrics::collectEphemeralValues(L, AC, EphValues); @@ -323,78 +1298,101 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { unsigned NumMemAccesses = 0; unsigned NumStridedMemAccesses = 0; SmallVector Prefetches; - for (const auto BB : L->blocks()) - for (auto &I : *BB) { - Value *PtrValue; - Instruction *MemI; - - if (LoadInst *LMemI = dyn_cast(&I)) { - MemI = LMemI; - PtrValue = LMemI->getPointerOperand(); - } else if (StoreInst *SMemI = dyn_cast(&I)) { - if (!doPrefetchWrites()) continue; - MemI = SMemI; - PtrValue = SMemI->getPointerOperand(); - } else continue; - - unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); - if (!TTI->shouldPrefetchAddressSpace(PtrAddrSpace)) - continue; - NumMemAccesses++; - if (L->isLoopInvariant(PtrValue)) - continue; - - const SCEV *LSCEV = SE->getSCEV(PtrValue); - const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); - if (!LSCEVAddRec) - continue; - NumStridedMemAccesses++; - - // We don't want to double prefetch individual cache lines. If this - // access is known to be within one cache line of some other one that - // has already been prefetched, then don't prefetch this one as well. - bool DupPref = false; - for (auto &Pref : Prefetches) { - const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.LSCEVAddRec); - if (const SCEVConstant *ConstPtrDiff = - dyn_cast(PtrDiff)) { - int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue()); - if (PD < (int64_t) TTI->getCacheLineSize()) { - Pref.addInstruction(MemI, DT, PD); - DupPref = true; - break; + if (!DisableDirectLoadPrefetch) { + for (const auto BB : L->blocks()) { + // If this is not inner most, we avoid prefetching in sub loops. + for (auto &I : *BB) { + Value *PtrValue = nullptr; + Instruction *MemI; + + if (LoadInst *LMemI = dyn_cast(&I)) { + MemI = LMemI; + PtrValue = LMemI->getPointerOperand(); + } else if (StoreInst *SMemI = dyn_cast(&I)) { + if (!doPrefetchWrites()) + continue; + MemI = SMemI; + PtrValue = SMemI->getPointerOperand(); + } else + continue; + + if (!PtrValue) + continue; + if (getPrefetchDistance() == 0) + continue; + + unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); + if (!TTI->shouldPrefetchAddressSpace(PtrAddrSpace)) + continue; + NumMemAccesses++; + if (L->isLoopInvariant(PtrValue)) + continue; + + const SCEV *LSCEV = SE->getSCEV(PtrValue); + const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); + if (!LSCEVAddRec) + continue; + NumStridedMemAccesses++; + + // For outer loops, we only prefetch memory instruction with stride + // depending on the current loop. + if (!IsInnerMost && LSCEVAddRec->getLoop() != L) + continue; + + // We don't want to double prefetch individual cache lines. If this + // access is known to be within one cache line of some other one that + // has already been prefetched, then don't prefetch this one as well. + bool DupPref = false; + for (auto &Pref : Prefetches) { + const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.LSCEVAddRec); + if (const SCEVConstant *ConstPtrDiff = + dyn_cast(PtrDiff)) { + int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue()); + // Use the CachelineSize value from compiler option. + int64_t CacheLineSize = CachelineSize.getNumOccurrences() + ? CachelineSize + : TTI->getCacheLineSize(); + // if TTI CacheLineSize is zero then, default CachelineSize will + // use. + CacheLineSize = CacheLineSize ? CacheLineSize : CachelineSize; + if (PD < (int64_t)CacheLineSize) { + Pref.addInstruction(MemI, DT, PD); + DupPref = true; + break; + } } } + if (!DupPref) + Prefetches.push_back(Prefetch(LSCEVAddRec, MemI)); } - if (!DupPref) - Prefetches.push_back(Prefetch(LSCEVAddRec, MemI)); } + } - unsigned TargetMinStride = - getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, - Prefetches.size(), HasCall); + unsigned TargetMinStride = getMinPrefetchStride( + NumMemAccesses, NumStridedMemAccesses, Prefetches.size(), HasCall); LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead - << " iterations ahead (loop size: " << LoopSize << ") in " - << L->getHeader()->getParent()->getName() << ": " << *L); - LLVM_DEBUG(dbgs() << "Loop has: " - << NumMemAccesses << " memory accesses, " - << NumStridedMemAccesses << " strided memory accesses, " - << Prefetches.size() << " potential prefetch(es), " - << "a minimum stride of " << TargetMinStride << ", " - << (HasCall ? "calls" : "no calls") << ".\n"); + << " iterations ahead (loop size: " << LoopSize << ") in " + << L->getHeader()->getParent()->getName() << ": " << *L); + LLVM_DEBUG(dbgs() << "Loop has: " << NumMemAccesses << " memory accesses, " + << NumStridedMemAccesses << " strided memory accesses, " + << Prefetches.size() << " potential prefetch(es), " + << "a minimum stride of " << TargetMinStride << ", " + << (HasCall ? "calls" : "no calls") << ".\n"); for (auto &P : Prefetches) { // Check if the stride of the accesses is large enough to warrant a - // prefetch. + // prefetch. If MinPrefetchStride <= 1, no need to check if any stride + // goes. if (!isStrideLargeEnough(P.LSCEVAddRec, TargetMinStride)) continue; BasicBlock *BB = P.InsertPt->getParent(); SCEVExpander SCEVE(*SE, BB->getModule()->getDataLayout(), "prefaddr"); - const SCEV *NextLSCEV = SE->getAddExpr(P.LSCEVAddRec, SE->getMulExpr( - SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead), - P.LSCEVAddRec->getStepRecurrence(*SE))); + const SCEV *NextLSCEV = SE->getAddExpr( + P.LSCEVAddRec, + SE->getMulExpr(SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead), + P.LSCEVAddRec->getStepRecurrence(*SE))); if (!SCEVE.isSafeToExpand(NextLSCEV)) continue; @@ -405,24 +1403,26 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { IRBuilder<> Builder(P.InsertPt); Module *M = BB->getParent()->getParent(); Type *I32 = Type::getInt32Ty(BB->getContext()); - Function *PrefetchFunc = Intrinsic::getDeclaration( - M, Intrinsic::prefetch, PrefPtrValue->getType()); - Builder.CreateCall( - PrefetchFunc, - {PrefPtrValue, - ConstantInt::get(I32, P.Writes), - ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)}); + Function *PrefetchFunc = Intrinsic::getDeclaration(M, Intrinsic::prefetch, + PrefPtrValue->getType()); + Builder.CreateCall(PrefetchFunc, + {PrefPtrValue, ConstantInt::get(I32, P.Writes), + ConstantInt::get(I32, IsInnerMost ? 3 : 0), + ConstantInt::get(I32, 1)}); ++NumPrefetches; LLVM_DEBUG(dbgs() << " Access: " - << *P.MemI->getOperand(isa(P.MemI) ? 0 : 1) - << ", SCEV: " << *P.LSCEVAddRec << "\n"); + << *P.MemI->getOperand(isa(P.MemI) ? 0 : 1) + << ", SCEV: " << *P.LSCEVAddRec << "\n"); ORE->emit([&]() { - return OptimizationRemark(DEBUG_TYPE, "Prefetched", P.MemI) - << "prefetched memory access"; - }); + return OptimizationRemark(DEBUG_TYPE, "Prefetched", P.MemI) + << "prefetched memory access"; + }); MadeChange = true; } + if (IndirectLoadPrefetch || RandomAccessPrefetch) + MadeChange |= doIndirectPrefetch(L, ItersAhead); + return MadeChange; } diff --git a/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch_crc.ll b/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch_crc.ll new file mode 100644 index 0000000000000000000000000000000000000000..e6a37ae38f0683ae8d56aa4b4b862e6444bd14e7 --- /dev/null +++ b/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch_crc.ll @@ -0,0 +1,109 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=loop-data-prefetch --prefetch-distance=1024 --random-access-prefetch=true -disable-direct-prefetch -S | FileCheck %s --check-prefixes=CHECK,CHECK-NON-SVE +; RUN: opt < %s -mcpu=tsv110 -passes=loop-data-prefetch --random-access-prefetch=true -disable-direct-prefetch -S | FileCheck %s --check-prefixes=CHECK,CHECK-NON-SVE +; RUN: opt < %s -mcpu=hip09 -passes=loop-data-prefetch --random-access-prefetch=true --prefetch-with-nonfaulty-load=false -S | FileCheck %s --check-prefixes=CHECK,CHECK-NON-SVE +; RUN: opt < %s -mcpu=hip12 -passes=loop-data-prefetch --random-access-prefetch=true -S | FileCheck %s --check-prefixes=CHECK,CHECK-SVE + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gun" + +declare i32 @llvm.aarch64.crc32w(i32, i32) + +; Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: read) uwtable +define dso_local noundef i32 @_Z12matchcolumnsPiiS_ii(ptr nocapture noundef readonly %A, i32 noundef %B, ptr nocapture noundef readonly %Key, i32 noundef %index, i32 noundef %count) { +; CHECK-LABEL: @_Z12matchcolumnsPiiS_ii( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret i32 [[ADD:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV22:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT23:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[SUM_020:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD]], [[IF_END]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDVARS_IV22]], 60 +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i64 [[TMP0]], 99 +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 99 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDVARS_IV22]], 120 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV22]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NON-SVE-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 4 +; CHECK-SVE-NEXT: [[TMP7_1:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 1) +; CHECK-SVE-NEXT: [[TMP7_2:%.*]] = call @llvm.aarch64.sve.ldnf1.nxv4i32( [[TMP7_1]], ptr [[TMP5]]) +; CHECK-SVE-NEXT: [[TMP7:%.*]] = extractelement [[TMP7_2]], i32 0 +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[TMP4]], i32 0, i32 3, i32 1) +; CHECK-NEXT: [[TMP8:%.*]] = tail call i32 @llvm.aarch64.crc32w(i32 [[TMP6]], i32 -1) +; CHECK-NEXT: [[TMP9:%.*]] = tail call i32 @llvm.aarch64.crc32w(i32 [[TMP7]], i32 -1) +; CHECK-NEXT: [[AND:%.*]] = and i32 [[TMP8]], 255 +; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP9]], 255 +; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[AND]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[KEY:%.*]], i64 [[IDXPROM1]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[KEY]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[TMP12]], i32 0, i32 3, i32 1) +; CHECK-NEXT: [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP13]], [[B:%.*]] +; CHECK-NEXT: br i1 [[CMP3_NOT]], label [[IF_END]], label [[DO_BODY_PREHEADER:%.*]] +; CHECK: do.body.preheader: +; CHECK-NEXT: br label [[DO_BODY:%.*]] +; CHECK: do.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DO_BODY]] ], [ [[IDXPROM1]], [[DO_BODY_PREHEADER]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[KEY]], i64 [[INDVARS_IV_NEXT]] +; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[CMP6_NOT:%.*]] = icmp eq i32 [[TMP14]], [[B]] +; CHECK-NEXT: br i1 [[CMP6_NOT]], label [[IF_END_LOOPEXIT:%.*]], label [[DO_BODY]] +; CHECK: if.end.loopexit: +; CHECK-NEXT: [[TMP15:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[AKEY_1:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP15]], [[IF_END_LOOPEXIT]] ] +; CHECK-NEXT: [[IDXPROM7:%.*]] = sext i32 [[AKEY_1]] to i64 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM7]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4 +; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP16]], [[SUM_020]] +; CHECK-NEXT: [[INDVARS_IV_NEXT23]] = add nuw nsw i64 [[INDVARS_IV22]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT23]], 100 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; +entry: + br label %for.body + +for.cond.cleanup: + ret i32 %add + +for.body: + %indvars.iv22 = phi i64 [ 0, %entry ], [ %indvars.iv.next23, %if.end ] + %sum.020 = phi i32 [ 0, %entry ], [ %add, %if.end ] + %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv22 + %0 = load i32, ptr %arrayidx, align 4 + %1 = tail call i32 @llvm.aarch64.crc32w(i32 %0, i32 -1) + %and = and i32 %1, 255 + %idxprom1 = zext i32 %and to i64 + %arrayidx2 = getelementptr inbounds i32, ptr %Key, i64 %idxprom1 + %2 = load i32, ptr %arrayidx2, align 4 + %cmp3.not = icmp eq i32 %2, %B + br i1 %cmp3.not, label %if.end, label %do.body + +do.body: + %indvars.iv = phi i64 [ %idxprom1, %for.body ], [ %indvars.iv.next, %do.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx5 = getelementptr inbounds i32, ptr %Key, i64 %indvars.iv.next + %3 = load i32, ptr %arrayidx5, align 4 + %cmp6.not = icmp eq i32 %3, %B + br i1 %cmp6.not, label %if.end.loopexit, label %do.body + +if.end.loopexit: + %4 = trunc i64 %indvars.iv.next to i32 + br label %if.end + +if.end: + %AKey.1 = phi i32 [ %and, %for.body ], [ %4, %if.end.loopexit ] + %idxprom7 = sext i32 %AKey.1 to i64 + %arrayidx8 = getelementptr inbounds i32, ptr %A, i64 %idxprom7 + %5 = load i32, ptr %arrayidx8, align 4 + %add = add nsw i32 %5, %sum.020 + %indvars.iv.next23 = add nuw nsw i64 %indvars.iv22, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next23, 100 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +}