From 03ef4dc4b25a2e576db147f2c0f39c94d39a822a Mon Sep 17 00:00:00 2001
From: Ramkrishnan NK <ramkrishnan.nk@huawei.com>
Date: Wed, 27 Aug 2025 14:19:10 -0400
Subject: [PATCH] [LoopDataPrefetch] Add support for indirect load prefetch

The feature of inserting prefetch intrinsics for indirect load in loop is added.
The default prefetch distance set for subtarget tsv110, hip09 and hip12.
By default direct load prefetch is disabled for hip09 and hip12.
---
 .../llvm/Analysis/TargetTransformInfo.h       |    6 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   |    2 +
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |    2 +
 llvm/lib/Analysis/TargetTransformInfo.cpp     |    4 +
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp  |    6 +
 llvm/lib/Target/AArch64/AArch64Subtarget.h    |   10 +
 .../AArch64/AArch64TargetTransformInfo.cpp    |    4 +
 .../AArch64/AArch64TargetTransformInfo.h      |    2 +
 .../Transforms/Scalar/LoopDataPrefetch.cpp    | 1166 +++++++++++++++--
 .../AArch64/indirect-load-prefetch_crc.ll     |  109 ++
 10 files changed, 1228 insertions(+), 83 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch_crc.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index ce3ff2ee7aec..f9d7d4fbbe50 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -726,6 +726,8 @@ public:
 
   bool isProfitableToLoopVersioning() const;
 
+  bool isProfitableToDirectPrefetch() const;
+
   bool useAA() const;
 
   /// Return true if this type is legal.
@@ -1674,6 +1676,7 @@ public:
   virtual bool isTruncateFree(Type *Ty1, Type *Ty2) = 0;
   virtual bool isProfitableToHoist(Instruction *I) = 0;
   virtual bool isProfitableToLoopVersioning() = 0;
+  virtual bool isProfitableToDirectPrefetch() = 0;
   virtual bool useAA() = 0;
   virtual bool isTypeLegal(Type *Ty) = 0;
   virtual unsigned getRegUsageForType(Type *Ty) = 0;
@@ -2144,6 +2147,9 @@ public:
   bool isProfitableToLoopVersioning() override {
     return Impl.isProfitableToLoopVersioning();
   }
+  bool isProfitableToDirectPrefetch() override {
+    return Impl.isProfitableToDirectPrefetch();
+  }
   bool useAA() override { return Impl.useAA(); }
   bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
   unsigned getRegUsageForType(Type *Ty) override {
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 3a99b02bc363..80479ade16f4 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -319,6 +319,8 @@ public:
 
   bool isProfitableToLoopVersioning() const { return false; }
 
+  bool isProfitableToDirectPrefetch() const { return true; }
+
   bool useAA() const { return false; }
 
   bool isTypeLegal(Type *Ty) const { return false; }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 0cd6599d4296..50c7ecdde77f 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -403,6 +403,8 @@ public:
 
   bool isProfitableToLoopVersioning() const { return false; }
 
+  bool isProfitableToDirectPrefetch() const { return true; }
+
   bool useAA() const { return getST()->useAA(); }
 
   bool isTypeLegal(Type *Ty) {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index e26c7ad3dd90..02da1e1dc65b 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -490,6 +490,10 @@ bool TargetTransformInfo::isProfitableToLoopVersioning() const {
   return TTIImpl->isProfitableToLoopVersioning();
 }
 
+bool TargetTransformInfo::isProfitableToDirectPrefetch() const {
+  return TTIImpl->isProfitableToDirectPrefetch();
+}
+
 bool TargetTransformInfo::useAA() const { return TTIImpl->useAA(); }
 
 bool TargetTransformInfo::isTypeLegal(Type *Ty) const {
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 52268ec9fa3b..ec481250224b 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -263,22 +263,28 @@ void AArch64Subtarget::initializeProperties() {
     break;
   case TSV110:
     CacheLineSize = 64;
+    PrefetchDistance = 1024;
     PrefFunctionLogAlignment = 4;
     PrefLoopLogAlignment = 2;
+    MinPrefetchStride = 4;
     break;
   case HIP09:
     CacheLineSize = 64;
+    PrefetchDistance = 1024;
     PrefFunctionLogAlignment = 4;
     PrefLoopLogAlignment = 2;
     VScaleForTuning = 2;
     DefaultSVETFOpts = TailFoldingOpts::Simple;
+    MinPrefetchStride = 4;
     break;
   case HIP12:
     CacheLineSize = 64;
+    PrefetchDistance = 1024;
     PrefFunctionLogAlignment = 4;
     PrefLoopLogAlignment = 2;
     VScaleForTuning = 2;
     DefaultSVETFOpts = TailFoldingOpts::Simple;
+    MinPrefetchStride = 4;
     break;
   case ThunderX3T110:
     CacheLineSize = 64;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 48a818455d1a..557044820d06 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -213,6 +213,16 @@ public:
     }
   }
 
+  bool isHiSiliconHIPProc() const {
+    switch (ARMProcFamily) {
+    case HIP09:
+    case HIP12:
+      return true;
+    default:
+      return false;
+    }
+  }
+
   bool isXRaySupported() const override { return true; }
 
   unsigned getMinVectorRegisterBitWidth() const {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 6f45c03ae977..74b27770f35c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -395,6 +395,10 @@ bool AArch64TTIImpl::isProfitableToLoopVersioning() const {
   return ST->isHiSiliconProc() || ForceEnableExperimentalOpt;
 }
 
+bool AArch64TTIImpl::isProfitableToDirectPrefetch() const {
+  return !ST->isHiSiliconHIPProc();
+}
+
 InstructionCost
 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                       TTI::TargetCostKind CostKind) {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 0f4a4d7dc804..eae92c7aa5b6 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -94,6 +94,8 @@ public:
 
   bool isProfitableToLoopVersioning() const;
 
+  bool isProfitableToDirectPrefetch() const;
+
   /// @}
 
   /// \name Vector TTI Implementations
diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index 7c2770979a90..bd395becc40a 100644
--- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -14,21 +14,34 @@
 #include "llvm/InitializePasses.h"
 
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ReplaceConstant.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 
 #define DEBUG_TYPE "loop-data-prefetch"
@@ -37,9 +50,9 @@ using namespace llvm;
 
 // By default, we limit this to creating 16 PHIs (which is a little over half
 // of the allocatable register set).
-static cl::opt<bool>
-PrefetchWrites("loop-prefetch-writes", cl::Hidden, cl::init(false),
-               cl::desc("Prefetch write addresses"));
+static cl::opt<bool> PrefetchWrites("loop-prefetch-writes", cl::Hidden,
+                                    cl::init(false),
+                                    cl::desc("Prefetch write addresses"));
 
 static cl::opt<unsigned>
     PrefetchDistance("prefetch-distance",
@@ -54,31 +67,136 @@ static cl::opt<unsigned> MaxPrefetchIterationsAhead(
     "max-prefetch-iters-ahead",
     cl::desc("Max number of iterations to prefetch ahead"), cl::Hidden);
 
+static cl::opt<bool>
+    IndirectLoadPrefetch("indirect-load-prefetch", cl::Hidden, cl::init(false),
+                         cl::desc("Enable indirect load prefetch"));
+
+static cl::opt<unsigned> PrefetchIterationsAhead(
+    "indirect-prefetch-iters-ahead",
+    cl::desc("Number of iterations for indirect-load prefetch"), cl::Hidden,
+    cl::init(0));
+
+static cl::opt<bool> SkipIntermediate(
+    "indirect-prefetch-skip-intermediate", cl::Hidden, cl::init(false),
+    cl::desc(
+        "Skip prefetching intermediate loads while doing indirect prefetch"));
+
+static cl::opt<unsigned> IndirectionLevel(
+    "indirect-level",
+    cl::desc("Indirection level considered for indirect load prefetch"),
+    cl::Hidden, cl::init(2));
+
+static cl::opt<bool> RandomAccessPrefetch(
+    "random-access-prefetch", cl::Hidden, cl::init(false),
+    cl::desc("Enable random offset indirect load prefetch"));
+
+static cl::opt<bool>
+    EnableNonFaultyLoad("prefetch-with-nonfaulty-load", cl::Hidden,
+                        cl::init(false),
+                        cl::desc("Prefetch with non-faulty Load instruction."));
+
+static cl::opt<unsigned> CachelineSize("prefetch-cache-line-size",
+                                       cl::desc("Specify cache line size"),
+                                       cl::Hidden, cl::init(64));
+
+static cl::opt<bool>
+    OuterLoopPrefetch("outer-loop-prefetch", cl::Hidden, cl::init(false),
+                      cl::desc("Enable prefetch in outer loops"));
+
+static cl::opt<bool>
+    DisableDirectLoadPrefetch("disable-direct-prefetch", cl::Hidden,
+                              cl::init(false),
+                              cl::desc("Disable direct load prefetch"));
+
+static cl::opt<unsigned>
+    PrefetchLoopDepth("prefetch-loop-depth",
+                      cl::desc("Least loop depth to insert prefetch"),
+                      cl::Hidden, cl::init(1));
+
 STATISTIC(NumPrefetches, "Number of prefetches inserted");
+STATISTIC(NumIndPrefetches, "Number of indirect prefetches inserted");
 
 namespace {
 
+// Helper function to return a type with the same size as
+// given step size
+static Type *getPtrTypefromPHI(PHINode *PHI, int64_t StepSize) {
+  Type *Int8Ty = Type::getInt8Ty(PHI->getParent()->getContext());
+  return ArrayType::get(Int8Ty, StepSize);
+}
+
 /// Loop prefetch implementation class.
 class LoopDataPrefetch {
 public:
-  LoopDataPrefetch(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI,
-                   ScalarEvolution *SE, const TargetTransformInfo *TTI,
+  LoopDataPrefetch(AliasAnalysis *AA, AssumptionCache *AC, DominatorTree *DT,
+                   LoopInfo *LI, ScalarEvolution *SE,
+                   const TargetTransformInfo *TTI,
                    OptimizationRemarkEmitter *ORE)
-      : AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}
+      : AA(AA), AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}
 
   bool run();
 
 private:
   bool runOnLoop(Loop *L);
 
+  Value *getCanonicalishSizeVariable(Loop *L, PHINode *PHI) const;
+  Value *
+  getLoopIterationNumber(Loop *L,
+                         SmallPtrSet<Instruction *, 4> &LoopAuxIndPHINodes,
+                         ValueMap<PHINode *, Value *> &AuxIndBounds);
+  /// If prefetch instruction is not inserted, need to clean iteration
+  /// instructions in the preheader.
+  void cleanLoopIterationNumber(Value *NumIterations);
+  /// Returns whether the auxiliary induction variable can generate bound.
+  /// If it can, add PHI to LoopAuxIndPHINodes
+  bool canGetAuxIndVarBound(Loop *L, PHINode *PHI,
+                            SmallPtrSet<Instruction *, 4> &LoopAuxIndPHINodes);
+
+  /// Generate bound for the auxiliary induction variable at the
+  /// preheader and add it to AuxIndBounds.
+  /// Returns whether the bound was successfully generated.
+  bool getAuxIndVarBound(Loop *L, PHINode *PHI, Value *NumIterations,
+                         ValueMap<PHINode *, Value *> &AuxIndBounds);
+
+  bool insertPrefetcherForIndirectLoad(
+      Loop *L, unsigned Idx, Value *NumIterations,
+      SmallVector<Instruction *, 4> &CandidateMemoryLoads,
+      SmallSetVector<Instruction *, 8> &DependentInsts,
+      ValueMap<PHINode *, Value *> &AuxIndBounds,
+      SmallVectorImpl<DenseMap<Value *, Value *>> &Transforms,
+      unsigned ItersAhead);
+
+  bool findCandidateMemoryLoads(
+      Instruction *I, SmallSetVector<Instruction *, 8> &InstList,
+      SmallPtrSet<Instruction *, 8> &InstSet,
+      SmallVector<Instruction *, 4> &CandidateMemoryLoads,
+      std::vector<SmallSetVector<Instruction *, 8>> &DependentInstList,
+      SmallPtrSet<Instruction *, 4> LoopAuxIndPHINodes);
+
+  /// Helper function to determine whether the given load is in
+  /// CandidateMemoryLoads. If yes, add the candidate's depending inst to the
+  /// list
+  bool isLoadInCandidateMemoryLoads(
+      LoadInst *LoadI, SmallSetVector<Instruction *, 8> &InstList,
+      SmallPtrSet<Instruction *, 8> &InstSet,
+      SmallVector<Instruction *, 4> &CandidateMemoryLoads,
+      std::vector<SmallSetVector<Instruction *, 8>> &DependentInstList);
+
+  /// Returns whether the given loop can do indirect prefetch and should be
+  /// processed to insert prefetches for indirect loads.
+  bool canDoIndirectPrefetch(Loop *L);
+
+  bool isCrcHashDataAccess(Instruction *I, Instruction *PrefetchingLoad);
+  bool isIntermediateLoadSupported(Loop *L, LoadInst *&CandidateLoad,
+                                   SmallSetVector<Instruction *, 8> &InstList);
+  bool doIndirectPrefetch(Loop *L, unsigned ItersAhead);
   /// Check if the stride of the accesses is large enough to
   /// warrant a prefetch.
   bool isStrideLargeEnough(const SCEVAddRecExpr *AR, unsigned TargetMinStride);
 
   unsigned getMinPrefetchStride(unsigned NumMemAccesses,
                                 unsigned NumStridedMemAccesses,
-                                unsigned NumPrefetches,
-                                bool HasCall) {
+                                unsigned NumPrefetches, bool HasCall) {
     if (MinPrefetchStride.getNumOccurrences() > 0)
       return MinPrefetchStride;
     return TTI->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
@@ -103,6 +221,15 @@ private:
     return TTI->enableWritePrefetching();
   }
 
+  bool isSupportsNonFaultyLoad(Module *M) {
+    if (EnableNonFaultyLoad.getNumOccurrences() > 0)
+      return EnableNonFaultyLoad;
+    Triple TargetTriple = Triple(M->getTargetTriple());
+    return TTI->supportsScalableVectors() &&
+           TargetTriple.getArch() == Triple::aarch64;
+  }
+
+  AliasAnalysis *AA;
   AssumptionCache *AC;
   DominatorTree *DT;
   LoopInfo *LI;
@@ -120,6 +247,7 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
@@ -140,6 +268,7 @@ public:
 char LoopDataPrefetchLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopDataPrefetchLegacyPass, "loop-data-prefetch",
                       "Loop Data Prefetch", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
@@ -169,8 +298,825 @@ bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR,
   return TargetMinStride <= AbsStride;
 }
 
+/// Use the induction variable to generate value representing the total num of
+/// iterations for the loop in the preheader.
+Value *LoopDataPrefetch::getLoopIterationNumber(
+    Loop *L, SmallPtrSet<Instruction *, 4> &LoopAuxIndPHINodes,
+    ValueMap<PHINode *, Value *> &AuxIndBounds) {
+  Value *LoopBoundValue;
+  Value *LoopStepValue;
+  Value *LoopStartValue;
+  Value *LoopPreHeader;
+  Value *NumIterations;
+
+  // Use induction variable to derive number of iterations for the loop which
+  // will be used to calculate the upper bound for other auxiliary induction
+  // variables.
+  PHINode *PHI = L->getInductionVariable(*SE);
+  if (PHI == nullptr)
+    return nullptr;
+
+  auto LoopLB = L->getBounds(*SE);
+  if (!LoopLB)
+    return nullptr;
+
+  LoopStartValue = &(LoopLB->getInitialIVValue());
+  LoopStepValue = LoopLB->getStepValue();
+  LoopBoundValue = &(LoopLB->getFinalIVValue());
+  LoopPreHeader = L->getLoopPreheader();
+
+  if (LoopStartValue == nullptr || LoopStepValue == nullptr ||
+      LoopBoundValue == nullptr || LoopPreHeader == nullptr)
+    return nullptr;
+
+  // Step should be constant.
+  if (!isa<SCEVConstant>(SE->getSCEV(LoopStepValue)))
+    return nullptr;
+
+  // Make sure each of them is invariant so we can use them in the preheader.
+  if (!L->isLoopInvariant(LoopBoundValue) ||
+      !L->isLoopInvariant(LoopStepValue) || !L->isLoopInvariant(LoopStartValue))
+    return nullptr;
+
+  // Generate instruction that calculated the total number of iterations of the
+  // loop in the preheader.
+  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+  Value *Range = Builder.CreateSub(LoopBoundValue, LoopStartValue);
+  NumIterations = Builder.CreateSDiv(Range, LoopStepValue);
+
+  LoopAuxIndPHINodes.insert(PHI);
+  Value *Bound = nullptr;
+  // If the step is positive, the upper bound isn't included, i.e. accessing
+  // [bound] is not legal, so subtract the bound by LoopStepValue to prevent out
+  // of bounds memory access.
+  if (SE->isKnownNegative(SE->getSCEV(LoopStepValue)))
+    Bound = LoopBoundValue;
+  else
+    Bound = Builder.CreateSub(LoopBoundValue, LoopStepValue);
+  AuxIndBounds.insert(std::pair<PHINode *, Value *>(PHI, Bound));
+  return NumIterations;
+}
+
+/// If prefetch instruction is not inserted. Need to clean iteration instruction
+/// in the preheader.
+void LoopDataPrefetch::cleanLoopIterationNumber(Value *NumIterations) {
+  RecursivelyDeleteTriviallyDeadInstructions(NumIterations);
+}
+
+/// Returns whether the auxiliary induction variable can generate bound.
+/// If it can generate a bound, add PHI to LoopAuxIndPHINodes
+bool LoopDataPrefetch::canGetAuxIndVarBound(
+    Loop *L, PHINode *PHI, SmallPtrSet<Instruction *, 4> &LoopAuxIndPHINodes) {
+  Value *AuxIndVarStartValue =
+      PHI->getIncomingValueForBlock(L->getLoopPreheader());
+  if (AuxIndVarStartValue == nullptr)
+    return false;
+
+  const SCEV *LSCEV = SE->getSCEV(PHI);
+  const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+
+  if (LSCEVAddRec == nullptr)
+    return false;
+
+  // Currently, we only support constant steps.
+  if (dyn_cast<SCEVConstant>(LSCEVAddRec->getStepRecurrence(*SE))) {
+    InductionDescriptor IndDesc;
+    if (!InductionDescriptor::isInductionPHI(PHI, L, SE, IndDesc))
+      return false;
+
+    if (IndDesc.getInductionOpcode() != Instruction::Add &&
+        IndDesc.getInductionOpcode() != Instruction::Sub &&
+        IndDesc.getKind() != InductionDescriptor::IK_PtrInduction)
+      return false;
+    return true;
+  }
+  return false;
+}
+
+/// Generate bound for the auxiliary induction variable at the preheader and add
+/// it to AuxIndBounds. Returns whether the bound was successfully generated.
+bool LoopDataPrefetch::getAuxIndVarBound(
+    Loop *L, PHINode *PHI, Value *NumIterations,
+    ValueMap<PHINode *, Value *> &AuxIndBounds) {
+  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+  Value *AuxIndVarStartValue =
+      PHI->getIncomingValueForBlock(L->getLoopPreheader());
+  if (AuxIndVarStartValue == nullptr)
+    return false;
+
+  const SCEV *LSCEV = SE->getSCEV(PHI);
+  const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+
+  // Currently, we only support constant steps.
+  if (const SCEVConstant *ConstPtrDiff =
+          dyn_cast<SCEVConstant>(LSCEVAddRec->getStepRecurrence(*SE))) {
+    Value *AuxIndVarBound;
+    InductionDescriptor IndDesc;
+    if (!InductionDescriptor::isInductionPHI(PHI, L, SE, IndDesc))
+      return false;
+
+    // Calculate the upper bound for the auxiliary induction variable.
+    Value *CastedNumIterations =
+        Builder.CreateSExtOrTrunc(NumIterations, ConstPtrDiff->getType());
+
+    // Subtract one from CastedNumIterations as we want the bound to be in
+    // bounds. If there are N iterations, the first iteration will access the
+    // array at offset 0. On the N-th iteration, it will access the array at
+    // offset N-1, not N.
+    CastedNumIterations = Builder.CreateSub(
+        CastedNumIterations, ConstantInt::get(ConstPtrDiff->getType(), 1));
+    // Teh induction operator is add / sub
+    if (IndDesc.getInductionOpcode() == Instruction::Add ||
+        IndDesc.getInductionOpcode() == Instruction::Sub) {
+      Value *Range =
+          Builder.CreateMul(ConstPtrDiff->getValue(), CastedNumIterations);
+      AuxIndVarBound = Builder.CreateAdd(Range, AuxIndVarStartValue);
+    } else if (IndDesc.getKind() == InductionDescriptor::IK_PtrInduction) {
+      // The induction variable is a pointer
+      int64_t StepSize = ConstPtrDiff->getAPInt().getSExtValue();
+      if (SE->isKnownNegative(ConstPtrDiff)) {
+        StepSize = -StepSize;
+        CastedNumIterations = Builder.CreateMul(
+            ConstantInt::getSigned(ConstPtrDiff->getType(), -1),
+            CastedNumIterations);
+      }
+      Type *GEPType = getPtrTypefromPHI(PHI, StepSize);
+      AuxIndVarBound = Builder.CreateInBoundsGEP(GEPType, AuxIndVarStartValue,
+                                                 CastedNumIterations);
+    } else
+      return false;
+
+    LLVM_DEBUG(dbgs() << "Added "
+                      << (isa<SCEVConstant>(SE->getSCEV(AuxIndVarBound))
+                              ? "Constant "
+                              : "")
+                      << "AuxIndVarBound " << *AuxIndVarBound
+                      << " for AuxIndVar:" << *PHI << "\n");
+    AuxIndBounds.insert(std::pair<PHINode *, Value *>(PHI, AuxIndVarBound));
+
+    return true;
+  }
+  return false;
+}
+
+// Helper function to calculate the step for a given loop
+static uint64_t getStep(PHINode *PN, ScalarEvolution *SE) {
+  // Get the constant step for the induction phi so we can use it to calculate
+  // how much we should increase the induction for prefetching.
+  uint64_t Step = 0;
+  const SCEV *LSCEV = SE->getSCEV(PN);
+  const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+
+  if (LSCEVAddRec == nullptr)
+    return Step;
+
+  if (const SCEVConstant *ConstPtrDiff =
+          dyn_cast<SCEVConstant>(LSCEVAddRec->getStepRecurrence(*SE))) {
+    Step = ConstPtrDiff->getAPInt().getZExtValue();
+  }
+  return Step;
+}
+
+// Helper function to determine if the loop step is positive
+static bool isPositiveStep(PHINode *PN, ScalarEvolution *SE) {
+  bool PositiveStep = true;
+  const SCEV *LSCEV = SE->getSCEV(PN);
+  const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+  if (const SCEVConstant *ConstPtrDiff =
+          dyn_cast<SCEVConstant>(LSCEVAddRec->getStepRecurrence(*SE))) {
+    if (SE->isKnownNegative(ConstPtrDiff)) {
+      PositiveStep = false;
+    }
+  }
+  return PositiveStep;
+}
+
+// Helper function to calculate the step type of a PHI node. If the PHI node is
+// not a pointer type, get the type PHI Node itself. Otherwise, get the integer
+// type of the PHI's step/offset value.
+static Type *getStepTypeFromPHINode(PHINode *PN, ScalarEvolution *SE) {
+  // Get the constant step for the induction phi so we can use it to calculate
+  // how much we should increase the induction for prefetching.
+  Type *T = PN->getType();
+  if (!T->isPointerTy())
+    return T;
+
+  const SCEV *LSCEV = SE->getSCEV(PN);
+  const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+  if (const SCEVConstant *ConstPtrDiff =
+          dyn_cast<SCEVConstant>(LSCEVAddRec->getStepRecurrence(*SE)))
+    return ConstPtrDiff->getType();
+
+  return T;
+}
+
+/// This function will take an instr list that contains indirect loads and
+/// transform them into prefetchers. E.g. Transform following indirect load
+/// A[B[i]]:
+///   phi indvar [0] [bound]
+///   idxB = gep *B, indvar
+///   offsetA = load * idxB
+///   idxA = gep *A, offsetA
+///   valueA = load *idxA
+/// To indirect load with prefetchers N iteration ahead:
+///   phi indvar [0] [bound]
+///   offsetN = add indvar, N
+///   offset2N = add indvar, 2N
+///   compare = icmp offsetN, bound
+///   offsetN = select compare, offsetN, bound
+///   preIdxN = gep *B, offsetN
+///   preIdx2N = get *B, offset2N
+///   call prefetch(preIdx2N)
+///   preOffsetA = load preIdxN
+///   preIdxA = gep *A, preOffsetA
+///   call prefetch(preIdxA)
+///   idxB = gep *B, indvar
+///   offsetA = load *idxB
+///   idxA = gep *A, offsetA
+///   valueA = load *idxA
+bool LoopDataPrefetch::insertPrefetcherForIndirectLoad(
+    Loop *L, unsigned Idx, Value *NumIterations,
+    SmallVector<Instruction *, 4> &CandidateMemoryLoads,
+    SmallSetVector<Instruction *, 8> &DependentInsts,
+    ValueMap<PHINode *, Value *> &AuxIndBounds,
+    SmallVectorImpl<DenseMap<Value *, Value *>> &Transforms,
+    unsigned ItersAhead) {
+  bool PositiveStep = true;
+  Instruction *TargetIndirectLoad = CandidateMemoryLoads[Idx];
+  IRBuilder<> Builder(TargetIndirectLoad);
+  Module *M = TargetIndirectLoad->getModule();
+  Type *I32Ty = Type::getInt32Ty(TargetIndirectLoad->getParent()->getContext());
+
+  bool isRandomAccess = false;
+  bool isCallDependency = false;
+  for (auto *I : DependentInsts) {
+    isCallDependency |= isa<CallInst>(I);
+    if (isCrcHashDataAccess(I, TargetIndirectLoad)) {
+      isRandomAccess = true;
+      break;
+    }
+  }
+  // CallInst dependency only support for Random access with CRC.
+  if (!isRandomAccess && (isCallDependency || !canDoIndirectPrefetch(L)))
+    return false;
+
+  // If indirect load prefetch is not specified then, exit for non random cases.
+  if (!IndirectLoadPrefetch && !isRandomAccess)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Inserting indirect prefetchers for\t"
+                    << *TargetIndirectLoad << "\twith " << DependentInsts.size()
+                    << " dependent instructions\n");
+
+  // Keep track of the number of prefetches left to process among the
+  // DependentInst List. We assume that for given indirectLevel N, we will have
+  // N prefetches to do, unless we are skipping intermediate loads, then we are
+  // only doing 1 prefetch.
+  size_t NumPrefetchesLeft = SkipIntermediate ? 1 : IndirectionLevel;
+  int64_t Step;
+  while (!DependentInsts.empty()) {
+    Instruction *DependentInst = DependentInsts.pop_back_val();
+    Instruction *Inst = dyn_cast<Instruction>(DependentInst);
+
+    switch (Inst->getOpcode()) {
+    case Instruction::PHI: {
+      // Get the constant step for the induction phi so we can use it to
+      // calculate how much we should increase the induction for prefetching.
+      PHINode *PN = dyn_cast<PHINode>(Inst);
+      Step = getStep(PN, SE);
+      PositiveStep = isPositiveStep(PN, SE);
+      Type *InstType = getStepTypeFromPHINode(PN, SE);
+      if (!PositiveStep)
+        Step = -Step;
+
+      // Make sure phi node is i64 or i32.
+      if (!InstType->isIntegerTy(64) && !InstType->isIntegerTy(32))
+        return false;
+
+      // Create the bound for this PHI if needed:
+      if (!AuxIndBounds.count(PN))
+        getAuxIndVarBound(L, PN, NumIterations, AuxIndBounds);
+
+      //  We create values based on the induction variable so we can use it to
+      //  generate prefetcher later on. The first value (indvar + IterationAhead
+      //  * step) will be used for the load of prefetched address and it must
+      //  not exceeding the bound. The second value (indvar + 2 * IterationAhead
+      //  * step) will be used to generate prefether for the load of address.
+      //  The subsequent values are generated in a similar fashion to generate
+      //  prefetchers for offset of all dependent loads.
+
+      //  Insert the new instruction after all PHI node.
+      auto InsertionPoint = Inst;
+      if (auto FirstNonPHI = Inst->getParent()->getFirstNonPHI())
+        InsertionPoint = FirstNonPHI->getPrevNode();
+
+      for (size_t i = 0; i < NumPrefetchesLeft; i++) {
+        if (i > 0 && SkipIntermediate)
+          break;
+
+        if (Transforms.size() < i + 1) {
+          Transforms.push_back(DenseMap<Value *, Value *>());
+        } else if (Transforms[i].count(Inst))
+          continue;
+
+        // Create the new operation for the target load
+        Value *NewOp = nullptr;
+        if (Inst->getType()->isPointerTy()) {
+          Type *GEPType = getPtrTypefromPHI(PN, Step);
+          int64_t Offset =
+              PrefetchIterationsAhead ? PrefetchIterationsAhead : ItersAhead;
+          if (!PositiveStep)
+            Offset = -Offset;
+          // Do not need to calculate Offset * Step as it is calculated
+          // implicitly within the GEP instruction
+          NewOp = Builder.CreateInBoundsGEP(
+              GEPType, Inst,
+              ConstantInt::getSigned(InstType, (i + 1) * Offset));
+        } else {
+          // FullStep is the initial offset for the new value, taking into
+          // account, both Step and the number of iterations ahead to prefetch.
+          // If indirect prefetch iterations ahead is enabled, we directly use
+          // the supplied indirect-prefetch-iters-ahead value.
+          int64_t FullStep = PrefetchIterationsAhead
+                                 ? PrefetchIterationsAhead * Step
+                                 : ItersAhead * Step;
+
+          Instruction::BinaryOps BiOp =
+              PositiveStep ? Instruction::Add : Instruction::Sub;
+          NewOp = Builder.CreateBinOp(
+              BiOp, Inst,
+              ConstantInt::get(Inst->getType(), (i + 1) * FullStep));
+        }
+
+        if (auto NewOpInstr = dyn_cast<Instruction>(NewOp)) {
+          NewOpInstr->moveAfter(InsertionPoint);
+          InsertionPoint = NewOpInstr;
+        }
+
+        // Create the new operations for the offset loads
+        if (i > 0 && i == NumPrefetchesLeft - 1) {
+          Transforms[i].insert(std::pair<Value *, Value *>(Inst, NewOp));
+        } else {
+          Value *NewCmp = Builder.CreateICmp(
+              PositiveStep ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, NewOp,
+              AuxIndBounds[cast<PHINode>(Inst)]);
+          Value *NewSelect =
+              Builder.CreateSelect(NewCmp, NewOp, AuxIndBounds[PN]);
+          Transforms[i].insert(std::pair<Value *, Value *>(Inst, NewSelect));
+
+          if (auto NewCmpInstr = dyn_cast<Instruction>(NewCmp)) {
+            NewCmpInstr->moveAfter(InsertionPoint);
+            InsertionPoint = NewCmpInstr;
+          }
+
+          if (auto NewSelectInstr = dyn_cast<Instruction>(NewSelect)) {
+            NewSelectInstr->moveAfter(InsertionPoint);
+            InsertionPoint = NewSelectInstr;
+          }
+        }
+      }
+      break;
+    }
+    case Instruction::Load: {
+      LoadInst *LoadI = dyn_cast<LoadInst>(Inst);
+      Value *LoadPtr = LoadI->getPointerOperand();
+      if (!SkipIntermediate)
+        NumPrefetchesLeft--;
+
+      auto GeneratePrefetcher = [&](llvm::Value *PrefetchPtr) {
+        Function *PrefetchFunc = Intrinsic::getDeclaration(
+            M, Intrinsic::prefetch, LoadPtr->getType());
+        Value *PrefetchArg[] = {PrefetchPtr, ConstantInt::get(I32Ty, 0),
+                                ConstantInt::get(I32Ty, 3),
+                                ConstantInt::get(I32Ty, 1)};
+        CallInst *PrefetchCall = CallInst::Create(PrefetchFunc, PrefetchArg);
+        return PrefetchCall;
+      };
+
+      auto CloneNonFaultyLoad = [&](LoadInst *Load, int PrefetchLevel,
+                                    Value *LoadOperand) {
+        auto DL = Load->getParent()->getModule()->getDataLayout();
+        auto ScalableWidth =
+            TTI->getRegisterBitWidth(TargetTransformInfo::RGK_ScalableVector)
+                .getKnownMinValue();
+
+        auto LDType = Load->getType();
+        unsigned LDTypeSize = (LDType->isPointerTy())
+                                  ? DL.getTypeStoreSizeInBits(LDType)
+                                  : LDType->getScalarSizeInBits();
+        int ElementCount = ScalableWidth / LDTypeSize;
+        Type *PredTy = ScalableVectorType::get(
+            Type::getInt1Ty(Load->getParent()->getContext()), ElementCount);
+        auto *PTruePat =
+            ConstantInt::get(I32Ty, 1 /* = AArch64SVEPredPattern::vl1*/);
+
+        auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
+                                              {PredTy}, {PTruePat});
+        PTrue->moveAfter(Load);
+        Type *ScaledLoadTy = ScalableVectorType::get(LDType, ElementCount);
+        auto *Ldnf1 = Builder.CreateIntrinsic(
+            Intrinsic::aarch64_sve_ldnf1, {ScaledLoadTy}, {PTrue, LoadOperand});
+        Ldnf1->moveAfter(PTrue);
+        auto *ExtractIdx = ConstantInt::get(I32Ty, 0);
+        Instruction *Element = dyn_cast<Instruction>(
+            Builder.CreateExtractElement(Ldnf1, ExtractIdx));
+        Element->moveAfter(Ldnf1);
+        Ldnf1->replaceUsesOfWith(LoadOperand,
+                                 Transforms[PrefetchLevel][LoadOperand]);
+        return Element;
+      };
+
+      if (!DependentInsts.empty()) {
+        // For any intermediate (not last) load, we generate a load for all the
+        // offset at min(indvar+N*IterationsAhead*step, bound)] for each N up to
+        // NumPrefetchesLeft - 1, and generate a prefetcher at
+        // (indvar+(N+1)*IterationAhead*step) for the offset load.
+        Instruction *PrefetchOffsetLoad = nullptr;
+        for (size_t i = 0; i < NumPrefetchesLeft; i++) {
+          if (Transforms[i].count(LoadI))
+            continue;
+          if (isSupportsNonFaultyLoad(M))
+            PrefetchOffsetLoad = CloneNonFaultyLoad(LoadI, i, LoadPtr);
+          else {
+            PrefetchOffsetLoad = LoadI->clone();
+            Builder.Insert(PrefetchOffsetLoad);
+            PrefetchOffsetLoad->moveAfter(LoadI);
+            PrefetchOffsetLoad->replaceUsesOfWith(LoadPtr,
+                                                  Transforms[i][LoadPtr]);
+          }
+          Transforms[i].insert(
+              std::pair<Value *, Value *>(LoadI, PrefetchOffsetLoad));
+        }
+
+        if (SkipIntermediate)
+          break;
+
+        // Create a prefetcher for the offset load.
+        if (PrefetchOffsetLoad) {
+          CallInst *PrefetchCall =
+              GeneratePrefetcher(Transforms[NumPrefetchesLeft][LoadPtr]);
+          PrefetchCall->insertAfter(PrefetchOffsetLoad);
+          NumIndPrefetches++;
+        }
+      } else {
+        CallInst *PrefetchCall = GeneratePrefetcher(Transforms[0][LoadPtr]);
+        PrefetchCall->insertAfter(LoadI);
+        NumIndPrefetches++;
+      }
+      break;
+    }
+    default: {
+      // For other types of instructions, we make a clone of the instruction and
+      // replace operands that we already transformed before.
+      for (size_t j = 0; j < NumPrefetchesLeft; j++) {
+        if (j >= Transforms.size() || Transforms[j].count(Inst))
+          continue;
+        Instruction *TransformedInst = Inst->clone();
+        Builder.Insert(TransformedInst);
+        TransformedInst->moveAfter(Inst);
+        for (unsigned i = 0; i < TransformedInst->getNumOperands(); i++) {
+          Value *Operand = TransformedInst->getOperand(i);
+          if (Transforms[j].count(Operand))
+            TransformedInst->replaceUsesOfWith(Operand, Transforms[j][Operand]);
+        }
+
+        Transforms[j].insert(
+            std::pair<Value *, Value *>(Inst, TransformedInst));
+      }
+      break;
+    }
+    }
+  }
+  return true;
+}
+
+/// Find the indirect load that depends on the auxiliary induction variable and
+/// construct an instr list that contains loop variant instruction from the
+/// target load to the candidate phi instr.
+bool LoopDataPrefetch::findCandidateMemoryLoads(
+    Instruction *I, SmallSetVector<Instruction *, 8> &InstList,
+    SmallPtrSet<Instruction *, 8> &InstSet,
+    SmallVector<Instruction *, 4> &CandidateMemoryLoads,
+    std::vector<SmallSetVector<Instruction *, 8>> &DependentInstList,
+    SmallPtrSet<Instruction *, 4> LoopAuxIndPHINodes) {
+  bool ret = false;
+
+  for (Use &U : I->operands()) {
+    // If value is loop invariant, just continue
+    if (LI->getLoopFor(I->getParent())->isLoopInvariant(U.get()))
+      continue;
+
+    Instruction *OperandInst = dyn_cast<Instruction>(U.get());
+    if (OperandInst != nullptr) {
+      switch (OperandInst->getOpcode()) {
+      case Instruction::Load: {
+        // Check if the load instruction that it depends on is already in the
+        // candidate. If yes, add the candidate's depending instr to the list.
+        // If not, the load instruction it depends on is invalid.
+        LoadInst *LoadI = dyn_cast<LoadInst>(OperandInst);
+        if (isLoadInCandidateMemoryLoads(LoadI, InstList, InstSet,
+                                         CandidateMemoryLoads,
+                                         DependentInstList)) {
+          // We do not return early in case there are other auxiliary induction
+          // variables to check.
+          ret = true;
+        }
+        break;
+      }
+      case Instruction::PHI: {
+        // Check if PHI is the loop auxiliary induction PHI. If yes, found a
+        // valid load dependent on loop auxiliary induction variable. If not,
+        // invalid candidate.
+        PHINode *PhiInst = dyn_cast<PHINode>(OperandInst);
+        if (LoopAuxIndPHINodes.contains(PhiInst)) {
+          // In order to prevent the size of SmallVector from going out of
+          // bounds for large cases, only the last access of the element is
+          // retained. Update the position of OperandInst in the InstList.
+          if (InstList.count(OperandInst))
+            InstList.remove(OperandInst);
+          InstList.insert(OperandInst);
+          return true;
+        }
+        break;
+      }
+      case Instruction::Call: {
+        if (OperandInst->mayReadOrWriteMemory())
+          return false;
+        CallInst *Call = dyn_cast<CallInst>(OperandInst);
+        if (!Call->doesNotThrow())
+          return false;
+
+        // Use DFS to search though the operands.
+        InstList.insert(OperandInst);
+        if (findCandidateMemoryLoads(OperandInst, InstList, InstSet,
+                                     CandidateMemoryLoads, DependentInstList,
+                                     LoopAuxIndPHINodes)) {
+          // We do not return early in case there are other auxiliary
+          // induction variable to check
+          ret = true;
+        } else {
+          // If the Operand isn't dependent on an auxiliary induction
+          // variable, remove any instructions added to DependentInstList from
+          // this operand
+          if (InstList.count(OperandInst))
+            InstList.remove(OperandInst);
+          InstList.insert(OperandInst);
+          return false;
+        }
+        break;
+      }
+      case Instruction::Invoke: {
+        // We currently can not handle case where indirect load depends on other
+        // functions yet.
+        return false;
+      }
+      default: {
+        // Use DFS to search though the operands.
+        if (InstList.count(OperandInst))
+          InstList.remove(OperandInst);
+        InstList.insert(OperandInst);
+        if (findCandidateMemoryLoads(OperandInst, InstList, InstSet,
+                                     CandidateMemoryLoads, DependentInstList,
+                                     LoopAuxIndPHINodes)) {
+          // We do not return early in case there are other auxiliary induction
+          // variables to check
+          ret = true;
+        } else {
+          // If the operand isn't dependent on an auxiliary induction variable,
+          // remove any instructions added to DependentInstList from this
+          // operand
+          InstList.remove(OperandInst);
+        }
+      }
+      }
+    }
+  }
+  return ret;
+}
+
+/// Helper function to determine whether the given load is in
+/// CandidateMemoryLoads. If Yes, add the candidate's depending instr to the
+/// list.
+bool LoopDataPrefetch::isLoadInCandidateMemoryLoads(
+    LoadInst *LoadI, SmallSetVector<Instruction *, 8> &InstList,
+    SmallPtrSet<Instruction *, 8> &InstSet,
+    SmallVector<Instruction *, 4> &CandidateMemoryLoads,
+    std::vector<SmallSetVector<Instruction *, 8>> &DependentInstList) {
+  size_t CandidateLoadIndex = 0;
+  for (auto CandidateMemoryLoad : CandidateMemoryLoads) {
+    if (LoadI == CandidateMemoryLoad)
+      break;
+    CandidateLoadIndex++;
+  }
+
+  if (CandidateLoadIndex >= CandidateMemoryLoads.size() || InstSet.count(LoadI))
+    return false;
+
+  for (auto CandidateInst : DependentInstList[CandidateLoadIndex]) {
+    if (InstList.count(CandidateInst))
+      InstList.remove(CandidateInst);
+    InstList.insert(CandidateInst);
+    InstSet.insert(CandidateInst);
+  }
+  return true;
+}
+
+/// Returns whether the given loop should be processed to insert prefetches for
+/// indirect loads.
+bool LoopDataPrefetch::canDoIndirectPrefetch(Loop *L) {
+  // Support inner most loops in a simple form. However, the parent of inner
+  // loop will be processed as well in the case of nested loops. If
+  // indirectLevel is low, only allow one block loop, otherwise, allow up to 5
+  // under certain conditions.
+  if (!L->isInnermost() || !L->getLoopPreheader() ||
+      (IndirectionLevel <= 3 && L->getNumBlocks() != 1) ||
+      (IndirectionLevel > 3 && L->getNumBlocks() == 1) || L->getNumBlocks() > 5)
+    return false;
+  return true;
+}
+
+/// Check if the load depends on Crc Hash functions.
+bool LoopDataPrefetch::isCrcHashDataAccess(Instruction *I,
+                                           Instruction *PrefetchingLoad) {
+  if (llvm::IntrinsicInst *II = dyn_cast<llvm::IntrinsicInst>(I))
+    // If CRC functions are used for offset calculation then offset will be
+    // random. To avoid cache misses, data prefetch is needed.
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::aarch64_crc32b:
+    case Intrinsic::aarch64_crc32cb:
+    case Intrinsic::aarch64_crc32h:
+    case Intrinsic::aarch64_crc32ch:
+    case Intrinsic::aarch64_crc32w:
+    case Intrinsic::aarch64_crc32cw:
+    case Intrinsic::aarch64_crc32x:
+    case Intrinsic::aarch64_crc32cx: {
+      // Checking Candidate load is incremented by 1.
+      if (auto *LI = dyn_cast<LoadInst>(PrefetchingLoad)) {
+        if (auto *GEPI = dyn_cast<GetElementPtrInst>(LI->getPointerOperand())) {
+          // The data access will be consecutive, if the gep has one indices.
+          if (GEPI->getNumOperands() > 2)
+            return false;
+          auto *PtrIndices = dyn_cast<Instruction>(GEPI->getOperand(1));
+          if (!PtrIndices || isa<GlobalValue>(PtrIndices))
+            return true;
+          for (auto &U : PtrIndices->uses())
+            if (auto *PN = dyn_cast<PHINode>(U.getUser()))
+              if (getStep(PN, SE) <= 1)
+                return true;
+        }
+      }
+      break;
+    }
+    }
+  return false;
+}
+
+bool LoopDataPrefetch::isIntermediateLoadSupported(
+    Loop *L, LoadInst *&CandidateLoad,
+    SmallSetVector<Instruction *, 8> &InstList) {
+  BasicBlock *DependentBB = nullptr;
+  for (auto *I : InstList) {
+    if (isSupportsNonFaultyLoad(CandidateLoad->getModule())) {
+      if (LoadInst *IntermediateLoad = dyn_cast<LoadInst>(I)) {
+        if (IntermediateLoad == CandidateLoad)
+          continue;
+        // If intermediate load is scalable then, using sve non-faulting
+        // can be used.
+        auto *LoadTy = IntermediateLoad->getType();
+        if (!LoadTy->isIntegerTy() && !LoadTy->isFloatingPointTy() &&
+            !LoadTy->isPointerTy()) {
+          return false;
+        }
+      }
+    } else {
+      // If the intermediate load is in a differnt basicblock then, there is
+      // a chance of segmenttation fault.
+      if (DependentBB && isa<LoadInst>(I) && DependentBB != I->getParent() &&
+          L->contains(I->getParent())) {
+        return false;
+      }
+      if (L->contains(I->getParent()))
+        DependentBB = I->getParent();
+    }
+  }
+  return true;
+}
+
+bool LoopDataPrefetch::doIndirectPrefetch(Loop *L, unsigned ItersAhead) {
+  // List of valid phi nodes that indirect loads can depend on.
+  SmallPtrSet<Instruction *, 4> LoopAuxIndPHINodes;
+  // Map of valid phi node to its bound value in the preheader.
+  ValueMap<PHINode *, Value *> AuxIndBounds;
+  // Candidate memory loads in the loop.
+  SmallVector<Instruction *, 4> CandidateMemoryLoads;
+  // List of instruction from phi to load.
+  std::vector<SmallSetVector<Instruction *, 8>> DependentInstList;
+  // List of store instr in the loop.
+  SmallVector<Value *, 4> LoopStorePtrs;
+  bool MadeChange = false;
+
+  // Get loop induction and auxiliary induction phis. (They will be candidates
+  // for phi node matching during construction of the candidate instructions.)
+  // And we use the phi nodes to determine the loop upperbound.
+  Value *NumIterations =
+      getLoopIterationNumber(L, LoopAuxIndPHINodes, AuxIndBounds);
+  if (NumIterations == nullptr)
+    return MadeChange;
+
+  if (!RandomAccessPrefetch && !canDoIndirectPrefetch(L)) {
+    cleanLoopIterationNumber(NumIterations);
+    return MadeChange;
+  }
+
+  // Find candidate auxiliary induction variables which could be a dependent for
+  // the indirect load.
+  for (auto &I : *L->getHeader())
+    if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
+      InductionDescriptor IndDesc;
+      if (InductionDescriptor::isInductionPHI(PHI, L, SE, IndDesc) &&
+          L->getInductionVariable(*SE) != PHI) {
+        if (canGetAuxIndVarBound(L, PHI, LoopAuxIndPHINodes))
+          LoopAuxIndPHINodes.insert(PHI);
+      }
+    }
+
+  // Will search for candidates in the parent loop of the current inner most
+  // loop. This will capture more opportunities in the outer loop.
+  SmallVector<BasicBlock *, 8> BBList;
+  for (auto &BB : L->blocks())
+    BBList.push_back(BB);
+  if (L->getParentLoop())
+    for (auto &BB : L->getParentLoop()->blocks()) {
+      // We don't want to repeat blocks in the case of nested loops.
+      if (L->contains(BB))
+        continue;
+      BBList.push_back(BB);
+    }
+
+  // Iterate through the loop and keep track of the memory loads and the
+  // instruction list they depend on.
+  for (const auto BB : BBList) {
+    for (auto &I : *BB)
+      if (LoadInst *LoadI = dyn_cast<LoadInst>(&I)) {
+        SmallSetVector<Instruction *, 8> InstList;
+        SmallSet<Instruction *, 8> InstSet;
+        InstList.insert(LoadI);
+        InstSet.insert(LoadI);
+        if (findCandidateMemoryLoads(LoadI, InstList, InstSet,
+                                     CandidateMemoryLoads, DependentInstList,
+                                     LoopAuxIndPHINodes)) {
+          if (!isIntermediateLoadSupported(L, LoadI, InstList))
+            continue;
+          LLVM_DEBUG(dbgs() << "Found load candidate " << *LoadI << "\n");
+          CandidateMemoryLoads.push_back(LoadI);
+          DependentInstList.push_back(InstList);
+        }
+      } else if (StoreInst *StoreI = dyn_cast<StoreInst>(&I)) {
+        // Keep track of store insts to avoid conflict.
+        LoopStorePtrs.push_back(StoreI->getPointerOperand());
+      }
+  }
+
+  // Keep track of previously transformed instrs for offset load and target
+  // loads so we can reuse them.
+  SmallVector<DenseMap<Value *, Value *>> Transforms;
+  for (unsigned i = 0; i < CandidateMemoryLoads.size(); i++) {
+    SmallSetVector<Instruction *, 8> DependentInsts = DependentInstList[i];
+    unsigned NumLoads = 0;
+    bool NoConflict = true;
+    // Find candidate that contains indirect loads and check load for offset
+    // doesn't alias with other stores.
+    for (auto DependentInst : DependentInsts) {
+      if (LoadInst *LoadI = dyn_cast<LoadInst>(DependentInst)) {
+        NumLoads++;
+        // For the load of target address offset, we avoid the load being
+        // conflict with stores in the same loop.
+        if (NumLoads == IndirectionLevel) {
+          Value *LoadPtr = LoadI->getPointerOperand();
+          for (Value *StorePtr : LoopStorePtrs)
+            if (AA->isMustAlias(LoadPtr, StorePtr)) {
+              NoConflict = false;
+              break;
+            }
+        }
+      }
+    }
+
+    // Prefetch all indirect loads without conflict to the offset load.
+    if (NumLoads == IndirectionLevel && NoConflict) {
+      MadeChange |= insertPrefetcherForIndirectLoad(
+          L, i, NumIterations, CandidateMemoryLoads, DependentInsts,
+          AuxIndBounds, Transforms, ItersAhead);
+    }
+  }
+
+  cleanLoopIterationNumber(NumIterations);
+  return MadeChange;
+}
+
 PreservedAnalyses LoopDataPrefetchPass::run(Function &F,
                                             FunctionAnalysisManager &AM) {
+  AliasAnalysis *AA = &AM.getResult<AAManager>(F);
   DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
   LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
   ScalarEvolution *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
@@ -179,8 +1125,16 @@ PreservedAnalyses LoopDataPrefetchPass::run(Function &F,
       &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   const TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);
 
-  LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
-  bool Changed = LDP.run();
+  // Ensure loops are in simplified form which is a pre-requisite for loop data
+  // prefetch pass. Added only for new PM since the legacy PM has already added
+  // LoopSimplify pass as a dependency.
+  bool Changed = false;
+  for (auto &L : *LI) {
+    Changed |= simplifyLoop(L, DT, LI, SE, AC, nullptr, false);
+  }
+
+  LoopDataPrefetch LDP(AA, AC, DT, LI, SE, TTI, ORE);
+  Changed |= LDP.run();
 
   if (Changed) {
     PreservedAnalyses PA;
@@ -196,6 +1150,7 @@ bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
+  AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
@@ -206,7 +1161,7 @@ bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) {
   const TargetTransformInfo *TTI =
       &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
-  LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
+  LoopDataPrefetch LDP(AA, AC, DT, LI, SE, TTI, ORE);
   return LDP.run();
 }
 
@@ -214,14 +1169,26 @@ bool LoopDataPrefetch::run() {
   // If PrefetchDistance is not set, don't run the pass.  This gives an
   // opportunity for targets to run this pass for selected subtargets only
   // (whose TTI sets PrefetchDistance and CacheLineSize).
-  if (getPrefetchDistance() == 0 || TTI->getCacheLineSize() == 0) {
+  if (getPrefetchDistance() == 0 ||
+      (TTI->getCacheLineSize() == 0 && CachelineSize == 0)) {
     LLVM_DEBUG(dbgs() << "Please set both PrefetchDistance and CacheLineSize "
                          "for loop data prefetch.\n");
     return false;
   }
 
   bool MadeChange = false;
+  if (DisableDirectLoadPrefetch.getNumOccurrences() == 0 &&
+      !TTI->isProfitableToDirectPrefetch()) {
+    LLVM_DEBUG(dbgs() << "Disabling direct load prefetching.\n");
+    DisableDirectLoadPrefetch = true;
+  }
 
+  if (RandomAccessPrefetch) {
+    OuterLoopPrefetch = true;
+  }
+  if (DisableDirectLoadPrefetch && !IndirectLoadPrefetch &&
+      !RandomAccessPrefetch)
+    return MadeChange;
   for (Loop *I : *LI)
     for (Loop *L : depth_first(I))
       MadeChange |= runOnLoop(L);
@@ -274,10 +1241,18 @@ struct Prefetch {
 bool LoopDataPrefetch::runOnLoop(Loop *L) {
   bool MadeChange = false;
 
-  // Only prefetch in the inner-most loop
-  if (!L->isInnermost())
+  if (L->getLoopDepth() < PrefetchLoopDepth)
     return MadeChange;
 
+  bool IsInnerMost = true;
+  // Prefetch outer loop if needed.
+  if (!L->isInnermost()) {
+    if (OuterLoopPrefetch)
+      IsInnerMost = false;
+    else
+      return MadeChange;
+  }
+
   SmallPtrSet<const Value *, 32> EphValues;
   CodeMetrics::collectEphemeralValues(L, AC, EphValues);
 
@@ -323,78 +1298,101 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
   unsigned NumMemAccesses = 0;
   unsigned NumStridedMemAccesses = 0;
   SmallVector<Prefetch, 16> Prefetches;
-  for (const auto BB : L->blocks())
-    for (auto &I : *BB) {
-      Value *PtrValue;
-      Instruction *MemI;
-
-      if (LoadInst *LMemI = dyn_cast<LoadInst>(&I)) {
-        MemI = LMemI;
-        PtrValue = LMemI->getPointerOperand();
-      } else if (StoreInst *SMemI = dyn_cast<StoreInst>(&I)) {
-        if (!doPrefetchWrites()) continue;
-        MemI = SMemI;
-        PtrValue = SMemI->getPointerOperand();
-      } else continue;
-
-      unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
-      if (!TTI->shouldPrefetchAddressSpace(PtrAddrSpace))
-        continue;
-      NumMemAccesses++;
-      if (L->isLoopInvariant(PtrValue))
-        continue;
-
-      const SCEV *LSCEV = SE->getSCEV(PtrValue);
-      const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
-      if (!LSCEVAddRec)
-        continue;
-      NumStridedMemAccesses++;
-
-      // We don't want to double prefetch individual cache lines. If this
-      // access is known to be within one cache line of some other one that
-      // has already been prefetched, then don't prefetch this one as well.
-      bool DupPref = false;
-      for (auto &Pref : Prefetches) {
-        const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.LSCEVAddRec);
-        if (const SCEVConstant *ConstPtrDiff =
-            dyn_cast<SCEVConstant>(PtrDiff)) {
-          int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue());
-          if (PD < (int64_t) TTI->getCacheLineSize()) {
-            Pref.addInstruction(MemI, DT, PD);
-            DupPref = true;
-            break;
+  if (!DisableDirectLoadPrefetch) {
+    for (const auto BB : L->blocks()) {
+      // If this is not inner most, we avoid prefetching in sub loops.
+      for (auto &I : *BB) {
+        Value *PtrValue = nullptr;
+        Instruction *MemI;
+
+        if (LoadInst *LMemI = dyn_cast<LoadInst>(&I)) {
+          MemI = LMemI;
+          PtrValue = LMemI->getPointerOperand();
+        } else if (StoreInst *SMemI = dyn_cast<StoreInst>(&I)) {
+          if (!doPrefetchWrites())
+            continue;
+          MemI = SMemI;
+          PtrValue = SMemI->getPointerOperand();
+        } else
+          continue;
+
+        if (!PtrValue)
+          continue;
+        if (getPrefetchDistance() == 0)
+          continue;
+
+        unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
+        if (!TTI->shouldPrefetchAddressSpace(PtrAddrSpace))
+          continue;
+        NumMemAccesses++;
+        if (L->isLoopInvariant(PtrValue))
+          continue;
+
+        const SCEV *LSCEV = SE->getSCEV(PtrValue);
+        const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+        if (!LSCEVAddRec)
+          continue;
+        NumStridedMemAccesses++;
+
+        // For outer loops, we only prefetch memory instruction with stride
+        // depending on the current loop.
+        if (!IsInnerMost && LSCEVAddRec->getLoop() != L)
+          continue;
+
+        // We don't want to double prefetch individual cache lines. If this
+        // access is known to be within one cache line of some other one that
+        // has already been prefetched, then don't prefetch this one as well.
+        bool DupPref = false;
+        for (auto &Pref : Prefetches) {
+          const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.LSCEVAddRec);
+          if (const SCEVConstant *ConstPtrDiff =
+                  dyn_cast<SCEVConstant>(PtrDiff)) {
+            int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue());
+            // Use the CachelineSize value from compiler option.
+            int64_t CacheLineSize = CachelineSize.getNumOccurrences()
+                                        ? CachelineSize
+                                        : TTI->getCacheLineSize();
+            // if TTI CacheLineSize is zero then, default CachelineSize will
+            // use.
+            CacheLineSize = CacheLineSize ? CacheLineSize : CachelineSize;
+            if (PD < (int64_t)CacheLineSize) {
+              Pref.addInstruction(MemI, DT, PD);
+              DupPref = true;
+              break;
+            }
           }
         }
+        if (!DupPref)
+          Prefetches.push_back(Prefetch(LSCEVAddRec, MemI));
       }
-      if (!DupPref)
-        Prefetches.push_back(Prefetch(LSCEVAddRec, MemI));
     }
+  }
 
-  unsigned TargetMinStride =
-    getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
-                         Prefetches.size(), HasCall);
+  unsigned TargetMinStride = getMinPrefetchStride(
+      NumMemAccesses, NumStridedMemAccesses, Prefetches.size(), HasCall);
 
   LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead
-             << " iterations ahead (loop size: " << LoopSize << ") in "
-             << L->getHeader()->getParent()->getName() << ": " << *L);
-  LLVM_DEBUG(dbgs() << "Loop has: "
-             << NumMemAccesses << " memory accesses, "
-             << NumStridedMemAccesses << " strided memory accesses, "
-             << Prefetches.size() << " potential prefetch(es), "
-             << "a minimum stride of " << TargetMinStride << ", "
-             << (HasCall ? "calls" : "no calls") << ".\n");
+                    << " iterations ahead (loop size: " << LoopSize << ") in "
+                    << L->getHeader()->getParent()->getName() << ": " << *L);
+  LLVM_DEBUG(dbgs() << "Loop has: " << NumMemAccesses << " memory accesses, "
+                    << NumStridedMemAccesses << " strided memory accesses, "
+                    << Prefetches.size() << " potential prefetch(es), "
+                    << "a minimum stride of " << TargetMinStride << ", "
+                    << (HasCall ? "calls" : "no calls") << ".\n");
 
   for (auto &P : Prefetches) {
     // Check if the stride of the accesses is large enough to warrant a
-    // prefetch.
+    // prefetch. If MinPrefetchStride <= 1, no need to check if any stride
+    // goes.
     if (!isStrideLargeEnough(P.LSCEVAddRec, TargetMinStride))
       continue;
 
     BasicBlock *BB = P.InsertPt->getParent();
     SCEVExpander SCEVE(*SE, BB->getModule()->getDataLayout(), "prefaddr");
-    const SCEV *NextLSCEV = SE->getAddExpr(P.LSCEVAddRec, SE->getMulExpr(
-      SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead),
-      P.LSCEVAddRec->getStepRecurrence(*SE)));
+    const SCEV *NextLSCEV = SE->getAddExpr(
+        P.LSCEVAddRec,
+        SE->getMulExpr(SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead),
+                       P.LSCEVAddRec->getStepRecurrence(*SE)));
     if (!SCEVE.isSafeToExpand(NextLSCEV))
       continue;
 
@@ -405,24 +1403,26 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
     IRBuilder<> Builder(P.InsertPt);
     Module *M = BB->getParent()->getParent();
     Type *I32 = Type::getInt32Ty(BB->getContext());
-    Function *PrefetchFunc = Intrinsic::getDeclaration(
-        M, Intrinsic::prefetch, PrefPtrValue->getType());
-    Builder.CreateCall(
-        PrefetchFunc,
-        {PrefPtrValue,
-         ConstantInt::get(I32, P.Writes),
-         ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
+    Function *PrefetchFunc = Intrinsic::getDeclaration(M, Intrinsic::prefetch,
+                                                       PrefPtrValue->getType());
+    Builder.CreateCall(PrefetchFunc,
+                       {PrefPtrValue, ConstantInt::get(I32, P.Writes),
+                        ConstantInt::get(I32, IsInnerMost ? 3 : 0),
+                        ConstantInt::get(I32, 1)});
     ++NumPrefetches;
     LLVM_DEBUG(dbgs() << "  Access: "
-               << *P.MemI->getOperand(isa<LoadInst>(P.MemI) ? 0 : 1)
-               << ", SCEV: " << *P.LSCEVAddRec << "\n");
+                      << *P.MemI->getOperand(isa<LoadInst>(P.MemI) ? 0 : 1)
+                      << ", SCEV: " << *P.LSCEVAddRec << "\n");
     ORE->emit([&]() {
-        return OptimizationRemark(DEBUG_TYPE, "Prefetched", P.MemI)
-          << "prefetched memory access";
-      });
+      return OptimizationRemark(DEBUG_TYPE, "Prefetched", P.MemI)
+             << "prefetched memory access";
+    });
 
     MadeChange = true;
   }
 
+  if (IndirectLoadPrefetch || RandomAccessPrefetch)
+    MadeChange |= doIndirectPrefetch(L, ItersAhead);
+
   return MadeChange;
 }
diff --git a/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch_crc.ll b/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch_crc.ll
new file mode 100644
index 000000000000..e6a37ae38f06
--- /dev/null
+++ b/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch_crc.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=loop-data-prefetch --prefetch-distance=1024 --random-access-prefetch=true -disable-direct-prefetch -S | FileCheck %s --check-prefixes=CHECK,CHECK-NON-SVE
+; RUN: opt < %s -mcpu=tsv110 -passes=loop-data-prefetch --random-access-prefetch=true -disable-direct-prefetch -S | FileCheck %s --check-prefixes=CHECK,CHECK-NON-SVE
+; RUN: opt < %s -mcpu=hip09 -passes=loop-data-prefetch --random-access-prefetch=true --prefetch-with-nonfaulty-load=false -S | FileCheck %s --check-prefixes=CHECK,CHECK-NON-SVE
+; RUN: opt < %s -mcpu=hip12 -passes=loop-data-prefetch --random-access-prefetch=true -S | FileCheck %s --check-prefixes=CHECK,CHECK-SVE
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gun"
+
+declare i32 @llvm.aarch64.crc32w(i32, i32)
+
+; Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: read) uwtable
+define dso_local noundef i32 @_Z12matchcolumnsPiiS_ii(ptr nocapture noundef readonly %A, i32 noundef %B, ptr nocapture noundef readonly %Key, i32 noundef %index, i32 noundef %count) {
+; CHECK-LABEL: @_Z12matchcolumnsPiiS_ii(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret i32 [[ADD:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV22:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT23:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[SUM_020:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD]], [[IF_END]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDVARS_IV22]], 60
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i64 [[TMP0]], 99
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 99
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDVARS_IV22]], 120
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV22]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NON-SVE-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-SVE-NEXT: [[TMP7_1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 1)
+; CHECK-SVE-NEXT: [[TMP7_2:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.ldnf1.nxv4i32(<vscale x 4 x i1> [[TMP7_1]], ptr [[TMP5]])
+; CHECK-SVE-NEXT: [[TMP7:%.*]] = extractelement <vscale x 4 x i32> [[TMP7_2]], i32 0
+; CHECK-NEXT:    call void @llvm.prefetch.p0(ptr [[TMP4]], i32 0, i32 3, i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call i32 @llvm.aarch64.crc32w(i32 [[TMP6]], i32 -1)
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call i32 @llvm.aarch64.crc32w(i32 [[TMP7]], i32 -1)
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[TMP8]], 255
+; CHECK-NEXT:    [[TMP10:%.*]] = and i32 [[TMP9]], 255
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[AND]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[KEY:%.*]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[KEY]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    call void @llvm.prefetch.p0(ptr [[TMP12]], i32 0, i32 3, i32 1)
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP13]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label [[IF_END]], label [[DO_BODY_PREHEADER:%.*]]
+; CHECK:       do.body.preheader:
+; CHECK-NEXT:    br label [[DO_BODY:%.*]]
+; CHECK:       do.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DO_BODY]] ], [ [[IDXPROM1]], [[DO_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[KEY]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[CMP6_NOT:%.*]] = icmp eq i32 [[TMP14]], [[B]]
+; CHECK-NEXT:    br i1 [[CMP6_NOT]], label [[IF_END_LOOPEXIT:%.*]], label [[DO_BODY]]
+; CHECK:       if.end.loopexit:
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[AKEY_1:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP15]], [[IF_END_LOOPEXIT]] ]
+; CHECK-NEXT:    [[IDXPROM7:%.*]] = sext i32 [[AKEY_1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP16]], [[SUM_020]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT23]] = add nuw nsw i64 [[INDVARS_IV22]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT23]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+for.body:
+  %indvars.iv22 = phi i64 [ 0, %entry ], [ %indvars.iv.next23, %if.end ]
+  %sum.020 = phi i32 [ 0, %entry ], [ %add, %if.end ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv22
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = tail call i32 @llvm.aarch64.crc32w(i32 %0, i32 -1)
+  %and = and i32 %1, 255
+  %idxprom1 = zext i32 %and to i64
+  %arrayidx2 = getelementptr inbounds i32, ptr %Key, i64 %idxprom1
+  %2 = load i32, ptr %arrayidx2, align 4
+  %cmp3.not = icmp eq i32 %2, %B
+  br i1 %cmp3.not, label %if.end, label %do.body
+
+do.body:
+  %indvars.iv = phi i64 [ %idxprom1, %for.body ], [ %indvars.iv.next, %do.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx5 = getelementptr inbounds i32, ptr %Key, i64 %indvars.iv.next
+  %3 = load i32, ptr %arrayidx5, align 4
+  %cmp6.not = icmp eq i32 %3, %B
+  br i1 %cmp6.not, label %if.end.loopexit, label %do.body
+
+if.end.loopexit:
+  %4 = trunc i64 %indvars.iv.next to i32
+  br label %if.end
+
+if.end:
+  %AKey.1 = phi i32 [ %and, %for.body ], [ %4, %if.end.loopexit ]
+  %idxprom7 = sext i32 %AKey.1 to i64
+  %arrayidx8 = getelementptr inbounds i32, ptr %A, i64 %idxprom7
+  %5 = load i32, ptr %arrayidx8, align 4
+  %add = add nsw i32 %5, %sum.020
+  %indvars.iv.next23 = add nuw nsw i64 %indvars.iv22, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next23, 100
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
-- 
Gitee