diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index ce3ff2ee7aecab20f159ffb99fda4b758951db09..f9d7d4fbbe5033bdd669fab4f96865f80b93dfb0 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -726,6 +726,8 @@ public:
 
   bool isProfitableToLoopVersioning() const;
 
+  bool isProfitableToDirectPrefetch() const;
+
   bool useAA() const;
 
   /// Return true if this type is legal.
@@ -1674,6 +1676,7 @@ public:
   virtual bool isTruncateFree(Type *Ty1, Type *Ty2) = 0;
   virtual bool isProfitableToHoist(Instruction *I) = 0;
   virtual bool isProfitableToLoopVersioning() = 0;
+  virtual bool isProfitableToDirectPrefetch() = 0;
   virtual bool useAA() = 0;
   virtual bool isTypeLegal(Type *Ty) = 0;
   virtual unsigned getRegUsageForType(Type *Ty) = 0;
@@ -2144,6 +2147,9 @@ public:
   bool isProfitableToLoopVersioning() override {
     return Impl.isProfitableToLoopVersioning();
   }
+  bool isProfitableToDirectPrefetch() override {
+    return Impl.isProfitableToDirectPrefetch();
+  }
   bool useAA() override { return Impl.useAA(); }
   bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
   unsigned getRegUsageForType(Type *Ty) override {
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 3a99b02bc3637eed73572cb1d6891eef48e87689..80479ade16f4d126e3288d00262df28736096a3b 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -319,6 +319,8 @@ public:
 
   bool isProfitableToLoopVersioning() const { return false; }
 
+  bool isProfitableToDirectPrefetch() const { return true; }
+
   bool useAA() const { return false; }
 
   bool isTypeLegal(Type *Ty) const { return false; }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 0cd6599d42966a1f8197934a7ecaa953cf986341..50c7ecdde77f13ae047ecca75c4bbfbf6fb6acd8 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -403,6 +403,8 @@ public:
 
   bool isProfitableToLoopVersioning() const { return false; }
 
+  bool isProfitableToDirectPrefetch() const { return true; }
+
   bool useAA() const { return getST()->useAA(); }
 
   bool isTypeLegal(Type *Ty) {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index e26c7ad3dd90bf505a7a85645352ef7568363dde..02da1e1dc65b96ca94c5b474049829a6e792e0bd 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -490,6 +490,10 @@ bool TargetTransformInfo::isProfitableToLoopVersioning() const {
   return TTIImpl->isProfitableToLoopVersioning();
 }
 
+bool TargetTransformInfo::isProfitableToDirectPrefetch() const {
+  return TTIImpl->isProfitableToDirectPrefetch();
+}
+
 bool TargetTransformInfo::useAA() const { return TTIImpl->useAA(); }
 
 bool TargetTransformInfo::isTypeLegal(Type *Ty) const {
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 52268ec9fa3b42f80eca328bce5f580dc7035d3e..ec481250224b27d50695866ffbc4479d904d5ee2 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -263,22 +263,28 @@ void AArch64Subtarget::initializeProperties() {
     break;
   case TSV110:
     CacheLineSize = 64;
+    PrefetchDistance = 1024;
     PrefFunctionLogAlignment = 4;
     PrefLoopLogAlignment = 2;
+    MinPrefetchStride = 4;
     break;
   case HIP09:
     CacheLineSize = 64;
+    PrefetchDistance = 1024;
     PrefFunctionLogAlignment = 4;
     PrefLoopLogAlignment = 2;
     VScaleForTuning = 2;
     DefaultSVETFOpts = TailFoldingOpts::Simple;
+    MinPrefetchStride = 4;
     break;
   case HIP12:
     CacheLineSize = 64;
+    PrefetchDistance = 1024;
     PrefFunctionLogAlignment = 4;
     PrefLoopLogAlignment = 2;
     VScaleForTuning = 2;
     DefaultSVETFOpts = TailFoldingOpts::Simple;
+    MinPrefetchStride = 4;
     break;
   case ThunderX3T110:
     CacheLineSize = 64;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 48a818455d1a7a81ffa80dd014fd34079ad243f1..557044820d0694bf70edbafb390049336d3d0125 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -213,6 +213,16 @@ public:
     }
   }
 
+  bool isHiSiliconHIPProc() const {
+    switch (ARMProcFamily) {
+    case HIP09:
+    case HIP12:
+      return true;
+    default:
+      return false;
+    }
+  }
+
   bool isXRaySupported() const override { return true; }
 
   unsigned getMinVectorRegisterBitWidth() const {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 6f45c03ae977616e0cb9bc5fd3f8ffb2791095ed..74b27770f35c44993d36b12a4f12a4e2c055a2c9 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -395,6 +395,10 @@ bool AArch64TTIImpl::isProfitableToLoopVersioning() const {
   return ST->isHiSiliconProc() || ForceEnableExperimentalOpt;
 }
 
+bool AArch64TTIImpl::isProfitableToDirectPrefetch() const {
+  return !ST->isHiSiliconHIPProc();
+}
+
 InstructionCost
 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                       TTI::TargetCostKind CostKind) {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 0f4a4d7dc80460651365486d2ca8b1bd0651ef81..eae92c7aa5b6c8eacbba6e121db050ad86ab6a66 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -94,6 +94,8 @@ public:
 
   bool isProfitableToLoopVersioning() const;
 
+  bool isProfitableToDirectPrefetch() const;
+
   /// @}
 
   /// \name Vector TTI Implementations
diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index 7c2770979a900fa7e953a712b192144c27bee3ce..bd395becc40acbeacf1f1fa19210d4074e41e2c1 100644
--- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -14,21 +14,34 @@
 #include "llvm/InitializePasses.h"
 
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ReplaceConstant.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 
 #define DEBUG_TYPE "loop-data-prefetch"
@@ -37,9 +50,9 @@ using namespace llvm;
 
 // By default, we limit this to creating 16 PHIs (which is a little over half
 // of the allocatable register set).
-static cl::opt<bool>
-PrefetchWrites("loop-prefetch-writes", cl::Hidden, cl::init(false),
-               cl::desc("Prefetch write addresses"));
+static cl::opt<bool> PrefetchWrites("loop-prefetch-writes", cl::Hidden,
+                                    cl::init(false),
+                                    cl::desc("Prefetch write addresses"));
 
 static cl::opt<unsigned>
     PrefetchDistance("prefetch-distance",
@@ -54,31 +67,136 @@ static cl::opt<unsigned> MaxPrefetchIterationsAhead(
     "max-prefetch-iters-ahead",
     cl::desc("Max number of iterations to prefetch ahead"), cl::Hidden);
 
+static cl::opt<bool>
+    IndirectLoadPrefetch("indirect-load-prefetch", cl::Hidden, cl::init(false),
+                         cl::desc("Enable indirect load prefetch"));
+
+static cl::opt<unsigned> PrefetchIterationsAhead(
+    "indirect-prefetch-iters-ahead",
+    cl::desc("Number of iterations for indirect-load prefetch"), cl::Hidden,
+    cl::init(0));
+
+static cl::opt<bool> SkipIntermediate(
+    "indirect-prefetch-skip-intermediate", cl::Hidden, cl::init(false),
+    cl::desc(
+        "Skip prefetching intermediate loads while doing indirect prefetch"));
+
+static cl::opt<unsigned> IndirectionLevel(
+    "indirect-level",
+    cl::desc("Indirection level considered for indirect load prefetch"),
+    cl::Hidden, cl::init(2));
+
+static cl::opt<bool> RandomAccessPrefetch(
+    "random-access-prefetch", cl::Hidden, cl::init(false),
+    cl::desc("Enable random offset indirect load prefetch"));
+
+static cl::opt<bool>
+    EnableNonFaultyLoad("prefetch-with-nonfaulty-load", cl::Hidden,
+                        cl::init(false),
+                        cl::desc("Prefetch with non-faulty Load instruction."));
+
+static cl::opt<unsigned> CachelineSize("prefetch-cache-line-size",
+                                       cl::desc("Specify cache line size"),
+                                       cl::Hidden, cl::init(64));
+
+static cl::opt<bool>
+    OuterLoopPrefetch("outer-loop-prefetch", cl::Hidden, cl::init(false),
+                      cl::desc("Enable prefetch in outer loops"));
+
+static cl::opt<bool>
+    DisableDirectLoadPrefetch("disable-direct-prefetch", cl::Hidden,
+                              cl::init(false),
+                              cl::desc("Disable direct load prefetch"));
+
+static cl::opt<unsigned>
+    PrefetchLoopDepth("prefetch-loop-depth",
+                      cl::desc("Least loop depth to insert prefetch"),
+                      cl::Hidden, cl::init(1));
+
 STATISTIC(NumPrefetches, "Number of prefetches inserted");
+STATISTIC(NumIndPrefetches, "Number of indirect prefetches inserted");
 
 namespace {
 
+// Helper function to return a type with the same size as
+// given step size
+static Type *getPtrTypefromPHI(PHINode *PHI, int64_t StepSize) {
+  Type *Int8Ty = Type::getInt8Ty(PHI->getParent()->getContext());
+  return ArrayType::get(Int8Ty, StepSize);
+}
+
 /// Loop prefetch implementation class.
 class LoopDataPrefetch {
 public:
-  LoopDataPrefetch(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI,
-                   ScalarEvolution *SE, const TargetTransformInfo *TTI,
+  LoopDataPrefetch(AliasAnalysis *AA, AssumptionCache *AC, DominatorTree *DT,
+                   LoopInfo *LI, ScalarEvolution *SE,
+                   const TargetTransformInfo *TTI,
                    OptimizationRemarkEmitter *ORE)
-      : AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}
+      : AA(AA), AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}
 
   bool run();
 
 private:
   bool runOnLoop(Loop *L);
 
+  Value *getCanonicalishSizeVariable(Loop *L, PHINode *PHI) const;
+  Value *
+  getLoopIterationNumber(Loop *L,
+                         SmallPtrSet<Instruction *, 4> &LoopAuxIndPHINodes,
+                         ValueMap<PHINode *, Value *> &AuxIndBounds);
+  /// If prefetch instruction is not inserted, need to clean iteration
+  /// instructions in the preheader.
+  void cleanLoopIterationNumber(Value *NumIterations);
+  /// Returns whether the auxiliary induction variable can generate bound.
+  /// If it can, add PHI to LoopAuxIndPHINodes
+  bool canGetAuxIndVarBound(Loop *L, PHINode *PHI,
+                            SmallPtrSet<Instruction *, 4> &LoopAuxIndPHINodes);
+
+  /// Generate bound for the auxiliary induction variable at the
+  /// preheader and add it to AuxIndBounds.
+  /// Returns whether the bound was successfully generated.
+  bool getAuxIndVarBound(Loop *L, PHINode *PHI, Value *NumIterations,
+                         ValueMap<PHINode *, Value *> &AuxIndBounds);
+
+  bool insertPrefetcherForIndirectLoad(
+      Loop *L, unsigned Idx, Value *NumIterations,
+      SmallVector<Instruction *, 4> &CandidateMemoryLoads,
+      SmallSetVector<Instruction *, 8> &DependentInsts,
+      ValueMap<PHINode *, Value *> &AuxIndBounds,
+      SmallVectorImpl<DenseMap<Value *, Value *>> &Transforms,
+      unsigned ItersAhead);
+
+  bool findCandidateMemoryLoads(
+      Instruction *I, SmallSetVector<Instruction *, 8> &InstList,
+      SmallPtrSet<Instruction *, 8> &InstSet,
+      SmallVector<Instruction *, 4> &CandidateMemoryLoads,
+      std::vector<SmallSetVector<Instruction *, 8>> &DependentInstList,
+      SmallPtrSet<Instruction *, 4> LoopAuxIndPHINodes);
+
+  /// Helper function to determine whether the given load is in
+  /// CandidateMemoryLoads. If yes, add the candidate's depending inst to the
+  /// list
+  bool isLoadInCandidateMemoryLoads(
+      LoadInst *LoadI, SmallSetVector<Instruction *, 8> &InstList,
+      SmallPtrSet<Instruction *, 8> &InstSet,
+      SmallVector<Instruction *, 4> &CandidateMemoryLoads,
+      std::vector<SmallSetVector<Instruction *, 8>> &DependentInstList);
+
+  /// Returns whether the given loop can do indirect prefetch and should be
+  /// processed to insert prefetches for indirect loads.
+  bool canDoIndirectPrefetch(Loop *L);
+
+  bool isCrcHashDataAccess(Instruction *I, Instruction *PrefetchingLoad);
+  bool isIntermediateLoadSupported(Loop *L, LoadInst *&CandidateLoad,
+                                   SmallSetVector<Instruction *, 8> &InstList);
+  bool doIndirectPrefetch(Loop *L, unsigned ItersAhead);
   /// Check if the stride of the accesses is large enough to
   /// warrant a prefetch.
   bool isStrideLargeEnough(const SCEVAddRecExpr *AR, unsigned TargetMinStride);
 
   unsigned getMinPrefetchStride(unsigned NumMemAccesses,
                                 unsigned NumStridedMemAccesses,
-                                unsigned NumPrefetches,
-                                bool HasCall) {
+                                unsigned NumPrefetches, bool HasCall) {
     if (MinPrefetchStride.getNumOccurrences() > 0)
       return MinPrefetchStride;
     return TTI->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
@@ -103,6 +221,15 @@ private:
     return TTI->enableWritePrefetching();
   }
 
+  bool isSupportsNonFaultyLoad(Module *M) {
+    if (EnableNonFaultyLoad.getNumOccurrences() > 0)
+      return EnableNonFaultyLoad;
+    Triple TargetTriple = Triple(M->getTargetTriple());
+    return TTI->supportsScalableVectors() &&
+           TargetTriple.getArch() == Triple::aarch64;
+  }
+
+  AliasAnalysis *AA;
   AssumptionCache *AC;
   DominatorTree *DT;
   LoopInfo *LI;
@@ -120,6 +247,7 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
@@ -140,6 +268,7 @@ public:
 char LoopDataPrefetchLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopDataPrefetchLegacyPass, "loop-data-prefetch",
                       "Loop Data Prefetch", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
@@ -169,8 +298,825 @@ bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR,
   return TargetMinStride <= AbsStride;
 }
 
+/// Use the induction variable to generate value representing the total num of
+/// iterations for the loop in the preheader.
+Value *LoopDataPrefetch::getLoopIterationNumber(
+    Loop *L, SmallPtrSet<Instruction *, 4> &LoopAuxIndPHINodes,
+    ValueMap<PHINode *, Value *> &AuxIndBounds) {
+  Value *LoopBoundValue;
+  Value *LoopStepValue;
+  Value *LoopStartValue;
+  Value *LoopPreHeader;
+  Value *NumIterations;
+
+  // Use induction variable to derive number of iterations for the loop which
+  // will be used to calculate the upper bound for other auxiliary induction
+  // variables.
+  PHINode *PHI = L->getInductionVariable(*SE);
+  if (PHI == nullptr)
+    return nullptr;
+
+  auto LoopLB = L->getBounds(*SE);
+  if (!LoopLB)
+    return nullptr;
+
+  LoopStartValue = &(LoopLB->getInitialIVValue());
+  LoopStepValue = LoopLB->getStepValue();
+  LoopBoundValue = &(LoopLB->getFinalIVValue());
+  LoopPreHeader = L->getLoopPreheader();
+
+  if (LoopStartValue == nullptr || LoopStepValue == nullptr ||
+      LoopBoundValue == nullptr || LoopPreHeader == nullptr)
+    return nullptr;
+
+  // Step should be constant.
+  if (!isa<SCEVConstant>(SE->getSCEV(LoopStepValue)))
+    return nullptr;
+
+  // Make sure each of them is invariant so we can use them in the preheader.
+  if (!L->isLoopInvariant(LoopBoundValue) ||
+      !L->isLoopInvariant(LoopStepValue) || !L->isLoopInvariant(LoopStartValue))
+    return nullptr;
+
+  // Generate instruction that calculated the total number of iterations of the
+  // loop in the preheader.
+  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+  Value *Range = Builder.CreateSub(LoopBoundValue, LoopStartValue);
+  NumIterations = Builder.CreateSDiv(Range, LoopStepValue);
+
+  LoopAuxIndPHINodes.insert(PHI);
+  Value *Bound = nullptr;
+  // If the step is positive, the upper bound isn't included, i.e. accessing
+  // [bound] is not legal, so subtract the bound by LoopStepValue to prevent out
+  // of bounds memory access.
+  if (SE->isKnownNegative(SE->getSCEV(LoopStepValue)))
+    Bound = LoopBoundValue;
+  else
+    Bound = Builder.CreateSub(LoopBoundValue, LoopStepValue);
+  AuxIndBounds.insert(std::pair<PHINode *, Value *>(PHI, Bound));
+  return NumIterations;
+}
+
+/// If prefetch instruction is not inserted. Need to clean iteration instruction
+/// in the preheader.
+void LoopDataPrefetch::cleanLoopIterationNumber(Value *NumIterations) {
+  RecursivelyDeleteTriviallyDeadInstructions(NumIterations);
+}
+
+/// Returns whether the auxiliary induction variable can generate bound.
+/// If it can generate a bound, add PHI to LoopAuxIndPHINodes
+bool LoopDataPrefetch::canGetAuxIndVarBound(
+    Loop *L, PHINode *PHI, SmallPtrSet<Instruction *, 4> &LoopAuxIndPHINodes) {
+  Value *AuxIndVarStartValue =
+      PHI->getIncomingValueForBlock(L->getLoopPreheader());
+  if (AuxIndVarStartValue == nullptr)
+    return false;
+
+  const SCEV *LSCEV = SE->getSCEV(PHI);
+  const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+
+  if (LSCEVAddRec == nullptr)
+    return false;
+
+  // Currently, we only support constant steps.
+  if (dyn_cast<SCEVConstant>(LSCEVAddRec->getStepRecurrence(*SE))) {
+    InductionDescriptor IndDesc;
+    if (!InductionDescriptor::isInductionPHI(PHI, L, SE, IndDesc))
+      return false;
+
+    if (IndDesc.getInductionOpcode() != Instruction::Add &&
+        IndDesc.getInductionOpcode() != Instruction::Sub &&
+        IndDesc.getKind() != InductionDescriptor::IK_PtrInduction)
+      return false;
+    return true;
+  }
+  return false;
+}
+
+/// Generate bound for the auxiliary induction variable at the preheader and add
+/// it to AuxIndBounds. Returns whether the bound was successfully generated.
+bool LoopDataPrefetch::getAuxIndVarBound(
+    Loop *L, PHINode *PHI, Value *NumIterations,
+    ValueMap<PHINode *, Value *> &AuxIndBounds) {
+  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+  Value *AuxIndVarStartValue =
+      PHI->getIncomingValueForBlock(L->getLoopPreheader());
+  if (AuxIndVarStartValue == nullptr)
+    return false;
+
+  const SCEV *LSCEV = SE->getSCEV(PHI);
+  const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+
+  // Currently, we only support constant steps.
+  if (const SCEVConstant *ConstPtrDiff =
+          dyn_cast<SCEVConstant>(LSCEVAddRec->getStepRecurrence(*SE))) {
+    Value *AuxIndVarBound;
+    InductionDescriptor IndDesc;
+    if (!InductionDescriptor::isInductionPHI(PHI, L, SE, IndDesc))
+      return false;
+
+    // Calculate the upper bound for the auxiliary induction variable.
+    Value *CastedNumIterations =
+        Builder.CreateSExtOrTrunc(NumIterations, ConstPtrDiff->getType());
+
+    // Subtract one from CastedNumIterations as we want the bound to be in
+    // bounds. If there are N iterations, the first iteration will access the
+    // array at offset 0. On the N-th iteration, it will access the array at
+    // offset N-1, not N.
+    CastedNumIterations = Builder.CreateSub(
+        CastedNumIterations, ConstantInt::get(ConstPtrDiff->getType(), 1));
+    // Teh induction operator is add / sub
+    if (IndDesc.getInductionOpcode() == Instruction::Add ||
+        IndDesc.getInductionOpcode() == Instruction::Sub) {
+      Value *Range =
+          Builder.CreateMul(ConstPtrDiff->getValue(), CastedNumIterations);
+      AuxIndVarBound = Builder.CreateAdd(Range, AuxIndVarStartValue);
+    } else if (IndDesc.getKind() == InductionDescriptor::IK_PtrInduction) {
+      // The induction variable is a pointer
+      int64_t StepSize = ConstPtrDiff->getAPInt().getSExtValue();
+      if (SE->isKnownNegative(ConstPtrDiff)) {
+        StepSize = -StepSize;
+        CastedNumIterations = Builder.CreateMul(
+            ConstantInt::getSigned(ConstPtrDiff->getType(), -1),
+            CastedNumIterations);
+      }
+      Type *GEPType = getPtrTypefromPHI(PHI, StepSize);
+      AuxIndVarBound = Builder.CreateInBoundsGEP(GEPType, AuxIndVarStartValue,
+                                                 CastedNumIterations);
+    } else
+      return false;
+
+    LLVM_DEBUG(dbgs() << "Added "
+                      << (isa<SCEVConstant>(SE->getSCEV(AuxIndVarBound))
+                              ? "Constant "
+                              : "")
+                      << "AuxIndVarBound " << *AuxIndVarBound
+                      << " for AuxIndVar:" << *PHI << "\n");
+    AuxIndBounds.insert(std::pair<PHINode *, Value *>(PHI, AuxIndVarBound));
+
+    return true;
+  }
+  return false;
+}
+
+// Helper function to calculate the step for a given loop
+static uint64_t getStep(PHINode *PN, ScalarEvolution *SE) {
+  // Get the constant step for the induction phi so we can use it to calculate
+  // how much we should increase the induction for prefetching.
+  uint64_t Step = 0;
+  const SCEV *LSCEV = SE->getSCEV(PN);
+  const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+
+  if (LSCEVAddRec == nullptr)
+    return Step;
+
+  if (const SCEVConstant *ConstPtrDiff =
+          dyn_cast<SCEVConstant>(LSCEVAddRec->getStepRecurrence(*SE))) {
+    Step = ConstPtrDiff->getAPInt().getZExtValue();
+  }
+  return Step;
+}
+
+// Helper function to determine if the loop step is positive
+static bool isPositiveStep(PHINode *PN, ScalarEvolution *SE) {
+  bool PositiveStep = true;
+  const SCEV *LSCEV = SE->getSCEV(PN);
+  const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+  if (const SCEVConstant *ConstPtrDiff =
+          dyn_cast<SCEVConstant>(LSCEVAddRec->getStepRecurrence(*SE))) {
+    if (SE->isKnownNegative(ConstPtrDiff)) {
+      PositiveStep = false;
+    }
+  }
+  return PositiveStep;
+}
+
+// Helper function to calculate the step type of a PHI node. If the PHI node is
+// not a pointer type, get the type PHI Node itself. Otherwise, get the integer
+// type of the PHI's step/offset value.
+static Type *getStepTypeFromPHINode(PHINode *PN, ScalarEvolution *SE) {
+  // Get the constant step for the induction phi so we can use it to calculate
+  // how much we should increase the induction for prefetching.
+  Type *T = PN->getType();
+  if (!T->isPointerTy())
+    return T;
+
+  const SCEV *LSCEV = SE->getSCEV(PN);
+  const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+  if (const SCEVConstant *ConstPtrDiff =
+          dyn_cast<SCEVConstant>(LSCEVAddRec->getStepRecurrence(*SE)))
+    return ConstPtrDiff->getType();
+
+  return T;
+}
+
+/// This function will take an instr list that contains indirect loads and
+/// transform them into prefetchers. E.g. Transform following indirect load
+/// A[B[i]]:
+///   phi indvar [0] [bound]
+///   idxB = gep *B, indvar
+///   offsetA = load * idxB
+///   idxA = gep *A, offsetA
+///   valueA = load *idxA
+/// To indirect load with prefetchers N iteration ahead:
+///   phi indvar [0] [bound]
+///   offsetN = add indvar, N
+///   offset2N = add indvar, 2N
+///   compare = icmp offsetN, bound
+///   offsetN = select compare, offsetN, bound
+///   preIdxN = gep *B, offsetN
+///   preIdx2N = get *B, offset2N
+///   call prefetch(preIdx2N)
+///   preOffsetA = load preIdxN
+///   preIdxA = gep *A, preOffsetA
+///   call prefetch(preIdxA)
+///   idxB = gep *B, indvar
+///   offsetA = load *idxB
+///   idxA = gep *A, offsetA
+///   valueA = load *idxA
+bool LoopDataPrefetch::insertPrefetcherForIndirectLoad(
+    Loop *L, unsigned Idx, Value *NumIterations,
+    SmallVector<Instruction *, 4> &CandidateMemoryLoads,
+    SmallSetVector<Instruction *, 8> &DependentInsts,
+    ValueMap<PHINode *, Value *> &AuxIndBounds,
+    SmallVectorImpl<DenseMap<Value *, Value *>> &Transforms,
+    unsigned ItersAhead) {
+  bool PositiveStep = true;
+  Instruction *TargetIndirectLoad = CandidateMemoryLoads[Idx];
+  IRBuilder<> Builder(TargetIndirectLoad);
+  Module *M = TargetIndirectLoad->getModule();
+  Type *I32Ty = Type::getInt32Ty(TargetIndirectLoad->getParent()->getContext());
+
+  bool isRandomAccess = false;
+  bool isCallDependency = false;
+  for (auto *I : DependentInsts) {
+    isCallDependency |= isa<CallInst>(I);
+    if (isCrcHashDataAccess(I, TargetIndirectLoad)) {
+      isRandomAccess = true;
+      break;
+    }
+  }
+  // CallInst dependency only support for Random access with CRC.
+  if (!isRandomAccess && (isCallDependency || !canDoIndirectPrefetch(L)))
+    return false;
+
+  // If indirect load prefetch is not specified then, exit for non random cases.
+  if (!IndirectLoadPrefetch && !isRandomAccess)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Inserting indirect prefetchers for\t"
+                    << *TargetIndirectLoad << "\twith " << DependentInsts.size()
+                    << " dependent instructions\n");
+
+  // Keep track of the number of prefetches left to process among the
+  // DependentInst List. We assume that for given indirectLevel N, we will have
+  // N prefetches to do, unless we are skipping intermediate loads, then we are
+  // only doing 1 prefetch.
+  size_t NumPrefetchesLeft = SkipIntermediate ? 1 : IndirectionLevel;
+  int64_t Step;
+  while (!DependentInsts.empty()) {
+    Instruction *DependentInst = DependentInsts.pop_back_val();
+    Instruction *Inst = dyn_cast<Instruction>(DependentInst);
+
+    switch (Inst->getOpcode()) {
+    case Instruction::PHI: {
+      // Get the constant step for the induction phi so we can use it to
+      // calculate how much we should increase the induction for prefetching.
+      PHINode *PN = dyn_cast<PHINode>(Inst);
+      Step = getStep(PN, SE);
+      PositiveStep = isPositiveStep(PN, SE);
+      Type *InstType = getStepTypeFromPHINode(PN, SE);
+      if (!PositiveStep)
+        Step = -Step;
+
+      // Make sure phi node is i64 or i32.
+      if (!InstType->isIntegerTy(64) && !InstType->isIntegerTy(32))
+        return false;
+
+      // Create the bound for this PHI if needed:
+      if (!AuxIndBounds.count(PN))
+        getAuxIndVarBound(L, PN, NumIterations, AuxIndBounds);
+
+      //  We create values based on the induction variable so we can use it to
+      //  generate prefetcher later on. The first value (indvar + IterationAhead
+      //  * step) will be used for the load of prefetched address and it must
+      //  not exceeding the bound. The second value (indvar + 2 * IterationAhead
+      //  * step) will be used to generate prefether for the load of address.
+      //  The subsequent values are generated in a similar fashion to generate
+      //  prefetchers for offset of all dependent loads.
+
+      //  Insert the new instruction after all PHI node.
+      auto InsertionPoint = Inst;
+      if (auto FirstNonPHI = Inst->getParent()->getFirstNonPHI())
+        InsertionPoint = FirstNonPHI->getPrevNode();
+
+      for (size_t i = 0; i < NumPrefetchesLeft; i++) {
+        if (i > 0 && SkipIntermediate)
+          break;
+
+        if (Transforms.size() < i + 1) {
+          Transforms.push_back(DenseMap<Value *, Value *>());
+        } else if (Transforms[i].count(Inst))
+          continue;
+
+        // Create the new operation for the target load
+        Value *NewOp = nullptr;
+        if (Inst->getType()->isPointerTy()) {
+          Type *GEPType = getPtrTypefromPHI(PN, Step);
+          int64_t Offset =
+              PrefetchIterationsAhead ? PrefetchIterationsAhead : ItersAhead;
+          if (!PositiveStep)
+            Offset = -Offset;
+          // Do not need to calculate Offset * Step as it is calculated
+          // implicitly within the GEP instruction
+          NewOp = Builder.CreateInBoundsGEP(
+              GEPType, Inst,
+              ConstantInt::getSigned(InstType, (i + 1) * Offset));
+        } else {
+          // FullStep is the initial offset for the new value, taking into
+          // account, both Step and the number of iterations ahead to prefetch.
+          // If indirect prefetch iterations ahead is enabled, we directly use
+          // the supplied indirect-prefetch-iters-ahead value.
+          int64_t FullStep = PrefetchIterationsAhead
+                                 ? PrefetchIterationsAhead * Step
+                                 : ItersAhead * Step;
+
+          Instruction::BinaryOps BiOp =
+              PositiveStep ? Instruction::Add : Instruction::Sub;
+          NewOp = Builder.CreateBinOp(
+              BiOp, Inst,
+              ConstantInt::get(Inst->getType(), (i + 1) * FullStep));
+        }
+
+        if (auto NewOpInstr = dyn_cast<Instruction>(NewOp)) {
+          NewOpInstr->moveAfter(InsertionPoint);
+          InsertionPoint = NewOpInstr;
+        }
+
+        // Create the new operations for the offset loads
+        if (i > 0 && i == NumPrefetchesLeft - 1) {
+          Transforms[i].insert(std::pair<Value *, Value *>(Inst, NewOp));
+        } else {
+          Value *NewCmp = Builder.CreateICmp(
+              PositiveStep ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, NewOp,
+              AuxIndBounds[cast<PHINode>(Inst)]);
+          Value *NewSelect =
+              Builder.CreateSelect(NewCmp, NewOp, AuxIndBounds[PN]);
+          Transforms[i].insert(std::pair<Value *, Value *>(Inst, NewSelect));
+
+          if (auto NewCmpInstr = dyn_cast<Instruction>(NewCmp)) {
+            NewCmpInstr->moveAfter(InsertionPoint);
+            InsertionPoint = NewCmpInstr;
+          }
+
+          if (auto NewSelectInstr = dyn_cast<Instruction>(NewSelect)) {
+            NewSelectInstr->moveAfter(InsertionPoint);
+            InsertionPoint = NewSelectInstr;
+          }
+        }
+      }
+      break;
+    }
+    case Instruction::Load: {
+      LoadInst *LoadI = dyn_cast<LoadInst>(Inst);
+      Value *LoadPtr = LoadI->getPointerOperand();
+      if (!SkipIntermediate)
+        NumPrefetchesLeft--;
+
+      auto GeneratePrefetcher = [&](llvm::Value *PrefetchPtr) {
+        Function *PrefetchFunc = Intrinsic::getDeclaration(
+            M, Intrinsic::prefetch, LoadPtr->getType());
+        Value *PrefetchArg[] = {PrefetchPtr, ConstantInt::get(I32Ty, 0),
+                                ConstantInt::get(I32Ty, 3),
+                                ConstantInt::get(I32Ty, 1)};
+        CallInst *PrefetchCall = CallInst::Create(PrefetchFunc, PrefetchArg);
+        return PrefetchCall;
+      };
+
+      auto CloneNonFaultyLoad = [&](LoadInst *Load, int PrefetchLevel,
+                                    Value *LoadOperand) {
+        auto DL = Load->getParent()->getModule()->getDataLayout();
+        auto ScalableWidth =
+            TTI->getRegisterBitWidth(TargetTransformInfo::RGK_ScalableVector)
+                .getKnownMinValue();
+
+        auto LDType = Load->getType();
+        unsigned LDTypeSize = (LDType->isPointerTy())
+                                  ? DL.getTypeStoreSizeInBits(LDType)
+                                  : LDType->getScalarSizeInBits();
+        int ElementCount = ScalableWidth / LDTypeSize;
+        Type *PredTy = ScalableVectorType::get(
+            Type::getInt1Ty(Load->getParent()->getContext()), ElementCount);
+        auto *PTruePat =
+            ConstantInt::get(I32Ty, 1 /* = AArch64SVEPredPattern::vl1*/);
+
+        auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
+                                              {PredTy}, {PTruePat});
+        PTrue->moveAfter(Load);
+        Type *ScaledLoadTy = ScalableVectorType::get(LDType, ElementCount);
+        auto *Ldnf1 = Builder.CreateIntrinsic(
+            Intrinsic::aarch64_sve_ldnf1, {ScaledLoadTy}, {PTrue, LoadOperand});
+        Ldnf1->moveAfter(PTrue);
+        auto *ExtractIdx = ConstantInt::get(I32Ty, 0);
+        Instruction *Element = dyn_cast<Instruction>(
+            Builder.CreateExtractElement(Ldnf1, ExtractIdx));
+        Element->moveAfter(Ldnf1);
+        Ldnf1->replaceUsesOfWith(LoadOperand,
+                                 Transforms[PrefetchLevel][LoadOperand]);
+        return Element;
+      };
+
+      if (!DependentInsts.empty()) {
+        // For any intermediate (not last) load, we generate a load for all the
+        // offset at min(indvar+N*IterationsAhead*step, bound)] for each N up to
+        // NumPrefetchesLeft - 1, and generate a prefetcher at
+        // (indvar+(N+1)*IterationAhead*step) for the offset load.
+        Instruction *PrefetchOffsetLoad = nullptr;
+        for (size_t i = 0; i < NumPrefetchesLeft; i++) {
+          if (Transforms[i].count(LoadI))
+            continue;
+          if (isSupportsNonFaultyLoad(M))
+            PrefetchOffsetLoad = CloneNonFaultyLoad(LoadI, i, LoadPtr);
+          else {
+            PrefetchOffsetLoad = LoadI->clone();
+            Builder.Insert(PrefetchOffsetLoad);
+            PrefetchOffsetLoad->moveAfter(LoadI);
+            PrefetchOffsetLoad->replaceUsesOfWith(LoadPtr,
+                                                  Transforms[i][LoadPtr]);
+          }
+          Transforms[i].insert(
+              std::pair<Value *, Value *>(LoadI, PrefetchOffsetLoad));
+        }
+
+        if (SkipIntermediate)
+          break;
+
+        // Create a prefetcher for the offset load.
+        if (PrefetchOffsetLoad) {
+          CallInst *PrefetchCall =
+              GeneratePrefetcher(Transforms[NumPrefetchesLeft][LoadPtr]);
+          PrefetchCall->insertAfter(PrefetchOffsetLoad);
+          NumIndPrefetches++;
+        }
+      } else {
+        CallInst *PrefetchCall = GeneratePrefetcher(Transforms[0][LoadPtr]);
+        PrefetchCall->insertAfter(LoadI);
+        NumIndPrefetches++;
+      }
+      break;
+    }
+    default: {
+      // For other types of instructions, we make a clone of the instruction and
+      // replace operands that we already transformed before.
+      for (size_t j = 0; j < NumPrefetchesLeft; j++) {
+        if (j >= Transforms.size() || Transforms[j].count(Inst))
+          continue;
+        Instruction *TransformedInst = Inst->clone();
+        Builder.Insert(TransformedInst);
+        TransformedInst->moveAfter(Inst);
+        for (unsigned i = 0; i < TransformedInst->getNumOperands(); i++) {
+          Value *Operand = TransformedInst->getOperand(i);
+          if (Transforms[j].count(Operand))
+            TransformedInst->replaceUsesOfWith(Operand, Transforms[j][Operand]);
+        }
+
+        Transforms[j].insert(
+            std::pair<Value *, Value *>(Inst, TransformedInst));
+      }
+      break;
+    }
+    }
+  }
+  return true;
+}
+
+/// Find the indirect load that depends on the auxiliary induction variable and
+/// construct an instr list that contains loop variant instruction from the
+/// target load to the candidate phi instr.
+bool LoopDataPrefetch::findCandidateMemoryLoads(
+    Instruction *I, SmallSetVector<Instruction *, 8> &InstList,
+    SmallPtrSet<Instruction *, 8> &InstSet,
+    SmallVector<Instruction *, 4> &CandidateMemoryLoads,
+    std::vector<SmallSetVector<Instruction *, 8>> &DependentInstList,
+    SmallPtrSet<Instruction *, 4> LoopAuxIndPHINodes) {
+  bool ret = false;
+
+  for (Use &U : I->operands()) {
+    // If value is loop invariant, just continue
+    if (LI->getLoopFor(I->getParent())->isLoopInvariant(U.get()))
+      continue;
+
+    Instruction *OperandInst = dyn_cast<Instruction>(U.get());
+    if (OperandInst != nullptr) {
+      switch (OperandInst->getOpcode()) {
+      case Instruction::Load: {
+        // Check if the load instruction that it depends on is already in the
+        // candidate. If yes, add the candidate's depending instr to the list.
+        // If not, the load instruction it depends on is invalid.
+        LoadInst *LoadI = dyn_cast<LoadInst>(OperandInst);
+        if (isLoadInCandidateMemoryLoads(LoadI, InstList, InstSet,
+                                         CandidateMemoryLoads,
+                                         DependentInstList)) {
+          // We do not return early in case there are other auxiliary induction
+          // variables to check.
+          ret = true;
+        }
+        break;
+      }
+      case Instruction::PHI: {
+        // Check if PHI is the loop auxiliary induction PHI. If yes, found a
+        // valid load dependent on loop auxiliary induction variable. If not,
+        // invalid candidate.
+        PHINode *PhiInst = dyn_cast<PHINode>(OperandInst);
+        if (LoopAuxIndPHINodes.contains(PhiInst)) {
+          // In order to prevent the size of SmallVector from going out of
+          // bounds for large cases, only the last access of the element is
+          // retained. Update the position of OperandInst in the InstList.
+          if (InstList.count(OperandInst))
+            InstList.remove(OperandInst);
+          InstList.insert(OperandInst);
+          return true;
+        }
+        break;
+      }
+      case Instruction::Call: {
+        if (OperandInst->mayReadOrWriteMemory())
+          return false;
+        CallInst *Call = dyn_cast<CallInst>(OperandInst);
+        if (!Call->doesNotThrow())
+          return false;
+
+        // Use DFS to search though the operands.
+        InstList.insert(OperandInst);
+        if (findCandidateMemoryLoads(OperandInst, InstList, InstSet,
+                                     CandidateMemoryLoads, DependentInstList,
+                                     LoopAuxIndPHINodes)) {
+          // We do not return early in case there are other auxiliary
+          // induction variable to check
+          ret = true;
+        } else {
+          // If the Operand isn't dependent on an auxiliary induction
+          // variable, remove any instructions added to DependentInstList from
+          // this operand
+          if (InstList.count(OperandInst))
+            InstList.remove(OperandInst);
+          InstList.insert(OperandInst);
+          return false;
+        }
+        break;
+      }
+      case Instruction::Invoke: {
+        // We currently can not handle case where indirect load depends on other
+        // functions yet.
+        return false;
+      }
+      default: {
+        // Use DFS to search though the operands.
+        if (InstList.count(OperandInst))
+          InstList.remove(OperandInst);
+        InstList.insert(OperandInst);
+        if (findCandidateMemoryLoads(OperandInst, InstList, InstSet,
+                                     CandidateMemoryLoads, DependentInstList,
+                                     LoopAuxIndPHINodes)) {
+          // We do not return early in case there are other auxiliary induction
+          // variables to check
+          ret = true;
+        } else {
+          // If the operand isn't dependent on an auxiliary induction variable,
+          // remove any instructions added to DependentInstList from this
+          // operand
+          InstList.remove(OperandInst);
+        }
+      }
+      }
+    }
+  }
+  return ret;
+}
+
+/// Helper function to determine whether the given load is in
+/// CandidateMemoryLoads. If Yes, add the candidate's depending instr to the
+/// list.
+bool LoopDataPrefetch::isLoadInCandidateMemoryLoads(
+    LoadInst *LoadI, SmallSetVector<Instruction *, 8> &InstList,
+    SmallPtrSet<Instruction *, 8> &InstSet,
+    SmallVector<Instruction *, 4> &CandidateMemoryLoads,
+    std::vector<SmallSetVector<Instruction *, 8>> &DependentInstList) {
+  size_t CandidateLoadIndex = 0;
+  for (auto CandidateMemoryLoad : CandidateMemoryLoads) {
+    if (LoadI == CandidateMemoryLoad)
+      break;
+    CandidateLoadIndex++;
+  }
+
+  if (CandidateLoadIndex >= CandidateMemoryLoads.size() || InstSet.count(LoadI))
+    return false;
+
+  for (auto CandidateInst : DependentInstList[CandidateLoadIndex]) {
+    if (InstList.count(CandidateInst))
+      InstList.remove(CandidateInst);
+    InstList.insert(CandidateInst);
+    InstSet.insert(CandidateInst);
+  }
+  return true;
+}
+
+/// Returns whether the given loop should be processed to insert prefetches for
+/// indirect loads.
+bool LoopDataPrefetch::canDoIndirectPrefetch(Loop *L) {
+  // Support inner most loops in a simple form. However, the parent of inner
+  // loop will be processed as well in the case of nested loops. If
+  // indirectLevel is low, only allow one block loop, otherwise, allow up to 5
+  // under certain conditions.
+  if (!L->isInnermost() || !L->getLoopPreheader() ||
+      (IndirectionLevel <= 3 && L->getNumBlocks() != 1) ||
+      (IndirectionLevel > 3 && L->getNumBlocks() == 1) || L->getNumBlocks() > 5)
+    return false;
+  return true;
+}
+
+/// Check if the load depends on Crc Hash functions.
+bool LoopDataPrefetch::isCrcHashDataAccess(Instruction *I,
+                                           Instruction *PrefetchingLoad) {
+  if (llvm::IntrinsicInst *II = dyn_cast<llvm::IntrinsicInst>(I))
+    // If CRC functions are used for offset calculation then offset will be
+    // random. To avoid cache misses, data prefetch is needed.
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::aarch64_crc32b:
+    case Intrinsic::aarch64_crc32cb:
+    case Intrinsic::aarch64_crc32h:
+    case Intrinsic::aarch64_crc32ch:
+    case Intrinsic::aarch64_crc32w:
+    case Intrinsic::aarch64_crc32cw:
+    case Intrinsic::aarch64_crc32x:
+    case Intrinsic::aarch64_crc32cx: {
+      // Checking Candidate load is incremented by 1.
+      if (auto *LI = dyn_cast<LoadInst>(PrefetchingLoad)) {
+        if (auto *GEPI = dyn_cast<GetElementPtrInst>(LI->getPointerOperand())) {
+          // The data access will be consecutive, if the gep has one indices.
+          if (GEPI->getNumOperands() > 2)
+            return false;
+          auto *PtrIndices = dyn_cast<Instruction>(GEPI->getOperand(1));
+          if (!PtrIndices || isa<GlobalValue>(PtrIndices))
+            return true;
+          for (auto &U : PtrIndices->uses())
+            if (auto *PN = dyn_cast<PHINode>(U.getUser()))
+              if (getStep(PN, SE) <= 1)
+                return true;
+        }
+      }
+      break;
+    }
+    }
+  return false;
+}
+
+bool LoopDataPrefetch::isIntermediateLoadSupported(
+    Loop *L, LoadInst *&CandidateLoad,
+    SmallSetVector<Instruction *, 8> &InstList) {
+  BasicBlock *DependentBB = nullptr;
+  for (auto *I : InstList) {
+    if (isSupportsNonFaultyLoad(CandidateLoad->getModule())) {
+      if (LoadInst *IntermediateLoad = dyn_cast<LoadInst>(I)) {
+        if (IntermediateLoad == CandidateLoad)
+          continue;
+        // If intermediate load is scalable then, using sve non-faulting
+        // can be used.
+        auto *LoadTy = IntermediateLoad->getType();
+        if (!LoadTy->isIntegerTy() && !LoadTy->isFloatingPointTy() &&
+            !LoadTy->isPointerTy()) {
+          return false;
+        }
+      }
+    } else {
+      // If the intermediate load is in a differnt basicblock then, there is
+      // a chance of segmenttation fault.
+      if (DependentBB && isa<LoadInst>(I) && DependentBB != I->getParent() &&
+          L->contains(I->getParent())) {
+        return false;
+      }
+      if (L->contains(I->getParent()))
+        DependentBB = I->getParent();
+    }
+  }
+  return true;
+}
+
+bool LoopDataPrefetch::doIndirectPrefetch(Loop *L, unsigned ItersAhead) {
+  // List of valid phi nodes that indirect loads can depend on.
+  SmallPtrSet<Instruction *, 4> LoopAuxIndPHINodes;
+  // Map of valid phi node to its bound value in the preheader.
+  ValueMap<PHINode *, Value *> AuxIndBounds;
+  // Candidate memory loads in the loop.
+  SmallVector<Instruction *, 4> CandidateMemoryLoads;
+  // List of instruction from phi to load.
+  std::vector<SmallSetVector<Instruction *, 8>> DependentInstList;
+  // List of store instr in the loop.
+  SmallVector<Value *, 4> LoopStorePtrs;
+  bool MadeChange = false;
+
+  // Get loop induction and auxiliary induction phis. (They will be candidates
+  // for phi node matching during construction of the candidate instructions.)
+  // And we use the phi nodes to determine the loop upperbound.
+  Value *NumIterations =
+      getLoopIterationNumber(L, LoopAuxIndPHINodes, AuxIndBounds);
+  if (NumIterations == nullptr)
+    return MadeChange;
+
+  if (!RandomAccessPrefetch && !canDoIndirectPrefetch(L)) {
+    cleanLoopIterationNumber(NumIterations);
+    return MadeChange;
+  }
+
+  // Find candidate auxiliary induction variables which could be a dependent for
+  // the indirect load.
+  for (auto &I : *L->getHeader())
+    if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
+      InductionDescriptor IndDesc;
+      if (InductionDescriptor::isInductionPHI(PHI, L, SE, IndDesc) &&
+          L->getInductionVariable(*SE) != PHI) {
+        if (canGetAuxIndVarBound(L, PHI, LoopAuxIndPHINodes))
+          LoopAuxIndPHINodes.insert(PHI);
+      }
+    }
+
+  // Will search for candidates in the parent loop of the current inner most
+  // loop. This will capture more opportunities in the outer loop.
+  SmallVector<BasicBlock *, 8> BBList;
+  for (auto &BB : L->blocks())
+    BBList.push_back(BB);
+  if (L->getParentLoop())
+    for (auto &BB : L->getParentLoop()->blocks()) {
+      // We don't want to repeat blocks in the case of nested loops.
+      if (L->contains(BB))
+        continue;
+      BBList.push_back(BB);
+    }
+
+  // Iterate through the loop and keep track of the memory loads and the
+  // instruction list they depend on.
+  for (const auto BB : BBList) {
+    for (auto &I : *BB)
+      if (LoadInst *LoadI = dyn_cast<LoadInst>(&I)) {
+        SmallSetVector<Instruction *, 8> InstList;
+        SmallSet<Instruction *, 8> InstSet;
+        InstList.insert(LoadI);
+        InstSet.insert(LoadI);
+        if (findCandidateMemoryLoads(LoadI, InstList, InstSet,
+                                     CandidateMemoryLoads, DependentInstList,
+                                     LoopAuxIndPHINodes)) {
+          if (!isIntermediateLoadSupported(L, LoadI, InstList))
+            continue;
+          LLVM_DEBUG(dbgs() << "Found load candidate " << *LoadI << "\n");
+          CandidateMemoryLoads.push_back(LoadI);
+          DependentInstList.push_back(InstList);
+        }
+      } else if (StoreInst *StoreI = dyn_cast<StoreInst>(&I)) {
+        // Keep track of store insts to avoid conflict.
+        LoopStorePtrs.push_back(StoreI->getPointerOperand());
+      }
+  }
+
+  // Keep track of previously transformed instrs for offset load and target
+  // loads so we can reuse them.
+  SmallVector<DenseMap<Value *, Value *>> Transforms;
+  for (unsigned i = 0; i < CandidateMemoryLoads.size(); i++) {
+    SmallSetVector<Instruction *, 8> DependentInsts = DependentInstList[i];
+    unsigned NumLoads = 0;
+    bool NoConflict = true;
+    // Find candidate that contains indirect loads and check load for offset
+    // doesn't alias with other stores.
+    for (auto DependentInst : DependentInsts) {
+      if (LoadInst *LoadI = dyn_cast<LoadInst>(DependentInst)) {
+        NumLoads++;
+        // For the load of target address offset, we avoid the load being
+        // conflict with stores in the same loop.
+        if (NumLoads == IndirectionLevel) {
+          Value *LoadPtr = LoadI->getPointerOperand();
+          for (Value *StorePtr : LoopStorePtrs)
+            if (AA->isMustAlias(LoadPtr, StorePtr)) {
+              NoConflict = false;
+              break;
+            }
+        }
+      }
+    }
+
+    // Prefetch all indirect loads without conflict to the offset load.
+    if (NumLoads == IndirectionLevel && NoConflict) {
+      MadeChange |= insertPrefetcherForIndirectLoad(
+          L, i, NumIterations, CandidateMemoryLoads, DependentInsts,
+          AuxIndBounds, Transforms, ItersAhead);
+    }
+  }
+
+  cleanLoopIterationNumber(NumIterations);
+  return MadeChange;
+}
+
 PreservedAnalyses LoopDataPrefetchPass::run(Function &F,
                                             FunctionAnalysisManager &AM) {
+  AliasAnalysis *AA = &AM.getResult<AAManager>(F);
   DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
   LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
   ScalarEvolution *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
@@ -179,8 +1125,16 @@ PreservedAnalyses LoopDataPrefetchPass::run(Function &F,
       &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   const TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);
 
-  LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
-  bool Changed = LDP.run();
+  // Ensure loops are in simplified form which is a pre-requisite for loop data
+  // prefetch pass. Added only for new PM since the legacy PM has already added
+  // LoopSimplify pass as a dependency.
+  bool Changed = false;
+  for (auto &L : *LI) {
+    Changed |= simplifyLoop(L, DT, LI, SE, AC, nullptr, false);
+  }
+
+  LoopDataPrefetch LDP(AA, AC, DT, LI, SE, TTI, ORE);
+  Changed |= LDP.run();
 
   if (Changed) {
     PreservedAnalyses PA;
@@ -196,6 +1150,7 @@ bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
+  AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
@@ -206,7 +1161,7 @@ bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) {
   const TargetTransformInfo *TTI =
       &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
-  LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
+  LoopDataPrefetch LDP(AA, AC, DT, LI, SE, TTI, ORE);
   return LDP.run();
 }
 
@@ -214,14 +1169,26 @@ bool LoopDataPrefetch::run() {
   // If PrefetchDistance is not set, don't run the pass.  This gives an
   // opportunity for targets to run this pass for selected subtargets only
   // (whose TTI sets PrefetchDistance and CacheLineSize).
-  if (getPrefetchDistance() == 0 || TTI->getCacheLineSize() == 0) {
+  if (getPrefetchDistance() == 0 ||
+      (TTI->getCacheLineSize() == 0 && CachelineSize == 0)) {
     LLVM_DEBUG(dbgs() << "Please set both PrefetchDistance and CacheLineSize "
                          "for loop data prefetch.\n");
     return false;
   }
 
   bool MadeChange = false;
+  if (DisableDirectLoadPrefetch.getNumOccurrences() == 0 &&
+      !TTI->isProfitableToDirectPrefetch()) {
+    LLVM_DEBUG(dbgs() << "Disabling direct load prefetching.\n");
+    DisableDirectLoadPrefetch = true;
+  }
 
+  if (RandomAccessPrefetch) {
+    OuterLoopPrefetch = true;
+  }
+  if (DisableDirectLoadPrefetch && !IndirectLoadPrefetch &&
+      !RandomAccessPrefetch)
+    return MadeChange;
   for (Loop *I : *LI)
     for (Loop *L : depth_first(I))
       MadeChange |= runOnLoop(L);
@@ -274,10 +1241,18 @@ struct Prefetch {
 bool LoopDataPrefetch::runOnLoop(Loop *L) {
   bool MadeChange = false;
 
-  // Only prefetch in the inner-most loop
-  if (!L->isInnermost())
+  if (L->getLoopDepth() < PrefetchLoopDepth)
     return MadeChange;
 
+  bool IsInnerMost = true;
+  // Prefetch outer loop if needed.
+  if (!L->isInnermost()) {
+    if (OuterLoopPrefetch)
+      IsInnerMost = false;
+    else
+      return MadeChange;
+  }
+
   SmallPtrSet<const Value *, 32> EphValues;
   CodeMetrics::collectEphemeralValues(L, AC, EphValues);
 
@@ -323,78 +1298,101 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
   unsigned NumMemAccesses = 0;
   unsigned NumStridedMemAccesses = 0;
   SmallVector<Prefetch, 16> Prefetches;
-  for (const auto BB : L->blocks())
-    for (auto &I : *BB) {
-      Value *PtrValue;
-      Instruction *MemI;
-
-      if (LoadInst *LMemI = dyn_cast<LoadInst>(&I)) {
-        MemI = LMemI;
-        PtrValue = LMemI->getPointerOperand();
-      } else if (StoreInst *SMemI = dyn_cast<StoreInst>(&I)) {
-        if (!doPrefetchWrites()) continue;
-        MemI = SMemI;
-        PtrValue = SMemI->getPointerOperand();
-      } else continue;
-
-      unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
-      if (!TTI->shouldPrefetchAddressSpace(PtrAddrSpace))
-        continue;
-      NumMemAccesses++;
-      if (L->isLoopInvariant(PtrValue))
-        continue;
-
-      const SCEV *LSCEV = SE->getSCEV(PtrValue);
-      const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
-      if (!LSCEVAddRec)
-        continue;
-      NumStridedMemAccesses++;
-
-      // We don't want to double prefetch individual cache lines. If this
-      // access is known to be within one cache line of some other one that
-      // has already been prefetched, then don't prefetch this one as well.
-      bool DupPref = false;
-      for (auto &Pref : Prefetches) {
-        const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.LSCEVAddRec);
-        if (const SCEVConstant *ConstPtrDiff =
-            dyn_cast<SCEVConstant>(PtrDiff)) {
-          int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue());
-          if (PD < (int64_t) TTI->getCacheLineSize()) {
-            Pref.addInstruction(MemI, DT, PD);
-            DupPref = true;
-            break;
+  if (!DisableDirectLoadPrefetch) {
+    for (const auto BB : L->blocks()) {
+      // If this is not inner most, we avoid prefetching in sub loops.
+      for (auto &I : *BB) {
+        Value *PtrValue = nullptr;
+        Instruction *MemI;
+
+        if (LoadInst *LMemI = dyn_cast<LoadInst>(&I)) {
+          MemI = LMemI;
+          PtrValue = LMemI->getPointerOperand();
+        } else if (StoreInst *SMemI = dyn_cast<StoreInst>(&I)) {
+          if (!doPrefetchWrites())
+            continue;
+          MemI = SMemI;
+          PtrValue = SMemI->getPointerOperand();
+        } else
+          continue;
+
+        if (!PtrValue)
+          continue;
+        if (getPrefetchDistance() == 0)
+          continue;
+
+        unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
+        if (!TTI->shouldPrefetchAddressSpace(PtrAddrSpace))
+          continue;
+        NumMemAccesses++;
+        if (L->isLoopInvariant(PtrValue))
+          continue;
+
+        const SCEV *LSCEV = SE->getSCEV(PtrValue);
+        const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+        if (!LSCEVAddRec)
+          continue;
+        NumStridedMemAccesses++;
+
+        // For outer loops, we only prefetch memory instruction with stride
+        // depending on the current loop.
+        if (!IsInnerMost && LSCEVAddRec->getLoop() != L)
+          continue;
+
+        // We don't want to double prefetch individual cache lines. If this
+        // access is known to be within one cache line of some other one that
+        // has already been prefetched, then don't prefetch this one as well.
+        bool DupPref = false;
+        for (auto &Pref : Prefetches) {
+          const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.LSCEVAddRec);
+          if (const SCEVConstant *ConstPtrDiff =
+                  dyn_cast<SCEVConstant>(PtrDiff)) {
+            int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue());
+            // Use the CachelineSize value from compiler option.
+            int64_t CacheLineSize = CachelineSize.getNumOccurrences()
+                                        ? CachelineSize
+                                        : TTI->getCacheLineSize();
+            // if TTI CacheLineSize is zero then, default CachelineSize will
+            // use.
+            CacheLineSize = CacheLineSize ? CacheLineSize : CachelineSize;
+            if (PD < (int64_t)CacheLineSize) {
+              Pref.addInstruction(MemI, DT, PD);
+              DupPref = true;
+              break;
+            }
           }
         }
+        if (!DupPref)
+          Prefetches.push_back(Prefetch(LSCEVAddRec, MemI));
       }
-      if (!DupPref)
-        Prefetches.push_back(Prefetch(LSCEVAddRec, MemI));
     }
+  }
 
-  unsigned TargetMinStride =
-    getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
-                         Prefetches.size(), HasCall);
+  unsigned TargetMinStride = getMinPrefetchStride(
+      NumMemAccesses, NumStridedMemAccesses, Prefetches.size(), HasCall);
 
   LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead
-             << " iterations ahead (loop size: " << LoopSize << ") in "
-             << L->getHeader()->getParent()->getName() << ": " << *L);
-  LLVM_DEBUG(dbgs() << "Loop has: "
-             << NumMemAccesses << " memory accesses, "
-             << NumStridedMemAccesses << " strided memory accesses, "
-             << Prefetches.size() << " potential prefetch(es), "
-             << "a minimum stride of " << TargetMinStride << ", "
-             << (HasCall ? "calls" : "no calls") << ".\n");
+                    << " iterations ahead (loop size: " << LoopSize << ") in "
+                    << L->getHeader()->getParent()->getName() << ": " << *L);
+  LLVM_DEBUG(dbgs() << "Loop has: " << NumMemAccesses << " memory accesses, "
+                    << NumStridedMemAccesses << " strided memory accesses, "
+                    << Prefetches.size() << " potential prefetch(es), "
+                    << "a minimum stride of " << TargetMinStride << ", "
+                    << (HasCall ? "calls" : "no calls") << ".\n");
 
   for (auto &P : Prefetches) {
     // Check if the stride of the accesses is large enough to warrant a
-    // prefetch.
+    // prefetch. If MinPrefetchStride <= 1, no need to check if any stride
+    // goes.
     if (!isStrideLargeEnough(P.LSCEVAddRec, TargetMinStride))
       continue;
 
     BasicBlock *BB = P.InsertPt->getParent();
     SCEVExpander SCEVE(*SE, BB->getModule()->getDataLayout(), "prefaddr");
-    const SCEV *NextLSCEV = SE->getAddExpr(P.LSCEVAddRec, SE->getMulExpr(
-      SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead),
-      P.LSCEVAddRec->getStepRecurrence(*SE)));
+    const SCEV *NextLSCEV = SE->getAddExpr(
+        P.LSCEVAddRec,
+        SE->getMulExpr(SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead),
+                       P.LSCEVAddRec->getStepRecurrence(*SE)));
     if (!SCEVE.isSafeToExpand(NextLSCEV))
       continue;
 
@@ -405,24 +1403,26 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
     IRBuilder<> Builder(P.InsertPt);
     Module *M = BB->getParent()->getParent();
     Type *I32 = Type::getInt32Ty(BB->getContext());
-    Function *PrefetchFunc = Intrinsic::getDeclaration(
-        M, Intrinsic::prefetch, PrefPtrValue->getType());
-    Builder.CreateCall(
-        PrefetchFunc,
-        {PrefPtrValue,
-         ConstantInt::get(I32, P.Writes),
-         ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
+    Function *PrefetchFunc = Intrinsic::getDeclaration(M, Intrinsic::prefetch,
+                                                       PrefPtrValue->getType());
+    Builder.CreateCall(PrefetchFunc,
+                       {PrefPtrValue, ConstantInt::get(I32, P.Writes),
+                        ConstantInt::get(I32, IsInnerMost ? 3 : 0),
+                        ConstantInt::get(I32, 1)});
     ++NumPrefetches;
     LLVM_DEBUG(dbgs() << "  Access: "
-               << *P.MemI->getOperand(isa<LoadInst>(P.MemI) ? 0 : 1)
-               << ", SCEV: " << *P.LSCEVAddRec << "\n");
+                      << *P.MemI->getOperand(isa<LoadInst>(P.MemI) ? 0 : 1)
+                      << ", SCEV: " << *P.LSCEVAddRec << "\n");
     ORE->emit([&]() {
-        return OptimizationRemark(DEBUG_TYPE, "Prefetched", P.MemI)
-          << "prefetched memory access";
-      });
+      return OptimizationRemark(DEBUG_TYPE, "Prefetched", P.MemI)
+             << "prefetched memory access";
+    });
 
     MadeChange = true;
   }
 
+  if (IndirectLoadPrefetch || RandomAccessPrefetch)
+    MadeChange |= doIndirectPrefetch(L, ItersAhead);
+
   return MadeChange;
 }
diff --git a/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch_crc.ll b/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch_crc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e6a37ae38f0683ae8d56aa4b4b862e6444bd14e7
--- /dev/null
+++ b/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch_crc.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=loop-data-prefetch --prefetch-distance=1024 --random-access-prefetch=true -disable-direct-prefetch -S | FileCheck %s --check-prefixes=CHECK,CHECK-NON-SVE
+; RUN: opt < %s -mcpu=tsv110 -passes=loop-data-prefetch --random-access-prefetch=true -disable-direct-prefetch -S | FileCheck %s --check-prefixes=CHECK,CHECK-NON-SVE
+; RUN: opt < %s -mcpu=hip09 -passes=loop-data-prefetch --random-access-prefetch=true --prefetch-with-nonfaulty-load=false -S | FileCheck %s --check-prefixes=CHECK,CHECK-NON-SVE
+; RUN: opt < %s -mcpu=hip12 -passes=loop-data-prefetch --random-access-prefetch=true -S | FileCheck %s --check-prefixes=CHECK,CHECK-SVE
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gun"
+
+declare i32 @llvm.aarch64.crc32w(i32, i32)
+
+; Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: read) uwtable
+define dso_local noundef i32 @_Z12matchcolumnsPiiS_ii(ptr nocapture noundef readonly %A, i32 noundef %B, ptr nocapture noundef readonly %Key, i32 noundef %index, i32 noundef %count) {
+; CHECK-LABEL: @_Z12matchcolumnsPiiS_ii(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret i32 [[ADD:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV22:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT23:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[SUM_020:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD]], [[IF_END]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDVARS_IV22]], 60
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i64 [[TMP0]], 99
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 99
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDVARS_IV22]], 120
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV22]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NON-SVE-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-SVE-NEXT: [[TMP7_1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 1)
+; CHECK-SVE-NEXT: [[TMP7_2:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.ldnf1.nxv4i32(<vscale x 4 x i1> [[TMP7_1]], ptr [[TMP5]])
+; CHECK-SVE-NEXT: [[TMP7:%.*]] = extractelement <vscale x 4 x i32> [[TMP7_2]], i32 0
+; CHECK-NEXT:    call void @llvm.prefetch.p0(ptr [[TMP4]], i32 0, i32 3, i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call i32 @llvm.aarch64.crc32w(i32 [[TMP6]], i32 -1)
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call i32 @llvm.aarch64.crc32w(i32 [[TMP7]], i32 -1)
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[TMP8]], 255
+; CHECK-NEXT:    [[TMP10:%.*]] = and i32 [[TMP9]], 255
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[AND]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[KEY:%.*]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[KEY]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    call void @llvm.prefetch.p0(ptr [[TMP12]], i32 0, i32 3, i32 1)
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP13]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label [[IF_END]], label [[DO_BODY_PREHEADER:%.*]]
+; CHECK:       do.body.preheader:
+; CHECK-NEXT:    br label [[DO_BODY:%.*]]
+; CHECK:       do.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DO_BODY]] ], [ [[IDXPROM1]], [[DO_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[KEY]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[CMP6_NOT:%.*]] = icmp eq i32 [[TMP14]], [[B]]
+; CHECK-NEXT:    br i1 [[CMP6_NOT]], label [[IF_END_LOOPEXIT:%.*]], label [[DO_BODY]]
+; CHECK:       if.end.loopexit:
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[AKEY_1:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP15]], [[IF_END_LOOPEXIT]] ]
+; CHECK-NEXT:    [[IDXPROM7:%.*]] = sext i32 [[AKEY_1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP16]], [[SUM_020]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT23]] = add nuw nsw i64 [[INDVARS_IV22]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT23]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+for.body:
+  %indvars.iv22 = phi i64 [ 0, %entry ], [ %indvars.iv.next23, %if.end ]
+  %sum.020 = phi i32 [ 0, %entry ], [ %add, %if.end ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv22
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = tail call i32 @llvm.aarch64.crc32w(i32 %0, i32 -1)
+  %and = and i32 %1, 255
+  %idxprom1 = zext i32 %and to i64
+  %arrayidx2 = getelementptr inbounds i32, ptr %Key, i64 %idxprom1
+  %2 = load i32, ptr %arrayidx2, align 4
+  %cmp3.not = icmp eq i32 %2, %B
+  br i1 %cmp3.not, label %if.end, label %do.body
+
+do.body:
+  %indvars.iv = phi i64 [ %idxprom1, %for.body ], [ %indvars.iv.next, %do.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx5 = getelementptr inbounds i32, ptr %Key, i64 %indvars.iv.next
+  %3 = load i32, ptr %arrayidx5, align 4
+  %cmp6.not = icmp eq i32 %3, %B
+  br i1 %cmp6.not, label %if.end.loopexit, label %do.body
+
+if.end.loopexit:
+  %4 = trunc i64 %indvars.iv.next to i32
+  br label %if.end
+
+if.end:
+  %AKey.1 = phi i32 [ %and, %for.body ], [ %4, %if.end.loopexit ]
+  %idxprom7 = sext i32 %AKey.1 to i64
+  %arrayidx8 = getelementptr inbounds i32, ptr %A, i64 %idxprom7
+  %5 = load i32, ptr %arrayidx8, align 4
+  %add = add nsw i32 %5, %sum.020
+  %indvars.iv.next23 = add nuw nsw i64 %indvars.iv22, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next23, 100
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}