From edf4e21db734846ecc6d8339244c564706d0217b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E5=93=B2=E6=B5=A9?= <2209576006@qq.com>
Date: Sun, 28 Sep 2025 13:38:02 +0800
Subject: [PATCH 1/2] [AArch64] Simplifies SVE gather/scatter address svadd
 chains in loops

---
 llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp  | 388 +++++++++++++
 llvm/test/CodeGen/AArch64/O3-pipeline.ll      |   1 +
 .../aarch64-sve-addressing-peephole.ll        | 526 ++++++++++++++++++
 3 files changed, 915 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll
diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
index c5a6cb7af405..87939363122c 100644
--- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
+++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -24,6 +24,7 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
@@ -41,9 +42,28 @@ using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "aarch64-sve-intrinsic-opts"
 
+static cl::opt<bool> EnableSVELoopAddressChainOpt(
+  "aarch64-sve-loop-address-chain-opt", cl::init(false), cl::Hidden,
+  cl::desc("Enable simplification of SVE address computation chains in loops"));
+
 namespace {
 struct SVEIntrinsicOpts : public ModulePass {
   static char ID; // Pass identification, replacement for typeid
+
+  enum class SVEIndexExtension { SIGN, ZERO, NONE }; // NONE for i64 indices
+
+  struct SVEMemoryOpInfo {
+    unsigned BaseOpIdx;
+    unsigned IndexOpIdx;
+    Type *ElemTy; // The type of the data element being loaded/stored.
+    SVEIndexExtension ExtKind;
+  };
+
+  // The key is {OriginalBasePointer, Index}
+  using InvariantBaseKey = std::pair<Value *, Value *>;
+  // The cache maps this key to the computed GEP.
+  using InvariantBaseCache = DenseMap<InvariantBaseKey, Value *>;
+
   SVEIntrinsicOpts() : ModulePass(ID) {
     initializeSVEIntrinsicOptsPass(*PassRegistry::getPassRegistry());
   }
@@ -60,6 +80,16 @@ private:
 
   bool optimizeInstructions(SmallSetVector<Function *, 4> &Functions);
 
+  std::optional<SVEMemoryOpInfo> getSVEMemoryOpInfo(const IntrinsicInst *II);
+  Value *getLoopInvariantSplatValue(Value *V, Loop *L);
+  Value *getHoistedBaseForIndex(Value *Index, Value *OriBase, Loop *L,
+                                InvariantBaseCache &Cache,
+                                SVEMemoryOpInfo &OpInfo);
+  bool simplifySVEAddressComputation(IntrinsicInst *II, Loop *L,
+                                     InvariantBaseCache &Cache,
+                                     SVEMemoryOpInfo &OpInfo);
+  bool runSVEAddressHoisting(Function &F, LoopInfo &LI);
+
   /// Operates at the function-scope. I.e., optimizations are applied local to
   /// the functions themselves.
   bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions);
@@ -68,6 +98,7 @@ private:
 
 void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addRequired<LoopInfoWrapperPass>();
   AU.setPreservesCFG();
 }
 
@@ -75,6 +106,7 @@ char SVEIntrinsicOpts::ID = 0;
 static const char *name = "SVE intrinsics optimizations";
 INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
 INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
 
 ModulePass *llvm::createSVEIntrinsicOptsPass() {
@@ -428,6 +460,332 @@ bool SVEIntrinsicOpts::optimizeInstructions(
   return Changed;
 }
 
+/// Checks if an intrinsic is an SVE gather/scatter memory operation that this
+/// optimization can analyze. Return the operand information (Base index, Index
+/// index, Element Type, and Extension Kind) if supported
+std::optional<SVEIntrinsicOpts::SVEMemoryOpInfo>
+SVEIntrinsicOpts::getSVEMemoryOpInfo(const IntrinsicInst *II) {
+  switch (II->getIntrinsicID()) {
+  // Gather Loads
+  case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
+  case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
+    return {{1, 2,
+             dyn_cast<ScalableVectorType>(II->getType())->getElementType(),
+             SVEIndexExtension::SIGN}}; // Base=1, Index=2, Ext=SIGN
+  case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
+  case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
+    return {{1, 2,
+             dyn_cast<ScalableVectorType>(II->getType())->getElementType(),
+             SVEIndexExtension::ZERO}}; // Base=1, Index=2, Ext=ZERO
+  case Intrinsic::aarch64_sve_ld1_gather_index:
+  case Intrinsic::aarch64_sve_ldff1_gather_index:
+  case Intrinsic::aarch64_sve_ldnt1_gather_index:
+    return {{1, 2,
+             dyn_cast<ScalableVectorType>(II->getType())->getElementType(),
+             SVEIndexExtension::NONE}}; // Base=1, Index=2, Ext=NONE
+
+  // Prefetches (have no return value, element type is based on name)
+  case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
+    return {{1, 2, Type::getInt64Ty(II->getParent()->getContext()),
+             SVEIndexExtension::SIGN}};
+  case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
+    return {{1, 2, Type::getInt16Ty(II->getParent()->getContext()),
+             SVEIndexExtension::SIGN}};
+  case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
+    return {{1, 2, Type::getInt32Ty(II->getParent()->getContext()),
+             SVEIndexExtension::SIGN}};
+  case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
+    return {{1, 2, Type::getInt64Ty(II->getParent()->getContext()),
+             SVEIndexExtension::ZERO}};
+  case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
+    return {{1, 2, Type::getInt16Ty(II->getParent()->getContext()),
+             SVEIndexExtension::ZERO}};
+  case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
+    return {{1, 2, Type::getInt32Ty(II->getParent()->getContext()),
+             SVEIndexExtension::ZERO}};
+  case Intrinsic::aarch64_sve_prfd_gather_index:
+    return {{1, 2, Type::getInt64Ty(II->getParent()->getContext()),
+             SVEIndexExtension::NONE}};
+  case Intrinsic::aarch64_sve_prfh_gather_index:
+    return {{1, 2, Type::getInt16Ty(II->getParent()->getContext()),
+             SVEIndexExtension::NONE}};
+  case Intrinsic::aarch64_sve_prfw_gather_index:
+    return {{1, 2, Type::getInt32Ty(II->getParent()->getContext()),
+             SVEIndexExtension::NONE}};
+
+  // Scatter Stores (data is operand 0, element type is derived from it)
+  case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
+    return {{2, 3,
+             dyn_cast<ScalableVectorType>(II->getOperand(0)->getType())
+                 ->getElementType(),
+             SVEIndexExtension::SIGN}}; // Base=2, Index=3, Ext=SIGN
+  case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
+    return {{2, 3,
+             dyn_cast<ScalableVectorType>(II->getOperand(0)->getType())
+                 ->getElementType(),
+             SVEIndexExtension::ZERO}}; // Base=2, Index=3, Ext=ZERO
+  case Intrinsic::aarch64_sve_st1_scatter_index:
+  case Intrinsic::aarch64_sve_stnt1_scatter_index:
+    return {{2, 3,
+             dyn_cast<ScalableVectorType>(II->getOperand(0)->getType())
+                 ->getElementType(),
+             SVEIndexExtension::NONE}}; // Base=2, Index=3, Ext=NONE
+
+  default:
+    return std::nullopt;
+  }
+}
+
+/// Check if a Value is a splat of a loop-invariant scalar, which is a
+/// shufflevector of an insertelement at index 0. If the pattern matches, return
+/// the loop scalar value.
+Value *SVEIntrinsicOpts::getLoopInvariantSplatValue(Value *V, Loop *L) {
+  Value *InvariantScalar = nullptr;
+  Value *InsertElementVal = nullptr;
+
+  if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
+    InsertElementVal = SV->getOperand(0);
+  } else if (auto *SVC = dyn_cast<ConstantExpr>(V)) {
+    if (SVC->getOpcode() == Instruction::ShuffleVector) {
+      InsertElementVal = SVC->getOperand(0);
+    }
+  }
+
+  if (!InsertElementVal)
+    return nullptr;
+
+  // Check if InsertElementVal is an insertelement and get the scalar.
+  if (auto *IE = dyn_cast<InsertElementInst>(InsertElementVal)) {
+    if (match(IE->getOperand(2),
+              m_Zero())) { // Ensure it's inserting at index 0
+      InvariantScalar = IE->getOperand(1);
+    }
+  } else if (auto *IEC = dyn_cast<ConstantExpr>(InsertElementVal)) {
+    if (IEC->getOpcode() == Instruction::InsertElement &&
+        match(IEC->getOperand(2), m_Zero())) {
+      InvariantScalar = IEC->getOperand(1);
+    }
+  }
+
+  if (!InvariantScalar || !L->isLoopInvariant(InvariantScalar))
+    return nullptr;
+
+  return InvariantScalar;
+}
+
+/// Analyzes an index calculation chain and generates hoistable GEPs.
+/// @param Index The starting index Value (from the sve memory op)
+/// @param OrigBase The original base pointer from the sve memory op
+/// @param L The loop context
+/// @param Cache A map to memoize results for `{OrigBase, Index} : NewBase`
+/// @param OpInfo Information about the sve memory op
+/// @return The final, rewritten base pointer for the memory op
+Value *SVEIntrinsicOpts::getHoistedBaseForIndex(Value *Index, Value *OrigBase,
+                                                Loop *L,
+                                                InvariantBaseCache &Cache,
+                                                SVEMemoryOpInfo &OpInfo) {
+  InvariantBaseKey InitialKey = {OrigBase, Index};
+
+  // If this entire chain has been processed before, return the final result.
+  if (Cache.count(InitialKey))
+    return Cache.lookup(InitialKey);
+
+  // --- Trace the ADD chain up to the root, collecting nodes ---
+  SmallVector<Value *, 8> IndexChain;
+  Value *CurrentIndex = Index;
+  Value *RootIndex = nullptr;
+
+  // The `while` loop traces the `sve.add` chain upwards from the `Index` used
+  // by the memory op, collecting all intermediate indices onto a stack
+  // (`IndexChain`). The trace stops when it hits a value that is not an `(index
+  // + invariant)` add, which becomes the `RootIndex`.
+  while (true) {
+    IndexChain.push_back(CurrentIndex);
+    InvariantBaseKey CurrentKey = {OrigBase, CurrentIndex};
+
+    // If a subchain in the chain is already solved, stop tracing
+    if (Cache.count(CurrentKey)) {
+      RootIndex = CurrentIndex;
+      break;
+    }
+
+    auto *Add = dyn_cast<IntrinsicInst>(CurrentIndex);
+    // Stop if not a recognized sve.add intrinsic or not defined in the loop
+    if (!Add ||
+        (Add->getIntrinsicID() != Intrinsic::aarch64_sve_add &&
+         Add->getIntrinsicID() != Intrinsic::aarch64_sve_add_u) ||
+        !L->contains(Add)) {
+      RootIndex = CurrentIndex;
+      break;
+    }
+
+    Value *Op1 = Add->getOperand(1);
+    Value *Op2 = Add->getOperand(2);
+
+    // Check if one of the operands is an invariant splat
+    if (getLoopInvariantSplatValue(Op1, L)) {
+      CurrentIndex = Op2;
+      if (match(Op2, m_Select(m_Value(), m_Value(), m_Zero())))
+        CurrentIndex = dyn_cast<Instruction>(Op2)->getOperand(1);
+    } else if (getLoopInvariantSplatValue(Op2, L)) {
+      CurrentIndex = Op1;
+      if (match(Op1, m_Select(m_Value(), m_Value(), m_Zero())))
+        CurrentIndex = dyn_cast<Instruction>(Op1)->getOperand(1);
+    } else {
+      RootIndex = CurrentIndex; // Not an (index + invariant) form.
+      break;
+    }
+  }
+
+  // --- Build GEPs back down the chain ---
+  // The base for the root index is always the original base pointer
+  Value *CurrentHoistedBase = Cache.lookup({OrigBase, RootIndex});
+  if (!CurrentHoistedBase) {
+    CurrentHoistedBase = OrigBase;
+    Cache[{OrigBase, RootIndex}] = OrigBase;
+  }
+
+  // Iterates down the collected chain (in reverse). For each node, it computes
+  // the new hoisted base by creating a GEP on top of the base of the previous
+  // node in the chain.
+  for (Value *IdxNode : reverse(IndexChain)) {
+    if (IdxNode == RootIndex)
+      continue;
+
+    InvariantBaseKey CurrentKey = {OrigBase, IdxNode};
+    auto *Add = dyn_cast<IntrinsicInst>(IdxNode);
+    Value *Op1 = Add->getOperand(1);
+    Value *Op2 = Add->getOperand(2);
+
+    Value *InvariantScalar = getLoopInvariantSplatValue(Op1, L);
+    if (!InvariantScalar)
+      InvariantScalar = getLoopInvariantSplatValue(Op2, L);
+    assert(InvariantScalar);
+
+    IRBuilder<> Builder(Add);
+    Value *GEPIndex = InvariantScalar;
+
+    // Ensure the invariant has the correct integer type for GEP
+    switch (OpInfo.ExtKind) {
+    case SVEIndexExtension::SIGN:
+      GEPIndex = Builder.CreateSExt(
+          GEPIndex, Type::getInt64Ty(Add->getParent()->getContext()),
+          "invariant.idx.sext");
+      break;
+    case SVEIndexExtension::ZERO:
+      GEPIndex = Builder.CreateZExt(
+          GEPIndex, Type::getInt64Ty(Add->getParent()->getContext()),
+          "invariant.idx.zext");
+      break;
+    case SVEIndexExtension::NONE:
+      break;
+    }
+
+    Value *NewBase = Builder.CreateGEP(OpInfo.ElemTy, CurrentHoistedBase,
+                                       GEPIndex, "add.ptr");
+    // Cache the result for this node and update the base for the next iteration
+    Cache[CurrentKey] = NewBase;
+    CurrentHoistedBase = NewBase;
+  }
+
+  return Cache.lookup(InitialKey);
+}
+
+/// Get the final rewritten base and root index, and rewrite the memory
+/// intrinsic
+bool SVEIntrinsicOpts::simplifySVEAddressComputation(
+    IntrinsicInst *MemIntrinsic, Loop *L, InvariantBaseCache &Cache,
+    SVEMemoryOpInfo &OpInfo) {
+  Value *OrigBase = MemIntrinsic->getArgOperand(OpInfo.BaseOpIdx);
+  Value *OrigIndex = MemIntrinsic->getArgOperand(OpInfo.IndexOpIdx);
+
+  // If the base itself is not loop invariant, skip simplification
+  if (!L->isLoopInvariant(OrigBase))
+    return false;
+
+  // The actual index might be hidden behind a `select(pg, index, zero)`
+  // Peel this away to get to the core index calculation
+  Value *IndexToTrace = OrigIndex;
+  if (match(IndexToTrace, m_Select(m_Value(), m_Value(), m_Zero()))) {
+    IndexToTrace = dyn_cast<Instruction>(IndexToTrace)->getOperand(1);
+  }
+  // This call populates the cache for the entire chain and returns the final
+  // base
+  Value *NewBase =
+      getHoistedBaseForIndex(IndexToTrace, OrigBase, L, Cache, OpInfo);
+
+  // If the base pointer hasn't changed, nothing was optimized.
+  if (NewBase == OrigBase)
+    return false;
+
+  // Now that the cache is populated, trace up from the starting index to find
+  // the root.
+  Value *RootIndex = IndexToTrace;
+  while (true) {
+    InvariantBaseKey CurrentKey = {OrigBase, RootIndex};
+    // The root is the node that maps back to the original base in the cache.
+    if (Cache.count(CurrentKey) && Cache.lookup(CurrentKey) == OrigBase) {
+      break;
+    }
+
+    auto *Add = dyn_cast<IntrinsicInst>(RootIndex);
+    Value *NextIndex = nullptr;
+    if (getLoopInvariantSplatValue(Add->getOperand(1), L)) {
+      NextIndex = Add->getOperand(2);
+    } else if (getLoopInvariantSplatValue(Add->getOperand(2), L)) {
+      NextIndex = Add->getOperand(1);
+    } else {
+      break; // Reached a non-optimizable ADD, this is the root
+    }
+
+    if (match(NextIndex, m_Select(m_Value(), m_Value(), m_Zero()))) {
+      RootIndex = dyn_cast<Instruction>(NextIndex)->getOperand(1);
+    } else {
+      RootIndex = NextIndex;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "SVE_ADDR_HOIST_IR: Rewriting " << *MemIntrinsic
+                    << "\n");
+
+  MemIntrinsic->setArgOperand(OpInfo.BaseOpIdx, NewBase);
+  MemIntrinsic->setArgOperand(OpInfo.IndexOpIdx, RootIndex);
+
+  // Cleanup would be complex. Rely on DCE for now.
+
+  LLVM_DEBUG(dbgs() << "SVE_ADDR_HOIST_IR: To -> " << *MemIntrinsic << "\n");
+
+  return true;
+}
+
+/// iterates through all basic blocks in a function. For each block
+/// that is part of a loop, it creates a fresh cache and then iterates through
+/// its instructions in program order, attempting to simplify any SVE memory
+/// operations it finds.
+bool SVEIntrinsicOpts::runSVEAddressHoisting(Function &F, LoopInfo &LI) {
+  bool Changed = false;
+  for (auto &BB : F) {
+    // We only care about blocks that are inside a loop.
+    Loop *L = LI.getLoopFor(&BB);
+    if (!L)
+      continue;
+
+    // A fresh cache is used for each basic block to ensure correctness.
+    // Maps {OriginalBasePointer, Index} to the new computed GEP.
+    InvariantBaseCache Cache;
+
+    // Iterate through instructions in program order (important!)
+    for (auto &I : BB) {
+      if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+        if (auto OpInfo = getSVEMemoryOpInfo(II))
+          Changed |= simplifySVEAddressComputation(II, L, Cache, *OpInfo);
+      }
+    }
+  }
+
+  return Changed;
+}
+
 bool SVEIntrinsicOpts::optimizeFunctions(
     SmallSetVector<Function *, 4> &Functions) {
   bool Changed = false;
@@ -435,6 +793,16 @@ bool SVEIntrinsicOpts::optimizeFunctions(
   Changed |= optimizePTrueIntrinsicCalls(Functions);
   Changed |= optimizeInstructions(Functions);
 
+  if (EnableSVELoopAddressChainOpt) {
+    for (Function *F : Functions) {
+      if (F->isDeclaration())
+        continue;
+
+      LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>(*F).getLoopInfo();
+      Changed |= runSVEAddressHoisting(*F, LI);
+    }
+  }
+
   return Changed;
 }
 
@@ -453,6 +821,26 @@ bool SVEIntrinsicOpts::runOnModule(Module &M) {
     case Intrinsic::vector_extract:
     case Intrinsic::vector_insert:
     case Intrinsic::aarch64_sve_ptrue:
+    case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
+    case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
+    case Intrinsic::aarch64_sve_ld1_gather_index:
+    case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
+    case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
+    case Intrinsic::aarch64_sve_ldff1_gather_index:
+    case Intrinsic::aarch64_sve_ldnt1_gather_index:
+    case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
+    case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
+    case Intrinsic::aarch64_sve_prfd_gather_index:
+    case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
+    case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
+    case Intrinsic::aarch64_sve_prfh_gather_index:
+    case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
+    case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
+    case Intrinsic::aarch64_sve_prfw_gather_index:
+    case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
+    case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
+    case Intrinsic::aarch64_sve_st1_scatter_index:
+    case Intrinsic::aarch64_sve_stnt1_scatter_index:
       for (User *U : F.users())
         Functions.insert(cast<Instruction>(U)->getFunction());
       break;
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index 3747b2581fa4..4ee13788c2c4 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -25,6 +25,7 @@
 ; CHECK-NEXT:     SVE intrinsics optimizations
 ; CHECK-NEXT:       FunctionPass Manager
 ; CHECK-NEXT:         Dominator Tree Construction
+; CHECK-NEXT:         Natural Loop Information
 ; CHECK-NEXT:     FunctionPass Manager
 ; CHECK-NEXT:       Simplify the CFG
 ; CHECK-NEXT:       Dominator Tree Construction
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll
new file mode 100644
index 000000000000..85d286c165ca
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll
@@ -0,0 +1,526 @@
+; RUN: llc -mtriple=aarch64-unknown -mcpu=hip09 -aarch64-sve-loop-address-chain-opt -O3 %s -o - | FileCheck %s
+
+define dso_local void @test_gather_multi_constOffset(i32 noundef %loopTime, ptr noundef %x, float noundef %ipx, float noundef %ipy, float noundef %ipz, ptr nocapture noundef nonnull align 4 dereferenceable(4) %tempx, ptr nocapture noundef nonnull align 4 dereferenceable(4) %tempy, ptr nocapture noundef nonnull align 4 dereferenceable(4) %tempz) local_unnamed_addr #0 {
+; CHECK-LABEL: test_gather_multi_constOffset:
+; CHECK:       .LBB0_2:                                // %for.body
+; CHECK:       add	x[[NEWBASE1:[0-9]+]], x1, #4
+; CHECK:       ld1w	{ z{{[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x1, z{{[0-9]+}}.s, sxtw #2]
+; CHECK:       ld1w	{ z{{[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE1]], z{{[0-9]+}}.s, sxtw #2]
+; CHECK:       add	x[[NEWBASE2:[0-9]+]], x1, #8
+; CHECK:       ld1w	{ z{{[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE2]], z{{[0-9]+}}.s, sxtw #2]
+entry:
+  %cmp18 = icmp sgt i32 %loopTime, 0
+  br i1 %cmp18, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %.splatinsert = insertelement <vscale x 4 x float> poison, float %ipx, i64 0
+  %.splat = shufflevector <vscale x 4 x float> %.splatinsert, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+  %.splatinsert2 = insertelement <vscale x 4 x float> poison, float %ipy, i64 0
+  %.splat3 = shufflevector <vscale x 4 x float> %.splatinsert2, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+  %.splatinsert5 = insertelement <vscale x 4 x float> poison, float %ipz, i64 0
+  %.splat6 = shufflevector <vscale x 4 x float> %.splatinsert5, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+  %.tr = tail call i32 @llvm.vscale.i32()
+  %0 = shl nuw nsw i32 %.tr, 2
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %jp.019 = phi i32 [ 0, %for.body.lr.ph ], [ %conv10, %for.body ]
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %jp.019, i32 %loopTime)
+  %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.index.nxv4i32(i32 %jp.019, i32 1)
+  %3 = select <vscale x 4 x i1> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> zeroinitializer
+  %4 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %3, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %5 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %1, ptr %x, <vscale x 4 x i32> %4)
+  %6 = select <vscale x 4 x i1> %1, <vscale x 4 x float> %5, <vscale x 4 x float> zeroinitializer
+  %7 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fsubr.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %6, <vscale x 4 x float> %.splat)
+  %8 = select <vscale x 4 x i1> %1, <vscale x 4 x float> %7, <vscale x 4 x float> zeroinitializer
+  %9 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %8, <vscale x 4 x float> %7)
+  %10 = select <vscale x 4 x i1> %1, <vscale x 4 x i32> %4, <vscale x 4 x i32> zeroinitializer
+  %11 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %10, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %12 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %1, ptr %x, <vscale x 4 x i32> %11)
+  %13 = select <vscale x 4 x i1> %1, <vscale x 4 x float> %12, <vscale x 4 x float> zeroinitializer
+  %14 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %13, <vscale x 4 x float> %.splat3)
+  %15 = select <vscale x 4 x i1> %1, <vscale x 4 x float> %14, <vscale x 4 x float> zeroinitializer
+  %16 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmad.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %15, <vscale x 4 x float> %14, <vscale x 4 x float> %9)
+  %17 = select <vscale x 4 x i1> %1, <vscale x 4 x i32> %11, <vscale x 4 x i32> zeroinitializer
+  %18 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %17, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %19 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %1, ptr %x, <vscale x 4 x i32> %18)
+  %20 = select <vscale x 4 x i1> %1, <vscale x 4 x float> %19, <vscale x 4 x float> zeroinitializer
+  %21 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %20, <vscale x 4 x float> %.splat6)
+  %22 = select <vscale x 4 x i1> %1, <vscale x 4 x float> %21, <vscale x 4 x float> zeroinitializer
+  %23 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmad.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %22, <vscale x 4 x float> %21, <vscale x 4 x float> %16)
+  %24 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fsqrt.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x i1> %1, <vscale x 4 x float> %23)
+  %25 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %8, <vscale x 4 x float> %24)
+  %26 = tail call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %25)
+  %27 = load float, ptr %tempx, align 4, !tbaa !5
+  %add = fadd float %26, %27
+  store float %add, ptr %tempx, align 4, !tbaa !5
+  %28 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %15, <vscale x 4 x float> %24)
+  %29 = tail call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %28)
+  %30 = load float, ptr %tempy, align 4, !tbaa !5
+  %add7 = fadd float %29, %30
+  store float %add7, ptr %tempy, align 4, !tbaa !5
+  %31 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %22, <vscale x 4 x float> %24)
+  %32 = tail call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %31)
+  %33 = load float, ptr %tempz, align 4, !tbaa !5
+  %add8 = fadd float %32, %33
+  store float %add8, ptr %tempz, align 4, !tbaa !5
+  %conv10 = add i32 %0, %jp.019
+  %cmp = icmp slt i32 %conv10, %loopTime
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !9
+}
+
+define dso_local void @test_scatter_constOffset(i32 noundef %loopTime, ptr noalias noundef %dst, ptr noalias nocapture noundef readonly %tempx, ptr noalias nocapture noundef readonly %tempy, ptr noalias nocapture noundef readonly %tempz) local_unnamed_addr #0 {
+; CHECK-LABEL: test_scatter_constOffset:
+; CHECK:       .LBB1_2:                                // %for.body
+; CHECK:       add	x[[NEWBASE1:[0-9]+]], x1, #4
+; CHECK:       st1w	{ z{{[0-9]+}}.s }, p[[PG:[0-9]+]], [x1, z{{[0-9]+}}.s, sxtw #2]
+; CHECK:       st1w	{ z{{[0-9]+}}.s }, p[[PG]], [x[[NEWBASE1]], z{{[0-9]+}}.s, sxtw #2]
+; CHECK:       add	x[[NEWBASE2:[0-9]+]], x1, #8
+; CHECK:       st1w	{ z{{[0-9]+}}.s }, p[[PG]], [x[[NEWBASE2]], z{{[0-9]+}}.s, sxtw #2]
+entry:
+  %cmp15 = icmp sgt i32 %loopTime, 0
+  br i1 %cmp15, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %.tr = tail call i32 @llvm.vscale.i32()
+  %0 = shl nuw nsw i32 %.tr, 2
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %jp.016 = phi i32 [ 0, %for.body.lr.ph ], [ %conv5, %for.body ]
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %jp.016, i32 %loopTime)
+  %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.index.nxv4i32(i32 %jp.016, i32 1)
+  %3 = select <vscale x 4 x i1> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> zeroinitializer
+  %4 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %3, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %5 = select <vscale x 4 x i1> %1, <vscale x 4 x i32> %4, <vscale x 4 x i32> zeroinitializer
+  %6 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %5, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %7 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %5, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %idx.ext = sext i32 %jp.016 to i64
+  %add.ptr = getelementptr inbounds float, ptr %tempx, i64 %idx.ext
+  %8 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %add.ptr, i32 1, <vscale x 4 x i1> %1, <vscale x 4 x float> zeroinitializer), !tbaa !5
+  %add.ptr2 = getelementptr inbounds float, ptr %tempy, i64 %idx.ext
+  %9 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %add.ptr2, i32 1, <vscale x 4 x i1> %1, <vscale x 4 x float> zeroinitializer), !tbaa !5
+  %add.ptr4 = getelementptr inbounds float, ptr %tempz, i64 %idx.ext
+  %10 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %add.ptr4, i32 1, <vscale x 4 x i1> %1, <vscale x 4 x float> zeroinitializer), !tbaa !5
+  tail call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32(<vscale x 4 x float> %8, <vscale x 4 x i1> %1, ptr %dst, <vscale x 4 x i32> %4)
+  tail call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32(<vscale x 4 x float> %9, <vscale x 4 x i1> %1, ptr %dst, <vscale x 4 x i32> %6)
+  tail call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32(<vscale x 4 x float> %10, <vscale x 4 x i1> %1, ptr %dst, <vscale x 4 x i32> %7)
+  %conv5 = add i32 %0, %jp.016
+  %cmp = icmp slt i32 %conv5, %loopTime
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !11
+}
+
+define dso_local void @test_prefetch_constOffset(i32 noundef %loopTime, ptr nocapture noundef %data) local_unnamed_addr #4 {
+; CHECK-LABEL: test_prefetch_constOffset:
+; CHECK:       // %bb.4:                               // %if.end
+; CHECK:       add	x[[NEWBASE1:[0-9]+]], x1, #4
+; CHECK:       prfw	pldl1keep, p[[PG:[0-9]+]], [x1, z{{[0-9]+}}.s, sxtw #2]
+; CHECK:       prfw	pldl1keep, p[[PG]], [x[[NEWBASE1]], z{{[0-9]+}}.s, sxtw #2]
+; CHECK:       add	x[[NEWBASE2:[0-9]+]], x1, #8
+; CHECK:       prfw	pldl1keep, p[[PG]], [x[[NEWBASE2]], z{{[0-9]+}}.s, sxtw #2]
+entry:
+  %.tr = tail call i32 @llvm.vscale.i32()
+  %conv = shl nuw nsw i32 %.tr, 2
+  %cmp13 = icmp sgt i32 %loopTime, 0
+  br i1 %cmp13, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %cleanup, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %cleanup
+  %jp.014 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %cleanup ]
+  %add = add i32 %jp.014, %conv
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %add, i32 %loopTime)
+  %2 = tail call i1 @llvm.aarch64.sve.ptest.any.nxv4i1(<vscale x 4 x i1> %0, <vscale x 4 x i1> %1)
+  br i1 %2, label %if.end, label %cleanup
+
+if.end:                                           ; preds = %for.body
+  %3 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.index.nxv4i32(i32 %add, i32 1)
+  %4 = select <vscale x 4 x i1> %1, <vscale x 4 x i32> %3, <vscale x 4 x i32> zeroinitializer
+  %5 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %4, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  tail call void @llvm.aarch64.sve.prfw.gather.sxtw.index.nxv4i32(<vscale x 4 x i1> %1, ptr %data, <vscale x 4 x i32> %5, i32 0)
+  %6 = select <vscale x 4 x i1> %1, <vscale x 4 x i32> %5, <vscale x 4 x i32> zeroinitializer
+  %7 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %6, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  tail call void @llvm.aarch64.sve.prfw.gather.sxtw.index.nxv4i32(<vscale x 4 x i1> %1, ptr %data, <vscale x 4 x i32> %7, i32 0)
+  %8 = select <vscale x 4 x i1> %1, <vscale x 4 x i32> %7, <vscale x 4 x i32> zeroinitializer
+  %9 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %8, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  tail call void @llvm.aarch64.sve.prfw.gather.sxtw.index.nxv4i32(<vscale x 4 x i1> %1, ptr %data, <vscale x 4 x i32> %9, i32 0)
+  br label %cleanup
+
+cleanup:                                          ; preds = %for.body, %if.end
+  %cmp = icmp slt i32 %add, %loopTime
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !12
+}
+
+define dso_local void @test_stride_constOffset(i32 noundef %loopTime, ptr noundef %data, ptr nocapture noundef %result) local_unnamed_addr #0 {
+; CHECK-LABEL: test_stride_constOffset:
+; CHECK:       .LBB3_2:                                // %for.body
+; CHECK:       add	x[[NEWBASE1:[0-9]+]], x1, #8
+; CHECK:       ld1w	{ z{{[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x1, z{{[0-9]+}}.s, sxtw #2]
+; CHECK:       ld1w	{ z{{[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE1]], z{{[0-9]+}}.s, sxtw #2]
+; CHECK:       add	x[[NEWBASE2:[0-9]+]], x1, #16
+; CHECK:       ld1w	{ z{{[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE2]], z{{[0-9]+}}.s, sxtw #2]
+entry:
+  %cmp9 = icmp sgt i32 %loopTime, 0
+  br i1 %cmp9, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %.tr = tail call i32 @llvm.vscale.i32()
+  %0 = shl nuw nsw i32 %.tr, 2
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %jp.010 = phi i32 [ 0, %for.body.lr.ph ], [ %conv1, %for.body ]
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %jp.010, i32 %loopTime)
+  %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.index.nxv4i32(i32 %jp.010, i32 1)
+  %3 = select <vscale x 4 x i1> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> zeroinitializer
+  %4 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %3, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %5 = select <vscale x 4 x i1> %1, <vscale x 4 x i32> %4, <vscale x 4 x i32> zeroinitializer
+  %6 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %5, <vscale x 4 x i32> zeroinitializer)
+  %7 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %1, ptr %data, <vscale x 4 x i32> %6)
+  %8 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %5, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %9 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %1, ptr %data, <vscale x 4 x i32> %8)
+  %10 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %5, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 4, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %11 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %1, ptr %data, <vscale x 4 x i32> %10)
+  %12 = select <vscale x 4 x i1> %1, <vscale x 4 x float> %7, <vscale x 4 x float> zeroinitializer
+  %13 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %12, <vscale x 4 x float> %9)
+  %14 = select <vscale x 4 x i1> %1, <vscale x 4 x float> %13, <vscale x 4 x float> zeroinitializer
+  %15 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %14, <vscale x 4 x float> %11)
+  %idx.ext = sext i32 %jp.010 to i64
+  %add.ptr = getelementptr inbounds float, ptr %result, i64 %idx.ext
+  tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %15, ptr %add.ptr, i32 1, <vscale x 4 x i1> %1), !tbaa !5
+  %conv1 = add i32 %0, %jp.010
+  %cmp = icmp slt i32 %conv1, %loopTime
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !13
+}
+
+define dso_local void @test_invariantOffset32bit(i32 noundef %N, i32 noundef %M, ptr noundef %matrix, ptr nocapture noundef %result) local_unnamed_addr #0 {
+; CHECK-LABEL: test_invariantOffset32bit:
+; CHECK:       .LBB4_5:                                // %for.body4
+; CHECK:       add	x[[NEWBASE1:[0-9]+]], x2, w{{[0-9]+}}, sxtw #2
+; CHECK:       ld1w	{ z{{[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x[[NEWBASE1]], z{{[0-9]+}}.s, sxtw #2]
+; CHECK:       add	x[[NEWBASE2:[0-9]+]], x2, w{{[0-9]+}}, sxtw #2
+; CHECK:       ld1w	{ z{{[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE2]], z{{[0-9]+}}.s, sxtw #2]
+; CHECK:       add	x[[NEWBASE3:[0-9]+]], x2, w{{[0-9]+}}, sxtw #2
+; CHECK:       ld1w	{ z{{[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE3]], z{{[0-9]+}}.s, sxtw #2]
+entry:
+  %cmp41 = icmp sgt i32 %N, 2
+  br i1 %cmp41, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %div51 = udiv i32 %N, 3
+  %cmp239 = icmp sgt i32 %M, 0
+  %0 = sext i32 %M to i64
+  %wide.trip.count = zext i32 %div51 to i64
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.lr.ph, %for.cond.cleanup3
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next, %for.cond.cleanup3 ]
+  br i1 %cmp239, label %for.body4.lr.ph, label %for.cond.cleanup3
+
+for.body4.lr.ph:                                  ; preds = %for.cond1.preheader
+  %1 = mul nuw nsw i64 %indvars.iv, 3
+  %2 = trunc i64 %1 to i32
+  %3 = mul i32 %2, %M
+  %.splatinsert = insertelement <vscale x 4 x i32> poison, i32 %3, i64 0
+  %.splat = shufflevector <vscale x 4 x i32> %.splatinsert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %4 = trunc i64 %1 to i32
+  %5 = add i32 %4, 1
+  %6 = mul i32 %5, %M
+  %.splatinsert9 = insertelement <vscale x 4 x i32> poison, i32 %6, i64 0
+  %.splat10 = shufflevector <vscale x 4 x i32> %.splatinsert9, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %7 = trunc i64 %1 to i32
+  %8 = add i32 %7, 2
+  %9 = mul i32 %8, %M
+  %.splatinsert14 = insertelement <vscale x 4 x i32> poison, i32 %9, i64 0
+  %.splat15 = shufflevector <vscale x 4 x i32> %.splatinsert14, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %10 = mul nsw i64 %indvars.iv, %0
+  %add.ptr = getelementptr inbounds float, ptr %result, i64 %10
+  %.tr = tail call i32 @llvm.vscale.i32()
+  %11 = shl nuw nsw i32 %.tr, 2
+  br label %for.body4
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3, %entry
+  ret void
+
+for.cond.cleanup3:                                ; preds = %for.body4, %for.cond1.preheader
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.cond1.preheader, !llvm.loop !14
+
+for.body4:                                        ; preds = %for.body4.lr.ph, %for.body4
+  %jp.040 = phi i32 [ 0, %for.body4.lr.ph ], [ %conv20, %for.body4 ]
+  %12 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %jp.040, i32 %M)
+  %13 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.index.nxv4i32(i32 %jp.040, i32 1)
+  %14 = select <vscale x 4 x i1> %12, <vscale x 4 x i32> %13, <vscale x 4 x i32> zeroinitializer
+  %15 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %12, <vscale x 4 x i32> %14, <vscale x 4 x i32> %.splat)
+  %16 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %12, ptr %matrix, <vscale x 4 x i32> %15)
+  %17 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %12, <vscale x 4 x i32> %14, <vscale x 4 x i32> %.splat10)
+  %18 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %12, ptr %matrix, <vscale x 4 x i32> %17)
+  %19 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %12, <vscale x 4 x i32> %14, <vscale x 4 x i32> %.splat15)
+  %20 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %12, ptr %matrix, <vscale x 4 x i32> %19)
+  %21 = select <vscale x 4 x i1> %12, <vscale x 4 x float> %16, <vscale x 4 x float> zeroinitializer
+  %22 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1> %12, <vscale x 4 x float> %21, <vscale x 4 x float> %18)
+  %23 = select <vscale x 4 x i1> %12, <vscale x 4 x float> %22, <vscale x 4 x float> zeroinitializer
+  %24 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1> %12, <vscale x 4 x float> %23, <vscale x 4 x float> %20)
+  %25 = select <vscale x 4 x i1> %12, <vscale x 4 x float> %24, <vscale x 4 x float> zeroinitializer
+  %26 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fdiv.nxv4f32(<vscale x 4 x i1> %12, <vscale x 4 x float> %25, <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer))
+  %idx.ext17 = sext i32 %jp.040 to i64
+  %add.ptr18 = getelementptr inbounds float, ptr %add.ptr, i64 %idx.ext17
+  tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %26, ptr %add.ptr18, i32 1, <vscale x 4 x i1> %12), !tbaa !5
+  %conv20 = add i32 %11, %jp.040
+  %cmp2 = icmp slt i32 %conv20, %M
+  br i1 %cmp2, label %for.body4, label %for.cond.cleanup3, !llvm.loop !15
+}
+
+define dso_local void @test_invariantOffset64bit(i64 noundef %N, i64 noundef %M, ptr noundef %matrix, ptr nocapture noundef %result) local_unnamed_addr #0 {
+; CHECK-LABEL: test_invariantOffset64bit:
+; CHECK:       .LBB5_6:                                // %for.body4
+; CHECK:       add	x[[NEWBASE1:[0-9]+]], x2, x{{[0-9]+}}, lsl #3
+; CHECK:       ld1d	{ z{{[0-9]+}}.d }, p[[PG:[0-9]+]]/z, [x[[NEWBASE1]], z{{[0-9]+}}.d, lsl #3]
+; CHECK:       add	x[[NEWBASE2:[0-9]+]], x2, x{{[0-9]+}}, lsl #3
+; CHECK:       ld1d	{ z{{[0-9]+}}.d }, p[[PG]]/z, [x[[NEWBASE2]], z{{[0-9]+}}.d, lsl #3]
+; CHECK:       add	x[[NEWBASE3:[0-9]+]], x2, x{{[0-9]+}}, lsl #3
+; CHECK:       ld1d	{ z{{[0-9]+}}.d }, p[[PG]]/z, [x[[NEWBASE3]], z{{[0-9]+}}.d, lsl #3]
+entry:
+  %cmp39.not = icmp ult i64 %N, 3
+  br i1 %cmp39.not, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %div = udiv i64 %N, 3
+  %cmp237.not = icmp eq i64 %M, 0
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.lr.ph, %for.cond.cleanup3
+  %i.040 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %inc, %for.cond.cleanup3 ]
+  br i1 %cmp237.not, label %for.cond.cleanup3, label %for.body4.lr.ph
+
+for.body4.lr.ph:                                  ; preds = %for.cond1.preheader
+  %mul = mul nuw i64 %i.040, 3
+  %mul5 = mul i64 %mul, %M
+  %.splatinsert = insertelement <vscale x 2 x i64> poison, i64 %mul5, i64 0
+  %.splat = shufflevector <vscale x 2 x i64> %.splatinsert, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %add7 = add nuw i64 %mul, 1
+  %mul8 = mul i64 %add7, %M
+  %.splatinsert9 = insertelement <vscale x 2 x i64> poison, i64 %mul8, i64 0
+  %.splat10 = shufflevector <vscale x 2 x i64> %.splatinsert9, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %add12 = add nuw i64 %mul, 2
+  %mul13 = mul i64 %add12, %M
+  %.splatinsert14 = insertelement <vscale x 2 x i64> poison, i64 %mul13, i64 0
+  %.splat15 = shufflevector <vscale x 2 x i64> %.splatinsert14, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %mul16 = mul i64 %i.040, %M
+  %add.ptr = getelementptr inbounds double, ptr %result, i64 %mul16
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 1
+  br label %for.body4
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3, %entry
+  ret void
+
+for.cond.cleanup3:                                ; preds = %for.body4, %for.cond1.preheader
+  %inc = add nuw nsw i64 %i.040, 1
+  %exitcond.not = icmp eq i64 %inc, %div
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.cond1.preheader, !llvm.loop !16
+
+for.body4:                                        ; preds = %for.body4.lr.ph, %for.body4
+  %jp.038 = phi i64 [ 0, %for.body4.lr.ph ], [ %add18, %for.body4 ]
+  %2 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.whilelo.nxv2i1.i64(i64 %jp.038, i64 %M)
+  %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64 %jp.038, i64 1)
+  %4 = select <vscale x 2 x i1> %2, <vscale x 2 x i64> %3, <vscale x 2 x i64> zeroinitializer
+  %5 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.add.nxv2i64(<vscale x 2 x i1> %2, <vscale x 2 x i64> %4, <vscale x 2 x i64> %.splat)
+  %6 = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1> %2, ptr %matrix, <vscale x 2 x i64> %5)
+  %7 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.add.nxv2i64(<vscale x 2 x i1> %2, <vscale x 2 x i64> %4, <vscale x 2 x i64> %.splat10)
+  %8 = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1> %2, ptr %matrix, <vscale x 2 x i64> %7)
+  %9 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.add.nxv2i64(<vscale x 2 x i1> %2, <vscale x 2 x i64> %4, <vscale x 2 x i64> %.splat15)
+  %10 = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1> %2, ptr %matrix, <vscale x 2 x i64> %9)
+  %11 = select <vscale x 2 x i1> %2, <vscale x 2 x double> %6, <vscale x 2 x double> zeroinitializer
+  %12 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fadd.nxv2f64(<vscale x 2 x i1> %2, <vscale x 2 x double> %11, <vscale x 2 x double> %8)
+  %13 = select <vscale x 2 x i1> %2, <vscale x 2 x double> %12, <vscale x 2 x double> zeroinitializer
+  %14 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fadd.nxv2f64(<vscale x 2 x i1> %2, <vscale x 2 x double> %13, <vscale x 2 x double> %10)
+  %15 = select <vscale x 2 x i1> %2, <vscale x 2 x double> %14, <vscale x 2 x double> zeroinitializer
+  %16 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fdiv.nxv2f64(<vscale x 2 x i1> %2, <vscale x 2 x double> %15, <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer))
+  %add.ptr17 = getelementptr inbounds double, ptr %add.ptr, i64 %jp.038
+  tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> %16, ptr %add.ptr17, i32 1, <vscale x 2 x i1> %2), !tbaa !17
+  %add18 = add i64 %1, %jp.038
+  %cmp2 = icmp ult i64 %add18, %M
+  br i1 %cmp2, label %for.body4, label %for.cond.cleanup3, !llvm.loop !19
+}
+
+define dso_local void @test_svaddx_constOffset(ptr noundef %base, <vscale x 4 x i32> %index) local_unnamed_addr #0 {
+; CHECK-LABEL: test_svaddx_constOffset:
+; CHECK:       .LBB6_1:                                // %for.body
+; CHECK:       add	x[[NEWBASE1:[0-9]+]], x0, #40
+; CHECK:       ld1w	{ z{{[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x[[NEWBASE1]], z{{[0-9]+}}.s, uxtw #2]
+; CHECK:       add	x[[NEWBASE2:[0-9]+]], x0, #44
+; CHECK:       ld1w	{ z{{[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE2]], z{{[0-9]+}}.s, uxtw #2]
+entry:
+  %0 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %for.body ]
+  %index.addr.05 = phi <vscale x 4 x i32> [ %index, %entry ], [ %8, %for.body ]
+  %1 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.u.nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %index.addr.05, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 10, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1> %0, ptr %base, <vscale x 4 x i32> %1)
+  %3 = shl nuw nsw i64 %indvars.iv, 4
+  %add.ptr = getelementptr inbounds i32, ptr %base, i64 %3
+  store <vscale x 4 x i32> %2, ptr %add.ptr, align 16, !tbaa !20
+  %4 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.u.nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %index.addr.05, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %5 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.u.nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %4, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 10, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %6 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1> %0, ptr %base, <vscale x 4 x i32> %5)
+  %indvars.iv.next = shl i64 %indvars.iv, 4
+  %7 = or i64 %indvars.iv.next, 16
+  %add.ptr.1 = getelementptr inbounds i32, ptr %base, i64 %7
+  store <vscale x 4 x i32> %6, ptr %add.ptr.1, align 16, !tbaa !20
+  %8 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.u.nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %4, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv, 2
+  %exitcond.not.1 = icmp eq i64 %indvars.iv.next.1, 100
+  br i1 %exitcond.not.1, label %for.cond.cleanup, label %for.body, !llvm.loop !22
+}
+
+define dso_local void @_Z26test_loop_invariant_offsetPlu11__SVInt64_tl(ptr noundef %base, <vscale x 2 x i64> %index, i64 noundef %invariant_offset) local_unnamed_addr #6 {
+; CHECK-LABEL: _Z26test_loop_invariant_offsetPlu11__SVInt64_tl:
+; CHECK:       add	x[[NEWBASE:[0-9]+]], x0, x1, lsl #3
+; CHECK:       st1d	{ z{{[0-9]+}}.d }, p{{[0-9]+}}, [x[[NEWBASE]], z{{[0-9]+}}.d, lsl #3]
+entry:
+  %0 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %.splatinsert = insertelement <vscale x 2 x i64> poison, i64 %invariant_offset, i64 0
+  %1 = shufflevector <vscale x 2 x i64> %.splatinsert, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %index.addr.05 = phi <vscale x 2 x i64> [ %index, %entry ], [ %4, %for.body ]
+  %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.add.u.nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %index.addr.05, <vscale x 2 x i64> %1)
+  %.splatinsert3 = insertelement <vscale x 2 x i64> poison, i64 %indvars.iv, i64 0
+  %3 = shufflevector <vscale x 2 x i64> %.splatinsert3, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  tail call void @llvm.aarch64.sve.st1.scatter.index.nxv2i64(<vscale x 2 x i64> %3, <vscale x 2 x i1> %0, ptr %base, <vscale x 2 x i64> %2)
+  %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.add.u.nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %index.addr.05, <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !23
+}
+
+define dso_local void @test_combined_const_and_invariant_offset(ptr noundef %base, <vscale x 4 x i32> %index, i32 noundef %invariant_offset) local_unnamed_addr #0 {
+; CHECK-LABEL: test_combined_const_and_invariant_offset:
+; CHECK:       add	x[[NEWBASE_GPR:[0-9]+]], x0, w1, sxtw #2
+; CHECK:       add	x[[NEWBASE_FINAL:[0-9]+]], x[[NEWBASE_GPR]], #40
+; CHECK:       ld1w	{ z{{[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x[[NEWBASE_FINAL]], z{{[0-9]+}}.s, sxtw #2]
+entry:
+  %0 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %.splatinsert = insertelement <vscale x 4 x i32> poison, i32 %invariant_offset, i64 0
+  %.splat = shufflevector <vscale x 4 x i32> %.splatinsert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %index.addr.06 = phi <vscale x 4 x i32> [ %index, %entry ], [ %7, %for.body ]
+  %1 = select <vscale x 4 x i1> %0, <vscale x 4 x i32> %index.addr.06, <vscale x 4 x i32> zeroinitializer
+  %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.u.nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 10, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %3 = select <vscale x 4 x i1> %0, <vscale x 4 x i32> %2, <vscale x 4 x i32> zeroinitializer
+  %4 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.u.nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %3, <vscale x 4 x i32> %.splat)
+  %5 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1> %0, ptr %base, <vscale x 4 x i32> %4)
+  %6 = shl nuw nsw i64 %indvars.iv, 4
+  %add.ptr = getelementptr inbounds i32, ptr %base, i64 %6
+  store <vscale x 4 x i32> %5, ptr %add.ptr, align 16, !tbaa !20
+  %7 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.u.nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %index.addr.06, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !24
+}
+
+declare <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32, i32) #1
+declare <vscale x 4 x i32> @llvm.aarch64.sve.index.nxv4i32(i32, i32) #1
+declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>) #1
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1>, ptr, <vscale x 4 x i32>) #2
+declare <vscale x 4 x float> @llvm.aarch64.sve.fsubr.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>) #1
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>) #1
+declare <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>) #1
+declare <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>) #1
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmad.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>) #1
+declare <vscale x 4 x float> @llvm.aarch64.sve.fsqrt.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 4 x float>) #1
+declare float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>) #1
+declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, ptr, <vscale x 4 x i32>) #3
+declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 immarg) #1
+declare void @llvm.aarch64.sve.prfw.gather.sxtw.index.nxv4i32(<vscale x 4 x i1>, ptr nocapture, <vscale x 4 x i32>, i32 immarg) #5
+declare <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>) #1
+declare <vscale x 4 x float> @llvm.aarch64.sve.fdiv.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>) #1
+declare <vscale x 2 x i1> @llvm.aarch64.sve.whilelo.nxv2i1.i64(i64, i64) #1
+declare <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64, i64) #1
+declare <vscale x 2 x i64> @llvm.aarch64.sve.add.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>) #1
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1>, ptr, <vscale x 2 x i64>) #2
+declare <vscale x 2 x double> @llvm.aarch64.sve.fadd.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>) #1
+declare <vscale x 2 x double> @llvm.aarch64.sve.fdiv.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>) #1
+declare <vscale x 4 x i32> @llvm.aarch64.sve.add.u.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>) #1
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1>, ptr, <vscale x 4 x i32>) #2
+declare <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 immarg) #1
+declare <vscale x 2 x i64> @llvm.aarch64.sve.add.u.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>) #1
+declare void @llvm.aarch64.sve.st1.scatter.index.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, ptr, <vscale x 2 x i64>) #3
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1>, ptr, <vscale x 4 x i32>) #2
+declare i64 @llvm.vscale.i64() #7
+declare i32 @llvm.vscale.i32() #7
+declare <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nocapture, i32 immarg, <vscale x 4 x i1>, <vscale x 4 x float>) #8
+declare i1 @llvm.aarch64.sve.ptest.any.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>) #7
+declare void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float>, ptr nocapture, i32 immarg, <vscale x 4 x i1>) #9
+declare void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double>, ptr nocapture, i32 immarg, <vscale x 2 x i1>) #9
+
+attributes #0 = { mustprogress nofree nosync nounwind memory(argmem: readwrite) uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hip09" "target-features"="+aes,+bf16,+crc,+dotprod,+f32mm,+f64mm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+sve,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,-fmv" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) }
+attributes #4 = { mustprogress nofree nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hip09" "target-features"="+aes,+bf16,+crc,+dotprod,+f32mm,+f64mm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+sve,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,-fmv" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) }
+attributes #6 = { mustprogress nofree nosync nounwind memory(argmem: write) uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hip09" "target-features"="+aes,+bf16,+crc,+dotprod,+f32mm,+f64mm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+sve,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,-fmv" }
+attributes #7 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #8 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #9 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"PIE Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 1}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"float", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C++ TBAA"}
+!9 = distinct !{!9, !10}
+!10 = !{!"llvm.loop.mustprogress"}
+!11 = distinct !{!11, !10}
+!12 = distinct !{!12, !10}
+!13 = distinct !{!13, !10}
+!14 = distinct !{!14, !10}
+!15 = distinct !{!15, !10}
+!16 = distinct !{!16, !10}
+!17 = !{!18, !18, i64 0}
+!18 = !{!"double", !7, i64 0}
+!19 = distinct !{!19, !10}
+!20 = !{!21, !21, i64 0}
+!21 = !{!"int", !7, i64 0}
+!22 = distinct !{!22, !10}
+!23 = distinct !{!23, !10}
+!24 = distinct !{!24, !10}
-- 
Gitee


From e0fa4ddfe0f886497f90811a6ec595ed2651e27b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E5=93=B2=E6=B5=A9?= <2209576006@qq.com>
Date: Sun, 28 Sep 2025 23:53:43 +0800
Subject: [PATCH 2/2] [AArch64]: Add SVE loop addressing optimizations

1. Simplifies SVE gather/scatter address svadd chains in loops by rewriting them to be hoistable by MachineLICM.

2. Simplifies SVE svindex+svmul patterns in loops via strength reduction.
---
 .../Target/AArch64/AArch64MIPeepholeOpt.cpp   | 926 ++++++++++++++++++
 llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp  | 388 --------
 llvm/test/CodeGen/AArch64/O3-pipeline.ll      |   1 -
 .../aarch64-sve-addressing-peephole.ll        |  16 +-
 .../AArch64/aarch64-sve-index-mul-simplify.ll |  48 +
 5 files changed, 983 insertions(+), 396 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/aarch64-sve-index-mul-simplify.ll

diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 87aa3b98d938..683e491f5724 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -65,7 +65,9 @@
 
 #include "AArch64ExpandImm.h"
 #include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 
@@ -73,6 +75,15 @@ using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-mi-peephole-opt"
 
+static cl::opt<bool> EnableSVELoopAddressChainOpt(
+    "aarch64-sve-loop-address-chain-opt", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Enable simplification of SVE address computation chains in loops"));
+
+static cl::opt<bool> EnableSVEIndexMultiplyOpt(
+    "aarch64-sve-simplify-index-multiply", cl::init(false), cl::Hidden,
+    cl::desc("Enable simplification of SVE svindex+svmul patterns in loops"));
+
 namespace {
 
 struct AArch64MIPeepholeOpt : public MachineFunctionPass {
@@ -94,6 +105,12 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
   using BuildMIFunc =
       std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned,
                          Register, Register, Register)>;
+  using InstAndOffset = std::pair<MachineInstr *, int64_t>;
+  using ChainKey = std::tuple<Register, Register, Register>;
+  using ChainMap = DenseMap<ChainKey, SmallVector<InstAndOffset, 4>>;
+  using ConstOffsetKey = std::pair<MachineBasicBlock *, int64_t>;
+  // Define an enum for the SVE offset type.
+  enum class SVEOffsetType { NOT_APPLICABLE, SXTW, UXTW, D64 };
 
   /// For instructions where an immediate operand could be split into two
   /// separate immediate instructions, use the splitTwoPartImm two handle the
@@ -127,6 +144,29 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
   bool visitINSERT(MachineInstr &MI);
   bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
   bool visitINSvi64lane(MachineInstr &MI);
+  bool isLoopInvariant(Register Reg, MachineLoop *L);
+  bool isConstantVector(Register Reg, int64_t &Value);
+  bool isInvariantBroadcastGPR(Register VecReg, MachineLoop *L, Register &GPR);
+  unsigned getElementSizeInBytes(const MachineInstr &MI,
+                                 SVEOffsetType *OffsetKind);
+  void traceIndexChain(Register IndexReg, Register &RootIndex,
+                       int64_t &AccumulatedOffset, Register &InvariantGPROffset,
+                       MachineLoop *L,
+                       SmallVectorImpl<MachineInstr *> &ChainsInsts);
+  void collectOptimizationCandidates(MachineLoop *L, ChainMap &Chains,
+                       SetVector<MachineInstr *> &CandidateDeadInsts);
+  Register getGPRBase(
+      MachineBasicBlock *MBB, Register BaseReg, Register InvariantGPROffset,
+      MachineInstr &UseMI,
+      DenseMap<MachineBasicBlock *, Register> &BlockToGPRBaseMap);
+  Register getFinalBase(MachineInstr &MI, int64_t ElemOffset,
+                       Register BaseForConst,
+                       DenseMap<ConstOffsetKey, Register> &FinalBaseMap);
+  bool rewriteConstantAddressComputations(MachineLoop *L,
+                                          const ChainMap &Chains);
+  bool cleanupDeadSVECode(SetVector<MachineInstr *> &CandidateDeadInsts);
+  bool simplifySVEIndexMultiply(MachineLoop *L);
+  bool processSVELoopAddressing(MachineLoop *L);
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override {
@@ -670,6 +710,873 @@ bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {
   return true;
 }
 
+// Check if Reg is a loop invariant to Loop L
+bool AArch64MIPeepholeOpt::isLoopInvariant(Register Reg, MachineLoop *L) {
+  if (!Reg.isVirtual())
+    return false;
+  MachineInstr *Def = MRI->getVRegDef(Reg);
+  if (!Def)
+    return true;
+  return !L->contains(Def->getParent());
+}
+
+// Check if a vector register represents a constant value
+// and retrieve that constant value if it exists
+bool AArch64MIPeepholeOpt::isConstantVector(Register Reg, int64_t &Value) {
+  if (!Reg.isVirtual())
+    return false;
+
+  MachineInstr *Def = MRI->getVRegDef(Reg);
+  if (!Def)
+    return false;
+
+  // Match the DUP instruction pattern: %Def = DUP_ZI_S Imm, 0
+  // This instruction broadcasts the immediate value to all vector elements
+  unsigned DupOp = Def->getOpcode();
+  if (DupOp == AArch64::DUP_ZI_S || DupOp == AArch64::DUP_ZI_D) {
+    Value = Def->getOperand(1).getImm();
+    return true;
+  }
+  return false;
+}
+
+// Checks if a vector register is broadcasted from a loop-invariant GPR
+// Matches instruction pattern: %VecReg = DUP_ZR_S/D %GPR
+// Where %GPR is loop-invariant to loop L
+bool AArch64MIPeepholeOpt::isInvariantBroadcastGPR(Register VecReg,
+                                                   MachineLoop *L,
+                                                   Register &GPR) {
+  if (!VecReg.isVirtual())
+    return false;
+
+  MachineInstr *Def = MRI->getVRegDef(VecReg);
+  if (!Def)
+    return false;
+
+  unsigned DupOp = Def->getOpcode();
+  if (DupOp == AArch64::DUP_ZR_S || DupOp == AArch64::DUP_ZR_D) {
+    Register SrcGPR = Def->getOperand(1).getReg();
+    if (isLoopInvariant(SrcGPR, L)) {
+      GPR = SrcGPR;
+      return true;
+    }
+  }
+  return false;
+}
+
+// Returns element size in bytes for gather/scatter instructions
+// Returns 0 for non-gather/scatter instructions
+unsigned
+AArch64MIPeepholeOpt::getElementSizeInBytes(const MachineInstr &MI,
+                                            SVEOffsetType *OffsetKind) {
+  switch (MI.getOpcode()) {
+  // --- Element Size: 2 Bytes (Half-Word) ---
+  case AArch64::GLD1H_D_SCALED:
+  case AArch64::GLD1SH_D_SCALED:
+  case AArch64::GLDFF1H_D_SCALED:
+  case AArch64::GLDFF1SH_D_SCALED:
+  case AArch64::LDNT1H_ZZR_D_REAL:
+  case AArch64::LDNT1SH_ZZR_D_REAL:
+  case AArch64::SST1H_D_SCALED:
+  case AArch64::STNT1H_ZZR_D_REAL:
+    *OffsetKind = SVEOffsetType::D64;
+    return 2;
+  case AArch64::GLD1H_S_SXTW_SCALED:
+  case AArch64::GLD1SH_S_SXTW_SCALED:
+  case AArch64::GLDFF1H_S_SXTW_SCALED:
+  case AArch64::GLDFF1SH_S_SXTW_SCALED:
+  case AArch64::SST1H_S_SXTW_SCALED:
+    *OffsetKind = SVEOffsetType::SXTW;
+    return 2;
+  case AArch64::GLD1H_S_UXTW_SCALED:
+  case AArch64::GLD1SH_S_UXTW_SCALED:
+  case AArch64::GLDFF1H_S_UXTW_SCALED:
+  case AArch64::GLDFF1SH_S_UXTW_SCALED:
+  case AArch64::SST1H_S_UXTW_SCALED:
+    *OffsetKind = SVEOffsetType::UXTW;
+    return 2;
+
+  // --- Element Size: 4 Bytes (Word) ---
+  case AArch64::GLD1SW_D_SCALED:
+  case AArch64::GLD1W_D_SCALED:
+  case AArch64::GLDFF1SW_D_SCALED:
+  case AArch64::GLDFF1W_D_SCALED:
+  case AArch64::LDNT1SW_ZZR_D_REAL:
+  case AArch64::LDNT1W_ZZR_D_REAL:
+  case AArch64::SST1W_D_SCALED:
+  case AArch64::STNT1W_ZZR_D_REAL:
+    *OffsetKind = SVEOffsetType::D64;
+    return 4;
+  case AArch64::GLD1W_SXTW_SCALED:
+  case AArch64::GLDFF1W_SXTW_SCALED:
+  case AArch64::PRFW_S_SXTW_SCALED:
+  case AArch64::SST1W_SXTW_SCALED:
+    *OffsetKind = SVEOffsetType::SXTW;
+    return 4;
+  case AArch64::GLD1W_UXTW_SCALED:
+  case AArch64::GLDFF1W_UXTW_SCALED:
+  case AArch64::PRFW_S_UXTW_SCALED:
+  case AArch64::SST1W_UXTW_SCALED:
+    *OffsetKind = SVEOffsetType::UXTW;
+    return 4;
+
+  // --- Element Size: 8 Bytes (Double-Word) ---
+  case AArch64::GLD1D_SCALED:
+  case AArch64::GLDFF1D_SCALED:
+  case AArch64::LDNT1D_ZZR_D_REAL:
+  case AArch64::PRFW_D_SCALED:
+  case AArch64::SST1D_SCALED:
+  case AArch64::STNT1D_ZZR_D_REAL:
+    *OffsetKind = SVEOffsetType::D64;
+    return 8;
+  case AArch64::GLD1D_SXTW_SCALED:
+  case AArch64::SST1D_SXTW_SCALED:
+    *OffsetKind = SVEOffsetType::SXTW;
+    return 8;
+  case AArch64::GLD1D_UXTW_SCALED:
+  case AArch64::SST1D_UXTW_SCALED:
+    *OffsetKind = SVEOffsetType::UXTW;
+    return 8;
+  default:
+    StringRef InstName = TII->getName(MI.getOpcode());
+    if (InstName.startswith("GLD") || InstName.startswith("SST") ||
+        InstName.startswith("LDNT") || InstName.startswith("STNT") ||
+        InstName.startswith("PRFW")) {
+      LLVM_DEBUG(dbgs() << "SVELoopAddressHoisting: Unhandled SVE "
+                           "gather/scatter-like instruction found: "
+                        << MI);
+    }
+
+    *OffsetKind = SVEOffsetType::NOT_APPLICABLE;
+    return 0;
+  }
+}
+
+// Traces index chain to discover:
+// - Root index register
+// - Accumulated constant offset
+// - Loop-invariant GPR offset component
+// - And collects the chain instructions for potential deletion
+void AArch64MIPeepholeOpt::traceIndexChain(
+    Register IndexReg, Register &RootIndex, int64_t &AccumulatedOffset,
+    Register &InvariantGPROffset, MachineLoop *L,
+    SmallVectorImpl<MachineInstr *> &ChainInsts) {
+  AccumulatedOffset = 0;
+  InvariantGPROffset = Register(0);
+  Register CurrentReg = IndexReg;
+
+  while (true) {
+    if (!CurrentReg.isVirtual())
+      break;
+
+    MachineInstr *Def = MRI->getVRegDef(CurrentReg);
+    // Index must be defined within loop as induction variable
+    if (!Def || !L->contains(Def->getParent()))
+      break;
+
+    // Match svadd index increment pattern:
+    // %index = ADD_ZI_[S/D] %prev_index, %offset, %pg
+    // %index = ADD_ZZZ_D %prev_index, %offset
+    // %index = ADD_ZPZZ_[S/D]_ZERO %pg, %prev_index, %offset
+    unsigned IndexOp = Def->getOpcode();
+    if (IndexOp == AArch64::ADD_ZI_S || IndexOp == AArch64::ADD_ZI_D) {
+      int64_t ConstValue = Def->getOperand(2).getImm();
+      AccumulatedOffset += ConstValue;
+      CurrentReg = Def->getOperand(1).getReg();
+      ChainInsts.push_back(Def);
+      continue;
+    }
+
+    Register Op1, Op2;
+    if (IndexOp == AArch64::ADD_ZZZ_S || IndexOp == AArch64::ADD_ZZZ_D) {
+      Op1 = Def->getOperand(1).getReg();
+      Op2 = Def->getOperand(2).getReg();
+    } else if (IndexOp == AArch64::ADD_ZPZZ_S_ZERO ||
+               IndexOp == AArch64::ADD_ZPZZ_D_ZERO ||
+               IndexOp == AArch64::ADD_ZPmZ_S ||
+               IndexOp == AArch64::ADD_ZPmZ_D) {
+      Op1 = Def->getOperand(2).getReg();
+      Op2 = Def->getOperand(3).getReg();
+    } else {
+      break;
+    }
+
+    int64_t ConstValue;
+    Register InvariantGPR;
+
+    // Op2 case 1: Constant vector offset
+    if (isConstantVector(Op2, ConstValue)) {
+      AccumulatedOffset += ConstValue;
+      CurrentReg = Op1;
+      ChainInsts.push_back(MRI->getVRegDef(Op2));
+      ChainInsts.push_back(Def);
+      continue;
+    }
+
+    // Op2 case 2: Loop-invariant GPR broadcast offset
+    if (isInvariantBroadcastGPR(Op2, L, InvariantGPR)) {
+      if (InvariantGPROffset != 0) {
+        LLVM_DEBUG(
+            dbgs() << "Found multiple GPR invariants, aborting trace.\n");
+        break;
+      }
+      InvariantGPROffset = InvariantGPR;
+      CurrentReg = Op1;
+      ChainInsts.push_back(MRI->getVRegDef(Op2));
+      ChainInsts.push_back(Def);
+      continue;
+    }
+
+    // Op1 case 1: Constant vector offset
+    if (isConstantVector(Op1, ConstValue)) {
+      AccumulatedOffset += ConstValue;
+      CurrentReg = Op2;
+      ChainInsts.push_back(MRI->getVRegDef(Op1));
+      ChainInsts.push_back(Def);
+      continue;
+    }
+
+    // Op1 case 2: Loop-invariant GPR broadcast offset
+    if (isInvariantBroadcastGPR(Op1, L, InvariantGPR)) {
+      if (InvariantGPROffset != 0) {
+        LLVM_DEBUG(
+            dbgs() << "Found multiple GPR invariants, aborting trace.\n");
+        break;
+      }
+      InvariantGPROffset = InvariantGPR;
+      CurrentReg = Op2;
+      ChainInsts.push_back(MRI->getVRegDef(Op1));
+      ChainInsts.push_back(Def);
+      continue;
+    }
+    break;
+  }
+
+  RootIndex = CurrentReg;
+}
+
+// Collects all optimizable gather/scatter instructions
+// and groups them into chains.
+void AArch64MIPeepholeOpt::collectOptimizationCandidates(
+    MachineLoop *L, ChainMap &Chains,
+    SetVector<MachineInstr *> &CandidateDeadInsts) {
+  for (MachineBasicBlock *MBB : L->getBlocks()) {
+    for (MachineInstr &MI : *MBB) {
+      SVEOffsetType OffsetType;
+      unsigned ElementSize = getElementSizeInBytes(MI, &OffsetType);
+      if (ElementSize == 0)
+        continue;
+
+      // Verify instruction format:
+      // Gather:  DstZPR, PredicatePPR, BaseGPR, IndexZPR
+      // Scatter: SrcZPR, PredicatePPR, BaseGPR, IndexZPR
+      if (MI.getNumOperands() < 4)
+        continue;
+
+      Register BaseReg = MI.getOperand(2).getReg();
+      Register IndexReg = MI.getOperand(3).getReg();
+      // Only optimize loop-invariant base addresses
+      if (!isLoopInvariant(BaseReg, L))
+        continue;
+
+      Register RootIndex, InvariantGPROffset;
+      int64_t ElemOffset;
+      SmallVector<MachineInstr *, 8> TmpChainInsts; // Store chain for this MI
+
+      // Trace index computation chain
+      traceIndexChain(IndexReg, RootIndex, ElemOffset, InvariantGPROffset, L,
+                      TmpChainInsts);
+
+      // If the chain is empty, there's nothing to optimize or delete.
+      if (TmpChainInsts.empty() && InvariantGPROffset == 0 && ElemOffset == 0)
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Found candidate instruction: "; MI.dump();
+                 dbgs() << "  BaseReg: " << printReg(BaseReg)
+                        << ", IndexReg: " << printReg(IndexReg)
+                        << " -> RootIndex: " << printReg(RootIndex)
+                        << ", ElemOffset: " << ElemOffset
+                        << ", InvariantGPROffset: "
+                        << printReg(InvariantGPROffset) << "\n");
+
+      Chains[{BaseReg, RootIndex, InvariantGPROffset}].push_back(
+          {&MI, ElemOffset});
+
+      // Add the identified chain instructions to the master set of candidates.
+      CandidateDeadInsts.insert(TmpChainInsts.begin(), TmpChainInsts.end());
+    }
+  }
+}
+
+// Get or create a shared base register for the (Base + GPR) calculation
+// It ensures the calculation is only generated once per block
+//
+// @param BaseReg The original, loop-invariant base register
+// @param InvariantGPROffset The loop-invariant GPR used as an offset
+// @param UseMI The memory instruction that will ultimately use this base
+// @param BlockToGPRBaseMap The cache mapping a block to its computed GPR-base
+// @return A register holding the result of `BaseReg + (InvariantGPROffset << scale)`.
+//         If no GPR offset exists, it returns the original `BaseReg`
+Register AArch64MIPeepholeOpt::getGPRBase(
+    MachineBasicBlock *MBB, Register BaseReg, Register InvariantGPROffset,
+    MachineInstr &UseMI,
+    DenseMap<MachineBasicBlock *, Register> &BlockToGPRBaseMap) {
+  // If we've already computed the GPR base for this block, return it
+  if (BlockToGPRBaseMap.count(MBB)) {
+    return BlockToGPRBaseMap[MBB];
+  }
+
+  // If there's no GPR offset, the base is simply the original BaseReg
+  if (InvariantGPROffset == 0) {
+    BlockToGPRBaseMap[MBB] = BaseReg;
+    return BaseReg;
+  }
+
+  // This is the first time for this block, so we generate the ADD instruction
+  // Insert the calculation at the beginning of the block
+  DebugLoc DL = UseMI.getDebugLoc();
+  SVEOffsetType OffsetType;
+  unsigned ElementSize = getElementSizeInBytes(UseMI, &OffsetType);
+  unsigned ShiftAmt = Log2_64(ElementSize);
+  unsigned AddOp, ShiftExtender;
+
+  const TargetRegisterClass *RC = MRI->getRegClass(InvariantGPROffset);
+  if (AArch64::GPR32RegClass.hasSubClassEq(RC)) {
+    ShiftExtender =
+        (OffsetType == SVEOffsetType::SXTW)
+            ? AArch64_AM::getArithExtendImm(AArch64_AM::SXTW, ShiftAmt)
+            : AArch64_AM::getArithExtendImm(AArch64_AM::UXTW, ShiftAmt);
+    AddOp = AArch64::ADDXrx;
+  } else {
+    ShiftExtender = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
+    AddOp = AArch64::ADDXrs;
+  }
+
+  Register GPROffsetBaseReg =
+      MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+  BuildMI(*MBB, MBB->getFirstNonPHI(), DL, TII->get(AddOp), GPROffsetBaseReg)
+      .addReg(BaseReg)
+      .addReg(InvariantGPROffset)
+      .addImm(ShiftExtender);
+
+  LLVM_DEBUG(dbgs() << "  In BB:" << MBB->getName()
+                    << ", created shared GPR base: "
+                    << printReg(GPROffsetBaseReg) << "\n");
+
+  // Cache and return the new base
+  BlockToGPRBaseMap[MBB] = GPROffsetBaseReg;
+  return GPROffsetBaseReg;
+}
+
+// Get or create the final base register (Base + GPR + Const)
+//
+// @param MI The memory instruction that will use this final base
+// @param ElemOffset The constant element offset extracted from the address chain.
+// @param BaseForConst The base register to add the constant offset to. This is
+//                     typically the result from `getGPRBase`.
+// @param FinalBaseMap The cache mapping a `{Block, Offset}` key to BaseForConst 's final base
+// @return A register holding the result of `BaseForConst + (ElemOffset * scale)`.
+//         If `ElemOffset` is zero, it returns `BaseForConst` directly
+Register AArch64MIPeepholeOpt::getFinalBase(
+    MachineInstr &MI, int64_t ElemOffset, Register BaseForConst,
+    DenseMap<ConstOffsetKey, Register> &FinalBaseMap) {
+
+  MachineBasicBlock *MBB = MI.getParent();
+  ConstOffsetKey Key = {MBB, ElemOffset};
+
+  // If we've already computed the final base for this key, return it
+  if (FinalBaseMap.count(Key)) {
+    return FinalBaseMap.lookup(Key);
+  }
+
+  // If there's no constant offset, the final base is the one passed in
+  if (ElemOffset == 0) {
+    FinalBaseMap[Key] = BaseForConst;
+    return BaseForConst;
+  }
+
+  // This is the first time for this key, generate the ADD instruction
+  DebugLoc DL = MI.getDebugLoc();
+  SVEOffsetType OffsetType;
+  unsigned ElementSize = getElementSizeInBytes(MI, &OffsetType);
+  int64_t ByteOffset = ElemOffset * ElementSize;
+
+  Register ConstOffsetBase =
+      MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+  BuildMI(*MBB, MI.getIterator(), DL, TII->get(AArch64::ADDXri),
+          ConstOffsetBase)
+      .addReg(BaseForConst)
+      .addImm(ByteOffset)
+      .addImm(0);
+
+  LLVM_DEBUG(dbgs() << "  Created new base for ElemOffset " << ElemOffset
+                    << " (ByteOffset " << ByteOffset << ") into "
+                    << printReg(ConstOffsetBase) << "\n");
+
+  // Cache and return the new final base.
+  FinalBaseMap[Key] = ConstOffsetBase;
+  return ConstOffsetBase;
+}
+
+bool AArch64MIPeepholeOpt::rewriteConstantAddressComputations(
+    MachineLoop *L, const ChainMap &Chains) {
+  bool Changed = false;
+
+  for (auto &ChainInfo : Chains) {
+    auto &Addressings = ChainInfo.second;
+    // Skip chains without optimizable offsets
+    if (Addressings.size() < 2 && std::get<2>(ChainInfo.first) == 0 &&
+        Addressings[0].second == 0)
+      continue;
+
+    Register BaseReg = std::get<0>(ChainInfo.first);
+    Register RootIndex = std::get<1>(ChainInfo.first);
+    Register InvariantGPROffset = std::get<2>(ChainInfo.first);
+
+    LLVM_DEBUG(dbgs() << "Optimizing chain with BaseReg: " << printReg(BaseReg)
+                      << ", RootIndex: " << printReg(RootIndex)
+                      << ", InvariantGPROffset: "
+                      << printReg(InvariantGPROffset) << "\n");
+
+    // Maps a Basic Block to the register holding the (Base + GPR) calculation
+    // for that block
+    DenseMap<MachineBasicBlock *, Register> BlockToGPRBaseMap;
+
+    // Maps a (Basic Block, Const Offset) pair to the final base register
+    // We need the BB in the key to ensure correctness across different blocks
+    DenseMap<ConstOffsetKey, Register> ConstOffsetToFinalBaseMap;
+
+    for (auto &AddressInfo : Addressings) {
+      MachineInstr *MI = AddressInfo.first;
+      MachineBasicBlock *MBB = MI->getParent();
+      int64_t ElemOffset = AddressInfo.second;
+
+      // Get the shared (Base + GPR) base for this instruction's block
+      Register BaseForConsts = getGPRBase(
+          MBB, BaseReg, InvariantGPROffset, *MI, BlockToGPRBaseMap);
+
+      // Get the final base, creating the const offset ADD if needed
+      Register FinalBaseReg = getFinalBase(
+          *MI, ElemOffset, BaseForConsts, ConstOffsetToFinalBaseMap);
+
+      // Rewrite the memory instruction
+      MI->getOperand(2).setReg(FinalBaseReg);
+      MI->getOperand(3).setReg(RootIndex);
+
+      LLVM_DEBUG(dbgs() << "  Rewrote instruction: "; MI->dump());
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+bool AArch64MIPeepholeOpt::cleanupDeadSVECode(
+    SetVector<MachineInstr *> &CandidateDeadInsts) {
+  if (CandidateDeadInsts.empty())
+    return false;
+
+  bool Changed = false;
+  LLVM_DEBUG(dbgs() << "--- Cleaning up dead instructions ---\n");
+  for (MachineInstr *MI : llvm::reverse(CandidateDeadInsts)) {
+    bool IsDead = true;
+    for (const MachineOperand &MO : MI->operands()) {
+      if (MO.isReg() && MO.isDef() && MO.getReg().isVirtual()) {
+        if (!MRI->use_empty(MO.getReg())) {
+          IsDead = false;
+          break;
+        }
+      }
+    }
+
+    if (!IsDead)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Deleting dead instruction: "; MI->dump());
+    MI->eraseFromParent();
+    Changed = true;
+  }
+  return Changed;
+}
+
+// Processes a single machine loop to find and rewrite optimizable
+// SVE address computation chains for gather/scatter-like instructions.
+//
+// The core idea is to identify cases where the vector index used by a memory
+// instruction is calculated by adding a loop-invariant offset to a base index
+// (the root induction variable). Such computations are redundant within the
+// loop. This optimization rewrites the address calculation to make the
+// invariant part easily hoistable by MachineLICM.
+//
+// Specifically, it targets the following pattern example:
+//
+// =============================== BEFORE ===============================
+// // In a loop, a complex index `z_idx` is computed before being used.
+// // The offset can be a constant, a loop-invariant GPR, or both.
+//
+//   ...
+//   dup    z_offset, invariant_gpr
+//   add    z_idx, z_root_idx, z_offset
+//   gather z_data, pg, [x_base, z_idx, <shifter>]
+//   ...
+//
+// =============================== AFTER ================================
+// // The pass sinks the invariant address calculation to just before the use,
+// // exposing it to MachineLICM. The original `add` chain is replaced.
+// // MachineLICM will then decide whether hoisting is profitable.
+//
+//   ...
+//   // --- Instructions created by this pass, to be hoisted by MachineLICM ---
+//   add    x_new_base, x_base, invariant_gpr, <shifter>
+//   gather z_data, pg, [x_new_base, z_root_idx, <shifter>]
+//   ...
+//
+bool AArch64MIPeepholeOpt::processSVELoopAddressing(MachineLoop *L) {
+  MachineBasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader)
+    return false;
+
+  bool Changed = false;
+  LLVM_DEBUG(dbgs() << "********** Processing Loop in Function: "
+                    << L->getHeader()->getParent()->getName()
+                    << " (Loop Header: " << L->getHeader()->getName()
+                    << ") **********\n");
+
+  // Collect all candadate instructions and their addressing chains
+  ChainMap Chains;
+  SetVector<MachineInstr *> CandidateDeadInsts;
+  collectOptimizationCandidates(L, Chains, CandidateDeadInsts);
+
+  if (Chains.empty())
+    return false;
+
+  // rewrite the instructions in the loop
+  Changed |= rewriteConstantAddressComputations(L, Chains);
+
+  // Clean up the original, now-dead, address computation instructions
+  if (Changed)
+    Changed |= cleanupDeadSVECode(CandidateDeadInsts);
+  return Changed;
+}
+
+// This optimization identifies a common pattern where a vector of indices,
+// generated from a loop induction variable, is immediately multiplied by a
+// constant. This is a computationally expensive operation inside a loop.
+// The pass transforms this pattern by replacing the expensive vector multiply
+// with a cheaper vector add. It achieves this by creating a new induction
+// variable system and a pre-computed offset vector.
+//
+// =============================== BEFORE ===============================
+// The pass targets a MUL instruction whose operands form a specific chain.
+/// In C SVE intrinsics, this typically looks like:
+//
+//   for (int jp = 0; jp < limit; jp += svcntw()) {
+//     svbool_t pg = svwhilelt_b32(jp, limit);
+//     // 1. Index vector is based on induction variable `jp`.
+//     svuint32_t indices = svindex_u32(jp, IndexStep);
+//     // 2. Result is a vector multiply by a constant `Multiplier`.
+//     svuint32_t final_indices = svmul_z(pg, indices, Multiplier);
+//   }
+//
+// This corresponds to the following MachineIR pattern:
+//   `z_result = MUL (SEL(pg, z_indices, 0), DUP(Multiplier))`
+//   where `z_indices` is defined by an `INDEX` instruction using a PHI-defined
+//   induction variable.
+//
+// =============================== AFTER ================================
+// The transformation is based on the distributive property:
+//   `(jp + k*IndexStep) * Multiplier = (jp*Multiplier) +
+//   k*(IndexStep*Multiplier)`
+//
+// The pass creates a new scalar induction variable `base_iv` to track the
+// `(jp * Multiplier)` term, and a constant vector `offset_vec` to represent
+// the `k * (IndexStep * Multiplier)` term. The expensive multiply in the loop
+// is replaced by a simple vector add.
+//
+//   // --- In Preheader ---
+//   uint32_t new_iv_step = svcntw() * Multiplier;
+//   uint32_t new_iv_init = 0 * Multiplier;
+//   svuint32_t offset_vec = svindex_u32(0, IndexStep * Multiplier);
+//
+//   // --- In Loop ---
+//   uint32_t base_iv = new_iv_init; // (PHI node)
+//   for (int jp = 0; jp < limit; jp += svcntw()) {
+//     // ...
+//     svuint32_t base_vec = svdup_u32(base_iv);
+//     svuint32_t final_indices = svadd_z(pg, base_vec, offset_vec);
+//     // ...
+//     base_iv += new_iv_step;
+//   }
+//
+bool AArch64MIPeepholeOpt::simplifySVEIndexMultiply(MachineLoop *L) {
+  MachineBasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader)
+    return false;
+
+  MachineBasicBlock *Header = L->getHeader();
+  MachineBasicBlock *Latch = L->getLoopLatch();
+  if (!Header || !Latch)
+    return false;
+
+  for (MachineBasicBlock *MBB : L->getBlocks()) {
+    for (MachineInstr &MI : *MBB) {
+      // --- Start of the SVE MUL_Z Pattern Match ---
+      if (MI.getOpcode() != AArch64::MUL_ZPmZ_S &&
+          MI.getOpcode() != AArch64::MUL_ZPmZ_D) {
+        continue;
+      }
+
+      LLVM_DEBUG(dbgs() << "Found candidate MUL: "; MI.dump());
+
+      // Set up the specific AArch64 opcodes based on whether we have a 32-bit
+      // or 64-bit operation.
+      bool is64Bit = (MI.getOpcode() == AArch64::MUL_ZPmZ_D);
+      unsigned SelOpc = is64Bit ? AArch64::SEL_ZPZZ_D : AArch64::SEL_ZPZZ_S;
+      unsigned IndexRiOpc = is64Bit ? AArch64::INDEX_RI_D : AArch64::INDEX_RI_S;
+      unsigned AddZzzOpc = is64Bit ? AArch64::ADD_ZZZ_D : AArch64::ADD_ZZZ_S;
+      unsigned DupZrOpc = is64Bit ? AArch64::DUP_ZR_D : AArch64::DUP_ZR_S;
+      unsigned IndexIiOpc = is64Bit ? AArch64::INDEX_II_D : AArch64::INDEX_II_S;
+      unsigned DupZiOpc = is64Bit ? AArch64::DUP_ZI_D : AArch64::DUP_ZI_S;
+      unsigned AddGprOpc = is64Bit ? AArch64::ADDXrr : AArch64::ADDWrr;
+      unsigned CntOpc = is64Bit ? AArch64::CNTD_XPiI : AArch64::CNTW_XPiI;
+      unsigned MaddGprOpc = is64Bit ? AArch64::MADDXrrr : AArch64::MADDWrrr;
+      unsigned MovImmOpc = is64Bit ? AArch64::MOVi64imm : AArch64::MOVi32imm;
+      unsigned ZeroReg = is64Bit ? AArch64::XZR : AArch64::WZR;
+
+      const TargetRegisterClass *GprRegClass =
+          is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+      const TargetRegisterClass *GprAllRegClass =
+          is64Bit ? &AArch64::GPR64allRegClass : &AArch64::GPR32allRegClass;
+      const TargetRegisterClass *ZprRegClass = &AArch64::ZPRRegClass;
+
+      // Deconstruct the multiply instruction to see if it matches our target
+      // pattern. The matched pattern is: MUL(SEL(Pred, INDEX(IV, IdxStep),
+      // Zero), DUP(Multiplier))
+      MachineInstr *SelMI = MRI->getVRegDef(MI.getOperand(2).getReg());
+      if (!SelMI || (SelMI->getOpcode() != SelOpc))
+        continue;
+
+      // The second operand of the select should be an index operation.
+      MachineInstr *IndexMI = MRI->getVRegDef(SelMI->getOperand(2).getReg());
+      if (!IndexMI)
+        continue;
+
+      Register IVReg;
+      int64_t IndexStep;
+      // Detect the two index generated ways
+      if (IndexMI->getOpcode() == IndexRiOpc) {
+        // Case 1: INDEX_RI (reg, imm)
+        IVReg = IndexMI->getOperand(1).getReg();
+        if (!IVReg.isVirtual())
+          continue;
+        IndexStep = IndexMI->getOperand(2).getImm();
+      } else if (IndexMI->getOpcode() == AddZzzOpc) {
+        // Case 2: ADD(INDEX_II(0, imm), DUP(reg))
+        MachineInstr *Op1 = MRI->getVRegDef(IndexMI->getOperand(1).getReg());
+        MachineInstr *Op2 = MRI->getVRegDef(IndexMI->getOperand(2).getReg());
+        if (!Op1 || !Op2)
+          continue;
+
+        auto matchIndexAddPattern = [&](MachineInstr *A, MachineInstr *B) {
+          return (A->getOpcode() == IndexIiOpc && B->getOpcode() == DupZrOpc &&
+                  A->getOperand(1).getImm() == 0);
+        };
+
+        if (matchIndexAddPattern(Op1, Op2)) {
+          IndexStep = Op1->getOperand(2).getImm();
+          IVReg = Op2->getOperand(1).getReg();
+        } else if (matchIndexAddPattern(Op2, Op1)) {
+          IndexStep = Op2->getOperand(2).getImm();
+          IVReg = Op1->getOperand(1).getReg();
+        } else {
+          continue;
+        }
+      } else {
+        continue;
+      }
+
+      // The third operand of the multiply should be a duplicated immediate
+      // value.
+      MachineInstr *MultiplierMI = MRI->getVRegDef(MI.getOperand(3).getReg());
+      if (!MultiplierMI || !isLoopInvariant(MI.getOperand(3).getReg(), L) ||
+          (MultiplierMI->getOpcode() != DupZiOpc))
+        continue;
+      int64_t MultiplierVal = MultiplierMI->getOperand(1).getImm();
+
+      // Check if the identified register is a basic loop induction variable.
+      MachineInstr *IVPhi = MRI->getVRegDef(IVReg);
+      if (!IVPhi || !IVPhi->isPHI() || IVPhi->getParent() != Header)
+        continue;
+
+      // Find the instruction that updates the induction variable (usually an
+      // ADD in the latch).
+      Register IVInitReg = Register(0), IVNextReg = Register(0);
+      for (unsigned i = 1; i < IVPhi->getNumOperands(); i += 2) {
+        if (IVPhi->getOperand(i + 1).getMBB() == Preheader) {
+          IVInitReg = IVPhi->getOperand(i).getReg();
+          break;
+        }
+      }
+      for (unsigned i = 1; i < IVPhi->getNumOperands(); i += 2) {
+        if (IVPhi->getOperand(i + 1).getMBB() == Latch) {
+          IVNextReg = IVPhi->getOperand(i).getReg();
+          break;
+        }
+      }
+      if (!IVInitReg || !IVNextReg)
+        continue;
+
+      // Get the definition of the next value of the induction variable.
+      MachineInstr *IVUpdateMI = MRI->getVRegDef(IVNextReg);
+      if (!IVUpdateMI)
+        continue;
+      if (IVUpdateMI->getOpcode() == AArch64::COPY)
+        IVUpdateMI = MRI->getVRegDef(IVUpdateMI->getOperand(1).getReg());
+      if (IVUpdateMI->getOpcode() != AddGprOpc)
+        continue;
+
+      // Determine the step of the induction variable.
+      Register IVStepReg;
+      if (IVUpdateMI->getOperand(1).getReg() == IVReg)
+        IVStepReg = IVUpdateMI->getOperand(2).getReg();
+      else if (IVUpdateMI->getOperand(2).getReg() == IVReg)
+        IVStepReg = IVUpdateMI->getOperand(1).getReg();
+      else
+        continue;
+
+      LLVM_DEBUG(
+          dbgs() << "Sve Mul Strength reduction pattern matched for MUL: ";
+          MI.dump(););
+
+      // --- Start of the Transformation ---
+      auto PreheaderInsertPt = Preheader->getFirstTerminator();
+      DebugLoc DL = MI.getDebugLoc();
+
+      // In the preheader, create a new offset = index(0, IndexStep *
+      // MultiplierVal)
+      Register OffsetVecReg = MRI->createVirtualRegister(ZprRegClass);
+      BuildMI(*MBB, MI.getIterator(), DL, TII->get(IndexIiOpc), OffsetVecReg)
+          .addImm(0)
+          .addImm(IndexStep * MultiplierVal)
+          .addReg(AArch64::VG, RegState::Implicit);
+
+      // In the preheader, calculate the new step value for our new induction
+      // variable. This is: NewStep = IVStep * MultiplierVal
+      MachineInstr *IVStepDef = MRI->getVRegDef(IVStepReg);
+      if (IVStepDef->getOpcode() == AArch64::COPY)
+        IVStepDef = MRI->getVRegDef(IVStepDef->getOperand(1).getReg());
+
+      // Check if the original IV step is the vector length (vl).
+      bool isStepVL =
+          IVStepDef && IVStepDef->getOpcode() == CntOpc &&
+          IVStepDef->getOperand(1).getImm() == 31 && // Pattern for 'all'
+          IVStepDef->getOperand(2).getImm() == 1;    // Multiplier of 1
+      Register NewStepReg = MRI->createVirtualRegister(GprRegClass);
+
+      // If the step is 'vl' and the multiplier is small, we can use a more
+      // efficient 'cnt' instruction.
+      if (isStepVL && MultiplierVal <= 15) {
+        Register NewStep64Reg =
+            MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+        LLVM_DEBUG(dbgs() << "IV Step is vl, using CNT[W/D] for new step.\n");
+        BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(CntOpc),
+                NewStep64Reg)
+            .addImm(31) // Pattern 'all' for vl
+            .addImm(MultiplierVal)
+            .addReg(AArch64::VG, RegState::Implicit);
+        if (!is64Bit) {
+          BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(AArch64::COPY),
+                  NewStepReg)
+              .addReg(NewStep64Reg, 0, AArch64::sub_32);
+        }
+      } else {
+        // Otherwise, we use a general multiplication.
+        LLVM_DEBUG(
+            dbgs() << "IV Step is not vl, using generic MUL for new step.\n");
+        Register MultReg = MRI->createVirtualRegister(GprRegClass);
+        BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(MovImmOpc), MultReg)
+            .addImm(MultiplierVal);
+        BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(MaddGprOpc),
+                NewStepReg)
+            .addReg(IVStepReg)
+            .addReg(MultReg)
+            .addReg(ZeroReg);
+      }
+
+      // In the preheader, calculate the initial value for the new base IV.
+      // BaseIVInit = IVInit * MultiplierVal
+      Register BaseIVInitReg = MRI->createVirtualRegister(GprAllRegClass);
+      Register MultRegForInit = MRI->createVirtualRegister(GprRegClass);
+      BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(MovImmOpc),
+              MultRegForInit)
+          .addImm(MultiplierVal);
+      BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(MaddGprOpc),
+              BaseIVInitReg)
+          .addReg(IVInitReg)
+          .addReg(MultRegForInit)
+          .addReg(ZeroReg);
+
+      // Create a new PHI node in the header
+      // for our new base induction variable.
+      Register BaseIVReg = MRI->createVirtualRegister(GprAllRegClass);
+      Register NextBaseIVReg = MRI->createVirtualRegister(GprAllRegClass);
+      auto BaseIVPhi = BuildMI(*Header, Header->getFirstNonPHI(), DL,
+                               TII->get(AArch64::PHI), BaseIVReg);
+      BaseIVPhi.addReg(BaseIVInitReg).addMBB(Preheader);
+
+      // In the loop latch, update our new base induction variable
+      // by adding the new step
+      BuildMI(*Latch, Latch->getFirstTerminator(), DL, TII->get(AddGprOpc),
+              NextBaseIVReg)
+          .addReg(BaseIVReg)
+          .addReg(NewStepReg);
+
+      BaseIVPhi.addReg(NextBaseIVReg).addMBB(Latch);
+
+      // Now, replace the original multiply operation in the loop body
+      // with a new add operation
+      auto BodyInsertPt = MI.getIterator();
+
+      // Broadcast the new base IV into a vector register.
+      Register BaseVecReg = MRI->createVirtualRegister(ZprRegClass);
+      BuildMI(*MI.getParent(), BodyInsertPt, DL, TII->get(DupZrOpc), BaseVecReg)
+          .addReg(BaseIVReg);
+
+      // Perform the vector addition: NewResult = OffsetVector + BaseVector
+      Register AddTmpReg = MRI->createVirtualRegister(ZprRegClass);
+      BuildMI(*MI.getParent(), BodyInsertPt, DL, TII->get(AddZzzOpc), AddTmpReg)
+          .addReg(OffsetVecReg)
+          .addReg(BaseVecReg);
+
+      // Replace all uses of the original multiplication result
+      // with our new addition result
+      MRI->replaceRegWith(MI.getOperand(0).getReg(), AddTmpReg);
+
+      // Clean up the now-dead instructions from the old calculation
+      MI.eraseFromParent();
+      if (MRI->use_empty(SelMI->getOperand(0).getReg()))
+        SelMI->eraseFromParent();
+      if (MRI->use_empty(IndexMI->getOperand(0).getReg())) {
+        if (IndexMI->getOpcode() == AddZzzOpc) {
+          MachineInstr *Op1 = MRI->getVRegDef(IndexMI->getOperand(1).getReg());
+          MachineInstr *Op2 = MRI->getVRegDef(IndexMI->getOperand(2).getReg());
+          if (MRI->use_empty(Op1->getOperand(0).getReg()))
+            Op1->eraseFromParent();
+          if (MRI->use_empty(Op2->getOperand(0).getReg()))
+            Op2->eraseFromParent();
+        }
+        IndexMI->eraseFromParent();
+      }
+      if (MRI->use_empty(MultiplierMI->getOperand(0).getReg()))
+        MultiplierMI->eraseFromParent();
+
+      LLVM_DEBUG(dbgs() << "Successfully applied strength reduction.\n");
+
+      return true;
+    }
+  }
+  return false;
+}
+
 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -752,6 +1659,25 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  if (EnableSVELoopAddressChainOpt &&
+      MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
+    for (MachineLoop *L : *MLI) {
+      for (MachineLoop *SubL : *L) {
+        Changed |= processSVELoopAddressing(SubL);
+      }
+      Changed |= processSVELoopAddressing(L);
+    }
+  }
+
+  if (EnableSVEIndexMultiplyOpt &&
+      MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
+    for (MachineLoop *L : *MLI) {
+      for (MachineLoop *SubL : *L) {
+        Changed |= simplifySVEIndexMultiply(SubL);
+      }
+      Changed |= simplifySVEIndexMultiply(L);
+    }
+  }
   return Changed;
 }
 
diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
index 87939363122c..c5a6cb7af405 100644
--- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
+++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -24,7 +24,6 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
@@ -42,28 +41,9 @@ using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "aarch64-sve-intrinsic-opts"
 
-static cl::opt<bool> EnableSVELoopAddressChainOpt(
-  "aarch64-sve-loop-address-chain-opt", cl::init(false), cl::Hidden,
-  cl::desc("Enable simplification of SVE address computation chains in loops"));
-
 namespace {
 struct SVEIntrinsicOpts : public ModulePass {
   static char ID; // Pass identification, replacement for typeid
-
-  enum class SVEIndexExtension { SIGN, ZERO, NONE }; // NONE for i64 indices
-
-  struct SVEMemoryOpInfo {
-    unsigned BaseOpIdx;
-    unsigned IndexOpIdx;
-    Type *ElemTy; // The type of the data element being loaded/stored.
-    SVEIndexExtension ExtKind;
-  };
-
-  // The key is {OriginalBasePointer, Index}
-  using InvariantBaseKey = std::pair<Value *, Value *>;
-  // The cache maps this key to the computed GEP.
-  using InvariantBaseCache = DenseMap<InvariantBaseKey, Value *>;
-
   SVEIntrinsicOpts() : ModulePass(ID) {
     initializeSVEIntrinsicOptsPass(*PassRegistry::getPassRegistry());
   }
@@ -80,16 +60,6 @@ private:
 
   bool optimizeInstructions(SmallSetVector<Function *, 4> &Functions);
 
-  std::optional<SVEMemoryOpInfo> getSVEMemoryOpInfo(const IntrinsicInst *II);
-  Value *getLoopInvariantSplatValue(Value *V, Loop *L);
-  Value *getHoistedBaseForIndex(Value *Index, Value *OriBase, Loop *L,
-                                InvariantBaseCache &Cache,
-                                SVEMemoryOpInfo &OpInfo);
-  bool simplifySVEAddressComputation(IntrinsicInst *II, Loop *L,
-                                     InvariantBaseCache &Cache,
-                                     SVEMemoryOpInfo &OpInfo);
-  bool runSVEAddressHoisting(Function &F, LoopInfo &LI);
-
   /// Operates at the function-scope. I.e., optimizations are applied local to
   /// the functions themselves.
   bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions);
@@ -98,7 +68,6 @@ private:
 
 void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<DominatorTreeWrapperPass>();
-  AU.addRequired<LoopInfoWrapperPass>();
   AU.setPreservesCFG();
 }
 
@@ -106,7 +75,6 @@ char SVEIntrinsicOpts::ID = 0;
 static const char *name = "SVE intrinsics optimizations";
 INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
 INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
 
 ModulePass *llvm::createSVEIntrinsicOptsPass() {
@@ -460,332 +428,6 @@ bool SVEIntrinsicOpts::optimizeInstructions(
   return Changed;
 }
 
-/// Checks if an intrinsic is an SVE gather/scatter memory operation that this
-/// optimization can analyze. Return the operand information (Base index, Index
-/// index, Element Type, and Extension Kind) if supported
-std::optional<SVEIntrinsicOpts::SVEMemoryOpInfo>
-SVEIntrinsicOpts::getSVEMemoryOpInfo(const IntrinsicInst *II) {
-  switch (II->getIntrinsicID()) {
-  // Gather Loads
-  case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
-  case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
-    return {{1, 2,
-             dyn_cast<ScalableVectorType>(II->getType())->getElementType(),
-             SVEIndexExtension::SIGN}}; // Base=1, Index=2, Ext=SIGN
-  case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
-  case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
-    return {{1, 2,
-             dyn_cast<ScalableVectorType>(II->getType())->getElementType(),
-             SVEIndexExtension::ZERO}}; // Base=1, Index=2, Ext=ZERO
-  case Intrinsic::aarch64_sve_ld1_gather_index:
-  case Intrinsic::aarch64_sve_ldff1_gather_index:
-  case Intrinsic::aarch64_sve_ldnt1_gather_index:
-    return {{1, 2,
-             dyn_cast<ScalableVectorType>(II->getType())->getElementType(),
-             SVEIndexExtension::NONE}}; // Base=1, Index=2, Ext=NONE
-
-  // Prefetches (have no return value, element type is based on name)
-  case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
-    return {{1, 2, Type::getInt64Ty(II->getParent()->getContext()),
-             SVEIndexExtension::SIGN}};
-  case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
-    return {{1, 2, Type::getInt16Ty(II->getParent()->getContext()),
-             SVEIndexExtension::SIGN}};
-  case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
-    return {{1, 2, Type::getInt32Ty(II->getParent()->getContext()),
-             SVEIndexExtension::SIGN}};
-  case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
-    return {{1, 2, Type::getInt64Ty(II->getParent()->getContext()),
-             SVEIndexExtension::ZERO}};
-  case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
-    return {{1, 2, Type::getInt16Ty(II->getParent()->getContext()),
-             SVEIndexExtension::ZERO}};
-  case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
-    return {{1, 2, Type::getInt32Ty(II->getParent()->getContext()),
-             SVEIndexExtension::ZERO}};
-  case Intrinsic::aarch64_sve_prfd_gather_index:
-    return {{1, 2, Type::getInt64Ty(II->getParent()->getContext()),
-             SVEIndexExtension::NONE}};
-  case Intrinsic::aarch64_sve_prfh_gather_index:
-    return {{1, 2, Type::getInt16Ty(II->getParent()->getContext()),
-             SVEIndexExtension::NONE}};
-  case Intrinsic::aarch64_sve_prfw_gather_index:
-    return {{1, 2, Type::getInt32Ty(II->getParent()->getContext()),
-             SVEIndexExtension::NONE}};
-
-  // Scatter Stores (data is operand 0, element type is derived from it)
-  case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
-    return {{2, 3,
-             dyn_cast<ScalableVectorType>(II->getOperand(0)->getType())
-                 ->getElementType(),
-             SVEIndexExtension::SIGN}}; // Base=2, Index=3, Ext=SIGN
-  case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
-    return {{2, 3,
-             dyn_cast<ScalableVectorType>(II->getOperand(0)->getType())
-                 ->getElementType(),
-             SVEIndexExtension::ZERO}}; // Base=2, Index=3, Ext=ZERO
-  case Intrinsic::aarch64_sve_st1_scatter_index:
-  case Intrinsic::aarch64_sve_stnt1_scatter_index:
-    return {{2, 3,
-             dyn_cast<ScalableVectorType>(II->getOperand(0)->getType())
-                 ->getElementType(),
-             SVEIndexExtension::NONE}}; // Base=2, Index=3, Ext=NONE
-
-  default:
-    return std::nullopt;
-  }
-}
-
-/// Check if a Value is a splat of a loop-invariant scalar, which is a
-/// shufflevector of an insertelement at index 0. If the pattern matches, return
-/// the loop scalar value.
-Value *SVEIntrinsicOpts::getLoopInvariantSplatValue(Value *V, Loop *L) {
-  Value *InvariantScalar = nullptr;
-  Value *InsertElementVal = nullptr;
-
-  if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
-    InsertElementVal = SV->getOperand(0);
-  } else if (auto *SVC = dyn_cast<ConstantExpr>(V)) {
-    if (SVC->getOpcode() == Instruction::ShuffleVector) {
-      InsertElementVal = SVC->getOperand(0);
-    }
-  }
-
-  if (!InsertElementVal)
-    return nullptr;
-
-  // Check if InsertElementVal is an insertelement and get the scalar.
-  if (auto *IE = dyn_cast<InsertElementInst>(InsertElementVal)) {
-    if (match(IE->getOperand(2),
-              m_Zero())) { // Ensure it's inserting at index 0
-      InvariantScalar = IE->getOperand(1);
-    }
-  } else if (auto *IEC = dyn_cast<ConstantExpr>(InsertElementVal)) {
-    if (IEC->getOpcode() == Instruction::InsertElement &&
-        match(IEC->getOperand(2), m_Zero())) {
-      InvariantScalar = IEC->getOperand(1);
-    }
-  }
-
-  if (!InvariantScalar || !L->isLoopInvariant(InvariantScalar))
-    return nullptr;
-
-  return InvariantScalar;
-}
-
-/// Analyzes an index calculation chain and generates hoistable GEPs.
-/// @param Index The starting index Value (from the sve memory op)
-/// @param OrigBase The original base pointer from the sve memory op
-/// @param L The loop context
-/// @param Cache A map to memoize results for `{OrigBase, Index} : NewBase`
-/// @param OpInfo Information about the sve memory op
-/// @return The final, rewritten base pointer for the memory op
-Value *SVEIntrinsicOpts::getHoistedBaseForIndex(Value *Index, Value *OrigBase,
-                                                Loop *L,
-                                                InvariantBaseCache &Cache,
-                                                SVEMemoryOpInfo &OpInfo) {
-  InvariantBaseKey InitialKey = {OrigBase, Index};
-
-  // If this entire chain has been processed before, return the final result.
-  if (Cache.count(InitialKey))
-    return Cache.lookup(InitialKey);
-
-  // --- Trace the ADD chain up to the root, collecting nodes ---
-  SmallVector<Value *, 8> IndexChain;
-  Value *CurrentIndex = Index;
-  Value *RootIndex = nullptr;
-
-  // The `while` loop traces the `sve.add` chain upwards from the `Index` used
-  // by the memory op, collecting all intermediate indices onto a stack
-  // (`IndexChain`). The trace stops when it hits a value that is not an `(index
-  // + invariant)` add, which becomes the `RootIndex`.
-  while (true) {
-    IndexChain.push_back(CurrentIndex);
-    InvariantBaseKey CurrentKey = {OrigBase, CurrentIndex};
-
-    // If a subchain in the chain is already solved, stop tracing
-    if (Cache.count(CurrentKey)) {
-      RootIndex = CurrentIndex;
-      break;
-    }
-
-    auto *Add = dyn_cast<IntrinsicInst>(CurrentIndex);
-    // Stop if not a recognized sve.add intrinsic or not defined in the loop
-    if (!Add ||
-        (Add->getIntrinsicID() != Intrinsic::aarch64_sve_add &&
-         Add->getIntrinsicID() != Intrinsic::aarch64_sve_add_u) ||
-        !L->contains(Add)) {
-      RootIndex = CurrentIndex;
-      break;
-    }
-
-    Value *Op1 = Add->getOperand(1);
-    Value *Op2 = Add->getOperand(2);
-
-    // Check if one of the operands is an invariant splat
-    if (getLoopInvariantSplatValue(Op1, L)) {
-      CurrentIndex = Op2;
-      if (match(Op2, m_Select(m_Value(), m_Value(), m_Zero())))
-        CurrentIndex = dyn_cast<Instruction>(Op2)->getOperand(1);
-    } else if (getLoopInvariantSplatValue(Op2, L)) {
-      CurrentIndex = Op1;
-      if (match(Op1, m_Select(m_Value(), m_Value(), m_Zero())))
-        CurrentIndex = dyn_cast<Instruction>(Op1)->getOperand(1);
-    } else {
-      RootIndex = CurrentIndex; // Not an (index + invariant) form.
-      break;
-    }
-  }
-
-  // --- Build GEPs back down the chain ---
-  // The base for the root index is always the original base pointer
-  Value *CurrentHoistedBase = Cache.lookup({OrigBase, RootIndex});
-  if (!CurrentHoistedBase) {
-    CurrentHoistedBase = OrigBase;
-    Cache[{OrigBase, RootIndex}] = OrigBase;
-  }
-
-  // Iterates down the collected chain (in reverse). For each node, it computes
-  // the new hoisted base by creating a GEP on top of the base of the previous
-  // node in the chain.
-  for (Value *IdxNode : reverse(IndexChain)) {
-    if (IdxNode == RootIndex)
-      continue;
-
-    InvariantBaseKey CurrentKey = {OrigBase, IdxNode};
-    auto *Add = dyn_cast<IntrinsicInst>(IdxNode);
-    Value *Op1 = Add->getOperand(1);
-    Value *Op2 = Add->getOperand(2);
-
-    Value *InvariantScalar = getLoopInvariantSplatValue(Op1, L);
-    if (!InvariantScalar)
-      InvariantScalar = getLoopInvariantSplatValue(Op2, L);
-    assert(InvariantScalar);
-
-    IRBuilder<> Builder(Add);
-    Value *GEPIndex = InvariantScalar;
-
-    // Ensure the invariant has the correct integer type for GEP
-    switch (OpInfo.ExtKind) {
-    case SVEIndexExtension::SIGN:
-      GEPIndex = Builder.CreateSExt(
-          GEPIndex, Type::getInt64Ty(Add->getParent()->getContext()),
-          "invariant.idx.sext");
-      break;
-    case SVEIndexExtension::ZERO:
-      GEPIndex = Builder.CreateZExt(
-          GEPIndex, Type::getInt64Ty(Add->getParent()->getContext()),
-          "invariant.idx.zext");
-      break;
-    case SVEIndexExtension::NONE:
-      break;
-    }
-
-    Value *NewBase = Builder.CreateGEP(OpInfo.ElemTy, CurrentHoistedBase,
-                                       GEPIndex, "add.ptr");
-    // Cache the result for this node and update the base for the next iteration
-    Cache[CurrentKey] = NewBase;
-    CurrentHoistedBase = NewBase;
-  }
-
-  return Cache.lookup(InitialKey);
-}
-
-/// Get the final rewritten base and root index, and rewrite the memory
-/// intrinsic
-bool SVEIntrinsicOpts::simplifySVEAddressComputation(
-    IntrinsicInst *MemIntrinsic, Loop *L, InvariantBaseCache &Cache,
-    SVEMemoryOpInfo &OpInfo) {
-  Value *OrigBase = MemIntrinsic->getArgOperand(OpInfo.BaseOpIdx);
-  Value *OrigIndex = MemIntrinsic->getArgOperand(OpInfo.IndexOpIdx);
-
-  // If the base itself is not loop invariant, skip simplification
-  if (!L->isLoopInvariant(OrigBase))
-    return false;
-
-  // The actual index might be hidden behind a `select(pg, index, zero)`
-  // Peel this away to get to the core index calculation
-  Value *IndexToTrace = OrigIndex;
-  if (match(IndexToTrace, m_Select(m_Value(), m_Value(), m_Zero()))) {
-    IndexToTrace = dyn_cast<Instruction>(IndexToTrace)->getOperand(1);
-  }
-  // This call populates the cache for the entire chain and returns the final
-  // base
-  Value *NewBase =
-      getHoistedBaseForIndex(IndexToTrace, OrigBase, L, Cache, OpInfo);
-
-  // If the base pointer hasn't changed, nothing was optimized.
-  if (NewBase == OrigBase)
-    return false;
-
-  // Now that the cache is populated, trace up from the starting index to find
-  // the root.
-  Value *RootIndex = IndexToTrace;
-  while (true) {
-    InvariantBaseKey CurrentKey = {OrigBase, RootIndex};
-    // The root is the node that maps back to the original base in the cache.
-    if (Cache.count(CurrentKey) && Cache.lookup(CurrentKey) == OrigBase) {
-      break;
-    }
-
-    auto *Add = dyn_cast<IntrinsicInst>(RootIndex);
-    Value *NextIndex = nullptr;
-    if (getLoopInvariantSplatValue(Add->getOperand(1), L)) {
-      NextIndex = Add->getOperand(2);
-    } else if (getLoopInvariantSplatValue(Add->getOperand(2), L)) {
-      NextIndex = Add->getOperand(1);
-    } else {
-      break; // Reached a non-optimizable ADD, this is the root
-    }
-
-    if (match(NextIndex, m_Select(m_Value(), m_Value(), m_Zero()))) {
-      RootIndex = dyn_cast<Instruction>(NextIndex)->getOperand(1);
-    } else {
-      RootIndex = NextIndex;
-    }
-  }
-
-  LLVM_DEBUG(dbgs() << "SVE_ADDR_HOIST_IR: Rewriting " << *MemIntrinsic
-                    << "\n");
-
-  MemIntrinsic->setArgOperand(OpInfo.BaseOpIdx, NewBase);
-  MemIntrinsic->setArgOperand(OpInfo.IndexOpIdx, RootIndex);
-
-  // Cleanup would be complex. Rely on DCE for now.
-
-  LLVM_DEBUG(dbgs() << "SVE_ADDR_HOIST_IR: To -> " << *MemIntrinsic << "\n");
-
-  return true;
-}
-
-/// iterates through all basic blocks in a function. For each block
-/// that is part of a loop, it creates a fresh cache and then iterates through
-/// its instructions in program order, attempting to simplify any SVE memory
-/// operations it finds.
-bool SVEIntrinsicOpts::runSVEAddressHoisting(Function &F, LoopInfo &LI) {
-  bool Changed = false;
-  for (auto &BB : F) {
-    // We only care about blocks that are inside a loop.
-    Loop *L = LI.getLoopFor(&BB);
-    if (!L)
-      continue;
-
-    // A fresh cache is used for each basic block to ensure correctness.
-    // Maps {OriginalBasePointer, Index} to the new computed GEP.
-    InvariantBaseCache Cache;
-
-    // Iterate through instructions in program order (important!)
-    for (auto &I : BB) {
-      if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
-        if (auto OpInfo = getSVEMemoryOpInfo(II))
-          Changed |= simplifySVEAddressComputation(II, L, Cache, *OpInfo);
-      }
-    }
-  }
-
-  return Changed;
-}
-
 bool SVEIntrinsicOpts::optimizeFunctions(
     SmallSetVector<Function *, 4> &Functions) {
   bool Changed = false;
@@ -793,16 +435,6 @@ bool SVEIntrinsicOpts::optimizeFunctions(
   Changed |= optimizePTrueIntrinsicCalls(Functions);
   Changed |= optimizeInstructions(Functions);
 
-  if (EnableSVELoopAddressChainOpt) {
-    for (Function *F : Functions) {
-      if (F->isDeclaration())
-        continue;
-
-      LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>(*F).getLoopInfo();
-      Changed |= runSVEAddressHoisting(*F, LI);
-    }
-  }
-
   return Changed;
 }
 
@@ -821,26 +453,6 @@ bool SVEIntrinsicOpts::runOnModule(Module &M) {
     case Intrinsic::vector_extract:
     case Intrinsic::vector_insert:
     case Intrinsic::aarch64_sve_ptrue:
-    case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
-    case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
-    case Intrinsic::aarch64_sve_ld1_gather_index:
-    case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
-    case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
-    case Intrinsic::aarch64_sve_ldff1_gather_index:
-    case Intrinsic::aarch64_sve_ldnt1_gather_index:
-    case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
-    case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
-    case Intrinsic::aarch64_sve_prfd_gather_index:
-    case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
-    case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
-    case Intrinsic::aarch64_sve_prfh_gather_index:
-    case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
-    case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
-    case Intrinsic::aarch64_sve_prfw_gather_index:
-    case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
-    case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
-    case Intrinsic::aarch64_sve_st1_scatter_index:
-    case Intrinsic::aarch64_sve_stnt1_scatter_index:
       for (User *U : F.users())
         Functions.insert(cast<Instruction>(U)->getFunction());
       break;
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index 4ee13788c2c4..3747b2581fa4 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -25,7 +25,6 @@
 ; CHECK-NEXT:     SVE intrinsics optimizations
 ; CHECK-NEXT:       FunctionPass Manager
 ; CHECK-NEXT:         Dominator Tree Construction
-; CHECK-NEXT:         Natural Loop Information
 ; CHECK-NEXT:     FunctionPass Manager
 ; CHECK-NEXT:       Simplify the CFG
 ; CHECK-NEXT:       Dominator Tree Construction
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll
index 85d286c165ca..579cc889a8a8 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll
@@ -210,11 +210,11 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
 define dso_local void @test_invariantOffset32bit(i32 noundef %N, i32 noundef %M, ptr noundef %matrix, ptr nocapture noundef %result) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_invariantOffset32bit:
 ; CHECK:       .LBB4_5:                                // %for.body4
+; CHECK:       add	x[[NEWBASE3:[0-9]+]], x2, w{{[0-9]+}}, sxtw #2
 ; CHECK:       add	x[[NEWBASE1:[0-9]+]], x2, w{{[0-9]+}}, sxtw #2
-; CHECK:       ld1w	{ z{{[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x[[NEWBASE1]], z{{[0-9]+}}.s, sxtw #2]
 ; CHECK:       add	x[[NEWBASE2:[0-9]+]], x2, w{{[0-9]+}}, sxtw #2
+; CHECK:       ld1w	{ z{{[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x[[NEWBASE1]], z{{[0-9]+}}.s, sxtw #2]
 ; CHECK:       ld1w	{ z{{[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE2]], z{{[0-9]+}}.s, sxtw #2]
-; CHECK:       add	x[[NEWBASE3:[0-9]+]], x2, w{{[0-9]+}}, sxtw #2
 ; CHECK:       ld1w	{ z{{[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE3]], z{{[0-9]+}}.s, sxtw #2]
 entry:
   %cmp41 = icmp sgt i32 %N, 2
@@ -289,11 +289,11 @@ for.body4:                                        ; preds = %for.body4.lr.ph, %f
 define dso_local void @test_invariantOffset64bit(i64 noundef %N, i64 noundef %M, ptr noundef %matrix, ptr nocapture noundef %result) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_invariantOffset64bit:
 ; CHECK:       .LBB5_6:                                // %for.body4
+; CHECK:       add	x[[NEWBASE2:[0-9]+]], x2, x{{[0-9]+}}, lsl #3
 ; CHECK:       add	x[[NEWBASE1:[0-9]+]], x2, x{{[0-9]+}}, lsl #3
+; CHECK:       add	x[[NEWBASE3:[0-9]+]], x2, x{{[0-9]+}}, lsl #3
 ; CHECK:       ld1d	{ z{{[0-9]+}}.d }, p[[PG:[0-9]+]]/z, [x[[NEWBASE1]], z{{[0-9]+}}.d, lsl #3]
-; CHECK:       add	x[[NEWBASE2:[0-9]+]], x2, x{{[0-9]+}}, lsl #3
 ; CHECK:       ld1d	{ z{{[0-9]+}}.d }, p[[PG]]/z, [x[[NEWBASE2]], z{{[0-9]+}}.d, lsl #3]
-; CHECK:       add	x[[NEWBASE3:[0-9]+]], x2, x{{[0-9]+}}, lsl #3
 ; CHECK:       ld1d	{ z{{[0-9]+}}.d }, p[[PG]]/z, [x[[NEWBASE3]], z{{[0-9]+}}.d, lsl #3]
 entry:
   %cmp39.not = icmp ult i64 %N, 3
@@ -396,6 +396,7 @@ for.body:                                         ; preds = %for.body, %entry
 
 define dso_local void @_Z26test_loop_invariant_offsetPlu11__SVInt64_tl(ptr noundef %base, <vscale x 2 x i64> %index, i64 noundef %invariant_offset) local_unnamed_addr #6 {
 ; CHECK-LABEL: _Z26test_loop_invariant_offsetPlu11__SVInt64_tl:
+; CHECK:       .LBB7_1:                                // %for.body
 ; CHECK:       add	x[[NEWBASE:[0-9]+]], x0, x1, lsl #3
 ; CHECK:       st1d	{ z{{[0-9]+}}.d }, p{{[0-9]+}}, [x[[NEWBASE]], z{{[0-9]+}}.d, lsl #3]
 entry:
@@ -422,8 +423,9 @@ for.body:                                         ; preds = %entry, %for.body
 
 define dso_local void @test_combined_const_and_invariant_offset(ptr noundef %base, <vscale x 4 x i32> %index, i32 noundef %invariant_offset) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_combined_const_and_invariant_offset:
-; CHECK:       add	x[[NEWBASE_GPR:[0-9]+]], x0, w1, sxtw #2
-; CHECK:       add	x[[NEWBASE_FINAL:[0-9]+]], x[[NEWBASE_GPR]], #40
+; CHECK:       .LBB8_1:                                // %for.body
+; CHECK-DAG:   add	x[[NEWBASE_GPR:[0-9]+]], x0, w1, sxtw #2
+; CHECK-DAG:   add	x[[NEWBASE_FINAL:[0-9]+]], x[[NEWBASE_GPR]], #40
 ; CHECK:       ld1w	{ z{{[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x[[NEWBASE_FINAL]], z{{[0-9]+}}.s, sxtw #2]
 entry:
   %0 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -523,4 +525,4 @@ attributes #9 = { nocallback nofree nosync nounwind willreturn memory(argmem: wr
 !21 = !{!"int", !7, i64 0}
 !22 = distinct !{!22, !10}
 !23 = distinct !{!23, !10}
-!24 = distinct !{!24, !10}
+!24 = distinct !{!24, !10}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-index-mul-simplify.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-index-mul-simplify.ll
new file mode 100644
index 000000000000..1c8550625404
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-index-mul-simplify.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mtriple=aarch64-unknown -mcpu=hip09 -aarch64-sve-simplify-index-multiply -O3 -o - %s | FileCheck %s
+
+define dso_local void @index_mul_simplify(i32 %loopTime, ptr %x, <vscale x 4 x float> %val) {
+; CHECK-LABEL: index_mul_simplify:
+; CHECK:       // %bb.1:                                // %for.body.lr.ph
+; CHECK-DAG:   mov	w[[MULTIPLIER:[0-9]+]], #3
+; CHECK-DAG:   index	z[[OFFSET_VEC:[0-9]+]].s, #0, #3
+; CHECK-DAG:   cntw	x[[IV_STEP:[0-9]+]]
+; CHECK-DAG:   cntw	x[[NEW_IV_STEP:[0-9]+]], all, mul #3
+; CHECK-DAG:   mul	w[[NEW_IV_INIT:[0-9]+]], wzr, w[[MULTIPLIER]]
+
+; CHECK:       .LBB0_2:                                // %for.body
+; CHECK:       mov z[[BASE_VEC:[0-9]+]].s, w[[NEW_IV_CUR:[0-9]+]]
+; CHECK:       whilelt	p[[PG:[0-9]+]].s, w{{[0-9]+}}, w{{[0-9]+}}
+; CHECK:       add	w[[NEW_IV_CUR]], w[[NEW_IV_CUR]], w[[NEW_IV_STEP]]
+; CHECK:       add	z[[FINAL_INDICES:[0-9]+]].s, z[[OFFSET_VEC]].s, z[[BASE_VEC]].s
+; CHECK-NOT:   mul
+; CHECK:       st1w	{ z0.s }, p[[PG]], [x1, z[[FINAL_INDICES]].s, sxtw #2]
+
+entry:
+  %cmp7 = icmp sgt i32 %loopTime, 0
+  br i1 %cmp7, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %.tr = tail call i32 @llvm.vscale.i32()
+  %0 = shl nuw nsw i32 %.tr, 2
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %jp.08 = phi i32 [ 0, %for.body.lr.ph ], [ %conv1, %for.body ]
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %jp.08, i32 %loopTime)
+  %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.index.nxv4i32(i32 %jp.08, i32 1)
+  %3 = select <vscale x 4 x i1> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> zeroinitializer
+  %4 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %3, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  tail call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32(<vscale x 4 x float> %val, <vscale x 4 x i1> %1, ptr %x, <vscale x 4 x i32> %4)
+  %conv1 = add i32 %0, %jp.08
+  %cmp = icmp slt i32 %conv1, %loopTime
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+declare <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.index.nxv4i32(i32, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, ptr, <vscale x 4 x i32>)
+declare i32 @llvm.vscale.i32()
\ No newline at end of file
-- 
Gitee