From edf4e21db734846ecc6d8339244c564706d0217b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E5=93=B2=E6=B5=A9?= <2209576006@qq.com> Date: Sun, 28 Sep 2025 13:38:02 +0800 Subject: [PATCH 1/2] [AArch64] Simplifies SVE gather/scatter address svadd chains in loops --- llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp | 388 +++++++++++++ llvm/test/CodeGen/AArch64/O3-pipeline.ll | 1 + .../aarch64-sve-addressing-peephole.ll | 526 ++++++++++++++++++ 3 files changed, 915 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp index c5a6cb7af405..87939363122c 100644 --- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -24,6 +24,7 @@ #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" @@ -41,9 +42,28 @@ using namespace llvm::PatternMatch; #define DEBUG_TYPE "aarch64-sve-intrinsic-opts" +static cl::opt EnableSVELoopAddressChainOpt( + "aarch64-sve-loop-address-chain-opt", cl::init(false), cl::Hidden, + cl::desc("Enable simplification of SVE address computation chains in loops")); + namespace { struct SVEIntrinsicOpts : public ModulePass { static char ID; // Pass identification, replacement for typeid + + enum class SVEIndexExtension { SIGN, ZERO, NONE }; // NONE for i64 indices + + struct SVEMemoryOpInfo { + unsigned BaseOpIdx; + unsigned IndexOpIdx; + Type *ElemTy; // The type of the data element being loaded/stored. + SVEIndexExtension ExtKind; + }; + + // The key is {OriginalBasePointer, Index} + using InvariantBaseKey = std::pair; + // The cache maps this key to the computed GEP. + using InvariantBaseCache = DenseMap; + SVEIntrinsicOpts() : ModulePass(ID) { initializeSVEIntrinsicOptsPass(*PassRegistry::getPassRegistry()); } @@ -60,6 +80,16 @@ private: bool optimizeInstructions(SmallSetVector &Functions); + std::optional getSVEMemoryOpInfo(const IntrinsicInst *II); + Value *getLoopInvariantSplatValue(Value *V, Loop *L); + Value *getHoistedBaseForIndex(Value *Index, Value *OriBase, Loop *L, + InvariantBaseCache &Cache, + SVEMemoryOpInfo &OpInfo); + bool simplifySVEAddressComputation(IntrinsicInst *II, Loop *L, + InvariantBaseCache &Cache, + SVEMemoryOpInfo &OpInfo); + bool runSVEAddressHoisting(Function &F, LoopInfo &LI); + /// Operates at the function-scope. I.e., optimizations are applied local to /// the functions themselves. bool optimizeFunctions(SmallSetVector &Functions); @@ -68,6 +98,7 @@ private: void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); + AU.addRequired(); AU.setPreservesCFG(); } @@ -75,6 +106,7 @@ char SVEIntrinsicOpts::ID = 0; static const char *name = "SVE intrinsics optimizations"; INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false) ModulePass *llvm::createSVEIntrinsicOptsPass() { @@ -428,6 +460,332 @@ bool SVEIntrinsicOpts::optimizeInstructions( return Changed; } +/// Checks if an intrinsic is an SVE gather/scatter memory operation that this +/// optimization can analyze. Return the operand information (Base index, Index +/// index, Element Type, and Extension Kind) if supported +std::optional +SVEIntrinsicOpts::getSVEMemoryOpInfo(const IntrinsicInst *II) { + switch (II->getIntrinsicID()) { + // Gather Loads + case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: + case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: + return {{1, 2, + dyn_cast(II->getType())->getElementType(), + SVEIndexExtension::SIGN}}; // Base=1, Index=2, Ext=SIGN + case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: + case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: + return {{1, 2, + dyn_cast(II->getType())->getElementType(), + SVEIndexExtension::ZERO}}; // Base=1, Index=2, Ext=ZERO + case Intrinsic::aarch64_sve_ld1_gather_index: + case Intrinsic::aarch64_sve_ldff1_gather_index: + case Intrinsic::aarch64_sve_ldnt1_gather_index: + return {{1, 2, + dyn_cast(II->getType())->getElementType(), + SVEIndexExtension::NONE}}; // Base=1, Index=2, Ext=NONE + + // Prefetches (have no return value, element type is based on name) + case Intrinsic::aarch64_sve_prfd_gather_sxtw_index: + return {{1, 2, Type::getInt64Ty(II->getParent()->getContext()), + SVEIndexExtension::SIGN}}; + case Intrinsic::aarch64_sve_prfh_gather_sxtw_index: + return {{1, 2, Type::getInt16Ty(II->getParent()->getContext()), + SVEIndexExtension::SIGN}}; + case Intrinsic::aarch64_sve_prfw_gather_sxtw_index: + return {{1, 2, Type::getInt32Ty(II->getParent()->getContext()), + SVEIndexExtension::SIGN}}; + case Intrinsic::aarch64_sve_prfd_gather_uxtw_index: + return {{1, 2, Type::getInt64Ty(II->getParent()->getContext()), + SVEIndexExtension::ZERO}}; + case Intrinsic::aarch64_sve_prfh_gather_uxtw_index: + return {{1, 2, Type::getInt16Ty(II->getParent()->getContext()), + SVEIndexExtension::ZERO}}; + case Intrinsic::aarch64_sve_prfw_gather_uxtw_index: + return {{1, 2, Type::getInt32Ty(II->getParent()->getContext()), + SVEIndexExtension::ZERO}}; + case Intrinsic::aarch64_sve_prfd_gather_index: + return {{1, 2, Type::getInt64Ty(II->getParent()->getContext()), + SVEIndexExtension::NONE}}; + case Intrinsic::aarch64_sve_prfh_gather_index: + return {{1, 2, Type::getInt16Ty(II->getParent()->getContext()), + SVEIndexExtension::NONE}}; + case Intrinsic::aarch64_sve_prfw_gather_index: + return {{1, 2, Type::getInt32Ty(II->getParent()->getContext()), + SVEIndexExtension::NONE}}; + + // Scatter Stores (data is operand 0, element type is derived from it) + case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: + return {{2, 3, + dyn_cast(II->getOperand(0)->getType()) + ->getElementType(), + SVEIndexExtension::SIGN}}; // Base=2, Index=3, Ext=SIGN + case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: + return {{2, 3, + dyn_cast(II->getOperand(0)->getType()) + ->getElementType(), + SVEIndexExtension::ZERO}}; // Base=2, Index=3, Ext=ZERO + case Intrinsic::aarch64_sve_st1_scatter_index: + case Intrinsic::aarch64_sve_stnt1_scatter_index: + return {{2, 3, + dyn_cast(II->getOperand(0)->getType()) + ->getElementType(), + SVEIndexExtension::NONE}}; // Base=2, Index=3, Ext=NONE + + default: + return std::nullopt; + } +} + +/// Check if a Value is a splat of a loop-invariant scalar, which is a +/// shufflevector of an insertelement at index 0. If the pattern matches, return +/// the loop scalar value. +Value *SVEIntrinsicOpts::getLoopInvariantSplatValue(Value *V, Loop *L) { + Value *InvariantScalar = nullptr; + Value *InsertElementVal = nullptr; + + if (auto *SV = dyn_cast(V)) { + InsertElementVal = SV->getOperand(0); + } else if (auto *SVC = dyn_cast(V)) { + if (SVC->getOpcode() == Instruction::ShuffleVector) { + InsertElementVal = SVC->getOperand(0); + } + } + + if (!InsertElementVal) + return nullptr; + + // Check if InsertElementVal is an insertelement and get the scalar. + if (auto *IE = dyn_cast(InsertElementVal)) { + if (match(IE->getOperand(2), + m_Zero())) { // Ensure it's inserting at index 0 + InvariantScalar = IE->getOperand(1); + } + } else if (auto *IEC = dyn_cast(InsertElementVal)) { + if (IEC->getOpcode() == Instruction::InsertElement && + match(IEC->getOperand(2), m_Zero())) { + InvariantScalar = IEC->getOperand(1); + } + } + + if (!InvariantScalar || !L->isLoopInvariant(InvariantScalar)) + return nullptr; + + return InvariantScalar; +} + +/// Analyzes an index calculation chain and generates hoistable GEPs. +/// @param Index The starting index Value (from the sve memory op) +/// @param OrigBase The original base pointer from the sve memory op +/// @param L The loop context +/// @param Cache A map to memoize results for `{OrigBase, Index} : NewBase` +/// @param OpInfo Information about the sve memory op +/// @return The final, rewritten base pointer for the memory op +Value *SVEIntrinsicOpts::getHoistedBaseForIndex(Value *Index, Value *OrigBase, + Loop *L, + InvariantBaseCache &Cache, + SVEMemoryOpInfo &OpInfo) { + InvariantBaseKey InitialKey = {OrigBase, Index}; + + // If this entire chain has been processed before, return the final result. + if (Cache.count(InitialKey)) + return Cache.lookup(InitialKey); + + // --- Trace the ADD chain up to the root, collecting nodes --- + SmallVector IndexChain; + Value *CurrentIndex = Index; + Value *RootIndex = nullptr; + + // The `while` loop traces the `sve.add` chain upwards from the `Index` used + // by the memory op, collecting all intermediate indices onto a stack + // (`IndexChain`). The trace stops when it hits a value that is not an `(index + // + invariant)` add, which becomes the `RootIndex`. + while (true) { + IndexChain.push_back(CurrentIndex); + InvariantBaseKey CurrentKey = {OrigBase, CurrentIndex}; + + // If a subchain in the chain is already solved, stop tracing + if (Cache.count(CurrentKey)) { + RootIndex = CurrentIndex; + break; + } + + auto *Add = dyn_cast(CurrentIndex); + // Stop if not a recognized sve.add intrinsic or not defined in the loop + if (!Add || + (Add->getIntrinsicID() != Intrinsic::aarch64_sve_add && + Add->getIntrinsicID() != Intrinsic::aarch64_sve_add_u) || + !L->contains(Add)) { + RootIndex = CurrentIndex; + break; + } + + Value *Op1 = Add->getOperand(1); + Value *Op2 = Add->getOperand(2); + + // Check if one of the operands is an invariant splat + if (getLoopInvariantSplatValue(Op1, L)) { + CurrentIndex = Op2; + if (match(Op2, m_Select(m_Value(), m_Value(), m_Zero()))) + CurrentIndex = dyn_cast(Op2)->getOperand(1); + } else if (getLoopInvariantSplatValue(Op2, L)) { + CurrentIndex = Op1; + if (match(Op1, m_Select(m_Value(), m_Value(), m_Zero()))) + CurrentIndex = dyn_cast(Op1)->getOperand(1); + } else { + RootIndex = CurrentIndex; // Not an (index + invariant) form. + break; + } + } + + // --- Build GEPs back down the chain --- + // The base for the root index is always the original base pointer + Value *CurrentHoistedBase = Cache.lookup({OrigBase, RootIndex}); + if (!CurrentHoistedBase) { + CurrentHoistedBase = OrigBase; + Cache[{OrigBase, RootIndex}] = OrigBase; + } + + // Iterates down the collected chain (in reverse). For each node, it computes + // the new hoisted base by creating a GEP on top of the base of the previous + // node in the chain. + for (Value *IdxNode : reverse(IndexChain)) { + if (IdxNode == RootIndex) + continue; + + InvariantBaseKey CurrentKey = {OrigBase, IdxNode}; + auto *Add = dyn_cast(IdxNode); + Value *Op1 = Add->getOperand(1); + Value *Op2 = Add->getOperand(2); + + Value *InvariantScalar = getLoopInvariantSplatValue(Op1, L); + if (!InvariantScalar) + InvariantScalar = getLoopInvariantSplatValue(Op2, L); + assert(InvariantScalar); + + IRBuilder<> Builder(Add); + Value *GEPIndex = InvariantScalar; + + // Ensure the invariant has the correct integer type for GEP + switch (OpInfo.ExtKind) { + case SVEIndexExtension::SIGN: + GEPIndex = Builder.CreateSExt( + GEPIndex, Type::getInt64Ty(Add->getParent()->getContext()), + "invariant.idx.sext"); + break; + case SVEIndexExtension::ZERO: + GEPIndex = Builder.CreateZExt( + GEPIndex, Type::getInt64Ty(Add->getParent()->getContext()), + "invariant.idx.zext"); + break; + case SVEIndexExtension::NONE: + break; + } + + Value *NewBase = Builder.CreateGEP(OpInfo.ElemTy, CurrentHoistedBase, + GEPIndex, "add.ptr"); + // Cache the result for this node and update the base for the next iteration + Cache[CurrentKey] = NewBase; + CurrentHoistedBase = NewBase; + } + + return Cache.lookup(InitialKey); +} + +/// Get the final rewritten base and root index, and rewrite the memory +/// intrinsic +bool SVEIntrinsicOpts::simplifySVEAddressComputation( + IntrinsicInst *MemIntrinsic, Loop *L, InvariantBaseCache &Cache, + SVEMemoryOpInfo &OpInfo) { + Value *OrigBase = MemIntrinsic->getArgOperand(OpInfo.BaseOpIdx); + Value *OrigIndex = MemIntrinsic->getArgOperand(OpInfo.IndexOpIdx); + + // If the base itself is not loop invariant, skip simplification + if (!L->isLoopInvariant(OrigBase)) + return false; + + // The actual index might be hidden behind a `select(pg, index, zero)` + // Peel this away to get to the core index calculation + Value *IndexToTrace = OrigIndex; + if (match(IndexToTrace, m_Select(m_Value(), m_Value(), m_Zero()))) { + IndexToTrace = dyn_cast(IndexToTrace)->getOperand(1); + } + // This call populates the cache for the entire chain and returns the final + // base + Value *NewBase = + getHoistedBaseForIndex(IndexToTrace, OrigBase, L, Cache, OpInfo); + + // If the base pointer hasn't changed, nothing was optimized. + if (NewBase == OrigBase) + return false; + + // Now that the cache is populated, trace up from the starting index to find + // the root. + Value *RootIndex = IndexToTrace; + while (true) { + InvariantBaseKey CurrentKey = {OrigBase, RootIndex}; + // The root is the node that maps back to the original base in the cache. + if (Cache.count(CurrentKey) && Cache.lookup(CurrentKey) == OrigBase) { + break; + } + + auto *Add = dyn_cast(RootIndex); + Value *NextIndex = nullptr; + if (getLoopInvariantSplatValue(Add->getOperand(1), L)) { + NextIndex = Add->getOperand(2); + } else if (getLoopInvariantSplatValue(Add->getOperand(2), L)) { + NextIndex = Add->getOperand(1); + } else { + break; // Reached a non-optimizable ADD, this is the root + } + + if (match(NextIndex, m_Select(m_Value(), m_Value(), m_Zero()))) { + RootIndex = dyn_cast(NextIndex)->getOperand(1); + } else { + RootIndex = NextIndex; + } + } + + LLVM_DEBUG(dbgs() << "SVE_ADDR_HOIST_IR: Rewriting " << *MemIntrinsic + << "\n"); + + MemIntrinsic->setArgOperand(OpInfo.BaseOpIdx, NewBase); + MemIntrinsic->setArgOperand(OpInfo.IndexOpIdx, RootIndex); + + // Cleanup would be complex. Rely on DCE for now. + + LLVM_DEBUG(dbgs() << "SVE_ADDR_HOIST_IR: To -> " << *MemIntrinsic << "\n"); + + return true; +} + +/// iterates through all basic blocks in a function. For each block +/// that is part of a loop, it creates a fresh cache and then iterates through +/// its instructions in program order, attempting to simplify any SVE memory +/// operations it finds. +bool SVEIntrinsicOpts::runSVEAddressHoisting(Function &F, LoopInfo &LI) { + bool Changed = false; + for (auto &BB : F) { + // We only care about blocks that are inside a loop. + Loop *L = LI.getLoopFor(&BB); + if (!L) + continue; + + // A fresh cache is used for each basic block to ensure correctness. + // Maps {OriginalBasePointer, Index} to the new computed GEP. + InvariantBaseCache Cache; + + // Iterate through instructions in program order (important!) + for (auto &I : BB) { + if (auto *II = dyn_cast(&I)) { + if (auto OpInfo = getSVEMemoryOpInfo(II)) + Changed |= simplifySVEAddressComputation(II, L, Cache, *OpInfo); + } + } + } + + return Changed; +} + bool SVEIntrinsicOpts::optimizeFunctions( SmallSetVector &Functions) { bool Changed = false; @@ -435,6 +793,16 @@ bool SVEIntrinsicOpts::optimizeFunctions( Changed |= optimizePTrueIntrinsicCalls(Functions); Changed |= optimizeInstructions(Functions); + if (EnableSVELoopAddressChainOpt) { + for (Function *F : Functions) { + if (F->isDeclaration()) + continue; + + LoopInfo &LI = getAnalysis(*F).getLoopInfo(); + Changed |= runSVEAddressHoisting(*F, LI); + } + } + return Changed; } @@ -453,6 +821,26 @@ bool SVEIntrinsicOpts::runOnModule(Module &M) { case Intrinsic::vector_extract: case Intrinsic::vector_insert: case Intrinsic::aarch64_sve_ptrue: + case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: + case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: + case Intrinsic::aarch64_sve_ld1_gather_index: + case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: + case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: + case Intrinsic::aarch64_sve_ldff1_gather_index: + case Intrinsic::aarch64_sve_ldnt1_gather_index: + case Intrinsic::aarch64_sve_prfd_gather_sxtw_index: + case Intrinsic::aarch64_sve_prfd_gather_uxtw_index: + case Intrinsic::aarch64_sve_prfd_gather_index: + case Intrinsic::aarch64_sve_prfh_gather_sxtw_index: + case Intrinsic::aarch64_sve_prfh_gather_uxtw_index: + case Intrinsic::aarch64_sve_prfh_gather_index: + case Intrinsic::aarch64_sve_prfw_gather_sxtw_index: + case Intrinsic::aarch64_sve_prfw_gather_uxtw_index: + case Intrinsic::aarch64_sve_prfw_gather_index: + case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: + case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: + case Intrinsic::aarch64_sve_st1_scatter_index: + case Intrinsic::aarch64_sve_stnt1_scatter_index: for (User *U : F.users()) Functions.insert(cast(U)->getFunction()); break; diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll index 3747b2581fa4..4ee13788c2c4 100644 --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -25,6 +25,7 @@ ; CHECK-NEXT: SVE intrinsics optimizations ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Simplify the CFG ; CHECK-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll new file mode 100644 index 000000000000..85d286c165ca --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll @@ -0,0 +1,526 @@ +; RUN: llc -mtriple=aarch64-unknown -mcpu=hip09 -aarch64-sve-loop-address-chain-opt -O3 %s -o - | FileCheck %s + +define dso_local void @test_gather_multi_constOffset(i32 noundef %loopTime, ptr noundef %x, float noundef %ipx, float noundef %ipy, float noundef %ipz, ptr nocapture noundef nonnull align 4 dereferenceable(4) %tempx, ptr nocapture noundef nonnull align 4 dereferenceable(4) %tempy, ptr nocapture noundef nonnull align 4 dereferenceable(4) %tempz) local_unnamed_addr #0 { +; CHECK-LABEL: test_gather_multi_constOffset: +; CHECK: .LBB0_2: // %for.body +; CHECK: add x[[NEWBASE1:[0-9]+]], x1, #4 +; CHECK: ld1w { z{{[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x1, z{{[0-9]+}}.s, sxtw #2] +; CHECK: ld1w { z{{[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE1]], z{{[0-9]+}}.s, sxtw #2] +; CHECK: add x[[NEWBASE2:[0-9]+]], x1, #8 +; CHECK: ld1w { z{{[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE2]], z{{[0-9]+}}.s, sxtw #2] +entry: + %cmp18 = icmp sgt i32 %loopTime, 0 + br i1 %cmp18, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %.splatinsert = insertelement poison, float %ipx, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %.splatinsert2 = insertelement poison, float %ipy, i64 0 + %.splat3 = shufflevector %.splatinsert2, poison, zeroinitializer + %.splatinsert5 = insertelement poison, float %ipz, i64 0 + %.splat6 = shufflevector %.splatinsert5, poison, zeroinitializer + %.tr = tail call i32 @llvm.vscale.i32() + %0 = shl nuw nsw i32 %.tr, 2 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.body + %jp.019 = phi i32 [ 0, %for.body.lr.ph ], [ %conv10, %for.body ] + %1 = tail call @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %jp.019, i32 %loopTime) + %2 = tail call @llvm.aarch64.sve.index.nxv4i32(i32 %jp.019, i32 1) + %3 = select %1, %2, zeroinitializer + %4 = tail call @llvm.aarch64.sve.mul.nxv4i32( %1, %3, shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer)) + %5 = tail call @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32( %1, ptr %x, %4) + %6 = select %1, %5, zeroinitializer + %7 = tail call @llvm.aarch64.sve.fsubr.nxv4f32( %1, %6, %.splat) + %8 = select %1, %7, zeroinitializer + %9 = tail call @llvm.aarch64.sve.fmul.nxv4f32( %1, %8, %7) + %10 = select %1, %4, zeroinitializer + %11 = tail call @llvm.aarch64.sve.add.nxv4i32( %1, %10, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer)) + %12 = tail call @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32( %1, ptr %x, %11) + %13 = select %1, %12, zeroinitializer + %14 = tail call @llvm.aarch64.sve.fsub.nxv4f32( %1, %13, %.splat3) + %15 = select %1, %14, zeroinitializer + %16 = tail call @llvm.aarch64.sve.fmad.nxv4f32( %1, %15, %14, %9) + %17 = select %1, %11, zeroinitializer + %18 = tail call @llvm.aarch64.sve.add.nxv4i32( %1, %17, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer)) + %19 = tail call @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32( %1, ptr %x, %18) + %20 = select %1, %19, zeroinitializer + %21 = tail call @llvm.aarch64.sve.fsub.nxv4f32( %1, %20, %.splat6) + %22 = select %1, %21, zeroinitializer + %23 = tail call @llvm.aarch64.sve.fmad.nxv4f32( %1, %22, %21, %16) + %24 = tail call @llvm.aarch64.sve.fsqrt.nxv4f32( zeroinitializer, %1, %23) + %25 = tail call @llvm.aarch64.sve.fmul.nxv4f32( %1, %8, %24) + %26 = tail call float @llvm.aarch64.sve.faddv.nxv4f32( %1, %25) + %27 = load float, ptr %tempx, align 4, !tbaa !5 + %add = fadd float %26, %27 + store float %add, ptr %tempx, align 4, !tbaa !5 + %28 = tail call @llvm.aarch64.sve.fmul.nxv4f32( %1, %15, %24) + %29 = tail call float @llvm.aarch64.sve.faddv.nxv4f32( %1, %28) + %30 = load float, ptr %tempy, align 4, !tbaa !5 + %add7 = fadd float %29, %30 + store float %add7, ptr %tempy, align 4, !tbaa !5 + %31 = tail call @llvm.aarch64.sve.fmul.nxv4f32( %1, %22, %24) + %32 = tail call float @llvm.aarch64.sve.faddv.nxv4f32( %1, %31) + %33 = load float, ptr %tempz, align 4, !tbaa !5 + %add8 = fadd float %32, %33 + store float %add8, ptr %tempz, align 4, !tbaa !5 + %conv10 = add i32 %0, %jp.019 + %cmp = icmp slt i32 %conv10, %loopTime + br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !9 +} + +define dso_local void @test_scatter_constOffset(i32 noundef %loopTime, ptr noalias noundef %dst, ptr noalias nocapture noundef readonly %tempx, ptr noalias nocapture noundef readonly %tempy, ptr noalias nocapture noundef readonly %tempz) local_unnamed_addr #0 { +; CHECK-LABEL: test_scatter_constOffset: +; CHECK: .LBB1_2: // %for.body +; CHECK: add x[[NEWBASE1:[0-9]+]], x1, #4 +; CHECK: st1w { z{{[0-9]+}}.s }, p[[PG:[0-9]+]], [x1, z{{[0-9]+}}.s, sxtw #2] +; CHECK: st1w { z{{[0-9]+}}.s }, p[[PG]], [x[[NEWBASE1]], z{{[0-9]+}}.s, sxtw #2] +; CHECK: add x[[NEWBASE2:[0-9]+]], x1, #8 +; CHECK: st1w { z{{[0-9]+}}.s }, p[[PG]], [x[[NEWBASE2]], z{{[0-9]+}}.s, sxtw #2] +entry: + %cmp15 = icmp sgt i32 %loopTime, 0 + br i1 %cmp15, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %.tr = tail call i32 @llvm.vscale.i32() + %0 = shl nuw nsw i32 %.tr, 2 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.body + %jp.016 = phi i32 [ 0, %for.body.lr.ph ], [ %conv5, %for.body ] + %1 = tail call @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %jp.016, i32 %loopTime) + %2 = tail call @llvm.aarch64.sve.index.nxv4i32(i32 %jp.016, i32 1) + %3 = select %1, %2, zeroinitializer + %4 = tail call @llvm.aarch64.sve.mul.nxv4i32( %1, %3, shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer)) + %5 = select %1, %4, zeroinitializer + %6 = tail call @llvm.aarch64.sve.add.nxv4i32( %1, %5, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer)) + %7 = tail call @llvm.aarch64.sve.add.nxv4i32( %1, %5, shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer)) + %idx.ext = sext i32 %jp.016 to i64 + %add.ptr = getelementptr inbounds float, ptr %tempx, i64 %idx.ext + %8 = tail call @llvm.masked.load.nxv4f32.p0(ptr %add.ptr, i32 1, %1, zeroinitializer), !tbaa !5 + %add.ptr2 = getelementptr inbounds float, ptr %tempy, i64 %idx.ext + %9 = tail call @llvm.masked.load.nxv4f32.p0(ptr %add.ptr2, i32 1, %1, zeroinitializer), !tbaa !5 + %add.ptr4 = getelementptr inbounds float, ptr %tempz, i64 %idx.ext + %10 = tail call @llvm.masked.load.nxv4f32.p0(ptr %add.ptr4, i32 1, %1, zeroinitializer), !tbaa !5 + tail call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32( %8, %1, ptr %dst, %4) + tail call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32( %9, %1, ptr %dst, %6) + tail call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32( %10, %1, ptr %dst, %7) + %conv5 = add i32 %0, %jp.016 + %cmp = icmp slt i32 %conv5, %loopTime + br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !11 +} + +define dso_local void @test_prefetch_constOffset(i32 noundef %loopTime, ptr nocapture noundef %data) local_unnamed_addr #4 { +; CHECK-LABEL: test_prefetch_constOffset: +; CHECK: // %bb.4: // %if.end +; CHECK: add x[[NEWBASE1:[0-9]+]], x1, #4 +; CHECK: prfw pldl1keep, p[[PG:[0-9]+]], [x1, z{{[0-9]+}}.s, sxtw #2] +; CHECK: prfw pldl1keep, p[[PG]], [x[[NEWBASE1]], z{{[0-9]+}}.s, sxtw #2] +; CHECK: add x[[NEWBASE2:[0-9]+]], x1, #8 +; CHECK: prfw pldl1keep, p[[PG]], [x[[NEWBASE2]], z{{[0-9]+}}.s, sxtw #2] +entry: + %.tr = tail call i32 @llvm.vscale.i32() + %conv = shl nuw nsw i32 %.tr, 2 + %cmp13 = icmp sgt i32 %loopTime, 0 + br i1 %cmp13, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %0 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + br label %for.body + +for.cond.cleanup: ; preds = %cleanup, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %cleanup + %jp.014 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %cleanup ] + %add = add i32 %jp.014, %conv + %1 = tail call @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %add, i32 %loopTime) + %2 = tail call i1 @llvm.aarch64.sve.ptest.any.nxv4i1( %0, %1) + br i1 %2, label %if.end, label %cleanup + +if.end: ; preds = %for.body + %3 = tail call @llvm.aarch64.sve.index.nxv4i32(i32 %add, i32 1) + %4 = select %1, %3, zeroinitializer + %5 = tail call @llvm.aarch64.sve.mul.nxv4i32( %1, %4, shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer)) + tail call void @llvm.aarch64.sve.prfw.gather.sxtw.index.nxv4i32( %1, ptr %data, %5, i32 0) + %6 = select %1, %5, zeroinitializer + %7 = tail call @llvm.aarch64.sve.add.nxv4i32( %1, %6, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer)) + tail call void @llvm.aarch64.sve.prfw.gather.sxtw.index.nxv4i32( %1, ptr %data, %7, i32 0) + %8 = select %1, %7, zeroinitializer + %9 = tail call @llvm.aarch64.sve.add.nxv4i32( %1, %8, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer)) + tail call void @llvm.aarch64.sve.prfw.gather.sxtw.index.nxv4i32( %1, ptr %data, %9, i32 0) + br label %cleanup + +cleanup: ; preds = %for.body, %if.end + %cmp = icmp slt i32 %add, %loopTime + br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !12 +} + +define dso_local void @test_stride_constOffset(i32 noundef %loopTime, ptr noundef %data, ptr nocapture noundef %result) local_unnamed_addr #0 { +; CHECK-LABEL: test_stride_constOffset: +; CHECK: .LBB3_2: // %for.body +; CHECK: add x[[NEWBASE1:[0-9]+]], x1, #8 +; CHECK: ld1w { z{{[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x1, z{{[0-9]+}}.s, sxtw #2] +; CHECK: ld1w { z{{[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE1]], z{{[0-9]+}}.s, sxtw #2] +; CHECK: add x[[NEWBASE2:[0-9]+]], x1, #16 +; CHECK: ld1w { z{{[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE2]], z{{[0-9]+}}.s, sxtw #2] +entry: + %cmp9 = icmp sgt i32 %loopTime, 0 + br i1 %cmp9, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %.tr = tail call i32 @llvm.vscale.i32() + %0 = shl nuw nsw i32 %.tr, 2 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.body + %jp.010 = phi i32 [ 0, %for.body.lr.ph ], [ %conv1, %for.body ] + %1 = tail call @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %jp.010, i32 %loopTime) + %2 = tail call @llvm.aarch64.sve.index.nxv4i32(i32 %jp.010, i32 1) + %3 = select %1, %2, zeroinitializer + %4 = tail call @llvm.aarch64.sve.mul.nxv4i32( %1, %3, shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer)) + %5 = select %1, %4, zeroinitializer + %6 = tail call @llvm.aarch64.sve.add.nxv4i32( %1, %5, zeroinitializer) + %7 = tail call @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32( %1, ptr %data, %6) + %8 = tail call @llvm.aarch64.sve.add.nxv4i32( %1, %5, shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer)) + %9 = tail call @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32( %1, ptr %data, %8) + %10 = tail call @llvm.aarch64.sve.add.nxv4i32( %1, %5, shufflevector ( insertelement ( poison, i32 4, i64 0), poison, zeroinitializer)) + %11 = tail call @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32( %1, ptr %data, %10) + %12 = select %1, %7, zeroinitializer + %13 = tail call @llvm.aarch64.sve.fadd.nxv4f32( %1, %12, %9) + %14 = select %1, %13, zeroinitializer + %15 = tail call @llvm.aarch64.sve.fadd.nxv4f32( %1, %14, %11) + %idx.ext = sext i32 %jp.010 to i64 + %add.ptr = getelementptr inbounds float, ptr %result, i64 %idx.ext + tail call void @llvm.masked.store.nxv4f32.p0( %15, ptr %add.ptr, i32 1, %1), !tbaa !5 + %conv1 = add i32 %0, %jp.010 + %cmp = icmp slt i32 %conv1, %loopTime + br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !13 +} + +define dso_local void @test_invariantOffset32bit(i32 noundef %N, i32 noundef %M, ptr noundef %matrix, ptr nocapture noundef %result) local_unnamed_addr #0 { +; CHECK-LABEL: test_invariantOffset32bit: +; CHECK: .LBB4_5: // %for.body4 +; CHECK: add x[[NEWBASE1:[0-9]+]], x2, w{{[0-9]+}}, sxtw #2 +; CHECK: ld1w { z{{[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x[[NEWBASE1]], z{{[0-9]+}}.s, sxtw #2] +; CHECK: add x[[NEWBASE2:[0-9]+]], x2, w{{[0-9]+}}, sxtw #2 +; CHECK: ld1w { z{{[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE2]], z{{[0-9]+}}.s, sxtw #2] +; CHECK: add x[[NEWBASE3:[0-9]+]], x2, w{{[0-9]+}}, sxtw #2 +; CHECK: ld1w { z{{[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE3]], z{{[0-9]+}}.s, sxtw #2] +entry: + %cmp41 = icmp sgt i32 %N, 2 + br i1 %cmp41, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup + +for.cond1.preheader.lr.ph: ; preds = %entry + %div51 = udiv i32 %N, 3 + %cmp239 = icmp sgt i32 %M, 0 + %0 = sext i32 %M to i64 + %wide.trip.count = zext i32 %div51 to i64 + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.lr.ph, %for.cond.cleanup3 + %indvars.iv = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next, %for.cond.cleanup3 ] + br i1 %cmp239, label %for.body4.lr.ph, label %for.cond.cleanup3 + +for.body4.lr.ph: ; preds = %for.cond1.preheader + %1 = mul nuw nsw i64 %indvars.iv, 3 + %2 = trunc i64 %1 to i32 + %3 = mul i32 %2, %M + %.splatinsert = insertelement poison, i32 %3, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %4 = trunc i64 %1 to i32 + %5 = add i32 %4, 1 + %6 = mul i32 %5, %M + %.splatinsert9 = insertelement poison, i32 %6, i64 0 + %.splat10 = shufflevector %.splatinsert9, poison, zeroinitializer + %7 = trunc i64 %1 to i32 + %8 = add i32 %7, 2 + %9 = mul i32 %8, %M + %.splatinsert14 = insertelement poison, i32 %9, i64 0 + %.splat15 = shufflevector %.splatinsert14, poison, zeroinitializer + %10 = mul nsw i64 %indvars.iv, %0 + %add.ptr = getelementptr inbounds float, ptr %result, i64 %10 + %.tr = tail call i32 @llvm.vscale.i32() + %11 = shl nuw nsw i32 %.tr, 2 + br label %for.body4 + +for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry + ret void + +for.cond.cleanup3: ; preds = %for.body4, %for.cond1.preheader + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.cond1.preheader, !llvm.loop !14 + +for.body4: ; preds = %for.body4.lr.ph, %for.body4 + %jp.040 = phi i32 [ 0, %for.body4.lr.ph ], [ %conv20, %for.body4 ] + %12 = tail call @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %jp.040, i32 %M) + %13 = tail call @llvm.aarch64.sve.index.nxv4i32(i32 %jp.040, i32 1) + %14 = select %12, %13, zeroinitializer + %15 = tail call @llvm.aarch64.sve.add.nxv4i32( %12, %14, %.splat) + %16 = tail call @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32( %12, ptr %matrix, %15) + %17 = tail call @llvm.aarch64.sve.add.nxv4i32( %12, %14, %.splat10) + %18 = tail call @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32( %12, ptr %matrix, %17) + %19 = tail call @llvm.aarch64.sve.add.nxv4i32( %12, %14, %.splat15) + %20 = tail call @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32( %12, ptr %matrix, %19) + %21 = select %12, %16, zeroinitializer + %22 = tail call @llvm.aarch64.sve.fadd.nxv4f32( %12, %21, %18) + %23 = select %12, %22, zeroinitializer + %24 = tail call @llvm.aarch64.sve.fadd.nxv4f32( %12, %23, %20) + %25 = select %12, %24, zeroinitializer + %26 = tail call @llvm.aarch64.sve.fdiv.nxv4f32( %12, %25, shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer)) + %idx.ext17 = sext i32 %jp.040 to i64 + %add.ptr18 = getelementptr inbounds float, ptr %add.ptr, i64 %idx.ext17 + tail call void @llvm.masked.store.nxv4f32.p0( %26, ptr %add.ptr18, i32 1, %12), !tbaa !5 + %conv20 = add i32 %11, %jp.040 + %cmp2 = icmp slt i32 %conv20, %M + br i1 %cmp2, label %for.body4, label %for.cond.cleanup3, !llvm.loop !15 +} + +define dso_local void @test_invariantOffset64bit(i64 noundef %N, i64 noundef %M, ptr noundef %matrix, ptr nocapture noundef %result) local_unnamed_addr #0 { +; CHECK-LABEL: test_invariantOffset64bit: +; CHECK: .LBB5_6: // %for.body4 +; CHECK: add x[[NEWBASE1:[0-9]+]], x2, x{{[0-9]+}}, lsl #3 +; CHECK: ld1d { z{{[0-9]+}}.d }, p[[PG:[0-9]+]]/z, [x[[NEWBASE1]], z{{[0-9]+}}.d, lsl #3] +; CHECK: add x[[NEWBASE2:[0-9]+]], x2, x{{[0-9]+}}, lsl #3 +; CHECK: ld1d { z{{[0-9]+}}.d }, p[[PG]]/z, [x[[NEWBASE2]], z{{[0-9]+}}.d, lsl #3] +; CHECK: add x[[NEWBASE3:[0-9]+]], x2, x{{[0-9]+}}, lsl #3 +; CHECK: ld1d { z{{[0-9]+}}.d }, p[[PG]]/z, [x[[NEWBASE3]], z{{[0-9]+}}.d, lsl #3] +entry: + %cmp39.not = icmp ult i64 %N, 3 + br i1 %cmp39.not, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph + +for.cond1.preheader.lr.ph: ; preds = %entry + %div = udiv i64 %N, 3 + %cmp237.not = icmp eq i64 %M, 0 + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.lr.ph, %for.cond.cleanup3 + %i.040 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %inc, %for.cond.cleanup3 ] + br i1 %cmp237.not, label %for.cond.cleanup3, label %for.body4.lr.ph + +for.body4.lr.ph: ; preds = %for.cond1.preheader + %mul = mul nuw i64 %i.040, 3 + %mul5 = mul i64 %mul, %M + %.splatinsert = insertelement poison, i64 %mul5, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %add7 = add nuw i64 %mul, 1 + %mul8 = mul i64 %add7, %M + %.splatinsert9 = insertelement poison, i64 %mul8, i64 0 + %.splat10 = shufflevector %.splatinsert9, poison, zeroinitializer + %add12 = add nuw i64 %mul, 2 + %mul13 = mul i64 %add12, %M + %.splatinsert14 = insertelement poison, i64 %mul13, i64 0 + %.splat15 = shufflevector %.splatinsert14, poison, zeroinitializer + %mul16 = mul i64 %i.040, %M + %add.ptr = getelementptr inbounds double, ptr %result, i64 %mul16 + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 1 + br label %for.body4 + +for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry + ret void + +for.cond.cleanup3: ; preds = %for.body4, %for.cond1.preheader + %inc = add nuw nsw i64 %i.040, 1 + %exitcond.not = icmp eq i64 %inc, %div + br i1 %exitcond.not, label %for.cond.cleanup, label %for.cond1.preheader, !llvm.loop !16 + +for.body4: ; preds = %for.body4.lr.ph, %for.body4 + %jp.038 = phi i64 [ 0, %for.body4.lr.ph ], [ %add18, %for.body4 ] + %2 = tail call @llvm.aarch64.sve.whilelo.nxv2i1.i64(i64 %jp.038, i64 %M) + %3 = tail call @llvm.aarch64.sve.index.nxv2i64(i64 %jp.038, i64 1) + %4 = select %2, %3, zeroinitializer + %5 = tail call @llvm.aarch64.sve.add.nxv2i64( %2, %4, %.splat) + %6 = tail call @llvm.aarch64.sve.ld1.gather.index.nxv2f64( %2, ptr %matrix, %5) + %7 = tail call @llvm.aarch64.sve.add.nxv2i64( %2, %4, %.splat10) + %8 = tail call @llvm.aarch64.sve.ld1.gather.index.nxv2f64( %2, ptr %matrix, %7) + %9 = tail call @llvm.aarch64.sve.add.nxv2i64( %2, %4, %.splat15) + %10 = tail call @llvm.aarch64.sve.ld1.gather.index.nxv2f64( %2, ptr %matrix, %9) + %11 = select %2, %6, zeroinitializer + %12 = tail call @llvm.aarch64.sve.fadd.nxv2f64( %2, %11, %8) + %13 = select %2, %12, zeroinitializer + %14 = tail call @llvm.aarch64.sve.fadd.nxv2f64( %2, %13, %10) + %15 = select %2, %14, zeroinitializer + %16 = tail call @llvm.aarch64.sve.fdiv.nxv2f64( %2, %15, shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer)) + %add.ptr17 = getelementptr inbounds double, ptr %add.ptr, i64 %jp.038 + tail call void @llvm.masked.store.nxv2f64.p0( %16, ptr %add.ptr17, i32 1, %2), !tbaa !17 + %add18 = add i64 %1, %jp.038 + %cmp2 = icmp ult i64 %add18, %M + br i1 %cmp2, label %for.body4, label %for.cond.cleanup3, !llvm.loop !19 +} + +define dso_local void @test_svaddx_constOffset(ptr noundef %base, %index) local_unnamed_addr #0 { +; CHECK-LABEL: test_svaddx_constOffset: +; CHECK: .LBB6_1: // %for.body +; CHECK: add x[[NEWBASE1:[0-9]+]], x0, #40 +; CHECK: ld1w { z{{[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x[[NEWBASE1]], z{{[0-9]+}}.s, uxtw #2] +; CHECK: add x[[NEWBASE2:[0-9]+]], x0, #44 +; CHECK: ld1w { z{{[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE2]], z{{[0-9]+}}.s, uxtw #2] +entry: + %0 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %for.body ] + %index.addr.05 = phi [ %index, %entry ], [ %8, %for.body ] + %1 = tail call @llvm.aarch64.sve.add.u.nxv4i32( %0, %index.addr.05, shufflevector ( insertelement ( poison, i32 10, i64 0), poison, zeroinitializer)) + %2 = tail call @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32( %0, ptr %base, %1) + %3 = shl nuw nsw i64 %indvars.iv, 4 + %add.ptr = getelementptr inbounds i32, ptr %base, i64 %3 + store %2, ptr %add.ptr, align 16, !tbaa !20 + %4 = tail call @llvm.aarch64.sve.add.u.nxv4i32( %0, %index.addr.05, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer)) + %5 = tail call @llvm.aarch64.sve.add.u.nxv4i32( %0, %4, shufflevector ( insertelement ( poison, i32 10, i64 0), poison, zeroinitializer)) + %6 = tail call @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32( %0, ptr %base, %5) + %indvars.iv.next = shl i64 %indvars.iv, 4 + %7 = or i64 %indvars.iv.next, 16 + %add.ptr.1 = getelementptr inbounds i32, ptr %base, i64 %7 + store %6, ptr %add.ptr.1, align 16, !tbaa !20 + %8 = tail call @llvm.aarch64.sve.add.u.nxv4i32( %0, %4, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer)) + %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv, 2 + %exitcond.not.1 = icmp eq i64 %indvars.iv.next.1, 100 + br i1 %exitcond.not.1, label %for.cond.cleanup, label %for.body, !llvm.loop !22 +} + +define dso_local void @_Z26test_loop_invariant_offsetPlu11__SVInt64_tl(ptr noundef %base, %index, i64 noundef %invariant_offset) local_unnamed_addr #6 { +; CHECK-LABEL: _Z26test_loop_invariant_offsetPlu11__SVInt64_tl: +; CHECK: add x[[NEWBASE:[0-9]+]], x0, x1, lsl #3 +; CHECK: st1d { z{{[0-9]+}}.d }, p{{[0-9]+}}, [x[[NEWBASE]], z{{[0-9]+}}.d, lsl #3] +entry: + %0 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %.splatinsert = insertelement poison, i64 %invariant_offset, i64 0 + %1 = shufflevector %.splatinsert, poison, zeroinitializer + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %index.addr.05 = phi [ %index, %entry ], [ %4, %for.body ] + %2 = tail call @llvm.aarch64.sve.add.u.nxv2i64( %0, %index.addr.05, %1) + %.splatinsert3 = insertelement poison, i64 %indvars.iv, i64 0 + %3 = shufflevector %.splatinsert3, poison, zeroinitializer + tail call void @llvm.aarch64.sve.st1.scatter.index.nxv2i64( %3, %0, ptr %base, %2) + %4 = tail call @llvm.aarch64.sve.add.u.nxv2i64( %0, %index.addr.05, shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer)) + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !23 +} + +define dso_local void @test_combined_const_and_invariant_offset(ptr noundef %base, %index, i32 noundef %invariant_offset) local_unnamed_addr #0 { +; CHECK-LABEL: test_combined_const_and_invariant_offset: +; CHECK: add x[[NEWBASE_GPR:[0-9]+]], x0, w1, sxtw #2 +; CHECK: add x[[NEWBASE_FINAL:[0-9]+]], x[[NEWBASE_GPR]], #40 +; CHECK: ld1w { z{{[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x[[NEWBASE_FINAL]], z{{[0-9]+}}.s, sxtw #2] +entry: + %0 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %.splatinsert = insertelement poison, i32 %invariant_offset, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %index.addr.06 = phi [ %index, %entry ], [ %7, %for.body ] + %1 = select %0, %index.addr.06, zeroinitializer + %2 = tail call @llvm.aarch64.sve.add.u.nxv4i32( %0, %1, shufflevector ( insertelement ( poison, i32 10, i64 0), poison, zeroinitializer)) + %3 = select %0, %2, zeroinitializer + %4 = tail call @llvm.aarch64.sve.add.u.nxv4i32( %0, %3, %.splat) + %5 = tail call @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32( %0, ptr %base, %4) + %6 = shl nuw nsw i64 %indvars.iv, 4 + %add.ptr = getelementptr inbounds i32, ptr %base, i64 %6 + store %5, ptr %add.ptr, align 16, !tbaa !20 + %7 = tail call @llvm.aarch64.sve.add.u.nxv4i32( %0, %index.addr.06, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer)) + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !24 +} + +declare @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32, i32) #1 +declare @llvm.aarch64.sve.index.nxv4i32(i32, i32) #1 +declare @llvm.aarch64.sve.mul.nxv4i32(, , ) #1 +declare @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(, ptr, ) #2 +declare @llvm.aarch64.sve.fsubr.nxv4f32(, , ) #1 +declare @llvm.aarch64.sve.fmul.nxv4f32(, , ) #1 +declare @llvm.aarch64.sve.add.nxv4i32(, , ) #1 +declare @llvm.aarch64.sve.fsub.nxv4f32(, , ) #1 +declare @llvm.aarch64.sve.fmad.nxv4f32(, , , ) #1 +declare @llvm.aarch64.sve.fsqrt.nxv4f32(, , ) #1 +declare float @llvm.aarch64.sve.faddv.nxv4f32(, ) #1 +declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32(, , ptr, ) #3 +declare @llvm.aarch64.sve.ptrue.nxv4i1(i32 immarg) #1 +declare void @llvm.aarch64.sve.prfw.gather.sxtw.index.nxv4i32(, ptr nocapture, , i32 immarg) #5 +declare @llvm.aarch64.sve.fadd.nxv4f32(, , ) #1 +declare @llvm.aarch64.sve.fdiv.nxv4f32(, , ) #1 +declare @llvm.aarch64.sve.whilelo.nxv2i1.i64(i64, i64) #1 +declare @llvm.aarch64.sve.index.nxv2i64(i64, i64) #1 +declare @llvm.aarch64.sve.add.nxv2i64(, , ) #1 +declare @llvm.aarch64.sve.ld1.gather.index.nxv2f64(, ptr, ) #2 +declare @llvm.aarch64.sve.fadd.nxv2f64(, , ) #1 +declare @llvm.aarch64.sve.fdiv.nxv2f64(, , ) #1 +declare @llvm.aarch64.sve.add.u.nxv4i32(, , ) #1 +declare @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32(, ptr, ) #2 +declare @llvm.aarch64.sve.ptrue.nxv2i1(i32 immarg) #1 +declare @llvm.aarch64.sve.add.u.nxv2i64(, , ) #1 +declare void @llvm.aarch64.sve.st1.scatter.index.nxv2i64(, , ptr, ) #3 +declare @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32(, ptr, ) #2 +declare i64 @llvm.vscale.i64() #7 +declare i32 @llvm.vscale.i32() #7 +declare @llvm.masked.load.nxv4f32.p0(ptr nocapture, i32 immarg, , ) #8 +declare i1 @llvm.aarch64.sve.ptest.any.nxv4i1(, ) #7 +declare void @llvm.masked.store.nxv4f32.p0(, ptr nocapture, i32 immarg, ) #9 +declare void @llvm.masked.store.nxv2f64.p0(, ptr nocapture, i32 immarg, ) #9 + +attributes #0 = { mustprogress nofree nosync nounwind memory(argmem: readwrite) uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hip09" "target-features"="+aes,+bf16,+crc,+dotprod,+f32mm,+f64mm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+sve,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,-fmv" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) } +attributes #4 = { mustprogress nofree nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hip09" "target-features"="+aes,+bf16,+crc,+dotprod,+f32mm,+f64mm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+sve,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,-fmv" } +attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) } +attributes #6 = { mustprogress nofree nosync nounwind memory(argmem: write) uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hip09" "target-features"="+aes,+bf16,+crc,+dotprod,+f32mm,+f64mm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+sve,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,-fmv" } +attributes #7 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #8 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #9 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2, !3, !4} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 8, !"PIC Level", i32 2} +!2 = !{i32 7, !"PIE Level", i32 2} +!3 = !{i32 7, !"uwtable", i32 2} +!4 = !{i32 7, !"frame-pointer", i32 1} +!5 = !{!6, !6, i64 0} +!6 = !{!"float", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C++ TBAA"} +!9 = distinct !{!9, !10} +!10 = !{!"llvm.loop.mustprogress"} +!11 = distinct !{!11, !10} +!12 = distinct !{!12, !10} +!13 = distinct !{!13, !10} +!14 = distinct !{!14, !10} +!15 = distinct !{!15, !10} +!16 = distinct !{!16, !10} +!17 = !{!18, !18, i64 0} +!18 = !{!"double", !7, i64 0} +!19 = distinct !{!19, !10} +!20 = !{!21, !21, i64 0} +!21 = !{!"int", !7, i64 0} +!22 = distinct !{!22, !10} +!23 = distinct !{!23, !10} +!24 = distinct !{!24, !10} -- Gitee From e0fa4ddfe0f886497f90811a6ec595ed2651e27b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E5=93=B2=E6=B5=A9?= <2209576006@qq.com> Date: Sun, 28 Sep 2025 23:53:43 +0800 Subject: [PATCH 2/2] [AArch64]: Add SVE loop addressing optimizations 1. Simplifies SVE gather/scatter address svadd chains in loops by rewriting them to be hoistable by MachineLICM. 2. Simplifies SVE svindex+svmul patterns in loops via strength reduction. --- .../Target/AArch64/AArch64MIPeepholeOpt.cpp | 926 ++++++++++++++++++ llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp | 388 -------- llvm/test/CodeGen/AArch64/O3-pipeline.ll | 1 - .../aarch64-sve-addressing-peephole.ll | 16 +- .../AArch64/aarch64-sve-index-mul-simplify.ll | 48 + 5 files changed, 983 insertions(+), 396 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/aarch64-sve-index-mul-simplify.ll diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp index 87aa3b98d938..683e491f5724 100644 --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -65,7 +65,9 @@ #include "AArch64ExpandImm.h" #include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/SetVector.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineLoopInfo.h" @@ -73,6 +75,15 @@ using namespace llvm; #define DEBUG_TYPE "aarch64-mi-peephole-opt" +static cl::opt EnableSVELoopAddressChainOpt( + "aarch64-sve-loop-address-chain-opt", cl::init(false), cl::Hidden, + cl::desc( + "Enable simplification of SVE address computation chains in loops")); + +static cl::opt EnableSVEIndexMultiplyOpt( + "aarch64-sve-simplify-index-multiply", cl::init(false), cl::Hidden, + cl::desc("Enable simplification of SVE svindex+svmul patterns in loops")); + namespace { struct AArch64MIPeepholeOpt : public MachineFunctionPass { @@ -94,6 +105,12 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass { using BuildMIFunc = std::function; + using InstAndOffset = std::pair; + using ChainKey = std::tuple; + using ChainMap = DenseMap>; + using ConstOffsetKey = std::pair; + // Define an enum for the SVE offset type. + enum class SVEOffsetType { NOT_APPLICABLE, SXTW, UXTW, D64 }; /// For instructions where an immediate operand could be split into two /// separate immediate instructions, use the splitTwoPartImm two handle the @@ -127,6 +144,29 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass { bool visitINSERT(MachineInstr &MI); bool visitINSviGPR(MachineInstr &MI, unsigned Opc); bool visitINSvi64lane(MachineInstr &MI); + bool isLoopInvariant(Register Reg, MachineLoop *L); + bool isConstantVector(Register Reg, int64_t &Value); + bool isInvariantBroadcastGPR(Register VecReg, MachineLoop *L, Register &GPR); + unsigned getElementSizeInBytes(const MachineInstr &MI, + SVEOffsetType *OffsetKind); + void traceIndexChain(Register IndexReg, Register &RootIndex, + int64_t &AccumulatedOffset, Register &InvariantGPROffset, + MachineLoop *L, + SmallVectorImpl &ChainsInsts); + void collectOptimizationCandidates(MachineLoop *L, ChainMap &Chains, + SetVector &CandidateDeadInsts); + Register getGPRBase( + MachineBasicBlock *MBB, Register BaseReg, Register InvariantGPROffset, + MachineInstr &UseMI, + DenseMap &BlockToGPRBaseMap); + Register getFinalBase(MachineInstr &MI, int64_t ElemOffset, + Register BaseForConst, + DenseMap &FinalBaseMap); + bool rewriteConstantAddressComputations(MachineLoop *L, + const ChainMap &Chains); + bool cleanupDeadSVECode(SetVector &CandidateDeadInsts); + bool simplifySVEIndexMultiply(MachineLoop *L); + bool processSVELoopAddressing(MachineLoop *L); bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -670,6 +710,873 @@ bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) { return true; } +// Check if Reg is a loop invariant to Loop L +bool AArch64MIPeepholeOpt::isLoopInvariant(Register Reg, MachineLoop *L) { + if (!Reg.isVirtual()) + return false; + MachineInstr *Def = MRI->getVRegDef(Reg); + if (!Def) + return true; + return !L->contains(Def->getParent()); +} + +// Check if a vector register represents a constant value +// and retrieve that constant value if it exists +bool AArch64MIPeepholeOpt::isConstantVector(Register Reg, int64_t &Value) { + if (!Reg.isVirtual()) + return false; + + MachineInstr *Def = MRI->getVRegDef(Reg); + if (!Def) + return false; + + // Match the DUP instruction pattern: %Def = DUP_ZI_S Imm, 0 + // This instruction broadcasts the immediate value to all vector elements + unsigned DupOp = Def->getOpcode(); + if (DupOp == AArch64::DUP_ZI_S || DupOp == AArch64::DUP_ZI_D) { + Value = Def->getOperand(1).getImm(); + return true; + } + return false; +} + +// Checks if a vector register is broadcasted from a loop-invariant GPR +// Matches instruction pattern: %VecReg = DUP_ZR_S/D %GPR +// Where %GPR is loop-invariant to loop L +bool AArch64MIPeepholeOpt::isInvariantBroadcastGPR(Register VecReg, + MachineLoop *L, + Register &GPR) { + if (!VecReg.isVirtual()) + return false; + + MachineInstr *Def = MRI->getVRegDef(VecReg); + if (!Def) + return false; + + unsigned DupOp = Def->getOpcode(); + if (DupOp == AArch64::DUP_ZR_S || DupOp == AArch64::DUP_ZR_D) { + Register SrcGPR = Def->getOperand(1).getReg(); + if (isLoopInvariant(SrcGPR, L)) { + GPR = SrcGPR; + return true; + } + } + return false; +} + +// Returns element size in bytes for gather/scatter instructions +// Returns 0 for non-gather/scatter instructions +unsigned +AArch64MIPeepholeOpt::getElementSizeInBytes(const MachineInstr &MI, + SVEOffsetType *OffsetKind) { + switch (MI.getOpcode()) { + // --- Element Size: 2 Bytes (Half-Word) --- + case AArch64::GLD1H_D_SCALED: + case AArch64::GLD1SH_D_SCALED: + case AArch64::GLDFF1H_D_SCALED: + case AArch64::GLDFF1SH_D_SCALED: + case AArch64::LDNT1H_ZZR_D_REAL: + case AArch64::LDNT1SH_ZZR_D_REAL: + case AArch64::SST1H_D_SCALED: + case AArch64::STNT1H_ZZR_D_REAL: + *OffsetKind = SVEOffsetType::D64; + return 2; + case AArch64::GLD1H_S_SXTW_SCALED: + case AArch64::GLD1SH_S_SXTW_SCALED: + case AArch64::GLDFF1H_S_SXTW_SCALED: + case AArch64::GLDFF1SH_S_SXTW_SCALED: + case AArch64::SST1H_S_SXTW_SCALED: + *OffsetKind = SVEOffsetType::SXTW; + return 2; + case AArch64::GLD1H_S_UXTW_SCALED: + case AArch64::GLD1SH_S_UXTW_SCALED: + case AArch64::GLDFF1H_S_UXTW_SCALED: + case AArch64::GLDFF1SH_S_UXTW_SCALED: + case AArch64::SST1H_S_UXTW_SCALED: + *OffsetKind = SVEOffsetType::UXTW; + return 2; + + // --- Element Size: 4 Bytes (Word) --- + case AArch64::GLD1SW_D_SCALED: + case AArch64::GLD1W_D_SCALED: + case AArch64::GLDFF1SW_D_SCALED: + case AArch64::GLDFF1W_D_SCALED: + case AArch64::LDNT1SW_ZZR_D_REAL: + case AArch64::LDNT1W_ZZR_D_REAL: + case AArch64::SST1W_D_SCALED: + case AArch64::STNT1W_ZZR_D_REAL: + *OffsetKind = SVEOffsetType::D64; + return 4; + case AArch64::GLD1W_SXTW_SCALED: + case AArch64::GLDFF1W_SXTW_SCALED: + case AArch64::PRFW_S_SXTW_SCALED: + case AArch64::SST1W_SXTW_SCALED: + *OffsetKind = SVEOffsetType::SXTW; + return 4; + case AArch64::GLD1W_UXTW_SCALED: + case AArch64::GLDFF1W_UXTW_SCALED: + case AArch64::PRFW_S_UXTW_SCALED: + case AArch64::SST1W_UXTW_SCALED: + *OffsetKind = SVEOffsetType::UXTW; + return 4; + + // --- Element Size: 8 Bytes (Double-Word) --- + case AArch64::GLD1D_SCALED: + case AArch64::GLDFF1D_SCALED: + case AArch64::LDNT1D_ZZR_D_REAL: + case AArch64::PRFW_D_SCALED: + case AArch64::SST1D_SCALED: + case AArch64::STNT1D_ZZR_D_REAL: + *OffsetKind = SVEOffsetType::D64; + return 8; + case AArch64::GLD1D_SXTW_SCALED: + case AArch64::SST1D_SXTW_SCALED: + *OffsetKind = SVEOffsetType::SXTW; + return 8; + case AArch64::GLD1D_UXTW_SCALED: + case AArch64::SST1D_UXTW_SCALED: + *OffsetKind = SVEOffsetType::UXTW; + return 8; + default: + StringRef InstName = TII->getName(MI.getOpcode()); + if (InstName.startswith("GLD") || InstName.startswith("SST") || + InstName.startswith("LDNT") || InstName.startswith("STNT") || + InstName.startswith("PRFW")) { + LLVM_DEBUG(dbgs() << "SVELoopAddressHoisting: Unhandled SVE " + "gather/scatter-like instruction found: " + << MI); + } + + *OffsetKind = SVEOffsetType::NOT_APPLICABLE; + return 0; + } +} + +// Traces index chain to discover: +// - Root index register +// - Accumulated constant offset +// - Loop-invariant GPR offset component +// - And collects the chain instructions for potential deletion +void AArch64MIPeepholeOpt::traceIndexChain( + Register IndexReg, Register &RootIndex, int64_t &AccumulatedOffset, + Register &InvariantGPROffset, MachineLoop *L, + SmallVectorImpl &ChainInsts) { + AccumulatedOffset = 0; + InvariantGPROffset = Register(0); + Register CurrentReg = IndexReg; + + while (true) { + if (!CurrentReg.isVirtual()) + break; + + MachineInstr *Def = MRI->getVRegDef(CurrentReg); + // Index must be defined within loop as induction variable + if (!Def || !L->contains(Def->getParent())) + break; + + // Match svadd index increment pattern: + // %index = ADD_ZI_[S/D] %prev_index, %offset, %pg + // %index = ADD_ZZZ_D %prev_index, %offset + // %index = ADD_ZPZZ_[S/D]_ZERO %pg, %prev_index, %offset + unsigned IndexOp = Def->getOpcode(); + if (IndexOp == AArch64::ADD_ZI_S || IndexOp == AArch64::ADD_ZI_D) { + int64_t ConstValue = Def->getOperand(2).getImm(); + AccumulatedOffset += ConstValue; + CurrentReg = Def->getOperand(1).getReg(); + ChainInsts.push_back(Def); + continue; + } + + Register Op1, Op2; + if (IndexOp == AArch64::ADD_ZZZ_S || IndexOp == AArch64::ADD_ZZZ_D) { + Op1 = Def->getOperand(1).getReg(); + Op2 = Def->getOperand(2).getReg(); + } else if (IndexOp == AArch64::ADD_ZPZZ_S_ZERO || + IndexOp == AArch64::ADD_ZPZZ_D_ZERO || + IndexOp == AArch64::ADD_ZPmZ_S || + IndexOp == AArch64::ADD_ZPmZ_D) { + Op1 = Def->getOperand(2).getReg(); + Op2 = Def->getOperand(3).getReg(); + } else { + break; + } + + int64_t ConstValue; + Register InvariantGPR; + + // Op2 case 1: Constant vector offset + if (isConstantVector(Op2, ConstValue)) { + AccumulatedOffset += ConstValue; + CurrentReg = Op1; + ChainInsts.push_back(MRI->getVRegDef(Op2)); + ChainInsts.push_back(Def); + continue; + } + + // Op2 case 2: Loop-invariant GPR broadcast offset + if (isInvariantBroadcastGPR(Op2, L, InvariantGPR)) { + if (InvariantGPROffset != 0) { + LLVM_DEBUG( + dbgs() << "Found multiple GPR invariants, aborting trace.\n"); + break; + } + InvariantGPROffset = InvariantGPR; + CurrentReg = Op1; + ChainInsts.push_back(MRI->getVRegDef(Op2)); + ChainInsts.push_back(Def); + continue; + } + + // Op1 case 1: Constant vector offset + if (isConstantVector(Op1, ConstValue)) { + AccumulatedOffset += ConstValue; + CurrentReg = Op2; + ChainInsts.push_back(MRI->getVRegDef(Op1)); + ChainInsts.push_back(Def); + continue; + } + + // Op1 case 2: Loop-invariant GPR broadcast offset + if (isInvariantBroadcastGPR(Op1, L, InvariantGPR)) { + if (InvariantGPROffset != 0) { + LLVM_DEBUG( + dbgs() << "Found multiple GPR invariants, aborting trace.\n"); + break; + } + InvariantGPROffset = InvariantGPR; + CurrentReg = Op2; + ChainInsts.push_back(MRI->getVRegDef(Op1)); + ChainInsts.push_back(Def); + continue; + } + break; + } + + RootIndex = CurrentReg; +} + +// Collects all optimizable gather/scatter instructions +// and groups them into chains. +void AArch64MIPeepholeOpt::collectOptimizationCandidates( + MachineLoop *L, ChainMap &Chains, + SetVector &CandidateDeadInsts) { + for (MachineBasicBlock *MBB : L->getBlocks()) { + for (MachineInstr &MI : *MBB) { + SVEOffsetType OffsetType; + unsigned ElementSize = getElementSizeInBytes(MI, &OffsetType); + if (ElementSize == 0) + continue; + + // Verify instruction format: + // Gather: DstZPR, PredicatePPR, BaseGPR, IndexZPR + // Scatter: SrcZPR, PredicatePPR, BaseGPR, IndexZPR + if (MI.getNumOperands() < 4) + continue; + + Register BaseReg = MI.getOperand(2).getReg(); + Register IndexReg = MI.getOperand(3).getReg(); + // Only optimize loop-invariant base addresses + if (!isLoopInvariant(BaseReg, L)) + continue; + + Register RootIndex, InvariantGPROffset; + int64_t ElemOffset; + SmallVector TmpChainInsts; // Store chain for this MI + + // Trace index computation chain + traceIndexChain(IndexReg, RootIndex, ElemOffset, InvariantGPROffset, L, + TmpChainInsts); + + // If the chain is empty, there's nothing to optimize or delete. + if (TmpChainInsts.empty() && InvariantGPROffset == 0 && ElemOffset == 0) + continue; + + LLVM_DEBUG(dbgs() << "Found candidate instruction: "; MI.dump(); + dbgs() << " BaseReg: " << printReg(BaseReg) + << ", IndexReg: " << printReg(IndexReg) + << " -> RootIndex: " << printReg(RootIndex) + << ", ElemOffset: " << ElemOffset + << ", InvariantGPROffset: " + << printReg(InvariantGPROffset) << "\n"); + + Chains[{BaseReg, RootIndex, InvariantGPROffset}].push_back( + {&MI, ElemOffset}); + + // Add the identified chain instructions to the master set of candidates. + CandidateDeadInsts.insert(TmpChainInsts.begin(), TmpChainInsts.end()); + } + } +} + +// Get or create a shared base register for the (Base + GPR) calculation +// It ensures the calculation is only generated once per block +// +// @param BaseReg The original, loop-invariant base register +// @param InvariantGPROffset The loop-invariant GPR used as an offset +// @param UseMI The memory instruction that will ultimately use this base +// @param BlockToGPRBaseMap The cache mapping a block to its computed GPR-base +// @return A register holding the result of `BaseReg + (InvariantGPROffset << scale)`. +// If no GPR offset exists, it returns the original `BaseReg` +Register AArch64MIPeepholeOpt::getGPRBase( + MachineBasicBlock *MBB, Register BaseReg, Register InvariantGPROffset, + MachineInstr &UseMI, + DenseMap &BlockToGPRBaseMap) { + // If we've already computed the GPR base for this block, return it + if (BlockToGPRBaseMap.count(MBB)) { + return BlockToGPRBaseMap[MBB]; + } + + // If there's no GPR offset, the base is simply the original BaseReg + if (InvariantGPROffset == 0) { + BlockToGPRBaseMap[MBB] = BaseReg; + return BaseReg; + } + + // This is the first time for this block, so we generate the ADD instruction + // Insert the calculation at the beginning of the block + DebugLoc DL = UseMI.getDebugLoc(); + SVEOffsetType OffsetType; + unsigned ElementSize = getElementSizeInBytes(UseMI, &OffsetType); + unsigned ShiftAmt = Log2_64(ElementSize); + unsigned AddOp, ShiftExtender; + + const TargetRegisterClass *RC = MRI->getRegClass(InvariantGPROffset); + if (AArch64::GPR32RegClass.hasSubClassEq(RC)) { + ShiftExtender = + (OffsetType == SVEOffsetType::SXTW) + ? AArch64_AM::getArithExtendImm(AArch64_AM::SXTW, ShiftAmt) + : AArch64_AM::getArithExtendImm(AArch64_AM::UXTW, ShiftAmt); + AddOp = AArch64::ADDXrx; + } else { + ShiftExtender = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); + AddOp = AArch64::ADDXrs; + } + + Register GPROffsetBaseReg = + MRI->createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(*MBB, MBB->getFirstNonPHI(), DL, TII->get(AddOp), GPROffsetBaseReg) + .addReg(BaseReg) + .addReg(InvariantGPROffset) + .addImm(ShiftExtender); + + LLVM_DEBUG(dbgs() << " In BB:" << MBB->getName() + << ", created shared GPR base: " + << printReg(GPROffsetBaseReg) << "\n"); + + // Cache and return the new base + BlockToGPRBaseMap[MBB] = GPROffsetBaseReg; + return GPROffsetBaseReg; +} + +// Get or create the final base register (Base + GPR + Const) +// +// @param MI The memory instruction that will use this final base +// @param ElemOffset The constant element offset extracted from the address chain. +// @param BaseForConst The base register to add the constant offset to. This is +// typically the result from `getGPRBase`. +// @param FinalBaseMap The cache mapping a `{Block, Offset}` key to BaseForConst 's final base +// @return A register holding the result of `BaseForConst + (ElemOffset * scale)`. +// If `ElemOffset` is zero, it returns `BaseForConst` directly +Register AArch64MIPeepholeOpt::getFinalBase( + MachineInstr &MI, int64_t ElemOffset, Register BaseForConst, + DenseMap &FinalBaseMap) { + + MachineBasicBlock *MBB = MI.getParent(); + ConstOffsetKey Key = {MBB, ElemOffset}; + + // If we've already computed the final base for this key, return it + if (FinalBaseMap.count(Key)) { + return FinalBaseMap.lookup(Key); + } + + // If there's no constant offset, the final base is the one passed in + if (ElemOffset == 0) { + FinalBaseMap[Key] = BaseForConst; + return BaseForConst; + } + + // This is the first time for this key, generate the ADD instruction + DebugLoc DL = MI.getDebugLoc(); + SVEOffsetType OffsetType; + unsigned ElementSize = getElementSizeInBytes(MI, &OffsetType); + int64_t ByteOffset = ElemOffset * ElementSize; + + Register ConstOffsetBase = + MRI->createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(*MBB, MI.getIterator(), DL, TII->get(AArch64::ADDXri), + ConstOffsetBase) + .addReg(BaseForConst) + .addImm(ByteOffset) + .addImm(0); + + LLVM_DEBUG(dbgs() << " Created new base for ElemOffset " << ElemOffset + << " (ByteOffset " << ByteOffset << ") into " + << printReg(ConstOffsetBase) << "\n"); + + // Cache and return the new final base. + FinalBaseMap[Key] = ConstOffsetBase; + return ConstOffsetBase; +} + +bool AArch64MIPeepholeOpt::rewriteConstantAddressComputations( + MachineLoop *L, const ChainMap &Chains) { + bool Changed = false; + + for (auto &ChainInfo : Chains) { + auto &Addressings = ChainInfo.second; + // Skip chains without optimizable offsets + if (Addressings.size() < 2 && std::get<2>(ChainInfo.first) == 0 && + Addressings[0].second == 0) + continue; + + Register BaseReg = std::get<0>(ChainInfo.first); + Register RootIndex = std::get<1>(ChainInfo.first); + Register InvariantGPROffset = std::get<2>(ChainInfo.first); + + LLVM_DEBUG(dbgs() << "Optimizing chain with BaseReg: " << printReg(BaseReg) + << ", RootIndex: " << printReg(RootIndex) + << ", InvariantGPROffset: " + << printReg(InvariantGPROffset) << "\n"); + + // Maps a Basic Block to the register holding the (Base + GPR) calculation + // for that block + DenseMap BlockToGPRBaseMap; + + // Maps a (Basic Block, Const Offset) pair to the final base register + // We need the BB in the key to ensure correctness across different blocks + DenseMap ConstOffsetToFinalBaseMap; + + for (auto &AddressInfo : Addressings) { + MachineInstr *MI = AddressInfo.first; + MachineBasicBlock *MBB = MI->getParent(); + int64_t ElemOffset = AddressInfo.second; + + // Get the shared (Base + GPR) base for this instruction's block + Register BaseForConsts = getGPRBase( + MBB, BaseReg, InvariantGPROffset, *MI, BlockToGPRBaseMap); + + // Get the final base, creating the const offset ADD if needed + Register FinalBaseReg = getFinalBase( + *MI, ElemOffset, BaseForConsts, ConstOffsetToFinalBaseMap); + + // Rewrite the memory instruction + MI->getOperand(2).setReg(FinalBaseReg); + MI->getOperand(3).setReg(RootIndex); + + LLVM_DEBUG(dbgs() << " Rewrote instruction: "; MI->dump()); + Changed = true; + } + } + return Changed; +} + +bool AArch64MIPeepholeOpt::cleanupDeadSVECode( + SetVector &CandidateDeadInsts) { + if (CandidateDeadInsts.empty()) + return false; + + bool Changed = false; + LLVM_DEBUG(dbgs() << "--- Cleaning up dead instructions ---\n"); + for (MachineInstr *MI : llvm::reverse(CandidateDeadInsts)) { + bool IsDead = true; + for (const MachineOperand &MO : MI->operands()) { + if (MO.isReg() && MO.isDef() && MO.getReg().isVirtual()) { + if (!MRI->use_empty(MO.getReg())) { + IsDead = false; + break; + } + } + } + + if (!IsDead) + continue; + + LLVM_DEBUG(dbgs() << "Deleting dead instruction: "; MI->dump()); + MI->eraseFromParent(); + Changed = true; + } + return Changed; +} + +// Processes a single machine loop to find and rewrite optimizable +// SVE address computation chains for gather/scatter-like instructions. +// +// The core idea is to identify cases where the vector index used by a memory +// instruction is calculated by adding a loop-invariant offset to a base index +// (the root induction variable). Such computations are redundant within the +// loop. This optimization rewrites the address calculation to make the +// invariant part easily hoistable by MachineLICM. +// +// Specifically, it targets the following pattern example: +// +// =============================== BEFORE =============================== +// // In a loop, a complex index `z_idx` is computed before being used. +// // The offset can be a constant, a loop-invariant GPR, or both. +// +// ... +// dup z_offset, invariant_gpr +// add z_idx, z_root_idx, z_offset +// gather z_data, pg, [x_base, z_idx, ] +// ... +// +// =============================== AFTER ================================ +// // The pass sinks the invariant address calculation to just before the use, +// // exposing it to MachineLICM. The original `add` chain is replaced. +// // MachineLICM will then decide whether hoisting is profitable. +// +// ... +// // --- Instructions created by this pass, to be hoisted by MachineLICM --- +// add x_new_base, x_base, invariant_gpr, +// gather z_data, pg, [x_new_base, z_root_idx, ] +// ... +// +bool AArch64MIPeepholeOpt::processSVELoopAddressing(MachineLoop *L) { + MachineBasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) + return false; + + bool Changed = false; + LLVM_DEBUG(dbgs() << "********** Processing Loop in Function: " + << L->getHeader()->getParent()->getName() + << " (Loop Header: " << L->getHeader()->getName() + << ") **********\n"); + + // Collect all candadate instructions and their addressing chains + ChainMap Chains; + SetVector CandidateDeadInsts; + collectOptimizationCandidates(L, Chains, CandidateDeadInsts); + + if (Chains.empty()) + return false; + + // rewrite the instructions in the loop + Changed |= rewriteConstantAddressComputations(L, Chains); + + // Clean up the original, now-dead, address computation instructions + if (Changed) + Changed |= cleanupDeadSVECode(CandidateDeadInsts); + return Changed; +} + +// This optimization identifies a common pattern where a vector of indices, +// generated from a loop induction variable, is immediately multiplied by a +// constant. This is a computationally expensive operation inside a loop. +// The pass transforms this pattern by replacing the expensive vector multiply +// with a cheaper vector add. It achieves this by creating a new induction +// variable system and a pre-computed offset vector. +// +// =============================== BEFORE =============================== +// The pass targets a MUL instruction whose operands form a specific chain. +/// In C SVE intrinsics, this typically looks like: +// +// for (int jp = 0; jp < limit; jp += svcntw()) { +// svbool_t pg = svwhilelt_b32(jp, limit); +// // 1. Index vector is based on induction variable `jp`. +// svuint32_t indices = svindex_u32(jp, IndexStep); +// // 2. Result is a vector multiply by a constant `Multiplier`. +// svuint32_t final_indices = svmul_z(pg, indices, Multiplier); +// } +// +// This corresponds to the following MachineIR pattern: +// `z_result = MUL (SEL(pg, z_indices, 0), DUP(Multiplier))` +// where `z_indices` is defined by an `INDEX` instruction using a PHI-defined +// induction variable. +// +// =============================== AFTER ================================ +// The transformation is based on the distributive property: +// `(jp + k*IndexStep) * Multiplier = (jp*Multiplier) + +// k*(IndexStep*Multiplier)` +// +// The pass creates a new scalar induction variable `base_iv` to track the +// `(jp * Multiplier)` term, and a constant vector `offset_vec` to represent +// the `k * (IndexStep * Multiplier)` term. The expensive multiply in the loop +// is replaced by a simple vector add. +// +// // --- In Preheader --- +// uint32_t new_iv_step = svcntw() * Multiplier; +// uint32_t new_iv_init = 0 * Multiplier; +// svuint32_t offset_vec = svindex_u32(0, IndexStep * Multiplier); +// +// // --- In Loop --- +// uint32_t base_iv = new_iv_init; // (PHI node) +// for (int jp = 0; jp < limit; jp += svcntw()) { +// // ... +// svuint32_t base_vec = svdup_u32(base_iv); +// svuint32_t final_indices = svadd_z(pg, base_vec, offset_vec); +// // ... +// base_iv += new_iv_step; +// } +// +bool AArch64MIPeepholeOpt::simplifySVEIndexMultiply(MachineLoop *L) { + MachineBasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) + return false; + + MachineBasicBlock *Header = L->getHeader(); + MachineBasicBlock *Latch = L->getLoopLatch(); + if (!Header || !Latch) + return false; + + for (MachineBasicBlock *MBB : L->getBlocks()) { + for (MachineInstr &MI : *MBB) { + // --- Start of the SVE MUL_Z Pattern Match --- + if (MI.getOpcode() != AArch64::MUL_ZPmZ_S && + MI.getOpcode() != AArch64::MUL_ZPmZ_D) { + continue; + } + + LLVM_DEBUG(dbgs() << "Found candidate MUL: "; MI.dump()); + + // Set up the specific AArch64 opcodes based on whether we have a 32-bit + // or 64-bit operation. + bool is64Bit = (MI.getOpcode() == AArch64::MUL_ZPmZ_D); + unsigned SelOpc = is64Bit ? AArch64::SEL_ZPZZ_D : AArch64::SEL_ZPZZ_S; + unsigned IndexRiOpc = is64Bit ? AArch64::INDEX_RI_D : AArch64::INDEX_RI_S; + unsigned AddZzzOpc = is64Bit ? AArch64::ADD_ZZZ_D : AArch64::ADD_ZZZ_S; + unsigned DupZrOpc = is64Bit ? AArch64::DUP_ZR_D : AArch64::DUP_ZR_S; + unsigned IndexIiOpc = is64Bit ? AArch64::INDEX_II_D : AArch64::INDEX_II_S; + unsigned DupZiOpc = is64Bit ? AArch64::DUP_ZI_D : AArch64::DUP_ZI_S; + unsigned AddGprOpc = is64Bit ? AArch64::ADDXrr : AArch64::ADDWrr; + unsigned CntOpc = is64Bit ? AArch64::CNTD_XPiI : AArch64::CNTW_XPiI; + unsigned MaddGprOpc = is64Bit ? AArch64::MADDXrrr : AArch64::MADDWrrr; + unsigned MovImmOpc = is64Bit ? AArch64::MOVi64imm : AArch64::MOVi32imm; + unsigned ZeroReg = is64Bit ? AArch64::XZR : AArch64::WZR; + + const TargetRegisterClass *GprRegClass = + is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + const TargetRegisterClass *GprAllRegClass = + is64Bit ? &AArch64::GPR64allRegClass : &AArch64::GPR32allRegClass; + const TargetRegisterClass *ZprRegClass = &AArch64::ZPRRegClass; + + // Deconstruct the multiply instruction to see if it matches our target + // pattern. The matched pattern is: MUL(SEL(Pred, INDEX(IV, IdxStep), + // Zero), DUP(Multiplier)) + MachineInstr *SelMI = MRI->getVRegDef(MI.getOperand(2).getReg()); + if (!SelMI || (SelMI->getOpcode() != SelOpc)) + continue; + + // The second operand of the select should be an index operation. + MachineInstr *IndexMI = MRI->getVRegDef(SelMI->getOperand(2).getReg()); + if (!IndexMI) + continue; + + Register IVReg; + int64_t IndexStep; + // Detect the two index generated ways + if (IndexMI->getOpcode() == IndexRiOpc) { + // Case 1: INDEX_RI (reg, imm) + IVReg = IndexMI->getOperand(1).getReg(); + if (!IVReg.isVirtual()) + continue; + IndexStep = IndexMI->getOperand(2).getImm(); + } else if (IndexMI->getOpcode() == AddZzzOpc) { + // Case 2: ADD(INDEX_II(0, imm), DUP(reg)) + MachineInstr *Op1 = MRI->getVRegDef(IndexMI->getOperand(1).getReg()); + MachineInstr *Op2 = MRI->getVRegDef(IndexMI->getOperand(2).getReg()); + if (!Op1 || !Op2) + continue; + + auto matchIndexAddPattern = [&](MachineInstr *A, MachineInstr *B) { + return (A->getOpcode() == IndexIiOpc && B->getOpcode() == DupZrOpc && + A->getOperand(1).getImm() == 0); + }; + + if (matchIndexAddPattern(Op1, Op2)) { + IndexStep = Op1->getOperand(2).getImm(); + IVReg = Op2->getOperand(1).getReg(); + } else if (matchIndexAddPattern(Op2, Op1)) { + IndexStep = Op2->getOperand(2).getImm(); + IVReg = Op1->getOperand(1).getReg(); + } else { + continue; + } + } else { + continue; + } + + // The third operand of the multiply should be a duplicated immediate + // value. + MachineInstr *MultiplierMI = MRI->getVRegDef(MI.getOperand(3).getReg()); + if (!MultiplierMI || !isLoopInvariant(MI.getOperand(3).getReg(), L) || + (MultiplierMI->getOpcode() != DupZiOpc)) + continue; + int64_t MultiplierVal = MultiplierMI->getOperand(1).getImm(); + + // Check if the identified register is a basic loop induction variable. + MachineInstr *IVPhi = MRI->getVRegDef(IVReg); + if (!IVPhi || !IVPhi->isPHI() || IVPhi->getParent() != Header) + continue; + + // Find the instruction that updates the induction variable (usually an + // ADD in the latch). + Register IVInitReg = Register(0), IVNextReg = Register(0); + for (unsigned i = 1; i < IVPhi->getNumOperands(); i += 2) { + if (IVPhi->getOperand(i + 1).getMBB() == Preheader) { + IVInitReg = IVPhi->getOperand(i).getReg(); + break; + } + } + for (unsigned i = 1; i < IVPhi->getNumOperands(); i += 2) { + if (IVPhi->getOperand(i + 1).getMBB() == Latch) { + IVNextReg = IVPhi->getOperand(i).getReg(); + break; + } + } + if (!IVInitReg || !IVNextReg) + continue; + + // Get the definition of the next value of the induction variable. + MachineInstr *IVUpdateMI = MRI->getVRegDef(IVNextReg); + if (!IVUpdateMI) + continue; + if (IVUpdateMI->getOpcode() == AArch64::COPY) + IVUpdateMI = MRI->getVRegDef(IVUpdateMI->getOperand(1).getReg()); + if (IVUpdateMI->getOpcode() != AddGprOpc) + continue; + + // Determine the step of the induction variable. + Register IVStepReg; + if (IVUpdateMI->getOperand(1).getReg() == IVReg) + IVStepReg = IVUpdateMI->getOperand(2).getReg(); + else if (IVUpdateMI->getOperand(2).getReg() == IVReg) + IVStepReg = IVUpdateMI->getOperand(1).getReg(); + else + continue; + + LLVM_DEBUG( + dbgs() << "Sve Mul Strength reduction pattern matched for MUL: "; + MI.dump();); + + // --- Start of the Transformation --- + auto PreheaderInsertPt = Preheader->getFirstTerminator(); + DebugLoc DL = MI.getDebugLoc(); + + // In the preheader, create a new offset = index(0, IndexStep * + // MultiplierVal) + Register OffsetVecReg = MRI->createVirtualRegister(ZprRegClass); + BuildMI(*MBB, MI.getIterator(), DL, TII->get(IndexIiOpc), OffsetVecReg) + .addImm(0) + .addImm(IndexStep * MultiplierVal) + .addReg(AArch64::VG, RegState::Implicit); + + // In the preheader, calculate the new step value for our new induction + // variable. This is: NewStep = IVStep * MultiplierVal + MachineInstr *IVStepDef = MRI->getVRegDef(IVStepReg); + if (IVStepDef->getOpcode() == AArch64::COPY) + IVStepDef = MRI->getVRegDef(IVStepDef->getOperand(1).getReg()); + + // Check if the original IV step is the vector length (vl). + bool isStepVL = + IVStepDef && IVStepDef->getOpcode() == CntOpc && + IVStepDef->getOperand(1).getImm() == 31 && // Pattern for 'all' + IVStepDef->getOperand(2).getImm() == 1; // Multiplier of 1 + Register NewStepReg = MRI->createVirtualRegister(GprRegClass); + + // If the step is 'vl' and the multiplier is small, we can use a more + // efficient 'cnt' instruction. + if (isStepVL && MultiplierVal <= 15) { + Register NewStep64Reg = + MRI->createVirtualRegister(&AArch64::GPR64RegClass); + LLVM_DEBUG(dbgs() << "IV Step is vl, using CNT[W/D] for new step.\n"); + BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(CntOpc), + NewStep64Reg) + .addImm(31) // Pattern 'all' for vl + .addImm(MultiplierVal) + .addReg(AArch64::VG, RegState::Implicit); + if (!is64Bit) { + BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(AArch64::COPY), + NewStepReg) + .addReg(NewStep64Reg, 0, AArch64::sub_32); + } + } else { + // Otherwise, we use a general multiplication. + LLVM_DEBUG( + dbgs() << "IV Step is not vl, using generic MUL for new step.\n"); + Register MultReg = MRI->createVirtualRegister(GprRegClass); + BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(MovImmOpc), MultReg) + .addImm(MultiplierVal); + BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(MaddGprOpc), + NewStepReg) + .addReg(IVStepReg) + .addReg(MultReg) + .addReg(ZeroReg); + } + + // In the preheader, calculate the initial value for the new base IV. + // BaseIVInit = IVInit * MultiplierVal + Register BaseIVInitReg = MRI->createVirtualRegister(GprAllRegClass); + Register MultRegForInit = MRI->createVirtualRegister(GprRegClass); + BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(MovImmOpc), + MultRegForInit) + .addImm(MultiplierVal); + BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(MaddGprOpc), + BaseIVInitReg) + .addReg(IVInitReg) + .addReg(MultRegForInit) + .addReg(ZeroReg); + + // Create a new PHI node in the header + // for our new base induction variable. + Register BaseIVReg = MRI->createVirtualRegister(GprAllRegClass); + Register NextBaseIVReg = MRI->createVirtualRegister(GprAllRegClass); + auto BaseIVPhi = BuildMI(*Header, Header->getFirstNonPHI(), DL, + TII->get(AArch64::PHI), BaseIVReg); + BaseIVPhi.addReg(BaseIVInitReg).addMBB(Preheader); + + // In the loop latch, update our new base induction variable + // by adding the new step + BuildMI(*Latch, Latch->getFirstTerminator(), DL, TII->get(AddGprOpc), + NextBaseIVReg) + .addReg(BaseIVReg) + .addReg(NewStepReg); + + BaseIVPhi.addReg(NextBaseIVReg).addMBB(Latch); + + // Now, replace the original multiply operation in the loop body + // with a new add operation + auto BodyInsertPt = MI.getIterator(); + + // Broadcast the new base IV into a vector register. + Register BaseVecReg = MRI->createVirtualRegister(ZprRegClass); + BuildMI(*MI.getParent(), BodyInsertPt, DL, TII->get(DupZrOpc), BaseVecReg) + .addReg(BaseIVReg); + + // Perform the vector addition: NewResult = OffsetVector + BaseVector + Register AddTmpReg = MRI->createVirtualRegister(ZprRegClass); + BuildMI(*MI.getParent(), BodyInsertPt, DL, TII->get(AddZzzOpc), AddTmpReg) + .addReg(OffsetVecReg) + .addReg(BaseVecReg); + + // Replace all uses of the original multiplication result + // with our new addition result + MRI->replaceRegWith(MI.getOperand(0).getReg(), AddTmpReg); + + // Clean up the now-dead instructions from the old calculation + MI.eraseFromParent(); + if (MRI->use_empty(SelMI->getOperand(0).getReg())) + SelMI->eraseFromParent(); + if (MRI->use_empty(IndexMI->getOperand(0).getReg())) { + if (IndexMI->getOpcode() == AddZzzOpc) { + MachineInstr *Op1 = MRI->getVRegDef(IndexMI->getOperand(1).getReg()); + MachineInstr *Op2 = MRI->getVRegDef(IndexMI->getOperand(2).getReg()); + if (MRI->use_empty(Op1->getOperand(0).getReg())) + Op1->eraseFromParent(); + if (MRI->use_empty(Op2->getOperand(0).getReg())) + Op2->eraseFromParent(); + } + IndexMI->eraseFromParent(); + } + if (MRI->use_empty(MultiplierMI->getOperand(0).getReg())) + MultiplierMI->eraseFromParent(); + + LLVM_DEBUG(dbgs() << "Successfully applied strength reduction.\n"); + + return true; + } + } + return false; +} + bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -752,6 +1659,25 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { } } + if (EnableSVELoopAddressChainOpt && + MF.getSubtarget().hasSVE()) { + for (MachineLoop *L : *MLI) { + for (MachineLoop *SubL : *L) { + Changed |= processSVELoopAddressing(SubL); + } + Changed |= processSVELoopAddressing(L); + } + } + + if (EnableSVEIndexMultiplyOpt && + MF.getSubtarget().hasSVE()) { + for (MachineLoop *L : *MLI) { + for (MachineLoop *SubL : *L) { + Changed |= simplifySVEIndexMultiply(SubL); + } + Changed |= simplifySVEIndexMultiply(L); + } + } return Changed; } diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp index 87939363122c..c5a6cb7af405 100644 --- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -24,7 +24,6 @@ #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" -#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" @@ -42,28 +41,9 @@ using namespace llvm::PatternMatch; #define DEBUG_TYPE "aarch64-sve-intrinsic-opts" -static cl::opt EnableSVELoopAddressChainOpt( - "aarch64-sve-loop-address-chain-opt", cl::init(false), cl::Hidden, - cl::desc("Enable simplification of SVE address computation chains in loops")); - namespace { struct SVEIntrinsicOpts : public ModulePass { static char ID; // Pass identification, replacement for typeid - - enum class SVEIndexExtension { SIGN, ZERO, NONE }; // NONE for i64 indices - - struct SVEMemoryOpInfo { - unsigned BaseOpIdx; - unsigned IndexOpIdx; - Type *ElemTy; // The type of the data element being loaded/stored. - SVEIndexExtension ExtKind; - }; - - // The key is {OriginalBasePointer, Index} - using InvariantBaseKey = std::pair; - // The cache maps this key to the computed GEP. - using InvariantBaseCache = DenseMap; - SVEIntrinsicOpts() : ModulePass(ID) { initializeSVEIntrinsicOptsPass(*PassRegistry::getPassRegistry()); } @@ -80,16 +60,6 @@ private: bool optimizeInstructions(SmallSetVector &Functions); - std::optional getSVEMemoryOpInfo(const IntrinsicInst *II); - Value *getLoopInvariantSplatValue(Value *V, Loop *L); - Value *getHoistedBaseForIndex(Value *Index, Value *OriBase, Loop *L, - InvariantBaseCache &Cache, - SVEMemoryOpInfo &OpInfo); - bool simplifySVEAddressComputation(IntrinsicInst *II, Loop *L, - InvariantBaseCache &Cache, - SVEMemoryOpInfo &OpInfo); - bool runSVEAddressHoisting(Function &F, LoopInfo &LI); - /// Operates at the function-scope. I.e., optimizations are applied local to /// the functions themselves. bool optimizeFunctions(SmallSetVector &Functions); @@ -98,7 +68,6 @@ private: void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); - AU.addRequired(); AU.setPreservesCFG(); } @@ -106,7 +75,6 @@ char SVEIntrinsicOpts::ID = 0; static const char *name = "SVE intrinsics optimizations"; INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false) ModulePass *llvm::createSVEIntrinsicOptsPass() { @@ -460,332 +428,6 @@ bool SVEIntrinsicOpts::optimizeInstructions( return Changed; } -/// Checks if an intrinsic is an SVE gather/scatter memory operation that this -/// optimization can analyze. Return the operand information (Base index, Index -/// index, Element Type, and Extension Kind) if supported -std::optional -SVEIntrinsicOpts::getSVEMemoryOpInfo(const IntrinsicInst *II) { - switch (II->getIntrinsicID()) { - // Gather Loads - case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: - case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: - return {{1, 2, - dyn_cast(II->getType())->getElementType(), - SVEIndexExtension::SIGN}}; // Base=1, Index=2, Ext=SIGN - case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: - case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: - return {{1, 2, - dyn_cast(II->getType())->getElementType(), - SVEIndexExtension::ZERO}}; // Base=1, Index=2, Ext=ZERO - case Intrinsic::aarch64_sve_ld1_gather_index: - case Intrinsic::aarch64_sve_ldff1_gather_index: - case Intrinsic::aarch64_sve_ldnt1_gather_index: - return {{1, 2, - dyn_cast(II->getType())->getElementType(), - SVEIndexExtension::NONE}}; // Base=1, Index=2, Ext=NONE - - // Prefetches (have no return value, element type is based on name) - case Intrinsic::aarch64_sve_prfd_gather_sxtw_index: - return {{1, 2, Type::getInt64Ty(II->getParent()->getContext()), - SVEIndexExtension::SIGN}}; - case Intrinsic::aarch64_sve_prfh_gather_sxtw_index: - return {{1, 2, Type::getInt16Ty(II->getParent()->getContext()), - SVEIndexExtension::SIGN}}; - case Intrinsic::aarch64_sve_prfw_gather_sxtw_index: - return {{1, 2, Type::getInt32Ty(II->getParent()->getContext()), - SVEIndexExtension::SIGN}}; - case Intrinsic::aarch64_sve_prfd_gather_uxtw_index: - return {{1, 2, Type::getInt64Ty(II->getParent()->getContext()), - SVEIndexExtension::ZERO}}; - case Intrinsic::aarch64_sve_prfh_gather_uxtw_index: - return {{1, 2, Type::getInt16Ty(II->getParent()->getContext()), - SVEIndexExtension::ZERO}}; - case Intrinsic::aarch64_sve_prfw_gather_uxtw_index: - return {{1, 2, Type::getInt32Ty(II->getParent()->getContext()), - SVEIndexExtension::ZERO}}; - case Intrinsic::aarch64_sve_prfd_gather_index: - return {{1, 2, Type::getInt64Ty(II->getParent()->getContext()), - SVEIndexExtension::NONE}}; - case Intrinsic::aarch64_sve_prfh_gather_index: - return {{1, 2, Type::getInt16Ty(II->getParent()->getContext()), - SVEIndexExtension::NONE}}; - case Intrinsic::aarch64_sve_prfw_gather_index: - return {{1, 2, Type::getInt32Ty(II->getParent()->getContext()), - SVEIndexExtension::NONE}}; - - // Scatter Stores (data is operand 0, element type is derived from it) - case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: - return {{2, 3, - dyn_cast(II->getOperand(0)->getType()) - ->getElementType(), - SVEIndexExtension::SIGN}}; // Base=2, Index=3, Ext=SIGN - case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: - return {{2, 3, - dyn_cast(II->getOperand(0)->getType()) - ->getElementType(), - SVEIndexExtension::ZERO}}; // Base=2, Index=3, Ext=ZERO - case Intrinsic::aarch64_sve_st1_scatter_index: - case Intrinsic::aarch64_sve_stnt1_scatter_index: - return {{2, 3, - dyn_cast(II->getOperand(0)->getType()) - ->getElementType(), - SVEIndexExtension::NONE}}; // Base=2, Index=3, Ext=NONE - - default: - return std::nullopt; - } -} - -/// Check if a Value is a splat of a loop-invariant scalar, which is a -/// shufflevector of an insertelement at index 0. If the pattern matches, return -/// the loop scalar value. -Value *SVEIntrinsicOpts::getLoopInvariantSplatValue(Value *V, Loop *L) { - Value *InvariantScalar = nullptr; - Value *InsertElementVal = nullptr; - - if (auto *SV = dyn_cast(V)) { - InsertElementVal = SV->getOperand(0); - } else if (auto *SVC = dyn_cast(V)) { - if (SVC->getOpcode() == Instruction::ShuffleVector) { - InsertElementVal = SVC->getOperand(0); - } - } - - if (!InsertElementVal) - return nullptr; - - // Check if InsertElementVal is an insertelement and get the scalar. - if (auto *IE = dyn_cast(InsertElementVal)) { - if (match(IE->getOperand(2), - m_Zero())) { // Ensure it's inserting at index 0 - InvariantScalar = IE->getOperand(1); - } - } else if (auto *IEC = dyn_cast(InsertElementVal)) { - if (IEC->getOpcode() == Instruction::InsertElement && - match(IEC->getOperand(2), m_Zero())) { - InvariantScalar = IEC->getOperand(1); - } - } - - if (!InvariantScalar || !L->isLoopInvariant(InvariantScalar)) - return nullptr; - - return InvariantScalar; -} - -/// Analyzes an index calculation chain and generates hoistable GEPs. -/// @param Index The starting index Value (from the sve memory op) -/// @param OrigBase The original base pointer from the sve memory op -/// @param L The loop context -/// @param Cache A map to memoize results for `{OrigBase, Index} : NewBase` -/// @param OpInfo Information about the sve memory op -/// @return The final, rewritten base pointer for the memory op -Value *SVEIntrinsicOpts::getHoistedBaseForIndex(Value *Index, Value *OrigBase, - Loop *L, - InvariantBaseCache &Cache, - SVEMemoryOpInfo &OpInfo) { - InvariantBaseKey InitialKey = {OrigBase, Index}; - - // If this entire chain has been processed before, return the final result. - if (Cache.count(InitialKey)) - return Cache.lookup(InitialKey); - - // --- Trace the ADD chain up to the root, collecting nodes --- - SmallVector IndexChain; - Value *CurrentIndex = Index; - Value *RootIndex = nullptr; - - // The `while` loop traces the `sve.add` chain upwards from the `Index` used - // by the memory op, collecting all intermediate indices onto a stack - // (`IndexChain`). The trace stops when it hits a value that is not an `(index - // + invariant)` add, which becomes the `RootIndex`. - while (true) { - IndexChain.push_back(CurrentIndex); - InvariantBaseKey CurrentKey = {OrigBase, CurrentIndex}; - - // If a subchain in the chain is already solved, stop tracing - if (Cache.count(CurrentKey)) { - RootIndex = CurrentIndex; - break; - } - - auto *Add = dyn_cast(CurrentIndex); - // Stop if not a recognized sve.add intrinsic or not defined in the loop - if (!Add || - (Add->getIntrinsicID() != Intrinsic::aarch64_sve_add && - Add->getIntrinsicID() != Intrinsic::aarch64_sve_add_u) || - !L->contains(Add)) { - RootIndex = CurrentIndex; - break; - } - - Value *Op1 = Add->getOperand(1); - Value *Op2 = Add->getOperand(2); - - // Check if one of the operands is an invariant splat - if (getLoopInvariantSplatValue(Op1, L)) { - CurrentIndex = Op2; - if (match(Op2, m_Select(m_Value(), m_Value(), m_Zero()))) - CurrentIndex = dyn_cast(Op2)->getOperand(1); - } else if (getLoopInvariantSplatValue(Op2, L)) { - CurrentIndex = Op1; - if (match(Op1, m_Select(m_Value(), m_Value(), m_Zero()))) - CurrentIndex = dyn_cast(Op1)->getOperand(1); - } else { - RootIndex = CurrentIndex; // Not an (index + invariant) form. - break; - } - } - - // --- Build GEPs back down the chain --- - // The base for the root index is always the original base pointer - Value *CurrentHoistedBase = Cache.lookup({OrigBase, RootIndex}); - if (!CurrentHoistedBase) { - CurrentHoistedBase = OrigBase; - Cache[{OrigBase, RootIndex}] = OrigBase; - } - - // Iterates down the collected chain (in reverse). For each node, it computes - // the new hoisted base by creating a GEP on top of the base of the previous - // node in the chain. - for (Value *IdxNode : reverse(IndexChain)) { - if (IdxNode == RootIndex) - continue; - - InvariantBaseKey CurrentKey = {OrigBase, IdxNode}; - auto *Add = dyn_cast(IdxNode); - Value *Op1 = Add->getOperand(1); - Value *Op2 = Add->getOperand(2); - - Value *InvariantScalar = getLoopInvariantSplatValue(Op1, L); - if (!InvariantScalar) - InvariantScalar = getLoopInvariantSplatValue(Op2, L); - assert(InvariantScalar); - - IRBuilder<> Builder(Add); - Value *GEPIndex = InvariantScalar; - - // Ensure the invariant has the correct integer type for GEP - switch (OpInfo.ExtKind) { - case SVEIndexExtension::SIGN: - GEPIndex = Builder.CreateSExt( - GEPIndex, Type::getInt64Ty(Add->getParent()->getContext()), - "invariant.idx.sext"); - break; - case SVEIndexExtension::ZERO: - GEPIndex = Builder.CreateZExt( - GEPIndex, Type::getInt64Ty(Add->getParent()->getContext()), - "invariant.idx.zext"); - break; - case SVEIndexExtension::NONE: - break; - } - - Value *NewBase = Builder.CreateGEP(OpInfo.ElemTy, CurrentHoistedBase, - GEPIndex, "add.ptr"); - // Cache the result for this node and update the base for the next iteration - Cache[CurrentKey] = NewBase; - CurrentHoistedBase = NewBase; - } - - return Cache.lookup(InitialKey); -} - -/// Get the final rewritten base and root index, and rewrite the memory -/// intrinsic -bool SVEIntrinsicOpts::simplifySVEAddressComputation( - IntrinsicInst *MemIntrinsic, Loop *L, InvariantBaseCache &Cache, - SVEMemoryOpInfo &OpInfo) { - Value *OrigBase = MemIntrinsic->getArgOperand(OpInfo.BaseOpIdx); - Value *OrigIndex = MemIntrinsic->getArgOperand(OpInfo.IndexOpIdx); - - // If the base itself is not loop invariant, skip simplification - if (!L->isLoopInvariant(OrigBase)) - return false; - - // The actual index might be hidden behind a `select(pg, index, zero)` - // Peel this away to get to the core index calculation - Value *IndexToTrace = OrigIndex; - if (match(IndexToTrace, m_Select(m_Value(), m_Value(), m_Zero()))) { - IndexToTrace = dyn_cast(IndexToTrace)->getOperand(1); - } - // This call populates the cache for the entire chain and returns the final - // base - Value *NewBase = - getHoistedBaseForIndex(IndexToTrace, OrigBase, L, Cache, OpInfo); - - // If the base pointer hasn't changed, nothing was optimized. - if (NewBase == OrigBase) - return false; - - // Now that the cache is populated, trace up from the starting index to find - // the root. - Value *RootIndex = IndexToTrace; - while (true) { - InvariantBaseKey CurrentKey = {OrigBase, RootIndex}; - // The root is the node that maps back to the original base in the cache. - if (Cache.count(CurrentKey) && Cache.lookup(CurrentKey) == OrigBase) { - break; - } - - auto *Add = dyn_cast(RootIndex); - Value *NextIndex = nullptr; - if (getLoopInvariantSplatValue(Add->getOperand(1), L)) { - NextIndex = Add->getOperand(2); - } else if (getLoopInvariantSplatValue(Add->getOperand(2), L)) { - NextIndex = Add->getOperand(1); - } else { - break; // Reached a non-optimizable ADD, this is the root - } - - if (match(NextIndex, m_Select(m_Value(), m_Value(), m_Zero()))) { - RootIndex = dyn_cast(NextIndex)->getOperand(1); - } else { - RootIndex = NextIndex; - } - } - - LLVM_DEBUG(dbgs() << "SVE_ADDR_HOIST_IR: Rewriting " << *MemIntrinsic - << "\n"); - - MemIntrinsic->setArgOperand(OpInfo.BaseOpIdx, NewBase); - MemIntrinsic->setArgOperand(OpInfo.IndexOpIdx, RootIndex); - - // Cleanup would be complex. Rely on DCE for now. - - LLVM_DEBUG(dbgs() << "SVE_ADDR_HOIST_IR: To -> " << *MemIntrinsic << "\n"); - - return true; -} - -/// iterates through all basic blocks in a function. For each block -/// that is part of a loop, it creates a fresh cache and then iterates through -/// its instructions in program order, attempting to simplify any SVE memory -/// operations it finds. -bool SVEIntrinsicOpts::runSVEAddressHoisting(Function &F, LoopInfo &LI) { - bool Changed = false; - for (auto &BB : F) { - // We only care about blocks that are inside a loop. - Loop *L = LI.getLoopFor(&BB); - if (!L) - continue; - - // A fresh cache is used for each basic block to ensure correctness. - // Maps {OriginalBasePointer, Index} to the new computed GEP. - InvariantBaseCache Cache; - - // Iterate through instructions in program order (important!) - for (auto &I : BB) { - if (auto *II = dyn_cast(&I)) { - if (auto OpInfo = getSVEMemoryOpInfo(II)) - Changed |= simplifySVEAddressComputation(II, L, Cache, *OpInfo); - } - } - } - - return Changed; -} - bool SVEIntrinsicOpts::optimizeFunctions( SmallSetVector &Functions) { bool Changed = false; @@ -793,16 +435,6 @@ bool SVEIntrinsicOpts::optimizeFunctions( Changed |= optimizePTrueIntrinsicCalls(Functions); Changed |= optimizeInstructions(Functions); - if (EnableSVELoopAddressChainOpt) { - for (Function *F : Functions) { - if (F->isDeclaration()) - continue; - - LoopInfo &LI = getAnalysis(*F).getLoopInfo(); - Changed |= runSVEAddressHoisting(*F, LI); - } - } - return Changed; } @@ -821,26 +453,6 @@ bool SVEIntrinsicOpts::runOnModule(Module &M) { case Intrinsic::vector_extract: case Intrinsic::vector_insert: case Intrinsic::aarch64_sve_ptrue: - case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: - case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: - case Intrinsic::aarch64_sve_ld1_gather_index: - case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: - case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: - case Intrinsic::aarch64_sve_ldff1_gather_index: - case Intrinsic::aarch64_sve_ldnt1_gather_index: - case Intrinsic::aarch64_sve_prfd_gather_sxtw_index: - case Intrinsic::aarch64_sve_prfd_gather_uxtw_index: - case Intrinsic::aarch64_sve_prfd_gather_index: - case Intrinsic::aarch64_sve_prfh_gather_sxtw_index: - case Intrinsic::aarch64_sve_prfh_gather_uxtw_index: - case Intrinsic::aarch64_sve_prfh_gather_index: - case Intrinsic::aarch64_sve_prfw_gather_sxtw_index: - case Intrinsic::aarch64_sve_prfw_gather_uxtw_index: - case Intrinsic::aarch64_sve_prfw_gather_index: - case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: - case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: - case Intrinsic::aarch64_sve_st1_scatter_index: - case Intrinsic::aarch64_sve_stnt1_scatter_index: for (User *U : F.users()) Functions.insert(cast(U)->getFunction()); break; diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll index 4ee13788c2c4..3747b2581fa4 100644 --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -25,7 +25,6 @@ ; CHECK-NEXT: SVE intrinsics optimizations ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Simplify the CFG ; CHECK-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll index 85d286c165ca..579cc889a8a8 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll @@ -210,11 +210,11 @@ for.body: ; preds = %for.body.lr.ph, %fo define dso_local void @test_invariantOffset32bit(i32 noundef %N, i32 noundef %M, ptr noundef %matrix, ptr nocapture noundef %result) local_unnamed_addr #0 { ; CHECK-LABEL: test_invariantOffset32bit: ; CHECK: .LBB4_5: // %for.body4 +; CHECK: add x[[NEWBASE3:[0-9]+]], x2, w{{[0-9]+}}, sxtw #2 ; CHECK: add x[[NEWBASE1:[0-9]+]], x2, w{{[0-9]+}}, sxtw #2 -; CHECK: ld1w { z{{[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x[[NEWBASE1]], z{{[0-9]+}}.s, sxtw #2] ; CHECK: add x[[NEWBASE2:[0-9]+]], x2, w{{[0-9]+}}, sxtw #2 +; CHECK: ld1w { z{{[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x[[NEWBASE1]], z{{[0-9]+}}.s, sxtw #2] ; CHECK: ld1w { z{{[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE2]], z{{[0-9]+}}.s, sxtw #2] -; CHECK: add x[[NEWBASE3:[0-9]+]], x2, w{{[0-9]+}}, sxtw #2 ; CHECK: ld1w { z{{[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE3]], z{{[0-9]+}}.s, sxtw #2] entry: %cmp41 = icmp sgt i32 %N, 2 @@ -289,11 +289,11 @@ for.body4: ; preds = %for.body4.lr.ph, %f define dso_local void @test_invariantOffset64bit(i64 noundef %N, i64 noundef %M, ptr noundef %matrix, ptr nocapture noundef %result) local_unnamed_addr #0 { ; CHECK-LABEL: test_invariantOffset64bit: ; CHECK: .LBB5_6: // %for.body4 +; CHECK: add x[[NEWBASE2:[0-9]+]], x2, x{{[0-9]+}}, lsl #3 ; CHECK: add x[[NEWBASE1:[0-9]+]], x2, x{{[0-9]+}}, lsl #3 +; CHECK: add x[[NEWBASE3:[0-9]+]], x2, x{{[0-9]+}}, lsl #3 ; CHECK: ld1d { z{{[0-9]+}}.d }, p[[PG:[0-9]+]]/z, [x[[NEWBASE1]], z{{[0-9]+}}.d, lsl #3] -; CHECK: add x[[NEWBASE2:[0-9]+]], x2, x{{[0-9]+}}, lsl #3 ; CHECK: ld1d { z{{[0-9]+}}.d }, p[[PG]]/z, [x[[NEWBASE2]], z{{[0-9]+}}.d, lsl #3] -; CHECK: add x[[NEWBASE3:[0-9]+]], x2, x{{[0-9]+}}, lsl #3 ; CHECK: ld1d { z{{[0-9]+}}.d }, p[[PG]]/z, [x[[NEWBASE3]], z{{[0-9]+}}.d, lsl #3] entry: %cmp39.not = icmp ult i64 %N, 3 @@ -396,6 +396,7 @@ for.body: ; preds = %for.body, %entry define dso_local void @_Z26test_loop_invariant_offsetPlu11__SVInt64_tl(ptr noundef %base, %index, i64 noundef %invariant_offset) local_unnamed_addr #6 { ; CHECK-LABEL: _Z26test_loop_invariant_offsetPlu11__SVInt64_tl: +; CHECK: .LBB7_1: // %for.body ; CHECK: add x[[NEWBASE:[0-9]+]], x0, x1, lsl #3 ; CHECK: st1d { z{{[0-9]+}}.d }, p{{[0-9]+}}, [x[[NEWBASE]], z{{[0-9]+}}.d, lsl #3] entry: @@ -422,8 +423,9 @@ for.body: ; preds = %entry, %for.body define dso_local void @test_combined_const_and_invariant_offset(ptr noundef %base, %index, i32 noundef %invariant_offset) local_unnamed_addr #0 { ; CHECK-LABEL: test_combined_const_and_invariant_offset: -; CHECK: add x[[NEWBASE_GPR:[0-9]+]], x0, w1, sxtw #2 -; CHECK: add x[[NEWBASE_FINAL:[0-9]+]], x[[NEWBASE_GPR]], #40 +; CHECK: .LBB8_1: // %for.body +; CHECK-DAG: add x[[NEWBASE_GPR:[0-9]+]], x0, w1, sxtw #2 +; CHECK-DAG: add x[[NEWBASE_FINAL:[0-9]+]], x[[NEWBASE_GPR]], #40 ; CHECK: ld1w { z{{[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x[[NEWBASE_FINAL]], z{{[0-9]+}}.s, sxtw #2] entry: %0 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) @@ -523,4 +525,4 @@ attributes #9 = { nocallback nofree nosync nounwind willreturn memory(argmem: wr !21 = !{!"int", !7, i64 0} !22 = distinct !{!22, !10} !23 = distinct !{!23, !10} -!24 = distinct !{!24, !10} +!24 = distinct !{!24, !10} \ No newline at end of file diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-index-mul-simplify.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-index-mul-simplify.ll new file mode 100644 index 000000000000..1c8550625404 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-sve-index-mul-simplify.ll @@ -0,0 +1,48 @@ +; RUN: llc -mtriple=aarch64-unknown -mcpu=hip09 -aarch64-sve-simplify-index-multiply -O3 -o - %s | FileCheck %s + +define dso_local void @index_mul_simplify(i32 %loopTime, ptr %x, %val) { +; CHECK-LABEL: index_mul_simplify: +; CHECK: // %bb.1: // %for.body.lr.ph +; CHECK-DAG: mov w[[MULTIPLIER:[0-9]+]], #3 +; CHECK-DAG: index z[[OFFSET_VEC:[0-9]+]].s, #0, #3 +; CHECK-DAG: cntw x[[IV_STEP:[0-9]+]] +; CHECK-DAG: cntw x[[NEW_IV_STEP:[0-9]+]], all, mul #3 +; CHECK-DAG: mul w[[NEW_IV_INIT:[0-9]+]], wzr, w[[MULTIPLIER]] + +; CHECK: .LBB0_2: // %for.body +; CHECK: mov z[[BASE_VEC:[0-9]+]].s, w[[NEW_IV_CUR:[0-9]+]] +; CHECK: whilelt p[[PG:[0-9]+]].s, w{{[0-9]+}}, w{{[0-9]+}} +; CHECK: add w[[NEW_IV_CUR]], w[[NEW_IV_CUR]], w[[NEW_IV_STEP]] +; CHECK: add z[[FINAL_INDICES:[0-9]+]].s, z[[OFFSET_VEC]].s, z[[BASE_VEC]].s +; CHECK-NOT: mul +; CHECK: st1w { z0.s }, p[[PG]], [x1, z[[FINAL_INDICES]].s, sxtw #2] + +entry: + %cmp7 = icmp sgt i32 %loopTime, 0 + br i1 %cmp7, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %.tr = tail call i32 @llvm.vscale.i32() + %0 = shl nuw nsw i32 %.tr, 2 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.body + %jp.08 = phi i32 [ 0, %for.body.lr.ph ], [ %conv1, %for.body ] + %1 = tail call @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %jp.08, i32 %loopTime) + %2 = tail call @llvm.aarch64.sve.index.nxv4i32(i32 %jp.08, i32 1) + %3 = select %1, %2, zeroinitializer + %4 = tail call @llvm.aarch64.sve.mul.nxv4i32( %1, %3, shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer)) + tail call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32( %val, %1, ptr %x, %4) + %conv1 = add i32 %0, %jp.08 + %cmp = icmp slt i32 %conv1, %loopTime + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +declare @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32, i32) +declare @llvm.aarch64.sve.index.nxv4i32(i32, i32) +declare @llvm.aarch64.sve.mul.nxv4i32(, , ) +declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32(, , ptr, ) +declare i32 @llvm.vscale.i32() \ No newline at end of file -- Gitee