diff --git a/.gitignore b/.gitignore index b33fbbf932379..29a57901174ad 100644 --- a/.gitignore +++ b/.gitignore @@ -68,3 +68,8 @@ pythonenv* /clang/utils/analyzer/projects/*/RefScanBuildResults # automodapi puts generated documentation files here. /lldb/docs/python_api/ + + +# exclude installation +build-llvm/* +install/* diff --git a/README.md b/README.md index 793180dc80b7b..be75cdf8c8569 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ LLVM 12 with extensions for processors and computer systems of the [PULP platfor - [HERO][hero]: mixed-data-model (64-bit + 32-bit) compilation and data sharing; automatic tiling of data structures and insertion of DMA transfers; - MemPool: Instruction scheduling model for the MemPool architecture; `Xmempool` extension to allow dynamic instruction tracing; - [PULPv2 RISC-V ISA extension (`Xpulpv2`)][hero]: automatic insertion of hardware loops, post-increment memory accesses, and multiply-accumulates; intrinsics, `clang` builtins , and assembly support for all instructions of the extension; -- [Snitch RISC-V ISA extensions (`Xssr`, `Xfrep`, and `Xdma`)][snitch]: automatic insertion of `frep` hardware loops; intrinsics and `clang` builtins for `Xssr` and `Xdma` extensions; assembly support for all instructions of the extension. +- [Snitch RISC-V ISA extensions (`Xssr`, `Xfrep`, and `Xdma`)][snitch]: automatic insertion of `frep` hardware loops; intrinsics and `clang` builtins for `Xssr` and `Xdma` extensions; assembly support for all instructions of the extension. NEW: automatic SSR inference. # HERO and PULPv2 RISC-V ISA Extension Support @@ -16,6 +16,7 @@ Refer to the [HERO repository](https://github.com/pulp-platform/hero) for build Refer to [snitch-toolchain-cd](https://github.com/pulp-platform/snitch-toolchain-cd) for build scripts and continuous deployment of pre-built toolchains. ## Command-line options +Note that flags that are passed to LLVM through `clang` need to be prefaced with `-mllvm` (use `"SHELL:-mllvm "` in CMake to prevent removal of repeated `-mllvm`s). | Flag | Description | |---|---| @@ -23,9 +24,16 @@ Refer to [snitch-toolchain-cd](https://github.com/pulp-platform/snitch-toolchain | `--debug-only=riscv-sdma` | Enable the debug output of the DMA pseudo instruction expansion pass | | `--debug-only=riscv-ssr` | Enable the debug output of the SSR pseudo instruction expansion pass | | `--debug-only=snitch-freploops` | Enable the debug output of the FREP loop inference pass | -| `--ssr-noregmerge` | Disable the SSR register merging in the SSR pseudo instruction expansion pass. Register merging is enabled by default and can be disabled with this flag. | +| `--ssr-no-regmerge` | Disable the SSR register merging in the SSR pseudo instruction expansion pass. Register merging is enabled by default and can be disabled with this flag. | | `--snitch-frep-inference` | Globally enable the FREP inference on all loops in the compiled module. | -| `--enable-misched=false` | Disable the machine instruction scheduler. Instructions in a complex loop with multiple SSR push or pop instructions on the same data mover may not be rescheduled because the order in which the SSR are accessed is important. | +| `-infer-ssr` | Enable automatic inference of SSR streams. | +| `-ssr-no-intersect-check` | Do not generate intersection checks (unsafe). Use `restrict` key-word instead if possible. | +| `-ssr-no-tcdm-check` | Assume all data of inferred streams is inside TCDM. | +| `-ssr-no-bound-check` | Do not generate checks that make sure the inferred stream's access is executed at least once. | +| `-ssr-conflict-free-only` | Only infer streams if they have no conflicts with other memory accesses. | +| `-ssr-no-inline` | Prevent functions that contain SSR streams from being inlined | +| `-ssr-barrier` | Enable the insertion of a spinning loop that waits for the stream to be done before it is disabled. | +| `-ssr-verbose` | Write information about inferred streams to `stderr`. | ## `clang` builtins The following `clang` builtins can be used to directly make use of the SSR and DMA extensions. @@ -189,6 +197,11 @@ void __builtin_ssr_setup_bound_stride_4d(uint32_t DM, uint32_t b, uint32_t s); void __builtin_ssr_barrier(uint32_t DM); ``` +#### SSR Inference Interoperability +Automatic SSR infernce will not infer any streams in an `ssr_enable` to `ssr_disable` region. +Note that SSR inference currently treats any inline asm block as if it would contain an SSR instruction. Thus it will not infer streams in any loop nests that contain inline asm somewhere. + + ### SDMA ```c diff --git a/cmd_out.txt b/cmd_out.txt new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h new file mode 100644 index 0000000000000..a50d68af2b193 --- /dev/null +++ b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h @@ -0,0 +1,185 @@ +#include "llvm/IR/PassManager.h" +#include "llvm/Passes/PassBuilder.h" +#include "llvm/Passes/PassPlugin.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/DenseMap.h" + +#include +#include +#include + +namespace llvm { + +class AffineAccess; +class AffineAccessAnalysis; +class LoopInfo; +class ScalarEvolution; +class MemorySSA; +class MemoryUseOrDef; +class MemoryDef; +struct ExpandedAffAcc; +class DependenceInfo; +class LoopAccessInfo; + +struct LoopRep{ +private: + ScalarEvolution &SE; + DominatorTree &DT; + const Loop *L; + const SCEV *RepSCEV; + Value *Rep = nullptr; + Value *RepPlusOne = nullptr; + SmallVector containingLoops; //from inner- to outermost + unsigned safeExpandBound; //exclusive bound + +public: + /// construct rep for this loop, if loop well-formed isAvaliable will give true + LoopRep(const Loop *L, ArrayRef contLoops, ScalarEvolution &SE, DominatorTree &DT); + bool isAvailable() const; + bool isOnAllCFPathsOfParentIfExecuted() const; + const Loop *getLoop() const; + const SCEV *getSCEV() const; + const SCEV *getSCEVPlusOne() const; + bool isSafeToExpandBefore(const Loop *L) const; + + ///expands LoopRep::RepSCEV at InsertBefore (if nullptr in preheader of loop) + Value *expandAt(Type *ty, Instruction *InsertBefore = (Instruction *)nullptr); + Value *expandLoopGuard(Instruction *InsertBefore = (Instruction *)nullptr); +}; + +enum AffAccConflict { NoConflict = 0, MustNotIntersect = 1, Bad = 10}; + +struct AffAcc{ +private: + ScalarEvolution &SE; + MemoryUseOrDef *MA; + SmallVector accesses; //the load/store (or call) instructions + SmallVector baseAddresses; //base addresses depending on loop + SmallVector steps; //steps per loop (0 if loop-inv) + SmallVector reps; //loop reps + SmallVector containingLoops; //from inner- to outermost + DenseMap> conflicts; + void findSteps(const SCEV *A, const SCEV *Factor, unsigned loop); + AffAccConflict fromConflictPair(const detail::DenseMapPair> &p, const Loop *L) const; + +public: + AffAcc() = delete; + ///immediately copies the contens of accesses and containingLoops + AffAcc(ArrayRef accesses, const SCEV *Addr, MemoryUseOrDef *MA, ArrayRef containingLoops, ScalarEvolution &SE); + ArrayRef getAccesses() const; + Value *getAddrValue() const; + bool isWrite() const; + int getMaxDimension() const; + const Loop *getDeepestMalformed() const; + bool isWellFormed(unsigned dimension) const; + bool isWellFormed(const Loop *L) const; + bool canExpandBefore(const Loop *L) const; + void dump() const; + void dumpInLoop(const Loop *L) const; + unsigned loopToDimension(const Loop *L) const; + const SCEV *getBaseAddr(unsigned dim) const; + const SCEV *getBaseAddr(const Loop *L) const; + const SCEV *getStep(unsigned dim) const; + const SCEV *getRep(unsigned dim) const; + const Loop *getLoop(unsigned dim) const; + ArrayRef getContainingLoops() const; + AffAccConflict getConflict(AffAcc *A, const Loop *L) const; + std::vector> getConflicts(const Loop *L) const; + + MemoryUseOrDef *getMemoryAccess(); + void addConflict(AffAcc *A, const Loop *StartL, AffAccConflict kind); + bool promote(LoopRep *LR); ///does not check whether it is on all CF-paths for LR->getLoop() + ///code gen: + Value *expandBaseAddr(unsigned dimension, Type *ty, Instruction *InsertBefore = (Instruction *)nullptr); + Value *expandStep(unsigned dimension, Type *ty, Instruction *InsertBefore = (Instruction *)nullptr); + Value *expandRep(unsigned dimension, Type *ty, Instruction *InsertBefore = (Instruction *)nullptr); + ExpandedAffAcc expandAt(const Loop *L, Instruction *Point, Type *PtrTy, IntegerType *ParamTy); +}; + +struct MemDep { +private: + MemorySSA &MSSA; + AAResults &AA; + bool alias(Value *A, Value *B); + bool alias(MemoryUseOrDef *A, MemoryUseOrDef *B); + +public: + MemDep(MemorySSA &MSSA, AAResults &AA) : MSSA(MSSA), AA(AA) {} + DenseSet findClobbers(MemoryUseOrDef *MA); + DenseSet findClobberUsers(MemoryDef *MA); +}; + +struct ExpandedAffAcc { +public: + AffAcc *const Access; + Value *const Addr; + const SmallVector Steps; + const SmallVector Reps; + const SmallVector Ranges; + const SmallVector PrefixSumRanges; + Value *const LowerBound; + Value *const UpperBound; + unsigned getDimension() const { return Steps.size(); } //returns the nr of steps/reps/etc... there are + ExpandedAffAcc (AffAcc *A, Value *Addr, ArrayRef Steps, ArrayRef Reps, + ArrayRef Ranges, ArrayRef PSRanges, Value *LowerBound, Value *UpperBound) + : Access(A), Addr(Addr), Steps(Steps.begin(), Steps.end()), Reps(Reps.begin(), Reps.end()), + Ranges(Ranges.begin(), Ranges.end()), PrefixSumRanges(PSRanges.begin(), PSRanges.end()), + LowerBound(LowerBound), UpperBound(UpperBound) { } +}; + +class AffineAccess{ +private: + ScalarEvolution &SE; + DominatorTree &DT; + LoopInfo &LI; + MemorySSA &MSSA; + AAResults &AA; + DependenceInfo &DI; + MemDep MD; + DenseMap access; + DenseMap reps; + DenseMap> promotedAccesses; + DenseMap> expandableAccesses; + + std::unique_ptr> analyze(Loop *Parent, ArrayRef loopPath); + void addAllConflicts(const std::vector &all); + AffAccConflict calcRWConflict(AffAcc *Read, AffAcc *Write, const Loop *L) const; + std::pair calcConflict(AffAcc *A, AffAcc *B) const; + +public: + AffineAccess(Function &F, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, MemorySSA &MSSA, AAResults &AA, DependenceInfo &DI); + AffineAccess() = delete; + bool accessPatternsMatch(const AffAcc *A, const AffAcc *B, const Loop *L) const; + bool accessPatternsAndAddressesMatch(const AffAcc *A, const AffAcc *B, const Loop *L) const; + ScalarEvolution &getSE() const; + DominatorTree &getDT() const; + LoopInfo &getLI() const; + MemorySSA &getMSSA() const; + AAResults &getAA() const; + DependenceInfo &getDI() const; + SmallVector getLoopsInPreorder() const; + + std::vector getExpandableAccesses(const Loop *L, bool conflictFreeOnly = false); + std::vector expandAllAt(ArrayRef Accs, const Loop *L, Instruction *Point, + Value *&BoundCheck, Type *PtrTy, IntegerType *ParamTy, bool conflictChecks = true, bool repChecks = false); +}; + +class AffineAccessAnalysis : public AnalysisInfoMixin { + friend AnalysisInfoMixin; + static AnalysisKey Key; + +public: + using Result = AffineAccess; + Result run(Function &F, FunctionAnalysisManager &AM); +}; + +// This is the analysis pass that will be invocable via opt +class AffineAccessAnalysisPass : public AnalysisInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +} // namespace llvm \ No newline at end of file diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td index 835a535a9be48..d41c10ff1fe56 100644 --- a/llvm/include/llvm/IR/IntrinsicsRISCV.td +++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td @@ -1434,19 +1434,18 @@ let TargetPrefix = "riscv" in { RISCVSSRIntrinsic; // The `Throws` attribute ensures that the push/pop don't get removed from loops - // by the LICM pass - // TODO: Is there another way to do this? + // by the LICM pass ==> not needed, LICM is only problem if readonly ==> make pop read and write (which is default) def int_riscv_ssr_push : GCCBuiltin<"__builtin_ssr_push">, Intrinsic<[], [llvm_i32_ty, llvm_double_ty], - [IntrWriteMem, IntrHasSideEffects, Throws, ImmArg>]>, + [IntrWriteMem, ImmArg>]>, RISCVSSRIntrinsic; def int_riscv_ssr_pop : GCCBuiltin<"__builtin_ssr_pop">, Intrinsic<[llvm_double_ty], [llvm_i32_ty], - [IntrReadMem, IntrHasSideEffects, Throws, ImmArg>]>, + [ImmArg>, IntrWriteMem]>, //use ReadWrite instead of throw to avoid licm RISCVSSRIntrinsic; def int_riscv_ssr_enable diff --git a/llvm/include/llvm/Transforms/SSR/SSRGeneration.h b/llvm/include/llvm/Transforms/SSR/SSRGeneration.h new file mode 100644 index 0000000000000..20262f3016d32 --- /dev/null +++ b/llvm/include/llvm/Transforms/SSR/SSRGeneration.h @@ -0,0 +1,23 @@ +//===-- SSRInference.h - Infer SSR usage ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SSR_SSRGENERATION_H +#define LLVM_TRANSFORMS_SSR_SSRGENERATION_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class SSRGenerationPass : public PassInfoMixin{ +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_SSR_SSRGENERATION_H diff --git a/llvm/include/llvm/Transforms/SSR/SSRInference.h b/llvm/include/llvm/Transforms/SSR/SSRInference.h new file mode 100644 index 0000000000000..3a95c68da2fce --- /dev/null +++ b/llvm/include/llvm/Transforms/SSR/SSRInference.h @@ -0,0 +1,23 @@ +//===-- SSRInference.h - Infer SSR usage ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SSR_SSRINFERENCE_H +#define LLVM_TRANSFORMS_SSR_SSRINFERENCE_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class SSRInferencePass : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_SSR_SSRINFERENCE_H diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp new file mode 100644 index 0000000000000..6b5a4220eef30 --- /dev/null +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -0,0 +1,1263 @@ +//===-- SSRGeneration.cpp - find prefetchable square affine accesses --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/AffineAccessAnalysis.h" + +#include "llvm/IR/PassManager.h" +#include "llvm/Passes/PassBuilder.h" +#include "llvm/Passes/PassPlugin.h" +#include "llvm/Support/raw_ostream.h" + +#include "llvm/InitializePasses.h" + +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsRISCV.h" +#include "llvm/IR/Dominators.h" + +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" +#include "llvm/Transforms/Utils/LoopVersioning.h" + +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasAnalysisEvaluator.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" + +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/TinyPtrVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/ilist.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/DenseMap.h" + +#include +#include +#include +#include + +#define DEBUG_TYPE "ssr" + +using namespace llvm; + +//================== AffineAcces, helper functions ========================================= + +namespace { + +//collects the set of unknown values in SCEV +struct SCEVUknownSetFinder { + DenseSet values; + // return true to follow this node. + bool follow(const SCEV *S) { + if (S->getSCEVType() == SCEVTypes::scUnknown) { + values.insert(cast(S)->getValue()); + } + return true; //always true + } + // return true to terminate the search. + bool isDone() { return false; /*continue forever*/ } +}; + +//finds whether two SCEVs share unknown values +bool shareValues(const SCEV *A, const SCEV *B) { + SCEVUknownSetFinder finderA; + SCEVTraversal trA(finderA); + trA.visitAll(A); + SCEVUknownSetFinder finderB; + SCEVTraversal trB(finderB); + trB.visitAll(B); + bool shareValues = false; + for (Value *V : finderA.values) { + for (Value *W : finderB.values) { + shareValues |= V == W; + } + } + return shareValues; +} + +//checks whether SCEV contains the SCEVCouldNotCompute expression +bool SCEVContainsCouldNotCompute(const SCEV *S) { + auto pred = [](const SCEV *X) { return !X || X->getSCEVType() == SCEVTypes::scCouldNotCompute || isa(X); }; + return SCEVExprContains(S, std::move(pred)); +} + +/// guarantees: +/// L has 1 preheader and 1 dedicated exit +/// L has 1 backedge and 1 exiting block +/// bt SCEV can be expanded to instructions at insertionsPoint +const SCEV *getLoopBTSCEV(const Loop *L, DominatorTree &DT, ScalarEvolution &SE){ + if (!L->isLCSSAForm(DT) + || !L->getLoopPreheader() + || !L->getExitingBlock() + || !L->getExitBlock() + || !L->hasDedicatedExits() + || L->getNumBackEdges() != 1) { + return nullptr; + } + if (!SE.hasLoopInvariantBackedgeTakenCount(L)){ + return nullptr; + } + const SCEV *bt = SE.getBackedgeTakenCount(L); + if(!bt || isa(bt) || !SE.isAvailableAtLoopEntry(bt, L) || SCEVContainsCouldNotCompute(bt)){ + return nullptr; + } + return bt; +} + +//casts SCEVs to same type if possible (or always if unsafe = true) +Optional> toSameType(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool unsafe = false){ + assert(LHS && RHS); + using PT = std::pair; + + const DataLayout &DL = SE.getDataLayout(); + LLVMContext &ctxt = SE.getContext(); + + Type *LT = LHS->getType(), *RT = RHS->getType(); + if (LT == RT) + return Optional(std::make_pair(LHS, RHS)); //trivially the same size + if (LT->isPointerTy() && RT->isPointerTy()) //if we have pointers to different types + //PointerType *LTP = cast(LT); PointerType *RTP = cast(RT); + return Optional(std::make_pair( + SE.getPtrToIntExpr(LHS, Type::getIntNTy(ctxt, DL.getMaxPointerSizeInBits())), + SE.getPtrToIntExpr(RHS, Type::getIntNTy(ctxt, DL.getMaxPointerSizeInBits())) + )); + + if (!LT->isSized() || !RT->isSized()) return None; + if (DL.getTypeSizeInBits(LT).isScalable() || DL.getTypeSizeInBits(RT).isScalable()) return None; + + uint64_t ls = DL.getTypeSizeInBits(LT).getValue(), rs = DL.getTypeSizeInBits(RT).getValue(); + + if (ls > rs) { + if (auto LHSx = dyn_cast(LHS)){ + if (LHSx->getAPInt().getActiveBits() <= rs) + return Optional(std::make_pair(SE.getConstant(RHS->getType(), LHSx->getAPInt().getLimitedValue()), RHS)); + } + if (auto RHSx = dyn_cast(RHS)){ + return Optional(std::make_pair(LHS, SE.getConstant(LHS->getType(), RHSx->getAPInt().getLimitedValue()))); + } + if (auto LHSx = dyn_cast(LHS)) return toSameType(LHSx->getOperand(0), RHS, SE); + if (auto LHSx = dyn_cast(LHS)) return toSameType(LHSx->getOperand(0), RHS, SE); + if (auto RHSx = dyn_cast(RHS)) return toSameType(LHS, RHSx->getOperand(0), SE); + if (unsafe && LT->isIntegerTy() && RT->isIntegerTy()) return Optional(std::make_pair(SE.getTruncateExpr(LHS, RHS->getType()), RHS)); + return None; + }else if (ls < rs){ + auto p = toSameType(RHS, LHS, SE); //swap + if (!p.hasValue()) return None; + return Optional(std::make_pair(p.getValue().second, p.getValue().first)); //swap back + } + if (unsafe) return Optional(std::make_pair(LHS, RHS)); + return None; +} + +///checks whether LHS == RHS always holds +bool SCEVEquals(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE){ + auto p = toSameType(LHS, RHS, SE); + if (!p.hasValue()) return false; + LHS = p.getValue().first; + RHS = p.getValue().second; + if (LHS == RHS) return true; //trivially the same if this holds (bc const Ptr) + else{ + const SCEVPredicate *Peq = SE.getEqualPredicate(LHS, RHS); + if (Peq->isAlwaysTrue()) return true; //if we arrive at setup addr scev, we are done + } + return false; +} + +/// check whether BB is on all controlflow paths from header to header +// TODO: could also be done with DT +bool isOnAllControlFlowPaths(BasicBlock *BB, const Loop *L, const DominatorTree &DT){ + BasicBlock *End = L->getHeader(); + std::deque> q; + q.push_back(std::make_pair(End, false)); //start with header (false = BB not yet visited) + std::set> vis; //comp here is less> + while (!q.empty()){ + auto p = q.front(); q.pop_front(); + if (vis.find(p) != vis.end()) continue; + vis.insert(p); + for (BasicBlock *B : successors(p.first)){ + q.push_back(std::make_pair(B, p.second || B == BB)); + } + //check here whether End is reached with false (not at start of loop bc we also start with End) + p = q.front(); + if (!p.second && p.first == End) return false; //got to End (header) without ever visiting BB + } + return true; +} + +//return result of Cmp predicated on Rep > 0 if possible. +// i.e. if we can say that Rep > 0 implies that Cmp is always false or true, we return that, o/w return None +Optional predicatedICmpOutcome(ICmpInst *Cmp, const SCEV *Rep, ScalarEvolution &SE){ + switch (Cmp->getPredicate()) + { + case CmpInst::Predicate::ICMP_SGT: + case CmpInst::Predicate::ICMP_UGT: + { + const SCEV *LHS = SE.getSCEV(Cmp->getOperand(0)); + const SCEV *RHS = SE.getSCEV(Cmp->getOperand(1)); + //transform: LHS > RHS <==> LHS - RHS > 0 + const SCEV *LHSmRHS = SE.getMinusSCEV(LHS, RHS); + //then check whether Rep == LHS - RHS in which case we know: Rep > 0 ==> result of Cmp is true + if (SCEVEquals(Rep, LHSmRHS, SE)) return Optional(true); + else return None; + } + case CmpInst::Predicate::ICMP_SLT: + case CmpInst::Predicate::ICMP_ULT: + { + //a < b <==> b > a + const SCEV *LHS = SE.getSCEV(Cmp->getOperand(1)); //b + const SCEV *RHS = SE.getSCEV(Cmp->getOperand(0)); //a + //transform: LHS > RHS <==> LHS - RHS > 0 + const SCEV *LHSmRHS = SE.getMinusSCEV(LHS, RHS); + //then check whether Rep == LHS - RHS in which case we know: Rep > 0 ==> result of Cmp is true + if (SCEVEquals(Rep, LHSmRHS, SE)) return Optional(true); + else return None; + } + case CmpInst::Predicate::ICMP_EQ: + case CmpInst::Predicate::ICMP_NE: + { + //Rep > 0 ==> Rep + x != x + const SCEV *LHS = SE.getSCEV(Cmp->getOperand(0)); //Rep + x (hopefully) + const SCEV *RHS = SE.getSCEV(Cmp->getOperand(1)); //x + const SCEV *LHSmRHS = SE.getMinusSCEV(LHS, RHS); //Rep (hopefully) + if (SCEVEquals(Rep, LHSmRHS, SE)) return Optional(Cmp->getPredicate() == CmpInst::Predicate::ICMP_NE); + else return None; + } + default: + return None; + } +} + +//conservative! +//because SCEVComparePredicate is not in this version of LLVM we have to do this manually ==> will not catch all cases (FIXME) +//predicate is that Rep > 0 +bool isOnAllPredicatedControlFlowPaths(BasicBlock *BB, const Loop *L, const DominatorTree &DT, const SCEV *Rep, ScalarEvolution &SE){ + if (isOnAllControlFlowPaths(BB, L, DT)) return true; //is on all paths anyway + DenseSet vis; //visited set + std::deque q(1U, L->getHeader()); //iterative BFS with queue + while (!q.empty()){ + BasicBlock *Current = q.front(); q.pop_front(); + if (Current == BB) continue; //do not continue BFS from BB + if (vis.find(Current) == vis.end()) continue; //already visited this block + vis.insert(Current); + + Instruction *T = Current->getTerminator(); + LLVM_DEBUG(T->dump()); + if (BranchInst *BR = dyn_cast(T)){ + if (BR->isConditional()){ + if (ICmpInst *Cmp = dyn_cast(BR->getCondition())){ //FOR NOW: only works with a single ICmpInst as branch condition operand + LLVM_DEBUG(Cmp->dump()); + auto r = predicatedICmpOutcome(Cmp, Rep, SE); + if (r.hasValue()){ + if (r.getValue()) q.push_back(BR->getSuccessor(0)); + else q.push_back(BR->getSuccessor(1)); + }else{ + q.push_back(BR->getSuccessor(0)); + q.push_back(BR->getSuccessor(1)); + } + } + }else{ + q.push_back(BR->getSuccessor(0)); //add the only successor to queue + } + }else{ + return false; //unknown jump somewhere else ==> BB not on all predicated paths + } + + if (q.front() == L->getHeader()) return false; //bfs arrived at Header (again) with a path that never went through BB + } + return true; +} + +//cast to right integer size, insert instruction at `InsPoint` +Value *castToSize(Value *R, Type *ty, Instruction *InsPoint){ + const DataLayout &DL = InsPoint->getParent()->getModule()->getDataLayout(); + IRBuilder<> builder(InsPoint); + Type *rty = R->getType(); + if (rty == ty) return R; + if (DL.getTypeSizeInBits(rty) > DL.getTypeSizeInBits(ty)) { + return builder.CreateTruncOrBitCast(R, ty, "scev.trunc"); + } + if (DL.getTypeSizeInBits(rty) < DL.getTypeSizeInBits(ty)) { + return builder.CreateSExtOrBitCast(R, ty, "scev.sext"); + } + return builder.CreateBitOrPointerCast(R, ty, "scev.cast"); +} + +// extract the Address Value of MA (nullptr if not available) +Value *getAddress(MemoryUseOrDef *MA) { + assert(MA && "called getAddress on nullptr"); + assert(MA->getMemoryInst()); + Instruction *I = MA->getMemoryInst(); + if (auto *L = dyn_cast(I)) return L->getPointerOperand(); + if (auto *S = dyn_cast(I)) return S->getPointerOperand(); + return nullptr; +} + +//find the first L in loops that contains BB +//loops should be a nesting of loops from inner to outermost +const Loop *findFirstContaining(ArrayRef loops, BasicBlock *BB){ + for (const Loop *L : loops) { + if (L && L->contains(BB)) { + return L; + } + } + return nullptr; +} + +//find out whether MA stands for some load/store (for some reason they don't always do, maybe bc of DCE?) +bool hasMemInst(MemoryUseOrDef *MA) { return MA && MA->getMemoryInst(); } + +//updates L<-M if M is a descendant of L (or if L is nullptr) +void updateIfDescendant(const Loop *&L, const Loop *M) { + if (!L || (M && L->contains(M))) L = M; +} + +//updates L<-M if L is descendant of M OR if M is nullptr +void updateIfAncestor(const Loop *&L, const Loop *M) { + if (!M || M->contains(L)) L = M; +} + +void updateOutermostExpandableExcl(const Loop *&outerMostExpandableExl, AffAccConflict kind, const Loop *innermostCommon, const Loop *deepestMalformed) { + switch (kind) { + case AffAccConflict::NoConflict: + break; + case AffAccConflict::MustNotIntersect: + updateIfAncestor(innermostCommon, deepestMalformed); //updates innermostCommon to deepestMalformed if that one is less "deep" + LLVM_FALLTHROUGH; + case AffAccConflict::Bad: + updateIfDescendant(outerMostExpandableExl, innermostCommon); + break; + default: + llvm_unreachable("unknown conflict type"); + } +} + +//tries to find the sign of SCEV which information given +Optional findSign(const SCEV *S, ScalarEvolution &SE, std::vector> &known) { + if (!S) return None; + + //in case we know + for (const auto &p : known) { + if (SCEVEquals(S, p.first, SE)) return p.second; + } + + //in case SE knows + if (SE.isKnownNegative(S)) return -1; + if (SE.isKnownPositive(S)) return 1; + if (S->isZero()) return 0; + + //do recursively + switch (S->getSCEVType()) + { + case SCEVTypes::scConstant: + if (S->isZero()) return 0; + else if (SE.isKnownPositive(S)) return 1; + else if (SE.isKnownNegative(S)) return -1; + llvm_unreachable("SE does not know sign of constant value ???"); + + case SCEVTypes::scMulExpr: { + auto l = findSign(cast(S)->getOperand(0), SE, known); + auto r = findSign(cast(S)->getOperand(1), SE, known); + if (!l.hasValue() || !r.hasValue()) return None; + return r.getValue() * l.getValue(); + } + + case SCEVTypes::scAddExpr: { + auto l = findSign(cast(S)->getOperand(0), SE, known); + auto r = findSign(cast(S)->getOperand(1), SE, known); + if (!l.hasValue() || !r.hasValue()) return None; + if (l.getValue() + r.getValue() >= 1) return 1; + if (l.getValue() + r.getValue() <= -1) return -1; + return None; + } + + case SCEVTypes::scPtrToInt: + case SCEVTypes::scTruncate: + return findSign(cast(S)->getOperand(0), SE, known); + + //TODO: could add max/min, etc... + + default: + return None; + } + llvm_unreachable(""); +} + +//cast some SCEVs if necessary +const SCEV *getZExtIfNeeded(const SCEV *S, Type *Ty, ScalarEvolution &SE) { + if (SE.getDataLayout().getTypeSizeInBits(S->getType()) < SE.getDataLayout().getTypeSizeInBits(Ty)) { + return SE.getZeroExtendExpr(S, Ty); + } + return S; +} + +//cast some SCEVs if necessary +const SCEV *getSExtIfNeeded(const SCEV *S, Type *Ty, ScalarEvolution &SE) { + if (SE.getDataLayout().getTypeSizeInBits(S->getType()) < SE.getDataLayout().getTypeSizeInBits(Ty)) { + return SE.getSignExtendExpr(S, Ty); + } + return S; +} + +} //end of namespace + +//================== =========================================================== + +// ==== LoopRep ==== +LoopRep::LoopRep(const Loop *L, ArrayRef contLoops, ScalarEvolution &SE, DominatorTree &DT) + : SE(SE), DT(DT), L(L), containingLoops(contLoops.begin(), contLoops.end()), safeExpandBound(0u) + { + RepSCEV = getLoopBTSCEV(L, DT, SE); + if (RepSCEV) LLVM_DEBUG(dbgs()<<"new LoopRep with rep scev: "<<*RepSCEV<<"\n"); + else LLVM_DEBUG(dbgs()<<"new LoopRep with rep scev: \n"); + + if (RepSCEV){ + while (safeExpandBound < containingLoops.size() + && (!containingLoops[safeExpandBound] + || isSafeToExpandAt(RepSCEV, containingLoops[safeExpandBound]->getLoopPreheader()->getTerminator(), SE))){ + safeExpandBound++; + } + } +} + +bool LoopRep::isAvailable() const { return RepSCEV != nullptr; } +const Loop *LoopRep::getLoop() const { return L; } +const SCEV *LoopRep::getSCEV() const { + assert(isAvailable() && "SCEV available"); //not necessary, but forces good practice + return RepSCEV; +} + +const SCEV *LoopRep::getSCEVPlusOne() const { + assert(isAvailable() && "SCEV available"); + return SE.getAddExpr(getSCEV(), SE.getConstant(getSCEV()->getType(), 1UL)); +} + +bool LoopRep::isOnAllCFPathsOfParentIfExecuted() const { //FIXME: maybe cache this result once calculated? + assert(isAvailable() && "SCEV available"); + return isOnAllPredicatedControlFlowPaths(L->getHeader(), L->getParentLoop(), DT, getSCEVPlusOne(), SE); +} + +bool LoopRep::isSafeToExpandBefore(const Loop *L) const { + assert(isAvailable() && "SCEV available"); + if (L == getLoop()) return true; + for (unsigned i = 0u; i < safeExpandBound; i++) { //FIXME: linear search -> use map instead + if (L == containingLoops[i]) return true; + } + return false; +} + +//code generation for loop rep, will cache the Value holding the results after calling for the first time to prevent excessive code-gen +Value *LoopRep::expandAt(Type *ty, Instruction *InsertBefore){ + assert(ty); + assert(RepSCEV); + if (Rep) { //FIXME: currently forces user to call first expand at a point that dominates all possible uses (improvement: could update expand point using DT) + assert(ty == Rep->getType() && "was already expanded with same type"); + return Rep; + } + InsertBefore = InsertBefore ? InsertBefore : L->getLoopPreheader()->getTerminator(); + const SCEV *RepP1 = getSCEVPlusOne(); //we go over the +1 version here because getSCEV() is usually sth like %n-1 so then this becomes just %n + assert(isSafeToExpandAt(RepP1, InsertBefore, SE) && "bound not expandable here"); + SCEVExpander ex(SE, L->getHeader()->getModule()->getDataLayout(), "rep"); + ex.setInsertPoint(InsertBefore); + RepPlusOne = castToSize(ex.expandCodeFor(RepP1), ty, InsertBefore); + IRBuilder<> builder(InsertBefore); + Rep = builder.CreateSub(RepPlusOne, ConstantInt::get(ty, 1u), "rep"); + return Rep; +} + +//code-gen for loop guard, ie. inserts code of rep+1 > 0 +Value *LoopRep::expandLoopGuard(Instruction *InsertBefore) { + assert(RepPlusOne && "expandAt has to be called before this"); + InsertBefore = InsertBefore ? InsertBefore : L->getLoopPreheader()->getTerminator(); + IRBuilder<> builder(InsertBefore); + return builder.CreateICmpSGT(RepPlusOne, ConstantInt::get(Rep->getType(), 0u, true)); //FIXME: this only works for unsigned Rep's that are < 2^30 (for i32) +} + +// ==== AffAcc ==== +AffAcc::AffAcc(ArrayRef accesses, const SCEV *Addr, MemoryUseOrDef *MA, ArrayRef contLoops, ScalarEvolution &SE) + : SE(SE), MA(MA), accesses(accesses.begin(), accesses.end()) +{ + assert(!accesses.empty()); + assert(MA); + + containingLoops.push_back((const Loop *)nullptr); //there is no loop for dim=0 + containingLoops.append(contLoops.begin(), contLoops.end()); //initialize loops + + bool isVolatile = false; + for (Instruction *I : accesses) //check for volatile mem insts, we don't want to touch those + isVolatile |= (isa(I) && cast(I)->isVolatile()) || (isa(I) && cast(I)->isVolatile()); + if (Addr && (SCEVContainsCouldNotCompute(Addr) || isVolatile)) Addr = nullptr; //set to null if contains SCEVCouldNotCompute + baseAddresses.push_back(Addr); + if (!Addr) return; //do not look for steps or addresses if SCEV of address is unknown + steps.push_back((const SCEV *)nullptr); //there is no step for dim=0 + reps.push_back((LoopRep *)nullptr); //there is no rep for dim=0 + findSteps(Addr, (const SCEV *)nullptr, 1u); //find steps + for (unsigned dim = 1; dim < containingLoops.size(); dim++){ + Addr = SE.SplitIntoInitAndPostInc(containingLoops[dim], Addr).first; + baseAddresses.push_back(Addr); + } +} + +//fold over A and collect steps in AddRec expressions +//the found steps might not be valid for square affine access patterns ==> `promote` will check this +void AffAcc::findSteps(const SCEV *A, const SCEV *Factor, unsigned loop){ + assert(A); + assert(baseAddresses.size() == 1 && reps.size() == 1 && "we only know dim=0 so far"); + + if (loop >= containingLoops.size()) return; //we are done + + if (!SE.containsAddRecurrence(A) && loop < containingLoops.size()){ + //A is inv to the rest of the loops + steps.push_back(SE.getConstant(Type::getInt64Ty(this->accesses[0]->getContext()), 0U)); + findSteps(A, Factor, loop + 1u); + } + + switch (A->getSCEVType()) + { + //unary expressions that do not change value + case SCEVTypes::scZeroExtend: //FIXME: this might be unsafe + case SCEVTypes::scSignExtend: + case SCEVTypes::scTruncate: + return findSteps(cast(A)->getOperand(0), Factor, loop); + + // TODO: if we want to allow random adds in between then we would need to add the non-recursive part to the base address + // case SCEVTypes::scAddExpr: { + // const SCEV *L = cast(A)->getOperand(0); + // const SCEV *R = cast(A)->getOperand(1); + // bool l = SE.containsAddRecurrence(L); + // bool r = SE.containsAddRecurrence(R); + // if (l && !r) return findSteps(L, Factor, loop); + // else if(!l && r) return findSteps(R, Factor, loop); + // return; + // } + + //L * R + case SCEVTypes::scMulExpr: { + const SCEV *L = cast(A)->getOperand(0); + const SCEV *R = cast(A)->getOperand(1); + bool l = SE.containsAddRecurrence(L); + bool r = SE.containsAddRecurrence(R); + if (l == r) return; + if (!l && r) std::swap(L, R); + assert(SE.containsAddRecurrence(L) && !SE.containsAddRecurrence(R)); + if (Factor) { + auto p = toSameType(Factor, R, SE, true); + if (!p.hasValue()) return; + Factor = SE.getMulExpr(p.getValue().first, p.getValue().second); + }else Factor = R; + return findSteps(L, Factor, loop); + } + + //{,+,Step} + case SCEVTypes::scAddRecExpr: { + const auto *S = cast(A); + const SCEV *Step; + if (S->getLoop() == containingLoops[loop]){ //L == containingLoops[loop] + Step = S->getStepRecurrence(SE); + if (Factor) { + auto p = toSameType(Factor, Step, SE, true); + if (!p.hasValue()) return; + Step = SE.getMulExpr(p.getValue().first, p.getValue().second); + } + steps.push_back(Step); + return findSteps(S->getStart(), Factor, loop+1); + }else{ //A is loop-invariant to containingLoops[loop] + bool occursLater = false; //loop needs to occur later + for (unsigned i = loop+1; i < containingLoops.size(); i++) + occursLater = occursLater || containingLoops[i] == S->getLoop(); + if (!occursLater) return; + steps.push_back(SE.getConstant(Type::getInt64Ty(this->accesses[0]->getContext()), 0U)); + return findSteps(S, Factor, loop+1); + } + } + default: //in all other cases we cannot safely extract more steps and thus just return + return; + } +} + +ArrayRef AffAcc::getAccesses() const { return accesses; } + +bool AffAcc::isWrite() const { return isa(MA); } + +///the nr of times `this` was promoted (-1 means the address is not known) +int AffAcc::getMaxDimension() const { return (int)reps.size() - 1; } + +///return the first (as in deepest) Loop L where this->isWellFormed(L) is false +///returns null if there is no such loop +const Loop *AffAcc::getDeepestMalformed() const { + for (const Loop *L : containingLoops) { + if (L && !isWellFormed(L)) return L; + } + return nullptr; + /*unsigned malformedStart = (unsigned)(getMaxDimension() + 2); //getMaxDimension() >= -1 + if (containingLoops.size() > malformedStart) return containingLoops[malformedStart]; + else return nullptr;*/ +} + +///true if this was successfully promoted to the given dimension (ie. nr of promotions is at least `dimension`) +bool AffAcc::isWellFormed(unsigned dimension) const { + int md = getMaxDimension(); + return md >= 0 && dimension <= (unsigned)md; +} + +///true if this was successfully promoted to the given dimension (ie. nr of promotions is `dimension`) +///if true, this means that `this` can be expanded in the preheader of `L` +bool AffAcc::isWellFormed(const Loop *L) const { return isWellFormed(loopToDimension(L)); } + +///returns the dimension that is defined by `L` (starts at 1) +unsigned AffAcc::loopToDimension(const Loop *L) const { + assert(L && "L not nullptr"); + for (unsigned d = 1u; d < containingLoops.size(); d++){ //FIXME: linear search -> improve with a map + if (containingLoops[d] == L) return d; + } + llvm_unreachable("The provided loop does not contain `this`!"); +} + +Value *AffAcc::getAddrValue() const { + assert(getBaseAddr(0u) && "has an address"); + if (isWrite()) { + return cast(accesses[0])->getPointerOperand(); + } else { + return cast(accesses[0])->getPointerOperand(); + } +} + +///SCEV of base Address for the base address at a given dimension +const SCEV *AffAcc::getBaseAddr(unsigned dim) const { assert(dim < baseAddresses.size()); return baseAddresses[dim]; } + +///SCEV of base Address outside of `L` +const SCEV *AffAcc::getBaseAddr(const Loop *L) const { return getBaseAddr(loopToDimension(L)); } + +///SCEV of step for the dimension `dim` (that means there is no step for `dim` = 0) +const SCEV *AffAcc::getStep(unsigned dim) const { assert(dim < steps.size()); return steps[dim]; } + +///SCEV of rep for the dimension `dim` (that means there is no rep for `dim` = 0) +const SCEV *AffAcc::getRep(unsigned dim) const { + assert(dim < reps.size()); + if (!reps[dim] || !reps[dim]->isAvailable()) return nullptr; + return reps[dim]->getSCEV(); +} + +///get Loop for given `dim` (that means there is no Loop for `dim` = 0) +const Loop *AffAcc::getLoop(unsigned dim) const { assert(dim < containingLoops.size()); return containingLoops[dim]; } + +///get containing loops from inner- to outermost +ArrayRef AffAcc::getContainingLoops() const { return ArrayRef(containingLoops); } + +//dump info known for this AffAcc up to some loop L +void AffAcc::dumpInLoop(const Loop *L) const { + errs()<<"Affine Access of \n"; + int dimension = getMaxDimension(); + if (L) dimension = std::min((int)loopToDimension(L), dimension); + for (auto *I : accesses) errs()<<*I<<"\n"; + if (dimension < 0) errs()<<"\t\n"; + for (int dim = 0; dim <= dimension && dim <= getMaxDimension(); dim++){ + const SCEV *s = getStep(dim); + const SCEV *r = getRep(dim); + const SCEV *a = getBaseAddr(dim); + errs()<<"\tdim = "<"; + errs()<<", rep = "; + if (r) errs()<<*r; + else errs()<<""; + errs()<<", well-formed = "<isWellFormed(dim); + errs()<<"\n"; + errs()<<"\taddress = "; + if (a) errs()<<*a; + else errs()<<""; + errs()<<"\n"; + errs()<<"\tloop header = "; + if (getLoop(dim)) errs()<getHeader()->getNameOrAsOperand(); + else errs()<<""; + errs()<<"\n"; + } +} + +//dump all info known about this AffAcc +void AffAcc::dump() const { dumpInLoop(nullptr); } + +//get the actual conflict between this and the AffAcc in the pair for some loop L +AffAccConflict AffAcc::fromConflictPair(const detail::DenseMapPair> &p, const Loop *L) const { + const Loop *S = p.getSecond().first; + if (S == L || L->contains(S)) { //if start is L or more "inner" loop + if (!isWellFormed(L) || !p.first->isWellFormed(L)) return AffAccConflict::Bad; //if either is not well-formed "demote" the conflict to bad (but only if exists) + return p.getSecond().second; + } + return AffAccConflict::NoConflict; +} + +//get the actual conflict between this and A for loop L +AffAccConflict AffAcc::getConflict(AffAcc *A, const Loop *L) const { + auto p = conflicts.find(A); + if (p != conflicts.end()) { + return fromConflictPair(*p, L); + } + return AffAccConflict::NoConflict; +} + +/// returns a vector of (AffAcc *, conflict) pairs containing all the conflicts that `this` has at loop `L` +/// It is guaranteed that conflict is never NoConflict +std::vector> AffAcc::getConflicts(const Loop *L) const { + std::vector> res; + for (const auto &p : conflicts) { + assert(p.first); + assert(p.getSecond().first); + AffAccConflict kind = fromConflictPair(p, L); + if (kind != AffAccConflict::NoConflict) res.push_back(std::make_pair(p.first, kind)); + } + return res; +} + +MemoryUseOrDef *AffAcc::getMemoryAccess() { return MA; } + +//add conflict with A, where StartL is innermost shared loop, with conflict classification `kind` +void AffAcc::addConflict(AffAcc *A, const Loop *StartL, AffAccConflict kind){ + assert(StartL); + assert(conflicts.find(A) == conflicts.end() && "no conflict for A yet"); + assert(kind == AffAccConflict::Bad || (isWellFormed(StartL) && A->isWellFormed(StartL))); + conflicts.insert(std::make_pair(A, std::make_pair(StartL, kind))); +} + +//promote `this` if possible. +//`LR` should be the rep of the next outer loop where this is not (yet) well-formed +// if successful, `this` is well-formed for LR->getLoop() afterwards. +bool AffAcc::promote(LoopRep *LR){ + if (!LR->isAvailable()) return false; + unsigned newDim = (unsigned)(getMaxDimension() + 1); //getMaxDimension() >= -1 + if (getLoop(newDim) != LR->getLoop()) return false; + LLVM_DEBUG(dbgs()<<"promote: (1) loops match, "); + bool possible = true; + Instruction *Point = LR->getLoop()->getLoopPreheader()->getTerminator(); + //check all current reps and steps + for (unsigned dim = 1; dim < newDim; dim++){ + possible &= isSafeToExpandAt(getStep(dim), Point, SE); + possible &= reps[dim]->isSafeToExpandBefore(LR->getLoop()); + } + if (possible) LLVM_DEBUG(dbgs()<<"can expand (2) current rep & step, "); + //check rep and step of new dimension + possible &= steps.size() > newDim && isSafeToExpandAt(getStep(newDim), Point, SE); + possible &= LR->isSafeToExpandBefore(LR->getLoop()); + if (possible) LLVM_DEBUG(dbgs()<<"(3) new rep & step, "); + //check base address + possible &= !SCEVContainsCouldNotCompute(getBaseAddr(newDim)) && isSafeToExpandAt(getBaseAddr(newDim), Point, SE); + if (possible) LLVM_DEBUG(dbgs()<<"and (4) new base addr!"); + LLVM_DEBUG(dbgs()<<"\n"); + if (!possible) return false; + + reps.push_back(LR); //changes getMaxDimension() + return true; +} + +//Code-gen for base address +Value *AffAcc::expandBaseAddr(unsigned dimension, Type *ty, Instruction *InsertBefore){ + assert(isWellFormed(dimension)); + InsertBefore = InsertBefore ? InsertBefore : reps[dimension]->getLoop()->getLoopPreheader()->getTerminator(); + if (!isSafeToExpandAt(getBaseAddr(dimension), InsertBefore, SE)){ + LLVM_DEBUG(dbgs()<<"data not expanable here (note: only preheader guaranteed)\n"); + LLVM_DEBUG(dbgs()<<"SCEV (dim = "<getParent()->dump()); + LLVM_DEBUG(dbgs()<<"before inst: "<<*InsertBefore<<"\n"); + LLVM_DEBUG(this->dump()); + llvm_unreachable("cannot expand SCEV at desired location"); + } + SCEVExpander ex(SE, reps[dimension]->getLoop()->getHeader()->getModule()->getDataLayout(), "addr"); + ex.setInsertPoint(InsertBefore); + return castToSize(ex.expandCodeFor(getBaseAddr(dimension)), ty, InsertBefore); +} + +//code-gen for step +Value *AffAcc::expandStep(unsigned dimension, Type *ty, Instruction *InsertBefore){ + assert(isWellFormed(dimension) && dimension > 0u); + InsertBefore = InsertBefore ? InsertBefore : reps[dimension]->getLoop()->getLoopPreheader()->getTerminator(); + assert(isSafeToExpandAt(getStep(dimension), InsertBefore, SE) && "data not expanable here (note: only preheader guaranteed)"); + SCEVExpander ex(SE, reps[dimension]->getLoop()->getHeader()->getModule()->getDataLayout(), "step"); + ex.setInsertPoint(InsertBefore); + return castToSize(ex.expandCodeFor(getStep(dimension)), ty, InsertBefore); +} + +//code-gen for rep (calls code-gen of the LoopRep) +Value *AffAcc::expandRep(unsigned dimension, Type *ty, Instruction *InsertBefore){ + assert(isWellFormed(dimension) && dimension > 0u); + InsertBefore = InsertBefore ? InsertBefore : reps[dimension]->getLoop()->getLoopPreheader()->getTerminator(); + if (!isSafeToExpandAt(getRep(dimension), InsertBefore, SE)) { + getRep(dimension)->dump(); + InsertBefore->dump(); + InsertBefore->getParent()->dump(); + this->dump(); + } + return reps[dimension]->expandAt(ty, InsertBefore); +} + +//code-gen for all info needed to know the square affine access pattern inside of L +//guaranteed to work if `Point` is the terminator of preheader of L +ExpandedAffAcc AffAcc::expandAt(const Loop *L, Instruction *Point, + Type *PtrTy, IntegerType *ParamTy) +{ + if (!Point) Point = L->getLoopPreheader()->getTerminator(); + IRBuilder<> builder(Point); + assert(isWellFormed(L)); + std::vector reps, steps, ranges, prefixsum_ranges; + const unsigned dim = loopToDimension(L); + Value *Addr = expandBaseAddr(dim, PtrTy, Point); + IntegerType *SizeTy = IntegerType::get(SE.getContext(), (unsigned)SE.getTypeSizeInBits(Addr->getType())); + Value *psum = nullptr; + Value *LowerBound = builder.CreatePtrToInt(Addr, SizeTy, "lb"); + Value *UpperBound = LowerBound; + std::vector> known; + for (int d = 1u; d < getMaxDimension(); d++) { + known.push_back(std::make_pair(this->reps[d]->getSCEVPlusOne(), 1)); + } + for (unsigned i = 1u; i <= dim; i++) { + reps.push_back(expandRep(i, ParamTy, Point)); + steps.push_back(expandStep(i, ParamTy, Point)); + Value *r = reps.back(); + Value *st = steps.back(); + ranges.push_back(builder.CreateMul(r, st, formatv("range.{0}d", i))); + if (psum) psum = builder.CreateAdd(psum, ranges.back(), formatv("prefsum.range.{0}d", i)); + else psum = ranges.back(); + prefixsum_ranges.push_back(psum); + auto sign = findSign(getStep(i), SE, known); + if (sign.hasValue()) { + if (sign.getValue() < 0) LowerBound = builder.CreateAdd(LowerBound, builder.CreateSExtOrTrunc(ranges.back(), SizeTy, "lb.dec")); + else if (sign.getValue() > 0) UpperBound = builder.CreateAdd(UpperBound, builder.CreateZExtOrTrunc(ranges.back(), SizeTy, "ub.inc")); + //else sign == 0: no action needed + } else { //we do not know sign! need to test at runtime + Value *Test = builder.CreateICmpSGE(ranges.back(), ConstantInt::get(ParamTy, 0), "test.nonnegative"); //FIXME: does not work for unsigned values > 2^30 + LowerBound = builder.CreateSelect( + builder.CreateNot(Test, formatv("not.test.{0}d", i)), + builder.CreateSExtOrTrunc(ranges.back(), SizeTy, formatv("range.{0}d.sext", i)), + ConstantInt::get(SizeTy, 0) + ); + UpperBound = builder.CreateSelect( + Test, + builder.CreateZExtOrTrunc(ranges.back(), SizeTy, formatv("range.{0}d.zext", i)), + ConstantInt::get(SizeTy, 0) + ); + } + } + ExpandedAffAcc Aexp(this, Addr, steps, reps, ranges, prefixsum_ranges, LowerBound, UpperBound); + return Aexp; +} + + +// ================= MemDep ============== + +bool MemDep::alias(Value *A, Value *B) { return !A || !B || AA.alias(A, B) != AliasResult::NoAlias; } +bool MemDep::alias(MemoryUseOrDef *A, MemoryUseOrDef *B) { + if (!hasMemInst(A) || !hasMemInst(B)) return false; //the memoryUseOrDef does not correspond to an instruction => no problem + else return alias(getAddress(A), getAddress(B)); +} + +//returns all MemoryDefs that might clobber MA +//i.e. we cannot be sure at compile-time that they *don't* clobber MA +DenseSet MemDep::findClobbers(MemoryUseOrDef *MA){ + DenseSet res; + std::deque worklist; + DenseSet vis; + worklist.push_back(MA->getDefiningAccess()); + while (!worklist.empty()) { + MemoryAccess *A = worklist.front(); worklist.pop_front(); + if (!A) continue; + if (vis.find(A) != vis.end()) continue; + if (A == MA) continue; + vis.insert(A); + if (MemoryDef *D = dyn_cast(A)) { + if (alias(D, MA)) { + res.insert(D); + } + worklist.push_back(D); + } else { + MemoryPhi *P = cast(A); + for (unsigned i = 0u; i < P->getNumOperands(); i++) { + worklist.push_back(P->getOperand(i)); + } + } + } + return res; +} + +//find all MemoryUse's or MemoryDef's that might be clobbered by MA (might = must OR we do not know at compile-time) +DenseSet MemDep::findClobberUsers(MemoryDef *MA) { + DenseSet res; + std::deque worklist; + DenseSet vis; + for (auto U = MA->use_begin(); U != MA->use_end(); ++U) { + worklist.push_back(cast(U->getUser())); + } + while (!worklist.empty()){ + MemoryAccess *A = worklist.front(); worklist.pop_front(); + if (!A) continue; + if (vis.find(A) != vis.end()) continue; + vis.insert(A); + if (MemoryUse *U = dyn_cast(A)) { + if (alias(U, MA)) res.insert(U); + } else if (MemoryDef *D = dyn_cast(A)) { + if (alias(D, MA)) { + res.insert(D); + } + worklist.push_back(D); + } else { + assert(isa(A)); + for (auto U = A->use_begin(); U != A->use_end(); ++U) { + worklist.push_back(cast(U->getUser())); + } + } + } + return res; +} + +//================== Affine Access =========================================================== + +//constructor of analysis result, immediately computes all necessary information +AffineAccess::AffineAccess( + Function &F, ScalarEvolution &SE, DominatorTree &DT, + LoopInfo &LI, MemorySSA &MSSA, AAResults &AA, + DependenceInfo &DI + ) : SE(SE), DT(DT), LI(LI), MSSA(MSSA), AA(AA), DI(DI), MD(MSSA, AA) +{ + for (Loop *L : LI.getTopLevelLoops()){ + auto all = analyze(L, ArrayRef()); + addAllConflicts(*all); + all.release(); + } +} + +//DFS over loop tree, constructs an AffAcc for each memory access and tries to promote it as far as possible +std::unique_ptr> AffineAccess::analyze(Loop *Parent, ArrayRef loopPath){ + LLVM_DEBUG(dbgs()<<"analyze: loop : "<getHeader()->getNameOrAsOperand()<<"\n"); + + //LoopRep for Parent + LoopRep *ParentLR = new LoopRep(Parent, loopPath, SE, DT); + reps.insert(std::make_pair(Parent, ParentLR)); //add Parent to LoopReps + + //prepare path + std::vector path; + path.push_back(Parent); //add Parent to path + for (auto *L : loopPath) path.push_back(L); + + //prepare results + auto all = std::make_unique>(); + auto &promoted = promotedAccesses.insert(std::make_pair(Parent, SmallVector())).first->getSecond(); + + //promote subloop accesses + for (Loop *L : Parent->getSubLoops()){ + std::unique_ptr> accs = analyze(L, ArrayRef(path)); + all->reserve(accs->size()); + LoopRep *LR = reps.find(L)->second; //guaranteed to exist, no check needed + bool canPromote = LR->isAvailable() && ParentLR->isAvailable() && LR->isOnAllCFPathsOfParentIfExecuted(); + for (AffAcc *A : *accs){ + all->push_back(A); + if (canPromote){ //L is well-formed and on all CF-paths if its rep is >0 at run-time + if (A->promote(ParentLR)){ + promoted.push_back(A); //guaranteed to exist + } + } + } + accs.release(); + } + + //promote accesses from this loop + for (BasicBlock *BB : Parent->getBlocks()){ + if (LI.getLoopFor(BB) != Parent) continue; //skip BB as it was already processed in a subloop + for (Instruction &I : *BB){ + MemoryUseOrDef *MA = MSSA.getMemoryAccess(&I); + if (MA && hasMemInst(MA) && access.find(MA) == access.end()){ //no AffAcc for this memory access yet! + Value *Addr = getAddress(MA); + const SCEV *AddrSCEV = nullptr; + if (Addr) AddrSCEV = SE.getSCEV(Addr); + AffAcc *A = new AffAcc(ArrayRef(&I), AddrSCEV, MA, ArrayRef(path), SE); + all->push_back(A); + access.insert(std::make_pair(MA, A)); + if (ParentLR->isAvailable()){ + bool onAllCFPaths = true; + for (Instruction *I : A->getAccesses()) onAllCFPaths &= isOnAllControlFlowPaths(I->getParent(), Parent, DT); + if (onAllCFPaths && A->promote(ParentLR)){ + promoted.push_back(A); //guaranteed to exist + } + } + } + } + } + + LLVM_DEBUG(dbgs()<<"analyze: done with loop: "<getHeader()->getNameOrAsOperand()<<"\n"); + + return all; +} + +//given the list of all AffAccs in a loop-tree, this finds all the conflicts between them +void AffineAccess::addAllConflicts(const std::vector &all) { + for (AffAcc *A : all) { + assert(A); + const Loop *outerMostExpandableExl = A->getDeepestMalformed(); + DenseSet c; + if (A->isWrite()){ + c = MD.findClobberUsers(cast(A->getMemoryAccess())); + } else { + c = MD.findClobbers(A->getMemoryAccess()); + } + for (MemoryUseOrDef *D : c) { + if (A->getMemoryAccess() == D || !hasMemInst(D)) continue; + auto p = access.find(D); + if (p == access.end()) continue; + AffAcc *B = p->second; + auto r = calcConflict(A, B); + if (r.first != AffAccConflict::NoConflict) A->addConflict(B, r.second, r.first); + updateOutermostExpandableExcl(outerMostExpandableExl, r.first, r.second, B->getDeepestMalformed()); + assert(!outerMostExpandableExl || outerMostExpandableExl->contains(A->getMemoryAccess()->getBlock())); + } + + ArrayRef loops = A->getContainingLoops(); + for (const Loop *L : loops) { + if (!L) continue; + if (L == outerMostExpandableExl) break; + if (!(!L || A->isWellFormed(L))){ + if (L) LLVM_DEBUG(L->dump()); + if (outerMostExpandableExl) LLVM_DEBUG(outerMostExpandableExl->dump()); + LLVM_DEBUG(A->dump()); + llvm_unreachable("this should not happen!"); + } + assert(!L || A->isWellFormed(L)); + auto p = expandableAccesses.find(L); + if (p == expandableAccesses.end()){ + p = expandableAccesses.insert(std::make_pair(L, SmallVector())).first; + } + p->getSecond().push_back(A); + } + } +} + +//classify conflict between Read and Write +AffAccConflict AffineAccess::calcRWConflict(AffAcc *Read, AffAcc *Write, const Loop *L) const { + assert(!Read->isWrite()); + assert(Write->isWrite()); + if (!L->contains(Read->getMemoryAccess()->getBlock()) || !L->contains(Write->getMemoryAccess()->getBlock())) return AffAccConflict::NoConflict; + if (!Read->isWellFormed(L) || !Write->isWellFormed(L)) return AffAccConflict::Bad; + MemoryUseOrDef *r = Read->getMemoryAccess(); + MemoryUseOrDef *w = Write->getMemoryAccess(); + Value *Addr = getAddress(r); + Value *DAddr = getAddress(w); + bool dominates = MSSA.dominates(r, w); + if (Addr && DAddr && AA.alias(Addr, DAddr) == NoAlias) return AffAccConflict::NoConflict; + AffAccConflict kind = AffAccConflict::Bad; + if (!dominates) { //read does not dominate write ==> R maybe after W + kind = AffAccConflict::MustNotIntersect; + } else { //read dominates write ==> W is after R + kind = AffAccConflict::MustNotIntersect; + //exception: we know that the store always happens to a position already written from if the store is to same address as write (FIXME: CONSERVATIVE) + //but the steps needs to be != 0 such that there is no dependence from one iteration to the next + bool nonzeroSteps = true; + unsigned dr = Read->loopToDimension(L); + unsigned dw = Write->loopToDimension(L); + while (Read->isWellFormed(dr) && Write->isWellFormed(dw)) { + nonzeroSteps &= SE.isKnownNonZero(Read->getStep(dr++)) && SE.isKnownNonZero(Write->getStep(dw++)); + } + if ((Addr && DAddr && AA.alias(Addr, DAddr) == MustAlias && nonzeroSteps) + || (accessPatternsAndAddressesMatch(Read, Write, L) && nonzeroSteps)) + { + kind = AffAccConflict::NoConflict; + } + } + return kind; +} + +///returns the kind of conflict (and innermost common loop) that A and B have assuming there is some memory dependency +///does not check for the memory dependency itself for to peformance +std::pair AffineAccess::calcConflict(AffAcc *A, AffAcc *B) const { + assert((A->isWrite() || B->isWrite()) && "conflict between two reads ???"); + const Loop *const innermostCommon = findFirstContaining(A->getContainingLoops(), B->getMemoryAccess()->getBlock()); + if (!innermostCommon) return std::make_pair(AffAccConflict::NoConflict, innermostCommon); + if (!A->isWrite()) std::swap(A, B); //we know at least one of them is write, swap so that one is A + AffAccConflict kind = AffAccConflict::Bad; //assume Bad at beginning + if (A->isWellFormed(innermostCommon) && B->isWellFormed(innermostCommon)) { + if (B->isWrite()) kind = AffAccConflict::MustNotIntersect; //WaW + else kind = calcRWConflict(B, A, innermostCommon); //B is read and A is write + } + //at this point, even if the two may alias, we assume the chance is high that they do at runtime + //if their base addresses share some SCEVUnknowns (ie. some Value's) (FIXME: this is CONSERVATIVE) + if (kind == AffAccConflict::MustNotIntersect){ + const Loop *L = innermostCommon->getParentLoop(); + const Loop *Last = innermostCommon; + while (L && A->isWellFormed(L) && B->isWellFormed(L)) { //traverse up the loop-tree up to the point where one of them is not wellformed anymore + Last = L; + L = L->getParentLoop(); + } + if (shareValues(A->getBaseAddr(Last), B->getBaseAddr(Last))) kind = AffAccConflict::Bad; + } + return std::make_pair(kind, innermostCommon); +} + +//checks whether access patterns (step, rep) match up to some loop L +bool AffineAccess::accessPatternsMatch(const AffAcc *A, const AffAcc *B, const Loop *L) const { + unsigned dimA = A->loopToDimension(L); + unsigned dimB = B->loopToDimension(L); + if (dimA != dimB) return false; + for (unsigned i = 1u; i <= dimA; i++){ + if (A->getLoop(i) != B->getLoop(i)) return false; + if (!SCEVEquals(A->getRep(i), B->getRep(i), SE)) return false; + if (!SCEVEquals(A->getStep(i), B->getStep(i), SE)) return false; + } + return true; +} + +//checks whether step, rep, and base address matches up to some loop L +bool AffineAccess::accessPatternsAndAddressesMatch(const AffAcc *A, const AffAcc *B, const Loop *L) const { + if (!accessPatternsMatch(A, B, L)) return false; + return SCEVEquals(A->getBaseAddr(A->loopToDimension(L)), B->getBaseAddr(B->loopToDimension(L)), SE); +} + +//simple access methods +ScalarEvolution &AffineAccess::getSE() const { return this->SE; } +DominatorTree &AffineAccess::getDT()const { return this->DT; } +LoopInfo &AffineAccess::getLI() const { return this->LI; } +MemorySSA &AffineAccess::getMSSA() const { return this->MSSA; } +AAResults &AffineAccess::getAA() const { return this->AA; } +DependenceInfo &AffineAccess::getDI() const { return this->DI; } +SmallVector AffineAccess::getLoopsInPreorder() const { return this->LI.getLoopsInPreorder(); } + +//get accesses with no bad conflicts for some loop L +//guarantees: +// no bad conflicts with any other memory instruction in L +// is well formed for L +// if conflictFreeOnly: has no conflicts at all (only NoConflict) ==> no run-time checks necessary +std::vector AffineAccess::getExpandableAccesses(const Loop *L, bool conflictFreeOnly) { + auto p = expandableAccesses.find(L); + std::vector res; + if (p == expandableAccesses.end()) return res; + for (AffAcc *A : p->getSecond()){ + if (!conflictFreeOnly || A->getConflicts(L).empty()) res.push_back(A); + } + return res; +} + +// code-gen: calls code-gen for all AffAccs in list, +// generates run-time checks for conflicts, +// generates run-time checks for loop-trip-counts (if repChecks = true) +// ANDs all the rt-checks to a single Value and writes it into BoundCheck +std::vector +AffineAccess::expandAllAt(ArrayRef Accs, const Loop *L, + Instruction *Point, Value *&BoundCheck, + Type *PtrTy, IntegerType *ParamTy, bool conflictChecks, bool repChecks) +{ + assert(Point); + IRBuilder<> builder(Point); + + DenseMap exps; + for (AffAcc *A : Accs) { //expand the requested AffAcc's + exps.insert(std::make_pair(A, std::move(A->expandAt(L, Point, PtrTy, ParamTy)))); + } + + std::vector checks; + if (conflictChecks) { + DenseSet done; //keep track of which were done to not make duplicate checks + for (AffAcc *A : Accs) { + auto conflicts = A->getConflicts(L); //get all AffAcc's with which A conflicts + for (const auto &p : conflicts) { + AffAcc *B = p.first; + if (done.find(B) != done.end()) continue; //this conflict was already handled when A was B (symmetry) + AffAccConflict kind = std::max(p.second, B->getConflict(A, L)); //take worse conflict + switch (kind) + { + case AffAccConflict::NoConflict: + break; //nothing to do + case AffAccConflict::MustNotIntersect: { + auto e = exps.find(B); + if (e == exps.end()) { //if B was not yet expanded, do that and update the iterator for the pair in exps + e = exps.insert(std::make_pair(B, std::move(B->expandAt(L, Point, PtrTy, ParamTy)))).first; + } + assert(e->first == B); + ExpandedAffAcc &expB = e->getSecond(); + ExpandedAffAcc &expA = exps.find(A)->getSecond(); //guaranteed to exist + Value *x = builder.CreateICmpULT(expA.UpperBound, expB.LowerBound, "no.inter.ab"); + Value *y = builder.CreateICmpULT(expB.UpperBound, expA.LowerBound, "no.inter.ba"); + checks.push_back(builder.CreateOr(x, y, "no.intersect")); + break; + } + case AffAccConflict::Bad: + llvm_unreachable("cannot expand the given accesses because some of them have a bad conflict in L!"); + break; + default: + llvm_unreachable("unknown conflict type"); + } + } + } + } + + if (repChecks) { + DenseSet loops; //find all relevant loops + for (auto &p : exps) { + AffAcc *A = p.first; + for (unsigned d = 0u; d < A->loopToDimension(L); d++) { + const Loop *x = A->getLoop(d); + if (x) loops.insert(x); + } + } + for (const Loop *M : loops) { //generate checks for the loops + auto p = reps.find(M); + assert(p != reps.end()); + checks.push_back(p->second->expandLoopGuard(Point)); + } + } + + if (checks.empty()) BoundCheck = builder.getTrue(); + else BoundCheck = builder.CreateAnd(checks); + + std::vector res; + for (AffAcc *A : Accs) { + res.push_back(std::move(exps.find(A)->getSecond())); //(can move because exps not needed anymore) + } + return res; +} + +//================== Affine Access Analysis ================================================== + +AnalysisKey AffineAccessAnalysis::Key; + +// run of the analysis pass +AffineAccess AffineAccessAnalysis::run(Function &F, FunctionAnalysisManager &FAM) { + + LLVM_DEBUG(dbgs()<<"running AffineAccessAnalysis on "<(F); + DominatorTree &DT = FAM.getResult(F); + ScalarEvolution &SE = FAM.getResult(F); + auto &MSSAA = FAM.getResult(F); + MemorySSA &MSSA = MSSAA.getMSSA(); + AAResults &AA = FAM.getResult(F); + DependenceInfo &DI = FAM.getResult(F); + + return AffineAccess(F, SE, DT, LI, MSSA, AA, DI); +} + +//================== Affine Acces Analysis Pass for opt ======================================= +PreservedAnalyses AffineAccessAnalysisPass::run(Function &F, FunctionAnalysisManager &FAM) { + AffineAccess AA = FAM.getResult(F); + for (const Loop *L : AA.getLI().getLoopsInPreorder()){ + L->dump(); + for (const AffAcc *A : AA.getExpandableAccesses(L)){ + A->dumpInLoop(L); + } + } + return PreservedAnalyses::all(); +} + diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt index f31cf349b09aa..887b2176fe730 100644 --- a/llvm/lib/Analysis/CMakeLists.txt +++ b/llvm/lib/Analysis/CMakeLists.txt @@ -14,6 +14,7 @@ if (DEFINED LLVM_HAVE_TF_AOT OR DEFINED LLVM_HAVE_TF_API) endif() add_llvm_component_library(LLVMAnalysis + AffineAccessAnalysis.cpp AliasAnalysis.cpp AliasAnalysisEvaluator.cpp AliasAnalysisSummary.cpp diff --git a/llvm/lib/CodeGen/PostRASchedulerList.cpp b/llvm/lib/CodeGen/PostRASchedulerList.cpp index b85f00a61eac1..157ee3f183726 100644 --- a/llvm/lib/CodeGen/PostRASchedulerList.cpp +++ b/llvm/lib/CodeGen/PostRASchedulerList.cpp @@ -268,10 +268,12 @@ bool PostRAScheduler::enablePostRAScheduler( TargetSubtargetInfo::RegClassVector &CriticalPathRCs) const { Mode = ST.getAntiDepBreakMode(); ST.getCriticalPathRCs(CriticalPathRCs); - // Check for explicit enable/disable of post-ra scheduling. - if (EnablePostRAScheduler.getPosition() > 0) + if (EnablePostRAScheduler.getPosition() > 0) { return EnablePostRAScheduler; + } + + // return true; //FIXME: Snitch does not enable this by default (and should probably) return ST.enablePostRAScheduler() && OptLevel >= ST.getOptLevelToEnablePostRAScheduler(); @@ -291,7 +293,6 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) { TargetSubtargetInfo::AntiDepBreakMode AntiDepMode = TargetSubtargetInfo::ANTIDEP_NONE; SmallVector CriticalPathRCs; - // Check that post-RA scheduling is enabled for this target. // This may upgrade the AntiDepMode. if (!enablePostRAScheduler(Fn.getSubtarget(), PassConfig->getOptLevel(), @@ -307,11 +308,13 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) { : TargetSubtargetInfo::ANTIDEP_NONE); } + // AntiDepMode = TargetSubtargetInfo::ANTIDEP_ALL; //FIXME: Snitch does not enable this by default (and probably should) + LLVM_DEBUG(dbgs() << "PostRAScheduler\n"); SchedulePostRATDList Scheduler(Fn, MLI, AA, RegClassInfo, AntiDepMode, CriticalPathRCs); - + // Loop over all of the basic blocks for (auto &MBB : Fn) { #ifndef NDEBUG diff --git a/llvm/lib/Passes/CMakeLists.txt b/llvm/lib/Passes/CMakeLists.txt index d834c0db4b458..fa0efb387353a 100644 --- a/llvm/lib/Passes/CMakeLists.txt +++ b/llvm/lib/Passes/CMakeLists.txt @@ -16,6 +16,7 @@ add_llvm_component_library(LLVMPasses Core Coroutines HelloNew + SSR IPO InstCombine ObjCARC diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 6c1a7c75d30a2..0680b98465a4b 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -16,6 +16,7 @@ #include "llvm/Passes/PassBuilder.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/Analysis/AffineAccessAnalysis.h" #include "llvm/Analysis/AliasAnalysisEvaluator.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/AssumptionCache.h" @@ -85,6 +86,8 @@ #include "llvm/Transforms/Coroutines/CoroElide.h" #include "llvm/Transforms/Coroutines/CoroSplit.h" #include "llvm/Transforms/HelloNew/HelloWorld.h" +#include "llvm/Transforms/SSR/SSRInference.h" +#include "llvm/Transforms/SSR/SSRGeneration.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" #include "llvm/Transforms/IPO/Annotation2Metadata.h" #include "llvm/Transforms/IPO/ArgumentPromotion.h" @@ -518,7 +521,6 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, ThinOrFullLTOPhase Phase) { FunctionPassManager FPM(DebugLogging); - // Form SSA out of local memory accesses after breaking apart aggregates into // scalars. FPM.addPass(SROA()); @@ -555,6 +557,7 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, // Simplify the loop body. We do this initially to clean up after other loop // passes run, either when iterating on a loop or on inner loops with // implications on the outer loop. + LPM1.addPass(LoopInstSimplifyPass()); LPM1.addPass(LoopSimplifyCFGPass()); @@ -593,6 +596,7 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, DebugLogging)); FPM.addPass(SimplifyCFGPass()); FPM.addPass(InstCombinePass()); + if (EnableLoopFlatten) FPM.addPass(LoopFlattenPass()); // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA. @@ -643,7 +647,6 @@ FunctionPassManager PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, ThinOrFullLTOPhase Phase) { assert(Level != OptimizationLevel::O0 && "Must request optimizations!"); - // The O1 pipeline has a separate pipeline creation function to simplify // construction readability. if (Level.getSpeedupLevel() == 1) @@ -755,8 +758,11 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(createFunctionToLoopPassAdaptor( std::move(LPM1), EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true, DebugLogging)); + FPM.addPass(SimplifyCFGPass()); FPM.addPass(InstCombinePass()); + FPM.addPass(SSRInferencePass()); + if (EnableLoopFlatten) FPM.addPass(LoopFlattenPass()); // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass, @@ -793,7 +799,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, // opportunities opened up by them. FPM.addPass(InstCombinePass()); invokePeepholeEPCallbacks(FPM, Level); - + // Re-consider control flow based optimizations after redundancy elimination, // redo DCE, etc. FPM.addPass(JumpThreadingPass()); @@ -988,6 +994,8 @@ PassBuilder::buildInlinerPipeline(OptimizationLevel Level, MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor( buildFunctionSimplificationPipeline(Level, Phase))); + //MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(SSRInferencePass())); + return MIWP; } diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 877cb9ed13b37..cb16781e4ecd2 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -146,6 +146,7 @@ CGSCC_PASS("no-op-cgscc", NoOpCGSCCPass()) #ifndef FUNCTION_ANALYSIS #define FUNCTION_ANALYSIS(NAME, CREATE_PASS) #endif +FUNCTION_ANALYSIS("affine-access", AffineAccessAnalysis()) FUNCTION_ANALYSIS("aa", AAManager()) FUNCTION_ANALYSIS("assumptions", AssumptionAnalysis()) FUNCTION_ANALYSIS("block-freq", BlockFrequencyAnalysis()) @@ -190,6 +191,9 @@ FUNCTION_ALIAS_ANALYSIS("tbaa", TypeBasedAA()) #ifndef FUNCTION_PASS #define FUNCTION_PASS(NAME, CREATE_PASS) #endif +FUNCTION_PASS("infer-ssr", SSRInferencePass()) +FUNCTION_PASS("generate-ssr", SSRGenerationPass()) +FUNCTION_PASS("affine-access-pass", AffineAccessAnalysisPass()) FUNCTION_PASS("aa-eval", AAEvaluator()) FUNCTION_PASS("adce", ADCEPass()) FUNCTION_PASS("add-discriminators", AddDiscriminatorsPass()) diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index c822929f94770..2dbd3f5e9be12 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -27,6 +27,7 @@ add_llvm_target(RISCVCodeGen RISCVExpandAtomicPseudoInsts.cpp RISCVExpandPseudoInsts.cpp RISCVExpandSSRInsts.cpp + RISCVExpandSSRInstsPostRegAlloc.cpp RISCVExpandSDMAInsts.cpp RISCVFrameLowering.cpp RISCVInstrInfo.cpp @@ -42,7 +43,10 @@ add_llvm_target(RISCVCodeGen RISCVTargetMachine.cpp RISCVTargetObjectFile.cpp RISCVTargetTransformInfo.cpp + RISCVSSRReassociate.cpp + RISCVSSRStatistics.cpp Snitch/SNITCHFrepLoops.cpp + Snitch/SNITCHAutoFrep.cpp LINK_COMPONENTS Analysis diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp index 5f8d6e1375187..6ba9c6901c8e8 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp @@ -86,6 +86,7 @@ void RISCVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O, const char *Modifier) { assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported"); + const MCOperand &MO = MI->getOperand(OpNo); if (MO.isReg()) { diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index 2cd960d7587d8..f731a4b7c1fbb 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -45,6 +45,18 @@ void initializeRISCVMergeBaseOffsetOptPass(PassRegistry &); FunctionPass *createRISCVExpandPseudoPass(); void initializeRISCVExpandPseudoPass(PassRegistry &); +FunctionPass *createRISCVExpandSSRPostRegAllocPass(); +void initializeRISCVExpandSSRPostRegAllocPass(PassRegistry &); + +FunctionPass *createSNITCHAutoFrepPass(); +void initializeSNITCHAutoFrepPass(PassRegistry &); + +FunctionPass *createSSRReassociatePass(); +void initializeSSRReassociatePass(PassRegistry &); + +FunctionPass *createSSRStatisticsPass(); +void initializeSSRStatisticsPass(PassRegistry &); + FunctionPass *createRISCVExpandAtomicPseudoPass(); void initializeRISCVExpandAtomicPseudoPass(PassRegistry &); @@ -54,6 +66,8 @@ void initializeRISCVCleanupVSETVLIPass(PassRegistry &); FunctionPass *createRISCVExpandSSRPass(); void initializeRISCVExpandSSRPass(PassRegistry &); +//TODO : reference function pass for auto SSR inference here (+ add to CMakeLists.txt) + FunctionPass *createRISCVExpandSDMAPass(); void initializeRISCVExpandSDMAPass(PassRegistry &); diff --git a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp index a90f199b6b1d6..93156b2315546 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp @@ -62,11 +62,6 @@ using namespace llvm; #define DEBUG_TYPE "riscv-ssr" -/// Command line options -static cl::opt - SSRRegisterMerge("ssr-noregmerge", cl::Hidden, - cl::desc("Disable the merging of SSR registers in other instructions")); - #define RISCV_EXPAND_SSR_NAME "RISCV SSR pseudo instruction expansion pass" #define NUM_SSR 3 @@ -78,12 +73,6 @@ class RISCVExpandSSR : public MachineFunctionPass { const RISCVInstrInfo *TII; static char ID; - /// Parameters for the register merging pass - struct RegisterMergingPreferences { - /// enable the register merging - bool Enable; - }; - RISCVExpandSSR() : MachineFunctionPass(ID) { initializeRISCVExpandSSRPass(*PassRegistry::getPassRegistry()); } @@ -96,10 +85,10 @@ class RISCVExpandSSR : public MachineFunctionPass { const MachineFunction *MF; RISCVMachineFunctionInfo *RVFI; - bool Enabled; + std::vector MoveLoads; + std::vector MoveStores; bool expandMBB(MachineBasicBlock &MBB); - void mergePushPop(MachineBasicBlock &MBB); bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI); bool expandSSR_Setup(MachineBasicBlock &MBB, @@ -119,8 +108,7 @@ class RISCVExpandSSR : public MachineFunctionPass { bool expandSSR_Barrier(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI); - - RISCVExpandSSR::RegisterMergingPreferences gatherRegisterMergingPreferences(); + void handlePushPops(); }; char RISCVExpandSSR::ID = 0; @@ -140,17 +128,14 @@ bool RISCVExpandSSR::runOnMachineFunction(MachineFunction &MF) { TII = static_cast(MF.getSubtarget().getInstrInfo()); this->MF = &MF; this->RVFI = MF.getInfo(); - Enabled = false; + this->MoveLoads.empty(); + this->MoveStores.empty(); bool Modified = false; for (auto &MBB : MF) Modified |= expandMBB(MBB); - // Run over MF again to merge SSR pops/pushs into instruction uses - RISCVExpandSSR::RegisterMergingPreferences RMP = gatherRegisterMergingPreferences(); - if(RMP.Enable && RVFI->getUsedSSR()) - for (auto &MBB : MF) - mergePushPop(MBB); + handlePushPops(); /// "Forcefully" add all SSR registers as live-in to all MBB in this MF if(Modified) { @@ -161,6 +146,10 @@ bool RISCVExpandSSR::runOnMachineFunction(MachineFunction &MF) { } } + //errs()<<"\n ========================= DUMP MF ========================== \n"; + //MF.dump(); + //errs()<<"\n ======================= END DUMP MF ========================== \n"; + return Modified; } @@ -211,17 +200,6 @@ bool RISCVExpandSSR::expandMI(MachineBasicBlock &MBB, return expandSSR_Barrier(MBB, MBBI, NextMBBI); } - // Prevent excessive live-ins, they pose a problem with multiple SSR regions - // in a single function. Adding SSR regs to live ins in push/pop should suffice - // for now, but there might be edge cases - - // if(Enabled) { - // // mark the SSR registers reserved in this BB - // unsigned ssrEnabledMask = 0; - // for (unsigned n = 0; n < NUM_SSR; ++n) - // MBB.addLiveIn(getSSRFtReg(n)); - // } - return false; } @@ -272,30 +250,25 @@ bool RISCVExpandSSR::expandSSR_PushPop(MachineBasicBlock &MBB, LLVM_DEBUG(dbgs() << "-- Expanding SSR " << (isPop?"Pop":"Push") << "\n"); LLVM_DEBUG(dbgs() << " Using register " << R << " for SSR streamer "<get(RISCV::FSGNJ_D), MBBI->getOperand(ssrValIdx).getReg()) - .addReg(R, 0) - .addReg(R, 0); - // BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), MBBI->getOperand(ssrValIdx).getReg()) - // .addReg(R, 0); + // Insert a "loading move" this is like a normal move but has side effects + Register valR = MBBI->getOperand(ssrValIdx).getReg(); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoLoadMove), valR).addReg(R, 0).getInstr(); + MBBI->eraseFromParent(); // The pseudo instruction is gone now. + MI->getOperand(0).setIsDef(); + this->MoveLoads.push_back(MI); } else { - // Build a copy instruction that moves the value from the register passed as - // argument to the ssr data register (R) - BuildMI(MBB, MBBI, DL, TII->get(RISCV::FSGNJ_D), R) - .addReg(MBBI->getOperand(ssrValIdx).getReg()) - .addReg(MBBI->getOperand(ssrValIdx).getReg()); - // BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), R) - // .addReg(MBBI->getOperand(ssrValIdx).getReg()); + Register valR = MBBI->getOperand(ssrValIdx).getReg(); + // Insert a "storing move" this is like a normal move but has side effects + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoStoreMove), R) + .addReg(valR, getRegState(MBBI->getOperand(ssrValIdx))) + .getInstr(); + MBBI->eraseFromParent(); // The pseudo instruction is gone now. + this->MoveStores.push_back(MI); } MBB.addLiveIn(R); - MBBI->eraseFromParent(); // The pseudo instruction is gone now. return true; } @@ -411,7 +384,6 @@ bool RISCVExpandSSR::expandSSR_EnDis(MachineBasicBlock &MBB, MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); LLVM_DEBUG(dbgs() << "-- Expanding SSR " << (isEnable ? "Enable" : "Disable") << "\n"); - Enabled = isEnable; // emit a csrsi/csrci call to the SSR location if(isEnable) { @@ -479,96 +451,9 @@ bool RISCVExpandSSR::expandSSR_Barrier(MachineBasicBlock &MBB, return true; } -void RISCVExpandSSR::mergePushPop(MachineBasicBlock &MBB) { - SmallSet virtRegs[NUM_SSR]; - const TargetRegisterInfo *TRI = MBB.getParent()->getRegInfo().getTargetRegisterInfo(); - bool inSSRRegion = false; - - Register ssr_regs[NUM_SSR]; - for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) ssr_regs[ssr_no] = getSSRFtReg(ssr_no); - - // First pass: Detect moves to or from SSR registers - for (auto MI = MBB.begin() ; MI != MBB.end() ; ) { - MachineBasicBlock::iterator NMI = std::next(MI); - - LLVM_DEBUG(dbgs()<<"Analyzing: "<<*MI<<"\n"); - - // detect an emitted pop and add assignment (virtual_reg, ssr_read) to list - if(MI->getOpcode() == RISCV::FSGNJ_D) { - LLVM_DEBUG(dbgs()<<"Found FSGNJ_D, Op 0: " << MI->getOperand(1).getReg() << " Op1: " << MI->getOperand(2).getReg() << "\n"); - - // look for each streamer register - for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) { - // check for pop - if(MI->getOperand(1).getReg() == ssr_regs[ssr_no] && MI->getOperand(2).getReg() == ssr_regs[ssr_no]) { - LLVM_DEBUG(dbgs()<<" pop: both operands from SSR"<< ssr_no <<"\n"); - // append virtual register to list of assigned virtuals - LLVM_DEBUG(dbgs()<<" append: "<< MI->getOperand(0).getReg() <<"\n"); - virtRegs[ssr_no].insert(MI->getOperand(0).getReg()); - // remove operation - MI->eraseFromParent(); - break; - } - // TODO: check for push - else if(MI->getOperand(0).getReg() == ssr_regs[ssr_no]) { - // This is non-trivial because a register might be used elsewhere, therefore the entire MBB - // must be analyzed and a merge can only be made, if the register is written once - // LLVM_DEBUG(dbgs()<<" push: operand 0 from SSR"<< ssr_no <<"\n"); - // // append virtual register to list of assigned virtuals - // LLVM_DEBUG(dbgs()<<" append: "<< MI->getOperand(1).getReg() <<"\n"); - // virtRegs[ssr_no].insert(MI->getOperand(1).getReg()); - // // remove operation - // MI->eraseFromParent(); - break; - } - } - } - MI = NMI; - } - - // DBG - for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) { - for (auto iter = virtRegs[ssr_no].begin() ; iter != virtRegs[ssr_no].end() ; ++iter) - LLVM_DEBUG(dbgs() << "virtregs["<operands_begin() ; operand != MI->operands_end() ; ++operand) { - if(!operand->isReg()) continue; - // check if operand is in any SSR list - for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) { - if(virtRegs[ssr_no].contains(operand->getReg())) { - LLVM_DEBUG(dbgs() << "Found use of operand " << operand->getReg() << " ssr: " << ssr_no << " in inst " << MI->getOpcode() << "\n"); - // substitute with SSR register - MI->substituteRegister(operand->getReg(), ssr_regs[ssr_no], 0, *TRI); - // guard this block and add ssr regs to live in - MBB.addLiveIn(ssr_regs[ssr_no]); - } - } - } - MI = NMI; - } - MBB.sortUniqueLiveIns(); -} - -/// Gather parameters for the register merging -RISCVExpandSSR::RegisterMergingPreferences RISCVExpandSSR::gatherRegisterMergingPreferences() { - RISCVExpandSSR::RegisterMergingPreferences RMP; - - // set up defaults - RMP.Enable = true; - - // read user - if (SSRRegisterMerge.getNumOccurrences() > 0) - RMP.Enable = !SSRRegisterMerge; - - LLVM_DEBUG(dbgs() << "RMP Enable "<> bundles; + //pops: + for (MachineInstr *MI : this->MoveLoads) { + if (!MI) continue; + MachineInstr *SingleUser = getUniqueUser(MI, MI->getOperand(0).getReg()); + if (SingleUser && SingleUser->getParent() == MI->getParent()) { + MI->moveBefore(SingleUser); //we pray that there was no reordering until now that moved SingleUser after the SSRDisable + auto b = bundles.find(SingleUser); + if (b == bundles.end()) { + b = bundles.insert(std::make_pair(SingleUser, std::make_pair(SingleUser, SingleUser))).first; + } + if (b->getSecond().first == SingleUser) b->getSecond().first = MI; //if begin of bundle was SingleUser, set to MI + } + } + //pushs: FIXME: currently only works if the defining instruction is pred of MoveStore (how to get def from MachineOperand ???) + for (MachineInstr *MI : this->MoveStores) { + Register valR = MI->getOperand(1).getReg(); + MachineInstr *Pred = MI->getPrevNode(); + bool doesDefvalR = false; + for (auto &MOP : Pred->defs()) doesDefvalR |= MOP.isReg() && MOP.getReg() == valR; + if (doesDefvalR && MI == getUniqueUser(Pred, valR)) { + auto b = bundles.find(Pred); + if (b == bundles.end()) { + b = bundles.insert(std::make_pair(Pred, std::make_pair(Pred, Pred))).first; + } + if (b->getSecond().second == Pred) b->getSecond().second = MI; + } + }*/ \ No newline at end of file diff --git a/llvm/lib/Target/RISCV/RISCVExpandSSRInstsPostRegAlloc.cpp b/llvm/lib/Target/RISCV/RISCVExpandSSRInstsPostRegAlloc.cpp new file mode 100644 index 0000000000000..2297f189fc1a3 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVExpandSSRInstsPostRegAlloc.cpp @@ -0,0 +1,476 @@ +//===-- RISCVExpandSSRPostRegAllocInsts.cpp - Expand the rest of the SSR pseudo insts ---------===// +// +// ??? +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that expands the PseudoLoadMove and PseudoStoreMove +// into normal moves and is meant to be run after any scheduling to guarantee +// correctness. +// +//===----------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVInstrInfo.h" +#include "RISCVTargetMachine.h" +#include "RISCVMachineFunctionInfo.h" + +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/LowLevelType.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Support/CommandLine.h" + +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/AntiDepBreaker.h" + +using namespace llvm; + +#define DEBUG_TYPE "riscv-ssr" + +namespace llvm { + /// Command line options + cl::opt SSRNoRegisterMerge("ssr-no-regmerge", cl::init(false), + cl::desc("Disable the merging of SSR registers in other instructions")); +} + +#define RISCV_EXPAND_SSR_POST_REG_ALLOC_NAME "RISCV SSR pseudo instruction expansion pass post reg alloc" + +#define NUM_SSR 3 + +namespace { + +class RISCVExpandSSRPostRegAlloc : public MachineFunctionPass { +public: + const RISCVInstrInfo *TII; + static char ID; + + RISCVExpandSSRPostRegAlloc() : MachineFunctionPass(ID) { + initializeRISCVExpandSSRPostRegAllocPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return RISCV_EXPAND_SSR_POST_REG_ALLOC_NAME; } + +private: + bool expandMBB(MachineBasicBlock &MBB); + bool mergePushPop(MachineBasicBlock &MBB); + bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); + bool expandSSR_StoreLoadMove(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); +}; + +char RISCVExpandSSRPostRegAlloc::ID = 0; + +//from RISCVExpandSSRInsts.cpp +static Register getSSRFtReg(unsigned streamer) { + unsigned AssignedReg = RISCV::F0_D + streamer; + // Advance the iterator to the assigned register until the valid + // register is found + const TargetRegisterClass *RC = &RISCV::FPR64RegClass; + TargetRegisterClass::iterator I = RC->begin(); + for (; *I != AssignedReg; ++I) + assert(I != RC->end() && "AssignedReg should be a member of provided RC"); + return Register(*I); +} + +bool RISCVExpandSSRPostRegAlloc::runOnMachineFunction(MachineFunction &MF) { + TII = static_cast(MF.getSubtarget().getInstrInfo()); + + bool Modified = false; + for (auto &MBB : MF) Modified |= expandMBB(MBB); + + if (SSRNoRegisterMerge) LLVM_DEBUG(dbgs()<<"regmerge disabled\n"); + if (!SSRNoRegisterMerge && Modified){ + for (auto &MBB : MF) mergePushPop(MBB); + } + + return Modified; +} + +bool RISCVExpandSSRPostRegAlloc::expandMBB(MachineBasicBlock &MBB) { + bool Modified = false; + + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + while (MBBI != E) { + MachineBasicBlock::iterator NMBBI = std::next(MBBI); + Modified |= expandMI(MBB, MBBI, NMBBI); + MBBI = NMBBI; + } + MBB.sortUniqueLiveIns(); + + return Modified; +} + +bool RISCVExpandSSRPostRegAlloc::expandMI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + switch (MBBI->getOpcode()) { + case RISCV::PseudoStoreMove: + case RISCV::PseudoLoadMove: + return expandSSR_StoreLoadMove(MBB, MBBI); + default: + return false; + } +} + +bool RISCVExpandSSRPostRegAlloc::expandSSR_StoreLoadMove(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + DebugLoc DL = MBBI->getDebugLoc(); + + Register src = MBBI->getOperand(0).getReg(); + Register dest = MBBI->getOperand(1).getReg(); + + BuildMI(MBB, MBBI, DL, TII->get(RISCV::FSGNJ_D), src) + .addReg(dest) + .addReg(dest); + + MBBI->eraseFromParent(); // The pseudo instruction is gone now. + return true; +} + +static MachineOperand *getUniqueUser ( + MachineBasicBlock::instr_iterator beg, + MachineBasicBlock::instr_iterator end, + Register valR) + { + + if (beg.isEnd()) return nullptr; + auto *MBB = beg->getParent(); + assert(MBB); + + auto realend = MBB->end().getInstrIterator(); + + MachineOperand *UseMOP = nullptr; + bool isPastEnd = false; + + for (auto MII = beg; MII != realend; ++MII) { + + isPastEnd |= MII == end; + if (MII->isDebugInstr()) continue; //skip debug instructions + bool definesValR = false; + + for (auto &MOP : MII->operands()) { + if (!MOP.isReg() || MOP.getReg() != valR) continue; + //at this point we know MII accesses valR, with MOP, but maybe also other operands + definesValR |= MOP.isDef(); + if (!isPastEnd && !UseMOP && !MOP.isDef()) { + UseMOP = &MOP; //if UseMOP is not yet found and MOP does not redefine valR then MOP is the first Use + if (MOP.isKill()) return UseMOP; //if MOP kills valR then we can stop looking further and return + } + } + + if (definesValR) { + return UseMOP; //if MII (re-)defines valR then we must have already found the Use before, (or we haven't in which case we return null) + } + + } + + if (MBB) { + + bool avail_in_all = true; + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + + for (auto *Succ : MBB->successors()) { + + if (!Succ) continue; + + LivePhysRegs liveness(*MRI.getTargetRegisterInfo()); + liveness.addLiveIns(*Succ); + avail_in_all &= liveness.available(MRI, valR); + } + + if (avail_in_all) return UseMOP; + + } + + return nullptr; +} + +bool RISCVExpandSSRPostRegAlloc::mergePushPop(MachineBasicBlock &MBB) { + Register ssr_regs[NUM_SSR]; + for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) ssr_regs[ssr_no] = getSSRFtReg(ssr_no); + + bool Modified = false; + + for (auto ssr_reg : ssr_regs){ + SmallSet modified; + for (auto MI = MBB.rbegin().getInstrIterator(); MI != MBB.rend().getInstrIterator(); ){ //go from back to front + auto PMI = std::next(MI); //this is prev bc reverse iterator + if(MI->getOpcode() == RISCV::FSGNJ_D){ + if (MI->getOperand(1).getReg() == ssr_reg && MI->getOperand(2).getReg() == ssr_reg && MI->getOperand(0).isReg()){ //this was an SSR pop + //limit search range for regmerge if there is an ssr disable + MachineBasicBlock::instr_iterator rangeLimit = MI.getReverse(); + for (; rangeLimit != MBB.end().getInstrIterator(); ++rangeLimit){ + if (rangeLimit->getOpcode() == RISCV::CSRRCI + && rangeLimit->getOperand(1).isImm() + && rangeLimit->getOperand(1).getImm() == 0x7C0 + && rangeLimit->getOperand(2).getImm() == 1) + { + break; + } + } + Register r = MI->getOperand(0).getReg(); //register to replace + MachineOperand *MO = getUniqueUser(std::next(MI.getReverse()), rangeLimit, r); + if (!MO) LLVM_DEBUG(dbgs()<<"*** NOT FOUND ***\n"); + if (MO) { //if unique user exists + MachineInstr *MIUser = MO->getParent(); + if (MIUser && modified.find(MIUser) == modified.end()){ //if unique user exists and was not yet modified + LLVM_DEBUG(MIUser->dump()); + for (auto &MOP : MIUser->operands()) { + if (MOP.isReg() && !MOP.isDef() && MOP.getReg() == r) { + MOP.setReg(ssr_reg); //replace all non-def uses of r with ssr_reg + MOP.setIsKill(false); + MOP.setIsRenamable(false); + } + } + LLVM_DEBUG(MIUser->dump()); + MI->eraseFromBundle(); + modified.insert(MIUser); + } + } + }else if(MI->getOperand(0).getReg() == ssr_reg){ + if (MI->getOperand(1).isReg() + && MI->getOperand(2).isReg() + && MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) + { //FIXME: use liveness analysis instead of .isKill() + Register R = MI->getOperand(1).getReg(); + MachineInstr *Pred = MI->getPrevNode(); + if (Pred && modified.find(Pred) == modified.end()){ //if Pred exists and is unmodified + bool predDefsR = false; + for (auto &MOP : Pred->defs()) { + predDefsR |= MOP.isReg() && MOP.isDef() && MOP.getReg() == R; + } + if (predDefsR) { //if Pred defines R + auto end = MI->getParent()->end().getInstrIterator(); + MachineOperand *MO = getUniqueUser(Pred->getIterator(), end, R); + if (MO && MO->getParent() == &*MI) { //if MI is unique user of R + LLVM_DEBUG(Pred->dump()); + for (auto &MOP : Pred->operands()) { + if (MOP.isReg() && MOP.isDef() && MOP.getReg() == R) { + MOP.setReg(ssr_reg); //replace all defs of R with ssr_reg + MOP.setIsDef(false); + MOP.setIsKill(false); + MOP.setIsDead(false); + MOP.setIsRenamable(false); + } + } + LLVM_DEBUG(Pred->dump()); + MI->eraseFromBundle(); + modified.insert(Pred); + } + } + } + } + } + } + MI = PMI; + } + } + return Modified; +} + +} // end of anonymous namespace + +INITIALIZE_PASS(RISCVExpandSSRPostRegAlloc, "riscv-expand-ssr-post-reg-alloc", + RISCV_EXPAND_SSR_POST_REG_ALLOC_NAME, false, false) +namespace llvm { + +FunctionPass *createRISCVExpandSSRPostRegAllocPass() { return new RISCVExpandSSRPostRegAlloc(); } + +} // end of namespace llvm + + +///REGMERGE USING LIVENESS, BUT SOMEHOW WORSE +// static std::pair isDefIsUse(MachineInstr &MI, MCRegister R) { +// bool def = false; +// bool use = false; +// for (auto &MOP : MI.operands()) { +// if (MOP.isReg() && MOP.getReg() == R) { +// if (MOP.isDef()) def = true; +// else use = true; +// } +// } +// return std::make_pair(def, use); +// } + +// struct Liveness { +// public: +// Liveness(const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, MachineBasicBlock &MBB, bool end) : liveness(TRI), MBB(MBB), MRI(MRI) { +// if (end) { +// liveness.addLiveOuts(MBB); +// LiveIn = MBB.end().getInstrIterator(); +// } else { +// liveness.addLiveIns(MBB); +// LiveIn = MBB.begin().getInstrIterator(); +// } +// } + +// void MoveForward(MachineBasicBlock::instr_iterator Point) { +// if (Point == LiveIn) return; +// SmallVector, 1u> clb; +// while (LiveIn != Point && LiveIn != MBB.end().getInstrIterator()) { +// liveness.stepForward(*LiveIn, clb); +// LiveIn++; +// } +// assert(LiveIn == Point && "moved forward to point"); +// } + +// void MoveBackward(MachineBasicBlock::reverse_instr_iterator Point) { +// assert(Point != MBB.rend().getInstrIterator() && "not rend()"); +// if (Point.getReverse() == LiveIn) return; +// Point++; //in order to get LiveIN for Point we have to move up to and incl. Point +// MachineBasicBlock::reverse_instr_iterator LiveInRev = LiveIn.getReverse(); +// LiveInRev++; +// while (LiveInRev != Point && LiveInRev != MBB.rend().getInstrIterator()) { +// liveness.stepBackward(*LiveInRev); +// LiveInRev++; +// } +// LiveIn = std::next(LiveInRev.getReverse()); +// assert(LiveInRev == Point && "moved backward to point"); +// } + +// //move forward up to first use of Reg, make sure Reg is not live anymore afterwards +// MachineBasicBlock::instr_iterator findUniqueUser(MCRegister Reg, MachineBasicBlock::instr_iterator end) { +// while (LiveIn != end) { +// auto ut = isDefIsUse(*LiveIn, Reg); +// if (ut.first && !ut.second) return end; //redefined +// if (ut.first && ut.second) return LiveIn; //first user redefines himself +// MoveForward(std::next(LiveIn)); +// if (ut.second) { +// if (liveness.available(MRI, Reg)) std::prev(LiveIn); +// else { +// for (auto x = LiveIn; x != MBB.end().getInstrIterator(); ++x) { +// auto ut = isDefIsUse(*x, Reg); +// if (ut.first && !ut.second) return std::prev(LiveIn); //found redef. +// else if (ut.second) return end; //another use +// } +// return end; +// } +// } +// } +// return end; +// } + +// MachineBasicBlock::instr_iterator getPoint() const { return LiveIn; } +// const LivePhysRegs &getLiveness() const { return liveness; } +// void addReg(MCRegister R) { liveness.addReg(R); } + +// private: +// MachineBasicBlock::instr_iterator LiveIn; //INV: this always points to the instr for which liveness has live-in info +// LivePhysRegs liveness; +// MachineBasicBlock &MBB; +// const MachineRegisterInfo &MRI; +// }; + +// static bool isSSREn(const MachineInstr &MI) { +// return MI.getOpcode() == RISCV::CSRRSI +// && MI.getOperand(1).isImm() +// && MI.getOperand(1).getImm() == 0x7C0 +// && MI.getOperand(2).isImm() +// && MI.getOperand(2).getImm() == 1; +// } + +// static bool isSSRDis(const MachineInstr &MI) { +// return MI.getOpcode() == RISCV::CSRRCI +// && MI.getOperand(1).isImm() +// && MI.getOperand(1).getImm() == 0x7C0 +// && MI.getOperand(2).isImm() +// && MI.getOperand(2).getImm() == 1; +// } + +// static bool isSSRReg(MCRegister R) { +// for (unsigned s = 0u; s < NUM_SSR; s++) { +// if (getSSRFtReg(s).asMCReg() == R) return true; +// } +// return false; +// } + +// static unsigned getSSRRegIdx(MCRegister R) { +// return R - MCRegister(RISCV::F0_D); +// } + +// bool RISCVExpandSSRPostRegAlloc::mergePushPop(MachineBasicBlock &MBB) { +// bool Modified; + +// MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); +// const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + +// recomputeLiveIns(MBB); +// recomputeLivenessFlags(MBB); + +// SmallSet modifiedInsts[NUM_SSR]; //keep track of which insts were merged into to avoid merging two different moves of same stream into one inst +// MachineBasicBlock::reverse_instr_iterator MII = MBB.rbegin().getInstrIterator(); +// MachineBasicBlock::instr_iterator SearchEnd = MBB.end().getInstrIterator(); +// while (MII != MBB.rend().getInstrIterator()) { +// auto NMII = std::next(MII); + +// if (isSSRDis(*MII)) { +// SearchEnd = MII.getReverse(); +// MII = NMII; +// continue; +// } + +// if (MII->getOpcode() == RISCV::FSGNJ_D) { +// auto &MOP0 = MII->getOperand(0); +// auto &MOP1 = MII->getOperand(1); +// auto &MOP2 = MII->getOperand(2); +// if (MOP0.isReg() && MOP1.isReg() && MOP2.isReg() && MOP1.getReg() == MOP2.getReg()) { +// if (isSSRReg(MOP1.getReg()) && MII != MBB.rbegin().getInstrIterator()) { //this is ssr pop (and there is at least one potential user) +// MCRegister dest = MOP0.getReg().asMCReg(); +// MCRegister ssr_reg = MOP1.getReg().asMCReg(); +// unsigned dmid = getSSRRegIdx(ssr_reg); +// //try to find unique user of dest +// Liveness Live(TRI, MRI, MBB, true); +// Live.MoveBackward(std::prev(MII)); //increment liveness to past MII +// auto user = Live.findUniqueUser(dest, SearchEnd); +// if (user != SearchEnd && modifiedInsts[dmid].find(&*user) == modifiedInsts[dmid].end()) { //found user +// user->dump(); +// for (auto &MOP : user->operands()) { +// if (MOP.isReg() && !MOP.isDef() && MOP.getReg() == dest) MOP.setReg(ssr_reg); //replace all non-def uses of r with ssr_reg +// } +// user->dump(); +// MII->eraseFromBundle(); +// modifiedInsts[dmid].insert(&*user); +// Modified = true; +// } +// } else if (isSSRReg(MOP0.getReg())) { +// MCRegister src = MOP1.getReg(); +// MCRegister ssr_reg = MOP0.getReg(); +// unsigned dmid = getSSRRegIdx(ssr_reg); +// MachineBasicBlock::reverse_instr_iterator beginSearch = std::next(MII); +// while (beginSearch != MBB.rend().getInstrIterator()) { +// if (isSSREn(*beginSearch)) break; +// auto ut = isDefIsUse(*beginSearch, src); +// if (ut.first) break; +// beginSearch++; +// } +// if (beginSearch != MBB.rend().getInstrIterator() && !isSSREn(*beginSearch)) { +// assert(isDefIsUse(*beginSearch, src).first && "does define src"); +// Liveness Live(TRI, MRI, MBB, true); +// Live.MoveBackward(std::prev(beginSearch)); +// auto user = Live.findUniqueUser(src, std::next(MII.getReverse())); +// if (user == MII.getReverse() && modifiedInsts[dmid].find(&*beginSearch) == modifiedInsts[dmid].end()) { +// beginSearch->dump(); +// for (auto &MOP : beginSearch->operands()) { +// if (MOP.isReg() && MOP.isDef() && MOP.getReg() == src) { +// MOP.setReg(ssr_reg); //replace all defs of R with ssr_reg +// MOP.setIsDef(false); +// } +// } +// beginSearch->dump(); +// MII->eraseFromBundle(); +// modifiedInsts[dmid].insert(&*beginSearch); +// Modified = true; +// } +// } +// } +// } +// } +// MII = NMII; +// } +// return Modified; +// } \ No newline at end of file diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td index 67f38e03e1fc0..bc1e03d5eedc6 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td @@ -105,10 +105,26 @@ class SPseudoPush: let usesCustomInserter = 0; } +class SPseudoStoreMove: //instead of using these could give isBarrier = 1 to ssr csrrsi/csrrci + Pseudo<(outs FPR64:$ssr), (ins FPR64:$val),[]> { + let mayLoad = 0; + let mayStore = 1; + let hasSideEffects = 1; + let usesCustomInserter = 0; +} + class SPseudoPop: Pseudo<(outs FPR64:$val), (ins uimm5:$ssr),[]> { let mayLoad = 1; - let mayStore = 0; + let mayStore = 1; + let hasSideEffects = 1; + let usesCustomInserter = 0; +} + +class SPseudoLoadMove: + Pseudo<(outs FPR64:$val), (ins FPR64:$ssr),[]> { + let mayLoad = 1; + let mayStore = 1; let hasSideEffects = 1; let usesCustomInserter = 0; } @@ -148,6 +164,8 @@ let Predicates = [HasExtXssr] in { def PseudoSSRSetup_1D_W : SPseudoSetup1D; def PseudoSSRPush : SPseudoPush; def PseudoSSRPop : SPseudoPop; + def PseudoStoreMove : SPseudoStoreMove; + def PseudoLoadMove : SPseudoLoadMove; foreach dim = [1, 2, 3, 4] in { def PseudoSSRSetupBoundStride_#dim#D : SPseudoSetupBoundStride; diff --git a/llvm/lib/Target/RISCV/RISCVSSRReassociate.cpp b/llvm/lib/Target/RISCV/RISCVSSRReassociate.cpp new file mode 100644 index 0000000000000..3fefec22f122c --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVSSRReassociate.cpp @@ -0,0 +1,528 @@ +//===- SSRReassociatePass.cpp - Reassociate Fast FP insts and move SSR push/pop intrinsics ------------------===// +// +// ??? +// +//===----------------------------------------------------------------------===// +// +// FIXME: The reassociation should really be done by the ReassociatePass but it +// for some reason does no reassociate fast FP insts? (maybe because it expects +// a normal out of order processor to vectorize anyway.) +// The reassociation is always done in full an can thus be quite slow when the +// dependency trees are large. Might want to introduce a max height or sth +// like that. +// Bubbling the Pushs/Pops might be better done in the pre RA ssr expand pass +// because we have more control over where they land there. +// This is not really meant to be used yet, so debug msg's are output by errs(). +// +//===----------------------------------------------------------------------===// + +#include "RISCV.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsRISCV.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "ssr-reassociate" + +namespace llvm { + cl::opt AggressiveReassociate( + "ssr-aggressive-reassociation", + cl::init(false), + cl::desc("Reassociate aggressively and move ssr push/pop out of the way. In particular: reassociate also fast fp-ops") + ); + + cl::opt BubbleStreams( + "ssr-bubble-streams", + cl::init(0), + cl::desc( + "Try to schedule pops earlier and pushs later making \"windows\" holding the given nr. of instructions given." + "This gives more freedom to the scheduler in unrolled loops. If window is too large then there are not enough registers which leads to unnecessary spills" + "0 means off (default), negative number means max window size") + ); +} + +namespace { + + class SSRReassociate: public FunctionPass { + const TargetLowering *TLI = nullptr; + + public: + static char ID; // Pass identification, replacement for typeid + + SSRReassociate() : FunctionPass(ID) { + initializeSSRReassociatePass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + + private: + bool runOnBB(BasicBlock &BB); + }; + +} // end anonymous namespace + +bool SSRReassociate::runOnFunction(Function &F) { + bool Modified = false; + + LLVM_DEBUG(dbgs()<<"SSR Reassociate Pass running on: "<(I) && + (cast(I).getIntrinsicID() == Intrinsic::riscv_ssr_push + || cast(I).getIntrinsicID() == Intrinsic::riscv_ssr_push); +} + +//put pops at top and pushs at bottom +static bool BubbleSSRIntrinsics(BasicBlock::iterator begin, BasicBlock::iterator end) { + bool Modified = false; + auto II = begin; + auto LastInsertedPopSucc = begin; + auto LastInsertedPush = std::prev(end); + auto FirstInsertedPush = end; + while (II != end && II != FirstInsertedPush) { + auto NII = std::next(II); + if (isa(*II)) { + auto &Intr = cast(*II); + if (Intr.getIntrinsicID() == Intrinsic::riscv_ssr_pop) { + Intr.moveBefore(&*LastInsertedPopSucc); + LastInsertedPopSucc = std::next(Intr.getIterator()); + Modified = true; + } else if (Intr.getIntrinsicID() == Intrinsic::riscv_ssr_push) { + Intr.moveAfter(&*LastInsertedPush); + LastInsertedPush = Intr.getIterator(); + Modified = true; + if (FirstInsertedPush == end) FirstInsertedPush = LastInsertedPush; + } + } + II = NII; + } + return Modified; +} + +//genetates the window that the above function uses to bubble +//windows depend are constrained by bubble_count and by ssr enable/disable calls +static bool BubbleSSRIntrinsics(BasicBlock &BB, unsigned bubble_count) { + bool Modified = false; + auto start = BB.getFirstInsertionPt(); + auto finish = start; + while (start != BB.end()) { + //increment finish until it hits an ssr enable / disable + unsigned w = 0; // or until we have bubble_count many instructions (non push/pop instructions) inside the window + while (finish != BB.end() && finish != BB.getTerminator()->getIterator() && w < bubble_count) { + assert(finish != BB.end()); + if (isa(*finish)) { + auto id = cast(*finish).getIntrinsicID(); + if (id == Intrinsic::riscv_ssr_enable || id == Intrinsic::riscv_ssr_disable) { + break; + } + } + if (!isPushPop(*finish) && !finish->isDebugOrPseudoInst()) w++; + finish++; + } + + Modified |= BubbleSSRIntrinsics(start, finish); + + if (finish != BB.getTerminator()->getIterator() && finish != BB.end()) finish++; //move past ssr en/dis + else break; // we are done + start = finish; + } + + return Modified; +} + +//put pops and pushs as close to their def/use as possible +static bool BubbleSSRIntrinsicsBack(BasicBlock &BB) { + bool Modified = false; + auto II = BB.getFirstInsertionPt(); + DenseSet vis; + while (II != BB.end()) { + auto NII = std::next(II); + if (vis.find(&*II) != vis.end()) { + II = NII; + continue; + } + vis.insert(&*II); + if (isa(*II)) { + auto &Intr = cast(*II); + if (Intr.getIntrinsicID() == Intrinsic::riscv_ssr_pop) { + Instruction *UU = nullptr; + for (User *U : Intr.users()) { + if (isa(U) && !UU) UU = cast(U); + else UU = nullptr; + } + if (UU) { + Intr.moveBefore(UU); + Modified = true; + } + } else if (Intr.getIntrinsicID() == Intrinsic::riscv_ssr_push) { + if (Instruction *D = dyn_cast(Intr.getOperand(1))) { + Intr.moveAfter(D); + Modified = true; + } + } + } + II = NII; + } + return Modified; +} + +static bool isAssociative(const Value &V) { + if (!isa(V)) return false; + const auto &I = cast(V); + if (I.getType()->isIntegerTy(1u)) return false; //ignore bools + if(I.isAssociative()) return true; + if (isa(I) && I.hasAllowReassoc()) return true; + // if ((I.getType()->isFloatingPointTy() && I.isFast())){ //https://gcc.gnu.org/wiki/FloatingPointMath + // switch (I.getOpcode()) + // { + // case Instruction::BinaryOps::FAdd: + // case Instruction::BinaryOps::FMul: + // return true; + // default: + // return false; + // } + // } + return false; +} + +// a bit redundant, but might allow to be extended +static bool isBinop(const Value &I) { + return isa(I); +} + +static unsigned getAndUpdateHeight(const Value &V, DenseMap &heights); //bc mutual recursion + +//assumes children have the correct height, updates the height of I accordingly +static unsigned updateHeightFromChildren(const BinaryOperator &I, DenseMap &heights) { + unsigned this_height = 1u + std::max( + getAndUpdateHeight(*I.getOperand(0), heights), + getAndUpdateHeight(*I.getOperand(1), heights) + ); + auto p = heights.insert(std::make_pair(&I, this_height)); + if (!p.second) p.first->getSecond() = this_height; //update value + return this_height; +} + +//updates the height of children recursively then uses updateHeightFromChildren +static unsigned getAndUpdateHeight(const Value &V, DenseMap &heights) { + if (!isa(V)) return 0; + const Instruction &I = cast(V); + if (!isBinop(I)) return 0; + auto d = heights.find(&I); + if (d != heights.end()) return d->second; //if height is available it is correct + return updateHeightFromChildren(cast(I), heights); +} + +//moves OP and all users that are between OP and Point to after Point in the same order +static void moveAfterWithAllUsers(BinaryOperator &OP, Instruction &Point) { + assert(OP.getParent() == Point.getParent()); + auto II = std::next(Point.getIterator().getReverse()); //start right before point + auto rend = OP.getIterator().getReverse(); //end just after OP + SmallPtrSet users; //for faster lookup + for (auto *U : OP.users()) { + if (auto *I = dyn_cast(U)) { + users.insert(I); + } + } + while (II != OP.getParent()->rend() && II != rend) { + auto NII = std::next(II); + for (auto *U : II->users()){ + if (auto *I = dyn_cast(U)) + users.insert(I); + } + if (users.contains(&*II)) { + II->moveAfter(&Point); + } + II = NII; + assert(II != OP.getParent()->rend()); + } + OP.moveAfter(&Point); +} + +//we can only rotate if B only depends directly on A without any other def-use path between them +static bool canRotate(const Instruction &A, const Instruction &B) { + SmallPtrSet users; + for (auto *U : A.users()) { + if (auto *I = dyn_cast(U)) users.insert(I); + } + auto II = A.getIterator(); + for (; II != A.getParent()->end() && &*II != &B; II++) { + if (users.contains(&*II)) { + for (auto *U : II->users()) { + if (auto *I = dyn_cast(U)) { + if (I == &B) return false; //additional def-use path + users.insert(I); + } + } + if (!isa(*II) && !isa(*II) && !isa(*II) && !isa(*II)) + return false; //if user (which will need to be moved is not a "simple" instrucion ==> then cannot do it) + } + } + return II != A.getParent()->end() && &*II == &B; //return true if II now points to B +} + +//single rotation counter-clockwise (trees are with root at bottom because thats how they are in LLVM IR) +static BinaryOperator *rotateCC(BinaryOperator &L, BinaryOperator &I, DenseMap &heights) { + assert(isAssociative(L) && isAssociative(I) && I.getOperand(0) == &L); + I.setOperand(0, L.getOperand(1)); + I.replaceAllUsesWith(&L); + L.setOperand(1, &I); + L.dropDroppableUses(); + moveAfterWithAllUsers(L, I); + updateHeightFromChildren(I, heights); + updateHeightFromChildren(L, heights); + return &L; +} + +//single rotation clock-wise +static BinaryOperator *rotateCW(BinaryOperator &R, BinaryOperator &I, DenseMap &heights) { + assert(isAssociative(R) && isAssociative(I) && I.getOperand(1) == &R); + I.setOperand(1, R.getOperand(0)); + I.replaceAllUsesWith(&R); + R.setOperand(0, &I); + R.dropDroppableUses(); //remove debug insts that would otherwise not be dominated by R anymore + moveAfterWithAllUsers(R, I); + updateHeightFromChildren(I, heights); + updateHeightFromChildren(R, heights); + assert(cast(*I.user_begin()) == &R && std::next(I.user_begin()) == I.user_end() && "the only user of I is R"); + return &R; +} + +//try to rotate or double rotate if applicable (see AVL trees) +static BinaryOperator *tryRotateL(Value &Left, Value &Root, DenseMap &heights) { + if (isBinop(Left) && isBinop(Root) && isAssociative(Left) && isAssociative(Root)) { + BinaryOperator &L = cast(Left); + BinaryOperator &I = cast(Root); + const unsigned opcode = I.getOpcode(); + if (L.getOpcode() != opcode || L.getParent() != I.getParent()) return nullptr; //cannot do anything + unsigned lh = getAndUpdateHeight(L, heights); + if (lh <= 1u) return nullptr; //nothing to do + auto &L_RChild = *L.getOperand(1); + if (isBinop(L_RChild) && isAssociative(L_RChild) + && getAndUpdateHeight(L_RChild, heights) + 1u == lh) { + auto &LRC = cast(L_RChild); + if (LRC.getOpcode() == opcode && LRC.getParent() == I.getParent() && canRotate(LRC, L) && canRotate(L, I)) { + auto &newL = *rotateCW(LRC, L, heights); + if (canRotate(newL, I)) return rotateCC(newL, I, heights); + else return nullptr; + } + } + if (canRotate(L, I)) return rotateCC(L, I, heights); + } + return nullptr; +} + +//try to rotate or double rotate if applicable (see AVL trees) +static BinaryOperator *tryRotateR(Value &Right, Value &Root, DenseMap &heights) { + if (isBinop(Right) && isBinop(Root) && isAssociative(Right) && isAssociative(Root)) { + BinaryOperator &R = cast(Right); + BinaryOperator &I = cast(Root); + const unsigned opcode = I.getOpcode(); + if (R.getOpcode() != opcode || R.getParent() != I.getParent()) return nullptr; //cannot do anything + unsigned rh = getAndUpdateHeight(R, heights); + if (rh <= 1u) return nullptr; //nothing to do + auto &R_LChild = *R.getOperand(0); + if (isBinop(R_LChild) && isAssociative(R_LChild) + && getAndUpdateHeight(R_LChild, heights) + 1u == rh) { + auto &RLC = cast(R_LChild); + if (RLC.getOpcode() == opcode && RLC.getParent() == I.getParent() && canRotate(RLC, R) && canRotate(R, I)) { + auto &newR = *rotateCC(RLC, R, heights); + if (canRotate(newR, I)) return rotateCW(newR, I, heights); + else return nullptr; + } + } + if (canRotate(R, I)) return rotateCW(R, I, heights); + } + return nullptr; +} + +//needed to check whether we are actually dealing with a tree +static bool subGraphsIntersect(const Value &X, const Value &Y) { + if (!isBinop(X) || !isBinop(Y)) return false; + const auto &A = cast(X); + const auto &B = cast(Y); + DenseSet seen; + std::deque q; + const BasicBlock *BB = A.getParent(); + q.push_back(&A); + while (!q.empty()) { + const auto *I = q.front(); q.pop_front(); + seen.insert(I); + if (auto *X = dyn_cast(I->getOperand(0))) { + if (X && X->getParent() == BB) q.push_back(X); + } + if (auto *X = dyn_cast(I->getOperand(1))) { + if (X && X->getParent() == BB) q.push_back(X); + } + } + assert(q.empty()); + q.push_back(&B); + while (!q.empty()) { + const auto *I = q.front(); q.pop_front(); + if (seen.contains(I)) return true; + if (auto *X = dyn_cast(I->getOperand(0))) { + if (X && X->getParent() == BB) q.push_back(X); + } + if (auto *X = dyn_cast(I->getOperand(1))) { + if (X && X->getParent() == BB) q.push_back(X); + } + } + return false; +} + +//print trees for debugging purposes +static void printDep(Value &I, unsigned lvl, DenseMap &heights, DenseSet &vis) { + if (vis.find(&I) != vis.end()) return; + vis.insert(&I); + for (unsigned i = 0; i < lvl; i++) errs()<<"| \t"; + unsigned h = 0; + if (isa(I)) { + auto p = heights.find(&cast(I)); + if (p != heights.end()) h = p->second; + } + errs()<<" h = "<(I); + for (unsigned i = 0; i < X.getNumOperands(); i++) { + auto *V = X.getOperand(i); + if (V) printDep(*V, lvl+1, heights, vis); + } + } +} + +//try to reassociate tree rooted in Inst (if it is a tree!) +//insts might be moved past Inst and Inst might not be the root anymore afterwards +static bool Reassociate(Value &Inst, DenseMap &heights) { + bool Modified = false; + if (isBinop(Inst) && isAssociative(Inst)) { + BinaryOperator *I = cast(&Inst); + unsigned h = updateHeightFromChildren(*I, heights); + if (h <= 2) return false; //nothing todo + if (subGraphsIntersect(*I->getOperand(0), *I->getOperand(1))) { + return false; //Inst is not root of a tree! cannot optimize! + } + bool better = true; + int lminusr = std::numeric_limits::max(); + DenseSet vis; + do { + if (vis.contains(I)) break; + vis.insert(I); + int new_lminusr = + (int)getAndUpdateHeight(*I->getOperand(0), heights) + - (int)getAndUpdateHeight(*I->getOperand(1), heights); + better = std::abs(lminusr) > std::abs(new_lminusr); + lminusr = new_lminusr; + BinaryOperator *NewRoot = nullptr; + if (lminusr >= 2) { + NewRoot = tryRotateL(*I->getOperand(0), *I, heights); //try to fix at this height + } else if (lminusr <= -2) { + NewRoot = tryRotateR(*I->getOperand(1), *I, heights); //try to fix at this height + } + if (NewRoot) { + I = NewRoot; + Modified = true; + better = true; + } else { + better = false; //defenitely do not repeat if we haven't changed anything anymore + } + } while (better); + + bool improved_left = Reassociate(*I->getOperand(0), heights); //fix left subtree + bool improved_right = Reassociate(*I->getOperand(1), heights); //fix right subtree + Modified = Modified || improved_left || improved_right; + + updateHeightFromChildren(*I, heights); + } + return Modified; +} + +//try to reassociate all insts in BB +static bool Reassociate(BasicBlock &BB) { + bool Modified = false; + + DenseMap heights; + + auto RI = BB.rbegin(); + while (RI != BB.rend()) { + if (heights.find(&*RI) == heights.end()) {//only reassociate if this was not part of any tree already + Modified |= Reassociate(*RI, heights); + } + RI++; //yes, this means we miss some instructions, but those are optimized already anyway + } + + // if (Modified) { + // errs()<<"Reassociate in BB: "< vis; + // for (auto RI = BB.rbegin(); RI != BB.rend(); RI++) { + // printDep(*RI, 0, heights, vis); + // } + // } + + return Modified; +} + +//reassociate and then bubble +bool SSRReassociate::runOnBB(BasicBlock &BB) { + bool Modified = false; + + if (AggressiveReassociate) { + Modified |= BubbleSSRIntrinsics(BB, std::numeric_limits::max()); //move pop/pushs out of the way + Modified |= Reassociate(BB); + if (BubbleStreams >= 0) Modified |= BubbleSSRIntrinsicsBack(BB); //move them back if needed + } + + if (BubbleStreams > 0) { + Modified |= BubbleSSRIntrinsics(BB, (unsigned)BubbleStreams); //bubble to form windows + } + + return Modified; +} + + +char SSRReassociate::ID = 0; + +INITIALIZE_PASS(SSRReassociate, DEBUG_TYPE, "SSR Reassociate Pass", false, false) + +FunctionPass *llvm::createSSRReassociatePass() { return new SSRReassociate(); } diff --git a/llvm/lib/Target/RISCV/RISCVSSRStatistics.cpp b/llvm/lib/Target/RISCV/RISCVSSRStatistics.cpp new file mode 100644 index 0000000000000..9c24e799cb09d --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVSSRStatistics.cpp @@ -0,0 +1,169 @@ +//===- RISCVSSRStatistics.cpp - Reassociate Fast FP insts and move SSR push/pop intrinsics ------------------===// +// +// ??? +// +//===----------------------------------------------------------------------===// +// +// count how many memory accesses there are and at what loop depth +// +//===----------------------------------------------------------------------===// + +#include "RISCV.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsRISCV.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsRISCV.h" +#include +#include +#include + +using namespace llvm; + +namespace { + + class SSRStatistics: public FunctionPass { + const TargetLowering *TLI = nullptr; + + public: + static char ID; // Pass identification, replacement for typeid + + SSRStatistics() : FunctionPass(ID) { + initializeSSRStatisticsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + + virtual void getAnalysisUsage(AnalysisUsage& AU) const override { + AU.addRequired(); + } + }; + +} // end anonymous namespace + +bool SSRStatistics::runOnFunction(Function &F) { + + DenseMap ld; + DenseMap st; + DenseMap push; + DenseMap pop; + + const LoopInfo &LI = getAnalysis().getLoopInfo(); + std::vector s; + for (const auto *L : LI.getTopLevelLoops()) { + s.push_back(L); + } + for (const auto &BB : F) { + const Loop *L = LI.getLoopFor(&BB); + if (!L) continue; + if (ld.find(L) == ld.end()) ld.insert(std::make_pair(L, 0)); + if (st.find(L) == st.end()) st.insert(std::make_pair(L, 0)); + if (push.find(L) == push.end()) push.insert(std::make_pair(L, 0)); + if (pop.find(L) == pop.end()) pop.insert(std::make_pair(L, 0)); + for (const Instruction &I : BB) { + if (isa(I)) { + auto x = ld.find(L); + assert(x != ld.end()); + x->getSecond() += 1; + } else if (isa(I)) { + auto x = st.find(L); + assert(x != st.end()); + x->getSecond() += 1; + } else if (isa(I)) { + const auto &In = cast(I); + if (In.getIntrinsicID() == Intrinsic::riscv_ssr_pop) { + auto x = pop.find(L); + assert(x != pop.end()); + x->getSecond() += 1; + } else if (In.getIntrinsicID() == Intrinsic::riscv_ssr_push) { + auto x = push.find(L); + assert(x != push.end()); + x->getSecond() += 1; + } + } + } + } + + errs()<<"\""<getHeader()->getNameOrAsOperand()<<"\": {\n"; + errs()<<"\t\t\"depth\": "<getLoopDepth()<<",\n"; + errs()<<"\t\t\"loads\": "<getSecond()<<",\n"; + errs()<<"\t\t\"stores\": "<getSecond()<<",\n"; + errs()<<"\t\t\"pushs\": "<getSecond()<<",\n"; + errs()<<"\t\t\"pops\": "<getSecond()<<"\n"; + errs()<<"\t},\n"; + } + errs()<<"},\n"; + + + return false; +} + +// bool SSRStatistics::runOnFunction(Function &F) { + +// std::vector n_ld, n_st; +// constexpr int max_depth = 5; +// while (n_ld.size() <= max_depth) n_ld.push_back(0); +// while (n_st.size() <= max_depth) n_st.push_back(0); + +// const LoopInfo &LI = getAnalysis().getLoopInfo(); +// for (const auto &BB : F) { +// unsigned depth = LI.getLoopDepth(&BB); +// assert(n_ld.size() > depth); +// assert(n_st.size() > depth); +// for (const Instruction &I : BB) { +// if (isa(I)) { +// n_ld[depth] += 1; +// } else if (isa(I)) { +// n_st[depth] += 1; +// } +// } +// } + +// errs()< debug output done with errs(). +// +//===----------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVInstrInfo.h" +#include "RISCVTargetMachine.h" +#include "RISCVMachineFunctionInfo.h" + +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/LowLevelType.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +#define DEBUG_TYPE "riscv-frep" + +namespace llvm { + cl::opt SnitchAutoFrep( + "snitch-auto-frep", + cl::init(false), + cl::desc("Find repeating fp insts in unrolled loops. If a reduction can be found (not good yet) insert frep with stagger.")); +} + +#define SNITCH_AUTO_FREP_NAME "Snitch Auto Frep" + +#define MAX_SEARCH_WINDOW 4 +#define MIN_REP 4 +#define MAX_STAGGER 4 +#define NUM_SSR 3 + +namespace { + +class SNITCHAutoFrep : public MachineFunctionPass { +public: + const RISCVInstrInfo *TII; + static char ID; + + SNITCHAutoFrep() : MachineFunctionPass(ID) { + initializeSNITCHAutoFrepPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return SNITCH_AUTO_FREP_NAME; } + +private: + + const MachineFunction *MF; + RISCVMachineFunctionInfo *RVFI; + DenseSet FPOps; + + bool process(MachineBasicBlock &MBB); + bool isFPInstr(MachineInstr &I); + std::pair findRep( + MachineBasicBlock::instr_iterator window_beg, + MachineBasicBlock::instr_iterator window_end, + MachineBasicBlock::instr_iterator end); +}; + +static Register getSSRFtReg(unsigned streamer) { //taken from RISCVExpandSSRInsts.cpp + unsigned AssignedReg = RISCV::F0_D + streamer; + // Advance the iterator to the assigned register until the valid + // register is found + const TargetRegisterClass *RC = &RISCV::FPR64RegClass; + TargetRegisterClass::iterator I = RC->begin(); + for (; *I != AssignedReg; ++I) + assert(I != RC->end() && "AssignedReg should be a member of provided RC"); + return Register(*I); +} + +char SNITCHAutoFrep::ID = 0; + +static constexpr unsigned fpopcodes[] = {RISCV::FADD_D, RISCV::FMUL_D, RISCV::FMADD_D, RISCV::FSGNJ_D, RISCV::FDIV_D, RISCV::FSUB_D, RISCV::FMSUB_D, RISCV::FMIN_D, RISCV::FMAX_D, RISCV::FSQRT_D}; + +bool SNITCHAutoFrep::runOnMachineFunction(MachineFunction &MF) { + + if (SnitchAutoFrep) { + errs()<<"snitch auto frep on "<(MF.getSubtarget().getInstrInfo()); + this->MF = &MF; + this->RVFI = MF.getInfo(); + for (const unsigned &x : fpopcodes) this->FPOps.insert(x); + + errs()<<"autofrep: running on:"<FPOps.find(I.getOpcode()) != this->FPOps.end(); +} + +//test whether the window [window_beg, window_end) is repeating and how many times it is +std::pair SNITCHAutoFrep::findRep( + MachineBasicBlock::instr_iterator window_beg, + MachineBasicBlock::instr_iterator window_end, + MachineBasicBlock::instr_iterator end) +{ + MachineBasicBlock::instr_iterator wi = window_beg; + MachineBasicBlock::instr_iterator s_end = window_end; + MachineBasicBlock::instr_iterator s_res = window_end; + unsigned rep = 1u; + while (s_end != end && isFPInstr(*s_end) && areTheSame(*s_end, *wi)) { + s_end++; + wi++; + if (wi == window_end) { + wi = window_beg; + rep++; + s_res = s_end; //found rep + } + } + return std::make_pair(s_res, rep); +} + +//used to calculate best possible stagger amount +static unsigned getCycles(unsigned opcode) { + switch (opcode) + { + case RISCV::FADD_D: + return 2; + case RISCV::FMUL_D: + return 3; + case RISCV::FMADD_D: + return 4; + default: + return 1; + } +} + +//return reduction operation +//fmul.d not included because we currently always init the staggered regs with 0 (and mul would need 1) +//min/max might also work, anything associative should work +static Optional getCombineOpcode(unsigned opcode, unsigned src_idx) { + switch (opcode) + { + case RISCV::FADD_D: + if (src_idx == 1 || src_idx == 2) return (unsigned)RISCV::FADD_D; + return None; + case RISCV::FMADD_D: + if (src_idx == 0) return (unsigned)RISCV::FADD_D; + return None; + default: + return None; + } +} + +//combine usages to mask +static unsigned toMask (const std::vector> &deps) { + unsigned mask = 0u; + for (const auto &p : deps) mask |= p.second; + return mask; +} + + +//find internal and external dependencies +static Optional>> findRepDependenceRegs( + MachineBasicBlock::instr_iterator window_begin, + MachineBasicBlock::instr_iterator window_end) +{ + DenseMap def; //defs that are live going out of window + std::vector> internal, external; + for (auto MII = window_begin; MII != window_end; MII++) { + for (unsigned i = MII->getNumOperands()-1; i < MII->getNumOperands(); i--) { + int idx = 3 - (int)i; + auto &MOP = MII->getOperand(i); + if (!MOP.isReg()) continue; + if (idx < 0) return None; //there is an instruction with more than 4 fpr's in window ==> cannot stagger + MCRegister r = MOP.getReg().asMCReg(); + if (MOP.isDef()) { + if (idx != 3) return None; //defining operand not at idx 0 ==> cannot stagger + def.insert(std::make_pair(r, (unsigned)(1 << idx))); + } else { //use + auto p = def.find(r); + if (p != def.end()) internal.push_back(std::make_pair(r, (unsigned)(1 << idx) | p->second)); + if (MOP.isKill()) def.erase(r); + } + idx--; + } + } + for (auto MII = window_begin; MII != window_end; MII++) { + for (unsigned i = MII->getNumOperands()-1; i < MII->getNumOperands(); i--) { + int idx = 3 - (int)i; + auto &MOP = MII->getOperand(i); + if (!MOP.isReg()) continue; + assert(idx >= 0); + MCRegister r = MOP.getReg().asMCReg(); + if (MOP.isDef()) { + def.erase(r); //redef'ed before use + } else { + auto p = def.find(r); + if (p != def.end()) external.push_back(std::make_pair(r, (unsigned)(1 << idx) | p->second)); + if (MOP.isKill()) def.erase(r); + } + } + } + unsigned internal_mask = toMask(internal); + unsigned external_mask = toMask(external); + for (auto &p : external) external_mask |= p.second; + //internal needs to be a subset of external so that we can stagger (FIXME: right?) + if ((internal_mask & external_mask) ^ internal_mask) return None; + return external; +} + +//merge dependecy vector +static void mergeRegisters(std::vector> &deps) { + unsigned i = 0; + while (i < deps.size()) { + MCRegister r = deps[i].first; + unsigned found = 0u; + for (unsigned j = 0; j < i; j++) { + if (deps[j].first == r) { + deps[j] = std::make_pair(r, deps[j].second | deps[i].first); + found++; + } + } + if (found) { + assert(found == 1); + deps.erase(deps.begin() + i); + //no need to increment i + } else { + i++; + } + } +} + +//duh +static bool isSSRReg(MCRegister r) { + for (unsigned i = 0; i < NUM_SSR; i++) { + if (getSSRFtReg(i) == r) return true; + } + return false; +} + +//try to find readuction operation, currently only single ops are allowed +static Optional> findCombineOps( + MCRegister DReg, + unsigned stagger_mask, + MachineBasicBlock::instr_iterator window_begin, + MachineBasicBlock::instr_iterator window_end) +{ + MachineInstr *Def = nullptr; + for (auto MII = std::next(window_end.getReverse()); MII != std::next(window_begin.getReverse()); MII++) { + if (MII->getOperand(0).isReg() && MII->getOperand(0).getReg() == DReg) { + Def = &*MII; + } + } + if (!Def) return None; + + std::vector ops; + MCRegister r = DReg; + bool reached_def = false; + for (auto MII = window_begin; !reached_def && MII != window_end; MII++) { + for (unsigned i = MII->getNumOperands() - 1; !reached_def && i < MII->getNumOperands(); i--) { + int idx = 3 - i; + if (idx < 0) continue; + auto &MOP = MII->getOperand(i); + if (MOP.isReg() && MOP.getReg().asMCReg() == r) { + if (!MII->getOperand(0).isReg()) return None; + r = MII->getOperand(0).getReg().asMCReg(); + auto op = getCombineOpcode(MII->getOpcode(), (unsigned)idx); + if (!op.hasValue()) return None; + ops.push_back(op.getValue()); + reached_def = (&*MII == Def); + if (!reached_def) return None; //FIXME: currently only one combineop allowed + break; //go to next instruction + } + } + } + return ops; +} + +struct StaggerInfo { + unsigned count; + unsigned mask; + std::vector regs; + std::vector combineOps; +}; + +//try to find a way to stagger +static Optional findStagger( + MachineBasicBlock::instr_iterator window_begin, + MachineBasicBlock::instr_iterator window_end, + const LivePhysRegs &liveness, + const llvm::MachineRegisterInfo &MRI) +{ + errs()<<"trying to find stagger\n"; + auto depsopt = findRepDependenceRegs(window_begin, window_end); + if (!depsopt.hasValue()) return None; + errs()<<"found deps\n"; + auto deps = depsopt.getValue(); + mergeRegisters(deps); + for (const auto &p : deps) errs()<<"reg = "< regs; + regs.push_back(DReg); + while (max_stagger_count < MAX_STAGGER && liveness.available(MRI, DReg + max_stagger_count + 1)) { + max_stagger_count++; + regs.push_back(DReg + max_stagger_count); + } + if (!max_stagger_count) return None; //regs not free (FIXME: rename instead) + StaggerInfo info; + info.count = max_stagger_count; + info.mask = stagger_mask; + info.regs = std::move(regs); + info.combineOps = std::move(ops.getValue()); + return info; +} + +static MachineBasicBlock *findBB(MachineInstr &MI) { + for (auto &MOP : MI.operands()) { + if (MOP.isMBB()) return MOP.getMBB(); + } + return nullptr; +} + +//FIXME: no idea how to make a block a label for sure ==> just search for a branch and take its target +// there must be a better way to do this +// used for an always "dead" branch in the fpu fence +static MachineBasicBlock *findBrAbleBB(MachineBasicBlock &MBB) { + if (!MBB.empty()) { + auto *BB = findBB(*std::prev(MBB.end())); + if (BB) return BB; + } + std::vector s; + SmallSet vis; + s.push_back(&MBB); + while (!s.empty()) { + auto *B = s.back(); s.pop_back(); + if (!B || vis.contains(B)) continue; + vis.insert(B); + if (!B->empty()) { + auto *x = findBB(*std::prev(B->end())); + if (x) return x; + } + for (auto *BB : B->predecessors()) s.push_back(BB); + for (auto *BB : B->successors()) s.push_back(BB); + } + return &MBB; +} + +// work on a single BB, try to find repetitions, then try to find a way to stagger, then generate code if it gives an improvement +bool SNITCHAutoFrep::process(MachineBasicBlock &MBB) { + bool Modified = false; + + recomputeLivenessFlags(MBB); //to be sure + + for (auto II = MBB.begin().getInstrIterator(); II != MBB.end().getInstrIterator(); ) { + auto NII = std::next(II); + if (II->isDebugInstr()) { //get rid of some dbg instructions (sorry) + II->eraseFromParent(); + } + II = NII; + } + + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + LivePhysRegs liveness(TRI); //use RegScavenger ? + liveness.addLiveIns(MBB); + for (unsigned r = 0; r < NUM_SSR; r++) + liveness.addReg(getSSRFtReg(r).asMCReg()); //add SSR regs for good measure (FIXME: conservative) + + MachineBasicBlock::instr_iterator MII = MBB.begin().getInstrIterator(); + while (MII != MBB.end().getInstrIterator()){ + if (!isFPInstr(*MII)) { + MII = std::next(MII); + continue; + } + + std::vector> search_results; + for (auto II = MII; II != MBB.end().getInstrIterator() && search_results.size() < MAX_SEARCH_WINDOW; ++II) { + auto wend = std::next(II); + auto sr = findRep(MII, wend, MBB.end().getInstrIterator()); + search_results.push_back(sr); + } + + unsigned best = 0u; + for (unsigned i = 0u; i < search_results.size(); i++) { + best = search_results[best].second < search_results[i].second ? i : best; + } + + bool found = false; + if (!search_results.empty() && search_results[best].second >= MIN_REP) { //if we have found at least MIN_REP repetitions + errs()<<"found repeting fp instr's\n"; + for (auto II = MII; II != search_results[best].first; ++II) II->dump(); + const TargetRegisterClass *RC = &RISCV::GPRNoX0RegClass; + TargetRegisterClass::iterator I = RC->begin(); + while(I != RC->end() && !liveness.available(MRI, MCPhysReg(*I))) I++; + if (I != RC->end()) { //did find a free GPR register + errs()<<"found a free GPR reg \n"; + + MCPhysReg freeGPR = *I; + + const unsigned window_size = best + 1; //recover window size + const unsigned reps = search_results[best].second; //get reps + + auto delete_begin = std::next(MII, window_size); //start of repeting region + auto delete_end = search_results[best].first; //end of repeting region (excl.) + + auto info = findStagger(MII, delete_begin, liveness, MRI); + + if (info.hasValue()) { + errs()<<"found stagger \n"; + + unsigned window_cycles = 0u; + for (auto MI = MII; MI != delete_begin; MI++) window_cycles += getCycles(MI->getOpcode()); + unsigned rep_stall = getCycles(std::prev(delete_begin)->getOpcode()) - 1u; + unsigned combine_cycles = 0u; + for (unsigned &op : info.getValue().combineOps) combine_cycles += getCycles(op); + + std::vector cost; + cost.push_back(reps * window_cycles); //cycles needed with no frep + errs()<<"default = "< 0u) { + errs()<<"frep+stagger is better\n"; + //code generation: + //delete repetitions + MBB.dump(); + found = true; + Modified = true; //we will modify now + + for (auto di = delete_begin; di != delete_end;) { + auto din = std::next(di); + di->eraseFromParentAndMarkDBGValuesForRemoval(); //delete repeated parts + di = din; + } + for (unsigned s = 1; s <= best_stagger; s++) { + //fcvt.d.w stagger, zero (FIXME: only allows additive combine op for now) + BuildMI(MBB, MII, MII->getDebugLoc(), this->TII->get(RISCV::FCVT_D_W), info.getValue().regs[s]) + .addReg(RISCV::X0); + } + //load rep + BuildMI(MBB, MII, MII->getDebugLoc(), this->TII->get(RISCV::ADDI), freeGPR) + .addReg(RISCV::X0) + .addImm(reps-1); + //frep.i + BuildMI(MBB, MII, MII->getDebugLoc(), this->TII->get(RISCV::FREP_O)) + .addReg(freeGPR, RegState::Kill) //reps + .addImm(window_size) //nr of instructions + .addImm(best_stagger) //stagger count + .addImm(info.getValue().mask); //stagger mask + + //combine result + errs()<<"generate combine result\n"; + unsigned step = 1; + while (step < best_stagger + 1u) { + for (unsigned i = 0u; i + step < best_stagger + 1u; i += (step + 1)) { + //FIXME: currently only one combine op allowed, if more: need temp regs here ??? + errs()<<"src = "<getDebugLoc(), this->TII->get(info.getValue().combineOps.front()), info.getValue().regs[i]) + .addReg(info.getValue().regs[i], RegState::Kill) + .addReg(info.getValue().regs[i + step], RegState::Kill) + .addImm(7); + } + step = step * 2; + } + + //FPU fence (as done in SNITCHFrepLoops.cpp) + BuildMI(MBB, delete_end, delete_end->getDebugLoc(), this->TII->get(RISCV::FMV_X_W), freeGPR) + .addReg(info.getValue().regs[1]); + auto *BB = findBrAbleBB(MBB); + BuildMI(MBB, delete_end, delete_end->getDebugLoc(), this->TII->get(RISCV::BLT)) + .addReg(freeGPR, RegState::Kill) + .addReg(freeGPR, RegState::Kill) + .addMBB(BB); + //advance liveness + for (auto II = MII; II != delete_end; II++) { + SmallVector, 4u> clobbers; + liveness.stepForward(*II, clobbers); + } + + MII = delete_end; //continue from here + } + } + } + } + + if (!found) { + SmallVector, 4u> clobbers; + liveness.stepForward(*MII, clobbers); + MII = std::next(MII); + } + } + + if (Modified) MBB.dump(); + + return Modified; +} + +} // end of anonymous namespace + +INITIALIZE_PASS(SNITCHAutoFrep, "riscv-snitch-auto-frep", + SNITCH_AUTO_FREP_NAME, false, false) +namespace llvm { + +FunctionPass *createSNITCHAutoFrepPass() { return new SNITCHAutoFrep(); } + +} // end of namespace llvm \ No newline at end of file diff --git a/llvm/lib/Transforms/CMakeLists.txt b/llvm/lib/Transforms/CMakeLists.txt index 2a0abebdf19b5..c4cbda13469b3 100644 --- a/llvm/lib/Transforms/CMakeLists.txt +++ b/llvm/lib/Transforms/CMakeLists.txt @@ -7,6 +7,7 @@ add_subdirectory(IPO) add_subdirectory(Vectorize) add_subdirectory(Hello) add_subdirectory(HelloNew) +add_subdirectory(SSR) add_subdirectory(ObjCARC) add_subdirectory(Coroutines) add_subdirectory(CFGuard) diff --git a/llvm/lib/Transforms/SSR/CMakeLists.txt b/llvm/lib/Transforms/SSR/CMakeLists.txt new file mode 100644 index 0000000000000..d6dc690ed6f95 --- /dev/null +++ b/llvm/lib/Transforms/SSR/CMakeLists.txt @@ -0,0 +1,11 @@ +add_llvm_component_library(LLVMSSR + SSRInference.cpp + SSRGeneration.cpp + + DEPENDS + intrinsics_gen + + LINK_COMPONENTS + Core + Support + ) diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp new file mode 100644 index 0000000000000..2c5bb14f85d77 --- /dev/null +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -0,0 +1,888 @@ +//===-- SSRGeneration.cpp - Generate SSR --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/SSR/SSRGeneration.h" +#include "llvm/InitializePasses.h" +#include "llvm/Passes/PassBuilder.h" +#include "llvm/Target/TargetMachine.h" + +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/IRBuilder.h" + +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/AffineAccessAnalysis.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/MemorySSAUpdater.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" + +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/FormatVariadic.h" + +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsRISCV.h" +#include "llvm/IR/InlineAsm.h" + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/TinyPtrVector.h" +#include "llvm/ADT/ilist.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_TYPE "ssr" + +#define NUM_SSR 3U +#define SSR_MAX_DIM 4U + +//both are inclusive! +#define SSR_SCRATCHPAD_BEGIN 0x100000 +#define SSR_SCRATCHPAD_END 0x120000 + +//current state of hw: only allow doubles +#define CHECK_TYPE(T, I) (T == Type::getDoubleTy(I->getParent()->getContext())) + +//for gain estimation +#define EST_LOOP_TC 25 +#define EST_MUL_COST 3 +#define EST_MEMOP_COST 2 + +using namespace llvm; + +namespace llvm { + +cl::opt InferSSR( + "infer-ssr", + cl::init(false), + cl::desc("Enable inference of SSR streams.") +); + +cl::opt SSRNoIntersectCheck( + "ssr-no-intersect-check", + cl::init(false), + cl::desc("Do not generate intersection checks (unsafe). Use `restrict` key-word instead if possible.") +); + +cl::opt SSRNoTCDMCheck( + "ssr-no-tcdm-check", + cl::init(false), + cl::desc("Assume all data of inferred streams is inside TCDM.") +); + +cl::opt SSRNoBoundCheck( + "ssr-no-bound-check", + cl::init(false), + cl::desc("Do not generate checks that make sure the inferred stream's access is executed at least once.") +); + +cl::opt SSRConflictFreeOnly( + "ssr-conflict-free-only", + cl::init(false), + cl::desc("Only infer streams if they have no conflicts with other memory accesses.") +); + +cl::opt SSRNoInline( + "ssr-no-inline", + cl::init(false), + cl::desc("prevent functions that contain SSR streams from being inlined.") +); + +cl::opt SSRBarrier( + "ssr-barrier", + cl::init(false), + cl::desc("Enable the insertion of a spinning loop that waits for the stream to be done before it is disabled.") +); + +cl::opt SSRVerbose( + "ssr-verbose", + cl::init(false), + cl::desc("Write information about inferred streams to stderr.") +); + +} //end of namespace llvm + + +static constexpr char SSRFnAttr[] = "SSR"; //used to tag functions that contain SSR streams + +static constexpr Intrinsic::ID riscSSRIntrinsics[] = { + Intrinsic::RISCVIntrinsics::riscv_ssr_barrier, + Intrinsic::RISCVIntrinsics::riscv_ssr_disable, + Intrinsic::RISCVIntrinsics::riscv_ssr_enable, + Intrinsic::RISCVIntrinsics::riscv_ssr_setup_repetition, + Intrinsic::RISCVIntrinsics::riscv_ssr_pop, + Intrinsic::RISCVIntrinsics::riscv_ssr_push, + Intrinsic::RISCVIntrinsics::riscv_ssr_read, + Intrinsic::RISCVIntrinsics::riscv_ssr_read_imm, + Intrinsic::RISCVIntrinsics::riscv_ssr_write, + Intrinsic::RISCVIntrinsics::riscv_ssr_write_imm, + Intrinsic::RISCVIntrinsics::riscv_ssr_setup_1d_r, + Intrinsic::RISCVIntrinsics::riscv_ssr_setup_1d_w, + Intrinsic::RISCVIntrinsics::riscv_ssr_setup_bound_stride_1d, + Intrinsic::RISCVIntrinsics::riscv_ssr_setup_bound_stride_2d, + Intrinsic::RISCVIntrinsics::riscv_ssr_setup_bound_stride_3d, + Intrinsic::RISCVIntrinsics::riscv_ssr_setup_bound_stride_4d, +}; + + +namespace { + +template +struct ConflictTree { + void insertNode(const NodeT *Node, unsigned value, const NodeT *Parent) { + assert((values.find(Node) == values.end() || children.find(Node) == children.end()) && "not yet inserted"); + values.insert(std::make_pair(Node, value)); + children.insert(std::make_pair(Node, std::move(std::vector()))); + if (!Parent) { //this is root + assert(!Root && "Parent = nullptr, but root already exists"); + Root = Node; + } else { + auto p = children.find(Parent); + assert(p != children.end() && "parent cannot be found"); + p->getSecond().push_back(Node); + } + } + + //picks nodes in the tree such that their combined value (conmbineFunc, needs to be associative & commutative) is the highest possible + //prioritizes parent over children + std::vector findBest(const std::function &combineFunc) { + std::vector res; + if (!Root) return res; + findBest(Root, combineFunc, res); + return res; + } + +private: + unsigned findBest(const NodeT *N, const std::function &combineFunc, std::vector &res) { + unsigned size = res.size(); + unsigned val = 0u; + auto &chs = children.find(N)->getSecond(); + if (!chs.empty()) { + for (const NodeT *C : chs) val = combineFunc(val, findBest(C, combineFunc, res)); + } + unsigned nval = values.find(N)->second; + if (val > nval) { + return val; + } else { + while (res.size() > size) res.pop_back(); + res.push_back(N); + return nval; + } + } + + DenseMap values; + DenseMap> children; + const NodeT *Root = nullptr; +}; + +// copy Phi-nodes from predecessor Basic Block (BB) +void copyPHIsFromPred(BasicBlock *BB){ + BasicBlock *Pred = nullptr; + for (BasicBlock *B : predecessors(BB)) { + if (!Pred) Pred = B; + assert(Pred == B && "BB has only one predecessor"); + } + assert(Pred && "BB has a Predecessor"); + for (Instruction &I : *Pred){ + if (auto *Phi = dyn_cast(&I)){ + PHINode *PhiC = PHINode::Create(Phi->getType(), 1u, Twine(Phi->getName()).concat(".copy"), BB->getFirstNonPHI()); + //Phi->replaceAllUsesWith(PhiC); + Phi->replaceUsesOutsideBlock(PhiC, Pred); //all users outside of Pred are now using PhiC + PhiC->addIncoming(Phi, Pred); + } + } +} + +///splits block, redirects all predecessor to first half of split, copies phi's +std::pair splitAt(Instruction *X, const Twine &name){ + assert(!isa(X) && "should not split at phi"); + BasicBlock *Two = X->getParent(); + BasicBlock *One = BasicBlock::Create(Two->getContext(), name, Two->getParent(), Two); + Instruction *BR = BranchInst::Create(Two, One); + //DTU.applyUpdates(cfg::Update(cfg::UpdateKind::Insert, One, Two)); + BasicBlock::iterator it = Two->begin(); + while (it != X->getIterator()) { + BasicBlock::iterator it_next = std::next(it); + it->removeFromParent(); + it->insertBefore(BR); + it = it_next; + } + //BasicBlock *One = splitBlockBefore(Two, X, &DTU, nullptr, nullptr, name); + std::vector toChange; + for (auto *BB : predecessors(Two)){ + if (BB == One) continue; + Instruction *T = BB->getTerminator(); + for (unsigned i = 0; i < T->getNumOperands(); i++){ + Value *OP = T->getOperand(i); + if (dyn_cast(OP) == Two){ + toChange.push_back(T); + } + } + } + for (Instruction *T : toChange) { + for (unsigned i = 0; i < T->getNumOperands(); i++){ + Value *OP = T->getOperand(i); + if (dyn_cast(OP) == Two){ + T->setOperand(i, One); //if an operand of the terminator of a predecessor of Two points to Two it should now point to One + /*cfg::Update upd[]{ + cfg::Update(cfg::UpdateKind::Insert, T->getParent(), One), + cfg::Update(cfg::UpdateKind::Delete, T->getParent(), Two), + }; + DTU.applyUpdates(upd);*/ + } + } + } + return std::make_pair(One, Two); +} + +///clones code from BeginWith up to EndBefore +///assumes all cf-paths from begin lead to end (or return) +///assumes there is a phi node for each value defined in the region that will be cloned in the block of EndBefore that is live after EndBefore +///returns the branch that splits region from coloned region and the pair of branches that jump to EndBefore at the end +std::pair> cloneRegion(Instruction *BeginWith, Instruction *EndBefore){ + LLVM_DEBUG(dbgs()<<"cloning from "<<*BeginWith<<" up to "<<*EndBefore<<"\n"); + + auto p = splitAt(BeginWith, "split.before"); + BasicBlock *Head = p.first; + BasicBlock *Begin = p.second; + + p = splitAt(EndBefore, "fuse.prep"); + BranchInst *BRFuse = cast(p.first->getTerminator()); + BasicBlock *End = p.second; + copyPHIsFromPred(End); //copy Phi's from Fuse to End + + std::deque q; //bfs queue + q.push_back(Begin); + DenseSet vis; //bfs visited set + DenseMap clones; //value in orig -> value in clone (INV: orig and clone are of same class) + std::vector> operandsCleanup; //store operands that reference instructions that are not cloned yet + + while (!q.empty()){ + BasicBlock *C = q.front(); q.pop_front(); + if (C == End || vis.find(C) != vis.end()) continue; + vis.insert(C); + BasicBlock *Cc = BasicBlock::Create(C->getContext(), Twine(C->getName()).concat(".clone"), C->getParent(), C); + clones.insert(std::make_pair(C, Cc)); //BasicBlock <: Value, needed for branches + IRBuilder<> builder(Cc); + for (Instruction &I : *C){ + Instruction *Ic = I.clone(); + assert(Ic->use_empty() && "no uses of clone"); + if (I.getType()->isVoidTy() || I.getType()->isLabelTy()) Ic = builder.Insert(Ic); //insert without name + else Ic = builder.Insert(Ic, Twine(I.getName()).concat(".clone")); + for (unsigned i = 0; i < Ic->getNumOperands(); i++){ + auto A = clones.find(Ic->getOperand(i)); + if (A != clones.end()){ + Ic->setOperand(i, A->second); //this also updates uses of A->second + //check users update in A->second + bool userUpdate = false; for (User *U : A->second->users()) {userUpdate = userUpdate || U == Ic; } assert(userUpdate && "user is updated on setOperand"); + //if (isa(A->first)) DTU.applyUpdates(cfg::Update(cfg::UpdateKind::Insert, Cc, cast(A->second))); + }else{ + operandsCleanup.push_back(std::make_pair(i, Ic)); + } + } + clones.insert(std::make_pair(&I, Ic)); //add Ic as clone of I + } + auto succs = successors(C); + for (auto S = succs.begin(); S != succs.end(); ++S) { + q.push_back(*S); + } + } + //operandCleanup + for (const auto &p : operandsCleanup){ //p.first = index of operand that needs to be changed to clone in p.second + auto A = clones.find(p.second->getOperand(p.first)); + if (A != clones.end()){ + p.second->setOperand(p.first, A->second); + //if (isa(A->first)) DTU.applyUpdates(cfg::Update(cfg::UpdateKind::Insert, p.second->getParent(), cast(A->second))); + }//else did not find ==> was defined before region + } + //incoming blocks of phi nodes are not operands ==> handle specially + for (const auto &p : clones){ //all clones of phi-nodes appear in here + if (auto *Phi = dyn_cast(p.second)){ + for (auto B = Phi->block_begin(); B != Phi->block_end(); ++B){ + const auto &c = clones.find(*B); + if (c != clones.end()){ + *B = cast(c->second); //overwrite with clone of block if it was cloned + } + } + } + } + //change terminator of Head to be CondBr with TakeOrig as cond + BranchInst *HeadBr = cast(Head->getTerminator()); //always BranchInst because of splitBlockBefore + BasicBlock *HeadSucc = HeadBr->getSuccessor(0); + BasicBlock *HeadSuccClone = cast(clones.find(HeadSucc)->second); + HeadBr->eraseFromParent(); + HeadBr = BranchInst::Create( + HeadSucc, //branch-cond = true -> go to non-clone (here SSR will be inserted) + HeadSuccClone, + ConstantInt::get(Type::getInt1Ty(HeadSucc->getContext()), 0u), + Head + ); + //DTU.applyUpdates(cfg::Update(cfg::UpdateKind::Insert, Head, HeadSuccClone)); + //handle phi nodes in End + for (Instruction &I : *End){ + if (auto *Phi = dyn_cast(&I)){ + for (auto *B : Phi->blocks()){ //yes Phi->blocks() will change during loop ==> does not matter + auto p = clones.find(B); + if (p != clones.end()){ + Value *Bval = Phi->getIncomingValueForBlock(B); + auto v = clones.find(Bval); + if (v != clones.end()){ + Phi->addIncoming(v->second, cast(p->second)); //add clone value & block as input + }else { + //v->first is constant or it is defined before cloned region begins + Phi->addIncoming(Bval, cast(p->second)); + } + } + } + } + } + LLVM_DEBUG(dbgs()<<"done cloning \n"); + + return std::make_pair(HeadBr, std::make_pair(BRFuse, cast(clones.find(BRFuse)->second))); +} + +BasicBlock *getSingleExitBlock(const Loop *L) { + BasicBlock *Ex = L->getExitBlock(); + if (Ex) return Ex; + SmallVector exits; + L->getExitBlocks(exits); + for (BasicBlock *BB : exits){ + if (!Ex) Ex = BB; + if (Ex != BB) return nullptr; + } + return Ex; +} + +void printInfo(ExpandedAffAcc &E) { + errs() + <<(E.Access->isWrite() ? "write" : "read ") + <<" stream of dimension " + <getAccesses()[0]->getDebugLoc(); + if (DL.get()) { + errs() + <<" orig. on line " + <getBaseAddr(E.getDimension()) + <<".\n"; +} + +//code for run-time checks for TCDM +Value *GenerateTCDMCheck(ExpandedAffAcc &E, Instruction *Point) { + IRBuilder<> builder(Point); + Value *c1 = builder.CreateICmpULE(ConstantInt::get(E.LowerBound->getType(), SSR_SCRATCHPAD_BEGIN), E.LowerBound, "beg.check"); + Value *c2 = builder.CreateICmpULE(E.UpperBound, ConstantInt::get(E.UpperBound->getType(), SSR_SCRATCHPAD_END), "end.check"); + return builder.CreateAnd(c1, c2, "tcdm.check"); +} + +//generate code for SSR setup +void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ + assert(Point); + Module *mod = Point->getModule(); + IRBuilder<> builder(Point); + Type *i32 = Type::getInt32Ty(Point->getContext()); + unsigned dim = E.getDimension(); + LLVM_DEBUG(dbgs()<<"SSR Setup for stream with dim = "<isWrite(); + + Intrinsic::RISCVIntrinsics functions[] = { + Intrinsic::riscv_ssr_setup_bound_stride_1d, + Intrinsic::riscv_ssr_setup_bound_stride_2d, + Intrinsic::riscv_ssr_setup_bound_stride_3d, + Intrinsic::riscv_ssr_setup_bound_stride_4d + }; + + for (unsigned i = 0u; i < dim; i++) { + Value *Stride = E.Steps[i]; + if (i > 0) Stride = builder.CreateSub(Stride, E.PrefixSumRanges[i-1], formatv("stride.{0}d", i+1)); + Value *Bound = E.Reps[i]; + Function *SSRBoundStrideSetup = Intrinsic::getDeclaration(mod, functions[i]); + std::array bsargs = {DMid, Bound, Stride}; + builder.CreateCall(SSRBoundStrideSetup->getFunctionType(), SSRBoundStrideSetup, ArrayRef(bsargs)); + } + + unsigned n_reps = 0u; + if (isStore){ + Function *SSRPush = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_push); + for (Instruction *I : E.Access->getAccesses()){ + std::array pusharg = {DMid, cast(I)->getValueOperand()}; + builder.SetInsertPoint(I); + builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); + I->eraseFromParent(); + n_reps++; + } + }else{ + Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); + std::array poparg = {DMid}; + for (Instruction *I : E.Access->getAccesses()){ + builder.SetInsertPoint(I); + auto *V = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); + BasicBlock::iterator ii(I); + ReplaceInstWithValue(I->getParent()->getInstList(), ii, V); + n_reps++; + } + } + + builder.SetInsertPoint(Point); + Constant *Rep = ConstantInt::get(i32, n_reps - 1U); + Function *SSRRepetitionSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_repetition); + std::array repargs = {DMid, Rep}; + builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef(repargs)); + + Function *SSRSetup; + if (!isStore){ + SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_read_imm); //can take _imm bc dm and dim are constant + }else{ + SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_write_imm); //can take _imm bc dm and dim are constant + } + std::array args = {DMid, Dim, E.Addr}; + //NOTE: this starts the prefetching ==> always needs to be inserted AFTER bound/stride and repetition setups !!! + builder.CreateCall(SSRSetup->getFunctionType(), SSRSetup, ArrayRef(args)); + + return; +} + +/// generate a SSR Barrier intrinsic call before InsertBefore +void generateSSRBarrier(Instruction *InsertBefore, unsigned dmid) { + IRBuilder<> builder(InsertBefore); + Function *Barrier = Intrinsic::getDeclaration(InsertBefore->getModule(), Intrinsic::riscv_ssr_barrier); + builder.CreateCall(Barrier->getFunctionType(), Barrier, ConstantInt::get(Type::getInt32Ty(builder.getContext()), dmid)); +} + +/// generates SSR enable & disable calls +std::pair generateSSREnDis(Instruction *PhP, Instruction *ExP){ + IRBuilder<> builder(PhP); // ----------- in preheader + Module *mod = PhP->getParent()->getModule(); + Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); + Instruction *en = builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef()); + + builder.SetInsertPoint(ExP); // ----------- in exit block + //generateFPDependency(builder); + Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); + Instruction *dis = builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef()); + + LLVM_DEBUG(dbgs()<<"generated ssr_enable and ssr_disable\n"); + + return std::make_pair(en, dis); +} + +//estimate how much it costs to compute the SSR setup data (bounds, strides, base address, etc...) +int getEstExpandCost(AffAcc *A, unsigned dim) { + int cost = 0; + cost += A->getBaseAddr(dim)->getExpressionSize(); + for (unsigned i = 1; i < dim; i++) { + cost += A->getStep(i)->getExpressionSize(); + cost += A->getRep(i)->getExpressionSize(); + cost += EST_MUL_COST; //for range + if (i > 1) cost += 1; //for addition + } + return cost; +} + +//estimate the benefit of turning some AffAccs into streams +int getEstGain(ArrayRef Accs, const Loop *L, AffineAccess &AAA) { + int gain = 0; + DenseSet accs; + for (auto *A : Accs) accs.insert(A); + + DenseSet contLoops; + DenseSet vis; + for (AffAcc *A : Accs) { + vis.insert(A); + unsigned dim = A->loopToDimension(L); + + //cost of expanding A + gain -= getEstExpandCost(A, dim); + + //cost of intersection checks + if (!SSRNoIntersectCheck) { + for (const auto &p : A->getConflicts(L)) { + switch (p.second) + { + case AffAccConflict::NoConflict: + break; //nothing to do + case AffAccConflict::MustNotIntersect: { + AffAcc *B = p.first; + if (vis.find(B) != vis.end()) break; //already handled this conflict when A was B + unsigned dimB = B->loopToDimension(L); + if (accs.find(B) == accs.end()) gain -= getEstExpandCost(B, dimB); + gain -= 4u; //2x ICmpULT, 1 OR, 1 AND + break; + } + case AffAccConflict::Bad: + assert(false && "WARNING: there is a bad conflict for given Accs and L ==> could not expand them here!"); + default: + llvm_unreachable("uknown conflict type"); + } + } + } + + //cost of tcdm checks + if (!SSRNoTCDMCheck) { + gain -= 4u; //2x ICmpULT, 2 AND + } + + int reps = 1; + for (unsigned d = dim; d >= 1u; d--) { //dimensions that are extended + int loopTC = EST_LOOP_TC; + if (A->getRep(d)->getSCEVType() == SCEVTypes::scConstant) + loopTC = cast(A->getRep(d))->getAPInt().getLimitedValue(std::numeric_limits::max()); + reps = std::max(reps * loopTC, reps); //prevent overflows + + //prep for boundcheck cost + contLoops.insert(A->getLoop(d)); + } + gain += EST_MEMOP_COST * reps; //the number of loads/stores that are removed by inserting a stream + + } + + if (!SSRNoBoundCheck) { + gain -= 2 * contLoops.size(); // 1 ICmp, 1 AND per loop + } + + return gain; +} + +///expands AffAcc's in L's preheader and inserts TCDM checks, returns ExpandedAffAcc's and writes the final Value* of the checks into Cond +std::vector expandInLoop(const std::vector &accs, const Loop *L, AffineAccess &AAA, Value *&Cond) { + assert(!accs.empty()); + assert(accs.size() <= NUM_SSR); + assert(L); + + LLVM_DEBUG(dbgs()<<"expanding in Loop: "<getHeader()->getNameOrAsOperand()<<" at depth "<getLoopDepth()<<"\n"); + + auto &ctxt = L->getHeader()->getContext(); + IntegerType *i32 = IntegerType::getInt32Ty(ctxt); + Type *i8Ptr = Type::getInt8PtrTy(ctxt); + + Instruction *PhT = L->getLoopPreheader()->getTerminator(); + + //generate Steps, Reps, base addresses, intersect checks, and bound checks + auto exp = AAA.expandAllAt(accs, L, PhT, Cond, i8Ptr, i32, !SSRNoIntersectCheck, !SSRNoBoundCheck); + assert(Cond); + + //TCDM Checks + if (!SSRNoTCDMCheck) { + IRBuilder<> builder(PhT); + for (auto &E : exp) { + Cond = builder.CreateAnd(Cond, GenerateTCDMCheck(E, PhT)); + } + } + + assert(Cond->getType() == Type::getInt1Ty(Cond->getContext()) && "Cond has type bool (i1)"); + + return exp; +} + +///clones from L's preheader to L's exit uses Cond for CBr between clone and non-clone +///then generates the instrinsics for all in exp +void cloneAndSetup(Instruction *PhT, Instruction *ExP, Value *Cond, std::vector &exp) { + assert(exp.size() <= NUM_SSR); + if (exp.size() == 0u) return; + + //generate en/dis range over both loop versions to prevent later runs of this pass to infer streams in the clone version + // ExP = generateSSREnDis(PhT, ExP).second; //TODO: this might be better here + + + if (!isa(Cond)){ //if Cond is not a constant we cannot make the decision at compile time ==> clone whole region for if-else + auto p = cloneRegion(PhT, ExP); + BranchInst *BR = p.first; + ExP = p.second.first; //terminator of exit block that jumps to original ExP + BR->setCondition(Cond); + } else { + //this should never happen, but it means the runtime checks were somehow known at compile time and turned out false: + if(cast(Cond)->getLimitedValue() == 0u) return; + } + + unsigned dmid = 0u; + for (auto &E : exp) { + GenerateSSRSetup(E, dmid++, PhT); + if (SSRBarrier) generateSSRBarrier(ExP, dmid); + } + + generateSSREnDis(PhT, ExP); +} + +//predicate to filter AffAccs +//in accordance with HW limitations, i.e., dimension <= 4, type = double, see #defines used +bool isValid(AffAcc *A, const Loop *L) { + assert(A->isWellFormed(L)); + bool valid = true; + bool write = A->isWrite(); + for (Instruction *I : A->getAccesses()) { + if (write) valid &= CHECK_TYPE(cast(I)->getValueOperand()->getType(), I); + else valid &= CHECK_TYPE(I->getType(), I); + } + valid &= A->loopToDimension(L) <= SSR_MAX_DIM; + return valid; +} + +//should be guaranteed by SimplifyLoops in SSRInferencePass, but the pass says that any guarantees should be rechecked when depended upon. +bool isValidLoop(const Loop *L) { + assert(L); + if (!L->getLoopPreheader() || !getSingleExitBlock(L)) return false; + return true; +} + +// collect some information about loop: +// possible streams +// insertion into conflict tree (for mapping to data movers) +bool visitLoop(const Loop *L, DenseMap> &possible, ConflictTree &tree, AffineAccess &AAA, bool isKnownInvalid) { + assert(L); + + //NOTE: cannot return early in this function, as `possible` and `tree` need to be expanded even if L is not suitable for streams + + std::vector accs = AAA.getExpandableAccesses(L, SSRConflictFreeOnly); + + if (isKnownInvalid || !isValidLoop(L)) { + accs.clear(); //make accs empty + isKnownInvalid = true; + } + + std::vector valid; + for (AffAcc *A : accs) { + if (isValid(A, L)) valid.push_back(A); + } + //sort by dimension (with read beeing preferred over write) + auto comp = [L](const AffAcc *A, const AffAcc *B) { + unsigned dimA = A->loopToDimension(L); + unsigned dimB = B->loopToDimension(L); + return dimA < dimB || (dimA == dimB && (!A->isWrite() && B->isWrite())); + }; + std::sort(valid.begin(), valid.end(), comp); + //add possible: + auto &l = possible.insert(std::make_pair(L, std::move(std::vector()))).first->getSecond(); + for (unsigned i = 0u; i < NUM_SSR && i < valid.size(); i++) { + l.push_back(valid[i]); + } + //add to tree: + int gain = getEstGain(l, L, AAA); + LLVM_DEBUG(dbgs()<<"est. gain is "<isOutermost() ? nullptr : L->getParentLoop()); + + if (SSRVerbose) { + for (auto *A : l) { + errs() + <<"potential stream with base addr SCEV " + <<*A->getBaseAddr(L) + <<" of dimension " + <loopToDimension(L) + <<"\n"; + } + if (!l.empty()) errs()<<"With est. gain = "< findLoopsWithSSR(Function &F, LoopInfo &LI) { + DenseSet invalid; + + DenseSet ids; + for (Intrinsic::ID x : riscSSRIntrinsics){ + ids.insert(x); //put intrinsics into set for faster lookup + } + + std::deque> worklist; + DenseSet visUnmarked; + DenseSet visMarked; + worklist.push_back(std::make_pair(&F.getEntryBlock(), false)); + while(!worklist.empty()) { + auto p = worklist.front(); worklist.pop_front(); + BasicBlock *BB = p.first; + bool marked = p.second; + + if (!BB) continue; + if (marked) { + if (visMarked.find(BB) != visMarked.end()) continue; + visMarked.insert(BB); + + //mark all loops containing this Block invalid + const Loop *L = LI.getLoopFor(BB); + while (L) { + invalid.insert(L); + L = L->getParentLoop(); + } + + //go through instructions in block, if there is an ssr_disable() call, remove the marking for the successors of this block + for (Instruction &i : *BB) { + if (isa(i)) { + if (cast(i).getIntrinsicID() == Intrinsic::riscv_ssr_disable) marked = false; + } + if (!marked) break; //early exit + } + + } else { + if (visUnmarked.find(BB) != visUnmarked.end()) continue; + visUnmarked.insert(BB); + + for (Instruction &i : *BB) { + Instruction *I = &i; + if (CallBase *C = dyn_cast(I)) { + if (C->hasFnAttr(SSRFnAttr)) { + LLVM_DEBUG(dbgs()<<"call "<<*C<<" has attribute "< no need to mark the BB + const Loop *L = LI.getLoopFor(BB); + while (L) { + invalid.insert(L); + L = L->getParentLoop(); + } + } + if (IntrinsicInst *II = dyn_cast(C)) { + if (ids.contains(II->getIntrinsicID())) { + LLVM_DEBUG(dbgs()<<"Intrinsic Instr "<<*II<<" calls an SSR intrinsic\n"); + marked = true; //mark this (and thus also all following BBs) + } + } + if (C->isInlineAsm()) { //inline asm may contain ssr setup insts! + LLVM_DEBUG(dbgs()<<"inline asm call "<<*C<<" may contain ssr insts!\n"); + LLVM_DEBUG(C->getType()->dump()); + marked = true; + } + } + } + if (marked) worklist.push_back(std::make_pair(BB, true)); // if now marked, add to queue again + } + + for (BasicBlock *BB2 : successors(BB)) { + worklist.push_back(std::make_pair(BB2, marked)); + } + } + if (!invalid.empty()) LLVM_DEBUG(dbgs()<<"Loops that are invalid bc of SSR\n"); + for (auto l : invalid) { + LLVM_DEBUG(dbgs()<<"header = "<getHeader()->getNameOrAsOperand()<<" at depth = "<getLoopDepth()<<"\n"); + } + + return invalid; +} + +} //end of namespace + +// main "run" of this pass +PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &FAM){ + LLVM_DEBUG(dbgs()<<"SSRInference Flags: "); + if (InferSSR) LLVM_DEBUG(dbgs()<<"infer-ssr"); + if (SSRNoIntersectCheck) LLVM_DEBUG(dbgs()<<", ssr-no-intersect-check"); + if (SSRNoBoundCheck) LLVM_DEBUG(dbgs()<<", ssr-no-bound-check"); + if (SSRNoTCDMCheck) LLVM_DEBUG(dbgs()<<", ssr-no-tcdm-check"); + if (SSRBarrier) LLVM_DEBUG(dbgs()<<", ssr-barrier"); + if (SSRNoInline) LLVM_DEBUG(dbgs()<<", ssr-no-inline"); + if (SSRConflictFreeOnly) LLVM_DEBUG(dbgs()<<", ssr-conflict-free-only"); + LLVM_DEBUG(dbgs()<<"\n"); + + if (!InferSSR) return PreservedAnalyses::all(); //if no SSR inference is enabled, we exit early + if (F.hasFnAttribute(SSRFnAttr)) return PreservedAnalyses::all(); //this function already contains streams ==> skip + + AffineAccess &AAA = FAM.getResult(F); //call analysis + + LLVM_DEBUG(dbgs()<<"SSR Generation Pass on function: "<> trees; //keep track of the conflict tree for each top-level loop + DenseMap> bestLoops; //keep track of the best results for each tree + DenseMap> possible; //keep track of the AffAcc's that can be expanded in each loop + DenseMap conds; //keep track of the condition of the run-time check for each loop + DenseMap> exps; //keep track of the expanded AffAcc's for each loop + DenseSet ssrInvalidLoops = findLoopsWithSSR(F, AAA.getLI()); + + for (const Loop *T : toploops){ + ConflictTree &tree = trees.insert(std::make_pair(T, ConflictTree())).first->getSecond(); + + //go through all loops in sub-tree of T to build conflict-tree and find possible expands + std::deque worklist; + worklist.push_back(T); + while (!worklist.empty()) { + const Loop *L = worklist.front(); worklist.pop_front(); + LLVM_DEBUG(dbgs()<<"visiting loop: "<getHeader()->getNameOrAsOperand()<<"\n"); + + visitLoop(L, possible, tree, AAA, ssrInvalidLoops.find(L) != ssrInvalidLoops.end()); + + for (const Loop *x : L->getSubLoops()) worklist.push_back(x); + } + + //find best expands (map best loops to data movers) + auto f = [](unsigned a, unsigned b){ return a + b; }; + std::vector best = tree.findBest(f); + + //expand them + for (const Loop *L : best) { + auto &acc = possible.find(L)->getSecond(); + if (!acc.empty()) { + changed = true; + Value *Cond = nullptr; + auto exp = expandInLoop(acc, L, AAA, Cond); + assert(Cond); + conds.insert(std::make_pair(L, Cond)); + exps.insert(std::make_pair(L, std::move(exp))); + } + } + + bestLoops.insert(std::make_pair(T, std::move(best))); + } + + ///NOTE: as soon as we start cloning (so after this comment), all the analyses are falsified and we do not want to update them + ///because that would falsify the AAA (which we do not want to update because it would find less solutions after the cloning). + ///So all the code that follows does not make use of any of the analyses (except for L->getLoopPreheader & stuff like that which luckily still work) + + for (const Loop *T : toploops) { + std::vector &best = bestLoops.find(T)->getSecond(); + for (const Loop *L : best) { + auto p = conds.find(L); + if (p != conds.end()) { + BasicBlock *Ex = getSingleExitBlock(L); + assert(Ex); + if (SSRVerbose) { + errs() + <<"> Function " + <getHeader()->getParent()->getNameOrAsOperand() + <<": Expanding SSR streams with " + <<(L->getLoopDepth()-1) + <<" containing loops and setup in preheader of loop with header " + <getHeader()->getNameOrAsOperand() + <<"\n"; + } + cloneAndSetup(L->getLoopPreheader()->getTerminator(), &*Ex->getFirstInsertionPt(), p->second, exps.find(L)->getSecond()); + } + } + } + + if (!changed) return PreservedAnalyses::all(); + + F.addFnAttr(StringRef(SSRFnAttr)); //we have inserted a stream, tag accordingly + if (SSRNoInline) F.addFnAttr(Attribute::AttrKind::NoInline); + return PreservedAnalyses::none(); +} \ No newline at end of file diff --git a/llvm/lib/Transforms/SSR/SSRInference.cpp b/llvm/lib/Transforms/SSR/SSRInference.cpp new file mode 100644 index 0000000000000..00f417f2d9c83 --- /dev/null +++ b/llvm/lib/Transforms/SSR/SSRInference.cpp @@ -0,0 +1,77 @@ +//===-- SSRInference.cpp - Infer SSR usage --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/SSR/SSRInference.h" +#include "llvm/InitializePasses.h" +#include "llvm/Passes/PassBuilder.h" +#include "llvm/Target/TargetMachine.h" + +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/IRBuilder.h" + +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Scalar/ADCE.h" + +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/AffineAccessAnalysis.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" +#include "llvm/Transforms/Utils/FixIrreducible.h" +#include "llvm/Transforms/InstCombine/InstCombine.h" +#include "llvm/Transforms/Scalar/LoopStrengthReduce.h" +#include "llvm/Transforms/Scalar/LoopRotation.h" +#include "llvm/Transforms/Scalar/IndVarSimplify.h" +#include "llvm/Transforms/Scalar/SimplifyCFG.h" +#include "llvm/Transforms/Scalar/LoopFlatten.h" +#include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h" +#include "llvm/Transforms/Scalar/Reassociate.h" +#include "llvm/Transforms/Scalar/LICM.h" +#include "llvm/Transforms/SSR/SSRGeneration.h" + +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Casting.h" + +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsRISCV.h" + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/TinyPtrVector.h" +#include "llvm/ADT/ilist.h" + +#include +#include + +#define DEBUG_TYPE "ssr" + +using namespace llvm; + +PreservedAnalyses SSRInferencePass::run(Function &F, FunctionAnalysisManager &FAM){ + LLVM_DEBUG(dbgs()<<"SSR Inference Pass on function: "<