pulp-platform · ThomasRupf · Mar 27, 2022 · Mar 27, 2022 · Apr 1, 2022 · Apr 5, 2022
diff --git a/.gitignore b/.gitignore
@@ -68,3 +68,8 @@ pythonenv*
 /clang/utils/analyzer/projects/*/RefScanBuildResults
 # automodapi puts generated documentation files here.
 /lldb/docs/python_api/
+
+
+# exclude installation
+build-llvm/*
+install/*
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ LLVM 12 with extensions for processors and computer systems of the [PULP platfor
 - [HERO][hero]: mixed-data-model (64-bit + 32-bit) compilation and data sharing; automatic tiling of data structures and insertion of DMA transfers;
 - MemPool: Instruction scheduling model for the MemPool architecture; `Xmempool` extension to allow dynamic instruction tracing;
 - [PULPv2 RISC-V ISA extension (`Xpulpv2`)][hero]: automatic insertion of hardware loops, post-increment memory accesses, and multiply-accumulates; intrinsics, `clang` builtins , and assembly support for all instructions of the extension;
-- [Snitch RISC-V ISA extensions (`Xssr`, `Xfrep`, and `Xdma`)][snitch]: automatic insertion of `frep` hardware loops; intrinsics and `clang` builtins for `Xssr` and `Xdma` extensions; assembly support for all instructions of the extension.
+- [Snitch RISC-V ISA extensions (`Xssr`, `Xfrep`, and `Xdma`)][snitch]: automatic insertion of `frep` hardware loops; intrinsics and `clang` builtins for `Xssr` and `Xdma` extensions; assembly support for all instructions of the extension. NEW: automatic SSR inference.
 
 # HERO and PULPv2 RISC-V ISA Extension Support
 
@@ -16,16 +16,24 @@ Refer to the [HERO repository](https://github.com/pulp-platform/hero) for build
 Refer to [snitch-toolchain-cd](https://github.com/pulp-platform/snitch-toolchain-cd) for build scripts and continuous deployment of pre-built toolchains.
 
 ## Command-line options
+Note that flags that are passed to LLVM through `clang` need to be prefaced with `-mllvm` (use `"SHELL:-mllvm <flag>"` in CMake to prevent removal of repeated `-mllvm`s).
 
 | Flag | Description |
 |---|---|
 | `--mcpu=snitch` | Enables all extensions for Snitch `rv32imafd,xfrep,xssr,xdma` and the Snitch machine model, which is not adapted for Snitch yet |
 | `--debug-only=riscv-sdma` | Enable the debug output of the DMA pseudo instruction expansion pass |
 | `--debug-only=riscv-ssr` | Enable the debug output of the SSR pseudo instruction expansion pass |
 | `--debug-only=snitch-freploops` | Enable the debug output of the FREP loop inference pass |
-| `--ssr-noregmerge` | Disable the SSR register merging in the SSR pseudo instruction expansion pass. Register merging is enabled by default and can be disabled with this flag. |
+| `--ssr-no-regmerge` | Disable the SSR register merging in the SSR pseudo instruction expansion pass. Register merging is enabled by default and can be disabled with this flag. |
 | `--snitch-frep-inference` | Globally enable the FREP inference on all loops in the compiled module. |
-| `--enable-misched=false` | Disable the machine instruction scheduler. Instructions in a complex loop with multiple SSR push or pop instructions on the same data mover may not be rescheduled because the order in which the SSR are accessed is important. |
+| `-infer-ssr` | Enable automatic inference of SSR streams. |
+| `-ssr-no-intersect-check` | Do not generate intersection checks (unsafe). Use `restrict` key-word instead if possible. |
+| `-ssr-no-tcdm-check` | Assume all data of inferred streams is inside TCDM. |
+| `-ssr-no-bound-check` | Do not generate checks that make sure the inferred stream's access is executed at least once. |
+| `-ssr-conflict-free-only` | Only infer streams if they have no conflicts with other memory accesses. |
+| `-ssr-no-inline` | Prevent functions that contain SSR streams from being inlined |
+| `-ssr-barrier` | Enable the insertion of a spinning loop that waits for the stream to be done before it is disabled. |
+| `-ssr-verbose` | Write information about inferred streams to `stderr`. |
 
 ## `clang` builtins
 The following `clang` builtins can be used to directly make use of the SSR and DMA extensions.
@@ -189,6 +197,11 @@ void __builtin_ssr_setup_bound_stride_4d(uint32_t DM, uint32_t b, uint32_t s);
 void __builtin_ssr_barrier(uint32_t DM);
 ```
 
+#### SSR Inference Interoperability
+Automatic SSR infernce will not infer any streams in an `ssr_enable` to `ssr_disable` region. 
+Note that SSR inference currently treats any inline asm block as if it would contain an SSR instruction. Thus it will not infer streams in any loop nests that contain inline asm somewhere.
+
+
 ### SDMA
 
 ```c

diff --git a/cmd_out.txt b/cmd_out.txt
diff --git a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h
@@ -0,0 +1,185 @@
+#include "llvm/IR/PassManager.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/PassPlugin.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DenseMap.h"
+
+#include <iostream>
+#include <vector>
+#include <utility>
+
+namespace llvm {
+
+class AffineAccess;
+class AffineAccessAnalysis;
+class LoopInfo;
+class ScalarEvolution;
+class MemorySSA;
+class MemoryUseOrDef;
+class MemoryDef;
+struct ExpandedAffAcc;
+class DependenceInfo;
+class LoopAccessInfo;
+
+struct LoopRep{
+private:
+  ScalarEvolution &SE;
+  DominatorTree &DT;
+  const Loop *L;
+  const SCEV *RepSCEV;
+  Value *Rep = nullptr;
+  Value *RepPlusOne = nullptr;
+  SmallVector<const Loop *, 4U> containingLoops; //from inner- to outermost
+  unsigned safeExpandBound; //exclusive bound
+
+public:
+  /// construct rep for this loop, if loop well-formed isAvaliable will give true
+  LoopRep(const Loop *L, ArrayRef<const Loop *> contLoops, ScalarEvolution &SE, DominatorTree &DT);
+  bool isAvailable() const;
+  bool isOnAllCFPathsOfParentIfExecuted() const;
+  const Loop *getLoop() const;
+  const SCEV *getSCEV() const;
+  const SCEV *getSCEVPlusOne() const;
+  bool isSafeToExpandBefore(const Loop *L) const;
+
+  ///expands LoopRep::RepSCEV at InsertBefore (if nullptr in preheader of loop)
+  Value *expandAt(Type *ty, Instruction *InsertBefore = (Instruction *)nullptr);
+  Value *expandLoopGuard(Instruction *InsertBefore = (Instruction *)nullptr);
+};
+
+enum AffAccConflict { NoConflict = 0, MustNotIntersect = 1, Bad = 10};
+
+struct AffAcc{
+private:
+  ScalarEvolution &SE;
+  MemoryUseOrDef *MA;
+  SmallVector<Instruction *, 2U> accesses;       //the load/store (or call) instructions 
+  SmallVector<const SCEV *, 4U> baseAddresses;   //base addresses depending on loop 
+  SmallVector<const SCEV *, 4U> steps;           //steps per loop (0 if loop-inv) 
+  SmallVector<LoopRep *, 4U> reps;               //loop reps 
+  SmallVector<const Loop *, 4U> containingLoops; //from inner- to outermost
+  DenseMap<AffAcc *, std::pair<const Loop *, AffAccConflict>> conflicts;
+  void findSteps(const SCEV *A, const SCEV *Factor, unsigned loop);
+  AffAccConflict fromConflictPair(const detail::DenseMapPair<AffAcc*, std::pair<const Loop*, AffAccConflict>> &p, const Loop *L) const;
+
+public:
+  AffAcc() = delete;
+  ///immediately copies the contens of accesses and containingLoops
+  AffAcc(ArrayRef<Instruction *> accesses, const SCEV *Addr, MemoryUseOrDef *MA, ArrayRef<const Loop *> containingLoops, ScalarEvolution &SE);
+  ArrayRef<Instruction *> getAccesses() const;
+  Value *getAddrValue() const;
+  bool isWrite() const;
+  int getMaxDimension() const; 
+  const Loop *getDeepestMalformed() const;
+  bool isWellFormed(unsigned dimension) const;
+  bool isWellFormed(const Loop *L) const;
+  bool canExpandBefore(const Loop *L) const;
+  void dump() const;
+  void dumpInLoop(const Loop *L) const;
+  unsigned loopToDimension(const Loop *L) const;
+  const SCEV *getBaseAddr(unsigned dim) const;
+  const SCEV *getBaseAddr(const Loop *L) const;
+  const SCEV *getStep(unsigned dim) const;
+  const SCEV *getRep(unsigned dim) const;
+  const Loop *getLoop(unsigned dim) const;
+  ArrayRef<const Loop *> getContainingLoops() const;
+  AffAccConflict getConflict(AffAcc *A, const Loop *L) const;
+  std::vector<std::pair<AffAcc*, AffAccConflict>> getConflicts(const Loop *L) const;
+
+  MemoryUseOrDef *getMemoryAccess();
+  void addConflict(AffAcc *A, const Loop *StartL, AffAccConflict kind);
+  bool promote(LoopRep *LR); ///does not check whether it is on all CF-paths for LR->getLoop()
+  ///code gen:
+  Value *expandBaseAddr(unsigned dimension, Type *ty, Instruction *InsertBefore = (Instruction *)nullptr);
+  Value *expandStep(unsigned dimension, Type *ty, Instruction *InsertBefore = (Instruction *)nullptr);
+  Value *expandRep(unsigned dimension, Type *ty, Instruction *InsertBefore = (Instruction *)nullptr);
+  ExpandedAffAcc expandAt(const Loop *L, Instruction *Point, Type *PtrTy, IntegerType *ParamTy); 
+};
+
+struct MemDep {
+private:
+  MemorySSA &MSSA;
+  AAResults &AA;
+  bool alias(Value *A, Value *B);
+  bool alias(MemoryUseOrDef *A, MemoryUseOrDef *B);
+
+public:
+  MemDep(MemorySSA &MSSA, AAResults &AA) : MSSA(MSSA), AA(AA) {}
+  DenseSet<MemoryUseOrDef *> findClobbers(MemoryUseOrDef *MA);
+  DenseSet<MemoryUseOrDef *> findClobberUsers(MemoryDef *MA);
+};
+
+struct ExpandedAffAcc {
+public:
+  AffAcc *const Access;
+  Value *const Addr;
+  const SmallVector<Value *, 3U> Steps;
+  const SmallVector<Value *, 3U> Reps;
+  const SmallVector<Value *, 3U> Ranges;
+  const SmallVector<Value *, 3U> PrefixSumRanges;
+  Value *const LowerBound;
+  Value *const UpperBound;
+  unsigned getDimension() const { return Steps.size(); } //returns the nr of steps/reps/etc... there are
+  ExpandedAffAcc (AffAcc *A, Value *Addr, ArrayRef<Value *> Steps, ArrayRef<Value *> Reps, 
+    ArrayRef<Value *> Ranges, ArrayRef<Value *> PSRanges, Value *LowerBound, Value *UpperBound) 
+    : Access(A), Addr(Addr), Steps(Steps.begin(), Steps.end()), Reps(Reps.begin(), Reps.end()), 
+      Ranges(Ranges.begin(), Ranges.end()), PrefixSumRanges(PSRanges.begin(), PSRanges.end()), 
+      LowerBound(LowerBound), UpperBound(UpperBound) { }
+};
+
+class AffineAccess{
+private:
+  ScalarEvolution &SE;
+  DominatorTree &DT;
+  LoopInfo &LI;
+  MemorySSA &MSSA;
+  AAResults &AA;
+  DependenceInfo &DI;
+  MemDep MD;
+  DenseMap<MemoryUseOrDef *, AffAcc *> access;
+  DenseMap<const Loop *, LoopRep *> reps;
+  DenseMap<const Loop *, SmallVector<AffAcc *, 3u>> promotedAccesses;
+  DenseMap<const Loop *, SmallVector<AffAcc *, 2u>> expandableAccesses;
+
+  std::unique_ptr<std::vector<AffAcc *>> analyze(Loop *Parent, ArrayRef<const Loop *> loopPath);
+  void addAllConflicts(const std::vector<AffAcc *> &all);
+  AffAccConflict calcRWConflict(AffAcc *Read, AffAcc *Write, const Loop *L) const;
+  std::pair<AffAccConflict, const Loop*> calcConflict(AffAcc *A, AffAcc *B) const;
+
+public:
+  AffineAccess(Function &F, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, MemorySSA &MSSA, AAResults &AA, DependenceInfo &DI);
+  AffineAccess() = delete;
+  bool accessPatternsMatch(const AffAcc *A, const AffAcc *B, const Loop *L) const;
+  bool accessPatternsAndAddressesMatch(const AffAcc *A, const AffAcc *B, const Loop *L) const;
+  ScalarEvolution &getSE() const;
+  DominatorTree &getDT() const;
+  LoopInfo &getLI() const;
+  MemorySSA &getMSSA() const;
+  AAResults &getAA() const;
+  DependenceInfo &getDI() const;
+  SmallVector<Loop *, 4U> getLoopsInPreorder() const;
+
+  std::vector<AffAcc *> getExpandableAccesses(const Loop *L, bool conflictFreeOnly = false);
+  std::vector<ExpandedAffAcc> expandAllAt(ArrayRef<AffAcc *> Accs, const Loop *L, Instruction *Point, 
+    Value *&BoundCheck, Type *PtrTy, IntegerType *ParamTy, bool conflictChecks = true, bool repChecks = false);
+};
+
+class AffineAccessAnalysis : public AnalysisInfoMixin<AffineAccessAnalysis> {
+  friend AnalysisInfoMixin<AffineAccessAnalysis>;
+  static AnalysisKey Key;
+
+public:
+  using Result = AffineAccess;
+  Result run(Function &F, FunctionAnalysisManager &AM);
+};
+
+// This is the analysis pass that will be invocable via opt
+class AffineAccessAnalysisPass : public AnalysisInfoMixin<AffineAccessAnalysisPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // namespace llvm
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -1434,19 +1434,18 @@ let TargetPrefix = "riscv" in {
         RISCVSSRIntrinsic;
 
   // The `Throws` attribute ensures that the push/pop don't get removed from loops
-  // by the LICM pass
-  // TODO: Is there another way to do this?
+  // by the LICM pass ==> not needed, LICM is only problem if readonly ==> make pop read and write (which is default)
   def int_riscv_ssr_push
       : GCCBuiltin<"__builtin_ssr_push">,
         Intrinsic<[],
                   [llvm_i32_ty, llvm_double_ty],
-                  [IntrWriteMem, IntrHasSideEffects, Throws, ImmArg<ArgIndex<0>>]>,
+                  [IntrWriteMem, ImmArg<ArgIndex<0>>]>,
         RISCVSSRIntrinsic;
   def int_riscv_ssr_pop
       : GCCBuiltin<"__builtin_ssr_pop">,
         Intrinsic<[llvm_double_ty],
                   [llvm_i32_ty],
-                  [IntrReadMem, IntrHasSideEffects, Throws, ImmArg<ArgIndex<0>>]>,
+                  [ImmArg<ArgIndex<0>>, IntrWriteMem]>, //use ReadWrite instead of throw to avoid licm
         RISCVSSRIntrinsic;
 
   def int_riscv_ssr_enable

diff --git a/llvm/include/llvm/Transforms/SSR/SSRGeneration.h b/llvm/include/llvm/Transforms/SSR/SSRGeneration.h
@@ -0,0 +1,23 @@
+//===-- SSRInference.h - Infer SSR usage ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SSR_SSRGENERATION_H
+#define LLVM_TRANSFORMS_SSR_SSRGENERATION_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class SSRGenerationPass : public PassInfoMixin<SSRGenerationPass>{
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_SSR_SSRGENERATION_H
diff --git a/llvm/include/llvm/Transforms/SSR/SSRInference.h b/llvm/include/llvm/Transforms/SSR/SSRInference.h
@@ -0,0 +1,23 @@
+//===-- SSRInference.h - Infer SSR usage ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SSR_SSRINFERENCE_H
+#define LLVM_TRANSFORMS_SSR_SSRINFERENCE_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class SSRInferencePass : public PassInfoMixin<SSRInferencePass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_SSR_SSRINFERENCE_H