diff --git a/.gitignore b/.gitignore
index b33fbbf932379..29a57901174ad 100644
--- a/.gitignore
+++ b/.gitignore
@@ -68,3 +68,8 @@ pythonenv*
 /clang/utils/analyzer/projects/*/RefScanBuildResults
 # automodapi puts generated documentation files here.
 /lldb/docs/python_api/
+
+
+# exclude installation
+build-llvm/*
+install/*
diff --git a/README.md b/README.md
index 793180dc80b7b..be75cdf8c8569 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ LLVM 12 with extensions for processors and computer systems of the [PULP platfor
 - [HERO][hero]: mixed-data-model (64-bit + 32-bit) compilation and data sharing; automatic tiling of data structures and insertion of DMA transfers;
 - MemPool: Instruction scheduling model for the MemPool architecture; `Xmempool` extension to allow dynamic instruction tracing;
 - [PULPv2 RISC-V ISA extension (`Xpulpv2`)][hero]: automatic insertion of hardware loops, post-increment memory accesses, and multiply-accumulates; intrinsics, `clang` builtins , and assembly support for all instructions of the extension;
-- [Snitch RISC-V ISA extensions (`Xssr`, `Xfrep`, and `Xdma`)][snitch]: automatic insertion of `frep` hardware loops; intrinsics and `clang` builtins for `Xssr` and `Xdma` extensions; assembly support for all instructions of the extension.
+- [Snitch RISC-V ISA extensions (`Xssr`, `Xfrep`, and `Xdma`)][snitch]: automatic insertion of `frep` hardware loops; intrinsics and `clang` builtins for `Xssr` and `Xdma` extensions; assembly support for all instructions of the extension. NEW: automatic SSR inference.
 
 # HERO and PULPv2 RISC-V ISA Extension Support
 
@@ -16,6 +16,7 @@ Refer to the [HERO repository](https://github.com/pulp-platform/hero) for build
 Refer to [snitch-toolchain-cd](https://github.com/pulp-platform/snitch-toolchain-cd) for build scripts and continuous deployment of pre-built toolchains.
 
 ## Command-line options
+Note that flags that are passed to LLVM through `clang` need to be prefaced with `-mllvm` (use `"SHELL:-mllvm <flag>"` in CMake to prevent removal of repeated `-mllvm`s).
 
 | Flag | Description |
 |---|---|
@@ -23,9 +24,16 @@ Refer to [snitch-toolchain-cd](https://github.com/pulp-platform/snitch-toolchain
 | `--debug-only=riscv-sdma` | Enable the debug output of the DMA pseudo instruction expansion pass |
 | `--debug-only=riscv-ssr` | Enable the debug output of the SSR pseudo instruction expansion pass |
 | `--debug-only=snitch-freploops` | Enable the debug output of the FREP loop inference pass |
-| `--ssr-noregmerge` | Disable the SSR register merging in the SSR pseudo instruction expansion pass. Register merging is enabled by default and can be disabled with this flag. |
+| `--ssr-no-regmerge` | Disable the SSR register merging in the SSR pseudo instruction expansion pass. Register merging is enabled by default and can be disabled with this flag. |
 | `--snitch-frep-inference` | Globally enable the FREP inference on all loops in the compiled module. |
-| `--enable-misched=false` | Disable the machine instruction scheduler. Instructions in a complex loop with multiple SSR push or pop instructions on the same data mover may not be rescheduled because the order in which the SSR are accessed is important. |
+| `-infer-ssr` | Enable automatic inference of SSR streams. |
+| `-ssr-no-intersect-check` | Do not generate intersection checks (unsafe). Use `restrict` key-word instead if possible. |
+| `-ssr-no-tcdm-check` | Assume all data of inferred streams is inside TCDM. |
+| `-ssr-no-bound-check` | Do not generate checks that make sure the inferred stream's access is executed at least once. |
+| `-ssr-conflict-free-only` | Only infer streams if they have no conflicts with other memory accesses. |
+| `-ssr-no-inline` | Prevent functions that contain SSR streams from being inlined |
+| `-ssr-barrier` | Enable the insertion of a spinning loop that waits for the stream to be done before it is disabled. |
+| `-ssr-verbose` | Write information about inferred streams to `stderr`. |
 
 ## `clang` builtins
 The following `clang` builtins can be used to directly make use of the SSR and DMA extensions.
@@ -189,6 +197,11 @@ void __builtin_ssr_setup_bound_stride_4d(uint32_t DM, uint32_t b, uint32_t s);
 void __builtin_ssr_barrier(uint32_t DM);
 ```
 
+#### SSR Inference Interoperability
+Automatic SSR infernce will not infer any streams in an `ssr_enable` to `ssr_disable` region. 
+Note that SSR inference currently treats any inline asm block as if it would contain an SSR instruction. Thus it will not infer streams in any loop nests that contain inline asm somewhere.
+
+
 ### SDMA
 
 ```c
diff --git a/cmd_out.txt b/cmd_out.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h
new file mode 100644
index 0000000000000..a50d68af2b193
--- /dev/null
+++ b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h
@@ -0,0 +1,185 @@
+#include "llvm/IR/PassManager.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/PassPlugin.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DenseMap.h"
+
+#include <iostream>
+#include <vector>
+#include <utility>
+
+namespace llvm {
+
+class AffineAccess;
+class AffineAccessAnalysis;
+class LoopInfo;
+class ScalarEvolution;
+class MemorySSA;
+class MemoryUseOrDef;
+class MemoryDef;
+struct ExpandedAffAcc;
+class DependenceInfo;
+class LoopAccessInfo;
+
+struct LoopRep{
+private:
+  ScalarEvolution &SE;
+  DominatorTree &DT;
+  const Loop *L;
+  const SCEV *RepSCEV;
+  Value *Rep = nullptr;
+  Value *RepPlusOne = nullptr;
+  SmallVector<const Loop *, 4U> containingLoops; //from inner- to outermost
+  unsigned safeExpandBound; //exclusive bound
+
+public:
+  /// construct rep for this loop, if loop well-formed isAvaliable will give true
+  LoopRep(const Loop *L, ArrayRef<const Loop *> contLoops, ScalarEvolution &SE, DominatorTree &DT);
+  bool isAvailable() const;
+  bool isOnAllCFPathsOfParentIfExecuted() const;
+  const Loop *getLoop() const;
+  const SCEV *getSCEV() const;
+  const SCEV *getSCEVPlusOne() const;
+  bool isSafeToExpandBefore(const Loop *L) const;
+
+  ///expands LoopRep::RepSCEV at InsertBefore (if nullptr in preheader of loop)
+  Value *expandAt(Type *ty, Instruction *InsertBefore = (Instruction *)nullptr);
+  Value *expandLoopGuard(Instruction *InsertBefore = (Instruction *)nullptr);
+};
+
+enum AffAccConflict { NoConflict = 0, MustNotIntersect = 1, Bad = 10};
+
+struct AffAcc{
+private:
+  ScalarEvolution &SE;
+  MemoryUseOrDef *MA;
+  SmallVector<Instruction *, 2U> accesses;       //the load/store (or call) instructions 
+  SmallVector<const SCEV *, 4U> baseAddresses;   //base addresses depending on loop 
+  SmallVector<const SCEV *, 4U> steps;           //steps per loop (0 if loop-inv) 
+  SmallVector<LoopRep *, 4U> reps;               //loop reps 
+  SmallVector<const Loop *, 4U> containingLoops; //from inner- to outermost
+  DenseMap<AffAcc *, std::pair<const Loop *, AffAccConflict>> conflicts;
+  void findSteps(const SCEV *A, const SCEV *Factor, unsigned loop);
+  AffAccConflict fromConflictPair(const detail::DenseMapPair<AffAcc*, std::pair<const Loop*, AffAccConflict>> &p, const Loop *L) const;
+
+public:
+  AffAcc() = delete;
+  ///immediately copies the contens of accesses and containingLoops
+  AffAcc(ArrayRef<Instruction *> accesses, const SCEV *Addr, MemoryUseOrDef *MA, ArrayRef<const Loop *> containingLoops, ScalarEvolution &SE);
+  ArrayRef<Instruction *> getAccesses() const;
+  Value *getAddrValue() const;
+  bool isWrite() const;
+  int getMaxDimension() const; 
+  const Loop *getDeepestMalformed() const;
+  bool isWellFormed(unsigned dimension) const;
+  bool isWellFormed(const Loop *L) const;
+  bool canExpandBefore(const Loop *L) const;
+  void dump() const;
+  void dumpInLoop(const Loop *L) const;
+  unsigned loopToDimension(const Loop *L) const;
+  const SCEV *getBaseAddr(unsigned dim) const;
+  const SCEV *getBaseAddr(const Loop *L) const;
+  const SCEV *getStep(unsigned dim) const;
+  const SCEV *getRep(unsigned dim) const;
+  const Loop *getLoop(unsigned dim) const;
+  ArrayRef<const Loop *> getContainingLoops() const;
+  AffAccConflict getConflict(AffAcc *A, const Loop *L) const;
+  std::vector<std::pair<AffAcc*, AffAccConflict>> getConflicts(const Loop *L) const;
+
+  MemoryUseOrDef *getMemoryAccess();
+  void addConflict(AffAcc *A, const Loop *StartL, AffAccConflict kind);
+  bool promote(LoopRep *LR); ///does not check whether it is on all CF-paths for LR->getLoop()
+  ///code gen:
+  Value *expandBaseAddr(unsigned dimension, Type *ty, Instruction *InsertBefore = (Instruction *)nullptr);
+  Value *expandStep(unsigned dimension, Type *ty, Instruction *InsertBefore = (Instruction *)nullptr);
+  Value *expandRep(unsigned dimension, Type *ty, Instruction *InsertBefore = (Instruction *)nullptr);
+  ExpandedAffAcc expandAt(const Loop *L, Instruction *Point, Type *PtrTy, IntegerType *ParamTy); 
+};
+
+struct MemDep {
+private:
+  MemorySSA &MSSA;
+  AAResults &AA;
+  bool alias(Value *A, Value *B);
+  bool alias(MemoryUseOrDef *A, MemoryUseOrDef *B);
+
+public:
+  MemDep(MemorySSA &MSSA, AAResults &AA) : MSSA(MSSA), AA(AA) {}
+  DenseSet<MemoryUseOrDef *> findClobbers(MemoryUseOrDef *MA);
+  DenseSet<MemoryUseOrDef *> findClobberUsers(MemoryDef *MA);
+};
+
+struct ExpandedAffAcc {
+public:
+  AffAcc *const Access;
+  Value *const Addr;
+  const SmallVector<Value *, 3U> Steps;
+  const SmallVector<Value *, 3U> Reps;
+  const SmallVector<Value *, 3U> Ranges;
+  const SmallVector<Value *, 3U> PrefixSumRanges;
+  Value *const LowerBound;
+  Value *const UpperBound;
+  unsigned getDimension() const { return Steps.size(); } //returns the nr of steps/reps/etc... there are
+  ExpandedAffAcc (AffAcc *A, Value *Addr, ArrayRef<Value *> Steps, ArrayRef<Value *> Reps, 
+    ArrayRef<Value *> Ranges, ArrayRef<Value *> PSRanges, Value *LowerBound, Value *UpperBound) 
+    : Access(A), Addr(Addr), Steps(Steps.begin(), Steps.end()), Reps(Reps.begin(), Reps.end()), 
+      Ranges(Ranges.begin(), Ranges.end()), PrefixSumRanges(PSRanges.begin(), PSRanges.end()), 
+      LowerBound(LowerBound), UpperBound(UpperBound) { }
+};
+
+class AffineAccess{
+private:
+  ScalarEvolution &SE;
+  DominatorTree &DT;
+  LoopInfo &LI;
+  MemorySSA &MSSA;
+  AAResults &AA;
+  DependenceInfo &DI;
+  MemDep MD;
+  DenseMap<MemoryUseOrDef *, AffAcc *> access;
+  DenseMap<const Loop *, LoopRep *> reps;
+  DenseMap<const Loop *, SmallVector<AffAcc *, 3u>> promotedAccesses;
+  DenseMap<const Loop *, SmallVector<AffAcc *, 2u>> expandableAccesses;
+
+  std::unique_ptr<std::vector<AffAcc *>> analyze(Loop *Parent, ArrayRef<const Loop *> loopPath);
+  void addAllConflicts(const std::vector<AffAcc *> &all);
+  AffAccConflict calcRWConflict(AffAcc *Read, AffAcc *Write, const Loop *L) const;
+  std::pair<AffAccConflict, const Loop*> calcConflict(AffAcc *A, AffAcc *B) const;
+  
+public:
+  AffineAccess(Function &F, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, MemorySSA &MSSA, AAResults &AA, DependenceInfo &DI);
+  AffineAccess() = delete;
+  bool accessPatternsMatch(const AffAcc *A, const AffAcc *B, const Loop *L) const;
+  bool accessPatternsAndAddressesMatch(const AffAcc *A, const AffAcc *B, const Loop *L) const;
+  ScalarEvolution &getSE() const;
+  DominatorTree &getDT() const;
+  LoopInfo &getLI() const;
+  MemorySSA &getMSSA() const;
+  AAResults &getAA() const;
+  DependenceInfo &getDI() const;
+  SmallVector<Loop *, 4U> getLoopsInPreorder() const;
+
+  std::vector<AffAcc *> getExpandableAccesses(const Loop *L, bool conflictFreeOnly = false);
+  std::vector<ExpandedAffAcc> expandAllAt(ArrayRef<AffAcc *> Accs, const Loop *L, Instruction *Point, 
+    Value *&BoundCheck, Type *PtrTy, IntegerType *ParamTy, bool conflictChecks = true, bool repChecks = false);
+};
+
+class AffineAccessAnalysis : public AnalysisInfoMixin<AffineAccessAnalysis> {
+  friend AnalysisInfoMixin<AffineAccessAnalysis>;
+  static AnalysisKey Key;
+
+public:
+  using Result = AffineAccess;
+  Result run(Function &F, FunctionAnalysisManager &AM);
+};
+
+// This is the analysis pass that will be invocable via opt
+class AffineAccessAnalysisPass : public AnalysisInfoMixin<AffineAccessAnalysisPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // namespace llvm
\ No newline at end of file
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td
index 835a535a9be48..d41c10ff1fe56 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -1434,19 +1434,18 @@ let TargetPrefix = "riscv" in {
         RISCVSSRIntrinsic;
 
   // The `Throws` attribute ensures that the push/pop don't get removed from loops
-  // by the LICM pass
-  // TODO: Is there another way to do this?
+  // by the LICM pass ==> not needed, LICM is only problem if readonly ==> make pop read and write (which is default)
   def int_riscv_ssr_push
       : GCCBuiltin<"__builtin_ssr_push">,
         Intrinsic<[],
                   [llvm_i32_ty, llvm_double_ty],
-                  [IntrWriteMem, IntrHasSideEffects, Throws, ImmArg<ArgIndex<0>>]>,
+                  [IntrWriteMem, ImmArg<ArgIndex<0>>]>,
         RISCVSSRIntrinsic;
   def int_riscv_ssr_pop
       : GCCBuiltin<"__builtin_ssr_pop">,
         Intrinsic<[llvm_double_ty],
                   [llvm_i32_ty],
-                  [IntrReadMem, IntrHasSideEffects, Throws, ImmArg<ArgIndex<0>>]>,
+                  [ImmArg<ArgIndex<0>>, IntrWriteMem]>, //use ReadWrite instead of throw to avoid licm
         RISCVSSRIntrinsic;
 
   def int_riscv_ssr_enable
diff --git a/llvm/include/llvm/Transforms/SSR/SSRGeneration.h b/llvm/include/llvm/Transforms/SSR/SSRGeneration.h
new file mode 100644
index 0000000000000..20262f3016d32
--- /dev/null
+++ b/llvm/include/llvm/Transforms/SSR/SSRGeneration.h
@@ -0,0 +1,23 @@
+//===-- SSRInference.h - Infer SSR usage ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SSR_SSRGENERATION_H
+#define LLVM_TRANSFORMS_SSR_SSRGENERATION_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class SSRGenerationPass : public PassInfoMixin<SSRGenerationPass>{
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_SSR_SSRGENERATION_H
diff --git a/llvm/include/llvm/Transforms/SSR/SSRInference.h b/llvm/include/llvm/Transforms/SSR/SSRInference.h
new file mode 100644
index 0000000000000..3a95c68da2fce
--- /dev/null
+++ b/llvm/include/llvm/Transforms/SSR/SSRInference.h
@@ -0,0 +1,23 @@
+//===-- SSRInference.h - Infer SSR usage ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SSR_SSRINFERENCE_H
+#define LLVM_TRANSFORMS_SSR_SSRINFERENCE_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class SSRInferencePass : public PassInfoMixin<SSRInferencePass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_SSR_SSRINFERENCE_H
diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp
new file mode 100644
index 0000000000000..6b5a4220eef30
--- /dev/null
+++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp
@@ -0,0 +1,1263 @@
+//===-- SSRGeneration.cpp - find prefetchable square affine accesses --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/AffineAccessAnalysis.h"
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/PassPlugin.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/InitializePasses.h"
+
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
+#include "llvm/IR/Dominators.h"
+
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasAnalysisEvaluator.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DenseMap.h"
+
+#include <array>
+#include <vector>
+#include <iostream>
+#include <utility>
+
+#define DEBUG_TYPE "ssr"
+
+using namespace llvm;
+
+//================== AffineAcces, helper functions =========================================
+
+namespace {
+
+//collects the set of unknown values in SCEV
+struct SCEVUknownSetFinder {
+  DenseSet<Value *> values;
+  // return true to follow this node.
+  bool follow(const SCEV *S) {
+    if (S->getSCEVType() == SCEVTypes::scUnknown) {
+      values.insert(cast<SCEVUnknown>(S)->getValue());
+    }
+    return true; //always true
+  }
+  // return true to terminate the search.
+  bool isDone() { return false; /*continue forever*/ }
+};
+
+//finds whether two SCEVs share unknown values
+bool shareValues(const SCEV *A, const SCEV *B) {
+  SCEVUknownSetFinder finderA;
+  SCEVTraversal<SCEVUknownSetFinder> trA(finderA);
+  trA.visitAll(A);
+  SCEVUknownSetFinder finderB;
+  SCEVTraversal<SCEVUknownSetFinder> trB(finderB);
+  trB.visitAll(B);
+  bool shareValues = false;
+  for (Value *V : finderA.values) {
+    for (Value *W : finderB.values) {
+      shareValues |= V == W;
+    }
+  }
+  return shareValues;
+}
+
+//checks whether SCEV contains the SCEVCouldNotCompute expression
+bool SCEVContainsCouldNotCompute(const SCEV *S) {
+  auto pred = [](const SCEV *X) { return !X || X->getSCEVType() == SCEVTypes::scCouldNotCompute || isa<SCEVCouldNotCompute>(X); };
+  return SCEVExprContains(S, std::move(pred));
+}
+
+/// guarantees: 
+/// L has 1 preheader and 1 dedicated exit
+/// L has 1 backedge and 1 exiting block
+/// bt SCEV can be expanded to instructions at insertionsPoint
+const SCEV *getLoopBTSCEV(const Loop *L, DominatorTree &DT, ScalarEvolution &SE){
+  if (!L->isLCSSAForm(DT)
+    || !L->getLoopPreheader() 
+    || !L->getExitingBlock() 
+    || !L->getExitBlock() 
+    || !L->hasDedicatedExits() 
+    || L->getNumBackEdges() != 1) { 
+    return nullptr; 
+  }
+  if (!SE.hasLoopInvariantBackedgeTakenCount(L)){
+    return nullptr;
+  }
+  const SCEV *bt = SE.getBackedgeTakenCount(L);
+  if(!bt || isa<SCEVCouldNotCompute>(bt) || !SE.isAvailableAtLoopEntry(bt, L) || SCEVContainsCouldNotCompute(bt)){
+    return nullptr;
+  }
+  return bt;
+}
+
+//casts SCEVs to same type if possible (or always if unsafe = true)
+Optional<std::pair<const SCEV *, const SCEV *>> toSameType(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool unsafe = false){
+  assert(LHS && RHS);
+  using PT = std::pair<const SCEV *, const SCEV *>;
+
+  const DataLayout &DL = SE.getDataLayout();
+  LLVMContext &ctxt = SE.getContext();
+
+  Type *LT = LHS->getType(), *RT = RHS->getType();
+  if (LT == RT) 
+    return Optional<PT>(std::make_pair(LHS, RHS)); //trivially the same size
+  if (LT->isPointerTy() && RT->isPointerTy()) //if we have pointers to different types
+    //PointerType *LTP = cast<PointerType>(LT); PointerType *RTP = cast<PointerType>(RT);
+    return Optional<PT>(std::make_pair(
+      SE.getPtrToIntExpr(LHS, Type::getIntNTy(ctxt, DL.getMaxPointerSizeInBits())), 
+      SE.getPtrToIntExpr(RHS, Type::getIntNTy(ctxt, DL.getMaxPointerSizeInBits()))
+    ));
+
+  if (!LT->isSized() || !RT->isSized()) return None;
+  if (DL.getTypeSizeInBits(LT).isScalable() || DL.getTypeSizeInBits(RT).isScalable()) return None;
+
+  uint64_t ls = DL.getTypeSizeInBits(LT).getValue(), rs = DL.getTypeSizeInBits(RT).getValue();
+
+  if (ls > rs) {
+    if (auto LHSx = dyn_cast<SCEVConstant>(LHS)){
+      if (LHSx->getAPInt().getActiveBits() <= rs) 
+        return Optional<PT>(std::make_pair(SE.getConstant(RHS->getType(), LHSx->getAPInt().getLimitedValue()), RHS));
+    } 
+    if (auto RHSx = dyn_cast<SCEVConstant>(RHS)){
+      return Optional<PT>(std::make_pair(LHS, SE.getConstant(LHS->getType(), RHSx->getAPInt().getLimitedValue())));
+    }
+    if (auto LHSx = dyn_cast<SCEVSignExtendExpr>(LHS)) return toSameType(LHSx->getOperand(0), RHS, SE);
+    if (auto LHSx = dyn_cast<SCEVZeroExtendExpr>(LHS)) return toSameType(LHSx->getOperand(0), RHS, SE);
+    if (auto RHSx = dyn_cast<SCEVTruncateExpr>(RHS)) return toSameType(LHS, RHSx->getOperand(0), SE);
+    if (unsafe && LT->isIntegerTy() && RT->isIntegerTy()) return Optional<PT>(std::make_pair(SE.getTruncateExpr(LHS, RHS->getType()), RHS));
+    return None;
+  }else if (ls < rs){
+    auto p = toSameType(RHS, LHS, SE); //swap
+    if (!p.hasValue()) return None;
+    return Optional<PT>(std::make_pair(p.getValue().second, p.getValue().first)); //swap back
+  }
+  if (unsafe) return Optional<PT>(std::make_pair(LHS, RHS));
+  return None;
+}
+
+///checks whether LHS == RHS always holds
+bool SCEVEquals(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE){
+  auto p = toSameType(LHS, RHS, SE);
+  if (!p.hasValue()) return false;
+  LHS = p.getValue().first;
+  RHS = p.getValue().second;
+  if (LHS == RHS) return true; //trivially the same if this holds (bc const Ptr)
+  else{
+    const SCEVPredicate *Peq = SE.getEqualPredicate(LHS, RHS);
+    if (Peq->isAlwaysTrue()) return true; //if we arrive at setup addr scev, we are done
+  }
+  return false;
+}
+
+/// check whether BB is on all controlflow paths from header to header
+// TODO: could also be done with DT
+bool isOnAllControlFlowPaths(BasicBlock *BB, const Loop *L, const DominatorTree &DT){
+  BasicBlock *End = L->getHeader();
+  std::deque<std::pair<BasicBlock *, bool>> q;
+  q.push_back(std::make_pair(End, false)); //start with header (false = BB not yet visited)
+  std::set<std::pair<BasicBlock *, bool>> vis; //comp here is less<pair<BasicBlock *, bool>>
+  while (!q.empty()){
+    auto p = q.front(); q.pop_front();
+    if (vis.find(p) != vis.end()) continue;
+    vis.insert(p);
+    for (BasicBlock *B : successors(p.first)){
+      q.push_back(std::make_pair(B, p.second || B == BB));
+    }
+    //check here whether End is reached with false (not at start of loop bc we also start with End)
+    p = q.front();
+    if (!p.second && p.first == End) return false; //got to End (header) without ever visiting BB
+  }
+  return true;
+}
+
+//return result of Cmp predicated on Rep > 0 if possible.
+// i.e. if we can say that Rep > 0 implies that Cmp is always false or true, we return that, o/w return None
+Optional<bool> predicatedICmpOutcome(ICmpInst *Cmp, const SCEV *Rep, ScalarEvolution &SE){
+  switch (Cmp->getPredicate())
+  {
+  case CmpInst::Predicate::ICMP_SGT:
+  case CmpInst::Predicate::ICMP_UGT:
+  {
+    const SCEV *LHS = SE.getSCEV(Cmp->getOperand(0));
+    const SCEV *RHS = SE.getSCEV(Cmp->getOperand(1));
+    //transform: LHS > RHS <==> LHS - RHS > 0
+    const SCEV *LHSmRHS = SE.getMinusSCEV(LHS, RHS);
+    //then check whether Rep == LHS - RHS in which case we know: Rep > 0 ==> result of Cmp is true
+    if (SCEVEquals(Rep, LHSmRHS, SE)) return Optional<bool>(true);
+    else return None;
+  }
+  case CmpInst::Predicate::ICMP_SLT:
+  case CmpInst::Predicate::ICMP_ULT:
+  {
+    //a < b <==> b > a
+    const SCEV *LHS = SE.getSCEV(Cmp->getOperand(1)); //b
+    const SCEV *RHS = SE.getSCEV(Cmp->getOperand(0)); //a
+    //transform: LHS > RHS <==> LHS - RHS > 0
+    const SCEV *LHSmRHS = SE.getMinusSCEV(LHS, RHS);
+    //then check whether Rep == LHS - RHS in which case we know: Rep > 0 ==> result of Cmp is true
+    if (SCEVEquals(Rep, LHSmRHS, SE)) return Optional<bool>(true);
+    else return None;
+  }
+  case CmpInst::Predicate::ICMP_EQ:
+  case CmpInst::Predicate::ICMP_NE:
+  {
+    //Rep > 0 ==> Rep + x != x
+    const SCEV *LHS = SE.getSCEV(Cmp->getOperand(0)); //Rep + x (hopefully)
+    const SCEV *RHS = SE.getSCEV(Cmp->getOperand(1)); //x 
+    const SCEV *LHSmRHS = SE.getMinusSCEV(LHS, RHS);  //Rep (hopefully)
+    if (SCEVEquals(Rep, LHSmRHS, SE)) return Optional<bool>(Cmp->getPredicate() == CmpInst::Predicate::ICMP_NE);
+    else return None;
+  }
+  default:
+    return None;
+  }
+}
+
+//conservative! 
+//because SCEVComparePredicate is not in this version of LLVM we have to do this manually ==> will not catch all cases (FIXME)
+//predicate is that Rep > 0
+bool isOnAllPredicatedControlFlowPaths(BasicBlock *BB, const Loop *L, const DominatorTree &DT, const SCEV *Rep, ScalarEvolution &SE){
+  if (isOnAllControlFlowPaths(BB, L, DT)) return true; //is on all paths anyway
+  DenseSet<BasicBlock *> vis; //visited set
+  std::deque<BasicBlock *> q(1U, L->getHeader()); //iterative BFS with queue
+  while (!q.empty()){
+    BasicBlock *Current = q.front(); q.pop_front();
+    if (Current == BB) continue; //do not continue BFS from BB
+    if (vis.find(Current) == vis.end()) continue; //already visited this block
+    vis.insert(Current);
+
+    Instruction *T = Current->getTerminator();
+    LLVM_DEBUG(T->dump());
+    if (BranchInst *BR = dyn_cast<BranchInst>(T)){
+      if (BR->isConditional()){
+        if (ICmpInst *Cmp = dyn_cast<ICmpInst>(BR->getCondition())){ //FOR NOW: only works with a single ICmpInst as branch condition operand
+          LLVM_DEBUG(Cmp->dump());
+          auto r = predicatedICmpOutcome(Cmp, Rep, SE);
+          if (r.hasValue()){
+            if (r.getValue()) q.push_back(BR->getSuccessor(0));
+            else q.push_back(BR->getSuccessor(1));
+          }else{
+            q.push_back(BR->getSuccessor(0));
+            q.push_back(BR->getSuccessor(1));
+          }
+        }
+      }else{
+        q.push_back(BR->getSuccessor(0)); //add the only successor to queue
+      }
+    }else{  
+      return false; //unknown jump somewhere else ==> BB not on all predicated paths
+    }
+
+    if (q.front() == L->getHeader()) return false; //bfs arrived at Header (again) with a path that never went through BB
+  }
+  return true;
+}
+
+//cast to right integer size, insert instruction at `InsPoint`
+Value *castToSize(Value *R, Type *ty, Instruction *InsPoint){
+  const DataLayout &DL = InsPoint->getParent()->getModule()->getDataLayout();
+  IRBuilder<> builder(InsPoint);
+  Type *rty = R->getType();
+  if (rty == ty) return R;
+  if (DL.getTypeSizeInBits(rty) > DL.getTypeSizeInBits(ty)) {
+    return builder.CreateTruncOrBitCast(R, ty, "scev.trunc");
+  }
+  if (DL.getTypeSizeInBits(rty) < DL.getTypeSizeInBits(ty)) {
+    return builder.CreateSExtOrBitCast(R, ty, "scev.sext");
+  }
+  return builder.CreateBitOrPointerCast(R, ty, "scev.cast");
+}
+
+// extract the Address Value of MA (nullptr if not available)
+Value *getAddress(MemoryUseOrDef *MA) {
+  assert(MA && "called getAddress on nullptr");
+  assert(MA->getMemoryInst());
+  Instruction *I = MA->getMemoryInst();
+  if (auto *L = dyn_cast<LoadInst>(I)) return L->getPointerOperand();
+  if (auto *S = dyn_cast<StoreInst>(I)) return S->getPointerOperand();
+  return nullptr;
+}
+
+//find the first L in loops that contains BB
+//loops should be a nesting of loops from inner to outermost
+const Loop *findFirstContaining(ArrayRef<const Loop *> loops, BasicBlock *BB){
+  for (const Loop *L : loops) {
+    if (L && L->contains(BB)) {
+      return L;
+    }
+  }
+  return nullptr;
+}
+
+//find out whether MA stands for some load/store (for some reason they don't always do, maybe bc of DCE?)
+bool hasMemInst(MemoryUseOrDef *MA) { return MA && MA->getMemoryInst(); }
+
+//updates L<-M if M is a descendant of L (or if L is nullptr)
+void updateIfDescendant(const Loop *&L, const Loop *M) {
+  if (!L || (M && L->contains(M))) L = M;
+}
+
+//updates L<-M if L is descendant of M OR if M is nullptr
+void updateIfAncestor(const Loop *&L, const Loop *M) {
+  if (!M || M->contains(L)) L = M;
+}
+
+void updateOutermostExpandableExcl(const Loop *&outerMostExpandableExl, AffAccConflict kind, const Loop *innermostCommon, const Loop *deepestMalformed) {
+  switch (kind) {
+  case AffAccConflict::NoConflict:
+    break;
+  case AffAccConflict::MustNotIntersect: 
+    updateIfAncestor(innermostCommon, deepestMalformed); //updates innermostCommon to deepestMalformed if that one is less "deep"
+    LLVM_FALLTHROUGH;
+  case AffAccConflict::Bad: 
+    updateIfDescendant(outerMostExpandableExl, innermostCommon);
+    break;
+  default:
+    llvm_unreachable("unknown conflict type");
+  }
+}
+
+//tries to find the sign of SCEV which information given
+Optional<int> findSign(const SCEV *S, ScalarEvolution &SE, std::vector<std::pair<const SCEV *, int>> &known) {
+  if (!S) return None;
+
+  //in case we know
+  for (const auto &p : known) {
+    if (SCEVEquals(S, p.first, SE)) return p.second;
+  }
+
+  //in case SE knows
+  if (SE.isKnownNegative(S)) return -1;
+  if (SE.isKnownPositive(S)) return 1;
+  if (S->isZero()) return 0;
+
+  //do recursively
+  switch (S->getSCEVType())
+  {
+  case SCEVTypes::scConstant:
+    if (S->isZero()) return 0;
+    else if (SE.isKnownPositive(S)) return 1;
+    else if (SE.isKnownNegative(S)) return -1;
+    llvm_unreachable("SE does not know sign of constant value ???");
+
+  case SCEVTypes::scMulExpr: {
+    auto l = findSign(cast<SCEVMulExpr>(S)->getOperand(0), SE, known);
+    auto r = findSign(cast<SCEVMulExpr>(S)->getOperand(1), SE, known);
+    if (!l.hasValue() || !r.hasValue()) return None;
+    return r.getValue() * l.getValue();
+  }
+
+  case SCEVTypes::scAddExpr: {
+    auto l = findSign(cast<SCEVAddExpr>(S)->getOperand(0), SE, known);
+    auto r = findSign(cast<SCEVAddExpr>(S)->getOperand(1), SE, known);
+    if (!l.hasValue() || !r.hasValue()) return None;
+    if (l.getValue() + r.getValue() >= 1) return 1;
+    if (l.getValue() + r.getValue() <= -1) return -1;
+    return None;
+  }
+
+  case SCEVTypes::scPtrToInt:
+  case SCEVTypes::scTruncate:
+    return findSign(cast<SCEVCastExpr>(S)->getOperand(0), SE, known);
+
+  //TODO: could add max/min, etc...
+  
+  default:
+    return None;
+  } 
+  llvm_unreachable("");
+}
+
+//cast some SCEVs if necessary
+const SCEV *getZExtIfNeeded(const SCEV *S, Type *Ty, ScalarEvolution &SE) {
+  if (SE.getDataLayout().getTypeSizeInBits(S->getType()) < SE.getDataLayout().getTypeSizeInBits(Ty)) {
+    return SE.getZeroExtendExpr(S, Ty);
+  }
+  return S;
+}
+
+//cast some SCEVs if necessary
+const SCEV *getSExtIfNeeded(const SCEV *S, Type *Ty, ScalarEvolution &SE) {
+  if (SE.getDataLayout().getTypeSizeInBits(S->getType()) < SE.getDataLayout().getTypeSizeInBits(Ty)) {
+    return SE.getSignExtendExpr(S, Ty);
+  }
+  return S;
+}
+
+} //end of namespace
+
+//==================  ===========================================================
+
+// ==== LoopRep ====
+LoopRep::LoopRep(const Loop *L, ArrayRef<const Loop *> contLoops, ScalarEvolution &SE, DominatorTree &DT) 
+  : SE(SE), DT(DT), L(L), containingLoops(contLoops.begin(), contLoops.end()), safeExpandBound(0u)
+  {
+  RepSCEV = getLoopBTSCEV(L, DT, SE);
+  if (RepSCEV) LLVM_DEBUG(dbgs()<<"new LoopRep with rep scev: "<<*RepSCEV<<"\n");
+  else LLVM_DEBUG(dbgs()<<"new LoopRep with rep scev: <nullptr> \n");
+  
+  if (RepSCEV){
+    while (safeExpandBound < containingLoops.size() 
+      && (!containingLoops[safeExpandBound] 
+        || isSafeToExpandAt(RepSCEV, containingLoops[safeExpandBound]->getLoopPreheader()->getTerminator(), SE))){
+      safeExpandBound++;
+    }
+  }
+}
+
+bool LoopRep::isAvailable() const { return RepSCEV != nullptr; }
+const Loop *LoopRep::getLoop() const { return L; }
+const SCEV *LoopRep::getSCEV() const { 
+  assert(isAvailable() && "SCEV available"); //not necessary, but forces good practice
+  return RepSCEV; 
+}
+
+const SCEV *LoopRep::getSCEVPlusOne() const {
+  assert(isAvailable() && "SCEV available");
+  return SE.getAddExpr(getSCEV(), SE.getConstant(getSCEV()->getType(), 1UL));
+}
+
+bool LoopRep::isOnAllCFPathsOfParentIfExecuted() const { //FIXME: maybe cache this result once calculated?
+  assert(isAvailable() && "SCEV available");
+  return isOnAllPredicatedControlFlowPaths(L->getHeader(), L->getParentLoop(), DT, getSCEVPlusOne(), SE);
+}
+
+bool LoopRep::isSafeToExpandBefore(const Loop *L) const {
+  assert(isAvailable() && "SCEV available");
+  if (L == getLoop()) return true;
+  for (unsigned i = 0u; i < safeExpandBound; i++) { //FIXME: linear search -> use map instead
+    if (L == containingLoops[i]) return true;
+  }
+  return false;
+}
+
+//code generation for loop rep, will cache the Value holding the results after calling for the first time to prevent excessive code-gen
+Value *LoopRep::expandAt(Type *ty, Instruction *InsertBefore){
+  assert(ty);
+  assert(RepSCEV);
+  if (Rep) { //FIXME: currently forces user to call first expand at a point that dominates all possible uses (improvement: could update expand point using DT)
+    assert(ty == Rep->getType() && "was already expanded with same type");
+    return Rep;
+  }
+  InsertBefore = InsertBefore ? InsertBefore : L->getLoopPreheader()->getTerminator();
+  const SCEV *RepP1 = getSCEVPlusOne(); //we go over the +1 version here because getSCEV() is usually sth like %n-1 so then this becomes just %n
+  assert(isSafeToExpandAt(RepP1, InsertBefore, SE) && "bound not expandable here");
+  SCEVExpander ex(SE, L->getHeader()->getModule()->getDataLayout(), "rep");
+  ex.setInsertPoint(InsertBefore);
+  RepPlusOne = castToSize(ex.expandCodeFor(RepP1), ty, InsertBefore);
+  IRBuilder<> builder(InsertBefore);
+  Rep = builder.CreateSub(RepPlusOne, ConstantInt::get(ty, 1u), "rep");
+  return Rep;
+}
+
+//code-gen for loop guard, ie. inserts code of rep+1 > 0
+Value *LoopRep::expandLoopGuard(Instruction *InsertBefore) {
+  assert(RepPlusOne && "expandAt has to be called before this");
+  InsertBefore = InsertBefore ? InsertBefore : L->getLoopPreheader()->getTerminator();
+  IRBuilder<> builder(InsertBefore);
+  return builder.CreateICmpSGT(RepPlusOne, ConstantInt::get(Rep->getType(), 0u, true)); //FIXME: this only works for unsigned Rep's that are < 2^30 (for i32)
+}
+
+// ==== AffAcc ====
+AffAcc::AffAcc(ArrayRef<Instruction *> accesses, const SCEV *Addr, MemoryUseOrDef *MA, ArrayRef<const Loop *> contLoops, ScalarEvolution &SE) 
+  : SE(SE), MA(MA), accesses(accesses.begin(), accesses.end())
+{
+  assert(!accesses.empty());
+  assert(MA); 
+
+  containingLoops.push_back((const Loop *)nullptr); //there is no loop for dim=0
+  containingLoops.append(contLoops.begin(), contLoops.end()); //initialize loops
+
+  bool isVolatile = false;
+  for (Instruction *I : accesses) //check for volatile mem insts, we don't want to touch those
+    isVolatile |= (isa<LoadInst>(I) && cast<LoadInst>(I)->isVolatile()) || (isa<StoreInst>(I) && cast<StoreInst>(I)->isVolatile());
+  if (Addr && (SCEVContainsCouldNotCompute(Addr) || isVolatile)) Addr = nullptr; //set to null if contains SCEVCouldNotCompute
+  baseAddresses.push_back(Addr);
+  if (!Addr) return; //do not look for steps or addresses if SCEV of address is unknown
+  steps.push_back((const SCEV *)nullptr); //there is no step for dim=0
+  reps.push_back((LoopRep *)nullptr); //there is no rep for dim=0
+  findSteps(Addr, (const SCEV *)nullptr, 1u); //find steps
+  for (unsigned dim = 1; dim < containingLoops.size(); dim++){
+    Addr = SE.SplitIntoInitAndPostInc(containingLoops[dim], Addr).first;
+    baseAddresses.push_back(Addr);
+  }
+}
+
+//fold over A and collect steps in AddRec expressions
+//the found steps might not be valid for square affine access patterns ==> `promote` will check this
+void AffAcc::findSteps(const SCEV *A, const SCEV *Factor, unsigned loop){
+  assert(A);
+  assert(baseAddresses.size() == 1 && reps.size() == 1 && "we only know dim=0 so far");
+
+  if (loop >= containingLoops.size()) return;  //we are done
+
+  if (!SE.containsAddRecurrence(A) && loop < containingLoops.size()){ 
+    //A is inv to the rest of the loops
+    steps.push_back(SE.getConstant(Type::getInt64Ty(this->accesses[0]->getContext()), 0U));
+    findSteps(A, Factor, loop + 1u);
+  }
+
+  switch (A->getSCEVType())
+  {
+  //unary expressions that do not change value
+  case SCEVTypes::scZeroExtend: //FIXME: this might be unsafe
+  case SCEVTypes::scSignExtend:
+  case SCEVTypes::scTruncate:
+    return findSteps(cast<SCEVCastExpr>(A)->getOperand(0), Factor, loop);
+
+  // TODO: if we want to allow random adds in between then we would need to add the non-recursive part to the base address
+  // case SCEVTypes::scAddExpr: {
+  //   const SCEV *L = cast<SCEVAddExpr>(A)->getOperand(0);
+  //   const SCEV *R = cast<SCEVAddExpr>(A)->getOperand(1);
+  //   bool l = SE.containsAddRecurrence(L);
+  //   bool r = SE.containsAddRecurrence(R);
+  //   if (l && !r) return findSteps(L, Factor, loop);
+  //   else if(!l && r) return findSteps(R, Factor, loop);
+  //   return;
+  // }
+
+  //L * R
+  case SCEVTypes::scMulExpr: {
+    const SCEV *L = cast<SCEVMulExpr>(A)->getOperand(0);
+    const SCEV *R = cast<SCEVMulExpr>(A)->getOperand(1);
+    bool l = SE.containsAddRecurrence(L);
+    bool r = SE.containsAddRecurrence(R);
+    if (l == r) return;
+    if (!l && r) std::swap(L, R); 
+    assert(SE.containsAddRecurrence(L) && !SE.containsAddRecurrence(R));
+    if (Factor) {
+      auto p = toSameType(Factor, R, SE, true);
+      if (!p.hasValue()) return;
+      Factor = SE.getMulExpr(p.getValue().first, p.getValue().second);
+    }else Factor = R;
+    return findSteps(L, Factor, loop);
+  }
+
+  //{<start>,+,Step}<L>
+  case SCEVTypes::scAddRecExpr: {
+    const auto *S = cast<SCEVAddRecExpr>(A);
+    const SCEV *Step;
+    if (S->getLoop() == containingLoops[loop]){ //L == containingLoops[loop]
+      Step = S->getStepRecurrence(SE);
+      if (Factor) {
+        auto p = toSameType(Factor, Step, SE, true);
+        if (!p.hasValue()) return;
+        Step = SE.getMulExpr(p.getValue().first, p.getValue().second);
+      }
+      steps.push_back(Step);
+      return findSteps(S->getStart(), Factor, loop+1);
+    }else{ //A is loop-invariant to containingLoops[loop]
+      bool occursLater = false; //loop needs to occur later 
+      for (unsigned i = loop+1; i < containingLoops.size(); i++) 
+        occursLater = occursLater || containingLoops[i] == S->getLoop();
+      if (!occursLater) return; 
+      steps.push_back(SE.getConstant(Type::getInt64Ty(this->accesses[0]->getContext()), 0U));
+      return findSteps(S, Factor, loop+1);
+    }    
+  }
+  default: //in all other cases we cannot safely extract more steps and thus just return
+    return;
+  }
+}
+
+ArrayRef<Instruction *> AffAcc::getAccesses() const { return accesses; }
+
+bool AffAcc::isWrite() const { return isa<MemoryDef>(MA); }
+
+///the nr of times `this` was promoted (-1 means the address is not known)
+int AffAcc::getMaxDimension() const { return (int)reps.size() - 1; }
+
+///return the first (as in deepest) Loop L where this->isWellFormed(L) is false
+///returns null if there is no such loop
+const Loop *AffAcc::getDeepestMalformed() const {
+  for (const Loop *L : containingLoops) {
+    if (L && !isWellFormed(L)) return L;
+  }
+  return nullptr;
+  /*unsigned malformedStart = (unsigned)(getMaxDimension() + 2); //getMaxDimension() >= -1
+  if (containingLoops.size() > malformedStart) return containingLoops[malformedStart];
+  else return nullptr;*/
+}
+
+///true if this was successfully promoted to the given dimension (ie. nr of promotions is at least `dimension`)
+bool AffAcc::isWellFormed(unsigned dimension) const { 
+  int md = getMaxDimension();
+  return md >= 0 && dimension <= (unsigned)md; 
+}
+
+///true if this was successfully promoted to the given dimension (ie. nr of promotions is `dimension`)
+///if true, this means that `this` can be expanded in the preheader of `L`
+bool AffAcc::isWellFormed(const Loop *L) const { return isWellFormed(loopToDimension(L)); }
+
+///returns the dimension that is defined by `L` (starts at 1)
+unsigned AffAcc::loopToDimension(const Loop *L) const {
+  assert(L && "L not nullptr");
+  for (unsigned d = 1u; d < containingLoops.size(); d++){ //FIXME: linear search -> improve with a <loop,unsigned> map
+    if (containingLoops[d] == L) return d;
+  }
+  llvm_unreachable("The provided loop does not contain `this`!");
+}
+
+Value *AffAcc::getAddrValue() const {
+  assert(getBaseAddr(0u) && "has an address");
+  if (isWrite()) {
+    return cast<StoreInst>(accesses[0])->getPointerOperand();
+  } else {
+    return cast<LoadInst>(accesses[0])->getPointerOperand();
+  }
+}
+
+///SCEV of base Address for the base address at a given dimension
+const SCEV *AffAcc::getBaseAddr(unsigned dim) const { assert(dim < baseAddresses.size()); return baseAddresses[dim]; }
+
+///SCEV of base Address outside of `L`
+const SCEV *AffAcc::getBaseAddr(const Loop *L) const { return getBaseAddr(loopToDimension(L)); }
+
+///SCEV of step for the dimension `dim` (that means there is no step for `dim` = 0)
+const SCEV *AffAcc::getStep(unsigned dim) const { assert(dim < steps.size()); return steps[dim]; }
+
+///SCEV of rep for the dimension `dim` (that means there is no rep for `dim` = 0)
+const SCEV *AffAcc::getRep(unsigned dim) const { 
+  assert(dim < reps.size()); 
+  if (!reps[dim] || !reps[dim]->isAvailable()) return nullptr;
+  return reps[dim]->getSCEV(); 
+}
+
+///get Loop for given `dim` (that means there is no Loop for `dim` = 0)
+const Loop *AffAcc::getLoop(unsigned dim) const { assert(dim < containingLoops.size()); return containingLoops[dim]; }
+
+///get containing loops from inner- to outermost
+ArrayRef<const Loop *> AffAcc::getContainingLoops() const { return ArrayRef<const Loop *>(containingLoops); }
+
+//dump info known for this AffAcc up to some loop L
+void AffAcc::dumpInLoop(const Loop *L) const {
+  errs()<<"Affine Access of \n";
+  int dimension = getMaxDimension();
+  if (L) dimension = std::min((int)loopToDimension(L), dimension);
+  for (auto *I : accesses) errs()<<*I<<"\n";
+  if (dimension < 0) errs()<<"\t<malformed>\n";
+  for (int dim = 0; dim <= dimension && dim <= getMaxDimension(); dim++){
+    const SCEV *s = getStep(dim);
+    const SCEV *r = getRep(dim);
+    const SCEV *a = getBaseAddr(dim);
+    errs()<<"\tdim = "<<dim<<", step = "; 
+    if (s) errs()<<*s;
+    else errs()<<"<nullptr>"; 
+    errs()<<", rep = ";
+    if (r) errs()<<*r;
+    else errs()<<"<nullptr>";
+    errs()<<", well-formed = "<<this->isWellFormed(dim);
+    errs()<<"\n";
+    errs()<<"\taddress = ";
+    if (a) errs()<<*a;
+    else errs()<<"<nullptr>";
+    errs()<<"\n";
+    errs()<<"\tloop header = ";
+    if (getLoop(dim)) errs()<<getLoop(dim)->getHeader()->getNameOrAsOperand();
+    else errs()<<"<nullptr>";
+    errs()<<"\n";
+  }
+}
+
+//dump all info known about this AffAcc
+void AffAcc::dump() const { dumpInLoop(nullptr); }
+
+//get the actual conflict between this and the AffAcc in the pair for some loop L
+AffAccConflict AffAcc::fromConflictPair(const detail::DenseMapPair<AffAcc*, std::pair<const Loop*, AffAccConflict>> &p, const Loop *L) const {
+  const Loop *S = p.getSecond().first;
+  if (S == L || L->contains(S)) { //if start is L or more "inner" loop
+    if (!isWellFormed(L) || !p.first->isWellFormed(L)) return AffAccConflict::Bad; //if either is not well-formed "demote" the conflict to bad (but only if exists)
+    return p.getSecond().second;
+  }
+  return AffAccConflict::NoConflict;
+}
+
+//get the actual conflict between this and A for loop L
+AffAccConflict AffAcc::getConflict(AffAcc *A, const Loop *L) const {
+  auto p = conflicts.find(A);
+  if (p != conflicts.end()) {
+    return fromConflictPair(*p, L);
+  }
+  return AffAccConflict::NoConflict;
+}
+
+/// returns a vector of (AffAcc *, conflict) pairs containing all the conflicts that `this` has at loop `L`
+/// It is guaranteed that conflict is never NoConflict
+std::vector<std::pair<AffAcc*, AffAccConflict>> AffAcc::getConflicts(const Loop *L) const {
+  std::vector<std::pair<AffAcc*, AffAccConflict>> res;
+  for (const auto &p : conflicts) {
+    assert(p.first);
+    assert(p.getSecond().first);
+    AffAccConflict kind = fromConflictPair(p, L);
+    if (kind != AffAccConflict::NoConflict) res.push_back(std::make_pair(p.first, kind));
+  }
+  return res;
+}
+
+MemoryUseOrDef *AffAcc::getMemoryAccess() { return MA; }
+
+//add conflict with A, where StartL is innermost shared loop, with conflict classification `kind`
+void AffAcc::addConflict(AffAcc *A, const Loop *StartL, AffAccConflict kind){
+  assert(StartL);
+  assert(conflicts.find(A) == conflicts.end() && "no conflict for A yet");
+  assert(kind == AffAccConflict::Bad || (isWellFormed(StartL) && A->isWellFormed(StartL)));
+  conflicts.insert(std::make_pair(A, std::make_pair(StartL, kind)));
+}
+
+//promote `this` if possible.
+//`LR` should be the rep of the next outer loop where this is not (yet) well-formed
+// if successful, `this` is well-formed for LR->getLoop() afterwards.
+bool AffAcc::promote(LoopRep *LR){
+  if (!LR->isAvailable()) return false;
+  unsigned newDim = (unsigned)(getMaxDimension() + 1); //getMaxDimension() >= -1
+  if (getLoop(newDim) != LR->getLoop()) return false;
+  LLVM_DEBUG(dbgs()<<"promote: (1) loops match, ");
+  bool possible = true;
+  Instruction *Point = LR->getLoop()->getLoopPreheader()->getTerminator();
+  //check all current reps and steps
+  for (unsigned dim = 1; dim < newDim; dim++){ 
+    possible &= isSafeToExpandAt(getStep(dim), Point, SE);
+    possible &= reps[dim]->isSafeToExpandBefore(LR->getLoop());
+  }
+  if (possible) LLVM_DEBUG(dbgs()<<"can expand (2) current rep & step, ");
+  //check rep and step of new dimension
+  possible &= steps.size() > newDim && isSafeToExpandAt(getStep(newDim), Point, SE);
+  possible &= LR->isSafeToExpandBefore(LR->getLoop());
+  if (possible) LLVM_DEBUG(dbgs()<<"(3) new rep & step, ");
+  //check base address
+  possible &= !SCEVContainsCouldNotCompute(getBaseAddr(newDim)) && isSafeToExpandAt(getBaseAddr(newDim), Point, SE);
+  if (possible) LLVM_DEBUG(dbgs()<<"and (4) new base addr!");
+  LLVM_DEBUG(dbgs()<<"\n");
+  if (!possible) return false;
+
+  reps.push_back(LR); //changes getMaxDimension()
+  return true;
+}
+
+//Code-gen for base address
+Value *AffAcc::expandBaseAddr(unsigned dimension, Type *ty, Instruction *InsertBefore){
+  assert(isWellFormed(dimension));
+  InsertBefore = InsertBefore ? InsertBefore : reps[dimension]->getLoop()->getLoopPreheader()->getTerminator();
+  if (!isSafeToExpandAt(getBaseAddr(dimension), InsertBefore, SE)){
+    LLVM_DEBUG(dbgs()<<"data not expanable here (note: only preheader guaranteed)\n");
+    LLVM_DEBUG(dbgs()<<"SCEV (dim = "<<dimension<<")= "<<*getBaseAddr(dimension)<<"\n");
+    LLVM_DEBUG(dbgs()<<"in block:\n"; InsertBefore->getParent()->dump());
+    LLVM_DEBUG(dbgs()<<"before inst: "<<*InsertBefore<<"\n");
+    LLVM_DEBUG(this->dump());
+    llvm_unreachable("cannot expand SCEV at desired location");
+  }
+  SCEVExpander ex(SE, reps[dimension]->getLoop()->getHeader()->getModule()->getDataLayout(), "addr");
+  ex.setInsertPoint(InsertBefore);
+  return castToSize(ex.expandCodeFor(getBaseAddr(dimension)), ty, InsertBefore);
+}
+
+//code-gen for step
+Value *AffAcc::expandStep(unsigned dimension, Type *ty, Instruction *InsertBefore){
+  assert(isWellFormed(dimension) && dimension > 0u);
+  InsertBefore = InsertBefore ? InsertBefore : reps[dimension]->getLoop()->getLoopPreheader()->getTerminator();
+  assert(isSafeToExpandAt(getStep(dimension), InsertBefore, SE) && "data not expanable here (note: only preheader guaranteed)");
+  SCEVExpander ex(SE, reps[dimension]->getLoop()->getHeader()->getModule()->getDataLayout(), "step");
+  ex.setInsertPoint(InsertBefore);
+  return castToSize(ex.expandCodeFor(getStep(dimension)), ty, InsertBefore);
+}
+
+//code-gen for rep (calls code-gen of the LoopRep)
+Value *AffAcc::expandRep(unsigned dimension, Type *ty, Instruction *InsertBefore){
+  assert(isWellFormed(dimension) && dimension > 0u);
+  InsertBefore = InsertBefore ? InsertBefore : reps[dimension]->getLoop()->getLoopPreheader()->getTerminator();
+  if (!isSafeToExpandAt(getRep(dimension), InsertBefore, SE)) {
+    getRep(dimension)->dump();
+    InsertBefore->dump();
+    InsertBefore->getParent()->dump();
+    this->dump();
+  }
+  return reps[dimension]->expandAt(ty, InsertBefore);
+}
+
+//code-gen for all info needed to know the square affine access pattern inside of L
+//guaranteed to work if `Point` is the terminator of preheader of L
+ExpandedAffAcc AffAcc::expandAt(const Loop *L, Instruction *Point, 
+  Type *PtrTy, IntegerType *ParamTy) 
+{
+  if (!Point) Point = L->getLoopPreheader()->getTerminator();
+  IRBuilder<> builder(Point);
+  assert(isWellFormed(L));
+  std::vector<Value *> reps, steps, ranges, prefixsum_ranges;
+  const unsigned dim = loopToDimension(L);
+  Value *Addr = expandBaseAddr(dim, PtrTy, Point);
+  IntegerType *SizeTy = IntegerType::get(SE.getContext(), (unsigned)SE.getTypeSizeInBits(Addr->getType()));
+  Value *psum = nullptr;
+  Value *LowerBound = builder.CreatePtrToInt(Addr, SizeTy, "lb");
+  Value *UpperBound = LowerBound;
+  std::vector<std::pair<const SCEV *, int>> known;
+  for (int d = 1u; d < getMaxDimension(); d++) {
+    known.push_back(std::make_pair(this->reps[d]->getSCEVPlusOne(), 1));
+  }
+  for (unsigned i = 1u; i <= dim; i++) {
+    reps.push_back(expandRep(i, ParamTy, Point));
+    steps.push_back(expandStep(i, ParamTy, Point));
+    Value *r = reps.back();
+    Value *st = steps.back();
+    ranges.push_back(builder.CreateMul(r, st, formatv("range.{0}d", i)));
+    if (psum) psum = builder.CreateAdd(psum, ranges.back(), formatv("prefsum.range.{0}d", i));
+    else psum = ranges.back();
+    prefixsum_ranges.push_back(psum);
+    auto sign = findSign(getStep(i), SE, known);
+    if (sign.hasValue()) {
+      if (sign.getValue() < 0) LowerBound = builder.CreateAdd(LowerBound, builder.CreateSExtOrTrunc(ranges.back(), SizeTy, "lb.dec"));
+      else if (sign.getValue() > 0) UpperBound = builder.CreateAdd(UpperBound, builder.CreateZExtOrTrunc(ranges.back(), SizeTy, "ub.inc"));
+      //else sign == 0: no action needed
+    } else { //we do not know sign! need to test at runtime
+      Value *Test = builder.CreateICmpSGE(ranges.back(), ConstantInt::get(ParamTy, 0), "test.nonnegative"); //FIXME: does not work for unsigned values > 2^30
+      LowerBound = builder.CreateSelect(
+        builder.CreateNot(Test, formatv("not.test.{0}d", i)), 
+        builder.CreateSExtOrTrunc(ranges.back(), SizeTy, formatv("range.{0}d.sext", i)), 
+        ConstantInt::get(SizeTy, 0)
+      );
+      UpperBound = builder.CreateSelect(
+        Test,
+        builder.CreateZExtOrTrunc(ranges.back(), SizeTy, formatv("range.{0}d.zext", i)),
+        ConstantInt::get(SizeTy, 0)
+      );
+    }
+  }
+  ExpandedAffAcc Aexp(this, Addr, steps, reps, ranges, prefixsum_ranges, LowerBound, UpperBound);
+  return Aexp;
+}
+
+
+// ================= MemDep ==============
+
+bool MemDep::alias(Value *A, Value *B) { return !A || !B || AA.alias(A, B) != AliasResult::NoAlias; }
+bool MemDep::alias(MemoryUseOrDef *A, MemoryUseOrDef *B) { 
+  if (!hasMemInst(A) || !hasMemInst(B)) return false; //the memoryUseOrDef does not correspond to an instruction => no problem
+  else return alias(getAddress(A), getAddress(B)); 
+}
+
+//returns all MemoryDefs that might clobber MA
+//i.e. we cannot be sure at compile-time that they *don't* clobber MA
+DenseSet<MemoryUseOrDef *> MemDep::findClobbers(MemoryUseOrDef *MA){
+  DenseSet<MemoryUseOrDef *> res;
+  std::deque<MemoryAccess *> worklist;
+  DenseSet<MemoryAccess *> vis;
+  worklist.push_back(MA->getDefiningAccess());
+  while (!worklist.empty()) {
+    MemoryAccess *A = worklist.front(); worklist.pop_front();
+    if (!A) continue;
+    if (vis.find(A) != vis.end()) continue;
+    if (A == MA) continue;
+    vis.insert(A);
+    if (MemoryDef *D = dyn_cast<MemoryDef>(A)) {
+      if (alias(D, MA)) {
+        res.insert(D);
+      }
+      worklist.push_back(D);
+    } else {
+      MemoryPhi *P = cast<MemoryPhi>(A);
+      for (unsigned i = 0u; i < P->getNumOperands(); i++) {
+        worklist.push_back(P->getOperand(i));
+      }
+    }
+  }
+  return res;
+}
+
+//find all MemoryUse's or MemoryDef's that might be clobbered by MA (might = must OR we do not know at compile-time)
+DenseSet<MemoryUseOrDef *> MemDep::findClobberUsers(MemoryDef *MA) {
+  DenseSet<MemoryUseOrDef *> res;
+  std::deque<MemoryAccess *> worklist;
+  DenseSet<MemoryAccess *> vis;
+  for (auto U = MA->use_begin(); U != MA->use_end(); ++U) {
+    worklist.push_back(cast<MemoryAccess>(U->getUser()));
+  }
+  while (!worklist.empty()){
+    MemoryAccess *A = worklist.front(); worklist.pop_front();
+    if (!A) continue;
+    if (vis.find(A) != vis.end()) continue;
+    vis.insert(A);
+    if (MemoryUse *U = dyn_cast<MemoryUse>(A)) {
+      if (alias(U, MA)) res.insert(U);
+    } else if (MemoryDef *D = dyn_cast<MemoryDef>(A)) {
+      if (alias(D, MA)) {
+        res.insert(D);
+      }
+      worklist.push_back(D);
+    } else {
+      assert(isa<MemoryPhi>(A));
+      for (auto U = A->use_begin(); U != A->use_end(); ++U) {
+        worklist.push_back(cast<MemoryAccess>(U->getUser()));
+      }
+    }
+  }
+  return res;
+}
+
+//================== Affine Access ===========================================================
+
+//constructor of analysis result, immediately computes all necessary information
+AffineAccess::AffineAccess(
+    Function &F, ScalarEvolution &SE, DominatorTree &DT, 
+    LoopInfo &LI, MemorySSA &MSSA, AAResults &AA, 
+    DependenceInfo &DI
+  ) : SE(SE), DT(DT), LI(LI), MSSA(MSSA), AA(AA), DI(DI), MD(MSSA, AA)
+{
+  for (Loop *L : LI.getTopLevelLoops()){
+    auto all = analyze(L, ArrayRef<const Loop *>());
+    addAllConflicts(*all);
+    all.release();
+  }
+}
+
+//DFS over loop tree, constructs an AffAcc for each memory access and tries to promote it as far as possible
+std::unique_ptr<std::vector<AffAcc *>> AffineAccess::analyze(Loop *Parent, ArrayRef<const Loop *> loopPath){
+  LLVM_DEBUG(dbgs()<<"analyze: loop          : "<<Parent->getHeader()->getNameOrAsOperand()<<"\n");
+
+  //LoopRep for Parent
+  LoopRep *ParentLR = new LoopRep(Parent, loopPath, SE, DT);
+  reps.insert(std::make_pair(Parent, ParentLR)); //add Parent to LoopReps
+
+  //prepare path
+  std::vector<const Loop *> path; 
+  path.push_back(Parent); //add Parent to path
+  for (auto *L : loopPath) path.push_back(L);
+  
+  //prepare results
+  auto all = std::make_unique<std::vector<AffAcc *>>();
+  auto &promoted = promotedAccesses.insert(std::make_pair(Parent, SmallVector<AffAcc *, 2u>())).first->getSecond();
+
+  //promote subloop accesses
+  for (Loop *L : Parent->getSubLoops()){
+    std::unique_ptr<std::vector<AffAcc *>> accs = analyze(L, ArrayRef<const Loop *>(path));
+    all->reserve(accs->size());
+    LoopRep *LR = reps.find(L)->second; //guaranteed to exist, no check needed
+    bool canPromote = LR->isAvailable() && ParentLR->isAvailable() && LR->isOnAllCFPathsOfParentIfExecuted();
+    for (AffAcc *A : *accs){
+      all->push_back(A);
+      if (canPromote){ //L is well-formed and on all CF-paths if its rep is >0 at run-time
+        if (A->promote(ParentLR)){
+          promoted.push_back(A); //guaranteed to exist
+        }
+      }
+    }
+    accs.release();
+  }
+
+  //promote accesses from this loop
+  for (BasicBlock *BB : Parent->getBlocks()){
+    if (LI.getLoopFor(BB) != Parent) continue; //skip BB as it was already processed in a subloop
+    for (Instruction &I : *BB){
+      MemoryUseOrDef *MA = MSSA.getMemoryAccess(&I);
+      if (MA && hasMemInst(MA) && access.find(MA) == access.end()){ //no AffAcc for this memory access yet!
+        Value *Addr = getAddress(MA);
+        const SCEV *AddrSCEV = nullptr;
+        if (Addr) AddrSCEV = SE.getSCEV(Addr);
+        AffAcc *A = new AffAcc(ArrayRef<Instruction *>(&I), AddrSCEV, MA, ArrayRef<const Loop *>(path), SE);
+        all->push_back(A);
+        access.insert(std::make_pair(MA, A));
+        if (ParentLR->isAvailable()){
+          bool onAllCFPaths = true;
+          for (Instruction *I : A->getAccesses()) onAllCFPaths &= isOnAllControlFlowPaths(I->getParent(), Parent, DT);
+          if (onAllCFPaths && A->promote(ParentLR)){
+            promoted.push_back(A); //guaranteed to exist
+          }
+        }
+      }
+    }
+  }
+  
+  LLVM_DEBUG(dbgs()<<"analyze: done with loop: "<<Parent->getHeader()->getNameOrAsOperand()<<"\n");
+  
+  return all;
+}
+
+//given the list of all AffAccs in a loop-tree, this finds all the conflicts between them
+void AffineAccess::addAllConflicts(const std::vector<AffAcc *> &all) {
+  for (AffAcc *A : all) {
+    assert(A);
+    const Loop *outerMostExpandableExl = A->getDeepestMalformed();
+    DenseSet<MemoryUseOrDef *> c;
+    if (A->isWrite()){
+      c = MD.findClobberUsers(cast<MemoryDef>(A->getMemoryAccess()));
+    } else {
+      c = MD.findClobbers(A->getMemoryAccess());
+    }
+    for (MemoryUseOrDef *D : c) {
+      if (A->getMemoryAccess() == D || !hasMemInst(D)) continue;
+      auto p = access.find(D);
+      if (p == access.end()) continue;
+      AffAcc *B = p->second;
+      auto r = calcConflict(A, B);
+      if (r.first != AffAccConflict::NoConflict) A->addConflict(B, r.second, r.first);
+      updateOutermostExpandableExcl(outerMostExpandableExl, r.first, r.second, B->getDeepestMalformed());
+      assert(!outerMostExpandableExl || outerMostExpandableExl->contains(A->getMemoryAccess()->getBlock()));
+    }
+
+    ArrayRef<const Loop *> loops = A->getContainingLoops();
+    for (const Loop *L : loops) {
+      if (!L) continue;
+      if (L == outerMostExpandableExl) break;
+      if (!(!L || A->isWellFormed(L))){
+        if (L) LLVM_DEBUG(L->dump());
+        if (outerMostExpandableExl) LLVM_DEBUG(outerMostExpandableExl->dump());
+        LLVM_DEBUG(A->dump());
+        llvm_unreachable("this should not happen!");
+      }
+      assert(!L || A->isWellFormed(L));
+      auto p = expandableAccesses.find(L);
+      if (p == expandableAccesses.end()){
+        p = expandableAccesses.insert(std::make_pair(L, SmallVector<AffAcc*, 3U>())).first;
+      } 
+      p->getSecond().push_back(A);
+    }
+  }
+}
+
+//classify conflict between Read and Write
+AffAccConflict AffineAccess::calcRWConflict(AffAcc *Read, AffAcc *Write, const Loop *L) const {
+  assert(!Read->isWrite());
+  assert(Write->isWrite());
+  if (!L->contains(Read->getMemoryAccess()->getBlock()) || !L->contains(Write->getMemoryAccess()->getBlock())) return AffAccConflict::NoConflict;
+  if (!Read->isWellFormed(L) || !Write->isWellFormed(L)) return AffAccConflict::Bad;
+  MemoryUseOrDef *r = Read->getMemoryAccess();
+  MemoryUseOrDef *w = Write->getMemoryAccess();
+  Value *Addr = getAddress(r);
+  Value *DAddr = getAddress(w);
+  bool dominates = MSSA.dominates(r, w);
+  if (Addr && DAddr && AA.alias(Addr, DAddr) == NoAlias) return AffAccConflict::NoConflict;
+  AffAccConflict kind = AffAccConflict::Bad;
+  if (!dominates) { //read does not dominate write ==> R maybe after W
+    kind = AffAccConflict::MustNotIntersect;
+  } else { //read dominates write ==> W is after R
+    kind = AffAccConflict::MustNotIntersect;
+    //exception: we know that the store always happens to a position already written from if the store is to same address as write (FIXME: CONSERVATIVE)
+    //but the steps needs to be != 0 such that there is no dependence from one iteration to the next
+    bool nonzeroSteps = true;
+    unsigned dr = Read->loopToDimension(L);
+    unsigned dw = Write->loopToDimension(L);
+    while (Read->isWellFormed(dr) && Write->isWellFormed(dw)) {
+      nonzeroSteps &= SE.isKnownNonZero(Read->getStep(dr++)) && SE.isKnownNonZero(Write->getStep(dw++));
+    }
+    if ((Addr && DAddr && AA.alias(Addr, DAddr) == MustAlias && nonzeroSteps)
+      || (accessPatternsAndAddressesMatch(Read, Write, L) && nonzeroSteps)) 
+    {
+      kind = AffAccConflict::NoConflict;
+    }
+  }
+  return kind;
+}
+
+///returns the kind of conflict (and innermost common loop) that A and B have assuming there is some memory dependency
+///does not check for the memory dependency itself for to peformance
+std::pair<AffAccConflict, const Loop*> AffineAccess::calcConflict(AffAcc *A, AffAcc *B) const {
+  assert((A->isWrite() || B->isWrite()) && "conflict between two reads ???");
+  const Loop *const innermostCommon = findFirstContaining(A->getContainingLoops(), B->getMemoryAccess()->getBlock());
+  if (!innermostCommon) return std::make_pair(AffAccConflict::NoConflict, innermostCommon);
+  if (!A->isWrite()) std::swap(A, B); //we know at least one of them is write, swap so that one is A
+  AffAccConflict kind = AffAccConflict::Bad; //assume Bad at beginning
+  if (A->isWellFormed(innermostCommon) && B->isWellFormed(innermostCommon)) {
+    if (B->isWrite()) kind = AffAccConflict::MustNotIntersect; //WaW
+    else kind = calcRWConflict(B, A, innermostCommon); //B is read and A is write
+  }
+  //at this point, even if the two may alias, we assume the chance is high that they do at runtime 
+  //if their base addresses share some SCEVUnknowns (ie. some Value's) (FIXME: this is CONSERVATIVE)
+  if (kind == AffAccConflict::MustNotIntersect){
+    const Loop *L = innermostCommon->getParentLoop();
+    const Loop *Last = innermostCommon;
+    while (L && A->isWellFormed(L) && B->isWellFormed(L)) { //traverse up the loop-tree up to the point where one of them is not wellformed anymore
+      Last = L;
+      L = L->getParentLoop();
+    }
+    if (shareValues(A->getBaseAddr(Last), B->getBaseAddr(Last))) kind = AffAccConflict::Bad;
+  }
+  return std::make_pair(kind, innermostCommon);
+}
+
+//checks whether access patterns (step, rep) match up to some loop L
+bool AffineAccess::accessPatternsMatch(const AffAcc *A, const AffAcc *B, const Loop *L) const {
+  unsigned dimA = A->loopToDimension(L);
+  unsigned dimB = B->loopToDimension(L);
+  if (dimA != dimB) return false;
+  for (unsigned i = 1u; i <= dimA; i++){
+    if (A->getLoop(i) != B->getLoop(i)) return false;
+    if (!SCEVEquals(A->getRep(i), B->getRep(i), SE)) return false;
+    if (!SCEVEquals(A->getStep(i), B->getStep(i), SE)) return false;
+  }
+  return true;
+}
+
+//checks whether step, rep, and base address matches up to some loop L
+bool AffineAccess::accessPatternsAndAddressesMatch(const AffAcc *A, const AffAcc *B, const Loop *L) const {
+  if (!accessPatternsMatch(A, B, L)) return false;
+  return SCEVEquals(A->getBaseAddr(A->loopToDimension(L)), B->getBaseAddr(B->loopToDimension(L)), SE);
+}
+
+//simple access methods
+ScalarEvolution &AffineAccess::getSE() const { return this->SE; }
+DominatorTree &AffineAccess::getDT()const { return this->DT; }
+LoopInfo &AffineAccess::getLI() const { return this->LI; }
+MemorySSA &AffineAccess::getMSSA() const { return this->MSSA; }
+AAResults &AffineAccess::getAA() const { return this->AA; }
+DependenceInfo &AffineAccess::getDI() const { return this->DI; }
+SmallVector<Loop *, 4U> AffineAccess::getLoopsInPreorder() const { return this->LI.getLoopsInPreorder(); }
+
+//get accesses with no bad conflicts for some loop L
+//guarantees:
+// no bad conflicts with any other memory instruction in L
+// is well formed for L
+// if conflictFreeOnly: has no conflicts at all (only NoConflict) ==> no run-time checks necessary
+std::vector<AffAcc *> AffineAccess::getExpandableAccesses(const Loop *L, bool conflictFreeOnly) {
+  auto p = expandableAccesses.find(L);
+  std::vector<AffAcc *> res;
+  if (p == expandableAccesses.end()) return res;
+  for (AffAcc *A : p->getSecond()){
+    if (!conflictFreeOnly || A->getConflicts(L).empty()) res.push_back(A);
+  }
+  return res;
+}
+
+// code-gen: calls code-gen for all AffAccs in list, 
+// generates run-time checks for conflicts,
+// generates run-time checks for loop-trip-counts (if repChecks = true)
+// ANDs all the rt-checks to a single Value and writes it into BoundCheck
+std::vector<ExpandedAffAcc> 
+AffineAccess::expandAllAt(ArrayRef<AffAcc *> Accs, const Loop *L, 
+  Instruction *Point, Value *&BoundCheck, 
+  Type *PtrTy, IntegerType *ParamTy, bool conflictChecks, bool repChecks) 
+{
+  assert(Point);
+  IRBuilder<> builder(Point);
+
+  DenseMap<AffAcc*, ExpandedAffAcc> exps;
+  for (AffAcc *A : Accs) { //expand the requested AffAcc's
+    exps.insert(std::make_pair(A, std::move(A->expandAt(L, Point, PtrTy, ParamTy))));
+  }
+
+  std::vector<Value *> checks;
+  if (conflictChecks) {
+    DenseSet<AffAcc*> done; //keep track of which were done to not make duplicate checks
+    for (AffAcc *A : Accs) {
+      auto conflicts = A->getConflicts(L); //get all AffAcc's with which A conflicts
+      for (const auto &p : conflicts) {
+        AffAcc *B = p.first;
+        if (done.find(B) != done.end()) continue; //this conflict was already handled when A was B (symmetry)
+        AffAccConflict kind = std::max(p.second, B->getConflict(A, L)); //take worse conflict
+        switch (kind)
+        {
+        case AffAccConflict::NoConflict:
+          break; //nothing to do
+        case AffAccConflict::MustNotIntersect: {
+          auto e = exps.find(B);
+          if (e == exps.end()) { //if B was not yet expanded, do that and update the iterator for the pair in exps
+            e = exps.insert(std::make_pair(B, std::move(B->expandAt(L, Point, PtrTy, ParamTy)))).first;
+          }
+          assert(e->first == B);
+          ExpandedAffAcc &expB = e->getSecond();
+          ExpandedAffAcc &expA = exps.find(A)->getSecond(); //guaranteed to exist
+          Value *x = builder.CreateICmpULT(expA.UpperBound, expB.LowerBound, "no.inter.ab");
+          Value *y = builder.CreateICmpULT(expB.UpperBound, expA.LowerBound, "no.inter.ba");
+          checks.push_back(builder.CreateOr(x, y, "no.intersect"));
+          break;
+        }
+        case AffAccConflict::Bad:
+          llvm_unreachable("cannot expand the given accesses because some of them have a bad conflict in L!");
+          break;
+        default:
+          llvm_unreachable("unknown conflict type");
+        }
+      }
+    }
+  }
+
+  if (repChecks) {
+    DenseSet<const Loop *> loops; //find all relevant loops
+    for (auto &p : exps) {
+      AffAcc *A = p.first;
+      for (unsigned d = 0u; d < A->loopToDimension(L); d++) {
+        const Loop *x = A->getLoop(d);
+        if (x) loops.insert(x);
+      }
+    }
+    for (const Loop *M : loops) { //generate checks for the loops
+      auto p = reps.find(M);
+      assert(p != reps.end());
+      checks.push_back(p->second->expandLoopGuard(Point));
+    }
+  }
+
+  if (checks.empty()) BoundCheck = builder.getTrue();
+  else BoundCheck = builder.CreateAnd(checks);
+
+  std::vector<ExpandedAffAcc> res;
+  for (AffAcc *A : Accs) {
+    res.push_back(std::move(exps.find(A)->getSecond())); //(can move because exps not needed anymore)
+  }
+  return res;
+}
+
+//================== Affine Access Analysis ==================================================
+
+AnalysisKey AffineAccessAnalysis::Key;
+
+// run of the analysis pass
+AffineAccess AffineAccessAnalysis::run(Function &F, FunctionAnalysisManager &FAM) {
+  
+  LLVM_DEBUG(dbgs()<<"running AffineAccessAnalysis on "<<F.getName()<<"\n");
+
+  LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
+  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  ScalarEvolution &SE = FAM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &MSSAA = FAM.getResult<MemorySSAAnalysis>(F);
+  MemorySSA &MSSA = MSSAA.getMSSA();
+  AAResults &AA = FAM.getResult<AAManager>(F);
+  DependenceInfo &DI = FAM.getResult<DependenceAnalysis>(F);
+  
+  return AffineAccess(F, SE, DT, LI, MSSA, AA, DI);
+}
+
+//================== Affine Acces Analysis Pass for opt =======================================
+PreservedAnalyses AffineAccessAnalysisPass::run(Function &F, FunctionAnalysisManager &FAM) {
+  AffineAccess AA = FAM.getResult<AffineAccessAnalysis>(F);
+  for (const Loop *L : AA.getLI().getLoopsInPreorder()){
+    L->dump();
+    for (const AffAcc *A : AA.getExpandableAccesses(L)){
+      A->dumpInLoop(L);
+    }
+  }
+  return PreservedAnalyses::all();
+}
+
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index f31cf349b09aa..887b2176fe730 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -14,6 +14,7 @@ if (DEFINED LLVM_HAVE_TF_AOT OR DEFINED LLVM_HAVE_TF_API)
 endif()
 
 add_llvm_component_library(LLVMAnalysis
+  AffineAccessAnalysis.cpp
   AliasAnalysis.cpp
   AliasAnalysisEvaluator.cpp
   AliasAnalysisSummary.cpp
diff --git a/llvm/lib/CodeGen/PostRASchedulerList.cpp b/llvm/lib/CodeGen/PostRASchedulerList.cpp
index b85f00a61eac1..157ee3f183726 100644
--- a/llvm/lib/CodeGen/PostRASchedulerList.cpp
+++ b/llvm/lib/CodeGen/PostRASchedulerList.cpp
@@ -268,10 +268,12 @@ bool PostRAScheduler::enablePostRAScheduler(
     TargetSubtargetInfo::RegClassVector &CriticalPathRCs) const {
   Mode = ST.getAntiDepBreakMode();
   ST.getCriticalPathRCs(CriticalPathRCs);
-
   // Check for explicit enable/disable of post-ra scheduling.
-  if (EnablePostRAScheduler.getPosition() > 0)
+  if (EnablePostRAScheduler.getPosition() > 0) {
     return EnablePostRAScheduler;
+  }
+
+  // return true; //FIXME: Snitch does not enable this by default (and should probably)
 
   return ST.enablePostRAScheduler() &&
          OptLevel >= ST.getOptLevelToEnablePostRAScheduler();
@@ -291,7 +293,6 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
   TargetSubtargetInfo::AntiDepBreakMode AntiDepMode =
     TargetSubtargetInfo::ANTIDEP_NONE;
   SmallVector<const TargetRegisterClass*, 4> CriticalPathRCs;
-
   // Check that post-RA scheduling is enabled for this target.
   // This may upgrade the AntiDepMode.
   if (!enablePostRAScheduler(Fn.getSubtarget(), PassConfig->getOptLevel(),
@@ -307,11 +308,13 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
          : TargetSubtargetInfo::ANTIDEP_NONE);
   }
 
+  // AntiDepMode = TargetSubtargetInfo::ANTIDEP_ALL; //FIXME: Snitch does not enable this by default (and probably should)
+
   LLVM_DEBUG(dbgs() << "PostRAScheduler\n");
 
   SchedulePostRATDList Scheduler(Fn, MLI, AA, RegClassInfo, AntiDepMode,
                                  CriticalPathRCs);
-
+  
   // Loop over all of the basic blocks
   for (auto &MBB : Fn) {
 #ifndef NDEBUG
diff --git a/llvm/lib/Passes/CMakeLists.txt b/llvm/lib/Passes/CMakeLists.txt
index d834c0db4b458..fa0efb387353a 100644
--- a/llvm/lib/Passes/CMakeLists.txt
+++ b/llvm/lib/Passes/CMakeLists.txt
@@ -16,6 +16,7 @@ add_llvm_component_library(LLVMPasses
   Core
   Coroutines
   HelloNew
+  SSR
   IPO
   InstCombine
   ObjCARC
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 6c1a7c75d30a2..0680b98465a4b 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -16,6 +16,7 @@
 
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/AffineAccessAnalysis.h"
 #include "llvm/Analysis/AliasAnalysisEvaluator.h"
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/AssumptionCache.h"
@@ -85,6 +86,8 @@
 #include "llvm/Transforms/Coroutines/CoroElide.h"
 #include "llvm/Transforms/Coroutines/CoroSplit.h"
 #include "llvm/Transforms/HelloNew/HelloWorld.h"
+#include "llvm/Transforms/SSR/SSRInference.h"
+#include "llvm/Transforms/SSR/SSRGeneration.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/Transforms/IPO/Annotation2Metadata.h"
 #include "llvm/Transforms/IPO/ArgumentPromotion.h"
@@ -518,7 +521,6 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
                                                    ThinOrFullLTOPhase Phase) {
 
   FunctionPassManager FPM(DebugLogging);
-
   // Form SSA out of local memory accesses after breaking apart aggregates into
   // scalars.
   FPM.addPass(SROA());
@@ -555,6 +557,7 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
   // Simplify the loop body. We do this initially to clean up after other loop
   // passes run, either when iterating on a loop or on inner loops with
   // implications on the outer loop.
+
   LPM1.addPass(LoopInstSimplifyPass());
   LPM1.addPass(LoopSimplifyCFGPass());
 
@@ -593,6 +596,7 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
       DebugLogging));
   FPM.addPass(SimplifyCFGPass());
   FPM.addPass(InstCombinePass());
+
   if (EnableLoopFlatten)
     FPM.addPass(LoopFlattenPass());
   // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA.
@@ -643,7 +647,6 @@ FunctionPassManager
 PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
                                                  ThinOrFullLTOPhase Phase) {
   assert(Level != OptimizationLevel::O0 && "Must request optimizations!");
-
   // The O1 pipeline has a separate pipeline creation function to simplify
   // construction readability.
   if (Level.getSpeedupLevel() == 1)
@@ -755,8 +758,11 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(createFunctionToLoopPassAdaptor(
       std::move(LPM1), EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true,
       DebugLogging));
+
   FPM.addPass(SimplifyCFGPass());
   FPM.addPass(InstCombinePass());
+  FPM.addPass(SSRInferencePass());
+
   if (EnableLoopFlatten)
     FPM.addPass(LoopFlattenPass());
   // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass,
@@ -793,7 +799,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   // opportunities opened up by them.
   FPM.addPass(InstCombinePass());
   invokePeepholeEPCallbacks(FPM, Level);
-
+  
   // Re-consider control flow based optimizations after redundancy elimination,
   // redo DCE, etc.
   FPM.addPass(JumpThreadingPass());
@@ -988,6 +994,8 @@ PassBuilder::buildInlinerPipeline(OptimizationLevel Level,
   MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
       buildFunctionSimplificationPipeline(Level, Phase)));
 
+  //MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(SSRInferencePass()));
+
   return MIWP;
 }
 
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 877cb9ed13b37..cb16781e4ecd2 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -146,6 +146,7 @@ CGSCC_PASS("no-op-cgscc", NoOpCGSCCPass())
 #ifndef FUNCTION_ANALYSIS
 #define FUNCTION_ANALYSIS(NAME, CREATE_PASS)
 #endif
+FUNCTION_ANALYSIS("affine-access", AffineAccessAnalysis())
 FUNCTION_ANALYSIS("aa", AAManager())
 FUNCTION_ANALYSIS("assumptions", AssumptionAnalysis())
 FUNCTION_ANALYSIS("block-freq", BlockFrequencyAnalysis())
@@ -190,6 +191,9 @@ FUNCTION_ALIAS_ANALYSIS("tbaa", TypeBasedAA())
 #ifndef FUNCTION_PASS
 #define FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
+FUNCTION_PASS("infer-ssr", SSRInferencePass())
+FUNCTION_PASS("generate-ssr", SSRGenerationPass())
+FUNCTION_PASS("affine-access-pass", AffineAccessAnalysisPass())
 FUNCTION_PASS("aa-eval", AAEvaluator())
 FUNCTION_PASS("adce", ADCEPass())
 FUNCTION_PASS("add-discriminators", AddDiscriminatorsPass())
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index c822929f94770..2dbd3f5e9be12 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -27,6 +27,7 @@ add_llvm_target(RISCVCodeGen
   RISCVExpandAtomicPseudoInsts.cpp
   RISCVExpandPseudoInsts.cpp
   RISCVExpandSSRInsts.cpp
+  RISCVExpandSSRInstsPostRegAlloc.cpp
   RISCVExpandSDMAInsts.cpp
   RISCVFrameLowering.cpp
   RISCVInstrInfo.cpp
@@ -42,7 +43,10 @@ add_llvm_target(RISCVCodeGen
   RISCVTargetMachine.cpp
   RISCVTargetObjectFile.cpp
   RISCVTargetTransformInfo.cpp
+  RISCVSSRReassociate.cpp
+  RISCVSSRStatistics.cpp
   Snitch/SNITCHFrepLoops.cpp
+  Snitch/SNITCHAutoFrep.cpp
 
   LINK_COMPONENTS
   Analysis
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index 5f8d6e1375187..6ba9c6901c8e8 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -86,6 +86,7 @@ void RISCVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                     const MCSubtargetInfo &STI, raw_ostream &O,
                                     const char *Modifier) {
   assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+
   const MCOperand &MO = MI->getOperand(OpNo);
 
   if (MO.isReg()) {
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index 2cd960d7587d8..f731a4b7c1fbb 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -45,6 +45,18 @@ void initializeRISCVMergeBaseOffsetOptPass(PassRegistry &);
 FunctionPass *createRISCVExpandPseudoPass();
 void initializeRISCVExpandPseudoPass(PassRegistry &);
 
+FunctionPass *createRISCVExpandSSRPostRegAllocPass();
+void initializeRISCVExpandSSRPostRegAllocPass(PassRegistry &);
+
+FunctionPass *createSNITCHAutoFrepPass();
+void initializeSNITCHAutoFrepPass(PassRegistry &);
+
+FunctionPass *createSSRReassociatePass();
+void initializeSSRReassociatePass(PassRegistry &);
+
+FunctionPass *createSSRStatisticsPass();
+void initializeSSRStatisticsPass(PassRegistry &);
+
 FunctionPass *createRISCVExpandAtomicPseudoPass();
 void initializeRISCVExpandAtomicPseudoPass(PassRegistry &);
 
@@ -54,6 +66,8 @@ void initializeRISCVCleanupVSETVLIPass(PassRegistry &);
 FunctionPass *createRISCVExpandSSRPass();
 void initializeRISCVExpandSSRPass(PassRegistry &);
 
+//TODO : reference function pass for auto SSR inference here (+ add to CMakeLists.txt)
+
 FunctionPass *createRISCVExpandSDMAPass();
 void initializeRISCVExpandSDMAPass(PassRegistry &);
 
diff --git a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp
index a90f199b6b1d6..93156b2315546 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp
@@ -62,11 +62,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "riscv-ssr"
 
-/// Command line options
-static cl::opt<bool>
-    SSRRegisterMerge("ssr-noregmerge", cl::Hidden,
-                    cl::desc("Disable the merging of SSR registers in other instructions"));
-
 #define RISCV_EXPAND_SSR_NAME "RISCV SSR pseudo instruction expansion pass"
 
 #define NUM_SSR 3
@@ -78,12 +73,6 @@ class RISCVExpandSSR : public MachineFunctionPass {
   const RISCVInstrInfo *TII;
   static char ID;
 
-  /// Parameters for the register merging pass
-  struct RegisterMergingPreferences {
-    /// enable the register merging
-    bool Enable;
-  };
-
   RISCVExpandSSR() : MachineFunctionPass(ID) {
     initializeRISCVExpandSSRPass(*PassRegistry::getPassRegistry());
   }
@@ -96,10 +85,10 @@ class RISCVExpandSSR : public MachineFunctionPass {
 
   const MachineFunction *MF;
   RISCVMachineFunctionInfo *RVFI;
-  bool Enabled;
+  std::vector<MachineInstr *> MoveLoads;
+  std::vector<MachineInstr *> MoveStores;
 
   bool expandMBB(MachineBasicBlock &MBB);
-  void mergePushPop(MachineBasicBlock &MBB);
   bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                 MachineBasicBlock::iterator &NextMBBI);
   bool expandSSR_Setup(MachineBasicBlock &MBB,
@@ -119,8 +108,7 @@ class RISCVExpandSSR : public MachineFunctionPass {
   bool expandSSR_Barrier(MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator MBBI,
                          MachineBasicBlock::iterator &NextMBBI);
-
-  RISCVExpandSSR::RegisterMergingPreferences gatherRegisterMergingPreferences();
+  void handlePushPops();
 };
 
 char RISCVExpandSSR::ID = 0;
@@ -140,17 +128,14 @@ bool RISCVExpandSSR::runOnMachineFunction(MachineFunction &MF) {
   TII = static_cast<const RISCVInstrInfo *>(MF.getSubtarget().getInstrInfo());
   this->MF = &MF;
   this->RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
-  Enabled = false;
+  this->MoveLoads.empty();
+  this->MoveStores.empty();
 
   bool Modified = false;
   for (auto &MBB : MF)
     Modified |= expandMBB(MBB);
 
-  // Run over MF again to merge SSR pops/pushs into instruction uses
-  RISCVExpandSSR::RegisterMergingPreferences RMP = gatherRegisterMergingPreferences();
-  if(RMP.Enable && RVFI->getUsedSSR())
-    for (auto &MBB : MF)
-      mergePushPop(MBB);
+  handlePushPops();
 
   /// "Forcefully" add all SSR registers as live-in to all MBB in this MF
   if(Modified) {
@@ -161,6 +146,10 @@ bool RISCVExpandSSR::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  //errs()<<"\n ========================= DUMP MF ========================== \n";
+  //MF.dump();
+  //errs()<<"\n ======================= END DUMP MF ========================== \n";
+
   return Modified;
 }
 
@@ -211,17 +200,6 @@ bool RISCVExpandSSR::expandMI(MachineBasicBlock &MBB,
     return expandSSR_Barrier(MBB, MBBI, NextMBBI);
   }
 
-  // Prevent excessive live-ins, they pose a problem with multiple SSR regions
-  // in a single function. Adding SSR regs to live ins in push/pop should suffice
-  // for now, but there might be edge cases
-  
-  // if(Enabled) {
-  //   // mark the SSR registers reserved in this BB
-  //   unsigned ssrEnabledMask = 0;
-  //   for (unsigned n = 0; n < NUM_SSR; ++n)
-  //     MBB.addLiveIn(getSSRFtReg(n));
-  // }
-
   return false;
 }
 
@@ -272,30 +250,25 @@ bool RISCVExpandSSR::expandSSR_PushPop(MachineBasicBlock &MBB,
   LLVM_DEBUG(dbgs() << "-- Expanding SSR " << (isPop?"Pop":"Push") << "\n");
   LLVM_DEBUG(dbgs() << "   Using register " << R << " for SSR streamer "<<streamer<<"\n");
 
-  // Build float move from ssr reg to register provided as argument or vice versa
-  // FSGNJ_D is used for FMV.D
   if(isPop) {
-    // Insert newly-built instruction and set up first operand as a destination
-    // virtual register. A copy instruction is emitted that moves the value
-    // from the SSR register (R) to a destination virtual register
-    BuildMI(MBB, MBBI, DL, TII->get(RISCV::FSGNJ_D), MBBI->getOperand(ssrValIdx).getReg())
-      .addReg(R, 0)
-      .addReg(R, 0);
-    // BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), MBBI->getOperand(ssrValIdx).getReg())
-    //   .addReg(R, 0);
+    // Insert a "loading move" this is like a normal move but has side effects
+    Register valR = MBBI->getOperand(ssrValIdx).getReg();
+    MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoLoadMove), valR).addReg(R, 0).getInstr();
+    MBBI->eraseFromParent(); // The pseudo instruction is gone now.
+    MI->getOperand(0).setIsDef();
+    this->MoveLoads.push_back(MI);
   }
   else {
-    // Build a copy instruction that moves the value from the register passed as 
-    // argument to the ssr data register (R)
-    BuildMI(MBB, MBBI, DL, TII->get(RISCV::FSGNJ_D), R)
-      .addReg(MBBI->getOperand(ssrValIdx).getReg())
-      .addReg(MBBI->getOperand(ssrValIdx).getReg());
-    // BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), R)
-    //   .addReg(MBBI->getOperand(ssrValIdx).getReg());
+    Register valR = MBBI->getOperand(ssrValIdx).getReg();
+    // Insert a "storing move" this is like a normal move but has side effects
+    MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoStoreMove), R)
+      .addReg(valR, getRegState(MBBI->getOperand(ssrValIdx)))
+      .getInstr();
+    MBBI->eraseFromParent(); // The pseudo instruction is gone now.
+    this->MoveStores.push_back(MI);
   }
 
   MBB.addLiveIn(R);
-  MBBI->eraseFromParent(); // The pseudo instruction is gone now.
   return true;
 }
 
@@ -411,7 +384,6 @@ bool RISCVExpandSSR::expandSSR_EnDis(MachineBasicBlock &MBB,
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
   LLVM_DEBUG(dbgs() << "-- Expanding SSR " << (isEnable ? "Enable" : "Disable") << "\n");
-  Enabled = isEnable;
 
   // emit a csrsi/csrci call to the SSR location
   if(isEnable) {
@@ -479,96 +451,9 @@ bool RISCVExpandSSR::expandSSR_Barrier(MachineBasicBlock &MBB,
   return true;
 }
 
-void RISCVExpandSSR::mergePushPop(MachineBasicBlock &MBB) {
-  SmallSet<Register, 8> virtRegs[NUM_SSR];
-  const TargetRegisterInfo *TRI = MBB.getParent()->getRegInfo().getTargetRegisterInfo();
-  bool inSSRRegion = false;
-
-  Register ssr_regs[NUM_SSR];
-  for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) ssr_regs[ssr_no] = getSSRFtReg(ssr_no);
-
-  // First pass: Detect moves to or from SSR registers
-  for (auto MI = MBB.begin() ; MI != MBB.end() ; ) {
-    MachineBasicBlock::iterator NMI = std::next(MI);
-
-    LLVM_DEBUG(dbgs()<<"Analyzing: "<<*MI<<"\n");
-
-    // detect an emitted pop and add assignment (virtual_reg, ssr_read) to list
-    if(MI->getOpcode() == RISCV::FSGNJ_D) {
-      LLVM_DEBUG(dbgs()<<"Found FSGNJ_D, Op 0: " << MI->getOperand(1).getReg() << " Op1: " << MI->getOperand(2).getReg() << "\n");
-      
-      // look for each streamer register
-      for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) {
-        // check for pop
-        if(MI->getOperand(1).getReg() == ssr_regs[ssr_no] && MI->getOperand(2).getReg() == ssr_regs[ssr_no]) {
-          LLVM_DEBUG(dbgs()<<"  pop: both operands from SSR"<< ssr_no <<"\n");
-          // append virtual register to list of assigned virtuals
-          LLVM_DEBUG(dbgs()<<"  append: "<< MI->getOperand(0).getReg() <<"\n");
-          virtRegs[ssr_no].insert(MI->getOperand(0).getReg());
-          // remove operation
-          MI->eraseFromParent();
-          break;
-        }
-        // TODO: check for push
-        else if(MI->getOperand(0).getReg() == ssr_regs[ssr_no]) {
-          // This is non-trivial because a register might be used elsewhere, therefore the entire MBB
-          // must be analyzed and a merge can only be made, if the register is written once
-          // LLVM_DEBUG(dbgs()<<"  push: operand 0 from SSR"<< ssr_no <<"\n");
-          // // append virtual register to list of assigned virtuals
-          // LLVM_DEBUG(dbgs()<<"  append: "<< MI->getOperand(1).getReg() <<"\n");
-          // virtRegs[ssr_no].insert(MI->getOperand(1).getReg());
-          // // remove operation
-          // MI->eraseFromParent();
-          break;
-        }
-      }
-    } 
-    MI = NMI;
-  }
-
-  // DBG
-  for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) {
-    for (auto iter = virtRegs[ssr_no].begin() ; iter != virtRegs[ssr_no].end() ; ++iter)
-      LLVM_DEBUG(dbgs() << "virtregs["<<ssr_no<<"] = " << *iter << "\n");
-  }
-
-  // Second pass: Replace uses of virtual registers corresponding to DMs with FT registers
-  for (auto MI = MBB.begin() ; MI != MBB.end() ; ) {
-    MachineBasicBlock::iterator NMI = std::next(MI);
-
-    // look for usage of any of the virtual registers assigned to SSRs
-    for (auto operand = MI->operands_begin() ; operand != MI->operands_end() ; ++operand) {
-      if(!operand->isReg()) continue;
-      // check if operand is in any SSR list
-      for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) {
-        if(virtRegs[ssr_no].contains(operand->getReg())) {
-          LLVM_DEBUG(dbgs() << "Found use of operand " << operand->getReg() << " ssr: " << ssr_no << " in inst " <<  MI->getOpcode() << "\n");
-          // substitute with SSR register
-          MI->substituteRegister(operand->getReg(), ssr_regs[ssr_no], 0, *TRI);
-          // guard this block and add ssr regs to live in
-          MBB.addLiveIn(ssr_regs[ssr_no]);
-        }
-      }
-    }
-    MI = NMI;    
-  }
-  MBB.sortUniqueLiveIns();
-}
-
-/// Gather parameters for the register merging
-RISCVExpandSSR::RegisterMergingPreferences RISCVExpandSSR::gatherRegisterMergingPreferences() {
-  RISCVExpandSSR::RegisterMergingPreferences RMP;
-
-  // set up defaults
-  RMP.Enable = true;
-
-  // read user 
-  if (SSRRegisterMerge.getNumOccurrences() > 0)
-    RMP.Enable = !SSRRegisterMerge;
-
-  LLVM_DEBUG(dbgs() << "RMP Enable "<<RMP.Enable<<"\n");
-
-  return RMP;
+//additional optimisations for MoveLoad or MoveStore
+void RISCVExpandSSR::handlePushPops() {
+  return;
 }
 
 } // end of anonymous namespace
@@ -580,3 +465,35 @@ namespace llvm {
 FunctionPass *createRISCVExpandSSRPass() { return new RISCVExpandSSR(); }
 
 } // end of namespace llvm
+
+/*
+  //TODO: bundle what is regmerged after reg-alloc to make sure that the FADD/FMUL/FMUL/etc.. do not slip past ssr_disable
+  /*
+  DenseMap<MachineInstr *, std::pair<MachineInstr *, MachineInstr *>> bundles;
+  //pops:
+  for (MachineInstr *MI : this->MoveLoads) {
+    if (!MI) continue;
+    MachineInstr *SingleUser = getUniqueUser(MI, MI->getOperand(0).getReg());
+    if (SingleUser && SingleUser->getParent() == MI->getParent()) {
+      MI->moveBefore(SingleUser); //we pray that there was no reordering until now that moved SingleUser after the SSRDisable
+      auto b = bundles.find(SingleUser);
+      if (b == bundles.end()) {
+        b = bundles.insert(std::make_pair(SingleUser, std::make_pair(SingleUser, SingleUser))).first;
+      }
+      if (b->getSecond().first == SingleUser) b->getSecond().first = MI; //if begin of bundle was SingleUser, set to MI
+    }
+  }
+  //pushs: FIXME: currently only works if the defining instruction is pred of MoveStore (how to get def from MachineOperand ???)
+  for (MachineInstr *MI : this->MoveStores) {
+    Register valR = MI->getOperand(1).getReg();
+    MachineInstr *Pred = MI->getPrevNode();
+    bool doesDefvalR = false;
+    for (auto &MOP : Pred->defs()) doesDefvalR |= MOP.isReg() && MOP.getReg() == valR;
+    if (doesDefvalR && MI == getUniqueUser(Pred, valR)) {
+      auto b = bundles.find(Pred);
+      if (b == bundles.end()) {
+        b = bundles.insert(std::make_pair(Pred, std::make_pair(Pred, Pred))).first;
+      }
+      if (b->getSecond().second == Pred) b->getSecond().second = MI;
+    }
+  }*/
\ No newline at end of file
diff --git a/llvm/lib/Target/RISCV/RISCVExpandSSRInstsPostRegAlloc.cpp b/llvm/lib/Target/RISCV/RISCVExpandSSRInstsPostRegAlloc.cpp
new file mode 100644
index 0000000000000..2297f189fc1a3
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVExpandSSRInstsPostRegAlloc.cpp
@@ -0,0 +1,476 @@
+//===-- RISCVExpandSSRPostRegAllocInsts.cpp - Expand the rest of the SSR pseudo insts ---------===//
+//
+// ???
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands the PseudoLoadMove and PseudoStoreMove
+// into normal moves and is meant to be run after any scheduling to guarantee
+// correctness.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVInstrInfo.h"
+#include "RISCVTargetMachine.h"
+#include "RISCVMachineFunctionInfo.h"
+
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Support/CommandLine.h"
+
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/AntiDepBreaker.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-ssr"
+
+namespace llvm {
+  /// Command line options
+  cl::opt<bool> SSRNoRegisterMerge("ssr-no-regmerge", cl::init(false),
+    cl::desc("Disable the merging of SSR registers in other instructions"));
+}
+
+#define RISCV_EXPAND_SSR_POST_REG_ALLOC_NAME "RISCV SSR pseudo instruction expansion pass post reg alloc"
+
+#define NUM_SSR 3
+
+namespace {
+
+class RISCVExpandSSRPostRegAlloc : public MachineFunctionPass {
+public:
+  const RISCVInstrInfo *TII;
+  static char ID;
+
+  RISCVExpandSSRPostRegAlloc() : MachineFunctionPass(ID) {
+    initializeRISCVExpandSSRPostRegAllocPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return RISCV_EXPAND_SSR_POST_REG_ALLOC_NAME; }
+
+private:
+  bool expandMBB(MachineBasicBlock &MBB);
+  bool mergePushPop(MachineBasicBlock &MBB);
+  bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                MachineBasicBlock::iterator &NextMBBI);
+  bool expandSSR_StoreLoadMove(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator MBBI);
+};
+
+char RISCVExpandSSRPostRegAlloc::ID = 0;
+
+//from RISCVExpandSSRInsts.cpp
+static Register getSSRFtReg(unsigned streamer) {
+  unsigned AssignedReg = RISCV::F0_D + streamer;
+  // Advance the iterator to the assigned register until the valid
+  // register is found
+  const TargetRegisterClass *RC = &RISCV::FPR64RegClass;
+  TargetRegisterClass::iterator I = RC->begin();
+  for (; *I != AssignedReg; ++I)
+    assert(I != RC->end() && "AssignedReg should be a member of provided RC");
+  return Register(*I);
+}
+
+bool RISCVExpandSSRPostRegAlloc::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const RISCVInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  bool Modified = false;
+  for (auto &MBB : MF) Modified |= expandMBB(MBB);
+
+  if (SSRNoRegisterMerge) LLVM_DEBUG(dbgs()<<"regmerge disabled\n");
+  if (!SSRNoRegisterMerge && Modified){
+    for (auto &MBB : MF) mergePushPop(MBB);
+  }
+
+  return Modified;
+}
+
+bool RISCVExpandSSRPostRegAlloc::expandMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+    Modified |= expandMI(MBB, MBBI, NMBBI);
+    MBBI = NMBBI;
+  }
+  MBB.sortUniqueLiveIns();
+
+  return Modified;
+}
+
+bool RISCVExpandSSRPostRegAlloc::expandMI(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 MachineBasicBlock::iterator &NextMBBI) {
+  switch (MBBI->getOpcode()) {
+  case RISCV::PseudoStoreMove:
+  case RISCV::PseudoLoadMove:
+    return expandSSR_StoreLoadMove(MBB, MBBI);
+  default:
+    return false;
+  }
+}
+
+bool RISCVExpandSSRPostRegAlloc::expandSSR_StoreLoadMove(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MBBI) {
+  DebugLoc DL = MBBI->getDebugLoc();
+
+  Register src = MBBI->getOperand(0).getReg();
+  Register dest = MBBI->getOperand(1).getReg();
+
+  BuildMI(MBB, MBBI, DL, TII->get(RISCV::FSGNJ_D), src)
+    .addReg(dest)
+    .addReg(dest);
+
+  MBBI->eraseFromParent(); // The pseudo instruction is gone now.
+  return true;
+}
+
+static MachineOperand *getUniqueUser (
+    MachineBasicBlock::instr_iterator beg, 
+    MachineBasicBlock::instr_iterator end, 
+    Register valR) 
+  {
+  
+  if (beg.isEnd()) return nullptr;
+  auto *MBB = beg->getParent();
+  assert(MBB);
+  
+  auto realend = MBB->end().getInstrIterator();
+  
+  MachineOperand *UseMOP = nullptr;
+  bool isPastEnd = false;
+  
+  for (auto MII = beg; MII != realend; ++MII) {
+    
+    isPastEnd |= MII == end;
+    if (MII->isDebugInstr()) continue; //skip debug instructions
+    bool definesValR = false;
+    
+    for (auto &MOP : MII->operands()) {
+      if (!MOP.isReg() || MOP.getReg() != valR) continue;
+      //at this point we know MII accesses valR, with MOP, but maybe also other operands
+      definesValR |= MOP.isDef();
+      if (!isPastEnd && !UseMOP && !MOP.isDef()) {
+        UseMOP = &MOP; //if UseMOP is not yet found and MOP does not redefine valR then MOP is the first Use
+        if (MOP.isKill()) return UseMOP; //if MOP kills valR then we can stop looking further and return
+      }
+    }
+    
+    if (definesValR) {
+      return UseMOP; //if MII (re-)defines valR then we must have already found the Use before, (or we haven't in which case we return null)
+    }
+    
+  }
+  
+  if (MBB) {
+    
+    bool avail_in_all = true;
+    MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+    
+    for (auto *Succ : MBB->successors()) {
+      
+      if (!Succ) continue;
+      
+      LivePhysRegs liveness(*MRI.getTargetRegisterInfo());
+      liveness.addLiveIns(*Succ);
+      avail_in_all &= liveness.available(MRI, valR);
+    }
+    
+    if (avail_in_all) return UseMOP;
+    
+  }
+  
+  return nullptr;
+}
+
+bool RISCVExpandSSRPostRegAlloc::mergePushPop(MachineBasicBlock &MBB) {
+  Register ssr_regs[NUM_SSR];
+  for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) ssr_regs[ssr_no] = getSSRFtReg(ssr_no);
+
+  bool Modified = false;
+  
+  for (auto ssr_reg : ssr_regs){
+    SmallSet<MachineInstr *, 2u> modified;
+    for (auto MI = MBB.rbegin().getInstrIterator(); MI != MBB.rend().getInstrIterator(); ){ //go from back to front
+      auto PMI = std::next(MI); //this is prev bc reverse iterator
+      if(MI->getOpcode() == RISCV::FSGNJ_D){
+        if (MI->getOperand(1).getReg() == ssr_reg && MI->getOperand(2).getReg() == ssr_reg && MI->getOperand(0).isReg()){ //this was an SSR pop
+          //limit search range for regmerge if there is an ssr disable
+          MachineBasicBlock::instr_iterator rangeLimit = MI.getReverse();
+          for (; rangeLimit != MBB.end().getInstrIterator(); ++rangeLimit){
+            if (rangeLimit->getOpcode() == RISCV::CSRRCI 
+              && rangeLimit->getOperand(1).isImm() 
+              && rangeLimit->getOperand(1).getImm() == 0x7C0
+              && rangeLimit->getOperand(2).getImm() == 1)
+            {
+              break;
+            } 
+          }
+          Register r = MI->getOperand(0).getReg(); //register to replace
+          MachineOperand *MO = getUniqueUser(std::next(MI.getReverse()), rangeLimit, r);
+          if (!MO) LLVM_DEBUG(dbgs()<<"*** NOT FOUND ***\n");
+          if (MO) { //if unique user exists
+            MachineInstr *MIUser = MO->getParent();
+            if (MIUser && modified.find(MIUser) == modified.end()){ //if unique user exists and was not yet modified
+              LLVM_DEBUG(MIUser->dump());
+              for (auto &MOP : MIUser->operands()) {
+                if (MOP.isReg() && !MOP.isDef() && MOP.getReg() == r) {
+                  MOP.setReg(ssr_reg); //replace all non-def uses of r with ssr_reg
+                  MOP.setIsKill(false);
+                  MOP.setIsRenamable(false);
+                }
+              }
+              LLVM_DEBUG(MIUser->dump());
+              MI->eraseFromBundle();
+              modified.insert(MIUser);
+            }
+          }
+        }else if(MI->getOperand(0).getReg() == ssr_reg){
+          if (MI->getOperand(1).isReg() 
+            && MI->getOperand(2).isReg() 
+            && MI->getOperand(1).getReg() == MI->getOperand(2).getReg())
+          { //FIXME: use liveness analysis instead of .isKill()
+            Register R = MI->getOperand(1).getReg();
+            MachineInstr *Pred = MI->getPrevNode();
+            if (Pred && modified.find(Pred) == modified.end()){ //if Pred exists and is unmodified
+              bool predDefsR = false;
+              for (auto &MOP : Pred->defs()) {
+                predDefsR |= MOP.isReg() && MOP.isDef() && MOP.getReg() == R;
+              }
+              if (predDefsR) { //if Pred defines R
+                auto end = MI->getParent()->end().getInstrIterator();
+                MachineOperand *MO = getUniqueUser(Pred->getIterator(), end, R);
+                if (MO && MO->getParent() == &*MI) { //if MI is unique user of R
+                  LLVM_DEBUG(Pred->dump());
+                  for (auto &MOP : Pred->operands()) {
+                    if (MOP.isReg() && MOP.isDef() && MOP.getReg() == R) { 
+                      MOP.setReg(ssr_reg); //replace all defs of R with ssr_reg
+                      MOP.setIsDef(false);
+                      MOP.setIsKill(false);
+                      MOP.setIsDead(false);
+                      MOP.setIsRenamable(false);
+                    }
+                  }
+                  LLVM_DEBUG(Pred->dump());
+                  MI->eraseFromBundle();
+                  modified.insert(Pred);
+                }
+              }
+            }
+          }
+        }
+      }
+      MI = PMI;
+    }
+  }
+  return Modified;
+}
+
+} // end of anonymous namespace
+
+INITIALIZE_PASS(RISCVExpandSSRPostRegAlloc, "riscv-expand-ssr-post-reg-alloc",
+                RISCV_EXPAND_SSR_POST_REG_ALLOC_NAME, false, false)
+namespace llvm {
+
+FunctionPass *createRISCVExpandSSRPostRegAllocPass() { return new RISCVExpandSSRPostRegAlloc(); }
+
+} // end of namespace llvm
+
+
+///REGMERGE USING LIVENESS, BUT SOMEHOW WORSE
+// static std::pair<bool, bool> isDefIsUse(MachineInstr &MI, MCRegister R) {
+//   bool def = false;
+//   bool use = false;
+//   for (auto &MOP : MI.operands()) {
+//     if (MOP.isReg() && MOP.getReg() == R) {
+//       if (MOP.isDef()) def = true;
+//       else use = true;
+//     }
+//   }
+//   return std::make_pair(def, use);
+// }
+
+// struct Liveness {
+// public:
+//   Liveness(const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, MachineBasicBlock &MBB, bool end) : liveness(TRI), MBB(MBB), MRI(MRI) {
+//     if (end) {
+//       liveness.addLiveOuts(MBB);
+//       LiveIn = MBB.end().getInstrIterator();
+//     } else {
+//       liveness.addLiveIns(MBB);
+//       LiveIn = MBB.begin().getInstrIterator();
+//     }
+//   }
+
+//   void MoveForward(MachineBasicBlock::instr_iterator Point) {
+//     if (Point == LiveIn) return;
+//     SmallVector<std::pair<MCPhysReg, const MachineOperand *>, 1u> clb;
+//     while (LiveIn != Point && LiveIn != MBB.end().getInstrIterator()) {
+//       liveness.stepForward(*LiveIn, clb);
+//       LiveIn++;
+//     }
+//     assert(LiveIn == Point && "moved forward to point");
+//   }
+
+//   void MoveBackward(MachineBasicBlock::reverse_instr_iterator Point) {
+//     assert(Point != MBB.rend().getInstrIterator() && "not rend()");
+//     if (Point.getReverse() == LiveIn) return;
+//     Point++; //in order to get LiveIN for Point we have to move up to and incl. Point
+//     MachineBasicBlock::reverse_instr_iterator LiveInRev = LiveIn.getReverse();
+//     LiveInRev++;
+//     while (LiveInRev != Point && LiveInRev != MBB.rend().getInstrIterator()) {
+//       liveness.stepBackward(*LiveInRev);
+//       LiveInRev++;
+//     }
+//     LiveIn = std::next(LiveInRev.getReverse());
+//     assert(LiveInRev == Point && "moved backward to point");
+//   }
+
+//   //move forward up to first use of Reg, make sure Reg is not live anymore afterwards
+//   MachineBasicBlock::instr_iterator findUniqueUser(MCRegister Reg, MachineBasicBlock::instr_iterator end) {
+//     while (LiveIn != end) {
+//       auto ut = isDefIsUse(*LiveIn, Reg);
+//       if (ut.first && !ut.second) return end; //redefined
+//       if (ut.first && ut.second) return LiveIn; //first user redefines himself
+//       MoveForward(std::next(LiveIn));
+//       if (ut.second) {
+//         if (liveness.available(MRI, Reg)) std::prev(LiveIn);
+//         else {
+//           for (auto x = LiveIn; x != MBB.end().getInstrIterator(); ++x) {
+//             auto ut = isDefIsUse(*x, Reg);
+//             if (ut.first && !ut.second) return std::prev(LiveIn); //found redef.
+//             else if (ut.second) return end; //another use
+//           }
+//           return end;
+//         }
+//       }
+//     }
+//     return end;
+//   }
+
+//   MachineBasicBlock::instr_iterator getPoint() const { return LiveIn; }
+//   const LivePhysRegs &getLiveness() const { return liveness; }
+//   void addReg(MCRegister R) { liveness.addReg(R); }
+
+// private:
+//   MachineBasicBlock::instr_iterator LiveIn; //INV: this always points to the instr for which liveness has live-in info
+//   LivePhysRegs liveness;
+//   MachineBasicBlock &MBB;
+//   const MachineRegisterInfo &MRI;
+// };
+
+// static bool isSSREn(const MachineInstr &MI) {
+//   return MI.getOpcode() == RISCV::CSRRSI 
+//     && MI.getOperand(1).isImm() 
+//     && MI.getOperand(1).getImm() == 0x7C0
+//     && MI.getOperand(2).isImm()
+//     && MI.getOperand(2).getImm() == 1;
+// }
+
+// static bool isSSRDis(const MachineInstr &MI) {
+//   return MI.getOpcode() == RISCV::CSRRCI 
+//     && MI.getOperand(1).isImm() 
+//     && MI.getOperand(1).getImm() == 0x7C0
+//     && MI.getOperand(2).isImm()
+//     && MI.getOperand(2).getImm() == 1;
+// }
+
+// static bool isSSRReg(MCRegister R) {
+//   for (unsigned s = 0u; s < NUM_SSR; s++) {
+//     if (getSSRFtReg(s).asMCReg() == R) return true;
+//   }
+//   return false;
+// }
+
+// static unsigned getSSRRegIdx(MCRegister R) {
+//   return R - MCRegister(RISCV::F0_D);
+// }
+
+// bool RISCVExpandSSRPostRegAlloc::mergePushPop(MachineBasicBlock &MBB) {
+//   bool Modified;
+
+//   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+//   const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+
+//   recomputeLiveIns(MBB);
+//   recomputeLivenessFlags(MBB);
+
+//   SmallSet<const MachineInstr*, 2u> modifiedInsts[NUM_SSR]; //keep track of which insts were merged into to avoid merging two different moves of same stream into one inst
+//   MachineBasicBlock::reverse_instr_iterator MII = MBB.rbegin().getInstrIterator();
+//   MachineBasicBlock::instr_iterator SearchEnd = MBB.end().getInstrIterator();
+//   while (MII != MBB.rend().getInstrIterator()) {
+//     auto NMII = std::next(MII);
+
+//     if (isSSRDis(*MII)) {
+//       SearchEnd = MII.getReverse();
+//       MII = NMII;
+//       continue;
+//     }
+
+//     if (MII->getOpcode() == RISCV::FSGNJ_D) {
+//       auto &MOP0 = MII->getOperand(0);
+//       auto &MOP1 = MII->getOperand(1);
+//       auto &MOP2 = MII->getOperand(2);
+//       if (MOP0.isReg() && MOP1.isReg() && MOP2.isReg() && MOP1.getReg() == MOP2.getReg()) {
+//         if (isSSRReg(MOP1.getReg()) && MII != MBB.rbegin().getInstrIterator()) { //this is ssr pop (and there is at least one potential user)
+//           MCRegister dest = MOP0.getReg().asMCReg();
+//           MCRegister ssr_reg = MOP1.getReg().asMCReg();
+//           unsigned dmid = getSSRRegIdx(ssr_reg);
+//           //try to find unique user of dest
+//           Liveness Live(TRI, MRI, MBB, true);
+//           Live.MoveBackward(std::prev(MII)); //increment liveness to past MII
+//           auto user = Live.findUniqueUser(dest, SearchEnd);
+//           if (user != SearchEnd && modifiedInsts[dmid].find(&*user) == modifiedInsts[dmid].end()) { //found user
+//             user->dump();
+//             for (auto &MOP : user->operands()) {
+//               if (MOP.isReg() && !MOP.isDef() && MOP.getReg() == dest) MOP.setReg(ssr_reg); //replace all non-def uses of r with ssr_reg
+//             }
+//             user->dump();
+//             MII->eraseFromBundle();
+//             modifiedInsts[dmid].insert(&*user);
+//             Modified = true;
+//           }
+//         } else if (isSSRReg(MOP0.getReg())) {
+//           MCRegister src = MOP1.getReg();
+//           MCRegister ssr_reg = MOP0.getReg();
+//           unsigned dmid = getSSRRegIdx(ssr_reg);
+//           MachineBasicBlock::reverse_instr_iterator beginSearch = std::next(MII);
+//           while (beginSearch != MBB.rend().getInstrIterator()) {
+//             if (isSSREn(*beginSearch)) break;
+//             auto ut = isDefIsUse(*beginSearch, src);
+//             if (ut.first) break;
+//             beginSearch++;
+//           }
+//           if (beginSearch != MBB.rend().getInstrIterator() && !isSSREn(*beginSearch)) {
+//             assert(isDefIsUse(*beginSearch, src).first && "does define src");
+//             Liveness Live(TRI, MRI, MBB, true);
+//             Live.MoveBackward(std::prev(beginSearch));
+//             auto user = Live.findUniqueUser(src, std::next(MII.getReverse()));
+//             if (user == MII.getReverse() && modifiedInsts[dmid].find(&*beginSearch) == modifiedInsts[dmid].end()) {
+//               beginSearch->dump();
+//               for (auto &MOP : beginSearch->operands()) {
+//                 if (MOP.isReg() && MOP.isDef() && MOP.getReg() == src) { 
+//                   MOP.setReg(ssr_reg); //replace all defs of R with ssr_reg
+//                   MOP.setIsDef(false);
+//                 }
+//               }
+//               beginSearch->dump();
+//               MII->eraseFromBundle();
+//               modifiedInsts[dmid].insert(&*beginSearch);
+//               Modified = true;
+//             }
+//           }
+//         }
+//       }
+//     }
+//     MII = NMII;
+//   }
+//   return Modified;
+// }
\ No newline at end of file
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td
index 67f38e03e1fc0..bc1e03d5eedc6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td
@@ -105,10 +105,26 @@ class SPseudoPush:
   let usesCustomInserter = 0;
 }
 
+class SPseudoStoreMove: //instead of using these could give isBarrier = 1 to ssr csrrsi/csrrci
+      Pseudo<(outs FPR64:$ssr), (ins FPR64:$val),[]> {
+  let mayLoad = 0;
+  let mayStore = 1;
+  let hasSideEffects = 1;
+  let usesCustomInserter = 0;
+}
+
 class SPseudoPop:
       Pseudo<(outs FPR64:$val), (ins uimm5:$ssr),[]> {
   let mayLoad = 1;
-  let mayStore = 0;
+  let mayStore = 1;
+  let hasSideEffects = 1;
+  let usesCustomInserter = 0;
+}
+
+class SPseudoLoadMove:
+      Pseudo<(outs FPR64:$val), (ins FPR64:$ssr),[]> {
+  let mayLoad = 1;
+  let mayStore = 1;
   let hasSideEffects = 1;
   let usesCustomInserter = 0;
 }
@@ -148,6 +164,8 @@ let Predicates = [HasExtXssr] in {
   def PseudoSSRSetup_1D_W : SPseudoSetup1D;
   def PseudoSSRPush  : SPseudoPush;
   def PseudoSSRPop   : SPseudoPop;
+  def PseudoStoreMove : SPseudoStoreMove;
+  def PseudoLoadMove  : SPseudoLoadMove;
 
   foreach dim = [1, 2, 3, 4] in {
     def PseudoSSRSetupBoundStride_#dim#D : SPseudoSetupBoundStride;
diff --git a/llvm/lib/Target/RISCV/RISCVSSRReassociate.cpp b/llvm/lib/Target/RISCV/RISCVSSRReassociate.cpp
new file mode 100644
index 0000000000000..3fefec22f122c
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVSSRReassociate.cpp
@@ -0,0 +1,528 @@
+//===- SSRReassociatePass.cpp - Reassociate Fast FP insts and move SSR push/pop intrinsics ------------------===//
+//
+// ???
+//
+//===----------------------------------------------------------------------===//
+//
+// FIXME: The reassociation should really be done by the ReassociatePass but it
+// for some reason does no reassociate fast FP insts? (maybe because it expects
+// a normal out of order processor to vectorize anyway.)
+// The reassociation is always done in full an can thus be quite slow when the 
+// dependency trees are large. Might want to introduce a max height or sth 
+// like that.
+// Bubbling the Pushs/Pops might be better done in the pre RA ssr expand pass
+// because we have more control over where they land there.
+// This is not really meant to be used yet, so debug msg's are output by errs().
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <vector>
+#include <limits>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ssr-reassociate"
+
+namespace llvm {
+  cl::opt<bool> AggressiveReassociate(
+    "ssr-aggressive-reassociation", 
+    cl::init(false), 
+    cl::desc("Reassociate aggressively and move ssr push/pop out of the way. In particular: reassociate also fast fp-ops")
+  );
+  
+  cl::opt<int> BubbleStreams(
+    "ssr-bubble-streams", 
+    cl::init(0), 
+    cl::desc(
+      "Try to schedule pops earlier and pushs later making \"windows\" holding the given nr. of instructions given." 
+      "This gives more freedom to the scheduler in unrolled loops. If window is too large then there are not enough registers which leads to unnecessary spills"
+      "0 means off (default), negative number means max window size")
+  );
+}
+
+namespace {
+
+  class SSRReassociate: public FunctionPass {
+    const TargetLowering *TLI = nullptr;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+
+    SSRReassociate() : FunctionPass(ID) {
+      initializeSSRReassociatePass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+  private:
+    bool runOnBB(BasicBlock &BB);
+  };
+
+} // end anonymous namespace
+
+bool SSRReassociate::runOnFunction(Function &F) {
+  bool Modified = false;
+
+  LLVM_DEBUG(dbgs()<<"SSR Reassociate Pass running on: "<<F.getNameOrAsOperand()<<"\n");
+  if (BubbleStreams) LLVM_DEBUG(dbgs()<<"bubbling streams by "<<BubbleStreams<<"\n");
+  if (AggressiveReassociate) LLVM_DEBUG(dbgs()<<"aggressive reassociate enabled \n");
+
+  for (auto &BB : F) Modified |= runOnBB(BB);
+
+  return Modified;
+}
+
+static bool isPushPop(Instruction &I) {
+  return isa<IntrinsicInst>(I) && 
+    (cast<IntrinsicInst>(I).getIntrinsicID() == Intrinsic::riscv_ssr_push
+     || cast<IntrinsicInst>(I).getIntrinsicID() == Intrinsic::riscv_ssr_push);
+}
+
+//put pops at top and pushs at bottom
+static bool BubbleSSRIntrinsics(BasicBlock::iterator begin, BasicBlock::iterator end) {
+  bool Modified = false;
+  auto II = begin;
+  auto LastInsertedPopSucc = begin;
+  auto LastInsertedPush = std::prev(end);
+  auto FirstInsertedPush = end;
+  while (II != end && II != FirstInsertedPush) {
+    auto NII = std::next(II);
+    if (isa<IntrinsicInst>(*II)) {
+      auto &Intr = cast<IntrinsicInst>(*II);
+      if (Intr.getIntrinsicID() == Intrinsic::riscv_ssr_pop) {
+        Intr.moveBefore(&*LastInsertedPopSucc);
+        LastInsertedPopSucc = std::next(Intr.getIterator());
+        Modified = true;
+      } else if (Intr.getIntrinsicID() == Intrinsic::riscv_ssr_push) {
+        Intr.moveAfter(&*LastInsertedPush);
+        LastInsertedPush = Intr.getIterator();
+        Modified = true;
+        if (FirstInsertedPush == end) FirstInsertedPush = LastInsertedPush;
+      }
+    }
+    II = NII;
+  }
+  return Modified;
+}
+
+//genetates the window that the above function uses to bubble
+//windows depend are constrained by bubble_count and by ssr enable/disable calls
+static bool BubbleSSRIntrinsics(BasicBlock &BB, unsigned bubble_count) {
+  bool Modified = false;
+  auto start = BB.getFirstInsertionPt();
+  auto finish = start;
+  while (start != BB.end()) {
+    //increment finish until it hits an ssr enable / disable
+    unsigned w = 0; // or until we have bubble_count many instructions (non push/pop instructions) inside the window
+    while (finish != BB.end() && finish != BB.getTerminator()->getIterator() && w < bubble_count) {
+      assert(finish != BB.end());
+      if (isa<IntrinsicInst>(*finish)) {
+        auto id = cast<IntrinsicInst>(*finish).getIntrinsicID();
+        if (id == Intrinsic::riscv_ssr_enable || id == Intrinsic::riscv_ssr_disable) {
+          break;
+        }
+      }
+      if (!isPushPop(*finish) && !finish->isDebugOrPseudoInst()) w++;
+      finish++;
+    }
+
+    Modified |= BubbleSSRIntrinsics(start, finish);
+
+    if (finish != BB.getTerminator()->getIterator() && finish != BB.end()) finish++; //move past ssr en/dis
+    else break; // we are done
+    start = finish;
+  }
+
+  return Modified;
+}
+
+//put pops and pushs as close to their def/use as possible
+static bool BubbleSSRIntrinsicsBack(BasicBlock &BB) {
+  bool Modified = false;
+  auto II = BB.getFirstInsertionPt();
+  DenseSet<const Instruction *> vis;
+  while (II != BB.end()) {
+    auto NII = std::next(II);
+    if (vis.find(&*II) != vis.end()) {
+      II = NII;
+      continue;
+    }
+    vis.insert(&*II);
+    if (isa<IntrinsicInst>(*II)) {
+      auto &Intr = cast<IntrinsicInst>(*II);
+      if (Intr.getIntrinsicID() == Intrinsic::riscv_ssr_pop) {
+        Instruction *UU = nullptr;
+        for (User *U : Intr.users()) {
+          if (isa<Instruction>(U) && !UU) UU = cast<Instruction>(U);
+          else UU = nullptr;
+        }
+        if (UU) {
+          Intr.moveBefore(UU);
+          Modified = true;
+        }
+      } else if (Intr.getIntrinsicID() == Intrinsic::riscv_ssr_push) {
+        if (Instruction *D = dyn_cast<Instruction>(Intr.getOperand(1))) {
+          Intr.moveAfter(D);
+          Modified = true;
+        }
+      }
+    }
+    II = NII;
+  }
+  return Modified;
+}
+
+static bool isAssociative(const Value &V) {
+  if (!isa<Instruction>(V)) return false;
+  const auto &I = cast<Instruction>(V);
+  if (I.getType()->isIntegerTy(1u)) return false; //ignore bools
+  if(I.isAssociative()) return true;
+  if (isa<FPMathOperator>(I) && I.hasAllowReassoc()) return true;
+  // if ((I.getType()->isFloatingPointTy() && I.isFast())){ //https://gcc.gnu.org/wiki/FloatingPointMath
+  //   switch (I.getOpcode())
+  //   {
+  //   case Instruction::BinaryOps::FAdd:
+  //   case Instruction::BinaryOps::FMul:
+  //     return true;  
+  //   default:
+  //     return false;
+  //   }
+  // }
+  return false;
+}
+
+// a bit redundant, but might allow to be extended
+static bool isBinop(const Value &I) {
+  return isa<BinaryOperator>(I);
+}
+
+static unsigned getAndUpdateHeight(const Value &V, DenseMap<const Instruction *, unsigned> &heights); //bc mutual recursion
+
+//assumes children have the correct height, updates the height of I accordingly
+static unsigned updateHeightFromChildren(const BinaryOperator &I, DenseMap<const Instruction *, unsigned> &heights) {
+  unsigned this_height = 1u + std::max(
+    getAndUpdateHeight(*I.getOperand(0), heights),
+    getAndUpdateHeight(*I.getOperand(1), heights)
+  );
+  auto p = heights.insert(std::make_pair(&I, this_height));
+  if (!p.second) p.first->getSecond() = this_height; //update value
+  return this_height;
+}
+
+//updates the height of children recursively then uses updateHeightFromChildren
+static unsigned getAndUpdateHeight(const Value &V, DenseMap<const Instruction *, unsigned> &heights) {
+  if (!isa<Instruction>(V)) return 0;
+  const Instruction &I = cast<Instruction>(V);
+  if (!isBinop(I)) return 0;
+  auto d = heights.find(&I);
+  if (d != heights.end()) return d->second; //if height is available it is correct
+  return updateHeightFromChildren(cast<BinaryOperator>(I), heights);
+}
+
+//moves OP and all users that are between OP and Point to after Point in the same order
+static void moveAfterWithAllUsers(BinaryOperator &OP, Instruction &Point) {
+  assert(OP.getParent() == Point.getParent());
+  auto II = std::next(Point.getIterator().getReverse()); //start right before point
+  auto rend = OP.getIterator().getReverse(); //end just after OP
+  SmallPtrSet<const Instruction *, 4u> users; //for faster lookup
+  for (auto *U : OP.users()) {
+    if (auto *I = dyn_cast<Instruction>(U)) {
+      users.insert(I);
+    }
+  }
+  while (II != OP.getParent()->rend() && II != rend) {
+    auto NII = std::next(II);
+    for (auto *U : II->users()){
+      if (auto *I = dyn_cast<Instruction>(U))
+        users.insert(I);
+    }
+    if (users.contains(&*II)) {
+      II->moveAfter(&Point);
+    }
+    II = NII;
+    assert(II != OP.getParent()->rend());
+  }
+  OP.moveAfter(&Point);
+}
+
+//we can only rotate if B only depends directly on A without any other def-use path between them
+static bool canRotate(const Instruction &A, const Instruction &B) {
+  SmallPtrSet<const Instruction *, 4u> users;
+  for (auto *U : A.users()) {
+    if (auto *I = dyn_cast<Instruction>(U)) users.insert(I);
+  }
+  auto II = A.getIterator();
+  for (; II != A.getParent()->end() && &*II != &B; II++) {
+    if (users.contains(&*II)) {
+      for (auto *U : II->users()) {
+        if (auto *I = dyn_cast<Instruction>(U)) {
+          if (I == &B) return false; //additional def-use path
+          users.insert(I);
+        }
+      }
+      if (!isa<UnaryInstruction>(*II) && !isa<BinaryOperator>(*II) && !isa<GetElementPtrInst>(*II) && !isa<SelectInst>(*II)) 
+        return false; //if user (which will need to be moved is not a "simple" instrucion ==> then cannot do it)
+    }
+  }
+  return II != A.getParent()->end() && &*II == &B; //return true if II now points to B
+}
+
+//single rotation counter-clockwise (trees are with root at bottom because thats how they are in LLVM IR)
+static BinaryOperator *rotateCC(BinaryOperator &L, BinaryOperator &I, DenseMap<const Instruction *, unsigned> &heights) {
+  assert(isAssociative(L) && isAssociative(I) && I.getOperand(0) == &L);
+  I.setOperand(0, L.getOperand(1));
+  I.replaceAllUsesWith(&L);
+  L.setOperand(1, &I);
+  L.dropDroppableUses();
+  moveAfterWithAllUsers(L, I);
+  updateHeightFromChildren(I, heights);
+  updateHeightFromChildren(L, heights);
+  return &L;
+}
+
+//single rotation clock-wise
+static BinaryOperator *rotateCW(BinaryOperator &R, BinaryOperator &I, DenseMap<const Instruction *, unsigned> &heights) {
+  assert(isAssociative(R) && isAssociative(I) && I.getOperand(1) == &R);
+  I.setOperand(1, R.getOperand(0));
+  I.replaceAllUsesWith(&R);
+  R.setOperand(0, &I);
+  R.dropDroppableUses(); //remove debug insts that would otherwise not be dominated by R anymore
+  moveAfterWithAllUsers(R, I);
+  updateHeightFromChildren(I, heights);
+  updateHeightFromChildren(R, heights);
+  assert(cast<BinaryOperator>(*I.user_begin()) == &R && std::next(I.user_begin()) == I.user_end() && "the only user of I is R");
+  return &R;
+}
+
+//try to rotate or double rotate if applicable (see AVL trees)
+static BinaryOperator *tryRotateL(Value &Left, Value &Root, DenseMap<const Instruction *, unsigned> &heights) {
+  if (isBinop(Left) && isBinop(Root) && isAssociative(Left) && isAssociative(Root)) {
+    BinaryOperator &L = cast<BinaryOperator>(Left);
+    BinaryOperator &I = cast<BinaryOperator>(Root);
+    const unsigned opcode = I.getOpcode();
+    if (L.getOpcode() != opcode || L.getParent() != I.getParent()) return nullptr; //cannot do anything
+    unsigned lh = getAndUpdateHeight(L, heights);
+    if (lh <= 1u) return nullptr; //nothing to do
+    auto &L_RChild = *L.getOperand(1);
+    if (isBinop(L_RChild) && isAssociative(L_RChild) 
+        && getAndUpdateHeight(L_RChild, heights) + 1u == lh) {
+      auto &LRC = cast<BinaryOperator>(L_RChild);
+      if (LRC.getOpcode() == opcode && LRC.getParent() == I.getParent() && canRotate(LRC, L) && canRotate(L, I)) {
+        auto &newL = *rotateCW(LRC, L, heights);
+        if (canRotate(newL, I)) return rotateCC(newL, I, heights);
+        else return nullptr;
+      }
+    }
+    if (canRotate(L, I)) return rotateCC(L, I, heights);
+  }
+  return nullptr;
+}
+
+//try to rotate or double rotate if applicable (see AVL trees)
+static BinaryOperator *tryRotateR(Value &Right, Value &Root, DenseMap<const Instruction *, unsigned> &heights) {
+  if (isBinop(Right) && isBinop(Root) && isAssociative(Right) && isAssociative(Root)) {
+    BinaryOperator &R = cast<BinaryOperator>(Right);
+    BinaryOperator &I = cast<BinaryOperator>(Root);
+    const unsigned opcode = I.getOpcode();
+    if (R.getOpcode() != opcode || R.getParent() != I.getParent()) return nullptr; //cannot do anything
+    unsigned rh = getAndUpdateHeight(R, heights);
+    if (rh <= 1u) return nullptr; //nothing to do
+    auto &R_LChild = *R.getOperand(0);
+    if (isBinop(R_LChild) && isAssociative(R_LChild)
+      && getAndUpdateHeight(R_LChild, heights) + 1u == rh) {
+        auto &RLC = cast<BinaryOperator>(R_LChild);
+        if (RLC.getOpcode() == opcode && RLC.getParent() == I.getParent() && canRotate(RLC, R) && canRotate(R, I)) {
+          auto &newR = *rotateCC(RLC, R, heights);
+          if (canRotate(newR, I)) return rotateCW(newR, I, heights);
+          else return nullptr;
+        }
+      }
+    if (canRotate(R, I)) return rotateCW(R, I, heights);
+  }
+  return nullptr;
+}
+
+//needed to check whether we are actually dealing with a tree
+static bool subGraphsIntersect(const Value &X, const Value &Y) {
+  if (!isBinop(X) || !isBinop(Y)) return false;
+  const auto &A = cast<BinaryOperator>(X);
+  const auto &B = cast<BinaryOperator>(Y);
+  DenseSet<const BinaryOperator *> seen;
+  std::deque<const BinaryOperator *> q;
+  const BasicBlock *BB = A.getParent();
+  q.push_back(&A);
+  while (!q.empty()) {
+    const auto *I = q.front(); q.pop_front();
+    seen.insert(I);
+    if (auto *X = dyn_cast<BinaryOperator>(I->getOperand(0))) {
+      if (X && X->getParent() == BB) q.push_back(X);
+    }
+    if (auto *X = dyn_cast<BinaryOperator>(I->getOperand(1))) {
+      if (X && X->getParent() == BB) q.push_back(X);
+    }
+  }
+  assert(q.empty());
+  q.push_back(&B);
+  while (!q.empty()) {
+    const auto *I = q.front(); q.pop_front();
+    if (seen.contains(I)) return true;
+    if (auto *X = dyn_cast<BinaryOperator>(I->getOperand(0))) {
+      if (X && X->getParent() == BB) q.push_back(X);
+    }
+    if (auto *X = dyn_cast<BinaryOperator>(I->getOperand(1))) {
+      if (X && X->getParent() == BB) q.push_back(X);
+    }
+  }
+  return false;
+}
+
+//print trees for debugging purposes
+static void printDep(Value &I, unsigned lvl, DenseMap<const Instruction *, unsigned> &heights, DenseSet<const Value *> &vis) {
+  if (vis.find(&I) != vis.end()) return;
+  vis.insert(&I);
+  for (unsigned i = 0; i < lvl; i++) errs()<<"| \t";
+  unsigned h = 0;
+  if (isa<Instruction>(I)) {
+    auto p = heights.find(&cast<Instruction>(I));
+    if (p != heights.end()) h = p->second;
+  }
+  errs()<<" h = "<<h<<" ";
+  errs()<<I<<"\n";
+  if (isBinop(I) && isAssociative(I)){
+    auto &X = cast<BinaryOperator>(I);
+    for (unsigned i = 0; i < X.getNumOperands(); i++) {
+      auto *V = X.getOperand(i);
+      if (V) printDep(*V, lvl+1, heights, vis);
+    }
+  }
+}
+
+//try to reassociate tree rooted in Inst (if it is a tree!)
+//insts might be moved past Inst and Inst might not be the root anymore afterwards
+static bool Reassociate(Value &Inst, DenseMap<const Instruction *, unsigned> &heights) {
+  bool Modified = false;
+  if (isBinop(Inst) && isAssociative(Inst)) {
+    BinaryOperator *I = cast<BinaryOperator>(&Inst);
+    unsigned h = updateHeightFromChildren(*I, heights);
+    if (h <= 2) return false; //nothing todo
+    if (subGraphsIntersect(*I->getOperand(0), *I->getOperand(1))) {
+      return false; //Inst is not root of a tree! cannot optimize!
+    }
+    bool better = true;
+    int lminusr = std::numeric_limits<int>::max();
+    DenseSet<const BinaryOperator *> vis;
+    do {
+      if (vis.contains(I)) break;
+      vis.insert(I); 
+      int new_lminusr = 
+        (int)getAndUpdateHeight(*I->getOperand(0), heights) 
+        - (int)getAndUpdateHeight(*I->getOperand(1), heights);
+      better = std::abs(lminusr) > std::abs(new_lminusr);
+      lminusr = new_lminusr;
+      BinaryOperator *NewRoot = nullptr;
+      if (lminusr >= 2) {
+        NewRoot = tryRotateL(*I->getOperand(0), *I, heights); //try to fix at this height
+      } else if (lminusr <= -2) {
+        NewRoot = tryRotateR(*I->getOperand(1), *I, heights); //try to fix at this height
+      }
+      if (NewRoot) {
+        I = NewRoot;
+        Modified = true;
+        better = true;
+      } else {
+        better = false; //defenitely do not repeat if we haven't changed anything anymore
+      }
+    } while (better);
+
+    bool improved_left = Reassociate(*I->getOperand(0), heights); //fix left subtree
+    bool improved_right = Reassociate(*I->getOperand(1), heights); //fix right subtree
+    Modified = Modified || improved_left || improved_right;
+
+    updateHeightFromChildren(*I, heights);
+  }
+  return Modified;
+}
+
+//try to reassociate all insts in BB
+static bool Reassociate(BasicBlock &BB) {
+  bool Modified = false;
+
+  DenseMap<const Instruction *, unsigned> heights;
+
+  auto RI = BB.rbegin();
+  while (RI != BB.rend()) {
+    if (heights.find(&*RI) == heights.end()) {//only reassociate if this was not part of any tree already
+      Modified |= Reassociate(*RI, heights);
+    }
+    RI++; //yes, this means we miss some instructions, but those are optimized already anyway
+  }
+
+  // if (Modified) {
+  //   errs()<<"Reassociate in BB: "<<Modified<<"\n";
+  //   DenseSet<const Value *> vis;
+  //   for (auto RI = BB.rbegin(); RI != BB.rend(); RI++) {
+  //     printDep(*RI, 0, heights, vis);
+  //   }
+  // }
+
+  return Modified;
+}
+
+//reassociate and then bubble
+bool SSRReassociate::runOnBB(BasicBlock &BB) {
+  bool Modified = false;
+
+  if (AggressiveReassociate) {
+    Modified |= BubbleSSRIntrinsics(BB, std::numeric_limits<unsigned>::max()); //move pop/pushs out of the way
+    Modified |= Reassociate(BB);
+    if (BubbleStreams >= 0) Modified |= BubbleSSRIntrinsicsBack(BB); //move them back if needed
+  }
+
+  if (BubbleStreams > 0) {
+    Modified |= BubbleSSRIntrinsics(BB, (unsigned)BubbleStreams); //bubble to form windows
+  }
+
+  return Modified;
+}
+
+
+char SSRReassociate::ID = 0;
+
+INITIALIZE_PASS(SSRReassociate, DEBUG_TYPE, "SSR Reassociate Pass", false, false)
+
+FunctionPass *llvm::createSSRReassociatePass() { return new SSRReassociate(); }
diff --git a/llvm/lib/Target/RISCV/RISCVSSRStatistics.cpp b/llvm/lib/Target/RISCV/RISCVSSRStatistics.cpp
new file mode 100644
index 0000000000000..9c24e799cb09d
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVSSRStatistics.cpp
@@ -0,0 +1,169 @@
+//===- RISCVSSRStatistics.cpp - Reassociate Fast FP insts and move SSR push/pop intrinsics ------------------===//
+//
+// ???
+//
+//===----------------------------------------------------------------------===//
+//
+// count how many memory accesses there are and at what loop depth
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
+#include <algorithm>
+#include <vector>
+#include <limits>
+
+using namespace llvm;
+
+namespace {
+
+  class SSRStatistics: public FunctionPass {
+    const TargetLowering *TLI = nullptr;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+
+    SSRStatistics() : FunctionPass(ID) {
+      initializeSSRStatisticsPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    virtual void getAnalysisUsage(AnalysisUsage& AU) const override {
+      AU.addRequired<LoopInfoWrapperPass>();
+    }
+  };
+
+} // end anonymous namespace
+
+bool SSRStatistics::runOnFunction(Function &F) {
+
+  DenseMap<const Loop *, unsigned> ld;
+  DenseMap<const Loop *, unsigned> st;
+  DenseMap<const Loop *, unsigned> push;
+  DenseMap<const Loop *, unsigned> pop;
+
+  const LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  std::vector<const Loop *> s;
+  for (const auto *L : LI.getTopLevelLoops()) {
+    s.push_back(L);
+  }
+  for (const auto &BB : F) {
+    const Loop *L = LI.getLoopFor(&BB);
+    if (!L) continue;
+    if (ld.find(L) == ld.end()) ld.insert(std::make_pair(L, 0));
+    if (st.find(L) == st.end()) st.insert(std::make_pair(L, 0));
+    if (push.find(L) == push.end()) push.insert(std::make_pair(L, 0));
+    if (pop.find(L) == pop.end()) pop.insert(std::make_pair(L, 0));
+    for (const Instruction &I : BB) {
+      if (isa<LoadInst>(I)) {
+        auto x = ld.find(L);
+        assert(x != ld.end());
+        x->getSecond() += 1;
+      } else if (isa<StoreInst>(I)) {
+        auto x = st.find(L);
+        assert(x != st.end());
+        x->getSecond() += 1;
+      } else if (isa<IntrinsicInst>(I)) {
+        const auto &In = cast<IntrinsicInst>(I);
+        if (In.getIntrinsicID() == Intrinsic::riscv_ssr_pop) {
+          auto x = pop.find(L);
+          assert(x != pop.end());
+          x->getSecond() += 1;
+        } else if (In.getIntrinsicID() == Intrinsic::riscv_ssr_push) {
+          auto x = push.find(L);
+          assert(x != push.end());
+          x->getSecond() += 1;
+        }
+      }
+    }
+  }
+
+  errs()<<"\""<<F.getNameOrAsOperand()<<"\": {\n";
+  for (const auto &p : ld) {
+    const Loop *L = p.first;
+    errs()<<"\t\""<<L->getHeader()->getNameOrAsOperand()<<"\": {\n";
+    errs()<<"\t\t\"depth\": "<<L->getLoopDepth()<<",\n";
+    errs()<<"\t\t\"loads\": "<<ld.find(L)->getSecond()<<",\n";
+    errs()<<"\t\t\"stores\": "<<st.find(L)->getSecond()<<",\n";
+    errs()<<"\t\t\"pushs\": "<<push.find(L)->getSecond()<<",\n";
+    errs()<<"\t\t\"pops\": "<<pop.find(L)->getSecond()<<"\n";
+    errs()<<"\t},\n";
+  }
+  errs()<<"},\n";
+  
+
+  return false;
+}
+
+// bool SSRStatistics::runOnFunction(Function &F) {
+
+//   std::vector<unsigned> n_ld, n_st;
+//   constexpr int max_depth = 5;
+//   while (n_ld.size() <= max_depth) n_ld.push_back(0);
+//   while (n_st.size() <= max_depth) n_st.push_back(0);
+
+//   const LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+//   for (const auto &BB : F) {
+//     unsigned depth = LI.getLoopDepth(&BB);
+//     assert(n_ld.size() > depth);
+//     assert(n_st.size() > depth);
+//     for (const Instruction &I : BB) {
+//       if (isa<LoadInst>(I)) {
+//         n_ld[depth] += 1;
+//       } else if (isa<StoreInst>(I)) {
+//         n_st[depth] += 1;
+//       }
+//     }
+//   }
+
+//   errs()<<F.getNameOrAsOperand();
+//   for (int i = 0; i <= max_depth; i++) errs()<<", "<<n_ld[i];
+//   for (int i = 0; i <= max_depth; i++) errs()<<", "<<n_st[i];
+//   errs()<<"\n";
+
+//   return false;
+// }
+
+char SSRStatistics::ID = 0;
+
+INITIALIZE_PASS(SSRStatistics, "SSR", "SSR Statistics Pass", false, false)
+
+FunctionPass *llvm::createSSRStatisticsPass() { return new SSRStatistics(); }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 523f0cb3dda8b..ea3b8a815f70e 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -30,6 +30,8 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar/Reassociate.h"
+#include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
@@ -42,6 +44,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   initializeSNITCHFrepLoopsPass(*PR);
   initializeRISCVExpandSDMAPass(*PR);
   initializeRISCVExpandPseudoPass(*PR);
+  initializeRISCVExpandSSRPostRegAllocPass(*PR);
   initializeRISCVCleanupVSETVLIPass(*PR);
 }
 
@@ -169,6 +172,9 @@ TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 void RISCVPassConfig::addIRPasses() {
+  //addPass(createSSRReassociatePass()); //slow & incorrect, see top of file for more info
+  //addPass(createReassociatePass()); does not reassociate fast FP-ops ???
+  //addPass(createSSRStatisticsPass()); //counts number of load/store as well as number of streams
   addPass(createAtomicExpandPass());
   TargetPassConfig::addIRPasses();
 }
@@ -208,6 +214,12 @@ void RISCVPassConfig::addPreEmitPass() {
 void RISCVPassConfig::addPreEmitPass2() {
   addPass(createRISCVExpandPseudoPass());
   addPass(createPULPFixupHwLoops());
+  addPass(createRISCVExpandSSRPostRegAllocPass());
+  //FIXME: scheduling the post ra scheduler after ssr expand gives better results but is unsafe 
+  //because it might move insts with ssr regs after ssr-disable (or before enable) or reorder them internally (change in order of stream!)
+  // addPass(&PostRASchedulerID); 
+  //addPass(createSNITCHAutoFrepPass()); //alternative to scheduling and anti-dep-breaking
+  
   // Schedule the expansion of AMOs at the last possible moment, avoiding the
   // possibility for other passes to break the requirements for forward
   // progress in the LR/SC block.
diff --git a/llvm/lib/Target/RISCV/Snitch/SNITCHAutoFrep.cpp b/llvm/lib/Target/RISCV/Snitch/SNITCHAutoFrep.cpp
new file mode 100644
index 0000000000000..5b3e212541877
--- /dev/null
+++ b/llvm/lib/Target/RISCV/Snitch/SNITCHAutoFrep.cpp
@@ -0,0 +1,568 @@
+//===-- SNITCHAutoFrep.cpp - Automatically insert frep for repeating FP insts ---------===//
+//
+// ???
+//
+//===----------------------------------------------------------------------===//
+//
+// FIXME: combine this with the SNITCHFrepLoop.cpp pass + extend that one to allow
+// for the PseudoLoadMove and PsuedoStoreMove pseudo insts. Might need to make two
+// passes one pre RA and one post RA.
+//
+// This pass looks for repeating fp insts and then tries to find a reduction operation.
+// If it finds one it will try to use freps stagger. If these can both be applied, the
+// pass will calculate what stagger amount is best and then insert a frep inst with it 
+// as well as insts that reduce the different "critical paths" into one result. 
+// The current fpu fence is quite strong (a branch) a weaker one might suffice.
+// Currently not meant to be used ==> debug output done with errs().
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVInstrInfo.h"
+#include "RISCVTargetMachine.h"
+#include "RISCVMachineFunctionInfo.h"
+
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-frep"
+
+namespace llvm {
+  cl::opt<bool> SnitchAutoFrep(
+    "snitch-auto-frep", 
+    cl::init(false), 
+    cl::desc("Find repeating fp insts in unrolled loops. If a reduction can be found (not good yet) insert frep with stagger."));
+}
+
+#define SNITCH_AUTO_FREP_NAME "Snitch Auto Frep"
+
+#define MAX_SEARCH_WINDOW 4
+#define MIN_REP 4
+#define MAX_STAGGER 4
+#define NUM_SSR 3
+
+namespace {
+
+class SNITCHAutoFrep : public MachineFunctionPass {
+public:
+    const RISCVInstrInfo *TII;
+    static char ID;
+
+    SNITCHAutoFrep() : MachineFunctionPass(ID) {
+      initializeSNITCHAutoFrepPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    StringRef getPassName() const override { return SNITCH_AUTO_FREP_NAME; }
+
+private:
+
+    const MachineFunction *MF;
+    RISCVMachineFunctionInfo *RVFI;
+    DenseSet<unsigned> FPOps;
+
+    bool process(MachineBasicBlock &MBB);
+    bool isFPInstr(MachineInstr &I);
+    std::pair<MachineBasicBlock::instr_iterator, unsigned> findRep(
+        MachineBasicBlock::instr_iterator window_beg,
+        MachineBasicBlock::instr_iterator window_end,
+        MachineBasicBlock::instr_iterator end);
+};
+
+static Register getSSRFtReg(unsigned streamer) { //taken from RISCVExpandSSRInsts.cpp
+  unsigned AssignedReg = RISCV::F0_D + streamer;
+  // Advance the iterator to the assigned register until the valid
+  // register is found
+  const TargetRegisterClass *RC = &RISCV::FPR64RegClass;
+  TargetRegisterClass::iterator I = RC->begin();
+  for (; *I != AssignedReg; ++I)
+    assert(I != RC->end() && "AssignedReg should be a member of provided RC");
+  return Register(*I);
+}
+
+char SNITCHAutoFrep::ID = 0;
+
+static constexpr unsigned fpopcodes[] = {RISCV::FADD_D, RISCV::FMUL_D, RISCV::FMADD_D, RISCV::FSGNJ_D, RISCV::FDIV_D, RISCV::FSUB_D, RISCV::FMSUB_D, RISCV::FMIN_D, RISCV::FMAX_D, RISCV::FSQRT_D};
+
+bool SNITCHAutoFrep::runOnMachineFunction(MachineFunction &MF) {
+
+    if (SnitchAutoFrep) {
+        errs()<<"snitch auto frep on "<<MF.getName()<<"\n";
+    } else {
+        return true;
+    }
+
+    TII = static_cast<const RISCVInstrInfo *>(MF.getSubtarget().getInstrInfo());
+    this->MF = &MF;
+    this->RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
+    for (const unsigned &x : fpopcodes) this->FPOps.insert(x);
+
+    errs()<<"autofrep: running on:"<<MF.getName()<<" ################################ \n";
+
+    bool Modified = false;
+    for (auto &MBB : MF) Modified |= process(MBB);
+
+    errs()<<"autofrep: done with:"<<MF.getName()<<" ################################ \n";
+
+    return Modified;
+}
+
+//very conservative
+// return true if two insts do the same
+static bool areTheSame(MachineInstr &A, MachineInstr &B){
+    if (A.isBundled() || B.isBundled() || A.isDebugInstr() || B.isDebugInstr()) return false;
+    bool same = A.getOpcode() == B.getOpcode();
+    same &= A.getNumOperands() == B.getNumOperands();
+    for (unsigned i = 0; same && i < A.getNumOperands(); i++) {
+        MachineOperand &AOP = A.getOperand(i);
+        MachineOperand &BOP = B.getOperand(i);
+        same &= AOP.isIdenticalTo(BOP);
+    }
+    return same;
+}
+
+//FIXME: surely there is a better way to do this
+bool SNITCHAutoFrep::isFPInstr(MachineInstr &I) {
+    return this->FPOps.find(I.getOpcode()) != this->FPOps.end();
+}
+
+//test whether the window [window_beg, window_end) is repeating and how many times it is
+std::pair<MachineBasicBlock::instr_iterator, unsigned> SNITCHAutoFrep::findRep(
+    MachineBasicBlock::instr_iterator window_beg,
+    MachineBasicBlock::instr_iterator window_end,
+    MachineBasicBlock::instr_iterator end)
+{
+    MachineBasicBlock::instr_iterator wi = window_beg;
+    MachineBasicBlock::instr_iterator s_end = window_end;
+    MachineBasicBlock::instr_iterator s_res = window_end;
+    unsigned rep = 1u;
+    while (s_end != end && isFPInstr(*s_end) && areTheSame(*s_end, *wi)) {
+        s_end++;
+        wi++;
+        if (wi == window_end) {
+            wi = window_beg;
+            rep++;
+            s_res = s_end; //found rep
+        }
+    }
+    return std::make_pair(s_res, rep);
+}
+
+//used to calculate best possible stagger amount
+static unsigned getCycles(unsigned opcode) {
+    switch (opcode)
+    {
+    case RISCV::FADD_D:
+        return 2;
+    case RISCV::FMUL_D:
+        return 3;
+    case RISCV::FMADD_D:
+        return 4;
+    default:
+        return 1;
+    }
+}
+
+//return reduction operation
+//fmul.d not included because we currently always init the staggered regs with 0 (and mul would need 1)
+//min/max might also work, anything associative should work
+static Optional<unsigned> getCombineOpcode(unsigned opcode, unsigned src_idx) {
+    switch (opcode)
+    {
+    case RISCV::FADD_D:
+        if (src_idx == 1 || src_idx == 2) return (unsigned)RISCV::FADD_D;
+        return None;
+    case RISCV::FMADD_D:
+        if (src_idx == 0) return (unsigned)RISCV::FADD_D;
+        return None;
+    default:
+        return None;
+    }
+}
+
+//combine usages to mask
+static unsigned toMask (const std::vector<std::pair<MCRegister, unsigned>> &deps) {
+    unsigned mask = 0u;
+    for (const auto &p : deps) mask |= p.second;
+    return mask;
+}
+
+
+//find internal and external dependencies
+static Optional<std::vector<std::pair<MCRegister, unsigned>>> findRepDependenceRegs(
+    MachineBasicBlock::instr_iterator window_begin, 
+    MachineBasicBlock::instr_iterator window_end) 
+{
+    DenseMap<MCRegister, unsigned> def; //defs that are live going out of window
+    std::vector<std::pair<MCRegister, unsigned>> internal, external;
+    for (auto MII = window_begin; MII != window_end; MII++) {
+        for (unsigned i = MII->getNumOperands()-1; i < MII->getNumOperands(); i--) {
+            int idx = 3 - (int)i;
+            auto &MOP = MII->getOperand(i);
+            if (!MOP.isReg()) continue;
+            if (idx < 0) return None; //there is an instruction with more than 4 fpr's in window ==> cannot stagger
+            MCRegister r = MOP.getReg().asMCReg();
+            if (MOP.isDef()) {
+                if (idx != 3) return None; //defining operand not at idx 0 ==> cannot stagger
+                def.insert(std::make_pair(r, (unsigned)(1 << idx)));
+            } else { //use
+                auto p = def.find(r);
+                if (p != def.end()) internal.push_back(std::make_pair(r, (unsigned)(1 << idx) | p->second));
+                if (MOP.isKill()) def.erase(r);
+            }
+            idx--;
+        }
+    }
+    for (auto MII = window_begin; MII != window_end; MII++) {
+        for (unsigned i = MII->getNumOperands()-1; i < MII->getNumOperands(); i--) {
+            int idx = 3 - (int)i;
+            auto &MOP = MII->getOperand(i);
+            if (!MOP.isReg()) continue;
+            assert(idx >= 0);
+            MCRegister r = MOP.getReg().asMCReg();
+            if (MOP.isDef()) {
+                def.erase(r); //redef'ed before use
+            } else {
+                auto p = def.find(r);
+                if (p != def.end()) external.push_back(std::make_pair(r, (unsigned)(1 << idx) | p->second));
+                if (MOP.isKill()) def.erase(r);
+            }
+        }
+    }
+    unsigned internal_mask = toMask(internal);
+    unsigned external_mask = toMask(external);
+    for (auto &p : external) external_mask |= p.second;
+    //internal needs to be a subset of external so that we can stagger (FIXME: right?)
+    if ((internal_mask & external_mask) ^ internal_mask) return None;
+    return external;
+}
+
+//merge dependecy vector
+static void mergeRegisters(std::vector<std::pair<MCRegister, unsigned>> &deps) {
+    unsigned i = 0;
+    while (i < deps.size()) {
+        MCRegister r = deps[i].first;
+        unsigned found = 0u;
+        for (unsigned j = 0; j < i; j++) {
+            if (deps[j].first == r) {
+                deps[j] = std::make_pair(r, deps[j].second | deps[i].first);
+                found++;
+            }
+        }
+        if (found) {
+            assert(found == 1);
+            deps.erase(deps.begin() + i);
+            //no need to increment i
+        } else {
+            i++;
+        }
+    }
+}
+
+//duh
+static bool isSSRReg(MCRegister r) {
+    for (unsigned i = 0; i < NUM_SSR; i++) {
+        if (getSSRFtReg(i) == r) return true;
+    }
+    return false;
+}
+
+//try to find readuction operation, currently only single ops are allowed
+static Optional<std::vector<unsigned>> findCombineOps(
+    MCRegister DReg, 
+    unsigned stagger_mask, 
+    MachineBasicBlock::instr_iterator window_begin,
+    MachineBasicBlock::instr_iterator window_end) 
+{
+    MachineInstr *Def = nullptr;
+    for (auto MII = std::next(window_end.getReverse()); MII != std::next(window_begin.getReverse()); MII++) {
+        if (MII->getOperand(0).isReg() && MII->getOperand(0).getReg() == DReg) {
+            Def = &*MII;
+        }
+    }
+    if (!Def) return None;
+
+    std::vector<unsigned> ops;
+    MCRegister r = DReg;
+    bool reached_def = false;
+    for (auto MII = window_begin; !reached_def && MII != window_end; MII++) {
+        for (unsigned i = MII->getNumOperands() - 1; !reached_def && i < MII->getNumOperands(); i--) {
+            int idx = 3 - i;
+            if (idx < 0) continue;
+            auto &MOP = MII->getOperand(i);
+            if (MOP.isReg() && MOP.getReg().asMCReg() == r) {
+                if (!MII->getOperand(0).isReg()) return None;
+                r = MII->getOperand(0).getReg().asMCReg();
+                auto op = getCombineOpcode(MII->getOpcode(), (unsigned)idx);
+                if (!op.hasValue()) return None;
+                ops.push_back(op.getValue());
+                reached_def = (&*MII == Def);
+                if (!reached_def) return None; //FIXME: currently only one combineop allowed
+                break; //go to next instruction
+            }
+        }
+    }
+    return ops;
+}
+
+struct StaggerInfo {
+    unsigned count;
+    unsigned mask;
+    std::vector<MCRegister> regs;
+    std::vector<unsigned> combineOps;
+};
+
+//try to find a way to stagger
+static Optional<StaggerInfo> findStagger(
+    MachineBasicBlock::instr_iterator window_begin,
+    MachineBasicBlock::instr_iterator window_end,
+    const LivePhysRegs &liveness,
+    const llvm::MachineRegisterInfo &MRI)
+{
+    errs()<<"trying to find stagger\n";
+    auto depsopt = findRepDependenceRegs(window_begin, window_end);
+    if (!depsopt.hasValue()) return None;
+    errs()<<"found deps\n";
+    auto deps = depsopt.getValue();
+    mergeRegisters(deps);
+    for (const auto &p : deps) errs()<<"reg = "<<p.first<<", mask = "<<p.second<<"\n";
+    bool contains_ssr_regs = false;
+    for (const auto &p : deps) contains_ssr_regs |= isSSRReg(p.first);
+    if (contains_ssr_regs) return None; //dependencies between ssr registers (FIXME: only do this if stream semantics are enabled)
+    if (deps.size() != 1) return None; //more than one reg dependency is too complicated rn (FIXME)
+    errs()<<"deps well-formed \n";
+    MCRegister DReg = deps.front().first;
+    unsigned stagger_mask = deps.front().second;
+    auto ops = findCombineOps(DReg, stagger_mask, window_begin, window_end);
+    if (!ops.hasValue()) return None;
+    errs()<<"found combine ops\n";
+    unsigned max_stagger_count = 0u;
+    std::vector<MCRegister> regs;
+    regs.push_back(DReg);
+    while (max_stagger_count < MAX_STAGGER && liveness.available(MRI, DReg + max_stagger_count + 1)) {
+        max_stagger_count++;
+        regs.push_back(DReg + max_stagger_count);
+    }
+    if (!max_stagger_count) return None; //regs not free (FIXME: rename instead)
+    StaggerInfo info;
+    info.count = max_stagger_count;
+    info.mask = stagger_mask;
+    info.regs = std::move(regs);
+    info.combineOps = std::move(ops.getValue());
+    return info;
+}
+
+static MachineBasicBlock *findBB(MachineInstr &MI) {
+    for (auto &MOP : MI.operands()) {
+        if (MOP.isMBB()) return MOP.getMBB();
+    }
+    return nullptr;
+}
+
+//FIXME: no idea how to make a block a label for sure ==> just search for a branch and take its target
+// there must be a better way to do this
+// used for an always "dead" branch in the fpu fence
+static MachineBasicBlock *findBrAbleBB(MachineBasicBlock &MBB) {
+    if (!MBB.empty()) {
+        auto *BB = findBB(*std::prev(MBB.end()));
+        if (BB) return BB;
+    }
+    std::vector<MachineBasicBlock *> s;
+    SmallSet<MachineBasicBlock *, 4> vis;
+    s.push_back(&MBB);
+    while (!s.empty()) {
+        auto *B = s.back(); s.pop_back();
+        if (!B || vis.contains(B)) continue;
+        vis.insert(B);
+        if (!B->empty()) {
+            auto *x = findBB(*std::prev(B->end()));
+            if (x) return x;
+        }
+        for (auto *BB : B->predecessors()) s.push_back(BB);
+        for (auto *BB : B->successors()) s.push_back(BB);
+    }
+    return &MBB;
+}
+
+// work on a single BB, try to find repetitions, then try to find a way to stagger, then generate code if it gives an improvement
+bool SNITCHAutoFrep::process(MachineBasicBlock &MBB) {
+    bool Modified = false;
+
+    recomputeLivenessFlags(MBB); //to be sure
+
+    for (auto II = MBB.begin().getInstrIterator(); II != MBB.end().getInstrIterator(); ) {
+        auto NII = std::next(II);
+        if (II->isDebugInstr()) { //get rid of some dbg instructions (sorry)
+            II->eraseFromParent();
+        }
+        II = NII;
+    }
+
+    MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+    const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+    LivePhysRegs liveness(TRI); //use RegScavenger ?
+    liveness.addLiveIns(MBB);
+    for (unsigned r = 0; r < NUM_SSR; r++) 
+        liveness.addReg(getSSRFtReg(r).asMCReg()); //add SSR regs for good measure (FIXME: conservative)
+
+    MachineBasicBlock::instr_iterator MII = MBB.begin().getInstrIterator();
+    while (MII != MBB.end().getInstrIterator()){
+        if (!isFPInstr(*MII)) {
+            MII = std::next(MII);
+            continue;
+        }
+
+        std::vector<std::pair<MachineBasicBlock::instr_iterator, unsigned>> search_results;
+        for (auto II = MII; II != MBB.end().getInstrIterator() && search_results.size() < MAX_SEARCH_WINDOW; ++II) {
+            auto wend = std::next(II);
+            auto sr = findRep(MII, wend, MBB.end().getInstrIterator());
+            search_results.push_back(sr);
+        }
+
+        unsigned best = 0u;
+        for (unsigned i = 0u; i < search_results.size(); i++) {
+            best = search_results[best].second < search_results[i].second ? i : best;
+        }
+
+        bool found = false;
+        if (!search_results.empty() && search_results[best].second >= MIN_REP) { //if we have found at least MIN_REP repetitions
+            errs()<<"found repeting fp instr's\n";
+            for (auto II = MII; II != search_results[best].first; ++II) II->dump();
+            const TargetRegisterClass *RC = &RISCV::GPRNoX0RegClass;
+            TargetRegisterClass::iterator I = RC->begin();
+            while(I != RC->end() && !liveness.available(MRI, MCPhysReg(*I))) I++;
+            if (I != RC->end()) { //did find a free GPR register
+                errs()<<"found a free GPR reg \n";
+
+                MCPhysReg freeGPR = *I;
+                
+                const unsigned window_size = best + 1;  //recover window size
+                const unsigned reps = search_results[best].second; //get reps
+
+                auto delete_begin = std::next(MII, window_size); //start of repeting region
+                auto delete_end = search_results[best].first;    //end of repeting region (excl.)
+
+                auto info = findStagger(MII, delete_begin, liveness, MRI);
+
+                if (info.hasValue()) {
+                    errs()<<"found stagger \n";
+
+                    unsigned window_cycles = 0u;
+                    for (auto MI = MII; MI != delete_begin; MI++) window_cycles += getCycles(MI->getOpcode());
+                    unsigned rep_stall = getCycles(std::prev(delete_begin)->getOpcode()) - 1u;
+                    unsigned combine_cycles = 0u;
+                    for (unsigned &op : info.getValue().combineOps) combine_cycles += getCycles(op);
+
+                    std::vector<unsigned> cost;
+                    cost.push_back(reps * window_cycles); //cycles needed with no frep
+                    errs()<<"default = "<<cost[0]<<"\n";
+                    for (unsigned stagger = 1u; stagger <= std::min(rep_stall, info.getValue().count); stagger++) {
+                        unsigned stagger_log2 = 0u;
+                        while ((1u << stagger_log2) < stagger+1) stagger_log2++;
+                        errs()<<"num stagger regs = "<<(stagger + 1)<<", num combine ops = "<<stagger_log2<<"\n";
+                        cost.push_back(
+                            (reps * window_cycles) 
+                            - ((stagger * reps) / (stagger + 1) * rep_stall) 
+                            + 2u 
+                            + stagger
+                            + (stagger_log2 * combine_cycles));
+                        errs()<<"frep + stagger("<<stagger<<") = "<<cost.back()<<"\n";
+                    }
+
+                    unsigned best_stagger = 0;
+                    for (unsigned i = 0; i < cost.size(); i++) {
+                        if (cost[i] < cost[best_stagger]) best_stagger = i;
+                    }
+
+                    if (best_stagger > 0u) {
+                        errs()<<"frep+stagger is better\n";
+                        //code generation:
+                        //delete repetitions
+                        MBB.dump();
+                        found = true;
+                        Modified = true; //we will modify now
+                        
+                        for (auto di = delete_begin; di != delete_end;) {
+                            auto din = std::next(di);
+                            di->eraseFromParentAndMarkDBGValuesForRemoval(); //delete repeated parts
+                            di = din;
+                        }
+                        for (unsigned s = 1; s <= best_stagger; s++) {
+                            //fcvt.d.w stagger, zero (FIXME: only allows additive combine op for now)
+                            BuildMI(MBB, MII, MII->getDebugLoc(), this->TII->get(RISCV::FCVT_D_W), info.getValue().regs[s]) 
+                                .addReg(RISCV::X0); 
+                        }
+                        //load rep
+                        BuildMI(MBB, MII, MII->getDebugLoc(), this->TII->get(RISCV::ADDI), freeGPR)
+                            .addReg(RISCV::X0)
+                            .addImm(reps-1);
+                        //frep.i 
+                        BuildMI(MBB, MII, MII->getDebugLoc(), this->TII->get(RISCV::FREP_O))
+                            .addReg(freeGPR, RegState::Kill)    //reps
+                            .addImm(window_size)                //nr of instructions
+                            .addImm(best_stagger)               //stagger count
+                            .addImm(info.getValue().mask);      //stagger mask
+                        
+                        //combine result
+                        errs()<<"generate combine result\n";
+                        unsigned step = 1;
+                        while (step < best_stagger + 1u) {
+                            for (unsigned i = 0u; i + step < best_stagger + 1u; i += (step + 1)) {
+                                //FIXME: currently only one combine op allowed, if more: need temp regs here ???
+                                errs()<<"src = "<<i<<", dest = "<<(i+step)<<"\n";
+                                BuildMI(MBB, delete_end, delete_end->getDebugLoc(), this->TII->get(info.getValue().combineOps.front()), info.getValue().regs[i])
+                                    .addReg(info.getValue().regs[i], RegState::Kill)
+                                    .addReg(info.getValue().regs[i + step], RegState::Kill)
+                                    .addImm(7); 
+                            }
+                            step = step * 2;
+                        }
+                        
+                        //FPU fence (as done in SNITCHFrepLoops.cpp)
+                        BuildMI(MBB, delete_end, delete_end->getDebugLoc(), this->TII->get(RISCV::FMV_X_W), freeGPR)
+                            .addReg(info.getValue().regs[1]);
+                        auto *BB = findBrAbleBB(MBB);
+                        BuildMI(MBB, delete_end, delete_end->getDebugLoc(), this->TII->get(RISCV::BLT))
+                            .addReg(freeGPR, RegState::Kill)
+                            .addReg(freeGPR, RegState::Kill)
+                            .addMBB(BB);
+                        //advance liveness
+                        for (auto II = MII; II != delete_end; II++) {
+                            SmallVector<std::pair<llvm::MCPhysReg, const llvm::MachineOperand *>, 4u> clobbers;
+                            liveness.stepForward(*II, clobbers);
+                        }
+                        
+                        MII = delete_end; //continue from here
+                    }
+                }
+            }
+        } 
+
+        if (!found) {
+            SmallVector<std::pair<llvm::MCPhysReg, const llvm::MachineOperand *>, 4u> clobbers;
+            liveness.stepForward(*MII, clobbers);
+            MII = std::next(MII);
+        }
+    }
+
+    if (Modified) MBB.dump();
+
+    return Modified;
+}
+
+} // end of anonymous namespace
+
+INITIALIZE_PASS(SNITCHAutoFrep, "riscv-snitch-auto-frep",
+                SNITCH_AUTO_FREP_NAME, false, false)
+namespace llvm {
+
+FunctionPass *createSNITCHAutoFrepPass() { return new SNITCHAutoFrep(); }
+
+} // end of namespace llvm
\ No newline at end of file
diff --git a/llvm/lib/Transforms/CMakeLists.txt b/llvm/lib/Transforms/CMakeLists.txt
index 2a0abebdf19b5..c4cbda13469b3 100644
--- a/llvm/lib/Transforms/CMakeLists.txt
+++ b/llvm/lib/Transforms/CMakeLists.txt
@@ -7,6 +7,7 @@ add_subdirectory(IPO)
 add_subdirectory(Vectorize)
 add_subdirectory(Hello)
 add_subdirectory(HelloNew)
+add_subdirectory(SSR)
 add_subdirectory(ObjCARC)
 add_subdirectory(Coroutines)
 add_subdirectory(CFGuard)
diff --git a/llvm/lib/Transforms/SSR/CMakeLists.txt b/llvm/lib/Transforms/SSR/CMakeLists.txt
new file mode 100644
index 0000000000000..d6dc690ed6f95
--- /dev/null
+++ b/llvm/lib/Transforms/SSR/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_llvm_component_library(LLVMSSR
+  SSRInference.cpp
+  SSRGeneration.cpp
+
+  DEPENDS
+  intrinsics_gen
+
+  LINK_COMPONENTS
+  Core
+  Support
+  )
diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp
new file mode 100644
index 0000000000000..2c5bb14f85d77
--- /dev/null
+++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp
@@ -0,0 +1,888 @@
+//===-- SSRGeneration.cpp - Generate SSR --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/SSR/SSRGeneration.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/IRBuilder.h"
+
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/AffineAccessAnalysis.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/FormatVariadic.h"
+
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
+#include "llvm/IR/InlineAsm.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/ADT/ilist.h"
+
+#include <llvm/IR/DebugLoc.h>
+#include <llvm/IR/DebugInfoMetadata.h>
+
+#include <array>
+#include <vector>
+#include <map>
+#include <utility>
+#include <algorithm>
+#include <queue>
+#include <limits>
+
+#define DEBUG_TYPE "ssr"
+
+#define NUM_SSR 3U 
+#define SSR_MAX_DIM 4U
+
+//both are inclusive! 
+#define SSR_SCRATCHPAD_BEGIN 0x100000
+#define SSR_SCRATCHPAD_END 0x120000
+
+//current state of hw: only allow doubles
+#define CHECK_TYPE(T, I) (T == Type::getDoubleTy(I->getParent()->getContext()))
+
+//for gain estimation
+#define EST_LOOP_TC 25
+#define EST_MUL_COST 3
+#define EST_MEMOP_COST 2
+
+using namespace llvm;
+
+namespace llvm {
+
+cl::opt<bool> InferSSR(
+  "infer-ssr", 
+  cl::init(false), 
+  cl::desc("Enable inference of SSR streams.")
+);
+
+cl::opt<bool> SSRNoIntersectCheck(
+  "ssr-no-intersect-check", 
+  cl::init(false), 
+  cl::desc("Do not generate intersection checks (unsafe). Use `restrict` key-word instead if possible.")
+);
+
+cl::opt<bool> SSRNoTCDMCheck(
+  "ssr-no-tcdm-check", 
+  cl::init(false),
+  cl::desc("Assume all data of inferred streams is inside TCDM.")
+);
+
+cl::opt<bool> SSRNoBoundCheck(
+  "ssr-no-bound-check", 
+  cl::init(false),
+  cl::desc("Do not generate checks that make sure the inferred stream's access is executed at least once.")
+);
+
+cl::opt<bool> SSRConflictFreeOnly(
+  "ssr-conflict-free-only", 
+  cl::init(false),
+  cl::desc("Only infer streams if they have no conflicts with other memory accesses.")
+);
+
+cl::opt<bool> SSRNoInline(
+  "ssr-no-inline", 
+  cl::init(false),
+  cl::desc("prevent functions that contain SSR streams from being inlined.")
+);
+
+cl::opt<bool> SSRBarrier(
+  "ssr-barrier",
+  cl::init(false),
+  cl::desc("Enable the insertion of a spinning loop that waits for the stream to be done before it is disabled.")
+);
+
+cl::opt<bool> SSRVerbose(
+  "ssr-verbose",
+  cl::init(false),
+  cl::desc("Write information about inferred streams to stderr.")
+);
+
+} //end of namespace llvm
+
+
+static constexpr char SSRFnAttr[] = "SSR"; //used to tag functions that contain SSR streams
+
+static constexpr Intrinsic::ID riscSSRIntrinsics[] = {
+  Intrinsic::RISCVIntrinsics::riscv_ssr_barrier, 
+  Intrinsic::RISCVIntrinsics::riscv_ssr_disable,
+  Intrinsic::RISCVIntrinsics::riscv_ssr_enable,
+  Intrinsic::RISCVIntrinsics::riscv_ssr_setup_repetition,
+  Intrinsic::RISCVIntrinsics::riscv_ssr_pop,
+  Intrinsic::RISCVIntrinsics::riscv_ssr_push, 
+  Intrinsic::RISCVIntrinsics::riscv_ssr_read,
+  Intrinsic::RISCVIntrinsics::riscv_ssr_read_imm,
+  Intrinsic::RISCVIntrinsics::riscv_ssr_write,
+  Intrinsic::RISCVIntrinsics::riscv_ssr_write_imm,
+  Intrinsic::RISCVIntrinsics::riscv_ssr_setup_1d_r,
+  Intrinsic::RISCVIntrinsics::riscv_ssr_setup_1d_w,
+  Intrinsic::RISCVIntrinsics::riscv_ssr_setup_bound_stride_1d,
+  Intrinsic::RISCVIntrinsics::riscv_ssr_setup_bound_stride_2d,
+  Intrinsic::RISCVIntrinsics::riscv_ssr_setup_bound_stride_3d,
+  Intrinsic::RISCVIntrinsics::riscv_ssr_setup_bound_stride_4d,
+};
+
+
+namespace {
+
+template<typename NodeT>
+struct ConflictTree {
+  void insertNode(const NodeT *Node, unsigned value, const NodeT *Parent) {
+    assert((values.find(Node) == values.end() || children.find(Node) == children.end()) && "not yet inserted");
+    values.insert(std::make_pair(Node, value));
+    children.insert(std::make_pair(Node, std::move(std::vector<const NodeT *>())));
+    if (!Parent) { //this is root
+      assert(!Root && "Parent = nullptr, but root already exists");
+      Root = Node;
+    } else {
+      auto p = children.find(Parent);
+      assert(p != children.end() && "parent cannot be found");
+      p->getSecond().push_back(Node);
+    }
+  }
+
+  //picks nodes in the tree such that their combined value (conmbineFunc, needs to be associative & commutative) is the highest possible
+  //prioritizes parent over children
+  std::vector<const NodeT *> findBest(const std::function<unsigned(unsigned, unsigned)> &combineFunc) {
+    std::vector<const NodeT *> res;
+    if (!Root) return res;
+    findBest(Root, combineFunc, res);
+    return res;
+  }
+
+private:
+  unsigned findBest(const NodeT *N, const std::function<unsigned(unsigned, unsigned)> &combineFunc, std::vector<const NodeT *> &res) {
+    unsigned size = res.size();
+    unsigned val = 0u;
+    auto &chs = children.find(N)->getSecond();
+    if (!chs.empty()) {
+      for (const NodeT *C : chs) val = combineFunc(val, findBest(C, combineFunc, res));
+    }
+    unsigned nval = values.find(N)->second;
+    if (val > nval) {
+      return val;
+    } else {
+      while (res.size() > size) res.pop_back();
+      res.push_back(N);
+      return nval;
+    }
+  }
+
+  DenseMap<const NodeT *, unsigned> values;
+  DenseMap<const NodeT *, std::vector<const NodeT *>> children;
+  const NodeT *Root = nullptr;
+};
+
+// copy Phi-nodes from predecessor Basic Block (BB)
+void copyPHIsFromPred(BasicBlock *BB){
+  BasicBlock *Pred = nullptr;
+  for (BasicBlock *B : predecessors(BB)) {
+    if (!Pred) Pred = B;
+    assert(Pred == B && "BB has only one predecessor");
+  }
+  assert(Pred && "BB has a Predecessor");
+  for (Instruction &I : *Pred){
+    if (auto *Phi = dyn_cast<PHINode>(&I)){
+      PHINode *PhiC = PHINode::Create(Phi->getType(), 1u, Twine(Phi->getName()).concat(".copy"), BB->getFirstNonPHI());
+      //Phi->replaceAllUsesWith(PhiC);
+      Phi->replaceUsesOutsideBlock(PhiC, Pred); //all users outside of Pred are now using PhiC
+      PhiC->addIncoming(Phi, Pred);
+    }
+  }
+}
+
+///splits block, redirects all predecessor to first half of split, copies phi's
+std::pair<BasicBlock *, BasicBlock *> splitAt(Instruction *X, const Twine &name){
+  assert(!isa<PHINode>(X) && "should not split at phi");
+  BasicBlock *Two = X->getParent();
+  BasicBlock *One = BasicBlock::Create(Two->getContext(), name, Two->getParent(), Two);
+  Instruction *BR = BranchInst::Create(Two, One);
+  //DTU.applyUpdates(cfg::Update<BasicBlock *>(cfg::UpdateKind::Insert, One, Two));
+  BasicBlock::iterator it = Two->begin();
+  while (it != X->getIterator()) {
+    BasicBlock::iterator it_next = std::next(it);
+    it->removeFromParent();
+    it->insertBefore(BR);
+    it = it_next;
+  }
+  //BasicBlock *One = splitBlockBefore(Two, X, &DTU, nullptr, nullptr, name);
+  std::vector<Instruction *> toChange;
+  for (auto *BB : predecessors(Two)){
+    if (BB == One) continue;
+    Instruction *T = BB->getTerminator();
+    for (unsigned i = 0; i < T->getNumOperands(); i++){
+      Value *OP = T->getOperand(i);
+      if (dyn_cast<BasicBlock>(OP) == Two){
+        toChange.push_back(T);
+      }
+    }
+  }
+  for (Instruction *T : toChange) {
+    for (unsigned i = 0; i < T->getNumOperands(); i++){
+      Value *OP = T->getOperand(i);
+      if (dyn_cast<BasicBlock>(OP) == Two){
+        T->setOperand(i, One); //if an operand of the terminator of a predecessor of Two points to Two it should now point to One
+        /*cfg::Update<BasicBlock *> upd[]{
+          cfg::Update<BasicBlock *>(cfg::UpdateKind::Insert, T->getParent(), One),
+          cfg::Update<BasicBlock *>(cfg::UpdateKind::Delete, T->getParent(), Two),
+        };
+        DTU.applyUpdates(upd);*/
+      }
+    }
+  }
+  return std::make_pair(One, Two);
+}
+
+///clones code from BeginWith up to EndBefore
+///assumes all cf-paths from begin lead to end (or return)
+///assumes there is a phi node for each value defined in the region that will be cloned in the block of EndBefore that is live after EndBefore
+///returns the branch that splits region from coloned region and the pair of branches that jump to EndBefore at the end
+std::pair<BranchInst *, std::pair<BranchInst *, BranchInst *>> cloneRegion(Instruction *BeginWith, Instruction *EndBefore){
+  LLVM_DEBUG(dbgs()<<"cloning from "<<*BeginWith<<" up to "<<*EndBefore<<"\n");
+
+  auto p = splitAt(BeginWith, "split.before");
+  BasicBlock *Head = p.first;
+  BasicBlock *Begin = p.second;
+
+  p = splitAt(EndBefore, "fuse.prep");
+  BranchInst *BRFuse = cast<BranchInst>(p.first->getTerminator());
+  BasicBlock *End = p.second;
+  copyPHIsFromPred(End); //copy Phi's from Fuse to End
+
+  std::deque<BasicBlock *> q; //bfs queue
+  q.push_back(Begin);
+  DenseSet<BasicBlock *> vis; //bfs visited set
+  DenseMap<Value *, Value *> clones; //value in orig -> value in clone (INV: orig and clone are of same class)
+  std::vector<std::pair<unsigned, Instruction *>> operandsCleanup; //store operands that reference instructions that are not cloned yet
+  
+  while (!q.empty()){
+    BasicBlock *C = q.front(); q.pop_front();
+    if (C == End || vis.find(C) != vis.end()) continue;
+    vis.insert(C);
+    BasicBlock *Cc = BasicBlock::Create(C->getContext(), Twine(C->getName()).concat(".clone"), C->getParent(), C);
+    clones.insert(std::make_pair(C, Cc)); //BasicBlock <: Value, needed for branches
+    IRBuilder<> builder(Cc);
+    for (Instruction &I : *C){
+      Instruction *Ic = I.clone();
+      assert(Ic->use_empty() && "no uses of clone");
+      if (I.getType()->isVoidTy() || I.getType()->isLabelTy()) Ic = builder.Insert(Ic); //insert without name
+      else Ic = builder.Insert(Ic, Twine(I.getName()).concat(".clone"));
+      for (unsigned i = 0; i < Ic->getNumOperands(); i++){
+        auto A = clones.find(Ic->getOperand(i));
+        if (A != clones.end()){
+          Ic->setOperand(i, A->second); //this also updates uses of A->second
+          //check users update in A->second
+          bool userUpdate = false; for (User *U : A->second->users()) {userUpdate = userUpdate || U == Ic; } assert(userUpdate && "user is updated on setOperand");
+          //if (isa<BasicBlock>(A->first)) DTU.applyUpdates(cfg::Update<BasicBlock *>(cfg::UpdateKind::Insert, Cc, cast<BasicBlock>(A->second)));
+        }else{
+          operandsCleanup.push_back(std::make_pair(i, Ic));
+        }
+      }
+      clones.insert(std::make_pair(&I, Ic)); //add Ic as clone of I
+    }
+    auto succs = successors(C);
+    for (auto S = succs.begin(); S != succs.end(); ++S) {
+      q.push_back(*S);
+    }
+  }
+  //operandCleanup
+  for (const auto &p : operandsCleanup){ //p.first = index of operand that needs to be changed to clone in p.second
+    auto A = clones.find(p.second->getOperand(p.first));
+    if (A != clones.end()){
+      p.second->setOperand(p.first, A->second);
+      //if (isa<BasicBlock>(A->first)) DTU.applyUpdates(cfg::Update<BasicBlock *>(cfg::UpdateKind::Insert, p.second->getParent(), cast<BasicBlock>(A->second)));
+    }//else did not find ==> was defined before region 
+  }
+  //incoming blocks of phi nodes are not operands ==> handle specially
+  for (const auto &p : clones){ //all clones of phi-nodes appear in here
+    if (auto *Phi = dyn_cast<PHINode>(p.second)){
+      for (auto B = Phi->block_begin(); B != Phi->block_end(); ++B){
+        const auto &c = clones.find(*B);
+        if (c != clones.end()){
+          *B = cast<BasicBlock>(c->second); //overwrite with clone of block if it was cloned
+        }
+      }
+    }
+  }
+  //change terminator of Head to be CondBr with TakeOrig as cond
+  BranchInst *HeadBr = cast<BranchInst>(Head->getTerminator()); //always BranchInst because of splitBlockBefore
+  BasicBlock *HeadSucc = HeadBr->getSuccessor(0);
+  BasicBlock *HeadSuccClone = cast<BasicBlock>(clones.find(HeadSucc)->second);
+  HeadBr->eraseFromParent();
+  HeadBr = BranchInst::Create(
+    HeadSucc, //branch-cond = true -> go to non-clone (here SSR will be inserted)
+    HeadSuccClone,
+    ConstantInt::get(Type::getInt1Ty(HeadSucc->getContext()), 0u), 
+    Head
+  );
+  //DTU.applyUpdates(cfg::Update<BasicBlock *>(cfg::UpdateKind::Insert, Head, HeadSuccClone));
+  //handle phi nodes in End
+  for (Instruction &I : *End){
+    if (auto *Phi = dyn_cast<PHINode>(&I)){
+      for (auto *B : Phi->blocks()){ //yes Phi->blocks() will change during loop ==> does not matter
+        auto p = clones.find(B);
+        if (p != clones.end()){
+          Value *Bval = Phi->getIncomingValueForBlock(B);
+          auto v = clones.find(Bval);
+          if (v != clones.end()){
+            Phi->addIncoming(v->second, cast<BasicBlock>(p->second)); //add clone value & block as input
+          }else {
+            //v->first is constant or it is defined before cloned region begins
+            Phi->addIncoming(Bval, cast<BasicBlock>(p->second));
+          }
+        }
+      }
+    }
+  }
+  LLVM_DEBUG(dbgs()<<"done cloning \n");
+
+  return std::make_pair(HeadBr, std::make_pair(BRFuse, cast<BranchInst>(clones.find(BRFuse)->second)));
+}
+
+BasicBlock *getSingleExitBlock(const Loop *L) {
+  BasicBlock *Ex = L->getExitBlock();
+  if (Ex) return Ex;
+  SmallVector<BasicBlock *, 1U> exits;
+  L->getExitBlocks(exits);
+  for (BasicBlock *BB : exits){
+    if (!Ex) Ex = BB;
+    if (Ex != BB) return nullptr;
+  }
+  return Ex;
+}
+
+void printInfo(ExpandedAffAcc &E) {
+  errs()
+    <<(E.Access->isWrite() ? "write" : "read ")
+    <<" stream of dimension "
+    <<E.getDimension();
+  const auto &DL = E.Access->getAccesses()[0]->getDebugLoc();
+  if (DL.get()) {
+    errs()  
+      <<" orig. on line "
+      <<DL.getLine();
+  }
+  errs()
+    <<" with base address SCEV "
+    <<*E.Access->getBaseAddr(E.getDimension())
+    <<".\n";
+}
+
+//code for run-time checks for TCDM
+Value *GenerateTCDMCheck(ExpandedAffAcc &E, Instruction *Point) {
+  IRBuilder<> builder(Point);
+  Value *c1 = builder.CreateICmpULE(ConstantInt::get(E.LowerBound->getType(), SSR_SCRATCHPAD_BEGIN), E.LowerBound, "beg.check");
+  Value *c2 = builder.CreateICmpULE(E.UpperBound, ConstantInt::get(E.UpperBound->getType(), SSR_SCRATCHPAD_END), "end.check");
+  return builder.CreateAnd(c1, c2, "tcdm.check");
+}
+
+//generate code for SSR setup
+void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){
+  assert(Point);
+  Module *mod = Point->getModule();
+  IRBuilder<> builder(Point);
+  Type *i32 = Type::getInt32Ty(Point->getContext());
+  unsigned dim = E.getDimension();
+  LLVM_DEBUG(dbgs()<<"SSR Setup for stream with dim = "<<dim<<"\n");
+  if (SSRVerbose) {
+    errs()<<"Inferring "; printInfo(E);
+  }
+  assert(dim <= SSR_MAX_DIM);
+  Constant *Dim = ConstantInt::get(i32, dim - 1U); //dimension - 1, ty=i32
+  Constant *DMid = ConstantInt::get(i32, dmid); //ty=i32
+  bool isStore = E.Access->isWrite();
+
+  Intrinsic::RISCVIntrinsics functions[] = {
+    Intrinsic::riscv_ssr_setup_bound_stride_1d,
+    Intrinsic::riscv_ssr_setup_bound_stride_2d,
+    Intrinsic::riscv_ssr_setup_bound_stride_3d,
+    Intrinsic::riscv_ssr_setup_bound_stride_4d
+  };
+
+  for (unsigned i = 0u; i < dim; i++) {
+    Value *Stride = E.Steps[i];
+    if (i > 0) Stride = builder.CreateSub(Stride, E.PrefixSumRanges[i-1], formatv("stride.{0}d", i+1));
+    Value *Bound = E.Reps[i];
+    Function *SSRBoundStrideSetup = Intrinsic::getDeclaration(mod, functions[i]);
+    std::array<Value *, 3> bsargs = {DMid, Bound, Stride};
+    builder.CreateCall(SSRBoundStrideSetup->getFunctionType(), SSRBoundStrideSetup, ArrayRef<Value *>(bsargs));
+  }
+
+  unsigned n_reps = 0u;
+  if (isStore){
+    Function *SSRPush = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_push);
+    for (Instruction *I : E.Access->getAccesses()){
+      std::array<Value *, 2> pusharg = {DMid, cast<StoreInst>(I)->getValueOperand()};
+      builder.SetInsertPoint(I);
+      builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef<Value *>(pusharg));
+      I->eraseFromParent();
+      n_reps++;
+    }
+  }else{
+    Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop);
+    std::array<Value *, 1> poparg = {DMid};
+    for (Instruction *I : E.Access->getAccesses()){
+      builder.SetInsertPoint(I);
+      auto *V = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef<Value *>(poparg), "ssr.pop");
+      BasicBlock::iterator ii(I);
+      ReplaceInstWithValue(I->getParent()->getInstList(), ii, V);
+      n_reps++;
+    }
+  }
+
+  builder.SetInsertPoint(Point);
+  Constant *Rep = ConstantInt::get(i32, n_reps - 1U);
+  Function *SSRRepetitionSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_repetition);
+  std::array<Value *, 2> repargs = {DMid, Rep};
+  builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef<Value *>(repargs));
+
+  Function *SSRSetup;
+  if (!isStore){
+    SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_read_imm); //can take _imm bc dm and dim are constant
+  }else{
+    SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_write_imm); //can take _imm bc dm and dim are constant
+  }
+  std::array<Value *, 3> args = {DMid, Dim, E.Addr};
+  //NOTE: this starts the prefetching ==> always needs to be inserted AFTER bound/stride and repetition setups !!!
+  builder.CreateCall(SSRSetup->getFunctionType(), SSRSetup, ArrayRef<Value *>(args)); 
+
+  return;
+}
+
+/// generate a SSR Barrier intrinsic call before InsertBefore
+void generateSSRBarrier(Instruction *InsertBefore, unsigned dmid) {
+  IRBuilder<> builder(InsertBefore);
+  Function *Barrier = Intrinsic::getDeclaration(InsertBefore->getModule(), Intrinsic::riscv_ssr_barrier);
+  builder.CreateCall(Barrier->getFunctionType(), Barrier, ConstantInt::get(Type::getInt32Ty(builder.getContext()), dmid));
+}
+
+/// generates SSR enable & disable calls
+std::pair<Instruction*, Instruction*> generateSSREnDis(Instruction *PhP, Instruction *ExP){
+  IRBuilder<> builder(PhP); // ----------- in preheader 
+  Module *mod = PhP->getParent()->getModule();
+  Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable);
+  Instruction *en = builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef<Value *>());
+
+  builder.SetInsertPoint(ExP); // ----------- in exit block
+  //generateFPDependency(builder);
+  Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable);
+  Instruction *dis = builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef<Value *>());
+
+  LLVM_DEBUG(dbgs()<<"generated ssr_enable and ssr_disable\n");
+
+  return std::make_pair(en, dis);
+}
+
+//estimate how much it costs to compute the SSR setup data (bounds, strides, base address, etc...)
+int getEstExpandCost(AffAcc *A, unsigned dim) {
+  int cost = 0;
+  cost += A->getBaseAddr(dim)->getExpressionSize();
+  for (unsigned i = 1; i < dim; i++) {
+    cost += A->getStep(i)->getExpressionSize();
+    cost += A->getRep(i)->getExpressionSize();
+    cost += EST_MUL_COST; //for range
+    if (i > 1) cost += 1; //for addition
+  }
+  return cost;
+}
+
+//estimate the benefit of turning some AffAccs into streams
+int getEstGain(ArrayRef<AffAcc *> Accs, const Loop *L, AffineAccess &AAA) {
+  int gain = 0;
+  DenseSet<AffAcc *> accs;
+  for (auto *A : Accs) accs.insert(A);
+
+  DenseSet<const Loop *> contLoops;
+  DenseSet<AffAcc *> vis;
+  for (AffAcc *A : Accs) {
+    vis.insert(A);
+    unsigned dim = A->loopToDimension(L);
+
+    //cost of expanding A
+    gain -= getEstExpandCost(A, dim);
+
+    //cost of intersection checks
+    if (!SSRNoIntersectCheck) {
+      for (const auto &p : A->getConflicts(L)) {
+        switch (p.second)
+        {
+        case AffAccConflict::NoConflict:
+          break; //nothing to do
+        case AffAccConflict::MustNotIntersect: {
+          AffAcc *B = p.first;
+          if (vis.find(B) != vis.end()) break; //already handled this conflict when A was B
+          unsigned dimB = B->loopToDimension(L);
+          if (accs.find(B) == accs.end()) gain -= getEstExpandCost(B, dimB);
+          gain -= 4u; //2x ICmpULT, 1 OR, 1 AND
+          break;
+        }
+        case AffAccConflict::Bad:
+          assert(false && "WARNING: there is a bad conflict for given Accs and L ==> could not expand them here!");
+        default:
+          llvm_unreachable("uknown conflict type");
+        }
+      }
+    }
+
+    //cost of tcdm checks
+    if (!SSRNoTCDMCheck) {
+      gain -= 4u; //2x ICmpULT, 2 AND
+    }
+
+    int reps = 1;
+    for (unsigned d = dim; d >= 1u; d--) { //dimensions that are extended
+      int loopTC = EST_LOOP_TC;
+      if (A->getRep(d)->getSCEVType() == SCEVTypes::scConstant) 
+        loopTC = cast<SCEVConstant>(A->getRep(d))->getAPInt().getLimitedValue(std::numeric_limits<int>::max());
+      reps = std::max(reps * loopTC, reps); //prevent overflows
+
+      //prep for boundcheck cost
+      contLoops.insert(A->getLoop(d));
+    }
+    gain += EST_MEMOP_COST * reps; //the number of loads/stores that are removed by inserting a stream
+
+  }
+
+  if (!SSRNoBoundCheck) {
+    gain -= 2 * contLoops.size(); // 1 ICmp, 1 AND per loop
+  }
+
+  return gain;
+}
+
+///expands AffAcc's in L's preheader and inserts TCDM checks, returns ExpandedAffAcc's and writes the final Value* of the checks into Cond
+std::vector<ExpandedAffAcc> expandInLoop(const std::vector<AffAcc *> &accs, const Loop *L, AffineAccess &AAA, Value *&Cond) {
+  assert(!accs.empty());
+  assert(accs.size() <= NUM_SSR);
+  assert(L);
+
+  LLVM_DEBUG(dbgs()<<"expanding in Loop: "<<L->getHeader()->getNameOrAsOperand()<<" at depth "<<L->getLoopDepth()<<"\n");
+
+  auto &ctxt = L->getHeader()->getContext();
+  IntegerType *i32 = IntegerType::getInt32Ty(ctxt);
+  Type *i8Ptr = Type::getInt8PtrTy(ctxt);
+
+  Instruction *PhT = L->getLoopPreheader()->getTerminator();
+
+  //generate Steps, Reps, base addresses, intersect checks, and bound checks
+  auto exp = AAA.expandAllAt(accs, L, PhT, Cond, i8Ptr, i32, !SSRNoIntersectCheck, !SSRNoBoundCheck);
+  assert(Cond);
+
+  //TCDM Checks
+  if (!SSRNoTCDMCheck) {
+    IRBuilder<> builder(PhT);
+    for (auto &E : exp) {
+      Cond = builder.CreateAnd(Cond, GenerateTCDMCheck(E, PhT));
+    }
+  }
+  
+  assert(Cond->getType() == Type::getInt1Ty(Cond->getContext()) && "Cond has type bool (i1)");
+
+  return exp;
+}
+
+///clones from L's preheader to L's exit uses Cond for CBr between clone and non-clone
+///then generates the instrinsics for all in exp
+void cloneAndSetup(Instruction *PhT, Instruction *ExP, Value *Cond, std::vector<ExpandedAffAcc> &exp) {
+  assert(exp.size() <= NUM_SSR);
+  if (exp.size() == 0u) return;
+
+  //generate en/dis range over both loop versions to prevent later runs of this pass to infer streams in the clone version
+  // ExP = generateSSREnDis(PhT, ExP).second; //TODO: this might be better here
+  
+
+  if (!isa<ConstantInt>(Cond)){ //if Cond is not a constant we cannot make the decision at compile time ==> clone whole region for if-else
+    auto p = cloneRegion(PhT, ExP);
+    BranchInst *BR = p.first;
+    ExP = p.second.first; //terminator of exit block that jumps to original ExP
+    BR->setCondition(Cond);
+  } else {
+    //this should never happen, but it means the runtime checks were somehow known at compile time and turned out false:
+    if(cast<ConstantInt>(Cond)->getLimitedValue() == 0u) return; 
+  }
+  
+  unsigned dmid = 0u;
+  for (auto &E : exp) {
+    GenerateSSRSetup(E, dmid++, PhT);
+    if (SSRBarrier) generateSSRBarrier(ExP, dmid);
+  }
+
+  generateSSREnDis(PhT, ExP);
+}
+
+//predicate to filter AffAccs
+//in accordance with HW limitations, i.e., dimension <= 4, type = double, see #defines used
+bool isValid(AffAcc *A, const Loop *L) {
+  assert(A->isWellFormed(L));
+  bool valid = true;
+  bool write = A->isWrite();
+  for (Instruction *I : A->getAccesses()) {
+    if (write) valid &= CHECK_TYPE(cast<StoreInst>(I)->getValueOperand()->getType(), I);
+    else valid &= CHECK_TYPE(I->getType(), I);
+  }
+  valid &= A->loopToDimension(L) <= SSR_MAX_DIM;
+  return valid;
+}
+
+//should be guaranteed by SimplifyLoops in SSRInferencePass, but the pass says that any guarantees should be rechecked when depended upon.
+bool isValidLoop(const Loop *L) {
+  assert(L);
+  if (!L->getLoopPreheader() || !getSingleExitBlock(L)) return false;
+  return true;
+}
+
+// collect some information about loop:
+// possible streams
+// insertion into conflict tree (for mapping to data movers)
+bool visitLoop(const Loop *L, DenseMap<const Loop *, std::vector<AffAcc *>> &possible, ConflictTree<Loop> &tree, AffineAccess &AAA, bool isKnownInvalid) {
+  assert(L);
+
+  //NOTE: cannot return early in this function, as `possible` and `tree` need to be expanded even if L is not suitable for streams
+  
+  std::vector<AffAcc *> accs = AAA.getExpandableAccesses(L, SSRConflictFreeOnly);
+
+  if (isKnownInvalid || !isValidLoop(L)) {
+    accs.clear(); //make accs empty
+    isKnownInvalid = true;
+  }
+
+  std::vector<AffAcc *> valid;
+  for (AffAcc *A : accs) {
+    if (isValid(A, L)) valid.push_back(A);
+  }
+  //sort by dimension (with read beeing preferred over write)
+  auto comp = [L](const AffAcc *A, const AffAcc *B) { 
+    unsigned dimA = A->loopToDimension(L);
+    unsigned dimB = B->loopToDimension(L);
+    return dimA < dimB || (dimA == dimB && (!A->isWrite() && B->isWrite()));
+  };
+  std::sort(valid.begin(), valid.end(), comp);
+  //add possible:
+  auto &l = possible.insert(std::make_pair(L, std::move(std::vector<AffAcc *>()))).first->getSecond();
+  for (unsigned i = 0u; i < NUM_SSR && i < valid.size(); i++) {
+    l.push_back(valid[i]);
+  }
+  //add to tree:
+  int gain = getEstGain(l, L, AAA);
+  LLVM_DEBUG(dbgs()<<"est. gain is "<<gain<<" \n");
+  unsigned val = (unsigned)std::max(0, gain); 
+  tree.insertNode(L, val, L->isOutermost() ? nullptr : L->getParentLoop());
+
+  if (SSRVerbose) {
+    for (auto *A : l) {
+      errs()
+        <<"potential stream with base addr SCEV "
+        <<*A->getBaseAddr(L)
+        <<" of dimension "
+        <<A->loopToDimension(L)
+        <<"\n";
+    }
+    if (!l.empty()) errs()<<"With est. gain = "<<gain<<"\n";
+  }
+
+  return !isKnownInvalid;
+}
+
+///finds loops already affected by SSR
+DenseSet<const Loop *> findLoopsWithSSR(Function &F, LoopInfo &LI) {
+  DenseSet<const Loop *> invalid;
+
+  DenseSet<Intrinsic::ID> ids;
+  for (Intrinsic::ID x : riscSSRIntrinsics){
+    ids.insert(x); //put intrinsics into set for faster lookup
+  }
+
+  std::deque<std::pair<BasicBlock *, bool>> worklist;
+  DenseSet<BasicBlock *> visUnmarked;
+  DenseSet<BasicBlock *> visMarked;
+  worklist.push_back(std::make_pair(&F.getEntryBlock(), false));
+  while(!worklist.empty()) {
+    auto p = worklist.front(); worklist.pop_front();
+    BasicBlock *BB = p.first;
+    bool marked = p.second;
+
+    if (!BB) continue;
+    if (marked) {
+      if (visMarked.find(BB) != visMarked.end()) continue;
+      visMarked.insert(BB);
+
+      //mark all loops containing this Block invalid
+      const Loop *L = LI.getLoopFor(BB); 
+      while (L) { 
+        invalid.insert(L);
+        L = L->getParentLoop();
+      }
+
+      //go through instructions in block, if there is an ssr_disable() call, remove the marking for the successors of this block
+      for (Instruction &i : *BB) {
+        if (isa<IntrinsicInst>(i)) {
+          if (cast<IntrinsicInst>(i).getIntrinsicID() == Intrinsic::riscv_ssr_disable) marked = false;
+        }
+        if (!marked) break; //early exit
+      }
+
+    } else {
+      if (visUnmarked.find(BB) != visUnmarked.end()) continue;
+      visUnmarked.insert(BB);
+
+      for (Instruction &i : *BB) {
+        Instruction *I = &i;
+        if (CallBase *C = dyn_cast<CallBase>(I)) {
+          if (C->hasFnAttr(SSRFnAttr)) {
+            LLVM_DEBUG(dbgs()<<"call "<<*C<<" has attribute "<<SSRFnAttr<<"\n");
+            //all loops that contain this call cannot have ssr streams, but successors can (we assume correct SSR usage) ==> no need to mark the BB
+            const Loop *L = LI.getLoopFor(BB);
+            while (L) {
+              invalid.insert(L);
+              L = L->getParentLoop();
+            }
+          }
+          if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(C)) { 
+            if (ids.contains(II->getIntrinsicID())) {
+              LLVM_DEBUG(dbgs()<<"Intrinsic Instr "<<*II<<" calls an SSR intrinsic\n");
+              marked = true; //mark this (and thus also all following BBs)
+            }
+          }
+          if (C->isInlineAsm()) { //inline asm may contain ssr setup insts!
+            LLVM_DEBUG(dbgs()<<"inline asm call "<<*C<<" may contain ssr insts!\n");
+            LLVM_DEBUG(C->getType()->dump());
+            marked = true;
+          }
+        }
+      }
+      if (marked) worklist.push_back(std::make_pair(BB, true)); // if now marked, add to queue again
+    }
+
+    for (BasicBlock *BB2 : successors(BB)) {
+      worklist.push_back(std::make_pair(BB2, marked));
+    }
+  }
+  if (!invalid.empty()) LLVM_DEBUG(dbgs()<<"Loops that are invalid bc of SSR\n");
+  for (auto l : invalid) {
+    LLVM_DEBUG(dbgs()<<"header = "<<l->getHeader()->getNameOrAsOperand()<<" at depth = "<<l->getLoopDepth()<<"\n");
+  }
+
+  return invalid;
+}
+
+} //end of namespace
+
+// main "run" of this pass
+PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &FAM){
+  LLVM_DEBUG(dbgs()<<"SSRInference Flags: ");
+  if (InferSSR) LLVM_DEBUG(dbgs()<<"infer-ssr");
+  if (SSRNoIntersectCheck) LLVM_DEBUG(dbgs()<<", ssr-no-intersect-check");
+  if (SSRNoBoundCheck) LLVM_DEBUG(dbgs()<<", ssr-no-bound-check");
+  if (SSRNoTCDMCheck) LLVM_DEBUG(dbgs()<<", ssr-no-tcdm-check");
+  if (SSRBarrier) LLVM_DEBUG(dbgs()<<", ssr-barrier");
+  if (SSRNoInline) LLVM_DEBUG(dbgs()<<", ssr-no-inline");
+  if (SSRConflictFreeOnly) LLVM_DEBUG(dbgs()<<", ssr-conflict-free-only");
+  LLVM_DEBUG(dbgs()<<"\n");
+
+  if (!InferSSR) return PreservedAnalyses::all(); //if no SSR inference is enabled, we exit early
+  if (F.hasFnAttribute(SSRFnAttr)) return PreservedAnalyses::all(); //this function already contains streams ==> skip
+
+  AffineAccess &AAA = FAM.getResult<AffineAccessAnalysis>(F); //call analysis
+
+  LLVM_DEBUG(dbgs()<<"SSR Generation Pass on function: "<<F.getNameOrAsOperand()<<" ---------------------------------------------------\n");
+
+  bool changed = false;
+  auto &toploops = AAA.getLI().getTopLevelLoops();
+  DenseMap<const Loop *, ConflictTree<Loop>> trees; //keep track of the conflict tree for each top-level loop
+  DenseMap<const Loop *, std::vector<const Loop *>> bestLoops; //keep track of the best results for each tree
+  DenseMap<const Loop *, std::vector<AffAcc *>> possible; //keep track of the AffAcc's that can be expanded in each loop
+  DenseMap<const Loop*, Value*> conds; //keep track of the condition of the run-time check for each loop
+  DenseMap<const Loop*, std::vector<ExpandedAffAcc>> exps; //keep track of the expanded AffAcc's for each loop
+  DenseSet<const Loop*> ssrInvalidLoops = findLoopsWithSSR(F, AAA.getLI());
+
+  for (const Loop *T : toploops){
+    ConflictTree<Loop> &tree = trees.insert(std::make_pair(T, ConflictTree<Loop>())).first->getSecond();
+
+    //go through all loops in sub-tree of T to build conflict-tree and find possible expands
+    std::deque<const Loop *> worklist;
+    worklist.push_back(T);
+    while (!worklist.empty()) {
+      const Loop *L = worklist.front(); worklist.pop_front();
+      LLVM_DEBUG(dbgs()<<"visiting loop: "<<L->getHeader()->getNameOrAsOperand()<<"\n");
+
+      visitLoop(L, possible, tree, AAA, ssrInvalidLoops.find(L) != ssrInvalidLoops.end());
+
+      for (const Loop *x : L->getSubLoops()) worklist.push_back(x);
+    }
+
+    //find best expands (map best loops to data movers)
+    auto f = [](unsigned a, unsigned b){ return a + b; };
+    std::vector<const Loop *> best = tree.findBest(f);
+
+    //expand them
+    for (const Loop *L : best) {
+      auto &acc = possible.find(L)->getSecond();
+      if (!acc.empty()) {
+        changed = true;
+        Value *Cond = nullptr;
+        auto exp = expandInLoop(acc, L, AAA, Cond);
+        assert(Cond);
+        conds.insert(std::make_pair(L, Cond));
+        exps.insert(std::make_pair(L, std::move(exp)));
+      }
+    }
+
+    bestLoops.insert(std::make_pair(T, std::move(best)));
+  }
+
+  ///NOTE: as soon as we start cloning (so after this comment), all the analyses are falsified and we do not want to update them 
+  ///because that would falsify the AAA (which we do not want to update because it would find less solutions after the cloning).
+  ///So all the code that follows does not make use of any of the analyses (except for L->getLoopPreheader & stuff like that which luckily still work)
+
+  for (const Loop *T : toploops) {
+    std::vector<const Loop *> &best = bestLoops.find(T)->getSecond(); 
+    for (const Loop *L : best) {
+      auto p = conds.find(L);
+      if (p != conds.end()) {
+        BasicBlock *Ex = getSingleExitBlock(L);
+        assert(Ex);
+        if (SSRVerbose) {
+          errs()
+            <<"> Function "
+            <<L->getHeader()->getParent()->getNameOrAsOperand()
+            <<": Expanding SSR streams with "
+            <<(L->getLoopDepth()-1)
+            <<" containing loops and setup in preheader of loop with header "
+            <<L->getHeader()->getNameOrAsOperand()
+            <<"\n";
+        }
+        cloneAndSetup(L->getLoopPreheader()->getTerminator(), &*Ex->getFirstInsertionPt(), p->second, exps.find(L)->getSecond());
+      }
+    }
+  }
+
+  if (!changed) return PreservedAnalyses::all();
+  
+  F.addFnAttr(StringRef(SSRFnAttr)); //we have inserted a stream, tag accordingly
+  if (SSRNoInline) F.addFnAttr(Attribute::AttrKind::NoInline);
+  return PreservedAnalyses::none();
+}
\ No newline at end of file
diff --git a/llvm/lib/Transforms/SSR/SSRInference.cpp b/llvm/lib/Transforms/SSR/SSRInference.cpp
new file mode 100644
index 0000000000000..00f417f2d9c83
--- /dev/null
+++ b/llvm/lib/Transforms/SSR/SSRInference.cpp
@@ -0,0 +1,77 @@
+//===-- SSRInference.cpp - Infer SSR usage --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/SSR/SSRInference.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/IRBuilder.h"
+
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Scalar/ADCE.h"
+
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/AffineAccessAnalysis.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/FixIrreducible.h"
+#include "llvm/Transforms/InstCombine/InstCombine.h"
+#include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
+#include "llvm/Transforms/Scalar/LoopRotation.h"
+#include "llvm/Transforms/Scalar/IndVarSimplify.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
+#include "llvm/Transforms/Scalar/LoopFlatten.h"
+#include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
+#include "llvm/Transforms/Scalar/Reassociate.h"
+#include "llvm/Transforms/Scalar/LICM.h"
+#include "llvm/Transforms/SSR/SSRGeneration.h"
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Casting.h"
+
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/ADT/ilist.h"
+
+#include <array>
+#include <vector>
+
+#define DEBUG_TYPE "ssr"
+
+using namespace llvm;
+
+PreservedAnalyses SSRInferencePass::run(Function &F, FunctionAnalysisManager &FAM){
+  LLVM_DEBUG(dbgs()<<"SSR Inference Pass on function: "<<F.getNameOrAsOperand()<<"====================================================\n");
+  FunctionPassManager FPM(false);
+  FPM.addPass(FixIrreduciblePass());//turn some non-loops into loops
+  FPM.addPass(LoopSimplifyPass());  //canonicalize loops
+  FPM.addPass(LCSSAPass());         //put loops into LCSSA-form
+
+  FPM.addPass(SSRGenerationPass()); //runs AffineAccess analysis and generates SSR intrinsics
+
+  FPM.addPass(LoopSimplifyPass());  //canonicalize loops again
+  FPM.addPass(InstCombinePass());   //removes phi nodes from LCSSA
+  FPM.addPass(ADCEPass());          //remove potential dead instructions that result from SSR replacement
+  FPM.addPass(createFunctionToLoopPassAdaptor(LICMPass())); //LICM of run-time checks if possible
+  FPM.addPass(SimplifyCFGPass());   //simplifies CFG again
+  FPM.addPass(LoopSimplifyPass());  //canonicalize loops again
+  auto pa = FPM.run(F, FAM);
+  LLVM_DEBUG(dbgs()<<"SSR Inference Pass on function: "<<F.getNameOrAsOperand()<<" done! =============================================\n");
+  return pa;
+}