From d8de7880e849f7be1f761c477e594bdb9c1946ec Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Sun, 27 Mar 2022 11:43:49 +0200 Subject: [PATCH 01/47] update gitignore --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index b33fbbf932379..29a57901174ad 100644 --- a/.gitignore +++ b/.gitignore @@ -68,3 +68,8 @@ pythonenv* /clang/utils/analyzer/projects/*/RefScanBuildResults # automodapi puts generated documentation files here. /lldb/docs/python_api/ + + +# exclude installation +build-llvm/* +install/* From b9a747aa794511fd88c18d1ab43318289bf1d2d5 Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Sun, 27 Mar 2022 11:48:28 +0200 Subject: [PATCH 02/47] add todo's --- llvm/lib/Target/RISCV/RISCV.h | 2 ++ llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index 2cd960d7587d8..48d0c7f164058 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -54,6 +54,8 @@ void initializeRISCVCleanupVSETVLIPass(PassRegistry &); FunctionPass *createRISCVExpandSSRPass(); void initializeRISCVExpandSSRPass(PassRegistry &); +//TODO : reference function pass for auto SSR inference here (+ add to CMakeLists.txt) + FunctionPass *createRISCVExpandSDMAPass(); void initializeRISCVExpandSDMAPass(PassRegistry &); diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 523f0cb3dda8b..05b923643be89 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -221,6 +221,8 @@ void RISCVPassConfig::addPreRegAlloc() { if (TM->getOptLevel() != CodeGenOpt::None) { addPass(createRISCVMergeBaseOffsetOptPass()); addPass(createRISCVCleanupVSETVLIPass()); + //TODO add pass that automatically inserts SSR instructions here + //addPass(createRISCVExpandSSRPass()); addPass(createPULPHardwareLoops()); } } From 726bae7cf9a89b13a573a22469405ab2c543e0f9 Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Fri, 1 Apr 2022 13:41:34 +0200 Subject: [PATCH 03/47] comments --- llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 05b923643be89..833ce8b505528 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -170,6 +170,7 @@ TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) { void RISCVPassConfig::addIRPasses() { addPass(createAtomicExpandPass()); + //TODO: add pass for auto SSR Inference here? TargetPassConfig::addIRPasses(); } @@ -221,8 +222,6 @@ void RISCVPassConfig::addPreRegAlloc() { if (TM->getOptLevel() != CodeGenOpt::None) { addPass(createRISCVMergeBaseOffsetOptPass()); addPass(createRISCVCleanupVSETVLIPass()); - //TODO add pass that automatically inserts SSR instructions here - //addPass(createRISCVExpandSSRPass()); addPass(createPULPHardwareLoops()); } } From 05d3d3a236df9b4770d757b03fa688e6180e48a6 Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Tue, 5 Apr 2022 13:05:02 +0200 Subject: [PATCH 04/47] User problem --- .../llvm/Transforms/SSR/SSRInference.h | 26 +++ llvm/lib/Passes/CMakeLists.txt | 1 + llvm/lib/Passes/PassBuilder.cpp | 1 + llvm/lib/Passes/PassRegistry.def | 1 + llvm/lib/Transforms/CMakeLists.txt | 1 + llvm/lib/Transforms/SSR/CMakeLists.txt | 10 + llvm/lib/Transforms/SSR/SSRInference.cpp | 215 ++++++++++++++++++ 7 files changed, 255 insertions(+) create mode 100644 llvm/include/llvm/Transforms/SSR/SSRInference.h create mode 100644 llvm/lib/Transforms/SSR/CMakeLists.txt create mode 100644 llvm/lib/Transforms/SSR/SSRInference.cpp diff --git a/llvm/include/llvm/Transforms/SSR/SSRInference.h b/llvm/include/llvm/Transforms/SSR/SSRInference.h new file mode 100644 index 0000000000000..b468363c62166 --- /dev/null +++ b/llvm/include/llvm/Transforms/SSR/SSRInference.h @@ -0,0 +1,26 @@ +//===-- SSRInference.h - Infer SSR usage ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SSR_SSRINFERENCE_H +#define LLVM_TRANSFORMS_SSR_SSRINFERENCE_H + +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" + +namespace llvm { + +class SSRInferencePass : public PassInfoMixin { +public: + PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &); +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_SSR_SSRINFERENCE_H diff --git a/llvm/lib/Passes/CMakeLists.txt b/llvm/lib/Passes/CMakeLists.txt index d834c0db4b458..fa0efb387353a 100644 --- a/llvm/lib/Passes/CMakeLists.txt +++ b/llvm/lib/Passes/CMakeLists.txt @@ -16,6 +16,7 @@ add_llvm_component_library(LLVMPasses Core Coroutines HelloNew + SSR IPO InstCombine ObjCARC diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 6c1a7c75d30a2..52e322f51b4f0 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -85,6 +85,7 @@ #include "llvm/Transforms/Coroutines/CoroElide.h" #include "llvm/Transforms/Coroutines/CoroSplit.h" #include "llvm/Transforms/HelloNew/HelloWorld.h" +#include "llvm/Transforms/SSR/SSRInference.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" #include "llvm/Transforms/IPO/Annotation2Metadata.h" #include "llvm/Transforms/IPO/ArgumentPromotion.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 877cb9ed13b37..24cdf2b8b6109 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -387,6 +387,7 @@ LOOP_PASS("canon-freeze", CanonicalizeFreezeInLoopsPass()) LOOP_PASS("dot-ddg", DDGDotPrinterPass()) LOOP_PASS("invalidate", InvalidateAllAnalysesPass()) LOOP_PASS("licm", LICMPass()) +LOOP_PASS("infer-ssr", SSRInferencePass()) LOOP_PASS("loop-idiom", LoopIdiomRecognizePass()) LOOP_PASS("loop-instsimplify", LoopInstSimplifyPass()) LOOP_PASS("loop-interchange", LoopInterchangePass()) diff --git a/llvm/lib/Transforms/CMakeLists.txt b/llvm/lib/Transforms/CMakeLists.txt index 2a0abebdf19b5..c4cbda13469b3 100644 --- a/llvm/lib/Transforms/CMakeLists.txt +++ b/llvm/lib/Transforms/CMakeLists.txt @@ -7,6 +7,7 @@ add_subdirectory(IPO) add_subdirectory(Vectorize) add_subdirectory(Hello) add_subdirectory(HelloNew) +add_subdirectory(SSR) add_subdirectory(ObjCARC) add_subdirectory(Coroutines) add_subdirectory(CFGuard) diff --git a/llvm/lib/Transforms/SSR/CMakeLists.txt b/llvm/lib/Transforms/SSR/CMakeLists.txt new file mode 100644 index 0000000000000..1f2d21bcd55a5 --- /dev/null +++ b/llvm/lib/Transforms/SSR/CMakeLists.txt @@ -0,0 +1,10 @@ +add_llvm_component_library(LLVMSSR + SSRInference.cpp + + DEPENDS + intrinsics_gen + + LINK_COMPONENTS + Core + Support + ) diff --git a/llvm/lib/Transforms/SSR/SSRInference.cpp b/llvm/lib/Transforms/SSR/SSRInference.cpp new file mode 100644 index 0000000000000..14942a05e3426 --- /dev/null +++ b/llvm/lib/Transforms/SSR/SSRInference.cpp @@ -0,0 +1,215 @@ +//===-- SSRInference.cpp - Infer SSR usage --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/SSR/SSRInference.h" +#include "llvm/InitializePasses.h" + +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/InstrTypes.h" + +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" + +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Casting.h" + +#include + +using namespace llvm; + +namespace{ + +Value *SCEVtoValues(const SCEV *scev, BasicBlock *block){ + assert(scev && block && "arguments should not be null"); + switch (scev->getSCEVType()) + { + case SCEVTypes::scConstant: + { + return cast(scev)->getValue(); + } + case SCEVTypes::scUnknown: + { + return cast(scev)->getValue(); + } + case SCEVTypes::scTruncate: + case SCEVTypes::scZeroExtend: + case SCEVTypes::scSignExtend: + { + const SCEVCastExpr *castSCEV = cast(scev); + Value *v = SCEVtoValues(castSCEV->getOperand(0), block); + if (v){ + Instruction *i; + switch (scev->getSCEVType()) + { + case SCEVTypes::scTruncate: + i = CastInst::CreateTruncOrBitCast(v, castSCEV->getType(), "scev.trunc", block); + break; + case SCEVTypes::scZeroExtend: + i = CastInst::CreateZExtOrBitCast(v, castSCEV->getType(), "scev.zext", block); + break; + case SCEVTypes::scSignExtend: + i = CastInst::CreateSExtOrBitCast(v, castSCEV->getType(), "scev.sext", block); + break; + default: + assert(false && "should not happen!"); + break; + } + return i; + } + return nullptr; + } + case SCEVTypes::scAddExpr: + case SCEVTypes::scMulExpr: + { + const SCEVCommutativeExpr *binopSCEV = cast(scev); + Value *v1 = SCEVtoValues(binopSCEV->getOperand(0), block); + Value *v2 = SCEVtoValues(binopSCEV->getOperand(1), block); + if (v1 && v2){ + Instruction *binop; + if (binopSCEV->getSCEVType() == SCEVTypes::scAddExpr) { + binop = BinaryOperator::CreateAdd(v1, v2, "rcev.add", block); + } else { + binop = BinaryOperator::CreateMul(v1, v2, "rcev.mul", block); + } + return binop; + } + return nullptr; + } + default: + { + errs()<<"encountered some weird SCEVType:\n"; + scev->dump(); + return nullptr; + } + } +} + +bool runOnLoop( + Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, + BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, + ScalarEvolution *SE, MemorySSA *MSSA) { + L->dump(); + + if (!L->isInnermost() || !L->isLCSSAForm(*DT)){ + errs()<<"loop not innermost or not LCSSA\n"; + return false; + } + + BasicBlock *preheader = L->getLoopPreheader(); + if (!preheader){ + //TODO: can alleviate this by adding one if SSR setup needed + errs()<<"loop has no preheader\n"; + return false; + } + + BasicBlock *exit = L->getExitBlock(); + if (!exit){ + errs()<<"loop has no or multiple exits\n"; + return false; + } + + BasicBlock *exiting = L->getExitingBlock(); + if (!exiting){ + errs()<<"no, or multiple exiting blocks \n"; + return false; + } + + if (!L->hasDedicatedExits()){ + //TODO: relatively easily fixable ==> add block before single exit block + errs()<<"exit block is not dedicated \n"; + return false; + } + + if (L->getNumBackEdges() != 1){ + errs()<<"# of back-edges is not 1 \n"; + return false; + } + + if (!SE->hasLoopInvariantBackedgeTakenCount(L)){ + errs()<<"back-edge taken count is not loop-inv\n"; + return false; + } + + const SCEV *bt = SE->getBackedgeTakenCount(L); + errs()<<"backedge taken SCEV is:\n"; + bt->dump(); errs()<<"\n"; + + Value *v = SCEVtoValues(bt, preheader); + if (!v){ + errs()<<"SCEV to Value/Instructions conversion failed\n"; + return false; + } + v->dump(); + + bool Changed = false; + + if (Changed){ + SE->forgetLoop(L); //TODO: maybe use SE->forgetValue instead + } + return Changed; +} + +/* + InductionDescriptor IndDesc; + if(!L->getInductionDescriptor(*SE, IndDesc)){ + errs()<<"no loop induction variable found\n"; + return false; + } + if(IndDesc.getKind() != InductionDescriptor::IK_IntInduction){ + //TODO: could allow with addresses too + errs()<<"induction not on integer\n"; + return false; + } + Value *tc = findTripCount(L, &IndDesc); + if (!tc){ + errs()<<"trip count not found\n"; + return false; + } +*/ + +/* +Value *findTripCount(Loop *L, InductionDescriptor *IndDesc){ + ConstantInt *step = IndDesc->getConstIntStepValue(); + if (!step){ + errs()<<"step is not const\n"; + return nullptr; + } + BasicBlock *exiting = L->getExitingBlock(); + if (!exiting){ + errs()<<"no, or multiple latches \n"; + return nullptr; + } + BasicBlock *header = L->getHeader(); + errs()<<"InductionBinOP = "<getInductionBinOp()->getNameOrAsOperand()<<"\n"; + for (User *U : IndDesc->getInductionBinOp()->users()){ //for all users of induction variable + if(ICmpInst *cmp = dyn_cast(U)){ + if(cmp->getParent() != exiting) continue; //looking for integer-comparisons in exiting block + for (User *Ucmp : cmp->users()){ + if(BranchInst *br = dyn_cast(Ucmp)){ + //if (br->isConditional() && ((br->get) || ())) + } + } + } + } + return nullptr; +}*/ + +} //end of namespace + +PreservedAnalyses SSRInferencePass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &){ + errs()<<"# =============== SSR Inference =============== #\n"; + if(!runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, AR.BFI, &AR.TLI, &AR.TTI, &AR.SE, AR.MSSA)){ + return PreservedAnalyses::all(); + } + return PreservedAnalyses::none(); +} + From d817676dbe23b2f22d3c9032e25344535eda51a1 Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Fri, 8 Apr 2022 07:49:30 +0200 Subject: [PATCH 05/47] work on ba --- llvm/lib/Transforms/SSR/SSRInference.cpp | 163 ++++++++++++++++++++++- 1 file changed, 156 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/SSR/SSRInference.cpp b/llvm/lib/Transforms/SSR/SSRInference.cpp index 14942a05e3426..0839f3f798146 100644 --- a/llvm/lib/Transforms/SSR/SSRInference.cpp +++ b/llvm/lib/Transforms/SSR/SSRInference.cpp @@ -13,6 +13,9 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/InstrTypes.h" +#include "llvm/IR/IRBuilder.h" + +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -22,13 +25,40 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Support/Casting.h" -#include +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsRISCV.h" + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/TinyPtrVector.h" +#include "llvm/ADT/ilist.h" + +#include using namespace llvm; namespace{ +struct SSRStream{ + BasicBlock *preheader; + BasicBlock *exit; + + BasicBlock *setupInsts; + SmallVector moveInsts; //likely to be just one or maybe two load/store insts + + bool isStore; + + unsigned dim; + Value *data; + Value *bound; + Value *stride; + + unsigned dm; //"color" + SmallVector conflicts; //"edges" to conflicting SSRStreams +}; + Value *SCEVtoValues(const SCEV *scev, BasicBlock *block){ + //FIXME: ugly because instructions are already inserted! => use IRBuilder or sth. assert(scev && block && "arguments should not be null"); switch (scev->getSCEVType()) { @@ -93,6 +123,78 @@ Value *SCEVtoValues(const SCEV *scev, BasicBlock *block){ } } +ConstantInt *SCEVtoConstStep(const SCEV *scev, const SCEV *init, Loop *L){ + //FIXME: lots to do better here + errs()<<"trying to find stepsize\n"; + scev->dump(); + if(const SCEVAddRecExpr *rec = dyn_cast(scev)){ + errs()<<"add-rec-expr at root\n"; + if (rec->getLoop() == L && rec->getOperand(0) == init){ + errs()<<"loop and init match\n"; + if (const SCEVConstant *c = dyn_cast(rec->getOperand(1))){ + return dyn_cast(c->getValue()); + } + } + } + return nullptr; +} + +void GenerateSSRInstructions(const SSRStream &stream){ + Module *mod = stream.preheader->getModule(); //module for function declarations, TODO: is this the correct one? + IntegerType *i32 = IntegerType::getInt32Ty(stream.preheader->getContext()); + + Instruction *point = stream.preheader->getTerminator(); + + //add all setup instructions to preheader + for (Instruction &I : *stream.setupInsts){ + I.removeFromParent(); + I.insertBefore(point); + } + stream.setupInsts->eraseFromParent(); //not needed anymore ==> delete + + IRBuilder<> builder(point); + + ConstantInt *dm = ConstantInt::get(i32, stream.dm); //datamover id, ty=i32 + ConstantInt *dim = ConstantInt::get(i32, stream.dim - 1); //dimension - 1, ty=i32 + // data pointer, ty=i8* + Function *SSRReadSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_read_imm); //can take _imm bc dm and dim are constant + std::array args = {dm, dim, stream.data}; + builder.CreateCall(SSRReadSetup->getFunctionType(), SSRReadSetup, ArrayRef(args), "ssr.read.setup"); + + ConstantInt *rep; //repetition - 1, ty=i32 + rep = ConstantInt::get(i32, stream.moveInsts.size()); + Function *SSRRepetitionSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_repetition); + std::array repargs = {dm, rep}; + builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef(repargs), "ssr.rep.setup"); + + //bound - 1, ty=i32, relative stride, ty=i32 + Function *SSRBoundStrideSetup1D = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_bound_stride_1d); + std::array bsargs = {dm, stream.bound, stream.stride}; + builder.CreateCall(SSRBoundStrideSetup1D->getFunctionType(), SSRBoundStrideSetup1D, ArrayRef(bsargs), "ssr.bound.stride.setup"); + + std::array emptyargs = {}; + Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); + builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef(emptyargs), "ssr.enable"); + Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); + builder.SetInsertPoint(&stream.exit->front()); + builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef(emptyargs), "ssr.disable"); + + if (stream.isStore){ + errs()<<"store not done yet \n"; + }else{ + Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); + std::array poparg = {dm}; + for (Instruction *I : stream.moveInsts){ + builder.SetInsertPoint(I); + Value *v = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); + BasicBlock::iterator ii(I); + ReplaceInstWithValue(I->getParent()->getInstList(), ii, v); + } + } + + return; +} + bool runOnLoop( Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, @@ -143,14 +245,61 @@ bool runOnLoop( errs()<<"backedge taken SCEV is:\n"; bt->dump(); errs()<<"\n"; - Value *v = SCEVtoValues(bt, preheader); - if (!v){ - errs()<<"SCEV to Value/Instructions conversion failed\n"; - return false; - } - v->dump(); + BasicBlock *insts = BasicBlock::Create(preheader->getContext(), "insts", preheader->getParent()); + + Value *repcount = SCEVtoValues(bt, insts); + assert(repcount && "scev 'bt' should be convertible to values/instructions!"); bool Changed = false; + + errs()<<"instructions with SSR replacement potential\n"; + unsigned dmid = 0; + for (BasicBlock *BB : L->blocks()){ + //FIXME: check whether block is on all paths from header to itself + for (auto &I : *BB){ + if (dmid == 3) break;//TODO: make better + if (LoadInst *load = dyn_cast(&I)){ + if (!load->getType()->isFloatingPointTy()) continue; + Value *addr = load->getOperand(0); + const SCEV *addrScev = SE->getSCEV(addr); + if (SE->hasComputableLoopEvolution(addrScev, L)){ + errs()<<"load instr, addr instr, and scev of address:\n"; + load->dump(); addr->dump(); addrScev->dump(); + + auto split = SE->SplitIntoInitAndPostInc(L, addrScev); + const SCEV *init = split.first; + //const SCEV *step = split.second; + + Value *baseAddr = SCEVtoValues(init, insts); + assert(baseAddr && "some weird SCEV in init SCEV"); + errs()<<"init and it's value:\n"; + init->dump(); baseAddr->dump(); + + ConstantInt *stepsize = SCEVtoConstStep(addrScev, init, L); + if (!stepsize){ + errs()<<"failed to compute stepsize\n"; + return false; + } + errs()<<"step value:\n"; + stepsize->dump(); + + SSRStream s; + s.dm = 0; + s.dim = 1; + s.preheader = preheader; + s.exit = exit; + s.setupInsts = insts; + s.moveInsts.push_back(load); + s.stride = stepsize; + s.data = CastInst::CreatePointerCast(addr, Type::getInt8PtrTy(preheader->getContext()), "data.cast", insts); + Value *bd = BinaryOperator::CreateMul(repcount, stepsize, "ssr.bound", insts); + s.bound = CastInst::CreateIntegerCast(bd, IntegerType::getInt32Ty(preheader->getContext()), false, "bound.cast", insts); + } + }else if(StoreInst *store = dyn_cast(&I)){ + store->dump(); + } + } + } if (Changed){ SE->forgetLoop(L); //TODO: maybe use SE->forgetValue instead From a96349e977c973ea3956089b815c112e2fd619e4 Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Sat, 9 Apr 2022 13:35:08 +0200 Subject: [PATCH 06/47] deallocation problems --- llvm/lib/Transforms/SSR/SSRInference.cpp | 228 ++++++++++++++--------- 1 file changed, 138 insertions(+), 90 deletions(-) diff --git a/llvm/lib/Transforms/SSR/SSRInference.cpp b/llvm/lib/Transforms/SSR/SSRInference.cpp index 0839f3f798146..b497d9f526f11 100644 --- a/llvm/lib/Transforms/SSR/SSRInference.cpp +++ b/llvm/lib/Transforms/SSR/SSRInference.cpp @@ -35,31 +35,114 @@ #include +#define SSR_NUM_DMS 3 + using namespace llvm; namespace{ -struct SSRStream{ - BasicBlock *preheader; - BasicBlock *exit; - - BasicBlock *setupInsts; +class SSRStream{ +public: + SSRStream(Loop *L, BasicBlock *setup, ArrayRef insts, + unsigned dim, Value *data, Value *bound, Value *stride, bool isStore) + : L(L), setup(setup), moveInsts(), isStore(isStore), _isgen(false), + dim(dim), data(data), bound(bound), stride(stride), dm(-1), conflicts() + { + moveInsts.append::iterator>(insts.begin(), insts.end()); + assert(L && setup && "input not null"); + assert(dim > 0 && "correct dimension"); + assert(data->getType() == Type::getInt8PtrTy(L->getHeader()->getContext())); + assert(bound->getType() == Type::getInt32Ty(L->getHeader()->getContext())); + assert(stride->getType() == Type::getInt32Ty(L->getHeader()->getContext())); + } + + int getDM() { return dm; } + void setDM(int dmId) { dm = dmId; } + + void GenerateSSRInstructions(){ + assert(!_isgen && "this stream has not generated its instructions yet"); + this->_isgen = true; + assert(this->dm >= 0 && this->dm < SSR_NUM_DMS && "stream has valid dm id"); + + Module *mod = L->getHeader()->getModule(); //module for function declarations, TODO: is this the correct one? + IntegerType *i32 = IntegerType::getInt32Ty(L->getHeader()->getContext()); + + //TODO: add branch from setup to header? + + Instruction *point = setup->getTerminator(); + + IRBuilder<> builder(point); + + ConstantInt *dm = ConstantInt::get(i32, this->dm); //datamover id, ty=i32 + ConstantInt *dim = ConstantInt::get(i32, this->dim - 1); //dimension - 1, ty=i32 + // data pointer, ty=i8* + Function *SSRReadSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_read_imm); //can take _imm bc dm and dim are constant + std::array args = {dm, dim, data}; + builder.CreateCall(SSRReadSetup->getFunctionType(), SSRReadSetup, ArrayRef(args)); + + errs()<<"generated ssr_read_imm \n"; + + ConstantInt *rep; //repetition - 1, ty=i32 + rep = ConstantInt::get(i32, moveInsts.size()); + Function *SSRRepetitionSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_repetition); + std::array repargs = {dm, rep}; + builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef(repargs)); + + errs()<<"generated ssr_setup_repetitions \n"; + + //bound - 1, ty=i32, relative stride, ty=i32 + Function *SSRBoundStrideSetup1D = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_bound_stride_1d); + std::array bsargs = {dm, bound, stride}; + builder.CreateCall(SSRBoundStrideSetup1D->getFunctionType(), SSRBoundStrideSetup1D, ArrayRef(bsargs)); + + errs()<<"generated ssr_setup_bound_stride_1d \n"; + + std::array emptyargs = {}; + Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); + builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef(emptyargs)); + Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); + builder.SetInsertPoint(&L->getExitBlock()->front()); + builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef(emptyargs)); + + if (isStore){ + errs()<<"store not done yet \n"; + }else{ + Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); + std::array poparg = {dm}; + for (Instruction *I : moveInsts){ + builder.SetInsertPoint(I); + Value *v = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); + BasicBlock::iterator ii(I); + ReplaceInstWithValue(I->getParent()->getInstList(), ii, v); + } + } + + return; + } + +private: + Loop *L; + BasicBlock *setup; + SmallVector moveInsts; //likely to be just one or maybe two load/store insts bool isStore; + bool _isgen; + unsigned dim; Value *data; Value *bound; Value *stride; - unsigned dm; //"color" - SmallVector conflicts; //"edges" to conflicting SSRStreams + int dm; //"color" + SmallVector conflicts; //"edges" to conflicting SSRStreams }; -Value *SCEVtoValues(const SCEV *scev, BasicBlock *block){ + +Value *SCEVtoValues(const SCEV *scev, ilist *insns){ //FIXME: ugly because instructions are already inserted! => use IRBuilder or sth. - assert(scev && block && "arguments should not be null"); + assert(scev && insns && "arguments should not be null"); switch (scev->getSCEVType()) { case SCEVTypes::scConstant: @@ -75,24 +158,25 @@ Value *SCEVtoValues(const SCEV *scev, BasicBlock *block){ case SCEVTypes::scSignExtend: { const SCEVCastExpr *castSCEV = cast(scev); - Value *v = SCEVtoValues(castSCEV->getOperand(0), block); + Value *v = SCEVtoValues(castSCEV->getOperand(0), insns); if (v){ Instruction *i; switch (scev->getSCEVType()) { case SCEVTypes::scTruncate: - i = CastInst::CreateTruncOrBitCast(v, castSCEV->getType(), "scev.trunc", block); + i = CastInst::CreateTruncOrBitCast(v, castSCEV->getType(), "scev,trunc"); break; case SCEVTypes::scZeroExtend: - i = CastInst::CreateZExtOrBitCast(v, castSCEV->getType(), "scev.zext", block); + i = CastInst::CreateZExtOrBitCast(v, castSCEV->getType(), "scev.zext"); break; case SCEVTypes::scSignExtend: - i = CastInst::CreateSExtOrBitCast(v, castSCEV->getType(), "scev.sext", block); + i = CastInst::CreateSExtOrBitCast(v, castSCEV->getType(), "scev.sext"); break; default: assert(false && "should not happen!"); break; } + insns->push_back(i); return i; } return nullptr; @@ -101,15 +185,16 @@ Value *SCEVtoValues(const SCEV *scev, BasicBlock *block){ case SCEVTypes::scMulExpr: { const SCEVCommutativeExpr *binopSCEV = cast(scev); - Value *v1 = SCEVtoValues(binopSCEV->getOperand(0), block); - Value *v2 = SCEVtoValues(binopSCEV->getOperand(1), block); + Value *v1 = SCEVtoValues(binopSCEV->getOperand(0), insns); + Value *v2 = SCEVtoValues(binopSCEV->getOperand(1), insns); if (v1 && v2){ Instruction *binop; if (binopSCEV->getSCEVType() == SCEVTypes::scAddExpr) { - binop = BinaryOperator::CreateAdd(v1, v2, "rcev.add", block); + binop = BinaryOperator::CreateAdd(v1, v2, "rcev.add"); } else { - binop = BinaryOperator::CreateMul(v1, v2, "rcev.mul", block); + binop = BinaryOperator::CreateMul(v1, v2, "rcev.mul"); } + insns->push_back(binop); return binop; } return nullptr; @@ -139,62 +224,6 @@ ConstantInt *SCEVtoConstStep(const SCEV *scev, const SCEV *init, Loop *L){ return nullptr; } -void GenerateSSRInstructions(const SSRStream &stream){ - Module *mod = stream.preheader->getModule(); //module for function declarations, TODO: is this the correct one? - IntegerType *i32 = IntegerType::getInt32Ty(stream.preheader->getContext()); - - Instruction *point = stream.preheader->getTerminator(); - - //add all setup instructions to preheader - for (Instruction &I : *stream.setupInsts){ - I.removeFromParent(); - I.insertBefore(point); - } - stream.setupInsts->eraseFromParent(); //not needed anymore ==> delete - - IRBuilder<> builder(point); - - ConstantInt *dm = ConstantInt::get(i32, stream.dm); //datamover id, ty=i32 - ConstantInt *dim = ConstantInt::get(i32, stream.dim - 1); //dimension - 1, ty=i32 - // data pointer, ty=i8* - Function *SSRReadSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_read_imm); //can take _imm bc dm and dim are constant - std::array args = {dm, dim, stream.data}; - builder.CreateCall(SSRReadSetup->getFunctionType(), SSRReadSetup, ArrayRef(args), "ssr.read.setup"); - - ConstantInt *rep; //repetition - 1, ty=i32 - rep = ConstantInt::get(i32, stream.moveInsts.size()); - Function *SSRRepetitionSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_repetition); - std::array repargs = {dm, rep}; - builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef(repargs), "ssr.rep.setup"); - - //bound - 1, ty=i32, relative stride, ty=i32 - Function *SSRBoundStrideSetup1D = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_bound_stride_1d); - std::array bsargs = {dm, stream.bound, stream.stride}; - builder.CreateCall(SSRBoundStrideSetup1D->getFunctionType(), SSRBoundStrideSetup1D, ArrayRef(bsargs), "ssr.bound.stride.setup"); - - std::array emptyargs = {}; - Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); - builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef(emptyargs), "ssr.enable"); - Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); - builder.SetInsertPoint(&stream.exit->front()); - builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef(emptyargs), "ssr.disable"); - - if (stream.isStore){ - errs()<<"store not done yet \n"; - }else{ - Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); - std::array poparg = {dm}; - for (Instruction *I : stream.moveInsts){ - builder.SetInsertPoint(I); - Value *v = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); - BasicBlock::iterator ii(I); - ReplaceInstWithValue(I->getParent()->getInstList(), ii, v); - } - } - - return; -} - bool runOnLoop( Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, @@ -245,13 +274,13 @@ bool runOnLoop( errs()<<"backedge taken SCEV is:\n"; bt->dump(); errs()<<"\n"; - BasicBlock *insts = BasicBlock::Create(preheader->getContext(), "insts", preheader->getParent()); + ilist *insns = new iplist(); - Value *repcount = SCEVtoValues(bt, insts); - assert(repcount && "scev 'bt' should be convertible to values/instructions!"); + Value *repcount = SCEVtoValues(bt, insns); bool Changed = false; + /* errs()<<"instructions with SSR replacement potential\n"; unsigned dmid = 0; for (BasicBlock *BB : L->blocks()){ @@ -259,7 +288,7 @@ bool runOnLoop( for (auto &I : *BB){ if (dmid == 3) break;//TODO: make better if (LoadInst *load = dyn_cast(&I)){ - if (!load->getType()->isFloatingPointTy()) continue; + if (load->getType() != Type::getDoubleTy(preheader->getContext())) continue; Value *addr = load->getOperand(0); const SCEV *addrScev = SE->getSCEV(addr); if (SE->hasComputableLoopEvolution(addrScev, L)){ @@ -270,6 +299,8 @@ bool runOnLoop( const SCEV *init = split.first; //const SCEV *step = split.second; + BasicBlock *insts = BasicBlock::Create(preheader->getContext(), "ssr.setup", preheader->getParent()); + Value *baseAddr = SCEVtoValues(init, insts); assert(baseAddr && "some weird SCEV in init SCEV"); errs()<<"init and it's value:\n"; @@ -283,27 +314,44 @@ bool runOnLoop( errs()<<"step value:\n"; stepsize->dump(); - SSRStream s; - s.dm = 0; - s.dim = 1; - s.preheader = preheader; - s.exit = exit; - s.setupInsts = insts; - s.moveInsts.push_back(load); - s.stride = stepsize; - s.data = CastInst::CreatePointerCast(addr, Type::getInt8PtrTy(preheader->getContext()), "data.cast", insts); - Value *bd = BinaryOperator::CreateMul(repcount, stepsize, "ssr.bound", insts); - s.bound = CastInst::CreateIntegerCast(bd, IntegerType::getInt32Ty(preheader->getContext()), false, "bound.cast", insts); + Instruction *data = CastInst::CreatePointerCast(addr, Type::getInt8PtrTy(preheader->getContext()), "data.cast", insts); + Instruction *bound = CastInst::CreateIntegerCast(repcount, IntegerType::getInt32Ty(preheader->getContext()), false, "bound.cast", insts); + Instruction *stride = CastInst::CreateIntegerCast(stepsize, IntegerType::getInt32Ty(preheader->getContext()), false, "stride.cast", insts); + SSRStream s(L, insts, ArrayRef(load), 1, data, bound, stride, false); + errs()<<"constructed SSRStream \n"; + + //for now + s.setDM(dmid++); + s.GenerateSSRInstructions(); + Changed = true; + + errs()<<"done \n"; } }else if(StoreInst *store = dyn_cast(&I)){ store->dump(); } } - } + }*/ if (Changed){ - SE->forgetLoop(L); //TODO: maybe use SE->forgetValue instead + //SE->forgetLoop(L); //TODO: maybe use SE->forgetValue instead + errs()<<"inserting insns into preheader:\n"; + Instruction *c = &insns->back(); + while (c) { + Instruction *c_ = insns->getPrevNode(*c); + c->dump(); + c->insertBefore(&*preheader->begin()); + c = c_; + } + }else{ + while(!insns->empty()){ + insns->pop_back(); //delete instructions from back to from to not get live Use when Def is deleted + } } + + errs()<<"done with loop:\n"; + L->dump(); + return Changed; } From 935d4b224efa1287da7eb4c6647af573bdadc471 Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Sat, 9 Apr 2022 15:15:21 +0200 Subject: [PATCH 07/47] working sp --- .../llvm/Transforms/SSR/SSRInference.h | 35 +++ llvm/lib/Transforms/SSR/SSRInference.cpp | 264 +++++++----------- 2 files changed, 140 insertions(+), 159 deletions(-) diff --git a/llvm/include/llvm/Transforms/SSR/SSRInference.h b/llvm/include/llvm/Transforms/SSR/SSRInference.h index b468363c62166..bb32d2e425f42 100644 --- a/llvm/include/llvm/Transforms/SSR/SSRInference.h +++ b/llvm/include/llvm/Transforms/SSR/SSRInference.h @@ -14,6 +14,11 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/Instruction.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/ilist.h" + namespace llvm { class SSRInferencePass : public PassInfoMixin { @@ -21,6 +26,36 @@ class SSRInferencePass : public PassInfoMixin { PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &); }; +class SSRStream{ +public: + SSRStream(Loop *L, ilist *setup, ArrayRef insts, + unsigned dim, Value *data, Value *bound, Value *stride, bool isStore); + + int getDM(); + void setDM(int dmId); + + void GenerateSSRInstructions(); + +private: + Loop *L; + ilist *setup; + + SmallVector moveInsts; //likely to be just one or maybe two load/store insts + + bool isStore; + + bool _isgen; + + unsigned dim; + Value *data; + Value *bound; + Value *stride; + + int dm; //"color" + SmallVector conflicts; //"edges" to conflicting SSRStreams +}; + + } // namespace llvm #endif // LLVM_TRANSFORMS_SSR_SSRINFERENCE_H diff --git a/llvm/lib/Transforms/SSR/SSRInference.cpp b/llvm/lib/Transforms/SSR/SSRInference.cpp index b497d9f526f11..a99780c6937af 100644 --- a/llvm/lib/Transforms/SSR/SSRInference.cpp +++ b/llvm/lib/Transforms/SSR/SSRInference.cpp @@ -34,16 +34,13 @@ #include "llvm/ADT/ilist.h" #include +#include #define SSR_NUM_DMS 3 using namespace llvm; -namespace{ - -class SSRStream{ -public: - SSRStream(Loop *L, BasicBlock *setup, ArrayRef insts, +SSRStream::SSRStream(Loop *L, ilist *setup, ArrayRef insts, unsigned dim, Value *data, Value *bound, Value *stride, bool isStore) : L(L), setup(setup), moveInsts(), isStore(isStore), _isgen(false), dim(dim), data(data), bound(bound), stride(stride), dm(-1), conflicts() @@ -56,89 +53,77 @@ class SSRStream{ assert(stride->getType() == Type::getInt32Ty(L->getHeader()->getContext())); } - int getDM() { return dm; } - void setDM(int dmId) { dm = dmId; } +int SSRStream::getDM() { return dm; } +void SSRStream::setDM(int dmId) { dm = dmId; } - void GenerateSSRInstructions(){ - assert(!_isgen && "this stream has not generated its instructions yet"); - this->_isgen = true; - assert(this->dm >= 0 && this->dm < SSR_NUM_DMS && "stream has valid dm id"); +void SSRStream::GenerateSSRInstructions(){ + assert(!_isgen && "this stream has not generated its instructions yet"); + this->_isgen = true; + assert(this->dm >= 0 && this->dm < SSR_NUM_DMS && "stream has valid dm id"); - Module *mod = L->getHeader()->getModule(); //module for function declarations, TODO: is this the correct one? - IntegerType *i32 = IntegerType::getInt32Ty(L->getHeader()->getContext()); + Module *mod = L->getHeader()->getModule(); //module for function declarations, TODO: is this the correct one? + IntegerType *i32 = IntegerType::getInt32Ty(L->getHeader()->getContext()); - //TODO: add branch from setup to header? + Instruction *point = L->getLoopPreheader()->getTerminator(); - Instruction *point = setup->getTerminator(); + Instruction *i = &setup->front(); + while(i){ + Instruction *iNext = setup->getNextNode(*i); + i->insertBefore(point); + i = iNext; + } - IRBuilder<> builder(point); + IRBuilder<> builder(point); - ConstantInt *dm = ConstantInt::get(i32, this->dm); //datamover id, ty=i32 - ConstantInt *dim = ConstantInt::get(i32, this->dim - 1); //dimension - 1, ty=i32 - // data pointer, ty=i8* - Function *SSRReadSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_read_imm); //can take _imm bc dm and dim are constant - std::array args = {dm, dim, data}; - builder.CreateCall(SSRReadSetup->getFunctionType(), SSRReadSetup, ArrayRef(args)); + ConstantInt *dm = ConstantInt::get(i32, this->dm); //datamover id, ty=i32 + ConstantInt *dim = ConstantInt::get(i32, this->dim - 1); //dimension - 1, ty=i32 + // data pointer, ty=i8* + Function *SSRReadSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_read_imm); //can take _imm bc dm and dim are constant + std::array args = {dm, dim, data}; + builder.CreateCall(SSRReadSetup->getFunctionType(), SSRReadSetup, ArrayRef(args)); - errs()<<"generated ssr_read_imm \n"; + errs()<<"generated ssr_read_imm \n"; - ConstantInt *rep; //repetition - 1, ty=i32 - rep = ConstantInt::get(i32, moveInsts.size()); - Function *SSRRepetitionSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_repetition); - std::array repargs = {dm, rep}; - builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef(repargs)); + ConstantInt *rep; //repetition - 1, ty=i32 + rep = ConstantInt::get(i32, moveInsts.size()); + Function *SSRRepetitionSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_repetition); + std::array repargs = {dm, rep}; + builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef(repargs)); - errs()<<"generated ssr_setup_repetitions \n"; + errs()<<"generated ssr_setup_repetitions \n"; - //bound - 1, ty=i32, relative stride, ty=i32 - Function *SSRBoundStrideSetup1D = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_bound_stride_1d); - std::array bsargs = {dm, bound, stride}; - builder.CreateCall(SSRBoundStrideSetup1D->getFunctionType(), SSRBoundStrideSetup1D, ArrayRef(bsargs)); + //bound - 1, ty=i32, relative stride, ty=i32 + Function *SSRBoundStrideSetup1D = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_bound_stride_1d); + std::array bsargs = {dm, bound, stride}; + builder.CreateCall(SSRBoundStrideSetup1D->getFunctionType(), SSRBoundStrideSetup1D, ArrayRef(bsargs)); - errs()<<"generated ssr_setup_bound_stride_1d \n"; + errs()<<"generated ssr_setup_bound_stride_1d \n"; - std::array emptyargs = {}; - Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); - builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef(emptyargs)); - Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); - builder.SetInsertPoint(&L->getExitBlock()->front()); - builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef(emptyargs)); + std::array emptyargs = {}; + Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); + builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef(emptyargs)); + Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); + builder.SetInsertPoint(L->getExitBlock()->getTerminator()); + builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef(emptyargs)); - if (isStore){ - errs()<<"store not done yet \n"; - }else{ - Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); - std::array poparg = {dm}; - for (Instruction *I : moveInsts){ - builder.SetInsertPoint(I); - Value *v = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); - BasicBlock::iterator ii(I); - ReplaceInstWithValue(I->getParent()->getInstList(), ii, v); - } + if (isStore){ + errs()<<"store not done yet \n"; + }else{ + Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); + std::array poparg = {dm}; + for (Instruction *I : moveInsts){ + builder.SetInsertPoint(I); + Value *v = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); + BasicBlock::iterator ii(I); + ReplaceInstWithValue(I->getParent()->getInstList(), ii, v); } - - return; } -private: - Loop *L; - BasicBlock *setup; - - SmallVector moveInsts; //likely to be just one or maybe two load/store insts - - bool isStore; - - bool _isgen; - - unsigned dim; - Value *data; - Value *bound; - Value *stride; + return; +} - int dm; //"color" - SmallVector conflicts; //"edges" to conflicting SSRStreams -}; +namespace{ Value *SCEVtoValues(const SCEV *scev, ilist *insns){ //FIXME: ugly because instructions are already inserted! => use IRBuilder or sth. @@ -271,68 +256,74 @@ bool runOnLoop( } const SCEV *bt = SE->getBackedgeTakenCount(L); - errs()<<"backedge taken SCEV is:\n"; - bt->dump(); errs()<<"\n"; ilist *insns = new iplist(); Value *repcount = SCEVtoValues(bt, insns); - bool Changed = false; - - /* + std::vector streams; + errs()<<"instructions with SSR replacement potential\n"; - unsigned dmid = 0; for (BasicBlock *BB : L->blocks()){ //FIXME: check whether block is on all paths from header to itself for (auto &I : *BB){ - if (dmid == 3) break;//TODO: make better if (LoadInst *load = dyn_cast(&I)){ if (load->getType() != Type::getDoubleTy(preheader->getContext())) continue; Value *addr = load->getOperand(0); const SCEV *addrScev = SE->getSCEV(addr); - if (SE->hasComputableLoopEvolution(addrScev, L)){ - errs()<<"load instr, addr instr, and scev of address:\n"; - load->dump(); addr->dump(); addrScev->dump(); - - auto split = SE->SplitIntoInitAndPostInc(L, addrScev); - const SCEV *init = split.first; - //const SCEV *step = split.second; - - BasicBlock *insts = BasicBlock::Create(preheader->getContext(), "ssr.setup", preheader->getParent()); - - Value *baseAddr = SCEVtoValues(init, insts); - assert(baseAddr && "some weird SCEV in init SCEV"); - errs()<<"init and it's value:\n"; - init->dump(); baseAddr->dump(); - - ConstantInt *stepsize = SCEVtoConstStep(addrScev, init, L); - if (!stepsize){ - errs()<<"failed to compute stepsize\n"; - return false; - } - errs()<<"step value:\n"; - stepsize->dump(); - - Instruction *data = CastInst::CreatePointerCast(addr, Type::getInt8PtrTy(preheader->getContext()), "data.cast", insts); - Instruction *bound = CastInst::CreateIntegerCast(repcount, IntegerType::getInt32Ty(preheader->getContext()), false, "bound.cast", insts); - Instruction *stride = CastInst::CreateIntegerCast(stepsize, IntegerType::getInt32Ty(preheader->getContext()), false, "stride.cast", insts); - SSRStream s(L, insts, ArrayRef(load), 1, data, bound, stride, false); - errs()<<"constructed SSRStream \n"; - - //for now - s.setDM(dmid++); - s.GenerateSSRInstructions(); - Changed = true; - - errs()<<"done \n"; + if (!SE->hasComputableLoopEvolution(addrScev, L)) { + errs()<<"addrScev has no computable loop evolution:\n"; + addrScev->dump(); + continue; } + errs()<<"load instr, addr instr, and scev of address:\n"; + load->dump(); addr->dump(); addrScev->dump(); + + auto split = SE->SplitIntoInitAndPostInc(L, addrScev); + const SCEV *init = split.first; + //const SCEV *step = split.second; + + ilist *setup = new iplist(); + + Value *baseAddr = SCEVtoValues(init, setup); + assert(baseAddr && "some weird SCEV in init SCEV"); + errs()<<"init and it's value:\n"; + init->dump(); baseAddr->dump(); + + ConstantInt *stepsize = SCEVtoConstStep(addrScev, init, L); + if (!stepsize){ + errs()<<"failed to compute stepsize\n"; + return false; + } + errs()<<"step value:\n"; + stepsize->dump(); + + Instruction *data = CastInst::CreatePointerCast(baseAddr, Type::getInt8PtrTy(preheader->getContext()), "data.cast"); + setup->push_back(data); + Instruction *bound = CastInst::CreateIntegerCast(repcount, IntegerType::getInt32Ty(preheader->getContext()), false, "bound.cast"); + setup->push_back(bound); + Instruction *stride = CastInst::CreateIntegerCast(stepsize, IntegerType::getInt32Ty(preheader->getContext()), false, "stride.cast"); + setup->push_back(stride); + SSRStream *s = new SSRStream(L, setup, ArrayRef(load), 1, data, bound, stride, false); + streams.push_back(s); + errs()<<"constructed SSRStream \n"; }else if(StoreInst *store = dyn_cast(&I)){ store->dump(); } } - }*/ - + } + + bool Changed = false; + + unsigned dmid = 0; + + for (SSRStream *s : streams){ + if (dmid >= SSR_NUM_DMS) break; + s->setDM(dmid++); + s->GenerateSSRInstructions(); + Changed = true; + } + if (Changed){ //SE->forgetLoop(L); //TODO: maybe use SE->forgetValue instead errs()<<"inserting insns into preheader:\n"; @@ -355,51 +346,6 @@ bool runOnLoop( return Changed; } -/* - InductionDescriptor IndDesc; - if(!L->getInductionDescriptor(*SE, IndDesc)){ - errs()<<"no loop induction variable found\n"; - return false; - } - if(IndDesc.getKind() != InductionDescriptor::IK_IntInduction){ - //TODO: could allow with addresses too - errs()<<"induction not on integer\n"; - return false; - } - Value *tc = findTripCount(L, &IndDesc); - if (!tc){ - errs()<<"trip count not found\n"; - return false; - } -*/ - -/* -Value *findTripCount(Loop *L, InductionDescriptor *IndDesc){ - ConstantInt *step = IndDesc->getConstIntStepValue(); - if (!step){ - errs()<<"step is not const\n"; - return nullptr; - } - BasicBlock *exiting = L->getExitingBlock(); - if (!exiting){ - errs()<<"no, or multiple latches \n"; - return nullptr; - } - BasicBlock *header = L->getHeader(); - errs()<<"InductionBinOP = "<getInductionBinOp()->getNameOrAsOperand()<<"\n"; - for (User *U : IndDesc->getInductionBinOp()->users()){ //for all users of induction variable - if(ICmpInst *cmp = dyn_cast(U)){ - if(cmp->getParent() != exiting) continue; //looking for integer-comparisons in exiting block - for (User *Ucmp : cmp->users()){ - if(BranchInst *br = dyn_cast(Ucmp)){ - //if (br->isConditional() && ((br->get) || ())) - } - } - } - } - return nullptr; -}*/ - } //end of namespace PreservedAnalyses SSRInferencePass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &){ From 85445631831178a2b864b8f8ef603cfbb9edb03c Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Sat, 9 Apr 2022 21:41:10 +0200 Subject: [PATCH 08/47] working pipeline --- llvm/lib/Passes/PassBuilder.cpp | 3 +++ llvm/lib/Transforms/SSR/SSRInference.cpp | 33 ++++++++++++++++-------- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 52e322f51b4f0..72e92c4403854 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -556,6 +556,7 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, // Simplify the loop body. We do this initially to clean up after other loop // passes run, either when iterating on a loop or on inner loops with // implications on the outer loop. + LPM1.addPass(LoopInstSimplifyPass()); LPM1.addPass(LoopSimplifyCFGPass()); @@ -564,6 +565,8 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap)); LPM1.addPass(SimpleLoopUnswitchPass()); + LPM1.addPass(SSRInferencePass()); + LPM2.addPass(LoopIdiomRecognizePass()); LPM2.addPass(IndVarSimplifyPass()); diff --git a/llvm/lib/Transforms/SSR/SSRInference.cpp b/llvm/lib/Transforms/SSR/SSRInference.cpp index a99780c6937af..92e072b443175 100644 --- a/llvm/lib/Transforms/SSR/SSRInference.cpp +++ b/llvm/lib/Transforms/SSR/SSRInference.cpp @@ -8,6 +8,7 @@ #include "llvm/Transforms/SSR/SSRInference.h" #include "llvm/InitializePasses.h" +#include "llvm/Passes/PassBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -40,6 +41,9 @@ using namespace llvm; +static cl::opt EnableSSRInference("ssr-inference", cl::Hidden, cl::init(false), + cl::desc("inference of SSR intrinsics")); + SSRStream::SSRStream(Loop *L, ilist *setup, ArrayRef insts, unsigned dim, Value *data, Value *bound, Value *stride, bool isStore) : L(L), setup(setup), moveInsts(), isStore(isStore), _isgen(false), @@ -99,13 +103,6 @@ void SSRStream::GenerateSSRInstructions(){ errs()<<"generated ssr_setup_bound_stride_1d \n"; - std::array emptyargs = {}; - Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); - builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef(emptyargs)); - Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); - builder.SetInsertPoint(L->getExitBlock()->getTerminator()); - builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef(emptyargs)); - if (isStore){ errs()<<"store not done yet \n"; }else{ @@ -126,8 +123,9 @@ void SSRStream::GenerateSSRInstructions(){ namespace{ Value *SCEVtoValues(const SCEV *scev, ilist *insns){ - //FIXME: ugly because instructions are already inserted! => use IRBuilder or sth. assert(scev && insns && "arguments should not be null"); + errs()<<"\t"; + scev->dump(); switch (scev->getSCEVType()) { case SCEVTypes::scConstant: @@ -282,10 +280,11 @@ bool runOnLoop( auto split = SE->SplitIntoInitAndPostInc(L, addrScev); const SCEV *init = split.first; //const SCEV *step = split.second; - + ilist *setup = new iplist(); - + Value *baseAddr = SCEVtoValues(init, setup); + assert(baseAddr && "some weird SCEV in init SCEV"); errs()<<"init and it's value:\n"; init->dump(); baseAddr->dump(); @@ -334,6 +333,17 @@ bool runOnLoop( c->insertBefore(&*preheader->begin()); c = c_; } + + //add SSRenable and -disable calls in preheader and exit + IRBuilder<> builder(preheader->getTerminator()); + Module *mod = preheader->getModule(); + std::array emptyargs = {}; + Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); + builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef(emptyargs)); + Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); + builder.SetInsertPoint(L->getExitBlock()->getTerminator()); + builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef(emptyargs)); + }else{ while(!insns->empty()){ insns->pop_back(); //delete instructions from back to from to not get live Use when Def is deleted @@ -349,9 +359,10 @@ bool runOnLoop( } //end of namespace PreservedAnalyses SSRInferencePass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &){ + //if (!EnableSSRInference) return PreservedAnalyses::all(); //if flag is not set, skip errs()<<"# =============== SSR Inference =============== #\n"; if(!runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, AR.BFI, &AR.TLI, &AR.TTI, &AR.SE, AR.MSSA)){ - return PreservedAnalyses::all(); + //return PreservedAnalyses::all(); } return PreservedAnalyses::none(); } From def0e0a6b86f4693165213f2a0e89255cc8c8c54 Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Mon, 11 Apr 2022 16:53:34 +0200 Subject: [PATCH 09/47] stuck on step --- llvm/lib/Transforms/SSR/SSRInference.cpp | 158 +++++++++++++---------- 1 file changed, 93 insertions(+), 65 deletions(-) diff --git a/llvm/lib/Transforms/SSR/SSRInference.cpp b/llvm/lib/Transforms/SSR/SSRInference.cpp index 92e072b443175..435b073a7212c 100644 --- a/llvm/lib/Transforms/SSR/SSRInference.cpp +++ b/llvm/lib/Transforms/SSR/SSRInference.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -122,6 +123,96 @@ void SSRStream::GenerateSSRInstructions(){ namespace{ +/// guarantees: +/// L has 1 preheader and 1 dedicated exit +/// L has 1 backedge and 1 exiting block +/// bt SCEV can be expanded to instructions at insertionsPoint +bool checkLoop(const Loop *L, DominatorTree *DT, ScalarEvolution *SE, Instruction *InsertionPoint){ + if (!L->isLCSSAForm(*DT) || !L->getLoopPreheader() || !L->getExitBlock() + || !L->getExitBlock() || !L->hasDedicatedExits() || L->getNumBackEdges() != 1){ + errs()<<"malformed loop: "; L->dump(); + return false; + } + if (!SE->hasLoopInvariantBackedgeTakenCount(L)){ + errs()<<"cannot calculate backedge taken count\n"; + return false; + } + const SCEV *bt = SE->getBackedgeTakenCount(L); + if(!isSafeToExpandAt(bt, InsertionPoint, *SE) /*|| !SE->isAvailableAtLoopEntry(bt, L)*/){ + errs()<<"cannot expand bt SCEV: "; bt->dump(); + } + errs()<<"loop is well-formed: "; bt->dump(); + return true; +} + +/// check whether BB is on all controlflow paths from header to header +bool isOnAllControlFlowPaths(const BasicBlock *BB, const Loop *L, const DominatorTree *DT){ + return DT->dominates(BB, L->getHeader()); +} + +bool runOnLoop( + const Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, + BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, + ScalarEvolution *SE, MemorySSA *MSSA) { + L->dump(); + + if (!L->getLoopPreheader() || !checkLoop(L, DT, SE, L->getLoopPreheader()->getTerminator())) return true; + + SmallVector streams; + + for (const auto &BB : L->getBlocks()){ + //TODO: how to allow inner loops? + if (!isOnAllControlFlowPaths(BB, L, DT)) continue; + + for (auto &I : *BB){ + Value *Addr; + if (LoadInst *Load = dyn_cast(&I)){ + Addr = Load->getPointerOperand(); + }else if (StoreInst *Store = dyn_cast(&I)){ + Addr = Store->getPointerOperand(); + }else{ + continue; //cannot do anything with this instruction + } + + const SCEV *AddrSCEV = SE->getSCEVAtScope(Addr, L); + if (!SE->hasComputableLoopEvolution(AddrSCEV, L)) continue; + errs()<<"has computable loop evolution: "; AddrSCEV->dump(); + + auto split = SE->SplitIntoInitAndPostInc(L, AddrSCEV); + const SCEV *SetupAddrSCEV = split.first; + const SCEV *PostIncSCEV = split.second; + if (!isSafeToExpandAt(SetupAddrSCEV, L->getLoopPreheader()->getTerminator(), *SE)) continue; + errs()<<"can expand setup addr scev in preheader: "; SetupAddrSCEV->dump(); + if (!isSafeToExpandAt(PostIncSCEV, L->getLoopPreheader()->getTerminator(), *SE)) continue; + errs()<<"can expand post inc addr scev in preheader: "; PostIncSCEV->dump(); + + + } + } + + return true; +} + +} //end of namespace + +PreservedAnalyses SSRInferencePass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &){ + //if (!EnableSSRInference) return PreservedAnalyses::all(); //if flag is not set, skip + errs()<<"# =============== SSR Inference =============== #\n"; + runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, AR.BFI, &AR.TLI, &AR.TTI, &AR.SE, AR.MSSA); + errs()<<"# =============== SSR done =============== #\n"; + return PreservedAnalyses::none(); +} + + + + +/* + const SCEV *bt = SE->getBackedgeTakenCount(L); + SCEVExpander ex(*SE, L->getHeader()->getModule()->getDataLayout(), "backedge.taken"); + ex.setInsertPoint(L->getLoopPreheader()->getTerminator()); + Value *v = ex.expandCodeFor(bt); + v->dump(); + Value *SCEVtoValues(const SCEV *scev, ilist *insns){ assert(scev && insns && "arguments should not be null"); errs()<<"\t"; @@ -207,59 +298,8 @@ ConstantInt *SCEVtoConstStep(const SCEV *scev, const SCEV *init, Loop *L){ return nullptr; } -bool runOnLoop( - Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, - BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, - ScalarEvolution *SE, MemorySSA *MSSA) { - L->dump(); - - if (!L->isInnermost() || !L->isLCSSAForm(*DT)){ - errs()<<"loop not innermost or not LCSSA\n"; - return false; - } - - BasicBlock *preheader = L->getLoopPreheader(); - if (!preheader){ - //TODO: can alleviate this by adding one if SSR setup needed - errs()<<"loop has no preheader\n"; - return false; - } - BasicBlock *exit = L->getExitBlock(); - if (!exit){ - errs()<<"loop has no or multiple exits\n"; - return false; - } - - BasicBlock *exiting = L->getExitingBlock(); - if (!exiting){ - errs()<<"no, or multiple exiting blocks \n"; - return false; - } - - if (!L->hasDedicatedExits()){ - //TODO: relatively easily fixable ==> add block before single exit block - errs()<<"exit block is not dedicated \n"; - return false; - } - - if (L->getNumBackEdges() != 1){ - errs()<<"# of back-edges is not 1 \n"; - return false; - } - - if (!SE->hasLoopInvariantBackedgeTakenCount(L)){ - errs()<<"back-edge taken count is not loop-inv\n"; - return false; - } - - const SCEV *bt = SE->getBackedgeTakenCount(L); - - ilist *insns = new iplist(); - - Value *repcount = SCEVtoValues(bt, insns); - - std::vector streams; +std::vector streams; errs()<<"instructions with SSR replacement potential\n"; for (BasicBlock *BB : L->blocks()){ @@ -353,17 +393,5 @@ bool runOnLoop( errs()<<"done with loop:\n"; L->dump(); - return Changed; -} - -} //end of namespace - -PreservedAnalyses SSRInferencePass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &){ - //if (!EnableSSRInference) return PreservedAnalyses::all(); //if flag is not set, skip - errs()<<"# =============== SSR Inference =============== #\n"; - if(!runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, AR.BFI, &AR.TLI, &AR.TTI, &AR.SE, AR.MSSA)){ - //return PreservedAnalyses::all(); - } - return PreservedAnalyses::none(); -} +*/ \ No newline at end of file From d98664dba532071047e2b61f94f1bc195c1eda8e Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Mon, 11 Apr 2022 22:58:00 +0200 Subject: [PATCH 10/47] work on analysis,pass split --- .../llvm/Analysis/AffineAccessAnalysis.h | 56 +++ .../llvm/Transforms/SSR/SSRInference.h | 38 +-- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 179 ++++++++++ llvm/lib/Analysis/CMakeLists.txt | 1 + llvm/lib/Passes/PassBuilder.cpp | 9 +- llvm/lib/Passes/PassRegistry.def | 5 +- llvm/lib/Transforms/SSR/SSRInference.cpp | 323 +++++++++++------- 7 files changed, 447 insertions(+), 164 deletions(-) create mode 100644 llvm/include/llvm/Analysis/AffineAccessAnalysis.h create mode 100644 llvm/lib/Analysis/AffineAccessAnalysis.cpp diff --git a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h new file mode 100644 index 0000000000000..673b3831fb5fa --- /dev/null +++ b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h @@ -0,0 +1,56 @@ +#include "llvm/IR/PassManager.h" +#include "llvm/Passes/PassBuilder.h" +#include "llvm/Passes/PassPlugin.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/ADT/SmallVector.h" + +#include +#include + +namespace llvm { + +class AffineAcc{ +public: + AffineAcc(const Loop *L, ArrayRef instructions, const SCEV *data, const SCEV *bound, const SCEV *stride); + void dump(); + unsigned getDimension(); +private: + void addLoop(const Loop *L, const SCEV *bound, const SCEV *stride); //add dimension + + const SCEV *data; + SmallVector bounds; //from outer- to innermost loop + SmallVector strides; //from outer- to innermost loop + SmallVector instructions; //instructions that are accessing the memory according to data, bounds, and strides. + const Loop *L; //outermost loop +}; + +class AffineAccess{ +public: + AffineAccess(ScalarEvolution &SE) :SE(SE) {} + void addAccess(AffineAcc &a); + ArrayRef getAll(); +private: + SmallVector accesses; + ScalarEvolution &SE; +}; + +class AffineAccessAnalysis : public AnalysisInfoMixin { + friend AnalysisInfoMixin; + static AnalysisKey Key; + +public: + using Result = AffineAccess; + Result run(Function &F, FunctionAnalysisManager &AM); +}; + +// This is the analysis pass that will be invocable via opt +class AffineAccessAnalysisPass : public AnalysisInfoMixin { + raw_ostream &OS; + +public: + explicit AffineAccessAnalysisPass(raw_ostream &OS) : OS(OS) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +} // namespace llvm \ No newline at end of file diff --git a/llvm/include/llvm/Transforms/SSR/SSRInference.h b/llvm/include/llvm/Transforms/SSR/SSRInference.h index bb32d2e425f42..250feff7d5d48 100644 --- a/llvm/include/llvm/Transforms/SSR/SSRInference.h +++ b/llvm/include/llvm/Transforms/SSR/SSRInference.h @@ -9,53 +9,21 @@ #ifndef LLVM_TRANSFORMS_SSR_SSRINFERENCE_H #define LLVM_TRANSFORMS_SSR_SSRINFERENCE_H -#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/PassManager.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Transforms/Scalar/LoopPassManager.h" - -#include "llvm/IR/Value.h" -#include "llvm/IR/Instruction.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/ilist.h" namespace llvm { class SSRInferencePass : public PassInfoMixin { public: - PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &); + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); }; -class SSRStream{ +class SSRGenerationPass : PassInfoMixin{ public: - SSRStream(Loop *L, ilist *setup, ArrayRef insts, - unsigned dim, Value *data, Value *bound, Value *stride, bool isStore); - - int getDM(); - void setDM(int dmId); - - void GenerateSSRInstructions(); - -private: - Loop *L; - ilist *setup; - - SmallVector moveInsts; //likely to be just one or maybe two load/store insts - - bool isStore; - - bool _isgen; - - unsigned dim; - Value *data; - Value *bound; - Value *stride; - - int dm; //"color" - SmallVector conflicts; //"edges" to conflicting SSRStreams + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); }; - } // namespace llvm #endif // LLVM_TRANSFORMS_SSR_SSRINFERENCE_H diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp new file mode 100644 index 0000000000000..5629a425545e8 --- /dev/null +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -0,0 +1,179 @@ +#include "llvm/Analysis/AffineAccessAnalysis.h" + +#include "llvm/IR/PassManager.h" +#include "llvm/Passes/PassBuilder.h" +#include "llvm/Passes/PassPlugin.h" +#include "llvm/Support/raw_ostream.h" + +#include "llvm/InitializePasses.h" + +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsRISCV.h" +#include "llvm/IR/Dominators.h" + +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" + +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasAnalysisEvaluator.h" +#include "llvm/Analysis/AliasSetTracker.h" + +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Casting.h" + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/TinyPtrVector.h" +#include "llvm/ADT/ilist.h" + +#include +#include +#include + +using namespace llvm; + +AffineAcc::AffineAcc(const Loop *L, ArrayRef instructions, const SCEV *data, + const SCEV *bound, const SCEV *stride) : data(data), L(L){ + this->instructions.append(instructions.begin(), instructions.end()); + this->bounds.push_back(bound); + this->strides.push_back(stride); + return; +} + +void AffineAcc::addLoop(const Loop *L, const SCEV *bound, const SCEV *stride){ + this->L = L; + this->bounds.push_back(bound); + this->strides.push_back(stride); + return; +} + +unsigned AffineAcc::getDimension(){ + return this->bounds.size(); +} + +void AffineAcc::dump(){ + errs()<<"strd,bd of "; this->data->dump(); + for (unsigned i = 0; i < this->getDimension(); i++){ + this->strides[i]->dump(); + this->bounds[i]->dump(); + } +} + +//================== AffineAcces, Result of Analysis ========================================= +void AffineAccess::addAccess(AffineAcc &a){ + this->accesses.push_back(a); + return; +} + +ArrayRef AffineAccess::getAll(){ + ArrayRef ar(accesses.begin(), accesses.end()); + return ar; //copy +} + +//================== Affine Acces Analysis ================================================== + +namespace { + +/// guarantees: +/// L has 1 preheader and 1 dedicated exit +/// L has 1 backedge and 1 exiting block +/// bt SCEV can be expanded to instructions at insertionsPoint +bool checkLoop(const Loop *L, DominatorTree &DT, ScalarEvolution &SE, Instruction *InsertionPoint){ + if (!L->isLCSSAForm(DT) || !L->getLoopPreheader() || !L->getExitBlock() + || !L->getExitBlock() || !L->hasDedicatedExits() || L->getNumBackEdges() != 1){ + errs()<<"malformed loop: "; L->dump(); + return false; + } + if (!SE.hasLoopInvariantBackedgeTakenCount(L)){ + errs()<<"cannot calculate backedge taken count\n"; + return false; + } + const SCEV *bt = SE.getBackedgeTakenCount(L); + if(!isSafeToExpandAt(bt, InsertionPoint, SE) /*|| !SE->isAvailableAtLoopEntry(bt, L)*/){ + errs()<<"cannot expand bt SCEV: "; bt->dump(); + } + errs()<<"loop is well-formed: "; bt->dump(); + return true; +} + +/// check whether BB is on all controlflow paths from header to header +bool isOnAllControlFlowPaths(const BasicBlock *BB, const Loop *L, const DominatorTree &DT){ + return DT.dominates(BB, L->getHeader()); +} + +AffineAccess &runOnFunction(Function &F, LoopInfo &LI, DominatorTree &DT, ScalarEvolution &SE, AAResults &AA){ + AffineAccess *aa = new AffineAccess(SE); + + for (const Loop *L : LI){ + errs()<<"loop: "; L->dump(); + + if (!L->getLoopPreheader() || !checkLoop(L, DT, SE, L->getLoopPreheader()->getTerminator())) continue; + + for (const auto &BB : L->getBlocks()){ + //TODO: how to allow inner loops? + if (!isOnAllControlFlowPaths(BB, L, DT)) continue; + + for (auto &I : *BB){ + Value *Addr; + if (LoadInst *Load = dyn_cast(&I)){ + Addr = Load->getPointerOperand(); + }else if (StoreInst *Store = dyn_cast(&I)){ + Addr = Store->getPointerOperand(); + }else{ + continue; //cannot do anything with this instruction + } + + const SCEV *AddrSCEV = SE.getSCEVAtScope(Addr, L); + if (!SE.hasComputableLoopEvolution(AddrSCEV, L)) continue; + errs()<<"has computable loop evolution: "; AddrSCEV->dump(); + + auto split = SE.SplitIntoInitAndPostInc(L, AddrSCEV); + const SCEV *SetupAddrSCEV = split.first; + const SCEV *PostIncSCEV = split.second; + if (!isSafeToExpandAt(SetupAddrSCEV, L->getLoopPreheader()->getTerminator(), SE)) continue; + errs()<<"can expand setup addr scev in preheader: "; SetupAddrSCEV->dump(); + + + } + } + + } + + return *aa; +} + +} //end of namespace + +AnalysisKey AffineAccessAnalysis::Key; + +AffineAccess AffineAccessAnalysis::run(Function &F, FunctionAnalysisManager &FAM) { + + errs()<<"running AffineAccessAnalysis on "<(F); + DominatorTree &DT = FAM.getResult(F); + ScalarEvolution &SE = FAM.getResult(F); + AAResults &AA = FAM.getResult(F); + + AffineAccess aa = runOnFunction(F, LI, DT, SE, AA); + return aa; +} + +//================== Affine Acces Analysis Pass for opt ======================================= +PreservedAnalyses AffineAccessAnalysisPass::run(Function &F, FunctionAnalysisManager &FAM) { + this->OS<<"enable debugging to see:\n"; + AffineAccess AA = FAM.getResult(F); + for (auto A : AA.getAll()){ + A.dump(); + } + return PreservedAnalyses::all(); +} \ No newline at end of file diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt index f31cf349b09aa..887b2176fe730 100644 --- a/llvm/lib/Analysis/CMakeLists.txt +++ b/llvm/lib/Analysis/CMakeLists.txt @@ -14,6 +14,7 @@ if (DEFINED LLVM_HAVE_TF_AOT OR DEFINED LLVM_HAVE_TF_API) endif() add_llvm_component_library(LLVMAnalysis + AffineAccessAnalysis.cpp AliasAnalysis.cpp AliasAnalysisEvaluator.cpp AliasAnalysisSummary.cpp diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 72e92c4403854..e3f1422552e34 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -16,6 +16,7 @@ #include "llvm/Passes/PassBuilder.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/Analysis/AffineAccessAnalysis.h" #include "llvm/Analysis/AliasAnalysisEvaluator.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/AssumptionCache.h" @@ -565,8 +566,6 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap)); LPM1.addPass(SimpleLoopUnswitchPass()); - LPM1.addPass(SSRInferencePass()); - LPM2.addPass(LoopIdiomRecognizePass()); LPM2.addPass(IndVarSimplifyPass()); @@ -597,6 +596,9 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, DebugLogging)); FPM.addPass(SimplifyCFGPass()); FPM.addPass(InstCombinePass()); + + FPM.addPass(SSRInferencePass()); + if (EnableLoopFlatten) FPM.addPass(LoopFlattenPass()); // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA. @@ -761,6 +763,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, DebugLogging)); FPM.addPass(SimplifyCFGPass()); FPM.addPass(InstCombinePass()); + + FPM.addPass(SSRInferencePass()); + if (EnableLoopFlatten) FPM.addPass(LoopFlattenPass()); // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass, diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 24cdf2b8b6109..594e6568fc132 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -146,6 +146,7 @@ CGSCC_PASS("no-op-cgscc", NoOpCGSCCPass()) #ifndef FUNCTION_ANALYSIS #define FUNCTION_ANALYSIS(NAME, CREATE_PASS) #endif +FUNCTION_ANALYSIS("affine-access", AffineAccessAnalysis()) FUNCTION_ANALYSIS("aa", AAManager()) FUNCTION_ANALYSIS("assumptions", AssumptionAnalysis()) FUNCTION_ANALYSIS("block-freq", BlockFrequencyAnalysis()) @@ -190,6 +191,9 @@ FUNCTION_ALIAS_ANALYSIS("tbaa", TypeBasedAA()) #ifndef FUNCTION_PASS #define FUNCTION_PASS(NAME, CREATE_PASS) #endif +FUNCTION_PASS("affine-access-pass", AffineAccessAnalysisPass(dbgs())) +FUNCTION_PASS("infer-ssr", SSRInferencePass()) +FUNCTION_PASS("generate-ssr-intrinsics", SSRGenerationPass()) FUNCTION_PASS("aa-eval", AAEvaluator()) FUNCTION_PASS("adce", ADCEPass()) FUNCTION_PASS("add-discriminators", AddDiscriminatorsPass()) @@ -387,7 +391,6 @@ LOOP_PASS("canon-freeze", CanonicalizeFreezeInLoopsPass()) LOOP_PASS("dot-ddg", DDGDotPrinterPass()) LOOP_PASS("invalidate", InvalidateAllAnalysesPass()) LOOP_PASS("licm", LICMPass()) -LOOP_PASS("infer-ssr", SSRInferencePass()) LOOP_PASS("loop-idiom", LoopIdiomRecognizePass()) LOOP_PASS("loop-instsimplify", LoopInstSimplifyPass()) LOOP_PASS("loop-interchange", LoopInterchangePass()) diff --git a/llvm/lib/Transforms/SSR/SSRInference.cpp b/llvm/lib/Transforms/SSR/SSRInference.cpp index 92e072b443175..6ca8d6487d5d4 100644 --- a/llvm/lib/Transforms/SSR/SSRInference.cpp +++ b/llvm/lib/Transforms/SSR/SSRInference.cpp @@ -9,6 +9,7 @@ #include "llvm/Transforms/SSR/SSRInference.h" #include "llvm/InitializePasses.h" #include "llvm/Passes/PassBuilder.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -20,6 +21,8 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/AffineAccessAnalysis.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -44,83 +47,30 @@ using namespace llvm; static cl::opt EnableSSRInference("ssr-inference", cl::Hidden, cl::init(false), cl::desc("inference of SSR intrinsics")); -SSRStream::SSRStream(Loop *L, ilist *setup, ArrayRef insts, - unsigned dim, Value *data, Value *bound, Value *stride, bool isStore) - : L(L), setup(setup), moveInsts(), isStore(isStore), _isgen(false), - dim(dim), data(data), bound(bound), stride(stride), dm(-1), conflicts() - { - moveInsts.append::iterator>(insts.begin(), insts.end()); - assert(L && setup && "input not null"); - assert(dim > 0 && "correct dimension"); - assert(data->getType() == Type::getInt8PtrTy(L->getHeader()->getContext())); - assert(bound->getType() == Type::getInt32Ty(L->getHeader()->getContext())); - assert(stride->getType() == Type::getInt32Ty(L->getHeader()->getContext())); - } - -int SSRStream::getDM() { return dm; } -void SSRStream::setDM(int dmId) { dm = dmId; } - -void SSRStream::GenerateSSRInstructions(){ - assert(!_isgen && "this stream has not generated its instructions yet"); - this->_isgen = true; - assert(this->dm >= 0 && this->dm < SSR_NUM_DMS && "stream has valid dm id"); - - Module *mod = L->getHeader()->getModule(); //module for function declarations, TODO: is this the correct one? - IntegerType *i32 = IntegerType::getInt32Ty(L->getHeader()->getContext()); - - Instruction *point = L->getLoopPreheader()->getTerminator(); +PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &FAM){ + AffineAccess &AF = FAM.getResult(F); - Instruction *i = &setup->front(); - while(i){ - Instruction *iNext = setup->getNextNode(*i); - i->insertBefore(point); - i = iNext; + for (auto A : AF.getAll()){ + A.dump(); } - IRBuilder<> builder(point); - - ConstantInt *dm = ConstantInt::get(i32, this->dm); //datamover id, ty=i32 - ConstantInt *dim = ConstantInt::get(i32, this->dim - 1); //dimension - 1, ty=i32 - // data pointer, ty=i8* - Function *SSRReadSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_read_imm); //can take _imm bc dm and dim are constant - std::array args = {dm, dim, data}; - builder.CreateCall(SSRReadSetup->getFunctionType(), SSRReadSetup, ArrayRef(args)); - - errs()<<"generated ssr_read_imm \n"; - - ConstantInt *rep; //repetition - 1, ty=i32 - rep = ConstantInt::get(i32, moveInsts.size()); - Function *SSRRepetitionSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_repetition); - std::array repargs = {dm, rep}; - builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef(repargs)); - - errs()<<"generated ssr_setup_repetitions \n"; - - //bound - 1, ty=i32, relative stride, ty=i32 - Function *SSRBoundStrideSetup1D = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_bound_stride_1d); - std::array bsargs = {dm, bound, stride}; - builder.CreateCall(SSRBoundStrideSetup1D->getFunctionType(), SSRBoundStrideSetup1D, ArrayRef(bsargs)); - - errs()<<"generated ssr_setup_bound_stride_1d \n"; - - if (isStore){ - errs()<<"store not done yet \n"; - }else{ - Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); - std::array poparg = {dm}; - for (Instruction *I : moveInsts){ - builder.SetInsertPoint(I); - Value *v = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); - BasicBlock::iterator ii(I); - ReplaceInstWithValue(I->getParent()->getInstList(), ii, v); - } - } + return PreservedAnalyses::none(); +} - return; +PreservedAnalyses SSRInferencePass::run(Function &F, FunctionAnalysisManager &FAM){ + FunctionPassManager FPM(true); + FPM.addPass(LoopSimplifyPass()); //need this before AffineAccessAnalysis in SSRGenerationPass + FPM.addPass(SSRGenerationPass()); + return FPM.run(F, FAM); } -namespace{ +/* + const SCEV *bt = SE->getBackedgeTakenCount(L); + SCEVExpander ex(*SE, L->getHeader()->getModule()->getDataLayout(), "backedge.taken"); + ex.setInsertPoint(L->getLoopPreheader()->getTerminator()); + Value *v = ex.expandCodeFor(bt); + v->dump(); Value *SCEVtoValues(const SCEV *scev, ilist *insns){ assert(scev && insns && "arguments should not be null"); @@ -207,59 +157,8 @@ ConstantInt *SCEVtoConstStep(const SCEV *scev, const SCEV *init, Loop *L){ return nullptr; } -bool runOnLoop( - Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, - BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, - ScalarEvolution *SE, MemorySSA *MSSA) { - L->dump(); - - if (!L->isInnermost() || !L->isLCSSAForm(*DT)){ - errs()<<"loop not innermost or not LCSSA\n"; - return false; - } - - BasicBlock *preheader = L->getLoopPreheader(); - if (!preheader){ - //TODO: can alleviate this by adding one if SSR setup needed - errs()<<"loop has no preheader\n"; - return false; - } - - BasicBlock *exit = L->getExitBlock(); - if (!exit){ - errs()<<"loop has no or multiple exits\n"; - return false; - } - - BasicBlock *exiting = L->getExitingBlock(); - if (!exiting){ - errs()<<"no, or multiple exiting blocks \n"; - return false; - } - - if (!L->hasDedicatedExits()){ - //TODO: relatively easily fixable ==> add block before single exit block - errs()<<"exit block is not dedicated \n"; - return false; - } - if (L->getNumBackEdges() != 1){ - errs()<<"# of back-edges is not 1 \n"; - return false; - } - - if (!SE->hasLoopInvariantBackedgeTakenCount(L)){ - errs()<<"back-edge taken count is not loop-inv\n"; - return false; - } - - const SCEV *bt = SE->getBackedgeTakenCount(L); - - ilist *insns = new iplist(); - - Value *repcount = SCEVtoValues(bt, insns); - - std::vector streams; +std::vector streams; errs()<<"instructions with SSR replacement potential\n"; for (BasicBlock *BB : L->blocks()){ @@ -353,7 +252,179 @@ bool runOnLoop( errs()<<"done with loop:\n"; L->dump(); - return Changed; +class SSRStream{ +public: + SSRStream(Loop *L, ilist *setup, ArrayRef insts, + unsigned dim, Value *data, Value *bound, Value *stride, bool isStore); + + int getDM(); + void setDM(int dmId); + + void GenerateSSRInstructions(); + +private: + Loop *L; + ilist *setup; + + SmallVector moveInsts; //likely to be just one or maybe two load/store insts + + bool isStore; + + bool _isgen; + + unsigned dim; + Value *data; + Value *bound; + Value *stride; + + int dm; //"color" + SmallVector conflicts; //"edges" to conflicting SSRStreams +}; + +SSRStream::SSRStream(Loop *L, ilist *setup, ArrayRef insts, + unsigned dim, Value *data, Value *bound, Value *stride, bool isStore) + : L(L), setup(setup), moveInsts(), isStore(isStore), _isgen(false), + dim(dim), data(data), bound(bound), stride(stride), dm(-1), conflicts() + { + moveInsts.append::iterator>(insts.begin(), insts.end()); + assert(L && setup && "input not null"); + assert(dim > 0 && "correct dimension"); + assert(data->getType() == Type::getInt8PtrTy(L->getHeader()->getContext())); + assert(bound->getType() == Type::getInt32Ty(L->getHeader()->getContext())); + assert(stride->getType() == Type::getInt32Ty(L->getHeader()->getContext())); + } + +int SSRStream::getDM() { return dm; } +void SSRStream::setDM(int dmId) { dm = dmId; } + +void SSRStream::GenerateSSRInstructions(){ + assert(!_isgen && "this stream has not generated its instructions yet"); + this->_isgen = true; + assert(this->dm >= 0 && this->dm < SSR_NUM_DMS && "stream has valid dm id"); + + Module *mod = L->getHeader()->getModule(); //module for function declarations, TODO: is this the correct one? + IntegerType *i32 = IntegerType::getInt32Ty(L->getHeader()->getContext()); + + Instruction *point = L->getLoopPreheader()->getTerminator(); + + Instruction *i = &setup->front(); + while(i){ + Instruction *iNext = setup->getNextNode(*i); + i->insertBefore(point); + i = iNext; + } + + IRBuilder<> builder(point); + + ConstantInt *dm = ConstantInt::get(i32, this->dm); //datamover id, ty=i32 + ConstantInt *dim = ConstantInt::get(i32, this->dim - 1); //dimension - 1, ty=i32 + // data pointer, ty=i8* + Function *SSRReadSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_read_imm); //can take _imm bc dm and dim are constant + std::array args = {dm, dim, data}; + builder.CreateCall(SSRReadSetup->getFunctionType(), SSRReadSetup, ArrayRef(args)); + + errs()<<"generated ssr_read_imm \n"; + + ConstantInt *rep; //repetition - 1, ty=i32 + rep = ConstantInt::get(i32, moveInsts.size()); + Function *SSRRepetitionSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_repetition); + std::array repargs = {dm, rep}; + builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef(repargs)); + + errs()<<"generated ssr_setup_repetitions \n"; + + //bound - 1, ty=i32, relative stride, ty=i32 + Function *SSRBoundStrideSetup1D = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_bound_stride_1d); + std::array bsargs = {dm, bound, stride}; + builder.CreateCall(SSRBoundStrideSetup1D->getFunctionType(), SSRBoundStrideSetup1D, ArrayRef(bsargs)); + + errs()<<"generated ssr_setup_bound_stride_1d \n"; + + if (isStore){ + errs()<<"store not done yet \n"; + }else{ + Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); + std::array poparg = {dm}; + for (Instruction *I : moveInsts){ + builder.SetInsertPoint(I); + Value *v = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); + BasicBlock::iterator ii(I); + ReplaceInstWithValue(I->getParent()->getInstList(), ii, v); + } + } + + return; +} + + +namespace{ + +/// guarantees: +/// L has 1 preheader and 1 dedicated exit +/// L has 1 backedge and 1 exiting block +/// bt SCEV can be expanded to instructions at insertionsPoint +bool checkLoop(const Loop *L, DominatorTree *DT, ScalarEvolution *SE, Instruction *InsertionPoint){ + if (!L->isLCSSAForm(*DT) || !L->getLoopPreheader() || !L->getExitBlock() + || !L->getExitBlock() || !L->hasDedicatedExits() || L->getNumBackEdges() != 1){ + errs()<<"malformed loop: "; L->dump(); + return false; + } + if (!SE->hasLoopInvariantBackedgeTakenCount(L)){ + errs()<<"cannot calculate backedge taken count\n"; + return false; + } + const SCEV *bt = SE->getBackedgeTakenCount(L); + if(!isSafeToExpandAt(bt, InsertionPoint, *SE) /|| !SE->isAvailableAtLoopEntry(bt, L)/){ + errs()<<"cannot expand bt SCEV: "; bt->dump(); + } + errs()<<"loop is well-formed: "; bt->dump(); + return true; +} + +/// check whether BB is on all controlflow paths from header to header +bool isOnAllControlFlowPaths(const BasicBlock *BB, const Loop *L, const DominatorTree *DT){ + return DT->dominates(BB, L->getHeader()); +} + +bool runOnLoop( + const Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, + BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, + ScalarEvolution *SE, MemorySSA *MSSA) { + L->dump(); + + if (!L->getLoopPreheader() || !checkLoop(L, DT, SE, L->getLoopPreheader()->getTerminator())) return true; + + SmallVector streams; + + for (const auto &BB : L->getBlocks()){ + //TODO: how to allow inner loops? + if (!isOnAllControlFlowPaths(BB, L, DT)) continue; + + for (auto &I : *BB){ + Value *Addr; + if (LoadInst *Load = dyn_cast(&I)){ + Addr = Load->getPointerOperand(); + }else if (StoreInst *Store = dyn_cast(&I)){ + Addr = Store->getPointerOperand(); + }else{ + continue; //cannot do anything with this instruction + } + + const SCEV *AddrSCEV = SE->getSCEVAtScope(Addr, L); + if (!SE->hasComputableLoopEvolution(AddrSCEV, L)) continue; + errs()<<"has computable loop evolution: "; AddrSCEV->dump(); + + auto split = SE->SplitIntoInitAndPostInc(L, AddrSCEV); + const SCEV *SetupAddrSCEV = split.first; + const SCEV *PostIncSCEV = split.second; + if (!isSafeToExpandAt(SetupAddrSCEV, L->getLoopPreheader()->getTerminator(), *SE)) continue; + errs()<<"can expand setup addr scev in preheader: "; SetupAddrSCEV->dump(); + + + } + } + + return true; } } //end of namespace @@ -361,9 +432,9 @@ bool runOnLoop( PreservedAnalyses SSRInferencePass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &){ //if (!EnableSSRInference) return PreservedAnalyses::all(); //if flag is not set, skip errs()<<"# =============== SSR Inference =============== #\n"; - if(!runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, AR.BFI, &AR.TLI, &AR.TTI, &AR.SE, AR.MSSA)){ - //return PreservedAnalyses::all(); - } + runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, AR.BFI, &AR.TLI, &AR.TTI, &AR.SE, AR.MSSA); + errs()<<"# =============== SSR done =============== #\n"; return PreservedAnalyses::none(); } +*/ \ No newline at end of file From 91b35ac6fecb67e9b5df3b6cd656a88ba47312b9 Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Fri, 15 Apr 2022 23:21:43 +0200 Subject: [PATCH 11/47] working 1d ssr-inference --- .../llvm/Analysis/AffineAccessAnalysis.h | 39 +- .../llvm/Transforms/SSR/SSRGeneration.h | 23 + .../llvm/Transforms/SSR/SSRInference.h | 6 - llvm/lib/Analysis/AffineAccessAnalysis.cpp | 168 ++++++-- llvm/lib/Passes/PassBuilder.cpp | 6 +- llvm/lib/Passes/PassRegistry.def | 4 +- llvm/lib/Transforms/SSR/CMakeLists.txt | 1 + llvm/lib/Transforms/SSR/SSRGeneration.cpp | 172 ++++++++ llvm/lib/Transforms/SSR/SSRInference.cpp | 401 +----------------- 9 files changed, 372 insertions(+), 448 deletions(-) create mode 100644 llvm/include/llvm/Transforms/SSR/SSRGeneration.h create mode 100644 llvm/lib/Transforms/SSR/SSRGeneration.cpp diff --git a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h index 673b3831fb5fa..b811ca0f39c52 100644 --- a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h @@ -10,29 +10,41 @@ namespace llvm { +class AffineAccess; + class AffineAcc{ -public: - AffineAcc(const Loop *L, ArrayRef instructions, const SCEV *data, const SCEV *bound, const SCEV *stride); - void dump(); - unsigned getDimension(); + friend AffineAccess; private: void addLoop(const Loop *L, const SCEV *bound, const SCEV *stride); //add dimension const SCEV *data; - SmallVector bounds; //from outer- to innermost loop - SmallVector strides; //from outer- to innermost loop - SmallVector instructions; //instructions that are accessing the memory according to data, bounds, and strides. + SmallVector bounds; //from outer- to innermost loop + SmallVector strides; //from outer- to innermost loop + Instruction *Addr; + SmallVector accesses; //load/store instructions that use address (guaranteed to be in same loop) const Loop *L; //outermost loop + +public: + AffineAcc(const Loop *L, Instruction *Addr, ArrayRef accesses, const SCEV *data, const SCEV *bound, const SCEV *stride); + void dump() const; + unsigned getDimension() const; + const Loop *getLoop() const; + Instruction *getAddrIns() const; + const SmallVector &getAccesses() const; }; class AffineAccess{ -public: - AffineAccess(ScalarEvolution &SE) :SE(SE) {} - void addAccess(AffineAcc &a); - ArrayRef getAll(); private: - SmallVector accesses; + SmallVector accesses; ScalarEvolution &SE; +public: + AffineAccess(ScalarEvolution &SE) : SE(SE) {} + void addAccess(const AffineAcc *a); + ArrayRef getAll() const; + + Value *expandData(const AffineAcc *aa, Type *ty = (Type *)nullptr) const; + Value *expandBound(const AffineAcc *aa, unsigned i, Type *ty = (Type *)nullptr) const; + Value *expandStride(const AffineAcc *aa, unsigned i, Type *ty = (Type *)nullptr) const; }; class AffineAccessAnalysis : public AnalysisInfoMixin { @@ -46,10 +58,7 @@ class AffineAccessAnalysis : public AnalysisInfoMixin { // This is the analysis pass that will be invocable via opt class AffineAccessAnalysisPass : public AnalysisInfoMixin { - raw_ostream &OS; - public: - explicit AffineAccessAnalysisPass(raw_ostream &OS) : OS(OS) {} PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; diff --git a/llvm/include/llvm/Transforms/SSR/SSRGeneration.h b/llvm/include/llvm/Transforms/SSR/SSRGeneration.h new file mode 100644 index 0000000000000..20262f3016d32 --- /dev/null +++ b/llvm/include/llvm/Transforms/SSR/SSRGeneration.h @@ -0,0 +1,23 @@ +//===-- SSRInference.h - Infer SSR usage ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SSR_SSRGENERATION_H +#define LLVM_TRANSFORMS_SSR_SSRGENERATION_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class SSRGenerationPass : public PassInfoMixin{ +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_SSR_SSRGENERATION_H diff --git a/llvm/include/llvm/Transforms/SSR/SSRInference.h b/llvm/include/llvm/Transforms/SSR/SSRInference.h index 250feff7d5d48..3a95c68da2fce 100644 --- a/llvm/include/llvm/Transforms/SSR/SSRInference.h +++ b/llvm/include/llvm/Transforms/SSR/SSRInference.h @@ -10,7 +10,6 @@ #define LLVM_TRANSFORMS_SSR_SSRINFERENCE_H #include "llvm/IR/PassManager.h" -#include "llvm/Support/CommandLine.h" namespace llvm { @@ -19,11 +18,6 @@ class SSRInferencePass : public PassInfoMixin { PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); }; -class SSRGenerationPass : PassInfoMixin{ -public: - PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); -}; - } // namespace llvm #endif // LLVM_TRANSFORMS_SSR_SSRINFERENCE_H diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index 5629a425545e8..6fd91965a8e3e 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -41,14 +41,14 @@ using namespace llvm; -AffineAcc::AffineAcc(const Loop *L, ArrayRef instructions, const SCEV *data, - const SCEV *bound, const SCEV *stride) : data(data), L(L){ - this->instructions.append(instructions.begin(), instructions.end()); - this->bounds.push_back(bound); - this->strides.push_back(stride); +AffineAcc::AffineAcc(const Loop *L, Instruction *Addr, ArrayRef accesses, + const SCEV *data, const SCEV *bound, const SCEV *stride) : data(data), Addr(Addr){ + for (Instruction *I : accesses) this->accesses.push_back(I); + addLoop(L, bound, stride); return; } +///note: stride can be 0 if access is inv w.r.t. this loop void AffineAcc::addLoop(const Loop *L, const SCEV *bound, const SCEV *stride){ this->L = L; this->bounds.push_back(bound); @@ -56,29 +56,66 @@ void AffineAcc::addLoop(const Loop *L, const SCEV *bound, const SCEV *stride){ return; } -unsigned AffineAcc::getDimension(){ +unsigned AffineAcc::getDimension() const{ return this->bounds.size(); } -void AffineAcc::dump(){ - errs()<<"strd,bd of "; this->data->dump(); +void AffineAcc::dump() const{ + errs()<<"Affine Access in Loop:\n"; + L->dump(); + errs()<<"With Addr instruction: "; Addr->dump(); + errs()<<"And the following load/store instructions:\n"; + for (Instruction *I : accesses){ + I->dump(); + } + errs()<<"data pointer: "; data->dump(); for (unsigned i = 0; i < this->getDimension(); i++){ - this->strides[i]->dump(); - this->bounds[i]->dump(); + errs()<<"dim "<<(i+1)<<" stride: "; strides[i]->dump(); + errs()<<"dim "<<(i+1)<<" bound: "; bounds[i]->dump(); } } +Instruction *AffineAcc::getAddrIns() const{ + return Addr; +} + +const Loop *AffineAcc::getLoop() const{ + return L; +} + +const SmallVector &AffineAcc::getAccesses() const{ + return this->accesses; +} + //================== AffineAcces, Result of Analysis ========================================= -void AffineAccess::addAccess(AffineAcc &a){ +void AffineAccess::addAccess(const AffineAcc *a){ this->accesses.push_back(a); return; } -ArrayRef AffineAccess::getAll(){ - ArrayRef ar(accesses.begin(), accesses.end()); - return ar; //copy +ArrayRef AffineAccess::getAll() const{ + ArrayRef *ar = new ArrayRef(accesses.begin(), accesses.end()); + return *ar; } + Value *AffineAccess::expandData(const AffineAcc *aa, Type *ty) const{ + SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "data"); + ex.setInsertPoint(aa->L->getLoopPreheader()->getTerminator()); + return ex.expandCodeFor(aa->data, ty); + } + + Value *AffineAccess::expandBound(const AffineAcc *aa, unsigned i, Type *ty) const{ + SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "bound"); + ex.setInsertPoint(aa->L->getLoopPreheader()->getTerminator()); + return ex.expandCodeFor(aa->bounds[i], ty); + } + + Value *AffineAccess::expandStride(const AffineAcc *aa, unsigned i, Type *ty) const{ + SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "stride"); + ex.setInsertPoint(aa->L->getLoopPreheader()->getTerminator()); + return ex.expandCodeFor(aa->strides[i], ty); + } + //================== Affine Acces Analysis ================================================== namespace { @@ -88,11 +125,12 @@ namespace { /// L has 1 backedge and 1 exiting block /// bt SCEV can be expanded to instructions at insertionsPoint bool checkLoop(const Loop *L, DominatorTree &DT, ScalarEvolution &SE, Instruction *InsertionPoint){ - if (!L->isLCSSAForm(DT) || !L->getLoopPreheader() || !L->getExitBlock() - || !L->getExitBlock() || !L->hasDedicatedExits() || L->getNumBackEdges() != 1){ - errs()<<"malformed loop: "; L->dump(); - return false; - } + if (!L->isLCSSAForm(DT)) { errs()<<"not LCSSA\n"; return false; } + if (!L->getLoopPreheader()) { errs()<<"no preheader\n"; return false; } + if (!L->getExitBlock()) { errs()<<"nr. exit blocks != 1\n"; return false; } + if (!L->hasDedicatedExits()) { errs()<<"exit is not dedicated\n"; return false; } + if (L->getNumBackEdges() != 1) { errs()<<"nr. back-edges != 1\n"; return false; } + if (!SE.hasLoopInvariantBackedgeTakenCount(L)){ errs()<<"cannot calculate backedge taken count\n"; return false; @@ -110,13 +148,61 @@ bool isOnAllControlFlowPaths(const BasicBlock *BB, const Loop *L, const Dominato return DT.dominates(BB, L->getHeader()); } +/// Fold over AddrSCEV +/// All AddRecSCEVs are dependent on L or loops contained in L (TODO: and on all paths?) +/// All steps in ADDRecSCEVs can be calculated in preheader of L +bool canFindStrides(ScalarEvolution &SE, const ArrayRef &loops, const SCEV *AddrSCEV, const SCEV *SetupAddrSCEV){ + errs()<<"finding strides in: "; AddrSCEV->dump(); + if (AddrSCEV == SetupAddrSCEV) return true; //trivially the same if this holds (bc const Ptr) + else{ + const SCEVPredicate *Peq = SE.getEqualPredicate(AddrSCEV, SetupAddrSCEV); + if (Peq->isAlwaysTrue()) return true; //if we arrive at setup addr scev, we are done + } + + if (loops.empty()) { errs()<<"not enough loops\n"; return false; } //need at least one more loop here for SCEVAddRecvExpr + + if (const auto *AR = dyn_cast(AddrSCEV)){ + auto L = loops.begin(); + while (L != loops.end() && AR->getLoop() != *L) ++L; //do header comparison instead? + if (L == loops.end()) { errs()<<"loops of addRecExpr not found\n"; return false; } + + const SCEV *Stride = AR->getStepRecurrence(SE); + const SCEV *Rest = AR->getStart(); + if (isSafeToExpandAt(Stride, (*L)->getLoopPreheader()->getTerminator(), SE)) { //if we can expand stride at loop entry + errs()<<"can expand stride: "; Stride->dump(); + return canFindStrides(SE, ArrayRef(++L, loops.end()), Rest, SetupAddrSCEV); //check Rest recursively + } + } + return false; +} + +SmallVector &getContainingLoops(const Loop *Outermost, Instruction *Ins){ + BasicBlock *BB = Ins->getParent(); + const auto &loops = Outermost->getLoopsInPreorder(); + SmallVector *r = new SmallVector(); + for (auto L = loops.rbegin(); L != loops.rend(); ++L){ //go through loops in reverse order ==> innermost first + if ((*L)->contains(BB)){ + errs()<<"found containing Loop\n"; + r->push_back(*L); + } + } + return *r; +} + AffineAccess &runOnFunction(Function &F, LoopInfo &LI, DominatorTree &DT, ScalarEvolution &SE, AAResults &AA){ AffineAccess *aa = new AffineAccess(SE); - for (const Loop *L : LI){ - errs()<<"loop: "; L->dump(); + auto loops = LI.getLoopsInPreorder(); + errs()<<"contains "<dump(); - if (!L->getLoopPreheader() || !checkLoop(L, DT, SE, L->getLoopPreheader()->getTerminator())) continue; + if (!L->isInnermost()) continue; //for now + + if (!L->getLoopPreheader()) { errs()<<"loop has no preheader\n"; continue; } + if (!checkLoop(L, DT, SE, L->getLoopPreheader()->getTerminator())) continue; for (const auto &BB : L->getBlocks()){ //TODO: how to allow inner loops? @@ -131,18 +217,45 @@ AffineAccess &runOnFunction(Function &F, LoopInfo &LI, DominatorTree &DT, Scalar }else{ continue; //cannot do anything with this instruction } + Instruction *AddrIns; + if (!(AddrIns = dyn_cast(Addr))) continue; //if Addr is not instruction ==> constant, or sth else (==> leave for other passes to opt) + errs()<<"looking at: "; AddrIns->dump(); - const SCEV *AddrSCEV = SE.getSCEVAtScope(Addr, L); + const SCEV *AddrSCEV = SE.getSCEV(Addr); if (!SE.hasComputableLoopEvolution(AddrSCEV, L)) continue; errs()<<"has computable loop evolution: "; AddrSCEV->dump(); auto split = SE.SplitIntoInitAndPostInc(L, AddrSCEV); - const SCEV *SetupAddrSCEV = split.first; - const SCEV *PostIncSCEV = split.second; + const SCEV *SetupAddrSCEV = split.first; //const SCEV *PostIncSCEV = split.second; if (!isSafeToExpandAt(SetupAddrSCEV, L->getLoopPreheader()->getTerminator(), SE)) continue; errs()<<"can expand setup addr scev in preheader: "; SetupAddrSCEV->dump(); + const auto &loops = getContainingLoops(L, AddrIns); + if (!canFindStrides(SE, ArrayRef(loops.begin(), loops.end()), AddrSCEV, SetupAddrSCEV)) continue; + errs()<<"can find loop Strides: "; AddrSCEV->dump(); + + const SCEV *TC = SE.getBackedgeTakenCount(L); + const auto *AddrRecSCEV = cast(AddrSCEV); + const SCEV *Str; + if (AddrRecSCEV->getLoop() == L){ + Str = cast(AddrSCEV)->getStepRecurrence(SE); //because 1D for now + }else{ + Str = SE.getConstant(APInt(64U, 0U)); + } + + std::vector accesses; + for (auto U = Addr->use_begin(); U != Addr->use_end(); ++U){ + Instruction *Acc = dyn_cast(U->getUser()); + if (!Acc) Acc = dyn_cast(U->getUser()); + + if (!Acc) continue; //both casts failed ==> not a suitable instruction + if (!isOnAllControlFlowPaths(Acc->getParent(), L, DT)) continue; //access does not occur consitently in loop ==> not suitable + + accesses.push_back(Acc); + } + aa->addAccess(new AffineAcc(L, AddrIns, ArrayRef(accesses), SetupAddrSCEV, TC, Str)); + errs()<<"added new AffineAcc\n"; } } @@ -170,10 +283,9 @@ AffineAccess AffineAccessAnalysis::run(Function &F, FunctionAnalysisManager &FAM //================== Affine Acces Analysis Pass for opt ======================================= PreservedAnalyses AffineAccessAnalysisPass::run(Function &F, FunctionAnalysisManager &FAM) { - this->OS<<"enable debugging to see:\n"; AffineAccess AA = FAM.getResult(F); - for (auto A : AA.getAll()){ - A.dump(); + for (const auto *A : AA.getAll()){ + A->dump(); } return PreservedAnalyses::all(); } \ No newline at end of file diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index e3f1422552e34..36f0765b484c8 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -87,6 +87,7 @@ #include "llvm/Transforms/Coroutines/CoroSplit.h" #include "llvm/Transforms/HelloNew/HelloWorld.h" #include "llvm/Transforms/SSR/SSRInference.h" +#include "llvm/Transforms/SSR/SSRGeneration.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" #include "llvm/Transforms/IPO/Annotation2Metadata.h" #include "llvm/Transforms/IPO/ArgumentPromotion.h" @@ -594,11 +595,10 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(createFunctionToLoopPassAdaptor( std::move(LPM1), EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true, DebugLogging)); + FPM.addPass(SSRInferencePass()); FPM.addPass(SimplifyCFGPass()); FPM.addPass(InstCombinePass()); - FPM.addPass(SSRInferencePass()); - if (EnableLoopFlatten) FPM.addPass(LoopFlattenPass()); // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA. @@ -764,8 +764,6 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(SimplifyCFGPass()); FPM.addPass(InstCombinePass()); - FPM.addPass(SSRInferencePass()); - if (EnableLoopFlatten) FPM.addPass(LoopFlattenPass()); // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass, diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 594e6568fc132..cb16781e4ecd2 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -191,9 +191,9 @@ FUNCTION_ALIAS_ANALYSIS("tbaa", TypeBasedAA()) #ifndef FUNCTION_PASS #define FUNCTION_PASS(NAME, CREATE_PASS) #endif -FUNCTION_PASS("affine-access-pass", AffineAccessAnalysisPass(dbgs())) FUNCTION_PASS("infer-ssr", SSRInferencePass()) -FUNCTION_PASS("generate-ssr-intrinsics", SSRGenerationPass()) +FUNCTION_PASS("generate-ssr", SSRGenerationPass()) +FUNCTION_PASS("affine-access-pass", AffineAccessAnalysisPass()) FUNCTION_PASS("aa-eval", AAEvaluator()) FUNCTION_PASS("adce", ADCEPass()) FUNCTION_PASS("add-discriminators", AddDiscriminatorsPass()) diff --git a/llvm/lib/Transforms/SSR/CMakeLists.txt b/llvm/lib/Transforms/SSR/CMakeLists.txt index 1f2d21bcd55a5..d6dc690ed6f95 100644 --- a/llvm/lib/Transforms/SSR/CMakeLists.txt +++ b/llvm/lib/Transforms/SSR/CMakeLists.txt @@ -1,5 +1,6 @@ add_llvm_component_library(LLVMSSR SSRInference.cpp + SSRGeneration.cpp DEPENDS intrinsics_gen diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp new file mode 100644 index 0000000000000..8db7575459939 --- /dev/null +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -0,0 +1,172 @@ +//===-- SSRGeneration.cpp - Generate SSR --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/SSR/SSRGeneration.h" +#include "llvm/InitializePasses.h" +#include "llvm/Passes/PassBuilder.h" +#include "llvm/Target/TargetMachine.h" + +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/IRBuilder.h" + +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/AffineAccessAnalysis.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" + +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Casting.h" + +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsRISCV.h" + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/TinyPtrVector.h" +#include "llvm/ADT/ilist.h" + +#include +#include + +#define NUM_SSR 3U + +//current state of hw: only allow doubles +#define CHECK_TYPE(I) (I->getType() == Type::getDoubleTy(I->getParent()->getContext())) + +using namespace llvm; + +namespace{ + +void generateSSR(AffineAccess &AA, const AffineAcc *aa, unsigned dmid, unsigned n_insts, bool isStore){ + BasicBlock *LoopPreheader = aa->getLoop()->getLoopPreheader(); + Module *mod = LoopPreheader->getModule(); + LLVMContext &ctxt = LoopPreheader->getContext(); + IntegerType *i32 = IntegerType::getInt32Ty(ctxt); + + IRBuilder<> builder(LoopPreheader->getTerminator()); + + ConstantInt *dm = ConstantInt::get(i32, dmid); //datamover id, ty=i32 + ConstantInt *dim = ConstantInt::get(i32, aa->getDimension() - 1U); //dimension - 1, ty=i32 + Value *data = AA.expandData(aa, Type::getInt8PtrTy(ctxt)); + Function *SSRReadSetup; + if (!isStore){ + SSRReadSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_read_imm); //can take _imm bc dm and dim are constant + }else{ + SSRReadSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_write_imm); //can take _imm bc dm and dim are constant + } + std::array args = {dm, dim, data}; + builder.CreateCall(SSRReadSetup->getFunctionType(), SSRReadSetup, ArrayRef(args)); + + errs()<<"generated ssr_read/write_imm \n"; + + ConstantInt *rep = ConstantInt::get(i32, n_insts - 1U); //repetition - 1, ty=i32 + Function *SSRRepetitionSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_repetition); + std::array repargs = {dm, rep}; + builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef(repargs)); + + errs()<<"generated ssr_setup_repetitions \n"; + + Intrinsic::RISCVIntrinsics functions[] = { + Intrinsic::riscv_ssr_setup_bound_stride_1d, + Intrinsic::riscv_ssr_setup_bound_stride_2d, + Intrinsic::riscv_ssr_setup_bound_stride_3d, + Intrinsic::riscv_ssr_setup_bound_stride_4d + }; + for (unsigned d = 0U; d < aa->getDimension(); d++){ + Value *bound = AA.expandBound(aa, d, i32); //bound - 1, ty=i32 + Value *stride = AA.expandStride(aa, d, i32); //relative stride, ty=i32 + + Function *SSRBoundStrideSetup = Intrinsic::getDeclaration(mod, functions[d]); + std::array bsargs = {dm, bound, stride}; + builder.CreateCall(SSRBoundStrideSetup->getFunctionType(), SSRBoundStrideSetup, ArrayRef(bsargs)); + + errs()<<"generated ssr_setup_bound_stride_"<<(d+1)<<"d \n"; + } + + if (isStore){ + Function *SSRPush = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_push); + for (Instruction *I : aa->getAccesses()){ + std::array pusharg = {dm, cast(I)->getValueOperand()}; + builder.SetInsertPoint(I); + builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); + I->removeFromParent(); + } + }else{ + Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); + std::array poparg = {dm}; + for (Instruction *I : aa->getAccesses()){ + builder.SetInsertPoint(I); + Value *v = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); + BasicBlock::iterator ii(I); + ReplaceInstWithValue(I->getParent()->getInstList(), ii, v); + } + } + errs()<<"placed push/pop calls\n"; + return; +} + +void generateSSREnDis(const Loop *L){ + IRBuilder<> builder(L->getLoopPreheader()->getTerminator()); + Module *mod = L->getHeader()->getModule(); + Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); + builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef()); + builder.SetInsertPoint(L->getExitBlock()->getTerminator()); + Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); + builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef()); + + errs()<<"generated ssr_enable and ssr_disable\n"; + return; +} + +} //end of namespace + +PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &FAM){ + errs()<<"SSR Generation Pass on function: "<(F); + + + SmallPtrSet changedLoops; + + unsigned dmid = 0U; + for (const auto *A : AF.getAll()){ + if (dmid >= NUM_SSR) break; + + unsigned n_store = 0U; + unsigned n_load = 0U; + bool valid = true; + for (auto *I : A->getAccesses()){ + valid = valid && CHECK_TYPE(I); + if (dyn_cast(I)) n_load++; + else if(dyn_cast(I)) n_store++; + else assert(false && "non load/store instruction in AffineAcc::accesses ?"); + + if(!valid) break; + } + + errs()<<"current aa is "< 0U); + changedLoops.insert(A->getLoop()); + dmid++; + } + + for (const Loop *L : changedLoops) generateSSREnDis(L); + + return changedLoops.empty() ? PreservedAnalyses::all() : PreservedAnalyses::none(); +} \ No newline at end of file diff --git a/llvm/lib/Transforms/SSR/SSRInference.cpp b/llvm/lib/Transforms/SSR/SSRInference.cpp index 6ca8d6487d5d4..5e3477791e1bc 100644 --- a/llvm/lib/Transforms/SSR/SSRInference.cpp +++ b/llvm/lib/Transforms/SSR/SSRInference.cpp @@ -18,11 +18,13 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Scalar/ADCE.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/AffineAccessAnalysis.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" +#include "llvm/Transforms/SSR/SSRGeneration.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -40,401 +42,14 @@ #include #include -#define SSR_NUM_DMS 3 - using namespace llvm; -static cl::opt EnableSSRInference("ssr-inference", cl::Hidden, cl::init(false), - cl::desc("inference of SSR intrinsics")); - -PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &FAM){ - AffineAccess &AF = FAM.getResult(F); - - for (auto A : AF.getAll()){ - A.dump(); - } - - return PreservedAnalyses::none(); -} - PreservedAnalyses SSRInferencePass::run(Function &F, FunctionAnalysisManager &FAM){ + errs()<<"SSR Inference Pass on function: "<getBackedgeTakenCount(L); - SCEVExpander ex(*SE, L->getHeader()->getModule()->getDataLayout(), "backedge.taken"); - ex.setInsertPoint(L->getLoopPreheader()->getTerminator()); - Value *v = ex.expandCodeFor(bt); - v->dump(); - -Value *SCEVtoValues(const SCEV *scev, ilist *insns){ - assert(scev && insns && "arguments should not be null"); - errs()<<"\t"; - scev->dump(); - switch (scev->getSCEVType()) - { - case SCEVTypes::scConstant: - { - return cast(scev)->getValue(); - } - case SCEVTypes::scUnknown: - { - return cast(scev)->getValue(); - } - case SCEVTypes::scTruncate: - case SCEVTypes::scZeroExtend: - case SCEVTypes::scSignExtend: - { - const SCEVCastExpr *castSCEV = cast(scev); - Value *v = SCEVtoValues(castSCEV->getOperand(0), insns); - if (v){ - Instruction *i; - switch (scev->getSCEVType()) - { - case SCEVTypes::scTruncate: - i = CastInst::CreateTruncOrBitCast(v, castSCEV->getType(), "scev,trunc"); - break; - case SCEVTypes::scZeroExtend: - i = CastInst::CreateZExtOrBitCast(v, castSCEV->getType(), "scev.zext"); - break; - case SCEVTypes::scSignExtend: - i = CastInst::CreateSExtOrBitCast(v, castSCEV->getType(), "scev.sext"); - break; - default: - assert(false && "should not happen!"); - break; - } - insns->push_back(i); - return i; - } - return nullptr; - } - case SCEVTypes::scAddExpr: - case SCEVTypes::scMulExpr: - { - const SCEVCommutativeExpr *binopSCEV = cast(scev); - Value *v1 = SCEVtoValues(binopSCEV->getOperand(0), insns); - Value *v2 = SCEVtoValues(binopSCEV->getOperand(1), insns); - if (v1 && v2){ - Instruction *binop; - if (binopSCEV->getSCEVType() == SCEVTypes::scAddExpr) { - binop = BinaryOperator::CreateAdd(v1, v2, "rcev.add"); - } else { - binop = BinaryOperator::CreateMul(v1, v2, "rcev.mul"); - } - insns->push_back(binop); - return binop; - } - return nullptr; - } - default: - { - errs()<<"encountered some weird SCEVType:\n"; - scev->dump(); - return nullptr; - } - } -} - -ConstantInt *SCEVtoConstStep(const SCEV *scev, const SCEV *init, Loop *L){ - //FIXME: lots to do better here - errs()<<"trying to find stepsize\n"; - scev->dump(); - if(const SCEVAddRecExpr *rec = dyn_cast(scev)){ - errs()<<"add-rec-expr at root\n"; - if (rec->getLoop() == L && rec->getOperand(0) == init){ - errs()<<"loop and init match\n"; - if (const SCEVConstant *c = dyn_cast(rec->getOperand(1))){ - return dyn_cast(c->getValue()); - } - } - } - return nullptr; -} - - -std::vector streams; - - errs()<<"instructions with SSR replacement potential\n"; - for (BasicBlock *BB : L->blocks()){ - //FIXME: check whether block is on all paths from header to itself - for (auto &I : *BB){ - if (LoadInst *load = dyn_cast(&I)){ - if (load->getType() != Type::getDoubleTy(preheader->getContext())) continue; - Value *addr = load->getOperand(0); - const SCEV *addrScev = SE->getSCEV(addr); - if (!SE->hasComputableLoopEvolution(addrScev, L)) { - errs()<<"addrScev has no computable loop evolution:\n"; - addrScev->dump(); - continue; - } - errs()<<"load instr, addr instr, and scev of address:\n"; - load->dump(); addr->dump(); addrScev->dump(); - - auto split = SE->SplitIntoInitAndPostInc(L, addrScev); - const SCEV *init = split.first; - //const SCEV *step = split.second; - - ilist *setup = new iplist(); - - Value *baseAddr = SCEVtoValues(init, setup); - - assert(baseAddr && "some weird SCEV in init SCEV"); - errs()<<"init and it's value:\n"; - init->dump(); baseAddr->dump(); - - ConstantInt *stepsize = SCEVtoConstStep(addrScev, init, L); - if (!stepsize){ - errs()<<"failed to compute stepsize\n"; - return false; - } - errs()<<"step value:\n"; - stepsize->dump(); - - Instruction *data = CastInst::CreatePointerCast(baseAddr, Type::getInt8PtrTy(preheader->getContext()), "data.cast"); - setup->push_back(data); - Instruction *bound = CastInst::CreateIntegerCast(repcount, IntegerType::getInt32Ty(preheader->getContext()), false, "bound.cast"); - setup->push_back(bound); - Instruction *stride = CastInst::CreateIntegerCast(stepsize, IntegerType::getInt32Ty(preheader->getContext()), false, "stride.cast"); - setup->push_back(stride); - SSRStream *s = new SSRStream(L, setup, ArrayRef(load), 1, data, bound, stride, false); - streams.push_back(s); - errs()<<"constructed SSRStream \n"; - }else if(StoreInst *store = dyn_cast(&I)){ - store->dump(); - } - } - } - - bool Changed = false; - - unsigned dmid = 0; - - for (SSRStream *s : streams){ - if (dmid >= SSR_NUM_DMS) break; - s->setDM(dmid++); - s->GenerateSSRInstructions(); - Changed = true; - } - - if (Changed){ - //SE->forgetLoop(L); //TODO: maybe use SE->forgetValue instead - errs()<<"inserting insns into preheader:\n"; - Instruction *c = &insns->back(); - while (c) { - Instruction *c_ = insns->getPrevNode(*c); - c->dump(); - c->insertBefore(&*preheader->begin()); - c = c_; - } - - //add SSRenable and -disable calls in preheader and exit - IRBuilder<> builder(preheader->getTerminator()); - Module *mod = preheader->getModule(); - std::array emptyargs = {}; - Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); - builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef(emptyargs)); - Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); - builder.SetInsertPoint(L->getExitBlock()->getTerminator()); - builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef(emptyargs)); - - }else{ - while(!insns->empty()){ - insns->pop_back(); //delete instructions from back to from to not get live Use when Def is deleted - } - } - - errs()<<"done with loop:\n"; - L->dump(); - -class SSRStream{ -public: - SSRStream(Loop *L, ilist *setup, ArrayRef insts, - unsigned dim, Value *data, Value *bound, Value *stride, bool isStore); - - int getDM(); - void setDM(int dmId); - - void GenerateSSRInstructions(); - -private: - Loop *L; - ilist *setup; - - SmallVector moveInsts; //likely to be just one or maybe two load/store insts - - bool isStore; - - bool _isgen; - - unsigned dim; - Value *data; - Value *bound; - Value *stride; - - int dm; //"color" - SmallVector conflicts; //"edges" to conflicting SSRStreams -}; - -SSRStream::SSRStream(Loop *L, ilist *setup, ArrayRef insts, - unsigned dim, Value *data, Value *bound, Value *stride, bool isStore) - : L(L), setup(setup), moveInsts(), isStore(isStore), _isgen(false), - dim(dim), data(data), bound(bound), stride(stride), dm(-1), conflicts() - { - moveInsts.append::iterator>(insts.begin(), insts.end()); - assert(L && setup && "input not null"); - assert(dim > 0 && "correct dimension"); - assert(data->getType() == Type::getInt8PtrTy(L->getHeader()->getContext())); - assert(bound->getType() == Type::getInt32Ty(L->getHeader()->getContext())); - assert(stride->getType() == Type::getInt32Ty(L->getHeader()->getContext())); - } - -int SSRStream::getDM() { return dm; } -void SSRStream::setDM(int dmId) { dm = dmId; } - -void SSRStream::GenerateSSRInstructions(){ - assert(!_isgen && "this stream has not generated its instructions yet"); - this->_isgen = true; - assert(this->dm >= 0 && this->dm < SSR_NUM_DMS && "stream has valid dm id"); - - Module *mod = L->getHeader()->getModule(); //module for function declarations, TODO: is this the correct one? - IntegerType *i32 = IntegerType::getInt32Ty(L->getHeader()->getContext()); - - Instruction *point = L->getLoopPreheader()->getTerminator(); - - Instruction *i = &setup->front(); - while(i){ - Instruction *iNext = setup->getNextNode(*i); - i->insertBefore(point); - i = iNext; - } - - IRBuilder<> builder(point); - - ConstantInt *dm = ConstantInt::get(i32, this->dm); //datamover id, ty=i32 - ConstantInt *dim = ConstantInt::get(i32, this->dim - 1); //dimension - 1, ty=i32 - // data pointer, ty=i8* - Function *SSRReadSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_read_imm); //can take _imm bc dm and dim are constant - std::array args = {dm, dim, data}; - builder.CreateCall(SSRReadSetup->getFunctionType(), SSRReadSetup, ArrayRef(args)); - - errs()<<"generated ssr_read_imm \n"; - - ConstantInt *rep; //repetition - 1, ty=i32 - rep = ConstantInt::get(i32, moveInsts.size()); - Function *SSRRepetitionSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_repetition); - std::array repargs = {dm, rep}; - builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef(repargs)); - - errs()<<"generated ssr_setup_repetitions \n"; - - //bound - 1, ty=i32, relative stride, ty=i32 - Function *SSRBoundStrideSetup1D = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_bound_stride_1d); - std::array bsargs = {dm, bound, stride}; - builder.CreateCall(SSRBoundStrideSetup1D->getFunctionType(), SSRBoundStrideSetup1D, ArrayRef(bsargs)); - - errs()<<"generated ssr_setup_bound_stride_1d \n"; - - if (isStore){ - errs()<<"store not done yet \n"; - }else{ - Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); - std::array poparg = {dm}; - for (Instruction *I : moveInsts){ - builder.SetInsertPoint(I); - Value *v = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); - BasicBlock::iterator ii(I); - ReplaceInstWithValue(I->getParent()->getInstList(), ii, v); - } - } - - return; -} - - -namespace{ - -/// guarantees: -/// L has 1 preheader and 1 dedicated exit -/// L has 1 backedge and 1 exiting block -/// bt SCEV can be expanded to instructions at insertionsPoint -bool checkLoop(const Loop *L, DominatorTree *DT, ScalarEvolution *SE, Instruction *InsertionPoint){ - if (!L->isLCSSAForm(*DT) || !L->getLoopPreheader() || !L->getExitBlock() - || !L->getExitBlock() || !L->hasDedicatedExits() || L->getNumBackEdges() != 1){ - errs()<<"malformed loop: "; L->dump(); - return false; - } - if (!SE->hasLoopInvariantBackedgeTakenCount(L)){ - errs()<<"cannot calculate backedge taken count\n"; - return false; - } - const SCEV *bt = SE->getBackedgeTakenCount(L); - if(!isSafeToExpandAt(bt, InsertionPoint, *SE) /|| !SE->isAvailableAtLoopEntry(bt, L)/){ - errs()<<"cannot expand bt SCEV: "; bt->dump(); - } - errs()<<"loop is well-formed: "; bt->dump(); - return true; -} - -/// check whether BB is on all controlflow paths from header to header -bool isOnAllControlFlowPaths(const BasicBlock *BB, const Loop *L, const DominatorTree *DT){ - return DT->dominates(BB, L->getHeader()); -} - -bool runOnLoop( - const Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, - BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, - ScalarEvolution *SE, MemorySSA *MSSA) { - L->dump(); - - if (!L->getLoopPreheader() || !checkLoop(L, DT, SE, L->getLoopPreheader()->getTerminator())) return true; - - SmallVector streams; - - for (const auto &BB : L->getBlocks()){ - //TODO: how to allow inner loops? - if (!isOnAllControlFlowPaths(BB, L, DT)) continue; - - for (auto &I : *BB){ - Value *Addr; - if (LoadInst *Load = dyn_cast(&I)){ - Addr = Load->getPointerOperand(); - }else if (StoreInst *Store = dyn_cast(&I)){ - Addr = Store->getPointerOperand(); - }else{ - continue; //cannot do anything with this instruction - } - - const SCEV *AddrSCEV = SE->getSCEVAtScope(Addr, L); - if (!SE->hasComputableLoopEvolution(AddrSCEV, L)) continue; - errs()<<"has computable loop evolution: "; AddrSCEV->dump(); - - auto split = SE->SplitIntoInitAndPostInc(L, AddrSCEV); - const SCEV *SetupAddrSCEV = split.first; - const SCEV *PostIncSCEV = split.second; - if (!isSafeToExpandAt(SetupAddrSCEV, L->getLoopPreheader()->getTerminator(), *SE)) continue; - errs()<<"can expand setup addr scev in preheader: "; SetupAddrSCEV->dump(); - - - } - } - - return true; -} - -} //end of namespace - -PreservedAnalyses SSRInferencePass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &){ - //if (!EnableSSRInference) return PreservedAnalyses::all(); //if flag is not set, skip - errs()<<"# =============== SSR Inference =============== #\n"; - runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, AR.BFI, &AR.TLI, &AR.TTI, &AR.SE, AR.MSSA); - errs()<<"# =============== SSR done =============== #\n"; - return PreservedAnalyses::none(); -} - -*/ \ No newline at end of file +} \ No newline at end of file From 9a6cef2c1a22c6e9b9cddd621d9ea9e044e51d34 Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Sun, 17 Apr 2022 14:15:45 +0200 Subject: [PATCH 12/47] working 2D sometimes --- .../llvm/Analysis/AffineAccessAnalysis.h | 16 +- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 453 ++++++++++++++---- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 44 +- llvm/lib/Transforms/SSR/SSRInference.cpp | 2 + 4 files changed, 405 insertions(+), 110 deletions(-) diff --git a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h index b811ca0f39c52..c4f52446a98a0 100644 --- a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h @@ -11,11 +11,12 @@ namespace llvm { class AffineAccess; +class AffineAccessAnalysis; class AffineAcc{ friend AffineAccess; private: - void addLoop(const Loop *L, const SCEV *bound, const SCEV *stride); //add dimension + AffineAcc(Instruction *Addr, ArrayRef accesses, const SCEV *data); const SCEV *data; SmallVector bounds; //from outer- to innermost loop @@ -25,7 +26,7 @@ class AffineAcc{ const Loop *L; //outermost loop public: - AffineAcc(const Loop *L, Instruction *Addr, ArrayRef accesses, const SCEV *data, const SCEV *bound, const SCEV *stride); + AffineAcc() = delete; void dump() const; unsigned getDimension() const; const Loop *getLoop() const; @@ -36,11 +37,16 @@ class AffineAcc{ class AffineAccess{ private: SmallVector accesses; + DenseMap loopReps; ScalarEvolution &SE; + DominatorTree &DT; + LoopInfo &LI; public: - AffineAccess(ScalarEvolution &SE) : SE(SE) {} - void addAccess(const AffineAcc *a); - ArrayRef getAll() const; + AffineAccess(ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI); + AffineAccess() = delete; + void addAllAccesses(Instruction *Addr, const Loop *L); + AffineAcc *promoteAccess(const AffineAcc &Acc, const Loop *L, const SCEV *Stride); + ArrayRef getAccesses() const; Value *expandData(const AffineAcc *aa, Type *ty = (Type *)nullptr) const; Value *expandBound(const AffineAcc *aa, unsigned i, Type *ty = (Type *)nullptr) const; diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index 6fd91965a8e3e..44f629e38c120 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -34,6 +34,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/TinyPtrVector.h" #include "llvm/ADT/ilist.h" +#include "llvm/ADT/DenseMap.h" #include #include @@ -41,18 +42,9 @@ using namespace llvm; -AffineAcc::AffineAcc(const Loop *L, Instruction *Addr, ArrayRef accesses, - const SCEV *data, const SCEV *bound, const SCEV *stride) : data(data), Addr(Addr){ +AffineAcc::AffineAcc(Instruction *Addr, ArrayRef accesses, + const SCEV *data) : data(data), Addr(Addr){ for (Instruction *I : accesses) this->accesses.push_back(I); - addLoop(L, bound, stride); - return; -} - -///note: stride can be 0 if access is inv w.r.t. this loop -void AffineAcc::addLoop(const Loop *L, const SCEV *bound, const SCEV *stride){ - this->L = L; - this->bounds.push_back(bound); - this->strides.push_back(stride); return; } @@ -87,36 +79,7 @@ const SmallVector &AffineAcc::getAccesses() const{ return this->accesses; } -//================== AffineAcces, Result of Analysis ========================================= -void AffineAccess::addAccess(const AffineAcc *a){ - this->accesses.push_back(a); - return; -} - -ArrayRef AffineAccess::getAll() const{ - ArrayRef *ar = new ArrayRef(accesses.begin(), accesses.end()); - return *ar; -} - - Value *AffineAccess::expandData(const AffineAcc *aa, Type *ty) const{ - SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "data"); - ex.setInsertPoint(aa->L->getLoopPreheader()->getTerminator()); - return ex.expandCodeFor(aa->data, ty); - } - - Value *AffineAccess::expandBound(const AffineAcc *aa, unsigned i, Type *ty) const{ - SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "bound"); - ex.setInsertPoint(aa->L->getLoopPreheader()->getTerminator()); - return ex.expandCodeFor(aa->bounds[i], ty); - } - - Value *AffineAccess::expandStride(const AffineAcc *aa, unsigned i, Type *ty) const{ - SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "stride"); - ex.setInsertPoint(aa->L->getLoopPreheader()->getTerminator()); - return ex.expandCodeFor(aa->strides[i], ty); - } - -//================== Affine Acces Analysis ================================================== +//================== AffineAcces, helper functions ========================================= namespace { @@ -132,7 +95,7 @@ bool checkLoop(const Loop *L, DominatorTree &DT, ScalarEvolution &SE, Instructio if (L->getNumBackEdges() != 1) { errs()<<"nr. back-edges != 1\n"; return false; } if (!SE.hasLoopInvariantBackedgeTakenCount(L)){ - errs()<<"cannot calculate backedge taken count\n"; + errs()<<"checkLoop: cannot calculate backedge taken count\n"; return false; } const SCEV *bt = SE.getBackedgeTakenCount(L); @@ -143,21 +106,314 @@ bool checkLoop(const Loop *L, DominatorTree &DT, ScalarEvolution &SE, Instructio return true; } +///checks whether LHS == RHS always holds +bool SCEVEquals(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE){ + errs()<<"SCEVEquals:\n\t"; LHS->dump(); + errs()<<"\t"; RHS->dump(); + if (LHS == RHS) return true; //trivially the same if this holds (bc const Ptr) + else{ + const SCEVPredicate *Peq = SE.getEqualPredicate(LHS, RHS); + if (Peq->isAlwaysTrue()) return true; //if we arrive at setup addr scev, we are done + } + return false; +} + /// check whether BB is on all controlflow paths from header to header bool isOnAllControlFlowPaths(const BasicBlock *BB, const Loop *L, const DominatorTree &DT){ return DT.dominates(BB, L->getHeader()); } +//return result of Cmp predicated on Rep > 0 if possible. +Optional predicatedICmpOutcome(ICmpInst *Cmp, const SCEV *Rep, ScalarEvolution &SE){ + switch (Cmp->getPredicate()) + { + case CmpInst::Predicate::ICMP_SGT: + case CmpInst::Predicate::ICMP_UGT: + { + const SCEV *LHS = SE.getSCEV(Cmp->getOperand(0)); + const SCEV *RHS = SE.getSCEV(Cmp->getOperand(1)); + //transform: LHS > RHS <==> LHS - RHS > 0 + const SCEV *LHSmRHS = SE.getMinusSCEV(LHS, RHS); + //then check whether Rep == LHS - RHS in which case we know: Rep > 0 ==> result of Cmp is true + if (SCEVEquals(Rep, LHSmRHS, SE)) return Optional(true); + else return None; + } + case CmpInst::Predicate::ICMP_SLT: + case CmpInst::Predicate::ICMP_ULT: + { + //a < b <==> b > a + const SCEV *LHS = SE.getSCEV(Cmp->getOperand(1)); //b + const SCEV *RHS = SE.getSCEV(Cmp->getOperand(0)); //a + //transform: LHS > RHS <==> LHS - RHS > 0 + const SCEV *LHSmRHS = SE.getMinusSCEV(LHS, RHS); + //then check whether Rep == LHS - RHS in which case we know: Rep > 0 ==> result of Cmp is true + if (SCEVEquals(Rep, LHSmRHS, SE)) return Optional(true); + else return None; + } + //TODO: do for more cases + default: + return None; + } +} + +//conservative! +//because SCEVComparePredicate is not in this version of LLVM we have to do this manually ==> will not catch all cases +//predicate is that Rep > 0 +bool isOnAllPredicatedControlFlowPaths(const BasicBlock *BB, const Loop *L, const DominatorTree &DT, const SCEV *Rep, ScalarEvolution &SE){ + if (isOnAllControlFlowPaths(BB, L, DT)) return true; //is on all paths anyway + Rep->dump(); + + std::deque q(1U, L->getHeader()); //iterative BFS with queue + while (!q.empty()){ + BasicBlock *Current = q.front(); q.pop_front(); + if (Current == BB) continue; //do not continue BFS from BB + + Instruction *T = Current->getTerminator(); + T->dump(); + if (BranchInst *BR = dyn_cast(T)){ + if (BR->isConditional()){ + if (ICmpInst *Cmp = dyn_cast(BR->getCondition())){ //FOR NOW: only works with a single ICmpInst as branch condition operand + Cmp->dump(); + auto r = predicatedICmpOutcome(Cmp, Rep, SE); + if (r.hasValue()){ + if (r.getValue()) q.push_back(BR->getSuccessor(0)); + else q.push_back(BR->getSuccessor(1)); + }else{ + q.push_back(BR->getSuccessor(0)); + q.push_back(BR->getSuccessor(1)); + } + } + }else{ + q.push_back(BR->getSuccessor(0)); //add the only successor to queue + } + }else{ + return false; //unknown jump somewhere else ==> BB not on all predicated paths + } + + if (q.front() == L->getHeader()) return false; //bfs arrived at Header (again) with a path that never went through BB + } + + return true; +} + +/// get Loops containing Ins from innermost to outermost +SmallVector &getContainingLoops(ArrayRef loopsPreorder, Instruction *Ins){ + BasicBlock *BB = Ins->getParent(); + SmallVector *r = new SmallVector(); + for (auto L = loopsPreorder.rbegin(); L != loopsPreorder.rend(); ++L){ //go through loops in reverse order ==> innermost first + if ((*L)->contains(BB)){ + r->push_back(*L); + } + } + return *r; +} + +} //end of namespace + +//================== AffineAcces, Result of Analysis ========================================= +AffineAccess::AffineAccess(ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI) : SE(SE), DT(DT), LI(LI){ + auto loops = LI.getLoopsInPreorder(); + unsigned l = 0u; + for (const Loop *L : loops){ + L->dump(); + + if (!L->getLoopPreheader()) continue; + if (!checkLoop(L, DT, SE, L->getLoopPreheader()->getTerminator())) continue; + + loopReps.insert(std::make_pair(L, SE.getBackedgeTakenCount(L))); l++; + } + errs()<<"FOUND "< 0 +/// (3) data ptr can be computed outside of parent loop +/// (4) forall bd : aa.bounds. SE.isLoopInvariant(bd, L) && isSafeToExpandAt(bd, LPreheader->getTerminator(), SE) +/// (5) forall st : aa.strides. SE.isLoopInvariant(st, L) && isSafeToExpandAt(st, LPreheader->getTerminator(), SE) +/// (6) isSafeToExpandAt(Bound / Stride, LPreheader->getTerminator(), SE) +AffineAcc *AffineAccess::promoteAccess(const AffineAcc &aa, const Loop *L, const SCEV *Stride){ + assert((!aa.L) || (aa.L && aa.L->getParentLoop() == L && "can only promote to parent loop")); //(1) + assert(this->loopReps.find(L) != this->loopReps.end() && "L is well formed"); //(1) + + errs()<<"Trying to promote AA of dim="<getSecond(); + const SCEV *Rep = SE.getAddExpr(Bd, SE.getConstant(Bd->getType(), 1U)); + if (!isOnAllPredicatedControlFlowPaths(aa.L->getHeader(), L, this->DT, Rep, this->SE)) return nullptr; //(2.1) + }else{ + for (Instruction *I : aa.accesses){ + if (!isOnAllControlFlowPaths(I->getParent(), L, DT)) return nullptr; //(2.2) + } + } + + errs()<<"passed (2)\n"; + + const SCEV *Bound = this->loopReps.find(L)->getSecond(); + Instruction *InsPoint = L->getLoopPreheader()->getTerminator(); + + if (!SE.hasComputableLoopEvolution(aa.data, L)) return nullptr; //(3.1) + const SCEV *Data = SE.SplitIntoInitAndPostInc(L, aa.data).first; + if (!isSafeToExpandAt(Data, InsPoint, SE)) return nullptr; //(3.2) + + errs()<<"passed (3)\n"; + + for (const SCEV *Bd : aa.bounds){ + if (!(SE.isLoopInvariant(Bd, L) && isSafeToExpandAt(Bd, InsPoint, SE))) return nullptr; //(4) + } + for (const SCEV *Str : aa.strides){ + if (!(SE.isLoopInvariant(Str, L) && isSafeToExpandAt(Str, InsPoint, SE))) return nullptr; //(5) + } + + if (!isSafeToExpandAt(Bound, InsPoint, SE) || !isSafeToExpandAt(Stride, InsPoint, SE)) return nullptr; //(6) + + errs()<<"passed (4), (5) & (6)\n"; + + AffineAcc *A = new AffineAcc(aa); + A->data = Data; + A->L = L; + A->bounds.push_back(Bound); + A->strides.push_back(Stride); + return A; +} + +/// adds all affine accesses that use Addr in loop L +void AffineAccess::addAllAccesses(Instruction *Addr, const Loop *L){ + std::vector accesses; + for (auto U = Addr->use_begin(); U != Addr->use_end(); ++U){ + Instruction *Acc = dyn_cast(U->getUser()); + if (!Acc) Acc = dyn_cast(U->getUser()); + if (!Acc) continue; //both casts failed ==> not a suitable instruction + if (!isOnAllControlFlowPaths(Acc->getParent(), L, DT)) continue; //access does not occur consistently in loop ==> not suitable + accesses.push_back(Acc); + } + if (accesses.empty()) return; //Addr not used + + errs()<<"adding Access: "; Addr->dump(); + + const SCEV *AddrS = SE.getSCEV(Addr); + + auto &cloops = getContainingLoops(LI.getLoopsInPreorder(), Addr); + + errs()<<"has "<(accesses), AddrS); //never needed -> alloc in stack + AffineAcc *A = &dim0; + + const SCEV *CS = AddrS; + for (auto L = cloops.begin(); L != cloops.end(); ++L){ + if (loopReps.find(*L) == loopReps.end()) break; //this loop is malformed ==> this and all more outer loops cannot be used + + errs()<<"finding stride in: "; CS->dump(); + const SCEV *Stride; + if (const auto *Rec = dyn_cast(CS)){ + if (Rec->getLoop() == *L) { + CS = Rec->getStart(); + Stride = Rec->getStepRecurrence(SE); + }else{ + bool occurs = false; + for (auto L_ = L; L_ != cloops.end(); ++L_) occurs = occurs && Rec->getLoop() == *L_; + if (!occurs) break; //AddRecExpr references a loop that is not a containing loop ==> cannot guarantee anything + Stride = SE.getConstant(APInt(64U, 0U)); //addrSCEV does not step in this loop ==> stride is 0 + } + }else{ + break; //did not manage to compute stride + } + assert(Stride); + errs()<<"found stride: "; Stride->dump(); + + A = promoteAccess(*A, *L, Stride); + if (A){ + errs()<<"found AffineAcc:\n"; A->dump(); + this->accesses.push_back(A); + }else{ + break; //did not manage to promote ==> cannot promote for loops further out + } + } + errs()<<"\n"; + return; +} + +ArrayRef AffineAccess::getAccesses() const{ + ArrayRef *ar = new ArrayRef(accesses.begin(), accesses.end()); + return *ar; +} + +Value *AffineAccess::expandData(const AffineAcc *aa, Type *ty) const{ + SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "data"); + ex.setInsertPoint(aa->L->getLoopPreheader()->getTerminator()); + return ex.expandCodeFor(aa->data, ty); +} + +Value *AffineAccess::expandBound(const AffineAcc *aa, unsigned i, Type *ty) const{ + SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "bound"); + ex.setInsertPoint(aa->L->getLoopPreheader()->getTerminator()); + return ex.expandCodeFor(aa->bounds[i], ty); +} + +Value *AffineAccess::expandStride(const AffineAcc *aa, unsigned i, Type *ty) const{ + SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "stride"); + ex.setInsertPoint(aa->L->getLoopPreheader()->getTerminator()); + return ex.expandCodeFor(aa->strides[i], ty); +} + +//================== Affine Acces Analysis ================================================== + +AnalysisKey AffineAccessAnalysis::Key; + +AffineAccess AffineAccessAnalysis::run(Function &F, FunctionAnalysisManager &FAM) { + + errs()<<"running AffineAccessAnalysis on "<(F); + DominatorTree &DT = FAM.getResult(F); + ScalarEvolution &SE = FAM.getResult(F); + //AAResults &AA = FAM.getResult(F); + + AffineAccess *A = new AffineAccess(SE, DT, LI); + + for (const Loop *L : LI.getLoopsInPreorder()){ + for (BasicBlock *BB : L->getBlocks()){ + if (!isOnAllControlFlowPaths(BB, L, DT)) continue; + for (Instruction &I : *BB){ + Value *Addr; + if (LoadInst *Load = dyn_cast(&I)){ + Addr = Load->getPointerOperand(); + }else if (StoreInst *Store = dyn_cast(&I)){ + Addr = Store->getPointerOperand(); + }else{ + continue; //cannot do anything with this instruction + } + Instruction *AddrIns; + if (!(AddrIns = dyn_cast(Addr))) continue; //if Addr is not instruction ==> constant, or sth else (==> leave for other passes to opt) + + A->addAllAccesses(AddrIns, L); + } + } + } + + return *A; +} + +//================== Affine Acces Analysis Pass for opt ======================================= +PreservedAnalyses AffineAccessAnalysisPass::run(Function &F, FunctionAnalysisManager &FAM) { + AffineAccess AA = FAM.getResult(F); + for (const auto *A : AA.getAccesses()){ + A->dump(); + } + return PreservedAnalyses::all(); +} + + +/* /// Fold over AddrSCEV /// All AddRecSCEVs are dependent on L or loops contained in L (TODO: and on all paths?) /// All steps in ADDRecSCEVs can be calculated in preheader of L bool canFindStrides(ScalarEvolution &SE, const ArrayRef &loops, const SCEV *AddrSCEV, const SCEV *SetupAddrSCEV){ errs()<<"finding strides in: "; AddrSCEV->dump(); - if (AddrSCEV == SetupAddrSCEV) return true; //trivially the same if this holds (bc const Ptr) - else{ - const SCEVPredicate *Peq = SE.getEqualPredicate(AddrSCEV, SetupAddrSCEV); - if (Peq->isAlwaysTrue()) return true; //if we arrive at setup addr scev, we are done - } + if (SCEVEquals(AddrSCEV, SetupAddrSCEV, SE)) return true; if (loops.empty()) { errs()<<"not enough loops\n"; return false; } //need at least one more loop here for SCEVAddRecvExpr @@ -175,37 +431,50 @@ bool canFindStrides(ScalarEvolution &SE, const ArrayRef &loops, co } return false; } - -SmallVector &getContainingLoops(const Loop *Outermost, Instruction *Ins){ - BasicBlock *BB = Ins->getParent(); - const auto &loops = Outermost->getLoopsInPreorder(); - SmallVector *r = new SmallVector(); - for (auto L = loops.rbegin(); L != loops.rend(); ++L){ //go through loops in reverse order ==> innermost first - if ((*L)->contains(BB)){ - errs()<<"found containing Loop\n"; - r->push_back(*L); - } - } - return *r; +*/ + +/* +/// can promote if: + (1) parent loop Outer satisfies checkLoop + (2) child loop Inner is on all paths in Outer where Inner.backedgetakencount +1 > 0 + (3) Stride for Outer can be found + (4) forall bd : aa.bounds. SE.isLoopInvariant(bd, Outer) && isSafeToExpandAt(bd, OuterPreheader->getTerminator(), SE) + (5) forall st : aa.strides. SE.isLoopInvariant(st, Outer) && isSafeToExpandAt(st, OuterPreheader->getTerminator(), SE) +bool promoteAffineAccess(AffineAcc &aa, ScalarEvolution &SE, DominatorTree &DT, DenseMap &LR){ + const Loop *Inner = aa.getLoop(); + const Loop *Outer = Inner->getParentLoop(); + const auto &R = LR.find_as(Outer); + if (R == LR.end()) return false; //Outer violates (1) + const SCEV *Bound = R->getSecond(); + BasicBlock *OuterPreheader = Outer->getLoopPreheader(); + BasicBlock *InnerPreheader = Inner->getLoopPreheader(); + const SCEV *Rep = SE.getAddExpr(SE.getBackedgeTakenCount(Inner), SE.getConstant(APInt(64U, 1U))); //trip count of Inner loop + if (!isOnAllPredicatedControlFlowPaths(InnerPreheader, Outer, DT, Rep, SE)) return false; //violates (2) + } +*/ +/* AffineAccess &runOnFunction(Function &F, LoopInfo &LI, DominatorTree &DT, ScalarEvolution &SE, AAResults &AA){ - AffineAccess *aa = new AffineAccess(SE); + AffineAccess *aa = new AffineAccess(SE, DT, LI); auto loops = LI.getLoopsInPreorder(); errs()<<"contains "< loopReps; + for (const Loop *L : loops){ errs()<<"LOOP:\n"; L->dump(); - if (!L->isInnermost()) continue; //for now - - if (!L->getLoopPreheader()) { errs()<<"loop has no preheader\n"; continue; } + if (!L->getLoopPreheader()) continue; if (!checkLoop(L, DT, SE, L->getLoopPreheader()->getTerminator())) continue; + loopReps.insert(std::make_pair(L, SE.getBackedgeTakenCount(L))); + for (const auto &BB : L->getBlocks()){ - //TODO: how to allow inner loops? + if (!isOnAllControlFlowPaths(BB, L, DT)) continue; for (auto &I : *BB){ @@ -221,27 +490,22 @@ AffineAccess &runOnFunction(Function &F, LoopInfo &LI, DominatorTree &DT, Scalar if (!(AddrIns = dyn_cast(Addr))) continue; //if Addr is not instruction ==> constant, or sth else (==> leave for other passes to opt) errs()<<"looking at: "; AddrIns->dump(); + aa->addAllAccesses(AddrIns); + + //Address SCEV const SCEV *AddrSCEV = SE.getSCEV(Addr); if (!SE.hasComputableLoopEvolution(AddrSCEV, L)) continue; errs()<<"has computable loop evolution: "; AddrSCEV->dump(); + //Base Pointer (=data) SCEV auto split = SE.SplitIntoInitAndPostInc(L, AddrSCEV); const SCEV *SetupAddrSCEV = split.first; //const SCEV *PostIncSCEV = split.second; if (!isSafeToExpandAt(SetupAddrSCEV, L->getLoopPreheader()->getTerminator(), SE)) continue; errs()<<"can expand setup addr scev in preheader: "; SetupAddrSCEV->dump(); - const auto &loops = getContainingLoops(L, AddrIns); - if (!canFindStrides(SE, ArrayRef(loops.begin(), loops.end()), AddrSCEV, SetupAddrSCEV)) continue; - errs()<<"can find loop Strides: "; AddrSCEV->dump(); - - const SCEV *TC = SE.getBackedgeTakenCount(L); - const auto *AddrRecSCEV = cast(AddrSCEV); - const SCEV *Str; - if (AddrRecSCEV->getLoop() == L){ - Str = cast(AddrSCEV)->getStepRecurrence(SE); //because 1D for now - }else{ - Str = SE.getConstant(APInt(64U, 0U)); - } + //Stride Check + if (!canFindStride(L, AddrSCEV, SetupAddrSCEV, SE)) continue; + errs()<<"can find loop Stride: "; AddrSCEV->dump(); std::vector accesses; for (auto U = Addr->use_begin(); U != Addr->use_end(); ++U){ @@ -254,8 +518,19 @@ AffineAccess &runOnFunction(Function &F, LoopInfo &LI, DominatorTree &DT, Scalar accesses.push_back(Acc); } + const SCEV *TC = loopReps.find(L)->getSecond(); + const auto *AddrRecSCEV = cast(AddrSCEV); + const SCEV *Str; + if (AddrRecSCEV->getLoop() == L){ + Str = cast(AddrSCEV)->getStepRecurrence(SE); //because 1D for now + }else{ + Str = SE.getConstant(APInt(64U, 0U)); + } + aa->addAccess(new AffineAcc(L, AddrIns, ArrayRef(accesses), SetupAddrSCEV, TC, Str)); errs()<<"added new AffineAcc\n"; + + //TODO: dimension promotion: if preheader has only one predecessor -> if cond for "skipping loop" is bt+1 == 0 -> if parent loop passes checks -> promote } } @@ -264,28 +539,4 @@ AffineAccess &runOnFunction(Function &F, LoopInfo &LI, DominatorTree &DT, Scalar return *aa; } -} //end of namespace - -AnalysisKey AffineAccessAnalysis::Key; - -AffineAccess AffineAccessAnalysis::run(Function &F, FunctionAnalysisManager &FAM) { - - errs()<<"running AffineAccessAnalysis on "<(F); - DominatorTree &DT = FAM.getResult(F); - ScalarEvolution &SE = FAM.getResult(F); - AAResults &AA = FAM.getResult(F); - - AffineAccess aa = runOnFunction(F, LI, DT, SE, AA); - return aa; -} - -//================== Affine Acces Analysis Pass for opt ======================================= -PreservedAnalyses AffineAccessAnalysisPass::run(Function &F, FunctionAnalysisManager &FAM) { - AffineAccess AA = FAM.getResult(F); - for (const auto *A : AA.getAll()){ - A->dump(); - } - return PreservedAnalyses::all(); -} \ No newline at end of file +*/ \ No newline at end of file diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index 8db7575459939..e4bdc21794036 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -40,8 +40,9 @@ #include #include +#include -#define NUM_SSR 3U +#define NUM_SSR 10U //current state of hw: only allow doubles #define CHECK_TYPE(I) (I->getType() == Type::getDoubleTy(I->getParent()->getContext())) @@ -50,6 +51,7 @@ using namespace llvm; namespace{ +///generates SSR setup calls void generateSSR(AffineAccess &AA, const AffineAcc *aa, unsigned dmid, unsigned n_insts, bool isStore){ BasicBlock *LoopPreheader = aa->getLoop()->getLoopPreheader(); Module *mod = LoopPreheader->getModule(); @@ -118,6 +120,7 @@ void generateSSR(AffineAccess &AA, const AffineAcc *aa, unsigned dmid, unsigned return; } +///generates SSR enable & disable calls void generateSSREnDis(const Loop *L){ IRBuilder<> builder(L->getLoopPreheader()->getTerminator()); Module *mod = L->getHeader()->getModule(); @@ -131,18 +134,49 @@ void generateSSREnDis(const Loop *L){ return; } +bool conflictingAccesses(const AffineAcc *A, const AffineAcc *B){ + if (A->getAddrIns() == B->getAddrIns()) return true; + for (Instruction *IA : A->getAccesses()){ + for (Instruction *IB : B->getAccesses()){ + if (IA == IB) return true; + } + } + return false; +} + } //end of namespace PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &FAM){ - errs()<<"SSR Generation Pass on function: "<(F); + errs()<<"SSR Generation Pass on function: "< changedLoops; + auto accs = AF.getAccesses(); + std::vector allaccesses; + for (const AffineAcc *A : accs) allaccesses.push_back(A); + + //sort by dimension + std::sort(allaccesses.begin(), allaccesses.end(), [](const AffineAcc *A, const AffineAcc *B){return A->getDimension() <= B->getDimension();}); + + errs()<<"total of "< accesses; + while (!allaccesses.empty()){ + auto A = allaccesses.back(); allaccesses.pop_back(); + bool conflict = false; + for (auto B : accesses){ + conflict = conflict || conflictingAccesses(A, B); + } + if (!conflict) accesses.push_back(A); + } + + errs()<= NUM_SSR) break; unsigned n_store = 0U; @@ -161,7 +195,9 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F if(!valid || (n_store == 0U && n_load == 0U)) continue; //all uses are valid load/stores and there is at least one of them + A->dump(); generateSSR(AF, A, dmid, n_store + n_load, n_store > 0U); + errs()<<"done with generation\n\n"; changedLoops.insert(A->getLoop()); dmid++; } diff --git a/llvm/lib/Transforms/SSR/SSRInference.cpp b/llvm/lib/Transforms/SSR/SSRInference.cpp index 5e3477791e1bc..a78ccfcbad13c 100644 --- a/llvm/lib/Transforms/SSR/SSRInference.cpp +++ b/llvm/lib/Transforms/SSR/SSRInference.cpp @@ -24,6 +24,7 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/AffineAccessAnalysis.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" +#include "llvm/Transforms/Scalar/LoopStrengthReduce.h" #include "llvm/Transforms/SSR/SSRGeneration.h" #include "llvm/Support/CommandLine.h" @@ -49,6 +50,7 @@ PreservedAnalyses SSRInferencePass::run(Function &F, FunctionAnalysisManager &FA FunctionPassManager FPM(true); FPM.addPass(LoopSimplifyPass()); //canonicalize loops FPM.addPass(LCSSAPass()); //put loops into LCSSA-form + FPM.addPass(createFunctionToLoopPassAdaptor(LoopStrengthReducePass())); FPM.addPass(SSRGenerationPass());//runs AffineAccess analysis and generates SSR intrinsics FPM.addPass(ADCEPass()); //remove potential dead instructions that result from SSR replacement (dead code elim) return FPM.run(F, FAM); From eeb21f00f53096cd694acffe7d7e766cc4ae216f Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Sun, 17 Apr 2022 22:16:51 +0200 Subject: [PATCH 13/47] progress --- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 167 ++++++++++++++++++--- 1 file changed, 145 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index 44f629e38c120..9dfd18cdc8224 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -39,6 +39,7 @@ #include #include #include +#include using namespace llvm; @@ -106,8 +107,36 @@ bool checkLoop(const Loop *L, DominatorTree &DT, ScalarEvolution &SE, Instructio return true; } +Optional> toSameSize(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool unsafe = false){ + using PT = std::pair; + if (LHS->getType()->getIntegerBitWidth() > RHS->getType()->getIntegerBitWidth()) { + if (auto LHSx = dyn_cast(LHS)){ + if (LHSx->getAPInt().getActiveBits() <= RHS->getType()->getIntegerBitWidth()) {} + return Optional(std::make_pair(SE.getConstant(RHS->getType(), LHSx->getAPInt().getLimitedValue()), RHS)); + } + if (auto RHSx = dyn_cast(RHS)){ + if (RHSx->getAPInt().getActiveBits() <= LHS->getType()->getIntegerBitWidth()) + return Optional(std::make_pair(LHS, SE.getConstant(LHS->getType(), RHSx->getAPInt().getLimitedValue()))); + } + if (auto LHSx = dyn_cast(LHS)) return toSameSize(LHSx->getOperand(0), RHS, SE); + if (auto LHSx = dyn_cast(LHS)) return toSameSize(LHSx->getOperand(0), RHS, SE); + if (auto RHSx = dyn_cast(RHS)) return toSameSize(LHS, RHSx->getOperand(0), SE); + if (unsafe) return Optional(std::make_pair(SE.getTruncateExpr(LHS, RHS->getType()), RHS)); + return None; + }else if (LHS->getType()->getIntegerBitWidth() < RHS->getType()->getIntegerBitWidth()){ + auto p = toSameSize(RHS, LHS, SE, unsafe); + if (!p.hasValue()) return None; + return Optional(std::make_pair(p.getValue().second, p.getValue().first)); + } + return Optional(std::make_pair(LHS, RHS)); +} + ///checks whether LHS == RHS always holds bool SCEVEquals(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE){ + auto p = toSameSize(LHS, RHS, SE); + if (!p.hasValue()) return false; + LHS = p.getValue().first; + RHS = p.getValue().second; errs()<<"SCEVEquals:\n\t"; LHS->dump(); errs()<<"\t"; RHS->dump(); if (LHS == RHS) return true; //trivially the same if this holds (bc const Ptr) @@ -115,6 +144,7 @@ bool SCEVEquals(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE){ const SCEVPredicate *Peq = SE.getEqualPredicate(LHS, RHS); if (Peq->isAlwaysTrue()) return true; //if we arrive at setup addr scev, we are done } + errs()<<"false\n"; return false; } @@ -150,7 +180,16 @@ Optional predicatedICmpOutcome(ICmpInst *Cmp, const SCEV *Rep, ScalarEvolu if (SCEVEquals(Rep, LHSmRHS, SE)) return Optional(true); else return None; } - //TODO: do for more cases + case CmpInst::Predicate::ICMP_EQ: + case CmpInst::Predicate::ICMP_NE: + { + //Rep > 0 ==> Rep + x != x + const SCEV *LHS = SE.getSCEV(Cmp->getOperand(0)); //Rep + x (hopefully) + const SCEV *RHS = SE.getSCEV(Cmp->getOperand(1)); //x + const SCEV *LHSmRHS = SE.getMinusSCEV(LHS, RHS); //Rep (hopefully) + if (SCEVEquals(Rep, LHSmRHS, SE)) return Optional(Cmp->getPredicate() == CmpInst::Predicate::ICMP_NE); + else return None; + } default: return None; } @@ -208,6 +247,75 @@ SmallVector &getContainingLoops(ArrayRef loopsPr return *r; } +void findStridesRec(const SCEV *Addr, ArrayRef loops, SmallVector &factors, ScalarEvolution &SE, SmallVector &res){ + errs()<<"finding strides in "; Addr->dump(); + if (loops.empty()) return; + switch (Addr->getSCEVType()) + { + case SCEVTypes::scAddRecExpr: + { + auto AddRec = cast(Addr); + if (AddRec->getLoop() == *loops.begin()){ + const SCEV *S = AddRec->getStepRecurrence(SE); + for (const SCEV *x : factors){ + auto p = toSameSize(S, x, SE, true); + S = SE.getMulExpr(p.getValue().first, p.getValue().second); + } + res.push_back(S); + findStridesRec(AddRec->getStart(), ArrayRef(loops.begin()+1, loops.end()), factors, SE, res); + }else{ + bool occurs = false; + for (const Loop *L : loops) occurs = occurs || AddRec->getLoop() == L; //loops needs to occur further up, o/w invalid + if (!occurs) return; + res.push_back(SE.getConstant(APInt(64U, 0U))); + findStridesRec(AddRec->getStart(), loops, factors, SE, res); + } + return; + } + //case SCEVTypes::scTruncate: TODO: is unsafe here, right? + case SCEVTypes::scSignExtend: + case SCEVTypes::scZeroExtend: + findStridesRec(cast(Addr)->getOperand(0), loops, factors, SE, res); + return; + + case SCEVTypes::scAddExpr: + { + auto S = cast(Addr); + bool lhs = SE.containsAddRecurrence(S->getOperand(0)); + bool rhs = SE.containsAddRecurrence(S->getOperand(1)); + if (lhs && !rhs) findStridesRec(S->getOperand(0), loops, factors, SE, res); + else if (!lhs && rhs) findStridesRec(S->getOperand(1), loops, factors, SE, res); + return; + } + case SCEVTypes::scMulExpr: + { + auto S = cast(Addr); + bool lhs = SE.containsAddRecurrence(S->getOperand(0)); + bool rhs = SE.containsAddRecurrence(S->getOperand(1)); + if (lhs && !rhs) { + factors.push_back(S->getOperand(1)); + findStridesRec(S->getOperand(0), loops, factors, SE, res); + }else if (!lhs && rhs) { + factors.push_back(S->getOperand(0)); + findStridesRec(S->getOperand(1), loops, factors, SE, res); + } + return; + } + + default: + return; + } +} + +SmallVector &findStrides(const SCEV *Addr, ArrayRef loops, ScalarEvolution &SE){ + SmallVector &strides = *(new SmallVector()); + SmallVector factors; + findStridesRec(Addr, loops, factors, SE, strides); + errs()<<"found strides: \n"; + for (const SCEV *S : strides) S->dump(); + return strides; +} + } //end of namespace //================== AffineAcces, Result of Analysis ========================================= @@ -263,13 +371,18 @@ AffineAcc *AffineAccess::promoteAccess(const AffineAcc &aa, const Loop *L, const for (const SCEV *Bd : aa.bounds){ if (!(SE.isLoopInvariant(Bd, L) && isSafeToExpandAt(Bd, InsPoint, SE))) return nullptr; //(4) } + + errs()<<"passed (4)\n"; + for (const SCEV *Str : aa.strides){ if (!(SE.isLoopInvariant(Str, L) && isSafeToExpandAt(Str, InsPoint, SE))) return nullptr; //(5) } + errs()<<"passed (5)\n"; + if (!isSafeToExpandAt(Bound, InsPoint, SE) || !isSafeToExpandAt(Stride, InsPoint, SE)) return nullptr; //(6) - errs()<<"passed (4), (5) & (6)\n"; + errs()<<"passed (6)\n"; AffineAcc *A = new AffineAcc(aa); A->data = Data; @@ -299,38 +412,25 @@ void AffineAccess::addAllAccesses(Instruction *Addr, const Loop *L){ errs()<<"has "<(accesses), AddrS); //never needed -> alloc in stack AffineAcc *A = &dim0; - const SCEV *CS = AddrS; for (auto L = cloops.begin(); L != cloops.end(); ++L){ if (loopReps.find(*L) == loopReps.end()) break; //this loop is malformed ==> this and all more outer loops cannot be used + if (Stride == strides.end()) break; //ran out of strides - errs()<<"finding stride in: "; CS->dump(); - const SCEV *Stride; - if (const auto *Rec = dyn_cast(CS)){ - if (Rec->getLoop() == *L) { - CS = Rec->getStart(); - Stride = Rec->getStepRecurrence(SE); - }else{ - bool occurs = false; - for (auto L_ = L; L_ != cloops.end(); ++L_) occurs = occurs && Rec->getLoop() == *L_; - if (!occurs) break; //AddRecExpr references a loop that is not a containing loop ==> cannot guarantee anything - Stride = SE.getConstant(APInt(64U, 0U)); //addrSCEV does not step in this loop ==> stride is 0 - } - }else{ - break; //did not manage to compute stride - } - assert(Stride); - errs()<<"found stride: "; Stride->dump(); - - A = promoteAccess(*A, *L, Stride); + A = promoteAccess(*A, *L, *Stride); if (A){ errs()<<"found AffineAcc:\n"; A->dump(); this->accesses.push_back(A); }else{ break; //did not manage to promote ==> cannot promote for loops further out } + + ++Stride; } errs()<<"\n"; return; @@ -341,6 +441,9 @@ ArrayRef AffineAccess::getAccesses() const{ return *ar; } +//TODO: 2D Stride = (1D Bound + 1) * 1D Stride +//TODO: fix below: do casts manually + Value *AffineAccess::expandData(const AffineAcc *aa, Type *ty) const{ SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "data"); ex.setInsertPoint(aa->L->getLoopPreheader()->getTerminator()); @@ -539,4 +642,24 @@ AffineAccess &runOnFunction(Function &F, LoopInfo &LI, DominatorTree &DT, Scalar return *aa; } +*/ + +/* +errs()<<"finding stride in: "; CS->dump(); + const SCEV *Stride; + if (const auto *Rec = dyn_cast(CS)){ + if (Rec->getLoop() == *L) { + CS = Rec->getStart(); + Stride = Rec->getStepRecurrence(SE); + }else{ + bool occurs = false; + for (auto L_ = L; L_ != cloops.end(); ++L_) occurs = occurs && Rec->getLoop() == *L_; + if (!occurs) break; //AddRecExpr references a loop that is not a containing loop ==> cannot guarantee anything + Stride = SE.getConstant(APInt(64U, 0U)); //addrSCEV does not step in this loop ==> stride is 0 + } + }else{ + break; //did not manage to compute stride + } + assert(Stride); + errs()<<"found stride: "; Stride->dump(); */ \ No newline at end of file From 908b2a4dfba15258a8688e95140799cb4adf3d89 Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Tue, 19 Apr 2022 11:39:42 +0200 Subject: [PATCH 14/47] added inference graph and proper ssr en/dis --- .../llvm/Analysis/AffineAccessAnalysis.h | 2 + llvm/lib/Analysis/AffineAccessAnalysis.cpp | 41 ++++- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 164 ++++++++++++++---- 3 files changed, 169 insertions(+), 38 deletions(-) diff --git a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h index c4f52446a98a0..0c9101d02dc1b 100644 --- a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h @@ -32,6 +32,8 @@ class AffineAcc{ const Loop *getLoop() const; Instruction *getAddrIns() const; const SmallVector &getAccesses() const; + unsigned getNStore() const; + unsigned getNLoad() const; }; class AffineAccess{ diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index 9dfd18cdc8224..033f39df60b6c 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -80,6 +80,18 @@ const SmallVector &AffineAcc::getAccesses() const{ return this->accesses; } +unsigned AffineAcc::getNStore() const { + unsigned r = 0U; + for (Instruction *I : this->accesses) r += isa(I); + return r; +} + +unsigned AffineAcc::getNLoad() const { + unsigned r = 0U; + for (Instruction *I : this->accesses) r += isa(I); + return r; +} + //================== AffineAcces, helper functions ========================================= namespace { @@ -441,25 +453,38 @@ ArrayRef AffineAccess::getAccesses() const{ return *ar; } -//TODO: 2D Stride = (1D Bound + 1) * 1D Stride -//TODO: fix below: do casts manually +Value *castToSize(Value *R, Type *ty, Instruction *InsPoint){ + const DataLayout &DL = InsPoint->getParent()->getModule()->getDataLayout(); + Type *rty = R->getType(); + if (rty == ty) return R; + if (DL.getTypeSizeInBits(rty) > DL.getTypeSizeInBits(ty)) { + return CastInst::CreateTruncOrBitCast(R, ty, "scev.cast", InsPoint); + } + if (DL.getTypeSizeInBits(rty) < DL.getTypeSizeInBits(ty)) { + return CastInst::CreateZExtOrBitCast(R, ty, "scev.cast", InsPoint); + } + return CastInst::CreateBitOrPointerCast(R, ty, "scev.cast", InsPoint); +} Value *AffineAccess::expandData(const AffineAcc *aa, Type *ty) const{ + Instruction *InsPoint = aa->L->getLoopPreheader()->getTerminator(); SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "data"); - ex.setInsertPoint(aa->L->getLoopPreheader()->getTerminator()); - return ex.expandCodeFor(aa->data, ty); + ex.setInsertPoint(InsPoint); + return castToSize(ex.expandCodeFor(aa->data), ty, InsPoint); } Value *AffineAccess::expandBound(const AffineAcc *aa, unsigned i, Type *ty) const{ + Instruction *InsPoint = aa->L->getLoopPreheader()->getTerminator(); SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "bound"); - ex.setInsertPoint(aa->L->getLoopPreheader()->getTerminator()); - return ex.expandCodeFor(aa->bounds[i], ty); + ex.setInsertPoint(InsPoint); + return castToSize(ex.expandCodeFor(aa->bounds[i]), ty, InsPoint); } Value *AffineAccess::expandStride(const AffineAcc *aa, unsigned i, Type *ty) const{ + Instruction *InsPoint = aa->L->getLoopPreheader()->getTerminator(); SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "stride"); - ex.setInsertPoint(aa->L->getLoopPreheader()->getTerminator()); - return ex.expandCodeFor(aa->strides[i], ty); + ex.setInsertPoint(InsPoint); + return castToSize(ex.expandCodeFor(aa->strides[i]), ty, InsPoint); } //================== Affine Acces Analysis ================================================== diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index e4bdc21794036..f975dcd491361 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -40,9 +40,13 @@ #include #include +#include +#include #include +#include -#define NUM_SSR 10U +#define NUM_SSR 3U +#define SSR_MAX_DIM 4U //current state of hw: only allow doubles #define CHECK_TYPE(I) (I->getType() == Type::getDoubleTy(I->getParent()->getContext())) @@ -134,7 +138,7 @@ void generateSSREnDis(const Loop *L){ return; } -bool conflictingAccesses(const AffineAcc *A, const AffineAcc *B){ +bool shareInsts(const AffineAcc *A, const AffineAcc *B){ if (A->getAddrIns() == B->getAddrIns()) return true; for (Instruction *IA : A->getAccesses()){ for (Instruction *IB : B->getAccesses()){ @@ -144,6 +148,107 @@ bool conflictingAccesses(const AffineAcc *A, const AffineAcc *B){ return false; } +bool isValid(const AffineAcc *A){ + if (A->getDimension() > SSR_MAX_DIM) return false; + unsigned n_store = 0U; + unsigned n_load = 0U; + bool valid = true; + for (auto *I : A->getAccesses()){ + valid = valid && CHECK_TYPE(I); + if (dyn_cast(I)) n_load++; + else if(dyn_cast(I)) n_store++; + else assert(false && "non load/store instruction in AffineAcc::accesses ?"); + if(!valid) break; + } + return valid && ((n_store > 0U && n_load == 0U)|| (n_store == 0U && n_load > 0U)); +} + +bool conflict(const AffineAcc *A, const AffineAcc *B){ + assert(!shareInsts(A, B) && "this AffineAcc's share instructions ==> one of them should be filtered"); + if (A->getNStore() == 0U && B->getNStore() == 0U) return false; //can intersect read only streams + //at this point one of them is store + for (BasicBlock *BB : A->getLoop()->getBlocks()) { if (B->getLoop()->contains(BB)) return true; } //loops contain each other + for (BasicBlock *BB : B->getLoop()->getBlocks()) { if (A->getLoop()->contains(BB)) return true; } //loops contain each other + return true; +} + +struct ConflictGraph{ + using NodeT = const AffineAcc *; + + ConflictGraph(ArrayRef accesses) { + for (auto A = accesses.begin(); A != accesses.end(); ++A){ + if (!isValid(*A)) continue; + conflicts.insert(std::make_pair(*A, std::move(std::vector()))); + mutexs.insert(std::make_pair(*A, std::move(std::vector()))); + for (auto B = accesses.begin(); B != A; ++B){ + if (shareInsts(*A, *B)){ + mutexs.find(*A)->second.push_back(*B); + mutexs.find(*B)->second.push_back(*A); + }else if (conflict(*A, *B)){ + conflicts.find(*A)->second.push_back(*B); + conflicts.find(*B)->second.push_back(*A); + } + } + } + } + + ///currently done greedily according to isBetter + std::map> &color(unsigned nColors){ + std::map> &color = *(new std::map>()); + std::vector accs; + for (const auto &A : conflicts) accs.push_back(A.first); + auto isBetter = [](NodeT A, NodeT B){ return A->getDimension() > B->getDimension(); }; + std::sort(accs.begin(), accs.end(), isBetter); + for (const auto &A : accs){ + bool done = false; + for (const auto &M : mutexs.find(A)->second){ + auto c = color.find(M); + if (c != color.end() && c->second.hasValue()) {//one mutex neighbour has color => A cannot get one + color.insert(std::make_pair(A, None)); + done = true; + break; + } + } + if (done) continue; //done with this A ==> go to next + + BitVector cs(nColors); + for (const auto &M : conflicts.find(A)->second){ + auto mc = color.find(M); + if (mc != color.end() && mc->second.hasValue()){ //neighbour has some color mc ==> A cannot get mc + cs[mc->second.getValue()] = true; + } + } + int c = cs.find_first_unset(); + if (c >= 0) color.insert(std::make_pair(A, (unsigned)c)); + else color.insert(std::make_pair(A, None)); + } + return color; + } + +private: + std::map> conflicts; //cannot get same color + std::map> mutexs; //if one gets a color the other cannot +}; + +void addChangedLoop(const Loop *NewL, SmallPtrSet &loops){ + //check whether L or any of its predecessors (parents, parents of parents, etc) are already marked for SSRenable & -disable + const Loop *L = NewL; + bool contained = false; + while (L && !contained){ + contained = contained || (loops.find(L) != loops.end()); + L = L->getParentLoop(); + } + if (!contained){ + //check for all loops in loops whether NewL contains them + std::vector dels; //cannot directly delete loops in foreach loops ==> store here first + for (const Loop *L : loops){ + if (NewL->contains(L->getHeader())) dels.push_back(L); + } + for (const Loop *L : dels) loops.erase(L); + loops.insert(NewL); + } +} + } //end of namespace PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &FAM){ @@ -152,13 +257,33 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F errs()<<"SSR Generation Pass on function: "< changedLoops; + SmallPtrSet changedLoops; auto accs = AF.getAccesses(); - std::vector allaccesses; + + ConflictGraph g(accs); + const auto &clr = g.color(NUM_SSR); + + for (const auto &C : clr){ + if (C.second.hasValue()){ //not None + unsigned n_store = C.first->getNStore(), n_load = C.first->getNLoad(); + generateSSR(AF, C.first, C.second.getValue(), n_store + n_load, n_store > 0U); + + addChangedLoop(C.first->getLoop(), changedLoops); + } + } + + for (const Loop *L : changedLoops) generateSSREnDis(L); + + return changedLoops.empty() ? PreservedAnalyses::all() : PreservedAnalyses::none(); +} + + +/* +std::vector allaccesses; for (const AffineAcc *A : accs) allaccesses.push_back(A); - //sort by dimension + //sort by dimension ascending std::sort(allaccesses.begin(), allaccesses.end(), [](const AffineAcc *A, const AffineAcc *B){return A->getDimension() <= B->getDimension();}); errs()<<"total of "< accesses; while (!allaccesses.empty()){ auto A = allaccesses.back(); allaccesses.pop_back(); + if (!isValid(A)) continue; bool conflict = false; for (auto B : accesses){ - conflict = conflict || conflictingAccesses(A, B); + conflict = conflict || shareInsts(A, B); } if (!conflict) accesses.push_back(A); } @@ -178,31 +304,9 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F unsigned dmid = 0U; for (const AffineAcc *A : accesses){ if (dmid >= NUM_SSR) break; - - unsigned n_store = 0U; - unsigned n_load = 0U; - bool valid = true; - for (auto *I : A->getAccesses()){ - valid = valid && CHECK_TYPE(I); - if (dyn_cast(I)) n_load++; - else if(dyn_cast(I)) n_store++; - else assert(false && "non load/store instruction in AffineAcc::accesses ?"); - - if(!valid) break; - } - - errs()<<"current aa is "<dump(); + unsigned n_store = A->getNStore(), n_load = A->getNLoad(); generateSSR(AF, A, dmid, n_store + n_load, n_store > 0U); - errs()<<"done with generation\n\n"; changedLoops.insert(A->getLoop()); dmid++; } - - for (const Loop *L : changedLoops) generateSSREnDis(L); - - return changedLoops.empty() ? PreservedAnalyses::all() : PreservedAnalyses::none(); -} \ No newline at end of file + */ \ No newline at end of file From e2b5ad33ce01dac246e2bcceb0f502e9c8fc1fa6 Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Wed, 20 Apr 2022 11:52:53 +0200 Subject: [PATCH 15/47] current state --- .../llvm/Analysis/AffineAccessAnalysis.h | 17 ++- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 133 +++++++++++++++--- llvm/lib/Passes/PassBuilder.cpp | 1 - llvm/lib/Transforms/SSR/SSRGeneration.cpp | 103 +++++++------- llvm/lib/Transforms/SSR/SSRInference.cpp | 22 ++- 5 files changed, 188 insertions(+), 88 deletions(-) diff --git a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h index 0c9101d02dc1b..7c61a2bf78fc4 100644 --- a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h @@ -7,6 +7,7 @@ #include #include +#include namespace llvm { @@ -19,8 +20,8 @@ class AffineAcc{ AffineAcc(Instruction *Addr, ArrayRef accesses, const SCEV *data); const SCEV *data; - SmallVector bounds; //from outer- to innermost loop - SmallVector strides; //from outer- to innermost loop + SmallVector bounds; //from outer- to innermost loop + SmallVector strides; //from outer- to innermost loop Instruction *Addr; SmallVector accesses; //load/store instructions that use address (guaranteed to be in same loop) const Loop *L; //outermost loop @@ -38,18 +39,24 @@ class AffineAcc{ class AffineAccess{ private: - SmallVector accesses; - DenseMap loopReps; + SmallVector accesses; //accesses + DenseMap loopReps; //wellformed loops & their bt counts + DenseSet addresses; //already checked address instructions ScalarEvolution &SE; DominatorTree &DT; LoopInfo &LI; + public: AffineAccess(ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI); AffineAccess() = delete; void addAllAccesses(Instruction *Addr, const Loop *L); AffineAcc *promoteAccess(const AffineAcc &Acc, const Loop *L, const SCEV *Stride); + std::pair splitLoadStore(const AffineAcc *Acc) const; ArrayRef getAccesses() const; - + bool accessPatternsMatch(const AffineAcc *A, const AffineAcc *B) const; + bool shareInsts(const AffineAcc *A, const AffineAcc *B) const; + bool conflictWWWR(const AffineAcc *A, const AffineAcc *B) const; + const SCEV *wellFormedLoopBTCount(const Loop *L) const; //returns bt count if loop is well-formed Value *expandData(const AffineAcc *aa, Type *ty = (Type *)nullptr) const; Value *expandBound(const AffineAcc *aa, unsigned i, Type *ty = (Type *)nullptr) const; Value *expandStride(const AffineAcc *aa, unsigned i, Type *ty = (Type *)nullptr) const; diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index 033f39df60b6c..402dce74998da 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -44,7 +44,7 @@ using namespace llvm; AffineAcc::AffineAcc(Instruction *Addr, ArrayRef accesses, - const SCEV *data) : data(data), Addr(Addr){ + const SCEV *data) : data(data), Addr(Addr), L(nullptr){ for (Instruction *I : accesses) this->accesses.push_back(I); return; } @@ -55,7 +55,8 @@ unsigned AffineAcc::getDimension() const{ void AffineAcc::dump() const{ errs()<<"Affine Access in Loop:\n"; - L->dump(); + if (L) L->dump(); + else errs()<<"nullptr\n"; errs()<<"With Addr instruction: "; Addr->dump(); errs()<<"And the following load/store instructions:\n"; for (Instruction *I : accesses){ @@ -213,11 +214,13 @@ Optional predicatedICmpOutcome(ICmpInst *Cmp, const SCEV *Rep, ScalarEvolu bool isOnAllPredicatedControlFlowPaths(const BasicBlock *BB, const Loop *L, const DominatorTree &DT, const SCEV *Rep, ScalarEvolution &SE){ if (isOnAllControlFlowPaths(BB, L, DT)) return true; //is on all paths anyway Rep->dump(); - + DenseSet vis; //visited set std::deque q(1U, L->getHeader()); //iterative BFS with queue while (!q.empty()){ BasicBlock *Current = q.front(); q.pop_front(); if (Current == BB) continue; //do not continue BFS from BB + if (vis.find(Current) == vis.end()) continue; //already visited this block + vis.insert(Current); Instruction *T = Current->getTerminator(); T->dump(); @@ -328,6 +331,19 @@ SmallVector &findStrides(const SCEV *Addr, ArrayRefgetParent()->getModule()->getDataLayout(); + Type *rty = R->getType(); + if (rty == ty) return R; + if (DL.getTypeSizeInBits(rty) > DL.getTypeSizeInBits(ty)) { + return CastInst::CreateTruncOrBitCast(R, ty, "scev.cast", InsPoint); + } + if (DL.getTypeSizeInBits(rty) < DL.getTypeSizeInBits(ty)) { + return CastInst::CreateZExtOrBitCast(R, ty, "scev.cast", InsPoint); + } + return CastInst::CreateBitOrPointerCast(R, ty, "scev.cast", InsPoint); +} + } //end of namespace //================== AffineAcces, Result of Analysis ========================================= @@ -354,6 +370,8 @@ AffineAccess::AffineAccess(ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI) /// (5) forall st : aa.strides. SE.isLoopInvariant(st, L) && isSafeToExpandAt(st, LPreheader->getTerminator(), SE) /// (6) isSafeToExpandAt(Bound / Stride, LPreheader->getTerminator(), SE) AffineAcc *AffineAccess::promoteAccess(const AffineAcc &aa, const Loop *L, const SCEV *Stride){ + aa.dump(); + assert(!aa.L || (aa.L && !aa.L->isInvalid())); assert((!aa.L) || (aa.L && aa.L->getParentLoop() == L && "can only promote to parent loop")); //(1) assert(this->loopReps.find(L) != this->loopReps.end() && "L is well formed"); //(1) @@ -369,7 +387,7 @@ AffineAcc *AffineAccess::promoteAccess(const AffineAcc &aa, const Loop *L, const } } - errs()<<"passed (2)\n"; + errs()<<"passed (2), "; const SCEV *Bound = this->loopReps.find(L)->getSecond(); Instruction *InsPoint = L->getLoopPreheader()->getTerminator(); @@ -378,23 +396,23 @@ AffineAcc *AffineAccess::promoteAccess(const AffineAcc &aa, const Loop *L, const const SCEV *Data = SE.SplitIntoInitAndPostInc(L, aa.data).first; if (!isSafeToExpandAt(Data, InsPoint, SE)) return nullptr; //(3.2) - errs()<<"passed (3)\n"; + errs()<<"passed (3), "; for (const SCEV *Bd : aa.bounds){ if (!(SE.isLoopInvariant(Bd, L) && isSafeToExpandAt(Bd, InsPoint, SE))) return nullptr; //(4) } - errs()<<"passed (4)\n"; + errs()<<"passed (4), "; for (const SCEV *Str : aa.strides){ if (!(SE.isLoopInvariant(Str, L) && isSafeToExpandAt(Str, InsPoint, SE))) return nullptr; //(5) } - errs()<<"passed (5)\n"; + errs()<<"passed (5), "; if (!isSafeToExpandAt(Bound, InsPoint, SE) || !isSafeToExpandAt(Stride, InsPoint, SE)) return nullptr; //(6) - errs()<<"passed (6)\n"; + errs()<<"passed (6)"; AffineAcc *A = new AffineAcc(aa); A->data = Data; @@ -406,10 +424,22 @@ AffineAcc *AffineAccess::promoteAccess(const AffineAcc &aa, const Loop *L, const /// adds all affine accesses that use Addr in loop L void AffineAccess::addAllAccesses(Instruction *Addr, const Loop *L){ + if (addresses.find(Addr) != addresses.end()) return; //already called addAllAccesses on this Addr instruction + addresses.insert(Addr); + + //find all accesses std::vector accesses; for (auto U = Addr->use_begin(); U != Addr->use_end(); ++U){ Instruction *Acc = dyn_cast(U->getUser()); - if (!Acc) Acc = dyn_cast(U->getUser()); + if (Acc){ //load inst + bool unvaildUser = false; + for (auto AU = Acc->use_begin(); AU != Acc->use_end(); ++AU){ + auto *I = dyn_cast(AU->getUser()); + unvaildUser = unvaildUser || !I || !isOnAllControlFlowPaths(I->getParent(), L, DT); + } + if (unvaildUser) continue; //skip this load it has users ouside of loop or not on all control flow paths + } + if (!Acc) Acc = dyn_cast(U->getUser()); //try to cast to store if (!Acc) continue; //both casts failed ==> not a suitable instruction if (!isOnAllControlFlowPaths(Acc->getParent(), L, DT)) continue; //access does not occur consistently in loop ==> not suitable accesses.push_back(Acc); @@ -420,7 +450,9 @@ void AffineAccess::addAllAccesses(Instruction *Addr, const Loop *L){ const SCEV *AddrS = SE.getSCEV(Addr); - auto &cloops = getContainingLoops(LI.getLoopsInPreorder(), Addr); + //we are looking at containing loops of all the accesses (guaranteed to be all the same) + //Addr ins might be outside of loop (licm) if 1D stride is 0 + auto &cloops = getContainingLoops(LI.getLoopsInPreorder(), accesses[0]); errs()<<"has "<dump(); this->accesses.push_back(A); @@ -444,26 +477,81 @@ void AffineAccess::addAllAccesses(Instruction *Addr, const Loop *L){ ++Stride; } - errs()<<"\n"; + errs()<<"we now have "<accesses.size()<<" affine accesses\n"; return; } +std::pair AffineAccess::splitLoadStore(const AffineAcc *Acc) const{ + unsigned nLoad = Acc->getNLoad(), nStore = Acc->getNStore(); + if (nLoad > 0U && nStore == 0U) return std::make_pair(Acc, nullptr); + if (nLoad == 0U && nStore > 0U) return std::make_pair(nullptr, Acc); + AffineAcc *L = new AffineAcc(*Acc); //copy + AffineAcc *S = new AffineAcc(*Acc); //copy + L->accesses.clear(); + S->accesses.clear(); + for (Instruction *I : Acc->getAccesses()){ + if (isa(I)) L->accesses.push_back(I); + else if(isa(I)) S->accesses.push_back(I); + } + return std::make_pair(L, S); +} + ArrayRef AffineAccess::getAccesses() const{ ArrayRef *ar = new ArrayRef(accesses.begin(), accesses.end()); return *ar; } -Value *castToSize(Value *R, Type *ty, Instruction *InsPoint){ - const DataLayout &DL = InsPoint->getParent()->getModule()->getDataLayout(); - Type *rty = R->getType(); - if (rty == ty) return R; - if (DL.getTypeSizeInBits(rty) > DL.getTypeSizeInBits(ty)) { - return CastInst::CreateTruncOrBitCast(R, ty, "scev.cast", InsPoint); +bool AffineAccess::accessPatternsMatch(const AffineAcc *A, const AffineAcc *B) const { + if (!SCEVEquals(A->data, B->data, SE)) return false; + if (A->getDimension() != B->getDimension()) return false; + for (unsigned i = 0; i < A->getDimension(); i++){ + if (!SCEVEquals(A->bounds[i], B->bounds[i], SE)) return false; + if (!SCEVEquals(A->strides[i], B->strides[i], SE)) return false; } - if (DL.getTypeSizeInBits(rty) < DL.getTypeSizeInBits(ty)) { - return CastInst::CreateZExtOrBitCast(R, ty, "scev.cast", InsPoint); + return true; +} + +bool AffineAccess::shareInsts(const AffineAcc *A, const AffineAcc *B) const{ + for (Instruction *IA : A->getAccesses()){ + for (Instruction *IB : B->getAccesses()){ + if (IA == IB) return true; + } } - return CastInst::CreateBitOrPointerCast(R, ty, "scev.cast", InsPoint); + return false; +} + +bool AffineAccess::conflictWWWR(const AffineAcc *A, const AffineAcc *B) const { + assert(!shareInsts(A, B) && "these AffineAcc's share instructions ==> one of them should be filtered"); + unsigned nstA = A->getNStore(), nstB = B->getNStore(); + if (nstA == 0U && nstB == 0U) return false; //can intersect read only streams + //at this point at least one of them is store + + //special case: no conflict, if + // - exactly one of them is a store + // - they have the same access pattern (AffineAccessAnalysis::accessPatternsMatch) + // - all loads dominate all stores in the loop (ie. read before write) + if ((nstA && !nstB) || (!nstA && nstB)){ + if (accessPatternsMatch(A, B)){ + A = nstA ? A : B; //A is store + B = nstA ? B : A; //B is load + bool check = true; + for (Instruction *IL : B->getAccesses()){ + for (Instruction *IS : A->getAccesses()){ + check = check && DT.dominates(IL, IS); + } + } + if (check) return false; + } + } + + if (A->getLoop()->contains(B->getLoop()) || B->getLoop()->contains(A->getLoop())) return true; + return false; +} + +const SCEV *AffineAccess::wellFormedLoopBTCount(const Loop *L) const{ + auto P = loopReps.find(L); + if (P == loopReps.end()) return nullptr; //loop not well-formed; + return P->getSecond(); } Value *AffineAccess::expandData(const AffineAcc *aa, Type *ty) const{ @@ -503,6 +591,9 @@ AffineAccess AffineAccessAnalysis::run(Function &F, FunctionAnalysisManager &FAM AffineAccess *A = new AffineAccess(SE, DT, LI); for (const Loop *L : LI.getLoopsInPreorder()){ + assert(L); + assert(!L->isInvalid()); + if (!A->wellFormedLoopBTCount(L)) continue; //loop not well-formed for (BasicBlock *BB : L->getBlocks()){ if (!isOnAllControlFlowPaths(BB, L, DT)) continue; for (Instruction &I : *BB){ @@ -522,7 +613,7 @@ AffineAccess AffineAccessAnalysis::run(Function &F, FunctionAnalysisManager &FAM } } - return *A; + return std::move(*A); } //================== Affine Acces Analysis Pass for opt ======================================= diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 36f0765b484c8..99794b6488c74 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -595,7 +595,6 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(createFunctionToLoopPassAdaptor( std::move(LPM1), EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true, DebugLogging)); - FPM.addPass(SSRInferencePass()); FPM.addPass(SimplifyCFGPass()); FPM.addPass(InstCombinePass()); diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index f975dcd491361..84443724065a3 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -56,7 +56,7 @@ using namespace llvm; namespace{ ///generates SSR setup calls -void generateSSR(AffineAccess &AA, const AffineAcc *aa, unsigned dmid, unsigned n_insts, bool isStore){ +void generateSSR(AffineAccess &AA, const AffineAcc *aa, unsigned dmid, bool isStore){ BasicBlock *LoopPreheader = aa->getLoop()->getLoopPreheader(); Module *mod = LoopPreheader->getModule(); LLVMContext &ctxt = LoopPreheader->getContext(); @@ -67,23 +67,14 @@ void generateSSR(AffineAccess &AA, const AffineAcc *aa, unsigned dmid, unsigned ConstantInt *dm = ConstantInt::get(i32, dmid); //datamover id, ty=i32 ConstantInt *dim = ConstantInt::get(i32, aa->getDimension() - 1U); //dimension - 1, ty=i32 Value *data = AA.expandData(aa, Type::getInt8PtrTy(ctxt)); - Function *SSRReadSetup; + Function *SSRSetup; if (!isStore){ - SSRReadSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_read_imm); //can take _imm bc dm and dim are constant + SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_read_imm); //can take _imm bc dm and dim are constant }else{ - SSRReadSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_write_imm); //can take _imm bc dm and dim are constant + SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_write_imm); //can take _imm bc dm and dim are constant } std::array args = {dm, dim, data}; - builder.CreateCall(SSRReadSetup->getFunctionType(), SSRReadSetup, ArrayRef(args)); - - errs()<<"generated ssr_read/write_imm \n"; - - ConstantInt *rep = ConstantInt::get(i32, n_insts - 1U); //repetition - 1, ty=i32 - Function *SSRRepetitionSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_repetition); - std::array repargs = {dm, rep}; - builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef(repargs)); - - errs()<<"generated ssr_setup_repetitions \n"; + builder.CreateCall(SSRSetup->getFunctionType(), SSRSetup, ArrayRef(args))->dump(); Intrinsic::RISCVIntrinsics functions[] = { Intrinsic::riscv_ssr_setup_bound_stride_1d, @@ -97,30 +88,41 @@ void generateSSR(AffineAccess &AA, const AffineAcc *aa, unsigned dmid, unsigned Function *SSRBoundStrideSetup = Intrinsic::getDeclaration(mod, functions[d]); std::array bsargs = {dm, bound, stride}; - builder.CreateCall(SSRBoundStrideSetup->getFunctionType(), SSRBoundStrideSetup, ArrayRef(bsargs)); - - errs()<<"generated ssr_setup_bound_stride_"<<(d+1)<<"d \n"; + auto *C = builder.CreateCall(SSRBoundStrideSetup->getFunctionType(), SSRBoundStrideSetup, ArrayRef(bsargs)); + C->dump(); } + unsigned n_reps = 0U; if (isStore){ Function *SSRPush = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_push); for (Instruction *I : aa->getAccesses()){ std::array pusharg = {dm, cast(I)->getValueOperand()}; builder.SetInsertPoint(I); - builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); + auto *C = builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); + C->dump(); + I->dump(); I->removeFromParent(); + n_reps++; } }else{ Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); std::array poparg = {dm}; for (Instruction *I : aa->getAccesses()){ + for (auto U = I->user_begin(); U != I->user_end(); ++U) n_reps++; //reps for load is its nr of uses builder.SetInsertPoint(I); - Value *v = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); + Instruction *V = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); + V->dump(); + I->dump(); BasicBlock::iterator ii(I); - ReplaceInstWithValue(I->getParent()->getInstList(), ii, v); + ReplaceInstWithValue(I->getParent()->getInstList(), ii, V); } } - errs()<<"placed push/pop calls\n"; + + builder.SetInsertPoint(LoopPreheader->getTerminator()); + ConstantInt *rep = ConstantInt::get(i32, n_reps - 1U); //repetition - 1, ty=i32 + Function *SSRRepetitionSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_repetition); + std::array repargs = {dm, rep}; + builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef(repargs))->dump(); return; } @@ -138,17 +140,8 @@ void generateSSREnDis(const Loop *L){ return; } -bool shareInsts(const AffineAcc *A, const AffineAcc *B){ - if (A->getAddrIns() == B->getAddrIns()) return true; - for (Instruction *IA : A->getAccesses()){ - for (Instruction *IB : B->getAccesses()){ - if (IA == IB) return true; - } - } - return false; -} - bool isValid(const AffineAcc *A){ + if (!A) return false; if (A->getDimension() > SSR_MAX_DIM) return false; unsigned n_store = 0U; unsigned n_load = 0U; @@ -160,31 +153,24 @@ bool isValid(const AffineAcc *A){ else assert(false && "non load/store instruction in AffineAcc::accesses ?"); if(!valid) break; } - return valid && ((n_store > 0U && n_load == 0U)|| (n_store == 0U && n_load > 0U)); -} - -bool conflict(const AffineAcc *A, const AffineAcc *B){ - assert(!shareInsts(A, B) && "this AffineAcc's share instructions ==> one of them should be filtered"); - if (A->getNStore() == 0U && B->getNStore() == 0U) return false; //can intersect read only streams - //at this point one of them is store - for (BasicBlock *BB : A->getLoop()->getBlocks()) { if (B->getLoop()->contains(BB)) return true; } //loops contain each other - for (BasicBlock *BB : B->getLoop()->getBlocks()) { if (A->getLoop()->contains(BB)) return true; } //loops contain each other - return true; + return valid && ((n_store > 0U && n_load == 0U) || (n_store == 0U && n_load > 0U)); } struct ConflictGraph{ using NodeT = const AffineAcc *; - ConflictGraph(ArrayRef accesses) { - for (auto A = accesses.begin(); A != accesses.end(); ++A){ - if (!isValid(*A)) continue; - conflicts.insert(std::make_pair(*A, std::move(std::vector()))); - mutexs.insert(std::make_pair(*A, std::move(std::vector()))); - for (auto B = accesses.begin(); B != A; ++B){ - if (shareInsts(*A, *B)){ + ///accs assumed to be valid + ConflictGraph(const AffineAccess &AF, ArrayRef accesses) : AF(AF){ + for (auto A = accesses.begin(); A != accesses.end(); A++){ + conflicts.insert(std::make_pair(*A, std::vector())); + mutexs.insert(std::make_pair(*A, std::vector())); + for (auto B = accesses.begin(); B != A; B++){ + assert(conflicts.find(*B) != conflicts.end()); + assert(mutexs.find(*B) != mutexs.end()); + if (AF.shareInsts(*A, *B)){ mutexs.find(*A)->second.push_back(*B); mutexs.find(*B)->second.push_back(*A); - }else if (conflict(*A, *B)){ + }else if (AF.conflictWWWR(*A, *B)){ conflicts.find(*A)->second.push_back(*B); conflicts.find(*B)->second.push_back(*A); } @@ -193,7 +179,7 @@ struct ConflictGraph{ } ///currently done greedily according to isBetter - std::map> &color(unsigned nColors){ + std::map> &color(unsigned nColors) { std::map> &color = *(new std::map>()); std::vector accs; for (const auto &A : conflicts) accs.push_back(A.first); @@ -226,8 +212,9 @@ struct ConflictGraph{ } private: + const AffineAccess &AF; std::map> conflicts; //cannot get same color - std::map> mutexs; //if one gets a color the other cannot + std::map> mutexs; //if one gets a color the other cannot get any color }; void addChangedLoop(const Loop *NewL, SmallPtrSet &loops){ @@ -261,13 +248,21 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F auto accs = AF.getAccesses(); - ConflictGraph g(accs); + std::vector goodAccs; + for (const AffineAcc *A : accs){ + auto p = AF.splitLoadStore(A); + if (p.first && isValid(p.first)) goodAccs.push_back(p.first); + if (p.second && isValid(p.second)) goodAccs.push_back(p.second); + } + + ConflictGraph g(AF, ArrayRef(goodAccs)); const auto &clr = g.color(NUM_SSR); + errs()<<"computed coloring\n"; for (const auto &C : clr){ if (C.second.hasValue()){ //not None - unsigned n_store = C.first->getNStore(), n_load = C.first->getNLoad(); - generateSSR(AF, C.first, C.second.getValue(), n_store + n_load, n_store > 0U); + generateSSR(AF, C.first, C.second.getValue(), C.first->getNStore() > 0U); + errs()<<"generated ssr insts \n"; addChangedLoop(C.first->getLoop(), changedLoops); } diff --git a/llvm/lib/Transforms/SSR/SSRInference.cpp b/llvm/lib/Transforms/SSR/SSRInference.cpp index a78ccfcbad13c..3a6244ddee29e 100644 --- a/llvm/lib/Transforms/SSR/SSRInference.cpp +++ b/llvm/lib/Transforms/SSR/SSRInference.cpp @@ -24,7 +24,10 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/AffineAccessAnalysis.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" +#include "llvm/Transforms/Utils/FixIrreducible.h" +#include "llvm/Transforms/InstCombine/InstCombine.h" #include "llvm/Transforms/Scalar/LoopStrengthReduce.h" +#include "llvm/Transforms/Scalar/SimplifyCFG.h" #include "llvm/Transforms/SSR/SSRGeneration.h" #include "llvm/Support/CommandLine.h" @@ -46,12 +49,17 @@ using namespace llvm; PreservedAnalyses SSRInferencePass::run(Function &F, FunctionAnalysisManager &FAM){ - errs()<<"SSR Inference Pass on function: "< Date: Thu, 21 Apr 2022 14:59:25 +0200 Subject: [PATCH 16/47] current state --- .../llvm/Analysis/AffineAccessAnalysis.h | 5 +-- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 32 ++++++++++--------- llvm/lib/Passes/PassBuilder.cpp | 4 +++ llvm/lib/Transforms/SSR/SSRGeneration.cpp | 13 ++++---- llvm/lib/Transforms/SSR/SSRInference.cpp | 2 +- 5 files changed, 32 insertions(+), 24 deletions(-) diff --git a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h index 7c61a2bf78fc4..bf7b6104e74e4 100644 --- a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h @@ -20,8 +20,8 @@ class AffineAcc{ AffineAcc(Instruction *Addr, ArrayRef accesses, const SCEV *data); const SCEV *data; - SmallVector bounds; //from outer- to innermost loop - SmallVector strides; //from outer- to innermost loop + SmallVector bounds; //from inner- to outermost loop + SmallVector strides; //from inner- to outermost loop Instruction *Addr; SmallVector accesses; //load/store instructions that use address (guaranteed to be in same loop) const Loop *L; //outermost loop @@ -56,6 +56,7 @@ class AffineAccess{ bool accessPatternsMatch(const AffineAcc *A, const AffineAcc *B) const; bool shareInsts(const AffineAcc *A, const AffineAcc *B) const; bool conflictWWWR(const AffineAcc *A, const AffineAcc *B) const; + bool shareLoops(const AffineAcc *A, const AffineAcc *B) const; const SCEV *wellFormedLoopBTCount(const Loop *L) const; //returns bt count if loop is well-formed Value *expandData(const AffineAcc *aa, Type *ty = (Type *)nullptr) const; Value *expandBound(const AffineAcc *aa, unsigned i, Type *ty = (Type *)nullptr) const; diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index 402dce74998da..96a812e7ef5e4 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -263,8 +263,10 @@ SmallVector &getContainingLoops(ArrayRef loopsPr } void findStridesRec(const SCEV *Addr, ArrayRef loops, SmallVector &factors, ScalarEvolution &SE, SmallVector &res){ + if (!res.empty()) { errs()<<"stride: "; res.back()->dump(); } errs()<<"finding strides in "; Addr->dump(); if (loops.empty()) return; + errs()<<"loop header: "<getHeader()->getNameOrAsOperand()<<"\n"; switch (Addr->getSCEVType()) { case SCEVTypes::scAddRecExpr: @@ -283,7 +285,7 @@ void findStridesRec(const SCEV *Addr, ArrayRef loops, SmallVector< for (const Loop *L : loops) occurs = occurs || AddRec->getLoop() == L; //loops needs to occur further up, o/w invalid if (!occurs) return; res.push_back(SE.getConstant(APInt(64U, 0U))); - findStridesRec(AddRec->getStart(), loops, factors, SE, res); + findStridesRec(AddRec, ArrayRef(loops.begin()+1, loops.end()), factors, SE, res); } return; } @@ -326,8 +328,6 @@ SmallVector &findStrides(const SCEV *Addr, ArrayRef &strides = *(new SmallVector()); SmallVector factors; findStridesRec(Addr, loops, factors, SE, strides); - errs()<<"found strides: \n"; - for (const SCEV *S : strides) S->dump(); return strides; } @@ -370,7 +370,6 @@ AffineAccess::AffineAccess(ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI) /// (5) forall st : aa.strides. SE.isLoopInvariant(st, L) && isSafeToExpandAt(st, LPreheader->getTerminator(), SE) /// (6) isSafeToExpandAt(Bound / Stride, LPreheader->getTerminator(), SE) AffineAcc *AffineAccess::promoteAccess(const AffineAcc &aa, const Loop *L, const SCEV *Stride){ - aa.dump(); assert(!aa.L || (aa.L && !aa.L->isInvalid())); assert((!aa.L) || (aa.L && aa.L->getParentLoop() == L && "can only promote to parent loop")); //(1) assert(this->loopReps.find(L) != this->loopReps.end() && "L is well formed"); //(1) @@ -392,7 +391,7 @@ AffineAcc *AffineAccess::promoteAccess(const AffineAcc &aa, const Loop *L, const const SCEV *Bound = this->loopReps.find(L)->getSecond(); Instruction *InsPoint = L->getLoopPreheader()->getTerminator(); - if (!SE.hasComputableLoopEvolution(aa.data, L)) return nullptr; //(3.1) + if (!SE.hasComputableLoopEvolution(aa.data, L) && !SE.isLoopInvariant(aa.data, L)) return nullptr; //(3.1) const SCEV *Data = SE.SplitIntoInitAndPostInc(L, aa.data).first; if (!isSafeToExpandAt(Data, InsPoint, SE)) return nullptr; //(3.2) @@ -464,9 +463,11 @@ void AffineAccess::addAllAccesses(Instruction *Addr, const Loop *L){ for (auto L = cloops.begin(); L != cloops.end(); ++L){ if (loopReps.find(*L) == loopReps.end()) break; //this loop is malformed ==> this and all more outer loops cannot be used - if (Stride == strides.end()) break; //ran out of strides + const SCEV *Str; + if (Stride != strides.end()) Str = *(Stride++); + else Str = SE.getConstant(IntegerType::getInt32Ty((*L)->getHeader()->getContext()), 0U); //if we run out of strides we can still promote with stride=0 - A = promoteAccess(*A, *L, *Stride); + A = promoteAccess(*A, *L, Str); errs()<<"\n"; if (A){ errs()<<"found AffineAcc:\n"; A->dump(); @@ -474,8 +475,6 @@ void AffineAccess::addAllAccesses(Instruction *Addr, const Loop *L){ }else{ break; //did not manage to promote ==> cannot promote for loops further out } - - ++Stride; } errs()<<"we now have "<accesses.size()<<" affine accesses\n"; return; @@ -523,7 +522,7 @@ bool AffineAccess::shareInsts(const AffineAcc *A, const AffineAcc *B) const{ bool AffineAccess::conflictWWWR(const AffineAcc *A, const AffineAcc *B) const { assert(!shareInsts(A, B) && "these AffineAcc's share instructions ==> one of them should be filtered"); unsigned nstA = A->getNStore(), nstB = B->getNStore(); - if (nstA == 0U && nstB == 0U) return false; //can intersect read only streams + if (nstA == 0U && nstB == 0U) return false; //can intersect read streams //at this point at least one of them is store //special case: no conflict, if @@ -548,27 +547,31 @@ bool AffineAccess::conflictWWWR(const AffineAcc *A, const AffineAcc *B) const { return false; } -const SCEV *AffineAccess::wellFormedLoopBTCount(const Loop *L) const{ +bool AffineAccess::shareLoops(const AffineAcc *A, const AffineAcc *B) const { + return A->getLoop() == B->getLoop() || A->getLoop()->contains(B->getLoop()) || B->getLoop()->contains(A->getLoop()); +} + +const SCEV *AffineAccess::wellFormedLoopBTCount(const Loop *L) const { auto P = loopReps.find(L); if (P == loopReps.end()) return nullptr; //loop not well-formed; return P->getSecond(); } -Value *AffineAccess::expandData(const AffineAcc *aa, Type *ty) const{ +Value *AffineAccess::expandData(const AffineAcc *aa, Type *ty) const { Instruction *InsPoint = aa->L->getLoopPreheader()->getTerminator(); SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "data"); ex.setInsertPoint(InsPoint); return castToSize(ex.expandCodeFor(aa->data), ty, InsPoint); } -Value *AffineAccess::expandBound(const AffineAcc *aa, unsigned i, Type *ty) const{ +Value *AffineAccess::expandBound(const AffineAcc *aa, unsigned i, Type *ty) const { Instruction *InsPoint = aa->L->getLoopPreheader()->getTerminator(); SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "bound"); ex.setInsertPoint(InsPoint); return castToSize(ex.expandCodeFor(aa->bounds[i]), ty, InsPoint); } -Value *AffineAccess::expandStride(const AffineAcc *aa, unsigned i, Type *ty) const{ +Value *AffineAccess::expandStride(const AffineAcc *aa, unsigned i, Type *ty) const { Instruction *InsPoint = aa->L->getLoopPreheader()->getTerminator(); SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "stride"); ex.setInsertPoint(InsPoint); @@ -607,7 +610,6 @@ AffineAccess AffineAccessAnalysis::run(Function &F, FunctionAnalysisManager &FAM } Instruction *AddrIns; if (!(AddrIns = dyn_cast(Addr))) continue; //if Addr is not instruction ==> constant, or sth else (==> leave for other passes to opt) - A->addAllAccesses(AddrIns, L); } } diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 99794b6488c74..8d38d59a21d67 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -627,6 +627,8 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(InstCombinePass()); invokePeepholeEPCallbacks(FPM, Level); + FPM.addPass(SSRInferencePass()); + if (PTO.Coroutines) FPM.addPass(CoroElidePass()); @@ -800,6 +802,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(InstCombinePass()); invokePeepholeEPCallbacks(FPM, Level); + FPM.addPass(SSRInferencePass()); + // Re-consider control flow based optimizations after redundancy elimination, // redo DCE, etc. FPM.addPass(JumpThreadingPass()); diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index 84443724065a3..d7e56dfcb4f8e 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -95,20 +95,21 @@ void generateSSR(AffineAccess &AA, const AffineAcc *aa, unsigned dmid, bool isSt unsigned n_reps = 0U; if (isStore){ Function *SSRPush = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_push); + std::vector del; for (Instruction *I : aa->getAccesses()){ std::array pusharg = {dm, cast(I)->getValueOperand()}; builder.SetInsertPoint(I); auto *C = builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); C->dump(); I->dump(); - I->removeFromParent(); + del.push_back(I); n_reps++; } + for (Instruction *I : del) I->removeFromParent(); }else{ Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); std::array poparg = {dm}; for (Instruction *I : aa->getAccesses()){ - for (auto U = I->user_begin(); U != I->user_end(); ++U) n_reps++; //reps for load is its nr of uses builder.SetInsertPoint(I); Instruction *V = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); V->dump(); @@ -167,10 +168,10 @@ struct ConflictGraph{ for (auto B = accesses.begin(); B != A; B++){ assert(conflicts.find(*B) != conflicts.end()); assert(mutexs.find(*B) != mutexs.end()); - if (AF.shareInsts(*A, *B)){ + if (AF.shareInsts(*A, *B) || AF.conflictWWWR(*A, *B)){ mutexs.find(*A)->second.push_back(*B); mutexs.find(*B)->second.push_back(*A); - }else if (AF.conflictWWWR(*A, *B)){ + }else if (AF.shareLoops(*A, *B)){ conflicts.find(*A)->second.push_back(*B); conflicts.find(*B)->second.push_back(*A); } @@ -201,7 +202,7 @@ struct ConflictGraph{ for (const auto &M : conflicts.find(A)->second){ auto mc = color.find(M); if (mc != color.end() && mc->second.hasValue()){ //neighbour has some color mc ==> A cannot get mc - cs[mc->second.getValue()] = true; + cs[mc->second.getValue()] = 1u; } } int c = cs.find_first_unset(); @@ -229,7 +230,7 @@ void addChangedLoop(const Loop *NewL, SmallPtrSet &loops){ //check for all loops in loops whether NewL contains them std::vector dels; //cannot directly delete loops in foreach loops ==> store here first for (const Loop *L : loops){ - if (NewL->contains(L->getHeader())) dels.push_back(L); + if (NewL->contains(L)) dels.push_back(L); } for (const Loop *L : dels) loops.erase(L); loops.insert(NewL); diff --git a/llvm/lib/Transforms/SSR/SSRInference.cpp b/llvm/lib/Transforms/SSR/SSRInference.cpp index 3a6244ddee29e..f2eb224fd3b27 100644 --- a/llvm/lib/Transforms/SSR/SSRInference.cpp +++ b/llvm/lib/Transforms/SSR/SSRInference.cpp @@ -54,7 +54,7 @@ PreservedAnalyses SSRInferencePass::run(Function &F, FunctionAnalysisManager &FA FPM.addPass(FixIrreduciblePass());//turn some non-loops into loops FPM.addPass(LoopSimplifyPass()); //canonicalize loops FPM.addPass(LCSSAPass()); //put loops into LCSSA-form - FPM.addPass(createFunctionToLoopPassAdaptor(LoopStrengthReducePass())); //loop strength reduction + //FPM.addPass(createFunctionToLoopPassAdaptor(LoopStrengthReducePass())); //loop strength reduction FPM.addPass(SSRGenerationPass()); //runs AffineAccess analysis and generates SSR intrinsics FPM.addPass(SimplifyCFGPass()); //simplifies CFG again FPM.addPass(InstCombinePass()); //removes phi nodes from LCSSA From 6fe4215d75cc8ed06562d1e676bd569156aaafa5 Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Sun, 24 Apr 2022 19:01:20 +0200 Subject: [PATCH 17/47] work on cloning --- cmd_out.txt | 0 llvm/lib/Analysis/AffineAccessAnalysis.cpp | 15 ++-- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 94 ++++++++++++++++++---- 3 files changed, 87 insertions(+), 22 deletions(-) create mode 100644 cmd_out.txt diff --git a/cmd_out.txt b/cmd_out.txt new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index 96a812e7ef5e4..f6f90ac97acf5 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -121,7 +121,11 @@ bool checkLoop(const Loop *L, DominatorTree &DT, ScalarEvolution &SE, Instructio } Optional> toSameSize(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool unsafe = false){ + assert(LHS && RHS); using PT = std::pair; + if (LHS->getType() == RHS->getType()) return Optional(std::make_pair(LHS, RHS)); //trivially the same size + if (!LHS->getType()->isSized() || !RHS->getType()->isSized()) return None; + //TODO: use datalayout for size instead if (LHS->getType()->getIntegerBitWidth() > RHS->getType()->getIntegerBitWidth()) { if (auto LHSx = dyn_cast(LHS)){ if (LHSx->getAPInt().getActiveBits() <= RHS->getType()->getIntegerBitWidth()) {} @@ -150,14 +154,11 @@ bool SCEVEquals(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE){ if (!p.hasValue()) return false; LHS = p.getValue().first; RHS = p.getValue().second; - errs()<<"SCEVEquals:\n\t"; LHS->dump(); - errs()<<"\t"; RHS->dump(); if (LHS == RHS) return true; //trivially the same if this holds (bc const Ptr) else{ const SCEVPredicate *Peq = SE.getEqualPredicate(LHS, RHS); if (Peq->isAlwaysTrue()) return true; //if we arrive at setup addr scev, we are done } - errs()<<"false\n"; return false; } @@ -531,11 +532,11 @@ bool AffineAccess::conflictWWWR(const AffineAcc *A, const AffineAcc *B) const { // - all loads dominate all stores in the loop (ie. read before write) if ((nstA && !nstB) || (!nstA && nstB)){ if (accessPatternsMatch(A, B)){ - A = nstA ? A : B; //A is store - B = nstA ? B : A; //B is load + const AffineAcc *S = nstA ? A : B; //store + const AffineAcc *L = nstA ? B : A; //load bool check = true; - for (Instruction *IL : B->getAccesses()){ - for (Instruction *IS : A->getAccesses()){ + for (Instruction *IL : L->getAccesses()){ + for (Instruction *IS : S->getAccesses()){ check = check && DT.dominates(IL, IS); } } diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index d7e56dfcb4f8e..d7520ee37b8de 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -22,12 +22,15 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/AffineAccessAnalysis.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicInst.h" @@ -49,12 +52,51 @@ #define SSR_MAX_DIM 4U //current state of hw: only allow doubles -#define CHECK_TYPE(I) (I->getType() == Type::getDoubleTy(I->getParent()->getContext())) +#define CHECK_TYPE(T, I) (T == Type::getDoubleTy(I->getParent()->getContext())) using namespace llvm; namespace{ +BasicBlock *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, DomTreeUpdater *DTU, LoopInfo *LI, MemorySSAUpdater *MSSAU){ + BasicBlock *Begin = BeginWith->getParent(); + BasicBlock *Head = splitBlockBefore(Begin, BeginWith, DTU, LI, MSSAU, "split.head"); + BasicBlock *End = splitBlockBefore(EndBefore->getParent(), EndBefore, DTU, LI, MSSAU, "fuse.again"); + std::deque q; //bfs queue + q.push_back(Begin); + DenseSet vis; //bfs visited set + DenseMap clones; //value in orig -> value in clone + std::vector> operandsCleanup; //store operands that reference instructions that are not cloned yet + + while (!q.empty()){ + BasicBlock *C = q.front(); q.pop_front(); + if (C == End || vis.find(C) != vis.end()) continue; + BasicBlock *Cc = BasicBlock::Create(C->getContext(), Twine(C->getName()).concat(".clone"), C->getParent(), C); + IRBuilder<> builder(Cc); + for (Instruction &I : *C){ + Instruction *Ic = I.clone(); + assert(Ic->use_empty() && "no uses of clone"); + builder.Insert(Ic, Twine(I.getName()).concat(".clone")); + for (unsigned i = 0; i < Ic->getNumOperands(); i++){ + auto A = clones.find(Ic->getOperand(i)); + if (A != clones.end()){ + Ic->setOperand(i, A->second); + bool userUpdate = false; + for (User *U : A->second->users()) userUpdate |= U == Ic; + assert(userUpdate && "user is updated on setOperand"); + }else{ + operandsCleanup.push_back(std::make_pair(i, Ic)); + } + } + clones.insert(std::make_pair(&I, Ic)); + } + + } + //TODO: change terminator of Head to be CondBr with TakeOrig as cond + //TODO: operandCleanup + return Head; +} + ///generates SSR setup calls void generateSSR(AffineAccess &AA, const AffineAcc *aa, unsigned dmid, bool isStore){ BasicBlock *LoopPreheader = aa->getLoop()->getLoopPreheader(); @@ -76,15 +118,22 @@ void generateSSR(AffineAccess &AA, const AffineAcc *aa, unsigned dmid, bool isSt std::array args = {dm, dim, data}; builder.CreateCall(SSRSetup->getFunctionType(), SSRSetup, ArrayRef(args))->dump(); + std::vector bds, sts; + for (unsigned d = 0U; d < aa->getDimension(); d++){ + bds.push_back(AA.expandBound(aa, d, i32)); //bound - 1, ty=i32 + sts.push_back(AA.expandStride(aa, d, i32)); //relative stride, ty=i32 + } + Intrinsic::RISCVIntrinsics functions[] = { Intrinsic::riscv_ssr_setup_bound_stride_1d, Intrinsic::riscv_ssr_setup_bound_stride_2d, Intrinsic::riscv_ssr_setup_bound_stride_3d, Intrinsic::riscv_ssr_setup_bound_stride_4d }; - for (unsigned d = 0U; d < aa->getDimension(); d++){ - Value *bound = AA.expandBound(aa, d, i32); //bound - 1, ty=i32 - Value *stride = AA.expandStride(aa, d, i32); //relative stride, ty=i32 + + for (unsigned d = 0U; d < aa->getDimension(); d++){ + Value *bound = bds[d]; + Value *stride = sts[d]; Function *SSRBoundStrideSetup = Intrinsic::getDeclaration(mod, functions[d]); std::array bsargs = {dm, bound, stride}; @@ -95,17 +144,15 @@ void generateSSR(AffineAccess &AA, const AffineAcc *aa, unsigned dmid, bool isSt unsigned n_reps = 0U; if (isStore){ Function *SSRPush = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_push); - std::vector del; for (Instruction *I : aa->getAccesses()){ std::array pusharg = {dm, cast(I)->getValueOperand()}; builder.SetInsertPoint(I); auto *C = builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); C->dump(); I->dump(); - del.push_back(I); + I->eraseFromParent(); n_reps++; } - for (Instruction *I : del) I->removeFromParent(); }else{ Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); std::array poparg = {dm}; @@ -148,10 +195,13 @@ bool isValid(const AffineAcc *A){ unsigned n_load = 0U; bool valid = true; for (auto *I : A->getAccesses()){ - valid = valid && CHECK_TYPE(I); - if (dyn_cast(I)) n_load++; - else if(dyn_cast(I)) n_store++; - else assert(false && "non load/store instruction in AffineAcc::accesses ?"); + if (dyn_cast(I)) { + n_load++; + valid = valid && CHECK_TYPE(I->getType(), I); + }else if(auto St = dyn_cast(I)) { + n_store++; + valid = valid && CHECK_TYPE(St->getValueOperand()->getType(), I); + }else assert(false && "non load/store instruction in AffineAcc::accesses ?"); if(!valid) break; } return valid && ((n_store > 0U && n_load == 0U) || (n_store == 0U && n_load > 0U)); @@ -162,12 +212,11 @@ struct ConflictGraph{ ///accs assumed to be valid ConflictGraph(const AffineAccess &AF, ArrayRef accesses) : AF(AF){ + errs()<<"conflict graph with "<())); mutexs.insert(std::make_pair(*A, std::vector())); for (auto B = accesses.begin(); B != A; B++){ - assert(conflicts.find(*B) != conflicts.end()); - assert(mutexs.find(*B) != mutexs.end()); if (AF.shareInsts(*A, *B) || AF.conflictWWWR(*A, *B)){ mutexs.find(*A)->second.push_back(*B); mutexs.find(*B)->second.push_back(*A); @@ -242,6 +291,11 @@ void addChangedLoop(const Loop *NewL, SmallPtrSet &loops){ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &FAM){ AffineAccess &AF = FAM.getResult(F); + LoopInfo &LI = FAM.getResult(F); + DominatorTree &DT = FAM.getResult(F); + DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Eager); + auto &MA = FAM.getResult(F); + MemorySSAUpdater MSSAU(&MA.getMSSA()); errs()<<"SSR Generation Pass on function: "< goodAccs; for (const AffineAcc *A : accs){ + A->dump(); auto p = AF.splitLoadStore(A); if (p.first && isValid(p.first)) goodAccs.push_back(p.first); if (p.second && isValid(p.second)) goodAccs.push_back(p.second); @@ -262,16 +317,24 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F for (const auto &C : clr){ if (C.second.hasValue()){ //not None + cloneRegion(C.first->getLoop()->getLoopPreheader()->getTerminator(), C.first->getLoop()->getExitBlock()->getTerminator(), &DTU, &LI, &MSSAU); generateSSR(AF, C.first, C.second.getValue(), C.first->getNStore() > 0U); errs()<<"generated ssr insts \n"; - + addChangedLoop(C.first->getLoop(), changedLoops); } } for (const Loop *L : changedLoops) generateSSREnDis(L); - return changedLoops.empty() ? PreservedAnalyses::all() : PreservedAnalyses::none(); + for (BasicBlock &BB : F) BB.dump(); + + if (changedLoops.empty()){ + return PreservedAnalyses::all(); + }else{ + F.addFnAttr(Attribute::AttrKind::NoInline); //mark function as no-inline, because there can be intersecting streams if function is inlined! + return PreservedAnalyses::none(); + } } @@ -305,4 +368,5 @@ std::vector allaccesses; changedLoops.insert(A->getLoop()); dmid++; } + */ \ No newline at end of file From 3540f176879ec666c3f3b9008b478f24848b61a9 Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Mon, 25 Apr 2022 11:39:33 +0200 Subject: [PATCH 18/47] working cloning & SSR guard? --- .../llvm/Analysis/AffineAccessAnalysis.h | 6 +- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 27 +- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 426 ++++++++++++++---- 3 files changed, 352 insertions(+), 107 deletions(-) diff --git a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h index bf7b6104e74e4..09d591c4c124a 100644 --- a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h @@ -58,9 +58,9 @@ class AffineAccess{ bool conflictWWWR(const AffineAcc *A, const AffineAcc *B) const; bool shareLoops(const AffineAcc *A, const AffineAcc *B) const; const SCEV *wellFormedLoopBTCount(const Loop *L) const; //returns bt count if loop is well-formed - Value *expandData(const AffineAcc *aa, Type *ty = (Type *)nullptr) const; - Value *expandBound(const AffineAcc *aa, unsigned i, Type *ty = (Type *)nullptr) const; - Value *expandStride(const AffineAcc *aa, unsigned i, Type *ty = (Type *)nullptr) const; + Value *expandData(const AffineAcc *aa, Type *ty = (Type *)nullptr, Instruction *InsertBefore = (Instruction *)nullptr) const; + Value *expandBound(const AffineAcc *aa, unsigned i, Type *ty = (Type *)nullptr, Instruction *InsertBefore = (Instruction *)nullptr) const; + Value *expandStride(const AffineAcc *aa, unsigned i, Type *ty = (Type *)nullptr, Instruction *InsertBefore = (Instruction *)nullptr) const; }; class AffineAccessAnalysis : public AnalysisInfoMixin { diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index f6f90ac97acf5..a22bcde912272 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -558,25 +558,28 @@ const SCEV *AffineAccess::wellFormedLoopBTCount(const Loop *L) const { return P->getSecond(); } -Value *AffineAccess::expandData(const AffineAcc *aa, Type *ty) const { - Instruction *InsPoint = aa->L->getLoopPreheader()->getTerminator(); +Value *AffineAccess::expandData(const AffineAcc *aa, Type *ty, Instruction *InsertBefore) const { + InsertBefore = InsertBefore ? InsertBefore : aa->L->getLoopPreheader()->getTerminator(); + assert(isSafeToExpandAt(aa->data, InsertBefore, SE) && "data not expanable here (note: only preheader guaranteed)"); SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "data"); - ex.setInsertPoint(InsPoint); - return castToSize(ex.expandCodeFor(aa->data), ty, InsPoint); + ex.setInsertPoint(InsertBefore); + return castToSize(ex.expandCodeFor(aa->data), ty, InsertBefore); } -Value *AffineAccess::expandBound(const AffineAcc *aa, unsigned i, Type *ty) const { - Instruction *InsPoint = aa->L->getLoopPreheader()->getTerminator(); +Value *AffineAccess::expandBound(const AffineAcc *aa, unsigned i, Type *ty, Instruction *InsertBefore) const { + InsertBefore = InsertBefore ? InsertBefore : aa->L->getLoopPreheader()->getTerminator(); + assert(isSafeToExpandAt(aa->bounds[i], InsertBefore, SE) && "bound not expanable here (note: only preheader guaranteed)"); SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "bound"); - ex.setInsertPoint(InsPoint); - return castToSize(ex.expandCodeFor(aa->bounds[i]), ty, InsPoint); + ex.setInsertPoint(InsertBefore); + return castToSize(ex.expandCodeFor(aa->bounds[i]), ty, InsertBefore); } -Value *AffineAccess::expandStride(const AffineAcc *aa, unsigned i, Type *ty) const { - Instruction *InsPoint = aa->L->getLoopPreheader()->getTerminator(); +Value *AffineAccess::expandStride(const AffineAcc *aa, unsigned i, Type *ty, Instruction *InsertBefore) const { + InsertBefore = InsertBefore ? InsertBefore : aa->L->getLoopPreheader()->getTerminator(); + assert(isSafeToExpandAt(aa->strides[i], InsertBefore, SE) && "bound not expanable here (note: only preheader guaranteed)"); SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "stride"); - ex.setInsertPoint(InsPoint); - return castToSize(ex.expandCodeFor(aa->strides[i]), ty, InsPoint); + ex.setInsertPoint(InsertBefore); + return castToSize(ex.expandCodeFor(aa->strides[i]), ty, InsertBefore); } //================== Affine Acces Analysis ================================================== diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index d7520ee37b8de..620785ae28c3f 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -50,128 +50,249 @@ #define NUM_SSR 3U #define SSR_MAX_DIM 4U - +//both are inclusive! +#define SSR_SCRATCHPAD_BEGIN 1000U +#define SSR_SCRATCHPAD_END 18000U //current state of hw: only allow doubles #define CHECK_TYPE(T, I) (T == Type::getDoubleTy(I->getParent()->getContext())) using namespace llvm; +///Wraps an AffineAcc *Access, expands all its SCEVs in constructor +struct GenSSR{ +private: + Value *Base; + ConstantInt *DMID; + SmallVector, SSR_MAX_DIM> offsets; + //Instruction *AvailableFrom; + +public: + ///AffineAcc which is wrapped by this GenSSR + const AffineAcc *Access; + + ///expand data, bound, and stride + GenSSR(const AffineAcc *A, unsigned dmid, Instruction *ExpandBefore, AffineAccess &AF) : Access(A) { + auto &ctxt = ExpandBefore->getParent()->getContext(); + Type *i32 = IntegerType::getInt32Ty(ctxt); + DMID = cast(ConstantInt::get(i32, dmid)); + Base = AF.expandData(A, Type::getInt8PtrTy(ctxt), ExpandBefore); + for (unsigned i = 0U; i < A->getDimension(); i++){ + offsets.push_back(std::make_pair(AF.expandBound(A, i, i32, ExpandBefore), AF.expandStride(A, i, i32, ExpandBefore))); + } + } + + ///generate comparisons + Value *GenerateSSRGuard(Instruction *ExpandBefore){ + auto &ctxt = ExpandBefore->getParent()->getContext(); + Type *i64 = IntegerType::getInt64Ty(ctxt); + IRBuilder<> builder(ExpandBefore); + std::vector checks; + for (unsigned i = 0U; i < Access->getDimension(); i++){ + /// loop has to be taken at least once (>= 1) ==> bound >= 0 + /// SGE also works for unsigned int: if the bound is unsigned and larger than 2^30 it will be too large for the scratchpad anyway + checks.push_back(builder.CreateICmpSGE(getBound(i), ConstantInt::get(Type::getInt32Ty(ExpandBefore->getContext()), 0U))); + } + errs()<<"before ptr to i64 \n"; + errs()<(checks)); + } + + ///generate setup instructions in loop preheader + void GenerateSetup(){ + Instruction *Point = Access->getLoop()->getLoopPreheader()->getTerminator(); + Module *mod = Point->getModule(); + IRBuilder<> builder(Point); + Type *i32 = Type::getInt32Ty(Point->getContext()); + Constant *dim = ConstantInt::get(i32, Access->getDimension() - 1U); //dimension - 1, ty=i32 + bool isStore = Access->getNStore() > 0u; + + Function *SSRSetup; + if (!isStore){ + SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_read_imm); //can take _imm bc dm and dim are constant + }else{ + SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_write_imm); //can take _imm bc dm and dim are constant + } + std::array args = {getDMID(), dim, getBase()}; + builder.CreateCall(SSRSetup->getFunctionType(), SSRSetup, ArrayRef(args))->dump(); + + Intrinsic::RISCVIntrinsics functions[] = { + Intrinsic::riscv_ssr_setup_bound_stride_1d, + Intrinsic::riscv_ssr_setup_bound_stride_2d, + Intrinsic::riscv_ssr_setup_bound_stride_3d, + Intrinsic::riscv_ssr_setup_bound_stride_4d + }; + for (unsigned i = 0u; i < Access->getDimension(); i++){ + Function *SSRBoundStrideSetup = Intrinsic::getDeclaration(mod, functions[i]); + std::array bsargs = {getDMID(), getBound(i), getStride(i)}; + builder.CreateCall(SSRBoundStrideSetup->getFunctionType(), SSRBoundStrideSetup, ArrayRef(bsargs))->dump(); + } + + unsigned n_reps = 0U; + if (isStore){ + Function *SSRPush = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_push); + for (Instruction *I : Access->getAccesses()){ + std::array pusharg = {getDMID(), cast(I)->getValueOperand()}; + builder.SetInsertPoint(I); + auto *C = builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); + C->dump(); I->dump(); + I->eraseFromParent(); + n_reps++; + } + }else{ + Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); + std::array poparg = {getDMID()}; + for (Instruction *I : Access->getAccesses()){ + builder.SetInsertPoint(I); + Instruction *V = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); + V->dump(); I->dump(); + BasicBlock::iterator ii(I); + ReplaceInstWithValue(I->getParent()->getInstList(), ii, V); + n_reps++; + } + } + + builder.SetInsertPoint(Point); + Constant *Rep = ConstantInt::get(i32, n_reps - 1U); //repetition - 1, ty=i32 + Function *SSRRepetitionSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_repetition); + std::array repargs = {getDMID(), Rep}; + builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef(repargs))->dump(); + return; + } + + Value *getBase() { return Base; } + Value *getBound(unsigned i) { return offsets[i].first; } + Value *getStride(unsigned i) { return offsets[i].second; } + ConstantInt *getDMID() { return DMID; } +}; + namespace{ -BasicBlock *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, DomTreeUpdater *DTU, LoopInfo *LI, MemorySSAUpdater *MSSAU){ +void copyPHIsFromPred(BasicBlock *BB){ + BasicBlock *Pred = BB->getSinglePredecessor(); + assert(Pred && "only works for blocks with one single predecessor"); + assert(BB->getTerminator() && "need at least one non-phi node in BB"); + for (Instruction &I : *Pred){ + if (auto *Phi = dyn_cast(&I)){ + PHINode *PhiC = PHINode::Create(Phi->getType(), 1u, Twine(Phi->getName()).concat(".copy"), BB->getFirstNonPHI()); + Phi->replaceAllUsesWith(PhiC); + PhiC->addIncoming(Phi, Pred); + errs()<<"replaced all uses of "<<*Phi<<" with "<<*PhiC<<"\n"; + } + } +} + +///clones code from BeginWith up to EndBefore +///assumes all cf-paths from begin lead to end (or return) +///assumes there is a phi node for each value defined in the region that will be cloned in the block of EndBefore that is live after EndBefore +BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, DominatorTree &DT, DomTreeUpdater *DTU, LoopInfo *LI, MemorySSAUpdater *MSSAU){ + errs()<<"cloning from "<<*BeginWith<<" up to "<<*EndBefore<<"\n"; BasicBlock *Begin = BeginWith->getParent(); BasicBlock *Head = splitBlockBefore(Begin, BeginWith, DTU, LI, MSSAU, "split.head"); - BasicBlock *End = splitBlockBefore(EndBefore->getParent(), EndBefore, DTU, LI, MSSAU, "fuse.again"); + copyPHIsFromPred(Begin); //copy Phi's from Head to Begin + BasicBlock *End = EndBefore->getParent(); + BasicBlock *Fuse = splitBlockBefore(EndBefore->getParent(), EndBefore, DTU, LI, MSSAU, "fuse.prep"); + copyPHIsFromPred(End); std::deque q; //bfs queue q.push_back(Begin); DenseSet vis; //bfs visited set - DenseMap clones; //value in orig -> value in clone + DenseMap clones; //value in orig -> value in clone (INV: orig and clone are of same class) std::vector> operandsCleanup; //store operands that reference instructions that are not cloned yet while (!q.empty()){ BasicBlock *C = q.front(); q.pop_front(); if (C == End || vis.find(C) != vis.end()) continue; + vis.insert(C); BasicBlock *Cc = BasicBlock::Create(C->getContext(), Twine(C->getName()).concat(".clone"), C->getParent(), C); + clones.insert(std::make_pair(C, Cc)); //BasicBlock <: Value, needed for branches IRBuilder<> builder(Cc); for (Instruction &I : *C){ Instruction *Ic = I.clone(); assert(Ic->use_empty() && "no uses of clone"); - builder.Insert(Ic, Twine(I.getName()).concat(".clone")); + if (I.getType()->isVoidTy() || I.getType()->isLabelTy()) Ic = builder.Insert(Ic); //insert without name + else Ic = builder.Insert(Ic, Twine(I.getName()).concat(".clone")); for (unsigned i = 0; i < Ic->getNumOperands(); i++){ auto A = clones.find(Ic->getOperand(i)); if (A != clones.end()){ - Ic->setOperand(i, A->second); - bool userUpdate = false; - for (User *U : A->second->users()) userUpdate |= U == Ic; - assert(userUpdate && "user is updated on setOperand"); + Ic->setOperand(i, A->second); //this also updates uses of A->second + //check users update in A->second + bool userUpdate = false; for (User *U : A->second->users()) {userUpdate = userUpdate || U == Ic; } assert(userUpdate && "user is updated on setOperand"); }else{ operandsCleanup.push_back(std::make_pair(i, Ic)); } } - clones.insert(std::make_pair(&I, Ic)); + clones.insert(std::make_pair(&I, Ic)); //add Ic as clone of I + } + auto succs = successors(C); + for (auto S = succs.begin(); S != succs.end(); ++S) { + q.push_back(*S); } - - } - //TODO: change terminator of Head to be CondBr with TakeOrig as cond - //TODO: operandCleanup - return Head; -} - -///generates SSR setup calls -void generateSSR(AffineAccess &AA, const AffineAcc *aa, unsigned dmid, bool isStore){ - BasicBlock *LoopPreheader = aa->getLoop()->getLoopPreheader(); - Module *mod = LoopPreheader->getModule(); - LLVMContext &ctxt = LoopPreheader->getContext(); - IntegerType *i32 = IntegerType::getInt32Ty(ctxt); - - IRBuilder<> builder(LoopPreheader->getTerminator()); - - ConstantInt *dm = ConstantInt::get(i32, dmid); //datamover id, ty=i32 - ConstantInt *dim = ConstantInt::get(i32, aa->getDimension() - 1U); //dimension - 1, ty=i32 - Value *data = AA.expandData(aa, Type::getInt8PtrTy(ctxt)); - Function *SSRSetup; - if (!isStore){ - SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_read_imm); //can take _imm bc dm and dim are constant - }else{ - SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_write_imm); //can take _imm bc dm and dim are constant } - std::array args = {dm, dim, data}; - builder.CreateCall(SSRSetup->getFunctionType(), SSRSetup, ArrayRef(args))->dump(); - - std::vector bds, sts; - for (unsigned d = 0U; d < aa->getDimension(); d++){ - bds.push_back(AA.expandBound(aa, d, i32)); //bound - 1, ty=i32 - sts.push_back(AA.expandStride(aa, d, i32)); //relative stride, ty=i32 + //operandCleanup + for (const auto &p : operandsCleanup){ //p.first = index of operand that needs to be changed to clone in p.second + auto A = clones.find(p.second->getOperand(p.first)); + if (A != clones.end()){ + p.second->setOperand(p.first, A->second); + }else{ + errs()<<"cloneRegion: did not find "<getOperand(p.first)->getNameOrAsOperand()<<"\n"; + } } - - Intrinsic::RISCVIntrinsics functions[] = { - Intrinsic::riscv_ssr_setup_bound_stride_1d, - Intrinsic::riscv_ssr_setup_bound_stride_2d, - Intrinsic::riscv_ssr_setup_bound_stride_3d, - Intrinsic::riscv_ssr_setup_bound_stride_4d - }; - - for (unsigned d = 0U; d < aa->getDimension(); d++){ - Value *bound = bds[d]; - Value *stride = sts[d]; - - Function *SSRBoundStrideSetup = Intrinsic::getDeclaration(mod, functions[d]); - std::array bsargs = {dm, bound, stride}; - auto *C = builder.CreateCall(SSRBoundStrideSetup->getFunctionType(), SSRBoundStrideSetup, ArrayRef(bsargs)); - C->dump(); + //incoming blocks of phi nodes are not operands ==> handle specially + for (const auto &p : clones){ //all clones of phi-nodes appear in here + if (auto *Phi = dyn_cast(p.second)){ + for (auto B = Phi->block_begin(); B != Phi->block_end(); ++B){ + const auto &c = clones.find(*B); + if (c != clones.end()){ + *B = cast(c->second); //overwrite with clone of block if it was cloned + } + } + } } - - unsigned n_reps = 0U; - if (isStore){ - Function *SSRPush = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_push); - for (Instruction *I : aa->getAccesses()){ - std::array pusharg = {dm, cast(I)->getValueOperand()}; - builder.SetInsertPoint(I); - auto *C = builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); - C->dump(); - I->dump(); - I->eraseFromParent(); - n_reps++; + //change terminator of Head to be CondBr with TakeOrig as cond + BranchInst *HeadBr = cast(Head->getTerminator()); //always BranchInst because of splitBlockBefore + BasicBlock *HeadSucc = HeadBr->getSuccessor(0); + HeadBr->eraseFromParent(); + HeadBr = BranchInst::Create( + HeadSucc, //branch-cond = true -> go to non-clone (here SSR will be inserted) + cast(clones.find(HeadSucc)->second), + ConstantInt::get(Type::getInt1Ty(HeadSucc->getContext()), 0u), + Head + ); + const auto &edge = BasicBlockEdge(std::make_pair(Fuse, End)); + for (auto &p : clones){ + for (User *U : p.first->users()){ + auto *I = dyn_cast(U); + if (I && DT.dominates(edge, I->getParent())){ + errs()<<*I<<" makes use of "<<*p.first<<" after cloned region ==> add phi node at end!\n"; + assert(true && "did not declare phi node for live-out value"); + } } - }else{ - Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); - std::array poparg = {dm}; - for (Instruction *I : aa->getAccesses()){ - builder.SetInsertPoint(I); - Instruction *V = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); - V->dump(); - I->dump(); - BasicBlock::iterator ii(I); - ReplaceInstWithValue(I->getParent()->getInstList(), ii, V); + } + //handle phi nodes in End + for (Instruction &I : *End){ + if (auto *Phi = dyn_cast(&I)){ + for (auto *B : Phi->blocks()){ //yes Phi->blocks() will change during loop ==> does not matter + auto p = clones.find(B); + if (p != clones.end()){ + Value *Bval = Phi->getIncomingValueForBlock(B); + auto v = clones.find(Bval); + if (v != clones.end()){ + Phi->addIncoming(v->second, cast(p->second)); //add clone value & block as input + }else { + //v->first is constant or it is defined before cloned region begins + Phi->addIncoming(Bval, cast(p->second)); + } + } + } } } - - builder.SetInsertPoint(LoopPreheader->getTerminator()); - ConstantInt *rep = ConstantInt::get(i32, n_reps - 1U); //repetition - 1, ty=i32 - Function *SSRRepetitionSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_repetition); - std::array repargs = {dm, rep}; - builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef(repargs))->dump(); - return; + errs()<<"done cloning from \n"; + return HeadBr; } ///generates SSR enable & disable calls @@ -188,6 +309,19 @@ void generateSSREnDis(const Loop *L){ return; } +void generateSSRGuard(BranchInst *BR, ArrayRef streams){ + assert(BR->isConditional()); + if (streams.empty()) return; + IRBuilder<> builder(BR); + std::vector checks; + for (auto *G : streams){ + checks.push_back(G->GenerateSSRGuard(BR)); + } + //TODO: cross check streams too + Value *TakeSSR = builder.CreateAnd(checks); + BR->setCondition(TakeSSR); +} + bool isValid(const AffineAcc *A){ if (!A) return false; if (A->getDimension() > SSR_MAX_DIM) return false; @@ -315,18 +449,48 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F const auto &clr = g.color(NUM_SSR); errs()<<"computed coloring\n"; - for (const auto &C : clr){ + DenseMap> ssrs; + + for (const auto C : clr){ if (C.second.hasValue()){ //not None - cloneRegion(C.first->getLoop()->getLoopPreheader()->getTerminator(), C.first->getLoop()->getExitBlock()->getTerminator(), &DTU, &LI, &MSSAU); - generateSSR(AF, C.first, C.second.getValue(), C.first->getNStore() > 0U); - errs()<<"generated ssr insts \n"; - - addChangedLoop(C.first->getLoop(), changedLoops); + //add to ssrs + auto p = ssrs.find(C.first->getLoop()); + GenSSR *G = new GenSSR(C.first, C.second.getValue(), C.first->getLoop()->getLoopPreheader()->getTerminator(), AF); + if (p != ssrs.end()) p->getSecond().push_back(G); + else ssrs.insert(std::make_pair(C.first->getLoop(), SmallVector(1u, G))); + + addChangedLoop(C.first->getLoop(), changedLoops); //update set of changed loops } } + errs()<<"expanded all SSR bases, bounds, and strides\n"; + + //generate clones + for (const Loop *L : LI.getLoopsInPreorder()){ + auto p = ssrs.find(L); + if (p != ssrs.end()){ + BranchInst *BR = cloneRegion(L->getLoopPreheader()->getTerminator(), &*L->getExitBlock()->getFirstInsertionPt(), DT, &DTU, &LI, &MSSAU); + generateSSRGuard(BR, ArrayRef(p->getSecond().begin(), p->getSecond().end())); //generate "SSR guard" + } + } + + errs()<<"generated all SSR guards\n"; + + //generate ssr setups + for (const auto &p : ssrs){ + for (GenSSR *G : p.getSecond()){ + G->GenerateSetup(); + } + } + + errs()<<"generated all SSR setups\n"; + + //generate enable / disable for (const Loop *L : changedLoops) generateSSREnDis(L); + errs()<<"generated all SSR enable & disable \n"; + + errs()<<"printing function: \n"; for (BasicBlock &BB : F) BB.dump(); if (changedLoops.empty()){ @@ -369,4 +533,82 @@ std::vector allaccesses; dmid++; } + void generateSSR(AffineAccess &AA, const AffineAcc *aa, unsigned dmid, bool isStore){ + BasicBlock *LoopPreheader = aa->getLoop()->getLoopPreheader(); + Module *mod = LoopPreheader->getModule(); + LLVMContext &ctxt = LoopPreheader->getContext(); + IntegerType *i32 = IntegerType::getInt32Ty(ctxt); + + IRBuilder<> builder(LoopPreheader->getTerminator()); + + ConstantInt *dm = ConstantInt::get(i32, dmid); //datamover id, ty=i32 + ConstantInt *dim = ConstantInt::get(i32, aa->getDimension() - 1U); //dimension - 1, ty=i32 + Value *data = AA.expandData(aa, Type::getInt8PtrTy(ctxt)); + Function *SSRSetup; + if (!isStore){ + SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_read_imm); //can take _imm bc dm and dim are constant + }else{ + SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_write_imm); //can take _imm bc dm and dim are constant + } + std::array args = {dm, dim, data}; + builder.CreateCall(SSRSetup->getFunctionType(), SSRSetup, ArrayRef(args))->dump(); + + std::vector bds, sts; + for (unsigned d = 0U; d < aa->getDimension(); d++){ + bds.push_back(AA.expandBound(aa, d, i32)); //bound - 1, ty=i32 + sts.push_back(AA.expandStride(aa, d, i32)); //relative stride, ty=i32 + } + + Intrinsic::RISCVIntrinsics functions[] = { + Intrinsic::riscv_ssr_setup_bound_stride_1d, + Intrinsic::riscv_ssr_setup_bound_stride_2d, + Intrinsic::riscv_ssr_setup_bound_stride_3d, + Intrinsic::riscv_ssr_setup_bound_stride_4d + }; + + for (unsigned d = 0U; d < aa->getDimension(); d++){ + Value *bound = bds[d]; + Value *stride = sts[d]; + + Function *SSRBoundStrideSetup = Intrinsic::getDeclaration(mod, functions[d]); + std::array bsargs = {dm, bound, stride}; + auto *C = builder.CreateCall(SSRBoundStrideSetup->getFunctionType(), SSRBoundStrideSetup, ArrayRef(bsargs)); + C->dump(); + } + + unsigned n_reps = 0U; + if (isStore){ + Function *SSRPush = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_push); + for (Instruction *I : aa->getAccesses()){ + std::array pusharg = {dm, cast(I)->getValueOperand()}; + builder.SetInsertPoint(I); + auto *C = builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); + C->dump(); + I->dump(); + I->eraseFromParent(); + n_reps++; + } + }else{ + Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); + std::array poparg = {dm}; + for (Instruction *I : aa->getAccesses()){ + builder.SetInsertPoint(I); + Instruction *V = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); + V->dump(); + I->dump(); + BasicBlock::iterator ii(I); + ReplaceInstWithValue(I->getParent()->getInstList(), ii, V); + n_reps++; + } + } + + builder.SetInsertPoint(LoopPreheader->getTerminator()); + ConstantInt *rep = ConstantInt::get(i32, n_reps - 1U); //repetition - 1, ty=i32 + Function *SSRRepetitionSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_repetition); + std::array repargs = {dm, rep}; + builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef(repargs))->dump(); + return; +} + + */ \ No newline at end of file From bb8364dcbe5d317543e4f95af9d9cf554bd43f09 Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Mon, 25 Apr 2022 18:14:57 +0200 Subject: [PATCH 19/47] add frep pragma after inference --- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 5 +++++ llvm/lib/Transforms/SSR/SSRInference.cpp | 2 ++ 2 files changed, 7 insertions(+) diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index 620785ae28c3f..0dff232ebd688 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -301,6 +301,11 @@ void generateSSREnDis(const Loop *L){ Module *mod = L->getHeader()->getModule(); Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef()); + + //insert frep pragma + Function *FrepPragma = Intrinsic::getDeclaration(mod, Intrinsic::riscv_frep_infer); + builder.CreateCall(FrepPragma->getFunctionType(), FrepPragma, ArrayRef()); + builder.SetInsertPoint(L->getExitBlock()->getTerminator()); Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef()); diff --git a/llvm/lib/Transforms/SSR/SSRInference.cpp b/llvm/lib/Transforms/SSR/SSRInference.cpp index f2eb224fd3b27..0eeaefc5792b1 100644 --- a/llvm/lib/Transforms/SSR/SSRInference.cpp +++ b/llvm/lib/Transforms/SSR/SSRInference.cpp @@ -48,6 +48,8 @@ using namespace llvm; +static cl::opt InferSSR("ssr-inference", cl::init(false), cl::Hidden); + PreservedAnalyses SSRInferencePass::run(Function &F, FunctionAnalysisManager &FAM){ errs()<<"SSR Inference Pass on function: "< Date: Wed, 27 Apr 2022 11:31:52 +0200 Subject: [PATCH 20/47] change scratchpad check --- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 2 +- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 16 +++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index a22bcde912272..430c45ea39701 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -593,7 +593,7 @@ AffineAccess AffineAccessAnalysis::run(Function &F, FunctionAnalysisManager &FAM LoopInfo &LI = FAM.getResult(F); DominatorTree &DT = FAM.getResult(F); ScalarEvolution &SE = FAM.getResult(F); - //AAResults &AA = FAM.getResult(F); + AAResults &AA = FAM.getResult(F); AffineAccess *A = new AffineAccess(SE, DT, LI); diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index 0dff232ebd688..a3b785bc3e3bb 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -64,7 +64,7 @@ struct GenSSR{ Value *Base; ConstantInt *DMID; SmallVector, SSR_MAX_DIM> offsets; - //Instruction *AvailableFrom; + //Instruction *AvailableFrom; //use this and do everything lazy? public: ///AffineAcc which is wrapped by this GenSSR @@ -92,14 +92,16 @@ struct GenSSR{ /// SGE also works for unsigned int: if the bound is unsigned and larger than 2^30 it will be too large for the scratchpad anyway checks.push_back(builder.CreateICmpSGE(getBound(i), ConstantInt::get(Type::getInt32Ty(ExpandBefore->getContext()), 0U))); } - errs()<<"before ptr to i64 \n"; - errs()<getDimension(); i++){ + auto dim = formatv("{0}d", i+1u); + Value *Range = builder.CreateNUWMul(getBound(i), getStride(i), Twine("range.").concat(dim)); + Value *RangeExt = builder.CreateSExt(Range, i64, Twine("range.sext.").concat(dim)); + EndIncl = builder.CreateAdd(EndIncl, RangeExt, Twine("end.incl.").concat(dim)); + } + checks.push_back(builder.CreateICmpULE(EndIncl, ConstantInt::get(i64, SSR_SCRATCHPAD_END), "scratchpad.end.check")); return builder.CreateAnd(ArrayRef(checks)); } From d0532ce123bedfaa1a371cd10fb9a9b6d6edb78d Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Thu, 28 Apr 2022 12:07:54 +0200 Subject: [PATCH 21/47] problem with polybench gemm and cloning --- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index a3b785bc3e3bb..1975616a65d06 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -271,7 +271,7 @@ BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, Dominato auto *I = dyn_cast(U); if (I && DT.dominates(edge, I->getParent())){ errs()<<*I<<" makes use of "<<*p.first<<" after cloned region ==> add phi node at end!\n"; - assert(true && "did not declare phi node for live-out value"); + assert(false && "did not declare phi node for live-out value"); } } } @@ -305,8 +305,8 @@ void generateSSREnDis(const Loop *L){ builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef()); //insert frep pragma - Function *FrepPragma = Intrinsic::getDeclaration(mod, Intrinsic::riscv_frep_infer); - builder.CreateCall(FrepPragma->getFunctionType(), FrepPragma, ArrayRef()); + //Function *FrepPragma = Intrinsic::getDeclaration(mod, Intrinsic::riscv_frep_infer); + //builder.CreateCall(FrepPragma->getFunctionType(), FrepPragma, ArrayRef()); builder.SetInsertPoint(L->getExitBlock()->getTerminator()); Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); From 7b0083b362fb794447803ffff340f5af980c56c4 Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Mon, 2 May 2022 11:45:06 +0200 Subject: [PATCH 22/47] after meeting --- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index d7e56dfcb4f8e..a71a69a7cf9c6 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -116,6 +116,7 @@ void generateSSR(AffineAccess &AA, const AffineAcc *aa, unsigned dmid, bool isSt I->dump(); BasicBlock::iterator ii(I); ReplaceInstWithValue(I->getParent()->getInstList(), ii, V); + n_reps++; } } @@ -271,7 +272,12 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F for (const Loop *L : changedLoops) generateSSREnDis(L); - return changedLoops.empty() ? PreservedAnalyses::all() : PreservedAnalyses::none(); + if (changedLoops.empty()){ + return PreservedAnalyses::all(); + } else{ + F.addFnAttr(Attribute::NoInline); //if we have streams in this function cannot inline ==> might conflict with other streams + return PreservedAnalyses::none(); + } } From 2d33720c2bc37ff019dc48e96bab67e1293aafdf Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Thu, 5 May 2022 20:45:23 +0200 Subject: [PATCH 23/47] fixed cloning issues --- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 91 ++++++++--- llvm/lib/Passes/PassBuilder.cpp | 19 ++- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 171 +++++---------------- 3 files changed, 124 insertions(+), 157 deletions(-) diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index 430c45ea39701..4874a9f08f9e7 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -120,37 +120,53 @@ bool checkLoop(const Loop *L, DominatorTree &DT, ScalarEvolution &SE, Instructio return true; } -Optional> toSameSize(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool unsafe = false){ +Optional> toSameType(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool unsafe = false){ assert(LHS && RHS); using PT = std::pair; - if (LHS->getType() == RHS->getType()) return Optional(std::make_pair(LHS, RHS)); //trivially the same size - if (!LHS->getType()->isSized() || !RHS->getType()->isSized()) return None; - //TODO: use datalayout for size instead - if (LHS->getType()->getIntegerBitWidth() > RHS->getType()->getIntegerBitWidth()) { + + const DataLayout &DL = SE.getDataLayout(); + LLVMContext &ctxt = SE.getContext(); + + Type *LT = LHS->getType(), *RT = RHS->getType(); + if (LT == RT) + return Optional(std::make_pair(LHS, RHS)); //trivially the same size + if (LT->isPointerTy() && RT->isPointerTy()) //if we have pointers to different types + //PointerType *LTP = cast(LT); PointerType *RTP = cast(RT); + return Optional(std::make_pair( + SE.getPtrToIntExpr(LHS, Type::getIntNTy(ctxt, DL.getMaxPointerSizeInBits())), + SE.getPtrToIntExpr(RHS, Type::getIntNTy(ctxt, DL.getMaxPointerSizeInBits())) + )); + + if (!LT->isSized() || !RT->isSized()) return None; + if (DL.getTypeSizeInBits(LT).isScalable() || DL.getTypeSizeInBits(RT).isScalable()) return None; + + uint64_t ls = DL.getTypeSizeInBits(LT).getValue(), rs = DL.getTypeSizeInBits(RT).getValue(); + + if (ls > rs) { if (auto LHSx = dyn_cast(LHS)){ - if (LHSx->getAPInt().getActiveBits() <= RHS->getType()->getIntegerBitWidth()) {} + if (LHSx->getAPInt().getActiveBits() <= rs) return Optional(std::make_pair(SE.getConstant(RHS->getType(), LHSx->getAPInt().getLimitedValue()), RHS)); } if (auto RHSx = dyn_cast(RHS)){ - if (RHSx->getAPInt().getActiveBits() <= LHS->getType()->getIntegerBitWidth()) - return Optional(std::make_pair(LHS, SE.getConstant(LHS->getType(), RHSx->getAPInt().getLimitedValue()))); + return Optional(std::make_pair(LHS, SE.getConstant(LHS->getType(), RHSx->getAPInt().getLimitedValue()))); } - if (auto LHSx = dyn_cast(LHS)) return toSameSize(LHSx->getOperand(0), RHS, SE); - if (auto LHSx = dyn_cast(LHS)) return toSameSize(LHSx->getOperand(0), RHS, SE); - if (auto RHSx = dyn_cast(RHS)) return toSameSize(LHS, RHSx->getOperand(0), SE); - if (unsafe) return Optional(std::make_pair(SE.getTruncateExpr(LHS, RHS->getType()), RHS)); + if (auto LHSx = dyn_cast(LHS)) return toSameType(LHSx->getOperand(0), RHS, SE); + if (auto LHSx = dyn_cast(LHS)) return toSameType(LHSx->getOperand(0), RHS, SE); + if (auto RHSx = dyn_cast(RHS)) return toSameType(LHS, RHSx->getOperand(0), SE); + if (unsafe && LT->isIntegerTy() && RT->isIntegerTy()) return Optional(std::make_pair(SE.getTruncateExpr(LHS, RHS->getType()), RHS)); return None; - }else if (LHS->getType()->getIntegerBitWidth() < RHS->getType()->getIntegerBitWidth()){ - auto p = toSameSize(RHS, LHS, SE, unsafe); + }else if (ls < rs){ + auto p = toSameType(RHS, LHS, SE); //swap if (!p.hasValue()) return None; - return Optional(std::make_pair(p.getValue().second, p.getValue().first)); + return Optional(std::make_pair(p.getValue().second, p.getValue().first)); //swap back } - return Optional(std::make_pair(LHS, RHS)); + if (unsafe) return Optional(std::make_pair(LHS, RHS)); + return None; } ///checks whether LHS == RHS always holds bool SCEVEquals(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE){ - auto p = toSameSize(LHS, RHS, SE); + auto p = toSameType(LHS, RHS, SE); if (!p.hasValue()) return false; LHS = p.getValue().first; RHS = p.getValue().second; @@ -276,7 +292,11 @@ void findStridesRec(const SCEV *Addr, ArrayRef loops, SmallVector< if (AddRec->getLoop() == *loops.begin()){ const SCEV *S = AddRec->getStepRecurrence(SE); for (const SCEV *x : factors){ - auto p = toSameSize(S, x, SE, true); + auto p = toSameType(S, x, SE, true); + if (!p.hasValue()) { + assert(false && "unsafe toSameType returned None!"); //TODO: change to errs() + return; + } S = SE.getMulExpr(p.getValue().first, p.getValue().second); } res.push_back(S); @@ -593,7 +613,7 @@ AffineAccess AffineAccessAnalysis::run(Function &F, FunctionAnalysisManager &FAM LoopInfo &LI = FAM.getResult(F); DominatorTree &DT = FAM.getResult(F); ScalarEvolution &SE = FAM.getResult(F); - AAResults &AA = FAM.getResult(F); + //AAResults &AA = FAM.getResult(F); AffineAccess *A = new AffineAccess(SE, DT, LI); @@ -784,4 +804,37 @@ errs()<<"finding stride in: "; CS->dump(); } assert(Stride); errs()<<"found stride: "; Stride->dump(); +*/ + +/* +Optional> toSameSize(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool unsafe = false){ + assert(LHS && RHS); + errs()<<"toSameSize: LHS="<<*LHS<<" with type"<<*LHS->getType()<<"\n"; + errs()<<"toSameSize: RHS="<<*RHS<<" with type"<<*RHS->getType()<<"\n"; + using PT = std::pair; + if (LHS->getType() == RHS->getType()) return Optional(std::make_pair(LHS, RHS)); //trivially the same size + if (LHS->getType()->isPointerTy() && RHS->getType()->isPointerTy()) return Optional(std::make_pair(LHS, RHS)); + if (!LHS->getType()->isSized() || !RHS->getType()->isSized()) return None; + //TODO: use datalayout for size instead + if (LHS->getType()->getIntegerBitWidth() > RHS->getType()->getIntegerBitWidth()) { + if (auto LHSx = dyn_cast(LHS)){ + if (LHSx->getAPInt().getActiveBits() <= RHS->getType()->getIntegerBitWidth()) {} + return Optional(std::make_pair(SE.getConstant(RHS->getType(), LHSx->getAPInt().getLimitedValue()), RHS)); + } + if (auto RHSx = dyn_cast(RHS)){ + if (RHSx->getAPInt().getActiveBits() <= LHS->getType()->getIntegerBitWidth()) + return Optional(std::make_pair(LHS, SE.getConstant(LHS->getType(), RHSx->getAPInt().getLimitedValue()))); + } + if (auto LHSx = dyn_cast(LHS)) return toSameSize(LHSx->getOperand(0), RHS, SE); + if (auto LHSx = dyn_cast(LHS)) return toSameSize(LHSx->getOperand(0), RHS, SE); + if (auto RHSx = dyn_cast(RHS)) return toSameSize(LHS, RHSx->getOperand(0), SE); + if (unsafe) return Optional(std::make_pair(SE.getTruncateExpr(LHS, RHS->getType()), RHS)); + return None; + }else if (LHS->getType()->getIntegerBitWidth() < RHS->getType()->getIntegerBitWidth()){ + auto p = toSameSize(RHS, LHS, SE, unsafe); + if (!p.hasValue()) return None; + return Optional(std::make_pair(p.getValue().second, p.getValue().first)); + } + return Optional(std::make_pair(LHS, RHS)); +} */ \ No newline at end of file diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 8d38d59a21d67..8fcc3695bf270 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -521,7 +521,7 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, ThinOrFullLTOPhase Phase) { FunctionPassManager FPM(DebugLogging); - + errs()<<"O1SimplificationPipeline\n"; // Form SSA out of local memory accesses after breaking apart aggregates into // scalars. FPM.addPass(SROA()); @@ -627,8 +627,6 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(InstCombinePass()); invokePeepholeEPCallbacks(FPM, Level); - FPM.addPass(SSRInferencePass()); - if (PTO.Coroutines) FPM.addPass(CoroElidePass()); @@ -650,7 +648,7 @@ FunctionPassManager PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, ThinOrFullLTOPhase Phase) { assert(Level != OptimizationLevel::O0 && "Must request optimizations!"); - + errs()<<"O2/3FunctionSimplificationPipeline\n"; // The O1 pipeline has a separate pipeline creation function to simplify // construction readability. if (Level.getSpeedupLevel() == 1) @@ -801,8 +799,6 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, // opportunities opened up by them. FPM.addPass(InstCombinePass()); invokePeepholeEPCallbacks(FPM, Level); - - FPM.addPass(SSRInferencePass()); // Re-consider control flow based optimizations after redundancy elimination, // redo DCE, etc. @@ -848,6 +844,7 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, std::string ProfileFile, std::string ProfileRemappingFile) { assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!"); + errs()<<"addPGOInstrPasses\n"; if (!IsCS && !DisablePreInliner) { InlineParams IP; @@ -941,6 +938,7 @@ ModuleInlinerWrapperPass PassBuilder::buildInlinerPipeline(OptimizationLevel Level, ThinOrFullLTOPhase Phase) { InlineParams IP = getInlineParamsFromOptLevel(Level); + errs()<<"buildInlinerPipeline\n"; if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) IP.HotCallSiteThreshold = 0; @@ -998,12 +996,15 @@ PassBuilder::buildInlinerPipeline(OptimizationLevel Level, MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor( buildFunctionSimplificationPipeline(Level, Phase))); + MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(SSRInferencePass())); + return MIWP; } ModulePassManager PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, ThinOrFullLTOPhase Phase) { + errs()<<"ModuleSimplificationPipeline\n"; ModulePassManager MPM(DebugLogging); // Add UniqueInternalLinkageNames Pass which renames internal linkage @@ -1164,6 +1165,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, ModulePassManager PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, bool LTOPreLink) { + errs()<<"ModuleOptimizationPipeline\n"; ModulePassManager MPM(DebugLogging); // Optimize globals now that the module is fully simplified. @@ -1407,6 +1409,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, ModulePassManager PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, bool LTOPreLink) { + errs()<<"PerModuleDefaultPipeline\n"; assert(Level != OptimizationLevel::O0 && "Must request optimizations for the default pipeline!"); @@ -1447,6 +1450,7 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, ModulePassManager PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) { + errs()<<"ThinLTOPreLinkDefaultPipeline\n"; assert(Level != OptimizationLevel::O0 && "Must request optimizations for the default pipeline!"); @@ -1503,6 +1507,7 @@ PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) { ModulePassManager PassBuilder::buildThinLTODefaultPipeline( OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) { + errs()<<"buildThinLTODefaultPipeline\n"; ModulePassManager MPM(DebugLogging); // Convert @llvm.global.annotations to !annotation metadata. @@ -1549,6 +1554,7 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline( ModulePassManager PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level) { + errs()<<"buildLTOPreLinkDefaultPipeline\n"; assert(Level != OptimizationLevel::O0 && "Must request optimizations for the default pipeline!"); // FIXME: We should use a customized pre-link pipeline! @@ -1559,6 +1565,7 @@ PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level) { ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, ModuleSummaryIndex *ExportSummary) { + errs()<<"buildLTODefaultPipeline\n"; ModulePassManager MPM(DebugLogging); // Convert @llvm.global.annotations to !annotation metadata. diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index a3b785bc3e3bb..f6038b510049a 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -51,8 +51,8 @@ #define NUM_SSR 3U #define SSR_MAX_DIM 4U //both are inclusive! -#define SSR_SCRATCHPAD_BEGIN 1000U -#define SSR_SCRATCHPAD_END 18000U +#define SSR_SCRATCHPAD_BEGIN 0U +#define SSR_SCRATCHPAD_END 0xFFFFFFFFFFFFFFFFU //maxint //current state of hw: only allow doubles #define CHECK_TYPE(T, I) (T == Type::getDoubleTy(I->getParent()->getContext())) @@ -177,8 +177,7 @@ namespace{ void copyPHIsFromPred(BasicBlock *BB){ BasicBlock *Pred = BB->getSinglePredecessor(); - assert(Pred && "only works for blocks with one single predecessor"); - assert(BB->getTerminator() && "need at least one non-phi node in BB"); + assert(Pred && "BB has single predecessor"); for (Instruction &I : *Pred){ if (auto *Phi = dyn_cast(&I)){ PHINode *PhiC = PHINode::Create(Phi->getType(), 1u, Twine(Phi->getName()).concat(".copy"), BB->getFirstNonPHI()); @@ -189,17 +188,40 @@ void copyPHIsFromPred(BasicBlock *BB){ } } +///splits block, redirects all predecessor to first half of split, copies phi's +std::pair splitAt(Instruction *X, const Twine &name, DomTreeUpdater *DTU){ + BasicBlock *Two = X->getParent(); + BasicBlock *One = splitBlockBefore(Two, X, DTU, nullptr, nullptr, name); + for (auto *BB : predecessors(Two)){ + if (BB == One) continue; + Instruction *T = BB->getTerminator(); + for (unsigned i = 0; i < T->getNumOperands(); i++){ + Value *OP = T->getOperand(i); + T->dump(); + if (dyn_cast(OP) == Two){ + T->setOperand(i, One); //if an operand of the terminator of a predecessor of Two points to Two it should now point to One + } + } + } + DTU->flush(); + copyPHIsFromPred(Two); //copy Phi's from One to Two + return std::make_pair(One, Two); +} + ///clones code from BeginWith up to EndBefore ///assumes all cf-paths from begin lead to end (or return) ///assumes there is a phi node for each value defined in the region that will be cloned in the block of EndBefore that is live after EndBefore BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, DominatorTree &DT, DomTreeUpdater *DTU, LoopInfo *LI, MemorySSAUpdater *MSSAU){ errs()<<"cloning from "<<*BeginWith<<" up to "<<*EndBefore<<"\n"; - BasicBlock *Begin = BeginWith->getParent(); - BasicBlock *Head = splitBlockBefore(Begin, BeginWith, DTU, LI, MSSAU, "split.head"); - copyPHIsFromPred(Begin); //copy Phi's from Head to Begin - BasicBlock *End = EndBefore->getParent(); - BasicBlock *Fuse = splitBlockBefore(EndBefore->getParent(), EndBefore, DTU, LI, MSSAU, "fuse.prep"); - copyPHIsFromPred(End); + + auto p = splitAt(BeginWith, "split.before", DTU); + BasicBlock *Head = p.first; + BasicBlock *Begin = p.second; + + p = splitAt(EndBefore, "fuse.prep", DTU); + BasicBlock *Fuse = p.first; + BasicBlock *End = p.second; + std::deque q; //bfs queue q.push_back(Begin); DenseSet vis; //bfs visited set @@ -240,9 +262,7 @@ BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, Dominato auto A = clones.find(p.second->getOperand(p.first)); if (A != clones.end()){ p.second->setOperand(p.first, A->second); - }else{ - errs()<<"cloneRegion: did not find "<getOperand(p.first)->getNameOrAsOperand()<<"\n"; - } + }//else did not find ==> was defined before region } //incoming blocks of phi nodes are not operands ==> handle specially for (const auto &p : clones){ //all clones of phi-nodes appear in here @@ -293,7 +313,8 @@ BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, Dominato } } } - errs()<<"done cloning from \n"; + errs()<<"done cloning \n"; + return HeadBr; } @@ -304,10 +325,6 @@ void generateSSREnDis(const Loop *L){ Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef()); - //insert frep pragma - Function *FrepPragma = Intrinsic::getDeclaration(mod, Intrinsic::riscv_frep_infer); - builder.CreateCall(FrepPragma->getFunctionType(), FrepPragma, ArrayRef()); - builder.SetInsertPoint(L->getExitBlock()->getTerminator()); Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef()); @@ -446,12 +463,13 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F std::vector goodAccs; for (const AffineAcc *A : accs){ - A->dump(); auto p = AF.splitLoadStore(A); if (p.first && isValid(p.first)) goodAccs.push_back(p.first); if (p.second && isValid(p.second)) goodAccs.push_back(p.second); } + if (goodAccs.empty()) return PreservedAnalyses::all(); + ConflictGraph g(AF, ArrayRef(goodAccs)); const auto &clr = g.color(NUM_SSR); errs()<<"computed coloring\n"; @@ -497,8 +515,8 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F errs()<<"generated all SSR enable & disable \n"; - errs()<<"printing function: \n"; - for (BasicBlock &BB : F) BB.dump(); + //TODO: merge loops + //TODO: frep pragmas if (changedLoops.empty()){ return PreservedAnalyses::all(); @@ -508,114 +526,3 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F } } - -/* -std::vector allaccesses; - for (const AffineAcc *A : accs) allaccesses.push_back(A); - - //sort by dimension ascending - std::sort(allaccesses.begin(), allaccesses.end(), [](const AffineAcc *A, const AffineAcc *B){return A->getDimension() <= B->getDimension();}); - - errs()<<"total of "< accesses; - while (!allaccesses.empty()){ - auto A = allaccesses.back(); allaccesses.pop_back(); - if (!isValid(A)) continue; - bool conflict = false; - for (auto B : accesses){ - conflict = conflict || shareInsts(A, B); - } - if (!conflict) accesses.push_back(A); - } - - errs()<= NUM_SSR) break; - unsigned n_store = A->getNStore(), n_load = A->getNLoad(); - generateSSR(AF, A, dmid, n_store + n_load, n_store > 0U); - changedLoops.insert(A->getLoop()); - dmid++; - } - - void generateSSR(AffineAccess &AA, const AffineAcc *aa, unsigned dmid, bool isStore){ - BasicBlock *LoopPreheader = aa->getLoop()->getLoopPreheader(); - Module *mod = LoopPreheader->getModule(); - LLVMContext &ctxt = LoopPreheader->getContext(); - IntegerType *i32 = IntegerType::getInt32Ty(ctxt); - - IRBuilder<> builder(LoopPreheader->getTerminator()); - - ConstantInt *dm = ConstantInt::get(i32, dmid); //datamover id, ty=i32 - ConstantInt *dim = ConstantInt::get(i32, aa->getDimension() - 1U); //dimension - 1, ty=i32 - Value *data = AA.expandData(aa, Type::getInt8PtrTy(ctxt)); - Function *SSRSetup; - if (!isStore){ - SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_read_imm); //can take _imm bc dm and dim are constant - }else{ - SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_write_imm); //can take _imm bc dm and dim are constant - } - std::array args = {dm, dim, data}; - builder.CreateCall(SSRSetup->getFunctionType(), SSRSetup, ArrayRef(args))->dump(); - - std::vector bds, sts; - for (unsigned d = 0U; d < aa->getDimension(); d++){ - bds.push_back(AA.expandBound(aa, d, i32)); //bound - 1, ty=i32 - sts.push_back(AA.expandStride(aa, d, i32)); //relative stride, ty=i32 - } - - Intrinsic::RISCVIntrinsics functions[] = { - Intrinsic::riscv_ssr_setup_bound_stride_1d, - Intrinsic::riscv_ssr_setup_bound_stride_2d, - Intrinsic::riscv_ssr_setup_bound_stride_3d, - Intrinsic::riscv_ssr_setup_bound_stride_4d - }; - - for (unsigned d = 0U; d < aa->getDimension(); d++){ - Value *bound = bds[d]; - Value *stride = sts[d]; - - Function *SSRBoundStrideSetup = Intrinsic::getDeclaration(mod, functions[d]); - std::array bsargs = {dm, bound, stride}; - auto *C = builder.CreateCall(SSRBoundStrideSetup->getFunctionType(), SSRBoundStrideSetup, ArrayRef(bsargs)); - C->dump(); - } - - unsigned n_reps = 0U; - if (isStore){ - Function *SSRPush = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_push); - for (Instruction *I : aa->getAccesses()){ - std::array pusharg = {dm, cast(I)->getValueOperand()}; - builder.SetInsertPoint(I); - auto *C = builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); - C->dump(); - I->dump(); - I->eraseFromParent(); - n_reps++; - } - }else{ - Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); - std::array poparg = {dm}; - for (Instruction *I : aa->getAccesses()){ - builder.SetInsertPoint(I); - Instruction *V = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); - V->dump(); - I->dump(); - BasicBlock::iterator ii(I); - ReplaceInstWithValue(I->getParent()->getInstList(), ii, V); - n_reps++; - } - } - - builder.SetInsertPoint(LoopPreheader->getTerminator()); - ConstantInt *rep = ConstantInt::get(i32, n_reps - 1U); //repetition - 1, ty=i32 - Function *SSRRepetitionSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_repetition); - std::array repargs = {dm, rep}; - builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef(repargs))->dump(); - return; -} - - - */ \ No newline at end of file From 134db67c4129b8aa063a10981dd0181aa43250cf Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Fri, 6 May 2022 14:44:04 +0200 Subject: [PATCH 24/47] changes --- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 42 ++++++++++------- llvm/lib/Passes/PassBuilder.cpp | 5 ++- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 52 +++++++++++++++++----- llvm/lib/Transforms/SSR/SSRInference.cpp | 6 ++- 4 files changed, 78 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index 4874a9f08f9e7..c02d5227e1cde 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -179,8 +179,24 @@ bool SCEVEquals(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE){ } /// check whether BB is on all controlflow paths from header to header -bool isOnAllControlFlowPaths(const BasicBlock *BB, const Loop *L, const DominatorTree &DT){ - return DT.dominates(BB, L->getHeader()); +// TODO: could also be done with DT +bool isOnAllControlFlowPaths(BasicBlock *BB, const Loop *L, const DominatorTree &DT){ + BasicBlock *End = L->getHeader(); + std::deque> q; + q.push_back(std::make_pair(End, false)); //start with header (false = BB not yet visited) + std::set> vis; //comp here is less> + while (!q.empty()){ + auto p = q.front(); q.pop_front(); + if (vis.find(p) != vis.end()) continue; + vis.insert(p); + for (BasicBlock *B : successors(p.first)){ + q.push_back(std::make_pair(B, p.second || B == BB)); + } + //check here whether End is reached with false (not at start of loop bc we also start with End) + p = q.front(); + if (!p.second && p.first == End) return false; //got to End (header) without ever visiting BB + } + return true; } //return result of Cmp predicated on Rep > 0 if possible. @@ -228,7 +244,7 @@ Optional predicatedICmpOutcome(ICmpInst *Cmp, const SCEV *Rep, ScalarEvolu //conservative! //because SCEVComparePredicate is not in this version of LLVM we have to do this manually ==> will not catch all cases //predicate is that Rep > 0 -bool isOnAllPredicatedControlFlowPaths(const BasicBlock *BB, const Loop *L, const DominatorTree &DT, const SCEV *Rep, ScalarEvolution &SE){ +bool isOnAllPredicatedControlFlowPaths(BasicBlock *BB, const Loop *L, const DominatorTree &DT, const SCEV *Rep, ScalarEvolution &SE){ if (isOnAllControlFlowPaths(BB, L, DT)) return true; //is on all paths anyway Rep->dump(); DenseSet vis; //visited set @@ -447,22 +463,17 @@ void AffineAccess::addAllAccesses(Instruction *Addr, const Loop *L){ if (addresses.find(Addr) != addresses.end()) return; //already called addAllAccesses on this Addr instruction addresses.insert(Addr); + errs()<<"addAllAccesses: start: "<<*Addr<<"\n"; + //find all accesses std::vector accesses; for (auto U = Addr->use_begin(); U != Addr->use_end(); ++U){ - Instruction *Acc = dyn_cast(U->getUser()); - if (Acc){ //load inst - bool unvaildUser = false; - for (auto AU = Acc->use_begin(); AU != Acc->use_end(); ++AU){ - auto *I = dyn_cast(AU->getUser()); - unvaildUser = unvaildUser || !I || !isOnAllControlFlowPaths(I->getParent(), L, DT); - } - if (unvaildUser) continue; //skip this load it has users ouside of loop or not on all control flow paths + Instruction *Acc = dyn_cast(U->getUser()); + if (!Acc) continue; //user is not an instruction + if (isa(Acc) || isa(Acc)){ + if (!isOnAllControlFlowPaths(Acc->getParent(), L, DT)) continue; //access does not occur consistently in loop ==> not suitable + accesses.push_back(Acc); } - if (!Acc) Acc = dyn_cast(U->getUser()); //try to cast to store - if (!Acc) continue; //both casts failed ==> not a suitable instruction - if (!isOnAllControlFlowPaths(Acc->getParent(), L, DT)) continue; //access does not occur consistently in loop ==> not suitable - accesses.push_back(Acc); } if (accesses.empty()) return; //Addr not used @@ -632,6 +643,7 @@ AffineAccess AffineAccessAnalysis::run(Function &F, FunctionAnalysisManager &FAM }else{ continue; //cannot do anything with this instruction } + errs()<<"run: "<(Addr))) continue; //if Addr is not instruction ==> constant, or sth else (==> leave for other passes to opt) A->addAllAccesses(AddrIns, L); diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 8fcc3695bf270..62c19d3a84ab5 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -760,6 +760,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(createFunctionToLoopPassAdaptor( std::move(LPM1), EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true, DebugLogging)); + FPM.addPass(SimplifyCFGPass()); FPM.addPass(InstCombinePass()); @@ -772,6 +773,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, std::move(LPM2), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false, DebugLogging)); + FPM.addPass(SSRInferencePass()); + // Delete small array after loop unroll. FPM.addPass(SROA()); @@ -996,7 +999,7 @@ PassBuilder::buildInlinerPipeline(OptimizationLevel Level, MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor( buildFunctionSimplificationPipeline(Level, Phase))); - MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(SSRInferencePass())); + //MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(SSRInferencePass())); return MIWP; } diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index f6038b510049a..9be1b3c73579a 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -64,6 +64,8 @@ struct GenSSR{ Value *Base; ConstantInt *DMID; SmallVector, SSR_MAX_DIM> offsets; + Value *MemBegin = nullptr; + Value *MemEnd = nullptr; //Instruction *AvailableFrom; //use this and do everything lazy? public: @@ -93,6 +95,7 @@ struct GenSSR{ checks.push_back(builder.CreateICmpSGE(getBound(i), ConstantInt::get(Type::getInt32Ty(ExpandBefore->getContext()), 0U))); } Value *BaseInt = builder.CreatePtrToInt(getBase(), i64, "base.to.int"); + MemBegin = BaseInt; checks.push_back(builder.CreateICmpUGE(BaseInt, ConstantInt::get(i64, SSR_SCRATCHPAD_BEGIN), "scratchpad.begin.check")); Value *EndIncl = BaseInt; for (unsigned i = 0U; i < Access->getDimension(); i++){ @@ -101,6 +104,7 @@ struct GenSSR{ Value *RangeExt = builder.CreateSExt(Range, i64, Twine("range.sext.").concat(dim)); EndIncl = builder.CreateAdd(EndIncl, RangeExt, Twine("end.incl.").concat(dim)); } + MemEnd = EndIncl; checks.push_back(builder.CreateICmpULE(EndIncl, ConstantInt::get(i64, SSR_SCRATCHPAD_END), "scratchpad.end.check")); return builder.CreateAnd(ArrayRef(checks)); } @@ -167,10 +171,12 @@ struct GenSSR{ return; } - Value *getBase() { return Base; } - Value *getBound(unsigned i) { return offsets[i].first; } - Value *getStride(unsigned i) { return offsets[i].second; } - ConstantInt *getDMID() { return DMID; } + Value *getBase() const { return Base; } + Value *getBound(unsigned i) const { return offsets[i].first; } + Value *getStride(unsigned i) const { return offsets[i].second; } + ConstantInt *getDMID() const { return DMID; } + Value *getMemBegin() const { return MemBegin; } + Value *getMemEnd() const { return MemEnd; } }; namespace{ @@ -325,6 +331,9 @@ void generateSSREnDis(const Loop *L){ Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef()); + Function *FREPPragma = Intrinsic::getDeclaration(mod, Intrinsic::riscv_frep_infer); + builder.CreateCall(FREPPragma->getFunctionType(), FREPPragma, ArrayRef()); + builder.SetInsertPoint(L->getExitBlock()->getTerminator()); Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef()); @@ -333,15 +342,36 @@ void generateSSREnDis(const Loop *L){ return; } +Value *generateIntersectCheck(IRBuilder<> &builder, GenSSR *G, GenSSR *H){ + Value *Glo = G->getMemBegin(); + Value *Ghi = G->getMemEnd(); + Value *Hlo = H->getMemBegin(); + Value *Hhi = H->getMemEnd(); + Value *GhiLTHlo = builder.CreateICmpULT(Ghi, Hlo, "1st.memrange.check"); //bounds are inclusive, we assume alignment + Value *HhiLTGlo = builder.CreateICmpULT(Hhi, Glo, "2nd.memrange.check"); + return builder.CreateOr(GhiLTHlo, HhiLTGlo, "or.memrange"); +} + void generateSSRGuard(BranchInst *BR, ArrayRef streams){ assert(BR->isConditional()); if (streams.empty()) return; IRBuilder<> builder(BR); std::vector checks; for (auto *G : streams){ - checks.push_back(G->GenerateSSRGuard(BR)); + checks.push_back(G->GenerateSSRGuard(BR)); //means getMemBegin() and getMemEnd() do not return nullptr } - //TODO: cross check streams too + for (unsigned i = 0; i < streams.size(); i++){ + GenSSR *G = streams[i]; + for (unsigned j = 0; j < streams.size(); j++){ + if (G->Access->getNStore() > 0u){ + GenSSR *H = streams[j]; + if (j < i || (j > i && H->Access->getNStore() == 0u)){ //true if H is before G OR H is after G and a load + checks.push_back(generateIntersectCheck(builder, G, H)); + } + } + } + } + Value *TakeSSR = builder.CreateAnd(checks); BR->setCondition(TakeSSR); } @@ -375,10 +405,10 @@ struct ConflictGraph{ conflicts.insert(std::make_pair(*A, std::vector())); mutexs.insert(std::make_pair(*A, std::vector())); for (auto B = accesses.begin(); B != A; B++){ - if (AF.shareInsts(*A, *B) || AF.conflictWWWR(*A, *B)){ + if (AF.shareInsts(*A, *B)){ //AF.conflictWWWR(*A, *B) mutexs.find(*A)->second.push_back(*B); mutexs.find(*B)->second.push_back(*A); - }else if (AF.shareLoops(*A, *B)){ + }else if (AF.shareLoops(*A, *B)){ //here we assume that the accessed memory region do not intersect and check this at runtime conflicts.find(*A)->second.push_back(*B); conflicts.find(*B)->second.push_back(*A); } @@ -455,7 +485,7 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F auto &MA = FAM.getResult(F); MemorySSAUpdater MSSAU(&MA.getMSSA()); - errs()<<"SSR Generation Pass on function: "< changedLoops; @@ -468,7 +498,9 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F if (p.second && isValid(p.second)) goodAccs.push_back(p.second); } - if (goodAccs.empty()) return PreservedAnalyses::all(); + F.dump(); + + if (goodAccs.empty()) return PreservedAnalyses::all(); //early exit ConflictGraph g(AF, ArrayRef(goodAccs)); const auto &clr = g.color(NUM_SSR); diff --git a/llvm/lib/Transforms/SSR/SSRInference.cpp b/llvm/lib/Transforms/SSR/SSRInference.cpp index 0eeaefc5792b1..e26df12f629ed 100644 --- a/llvm/lib/Transforms/SSR/SSRInference.cpp +++ b/llvm/lib/Transforms/SSR/SSRInference.cpp @@ -27,7 +27,10 @@ #include "llvm/Transforms/Utils/FixIrreducible.h" #include "llvm/Transforms/InstCombine/InstCombine.h" #include "llvm/Transforms/Scalar/LoopStrengthReduce.h" +#include "llvm/Transforms/Scalar/LoopRotation.h" +#include "llvm/Transforms/Scalar/IndVarSimplify.h" #include "llvm/Transforms/Scalar/SimplifyCFG.h" +#include "llvm/Transforms/Scalar/LoopFlatten.h" #include "llvm/Transforms/SSR/SSRGeneration.h" #include "llvm/Support/CommandLine.h" @@ -55,8 +58,9 @@ PreservedAnalyses SSRInferencePass::run(Function &F, FunctionAnalysisManager &FA FunctionPassManager FPM(true); FPM.addPass(FixIrreduciblePass());//turn some non-loops into loops FPM.addPass(LoopSimplifyPass()); //canonicalize loops + //FPM.addPass(createFunctionToLoopPassAdaptor(LoopRotatePass())); FPM.addPass(LCSSAPass()); //put loops into LCSSA-form - //FPM.addPass(createFunctionToLoopPassAdaptor(LoopStrengthReducePass())); //loop strength reduction + //FPM.addPass(createFunctionToLoopPassAdaptor(IndVarSimplifyPass(false))); FPM.addPass(SSRGenerationPass()); //runs AffineAccess analysis and generates SSR intrinsics FPM.addPass(SimplifyCFGPass()); //simplifies CFG again FPM.addPass(InstCombinePass()); //removes phi nodes from LCSSA From d7617f93bc66b5d5d92aa73b2d98d0ef84733b4b Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Mon, 16 May 2022 15:47:03 +0200 Subject: [PATCH 25/47] passing tests --- .../llvm/Analysis/AffineAccessAnalysis.h | 1 + llvm/lib/Analysis/AffineAccessAnalysis.cpp | 39 ++++--- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 107 +++++++++++++----- 3 files changed, 107 insertions(+), 40 deletions(-) diff --git a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h index 09d591c4c124a..3e8dbcf66e8bd 100644 --- a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h @@ -30,6 +30,7 @@ class AffineAcc{ AffineAcc() = delete; void dump() const; unsigned getDimension() const; + unsigned getUsedDimension() const; const Loop *getLoop() const; Instruction *getAddrIns() const; const SmallVector &getAccesses() const; diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index c02d5227e1cde..ba3b7030ffdc5 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -53,6 +53,14 @@ unsigned AffineAcc::getDimension() const{ return this->bounds.size(); } +unsigned AffineAcc::getUsedDimension() const{ + unsigned d = 0u; + for (const SCEV *S : this->strides){ + if (!isa(S) || !cast(S)->isZero()) d++; //only cound a dimension if its stride is non-zero + } + return d; +} + void AffineAcc::dump() const{ errs()<<"Affine Access in Loop:\n"; if (L) L->dump(); @@ -242,7 +250,7 @@ Optional predicatedICmpOutcome(ICmpInst *Cmp, const SCEV *Rep, ScalarEvolu } //conservative! -//because SCEVComparePredicate is not in this version of LLVM we have to do this manually ==> will not catch all cases +//because SCEVComparePredicate is not in this version of LLVM we have to do this manually ==> will not catch all cases (FIXME) //predicate is that Rep > 0 bool isOnAllPredicatedControlFlowPaths(BasicBlock *BB, const Loop *L, const DominatorTree &DT, const SCEV *Rep, ScalarEvolution &SE){ if (isOnAllControlFlowPaths(BB, L, DT)) return true; //is on all paths anyway @@ -321,7 +329,7 @@ void findStridesRec(const SCEV *Addr, ArrayRef loops, SmallVector< bool occurs = false; for (const Loop *L : loops) occurs = occurs || AddRec->getLoop() == L; //loops needs to occur further up, o/w invalid if (!occurs) return; - res.push_back(SE.getConstant(APInt(64U, 0U))); + res.push_back(SE.getConstant(APInt(64U, 0U))); //TODO: this leads to ugly casts findStridesRec(AddRec, ArrayRef(loops.begin()+1, loops.end()), factors, SE, res); } return; @@ -335,7 +343,7 @@ void findStridesRec(const SCEV *Addr, ArrayRef loops, SmallVector< case SCEVTypes::scAddExpr: { auto S = cast(Addr); - bool lhs = SE.containsAddRecurrence(S->getOperand(0)); + bool lhs = SE.containsAddRecurrence(S->getOperand(0)); //TODO: this does not catch all cases maybe limit SCEV to outermost loop? bool rhs = SE.containsAddRecurrence(S->getOperand(1)); if (lhs && !rhs) findStridesRec(S->getOperand(0), loops, factors, SE, res); else if (!lhs && rhs) findStridesRec(S->getOperand(1), loops, factors, SE, res); @@ -344,7 +352,7 @@ void findStridesRec(const SCEV *Addr, ArrayRef loops, SmallVector< case SCEVTypes::scMulExpr: { auto S = cast(Addr); - bool lhs = SE.containsAddRecurrence(S->getOperand(0)); + bool lhs = SE.containsAddRecurrence(S->getOperand(0)); //TODO: this does not catch all cases maybe limit SCEV to outermost loop? bool rhs = SE.containsAddRecurrence(S->getOperand(1)); if (lhs && !rhs) { factors.push_back(S->getOperand(1)); @@ -361,24 +369,26 @@ void findStridesRec(const SCEV *Addr, ArrayRef loops, SmallVector< } } -SmallVector &findStrides(const SCEV *Addr, ArrayRef loops, ScalarEvolution &SE){ +SmallVector &findStrides(Instruction *Addr, ArrayRef loops, ScalarEvolution &SE){ + const SCEV *AddrS = SE.getSCEV(Addr); //SE.getSCEVAtScope(Addr, loops.back()); //we only look at the scev as contained in outermost loop SmallVector &strides = *(new SmallVector()); SmallVector factors; - findStridesRec(Addr, loops, factors, SE, strides); + findStridesRec(AddrS, loops, factors, SE, strides); return strides; } Value *castToSize(Value *R, Type *ty, Instruction *InsPoint){ const DataLayout &DL = InsPoint->getParent()->getModule()->getDataLayout(); + IRBuilder<> builder(InsPoint); Type *rty = R->getType(); if (rty == ty) return R; if (DL.getTypeSizeInBits(rty) > DL.getTypeSizeInBits(ty)) { - return CastInst::CreateTruncOrBitCast(R, ty, "scev.cast", InsPoint); + return builder.CreateTruncOrBitCast(R, ty, "scev.trunc"); } if (DL.getTypeSizeInBits(rty) < DL.getTypeSizeInBits(ty)) { - return CastInst::CreateZExtOrBitCast(R, ty, "scev.cast", InsPoint); + return builder.CreateSExtOrBitCast(R, ty, "scev.sext"); } - return CastInst::CreateBitOrPointerCast(R, ty, "scev.cast", InsPoint); + return builder.CreateBitOrPointerCast(R, ty, "scev.cast"); } } //end of namespace @@ -479,18 +489,16 @@ void AffineAccess::addAllAccesses(Instruction *Addr, const Loop *L){ errs()<<"adding Access: "; Addr->dump(); - const SCEV *AddrS = SE.getSCEV(Addr); - //we are looking at containing loops of all the accesses (guaranteed to be all the same) //Addr ins might be outside of loop (licm) if 1D stride is 0 auto &cloops = getContainingLoops(LI.getLoopsInPreorder(), accesses[0]); errs()<<"has "<(accesses), AddrS); //never needed -> alloc in stack + AffineAcc dim0(Addr, ArrayRef(accesses), SE.getSCEV(Addr)); //never needed -> alloc in stack AffineAcc *A = &dim0; for (auto L = cloops.begin(); L != cloops.end(); ++L){ @@ -594,7 +602,10 @@ Value *AffineAccess::expandData(const AffineAcc *aa, Type *ty, Instruction *Inse assert(isSafeToExpandAt(aa->data, InsertBefore, SE) && "data not expanable here (note: only preheader guaranteed)"); SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "data"); ex.setInsertPoint(InsertBefore); - return castToSize(ex.expandCodeFor(aa->data), ty, InsertBefore); + errs()<<"expandData: scev "<<*aa->data<<" has type: "<<*aa->data->getType()<<"\n"; + Value *data = ex.expandCodeFor(aa->data); + errs()<<"expandData: value "<<*data<<" has type: "<<*data->getType()<<"\n"; + return castToSize(data, ty, InsertBefore); } Value *AffineAccess::expandBound(const AffineAcc *aa, unsigned i, Type *ty, Instruction *InsertBefore) const { diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index 9be1b3c73579a..c7dd8f058e5c2 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsRISCV.h" +#include "llvm/IR/InlineAsm.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/SmallPtrSet.h" @@ -48,16 +49,40 @@ #include #include +#define SSR_INFERENCE true + #define NUM_SSR 3U #define SSR_MAX_DIM 4U //both are inclusive! -#define SSR_SCRATCHPAD_BEGIN 0U -#define SSR_SCRATCHPAD_END 0xFFFFFFFFFFFFFFFFU //maxint +#define SSR_SCRATCHPAD_BEGIN 0x100000 +#define SSR_SCRATCHPAD_END 0x120000 //current state of hw: only allow doubles #define CHECK_TYPE(T, I) (T == Type::getDoubleTy(I->getParent()->getContext())) using namespace llvm; +static cl::opt GenerateSSR("generate-ssr", cl::init(true), cl::Hidden); + +namespace{ + +void clobberRegistersAt(ArrayRef regs, Instruction *Before){ + IRBuilder<> builder(Before); + //equivalent to asm volatile ("":::regs); + std::string constraints = "~{dirflag},~{fpsr},~{flags}"; //TODO: what are these doing? + for (const std::string &r : regs){ + constraints = (formatv("~{{{0}},", r) + Twine(constraints)).str(); + } + InlineAsm *IA = InlineAsm::get( + FunctionType::get(Type::getVoidTy(builder.getContext()), false), + "", + constraints, + true + ); + builder.CreateCall(IA)->dump(); +} + +} //end of namespace + ///Wraps an AffineAcc *Access, expands all its SCEVs in constructor struct GenSSR{ private: @@ -118,25 +143,27 @@ struct GenSSR{ Constant *dim = ConstantInt::get(i32, Access->getDimension() - 1U); //dimension - 1, ty=i32 bool isStore = Access->getNStore() > 0u; - Function *SSRSetup; - if (!isStore){ - SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_read_imm); //can take _imm bc dm and dim are constant - }else{ - SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_write_imm); //can take _imm bc dm and dim are constant - } - std::array args = {getDMID(), dim, getBase()}; - builder.CreateCall(SSRSetup->getFunctionType(), SSRSetup, ArrayRef(args))->dump(); - Intrinsic::RISCVIntrinsics functions[] = { Intrinsic::riscv_ssr_setup_bound_stride_1d, Intrinsic::riscv_ssr_setup_bound_stride_2d, Intrinsic::riscv_ssr_setup_bound_stride_3d, Intrinsic::riscv_ssr_setup_bound_stride_4d }; + Value *StrideChange = nullptr; for (unsigned i = 0u; i < Access->getDimension(); i++){ + Value *Str = getStride(i); + Value *Bd = getBound(i); + Value *ChSt; + if (StrideChange) ChSt = builder.CreateSub(Str, StrideChange, formatv("stride.{0}d.final", i+1)); + else ChSt = Str; Function *SSRBoundStrideSetup = Intrinsic::getDeclaration(mod, functions[i]); - std::array bsargs = {getDMID(), getBound(i), getStride(i)}; + std::array bsargs = {getDMID(), Bd, ChSt}; builder.CreateCall(SSRBoundStrideSetup->getFunctionType(), SSRBoundStrideSetup, ArrayRef(bsargs))->dump(); + if (i + 1 != Access->getDimension()){ //only calculate stride change if needed + Value *bdXstr = builder.CreateMul(Bd, Str, formatv("bdXstd.{0}d", i+1)); + if (StrideChange) StrideChange = builder.CreateAdd(StrideChange, bdXstr, formatv("str.change.for{0}d", i+2)); + else StrideChange = bdXstr; + } } unsigned n_reps = 0U; @@ -146,6 +173,7 @@ struct GenSSR{ std::array pusharg = {getDMID(), cast(I)->getValueOperand()}; builder.SetInsertPoint(I); auto *C = builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); + clobberRegistersAt(ArrayRef(formatv("ft{0}", (unsigned)DMID->getValue().getLimitedValue(NUM_SSR))), C); C->dump(); I->dump(); I->eraseFromParent(); n_reps++; @@ -156,6 +184,7 @@ struct GenSSR{ for (Instruction *I : Access->getAccesses()){ builder.SetInsertPoint(I); Instruction *V = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); + clobberRegistersAt(ArrayRef(formatv("ft{0}", (unsigned)DMID->getValue().getLimitedValue(NUM_SSR))), V); V->dump(); I->dump(); BasicBlock::iterator ii(I); ReplaceInstWithValue(I->getParent()->getInstList(), ii, V); @@ -164,10 +193,26 @@ struct GenSSR{ } builder.SetInsertPoint(Point); - Constant *Rep = ConstantInt::get(i32, n_reps - 1U); //repetition - 1, ty=i32 + Constant *Rep = ConstantInt::get(i32, n_reps - 1U); Function *SSRRepetitionSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_repetition); std::array repargs = {getDMID(), Rep}; builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef(repargs))->dump(); + + Function *SSRSetup; + if (!isStore){ + SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_read_imm); //can take _imm bc dm and dim are constant + }else{ + SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_write_imm); //can take _imm bc dm and dim are constant + } + std::array args = {getDMID(), dim, getBase()}; + //NOTE: this starts the prefetching ==> always needs to be inserted AFTER bound/stride and repetition setups !!! + builder.CreateCall(SSRSetup->getFunctionType(), SSRSetup, ArrayRef(args))->dump(); + + //create an SSR barrior in exit block. TODO: needed esp. for write streams? + builder.SetInsertPoint(Access->getLoop()->getExitBlock()->getFirstNonPHI()); + Function *SSRBarrier = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_barrier); + std::array barrargs = {getDMID()}; + builder.CreateCall(SSRBarrier->getFunctionType(), SSRBarrier, ArrayRef(barrargs))->dump(); return; } @@ -203,7 +248,6 @@ std::pair splitAt(Instruction *X, const Twine &name, Instruction *T = BB->getTerminator(); for (unsigned i = 0; i < T->getNumOperands(); i++){ Value *OP = T->getOperand(i); - T->dump(); if (dyn_cast(OP) == Two){ T->setOperand(i, One); //if an operand of the terminator of a predecessor of Two points to Two it should now point to One } @@ -326,19 +370,34 @@ BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, Dominato ///generates SSR enable & disable calls void generateSSREnDis(const Loop *L){ - IRBuilder<> builder(L->getLoopPreheader()->getTerminator()); + IRBuilder<> builder(L->getLoopPreheader()->getTerminator()); // ----------- in preheader Module *mod = L->getHeader()->getModule(); Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef()); - Function *FREPPragma = Intrinsic::getDeclaration(mod, Intrinsic::riscv_frep_infer); - builder.CreateCall(FREPPragma->getFunctionType(), FREPPragma, ArrayRef()); + //create inline asm that clobbers ft0-2 to make sure none of them are reordered to before ssr enable / after ssr disable + //equivalent to asm volatile ("":::"ft0", "ft1", "ft2"); + InlineAsm *IA = InlineAsm::get( + FunctionType::get(Type::getVoidTy(builder.getContext()), false), + "", + "~{ft0},~{ft1},~{ft2},~{dirflag},~{fpsr},~{flags}", + true + ); + builder.CreateCall(IA); + + //Function *FREPPragma = Intrinsic::getDeclaration(mod, Intrinsic::riscv_frep_infer); + //builder.CreateCall(FREPPragma->getFunctionType(), FREPPragma, ArrayRef()); - builder.SetInsertPoint(L->getExitBlock()->getTerminator()); + builder.SetInsertPoint(L->getExitBlock()->getTerminator()); // ----------- in exit block + builder.CreateCall(IA); //same here Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef()); errs()<<"generated ssr_enable and ssr_disable\n"; + + L->getLoopPreheader()->getSinglePredecessor()->dump(); + L->getLoopPreheader()->dump(); + return; } @@ -478,6 +537,8 @@ void addChangedLoop(const Loop *NewL, SmallPtrSet &loops){ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &FAM){ + if (!SSR_INFERENCE || !GenerateSSR) return PreservedAnalyses::all(); + AffineAccess &AF = FAM.getResult(F); LoopInfo &LI = FAM.getResult(F); DominatorTree &DT = FAM.getResult(F); @@ -498,8 +559,6 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F if (p.second && isValid(p.second)) goodAccs.push_back(p.second); } - F.dump(); - if (goodAccs.empty()) return PreservedAnalyses::all(); //early exit ConflictGraph g(AF, ArrayRef(goodAccs)); @@ -550,11 +609,7 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F //TODO: merge loops //TODO: frep pragmas - if (changedLoops.empty()){ - return PreservedAnalyses::all(); - }else{ - F.addFnAttr(Attribute::AttrKind::NoInline); //mark function as no-inline, because there can be intersecting streams if function is inlined! - return PreservedAnalyses::none(); - } + F.addFnAttr(Attribute::AttrKind::NoInline); //mark function as no-inline, because there can be intersecting streams if function is inlined! + return PreservedAnalyses::none(); } From 31f1335ce4609499eef432b095798ca8932b122b Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Tue, 17 May 2022 15:01:24 +0200 Subject: [PATCH 26/47] problems with regremerge --- llvm/lib/Passes/PassBuilder.cpp | 3 +- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 75 ++++++++++++++--------- 2 files changed, 49 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 62c19d3a84ab5..e54e4c6b1a37d 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -763,6 +763,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(SimplifyCFGPass()); FPM.addPass(InstCombinePass()); + FPM.addPass(SSRInferencePass()); if (EnableLoopFlatten) FPM.addPass(LoopFlattenPass()); @@ -773,7 +774,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, std::move(LPM2), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false, DebugLogging)); - FPM.addPass(SSRInferencePass()); + //FPM.addPass(SSRInferencePass()); // Delete small array after loop unroll. FPM.addPass(SROA()); diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index c7dd8f058e5c2..a6a57eea77c83 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -51,7 +51,7 @@ #define SSR_INFERENCE true -#define NUM_SSR 3U +#define NUM_SSR 3U //NOTE: if increased too much, might need to change 1st arguments to clobberRegisters(..) #define SSR_MAX_DIM 4U //both are inclusive! #define SSR_SCRATCHPAD_BEGIN 0x100000 @@ -65,20 +65,20 @@ static cl::opt GenerateSSR("generate-ssr", cl::init(true), cl::Hidden); namespace{ -void clobberRegistersAt(ArrayRef regs, Instruction *Before){ - IRBuilder<> builder(Before); +void clobberRegisters(ArrayRef regs, IRBuilder<> &builder){ //equivalent to asm volatile ("":::regs); std::string constraints = "~{dirflag},~{fpsr},~{flags}"; //TODO: what are these doing? - for (const std::string &r : regs){ - constraints = (formatv("~{{{0}},", r) + Twine(constraints)).str(); + for (const std::string r : regs){ + constraints = "~{" + r + "}," + constraints; //(formatv("~{{{0}},", r) + Twine(constraints)).str() } + errs()<dump(); + builder.CreateCall(IA); } } //end of namespace @@ -167,13 +167,16 @@ struct GenSSR{ } unsigned n_reps = 0U; + std::string s = formatv("ft{0}", (unsigned)DMID->getValue().getLimitedValue()); + ArrayRef regs(s); if (isStore){ Function *SSRPush = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_push); for (Instruction *I : Access->getAccesses()){ std::array pusharg = {getDMID(), cast(I)->getValueOperand()}; builder.SetInsertPoint(I); + clobberRegisters(regs, builder); auto *C = builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); - clobberRegistersAt(ArrayRef(formatv("ft{0}", (unsigned)DMID->getValue().getLimitedValue(NUM_SSR))), C); + clobberRegisters(regs, builder); C->dump(); I->dump(); I->eraseFromParent(); n_reps++; @@ -183,8 +186,9 @@ struct GenSSR{ std::array poparg = {getDMID()}; for (Instruction *I : Access->getAccesses()){ builder.SetInsertPoint(I); - Instruction *V = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); - clobberRegistersAt(ArrayRef(formatv("ft{0}", (unsigned)DMID->getValue().getLimitedValue(NUM_SSR))), V); + clobberRegisters(regs, builder); + auto *V = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); + clobberRegisters(regs, builder); V->dump(); I->dump(); BasicBlock::iterator ii(I); ReplaceInstWithValue(I->getParent()->getInstList(), ii, V); @@ -208,7 +212,7 @@ struct GenSSR{ //NOTE: this starts the prefetching ==> always needs to be inserted AFTER bound/stride and repetition setups !!! builder.CreateCall(SSRSetup->getFunctionType(), SSRSetup, ArrayRef(args))->dump(); - //create an SSR barrior in exit block. TODO: needed esp. for write streams? + //create an SSR barrier in exit block. TODO: needed esp. for write streams? builder.SetInsertPoint(Access->getLoop()->getExitBlock()->getFirstNonPHI()); Function *SSRBarrier = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_barrier); std::array barrargs = {getDMID()}; @@ -242,7 +246,7 @@ void copyPHIsFromPred(BasicBlock *BB){ ///splits block, redirects all predecessor to first half of split, copies phi's std::pair splitAt(Instruction *X, const Twine &name, DomTreeUpdater *DTU){ BasicBlock *Two = X->getParent(); - BasicBlock *One = splitBlockBefore(Two, X, DTU, nullptr, nullptr, name); + BasicBlock *One = splitBlockBefore(Two, X, nullptr, nullptr, nullptr, name); for (auto *BB : predecessors(Two)){ if (BB == One) continue; Instruction *T = BB->getTerminator(); @@ -375,21 +379,19 @@ void generateSSREnDis(const Loop *L){ Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef()); + std::vector regs; + for (unsigned r = 0u; r < NUM_SSR; r++){ + regs.push_back(std::string(formatv("ft{0}", r))); + } //create inline asm that clobbers ft0-2 to make sure none of them are reordered to before ssr enable / after ssr disable //equivalent to asm volatile ("":::"ft0", "ft1", "ft2"); - InlineAsm *IA = InlineAsm::get( - FunctionType::get(Type::getVoidTy(builder.getContext()), false), - "", - "~{ft0},~{ft1},~{ft2},~{dirflag},~{fpsr},~{flags}", - true - ); - builder.CreateCall(IA); + clobberRegisters(ArrayRef(regs), builder); //Function *FREPPragma = Intrinsic::getDeclaration(mod, Intrinsic::riscv_frep_infer); //builder.CreateCall(FREPPragma->getFunctionType(), FREPPragma, ArrayRef()); builder.SetInsertPoint(L->getExitBlock()->getTerminator()); // ----------- in exit block - builder.CreateCall(IA); //same here + clobberRegisters(ArrayRef(regs), builder); Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef()); @@ -411,7 +413,7 @@ Value *generateIntersectCheck(IRBuilder<> &builder, GenSSR *G, GenSSR *H){ return builder.CreateOr(GhiLTHlo, HhiLTGlo, "or.memrange"); } -void generateSSRGuard(BranchInst *BR, ArrayRef streams){ +void generateSSRGuard(BranchInst *BR, ArrayRef streams, AffineAccess &AF){ assert(BR->isConditional()); if (streams.empty()) return; IRBuilder<> builder(BR); @@ -421,12 +423,10 @@ void generateSSRGuard(BranchInst *BR, ArrayRef streams){ } for (unsigned i = 0; i < streams.size(); i++){ GenSSR *G = streams[i]; - for (unsigned j = 0; j < streams.size(); j++){ - if (G->Access->getNStore() > 0u){ - GenSSR *H = streams[j]; - if (j < i || (j > i && H->Access->getNStore() == 0u)){ //true if H is before G OR H is after G and a load - checks.push_back(generateIntersectCheck(builder, G, H)); - } + for (unsigned j = 0; j < i; j++){ + GenSSR *H = streams[j]; + if (AF.conflictWWWR(G->Access, H->Access)){ + generateIntersectCheck(builder, G, H); } } } @@ -480,7 +480,11 @@ struct ConflictGraph{ std::map> &color = *(new std::map>()); std::vector accs; for (const auto &A : conflicts) accs.push_back(A.first); - auto isBetter = [](NodeT A, NodeT B){ return A->getDimension() > B->getDimension(); }; + auto isBetter = [](NodeT A, NodeT B){ + unsigned a = A->getLoop()->getLoopDepth() + 2 * A->getDimension() + (A->getNLoad() > 0u); + unsigned b = B->getLoop()->getLoopDepth() + 2 * B->getDimension() + (B->getNLoad() > 0u); + return a > b; + }; std::sort(accs.begin(), accs.end(), isBetter); for (const auto &A : accs){ bool done = false; @@ -586,7 +590,7 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F auto p = ssrs.find(L); if (p != ssrs.end()){ BranchInst *BR = cloneRegion(L->getLoopPreheader()->getTerminator(), &*L->getExitBlock()->getFirstInsertionPt(), DT, &DTU, &LI, &MSSAU); - generateSSRGuard(BR, ArrayRef(p->getSecond().begin(), p->getSecond().end())); //generate "SSR guard" + generateSSRGuard(BR, ArrayRef(p->getSecond().begin(), p->getSecond().end()), AF); //generate "SSR guard" } } @@ -613,3 +617,18 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F return PreservedAnalyses::none(); } + + +/* +for (unsigned i = 0; i < streams.size(); i++){ + GenSSR *G = streams[i]; + for (unsigned j = 0; j < streams.size(); j++){ + if (G->Access->getNStore() > 0u){ + GenSSR *H = streams[j]; + if (j < i || (j > i && H->Access->getNStore() == 0u)){ //true if H is before G OR H is after G and a load + checks.push_back(generateIntersectCheck(builder, G, H)); + } + } + } + } +*/ \ No newline at end of file From b8950663bafe60c369f37ed11d6a4f943cb0914d Mon Sep 17 00:00:00 2001 From: ThomasRupf Date: Tue, 17 May 2022 23:31:30 +0200 Subject: [PATCH 27/47] fix backend bug --- llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp | 120 +++++++++++++----- 1 file changed, 89 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp index a90f199b6b1d6..e45546904b9a2 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp @@ -480,14 +480,99 @@ bool RISCVExpandSSR::expandSSR_Barrier(MachineBasicBlock &MBB, } void RISCVExpandSSR::mergePushPop(MachineBasicBlock &MBB) { - SmallSet virtRegs[NUM_SSR]; + //SmallSet virtRegs[NUM_SSR]; const TargetRegisterInfo *TRI = MBB.getParent()->getRegInfo().getTargetRegisterInfo(); - bool inSSRRegion = false; Register ssr_regs[NUM_SSR]; for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) ssr_regs[ssr_no] = getSSRFtReg(ssr_no); + + for (auto ssr_reg : ssr_regs){ + SmallSet modified; + for (auto MI = MBB.rbegin(); MI != MBB.rend(); ){ //go from back to front + auto PMI = std::next(MI); //this is prev bc reverse iterator + if(MI->getOpcode() == RISCV::FSGNJ_D){ + if (MI->getOperand(1).getReg() == ssr_reg && MI->getOperand(2).getReg() == ssr_reg && MI->getOperand(0).isReg()){ //this was an SSR pop + Register r = MI->getOperand(0).getReg(); //register to replace + bool replacedAll = true; //if there are no uses, can replace too + SmallVector replacements; + for (auto MI2 = MBB.rbegin(); replacedAll && MI2 != MI; MI2++){ + for (auto Op = MI2->operands_begin(); replacedAll && Op != MI2->operands_end(); ++Op){ + if (Op->isReg() && Op->getReg() == r){ + replacedAll = replacedAll && modified.find(&*MI2) == modified.end(); + replacements.push_back(&*MI2); + } + } + } + if (replacedAll) { + MBB.addLiveIn(ssr_reg); + MI->eraseFromParentAndMarkDBGValuesForRemoval(); + for (MachineInstr *I : replacements){ + I->dump(); + I->substituteRegister(r, ssr_reg, 0, *TRI); + I->dump(); + modified.insert(I); + } + } + }else if(MI->getOperand(0).getReg() == ssr_reg){ + auto Op1 = MI->getOperand(1), Op2 = MI->getOperand(2); + if (Op1.isReg() && Op2.isReg() && Op1.getReg() == Op2.getReg() && Op1.isKill() && Op2.isKill()){ //because Op is kill will not be used later + Register r = Op1.getReg(); + MachineOperand *O = nullptr; + bool done = false; + //find the single operand that defines this reg (no other users allowed in between) + for (auto MI2 = std::next(MI)/*this is prev*/; !done && MI2 != MBB.rend(); ++MI2){ + for (auto Op = MI2->operands_begin(); Op != MI2->operands_end(); ++Op){ + if (Op->isReg() && Op->getReg() == r){ + done = true; + if (Op->isDef()) O = &*Op; + break; + } + } + } + if (O){ + O->getParent()->dump(); + O->setReg(ssr_reg); + O->getParent()->dump(); + MI->eraseFromParentAndMarkDBGValuesForRemoval(); + } + } + } + } + MI = PMI; + } + } + MBB.sortUniqueLiveIns(); +} + +/// Gather parameters for the register merging +RISCVExpandSSR::RegisterMergingPreferences RISCVExpandSSR::gatherRegisterMergingPreferences() { + RISCVExpandSSR::RegisterMergingPreferences RMP; + + // set up defaults + RMP.Enable = true; + + // read user + if (SSRRegisterMerge.getNumOccurrences() > 0) + RMP.Enable = !SSRRegisterMerge; + + LLVM_DEBUG(dbgs() << "RMP Enable "< 0) - RMP.Enable = !SSRRegisterMerge; - - LLVM_DEBUG(dbgs() << "RMP Enable "< Date: Fri, 27 May 2022 11:53:52 +0200 Subject: [PATCH 28/47] changes --- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 5 +++-- llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp | 19 +++++++++++++------ llvm/lib/Transforms/SSR/SSRGeneration.cpp | 6 +++--- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index ba3b7030ffdc5..5ee072fb6801a 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -541,8 +541,9 @@ ArrayRef AffineAccess::getAccesses() const{ } bool AffineAccess::accessPatternsMatch(const AffineAcc *A, const AffineAcc *B) const { - if (!SCEVEquals(A->data, B->data, SE)) return false; if (A->getDimension() != B->getDimension()) return false; + if (A->getLoop() != B->getLoop()) return false; + if (!SCEVEquals(A->data, B->data, SE)) return false; for (unsigned i = 0; i < A->getDimension(); i++){ if (!SCEVEquals(A->bounds[i], B->bounds[i], SE)) return false; if (!SCEVEquals(A->strides[i], B->strides[i], SE)) return false; @@ -583,7 +584,7 @@ bool AffineAccess::conflictWWWR(const AffineAcc *A, const AffineAcc *B) const { } } - if (A->getLoop()->contains(B->getLoop()) || B->getLoop()->contains(A->getLoop())) return true; + if (A->getLoop() == B->getLoop() || A->getLoop()->contains(B->getLoop()) || B->getLoop()->contains(A->getLoop())) return true; return false; } diff --git a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp index e45546904b9a2..808576641f365 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp @@ -515,19 +515,26 @@ void RISCVExpandSSR::mergePushPop(MachineBasicBlock &MBB) { } }else if(MI->getOperand(0).getReg() == ssr_reg){ auto Op1 = MI->getOperand(1), Op2 = MI->getOperand(2); + //TODO: turns out the following condition is almost never true ==> use live-ness analysis instead of .isKill() ? if (Op1.isReg() && Op2.isReg() && Op1.getReg() == Op2.getReg() && Op1.isKill() && Op2.isKill()){ //because Op is kill will not be used later Register r = Op1.getReg(); MachineOperand *O = nullptr; - bool done = false; - //find the single operand that defines this reg (no other users allowed in between) - for (auto MI2 = std::next(MI)/*this is prev*/; !done && MI2 != MBB.rend(); ++MI2){ - for (auto Op = MI2->operands_begin(); Op != MI2->operands_end(); ++Op){ + //find the most recent operand that sets this reg + for (auto MI2 = std::next(MI); !O && MI2 != MBB.rend(); ++MI2){ + //FIXME: first operand is always dest operand right? otherwise require def (like below) or query llvm which operand is dest (how?) + if (MI2->getNumOperands() == 0u) continue; + MachineOperand *Op = &*MI2->operands_begin(); + MI2->dump(); + if (Op->isReg() && Op->getReg() == r){ + O = Op; + } + /*for (auto Op = MI2->operands_begin(); Op != MI2->operands_end(); ++Op){ if (Op->isReg() && Op->getReg() == r){ done = true; - if (Op->isDef()) O = &*Op; + if (Op->isDef() || SKIP_DEF_CHECK) O = &*Op; break; } - } + }*/ } if (O){ O->getParent()->dump(); diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index a6a57eea77c83..1f88bbd2957c3 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -120,7 +120,7 @@ struct GenSSR{ checks.push_back(builder.CreateICmpSGE(getBound(i), ConstantInt::get(Type::getInt32Ty(ExpandBefore->getContext()), 0U))); } Value *BaseInt = builder.CreatePtrToInt(getBase(), i64, "base.to.int"); - MemBegin = BaseInt; + this->MemBegin = BaseInt; checks.push_back(builder.CreateICmpUGE(BaseInt, ConstantInt::get(i64, SSR_SCRATCHPAD_BEGIN), "scratchpad.begin.check")); Value *EndIncl = BaseInt; for (unsigned i = 0U; i < Access->getDimension(); i++){ @@ -129,7 +129,7 @@ struct GenSSR{ Value *RangeExt = builder.CreateSExt(Range, i64, Twine("range.sext.").concat(dim)); EndIncl = builder.CreateAdd(EndIncl, RangeExt, Twine("end.incl.").concat(dim)); } - MemEnd = EndIncl; + this->MemEnd = EndIncl; checks.push_back(builder.CreateICmpULE(EndIncl, ConstantInt::get(i64, SSR_SCRATCHPAD_END), "scratchpad.end.check")); return builder.CreateAnd(ArrayRef(checks)); } @@ -426,7 +426,7 @@ void generateSSRGuard(BranchInst *BR, ArrayRef streams, AffineAccess & for (unsigned j = 0; j < i; j++){ GenSSR *H = streams[j]; if (AF.conflictWWWR(G->Access, H->Access)){ - generateIntersectCheck(builder, G, H); + checks.push_back(generateIntersectCheck(builder, G, H)); } } } From b5ac6204d8c4f5da2bdd6ba8ffb40356b65a1aa4 Mon Sep 17 00:00:00 2001 From: thrupf Date: Wed, 1 Jun 2022 08:49:17 +0200 Subject: [PATCH 29/47] only conflicts left --- .../llvm/Analysis/AffineAccessAnalysis.h | 126 ++- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 922 +++++++----------- 2 files changed, 456 insertions(+), 592 deletions(-) diff --git a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h index 3e8dbcf66e8bd..a2c0c7ff099bd 100644 --- a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h @@ -2,8 +2,10 @@ #include "llvm/Passes/PassBuilder.h" #include "llvm/Passes/PassPlugin.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/DenseMap.h" #include #include @@ -13,55 +15,109 @@ namespace llvm { class AffineAccess; class AffineAccessAnalysis; +class LoopInfo; +class ScalarEvolution; +class MemorySSA; +class MemoryUseOrDef; -class AffineAcc{ - friend AffineAccess; +struct LoopRep{ private: - AffineAcc(Instruction *Addr, ArrayRef accesses, const SCEV *data); - - const SCEV *data; - SmallVector bounds; //from inner- to outermost loop - SmallVector strides; //from inner- to outermost loop - Instruction *Addr; - SmallVector accesses; //load/store instructions that use address (guaranteed to be in same loop) - const Loop *L; //outermost loop + ScalarEvolution &SE; + DominatorTree &DT; + const Loop *L; + const SCEV *RepSCEV; + Value *Rep = nullptr; + SmallVector containingLoops; //from inner- to outermost + unsigned safeExpandBound; //exclusive bound public: - AffineAcc() = delete; - void dump() const; - unsigned getDimension() const; - unsigned getUsedDimension() const; + /// construct rep for this loop, if loop well-formed isAvaliable will give true + LoopRep(const Loop *L, ArrayRef contLoops, ScalarEvolution &SE, DominatorTree &DT); + bool isAvailable() const; + bool isOnAllCFPathsOfParentIfExecuted() const; const Loop *getLoop() const; - Instruction *getAddrIns() const; - const SmallVector &getAccesses() const; - unsigned getNStore() const; - unsigned getNLoad() const; + const SCEV *getSCEV() const; + const SCEV *getSCEVPlusOne() const; + bool isSafeToExpandBefore(const Loop *L) const; + + ///expands LoopRep::RepSCEV at InsertBefore (if nullptr in preheader of loop) + Value *expandAt(Type *ty, Instruction *InsertBefore = (Instruction *)nullptr); +}; + +enum ConflictKind {NoConflict = 0, MustNotIntersect, MustBeSame, Bad }; + +struct AffAcc{ +private: + ScalarEvolution &SE; + MemoryUseOrDef *MA; + SmallVector accesses; //the load/store (or call) instructions + SmallVector baseAddresses; //base addresses depending on loop + SmallVector steps; //steps per loop (0 if loop-inv) + SmallVector reps; //loop reps + SmallVector containingLoops; //from inner- to outermost + DenseMap> conflicts; //conflicts to other Affine Accesses and starting from which dimension + void findSteps(const SCEV *A, const SCEV *Factor, unsigned loop); + +public: + AffAcc() = delete; + //immediately copies the contens of accesses and containingLoops + AffAcc(ArrayRef accesses, const SCEV *Addr, MemoryUseOrDef *MA, ArrayRef containingLoops, ScalarEvolution &SE); + ArrayRef getAccesses() const; + bool isWrite() const; + unsigned getMaxDimension() const; + bool isWellFormed(unsigned dimension) const; + bool canExpandBefore(const Loop *L) const; + void dump() const; + unsigned loopToDimension(const Loop *L) const; + ConflictKind getConflictFor(const AffAcc *A, unsigned dimension) const; + ConflictKind getConflictInLoop(const AffAcc *A, const Loop *L) const; + const SCEV *getBaseAddr(unsigned dim) const; + const SCEV *getStep(unsigned dim) const; + const SCEV *getRep(unsigned dim) const; + const Loop *getLoop(unsigned dim) const; + + MemoryAccess *getMemoryAccess(); + ///finds all MemoryDefs that clobber this access's memory that prevent it from being prefetched before the loop + ArrayRef getAllClobberingFor(const Loop *L); + void addConflict(const AffAcc *A, unsigned startDimension, ConflictKind kind); + void addConflictInLoop(const AffAcc *A, const Loop *StartLoop, ConflictKind kind); + bool promote(LoopRep *LR); ///does not check whether it is on all CF-paths for LR->getLoop() + ///code gen: + Value *expandBaseAddr(unsigned dimension, Type *ty, Instruction *InsertBefore = (Instruction *)nullptr); + Value *expandStep(unsigned dimension, Type *ty, Instruction *InsertBefore = (Instruction *)nullptr); + Value *expandRep(unsigned dimension, Type *ty, Instruction *InsertBefore = (Instruction *)nullptr); }; class AffineAccess{ private: - SmallVector accesses; //accesses - DenseMap loopReps; //wellformed loops & their bt counts - DenseSet addresses; //already checked address instructions ScalarEvolution &SE; DominatorTree &DT; LoopInfo &LI; + MemorySSA &MSSA; + AAResults &AA; + DenseMap access; + DenseMap reps; + DenseMap> expandableAccesses; + + DenseSet analyze(const Loop *Parent, std::vector &loopPath); + void addConflictsForUse(AffAcc *A, const Loop *L); + void addConflictsForDef(AffAcc *A, const Loop *L); + void addConflict(AffAcc *A, AffAcc *B, const Loop *L, ConflictKind kind); public: - AffineAccess(ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI); + AffineAccess(Function &F, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, MemorySSA &MSSA, AAResults &AA); AffineAccess() = delete; - void addAllAccesses(Instruction *Addr, const Loop *L); - AffineAcc *promoteAccess(const AffineAcc &Acc, const Loop *L, const SCEV *Stride); - std::pair splitLoadStore(const AffineAcc *Acc) const; - ArrayRef getAccesses() const; - bool accessPatternsMatch(const AffineAcc *A, const AffineAcc *B) const; - bool shareInsts(const AffineAcc *A, const AffineAcc *B) const; - bool conflictWWWR(const AffineAcc *A, const AffineAcc *B) const; - bool shareLoops(const AffineAcc *A, const AffineAcc *B) const; - const SCEV *wellFormedLoopBTCount(const Loop *L) const; //returns bt count if loop is well-formed - Value *expandData(const AffineAcc *aa, Type *ty = (Type *)nullptr, Instruction *InsertBefore = (Instruction *)nullptr) const; - Value *expandBound(const AffineAcc *aa, unsigned i, Type *ty = (Type *)nullptr, Instruction *InsertBefore = (Instruction *)nullptr) const; - Value *expandStride(const AffineAcc *aa, unsigned i, Type *ty = (Type *)nullptr, Instruction *InsertBefore = (Instruction *)nullptr) const; + bool accessPatternsMatch(const AffAcc *A, const AffAcc *B, const Loop *L) const; + ScalarEvolution &getSE() const; + DominatorTree &getDT() const; + LoopInfo &getLI() const; + MemorySSA &getMSSA() const; + AAResults &getAA() const; + ArrayRef getLoopsInPreorder() const; + ArrayRef getExpandableAccesses(const Loop *L) const; + const AffAcc *getAccess(Instruction *I) const; + + static Value *getAddress(Instruction *I); }; class AffineAccessAnalysis : public AnalysisInfoMixin { diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index 5ee072fb6801a..e2f255a16096e 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -22,9 +22,11 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasAnalysisEvaluator.h" #include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -33,7 +35,9 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/TinyPtrVector.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/ilist.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/DenseMap.h" #include @@ -43,64 +47,6 @@ using namespace llvm; -AffineAcc::AffineAcc(Instruction *Addr, ArrayRef accesses, - const SCEV *data) : data(data), Addr(Addr), L(nullptr){ - for (Instruction *I : accesses) this->accesses.push_back(I); - return; -} - -unsigned AffineAcc::getDimension() const{ - return this->bounds.size(); -} - -unsigned AffineAcc::getUsedDimension() const{ - unsigned d = 0u; - for (const SCEV *S : this->strides){ - if (!isa(S) || !cast(S)->isZero()) d++; //only cound a dimension if its stride is non-zero - } - return d; -} - -void AffineAcc::dump() const{ - errs()<<"Affine Access in Loop:\n"; - if (L) L->dump(); - else errs()<<"nullptr\n"; - errs()<<"With Addr instruction: "; Addr->dump(); - errs()<<"And the following load/store instructions:\n"; - for (Instruction *I : accesses){ - I->dump(); - } - errs()<<"data pointer: "; data->dump(); - for (unsigned i = 0; i < this->getDimension(); i++){ - errs()<<"dim "<<(i+1)<<" stride: "; strides[i]->dump(); - errs()<<"dim "<<(i+1)<<" bound: "; bounds[i]->dump(); - } -} - -Instruction *AffineAcc::getAddrIns() const{ - return Addr; -} - -const Loop *AffineAcc::getLoop() const{ - return L; -} - -const SmallVector &AffineAcc::getAccesses() const{ - return this->accesses; -} - -unsigned AffineAcc::getNStore() const { - unsigned r = 0U; - for (Instruction *I : this->accesses) r += isa(I); - return r; -} - -unsigned AffineAcc::getNLoad() const { - unsigned r = 0U; - for (Instruction *I : this->accesses) r += isa(I); - return r; -} - //================== AffineAcces, helper functions ========================================= namespace { @@ -109,22 +55,22 @@ namespace { /// L has 1 preheader and 1 dedicated exit /// L has 1 backedge and 1 exiting block /// bt SCEV can be expanded to instructions at insertionsPoint -bool checkLoop(const Loop *L, DominatorTree &DT, ScalarEvolution &SE, Instruction *InsertionPoint){ - if (!L->isLCSSAForm(DT)) { errs()<<"not LCSSA\n"; return false; } - if (!L->getLoopPreheader()) { errs()<<"no preheader\n"; return false; } - if (!L->getExitBlock()) { errs()<<"nr. exit blocks != 1\n"; return false; } - if (!L->hasDedicatedExits()) { errs()<<"exit is not dedicated\n"; return false; } - if (L->getNumBackEdges() != 1) { errs()<<"nr. back-edges != 1\n"; return false; } - +bool checkLoop(const Loop *L, DominatorTree &DT, ScalarEvolution &SE){ + if (!L->isLCSSAForm(DT) + || !L->getLoopPreheader() + || !L->getExitingBlock() + || !L->getExitBlock() + || !L->hasDedicatedExits() + || L->getNumBackEdges() != 1) { + return false; + } if (!SE.hasLoopInvariantBackedgeTakenCount(L)){ - errs()<<"checkLoop: cannot calculate backedge taken count\n"; return false; } const SCEV *bt = SE.getBackedgeTakenCount(L); - if(!isSafeToExpandAt(bt, InsertionPoint, SE) /*|| !SE->isAvailableAtLoopEntry(bt, L)*/){ - errs()<<"cannot expand bt SCEV: "; bt->dump(); + if(!isa(bt) || !SE.isAvailableAtLoopEntry(bt, L)){ + return false; } - errs()<<"loop is well-formed: "; bt->dump(); return true; } @@ -291,92 +237,6 @@ bool isOnAllPredicatedControlFlowPaths(BasicBlock *BB, const Loop *L, const Domi return true; } -/// get Loops containing Ins from innermost to outermost -SmallVector &getContainingLoops(ArrayRef loopsPreorder, Instruction *Ins){ - BasicBlock *BB = Ins->getParent(); - SmallVector *r = new SmallVector(); - for (auto L = loopsPreorder.rbegin(); L != loopsPreorder.rend(); ++L){ //go through loops in reverse order ==> innermost first - if ((*L)->contains(BB)){ - r->push_back(*L); - } - } - return *r; -} - -void findStridesRec(const SCEV *Addr, ArrayRef loops, SmallVector &factors, ScalarEvolution &SE, SmallVector &res){ - if (!res.empty()) { errs()<<"stride: "; res.back()->dump(); } - errs()<<"finding strides in "; Addr->dump(); - if (loops.empty()) return; - errs()<<"loop header: "<getHeader()->getNameOrAsOperand()<<"\n"; - switch (Addr->getSCEVType()) - { - case SCEVTypes::scAddRecExpr: - { - auto AddRec = cast(Addr); - if (AddRec->getLoop() == *loops.begin()){ - const SCEV *S = AddRec->getStepRecurrence(SE); - for (const SCEV *x : factors){ - auto p = toSameType(S, x, SE, true); - if (!p.hasValue()) { - assert(false && "unsafe toSameType returned None!"); //TODO: change to errs() - return; - } - S = SE.getMulExpr(p.getValue().first, p.getValue().second); - } - res.push_back(S); - findStridesRec(AddRec->getStart(), ArrayRef(loops.begin()+1, loops.end()), factors, SE, res); - }else{ - bool occurs = false; - for (const Loop *L : loops) occurs = occurs || AddRec->getLoop() == L; //loops needs to occur further up, o/w invalid - if (!occurs) return; - res.push_back(SE.getConstant(APInt(64U, 0U))); //TODO: this leads to ugly casts - findStridesRec(AddRec, ArrayRef(loops.begin()+1, loops.end()), factors, SE, res); - } - return; - } - //case SCEVTypes::scTruncate: TODO: is unsafe here, right? - case SCEVTypes::scSignExtend: - case SCEVTypes::scZeroExtend: - findStridesRec(cast(Addr)->getOperand(0), loops, factors, SE, res); - return; - - case SCEVTypes::scAddExpr: - { - auto S = cast(Addr); - bool lhs = SE.containsAddRecurrence(S->getOperand(0)); //TODO: this does not catch all cases maybe limit SCEV to outermost loop? - bool rhs = SE.containsAddRecurrence(S->getOperand(1)); - if (lhs && !rhs) findStridesRec(S->getOperand(0), loops, factors, SE, res); - else if (!lhs && rhs) findStridesRec(S->getOperand(1), loops, factors, SE, res); - return; - } - case SCEVTypes::scMulExpr: - { - auto S = cast(Addr); - bool lhs = SE.containsAddRecurrence(S->getOperand(0)); //TODO: this does not catch all cases maybe limit SCEV to outermost loop? - bool rhs = SE.containsAddRecurrence(S->getOperand(1)); - if (lhs && !rhs) { - factors.push_back(S->getOperand(1)); - findStridesRec(S->getOperand(0), loops, factors, SE, res); - }else if (!lhs && rhs) { - factors.push_back(S->getOperand(0)); - findStridesRec(S->getOperand(1), loops, factors, SE, res); - } - return; - } - - default: - return; - } -} - -SmallVector &findStrides(Instruction *Addr, ArrayRef loops, ScalarEvolution &SE){ - const SCEV *AddrS = SE.getSCEV(Addr); //SE.getSCEVAtScope(Addr, loops.back()); //we only look at the scev as contained in outermost loop - SmallVector &strides = *(new SmallVector()); - SmallVector factors; - findStridesRec(AddrS, loops, factors, SE, strides); - return strides; -} - Value *castToSize(Value *R, Type *ty, Instruction *InsPoint){ const DataLayout &DL = InsPoint->getParent()->getModule()->getDataLayout(); IRBuilder<> builder(InsPoint); @@ -393,472 +253,420 @@ Value *castToSize(Value *R, Type *ty, Instruction *InsPoint){ } //end of namespace -//================== AffineAcces, Result of Analysis ========================================= -AffineAccess::AffineAccess(ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI) : SE(SE), DT(DT), LI(LI){ - auto loops = LI.getLoopsInPreorder(); - unsigned l = 0u; - for (const Loop *L : loops){ - L->dump(); - - if (!L->getLoopPreheader()) continue; - if (!checkLoop(L, DT, SE, L->getLoopPreheader()->getTerminator())) continue; - - loopReps.insert(std::make_pair(L, SE.getBackedgeTakenCount(L))); l++; - } - errs()<<"FOUND "< 0 -/// (3) data ptr can be computed outside of parent loop -/// (4) forall bd : aa.bounds. SE.isLoopInvariant(bd, L) && isSafeToExpandAt(bd, LPreheader->getTerminator(), SE) -/// (5) forall st : aa.strides. SE.isLoopInvariant(st, L) && isSafeToExpandAt(st, LPreheader->getTerminator(), SE) -/// (6) isSafeToExpandAt(Bound / Stride, LPreheader->getTerminator(), SE) -AffineAcc *AffineAccess::promoteAccess(const AffineAcc &aa, const Loop *L, const SCEV *Stride){ - assert(!aa.L || (aa.L && !aa.L->isInvalid())); - assert((!aa.L) || (aa.L && aa.L->getParentLoop() == L && "can only promote to parent loop")); //(1) - assert(this->loopReps.find(L) != this->loopReps.end() && "L is well formed"); //(1) - - errs()<<"Trying to promote AA of dim="<getSecond(); - const SCEV *Rep = SE.getAddExpr(Bd, SE.getConstant(Bd->getType(), 1U)); - if (!isOnAllPredicatedControlFlowPaths(aa.L->getHeader(), L, this->DT, Rep, this->SE)) return nullptr; //(2.1) +// ==== LoopRep ==== +LoopRep::LoopRep(const Loop *L, ArrayRef contLoops, ScalarEvolution &SE, DominatorTree &DT) + : L(L), containingLoops(contLoops.begin(), contLoops.end()), SE(SE), DT(DT), safeExpandBound(0u) + { + if (checkLoop(L, DT, SE)){ + const SCEV *R = SE.getBackedgeTakenCount(L); + RepSCEV = isa(R) ? nullptr : R; }else{ - for (Instruction *I : aa.accesses){ - if (!isOnAllControlFlowPaths(I->getParent(), L, DT)) return nullptr; //(2.2) - } + RepSCEV = nullptr; } - - errs()<<"passed (2), "; - - const SCEV *Bound = this->loopReps.find(L)->getSecond(); - Instruction *InsPoint = L->getLoopPreheader()->getTerminator(); - - if (!SE.hasComputableLoopEvolution(aa.data, L) && !SE.isLoopInvariant(aa.data, L)) return nullptr; //(3.1) - const SCEV *Data = SE.SplitIntoInitAndPostInc(L, aa.data).first; - if (!isSafeToExpandAt(Data, InsPoint, SE)) return nullptr; //(3.2) - - errs()<<"passed (3), "; - - for (const SCEV *Bd : aa.bounds){ - if (!(SE.isLoopInvariant(Bd, L) && isSafeToExpandAt(Bd, InsPoint, SE))) return nullptr; //(4) - } - - errs()<<"passed (4), "; - - for (const SCEV *Str : aa.strides){ - if (!(SE.isLoopInvariant(Str, L) && isSafeToExpandAt(Str, InsPoint, SE))) return nullptr; //(5) + + if (RepSCEV){ + while (safeExpandBound < containingLoops.size() + && isSafeToExpandAt(RepSCEV, containingLoops[safeExpandBound]->getLoopPreheader()->getTerminator(), SE)) + safeExpandBound++; } - - errs()<<"passed (5), "; - - if (!isSafeToExpandAt(Bound, InsPoint, SE) || !isSafeToExpandAt(Stride, InsPoint, SE)) return nullptr; //(6) - - errs()<<"passed (6)"; - - AffineAcc *A = new AffineAcc(aa); - A->data = Data; - A->L = L; - A->bounds.push_back(Bound); - A->strides.push_back(Stride); - return A; } -/// adds all affine accesses that use Addr in loop L -void AffineAccess::addAllAccesses(Instruction *Addr, const Loop *L){ - if (addresses.find(Addr) != addresses.end()) return; //already called addAllAccesses on this Addr instruction - addresses.insert(Addr); - - errs()<<"addAllAccesses: start: "<<*Addr<<"\n"; - - //find all accesses - std::vector accesses; - for (auto U = Addr->use_begin(); U != Addr->use_end(); ++U){ - Instruction *Acc = dyn_cast(U->getUser()); - if (!Acc) continue; //user is not an instruction - if (isa(Acc) || isa(Acc)){ - if (!isOnAllControlFlowPaths(Acc->getParent(), L, DT)) continue; //access does not occur consistently in loop ==> not suitable - accesses.push_back(Acc); - } - } - if (accesses.empty()) return; //Addr not used - - errs()<<"adding Access: "; Addr->dump(); - - //we are looking at containing loops of all the accesses (guaranteed to be all the same) - //Addr ins might be outside of loop (licm) if 1D stride is 0 - auto &cloops = getContainingLoops(LI.getLoopsInPreorder(), accesses[0]); +bool LoopRep::isAvailable() const { return RepSCEV != nullptr; } - errs()<<"has "<(accesses), SE.getSCEV(Addr)); //never needed -> alloc in stack - AffineAcc *A = &dim0; - - for (auto L = cloops.begin(); L != cloops.end(); ++L){ - if (loopReps.find(*L) == loopReps.end()) break; //this loop is malformed ==> this and all more outer loops cannot be used - const SCEV *Str; - if (Stride != strides.end()) Str = *(Stride++); - else Str = SE.getConstant(IntegerType::getInt32Ty((*L)->getHeader()->getContext()), 0U); //if we run out of strides we can still promote with stride=0 - - A = promoteAccess(*A, *L, Str); - errs()<<"\n"; - if (A){ - errs()<<"found AffineAcc:\n"; A->dump(); - this->accesses.push_back(A); - }else{ - break; //did not manage to promote ==> cannot promote for loops further out - } - } - errs()<<"we now have "<accesses.size()<<" affine accesses\n"; - return; +const SCEV *LoopRep::getSCEV() const { + assert(isAvailable() && "SCEV available"); //not necessary, but forces good practice + return RepSCEV; } -std::pair AffineAccess::splitLoadStore(const AffineAcc *Acc) const{ - unsigned nLoad = Acc->getNLoad(), nStore = Acc->getNStore(); - if (nLoad > 0U && nStore == 0U) return std::make_pair(Acc, nullptr); - if (nLoad == 0U && nStore > 0U) return std::make_pair(nullptr, Acc); - AffineAcc *L = new AffineAcc(*Acc); //copy - AffineAcc *S = new AffineAcc(*Acc); //copy - L->accesses.clear(); - S->accesses.clear(); - for (Instruction *I : Acc->getAccesses()){ - if (isa(I)) L->accesses.push_back(I); - else if(isa(I)) S->accesses.push_back(I); - } - return std::make_pair(L, S); +const SCEV *LoopRep::getSCEVPlusOne() const { + assert(isAvailable() && "SCEV available"); + return SE.getAddExpr(getSCEV(), SE.getConstant(getSCEV()->getType(), 1UL)); } -ArrayRef AffineAccess::getAccesses() const{ - ArrayRef *ar = new ArrayRef(accesses.begin(), accesses.end()); - return *ar; +bool LoopRep::isOnAllCFPathsOfParentIfExecuted() const { //FIXME: maybe cache this result once calculated? + assert(isAvailable() && "SCEV available"); + return isOnAllPredicatedControlFlowPaths(L->getHeader(), L->getParentLoop(), DT, getSCEVPlusOne(), SE); } -bool AffineAccess::accessPatternsMatch(const AffineAcc *A, const AffineAcc *B) const { - if (A->getDimension() != B->getDimension()) return false; - if (A->getLoop() != B->getLoop()) return false; - if (!SCEVEquals(A->data, B->data, SE)) return false; - for (unsigned i = 0; i < A->getDimension(); i++){ - if (!SCEVEquals(A->bounds[i], B->bounds[i], SE)) return false; - if (!SCEVEquals(A->strides[i], B->strides[i], SE)) return false; +bool LoopRep::isSafeToExpandBefore(const Loop *L) const { + assert(isAvailable() && "SCEV available"); + if (L == getLoop()) return true; + for (unsigned i = 0u; i < safeExpandBound; i++) { //FIXME: linear search -> use map instead + if (L == containingLoops[i]) return true; } - return true; + return false; } -bool AffineAccess::shareInsts(const AffineAcc *A, const AffineAcc *B) const{ - for (Instruction *IA : A->getAccesses()){ - for (Instruction *IB : B->getAccesses()){ - if (IA == IB) return true; - } +Value *LoopRep::expandAt(Type *ty, Instruction *InsertBefore){ + assert(ty); + if (Rep) { //FIXME: currently forces user to call first expand at a point that dominates all possible uses (improvement: could update expand point using DT) + assert(ty == Rep->getType() && "was already expanded with same type"); + return Rep; } - return false; + InsertBefore = InsertBefore ? InsertBefore : L->getLoopPreheader()->getTerminator(); + assert(isSafeToExpandAt(RepSCEV, InsertBefore, SE) && "bound not expanable here"); + SCEVExpander ex(SE, L->getHeader()->getModule()->getDataLayout(), "rep"); + ex.setInsertPoint(InsertBefore); + return castToSize(ex.expandCodeFor(RepSCEV), ty, InsertBefore); } -bool AffineAccess::conflictWWWR(const AffineAcc *A, const AffineAcc *B) const { - assert(!shareInsts(A, B) && "these AffineAcc's share instructions ==> one of them should be filtered"); - unsigned nstA = A->getNStore(), nstB = B->getNStore(); - if (nstA == 0U && nstB == 0U) return false; //can intersect read streams - //at this point at least one of them is store - - //special case: no conflict, if - // - exactly one of them is a store - // - they have the same access pattern (AffineAccessAnalysis::accessPatternsMatch) - // - all loads dominate all stores in the loop (ie. read before write) - if ((nstA && !nstB) || (!nstA && nstB)){ - if (accessPatternsMatch(A, B)){ - const AffineAcc *S = nstA ? A : B; //store - const AffineAcc *L = nstA ? B : A; //load - bool check = true; - for (Instruction *IL : L->getAccesses()){ - for (Instruction *IS : S->getAccesses()){ - check = check && DT.dominates(IL, IS); - } - } - if (check) return false; - } +// ==== AffAcc ==== +AffAcc::AffAcc(ArrayRef accesses, const SCEV *Addr, MemoryUseOrDef *MA, ArrayRef contLoops, ScalarEvolution &SE) + : accesses(accesses.begin(), accesses.end()), MA(MA), SE(SE) +{ + baseAddresses.push_back(Addr); + steps.push_back((const SCEV *)nullptr); //there is no step for dim=0 + reps.push_back((LoopRep *)nullptr); //there is no rep for dim=0 + containingLoops.push_back((const Loop *)nullptr); //there is no loop for dim=0 + for (const Loop *L : contLoops) containingLoops.push_back(L); + findSteps(Addr, (const SCEV *)nullptr, 1u); + for (unsigned dim = 1; dim < containingLoops.size(); dim++){ + baseAddresses.push_back(SE.SplitIntoInitAndPostInc(containingLoops[dim], Addr).first); } - - if (A->getLoop() == B->getLoop() || A->getLoop()->contains(B->getLoop()) || B->getLoop()->contains(A->getLoop())) return true; - return false; } -bool AffineAccess::shareLoops(const AffineAcc *A, const AffineAcc *B) const { - return A->getLoop() == B->getLoop() || A->getLoop()->contains(B->getLoop()) || B->getLoop()->contains(A->getLoop()); +void AffAcc::findSteps(const SCEV *A, const SCEV *Factor, unsigned loop){ + assert(baseAddresses.size() == 1 && reps.size() == 1 && "we only know dim=0 so far"); + if (loop >= containingLoops.size() || !A) return; + switch (A->getSCEVType()) + { + //case SCEVTypes::scZeroExtend: FIXME: this is unsafe, right? + case SCEVTypes::scSignExtend: + case SCEVTypes::scTruncate: + return findSteps(cast(A)->getOperand(0), Factor, loop); + case SCEVTypes::scAddExpr: { + const SCEV *L = cast(A)->getOperand(0); + const SCEV *R = cast(A)->getOperand(1); + bool l = SE.containsAddRecurrence(L); + bool r = SE.containsAddRecurrence(R); + if (l && !r) return findSteps(L, Factor, loop); + else if(!l && r) return findSteps(R, Factor, loop); + return; + } + case SCEVTypes::scMulExpr: { + const SCEV *L = cast(A)->getOperand(0); + const SCEV *R = cast(A)->getOperand(1); + bool l = SE.containsAddRecurrence(L); + bool r = SE.containsAddRecurrence(R); + if (l == r) return; + if (!l && r) std::swap(L, R); + assert(SE.containsAddRecurrence(L) && !SE.containsAddRecurrence(R)); + if (Factor) { + auto p = toSameType(Factor, R, SE, true); + if (!p.hasValue()) return; + Factor = SE.getMulExpr(p.getValue().first, p.getValue().second); + }else Factor = R; + return findSteps(L, Factor, loop); + } + case SCEVTypes::scAddRecExpr: { + const auto *S = cast(A); + const SCEV *Step; + if (S->getLoop() == containingLoops[loop]){ + auto p = toSameType(Factor, S->getStepRecurrence(SE), SE, true); + if (!p.hasValue()) return; + Step = SE.getMulExpr(p.getValue().first, p.getValue().second); + }else{ //A is loop-invariant to containingLoops[loop] + bool occursLater = false; //loop needs to occur later + for (unsigned i = loop+1; i < containingLoops.size(); i++) occursLater = occursLater || containingLoops[i] == S->getLoop(); + if (!occursLater) return; + Step = SE.getConstant(APInt(1u, 0UL, false)); + } + steps.push_back(Step); + return findSteps(S->getStart(), Factor, loop+1); + + } + default: + return; + } } -const SCEV *AffineAccess::wellFormedLoopBTCount(const Loop *L) const { - auto P = loopReps.find(L); - if (P == loopReps.end()) return nullptr; //loop not well-formed; - return P->getSecond(); +ArrayRef AffAcc::getAccesses() const { return ArrayRef (accesses.begin(), accesses.end()); } +bool AffAcc::isWrite() const { return isa(MA); } +unsigned AffAcc::getMaxDimension() const { return reps.size() - 1u; } +bool AffAcc::isWellFormed(unsigned dimension) const { return dimension <= getMaxDimension() && baseAddresses[0]; } +unsigned AffAcc::loopToDimension(const Loop *L) const { + assert(L); + for (unsigned d = 1u; d < containingLoops.size(); d++){ //FIXME: linear search -> improve with a map + if (containingLoops[d] == L) return d; + } + llvm_unreachable("The provided loop does not contain `this`!"); +} +bool AffAcc::canExpandBefore(const Loop *L) const { return isWellFormed(loopToDimension(L)); } +ConflictKind AffAcc::getConflictFor(const AffAcc *A, unsigned dimension) const { + auto r = conflicts.find(A); + if (r == conflicts.end() || r->getSecond().first > dimension) return ConflictKind::None; + return r->getSecond().second; +} +ConflictKind AffAcc::getConflictInLoop(const AffAcc *A, const Loop *L) const { + return getConflictFor(A, loopToDimension(L) - 1u); +} +const SCEV *AffAcc::getBaseAddr(unsigned dim) const { assert(dim < baseAddresses.size()); return baseAddresses[dim]; } +const SCEV *AffAcc::getStep(unsigned dim) const { assert(dim < steps.size()); return steps[dim]; } +const SCEV *AffAcc::getRep(unsigned dim) const { assert(dim < reps.size()); return reps[dim]->getSCEV(); } +const Loop *AffAcc::getLoop(unsigned dim) const { assert(dim < containingLoops.size()); return containingLoops[dim]; } +void AffAcc::dump() const { + errs()<<"Affine Access of \n"; + for (auto *I : accesses) errs()<<*I<<"\n"; + for (unsigned dim = 0u; dim <= getMaxDimension(); dim++){ + errs()<<"\tdim = "<getLoop()) return false; + bool possible = true; + Instruction *Point = LR->getLoop()->getLoopPreheader()->getTerminator(); + //check all current reps and steps + for (unsigned dim = 1u; possible && dim < newDim; dim++){ + possible = possible && isSafeToExpandAt(getStep(dim), Point, SE); + possible = possible && reps[dim]->isSafeToExpandBefore(LR->getLoop()); + } + //check rep and step of new dimension + possible &= steps.size() > newDim && isSafeToExpandAt(getStep(newDim), Point, SE); + possible &= LR->isSafeToExpandBefore(LR->getLoop()); + //check base address + possible &= isSafeToExpandAt(getBaseAddr(newDim), Point, SE); + if (!possible) return false; + + reps.push_back(LR); //changes getMaxDimension() + return true; } - -Value *AffineAccess::expandData(const AffineAcc *aa, Type *ty, Instruction *InsertBefore) const { - InsertBefore = InsertBefore ? InsertBefore : aa->L->getLoopPreheader()->getTerminator(); - assert(isSafeToExpandAt(aa->data, InsertBefore, SE) && "data not expanable here (note: only preheader guaranteed)"); - SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "data"); +Value *AffAcc::expandBaseAddr(unsigned dimension, Type *ty, Instruction *InsertBefore){ + assert(isWellFormed(dimension)); + InsertBefore = InsertBefore ? InsertBefore : reps[dimension]->getLoop()->getLoopPreheader()->getTerminator(); + assert(isSafeToExpandAt(getBaseAddr(dimension), InsertBefore, SE) && "data not expanable here (note: only preheader guaranteed)"); + SCEVExpander ex(SE, reps[dimension]->getLoop()->getHeader()->getModule()->getDataLayout(), "addr"); ex.setInsertPoint(InsertBefore); - errs()<<"expandData: scev "<<*aa->data<<" has type: "<<*aa->data->getType()<<"\n"; - Value *data = ex.expandCodeFor(aa->data); - errs()<<"expandData: value "<<*data<<" has type: "<<*data->getType()<<"\n"; - return castToSize(data, ty, InsertBefore); + return castToSize(ex.expandCodeFor(getBaseAddr(dimension)), ty, InsertBefore); } - -Value *AffineAccess::expandBound(const AffineAcc *aa, unsigned i, Type *ty, Instruction *InsertBefore) const { - InsertBefore = InsertBefore ? InsertBefore : aa->L->getLoopPreheader()->getTerminator(); - assert(isSafeToExpandAt(aa->bounds[i], InsertBefore, SE) && "bound not expanable here (note: only preheader guaranteed)"); - SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "bound"); +Value *AffAcc::expandStep(unsigned dimension, Type *ty, Instruction *InsertBefore){ + assert(isWellFormed(dimension) && dimension > 0u); + InsertBefore = InsertBefore ? InsertBefore : reps[dimension]->getLoop()->getLoopPreheader()->getTerminator(); + assert(isSafeToExpandAt(getStep(dimension), InsertBefore, SE) && "data not expanable here (note: only preheader guaranteed)"); + SCEVExpander ex(SE, reps[dimension]->getLoop()->getHeader()->getModule()->getDataLayout(), "addr"); ex.setInsertPoint(InsertBefore); - return castToSize(ex.expandCodeFor(aa->bounds[i]), ty, InsertBefore); + return castToSize(ex.expandCodeFor(getStep(dimension)), ty, InsertBefore); } - -Value *AffineAccess::expandStride(const AffineAcc *aa, unsigned i, Type *ty, Instruction *InsertBefore) const { - InsertBefore = InsertBefore ? InsertBefore : aa->L->getLoopPreheader()->getTerminator(); - assert(isSafeToExpandAt(aa->strides[i], InsertBefore, SE) && "bound not expanable here (note: only preheader guaranteed)"); - SCEVExpander ex(SE, aa->L->getHeader()->getModule()->getDataLayout(), "stride"); - ex.setInsertPoint(InsertBefore); - return castToSize(ex.expandCodeFor(aa->strides[i]), ty, InsertBefore); +Value *AffAcc::expandRep(unsigned dimension, Type *ty, Instruction *InsertBefore){ + assert(isWellFormed(dimension) && dimension > 0u); + InsertBefore = InsertBefore ? InsertBefore : reps[dimension]->getLoop()->getLoopPreheader()->getTerminator(); + assert(isSafeToExpandAt(getRep(dimension), InsertBefore, SE) && "data not expanable here (note: only preheader guaranteed)"); + return reps[dimension]->expandAt(ty, InsertBefore); } -//================== Affine Acces Analysis ================================================== +//================== Affine Access =========================================================== -AnalysisKey AffineAccessAnalysis::Key; +AffineAccess::AffineAccess(Function &F, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, MemorySSA &MSSA, AAResults &AA) + : SE(SE), DT(DT), LI(LI), MSSA(MSSA), AA(AA){ + for (const Loop *L : LI.getTopLevelLoops()){ + std::vector p; + analyze(L, p); + assert(p.empty()); + } -AffineAccess AffineAccessAnalysis::run(Function &F, FunctionAnalysisManager &FAM) { - - errs()<<"running AffineAccessAnalysis on "<(F); - DominatorTree &DT = FAM.getResult(F); - ScalarEvolution &SE = FAM.getResult(F); - //AAResults &AA = FAM.getResult(F); - - AffineAccess *A = new AffineAccess(SE, DT, LI); - - for (const Loop *L : LI.getLoopsInPreorder()){ - assert(L); - assert(!L->isInvalid()); - if (!A->wellFormedLoopBTCount(L)) continue; //loop not well-formed - for (BasicBlock *BB : L->getBlocks()){ - if (!isOnAllControlFlowPaths(BB, L, DT)) continue; - for (Instruction &I : *BB){ - Value *Addr; - if (LoadInst *Load = dyn_cast(&I)){ - Addr = Load->getPointerOperand(); - }else if (StoreInst *Store = dyn_cast(&I)){ - Addr = Store->getPointerOperand(); - }else{ - continue; //cannot do anything with this instruction +DenseSet AffineAccess::analyze(const Loop *Parent, std::vector &loopPath){ + LoopRep *ParentLR = new LoopRep(Parent, ArrayRef(loopPath), SE, DT); + reps.insert(std::make_pair(Parent, ParentLR)); //add Parent to LoopReps + loopPath.push_back(Parent); //add Parent to path + expandableAccesses.insert(std::make_pair(Parent, SmallVector())); + DenseSet all; + + for (const Loop *L : Parent->getSubLoops()){ + DenseSet accs = analyze(L, loopPath); + LoopRep *LR = reps.find(L)->second; //guaranteed to exist, no check needed + if (LR->isAvailable() && LR->isOnAllCFPathsOfParentIfExecuted()){ //L is well-formed and on all CF-paths if its rep is >0 at run-time + for (AffAcc *A : accs){ + all.insert(A); + if (ParentLR->isAvailable() && A->promote(ParentLR)){ + expandableAccesses.find(Parent)->getSecond().push_back(A); //guaranteed to exist } - errs()<<"run: "<(Addr))) continue; //if Addr is not instruction ==> constant, or sth else (==> leave for other passes to opt) - A->addAllAccesses(AddrIns, L); } } } - return std::move(*A); -} - -//================== Affine Acces Analysis Pass for opt ======================================= -PreservedAnalyses AffineAccessAnalysisPass::run(Function &F, FunctionAnalysisManager &FAM) { - AffineAccess AA = FAM.getResult(F); - for (const auto *A : AA.getAccesses()){ - A->dump(); + std::vector toAdd; + for (BasicBlock *BB : Parent->getBlocks()){ + for (Instruction &I : *BB){ + MemoryUseOrDef *MA = MSSA.getMemoryAccess(&I); + AffAcc *A; + if (MA && access.find(MA) == access.end()){ //no AffAcc for this memory access yet! + if (isa(I)){ + A = new AffAcc(ArrayRef(&I), SE.getSCEV(cast(I).getPointerOperand()), MA, ArrayRef(loopPath), SE); + } else if (isa(I)) { + A = new AffAcc(ArrayRef(&I), SE.getSCEV(cast(I).getPointerOperand()), MA, ArrayRef(loopPath), SE); + } else { + //this is probably a call in the loop that modifies memory or sth like that + A = new AffAcc(ArrayRef(&I), nullptr, MA, ArrayRef(loopPath), SE); + } + access.insert(std::make_pair(MA, A)); + toAdd.push_back(A); + if (ParentLR->isAvailable()){ + bool onAllCFPaths = true; + for (Instruction *I : A->getAccesses()) onAllCFPaths &= isOnAllControlFlowPaths(I->getParent(), Parent, DT); + if (onAllCFPaths && A->promote(ParentLR)){ + expandableAccesses.find(Parent)->getSecond().push_back(A); //guaranteed to exist + } + } + } + } } - return PreservedAnalyses::all(); -} + for (AffAcc *A : toAdd){ + if (A->isWrite()) addConflictsForDef(A, Parent); + else addConflictsForUse(A, Parent); + all.insert(A); + } -/* -/// Fold over AddrSCEV -/// All AddRecSCEVs are dependent on L or loops contained in L (TODO: and on all paths?) -/// All steps in ADDRecSCEVs can be calculated in preheader of L -bool canFindStrides(ScalarEvolution &SE, const ArrayRef &loops, const SCEV *AddrSCEV, const SCEV *SetupAddrSCEV){ - errs()<<"finding strides in: "; AddrSCEV->dump(); - if (SCEVEquals(AddrSCEV, SetupAddrSCEV, SE)) return true; + assert(loopPath.back() == Parent); + loopPath.pop_back(); //remove Parent again - if (loops.empty()) { errs()<<"not enough loops\n"; return false; } //need at least one more loop here for SCEVAddRecvExpr - - if (const auto *AR = dyn_cast(AddrSCEV)){ - auto L = loops.begin(); - while (L != loops.end() && AR->getLoop() != *L) ++L; //do header comparison instead? - if (L == loops.end()) { errs()<<"loops of addRecExpr not found\n"; return false; } - - const SCEV *Stride = AR->getStepRecurrence(SE); - const SCEV *Rest = AR->getStart(); - if (isSafeToExpandAt(Stride, (*L)->getLoopPreheader()->getTerminator(), SE)) { //if we can expand stride at loop entry - errs()<<"can expand stride: "; Stride->dump(); - return canFindStrides(SE, ArrayRef(++L, loops.end()), Rest, SetupAddrSCEV); //check Rest recursively + return std::move(all); +} + +///we can prefetch a use before the loop iff its MemoryUse only depends on MemoryDefs that dominate the loop +///this adds conflicts between A and all MemoryDefs that stand in the way of that +void AffineAccess::addConflictsForUse(AffAcc *A, const Loop *L){ + Value *AAddr = getAddress(A->getAccesses()[0]); + auto *W = MSSA.getSkipSelfWalker(); + std::deque worklist; + worklist.push_back(W->getClobberingMemoryAccess(A->getMemoryAccess())); + while (!worklist.empty()){ + MemoryAccess *C = worklist.front(); worklist.pop_front(); + if (!C) continue; + if (isa(C)){ + MemoryDef *MA = cast(C); + Value *MAAddr = getAddress(MA->getMemoryInst()); + if (L->contains(cast(MA)->getMemoryInst()) && (!AAddr || !MAAddr || AA.alias(AAddr, MAAddr))) { //we have a conflict inside loop + auto p = access.find(cast(MA)); + assert(p != access.end() && "by this point all accesses in L should have an AffAcc!"); + AffAcc *O = p->second; + //FIXME: should only consider cf-paths where the reps are > 0? + if (!A->isWellFormed(A->loopToDimension(L)) || !O->isWellFormed(O->loopToDimension(L))){ + addConflict(A, O, L, ConflictKind::Bad); //not well formed ==> cannot generate intersection checks + }else if (!MSSA.dominates(A->getMemoryAccess(), MA)){ //O might happen before A! + addConflict(A, O, L, ConflictKind::MustNotIntersect); //RaW + } else { //A always happens before O + bool sameBaseAddrSCEV = SCEVEquals(A->getBaseAddr(A->loopToDimension(L)), O->getBaseAddr(O->loopToDimension(L)), SE); + if (accessPatternsMatch(A, O, L)){ + if (!sameBaseAddrSCEV){ + //TODO: use baseAddrSCEV to catch cases where they are for sure not the same + addConflict(A, O, L, ConflictKind::MustBeSame); //WaR + } + } else { + //if (sameBaseAddrSCEV) addConflict(A, O, L, ConflictKind::Bad); // this might not hold but might be useful + addConflict(A, O, L, ConflictKind::MustNotIntersect); //WaR + } + } + worklist.push_back(W->getClobberingMemoryAccess(C)); + //aliasing is transitive and once an memory def is before loop it will not depend on other defs inside loop + //so we only add more defs inside the `if` + } + } else if (isa(C)){ + MemoryPhi *P = cast(C); + for (unsigned i = 0u; i < P->getNumOperands(); i++){ + worklist.push_back(P->getOperand(i)); //this adds MemoryDefs that do not alias, but will be be removed when pop-ed + } } } - return false; } -*/ - -/* -/// can promote if: - (1) parent loop Outer satisfies checkLoop - (2) child loop Inner is on all paths in Outer where Inner.backedgetakencount +1 > 0 - (3) Stride for Outer can be found - (4) forall bd : aa.bounds. SE.isLoopInvariant(bd, Outer) && isSafeToExpandAt(bd, OuterPreheader->getTerminator(), SE) - (5) forall st : aa.strides. SE.isLoopInvariant(st, Outer) && isSafeToExpandAt(st, OuterPreheader->getTerminator(), SE) -bool promoteAffineAccess(AffineAcc &aa, ScalarEvolution &SE, DominatorTree &DT, DenseMap &LR){ - const Loop *Inner = aa.getLoop(); - const Loop *Outer = Inner->getParentLoop(); - const auto &R = LR.find_as(Outer); - if (R == LR.end()) return false; //Outer violates (1) - const SCEV *Bound = R->getSecond(); - BasicBlock *OuterPreheader = Outer->getLoopPreheader(); - BasicBlock *InnerPreheader = Inner->getLoopPreheader(); - const SCEV *Rep = SE.getAddExpr(SE.getBackedgeTakenCount(Inner), SE.getConstant(APInt(64U, 1U))); //trip count of Inner loop - if (!isOnAllPredicatedControlFlowPaths(InnerPreheader, Outer, DT, Rep, SE)) return false; //violates (2) + +//we can delay a store up to after the loop if it is not redefined or used in the loop anymore +void AffineAccess::addConflictsForDef(AffAcc *A, const Loop *L){ } -*/ - -/* -AffineAccess &runOnFunction(Function &F, LoopInfo &LI, DominatorTree &DT, ScalarEvolution &SE, AAResults &AA){ - AffineAccess *aa = new AffineAccess(SE, DT, LI); - auto loops = LI.getLoopsInPreorder(); - errs()<<"contains "<canExpandBefore(L) || !B->canExpandBefore(L)) { + kind = ConflictKind::Bad; + } + A->addConflictInLoop(B, L, kind); + B->addConflictInLoop(A, L, kind); +} - //loops contained in this are guatanteed to have passed checkLoop - DenseMap loopReps; +bool AffineAccess::accessPatternsMatch(const AffAcc *A, const AffAcc *B, const Loop *L) const { + unsigned dimA = A->loopToDimension(L); + unsigned dimB = B->loopToDimension(L); + if (dimA != dimB) return false; + for (unsigned i = 1u; i <= dimA; i++){ + if (A->getLoop(i) != B->getLoop(i)) return false; + if (!SCEVEquals(A->getRep(i), B->getRep(i), SE)) return false; + if (!SCEVEquals(A->getStep(i), B->getStep(i), SE)) return false; + } + return true; +} - for (const Loop *L : loops){ - errs()<<"LOOP:\n"; - L->dump(); +ScalarEvolution &AffineAccess::getSE() const { return this->SE; } - if (!L->getLoopPreheader()) continue; - if (!checkLoop(L, DT, SE, L->getLoopPreheader()->getTerminator())) continue; +DominatorTree &AffineAccess::getDT()const { return this->DT; } - loopReps.insert(std::make_pair(L, SE.getBackedgeTakenCount(L))); +LoopInfo &AffineAccess::getLI() const { return this->LI; } - for (const auto &BB : L->getBlocks()){ - - if (!isOnAllControlFlowPaths(BB, L, DT)) continue; +MemorySSA &AffineAccess::getMSSA() const { return this->MSSA; } - for (auto &I : *BB){ - Value *Addr; - if (LoadInst *Load = dyn_cast(&I)){ - Addr = Load->getPointerOperand(); - }else if (StoreInst *Store = dyn_cast(&I)){ - Addr = Store->getPointerOperand(); - }else{ - continue; //cannot do anything with this instruction - } - Instruction *AddrIns; - if (!(AddrIns = dyn_cast(Addr))) continue; //if Addr is not instruction ==> constant, or sth else (==> leave for other passes to opt) - errs()<<"looking at: "; AddrIns->dump(); - - aa->addAllAccesses(AddrIns); - - //Address SCEV - const SCEV *AddrSCEV = SE.getSCEV(Addr); - if (!SE.hasComputableLoopEvolution(AddrSCEV, L)) continue; - errs()<<"has computable loop evolution: "; AddrSCEV->dump(); - - //Base Pointer (=data) SCEV - auto split = SE.SplitIntoInitAndPostInc(L, AddrSCEV); - const SCEV *SetupAddrSCEV = split.first; //const SCEV *PostIncSCEV = split.second; - if (!isSafeToExpandAt(SetupAddrSCEV, L->getLoopPreheader()->getTerminator(), SE)) continue; - errs()<<"can expand setup addr scev in preheader: "; SetupAddrSCEV->dump(); - - //Stride Check - if (!canFindStride(L, AddrSCEV, SetupAddrSCEV, SE)) continue; - errs()<<"can find loop Stride: "; AddrSCEV->dump(); - - std::vector accesses; - for (auto U = Addr->use_begin(); U != Addr->use_end(); ++U){ - Instruction *Acc = dyn_cast(U->getUser()); - if (!Acc) Acc = dyn_cast(U->getUser()); - - if (!Acc) continue; //both casts failed ==> not a suitable instruction - if (!isOnAllControlFlowPaths(Acc->getParent(), L, DT)) continue; //access does not occur consitently in loop ==> not suitable - - accesses.push_back(Acc); - } +AAResults &AffineAccess::getAA() const { return this->AA; } - const SCEV *TC = loopReps.find(L)->getSecond(); - const auto *AddrRecSCEV = cast(AddrSCEV); - const SCEV *Str; - if (AddrRecSCEV->getLoop() == L){ - Str = cast(AddrSCEV)->getStepRecurrence(SE); //because 1D for now - }else{ - Str = SE.getConstant(APInt(64U, 0U)); - } +ArrayRef AffineAccess::getLoopsInPreorder() const { return this->LI.getLoopsInPreorder(); } - aa->addAccess(new AffineAcc(L, AddrIns, ArrayRef(accesses), SetupAddrSCEV, TC, Str)); - errs()<<"added new AffineAcc\n"; +Value *AffineAccess::getAddress(Instruction *I) { + if (auto *L = dyn_cast(I)) return L->getPointerOperand(); + if (auto *S = dyn_cast(I)) return S->getPointerOperand(); + return nullptr; +} - //TODO: dimension promotion: if preheader has only one predecessor -> if cond for "skipping loop" is bt+1 == 0 -> if parent loop passes checks -> promote - } - } +ArrayRef AffineAccess::getExpandableAccesses(const Loop *L) const { + return ArrayRef(expandableAccesses.find(L)->getSecond()); +} - } - - return *aa; +const AffAcc *AffineAccess::getAccess(Instruction *I) const { + MemoryUseOrDef *MA = MSSA.getMemoryAccess(I); + if (!MA) return nullptr; + auto p = access.find(MA); + if (p == access.end()) return nullptr; + return p->second; } -*/ +//================== Affine Access Analysis ================================================== -/* -errs()<<"finding stride in: "; CS->dump(); - const SCEV *Stride; - if (const auto *Rec = dyn_cast(CS)){ - if (Rec->getLoop() == *L) { - CS = Rec->getStart(); - Stride = Rec->getStepRecurrence(SE); - }else{ - bool occurs = false; - for (auto L_ = L; L_ != cloops.end(); ++L_) occurs = occurs && Rec->getLoop() == *L_; - if (!occurs) break; //AddRecExpr references a loop that is not a containing loop ==> cannot guarantee anything - Stride = SE.getConstant(APInt(64U, 0U)); //addrSCEV does not step in this loop ==> stride is 0 - } - }else{ - break; //did not manage to compute stride - } - assert(Stride); - errs()<<"found stride: "; Stride->dump(); -*/ +AnalysisKey AffineAccessAnalysis::Key; -/* -Optional> toSameSize(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool unsafe = false){ - assert(LHS && RHS); - errs()<<"toSameSize: LHS="<<*LHS<<" with type"<<*LHS->getType()<<"\n"; - errs()<<"toSameSize: RHS="<<*RHS<<" with type"<<*RHS->getType()<<"\n"; - using PT = std::pair; - if (LHS->getType() == RHS->getType()) return Optional(std::make_pair(LHS, RHS)); //trivially the same size - if (LHS->getType()->isPointerTy() && RHS->getType()->isPointerTy()) return Optional(std::make_pair(LHS, RHS)); - if (!LHS->getType()->isSized() || !RHS->getType()->isSized()) return None; - //TODO: use datalayout for size instead - if (LHS->getType()->getIntegerBitWidth() > RHS->getType()->getIntegerBitWidth()) { - if (auto LHSx = dyn_cast(LHS)){ - if (LHSx->getAPInt().getActiveBits() <= RHS->getType()->getIntegerBitWidth()) {} - return Optional(std::make_pair(SE.getConstant(RHS->getType(), LHSx->getAPInt().getLimitedValue()), RHS)); - } - if (auto RHSx = dyn_cast(RHS)){ - if (RHSx->getAPInt().getActiveBits() <= LHS->getType()->getIntegerBitWidth()) - return Optional(std::make_pair(LHS, SE.getConstant(LHS->getType(), RHSx->getAPInt().getLimitedValue()))); +AffineAccess AffineAccessAnalysis::run(Function &F, FunctionAnalysisManager &FAM) { + + errs()<<"running AffineAccessAnalysis on "<(F); + DominatorTree &DT = FAM.getResult(F); + ScalarEvolution &SE = FAM.getResult(F); + auto &MSSAA = FAM.getResult(F); + MemorySSA &MSSA = MSSAA.getMSSA(); + AAResults &AA = FAM.getResult(F); + //DependenceInfo &DI = FAM.getResult(F); + + return AffineAccess(F, SE, DT, LI, MSSA, AA); +} + +//================== Affine Acces Analysis Pass for opt ======================================= +PreservedAnalyses AffineAccessAnalysisPass::run(Function &F, FunctionAnalysisManager &FAM) { + AffineAccess AA = FAM.getResult(F); + for (const Loop *L : AA.getLI().getLoopsInPreorder()){ + L->dump(); + for (const AffAcc *A : AA.getAccesses(L)){ + A->dump(); } - if (auto LHSx = dyn_cast(LHS)) return toSameSize(LHSx->getOperand(0), RHS, SE); - if (auto LHSx = dyn_cast(LHS)) return toSameSize(LHSx->getOperand(0), RHS, SE); - if (auto RHSx = dyn_cast(RHS)) return toSameSize(LHS, RHSx->getOperand(0), SE); - if (unsafe) return Optional(std::make_pair(SE.getTruncateExpr(LHS, RHS->getType()), RHS)); - return None; - }else if (LHS->getType()->getIntegerBitWidth() < RHS->getType()->getIntegerBitWidth()){ - auto p = toSameSize(RHS, LHS, SE, unsafe); - if (!p.hasValue()) return None; - return Optional(std::make_pair(p.getValue().second, p.getValue().first)); } - return Optional(std::make_pair(LHS, RHS)); + return PreservedAnalyses::all(); } -*/ \ No newline at end of file From 116dfe846bdec7adb643dd38d8a2dd28dc51b959 Mon Sep 17 00:00:00 2001 From: thrupf Date: Sat, 4 Jun 2022 23:25:43 +0200 Subject: [PATCH 30/47] working compilation wit mem conflicts --- .../llvm/Analysis/AffineAccessAnalysis.h | 76 +- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 647 +++++++++++++----- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 587 ++++++---------- 3 files changed, 752 insertions(+), 558 deletions(-) diff --git a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h index a2c0c7ff099bd..b097d35bfbe49 100644 --- a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h @@ -19,6 +19,8 @@ class LoopInfo; class ScalarEvolution; class MemorySSA; class MemoryUseOrDef; +class MemoryDef; +struct ExpandedAffAcc; struct LoopRep{ private: @@ -42,9 +44,10 @@ struct LoopRep{ ///expands LoopRep::RepSCEV at InsertBefore (if nullptr in preheader of loop) Value *expandAt(Type *ty, Instruction *InsertBefore = (Instruction *)nullptr); + Value *expandLoopGuard(Instruction *InsertBefore = (Instruction *)nullptr); }; -enum ConflictKind {NoConflict = 0, MustNotIntersect, MustBeSame, Bad }; +enum AffAccConflict { NoConflict = 0, MustNotIntersect = 1, Bad = 10}; struct AffAcc{ private: @@ -55,7 +58,7 @@ struct AffAcc{ SmallVector steps; //steps per loop (0 if loop-inv) SmallVector reps; //loop reps SmallVector containingLoops; //from inner- to outermost - DenseMap> conflicts; //conflicts to other Affine Accesses and starting from which dimension + DenseMap> conflicts; void findSteps(const SCEV *A, const SCEV *Factor, unsigned loop); public: @@ -66,26 +69,61 @@ struct AffAcc{ bool isWrite() const; unsigned getMaxDimension() const; bool isWellFormed(unsigned dimension) const; + bool isWellFormed(const Loop *L) const; bool canExpandBefore(const Loop *L) const; void dump() const; + void dumpInLoop(const Loop *L) const; unsigned loopToDimension(const Loop *L) const; - ConflictKind getConflictFor(const AffAcc *A, unsigned dimension) const; - ConflictKind getConflictInLoop(const AffAcc *A, const Loop *L) const; const SCEV *getBaseAddr(unsigned dim) const; const SCEV *getStep(unsigned dim) const; const SCEV *getRep(unsigned dim) const; const Loop *getLoop(unsigned dim) const; + ArrayRef getContainingLoops() const; + AffAccConflict getConflict(AffAcc *A, const Loop *L) const; - MemoryAccess *getMemoryAccess(); - ///finds all MemoryDefs that clobber this access's memory that prevent it from being prefetched before the loop - ArrayRef getAllClobberingFor(const Loop *L); - void addConflict(const AffAcc *A, unsigned startDimension, ConflictKind kind); - void addConflictInLoop(const AffAcc *A, const Loop *StartLoop, ConflictKind kind); + MemoryUseOrDef *getMemoryAccess(); + void addConflict(AffAcc *A, const Loop *StartL, AffAccConflict kind); bool promote(LoopRep *LR); ///does not check whether it is on all CF-paths for LR->getLoop() ///code gen: Value *expandBaseAddr(unsigned dimension, Type *ty, Instruction *InsertBefore = (Instruction *)nullptr); Value *expandStep(unsigned dimension, Type *ty, Instruction *InsertBefore = (Instruction *)nullptr); Value *expandRep(unsigned dimension, Type *ty, Instruction *InsertBefore = (Instruction *)nullptr); + ExpandedAffAcc expandAt(const Loop *L, Instruction *Point, Type *PtrTy, IntegerType *ParamTy, IntegerType *AgParamTy); +}; + +struct MemDep { +private: + DenseMap> clobbers; + DenseMap> clobberUsers; + MemorySSA &MSSA; + AAResults &AA; + bool alias(Value *A, Value *B); + bool alias(MemoryUseOrDef *A, MemoryUseOrDef *B); + +public: + MemDep(MemorySSA &MSSA, AAResults &AA) : MSSA(MSSA), AA(AA) {} + const DenseSet &findClobbers(MemoryUseOrDef *MA); + std::vector findClobbersInLoop(MemoryUseOrDef *MA, const Loop *L); + const DenseSet &findClobberUsers(MemoryDef *MA); + std::vector findClobberUsersInLoop(MemoryDef *MA, const Loop *L); +}; + +struct ExpandedAffAcc { +public: + AffAcc *const Access; + Value *const Addr; + const SmallVector Steps; + const SmallVector Reps; + const SmallVector Ranges; + const SmallVector PrefixSumRanges; + Value *const LowerBound; + Value *const UpperBound; + unsigned getDimension() const { return Steps.size(); } + ExpandedAffAcc (AffAcc *A, Value *Addr, ArrayRef Steps, ArrayRef Reps, + ArrayRef Ranges, ArrayRef PSRanges, Value *LowerBound, Value *UpperBound) + : Access(A), Addr(Addr), Steps(Steps.begin(), Steps.end()), Reps(Reps.begin(), Reps.end()), + Ranges(Ranges.begin(), Ranges.end()), PrefixSumRanges(PSRanges.begin(), PSRanges.end()), + LowerBound(LowerBound), UpperBound(UpperBound) { } }; class AffineAccess{ @@ -95,29 +133,31 @@ class AffineAccess{ LoopInfo &LI; MemorySSA &MSSA; AAResults &AA; + MemDep MD; DenseMap access; DenseMap reps; - DenseMap> expandableAccesses; + DenseMap> wellformedAccesses; + DenseMap> expandableAccesses; - DenseSet analyze(const Loop *Parent, std::vector &loopPath); - void addConflictsForUse(AffAcc *A, const Loop *L); - void addConflictsForDef(AffAcc *A, const Loop *L); - void addConflict(AffAcc *A, AffAcc *B, const Loop *L, ConflictKind kind); + std::vector analyze(const Loop *Parent, ArrayRef loopPath); + void addAllConflicts(const std::vector &all); + AffAccConflict getRWConflict(AffAcc *Read, AffAcc *Write, const Loop *L); public: AffineAccess(Function &F, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, MemorySSA &MSSA, AAResults &AA); AffineAccess() = delete; bool accessPatternsMatch(const AffAcc *A, const AffAcc *B, const Loop *L) const; + bool accessPatternsAndAddressesMatch(const AffAcc *A, const AffAcc *B, const Loop *L) const; ScalarEvolution &getSE() const; DominatorTree &getDT() const; LoopInfo &getLI() const; MemorySSA &getMSSA() const; AAResults &getAA() const; - ArrayRef getLoopsInPreorder() const; - ArrayRef getExpandableAccesses(const Loop *L) const; - const AffAcc *getAccess(Instruction *I) const; + SmallVector getLoopsInPreorder() const; - static Value *getAddress(Instruction *I); + ArrayRef getExpandableAccesses(const Loop *L); + std::vector expandAllAt(ArrayRef Accs, const Loop *L, Instruction *Point, + Value *&BoundCheck, Type *PtrTy, IntegerType *ParamTy, IntegerType *AgParamTy); }; class AffineAccessAnalysis : public AnalysisInfoMixin { diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index e2f255a16096e..15d51633e0f8d 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -29,9 +29,11 @@ #include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/TinyPtrVector.h" @@ -55,23 +57,23 @@ namespace { /// L has 1 preheader and 1 dedicated exit /// L has 1 backedge and 1 exiting block /// bt SCEV can be expanded to instructions at insertionsPoint -bool checkLoop(const Loop *L, DominatorTree &DT, ScalarEvolution &SE){ - if (!L->isLCSSAForm(DT) +const SCEV *getLoopBTSCEV(const Loop *L, DominatorTree &DT, ScalarEvolution &SE){ + if (!L->isLCSSAForm(DT) || !L->getLoopPreheader() || !L->getExitingBlock() || !L->getExitBlock() || !L->hasDedicatedExits() || L->getNumBackEdges() != 1) { - return false; + return nullptr; } if (!SE.hasLoopInvariantBackedgeTakenCount(L)){ - return false; + return nullptr; } const SCEV *bt = SE.getBackedgeTakenCount(L); - if(!isa(bt) || !SE.isAvailableAtLoopEntry(bt, L)){ - return false; + if(!bt || isa(bt) || !SE.isAvailableAtLoopEntry(bt, L)){ + return nullptr; } - return true; + return bt; } Optional> toSameType(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool unsafe = false){ @@ -251,20 +253,82 @@ Value *castToSize(Value *R, Type *ty, Instruction *InsPoint){ return builder.CreateBitOrPointerCast(R, ty, "scev.cast"); } +Value *getAddress(MemoryUseOrDef *MA) { + assert(MA && "called getAddress on nullptr"); + assert(MA->getMemoryInst()); + Instruction *I = MA->getMemoryInst(); + if (auto *L = dyn_cast(I)) return L->getPointerOperand(); + if (auto *S = dyn_cast(I)) return S->getPointerOperand(); + return nullptr; +} + +const Loop *findFirstContaining(ArrayRef loops, BasicBlock *BB){ + for (const Loop *L : loops) { + if (L && L->contains(BB)) { + return L; + } + } + return nullptr; +} + +Value *goodAnd(IRBuilder<> &builder, ArrayRef bools){ + assert(!bools.empty()); + std::vector b1, b2; + for (Value *b : bools) b1.push_back(b); + while (b1.size() > 1u) { + unsigned i = 0u; + for (; i+1 < b1.size(); i += 2) { + b2.push_back(builder.CreateAnd(b1[i], b1[i+1], "and.tree")); + } + if (i < b1.size()) b2.push_back(b1[i]); //add last element if odd nr in b1 + std::swap(b1, b2); + b2.clear(); + } + return b1[0]; //return the last value +} + +bool hasMemInst(MemoryUseOrDef *MA) { return MA && MA->getMemoryInst(); } + +struct SCEVUknownSetFinder { + DenseSet values; + // return true to follow this node. + bool follow(const SCEV *S) { + if (S->getSCEVType() == SCEVTypes::scUnknown) { + values.insert(cast(S)->getValue()); + } + return true; //always true + } + // return true to terminate the search. + bool isDone() { return false; /*continue forever*/ } +}; + +bool shareValues(const SCEV *A, const SCEV *B) { + SCEVUknownSetFinder finderA; + SCEVTraversal trA(finderA); + trA.visitAll(A); + SCEVUknownSetFinder finderB; + SCEVTraversal trB(finderB); + trB.visitAll(B); + bool shareValues = false; + for (Value *V : finderA.values) { + for (Value *W : finderB.values) { + shareValues |= V == W; + } + } + return shareValues; +} + } //end of namespace //================== =========================================================== // ==== LoopRep ==== LoopRep::LoopRep(const Loop *L, ArrayRef contLoops, ScalarEvolution &SE, DominatorTree &DT) - : L(L), containingLoops(contLoops.begin(), contLoops.end()), SE(SE), DT(DT), safeExpandBound(0u) + : SE(SE), DT(DT), L(L), containingLoops(contLoops.begin(), contLoops.end()), safeExpandBound(0u) { - if (checkLoop(L, DT, SE)){ - const SCEV *R = SE.getBackedgeTakenCount(L); - RepSCEV = isa(R) ? nullptr : R; - }else{ - RepSCEV = nullptr; - } + RepSCEV = getLoopBTSCEV(L, DT, SE); + if (RepSCEV) errs()<<"new LoopRep with rep scev: "<<*RepSCEV<<"\n"; + else errs()<<"new LoopRep with rep scev: \n"; if (RepSCEV){ while (safeExpandBound < containingLoops.size() @@ -274,9 +338,7 @@ LoopRep::LoopRep(const Loop *L, ArrayRef contLoops, ScalarEvolutio } bool LoopRep::isAvailable() const { return RepSCEV != nullptr; } - const Loop *LoopRep::getLoop() const { return L; } - const SCEV *LoopRep::getSCEV() const { assert(isAvailable() && "SCEV available"); //not necessary, but forces good practice return RepSCEV; @@ -303,35 +365,54 @@ bool LoopRep::isSafeToExpandBefore(const Loop *L) const { Value *LoopRep::expandAt(Type *ty, Instruction *InsertBefore){ assert(ty); + assert(RepSCEV); if (Rep) { //FIXME: currently forces user to call first expand at a point that dominates all possible uses (improvement: could update expand point using DT) assert(ty == Rep->getType() && "was already expanded with same type"); return Rep; } InsertBefore = InsertBefore ? InsertBefore : L->getLoopPreheader()->getTerminator(); - assert(isSafeToExpandAt(RepSCEV, InsertBefore, SE) && "bound not expanable here"); + const SCEV *RepPlusOne = getSCEVPlusOne(); + assert(isSafeToExpandAt(RepPlusOne, InsertBefore, SE) && "bound not expanable here"); SCEVExpander ex(SE, L->getHeader()->getModule()->getDataLayout(), "rep"); ex.setInsertPoint(InsertBefore); - return castToSize(ex.expandCodeFor(RepSCEV), ty, InsertBefore); + Rep = castToSize(ex.expandCodeFor(RepPlusOne), ty, InsertBefore); + return Rep; +} + +Value *LoopRep::expandLoopGuard(Instruction *InsertBefore) { + assert(Rep && "expandAt has to be called before this"); + InsertBefore = InsertBefore ? InsertBefore : L->getLoopPreheader()->getTerminator(); + IRBuilder<> builder(InsertBefore); + return builder.CreateICmpSGT(Rep, ConstantInt::get(Rep->getType(), 0u, true)); //FIXME: this only works for unsigned Rep's that are < 2^30 (for i32) } // ==== AffAcc ==== AffAcc::AffAcc(ArrayRef accesses, const SCEV *Addr, MemoryUseOrDef *MA, ArrayRef contLoops, ScalarEvolution &SE) - : accesses(accesses.begin(), accesses.end()), MA(MA), SE(SE) + : SE(SE), MA(MA), accesses(accesses.begin(), accesses.end()) { + assert(!accesses.empty()); + assert(MA); baseAddresses.push_back(Addr); steps.push_back((const SCEV *)nullptr); //there is no step for dim=0 reps.push_back((LoopRep *)nullptr); //there is no rep for dim=0 containingLoops.push_back((const Loop *)nullptr); //there is no loop for dim=0 - for (const Loop *L : contLoops) containingLoops.push_back(L); - findSteps(Addr, (const SCEV *)nullptr, 1u); + containingLoops.append(contLoops.begin(), contLoops.end()); + if (!Addr) return; //do not look for steps or addresses if SCEV of address is unknown + findSteps(Addr, (const SCEV *)nullptr, 1u); //find steps for (unsigned dim = 1; dim < containingLoops.size(); dim++){ - baseAddresses.push_back(SE.SplitIntoInitAndPostInc(containingLoops[dim], Addr).first); + Addr = SE.SplitIntoInitAndPostInc(containingLoops[dim], Addr).first; + baseAddresses.push_back(Addr); } } void AffAcc::findSteps(const SCEV *A, const SCEV *Factor, unsigned loop){ + assert(A); assert(baseAddresses.size() == 1 && reps.size() == 1 && "we only know dim=0 so far"); - if (loop >= containingLoops.size() || !A) return; + if (loop >= containingLoops.size()) return; + if (!SE.containsAddRecurrence(A) && loop < containingLoops.size()){ //A is inv to the rest of the loops + steps.push_back(SE.getConstant(Type::getInt64Ty(this->accesses[0]->getContext()), 0U)); + findSteps(A, Factor, loop + 1u); + } switch (A->getSCEVType()) { //case SCEVTypes::scZeroExtend: FIXME: this is unsafe, right? @@ -366,67 +447,113 @@ void AffAcc::findSteps(const SCEV *A, const SCEV *Factor, unsigned loop){ const auto *S = cast(A); const SCEV *Step; if (S->getLoop() == containingLoops[loop]){ - auto p = toSameType(Factor, S->getStepRecurrence(SE), SE, true); - if (!p.hasValue()) return; - Step = SE.getMulExpr(p.getValue().first, p.getValue().second); + Step = S->getStepRecurrence(SE); + if (Factor) { + auto p = toSameType(Factor, Step, SE, true); + if (!p.hasValue()) return; + Step = SE.getMulExpr(p.getValue().first, p.getValue().second); + } + steps.push_back(Step); + return findSteps(S->getStart(), Factor, loop+1); }else{ //A is loop-invariant to containingLoops[loop] bool occursLater = false; //loop needs to occur later for (unsigned i = loop+1; i < containingLoops.size(); i++) occursLater = occursLater || containingLoops[i] == S->getLoop(); if (!occursLater) return; - Step = SE.getConstant(APInt(1u, 0UL, false)); - } - steps.push_back(Step); - return findSteps(S->getStart(), Factor, loop+1); - + steps.push_back(SE.getConstant(Type::getInt64Ty(this->accesses[0]->getContext()), 0U)); + return findSteps(S, Factor, loop+1); + } } default: return; } } -ArrayRef AffAcc::getAccesses() const { return ArrayRef (accesses.begin(), accesses.end()); } +ArrayRef AffAcc::getAccesses() const { return accesses; } bool AffAcc::isWrite() const { return isa(MA); } unsigned AffAcc::getMaxDimension() const { return reps.size() - 1u; } bool AffAcc::isWellFormed(unsigned dimension) const { return dimension <= getMaxDimension() && baseAddresses[0]; } +bool AffAcc::isWellFormed(const Loop *L) const { return isWellFormed(loopToDimension(L)); } unsigned AffAcc::loopToDimension(const Loop *L) const { - assert(L); + assert(L && "L not nullptr"); for (unsigned d = 1u; d < containingLoops.size(); d++){ //FIXME: linear search -> improve with a map if (containingLoops[d] == L) return d; } - llvm_unreachable("The provided loop does not contain `this`!"); + assert(false && "The provided loop does not contain `this`!"); } bool AffAcc::canExpandBefore(const Loop *L) const { return isWellFormed(loopToDimension(L)); } -ConflictKind AffAcc::getConflictFor(const AffAcc *A, unsigned dimension) const { - auto r = conflicts.find(A); - if (r == conflicts.end() || r->getSecond().first > dimension) return ConflictKind::None; - return r->getSecond().second; -} -ConflictKind AffAcc::getConflictInLoop(const AffAcc *A, const Loop *L) const { - return getConflictFor(A, loopToDimension(L) - 1u); -} const SCEV *AffAcc::getBaseAddr(unsigned dim) const { assert(dim < baseAddresses.size()); return baseAddresses[dim]; } const SCEV *AffAcc::getStep(unsigned dim) const { assert(dim < steps.size()); return steps[dim]; } -const SCEV *AffAcc::getRep(unsigned dim) const { assert(dim < reps.size()); return reps[dim]->getSCEV(); } +const SCEV *AffAcc::getRep(unsigned dim) const { + assert(dim < reps.size()); + if (!reps[dim] || !reps[dim]->isAvailable()) return nullptr; + return reps[dim]->getSCEV(); +} const Loop *AffAcc::getLoop(unsigned dim) const { assert(dim < containingLoops.size()); return containingLoops[dim]; } -void AffAcc::dump() const { +ArrayRef AffAcc::getContainingLoops() const { return ArrayRef(containingLoops); } +void AffAcc::dumpInLoop(const Loop *L) const { errs()<<"Affine Access of \n"; + unsigned dimension; + if (L) dimension = loopToDimension(L); + else dimension = getMaxDimension(); for (auto *I : accesses) errs()<<*I<<"\n"; - for (unsigned dim = 0u; dim <= getMaxDimension(); dim++){ - errs()<<"\tdim = "<"; + errs()<<", rep = "; + if (r) errs()<<*r; + else errs()<<""; + errs()<<"\n"; + errs()<<"\taddress = "; + if (a) errs()<<*a; + else errs()<<""; + errs()<<" is well-formed = "<isWellFormed(dim)<<"\n"; + } +} +void AffAcc::dump() const { dumpInLoop(nullptr); } +AffAccConflict AffAcc::getConflict(AffAcc *A, const Loop *L) const { + auto p = conflicts.find(A); + if (p != conflicts.end()) { + const Loop *S = p->getSecond().first; + if (S == L || L->contains(S)) { //if start is L or more "inner" loop + if (!isWellFormed(L) || !A->isWellFormed(L)) return AffAccConflict::Bad; //if either is not well-formed "demote" the conflict to bad (but only if exists) + return p->getSecond().second; + } } + return AffAccConflict::NoConflict; } -MemoryAccess *AffAcc::getMemoryAccess() { return MA; } -void AffAcc::addConflict(const AffAcc *A, unsigned startDimension, ConflictKind kind){ - conflicts.insert(std::make_pair(A, std::make_pair(startDimension, kind))); -} -void AffAcc::addConflictInLoop(const AffAcc *A, const Loop *StartLoop, ConflictKind kind){ - return addConflict(A, loopToDimension(StartLoop) - 1u, kind); +MemoryUseOrDef *AffAcc::getMemoryAccess() { return MA; } +void AffAcc::addConflict(AffAcc *A, const Loop *StartL, AffAccConflict kind){ + assert(conflicts.find(A) == conflicts.end() && "no conflict for A yet"); + assert(kind == AffAccConflict::Bad || (isWellFormed(StartL) && A->isWellFormed(StartL))); + conflicts.insert(std::make_pair(A, std::make_pair(StartL, kind))); + errs()<<"conflict for:\n"; dumpInLoop(StartL); errs()<<"with:\n"; A->dumpInLoop(StartL); errs()<<"is ===========>"; + switch (kind) + { + case AffAccConflict::Bad: + errs()<<"Bad"; + break; + case AffAccConflict::MustNotIntersect: + errs()<<"MustNotIntersect"; + break; + case AffAccConflict::NoConflict: + errs()<<"NoConflict"; + break; + default: + break; + } + errs()<<"\n"; } bool AffAcc::promote(LoopRep *LR){ + if (!LR->isAvailable()) return false; unsigned newDim = getMaxDimension() + 1u; + if (!isWellFormed(getMaxDimension())) return false; if (getLoop(newDim) != LR->getLoop()) return false; + errs()<<"promote: (1) loops match, "; bool possible = true; Instruction *Point = LR->getLoop()->getLoopPreheader()->getTerminator(); //check all current reps and steps @@ -434,16 +561,21 @@ bool AffAcc::promote(LoopRep *LR){ possible = possible && isSafeToExpandAt(getStep(dim), Point, SE); possible = possible && reps[dim]->isSafeToExpandBefore(LR->getLoop()); } + if (possible) errs()<<"(2) current rep & step can be expanded, "; //check rep and step of new dimension possible &= steps.size() > newDim && isSafeToExpandAt(getStep(newDim), Point, SE); possible &= LR->isSafeToExpandBefore(LR->getLoop()); + if (possible) errs()<<"(3) new rep & step can be expanded, "; //check base address possible &= isSafeToExpandAt(getBaseAddr(newDim), Point, SE); + if (possible) errs()<<"(4) new base addr can be expanded"; + errs()<<"\n"; if (!possible) return false; reps.push_back(LR); //changes getMaxDimension() return true; } + Value *AffAcc::expandBaseAddr(unsigned dimension, Type *ty, Instruction *InsertBefore){ assert(isWellFormed(dimension)); InsertBefore = InsertBefore ? InsertBefore : reps[dimension]->getLoop()->getLoopPreheader()->getTerminator(); @@ -456,7 +588,7 @@ Value *AffAcc::expandStep(unsigned dimension, Type *ty, Instruction *InsertBefor assert(isWellFormed(dimension) && dimension > 0u); InsertBefore = InsertBefore ? InsertBefore : reps[dimension]->getLoop()->getLoopPreheader()->getTerminator(); assert(isSafeToExpandAt(getStep(dimension), InsertBefore, SE) && "data not expanable here (note: only preheader guaranteed)"); - SCEVExpander ex(SE, reps[dimension]->getLoop()->getHeader()->getModule()->getDataLayout(), "addr"); + SCEVExpander ex(SE, reps[dimension]->getLoop()->getHeader()->getModule()->getDataLayout(), "step"); ex.setInsertPoint(InsertBefore); return castToSize(ex.expandCodeFor(getStep(dimension)), ty, InsertBefore); } @@ -466,136 +598,271 @@ Value *AffAcc::expandRep(unsigned dimension, Type *ty, Instruction *InsertBefore assert(isSafeToExpandAt(getRep(dimension), InsertBefore, SE) && "data not expanable here (note: only preheader guaranteed)"); return reps[dimension]->expandAt(ty, InsertBefore); } +ExpandedAffAcc AffAcc::expandAt(const Loop *L, Instruction *Point, + Type *PtrTy, IntegerType *ParamTy, IntegerType *AgParamTy) +{ + if (!Point) Point = L->getLoopPreheader()->getTerminator(); + IRBuilder<> builder(Point); + assert(isWellFormed(L)); + std::vector reps, steps, ranges, prefixsum_ranges; + unsigned dim = loopToDimension(L); + Value *Addr = expandBaseAddr(dim, PtrTy, Point); + Value *psum = nullptr; + for (unsigned i = 1u; i <= dim; i++) { + reps.push_back(expandRep(i, ParamTy, Point)); + steps.push_back(expandStep(i, ParamTy, Point)); + Value *r = reps.back(); + Value *st = steps.back(); + ranges.push_back(builder.CreateMul(r, st, formatv("range.{0}d", i))); + if (psum) psum = builder.CreateAdd(psum, ranges.back(), formatv("prefsum.range.{0}d", i)); + else psum = ranges.back(); + prefixsum_ranges.push_back(psum); + } + Value *LowerBound = builder.CreatePtrToInt(Addr, AgParamTy, "lb"); + Value *r = builder.CreateZExtOrTrunc(prefixsum_ranges.back(), AgParamTy, "prefsum.cast"); + Value *UpperBound = builder.CreateAdd(LowerBound, r, "ub"); + ExpandedAffAcc Aexp(this, Addr, steps, reps, ranges, prefixsum_ranges, LowerBound, UpperBound); + return Aexp; +} + +// ================= MemDep ============== + +bool MemDep::alias(Value *A, Value *B) { return !A || !B || AA.alias(A, B) != AliasResult::NoAlias; } +bool MemDep::alias(MemoryUseOrDef *A, MemoryUseOrDef *B) { + if (!hasMemInst(A) || !hasMemInst(B)) return false; //the memoryUseOrDef does not correspond to an instruction => no problem + else return alias(getAddress(A), getAddress(B)); +} + +const DenseSet &MemDep::findClobbers(MemoryUseOrDef *MA){ + if (clobbers.find(MA) == clobbers.end()) { + clobbers.insert(std::make_pair(MA, std::move(DenseSet()))); + auto &res = clobbers.find(MA)->getSecond(); + std::deque worklist; + DenseSet vis; + worklist.push_back(MA->getDefiningAccess()); + while (!worklist.empty()) { + MemoryAccess *A = worklist.front(); worklist.pop_front(); + if (!A) continue; + if (vis.find(A) != vis.end()) continue; + if (A == MA) continue; + vis.insert(A); + if (MemoryDef *D = dyn_cast(A)) { + if (alias(D, MA)) { + auto &s = findClobbers(D); + res.insert(D); + for (auto *A : s) res.insert(A); + } else { + worklist.push_back(D); + } + } else { + MemoryPhi *P = cast(A); + for (unsigned i = 0u; i < P->getNumOperands(); i++) { + worklist.push_back(P->getOperand(i)); + } + } + } + } + return clobbers.find(MA)->getSecond(); +} + +std::vector MemDep::findClobbersInLoop(MemoryUseOrDef *MA, const Loop *L) { + auto &s = findClobbers(MA); + std::vector r; + for (auto *A : s) + if (L && L->contains(A->getBlock())) r.push_back(A); + return r; +} + +const DenseSet &MemDep::findClobberUsers(MemoryDef *MA) { + if (clobberUsers.find(MA) == clobberUsers.end()) { + clobberUsers.insert(std::make_pair(MA, std::move(DenseSet()))); + auto &res = clobberUsers.find(MA)->getSecond(); + std::deque worklist; + DenseSet vis; + for (auto U = MA->use_begin(); U != MA->use_end(); ++U) { + worklist.push_back(cast(U->getUser())); + } + while (!worklist.empty()){ + MemoryAccess *A = worklist.front(); worklist.pop_front(); + if (!A) continue; + if (vis.find(A) != vis.end()) continue; + vis.insert(A); + if (MemoryUse *U = dyn_cast(A)) { + if (alias(U, MA)) res.insert(U); + } else if (MemoryDef *D = dyn_cast(A)) { + if (alias(D, MA)) { + auto &s = findClobberUsers(D); + res.insert(D); + for (auto *A : s) res.insert(A); + } else { + worklist.push_back(D); + } + } else { + assert(isa(A)); + for (auto U = A->use_begin(); U != A->use_end(); ++U) { + worklist.push_back(cast(U->getUser())); + } + } + } + } + return clobberUsers.find(MA)->getSecond(); +} + +std::vector MemDep::findClobberUsersInLoop(MemoryDef *MA, const Loop *L) { + auto &s = findClobberUsers(MA); + std::vector r; + for (auto *A : s) + if (L && L->contains(A->getBlock())) r.push_back(A); + return r; +} //================== Affine Access =========================================================== AffineAccess::AffineAccess(Function &F, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, MemorySSA &MSSA, AAResults &AA) - : SE(SE), DT(DT), LI(LI), MSSA(MSSA), AA(AA){ + : SE(SE), DT(DT), LI(LI), MSSA(MSSA), AA(AA), MD(MSSA, AA){ for (const Loop *L : LI.getTopLevelLoops()){ - std::vector p; - analyze(L, p); - assert(p.empty()); + std::vector all = analyze(L, ArrayRef()); + addAllConflicts(all); } - } -DenseSet AffineAccess::analyze(const Loop *Parent, std::vector &loopPath){ - LoopRep *ParentLR = new LoopRep(Parent, ArrayRef(loopPath), SE, DT); +std::vector AffineAccess::analyze(const Loop *Parent, ArrayRef loopPath){ + errs()<<"analyze: loop : "<getHeader()->getNameOrAsOperand()<<"\n"; + LoopRep *ParentLR = new LoopRep(Parent, loopPath, SE, DT); reps.insert(std::make_pair(Parent, ParentLR)); //add Parent to LoopReps - loopPath.push_back(Parent); //add Parent to path - expandableAccesses.insert(std::make_pair(Parent, SmallVector())); - DenseSet all; + std::vector path; + path.push_back(Parent); //add Parent to path + for (auto *L : loopPath) path.push_back(L); + wellformedAccesses.insert(std::make_pair(Parent, SmallVector())); + std::vector all; for (const Loop *L : Parent->getSubLoops()){ - DenseSet accs = analyze(L, loopPath); + std::vector accs = analyze(L, ArrayRef(path)); LoopRep *LR = reps.find(L)->second; //guaranteed to exist, no check needed - if (LR->isAvailable() && LR->isOnAllCFPathsOfParentIfExecuted()){ //L is well-formed and on all CF-paths if its rep is >0 at run-time - for (AffAcc *A : accs){ - all.insert(A); - if (ParentLR->isAvailable() && A->promote(ParentLR)){ - expandableAccesses.find(Parent)->getSecond().push_back(A); //guaranteed to exist + bool canPromote = LR->isOnAllCFPathsOfParentIfExecuted() && ParentLR->isAvailable() && LR->isAvailable(); + for (AffAcc *A : accs){ + all.push_back(A); + if (canPromote){ //L is well-formed and on all CF-paths if its rep is >0 at run-time + auto &l = wellformedAccesses.find(Parent)->getSecond(); + if (A->promote(ParentLR)){ + l.push_back(A); //guaranteed to exist } } } } - std::vector toAdd; for (BasicBlock *BB : Parent->getBlocks()){ for (Instruction &I : *BB){ MemoryUseOrDef *MA = MSSA.getMemoryAccess(&I); - AffAcc *A; - if (MA && access.find(MA) == access.end()){ //no AffAcc for this memory access yet! - if (isa(I)){ - A = new AffAcc(ArrayRef(&I), SE.getSCEV(cast(I).getPointerOperand()), MA, ArrayRef(loopPath), SE); - } else if (isa(I)) { - A = new AffAcc(ArrayRef(&I), SE.getSCEV(cast(I).getPointerOperand()), MA, ArrayRef(loopPath), SE); - } else { - //this is probably a call in the loop that modifies memory or sth like that - A = new AffAcc(ArrayRef(&I), nullptr, MA, ArrayRef(loopPath), SE); - } + if (MA && hasMemInst(MA) && access.find(MA) == access.end()){ //no AffAcc for this memory access yet! + Value *Addr = getAddress(MA); + const SCEV *AddrSCEV = nullptr; + if (Addr) AddrSCEV = SE.getSCEV(Addr); + AffAcc *A = new AffAcc(ArrayRef(&I), AddrSCEV, MA, ArrayRef(path), SE); + all.push_back(A); access.insert(std::make_pair(MA, A)); - toAdd.push_back(A); if (ParentLR->isAvailable()){ bool onAllCFPaths = true; for (Instruction *I : A->getAccesses()) onAllCFPaths &= isOnAllControlFlowPaths(I->getParent(), Parent, DT); if (onAllCFPaths && A->promote(ParentLR)){ - expandableAccesses.find(Parent)->getSecond().push_back(A); //guaranteed to exist + wellformedAccesses.find(Parent)->getSecond().push_back(A); //guaranteed to exist } } } } } - - for (AffAcc *A : toAdd){ - if (A->isWrite()) addConflictsForDef(A, Parent); - else addConflictsForUse(A, Parent); - all.insert(A); - } - - assert(loopPath.back() == Parent); - loopPath.pop_back(); //remove Parent again - - return std::move(all); -} - -///we can prefetch a use before the loop iff its MemoryUse only depends on MemoryDefs that dominate the loop -///this adds conflicts between A and all MemoryDefs that stand in the way of that -void AffineAccess::addConflictsForUse(AffAcc *A, const Loop *L){ - Value *AAddr = getAddress(A->getAccesses()[0]); - auto *W = MSSA.getSkipSelfWalker(); - std::deque worklist; - worklist.push_back(W->getClobberingMemoryAccess(A->getMemoryAccess())); - while (!worklist.empty()){ - MemoryAccess *C = worklist.front(); worklist.pop_front(); - if (!C) continue; - if (isa(C)){ - MemoryDef *MA = cast(C); - Value *MAAddr = getAddress(MA->getMemoryInst()); - if (L->contains(cast(MA)->getMemoryInst()) && (!AAddr || !MAAddr || AA.alias(AAddr, MAAddr))) { //we have a conflict inside loop - auto p = access.find(cast(MA)); - assert(p != access.end() && "by this point all accesses in L should have an AffAcc!"); - AffAcc *O = p->second; - //FIXME: should only consider cf-paths where the reps are > 0? - if (!A->isWellFormed(A->loopToDimension(L)) || !O->isWellFormed(O->loopToDimension(L))){ - addConflict(A, O, L, ConflictKind::Bad); //not well formed ==> cannot generate intersection checks - }else if (!MSSA.dominates(A->getMemoryAccess(), MA)){ //O might happen before A! - addConflict(A, O, L, ConflictKind::MustNotIntersect); //RaW - } else { //A always happens before O - bool sameBaseAddrSCEV = SCEVEquals(A->getBaseAddr(A->loopToDimension(L)), O->getBaseAddr(O->loopToDimension(L)), SE); - if (accessPatternsMatch(A, O, L)){ - if (!sameBaseAddrSCEV){ - //TODO: use baseAddrSCEV to catch cases where they are for sure not the same - addConflict(A, O, L, ConflictKind::MustBeSame); //WaR - } - } else { - //if (sameBaseAddrSCEV) addConflict(A, O, L, ConflictKind::Bad); // this might not hold but might be useful - addConflict(A, O, L, ConflictKind::MustNotIntersect); //WaR - } + errs()<<"analyze: done with loop: "<getHeader()->getNameOrAsOperand()<<"\n"; + return all; +} + +void AffineAccess::addAllConflicts(const std::vector &all) { + for (AffAcc *A : all) { + assert(A); + ArrayRef loops = A->getContainingLoops(); + const Loop *outerMostExpandableExl = nullptr; + if (A->isWrite()){ + MemoryDef *MA = cast(A->getMemoryAccess()); + const DenseSet &cu = MD.findClobberUsers(MA); + for (MemoryUseOrDef *D : cu) { + if (MA == D || !hasMemInst(D)) continue; + const Loop *innermostCommon = findFirstContaining(loops, D->getBlock()); + if (!innermostCommon) continue; + auto p = access.find(D); + if (p == access.end()) continue; //no AffAcc for D ==> skip + AffAcc *B = p->second; + AffAccConflict kind = AffAccConflict::Bad; + if (A->isWellFormed(innermostCommon) && B->isWellFormed(innermostCommon)) { + if (B->isWrite()) kind = AffAccConflict::MustNotIntersect; //WaW + else kind = getRWConflict(B, A, innermostCommon); + } + //at this point, even if the two do not alias, we assume the chance is high that they do at runtime + //if their base addresses share some SCEVUnknowns (ie. some Value's) (FIXME: CONSERVATIVE) + if (kind == AffAccConflict::MustNotIntersect){ + if (shareValues(A->getBaseAddr(A->loopToDimension(innermostCommon)), B->getBaseAddr(B->loopToDimension(innermostCommon)))) + kind = AffAccConflict::Bad; } - worklist.push_back(W->getClobberingMemoryAccess(C)); - //aliasing is transitive and once an memory def is before loop it will not depend on other defs inside loop - //so we only add more defs inside the `if` + if (kind != AffAccConflict::NoConflict) A->addConflict(B, innermostCommon, kind); + if (kind == AffAccConflict::Bad) outerMostExpandableExl = innermostCommon; } - } else if (isa(C)){ - MemoryPhi *P = cast(C); - for (unsigned i = 0u; i < P->getNumOperands(); i++){ - worklist.push_back(P->getOperand(i)); //this adds MemoryDefs that do not alias, but will be be removed when pop-ed + } else { + MemoryUseOrDef *MA = A->getMemoryAccess(); + const DenseSet &cs = MD.findClobbers(MA); + for (MemoryDef *D : cs) { + if (MA == D || !hasMemInst(D)) continue; + const Loop *innermostCommon = findFirstContaining(loops, D->getBlock()); + if (!innermostCommon) continue; + auto p = access.find(D); + if (p == access.end()) continue; //no AffAcc for D ==> skip + AffAcc *B = p->second; + AffAccConflict kind = getRWConflict(A, B, innermostCommon); + //at this point, even if the two do not alias, we assume the chance is high that they do at runtime + //if their base addresses share some SCEVUnknowns (ie. some Value's) (FIXME: CONSERVATIVE) + if (kind == AffAccConflict::MustNotIntersect){ + if (shareValues(A->getBaseAddr(A->loopToDimension(innermostCommon)), B->getBaseAddr(B->loopToDimension(innermostCommon)))) + kind = AffAccConflict::Bad; + } + if (kind != AffAccConflict::NoConflict) A->addConflict(B, innermostCommon, kind); + if (kind == AffAccConflict::Bad) outerMostExpandableExl = innermostCommon; + } + } + for (const Loop *L : loops) { + if (!L) continue; + if (L == outerMostExpandableExl) break; + if (!A->isWellFormed(L)) break; + auto p = expandableAccesses.find(L); + if (p == expandableAccesses.end()){ + SmallVector l; + l.push_back(A); + expandableAccesses.insert(std::make_pair(L, std::move(l))); + } else { + p->getSecond().push_back(A); } } } } -//we can delay a store up to after the loop if it is not redefined or used in the loop anymore -void AffineAccess::addConflictsForDef(AffAcc *A, const Loop *L){ - -} - -void AffineAccess::addConflict(AffAcc *A, AffAcc *B, const Loop *L, ConflictKind kind){ - if (!A->canExpandBefore(L) || !B->canExpandBefore(L)) { - kind = ConflictKind::Bad; +AffAccConflict AffineAccess::getRWConflict(AffAcc *Read, AffAcc *Write, const Loop *L) { + assert(!Read->isWrite()); + assert(Write->isWrite()); + if (!L->contains(Read->getMemoryAccess()->getBlock()) || !L->contains(Write->getMemoryAccess()->getBlock())) return AffAccConflict::NoConflict; + if (!Read->isWellFormed(L) || !Write->isWellFormed(L)) return AffAccConflict::Bad; + Value *Addr = getAddress(Read->getMemoryAccess()); + Value *DAddr = getAddress(Write->getMemoryAccess()); + if (Addr && DAddr && AA.alias(Addr, DAddr) == NoAlias) return AffAccConflict::NoConflict; + AffAccConflict kind = AffAccConflict::Bad; + if (!MSSA.dominates(Read->getMemoryAccess(), Write->getMemoryAccess())) { //read does not dominate write ==> RaW + kind = AffAccConflict::MustNotIntersect; + } else { //read dominates write ==> WaR + kind = AffAccConflict::MustNotIntersect; + //exception we know that the store always happens to a position already written from if the store is to same address as write (FIXME: CONSERVATIVE) + if ((Addr && DAddr && AA.alias(Addr, DAddr) == MustAlias) + || accessPatternsAndAddressesMatch(Read, Write, L)) + { + kind = AffAccConflict::NoConflict; + } } - A->addConflictInLoop(B, L, kind); - B->addConflictInLoop(A, L, kind); + + return kind; } bool AffineAccess::accessPatternsMatch(const AffAcc *A, const AffAcc *B, const Loop *L) const { @@ -610,34 +877,71 @@ bool AffineAccess::accessPatternsMatch(const AffAcc *A, const AffAcc *B, const L return true; } -ScalarEvolution &AffineAccess::getSE() const { return this->SE; } +bool AffineAccess::accessPatternsAndAddressesMatch(const AffAcc *A, const AffAcc *B, const Loop *L) const { + if (!accessPatternsMatch(A, B, L)) return false; + return SCEVEquals(A->getBaseAddr(A->loopToDimension(L)), B->getBaseAddr(B->loopToDimension(L)), SE); +} +ScalarEvolution &AffineAccess::getSE() const { return this->SE; } DominatorTree &AffineAccess::getDT()const { return this->DT; } - LoopInfo &AffineAccess::getLI() const { return this->LI; } - MemorySSA &AffineAccess::getMSSA() const { return this->MSSA; } - AAResults &AffineAccess::getAA() const { return this->AA; } - -ArrayRef AffineAccess::getLoopsInPreorder() const { return this->LI.getLoopsInPreorder(); } - -Value *AffineAccess::getAddress(Instruction *I) { - if (auto *L = dyn_cast(I)) return L->getPointerOperand(); - if (auto *S = dyn_cast(I)) return S->getPointerOperand(); - return nullptr; -} - -ArrayRef AffineAccess::getExpandableAccesses(const Loop *L) const { - return ArrayRef(expandableAccesses.find(L)->getSecond()); -} - -const AffAcc *AffineAccess::getAccess(Instruction *I) const { - MemoryUseOrDef *MA = MSSA.getMemoryAccess(I); - if (!MA) return nullptr; - auto p = access.find(MA); - if (p == access.end()) return nullptr; - return p->second; +SmallVector AffineAccess::getLoopsInPreorder() const { return this->LI.getLoopsInPreorder(); } + +ArrayRef AffineAccess::getExpandableAccesses(const Loop *L) { + auto p = expandableAccesses.find(L); + if (p == expandableAccesses.end()) return ArrayRef(); + return ArrayRef(p->getSecond()); +} + + +std::vector +AffineAccess::expandAllAt(ArrayRef Accs, const Loop *L, + Instruction *Point, Value *&BoundCheck, + Type *PtrTy, IntegerType *ParamTy, IntegerType *AgParamTy) { + std::vector res; + IRBuilder<> builder(Point); + for (AffAcc *A : Accs) { + res.push_back(std::move(A->expandAt(L, Point, PtrTy, ParamTy, AgParamTy))); + } + std::vector checks; + for (auto A = res.begin(); A != res.end(); ++A) { + for (auto B = res.begin(); B != A; ++B){ + AffAccConflict kind = std::max(A->Access->getConflict(B->Access, L), B->Access->getConflict(A->Access, L)); + switch (kind) + { + case AffAccConflict::NoConflict: + break; //nothing to add + case AffAccConflict::MustNotIntersect: { + Value *x = builder.CreateICmpULT(A->UpperBound, B->LowerBound, "no.inter.ab"); + Value *y = builder.CreateICmpULT(B->UpperBound, A->LowerBound, "no.inter.ba"); + checks.push_back(builder.CreateOr(x, y, "no.intersect")); + break; + } + case AffAccConflict::Bad: + assert(false && "cannot expand the given access because some of them have a bad conflict!"); + break; + default: + llvm_unreachable("unknown conflict type"); + } + } + } + DenseSet loops; //find all relevant loops + for (AffAcc *A : Accs) { + for (unsigned d = 0u; d < A->loopToDimension(L); d++) { + const Loop *x = A->getLoop(d); + if (x) loops.insert(x); + } + } + for (const Loop *M : loops) { //generate checks for the loops + auto p = reps.find(M); + assert(p != reps.end()); + checks.push_back(p->second->expandLoopGuard(Point)); + } + if (checks.empty()) BoundCheck = builder.getTrue(); + else BoundCheck = builder.CreateAnd(checks); + return res; } //================== Affine Access Analysis ================================================== @@ -664,9 +968,10 @@ PreservedAnalyses AffineAccessAnalysisPass::run(Function &F, FunctionAnalysisMan AffineAccess AA = FAM.getResult(F); for (const Loop *L : AA.getLI().getLoopsInPreorder()){ L->dump(); - for (const AffAcc *A : AA.getAccesses(L)){ - A->dump(); + for (const AffAcc *A : AA.getExpandableAccesses(L)){ + A->dumpInLoop(L); } } return PreservedAnalyses::all(); } + diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index d9ca508adc81d..c1efd9ea1634b 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -48,6 +48,7 @@ #include #include #include +#include #define SSR_INFERENCE true @@ -61,10 +62,55 @@ using namespace llvm; -static cl::opt GenerateSSR("generate-ssr", cl::init(true), cl::Hidden); - namespace{ +template +struct ConflictTree { + void insertNode(const NodeT *Node, unsigned value, const NodeT *Parent) { + assert((values.find(Node) == values.end() || children.find(Node) == children.end()) && "not yet inserted"); + values.insert(std::make_pair(Node, value)); + children.insert(std::make_pair(Node, std::move(std::vector()))); + if (!Parent) { //this is root + assert(!Root && "Parent = nullptr, but root already exists"); + Root = Node; + } else { + auto p = children.find(Parent); + assert(p != children.end() && "parent cannot be found"); + p->getSecond().push_back(Node); + } + } + + //picks the nodes in the tree such that their combined value (conmbineFunc, needs to be associative & commutative) is the highest possible + std::vector findBest(const std::function &combineFunc) { + std::vector res; + if (!Root) return res; + findBest(Root, combineFunc, res); + return res; + } + +private: + unsigned findBest(const NodeT *N, const std::function &combineFunc, std::vector &res) { + unsigned size = res.size(); + unsigned val = 0u; + auto &chs = children.find(N)->getSecond(); + if (!chs.empty()) { + for (const NodeT *C : chs) val = combineFunc(val, findBest(C, combineFunc, res)); + } + unsigned nval = values.find(N)->second; + if (val > nval) { + return val; + } else { + while (res.size() > size) res.pop_back(); + res.push_back(N); + return nval; + } + } + + DenseMap values; + DenseMap> children; + const NodeT *Root = nullptr; +}; + void clobberRegisters(ArrayRef regs, IRBuilder<> &builder){ //equivalent to asm volatile ("":::regs); std::string constraints = "~{dirflag},~{fpsr},~{flags}"; //TODO: what are these doing? @@ -81,155 +127,6 @@ void clobberRegisters(ArrayRef regs, IRBuilder<> &builder){ builder.CreateCall(IA); } -} //end of namespace - -///Wraps an AffineAcc *Access, expands all its SCEVs in constructor -struct GenSSR{ -private: - Value *Base; - ConstantInt *DMID; - SmallVector, SSR_MAX_DIM> offsets; - Value *MemBegin = nullptr; - Value *MemEnd = nullptr; - //Instruction *AvailableFrom; //use this and do everything lazy? - -public: - ///AffineAcc which is wrapped by this GenSSR - const AffineAcc *Access; - - ///expand data, bound, and stride - GenSSR(const AffineAcc *A, unsigned dmid, Instruction *ExpandBefore, AffineAccess &AF) : Access(A) { - auto &ctxt = ExpandBefore->getParent()->getContext(); - Type *i32 = IntegerType::getInt32Ty(ctxt); - DMID = cast(ConstantInt::get(i32, dmid)); - Base = AF.expandData(A, Type::getInt8PtrTy(ctxt), ExpandBefore); - for (unsigned i = 0U; i < A->getDimension(); i++){ - offsets.push_back(std::make_pair(AF.expandBound(A, i, i32, ExpandBefore), AF.expandStride(A, i, i32, ExpandBefore))); - } - } - - ///generate comparisons - Value *GenerateSSRGuard(Instruction *ExpandBefore){ - auto &ctxt = ExpandBefore->getParent()->getContext(); - Type *i64 = IntegerType::getInt64Ty(ctxt); - IRBuilder<> builder(ExpandBefore); - std::vector checks; - for (unsigned i = 0U; i < Access->getDimension(); i++){ - /// loop has to be taken at least once (>= 1) ==> bound >= 0 - /// SGE also works for unsigned int: if the bound is unsigned and larger than 2^30 it will be too large for the scratchpad anyway - checks.push_back(builder.CreateICmpSGE(getBound(i), ConstantInt::get(Type::getInt32Ty(ExpandBefore->getContext()), 0U))); - } - Value *BaseInt = builder.CreatePtrToInt(getBase(), i64, "base.to.int"); - this->MemBegin = BaseInt; - checks.push_back(builder.CreateICmpUGE(BaseInt, ConstantInt::get(i64, SSR_SCRATCHPAD_BEGIN), "scratchpad.begin.check")); - Value *EndIncl = BaseInt; - for (unsigned i = 0U; i < Access->getDimension(); i++){ - auto dim = formatv("{0}d", i+1u); - Value *Range = builder.CreateNUWMul(getBound(i), getStride(i), Twine("range.").concat(dim)); - Value *RangeExt = builder.CreateSExt(Range, i64, Twine("range.sext.").concat(dim)); - EndIncl = builder.CreateAdd(EndIncl, RangeExt, Twine("end.incl.").concat(dim)); - } - this->MemEnd = EndIncl; - checks.push_back(builder.CreateICmpULE(EndIncl, ConstantInt::get(i64, SSR_SCRATCHPAD_END), "scratchpad.end.check")); - return builder.CreateAnd(ArrayRef(checks)); - } - - ///generate setup instructions in loop preheader - void GenerateSetup(){ - Instruction *Point = Access->getLoop()->getLoopPreheader()->getTerminator(); - Module *mod = Point->getModule(); - IRBuilder<> builder(Point); - Type *i32 = Type::getInt32Ty(Point->getContext()); - Constant *dim = ConstantInt::get(i32, Access->getDimension() - 1U); //dimension - 1, ty=i32 - bool isStore = Access->getNStore() > 0u; - - Intrinsic::RISCVIntrinsics functions[] = { - Intrinsic::riscv_ssr_setup_bound_stride_1d, - Intrinsic::riscv_ssr_setup_bound_stride_2d, - Intrinsic::riscv_ssr_setup_bound_stride_3d, - Intrinsic::riscv_ssr_setup_bound_stride_4d - }; - Value *StrideChange = nullptr; - for (unsigned i = 0u; i < Access->getDimension(); i++){ - Value *Str = getStride(i); - Value *Bd = getBound(i); - Value *ChSt; - if (StrideChange) ChSt = builder.CreateSub(Str, StrideChange, formatv("stride.{0}d.final", i+1)); - else ChSt = Str; - Function *SSRBoundStrideSetup = Intrinsic::getDeclaration(mod, functions[i]); - std::array bsargs = {getDMID(), Bd, ChSt}; - builder.CreateCall(SSRBoundStrideSetup->getFunctionType(), SSRBoundStrideSetup, ArrayRef(bsargs))->dump(); - if (i + 1 != Access->getDimension()){ //only calculate stride change if needed - Value *bdXstr = builder.CreateMul(Bd, Str, formatv("bdXstd.{0}d", i+1)); - if (StrideChange) StrideChange = builder.CreateAdd(StrideChange, bdXstr, formatv("str.change.for{0}d", i+2)); - else StrideChange = bdXstr; - } - } - - unsigned n_reps = 0U; - std::string s = formatv("ft{0}", (unsigned)DMID->getValue().getLimitedValue()); - ArrayRef regs(s); - if (isStore){ - Function *SSRPush = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_push); - for (Instruction *I : Access->getAccesses()){ - std::array pusharg = {getDMID(), cast(I)->getValueOperand()}; - builder.SetInsertPoint(I); - clobberRegisters(regs, builder); - auto *C = builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); - clobberRegisters(regs, builder); - C->dump(); I->dump(); - I->eraseFromParent(); - n_reps++; - } - }else{ - Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); - std::array poparg = {getDMID()}; - for (Instruction *I : Access->getAccesses()){ - builder.SetInsertPoint(I); - clobberRegisters(regs, builder); - auto *V = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); - clobberRegisters(regs, builder); - V->dump(); I->dump(); - BasicBlock::iterator ii(I); - ReplaceInstWithValue(I->getParent()->getInstList(), ii, V); - n_reps++; - } - } - - builder.SetInsertPoint(Point); - Constant *Rep = ConstantInt::get(i32, n_reps - 1U); - Function *SSRRepetitionSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_repetition); - std::array repargs = {getDMID(), Rep}; - builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef(repargs))->dump(); - - Function *SSRSetup; - if (!isStore){ - SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_read_imm); //can take _imm bc dm and dim are constant - }else{ - SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_write_imm); //can take _imm bc dm and dim are constant - } - std::array args = {getDMID(), dim, getBase()}; - //NOTE: this starts the prefetching ==> always needs to be inserted AFTER bound/stride and repetition setups !!! - builder.CreateCall(SSRSetup->getFunctionType(), SSRSetup, ArrayRef(args))->dump(); - - //create an SSR barrier in exit block. TODO: needed esp. for write streams? - builder.SetInsertPoint(Access->getLoop()->getExitBlock()->getFirstNonPHI()); - Function *SSRBarrier = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_barrier); - std::array barrargs = {getDMID()}; - builder.CreateCall(SSRBarrier->getFunctionType(), SSRBarrier, ArrayRef(barrargs))->dump(); - return; - } - - Value *getBase() const { return Base; } - Value *getBound(unsigned i) const { return offsets[i].first; } - Value *getStride(unsigned i) const { return offsets[i].second; } - ConstantInt *getDMID() const { return DMID; } - Value *getMemBegin() const { return MemBegin; } - Value *getMemEnd() const { return MemEnd; } -}; - -namespace{ - void copyPHIsFromPred(BasicBlock *BB){ BasicBlock *Pred = BB->getSinglePredecessor(); assert(Pred && "BB has single predecessor"); @@ -244,7 +141,7 @@ void copyPHIsFromPred(BasicBlock *BB){ } ///splits block, redirects all predecessor to first half of split, copies phi's -std::pair splitAt(Instruction *X, const Twine &name, DomTreeUpdater *DTU){ +std::pair splitAt(Instruction *X, const Twine &name){ BasicBlock *Two = X->getParent(); BasicBlock *One = splitBlockBefore(Two, X, nullptr, nullptr, nullptr, name); for (auto *BB : predecessors(Two)){ @@ -257,7 +154,6 @@ std::pair splitAt(Instruction *X, const Twine &name, } } } - DTU->flush(); copyPHIsFromPred(Two); //copy Phi's from One to Two return std::make_pair(One, Two); } @@ -265,14 +161,14 @@ std::pair splitAt(Instruction *X, const Twine &name, ///clones code from BeginWith up to EndBefore ///assumes all cf-paths from begin lead to end (or return) ///assumes there is a phi node for each value defined in the region that will be cloned in the block of EndBefore that is live after EndBefore -BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, DominatorTree &DT, DomTreeUpdater *DTU, LoopInfo *LI, MemorySSAUpdater *MSSAU){ +BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, DominatorTree &DT){ errs()<<"cloning from "<<*BeginWith<<" up to "<<*EndBefore<<"\n"; - - auto p = splitAt(BeginWith, "split.before", DTU); + + auto p = splitAt(BeginWith, "split.before"); BasicBlock *Head = p.first; BasicBlock *Begin = p.second; - p = splitAt(EndBefore, "fuse.prep", DTU); + p = splitAt(EndBefore, "fuse.prep"); BasicBlock *Fuse = p.first; BasicBlock *End = p.second; @@ -339,13 +235,13 @@ BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, Dominato ConstantInt::get(Type::getInt1Ty(HeadSucc->getContext()), 0u), Head ); - const auto &edge = BasicBlockEdge(std::make_pair(Fuse, End)); + const auto edge = BasicBlockEdge(std::make_pair(Fuse, End)); for (auto &p : clones){ for (User *U : p.first->users()){ auto *I = dyn_cast(U); if (I && DT.dominates(edge, I->getParent())){ errs()<<*I<<" makes use of "<<*p.first<<" after cloned region ==> add phi node at end!\n"; - assert(false && "did not declare phi node for live-out value"); + //assert(false && "did not declare phi node for live-out value"); } } } @@ -372,6 +268,93 @@ BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, Dominato return HeadBr; } +Value *GenerateTCDMCheck(ExpandedAffAcc &E, Instruction *Point) { + IRBuilder<> builder(Point); + IntegerType *i64 = IntegerType::getInt64Ty(Point->getContext()); + Value *c1 = builder.CreateICmpUGE(ConstantInt::get(i64, SSR_SCRATCHPAD_BEGIN), E.LowerBound, "beg.check"); + Value *c2 = builder.CreateICmpULE(E.UpperBound, ConstantInt::get(i64, SSR_SCRATCHPAD_END), "end.check"); + return builder.CreateAnd(c1, c2, "tcdm.check"); +} + +void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ + Module *mod = Point->getModule(); + IRBuilder<> builder(Point); + Type *i32 = Type::getInt32Ty(Point->getContext()); + unsigned dim = E.getDimension(); + Constant *Dim = ConstantInt::get(i32, dim - 1U); //dimension - 1, ty=i32 + Constant *DMid = ConstantInt::get(i32, dmid); //ty=i32 + bool isStore = E.Access->isWrite(); + + Intrinsic::RISCVIntrinsics functions[] = { + Intrinsic::riscv_ssr_setup_bound_stride_1d, + Intrinsic::riscv_ssr_setup_bound_stride_2d, + Intrinsic::riscv_ssr_setup_bound_stride_3d, + Intrinsic::riscv_ssr_setup_bound_stride_4d + }; + + for (unsigned i = 0u; i < dim; i++) { + Value *Stride = E.Steps[i]; + if (i > 0) Stride = builder.CreateSub(Stride, E.PrefixSumRanges[i-1], formatv("stride.{0}d", i+1)); + Value *Bound = builder.CreateSub(E.Reps[i], ConstantInt::get(i32, 1u), formatv("bound.{0}d", i+1)); + Function *SSRBoundStrideSetup = Intrinsic::getDeclaration(mod, functions[i]); + std::array bsargs = {DMid, Bound, Stride}; + builder.CreateCall(SSRBoundStrideSetup->getFunctionType(), SSRBoundStrideSetup, ArrayRef(bsargs))->dump(); + } + + unsigned n_reps = 0U; + std::string s = formatv("ft{0}", dmid); + ArrayRef regs(s); + if (isStore){ + Function *SSRPush = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_push); + for (Instruction *I : E.Access->getAccesses()){ + std::array pusharg = {DMid, cast(I)->getValueOperand()}; + builder.SetInsertPoint(I); + clobberRegisters(regs, builder); + auto *C = builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); + clobberRegisters(regs, builder); + C->dump(); I->dump(); + I->eraseFromParent(); + n_reps++; + } + }else{ + Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); + std::array poparg = {DMid}; + for (Instruction *I : E.Access->getAccesses()){ + builder.SetInsertPoint(I); + clobberRegisters(regs, builder); + auto *V = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); + clobberRegisters(regs, builder); + V->dump(); I->dump(); + BasicBlock::iterator ii(I); + ReplaceInstWithValue(I->getParent()->getInstList(), ii, V); + n_reps++; + } + } + + builder.SetInsertPoint(Point); + Constant *Rep = ConstantInt::get(i32, n_reps - 1U); + Function *SSRRepetitionSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_repetition); + std::array repargs = {DMid, Rep}; + builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef(repargs))->dump(); + + Function *SSRSetup; + if (!isStore){ + SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_read_imm); //can take _imm bc dm and dim are constant + }else{ + SSRSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_write_imm); //can take _imm bc dm and dim are constant + } + std::array args = {DMid, Dim, E.Addr}; + //NOTE: this starts the prefetching ==> always needs to be inserted AFTER bound/stride and repetition setups !!! + builder.CreateCall(SSRSetup->getFunctionType(), SSRSetup, ArrayRef(args))->dump(); + + //create an SSR barrier in exit block. TODO: needed esp. for write streams? + //builder.SetInsertPoint(Access->getLoop()->getExitBlock()->getFirstNonPHI()); + //Function *SSRBarrier = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_barrier); + //std::array barrargs = {DMid}; + //builder.CreateCall(SSRBarrier->getFunctionType(), SSRBarrier, ArrayRef(barrargs))->dump(); + return; +} + ///generates SSR enable & disable calls void generateSSREnDis(const Loop *L){ IRBuilder<> builder(L->getLoopPreheader()->getTerminator()); // ----------- in preheader @@ -379,16 +362,6 @@ void generateSSREnDis(const Loop *L){ Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef()); -<<<<<<< HEAD -<<<<<<< HEAD - //insert frep pragma - //Function *FrepPragma = Intrinsic::getDeclaration(mod, Intrinsic::riscv_frep_infer); - //builder.CreateCall(FrepPragma->getFunctionType(), FrepPragma, ArrayRef()); -======= - Function *FREPPragma = Intrinsic::getDeclaration(mod, Intrinsic::riscv_frep_infer); - builder.CreateCall(FREPPragma->getFunctionType(), FREPPragma, ArrayRef()); ->>>>>>> analysis -======= std::vector regs; for (unsigned r = 0u; r < NUM_SSR; r++){ regs.push_back(std::string(formatv("ft{0}", r))); @@ -399,7 +372,6 @@ void generateSSREnDis(const Loop *L){ //Function *FREPPragma = Intrinsic::getDeclaration(mod, Intrinsic::riscv_frep_infer); //builder.CreateCall(FREPPragma->getFunctionType(), FREPPragma, ArrayRef()); ->>>>>>> analysis builder.SetInsertPoint(L->getExitBlock()->getTerminator()); // ----------- in exit block clobberRegisters(ArrayRef(regs), builder); @@ -414,232 +386,109 @@ void generateSSREnDis(const Loop *L){ return; } -Value *generateIntersectCheck(IRBuilder<> &builder, GenSSR *G, GenSSR *H){ - Value *Glo = G->getMemBegin(); - Value *Ghi = G->getMemEnd(); - Value *Hlo = H->getMemBegin(); - Value *Hhi = H->getMemEnd(); - Value *GhiLTHlo = builder.CreateICmpULT(Ghi, Hlo, "1st.memrange.check"); //bounds are inclusive, we assume alignment - Value *HhiLTGlo = builder.CreateICmpULT(Hhi, Glo, "2nd.memrange.check"); - return builder.CreateOr(GhiLTHlo, HhiLTGlo, "or.memrange"); -} +void expandInLoop(const std::vector &accs, const Loop *L, AffineAccess &AAA) { + assert(accs.size() <= NUM_SSR); + assert(L); + + errs()<<"expanding in Loop: "<getHeader()->getNameOrAsOperand()<<" at depth "<getLoopDepth()<<"\n"; + + auto &ctxt = L->getHeader()->getContext(); + IntegerType *i32 = IntegerType::getInt32Ty(ctxt); + IntegerType *i64 = IntegerType::getInt64Ty(ctxt); + Type *i8Ptr = Type::getInt8PtrTy(ctxt); + + BranchInst *BR = cloneRegion(L->getLoopPreheader()->getTerminator(), &*L->getExitBlock()->getFirstInsertionPt(), AAA.getDT()); -void generateSSRGuard(BranchInst *BR, ArrayRef streams, AffineAccess &AF){ - assert(BR->isConditional()); - if (streams.empty()) return; + //generate Stride, Bound, base addresses, and intersect checks + Value *Cond = nullptr; + auto exp = AAA.expandAllAt(accs, L, BR, Cond, i8Ptr, i32, i64); + assert(Cond); + + //TCDM Checks IRBuilder<> builder(BR); - std::vector checks; - for (auto *G : streams){ - checks.push_back(G->GenerateSSRGuard(BR)); //means getMemBegin() and getMemEnd() do not return nullptr + for (auto &E : exp) { + Cond = builder.CreateAnd(Cond, GenerateTCDMCheck(E, BR)); } - for (unsigned i = 0; i < streams.size(); i++){ - GenSSR *G = streams[i]; - for (unsigned j = 0; j < i; j++){ - GenSSR *H = streams[j]; - if (AF.conflictWWWR(G->Access, H->Access)){ - checks.push_back(generateIntersectCheck(builder, G, H)); - } - } + + BR->setCondition(Cond); + + unsigned dmid = 0u; + for (auto &E : exp) { + GenerateSSRSetup(E, dmid++, L->getLoopPreheader()->getTerminator()); } - Value *TakeSSR = builder.CreateAnd(checks); - BR->setCondition(TakeSSR); + generateSSREnDis(L); } -bool isValid(const AffineAcc *A){ - if (!A) return false; - if (A->getDimension() > SSR_MAX_DIM) return false; - unsigned n_store = 0U; - unsigned n_load = 0U; +bool isValid(AffAcc *A) { bool valid = true; - for (auto *I : A->getAccesses()){ - if (dyn_cast(I)) { - n_load++; - valid = valid && CHECK_TYPE(I->getType(), I); - }else if(auto St = dyn_cast(I)) { - n_store++; - valid = valid && CHECK_TYPE(St->getValueOperand()->getType(), I); - }else assert(false && "non load/store instruction in AffineAcc::accesses ?"); - if(!valid) break; + bool write = A->isWrite(); + for (Instruction *I : A->getAccesses()) { + if (write) valid &= CHECK_TYPE(cast(I)->getValueOperand()->getType(), I); + else valid &= CHECK_TYPE(I->getType(), I); } - return valid && ((n_store > 0U && n_load == 0U) || (n_store == 0U && n_load > 0U)); + return valid; } -struct ConflictGraph{ - using NodeT = const AffineAcc *; - - ///accs assumed to be valid - ConflictGraph(const AffineAccess &AF, ArrayRef accesses) : AF(AF){ - errs()<<"conflict graph with "<())); - mutexs.insert(std::make_pair(*A, std::vector())); - for (auto B = accesses.begin(); B != A; B++){ - if (AF.shareInsts(*A, *B)){ //AF.conflictWWWR(*A, *B) - mutexs.find(*A)->second.push_back(*B); - mutexs.find(*B)->second.push_back(*A); - }else if (AF.shareLoops(*A, *B)){ //here we assume that the accessed memory region do not intersect and check this at runtime - conflicts.find(*A)->second.push_back(*B); - conflicts.find(*B)->second.push_back(*A); - } - } - } - } - - ///currently done greedily according to isBetter - std::map> &color(unsigned nColors) { - std::map> &color = *(new std::map>()); - std::vector accs; - for (const auto &A : conflicts) accs.push_back(A.first); - auto isBetter = [](NodeT A, NodeT B){ - unsigned a = A->getLoop()->getLoopDepth() + 2 * A->getDimension() + (A->getNLoad() > 0u); - unsigned b = B->getLoop()->getLoopDepth() + 2 * B->getDimension() + (B->getNLoad() > 0u); - return a > b; - }; - std::sort(accs.begin(), accs.end(), isBetter); - for (const auto &A : accs){ - bool done = false; - for (const auto &M : mutexs.find(A)->second){ - auto c = color.find(M); - if (c != color.end() && c->second.hasValue()) {//one mutex neighbour has color => A cannot get one - color.insert(std::make_pair(A, None)); - done = true; - break; - } - } - if (done) continue; //done with this A ==> go to next - - BitVector cs(nColors); - for (const auto &M : conflicts.find(A)->second){ - auto mc = color.find(M); - if (mc != color.end() && mc->second.hasValue()){ //neighbour has some color mc ==> A cannot get mc - cs[mc->second.getValue()] = 1u; - } - } - int c = cs.find_first_unset(); - if (c >= 0) color.insert(std::make_pair(A, (unsigned)c)); - else color.insert(std::make_pair(A, None)); - } - return color; +void visitLoop(const Loop *L, DenseMap> &possible, ConflictTree &tree, AffineAccess &AAA) { + assert(L); + ArrayRef accs = AAA.getExpandableAccesses(L); + std::vector valid; + for (AffAcc *A : accs) { + if (isValid(A)) valid.push_back(A); } - -private: - const AffineAccess &AF; - std::map> conflicts; //cannot get same color - std::map> mutexs; //if one gets a color the other cannot get any color -}; - -void addChangedLoop(const Loop *NewL, SmallPtrSet &loops){ - //check whether L or any of its predecessors (parents, parents of parents, etc) are already marked for SSRenable & -disable - const Loop *L = NewL; - bool contained = false; - while (L && !contained){ - contained = contained || (loops.find(L) != loops.end()); - L = L->getParentLoop(); - } - if (!contained){ - //check for all loops in loops whether NewL contains them - std::vector dels; //cannot directly delete loops in foreach loops ==> store here first - for (const Loop *L : loops){ - if (NewL->contains(L)) dels.push_back(L); - } - for (const Loop *L : dels) loops.erase(L); - loops.insert(NewL); + if (valid.empty()) return; + //sort by dimension (with read beeing preferred over write) + auto comp = [L](const AffAcc *A, const AffAcc *B) { + unsigned dimA = A->loopToDimension(L); + unsigned dimB = B->loopToDimension(L); + return dimA < dimB || (dimA == dimB && !(A->isWrite() && !B->isWrite())); + }; + std::sort(valid.begin(), valid.end(), comp); + //add possible: + auto &l = possible.insert(std::make_pair(L, std::move(std::vector()))).first->getSecond(); + for (unsigned i = 0u; i < NUM_SSR && i < valid.size(); i++) { + l.push_back(valid[i]); } + //add to tree: + unsigned val = l.size(); //TODO: find more elaborate score model + tree.insertNode(L, val, L->isOutermost() ? nullptr : L->getParentLoop()); } } //end of namespace PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &FAM){ - - if (!SSR_INFERENCE || !GenerateSSR) return PreservedAnalyses::all(); - AffineAccess &AF = FAM.getResult(F); - LoopInfo &LI = FAM.getResult(F); - DominatorTree &DT = FAM.getResult(F); - DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Eager); - auto &MA = FAM.getResult(F); - MemorySSAUpdater MSSAU(&MA.getMSSA()); + AffineAccess &AAA = FAM.getResult(F); errs()<<"SSR Generation Pass on function: "< changedLoops; - - auto accs = AF.getAccesses(); - - std::vector goodAccs; - for (const AffineAcc *A : accs){ - auto p = AF.splitLoadStore(A); - if (p.first && isValid(p.first)) goodAccs.push_back(p.first); - if (p.second && isValid(p.second)) goodAccs.push_back(p.second); - } - - if (goodAccs.empty()) return PreservedAnalyses::all(); //early exit - - ConflictGraph g(AF, ArrayRef(goodAccs)); - const auto &clr = g.color(NUM_SSR); - errs()<<"computed coloring\n"; - - DenseMap> ssrs; - - for (const auto C : clr){ - if (C.second.hasValue()){ //not None - //add to ssrs - auto p = ssrs.find(C.first->getLoop()); - GenSSR *G = new GenSSR(C.first, C.second.getValue(), C.first->getLoop()->getLoopPreheader()->getTerminator(), AF); - if (p != ssrs.end()) p->getSecond().push_back(G); - else ssrs.insert(std::make_pair(C.first->getLoop(), SmallVector(1u, G))); - - addChangedLoop(C.first->getLoop(), changedLoops); //update set of changed loops + bool changed = false; + for (const Loop *T : AAA.getLI().getTopLevelLoops()){ + DenseMap> possible; + ConflictTree tree; + + //go through all loops in this tree to build conflict-tree and find possible expands + std::deque worklist; + worklist.push_back(T); + while (!worklist.empty()) { + const Loop *L = worklist.front(); worklist.pop_front(); + visitLoop(L, possible, tree, AAA); + for (const Loop *x : L->getSubLoops()) worklist.push_back(x); } - } - errs()<<"expanded all SSR bases, bounds, and strides\n"; + //find best expands + auto f = [](unsigned a, unsigned b){ return a + b; }; + std::vector best = tree.findBest(f); - //generate clones - for (const Loop *L : LI.getLoopsInPreorder()){ - auto p = ssrs.find(L); - if (p != ssrs.end()){ - BranchInst *BR = cloneRegion(L->getLoopPreheader()->getTerminator(), &*L->getExitBlock()->getFirstInsertionPt(), DT, &DTU, &LI, &MSSAU); - generateSSRGuard(BR, ArrayRef(p->getSecond().begin(), p->getSecond().end()), AF); //generate "SSR guard" + //expand them + for (const Loop *L : best) { + expandInLoop(possible.find(L)->getSecond(), L, AAA); } - } - - errs()<<"generated all SSR guards\n"; - //generate ssr setups - for (const auto &p : ssrs){ - for (GenSSR *G : p.getSecond()){ - G->GenerateSetup(); - } + changed |= !best.empty(); } - errs()<<"generated all SSR setups\n"; - - //generate enable / disable - for (const Loop *L : changedLoops) generateSSREnDis(L); - - errs()<<"generated all SSR enable & disable \n"; - - //TODO: merge loops - //TODO: frep pragmas - - F.addFnAttr(Attribute::AttrKind::NoInline); //mark function as no-inline, because there can be intersecting streams if function is inlined! + if (changed) return PreservedAnalyses::all(); return PreservedAnalyses::none(); -} - - - -/* -for (unsigned i = 0; i < streams.size(); i++){ - GenSSR *G = streams[i]; - for (unsigned j = 0; j < streams.size(); j++){ - if (G->Access->getNStore() > 0u){ - GenSSR *H = streams[j]; - if (j < i || (j > i && H->Access->getNStore() == 0u)){ //true if H is before G OR H is after G and a load - checks.push_back(generateIntersectCheck(builder, G, H)); - } - } - } - } -*/ \ No newline at end of file +} \ No newline at end of file From 77a028613705107e192f69c51bc2aef8b3db7b38 Mon Sep 17 00:00:00 2001 From: thrupf Date: Sun, 5 Jun 2022 00:08:26 +0200 Subject: [PATCH 31/47] performance problems? --- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 76 ++++++++++++---------- 1 file changed, 41 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index 15d51633e0f8d..5266daeada4db 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -53,6 +53,40 @@ using namespace llvm; namespace { +struct SCEVUknownSetFinder { + DenseSet values; + // return true to follow this node. + bool follow(const SCEV *S) { + if (S->getSCEVType() == SCEVTypes::scUnknown) { + values.insert(cast(S)->getValue()); + } + return true; //always true + } + // return true to terminate the search. + bool isDone() { return false; /*continue forever*/ } +}; + +bool shareValues(const SCEV *A, const SCEV *B) { + SCEVUknownSetFinder finderA; + SCEVTraversal trA(finderA); + trA.visitAll(A); + SCEVUknownSetFinder finderB; + SCEVTraversal trB(finderB); + trB.visitAll(B); + bool shareValues = false; + for (Value *V : finderA.values) { + for (Value *W : finderB.values) { + shareValues |= V == W; + } + } + return shareValues; +} + +bool SCEVContainsCouldNotCompute(const SCEV *S) { + auto pred = [](const SCEV *X) { return !X || X->getSCEVType() == SCEVTypes::scCouldNotCompute || isa(X); }; + return SCEVExprContains(S, std::move(pred)); +} + /// guarantees: /// L has 1 preheader and 1 dedicated exit /// L has 1 backedge and 1 exiting block @@ -70,7 +104,7 @@ const SCEV *getLoopBTSCEV(const Loop *L, DominatorTree &DT, ScalarEvolution &SE) return nullptr; } const SCEV *bt = SE.getBackedgeTakenCount(L); - if(!bt || isa(bt) || !SE.isAvailableAtLoopEntry(bt, L)){ + if(!bt || isa(bt) || !SE.isAvailableAtLoopEntry(bt, L) || SCEVContainsCouldNotCompute(bt)){ return nullptr; } return bt; @@ -289,35 +323,6 @@ Value *goodAnd(IRBuilder<> &builder, ArrayRef bools){ bool hasMemInst(MemoryUseOrDef *MA) { return MA && MA->getMemoryInst(); } -struct SCEVUknownSetFinder { - DenseSet values; - // return true to follow this node. - bool follow(const SCEV *S) { - if (S->getSCEVType() == SCEVTypes::scUnknown) { - values.insert(cast(S)->getValue()); - } - return true; //always true - } - // return true to terminate the search. - bool isDone() { return false; /*continue forever*/ } -}; - -bool shareValues(const SCEV *A, const SCEV *B) { - SCEVUknownSetFinder finderA; - SCEVTraversal trA(finderA); - trA.visitAll(A); - SCEVUknownSetFinder finderB; - SCEVTraversal trB(finderB); - trB.visitAll(B); - bool shareValues = false; - for (Value *V : finderA.values) { - for (Value *W : finderB.values) { - shareValues |= V == W; - } - } - return shareValues; -} - } //end of namespace //================== =========================================================== @@ -392,6 +397,7 @@ AffAcc::AffAcc(ArrayRef accesses, const SCEV *Addr, MemoryUseOrDe { assert(!accesses.empty()); assert(MA); + if (Addr && SCEVContainsCouldNotCompute(Addr)) Addr = nullptr; //set to null if contains SCEVCouldNotCompute baseAddresses.push_back(Addr); steps.push_back((const SCEV *)nullptr); //there is no step for dim=0 reps.push_back((LoopRep *)nullptr); //there is no rep for dim=0 @@ -478,7 +484,7 @@ unsigned AffAcc::loopToDimension(const Loop *L) const { for (unsigned d = 1u; d < containingLoops.size(); d++){ //FIXME: linear search -> improve with a map if (containingLoops[d] == L) return d; } - assert(false && "The provided loop does not contain `this`!"); + llvm_unreachable("The provided loop does not contain `this`!"); } bool AffAcc::canExpandBefore(const Loop *L) const { return isWellFormed(loopToDimension(L)); } const SCEV *AffAcc::getBaseAddr(unsigned dim) const { assert(dim < baseAddresses.size()); return baseAddresses[dim]; } @@ -531,7 +537,7 @@ void AffAcc::addConflict(AffAcc *A, const Loop *StartL, AffAccConflict kind){ assert(conflicts.find(A) == conflicts.end() && "no conflict for A yet"); assert(kind == AffAccConflict::Bad || (isWellFormed(StartL) && A->isWellFormed(StartL))); conflicts.insert(std::make_pair(A, std::make_pair(StartL, kind))); - errs()<<"conflict for:\n"; dumpInLoop(StartL); errs()<<"with:\n"; A->dumpInLoop(StartL); errs()<<"is ===========>"; + /*errs()<<"conflict for:\n"; dumpInLoop(StartL); errs()<<"with:\n"; A->dumpInLoop(StartL); errs()<<"is ===========>"; switch (kind) { case AffAccConflict::Bad: @@ -546,7 +552,7 @@ void AffAcc::addConflict(AffAcc *A, const Loop *StartL, AffAccConflict kind){ default: break; } - errs()<<"\n"; + errs()<<"\n"; */ } bool AffAcc::promote(LoopRep *LR){ if (!LR->isAvailable()) return false; @@ -567,7 +573,7 @@ bool AffAcc::promote(LoopRep *LR){ possible &= LR->isSafeToExpandBefore(LR->getLoop()); if (possible) errs()<<"(3) new rep & step can be expanded, "; //check base address - possible &= isSafeToExpandAt(getBaseAddr(newDim), Point, SE); + possible &= !SCEVContainsCouldNotCompute(getBaseAddr(newDim)) && isSafeToExpandAt(getBaseAddr(newDim), Point, SE); if (possible) errs()<<"(4) new base addr can be expanded"; errs()<<"\n"; if (!possible) return false; @@ -739,7 +745,7 @@ std::vector AffineAccess::analyze(const Loop *Parent, ArrayRefgetSubLoops()){ std::vector accs = analyze(L, ArrayRef(path)); LoopRep *LR = reps.find(L)->second; //guaranteed to exist, no check needed - bool canPromote = LR->isOnAllCFPathsOfParentIfExecuted() && ParentLR->isAvailable() && LR->isAvailable(); + bool canPromote = LR->isAvailable() && ParentLR->isAvailable() && LR->isOnAllCFPathsOfParentIfExecuted(); for (AffAcc *A : accs){ all.push_back(A); if (canPromote){ //L is well-formed and on all CF-paths if its rep is >0 at run-time From 61399ea9b60482c21e730a38c857516cfdb5bbbc Mon Sep 17 00:00:00 2001 From: thrupf Date: Sun, 5 Jun 2022 17:00:41 +0200 Subject: [PATCH 32/47] 4/7 sw tests --- .../llvm/Analysis/AffineAccessAnalysis.h | 8 +- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 141 ++++++++---------- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 104 +++++++++---- 3 files changed, 143 insertions(+), 110 deletions(-) diff --git a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h index b097d35bfbe49..8539569cb198e 100644 --- a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h @@ -93,8 +93,6 @@ struct AffAcc{ struct MemDep { private: - DenseMap> clobbers; - DenseMap> clobberUsers; MemorySSA &MSSA; AAResults &AA; bool alias(Value *A, Value *B); @@ -102,10 +100,8 @@ struct MemDep { public: MemDep(MemorySSA &MSSA, AAResults &AA) : MSSA(MSSA), AA(AA) {} - const DenseSet &findClobbers(MemoryUseOrDef *MA); - std::vector findClobbersInLoop(MemoryUseOrDef *MA, const Loop *L); - const DenseSet &findClobberUsers(MemoryDef *MA); - std::vector findClobberUsersInLoop(MemoryDef *MA, const Loop *L); + DenseSet findClobbers(MemoryUseOrDef *MA); + DenseSet findClobberUsers(MemoryDef *MA); }; struct ExpandedAffAcc { diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index 5266daeada4db..5d740a802ffe7 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -512,11 +512,16 @@ void AffAcc::dumpInLoop(const Loop *L) const { errs()<<", rep = "; if (r) errs()<<*r; else errs()<<""; + errs()<<", well-formed = "<isWellFormed(dim); errs()<<"\n"; errs()<<"\taddress = "; if (a) errs()<<*a; else errs()<<""; - errs()<<" is well-formed = "<isWellFormed(dim)<<"\n"; + errs()<<"\n"; + errs()<<"\tloop header = "; + if (getLoop(dim)) errs()<getHeader()->getNameOrAsOperand(); + else errs()<<""; + errs()<<"\n"; } } void AffAcc::dump() const { dumpInLoop(nullptr); } @@ -585,7 +590,14 @@ bool AffAcc::promote(LoopRep *LR){ Value *AffAcc::expandBaseAddr(unsigned dimension, Type *ty, Instruction *InsertBefore){ assert(isWellFormed(dimension)); InsertBefore = InsertBefore ? InsertBefore : reps[dimension]->getLoop()->getLoopPreheader()->getTerminator(); - assert(isSafeToExpandAt(getBaseAddr(dimension), InsertBefore, SE) && "data not expanable here (note: only preheader guaranteed)"); + if (!isSafeToExpandAt(getBaseAddr(dimension), InsertBefore, SE)){ + errs()<<"data not expanable here (note: only preheader guaranteed)\n"; + errs()<<"SCEV (dim = "<getParent()->dump(); + errs()<<"before inst: "<<*InsertBefore<<"\n"; + this->dump(); + llvm_unreachable("cannot expand SCEV at desired location"); + } SCEVExpander ex(SE, reps[dimension]->getLoop()->getHeader()->getModule()->getDataLayout(), "addr"); ex.setInsertPoint(InsertBefore); return castToSize(ex.expandCodeFor(getBaseAddr(dimension)), ty, InsertBefore); @@ -607,6 +619,7 @@ Value *AffAcc::expandRep(unsigned dimension, Type *ty, Instruction *InsertBefore ExpandedAffAcc AffAcc::expandAt(const Loop *L, Instruction *Point, Type *PtrTy, IntegerType *ParamTy, IntegerType *AgParamTy) { + errs()<<"expanding for Loop with header: "<getHeader()->getNameOrAsOperand()<<"\n"; if (!Point) Point = L->getLoopPreheader()->getTerminator(); IRBuilder<> builder(Point); assert(isWellFormed(L)); @@ -639,87 +652,59 @@ bool MemDep::alias(MemoryUseOrDef *A, MemoryUseOrDef *B) { else return alias(getAddress(A), getAddress(B)); } -const DenseSet &MemDep::findClobbers(MemoryUseOrDef *MA){ - if (clobbers.find(MA) == clobbers.end()) { - clobbers.insert(std::make_pair(MA, std::move(DenseSet()))); - auto &res = clobbers.find(MA)->getSecond(); - std::deque worklist; - DenseSet vis; - worklist.push_back(MA->getDefiningAccess()); - while (!worklist.empty()) { - MemoryAccess *A = worklist.front(); worklist.pop_front(); - if (!A) continue; - if (vis.find(A) != vis.end()) continue; - if (A == MA) continue; - vis.insert(A); - if (MemoryDef *D = dyn_cast(A)) { - if (alias(D, MA)) { - auto &s = findClobbers(D); - res.insert(D); - for (auto *A : s) res.insert(A); - } else { - worklist.push_back(D); - } - } else { - MemoryPhi *P = cast(A); - for (unsigned i = 0u; i < P->getNumOperands(); i++) { - worklist.push_back(P->getOperand(i)); - } +DenseSet MemDep::findClobbers(MemoryUseOrDef *MA){ + DenseSet res; + std::deque worklist; + DenseSet vis; + worklist.push_back(MA->getDefiningAccess()); + while (!worklist.empty()) { + MemoryAccess *A = worklist.front(); worklist.pop_front(); + if (!A) continue; + if (vis.find(A) != vis.end()) continue; + if (A == MA) continue; + vis.insert(A); + if (MemoryDef *D = dyn_cast(A)) { + if (alias(D, MA)) { + res.insert(D); + } + worklist.push_back(D); + } else { + MemoryPhi *P = cast(A); + for (unsigned i = 0u; i < P->getNumOperands(); i++) { + worklist.push_back(P->getOperand(i)); } } } - return clobbers.find(MA)->getSecond(); + return res; } - -std::vector MemDep::findClobbersInLoop(MemoryUseOrDef *MA, const Loop *L) { - auto &s = findClobbers(MA); - std::vector r; - for (auto *A : s) - if (L && L->contains(A->getBlock())) r.push_back(A); - return r; -} - -const DenseSet &MemDep::findClobberUsers(MemoryDef *MA) { - if (clobberUsers.find(MA) == clobberUsers.end()) { - clobberUsers.insert(std::make_pair(MA, std::move(DenseSet()))); - auto &res = clobberUsers.find(MA)->getSecond(); - std::deque worklist; - DenseSet vis; - for (auto U = MA->use_begin(); U != MA->use_end(); ++U) { - worklist.push_back(cast(U->getUser())); - } - while (!worklist.empty()){ - MemoryAccess *A = worklist.front(); worklist.pop_front(); - if (!A) continue; - if (vis.find(A) != vis.end()) continue; - vis.insert(A); - if (MemoryUse *U = dyn_cast(A)) { - if (alias(U, MA)) res.insert(U); - } else if (MemoryDef *D = dyn_cast(A)) { - if (alias(D, MA)) { - auto &s = findClobberUsers(D); - res.insert(D); - for (auto *A : s) res.insert(A); - } else { - worklist.push_back(D); - } - } else { - assert(isa(A)); - for (auto U = A->use_begin(); U != A->use_end(); ++U) { - worklist.push_back(cast(U->getUser())); - } + +DenseSet MemDep::findClobberUsers(MemoryDef *MA) { + DenseSet res; + std::deque worklist; + DenseSet vis; + for (auto U = MA->use_begin(); U != MA->use_end(); ++U) { + worklist.push_back(cast(U->getUser())); + } + while (!worklist.empty()){ + MemoryAccess *A = worklist.front(); worklist.pop_front(); + if (!A) continue; + if (vis.find(A) != vis.end()) continue; + vis.insert(A); + if (MemoryUse *U = dyn_cast(A)) { + if (alias(U, MA)) res.insert(U); + } else if (MemoryDef *D = dyn_cast(A)) { + if (alias(D, MA)) { + res.insert(D); + } + worklist.push_back(D); + } else { + assert(isa(A)); + for (auto U = A->use_begin(); U != A->use_end(); ++U) { + worklist.push_back(cast(U->getUser())); } } } - return clobberUsers.find(MA)->getSecond(); -} - -std::vector MemDep::findClobberUsersInLoop(MemoryDef *MA, const Loop *L) { - auto &s = findClobberUsers(MA); - std::vector r; - for (auto *A : s) - if (L && L->contains(A->getBlock())) r.push_back(A); - return r; + return res; } //================== Affine Access =========================================================== @@ -905,7 +890,9 @@ ArrayRef AffineAccess::getExpandableAccesses(const Loop *L) { std::vector AffineAccess::expandAllAt(ArrayRef Accs, const Loop *L, Instruction *Point, Value *&BoundCheck, - Type *PtrTy, IntegerType *ParamTy, IntegerType *AgParamTy) { + Type *PtrTy, IntegerType *ParamTy, IntegerType *AgParamTy) +{ + assert(Point); std::vector res; IRBuilder<> builder(Point); for (AffAcc *A : Accs) { diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index c1efd9ea1634b..a075ecacdfa44 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -128,29 +128,57 @@ void clobberRegisters(ArrayRef regs, IRBuilder<> &builder){ } void copyPHIsFromPred(BasicBlock *BB){ - BasicBlock *Pred = BB->getSinglePredecessor(); - assert(Pred && "BB has single predecessor"); + BasicBlock *Pred = nullptr; + for (BasicBlock *B : predecessors(BB)) { + if (!Pred) Pred = B; + assert(Pred == B && "BB has only one predecessor"); + } + assert(Pred && "BB has a Predecessor"); for (Instruction &I : *Pred){ if (auto *Phi = dyn_cast(&I)){ PHINode *PhiC = PHINode::Create(Phi->getType(), 1u, Twine(Phi->getName()).concat(".copy"), BB->getFirstNonPHI()); Phi->replaceAllUsesWith(PhiC); PhiC->addIncoming(Phi, Pred); - errs()<<"replaced all uses of "<<*Phi<<" with "<<*PhiC<<"\n"; } } } ///splits block, redirects all predecessor to first half of split, copies phi's -std::pair splitAt(Instruction *X, const Twine &name){ +std::pair splitAt(Instruction *X, const Twine &name, DomTreeUpdater &DTU){ + assert(!isa(X) && "should not split at phi"); BasicBlock *Two = X->getParent(); - BasicBlock *One = splitBlockBefore(Two, X, nullptr, nullptr, nullptr, name); + BasicBlock *One = BasicBlock::Create(Two->getContext(), name, Two->getParent(), Two); + Instruction *BR = BranchInst::Create(Two, One); + DTU.applyUpdates(cfg::Update(cfg::UpdateKind::Insert, One, Two)); + BasicBlock::iterator it = Two->begin(); + while (it != X->getIterator()) { + BasicBlock::iterator it_next = std::next(it); + it->removeFromParent(); + it->insertBefore(BR); + it = it_next; + } + //BasicBlock *One = splitBlockBefore(Two, X, &DTU, nullptr, nullptr, name); + std::vector toChange; for (auto *BB : predecessors(Two)){ if (BB == One) continue; Instruction *T = BB->getTerminator(); + for (unsigned i = 0; i < T->getNumOperands(); i++){ + Value *OP = T->getOperand(i); + if (dyn_cast(OP) == Two){ + toChange.push_back(T); + } + } + } + for (Instruction *T : toChange) { for (unsigned i = 0; i < T->getNumOperands(); i++){ Value *OP = T->getOperand(i); if (dyn_cast(OP) == Two){ T->setOperand(i, One); //if an operand of the terminator of a predecessor of Two points to Two it should now point to One + cfg::Update upd[]{ + cfg::Update(cfg::UpdateKind::Insert, T->getParent(), One), + cfg::Update(cfg::UpdateKind::Delete, T->getParent(), Two), + }; + DTU.applyUpdates(upd); } } } @@ -161,14 +189,14 @@ std::pair splitAt(Instruction *X, const Twine &name) ///clones code from BeginWith up to EndBefore ///assumes all cf-paths from begin lead to end (or return) ///assumes there is a phi node for each value defined in the region that will be cloned in the block of EndBefore that is live after EndBefore -BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, DominatorTree &DT){ +BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, DominatorTree &DT, DomTreeUpdater &DTU){ errs()<<"cloning from "<<*BeginWith<<" up to "<<*EndBefore<<"\n"; - - auto p = splitAt(BeginWith, "split.before"); + + auto p = splitAt(BeginWith, "split.before", DTU); BasicBlock *Head = p.first; BasicBlock *Begin = p.second; - p = splitAt(EndBefore, "fuse.prep"); + p = splitAt(EndBefore, "fuse.prep", DTU); BasicBlock *Fuse = p.first; BasicBlock *End = p.second; @@ -196,6 +224,7 @@ BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, Dominato Ic->setOperand(i, A->second); //this also updates uses of A->second //check users update in A->second bool userUpdate = false; for (User *U : A->second->users()) {userUpdate = userUpdate || U == Ic; } assert(userUpdate && "user is updated on setOperand"); + if (isa(A->first)) DTU.applyUpdates(cfg::Update(cfg::UpdateKind::Insert, Cc, cast(A->second))); }else{ operandsCleanup.push_back(std::make_pair(i, Ic)); } @@ -212,6 +241,7 @@ BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, Dominato auto A = clones.find(p.second->getOperand(p.first)); if (A != clones.end()){ p.second->setOperand(p.first, A->second); + if (isa(A->first)) DTU.applyUpdates(cfg::Update(cfg::UpdateKind::Insert, p.second->getParent(), cast(A->second))); }//else did not find ==> was defined before region } //incoming blocks of phi nodes are not operands ==> handle specially @@ -228,23 +258,15 @@ BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, Dominato //change terminator of Head to be CondBr with TakeOrig as cond BranchInst *HeadBr = cast(Head->getTerminator()); //always BranchInst because of splitBlockBefore BasicBlock *HeadSucc = HeadBr->getSuccessor(0); + BasicBlock *HeadSuccClone = cast(clones.find(HeadSucc)->second); HeadBr->eraseFromParent(); HeadBr = BranchInst::Create( HeadSucc, //branch-cond = true -> go to non-clone (here SSR will be inserted) - cast(clones.find(HeadSucc)->second), + HeadSuccClone, ConstantInt::get(Type::getInt1Ty(HeadSucc->getContext()), 0u), Head ); - const auto edge = BasicBlockEdge(std::make_pair(Fuse, End)); - for (auto &p : clones){ - for (User *U : p.first->users()){ - auto *I = dyn_cast(U); - if (I && DT.dominates(edge, I->getParent())){ - errs()<<*I<<" makes use of "<<*p.first<<" after cloned region ==> add phi node at end!\n"; - //assert(false && "did not declare phi node for live-out value"); - } - } - } + DTU.applyUpdates(cfg::Update(cfg::UpdateKind::Insert, Head, HeadSuccClone)); //handle phi nodes in End for (Instruction &I : *End){ if (auto *Phi = dyn_cast(&I)){ @@ -263,6 +285,16 @@ BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, Dominato } } } + const auto edge = BasicBlockEdge(std::make_pair(Fuse, End)); + for (auto &p : clones){ + for (User *U : p.first->users()){ + auto *I = dyn_cast(U); + if (I && DT.dominates(edge, I->getParent())){ + errs()<<*I<<" makes use of "<<*p.first<<" after cloned region ==> add phi node at end!\n"; + //assert(false && "did not declare phi node for live-out value"); + } + } + } errs()<<"done cloning \n"; return HeadBr; @@ -277,10 +309,12 @@ Value *GenerateTCDMCheck(ExpandedAffAcc &E, Instruction *Point) { } void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ + assert(Point); Module *mod = Point->getModule(); IRBuilder<> builder(Point); Type *i32 = Type::getInt32Ty(Point->getContext()); unsigned dim = E.getDimension(); + assert(dim > 0u); Constant *Dim = ConstantInt::get(i32, dim - 1U); //dimension - 1, ty=i32 Constant *DMid = ConstantInt::get(i32, dmid); //ty=i32 bool isStore = E.Access->isWrite(); @@ -293,6 +327,7 @@ void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ }; for (unsigned i = 0u; i < dim; i++) { + errs()<<"for dim = "<<(i+1)<<"\n"; Value *Stride = E.Steps[i]; if (i > 0) Stride = builder.CreateSub(Stride, E.PrefixSumRanges[i-1], formatv("stride.{0}d", i+1)); Value *Bound = builder.CreateSub(E.Reps[i], ConstantInt::get(i32, 1u), formatv("bound.{0}d", i+1)); @@ -387,6 +422,7 @@ void generateSSREnDis(const Loop *L){ } void expandInLoop(const std::vector &accs, const Loop *L, AffineAccess &AAA) { + assert(!accs.empty()); assert(accs.size() <= NUM_SSR); assert(L); @@ -397,7 +433,16 @@ void expandInLoop(const std::vector &accs, const Loop *L, AffineAccess IntegerType *i64 = IntegerType::getInt64Ty(ctxt); Type *i8Ptr = Type::getInt8PtrTy(ctxt); - BranchInst *BR = cloneRegion(L->getLoopPreheader()->getTerminator(), &*L->getExitBlock()->getFirstInsertionPt(), AAA.getDT()); + //for some reason sometimes the loop has multiple exits but they are the same (this is the case if a CondBr has two operands to same block) + SmallVector exits; + L->getExitBlocks(exits); + BasicBlock *Ex = nullptr; + for (BasicBlock *BB : exits){ + if (!Ex) Ex = BB; + assert(Ex == BB); + } + DomTreeUpdater DTU(&AAA.getDT(), DomTreeUpdater::UpdateStrategy::Lazy); + BranchInst *BR = cloneRegion(L->getLoopPreheader()->getTerminator(), &*Ex->getFirstInsertionPt(), AAA.getDT(), DTU); //generate Stride, Bound, base addresses, and intersect checks Value *Cond = nullptr; @@ -418,15 +463,19 @@ void expandInLoop(const std::vector &accs, const Loop *L, AffineAccess } generateSSREnDis(L); + + DTU.flush(); //only change DT after everything } -bool isValid(AffAcc *A) { +bool isValid(AffAcc *A, const Loop *L) { + assert(A->isWellFormed(L)); bool valid = true; bool write = A->isWrite(); for (Instruction *I : A->getAccesses()) { if (write) valid &= CHECK_TYPE(cast(I)->getValueOperand()->getType(), I); else valid &= CHECK_TYPE(I->getType(), I); } + valid &= A->loopToDimension(L) <= SSR_MAX_DIM; return valid; } @@ -435,14 +484,13 @@ void visitLoop(const Loop *L, DenseMap> &pos ArrayRef accs = AAA.getExpandableAccesses(L); std::vector valid; for (AffAcc *A : accs) { - if (isValid(A)) valid.push_back(A); + if (isValid(A, L)) valid.push_back(A); } - if (valid.empty()) return; //sort by dimension (with read beeing preferred over write) auto comp = [L](const AffAcc *A, const AffAcc *B) { unsigned dimA = A->loopToDimension(L); unsigned dimB = B->loopToDimension(L); - return dimA < dimB || (dimA == dimB && !(A->isWrite() && !B->isWrite())); + return dimA < dimB || (dimA == dimB && (!A->isWrite() && B->isWrite())); }; std::sort(valid.begin(), valid.end(), comp); //add possible: @@ -483,12 +531,14 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F //expand them for (const Loop *L : best) { - expandInLoop(possible.find(L)->getSecond(), L, AAA); + auto &acc = possible.find(L)->getSecond(); + if (!acc.empty()) expandInLoop(acc, L, AAA); } changed |= !best.empty(); } - if (changed) return PreservedAnalyses::all(); + if (!changed) return PreservedAnalyses::all(); + F.addFnAttr(Attribute::AttrKind::NoInline); return PreservedAnalyses::none(); } \ No newline at end of file From cdc33bd2549cf2c556eb8fdc643a63ea8c5637c1 Mon Sep 17 00:00:00 2001 From: thrupf Date: Mon, 6 Jun 2022 11:43:41 +0200 Subject: [PATCH 33/47] 7/7 working --- .../llvm/Analysis/AffineAccessAnalysis.h | 3 ++- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 15 ++++++----- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 27 ++++++++++++------- 3 files changed, 28 insertions(+), 17 deletions(-) diff --git a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h index 8539569cb198e..e807343e982d3 100644 --- a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h @@ -29,6 +29,7 @@ struct LoopRep{ const Loop *L; const SCEV *RepSCEV; Value *Rep = nullptr; + Value *RepPlusOne = nullptr; SmallVector containingLoops; //from inner- to outermost unsigned safeExpandBound; //exclusive bound @@ -114,7 +115,7 @@ struct ExpandedAffAcc { const SmallVector PrefixSumRanges; Value *const LowerBound; Value *const UpperBound; - unsigned getDimension() const { return Steps.size(); } + unsigned getDimension() const { return Steps.size(); } //returns the nr of steps/reps/etc... there are ExpandedAffAcc (AffAcc *A, Value *Addr, ArrayRef Steps, ArrayRef Reps, ArrayRef Ranges, ArrayRef PSRanges, Value *LowerBound, Value *UpperBound) : Access(A), Addr(Addr), Steps(Steps.begin(), Steps.end()), Reps(Reps.begin(), Reps.end()), diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index 5d740a802ffe7..427d664708b4c 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -376,19 +376,21 @@ Value *LoopRep::expandAt(Type *ty, Instruction *InsertBefore){ return Rep; } InsertBefore = InsertBefore ? InsertBefore : L->getLoopPreheader()->getTerminator(); - const SCEV *RepPlusOne = getSCEVPlusOne(); - assert(isSafeToExpandAt(RepPlusOne, InsertBefore, SE) && "bound not expanable here"); + const SCEV *RepP1 = getSCEVPlusOne(); //we go over the +1 version here because getSCEV() is usually sth like %n-1 so then this becomes just %n + assert(isSafeToExpandAt(RepP1, InsertBefore, SE) && "bound not expandable here"); SCEVExpander ex(SE, L->getHeader()->getModule()->getDataLayout(), "rep"); ex.setInsertPoint(InsertBefore); - Rep = castToSize(ex.expandCodeFor(RepPlusOne), ty, InsertBefore); + RepPlusOne = castToSize(ex.expandCodeFor(RepP1), ty, InsertBefore); + IRBuilder<> builder(InsertBefore); + Rep = builder.CreateSub(RepPlusOne, ConstantInt::get(ty, 1u), "rep"); return Rep; } Value *LoopRep::expandLoopGuard(Instruction *InsertBefore) { - assert(Rep && "expandAt has to be called before this"); + assert(RepPlusOne && "expandAt has to be called before this"); InsertBefore = InsertBefore ? InsertBefore : L->getLoopPreheader()->getTerminator(); IRBuilder<> builder(InsertBefore); - return builder.CreateICmpSGT(Rep, ConstantInt::get(Rep->getType(), 0u, true)); //FIXME: this only works for unsigned Rep's that are < 2^30 (for i32) + return builder.CreateICmpSGT(RepPlusOne, ConstantInt::get(Rep->getType(), 0u, true)); //FIXME: this only works for unsigned Rep's that are < 2^30 (for i32) } // ==== AffAcc ==== @@ -619,7 +621,8 @@ Value *AffAcc::expandRep(unsigned dimension, Type *ty, Instruction *InsertBefore ExpandedAffAcc AffAcc::expandAt(const Loop *L, Instruction *Point, Type *PtrTy, IntegerType *ParamTy, IntegerType *AgParamTy) { - errs()<<"expanding for Loop with header: "<getHeader()->getNameOrAsOperand()<<"\n"; + errs()<<"expanding for Loop with header: "<getHeader()->getNameOrAsOperand()<<" the following:\n"; + dumpInLoop(L); if (!Point) Point = L->getLoopPreheader()->getTerminator(); IRBuilder<> builder(Point); assert(isWellFormed(L)); diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index a075ecacdfa44..d6c85bd6cf1e9 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -314,7 +314,8 @@ void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ IRBuilder<> builder(Point); Type *i32 = Type::getInt32Ty(Point->getContext()); unsigned dim = E.getDimension(); - assert(dim > 0u); + errs()<<"SSR Setup for stream with dim = "<isWrite(); @@ -327,10 +328,9 @@ void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ }; for (unsigned i = 0u; i < dim; i++) { - errs()<<"for dim = "<<(i+1)<<"\n"; Value *Stride = E.Steps[i]; if (i > 0) Stride = builder.CreateSub(Stride, E.PrefixSumRanges[i-1], formatv("stride.{0}d", i+1)); - Value *Bound = builder.CreateSub(E.Reps[i], ConstantInt::get(i32, 1u), formatv("bound.{0}d", i+1)); + Value *Bound = E.Reps[i]; Function *SSRBoundStrideSetup = Intrinsic::getDeclaration(mod, functions[i]); std::array bsargs = {DMid, Bound, Stride}; builder.CreateCall(SSRBoundStrideSetup->getFunctionType(), SSRBoundStrideSetup, ArrayRef(bsargs))->dump(); @@ -415,9 +415,6 @@ void generateSSREnDis(const Loop *L){ errs()<<"generated ssr_enable and ssr_disable\n"; - L->getLoopPreheader()->getSinglePredecessor()->dump(); - L->getLoopPreheader()->dump(); - return; } @@ -433,6 +430,13 @@ void expandInLoop(const std::vector &accs, const Loop *L, AffineAccess IntegerType *i64 = IntegerType::getInt64Ty(ctxt); Type *i8Ptr = Type::getInt8PtrTy(ctxt); + //generate Stride, Bound, base addresses, and intersect checks + Value *Cond = nullptr; + auto exp = AAA.expandAllAt(accs, L, L->getLoopPreheader()->getTerminator(), Cond, i8Ptr, i32, i64); + assert(Cond); + + //errs()<<"expanded All in Loop Preheader:\n"; L->getLoopPreheader()->dump(); + //for some reason sometimes the loop has multiple exits but they are the same (this is the case if a CondBr has two operands to same block) SmallVector exits; L->getExitBlocks(exits); @@ -444,10 +448,7 @@ void expandInLoop(const std::vector &accs, const Loop *L, AffineAccess DomTreeUpdater DTU(&AAA.getDT(), DomTreeUpdater::UpdateStrategy::Lazy); BranchInst *BR = cloneRegion(L->getLoopPreheader()->getTerminator(), &*Ex->getFirstInsertionPt(), AAA.getDT(), DTU); - //generate Stride, Bound, base addresses, and intersect checks - Value *Cond = nullptr; - auto exp = AAA.expandAllAt(accs, L, BR, Cond, i8Ptr, i32, i64); - assert(Cond); + //errs()<<"done cloning, Loop Preheader:\n"; L->getLoopPreheader()->dump(); //TCDM Checks IRBuilder<> builder(BR); @@ -538,6 +539,12 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F changed |= !best.empty(); } + /* + errs()<<"dumping Module ============================\n"; + F.getParent()->dump(); + errs()<<"done dumping Module ============================\n"; + */ + if (!changed) return PreservedAnalyses::all(); F.addFnAttr(Attribute::AttrKind::NoInline); return PreservedAnalyses::none(); From 880928ccbe8a36067d5d36dc38b32a606e17eb78 Mon Sep 17 00:00:00 2001 From: thrupf Date: Sat, 11 Jun 2022 22:05:55 +0200 Subject: [PATCH 34/47] minor refactor --- .../llvm/Analysis/AffineAccessAnalysis.h | 15 +- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 354 ++++++++++++------ llvm/lib/Transforms/SSR/SSRGeneration.cpp | 242 ++++++++---- 3 files changed, 419 insertions(+), 192 deletions(-) diff --git a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h index e807343e982d3..dab3969cbe8f8 100644 --- a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h @@ -61,14 +61,16 @@ struct AffAcc{ SmallVector containingLoops; //from inner- to outermost DenseMap> conflicts; void findSteps(const SCEV *A, const SCEV *Factor, unsigned loop); + AffAccConflict fromConflictPair(const detail::DenseMapPair> &p, const Loop *L) const; public: AffAcc() = delete; - //immediately copies the contens of accesses and containingLoops + ///immediately copies the contens of accesses and containingLoops AffAcc(ArrayRef accesses, const SCEV *Addr, MemoryUseOrDef *MA, ArrayRef containingLoops, ScalarEvolution &SE); ArrayRef getAccesses() const; bool isWrite() const; - unsigned getMaxDimension() const; + int getMaxDimension() const; + const Loop *getDeepestMalformed() const; bool isWellFormed(unsigned dimension) const; bool isWellFormed(const Loop *L) const; bool canExpandBefore(const Loop *L) const; @@ -76,11 +78,13 @@ struct AffAcc{ void dumpInLoop(const Loop *L) const; unsigned loopToDimension(const Loop *L) const; const SCEV *getBaseAddr(unsigned dim) const; + const SCEV *getBaseAddr(const Loop *L) const; const SCEV *getStep(unsigned dim) const; const SCEV *getRep(unsigned dim) const; const Loop *getLoop(unsigned dim) const; ArrayRef getContainingLoops() const; AffAccConflict getConflict(AffAcc *A, const Loop *L) const; + std::vector> getConflicts(const Loop *L) const; MemoryUseOrDef *getMemoryAccess(); void addConflict(AffAcc *A, const Loop *StartL, AffAccConflict kind); @@ -101,7 +105,7 @@ struct MemDep { public: MemDep(MemorySSA &MSSA, AAResults &AA) : MSSA(MSSA), AA(AA) {} - DenseSet findClobbers(MemoryUseOrDef *MA); + DenseSet findClobbers(MemoryUseOrDef *MA); DenseSet findClobberUsers(MemoryDef *MA); }; @@ -138,7 +142,8 @@ class AffineAccess{ std::vector analyze(const Loop *Parent, ArrayRef loopPath); void addAllConflicts(const std::vector &all); - AffAccConflict getRWConflict(AffAcc *Read, AffAcc *Write, const Loop *L); + AffAccConflict calcRWConflict(AffAcc *Read, AffAcc *Write, const Loop *L) const; + std::pair calcConflict(AffAcc *A, AffAcc *B) const; public: AffineAccess(Function &F, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, MemorySSA &MSSA, AAResults &AA); @@ -154,7 +159,7 @@ class AffineAccess{ ArrayRef getExpandableAccesses(const Loop *L); std::vector expandAllAt(ArrayRef Accs, const Loop *L, Instruction *Point, - Value *&BoundCheck, Type *PtrTy, IntegerType *ParamTy, IntegerType *AgParamTy); + Value *&BoundCheck, Type *PtrTy, IntegerType *ParamTy, IntegerType *AgParamTy, bool conflictChecks = true, bool repChecks = false); }; class AffineAccessAnalysis : public AnalysisInfoMixin { diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index 427d664708b4c..0393a635e68ac 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -236,7 +236,6 @@ Optional predicatedICmpOutcome(ICmpInst *Cmp, const SCEV *Rep, ScalarEvolu //predicate is that Rep > 0 bool isOnAllPredicatedControlFlowPaths(BasicBlock *BB, const Loop *L, const DominatorTree &DT, const SCEV *Rep, ScalarEvolution &SE){ if (isOnAllControlFlowPaths(BB, L, DT)) return true; //is on all paths anyway - Rep->dump(); DenseSet vis; //visited set std::deque q(1U, L->getHeader()); //iterative BFS with queue while (!q.empty()){ @@ -305,7 +304,7 @@ const Loop *findFirstContaining(ArrayRef loops, BasicBlock *BB){ return nullptr; } -Value *goodAnd(IRBuilder<> &builder, ArrayRef bools){ +/*Value *goodAnd(IRBuilder<> &builder, ArrayRef bools){ assert(!bools.empty()); std::vector b1, b2; for (Value *b : bools) b1.push_back(b); @@ -319,10 +318,37 @@ Value *goodAnd(IRBuilder<> &builder, ArrayRef bools){ b2.clear(); } return b1[0]; //return the last value -} +}*/ bool hasMemInst(MemoryUseOrDef *MA) { return MA && MA->getMemoryInst(); } +//updates L<-M if M is a descendant of L (or if L is nullptr) +void updateIfDescendant(const Loop *&L, const Loop *M) { + if (!L || (M && L->contains(M))) L = M; +} + +//updates L<-M if L is descendant of M OR if M is nullptr +void updateIfAncestor(const Loop *&L, const Loop *M) { + if (!M || M->contains(L)) L = M; +} + +void updateOutermostExpandableExcl(const Loop *&outerMostExpandableExl, AffAccConflict kind, const Loop *innermostCommon, const Loop *deepestMalformed) { + switch (kind) { + case AffAccConflict::NoConflict: + break; + case AffAccConflict::MustNotIntersect: + errs()<<"must-not-intersect\n"; + updateIfAncestor(innermostCommon, deepestMalformed); //updates innermostCommon to deepestMalformed if that one is less "deep" + LLVM_FALLTHROUGH; + case AffAccConflict::Bad: + errs()<<"bad\n"; + updateIfDescendant(outerMostExpandableExl, innermostCommon); + break; + default: + llvm_unreachable("unknown conflict type"); + } +} + } //end of namespace //================== =========================================================== @@ -399,13 +425,18 @@ AffAcc::AffAcc(ArrayRef accesses, const SCEV *Addr, MemoryUseOrDe { assert(!accesses.empty()); assert(MA); - if (Addr && SCEVContainsCouldNotCompute(Addr)) Addr = nullptr; //set to null if contains SCEVCouldNotCompute - baseAddresses.push_back(Addr); - steps.push_back((const SCEV *)nullptr); //there is no step for dim=0 - reps.push_back((LoopRep *)nullptr); //there is no rep for dim=0 + containingLoops.push_back((const Loop *)nullptr); //there is no loop for dim=0 containingLoops.append(contLoops.begin(), contLoops.end()); + + bool isVolatile = false; + for (Instruction *I : accesses) + isVolatile |= (isa(I) && cast(I)->isVolatile()) || (isa(I) && cast(I)->isVolatile()); + if (Addr && (SCEVContainsCouldNotCompute(Addr) || isVolatile)) Addr = nullptr; //set to null if contains SCEVCouldNotCompute + baseAddresses.push_back(Addr); if (!Addr) return; //do not look for steps or addresses if SCEV of address is unknown + steps.push_back((const SCEV *)nullptr); //there is no step for dim=0 + reps.push_back((LoopRep *)nullptr); //there is no rep for dim=0 findSteps(Addr, (const SCEV *)nullptr, 1u); //find steps for (unsigned dim = 1; dim < containingLoops.size(); dim++){ Addr = SE.SplitIntoInitAndPostInc(containingLoops[dim], Addr).first; @@ -477,10 +508,35 @@ void AffAcc::findSteps(const SCEV *A, const SCEV *Factor, unsigned loop){ } ArrayRef AffAcc::getAccesses() const { return accesses; } + bool AffAcc::isWrite() const { return isa(MA); } -unsigned AffAcc::getMaxDimension() const { return reps.size() - 1u; } -bool AffAcc::isWellFormed(unsigned dimension) const { return dimension <= getMaxDimension() && baseAddresses[0]; } + +///the nr of times `this` was promoted (-1 means the address is not known) +int AffAcc::getMaxDimension() const { return (int)reps.size() - 1; } + +///return the first (as in deepest) Loop L where this->isWellFormed(L) is false +///returns null if there is no such loop +const Loop *AffAcc::getDeepestMalformed() const { + for (const Loop *L : containingLoops) { + if (L && !isWellFormed(L)) return L; + } + return nullptr; + /*unsigned malformedStart = (unsigned)(getMaxDimension() + 2); //getMaxDimension() >= -1 + if (containingLoops.size() > malformedStart) return containingLoops[malformedStart]; + else return nullptr;*/ +} + +///true if this was successfully promoted to the given dimension (ie. nr of promotions is at least `dimension`) +bool AffAcc::isWellFormed(unsigned dimension) const { + int md = getMaxDimension(); + return md >= 0 && dimension <= (unsigned)md; +} + +///true if this was successfully promoted to the given dimension (ie. nr of promotions is `dimension`) +///if true, this means that `this` can be expanded in the preheader of `L` bool AffAcc::isWellFormed(const Loop *L) const { return isWellFormed(loopToDimension(L)); } + +///returns the dimension that is defined by `L` (starts at 1) unsigned AffAcc::loopToDimension(const Loop *L) const { assert(L && "L not nullptr"); for (unsigned d = 1u; d < containingLoops.size(); d++){ //FIXME: linear search -> improve with a map @@ -488,23 +544,36 @@ unsigned AffAcc::loopToDimension(const Loop *L) const { } llvm_unreachable("The provided loop does not contain `this`!"); } -bool AffAcc::canExpandBefore(const Loop *L) const { return isWellFormed(loopToDimension(L)); } + +///SCEV of base Address for the base address at a given dimension const SCEV *AffAcc::getBaseAddr(unsigned dim) const { assert(dim < baseAddresses.size()); return baseAddresses[dim]; } + +///SCEV of base Address outside of `L` +const SCEV *AffAcc::getBaseAddr(const Loop *L) const { return getBaseAddr(loopToDimension(L)); } + +///SCEV of step for the dimension `dim` (that means there is no step for `dim` = 0) const SCEV *AffAcc::getStep(unsigned dim) const { assert(dim < steps.size()); return steps[dim]; } + +///SCEV of rep for the dimension `dim` (that means there is no rep for `dim` = 0) const SCEV *AffAcc::getRep(unsigned dim) const { assert(dim < reps.size()); if (!reps[dim] || !reps[dim]->isAvailable()) return nullptr; return reps[dim]->getSCEV(); } + +///get Loop for given `dim` (that means there is no Loop for `dim` = 0) const Loop *AffAcc::getLoop(unsigned dim) const { assert(dim < containingLoops.size()); return containingLoops[dim]; } + +///get containing loops from inner- to outermost ArrayRef AffAcc::getContainingLoops() const { return ArrayRef(containingLoops); } + void AffAcc::dumpInLoop(const Loop *L) const { errs()<<"Affine Access of \n"; - unsigned dimension; - if (L) dimension = loopToDimension(L); - else dimension = getMaxDimension(); + int dimension = getMaxDimension(); + if (L) dimension = std::min((int)loopToDimension(L), dimension); for (auto *I : accesses) errs()<<*I<<"\n"; - for (unsigned dim = 0u; dim <= dimension && dim <= getMaxDimension(); dim++){ + if (dimension < 0) errs()<<"\t\n"; + for (int dim = 0; dim <= dimension && dim <= getMaxDimension(); dim++){ const SCEV *s = getStep(dim); const SCEV *r = getRep(dim); const SCEV *a = getBaseAddr(dim); @@ -526,26 +595,46 @@ void AffAcc::dumpInLoop(const Loop *L) const { errs()<<"\n"; } } + void AffAcc::dump() const { dumpInLoop(nullptr); } + +AffAccConflict AffAcc::fromConflictPair(const detail::DenseMapPair> &p, const Loop *L) const { + const Loop *S = p.getSecond().first; + if (S == L || L->contains(S)) { //if start is L or more "inner" loop + if (!isWellFormed(L) || !p.first->isWellFormed(L)) return AffAccConflict::Bad; //if either is not well-formed "demote" the conflict to bad (but only if exists) + return p.getSecond().second; + } + return AffAccConflict::NoConflict; +} + AffAccConflict AffAcc::getConflict(AffAcc *A, const Loop *L) const { auto p = conflicts.find(A); if (p != conflicts.end()) { - const Loop *S = p->getSecond().first; - if (S == L || L->contains(S)) { //if start is L or more "inner" loop - if (!isWellFormed(L) || !A->isWellFormed(L)) return AffAccConflict::Bad; //if either is not well-formed "demote" the conflict to bad (but only if exists) - return p->getSecond().second; - } + return fromConflictPair(*p, L); } return AffAccConflict::NoConflict; } +std::vector> AffAcc::getConflicts(const Loop *L) const { + std::vector> res; + for (const auto &p : conflicts) { + assert(p.first); + assert(p.getSecond().first); + AffAccConflict kind = fromConflictPair(p, L); + if (kind != AffAccConflict::NoConflict) res.push_back(std::make_pair(p.first, kind)); + } + return res; +} + MemoryUseOrDef *AffAcc::getMemoryAccess() { return MA; } + void AffAcc::addConflict(AffAcc *A, const Loop *StartL, AffAccConflict kind){ + assert(StartL); assert(conflicts.find(A) == conflicts.end() && "no conflict for A yet"); assert(kind == AffAccConflict::Bad || (isWellFormed(StartL) && A->isWellFormed(StartL))); conflicts.insert(std::make_pair(A, std::make_pair(StartL, kind))); - /*errs()<<"conflict for:\n"; dumpInLoop(StartL); errs()<<"with:\n"; A->dumpInLoop(StartL); errs()<<"is ===========>"; - switch (kind) + errs()<<"conflict for:\n"; dumpInLoop(StartL); errs()<<"with:\n"; A->dumpInLoop(StartL); errs()<<"is ===> "; + /*switch (kind) { case AffAccConflict::Bad: errs()<<"Bad"; @@ -561,27 +650,27 @@ void AffAcc::addConflict(AffAcc *A, const Loop *StartL, AffAccConflict kind){ } errs()<<"\n"; */ } + bool AffAcc::promote(LoopRep *LR){ if (!LR->isAvailable()) return false; - unsigned newDim = getMaxDimension() + 1u; - if (!isWellFormed(getMaxDimension())) return false; + unsigned newDim = (unsigned)(getMaxDimension() + 1); //getMaxDimension() >= -1 if (getLoop(newDim) != LR->getLoop()) return false; errs()<<"promote: (1) loops match, "; bool possible = true; Instruction *Point = LR->getLoop()->getLoopPreheader()->getTerminator(); //check all current reps and steps - for (unsigned dim = 1u; possible && dim < newDim; dim++){ - possible = possible && isSafeToExpandAt(getStep(dim), Point, SE); - possible = possible && reps[dim]->isSafeToExpandBefore(LR->getLoop()); + for (int dim = 1; possible && dim < getMaxDimension(); dim++){ + possible &= isSafeToExpandAt(getStep(dim), Point, SE); + possible &= reps[dim]->isSafeToExpandBefore(LR->getLoop()); } - if (possible) errs()<<"(2) current rep & step can be expanded, "; + if (possible) errs()<<"can expand (2) current rep & step, "; //check rep and step of new dimension possible &= steps.size() > newDim && isSafeToExpandAt(getStep(newDim), Point, SE); possible &= LR->isSafeToExpandBefore(LR->getLoop()); - if (possible) errs()<<"(3) new rep & step can be expanded, "; + if (possible) errs()<<"(3) new rep & step, "; //check base address possible &= !SCEVContainsCouldNotCompute(getBaseAddr(newDim)) && isSafeToExpandAt(getBaseAddr(newDim), Point, SE); - if (possible) errs()<<"(4) new base addr can be expanded"; + if (possible) errs()<<"and (4) new base addr!"; errs()<<"\n"; if (!possible) return false; @@ -604,6 +693,7 @@ Value *AffAcc::expandBaseAddr(unsigned dimension, Type *ty, Instruction *InsertB ex.setInsertPoint(InsertBefore); return castToSize(ex.expandCodeFor(getBaseAddr(dimension)), ty, InsertBefore); } + Value *AffAcc::expandStep(unsigned dimension, Type *ty, Instruction *InsertBefore){ assert(isWellFormed(dimension) && dimension > 0u); InsertBefore = InsertBefore ? InsertBefore : reps[dimension]->getLoop()->getLoopPreheader()->getTerminator(); @@ -612,17 +702,20 @@ Value *AffAcc::expandStep(unsigned dimension, Type *ty, Instruction *InsertBefor ex.setInsertPoint(InsertBefore); return castToSize(ex.expandCodeFor(getStep(dimension)), ty, InsertBefore); } + Value *AffAcc::expandRep(unsigned dimension, Type *ty, Instruction *InsertBefore){ assert(isWellFormed(dimension) && dimension > 0u); InsertBefore = InsertBefore ? InsertBefore : reps[dimension]->getLoop()->getLoopPreheader()->getTerminator(); assert(isSafeToExpandAt(getRep(dimension), InsertBefore, SE) && "data not expanable here (note: only preheader guaranteed)"); return reps[dimension]->expandAt(ty, InsertBefore); } + ExpandedAffAcc AffAcc::expandAt(const Loop *L, Instruction *Point, Type *PtrTy, IntegerType *ParamTy, IntegerType *AgParamTy) { errs()<<"expanding for Loop with header: "<getHeader()->getNameOrAsOperand()<<" the following:\n"; dumpInLoop(L); + if (!Point) Point = L->getLoopPreheader()->getTerminator(); IRBuilder<> builder(Point); assert(isWellFormed(L)); @@ -655,8 +748,8 @@ bool MemDep::alias(MemoryUseOrDef *A, MemoryUseOrDef *B) { else return alias(getAddress(A), getAddress(B)); } -DenseSet MemDep::findClobbers(MemoryUseOrDef *MA){ - DenseSet res; +DenseSet MemDep::findClobbers(MemoryUseOrDef *MA){ + DenseSet res; std::deque worklist; DenseSet vis; worklist.push_back(MA->getDefiningAccess()); @@ -772,70 +865,46 @@ std::vector AffineAccess::analyze(const Loop *Parent, ArrayRef &all) { for (AffAcc *A : all) { assert(A); - ArrayRef loops = A->getContainingLoops(); - const Loop *outerMostExpandableExl = nullptr; + const Loop *outerMostExpandableExl = A->getDeepestMalformed(); + DenseSet c; if (A->isWrite()){ - MemoryDef *MA = cast(A->getMemoryAccess()); - const DenseSet &cu = MD.findClobberUsers(MA); - for (MemoryUseOrDef *D : cu) { - if (MA == D || !hasMemInst(D)) continue; - const Loop *innermostCommon = findFirstContaining(loops, D->getBlock()); - if (!innermostCommon) continue; - auto p = access.find(D); - if (p == access.end()) continue; //no AffAcc for D ==> skip - AffAcc *B = p->second; - AffAccConflict kind = AffAccConflict::Bad; - if (A->isWellFormed(innermostCommon) && B->isWellFormed(innermostCommon)) { - if (B->isWrite()) kind = AffAccConflict::MustNotIntersect; //WaW - else kind = getRWConflict(B, A, innermostCommon); - } - //at this point, even if the two do not alias, we assume the chance is high that they do at runtime - //if their base addresses share some SCEVUnknowns (ie. some Value's) (FIXME: CONSERVATIVE) - if (kind == AffAccConflict::MustNotIntersect){ - if (shareValues(A->getBaseAddr(A->loopToDimension(innermostCommon)), B->getBaseAddr(B->loopToDimension(innermostCommon)))) - kind = AffAccConflict::Bad; - } - if (kind != AffAccConflict::NoConflict) A->addConflict(B, innermostCommon, kind); - if (kind == AffAccConflict::Bad) outerMostExpandableExl = innermostCommon; - } + c = MD.findClobberUsers(cast(A->getMemoryAccess())); } else { - MemoryUseOrDef *MA = A->getMemoryAccess(); - const DenseSet &cs = MD.findClobbers(MA); - for (MemoryDef *D : cs) { - if (MA == D || !hasMemInst(D)) continue; - const Loop *innermostCommon = findFirstContaining(loops, D->getBlock()); - if (!innermostCommon) continue; - auto p = access.find(D); - if (p == access.end()) continue; //no AffAcc for D ==> skip - AffAcc *B = p->second; - AffAccConflict kind = getRWConflict(A, B, innermostCommon); - //at this point, even if the two do not alias, we assume the chance is high that they do at runtime - //if their base addresses share some SCEVUnknowns (ie. some Value's) (FIXME: CONSERVATIVE) - if (kind == AffAccConflict::MustNotIntersect){ - if (shareValues(A->getBaseAddr(A->loopToDimension(innermostCommon)), B->getBaseAddr(B->loopToDimension(innermostCommon)))) - kind = AffAccConflict::Bad; - } - if (kind != AffAccConflict::NoConflict) A->addConflict(B, innermostCommon, kind); - if (kind == AffAccConflict::Bad) outerMostExpandableExl = innermostCommon; - } + c = MD.findClobbers(A->getMemoryAccess()); } + for (MemoryUseOrDef *D : c) { + if (A->getMemoryAccess() == D || !hasMemInst(D)) continue; + auto p = access.find(D); + if (p == access.end()) continue; + AffAcc *B = p->second; + AffAcc *x = A; //copy, TODO: remove this check + auto r = calcConflict(A, B); + assert(A == x && "swap does not affect"); + if (r.first != AffAccConflict::NoConflict) A->addConflict(B, r.second, r.first); + updateOutermostExpandableExcl(outerMostExpandableExl, r.first, r.second, B->getDeepestMalformed()); + assert(!outerMostExpandableExl || outerMostExpandableExl->contains(A->getMemoryAccess()->getBlock())); + } + ArrayRef loops = A->getContainingLoops(); for (const Loop *L : loops) { if (!L) continue; if (L == outerMostExpandableExl) break; - if (!A->isWellFormed(L)) break; + if (!(!L || A->isWellFormed(L))){ + errs()<<"HERE\n"; + if (L) L->dump(); + if (outerMostExpandableExl) outerMostExpandableExl->dump(); + A->dump(); + } + assert(!L || A->isWellFormed(L)); auto p = expandableAccesses.find(L); if (p == expandableAccesses.end()){ - SmallVector l; - l.push_back(A); - expandableAccesses.insert(std::make_pair(L, std::move(l))); - } else { - p->getSecond().push_back(A); - } + p = expandableAccesses.insert(std::make_pair(L, SmallVector())).first; + } + p->getSecond().push_back(A); } } } -AffAccConflict AffineAccess::getRWConflict(AffAcc *Read, AffAcc *Write, const Loop *L) { +AffAccConflict AffineAccess::calcRWConflict(AffAcc *Read, AffAcc *Write, const Loop *L) const { assert(!Read->isWrite()); assert(Write->isWrite()); if (!L->contains(Read->getMemoryAccess()->getBlock()) || !L->contains(Write->getMemoryAccess()->getBlock())) return AffAccConflict::NoConflict; @@ -848,17 +917,42 @@ AffAccConflict AffineAccess::getRWConflict(AffAcc *Read, AffAcc *Write, const Lo kind = AffAccConflict::MustNotIntersect; } else { //read dominates write ==> WaR kind = AffAccConflict::MustNotIntersect; - //exception we know that the store always happens to a position already written from if the store is to same address as write (FIXME: CONSERVATIVE) + //exception: we know that the store always happens to a position already written from if the store is to same address as write (FIXME: CONSERVATIVE) if ((Addr && DAddr && AA.alias(Addr, DAddr) == MustAlias) || accessPatternsAndAddressesMatch(Read, Write, L)) { kind = AffAccConflict::NoConflict; } } - return kind; } +///returns the kind of conflict (and innermost common loop) that A and B have assuming there is some memory dependency +///does not check for the memory dependency itself for to peformance +std::pair AffineAccess::calcConflict(AffAcc *A, AffAcc *B) const { + assert((A->isWrite() || B->isWrite()) && "conflict between two reads ???"); + const Loop *const innermostCommon = findFirstContaining(A->getContainingLoops(), B->getMemoryAccess()->getBlock()); + if (!innermostCommon) return std::make_pair(AffAccConflict::NoConflict, innermostCommon); + if (!A->isWrite()) std::swap(A, B); //we know at least one of them is write, swap so that one is A + AffAccConflict kind = AffAccConflict::Bad; //assume Bad at beginning + if (A->isWellFormed(innermostCommon) && B->isWellFormed(innermostCommon)) { + if (B->isWrite()) kind = AffAccConflict::MustNotIntersect; //WaW + else kind = calcRWConflict(B, A, innermostCommon); //B is read and A is write + } + //at this point, even if the two do not alias, we assume the chance is high that they do at runtime + //if their base addresses share some SCEVUnknowns (ie. some Value's) (FIXME: this is CONSERVATIVE) + if (kind == AffAccConflict::MustNotIntersect){ + const Loop *L = innermostCommon->getParentLoop(); + const Loop *Last = innermostCommon; + while (L && A->isWellFormed(L) && B->isWellFormed(L)) { //traverse up the loop-tree up to the point where one of them is not wellformed anymore + Last = L; + L = L->getParentLoop(); + } + if (shareValues(A->getBaseAddr(Last), B->getBaseAddr(Last))) kind = AffAccConflict::Bad; + } + return std::make_pair(kind, innermostCommon); +} + bool AffineAccess::accessPatternsMatch(const AffAcc *A, const AffAcc *B, const Loop *L) const { unsigned dimA = A->loopToDimension(L); unsigned dimB = B->loopToDimension(L); @@ -889,54 +983,78 @@ ArrayRef AffineAccess::getExpandableAccesses(const Loop *L) { return ArrayRef(p->getSecond()); } - std::vector AffineAccess::expandAllAt(ArrayRef Accs, const Loop *L, Instruction *Point, Value *&BoundCheck, - Type *PtrTy, IntegerType *ParamTy, IntegerType *AgParamTy) + Type *PtrTy, IntegerType *ParamTy, IntegerType *AgParamTy, bool conflictChecks, bool repChecks) { assert(Point); - std::vector res; IRBuilder<> builder(Point); - for (AffAcc *A : Accs) { - res.push_back(std::move(A->expandAt(L, Point, PtrTy, ParamTy, AgParamTy))); + + DenseMap exps; + for (AffAcc *A : Accs) { //expand the requested AffAcc's + exps.insert(std::make_pair(A, std::move(A->expandAt(L, Point, PtrTy, ParamTy, AgParamTy)))); } + std::vector checks; - for (auto A = res.begin(); A != res.end(); ++A) { - for (auto B = res.begin(); B != A; ++B){ - AffAccConflict kind = std::max(A->Access->getConflict(B->Access, L), B->Access->getConflict(A->Access, L)); - switch (kind) - { - case AffAccConflict::NoConflict: - break; //nothing to add - case AffAccConflict::MustNotIntersect: { - Value *x = builder.CreateICmpULT(A->UpperBound, B->LowerBound, "no.inter.ab"); - Value *y = builder.CreateICmpULT(B->UpperBound, A->LowerBound, "no.inter.ba"); - checks.push_back(builder.CreateOr(x, y, "no.intersect")); - break; - } - case AffAccConflict::Bad: - assert(false && "cannot expand the given access because some of them have a bad conflict!"); - break; - default: - llvm_unreachable("unknown conflict type"); + if (conflictChecks) { + DenseSet done; //keep track of which were done to not make duplicate checks + for (AffAcc *A : Accs) { + auto conflicts = A->getConflicts(L); //get all AffAcc's with which A conflicts + for (const auto &p : conflicts) { + AffAcc *B = p.first; + if (done.find(B) != done.end()) continue; //this conflict was already handled when A was B (symmetry) + AffAccConflict kind = std::max(p.second, B->getConflict(A, L)); //take worse conflict + switch (kind) + { + case AffAccConflict::NoConflict: + break; //nothing to do + case AffAccConflict::MustNotIntersect: { + auto e = exps.find(B); + if (e == exps.end()) { //if B was not yet expanded, do that and update the iterator for the pair in exps + e = exps.insert(std::make_pair(B, std::move(B->expandAt(L, Point, PtrTy, ParamTy, AgParamTy)))).first; + } + assert(e->first == B); + ExpandedAffAcc &expB = e->getSecond(); + ExpandedAffAcc &expA = exps.find(A)->getSecond(); //guaranteed to exist + Value *x = builder.CreateICmpULT(expA.UpperBound, expB.LowerBound, "no.inter.ab"); + Value *y = builder.CreateICmpULT(expB.UpperBound, expA.LowerBound, "no.inter.ba"); + checks.push_back(builder.CreateOr(x, y, "no.intersect")); + break; + } + case AffAccConflict::Bad: + llvm_unreachable("cannot expand the given accesses because some of them have a bad conflict in L!"); + break; + default: + llvm_unreachable("unknown conflict type"); + } } } } - DenseSet loops; //find all relevant loops - for (AffAcc *A : Accs) { - for (unsigned d = 0u; d < A->loopToDimension(L); d++) { - const Loop *x = A->getLoop(d); - if (x) loops.insert(x); + + if (repChecks) { + DenseSet loops; //find all relevant loops + for (auto &p : exps) { + AffAcc *A = p.first; + for (unsigned d = 0u; d < A->loopToDimension(L); d++) { + const Loop *x = A->getLoop(d); + if (x) loops.insert(x); + } + } + for (const Loop *M : loops) { //generate checks for the loops + auto p = reps.find(M); + assert(p != reps.end()); + checks.push_back(p->second->expandLoopGuard(Point)); } } - for (const Loop *M : loops) { //generate checks for the loops - auto p = reps.find(M); - assert(p != reps.end()); - checks.push_back(p->second->expandLoopGuard(Point)); - } + if (checks.empty()) BoundCheck = builder.getTrue(); else BoundCheck = builder.CreateAnd(checks); + + std::vector res; + for (AffAcc *A : Accs) { + res.push_back(std::move(exps.find(A)->getSecond())); //(can move because exps not needed anymore) + } return res; } diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index d6c85bd6cf1e9..e1bac2f4e940f 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -50,18 +50,51 @@ #include #include +//immediately return in pass if false #define SSR_INFERENCE true +//flags for runtime checks (true = disabled) +#define SSR_NO_INTERSECTCHECK false +#define SSR_NO_TCDMCHECK false +#define SSR_NO_BOUNDCHECK false + +//if you feel like there is somehow still some weird reordering going on, enable these: +#define SSR_CLOBBER_REGS_FOR_PUSH false +#define SSR_CLOBBER_REGS_FOR_POP false + #define NUM_SSR 3U //NOTE: if increased too much, might need to change 1st arguments to clobberRegisters(..) #define SSR_MAX_DIM 4U + //both are inclusive! #define SSR_SCRATCHPAD_BEGIN 0x100000 #define SSR_SCRATCHPAD_END 0x120000 + //current state of hw: only allow doubles #define CHECK_TYPE(T, I) (T == Type::getDoubleTy(I->getParent()->getContext())) +static constexpr char SSRFnAttr[] = "SSR"; //used to tag functions that contain SSR streams + using namespace llvm; +static constexpr Intrinsic::ID riscSSRIntrinsics[] = { + Intrinsic::RISCVIntrinsics::riscv_ssr_barrier, + Intrinsic::RISCVIntrinsics::riscv_ssr_disable, + Intrinsic::RISCVIntrinsics::riscv_ssr_enable, + Intrinsic::RISCVIntrinsics::riscv_ssr_setup_repetition, + Intrinsic::RISCVIntrinsics::riscv_ssr_pop, + Intrinsic::RISCVIntrinsics::riscv_ssr_push, + Intrinsic::RISCVIntrinsics::riscv_ssr_read, + Intrinsic::RISCVIntrinsics::riscv_ssr_read_imm, + Intrinsic::RISCVIntrinsics::riscv_ssr_write, + Intrinsic::RISCVIntrinsics::riscv_ssr_write_imm, + Intrinsic::RISCVIntrinsics::riscv_ssr_setup_1d_r, + Intrinsic::RISCVIntrinsics::riscv_ssr_setup_1d_w, + Intrinsic::RISCVIntrinsics::riscv_ssr_setup_bound_stride_1d, + Intrinsic::RISCVIntrinsics::riscv_ssr_setup_bound_stride_2d, + Intrinsic::RISCVIntrinsics::riscv_ssr_setup_bound_stride_3d, + Intrinsic::RISCVIntrinsics::riscv_ssr_setup_bound_stride_4d, +}; + namespace{ template @@ -137,19 +170,20 @@ void copyPHIsFromPred(BasicBlock *BB){ for (Instruction &I : *Pred){ if (auto *Phi = dyn_cast(&I)){ PHINode *PhiC = PHINode::Create(Phi->getType(), 1u, Twine(Phi->getName()).concat(".copy"), BB->getFirstNonPHI()); - Phi->replaceAllUsesWith(PhiC); + //Phi->replaceAllUsesWith(PhiC); + Phi->replaceUsesOutsideBlock(PhiC, Pred); //all users outside of Pred are now using PhiC PhiC->addIncoming(Phi, Pred); } } } ///splits block, redirects all predecessor to first half of split, copies phi's -std::pair splitAt(Instruction *X, const Twine &name, DomTreeUpdater &DTU){ +std::pair splitAt(Instruction *X, const Twine &name){ assert(!isa(X) && "should not split at phi"); BasicBlock *Two = X->getParent(); BasicBlock *One = BasicBlock::Create(Two->getContext(), name, Two->getParent(), Two); Instruction *BR = BranchInst::Create(Two, One); - DTU.applyUpdates(cfg::Update(cfg::UpdateKind::Insert, One, Two)); + //DTU.applyUpdates(cfg::Update(cfg::UpdateKind::Insert, One, Two)); BasicBlock::iterator it = Two->begin(); while (it != X->getIterator()) { BasicBlock::iterator it_next = std::next(it); @@ -174,31 +208,30 @@ std::pair splitAt(Instruction *X, const Twine &name, Value *OP = T->getOperand(i); if (dyn_cast(OP) == Two){ T->setOperand(i, One); //if an operand of the terminator of a predecessor of Two points to Two it should now point to One - cfg::Update upd[]{ + /*cfg::Update upd[]{ cfg::Update(cfg::UpdateKind::Insert, T->getParent(), One), cfg::Update(cfg::UpdateKind::Delete, T->getParent(), Two), }; - DTU.applyUpdates(upd); + DTU.applyUpdates(upd);*/ } } } - copyPHIsFromPred(Two); //copy Phi's from One to Two return std::make_pair(One, Two); } ///clones code from BeginWith up to EndBefore ///assumes all cf-paths from begin lead to end (or return) ///assumes there is a phi node for each value defined in the region that will be cloned in the block of EndBefore that is live after EndBefore -BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, DominatorTree &DT, DomTreeUpdater &DTU){ +BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore){ errs()<<"cloning from "<<*BeginWith<<" up to "<<*EndBefore<<"\n"; - auto p = splitAt(BeginWith, "split.before", DTU); + auto p = splitAt(BeginWith, "split.before"); BasicBlock *Head = p.first; BasicBlock *Begin = p.second; - p = splitAt(EndBefore, "fuse.prep", DTU); - BasicBlock *Fuse = p.first; + p = splitAt(EndBefore, "fuse.prep"); BasicBlock *End = p.second; + copyPHIsFromPred(End); //copy Phi's from Fuse to End std::deque q; //bfs queue q.push_back(Begin); @@ -224,7 +257,7 @@ BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, Dominato Ic->setOperand(i, A->second); //this also updates uses of A->second //check users update in A->second bool userUpdate = false; for (User *U : A->second->users()) {userUpdate = userUpdate || U == Ic; } assert(userUpdate && "user is updated on setOperand"); - if (isa(A->first)) DTU.applyUpdates(cfg::Update(cfg::UpdateKind::Insert, Cc, cast(A->second))); + //if (isa(A->first)) DTU.applyUpdates(cfg::Update(cfg::UpdateKind::Insert, Cc, cast(A->second))); }else{ operandsCleanup.push_back(std::make_pair(i, Ic)); } @@ -241,7 +274,7 @@ BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, Dominato auto A = clones.find(p.second->getOperand(p.first)); if (A != clones.end()){ p.second->setOperand(p.first, A->second); - if (isa(A->first)) DTU.applyUpdates(cfg::Update(cfg::UpdateKind::Insert, p.second->getParent(), cast(A->second))); + //if (isa(A->first)) DTU.applyUpdates(cfg::Update(cfg::UpdateKind::Insert, p.second->getParent(), cast(A->second))); }//else did not find ==> was defined before region } //incoming blocks of phi nodes are not operands ==> handle specially @@ -266,7 +299,7 @@ BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, Dominato ConstantInt::get(Type::getInt1Ty(HeadSucc->getContext()), 0u), Head ); - DTU.applyUpdates(cfg::Update(cfg::UpdateKind::Insert, Head, HeadSuccClone)); + //DTU.applyUpdates(cfg::Update(cfg::UpdateKind::Insert, Head, HeadSuccClone)); //handle phi nodes in End for (Instruction &I : *End){ if (auto *Phi = dyn_cast(&I)){ @@ -285,25 +318,27 @@ BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore, Dominato } } } - const auto edge = BasicBlockEdge(std::make_pair(Fuse, End)); - for (auto &p : clones){ - for (User *U : p.first->users()){ - auto *I = dyn_cast(U); - if (I && DT.dominates(edge, I->getParent())){ - errs()<<*I<<" makes use of "<<*p.first<<" after cloned region ==> add phi node at end!\n"; - //assert(false && "did not declare phi node for live-out value"); - } - } - } errs()<<"done cloning \n"; return HeadBr; } +BasicBlock *getSingleExitBlock(const Loop *L) { + SmallVector exits; + L->getExitBlocks(exits); + BasicBlock *Ex = nullptr; + for (BasicBlock *BB : exits){ + if (!Ex) Ex = BB; + assert(Ex == BB); + } + assert(Ex); + return Ex; +} + Value *GenerateTCDMCheck(ExpandedAffAcc &E, Instruction *Point) { IRBuilder<> builder(Point); IntegerType *i64 = IntegerType::getInt64Ty(Point->getContext()); - Value *c1 = builder.CreateICmpUGE(ConstantInt::get(i64, SSR_SCRATCHPAD_BEGIN), E.LowerBound, "beg.check"); + Value *c1 = builder.CreateICmpULE(ConstantInt::get(i64, SSR_SCRATCHPAD_BEGIN), E.LowerBound, "beg.check"); Value *c2 = builder.CreateICmpULE(E.UpperBound, ConstantInt::get(i64, SSR_SCRATCHPAD_END), "end.check"); return builder.CreateAnd(c1, c2, "tcdm.check"); } @@ -344,9 +379,8 @@ void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ for (Instruction *I : E.Access->getAccesses()){ std::array pusharg = {DMid, cast(I)->getValueOperand()}; builder.SetInsertPoint(I); - clobberRegisters(regs, builder); + if (SSR_CLOBBER_REGS_FOR_PUSH) clobberRegisters(regs, builder); auto *C = builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); - clobberRegisters(regs, builder); C->dump(); I->dump(); I->eraseFromParent(); n_reps++; @@ -356,9 +390,8 @@ void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ std::array poparg = {DMid}; for (Instruction *I : E.Access->getAccesses()){ builder.SetInsertPoint(I); - clobberRegisters(regs, builder); auto *V = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); - clobberRegisters(regs, builder); + if (SSR_CLOBBER_REGS_FOR_POP) clobberRegisters(regs, builder); V->dump(); I->dump(); BasicBlock::iterator ii(I); ReplaceInstWithValue(I->getParent()->getInstList(), ii, V); @@ -391,9 +424,9 @@ void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ } ///generates SSR enable & disable calls -void generateSSREnDis(const Loop *L){ - IRBuilder<> builder(L->getLoopPreheader()->getTerminator()); // ----------- in preheader - Module *mod = L->getHeader()->getModule(); +void generateSSREnDis(Instruction *PhP, Instruction *ExP){ + IRBuilder<> builder(PhP); // ----------- in preheader + Module *mod = PhP->getParent()->getModule(); Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef()); @@ -408,7 +441,7 @@ void generateSSREnDis(const Loop *L){ //Function *FREPPragma = Intrinsic::getDeclaration(mod, Intrinsic::riscv_frep_infer); //builder.CreateCall(FREPPragma->getFunctionType(), FREPPragma, ArrayRef()); - builder.SetInsertPoint(L->getExitBlock()->getTerminator()); // ----------- in exit block + builder.SetInsertPoint(ExP); // ----------- in exit block clobberRegisters(ArrayRef(regs), builder); Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef()); @@ -418,7 +451,8 @@ void generateSSREnDis(const Loop *L){ return; } -void expandInLoop(const std::vector &accs, const Loop *L, AffineAccess &AAA) { +///expands AffAcc's in L's preheader and inserts TCDM checks, returns ExpandedAffAcc's and writes the final Value* of the checks into Cond +std::vector expandInLoop(const std::vector &accs, const Loop *L, AffineAccess &AAA, Value *&Cond) { assert(!accs.empty()); assert(accs.size() <= NUM_SSR); assert(L); @@ -430,42 +464,51 @@ void expandInLoop(const std::vector &accs, const Loop *L, AffineAccess IntegerType *i64 = IntegerType::getInt64Ty(ctxt); Type *i8Ptr = Type::getInt8PtrTy(ctxt); - //generate Stride, Bound, base addresses, and intersect checks - Value *Cond = nullptr; - auto exp = AAA.expandAllAt(accs, L, L->getLoopPreheader()->getTerminator(), Cond, i8Ptr, i32, i64); - assert(Cond); + Instruction *PhT = L->getLoopPreheader()->getTerminator(); - //errs()<<"expanded All in Loop Preheader:\n"; L->getLoopPreheader()->dump(); + //generate Steps, Reps, base addresses, intersect checks, and bound checks + auto exp = AAA.expandAllAt(accs, L, PhT, Cond, i8Ptr, i32, i64, !SSR_NO_INTERSECTCHECK, !SSR_NO_BOUNDCHECK); + assert(Cond); - //for some reason sometimes the loop has multiple exits but they are the same (this is the case if a CondBr has two operands to same block) - SmallVector exits; - L->getExitBlocks(exits); - BasicBlock *Ex = nullptr; - for (BasicBlock *BB : exits){ - if (!Ex) Ex = BB; - assert(Ex == BB); + //TCDM Checks + if (!SSR_NO_TCDMCHECK) { + IRBuilder<> builder(PhT); + for (auto &E : exp) { + Cond = builder.CreateAnd(Cond, GenerateTCDMCheck(E, PhT)); + } } - DomTreeUpdater DTU(&AAA.getDT(), DomTreeUpdater::UpdateStrategy::Lazy); - BranchInst *BR = cloneRegion(L->getLoopPreheader()->getTerminator(), &*Ex->getFirstInsertionPt(), AAA.getDT(), DTU); + + assert(Cond->getType() == Type::getInt1Ty(Cond->getContext()) && "Cond has type bool (i1)"); + + return exp; +} - //errs()<<"done cloning, Loop Preheader:\n"; L->getLoopPreheader()->dump(); +///clones from L's preheader to L's exit uses Cond for CBr between clone and non-clone +///then generates the instrinsics for all in exp +void cloneAndSetup(const Loop *L, Value *Cond, std::vector &exp) { + assert(exp.size() <= NUM_SSR); + if (exp.size() == 0u) return; - //TCDM Checks - IRBuilder<> builder(BR); - for (auto &E : exp) { - Cond = builder.CreateAnd(Cond, GenerateTCDMCheck(E, BR)); - } + errs()<<"cloning in "<getHeader()->getNameOrAsOperand()<<"\n"; - BR->setCondition(Cond); + Instruction *PhT = L->getLoopPreheader()->getTerminator(); + if (!isa(Cond)){ //if Cond is not a constant we cannot make the decision at compile time ==> clone whole region for if-else + BranchInst *BR = cloneRegion(L->getLoopPreheader()->getTerminator(), &*getSingleExitBlock(L)->getFirstInsertionPt()); + BR->setCondition(Cond); + } else { + //this should never happen, but it means the runtime checks were somehow known at compile time and turned out false: + if(cast(Cond)->getLimitedValue() == 0u) return; + } + + Instruction *ExP = &*getSingleExitBlock(L)->getFirstInsertionPt(); //this changes if clone is executed! + unsigned dmid = 0u; for (auto &E : exp) { - GenerateSSRSetup(E, dmid++, L->getLoopPreheader()->getTerminator()); + GenerateSSRSetup(E, dmid++, PhT); } - generateSSREnDis(L); - - DTU.flush(); //only change DT after everything + generateSSREnDis(PhT, ExP); } bool isValid(AffAcc *A, const Loop *L) { @@ -480,9 +523,45 @@ bool isValid(AffAcc *A, const Loop *L) { return valid; } + +//TODO: could do this faster by precomputing for whole function +bool isValidLoop(const Loop *L) { + + DenseSet ids; + for (Intrinsic::ID x : riscSSRIntrinsics){ + ids.insert(x); //put intrinsics into set for faster lookup + } + + for (BasicBlock *BB : L->getBlocks()){ + for (Instruction &i : *BB) { + Instruction *I = &i; + if (CallBase *C = dyn_cast(I)) { + //if L contains call to function that uses SSR streams ==> cannot have streams itself (potential conflict of streams using the same DMIDs) + if (C->hasFnAttr(SSRFnAttr)) { + errs()<<"Loop "<getHeader()->getNameOrAsOperand()<<" is invalid, because of:\n"<<*C<<"\n"; + return false; + } + if (IntrinsicInst *II = dyn_cast(C)) { + if (ids.contains(II->getIntrinsicID())) { + errs()<<"Loop "<getHeader()->getNameOrAsOperand()<<" is invalid, because it already contains intrinsic:\n"<<*II<<"\n"; + return false; + } + } + } + } + } + return true; +} + void visitLoop(const Loop *L, DenseMap> &possible, ConflictTree &tree, AffineAccess &AAA) { assert(L); + + //NOTE: cannot return early in this function, as `possible` and `tree` need to be expanded even if L is not suitable for streams + ArrayRef accs = AAA.getExpandableAccesses(L); + + if (!isValidLoop(L)) accs = ArrayRef(); //make accs empty + std::vector valid; for (AffAcc *A : accs) { if (isValid(A, L)) valid.push_back(A); @@ -507,17 +586,25 @@ void visitLoop(const Loop *L, DenseMap> &pos } //end of namespace PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &FAM){ + if (!SSR_INFERENCE) return PreservedAnalyses::all(); + if (F.hasFnAttribute(SSRFnAttr)) return PreservedAnalyses::all(); //this function already contains streams ==> skip AffineAccess &AAA = FAM.getResult(F); errs()<<"SSR Generation Pass on function: "<> possible; - ConflictTree tree; + auto &toploops = AAA.getLI().getTopLevelLoops(); + DenseMap> trees; //keep track of the conflict tree for each top-level loop + DenseMap> bestLoops; //keep track of the best results for each tree + DenseMap> possible; //keep track of the AffAcc's that can be expanded in each loop + DenseMap conds; //keep track of the condition of the run-time check for each loop + DenseMap> exps; //keep track of the expanded AffAcc's for each loop - //go through all loops in this tree to build conflict-tree and find possible expands + for (const Loop *T : toploops){ + ConflictTree &tree = trees.insert(std::make_pair(T, ConflictTree())).first->getSecond(); + + //go through all loops in sub-tree of T to build conflict-tree and find possible expands std::deque worklist; worklist.push_back(T); while (!worklist.empty()) { @@ -533,19 +620,36 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F //expand them for (const Loop *L : best) { auto &acc = possible.find(L)->getSecond(); - if (!acc.empty()) expandInLoop(acc, L, AAA); + if (!acc.empty()) { + changed = true; + Value *Cond = nullptr; + auto exp = expandInLoop(acc, L, AAA, Cond); + assert(Cond); + conds.insert(std::make_pair(L, Cond)); + exps.insert(std::make_pair(L, std::move(exp))); + } } - changed |= !best.empty(); + bestLoops.insert(std::make_pair(T, std::move(best))); } - /* - errs()<<"dumping Module ============================\n"; - F.getParent()->dump(); - errs()<<"done dumping Module ============================\n"; - */ + ///NOTE: as soon as we start cloning (so after this comment), all the analyses are falsified and we do not want to update them + ///because that would falsify the AAA (which we do not want to update because it would find less solutions after the cloning). + ///So all the code that follows only makes use of simple stuff like Loop::getLoopPreheader() which luckily still works + + for (const Loop *T : toploops) { + std::vector &best = bestLoops.find(T)->getSecond(); + for (const Loop *L : best) { + auto p = conds.find(L); + if (p != conds.end()) { + errs()<<"clone and setup in "<getHeader()->getNameOrAsOperand()<<"\n"; + cloneAndSetup(L, p->second, exps.find(L)->getSecond()); + } + } + } if (!changed) return PreservedAnalyses::all(); + F.addFnAttr(StringRef(SSRFnAttr)); F.addFnAttr(Attribute::AttrKind::NoInline); return PreservedAnalyses::none(); } \ No newline at end of file From f2981773a50d698b178a8b079e23a06229d040be Mon Sep 17 00:00:00 2001 From: thrupf Date: Sat, 11 Jun 2022 22:38:18 +0200 Subject: [PATCH 35/47] 2mm working --- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index e1bac2f4e940f..e6716365e6647 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -59,8 +59,8 @@ #define SSR_NO_BOUNDCHECK false //if you feel like there is somehow still some weird reordering going on, enable these: -#define SSR_CLOBBER_REGS_FOR_PUSH false -#define SSR_CLOBBER_REGS_FOR_POP false +#define SSR_CLOBBER_REGS_FOR_PUSH true +#define SSR_CLOBBER_REGS_FOR_POP true #define NUM_SSR 3U //NOTE: if increased too much, might need to change 1st arguments to clobberRegisters(..) #define SSR_MAX_DIM 4U From b8816d02e2add727dcdba6d588e624947f111a70 Mon Sep 17 00:00:00 2001 From: thrupf Date: Tue, 21 Jun 2022 09:14:04 +0200 Subject: [PATCH 36/47] asm version --- .../llvm/Analysis/AffineAccessAnalysis.h | 7 +- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 83 +-- llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp | 5 + llvm/lib/Transforms/SSR/SSRGeneration.cpp | 472 ++++++++++++++---- llvm/lib/Transforms/SSR/SSRInference.cpp | 2 - 5 files changed, 433 insertions(+), 136 deletions(-) diff --git a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h index dab3969cbe8f8..e2b223c08b97d 100644 --- a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h @@ -21,6 +21,7 @@ class MemorySSA; class MemoryUseOrDef; class MemoryDef; struct ExpandedAffAcc; +class DependenceInfo; struct LoopRep{ private: @@ -134,6 +135,7 @@ class AffineAccess{ LoopInfo &LI; MemorySSA &MSSA; AAResults &AA; + DependenceInfo &DI; MemDep MD; DenseMap access; DenseMap reps; @@ -146,7 +148,7 @@ class AffineAccess{ std::pair calcConflict(AffAcc *A, AffAcc *B) const; public: - AffineAccess(Function &F, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, MemorySSA &MSSA, AAResults &AA); + AffineAccess(Function &F, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, MemorySSA &MSSA, AAResults &AA, DependenceInfo &DI); AffineAccess() = delete; bool accessPatternsMatch(const AffAcc *A, const AffAcc *B, const Loop *L) const; bool accessPatternsAndAddressesMatch(const AffAcc *A, const AffAcc *B, const Loop *L) const; @@ -155,9 +157,10 @@ class AffineAccess{ LoopInfo &getLI() const; MemorySSA &getMSSA() const; AAResults &getAA() const; + DependenceInfo &getDI() const; SmallVector getLoopsInPreorder() const; - ArrayRef getExpandableAccesses(const Loop *L); + std::vector getExpandableAccesses(const Loop *L, bool conflictFreeOnly = false); std::vector expandAllAt(ArrayRef Accs, const Loop *L, Instruction *Point, Value *&BoundCheck, Type *PtrTy, IntegerType *ParamTy, IntegerType *AgParamTy, bool conflictChecks = true, bool repChecks = false); }; diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index 0393a635e68ac..cacc9067289e8 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -337,11 +337,9 @@ void updateOutermostExpandableExcl(const Loop *&outerMostExpandableExl, AffAccCo case AffAccConflict::NoConflict: break; case AffAccConflict::MustNotIntersect: - errs()<<"must-not-intersect\n"; updateIfAncestor(innermostCommon, deepestMalformed); //updates innermostCommon to deepestMalformed if that one is less "deep" LLVM_FALLTHROUGH; case AffAccConflict::Bad: - errs()<<"bad\n"; updateIfDescendant(outerMostExpandableExl, innermostCommon); break; default: @@ -349,6 +347,24 @@ void updateOutermostExpandableExcl(const Loop *&outerMostExpandableExl, AffAccCo } } +void dumpAffAccConflict(AffAccConflict kind) { + switch (kind) + { + case AffAccConflict::Bad: + errs()<<"Bad"; + break; + case AffAccConflict::MustNotIntersect: + errs()<<"MustNotIntersect"; + break; + case AffAccConflict::NoConflict: + errs()<<"NoConflict"; + break; + default: + break; + } + errs()<<"\n"; +} + } //end of namespace //================== =========================================================== @@ -615,6 +631,8 @@ AffAccConflict AffAcc::getConflict(AffAcc *A, const Loop *L) const { return AffAccConflict::NoConflict; } +/// returns a vector of (AffAcc *, conflict) pairs containing all the conflicts that `this` has at loop `L` +/// It is guaranteed that conflict is never NoConflict std::vector> AffAcc::getConflicts(const Loop *L) const { std::vector> res; for (const auto &p : conflicts) { @@ -633,22 +651,7 @@ void AffAcc::addConflict(AffAcc *A, const Loop *StartL, AffAccConflict kind){ assert(conflicts.find(A) == conflicts.end() && "no conflict for A yet"); assert(kind == AffAccConflict::Bad || (isWellFormed(StartL) && A->isWellFormed(StartL))); conflicts.insert(std::make_pair(A, std::make_pair(StartL, kind))); - errs()<<"conflict for:\n"; dumpInLoop(StartL); errs()<<"with:\n"; A->dumpInLoop(StartL); errs()<<"is ===> "; - /*switch (kind) - { - case AffAccConflict::Bad: - errs()<<"Bad"; - break; - case AffAccConflict::MustNotIntersect: - errs()<<"MustNotIntersect"; - break; - case AffAccConflict::NoConflict: - errs()<<"NoConflict"; - break; - default: - break; - } - errs()<<"\n"; */ + //errs()<<"conflict for:\n"; dumpInLoop(StartL); errs()<<"with:\n"; A->dumpInLoop(StartL); errs()<<"is ===> "; } bool AffAcc::promote(LoopRep *LR){ @@ -805,8 +808,8 @@ DenseSet MemDep::findClobberUsers(MemoryDef *MA) { //================== Affine Access =========================================================== -AffineAccess::AffineAccess(Function &F, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, MemorySSA &MSSA, AAResults &AA) - : SE(SE), DT(DT), LI(LI), MSSA(MSSA), AA(AA), MD(MSSA, AA){ +AffineAccess::AffineAccess(Function &F, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, MemorySSA &MSSA, AAResults &AA, DependenceInfo &DI) + : SE(SE), DT(DT), LI(LI), MSSA(MSSA), AA(AA), DI(DI), MD(MSSA, AA){ for (const Loop *L : LI.getTopLevelLoops()){ std::vector all = analyze(L, ArrayRef()); addAllConflicts(all); @@ -877,9 +880,9 @@ void AffineAccess::addAllConflicts(const std::vector &all) { auto p = access.find(D); if (p == access.end()) continue; AffAcc *B = p->second; - AffAcc *x = A; //copy, TODO: remove this check + //A->dump(); B->dump(); auto r = calcConflict(A, B); - assert(A == x && "swap does not affect"); + //dumpAffAccConflict(r.first); if (r.first != AffAccConflict::NoConflict) A->addConflict(B, r.second, r.first); updateOutermostExpandableExcl(outerMostExpandableExl, r.first, r.second, B->getDeepestMalformed()); assert(!outerMostExpandableExl || outerMostExpandableExl->contains(A->getMemoryAccess()->getBlock())); @@ -909,19 +912,29 @@ AffAccConflict AffineAccess::calcRWConflict(AffAcc *Read, AffAcc *Write, const L assert(Write->isWrite()); if (!L->contains(Read->getMemoryAccess()->getBlock()) || !L->contains(Write->getMemoryAccess()->getBlock())) return AffAccConflict::NoConflict; if (!Read->isWellFormed(L) || !Write->isWellFormed(L)) return AffAccConflict::Bad; - Value *Addr = getAddress(Read->getMemoryAccess()); - Value *DAddr = getAddress(Write->getMemoryAccess()); + MemoryUseOrDef *r = Read->getMemoryAccess(); + MemoryUseOrDef *w = Write->getMemoryAccess(); + Value *Addr = getAddress(r); + Value *DAddr = getAddress(w); + bool dominates = MSSA.dominates(r, w); + //auto dep = DI.depends(r->getMemoryInst(), w->getMemoryInst(), dominates && L->isInnermost()); if (Addr && DAddr && AA.alias(Addr, DAddr) == NoAlias) return AffAccConflict::NoConflict; AffAccConflict kind = AffAccConflict::Bad; - if (!MSSA.dominates(Read->getMemoryAccess(), Write->getMemoryAccess())) { //read does not dominate write ==> RaW + if (!dominates) { //read does not dominate write ==> RaW kind = AffAccConflict::MustNotIntersect; } else { //read dominates write ==> WaR kind = AffAccConflict::MustNotIntersect; //exception: we know that the store always happens to a position already written from if the store is to same address as write (FIXME: CONSERVATIVE) + //but the steps needs to be != 0 such that there is no dependence from one iteration to the next if ((Addr && DAddr && AA.alias(Addr, DAddr) == MustAlias) || accessPatternsAndAddressesMatch(Read, Write, L)) { - kind = AffAccConflict::NoConflict; + bool nonzeroSteps = true; + unsigned dr = Read->loopToDimension(L); + unsigned dw = Write->loopToDimension(L); + while (Read->isWellFormed(dr) && Write->isWellFormed(dw)) + nonzeroSteps &= SE.isKnownNonZero(Read->getStep(dr++)) && SE.isKnownNonZero(Write->getStep(dw++)); + if (nonzeroSteps) kind = AffAccConflict::NoConflict; } } return kind; @@ -930,6 +943,9 @@ AffAccConflict AffineAccess::calcRWConflict(AffAcc *Read, AffAcc *Write, const L ///returns the kind of conflict (and innermost common loop) that A and B have assuming there is some memory dependency ///does not check for the memory dependency itself for to peformance std::pair AffineAccess::calcConflict(AffAcc *A, AffAcc *B) const { + //auto dep = DI.depends(A->getMemoryAccess()->getMemoryInst(), B->getMemoryAccess()->getMemoryInst(), MSSA.dominates(A->getMemoryAccess(), B->getMemoryAccess())); + //dep->dump(errs()); + //errs()<<"confused = "<isConfused()<<", is consistent = "<isConsistent()<<", is anti = "<isAnti()<<", is flow = "<isFlow()<<", is input = "<isInput()<<", is output = "<isOutput()<<"\n"; assert((A->isWrite() || B->isWrite()) && "conflict between two reads ???"); const Loop *const innermostCommon = findFirstContaining(A->getContainingLoops(), B->getMemoryAccess()->getBlock()); if (!innermostCommon) return std::make_pair(AffAccConflict::NoConflict, innermostCommon); @@ -975,12 +991,17 @@ DominatorTree &AffineAccess::getDT()const { return this->DT; } LoopInfo &AffineAccess::getLI() const { return this->LI; } MemorySSA &AffineAccess::getMSSA() const { return this->MSSA; } AAResults &AffineAccess::getAA() const { return this->AA; } +DependenceInfo &AffineAccess::getDI() const { return this->DI; } SmallVector AffineAccess::getLoopsInPreorder() const { return this->LI.getLoopsInPreorder(); } -ArrayRef AffineAccess::getExpandableAccesses(const Loop *L) { +std::vector AffineAccess::getExpandableAccesses(const Loop *L, bool conflictFreeOnly) { auto p = expandableAccesses.find(L); - if (p == expandableAccesses.end()) return ArrayRef(); - return ArrayRef(p->getSecond()); + std::vector res; + if (p == expandableAccesses.end()) return res; + for (AffAcc *A : p->getSecond()){ + if (!conflictFreeOnly || A->getConflicts(L).empty()) res.push_back(A); + } + return res; } std::vector @@ -1072,9 +1093,9 @@ AffineAccess AffineAccessAnalysis::run(Function &F, FunctionAnalysisManager &FAM auto &MSSAA = FAM.getResult(F); MemorySSA &MSSA = MSSAA.getMSSA(); AAResults &AA = FAM.getResult(F); - //DependenceInfo &DI = FAM.getResult(F); + DependenceInfo &DI = FAM.getResult(F); - return AffineAccess(F, SE, DT, LI, MSSA, AA); + return AffineAccess(F, SE, DT, LI, MSSA, AA, DI); } //================== Affine Acces Analysis Pass for opt ======================================= diff --git a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp index 808576641f365..fe82351286f4c 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp @@ -161,6 +161,10 @@ bool RISCVExpandSSR::runOnMachineFunction(MachineFunction &MF) { } } + errs()<<"\n ========================= DUMP MF ========================== \n"; + MF.dump(); + errs()<<"\n ======================= END DUMP MF ========================== \n"; + return Modified; } @@ -537,6 +541,7 @@ void RISCVExpandSSR::mergePushPop(MachineBasicBlock &MBB) { }*/ } if (O){ + errs()<<"push regmerge: \n"; O->getParent()->dump(); O->setReg(ssr_reg); O->getParent()->dump(); diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index e6716365e6647..20060c583b89d 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -50,17 +50,9 @@ #include #include -//immediately return in pass if false -#define SSR_INFERENCE true - -//flags for runtime checks (true = disabled) -#define SSR_NO_INTERSECTCHECK false -#define SSR_NO_TCDMCHECK false -#define SSR_NO_BOUNDCHECK false - //if you feel like there is somehow still some weird reordering going on, enable these: -#define SSR_CLOBBER_REGS_FOR_PUSH true -#define SSR_CLOBBER_REGS_FOR_POP true +#define SSR_CLOBBER_REGS_FOR_PUSH false +#define SSR_CLOBBER_REGS_FOR_POP false #define NUM_SSR 3U //NOTE: if increased too much, might need to change 1st arguments to clobberRegisters(..) #define SSR_MAX_DIM 4U @@ -72,10 +64,56 @@ //current state of hw: only allow doubles #define CHECK_TYPE(T, I) (T == Type::getDoubleTy(I->getParent()->getContext())) -static constexpr char SSRFnAttr[] = "SSR"; //used to tag functions that contain SSR streams +//for gain estimation +#define EST_LOOP_TC 25 +#define EST_MUL_COST 3 +#define EST_MEMOP_COST 2 using namespace llvm; +namespace llvm { + +cl::opt InferSSR( + "infer-ssr", + cl::init(false), + cl::desc("Enable inference of SSR streams.") +); + +cl::opt SSRNoIntersectCheck( + "ssr-no-intersect-check", + cl::init(false), + cl::desc("Do not generate intersection checks (unsafe). Use `restrict` key-word instead if possible.") +); + +cl::opt SSRNoTCDMCheck( + "ssr-no-tcdm-check", + cl::init(false), + cl::desc("Assume all data of inferred streams is inside TCDM.") +); + +cl::opt SSRNoBoundCheck( + "ssr-no-bound-check", + cl::init(false), + cl::desc("Do not generate checks that make sure the inferred stream's access is executed at least once.") +); + +cl::opt SSRConflictFreeOnly( + "ssr-conflict-free-only", + cl::init(false), + cl::desc("Only infer streams if they have no conflicts with other memory accesses.") +); + +cl::opt SSRInline( + "ssr-inline", + cl::init(false), + cl::desc("allow functions that contain SSR streams to be inlined") +); + +} //end of namespace llvm + + +static constexpr char SSRFnAttr[] = "SSR"; //used to tag functions that contain SSR streams + static constexpr Intrinsic::ID riscSSRIntrinsics[] = { Intrinsic::RISCVIntrinsics::riscv_ssr_barrier, Intrinsic::RISCVIntrinsics::riscv_ssr_disable, @@ -95,7 +133,8 @@ static constexpr Intrinsic::ID riscSSRIntrinsics[] = { Intrinsic::RISCVIntrinsics::riscv_ssr_setup_bound_stride_4d, }; -namespace{ + +namespace { template struct ConflictTree { @@ -113,7 +152,8 @@ struct ConflictTree { } } - //picks the nodes in the tree such that their combined value (conmbineFunc, needs to be associative & commutative) is the highest possible + //picks nodes in the tree such that their combined value (conmbineFunc, needs to be associative & commutative) is the highest possible + //prioritizes parent over children std::vector findBest(const std::function &combineFunc) { std::vector res; if (!Root) return res; @@ -145,10 +185,9 @@ struct ConflictTree { }; void clobberRegisters(ArrayRef regs, IRBuilder<> &builder){ - //equivalent to asm volatile ("":::regs); - std::string constraints = "~{dirflag},~{fpsr},~{flags}"; //TODO: what are these doing? + std::string constraints = ""; for (const std::string r : regs){ - constraints = "~{" + r + "}," + constraints; //(formatv("~{{{0}},", r) + Twine(constraints)).str() + constraints = "~{" + r + "}," + constraints; } errs()< splitAt(Instruction *X, const Twine &name) ///clones code from BeginWith up to EndBefore ///assumes all cf-paths from begin lead to end (or return) ///assumes there is a phi node for each value defined in the region that will be cloned in the block of EndBefore that is live after EndBefore -BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore){ +///returns the branch that splits region from coloned region and the pair of branches that jump to EndBefore at the end +std::pair> cloneRegion(Instruction *BeginWith, Instruction *EndBefore){ errs()<<"cloning from "<<*BeginWith<<" up to "<<*EndBefore<<"\n"; auto p = splitAt(BeginWith, "split.before"); @@ -230,6 +270,7 @@ BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore){ BasicBlock *Begin = p.second; p = splitAt(EndBefore, "fuse.prep"); + BranchInst *BRFuse = cast(p.first->getTerminator()); BasicBlock *End = p.second; copyPHIsFromPred(End); //copy Phi's from Fuse to End @@ -320,18 +361,18 @@ BranchInst *cloneRegion(Instruction *BeginWith, Instruction *EndBefore){ } errs()<<"done cloning \n"; - return HeadBr; + return std::make_pair(HeadBr, std::make_pair(BRFuse, cast(clones.find(BRFuse)->second))); } BasicBlock *getSingleExitBlock(const Loop *L) { + BasicBlock *Ex = L->getExitBlock(); + if (Ex) return Ex; SmallVector exits; L->getExitBlocks(exits); - BasicBlock *Ex = nullptr; for (BasicBlock *BB : exits){ if (!Ex) Ex = BB; - assert(Ex == BB); + if (Ex != BB) return nullptr; } - assert(Ex); return Ex; } @@ -343,6 +384,22 @@ Value *GenerateTCDMCheck(ExpandedAffAcc &E, Instruction *Point) { return builder.CreateAnd(c1, c2, "tcdm.check"); } +Value *generatePopAsm(Instruction *InsertBefore, unsigned dmid) { + IRBuilder<> builder(InsertBefore); + FunctionType *fty = FunctionType::get(Type::getDoubleTy(builder.getContext()), false); + std::string inst = formatv("fmv.d $0, ft{0}\0A", dmid); + InlineAsm *Pop = InlineAsm::get(fty, inst, "=f", true); + return builder.CreateCall(Pop, ArrayRef(), "ssr.pop"); +} + +void generatePushAsm(Instruction *InsertBefore, unsigned dmid, Value *Val){ + IRBuilder<> builder(InsertBefore); + FunctionType *fty = FunctionType::get(Type::getVoidTy(builder.getContext()), ArrayRef(Type::getDoubleTy(builder.getContext())), false); + std::string inst = formatv("fmv.d ft{0}, $0\0A", dmid); + InlineAsm *Push = InlineAsm::get(fty, inst, "f", true); + builder.CreateCall(Push, ArrayRef(Val)); +} + void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ assert(Point); Module *mod = Point->getModule(); @@ -372,26 +429,17 @@ void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ } unsigned n_reps = 0U; - std::string s = formatv("ft{0}", dmid); - ArrayRef regs(s); if (isStore){ - Function *SSRPush = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_push); for (Instruction *I : E.Access->getAccesses()){ - std::array pusharg = {DMid, cast(I)->getValueOperand()}; - builder.SetInsertPoint(I); - if (SSR_CLOBBER_REGS_FOR_PUSH) clobberRegisters(regs, builder); - auto *C = builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); - C->dump(); I->dump(); + generatePushAsm(I, dmid, cast(I)->getValueOperand()); + I->dump(); I->eraseFromParent(); n_reps++; } }else{ - Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); - std::array poparg = {DMid}; for (Instruction *I : E.Access->getAccesses()){ builder.SetInsertPoint(I); - auto *V = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); - if (SSR_CLOBBER_REGS_FOR_POP) clobberRegisters(regs, builder); + auto *V = generatePopAsm(I, dmid); V->dump(); I->dump(); BasicBlock::iterator ii(I); ReplaceInstWithValue(I->getParent()->getInstList(), ii, V); @@ -423,32 +471,123 @@ void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ return; } -///generates SSR enable & disable calls -void generateSSREnDis(Instruction *PhP, Instruction *ExP){ - IRBuilder<> builder(PhP); // ----------- in preheader - Module *mod = PhP->getParent()->getModule(); - Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); - builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef()); +///generates SSR enable & disable calls +void generateSSREnDisAsm(Instruction *PhT, Instruction *ExP){ + constexpr unsigned num_ssr = 3u; //FIXME: make use of NUM_SSR + + IRBuilder<> builder(PhT); // ----------- in preheader + auto &ctxt = builder.getContext(); + Type *Double = Type::getDoubleTy(ctxt); + std::vector structTys; + for (unsigned i = 0; i < num_ssr; i++) structTys.push_back(Double); + Type *ArrTy = StructType::get(ctxt, structTys); //auto *ArrTy = ArrayType::get(Double, num_ssr); //VectorType::get(Double, num_ssr, false); + std::vector argtypes; + for (unsigned i = 0u; i < num_ssr; i++) argtypes.push_back(Double); + std::string constraints = "={f0},={f1},={f2},{f0},{f1},{f2},~{memory}"; + FunctionType* fty = FunctionType::get(ArrTy, argtypes, false); + InlineAsm *En = InlineAsm::get(fty, "csrsi 0x7C0, 1\0A", constraints, true); + En->dump(); + std::vector args; + for (unsigned i = 0u; i < num_ssr; i++) args.push_back(UndefValue::get(Double)); + CallInst *Dep = builder.CreateCall(En, args, "ssr.enable.dep"); + Dep->dump(); - std::vector regs; - for (unsigned r = 0u; r < NUM_SSR; r++){ - regs.push_back(std::string(formatv("ft{0}", r))); + builder.SetInsertPoint(ExP); // ----------- in exit block + std::vector deps; + for (unsigned i = 0u; i < num_ssr; i++) + deps.push_back(builder.CreateExtractValue(Dep, i, formatv("dep.{0}", i))); + InlineAsm *Dis = InlineAsm::get(fty, "csrci 0x7C0, 1\0A", constraints, true); + builder.CreateCall(Dis, deps, "ssr.disable.dep")->dump(); + + errs()<<"generated ssr_enable and ssr_disable\n"; + + return; +} + +/* +void generateFrepPragma(ArrayRef points) { + if (points.empty()) return; + IRBuilder<> builder(points.front()->getContext()); + for (Instruction *P : points) { + builder.SetInsertPoint(P); + Function *FrepPragma = Intrinsic::getDeclaration(P->getModule(), Intrinsic::riscv_frep_infer); + builder.CreateCall(FrepPragma->getFunctionType(), FrepPragma, ArrayRef())->dump(); } - //create inline asm that clobbers ft0-2 to make sure none of them are reordered to before ssr enable / after ssr disable - //equivalent to asm volatile ("":::"ft0", "ft1", "ft2"); - clobberRegisters(ArrayRef(regs), builder); +}*/ + +int getEstExpandCost(AffAcc *A, unsigned dim) { + int cost = 0; + cost += A->getBaseAddr(dim)->getExpressionSize(); + for (unsigned i = 1; i < dim; i++) { + cost += A->getStep(i)->getExpressionSize(); + cost += A->getRep(i)->getExpressionSize(); + cost += EST_MUL_COST; //for range + if (i > 1) cost += 1; //for addition + } + return cost; +} - //Function *FREPPragma = Intrinsic::getDeclaration(mod, Intrinsic::riscv_frep_infer); - //builder.CreateCall(FREPPragma->getFunctionType(), FREPPragma, ArrayRef()); +int getEstGain(ArrayRef Accs, const Loop *L, AffineAccess &AAA) { + int gain = 0; + DenseSet accs; + for (auto *A : Accs) accs.insert(A); + + DenseSet contLoops; + DenseSet vis; + for (AffAcc *A : Accs) { + vis.insert(A); + unsigned dim = A->loopToDimension(L); + + //cost of expanding A + gain -= getEstExpandCost(A, dim); + + //cost of intersection checks + if (!SSRNoIntersectCheck) { + for (const auto &p : A->getConflicts(L)) { + switch (p.second) + { + case AffAccConflict::NoConflict: + break; //nothing to do + case AffAccConflict::MustNotIntersect: { + AffAcc *B = p.first; + if (vis.find(B) != vis.end()) break; //already handled this conflict when A was B + unsigned dimB = B->loopToDimension(L); + if (accs.find(B) == accs.end()) gain -= getEstExpandCost(B, dimB); + gain -= 4u; //2x ICmpULT, 1 OR, 1 AND + break; + } + case AffAccConflict::Bad: + assert(false && "WARNING: there is a bad conflict for given Accs and L ==> could not expand them here!"); + default: + llvm_unreachable("uknown conflict type"); + } + } + } - builder.SetInsertPoint(ExP); // ----------- in exit block - clobberRegisters(ArrayRef(regs), builder); - Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); - builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef()); + //cost of tcdm checks + if (!SSRNoTCDMCheck) { + gain -= 4u; //2x ICmpULT, 2 AND + } - errs()<<"generated ssr_enable and ssr_disable\n"; + int reps = 1; + for (unsigned d = dim; d >= 1u; d--) { //dimensions that are extended + int loopTC = EST_LOOP_TC; + if (A->getRep(d)->getSCEVType() == SCEVTypes::scConstant) + loopTC = cast(A->getRep(d))->getAPInt().getLimitedValue(std::numeric_limits::max()); + reps = std::max(reps * loopTC, reps); //prevent overflows - return; + //prep for boundcheck cost + contLoops.insert(A->getLoop(d)); + } + gain += EST_MEMOP_COST * reps; //the number of loads/stores that are removed by inserting a stream + + } + + if (!SSRNoBoundCheck) { + gain -= 2 * contLoops.size(); // 1 ICmp, 1 AND per loop + } + + return gain; } ///expands AffAcc's in L's preheader and inserts TCDM checks, returns ExpandedAffAcc's and writes the final Value* of the checks into Cond @@ -467,11 +606,11 @@ std::vector expandInLoop(const std::vector &accs, cons Instruction *PhT = L->getLoopPreheader()->getTerminator(); //generate Steps, Reps, base addresses, intersect checks, and bound checks - auto exp = AAA.expandAllAt(accs, L, PhT, Cond, i8Ptr, i32, i64, !SSR_NO_INTERSECTCHECK, !SSR_NO_BOUNDCHECK); + auto exp = AAA.expandAllAt(accs, L, PhT, Cond, i8Ptr, i32, i64, !SSRNoIntersectCheck, !SSRNoBoundCheck); assert(Cond); //TCDM Checks - if (!SSR_NO_TCDMCHECK) { + if (!SSRNoTCDMCheck) { IRBuilder<> builder(PhT); for (auto &E : exp) { Cond = builder.CreateAnd(Cond, GenerateTCDMCheck(E, PhT)); @@ -485,30 +624,28 @@ std::vector expandInLoop(const std::vector &accs, cons ///clones from L's preheader to L's exit uses Cond for CBr between clone and non-clone ///then generates the instrinsics for all in exp -void cloneAndSetup(const Loop *L, Value *Cond, std::vector &exp) { +void cloneAndSetup(Instruction *PhT, Instruction *ExP, Value *Cond, std::vector &exp) { assert(exp.size() <= NUM_SSR); if (exp.size() == 0u) return; - errs()<<"cloning in "<getHeader()->getNameOrAsOperand()<<"\n"; - - Instruction *PhT = L->getLoopPreheader()->getTerminator(); - if (!isa(Cond)){ //if Cond is not a constant we cannot make the decision at compile time ==> clone whole region for if-else - BranchInst *BR = cloneRegion(L->getLoopPreheader()->getTerminator(), &*getSingleExitBlock(L)->getFirstInsertionPt()); + auto p = cloneRegion(PhT, ExP); + BranchInst *BR = p.first; + ExP = p.second.first; //terminator of exit block that jumps to original ExP + //PhT = cast(BR->getOperand(1u))->getTerminator(); BR->setCondition(Cond); } else { //this should never happen, but it means the runtime checks were somehow known at compile time and turned out false: if(cast(Cond)->getLimitedValue() == 0u) return; } - - Instruction *ExP = &*getSingleExitBlock(L)->getFirstInsertionPt(); //this changes if clone is executed! unsigned dmid = 0u; for (auto &E : exp) { GenerateSSRSetup(E, dmid++, PhT); } - generateSSREnDis(PhT, ExP); + //generateSSREnDis(PhT, ExP); + generateSSREnDisAsm(PhT, ExP); } bool isValid(AffAcc *A, const Loop *L) { @@ -523,44 +660,23 @@ bool isValid(AffAcc *A, const Loop *L) { return valid; } - -//TODO: could do this faster by precomputing for whole function bool isValidLoop(const Loop *L) { - - DenseSet ids; - for (Intrinsic::ID x : riscSSRIntrinsics){ - ids.insert(x); //put intrinsics into set for faster lookup - } - - for (BasicBlock *BB : L->getBlocks()){ - for (Instruction &i : *BB) { - Instruction *I = &i; - if (CallBase *C = dyn_cast(I)) { - //if L contains call to function that uses SSR streams ==> cannot have streams itself (potential conflict of streams using the same DMIDs) - if (C->hasFnAttr(SSRFnAttr)) { - errs()<<"Loop "<getHeader()->getNameOrAsOperand()<<" is invalid, because of:\n"<<*C<<"\n"; - return false; - } - if (IntrinsicInst *II = dyn_cast(C)) { - if (ids.contains(II->getIntrinsicID())) { - errs()<<"Loop "<getHeader()->getNameOrAsOperand()<<" is invalid, because it already contains intrinsic:\n"<<*II<<"\n"; - return false; - } - } - } - } - } + assert(L); + if (!L->getLoopPreheader() || !getSingleExitBlock(L)) return false; return true; } -void visitLoop(const Loop *L, DenseMap> &possible, ConflictTree &tree, AffineAccess &AAA) { +bool visitLoop(const Loop *L, DenseMap> &possible, ConflictTree &tree, AffineAccess &AAA, bool isKnownInvalid) { assert(L); //NOTE: cannot return early in this function, as `possible` and `tree` need to be expanded even if L is not suitable for streams - ArrayRef accs = AAA.getExpandableAccesses(L); + std::vector accs = AAA.getExpandableAccesses(L, SSRConflictFreeOnly); - if (!isValidLoop(L)) accs = ArrayRef(); //make accs empty + if (isKnownInvalid || !isValidLoop(L)) { + accs.clear(); //make accs empty + isKnownInvalid = true; + } std::vector valid; for (AffAcc *A : accs) { @@ -579,14 +695,110 @@ void visitLoop(const Loop *L, DenseMap> &pos l.push_back(valid[i]); } //add to tree: - unsigned val = l.size(); //TODO: find more elaborate score model + int gain = getEstGain(l, L, AAA); + errs()<<"est. gain is "<isOutermost() ? nullptr : L->getParentLoop()); + + return !isKnownInvalid; +} + +///finds loops already affected by SSR +DenseSet findLoopsWithSSR(Function &F, LoopInfo &LI) { + DenseSet invalid; + + DenseSet ids; + for (Intrinsic::ID x : riscSSRIntrinsics){ + ids.insert(x); //put intrinsics into set for faster lookup + } + + std::deque> worklist; + DenseSet visUnmarked; + DenseSet visMarked; + worklist.push_back(std::make_pair(&F.getEntryBlock(), false)); + while(!worklist.empty()) { + auto p = worklist.front(); worklist.pop_front(); + BasicBlock *BB = p.first; + bool marked = p.second; + + if (!BB) continue; + if (marked) { + if (visMarked.find(BB) != visMarked.end()) continue; + visMarked.insert(BB); + + //mark all loops containing this Block invalid + const Loop *L = LI.getLoopFor(BB); + while (L) { + invalid.insert(L); + L = L->getParentLoop(); + } + + //go through instructions in block, if there is an ssr_disable() call, remove the marking for the successors of this block + for (Instruction &i : *BB) { + if (isa(i)) { + if (cast(i).getIntrinsicID() == Intrinsic::riscv_ssr_disable) marked = false; + } + if (!marked) break; //early exit + } + + } else { + if (visUnmarked.find(BB) != visUnmarked.end()) continue; + visUnmarked.insert(BB); + + for (Instruction &i : *BB) { + Instruction *I = &i; + if (CallBase *C = dyn_cast(I)) { + if (C->hasFnAttr(SSRFnAttr)) { + errs()<<"call "<<*C<<" has attribute "< no need to mark the BB + const Loop *L = LI.getLoopFor(BB); + while (L) { + invalid.insert(L); + L = L->getParentLoop(); + } + } + if (IntrinsicInst *II = dyn_cast(C)) { + if (ids.contains(II->getIntrinsicID())) { + errs()<<"Intrinsic Instr "<<*II<<" calls an SSR intrinsic\n"; + marked = true; //mark this (and thus also all following BBs) + } + } + if (C->isInlineAsm()) { //inline asm may contain ssr setup insts! + errs()<<"inline asm call "<<*C<<" may contain ssr insts!\n"; + C->getType()->dump(); + marked = true; + } + } + } + if (marked) worklist.push_back(std::make_pair(BB, true)); // if now marked, add to queue again + } + + for (BasicBlock *BB2 : successors(BB)) { + worklist.push_back(std::make_pair(BB2, marked)); + } + } + + if (!invalid.empty()) errs()<<"Loops that are invalid bc of SSR\n"; + for (auto l : invalid) { + errs()<<"header = "<getHeader()->getNameOrAsOperand()<<" at depth = "<getLoopDepth()<<"\n"; + } + + return invalid; } } //end of namespace PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &FAM){ - if (!SSR_INFERENCE) return PreservedAnalyses::all(); + errs()<<"SSRInference Flags: "; + if (InferSSR) errs()<<"infer-ssr"; + if (SSRNoIntersectCheck) errs()<<", ssr-no-intersect-check"; + if (SSRNoBoundCheck) errs()<<", ssr-no-bound-check"; + if (SSRNoTCDMCheck) errs()<<", ssr-no-tcdm-check"; + if (SSRConflictFreeOnly) errs()<<", ssr-conflict-free-only"; + //if (SSRInsertFrepPragma) errs()<<", ssr-insert-frep-pragma"; + errs()<<"\n"; + + if (!InferSSR) return PreservedAnalyses::all(); if (F.hasFnAttribute(SSRFnAttr)) return PreservedAnalyses::all(); //this function already contains streams ==> skip AffineAccess &AAA = FAM.getResult(F); @@ -600,6 +812,7 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F DenseMap> possible; //keep track of the AffAcc's that can be expanded in each loop DenseMap conds; //keep track of the condition of the run-time check for each loop DenseMap> exps; //keep track of the expanded AffAcc's for each loop + DenseSet ssrInvalidLoops = findLoopsWithSSR(F, AAA.getLI()); for (const Loop *T : toploops){ ConflictTree &tree = trees.insert(std::make_pair(T, ConflictTree())).first->getSecond(); @@ -609,7 +822,9 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F worklist.push_back(T); while (!worklist.empty()) { const Loop *L = worklist.front(); worklist.pop_front(); - visitLoop(L, possible, tree, AAA); + errs()<<"visiting loop: "<getHeader()->getNameOrAsOperand()<<"\n"; + visitLoop(L, possible, tree, AAA, ssrInvalidLoops.find(L) != ssrInvalidLoops.end()); + for (const Loop *x : L->getSubLoops()) worklist.push_back(x); } @@ -635,21 +850,76 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F ///NOTE: as soon as we start cloning (so after this comment), all the analyses are falsified and we do not want to update them ///because that would falsify the AAA (which we do not want to update because it would find less solutions after the cloning). - ///So all the code that follows only makes use of simple stuff like Loop::getLoopPreheader() which luckily still works + ///So all the code that follows does not make use of any of the analyses (except for L->getLoopPreheader & stuff like that which luckily still work) for (const Loop *T : toploops) { std::vector &best = bestLoops.find(T)->getSecond(); for (const Loop *L : best) { auto p = conds.find(L); if (p != conds.end()) { - errs()<<"clone and setup in "<getHeader()->getNameOrAsOperand()<<"\n"; - cloneAndSetup(L, p->second, exps.find(L)->getSecond()); + BasicBlock *Ex = getSingleExitBlock(L); + assert(Ex); + cloneAndSetup(L->getLoopPreheader()->getTerminator(), &*Ex->getFirstInsertionPt(), p->second, exps.find(L)->getSecond()); } } } if (!changed) return PreservedAnalyses::all(); F.addFnAttr(StringRef(SSRFnAttr)); - F.addFnAttr(Attribute::AttrKind::NoInline); + if (!SSRInline) F.addFnAttr(Attribute::AttrKind::NoInline); return PreservedAnalyses::none(); -} \ No newline at end of file +} + +/* +std::string s = formatv("f{0}", dmid); + ArrayRef regs(s); + if (isStore){ + Function *SSRPush = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_push); + for (Instruction *I : E.Access->getAccesses()){ + std::array pusharg = {DMid, cast(I)->getValueOperand()}; + builder.SetInsertPoint(I); + if (SSR_CLOBBER_REGS_FOR_PUSH) clobberRegisters(regs, builder); + auto *C = builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); + C->dump(); I->dump(); + I->eraseFromParent(); + n_reps++; + } + }else{ + Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); + std::array poparg = {DMid}; + for (Instruction *I : E.Access->getAccesses()){ + builder.SetInsertPoint(I); + auto *V = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); + if (SSR_CLOBBER_REGS_FOR_POP) clobberRegisters(regs, builder); + V->dump(); I->dump(); + BasicBlock::iterator ii(I); + ReplaceInstWithValue(I->getParent()->getInstList(), ii, V); + n_reps++; + } + } + + ///generates SSR enable & disable calls +void generateSSREnDis(Instruction *PhP, Instruction *ExP){ + IRBuilder<> builder(PhP); // ----------- in preheader + Module *mod = PhP->getParent()->getModule(); + Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); + builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef()); + + std::vector regs; + for (unsigned r = 0u; r < NUM_SSR; r++){ + regs.push_back(std::string(formatv("f{0}", r))); + } + //create inline asm that clobbers ft0-2 to make sure none of them are reordered to before ssr enable / after ssr disable + //equivalent to asm volatile ("":::"ft0", "ft1", "ft2"); + clobberRegisters(ArrayRef(regs), builder); + + builder.SetInsertPoint(ExP); // ----------- in exit block + clobberRegisters(ArrayRef(regs), builder); + Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); + builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef()); + + errs()<<"generated ssr_enable and ssr_disable\n"; + + return; +} + */ \ No newline at end of file diff --git a/llvm/lib/Transforms/SSR/SSRInference.cpp b/llvm/lib/Transforms/SSR/SSRInference.cpp index eaba80bfb801c..ce428f597dd62 100644 --- a/llvm/lib/Transforms/SSR/SSRInference.cpp +++ b/llvm/lib/Transforms/SSR/SSRInference.cpp @@ -51,8 +51,6 @@ using namespace llvm; -static cl::opt InferSSR("ssr-inference", cl::init(false), cl::Hidden); - PreservedAnalyses SSRInferencePass::run(Function &F, FunctionAnalysisManager &FAM){ errs()<<"SSR Inference Pass on function: "< Date: Thu, 23 Jun 2022 19:33:53 +0200 Subject: [PATCH 37/47] good version --- llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp | 6 +- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 201 +++++++++--------- 2 files changed, 104 insertions(+), 103 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp index fe82351286f4c..9800c35be86c5 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp @@ -161,9 +161,9 @@ bool RISCVExpandSSR::runOnMachineFunction(MachineFunction &MF) { } } - errs()<<"\n ========================= DUMP MF ========================== \n"; - MF.dump(); - errs()<<"\n ======================= END DUMP MF ========================== \n"; + //errs()<<"\n ========================= DUMP MF ========================== \n"; + //MF.dump(); + //errs()<<"\n ======================= END DUMP MF ========================== \n"; return Modified; } diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index 20060c583b89d..20bc5c23be5e3 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -51,8 +51,8 @@ #include //if you feel like there is somehow still some weird reordering going on, enable these: -#define SSR_CLOBBER_REGS_FOR_PUSH false -#define SSR_CLOBBER_REGS_FOR_POP false +#define SSR_CLOBBER_REGS_FOR_PUSH true +#define SSR_CLOBBER_REGS_FOR_POP true #define NUM_SSR 3U //NOTE: if increased too much, might need to change 1st arguments to clobberRegisters(..) #define SSR_MAX_DIM 4U @@ -106,7 +106,13 @@ cl::opt SSRConflictFreeOnly( cl::opt SSRInline( "ssr-inline", cl::init(false), - cl::desc("allow functions that contain SSR streams to be inlined") + cl::desc("Allow functions that contain SSR streams to be inlined.") +); + +cl::opt SSRNoBarrier( + "ssr-no-barrier", + cl::init(false), + cl::desc("Disable the insertion of an spinning loop that waits for the stream to be done before it is dissabled (potentially unsafe).") ); } //end of namespace llvm @@ -186,17 +192,19 @@ struct ConflictTree { void clobberRegisters(ArrayRef regs, IRBuilder<> &builder){ std::string constraints = ""; - for (const std::string r : regs){ - constraints = "~{" + r + "}," + constraints; + if (regs.size() > 0u) { + constraints = "~{" + regs[0] + "}"; + for (unsigned i = 1u; i < regs.size(); i++) { + constraints = "~{" + regs[i] + "}," + constraints; + } } - errs()<dump(); } void copyPHIsFromPred(BasicBlock *BB){ @@ -384,22 +392,6 @@ Value *GenerateTCDMCheck(ExpandedAffAcc &E, Instruction *Point) { return builder.CreateAnd(c1, c2, "tcdm.check"); } -Value *generatePopAsm(Instruction *InsertBefore, unsigned dmid) { - IRBuilder<> builder(InsertBefore); - FunctionType *fty = FunctionType::get(Type::getDoubleTy(builder.getContext()), false); - std::string inst = formatv("fmv.d $0, ft{0}\0A", dmid); - InlineAsm *Pop = InlineAsm::get(fty, inst, "=f", true); - return builder.CreateCall(Pop, ArrayRef(), "ssr.pop"); -} - -void generatePushAsm(Instruction *InsertBefore, unsigned dmid, Value *Val){ - IRBuilder<> builder(InsertBefore); - FunctionType *fty = FunctionType::get(Type::getVoidTy(builder.getContext()), ArrayRef(Type::getDoubleTy(builder.getContext())), false); - std::string inst = formatv("fmv.d ft{0}, $0\0A", dmid); - InlineAsm *Push = InlineAsm::get(fty, inst, "f", true); - builder.CreateCall(Push, ArrayRef(Val)); -} - void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ assert(Point); Module *mod = Point->getModule(); @@ -428,18 +420,27 @@ void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ builder.CreateCall(SSRBoundStrideSetup->getFunctionType(), SSRBoundStrideSetup, ArrayRef(bsargs))->dump(); } - unsigned n_reps = 0U; + unsigned n_reps = 0u; + std::string s = formatv("f{0}", dmid); + ArrayRef regs(s); if (isStore){ + Function *SSRPush = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_push); for (Instruction *I : E.Access->getAccesses()){ - generatePushAsm(I, dmid, cast(I)->getValueOperand()); - I->dump(); + std::array pusharg = {DMid, cast(I)->getValueOperand()}; + builder.SetInsertPoint(I); + if (SSR_CLOBBER_REGS_FOR_PUSH) clobberRegisters(regs, builder); + auto *C = builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); + C->dump(); I->dump(); I->eraseFromParent(); n_reps++; } }else{ + Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); + std::array poparg = {DMid}; for (Instruction *I : E.Access->getAccesses()){ builder.SetInsertPoint(I); - auto *V = generatePopAsm(I, dmid); + auto *V = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); + if (SSR_CLOBBER_REGS_FOR_POP) clobberRegisters(regs, builder); V->dump(); I->dump(); BasicBlock::iterator ii(I); ReplaceInstWithValue(I->getParent()->getInstList(), ii, V); @@ -463,58 +464,43 @@ void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ //NOTE: this starts the prefetching ==> always needs to be inserted AFTER bound/stride and repetition setups !!! builder.CreateCall(SSRSetup->getFunctionType(), SSRSetup, ArrayRef(args))->dump(); - //create an SSR barrier in exit block. TODO: needed esp. for write streams? - //builder.SetInsertPoint(Access->getLoop()->getExitBlock()->getFirstNonPHI()); - //Function *SSRBarrier = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_barrier); - //std::array barrargs = {DMid}; - //builder.CreateCall(SSRBarrier->getFunctionType(), SSRBarrier, ArrayRef(barrargs))->dump(); return; } -///generates SSR enable & disable calls -void generateSSREnDisAsm(Instruction *PhT, Instruction *ExP){ - constexpr unsigned num_ssr = 3u; //FIXME: make use of NUM_SSR +/// generate a SSR Barrier intrinsic call before InsertBefore +void generateSSRBarrier(Instruction *InsertBefore, unsigned dmid) { + IRBuilder<> builder(InsertBefore); + Function *Barrier = Intrinsic::getDeclaration(InsertBefore->getModule(), Intrinsic::riscv_ssr_barrier); + for (unsigned dmid : activeDMIds) { + builder.CreateCall(Barrier->getFunctionType(), Barrier, ConstantInt::get(Type::getInt32Ty(builder.getContext()), dmid))->dump(); + } +} - IRBuilder<> builder(PhT); // ----------- in preheader - auto &ctxt = builder.getContext(); - Type *Double = Type::getDoubleTy(ctxt); - std::vector structTys; - for (unsigned i = 0; i < num_ssr; i++) structTys.push_back(Double); - Type *ArrTy = StructType::get(ctxt, structTys); //auto *ArrTy = ArrayType::get(Double, num_ssr); //VectorType::get(Double, num_ssr, false); - std::vector argtypes; - for (unsigned i = 0u; i < num_ssr; i++) argtypes.push_back(Double); - std::string constraints = "={f0},={f1},={f2},{f0},{f1},{f2},~{memory}"; - FunctionType* fty = FunctionType::get(ArrTy, argtypes, false); - InlineAsm *En = InlineAsm::get(fty, "csrsi 0x7C0, 1\0A", constraints, true); - En->dump(); - std::vector args; - for (unsigned i = 0u; i < num_ssr; i++) args.push_back(UndefValue::get(Double)); - CallInst *Dep = builder.CreateCall(En, args, "ssr.enable.dep"); - Dep->dump(); +/// generates SSR enable & disable calls +void generateSSREnDis(Instruction *PhP, Instruction *ExP){ + IRBuilder<> builder(PhP); // ----------- in preheader + Module *mod = PhP->getParent()->getModule(); + Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); + builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef()); + std::vector regs; + for (unsigned r = 0u; r < NUM_SSR; r++){ + regs.push_back(std::string(formatv("f{0}", r))); + } + //create inline asm that clobbers ft0-2 to make sure none of them are reordered to before ssr enable / after ssr disable + //equivalent to asm volatile ("":::"ft0", "ft1", "ft2"); + clobberRegisters(ArrayRef(regs), builder); builder.SetInsertPoint(ExP); // ----------- in exit block - std::vector deps; - for (unsigned i = 0u; i < num_ssr; i++) - deps.push_back(builder.CreateExtractValue(Dep, i, formatv("dep.{0}", i))); - InlineAsm *Dis = InlineAsm::get(fty, "csrci 0x7C0, 1\0A", constraints, true); - builder.CreateCall(Dis, deps, "ssr.disable.dep")->dump(); - + clobberRegisters(ArrayRef(regs), builder); + + Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); + builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef()); + errs()<<"generated ssr_enable and ssr_disable\n"; return; } -/* -void generateFrepPragma(ArrayRef points) { - if (points.empty()) return; - IRBuilder<> builder(points.front()->getContext()); - for (Instruction *P : points) { - builder.SetInsertPoint(P); - Function *FrepPragma = Intrinsic::getDeclaration(P->getModule(), Intrinsic::riscv_frep_infer); - builder.CreateCall(FrepPragma->getFunctionType(), FrepPragma, ArrayRef())->dump(); - } -}*/ - int getEstExpandCost(AffAcc *A, unsigned dim) { int cost = 0; cost += A->getBaseAddr(dim)->getExpressionSize(); @@ -642,10 +628,10 @@ void cloneAndSetup(Instruction *PhT, Instruction *ExP, Value *Cond, std::vector< unsigned dmid = 0u; for (auto &E : exp) { GenerateSSRSetup(E, dmid++, PhT); + if (!SSRNoBarrier) generateSSRBarrier(ExP, dmid); } - //generateSSREnDis(PhT, ExP); - generateSSREnDisAsm(PhT, ExP); + generateSSREnDis(PhT, ExP); } bool isValid(AffAcc *A, const Loop *L) { @@ -795,7 +781,6 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F if (SSRNoBoundCheck) errs()<<", ssr-no-bound-check"; if (SSRNoTCDMCheck) errs()<<", ssr-no-tcdm-check"; if (SSRConflictFreeOnly) errs()<<", ssr-conflict-free-only"; - //if (SSRInsertFrepPragma) errs()<<", ssr-insert-frep-pragma"; errs()<<"\n"; if (!InferSSR) return PreservedAnalyses::all(); @@ -871,26 +856,18 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F } /* -std::string s = formatv("f{0}", dmid); - ArrayRef regs(s); + unsigned n_reps = 0U; if (isStore){ - Function *SSRPush = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_push); for (Instruction *I : E.Access->getAccesses()){ - std::array pusharg = {DMid, cast(I)->getValueOperand()}; - builder.SetInsertPoint(I); - if (SSR_CLOBBER_REGS_FOR_PUSH) clobberRegisters(regs, builder); - auto *C = builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); - C->dump(); I->dump(); + generatePushAsm(I, dmid, cast(I)->getValueOperand()); + I->dump(); I->eraseFromParent(); n_reps++; } }else{ - Function *SSRPop = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_pop); - std::array poparg = {DMid}; for (Instruction *I : E.Access->getAccesses()){ builder.SetInsertPoint(I); - auto *V = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); - if (SSR_CLOBBER_REGS_FOR_POP) clobberRegisters(regs, builder); + auto *V = generatePopAsm(I, dmid); V->dump(); I->dump(); BasicBlock::iterator ii(I); ReplaceInstWithValue(I->getParent()->getInstList(), ii, V); @@ -898,28 +875,52 @@ std::string s = formatv("f{0}", dmid); } } - ///generates SSR enable & disable calls -void generateSSREnDis(Instruction *PhP, Instruction *ExP){ - IRBuilder<> builder(PhP); // ----------- in preheader - Module *mod = PhP->getParent()->getModule(); - Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); - builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef()); + ///generates SSR enable & disable calls +void generateSSREnDisAsm(Instruction *PhT, Instruction *ExP){ + constexpr unsigned num_ssr = 3u; //FIXME: make use of NUM_SSR - std::vector regs; - for (unsigned r = 0u; r < NUM_SSR; r++){ - regs.push_back(std::string(formatv("f{0}", r))); - } - //create inline asm that clobbers ft0-2 to make sure none of them are reordered to before ssr enable / after ssr disable - //equivalent to asm volatile ("":::"ft0", "ft1", "ft2"); - clobberRegisters(ArrayRef(regs), builder); + IRBuilder<> builder(PhT); // ----------- in preheader + auto &ctxt = builder.getContext(); + Type *Double = Type::getDoubleTy(ctxt); + std::vector structTys; + for (unsigned i = 0; i < num_ssr; i++) structTys.push_back(Double); + Type *ArrTy = StructType::get(ctxt, structTys); //auto *ArrTy = ArrayType::get(Double, num_ssr); //VectorType::get(Double, num_ssr, false); + std::vector argtypes; + for (unsigned i = 0u; i < num_ssr; i++) argtypes.push_back(Double); + std::string constraints = "={f0},={f1},={f2},{f0},{f1},{f2},~{memory}"; + FunctionType* fty = FunctionType::get(ArrTy, argtypes, false); + InlineAsm *En = InlineAsm::get(fty, "csrsi 0x7C0, 1\0A", constraints, true); + En->dump(); + std::vector args; + for (unsigned i = 0u; i < num_ssr; i++) args.push_back(UndefValue::get(Double)); + CallInst *Dep = builder.CreateCall(En, args, "ssr.enable.dep"); + Dep->dump(); builder.SetInsertPoint(ExP); // ----------- in exit block - clobberRegisters(ArrayRef(regs), builder); - Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); - builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef()); - + std::vector deps; + for (unsigned i = 0u; i < num_ssr; i++) + deps.push_back(builder.CreateExtractValue(Dep, i, formatv("dep.{0}", i))); + InlineAsm *Dis = InlineAsm::get(fty, "csrci 0x7C0, 1\0A", constraints, true); + builder.CreateCall(Dis, deps, "ssr.disable.dep")->dump(); + errs()<<"generated ssr_enable and ssr_disable\n"; return; } + +Value *generatePopAsm(Instruction *InsertBefore, unsigned dmid) { + IRBuilder<> builder(InsertBefore); + FunctionType *fty = FunctionType::get(Type::getDoubleTy(builder.getContext()), false); + std::string inst = formatv("fmv.d $0, ft{0}\0A", dmid); + InlineAsm *Pop = InlineAsm::get(fty, inst, "=f", true); + return builder.CreateCall(Pop, ArrayRef(), "ssr.pop"); +} + +void generatePushAsm(Instruction *InsertBefore, unsigned dmid, Value *Val){ + IRBuilder<> builder(InsertBefore); + FunctionType *fty = FunctionType::get(Type::getVoidTy(builder.getContext()), ArrayRef(Type::getDoubleTy(builder.getContext())), false); + std::string inst = formatv("fmv.d ft{0}, $0\0A", dmid); + InlineAsm *Push = InlineAsm::get(fty, inst, "f", true); + builder.CreateCall(Push, ArrayRef(Val)); +} */ \ No newline at end of file From e7057932f084766cfa4207072cafd9bca4b3febb Mon Sep 17 00:00:00 2001 From: thrupf Date: Mon, 27 Jun 2022 09:35:35 +0200 Subject: [PATCH 38/47] fixed backend, no inline asm --- llvm/include/llvm/IR/IntrinsicsRISCV.td | 7 +- llvm/lib/Passes/PassBuilder.cpp | 2 + llvm/lib/Target/RISCV/CMakeLists.txt | 1 + llvm/lib/Target/RISCV/RISCV.h | 3 + llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp | 279 ++++--------- .../RISCV/RISCVExpandSSRInstsPostRegAlloc.cpp | 371 ++++++++++++++++++ llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td | 18 + llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 2 + llvm/lib/Transforms/SSR/SSRGeneration.cpp | 64 +-- llvm/lib/Transforms/SSR/SSRInference.cpp | 1 + 10 files changed, 517 insertions(+), 231 deletions(-) create mode 100644 llvm/lib/Target/RISCV/RISCVExpandSSRInstsPostRegAlloc.cpp diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td index 835a535a9be48..f9ea2c11c6089 100644 --- a/llvm/include/llvm/IR/IntrinsicsRISCV.td +++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td @@ -1434,19 +1434,18 @@ let TargetPrefix = "riscv" in { RISCVSSRIntrinsic; // The `Throws` attribute ensures that the push/pop don't get removed from loops - // by the LICM pass - // TODO: Is there another way to do this? + // by the LICM pass ==> not needed, LICM is only problem if readonly ==> make pop read and write (which is default) def int_riscv_ssr_push : GCCBuiltin<"__builtin_ssr_push">, Intrinsic<[], [llvm_i32_ty, llvm_double_ty], - [IntrWriteMem, IntrHasSideEffects, Throws, ImmArg>]>, + [IntrWillReturn, IntrWriteMem, IntrHasSideEffects, ImmArg>]>, RISCVSSRIntrinsic; def int_riscv_ssr_pop : GCCBuiltin<"__builtin_ssr_pop">, Intrinsic<[llvm_double_ty], [llvm_i32_ty], - [IntrReadMem, IntrHasSideEffects, Throws, ImmArg>]>, + [IntrWillReturn, ImmArg>]>, //use ReadWrite instead of throw to avoid licm RISCVSSRIntrinsic; def int_riscv_ssr_enable diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index e54e4c6b1a37d..0e7847d713427 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -819,6 +819,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap), EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true, DebugLogging)); + FPM.addPass(ReassociatePass()); //want to do this again after loop unrolling. FIXME: probably worth it right? + if (PTO.Coroutines) FPM.addPass(CoroElidePass()); diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index c822929f94770..ce443473f15f6 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -27,6 +27,7 @@ add_llvm_target(RISCVCodeGen RISCVExpandAtomicPseudoInsts.cpp RISCVExpandPseudoInsts.cpp RISCVExpandSSRInsts.cpp + RISCVExpandSSRInstsPostRegAlloc.cpp RISCVExpandSDMAInsts.cpp RISCVFrameLowering.cpp RISCVInstrInfo.cpp diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index 48d0c7f164058..318975c9eb1c7 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -45,6 +45,9 @@ void initializeRISCVMergeBaseOffsetOptPass(PassRegistry &); FunctionPass *createRISCVExpandPseudoPass(); void initializeRISCVExpandPseudoPass(PassRegistry &); +FunctionPass *createRISCVExpandSSRPostRegAllocPass(); +void initializeRISCVExpandSSRPostRegAllocPass(PassRegistry &); + FunctionPass *createRISCVExpandAtomicPseudoPass(); void initializeRISCVExpandAtomicPseudoPass(PassRegistry &); diff --git a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp index 9800c35be86c5..3fe6cc662d3fe 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp @@ -62,11 +62,6 @@ using namespace llvm; #define DEBUG_TYPE "riscv-ssr" -/// Command line options -static cl::opt - SSRRegisterMerge("ssr-noregmerge", cl::Hidden, - cl::desc("Disable the merging of SSR registers in other instructions")); - #define RISCV_EXPAND_SSR_NAME "RISCV SSR pseudo instruction expansion pass" #define NUM_SSR 3 @@ -78,12 +73,6 @@ class RISCVExpandSSR : public MachineFunctionPass { const RISCVInstrInfo *TII; static char ID; - /// Parameters for the register merging pass - struct RegisterMergingPreferences { - /// enable the register merging - bool Enable; - }; - RISCVExpandSSR() : MachineFunctionPass(ID) { initializeRISCVExpandSSRPass(*PassRegistry::getPassRegistry()); } @@ -96,10 +85,10 @@ class RISCVExpandSSR : public MachineFunctionPass { const MachineFunction *MF; RISCVMachineFunctionInfo *RVFI; - bool Enabled; + std::vector MoveLoads; + std::vector MoveStores; bool expandMBB(MachineBasicBlock &MBB); - void mergePushPop(MachineBasicBlock &MBB); bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI); bool expandSSR_Setup(MachineBasicBlock &MBB, @@ -119,8 +108,7 @@ class RISCVExpandSSR : public MachineFunctionPass { bool expandSSR_Barrier(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI); - - RISCVExpandSSR::RegisterMergingPreferences gatherRegisterMergingPreferences(); + void bundlePushPops(); }; char RISCVExpandSSR::ID = 0; @@ -140,17 +128,14 @@ bool RISCVExpandSSR::runOnMachineFunction(MachineFunction &MF) { TII = static_cast(MF.getSubtarget().getInstrInfo()); this->MF = &MF; this->RVFI = MF.getInfo(); - Enabled = false; + this->MoveLoads.empty(); + this->MoveStores.empty(); bool Modified = false; for (auto &MBB : MF) Modified |= expandMBB(MBB); - // Run over MF again to merge SSR pops/pushs into instruction uses - RISCVExpandSSR::RegisterMergingPreferences RMP = gatherRegisterMergingPreferences(); - if(RMP.Enable && RVFI->getUsedSSR()) - for (auto &MBB : MF) - mergePushPop(MBB); + bundlePushPops(); //bundle push/pops with their users /// "Forcefully" add all SSR registers as live-in to all MBB in this MF if(Modified) { @@ -215,17 +200,6 @@ bool RISCVExpandSSR::expandMI(MachineBasicBlock &MBB, return expandSSR_Barrier(MBB, MBBI, NextMBBI); } - // Prevent excessive live-ins, they pose a problem with multiple SSR regions - // in a single function. Adding SSR regs to live ins in push/pop should suffice - // for now, but there might be edge cases - - // if(Enabled) { - // // mark the SSR registers reserved in this BB - // unsigned ssrEnabledMask = 0; - // for (unsigned n = 0; n < NUM_SSR; ++n) - // MBB.addLiveIn(getSSRFtReg(n)); - // } - return false; } @@ -276,30 +250,24 @@ bool RISCVExpandSSR::expandSSR_PushPop(MachineBasicBlock &MBB, LLVM_DEBUG(dbgs() << "-- Expanding SSR " << (isPop?"Pop":"Push") << "\n"); LLVM_DEBUG(dbgs() << " Using register " << R << " for SSR streamer "<get(RISCV::FSGNJ_D), MBBI->getOperand(ssrValIdx).getReg()) - .addReg(R, 0) - .addReg(R, 0); - // BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), MBBI->getOperand(ssrValIdx).getReg()) - // .addReg(R, 0); + // Insert a "loading move" this is like a normal move but has side effects + Register valR = MBBI->getOperand(ssrValIdx).getReg(); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoLoadMove), valR).addReg(R, 0).getInstr(); + MBBI->eraseFromParent(); // The pseudo instruction is gone now. + MI->getOperand(0).setIsDef(); + this->MoveLoads.push_back(MI); } else { - // Build a copy instruction that moves the value from the register passed as - // argument to the ssr data register (R) - BuildMI(MBB, MBBI, DL, TII->get(RISCV::FSGNJ_D), R) - .addReg(MBBI->getOperand(ssrValIdx).getReg()) - .addReg(MBBI->getOperand(ssrValIdx).getReg()); - // BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), R) - // .addReg(MBBI->getOperand(ssrValIdx).getReg()); + Register valR = MBBI->getOperand(ssrValIdx).getReg(); + // Insert a "storing move" this is like a normal move but has side effects + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoStoreMove), R).addReg(valR).getInstr(); + MBBI->eraseFromParent(); // The pseudo instruction is gone now. + MI->getOperand(0).setIsDef(); + this->MoveStores.push_back(MI); } MBB.addLiveIn(R); - MBBI->eraseFromParent(); // The pseudo instruction is gone now. return true; } @@ -415,7 +383,6 @@ bool RISCVExpandSSR::expandSSR_EnDis(MachineBasicBlock &MBB, MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); LLVM_DEBUG(dbgs() << "-- Expanding SSR " << (isEnable ? "Enable" : "Disable") << "\n"); - Enabled = isEnable; // emit a csrsi/csrci call to the SSR location if(isEnable) { @@ -483,93 +450,37 @@ bool RISCVExpandSSR::expandSSR_Barrier(MachineBasicBlock &MBB, return true; } -void RISCVExpandSSR::mergePushPop(MachineBasicBlock &MBB) { - //SmallSet virtRegs[NUM_SSR]; - const TargetRegisterInfo *TRI = MBB.getParent()->getRegInfo().getTargetRegisterInfo(); - - Register ssr_regs[NUM_SSR]; - for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) ssr_regs[ssr_no] = getSSRFtReg(ssr_no); - - for (auto ssr_reg : ssr_regs){ - SmallSet modified; - for (auto MI = MBB.rbegin(); MI != MBB.rend(); ){ //go from back to front - auto PMI = std::next(MI); //this is prev bc reverse iterator - if(MI->getOpcode() == RISCV::FSGNJ_D){ - if (MI->getOperand(1).getReg() == ssr_reg && MI->getOperand(2).getReg() == ssr_reg && MI->getOperand(0).isReg()){ //this was an SSR pop - Register r = MI->getOperand(0).getReg(); //register to replace - bool replacedAll = true; //if there are no uses, can replace too - SmallVector replacements; - for (auto MI2 = MBB.rbegin(); replacedAll && MI2 != MI; MI2++){ - for (auto Op = MI2->operands_begin(); replacedAll && Op != MI2->operands_end(); ++Op){ - if (Op->isReg() && Op->getReg() == r){ - replacedAll = replacedAll && modified.find(&*MI2) == modified.end(); - replacements.push_back(&*MI2); - } - } - } - if (replacedAll) { - MBB.addLiveIn(ssr_reg); - MI->eraseFromParentAndMarkDBGValuesForRemoval(); - for (MachineInstr *I : replacements){ - I->dump(); - I->substituteRegister(r, ssr_reg, 0, *TRI); - I->dump(); - modified.insert(I); - } - } - }else if(MI->getOperand(0).getReg() == ssr_reg){ - auto Op1 = MI->getOperand(1), Op2 = MI->getOperand(2); - //TODO: turns out the following condition is almost never true ==> use live-ness analysis instead of .isKill() ? - if (Op1.isReg() && Op2.isReg() && Op1.getReg() == Op2.getReg() && Op1.isKill() && Op2.isKill()){ //because Op is kill will not be used later - Register r = Op1.getReg(); - MachineOperand *O = nullptr; - //find the most recent operand that sets this reg - for (auto MI2 = std::next(MI); !O && MI2 != MBB.rend(); ++MI2){ - //FIXME: first operand is always dest operand right? otherwise require def (like below) or query llvm which operand is dest (how?) - if (MI2->getNumOperands() == 0u) continue; - MachineOperand *Op = &*MI2->operands_begin(); - MI2->dump(); - if (Op->isReg() && Op->getReg() == r){ - O = Op; - } - /*for (auto Op = MI2->operands_begin(); Op != MI2->operands_end(); ++Op){ - if (Op->isReg() && Op->getReg() == r){ - done = true; - if (Op->isDef() || SKIP_DEF_CHECK) O = &*Op; - break; - } - }*/ - } - if (O){ - errs()<<"push regmerge: \n"; - O->getParent()->dump(); - O->setReg(ssr_reg); - O->getParent()->dump(); - MI->eraseFromParentAndMarkDBGValuesForRemoval(); - } - } - } +void RISCVExpandSSR::bundlePushPops() { + //TODO: bundle what is regmerged after reg-alloc to make sure that the FADD/FMUL/FMUL/etc.. do not slip past ssr_disable + /* + DenseMap> bundles; + //pops: + for (MachineInstr *MI : this->MoveLoads) { + if (!MI) continue; + MachineInstr *SingleUser = getUniqueUser(MI, MI->getOperand(0).getReg()); + if (SingleUser && SingleUser->getParent() == MI->getParent()) { + MI->moveBefore(SingleUser); //we pray that there was no reordering until now that moved SingleUser after the SSRDisable + auto b = bundles.find(SingleUser); + if (b == bundles.end()) { + b = bundles.insert(std::make_pair(SingleUser, std::make_pair(SingleUser, SingleUser))).first; } - MI = PMI; + if (b->getSecond().first == SingleUser) b->getSecond().first = MI; //if begin of bundle was SingleUser, set to MI } } - MBB.sortUniqueLiveIns(); -} - -/// Gather parameters for the register merging -RISCVExpandSSR::RegisterMergingPreferences RISCVExpandSSR::gatherRegisterMergingPreferences() { - RISCVExpandSSR::RegisterMergingPreferences RMP; - - // set up defaults - RMP.Enable = true; - - // read user - if (SSRRegisterMerge.getNumOccurrences() > 0) - RMP.Enable = !SSRRegisterMerge; - - LLVM_DEBUG(dbgs() << "RMP Enable "<MoveStores) { + Register valR = MI->getOperand(1).getReg(); + MachineInstr *Pred = MI->getPrevNode(); + bool doesDefvalR = false; + for (auto &MOP : Pred->defs()) doesDefvalR |= MOP.isReg() && MOP.getReg() == valR; + if (doesDefvalR && MI == getUniqueUser(Pred, valR)) { + auto b = bundles.find(Pred); + if (b == bundles.end()) { + b = bundles.insert(std::make_pair(Pred, std::make_pair(Pred, Pred))).first; + } + if (b->getSecond().second == Pred) b->getSecond().second = MI; + } + }*/ } } // end of anonymous namespace @@ -582,71 +493,41 @@ FunctionPass *createRISCVExpandSSRPass() { return new RISCVExpandSSR(); } } // end of namespace llvm - -/* OLD VERSION OF REGMERGE -// First pass: Detect moves to or from SSR registers - for (auto MI = MBB.begin() ; MI != MBB.end() ; ) { - MachineBasicBlock::iterator NMI = std::next(MI); - - LLVM_DEBUG(dbgs()<<"Analyzing: "<<*MI<<"\n"); - - // detect an emitted pop and add assignment (virtual_reg, ssr_read) to list - if(MI->getOpcode() == RISCV::FSGNJ_D) { - LLVM_DEBUG(dbgs()<<"Found FSGNJ_D, Op 0: " << MI->getOperand(1).getReg() << " Op1: " << MI->getOperand(2).getReg() << "\n"); - - // look for each streamer register - for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) { - // check for pop - if(MI->getOperand(1).getReg() == ssr_regs[ssr_no] && MI->getOperand(2).getReg() == ssr_regs[ssr_no]) { - LLVM_DEBUG(dbgs()<<" pop: both operands from SSR"<< ssr_no <<"\n"); - // append virtual register to list of assigned virtuals - LLVM_DEBUG(dbgs()<<" append: "<< MI->getOperand(0).getReg() <<"\n"); - virtRegs[ssr_no].insert(MI->getOperand(0).getReg()); - // remove operation - MI->eraseFromParent(); - break; - } - // TODO: check for push - else if(MI->getOperand(0).getReg() == ssr_regs[ssr_no]) { - // This is non-trivial because a register might be used elsewhere, therefore the entire MBB - // must be analyzed and a merge can only be made, if the register is written once - // LLVM_DEBUG(dbgs()<<" push: operand 0 from SSR"<< ssr_no <<"\n"); - // // append virtual register to list of assigned virtuals - // LLVM_DEBUG(dbgs()<<" append: "<< MI->getOperand(1).getReg() <<"\n"); - // virtRegs[ssr_no].insert(MI->getOperand(1).getReg()); - // // remove operation - // MI->eraseFromParent(); - break; +/* +bundle.second = bundle.second->getNextNode(); //make end of bundle exclusive bound + DenseSet regs; + errs()<<"beg bundle\n"; + for (MachineInstr *MI = bundle.first; MI != bundle.second; MI = MI->getNextNode()) MI->dump(); + errs()<<"end bundle\n"; + MachineInstr &first = *bundle.first; + auto BMI = BuildMI(*first.getParent(), first.getIterator(), first.getDebugLoc(), TII->get(RISCV::BUNDLE)); + for (MachineInstr *MI = bundle.first; MI != bundle.second; MI = MI->getNextNode()) { + MI->dump(); + for (auto &MOP : MI->operands()) { + if (!MOP.isReg()) continue; + Register reg = MOP.getReg(); + if (regs.find(reg) != regs.end()) continue; + regs.insert(reg); + bool isInternal = false; + for (auto *MI2 = MI->getNextNode(); !isInternal && MI2 != bundle.second; MI2 = MI2->getNextNode()) { + for (const auto &MOP2 : MI2->operands()) { + isInternal |= MOP2.isReg() && MOP2.getReg() == reg; + } } - } - } - MI = NMI; - } - - // DBG - for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) { - for (auto iter = virtRegs[ssr_no].begin() ; iter != virtRegs[ssr_no].end() ; ++iter) - LLVM_DEBUG(dbgs() << "virtregs["<operands_begin() ; operand != MI->operands_end() ; ++operand) { - if(!operand->isReg()) continue; - // check if operand is in any SSR list - for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) { - if(virtRegs[ssr_no].contains(operand->getReg())) { - LLVM_DEBUG(dbgs() << "Found use of operand " << operand->getReg() << " ssr: " << ssr_no << " in inst " << MI->getOpcode() << "\n"); - // substitute with SSR register - MI->substituteRegister(operand->getReg(), ssr_regs[ssr_no], 0, *TRI); - // guard this block and add ssr regs to live in - MBB.addLiveIn(ssr_regs[ssr_no]); + MOP.dump(); + errs()<<"is internal = "<dump(); + */ \ No newline at end of file diff --git a/llvm/lib/Target/RISCV/RISCVExpandSSRInstsPostRegAlloc.cpp b/llvm/lib/Target/RISCV/RISCVExpandSSRInstsPostRegAlloc.cpp new file mode 100644 index 0000000000000..1c1e44b1b0c31 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVExpandSSRInstsPostRegAlloc.cpp @@ -0,0 +1,371 @@ +//===-- RISCVExpandSSRPostRegAllocInsts.cpp - Expand SSR pseudo instructions ---------===// +// +// Copyright 2021 ETH Zurich, University of Bologna. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that expands SSR pseudo instructions into target +// instructions. This pass should be run before register allocation +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// The SSR are configured in a memory-mapped address space accessible through +// the SCGGW(I)/SCGGR(I) instructions. The (I)mmediate instructions take the +// address as an immediate. The Address map is as follows: +// +// | Word| Hex | reg | +// |-----|------|------------| +// | 0 | 0x00 | status | +// | 1 | 0x01 | repeat | +// | 2 | 0x02 | Bound 0 | +// | 3 | 0x03 | Bound 1 | +// | 4 | 0x04 | Bound 2 | +// | 5 | 0x05 | Bound 3 | +// | 6 | 0x06 | Stride 0 | +// | 7 | 0x07 | Stride 1 | +// | 8 | 0x08 | Stride 2 | +// | 9 | 0x09 | Stride 3 | +// | | | _reserved_ | +// | 24 | 0x18 | Rptr 0 | +// | 25 | 0x19 | Rptr 1 | +// | 26 | 0x1a | Rptr 2 | +// | 27 | 0x1b | Rptr 3 | +// | 28 | 0x1c | Wptr 0 | +// | 29 | 0x1d | Wptr 1 | +// | 30 | 0x1e | Wptr 2 | +// | 31 | 0x1f | Wptr 3 | +// +// The data mover is selected in the lower 5 bits, the register offset is encoded +// in the upper 7 bits. The value passed to scfgX is therefore +// addr = dm + reg << 5 +// +// scfgw rs1 rs2 # rs1=value rs2=addr +//===----------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVInstrInfo.h" +#include "RISCVTargetMachine.h" +#include "RISCVMachineFunctionInfo.h" + +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/LowLevelType.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +#define DEBUG_TYPE "riscv-ssr" + +namespace llvm { + /// Command line options + cl::opt SSRNoRegisterMerge("ssr-no-regmerge", cl::init(false), + cl::desc("Disable the merging of SSR registers in other instructions")); +} + +#define RISCV_EXPAND_SSR_POST_REG_ALLOC_NAME "RISCV SSR pseudo instruction expansion pass post reg alloc" + +#define NUM_SSR 3 + +namespace { + +class RISCVExpandSSRPostRegAlloc : public MachineFunctionPass { +public: + const RISCVInstrInfo *TII; + static char ID; + + RISCVExpandSSRPostRegAlloc() : MachineFunctionPass(ID) { + initializeRISCVExpandSSRPostRegAllocPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return RISCV_EXPAND_SSR_POST_REG_ALLOC_NAME; } + +private: + + const MachineFunction *MF; + RISCVMachineFunctionInfo *RVFI; + bool Enabled; + + bool expandMBB(MachineBasicBlock &MBB); + bool mergePushPop(MachineBasicBlock &MBB); + bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); + bool expandSSR_StoreLoadMove(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); +}; + +char RISCVExpandSSRPostRegAlloc::ID = 0; + +static Register getSSRFtReg(unsigned streamer) { + unsigned AssignedReg = RISCV::F0_D + streamer; + // Advance the iterator to the assigned register until the valid + // register is found + const TargetRegisterClass *RC = &RISCV::FPR64RegClass; + TargetRegisterClass::iterator I = RC->begin(); + for (; *I != AssignedReg; ++I) + assert(I != RC->end() && "AssignedReg should be a member of provided RC"); + return Register(*I); +} + +bool RISCVExpandSSRPostRegAlloc::runOnMachineFunction(MachineFunction &MF) { + TII = static_cast(MF.getSubtarget().getInstrInfo()); + this->MF = &MF; + this->RVFI = MF.getInfo(); + + bool Modified = false; + for (auto &MBB : MF) Modified |= expandMBB(MBB); + + if (SSRNoRegisterMerge) errs()<<"regmerge disabled \n"; + if (!SSRNoRegisterMerge && Modified){ + for (auto &MBB : MF) mergePushPop(MBB); + } + + return Modified; +} + +bool RISCVExpandSSRPostRegAlloc::expandMBB(MachineBasicBlock &MBB) { + bool Modified = false; + + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + while (MBBI != E) { + MachineBasicBlock::iterator NMBBI = std::next(MBBI); + Modified |= expandMI(MBB, MBBI, NMBBI); + MBBI = NMBBI; + } + MBB.sortUniqueLiveIns(); + + return Modified; +} + +bool RISCVExpandSSRPostRegAlloc::expandMI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + switch (MBBI->getOpcode()) { + case RISCV::PseudoStoreMove: + case RISCV::PseudoLoadMove: + return expandSSR_StoreLoadMove(MBB, MBBI); + default: + return false; + } +} + +bool RISCVExpandSSRPostRegAlloc::expandSSR_StoreLoadMove(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + DebugLoc DL = MBBI->getDebugLoc(); + + Register src = MBBI->getOperand(0).getReg(); + Register dest = MBBI->getOperand(1).getReg(); + + BuildMI(MBB, MBBI, DL, TII->get(RISCV::FSGNJ_D), src) + .addReg(dest) + .addReg(dest); + + MBBI->eraseFromParent(); // The pseudo instruction is gone now. + return true; +} + +/* +static MachineOperand *getUniqueUser(MachineBasicBlock::instr_iterator beg, MachineBasicBlock::instr_iterator end, Register valR) { + for (auto MII = beg; MII != end; ++MII) { + if (MII->isDebugInstr()) continue; + for (auto &MOP : MII->operands()){ + if (!MOP.isReg() || MOP.getReg() != valR) continue; + if (MOP.isKill()) return &MOP; + else return nullptr; + } + } + return nullptr; //cannot be sure, maybe there is a user in a later block? +} */ + +static MachineOperand *getUniqueUser ( + MachineBasicBlock::instr_iterator beg, + MachineBasicBlock::instr_iterator end, + MachineBasicBlock::instr_iterator realend, + Register valR) + { + MachineOperand *UseMOP = nullptr; + bool isPastEnd = false; + for (auto MII = beg; MII != realend; ++MII) { + isPastEnd |= MII == end; + if (MII->isDebugInstr()) continue; //skip debug instructions + bool definesValR = false; + for (auto &MOP : MII->operands()) { + if (!MOP.isReg() || MOP.getReg() != valR) continue; + //at this point we know MII accesses valR, with MOP, but maybe also other operands + definesValR |= MOP.isDef(); + if (!isPastEnd && !UseMOP && !MOP.isDef()) { + UseMOP = &MOP; //if UseMOP is not yet found and MOP does not redefine valR then MOP is the first Use + if (MOP.isKill()) return UseMOP; //if MOP kills valR then we can stop looking further and return + } + } + if (definesValR) { + return UseMOP; //if MII (re-)defines valR then we must have already found the Use before, (or we haven't in which case we return null) + } + } + //if we arrive at the end and have not found a redefinition or a kill, then we cannot be sure, whether valR is used after the realend ==> have to return nullptr + return nullptr; +} + +bool RISCVExpandSSRPostRegAlloc::mergePushPop(MachineBasicBlock &MBB) { + const TargetRegisterInfo *TRI = MBB.getParent()->getRegInfo().getTargetRegisterInfo(); + + Register ssr_regs[NUM_SSR]; + for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) ssr_regs[ssr_no] = getSSRFtReg(ssr_no); + + bool Modified = false; + + for (auto ssr_reg : ssr_regs){ + SmallSet modified; + for (auto MI = MBB.rbegin().getInstrIterator(); MI != MBB.rend().getInstrIterator(); ){ //go from back to front + auto PMI = std::next(MI); //this is prev bc reverse iterator + if(MI->getOpcode() == RISCV::FSGNJ_D){ + if (MI->getOperand(1).getReg() == ssr_reg && MI->getOperand(2).getReg() == ssr_reg && MI->getOperand(0).isReg()){ //this was an SSR pop + //limit search range for regmerge if there is an ssr disable + MachineBasicBlock::instr_iterator rangeLimit = MI.getReverse(); + for (; rangeLimit != MBB.end().getInstrIterator(); ++rangeLimit){ + if (rangeLimit->getOpcode() == RISCV::CSRRCI + && rangeLimit->getOperand(1).isImm() + && rangeLimit->getOperand(1).getImm() == 0x7C0 + && rangeLimit->getOperand(2).getImm() == 1) + { + break; + } + } + Register r = MI->getOperand(0).getReg(); //register to replace + MachineOperand *MO = getUniqueUser(std::next(MI.getReverse()), rangeLimit, MI->getParent()->end().getInstrIterator(), r); + if (MO) { //if unique user exists + MachineInstr *MIUser = MO->getParent(); + if (MIUser && modified.find(MIUser) == modified.end()){ //if unique user exists and was not yet modified + MIUser->dump(); + for (auto &MOP : MIUser->operands()) { + if (MOP.isReg() && !MOP.isDef() && MOP.getReg() == r) MOP.setReg(ssr_reg); //replace all non-def uses of r with ssr_reg + } + MIUser->dump(); + MI->eraseFromBundle(); + modified.insert(MIUser); + } + } + }else if(MI->getOperand(0).getReg() == ssr_reg){ + if (MI->getOperand(1).isReg() + && MI->getOperand(2).isReg() + && MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) + { //FIXME: use liveness analysis instead of .isKill() + Register R = MI->getOperand(1).getReg(); + MachineInstr *Pred = MI->getPrevNode(); + if (Pred && modified.find(Pred) == modified.end()){ //if Pred exists and is unmodified + bool predDefsR = false; + for (auto &MOP : Pred->defs()) { + predDefsR |= MOP.isReg() && MOP.isDef() && MOP.getReg() == R; + } + if (predDefsR) { //if Pred defines R + auto end = MI->getParent()->end().getInstrIterator(); + MachineOperand *MO = getUniqueUser(Pred->getIterator(), end, end, R); + if (MO && MO->getParent() == &*MI) { //if MI is unique user of R + Pred->dump(); + for (auto &MOP : Pred->operands()) { + if (MOP.isReg() && MOP.isDef() && MOP.getReg() == R) { + MOP.setReg(ssr_reg); //replace all defs of R with ssr_reg + MOP.setIsDef(false); + } + } + Pred->dump(); + MI->eraseFromBundle(); + modified.insert(Pred); + } + } + } + } + } + } + MI = PMI; + } + } + MBB.sortUniqueLiveIns(); + return Modified; +} + +} // end of anonymous namespace + +INITIALIZE_PASS(RISCVExpandSSRPostRegAlloc, "riscv-expand-ssr-post-reg-alloc", + RISCV_EXPAND_SSR_POST_REG_ALLOC_NAME, false, false) +namespace llvm { + +FunctionPass *createRISCVExpandSSRPostRegAllocPass() { return new RISCVExpandSSRPostRegAlloc(); } + +} // end of namespace llvm + + +/* OLD VERSION OF REGMERGE +// First pass: Detect moves to or from SSR registers + for (auto MI = MBB.begin() ; MI != MBB.end() ; ) { + MachineBasicBlock::iterator NMI = std::next(MI); + + LLVM_DEBUG(dbgs()<<"Analyzing: "<<*MI<<"\n"); + + // detect an emitted pop and add assignment (virtual_reg, ssr_read) to list + if(MI->getOpcode() == RISCV::FSGNJ_D) { + LLVM_DEBUG(dbgs()<<"Found FSGNJ_D, Op 0: " << MI->getOperand(1).getReg() << " Op1: " << MI->getOperand(2).getReg() << "\n"); + + // look for each streamer register + for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) { + // check for pop + if(MI->getOperand(1).getReg() == ssr_regs[ssr_no] && MI->getOperand(2).getReg() == ssr_regs[ssr_no]) { + LLVM_DEBUG(dbgs()<<" pop: both operands from SSR"<< ssr_no <<"\n"); + // append virtual register to list of assigned virtuals + LLVM_DEBUG(dbgs()<<" append: "<< MI->getOperand(0).getReg() <<"\n"); + virtRegs[ssr_no].insert(MI->getOperand(0).getReg()); + // remove operation + MI->eraseFromParent(); + break; + } + // TODO: check for push + else if(MI->getOperand(0).getReg() == ssr_regs[ssr_no]) { + // This is non-trivial because a register might be used elsewhere, therefore the entire MBB + // must be analyzed and a merge can only be made, if the register is written once + // LLVM_DEBUG(dbgs()<<" push: operand 0 from SSR"<< ssr_no <<"\n"); + // // append virtual register to list of assigned virtuals + // LLVM_DEBUG(dbgs()<<" append: "<< MI->getOperand(1).getReg() <<"\n"); + // virtRegs[ssr_no].insert(MI->getOperand(1).getReg()); + // // remove operation + // MI->eraseFromParent(); + break; + } + } + } + MI = NMI; + } + + // DBG + for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) { + for (auto iter = virtRegs[ssr_no].begin() ; iter != virtRegs[ssr_no].end() ; ++iter) + LLVM_DEBUG(dbgs() << "virtregs["<operands_begin() ; operand != MI->operands_end() ; ++operand) { + if(!operand->isReg()) continue; + // check if operand is in any SSR list + for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) { + if(virtRegs[ssr_no].contains(operand->getReg())) { + LLVM_DEBUG(dbgs() << "Found use of operand " << operand->getReg() << " ssr: " << ssr_no << " in inst " << MI->getOpcode() << "\n"); + // substitute with SSR register + MI->substituteRegister(operand->getReg(), ssr_regs[ssr_no], 0, *TRI); + // guard this block and add ssr regs to live in + MBB.addLiveIn(ssr_regs[ssr_no]); + } + } + } + MI = NMI; + } + */ \ No newline at end of file diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td index 67f38e03e1fc0..6bf48cec791dd 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td @@ -105,9 +105,25 @@ class SPseudoPush: let usesCustomInserter = 0; } +class SPseudoStoreMove: + Pseudo<(outs FPR64:$ssr), (ins FPR64:$val),[]> { + let mayLoad = 0; + let mayStore = 1; + let hasSideEffects = 1; + let usesCustomInserter = 0; +} + class SPseudoPop: Pseudo<(outs FPR64:$val), (ins uimm5:$ssr),[]> { let mayLoad = 1; + let mayStore = 1; + let hasSideEffects = 1; + let usesCustomInserter = 0; +} + +class SPseudoLoadMove: + Pseudo<(outs FPR64:$val), (ins FPR64:$ssr),[]> { + let mayLoad = 1; let mayStore = 0; let hasSideEffects = 1; let usesCustomInserter = 0; @@ -148,6 +164,8 @@ let Predicates = [HasExtXssr] in { def PseudoSSRSetup_1D_W : SPseudoSetup1D; def PseudoSSRPush : SPseudoPush; def PseudoSSRPop : SPseudoPop; + def PseudoStoreMove : SPseudoStoreMove; + def PseudoLoadMove : SPseudoLoadMove; foreach dim = [1, 2, 3, 4] in { def PseudoSSRSetupBoundStride_#dim#D : SPseudoSetupBoundStride; diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 833ce8b505528..cf86d705768a4 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -42,6 +42,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { initializeSNITCHFrepLoopsPass(*PR); initializeRISCVExpandSDMAPass(*PR); initializeRISCVExpandPseudoPass(*PR); + initializeRISCVExpandSSRPostRegAllocPass(*PR); initializeRISCVCleanupVSETVLIPass(*PR); } @@ -209,6 +210,7 @@ void RISCVPassConfig::addPreEmitPass() { void RISCVPassConfig::addPreEmitPass2() { addPass(createRISCVExpandPseudoPass()); addPass(createPULPFixupHwLoops()); + addPass(createRISCVExpandSSRPostRegAllocPass()); // Schedule the expansion of AMOs at the last possible moment, avoiding the // possibility for other passes to break the requirements for forward // progress in the LR/SC block. diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index a8135767461b4..d07b1f4e6cc21 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -50,11 +50,7 @@ #include #include -//if you feel like there is somehow still some weird reordering going on, enable these: -#define SSR_CLOBBER_REGS_FOR_PUSH true -#define SSR_CLOBBER_REGS_FOR_POP true - -#define NUM_SSR 3U //NOTE: if increased too much, might need to change 1st arguments to clobberRegisters(..) +#define NUM_SSR 3U #define SSR_MAX_DIM 4U //both are inclusive! @@ -103,16 +99,16 @@ cl::opt SSRConflictFreeOnly( cl::desc("Only infer streams if they have no conflicts with other memory accesses.") ); -cl::opt SSRInline( - "ssr-inline", +cl::opt SSRNoInline( + "ssr-no-inline", cl::init(false), - cl::desc("Allow functions that contain SSR streams to be inlined.") + cl::desc("prevent functions that contain SSR streams from being inlined.") ); -cl::opt SSRNoBarrier( - "ssr-no-barrier", +cl::opt SSRBarrier( + "ssr-barrier", cl::init(false), - cl::desc("Disable the insertion of an spinning loop that waits for the stream to be done before it is dissabled (potentially unsafe).") + cl::desc("Enable the insertion of a spinning loop that waits for the stream to be done before it is dissabled.") ); } //end of namespace llvm @@ -190,6 +186,7 @@ struct ConflictTree { const NodeT *Root = nullptr; }; +/* void clobberRegisters(ArrayRef regs, IRBuilder<> &builder){ std::string constraints = ""; if (regs.size() > 0u) { @@ -206,6 +203,7 @@ void clobberRegisters(ArrayRef regs, IRBuilder<> &builder){ ); builder.CreateCall(IA)->dump(); } +*/ void copyPHIsFromPred(BasicBlock *BB){ BasicBlock *Pred = nullptr; @@ -421,14 +419,11 @@ void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ } unsigned n_reps = 0u; - std::string s = formatv("f{0}", dmid); - ArrayRef regs(s); if (isStore){ Function *SSRPush = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_push); for (Instruction *I : E.Access->getAccesses()){ std::array pusharg = {DMid, cast(I)->getValueOperand()}; builder.SetInsertPoint(I); - if (SSR_CLOBBER_REGS_FOR_PUSH) clobberRegisters(regs, builder); auto *C = builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); C->dump(); I->dump(); I->eraseFromParent(); @@ -440,7 +435,6 @@ void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ for (Instruction *I : E.Access->getAccesses()){ builder.SetInsertPoint(I); auto *V = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); - if (SSR_CLOBBER_REGS_FOR_POP) clobberRegisters(regs, builder); V->dump(); I->dump(); BasicBlock::iterator ii(I); ReplaceInstWithValue(I->getParent()->getInstList(), ii, V); @@ -474,23 +468,37 @@ void generateSSRBarrier(Instruction *InsertBefore, unsigned dmid) { builder.CreateCall(Barrier->getFunctionType(), Barrier, ConstantInt::get(Type::getInt32Ty(builder.getContext()), dmid))->dump(); } +void generateFPDependency(IRBuilder<> &builder){ + constexpr unsigned num_fpr = 32u; + Type *Double = Type::getDoubleTy(builder.getContext()); + std::vector inputs; + std::vector args; + std::string constraints = ""; + for (unsigned i = 0u; i < num_fpr; i++) { + inputs.push_back(Double); + args.push_back(UndefValue::get(Double)); + std::string regname = formatv("f{0}", i); + constraints = "={" + regname + "}" + (i ? "," : "") + constraints + ", {" + regname + "}"; + } + Type *rty = StructType::get(builder.getContext(), inputs); + auto *IA = InlineAsm::get( + FunctionType::get(rty, inputs, false), + "", + constraints, + true + ); + builder.CreateCall(IA, args, "fpr.dep"); +} + /// generates SSR enable & disable calls void generateSSREnDis(Instruction *PhP, Instruction *ExP){ IRBuilder<> builder(PhP); // ----------- in preheader Module *mod = PhP->getParent()->getModule(); Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef()); - std::vector regs; - for (unsigned r = 0u; r < NUM_SSR; r++){ - regs.push_back(std::string(formatv("f{0}", r))); - } - //create inline asm that clobbers ft0-2 to make sure none of them are reordered to before ssr enable / after ssr disable - //equivalent to asm volatile ("":::"ft0", "ft1", "ft2"); - clobberRegisters(ArrayRef(regs), builder); builder.SetInsertPoint(ExP); // ----------- in exit block - clobberRegisters(ArrayRef(regs), builder); - + //generateFPDependency(builder); Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef()); @@ -626,7 +634,7 @@ void cloneAndSetup(Instruction *PhT, Instruction *ExP, Value *Cond, std::vector< unsigned dmid = 0u; for (auto &E : exp) { GenerateSSRSetup(E, dmid++, PhT); - if (!SSRNoBarrier) generateSSRBarrier(ExP, dmid); + if (SSRBarrier) generateSSRBarrier(ExP, dmid); } generateSSREnDis(PhT, ExP); @@ -777,8 +785,8 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F if (SSRNoIntersectCheck) errs()<<", ssr-no-intersect-check"; if (SSRNoBoundCheck) errs()<<", ssr-no-bound-check"; if (SSRNoTCDMCheck) errs()<<", ssr-no-tcdm-check"; - if (SSRNoBarrier) errs()<<", ssr-no-barrier"; - if (SSRInline) errs()<<", ssr-inline"; + if (SSRBarrier) errs()<<", ssr-barrier"; + if (SSRNoInline) errs()<<", ssr-no-inline"; if (SSRConflictFreeOnly) errs()<<", ssr-conflict-free-only"; errs()<<"\n"; @@ -851,7 +859,7 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F if (!changed) return PreservedAnalyses::all(); F.addFnAttr(StringRef(SSRFnAttr)); //we have inserted a stream, tag accordingly - if (!SSRInline) F.addFnAttr(Attribute::AttrKind::NoInline); + if (SSRNoInline) F.addFnAttr(Attribute::AttrKind::NoInline); return PreservedAnalyses::none(); } diff --git a/llvm/lib/Transforms/SSR/SSRInference.cpp b/llvm/lib/Transforms/SSR/SSRInference.cpp index ce428f597dd62..6f2375e602d9d 100644 --- a/llvm/lib/Transforms/SSR/SSRInference.cpp +++ b/llvm/lib/Transforms/SSR/SSRInference.cpp @@ -63,6 +63,7 @@ PreservedAnalyses SSRInferencePass::run(Function &F, FunctionAnalysisManager &FA FPM.addPass(SimplifyCFGPass()); //simplifies CFG again FPM.addPass(InstCombinePass()); //removes phi nodes from LCSSA FPM.addPass(ADCEPass()); //remove potential dead instructions that result from SSR replacement + FPM.addPass(LoopSimplifyPass()); //canonicalize loops again auto pa = FPM.run(F, FAM); errs()<<"SSR Inference Pass on function: "< Date: Wed, 29 Jun 2022 18:59:10 +0200 Subject: [PATCH 39/47] keep up to date --- llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp | 70 +++++-------------- 1 file changed, 18 insertions(+), 52 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp index 3fe6cc662d3fe..caa1161762794 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp @@ -108,7 +108,7 @@ class RISCVExpandSSR : public MachineFunctionPass { bool expandSSR_Barrier(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI); - void bundlePushPops(); + void handlePushPops(); }; char RISCVExpandSSR::ID = 0; @@ -135,7 +135,7 @@ bool RISCVExpandSSR::runOnMachineFunction(MachineFunction &MF) { for (auto &MBB : MF) Modified |= expandMBB(MBB); - bundlePushPops(); //bundle push/pops with their users + handlePushPops(); /// "Forcefully" add all SSR registers as live-in to all MBB in this MF if(Modified) { @@ -450,7 +450,22 @@ bool RISCVExpandSSR::expandSSR_Barrier(MachineBasicBlock &MBB, return true; } -void RISCVExpandSSR::bundlePushPops() { +//additional optimisations for MoveLoad or MoveStore +void RISCVExpandSSR::handlePushPops() { + return; +} + +} // end of anonymous namespace + +INITIALIZE_PASS(RISCVExpandSSR, "riscv-expand-ssr", + RISCV_EXPAND_SSR_NAME, false, false) +namespace llvm { + +FunctionPass *createRISCVExpandSSRPass() { return new RISCVExpandSSR(); } + +} // end of namespace llvm + +/* //TODO: bundle what is regmerged after reg-alloc to make sure that the FADD/FMUL/FMUL/etc.. do not slip past ssr_disable /* DenseMap> bundles; @@ -481,53 +496,4 @@ void RISCVExpandSSR::bundlePushPops() { if (b->getSecond().second == Pred) b->getSecond().second = MI; } }*/ -} - -} // end of anonymous namespace - -INITIALIZE_PASS(RISCVExpandSSR, "riscv-expand-ssr", - RISCV_EXPAND_SSR_NAME, false, false) -namespace llvm { - -FunctionPass *createRISCVExpandSSRPass() { return new RISCVExpandSSR(); } - -} // end of namespace llvm - -/* -bundle.second = bundle.second->getNextNode(); //make end of bundle exclusive bound - DenseSet regs; - errs()<<"beg bundle\n"; - for (MachineInstr *MI = bundle.first; MI != bundle.second; MI = MI->getNextNode()) MI->dump(); - errs()<<"end bundle\n"; - MachineInstr &first = *bundle.first; - auto BMI = BuildMI(*first.getParent(), first.getIterator(), first.getDebugLoc(), TII->get(RISCV::BUNDLE)); - for (MachineInstr *MI = bundle.first; MI != bundle.second; MI = MI->getNextNode()) { - MI->dump(); - for (auto &MOP : MI->operands()) { - if (!MOP.isReg()) continue; - Register reg = MOP.getReg(); - if (regs.find(reg) != regs.end()) continue; - regs.insert(reg); - bool isInternal = false; - for (auto *MI2 = MI->getNextNode(); !isInternal && MI2 != bundle.second; MI2 = MI2->getNextNode()) { - for (const auto &MOP2 : MI2->operands()) { - isInternal |= MOP2.isReg() && MOP2.getReg() == reg; - } - } - MOP.dump(); - errs()<<"is internal = "<dump(); */ \ No newline at end of file From 59648b5d00f9abc9e134968eb8a1b644e6ef3df0 Mon Sep 17 00:00:00 2001 From: thrupf Date: Mon, 4 Jul 2022 15:05:08 +0200 Subject: [PATCH 40/47] work on stalling problem --- llvm/include/llvm/IR/IntrinsicsRISCV.td | 4 +- llvm/lib/Passes/PassBuilder.cpp | 2 - llvm/lib/Target/RISCV/CMakeLists.txt | 2 + .../RISCV/MCTargetDesc/RISCVInstPrinter.cpp | 1 + llvm/lib/Target/RISCV/RISCV.h | 6 + llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp | 3 +- .../RISCV/RISCVExpandSSRInstsPostRegAlloc.cpp | 302 ++++++--- llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td | 2 +- llvm/lib/Target/RISCV/RISCVSSRReassociate.cpp | 362 +++++++++++ llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 4 +- .../Target/RISCV/Snitch/SNITCHAutoFrep.cpp | 590 ++++++++++++++++++ llvm/lib/Transforms/SSR/SSRGeneration.cpp | 33 +- llvm/lib/Transforms/SSR/SSRInference.cpp | 9 +- 13 files changed, 1204 insertions(+), 116 deletions(-) create mode 100644 llvm/lib/Target/RISCV/RISCVSSRReassociate.cpp create mode 100644 llvm/lib/Target/RISCV/Snitch/SNITCHAutoFrep.cpp diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td index f9ea2c11c6089..d41c10ff1fe56 100644 --- a/llvm/include/llvm/IR/IntrinsicsRISCV.td +++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td @@ -1439,13 +1439,13 @@ let TargetPrefix = "riscv" in { : GCCBuiltin<"__builtin_ssr_push">, Intrinsic<[], [llvm_i32_ty, llvm_double_ty], - [IntrWillReturn, IntrWriteMem, IntrHasSideEffects, ImmArg>]>, + [IntrWriteMem, ImmArg>]>, RISCVSSRIntrinsic; def int_riscv_ssr_pop : GCCBuiltin<"__builtin_ssr_pop">, Intrinsic<[llvm_double_ty], [llvm_i32_ty], - [IntrWillReturn, ImmArg>]>, //use ReadWrite instead of throw to avoid licm + [ImmArg>, IntrWriteMem]>, //use ReadWrite instead of throw to avoid licm RISCVSSRIntrinsic; def int_riscv_ssr_enable diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 0e7847d713427..e54e4c6b1a37d 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -819,8 +819,6 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap), EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true, DebugLogging)); - FPM.addPass(ReassociatePass()); //want to do this again after loop unrolling. FIXME: probably worth it right? - if (PTO.Coroutines) FPM.addPass(CoroElidePass()); diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index ce443473f15f6..e1faae23538d4 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -43,7 +43,9 @@ add_llvm_target(RISCVCodeGen RISCVTargetMachine.cpp RISCVTargetObjectFile.cpp RISCVTargetTransformInfo.cpp + RISCVSSRReassociate.cpp Snitch/SNITCHFrepLoops.cpp + Snitch/SNITCHAutoFrep.cpp LINK_COMPONENTS Analysis diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp index 5f8d6e1375187..6ba9c6901c8e8 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp @@ -86,6 +86,7 @@ void RISCVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O, const char *Modifier) { assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported"); + const MCOperand &MO = MI->getOperand(OpNo); if (MO.isReg()) { diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index 318975c9eb1c7..4f6ea25817183 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -48,6 +48,12 @@ void initializeRISCVExpandPseudoPass(PassRegistry &); FunctionPass *createRISCVExpandSSRPostRegAllocPass(); void initializeRISCVExpandSSRPostRegAllocPass(PassRegistry &); +FunctionPass *createSNITCHAutoFrepPass(); +void initializeSNITCHAutoFrepPass(PassRegistry &); + +FunctionPass *createSSRReassociatePass(); +void initializeSSRReassociatePass(PassRegistry &); + FunctionPass *createRISCVExpandAtomicPseudoPass(); void initializeRISCVExpandAtomicPseudoPass(PassRegistry &); diff --git a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp index caa1161762794..1fc4bf919f6e3 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp @@ -495,5 +495,4 @@ FunctionPass *createRISCVExpandSSRPass() { return new RISCVExpandSSR(); } } if (b->getSecond().second == Pred) b->getSecond().second = MI; } - }*/ - */ \ No newline at end of file + }*/ \ No newline at end of file diff --git a/llvm/lib/Target/RISCV/RISCVExpandSSRInstsPostRegAlloc.cpp b/llvm/lib/Target/RISCV/RISCVExpandSSRInstsPostRegAlloc.cpp index 1c1e44b1b0c31..0b4cb3d9a0daf 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandSSRInstsPostRegAlloc.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandSSRInstsPostRegAlloc.cpp @@ -58,6 +58,9 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/Support/CommandLine.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/AntiDepBreaker.h" + using namespace llvm; #define DEBUG_TYPE "riscv-ssr" @@ -126,6 +129,11 @@ bool RISCVExpandSSRPostRegAlloc::runOnMachineFunction(MachineFunction &MF) { if (!SSRNoRegisterMerge && Modified){ for (auto &MBB : MF) mergePushPop(MBB); } + // auto &MRI = MF.getRegInfo(); + // auto &TRI = *MRI.getTargetRegisterInfo(); + // RegisterClassInfo RCI; + // RCI.runOnMachineFunction(MF); + // auto *ADB = createAggressiveAntiDepBreaker(MF, RCI, ) return Modified; } @@ -171,18 +179,17 @@ bool RISCVExpandSSRPostRegAlloc::expandSSR_StoreLoadMove(MachineBasicBlock &MBB, return true; } -/* -static MachineOperand *getUniqueUser(MachineBasicBlock::instr_iterator beg, MachineBasicBlock::instr_iterator end, Register valR) { - for (auto MII = beg; MII != end; ++MII) { - if (MII->isDebugInstr()) continue; - for (auto &MOP : MII->operands()){ - if (!MOP.isReg() || MOP.getReg() != valR) continue; - if (MOP.isKill()) return &MOP; - else return nullptr; +static std::pair isDefIsUse(MachineInstr &MI, MCRegister R) { + bool def = false; + bool use = false; + for (auto &MOP : MI.operands()) { + if (MOP.isReg() && MOP.getReg() == R) { + if (MOP.isDef()) def = true; + else use = true; } } - return nullptr; //cannot be sure, maybe there is a user in a later block? -} */ + return std::make_pair(def, use); +} static MachineOperand *getUniqueUser ( MachineBasicBlock::instr_iterator beg, @@ -195,6 +202,8 @@ static MachineOperand *getUniqueUser ( for (auto MII = beg; MII != realend; ++MII) { isPastEnd |= MII == end; if (MII->isDebugInstr()) continue; //skip debug instructions + errs()<<"looing at: "<<*MII; + if (UseMOP) errs()<<"usemop = "<<*UseMOP<<"\n"; bool definesValR = false; for (auto &MOP : MII->operands()) { if (!MOP.isReg() || MOP.getReg() != valR) continue; @@ -209,7 +218,18 @@ static MachineOperand *getUniqueUser ( return UseMOP; //if MII (re-)defines valR then we must have already found the Use before, (or we haven't in which case we return null) } } - //if we arrive at the end and have not found a redefinition or a kill, then we cannot be sure, whether valR is used after the realend ==> have to return nullptr + auto *MBB = beg->getParent(); + if (MBB) { + bool avail_in_all = true; + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + for (auto *Succ : MBB->successors()) { + if (!Succ) continue; + LivePhysRegs liveness(*MRI.getTargetRegisterInfo()); + liveness.addLiveIns(*Succ); + avail_in_all &= liveness.available(MRI, valR); + } + if (avail_in_all) return UseMOP; + } return nullptr; } @@ -239,7 +259,9 @@ bool RISCVExpandSSRPostRegAlloc::mergePushPop(MachineBasicBlock &MBB) { } } Register r = MI->getOperand(0).getReg(); //register to replace + errs()<<"looking for "<getOperand(0)<<"\n"; MachineOperand *MO = getUniqueUser(std::next(MI.getReverse()), rangeLimit, MI->getParent()->end().getInstrIterator(), r); + if (!MO) errs()<<"*** NOT FOUND ***\n"; if (MO) { //if unique user exists MachineInstr *MIUser = MO->getParent(); if (MIUser && modified.find(MIUser) == modified.end()){ //if unique user exists and was not yet modified @@ -287,7 +309,6 @@ bool RISCVExpandSSRPostRegAlloc::mergePushPop(MachineBasicBlock &MBB) { MI = PMI; } } - MBB.sortUniqueLiveIns(); return Modified; } @@ -302,70 +323,193 @@ FunctionPass *createRISCVExpandSSRPostRegAllocPass() { return new RISCVExpandSSR } // end of namespace llvm -/* OLD VERSION OF REGMERGE -// First pass: Detect moves to or from SSR registers - for (auto MI = MBB.begin() ; MI != MBB.end() ; ) { - MachineBasicBlock::iterator NMI = std::next(MI); - - LLVM_DEBUG(dbgs()<<"Analyzing: "<<*MI<<"\n"); - - // detect an emitted pop and add assignment (virtual_reg, ssr_read) to list - if(MI->getOpcode() == RISCV::FSGNJ_D) { - LLVM_DEBUG(dbgs()<<"Found FSGNJ_D, Op 0: " << MI->getOperand(1).getReg() << " Op1: " << MI->getOperand(2).getReg() << "\n"); - - // look for each streamer register - for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) { - // check for pop - if(MI->getOperand(1).getReg() == ssr_regs[ssr_no] && MI->getOperand(2).getReg() == ssr_regs[ssr_no]) { - LLVM_DEBUG(dbgs()<<" pop: both operands from SSR"<< ssr_no <<"\n"); - // append virtual register to list of assigned virtuals - LLVM_DEBUG(dbgs()<<" append: "<< MI->getOperand(0).getReg() <<"\n"); - virtRegs[ssr_no].insert(MI->getOperand(0).getReg()); - // remove operation - MI->eraseFromParent(); - break; - } - // TODO: check for push - else if(MI->getOperand(0).getReg() == ssr_regs[ssr_no]) { - // This is non-trivial because a register might be used elsewhere, therefore the entire MBB - // must be analyzed and a merge can only be made, if the register is written once - // LLVM_DEBUG(dbgs()<<" push: operand 0 from SSR"<< ssr_no <<"\n"); - // // append virtual register to list of assigned virtuals - // LLVM_DEBUG(dbgs()<<" append: "<< MI->getOperand(1).getReg() <<"\n"); - // virtRegs[ssr_no].insert(MI->getOperand(1).getReg()); - // // remove operation - // MI->eraseFromParent(); - break; - } - } - } - MI = NMI; - } - - // DBG - for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) { - for (auto iter = virtRegs[ssr_no].begin() ; iter != virtRegs[ssr_no].end() ; ++iter) - LLVM_DEBUG(dbgs() << "virtregs["<operands_begin() ; operand != MI->operands_end() ; ++operand) { - if(!operand->isReg()) continue; - // check if operand is in any SSR list - for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) { - if(virtRegs[ssr_no].contains(operand->getReg())) { - LLVM_DEBUG(dbgs() << "Found use of operand " << operand->getReg() << " ssr: " << ssr_no << " in inst " << MI->getOpcode() << "\n"); - // substitute with SSR register - MI->substituteRegister(operand->getReg(), ssr_regs[ssr_no], 0, *TRI); - // guard this block and add ssr regs to live in - MBB.addLiveIn(ssr_regs[ssr_no]); - } - } - } - MI = NMI; - } - */ \ No newline at end of file +///REGMERGE USING LIVENESS, BUT SOMEHOW WORSE +// static std::pair isDefIsUse(MachineInstr &MI, MCRegister R) { +// bool def = false; +// bool use = false; +// for (auto &MOP : MI.operands()) { +// if (MOP.isReg() && MOP.getReg() == R) { +// if (MOP.isDef()) def = true; +// else use = true; +// } +// } +// return std::make_pair(def, use); +// } + +// struct Liveness { +// public: +// Liveness(const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, MachineBasicBlock &MBB, bool end) : liveness(TRI), MBB(MBB), MRI(MRI) { +// if (end) { +// liveness.addLiveOuts(MBB); +// LiveIn = MBB.end().getInstrIterator(); +// } else { +// liveness.addLiveIns(MBB); +// LiveIn = MBB.begin().getInstrIterator(); +// } +// } + +// void MoveForward(MachineBasicBlock::instr_iterator Point) { +// if (Point == LiveIn) return; +// SmallVector, 1u> clb; +// while (LiveIn != Point && LiveIn != MBB.end().getInstrIterator()) { +// liveness.stepForward(*LiveIn, clb); +// LiveIn++; +// } +// assert(LiveIn == Point && "moved forward to point"); +// } + +// void MoveBackward(MachineBasicBlock::reverse_instr_iterator Point) { +// assert(Point != MBB.rend().getInstrIterator() && "not rend()"); +// if (Point.getReverse() == LiveIn) return; +// Point++; //in order to get LiveIN for Point we have to move up to and incl. Point +// MachineBasicBlock::reverse_instr_iterator LiveInRev = LiveIn.getReverse(); +// LiveInRev++; +// while (LiveInRev != Point && LiveInRev != MBB.rend().getInstrIterator()) { +// liveness.stepBackward(*LiveInRev); +// LiveInRev++; +// } +// LiveIn = std::next(LiveInRev.getReverse()); +// assert(LiveInRev == Point && "moved backward to point"); +// } + +// //move forward up to first use of Reg, make sure Reg is not live anymore afterwards +// MachineBasicBlock::instr_iterator findUniqueUser(MCRegister Reg, MachineBasicBlock::instr_iterator end) { +// while (LiveIn != end) { +// auto ut = isDefIsUse(*LiveIn, Reg); +// if (ut.first && !ut.second) return end; //redefined +// if (ut.first && ut.second) return LiveIn; //first user redefines himself +// MoveForward(std::next(LiveIn)); +// if (ut.second) { +// if (liveness.available(MRI, Reg)) std::prev(LiveIn); +// else { +// for (auto x = LiveIn; x != MBB.end().getInstrIterator(); ++x) { +// auto ut = isDefIsUse(*x, Reg); +// if (ut.first && !ut.second) return std::prev(LiveIn); //found redef. +// else if (ut.second) return end; //another use +// } +// return end; +// } +// } +// } +// return end; +// } + +// MachineBasicBlock::instr_iterator getPoint() const { return LiveIn; } +// const LivePhysRegs &getLiveness() const { return liveness; } +// void addReg(MCRegister R) { liveness.addReg(R); } + +// private: +// MachineBasicBlock::instr_iterator LiveIn; //INV: this always points to the instr for which liveness has live-in info +// LivePhysRegs liveness; +// MachineBasicBlock &MBB; +// const MachineRegisterInfo &MRI; +// }; + +// static bool isSSREn(const MachineInstr &MI) { +// return MI.getOpcode() == RISCV::CSRRSI +// && MI.getOperand(1).isImm() +// && MI.getOperand(1).getImm() == 0x7C0 +// && MI.getOperand(2).isImm() +// && MI.getOperand(2).getImm() == 1; +// } + +// static bool isSSRDis(const MachineInstr &MI) { +// return MI.getOpcode() == RISCV::CSRRCI +// && MI.getOperand(1).isImm() +// && MI.getOperand(1).getImm() == 0x7C0 +// && MI.getOperand(2).isImm() +// && MI.getOperand(2).getImm() == 1; +// } + +// static bool isSSRReg(MCRegister R) { +// for (unsigned s = 0u; s < NUM_SSR; s++) { +// if (getSSRFtReg(s).asMCReg() == R) return true; +// } +// return false; +// } + +// static unsigned getSSRRegIdx(MCRegister R) { +// return R - MCRegister(RISCV::F0_D); +// } + +// bool RISCVExpandSSRPostRegAlloc::mergePushPop(MachineBasicBlock &MBB) { +// bool Modified; + +// MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); +// const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + +// recomputeLiveIns(MBB); +// recomputeLivenessFlags(MBB); + +// SmallSet modifiedInsts[NUM_SSR]; //keep track of which insts were merged into to avoid merging two different moves of same stream into one inst +// MachineBasicBlock::reverse_instr_iterator MII = MBB.rbegin().getInstrIterator(); +// MachineBasicBlock::instr_iterator SearchEnd = MBB.end().getInstrIterator(); +// while (MII != MBB.rend().getInstrIterator()) { +// auto NMII = std::next(MII); + +// if (isSSRDis(*MII)) { +// SearchEnd = MII.getReverse(); +// MII = NMII; +// continue; +// } + +// if (MII->getOpcode() == RISCV::FSGNJ_D) { +// auto &MOP0 = MII->getOperand(0); +// auto &MOP1 = MII->getOperand(1); +// auto &MOP2 = MII->getOperand(2); +// if (MOP0.isReg() && MOP1.isReg() && MOP2.isReg() && MOP1.getReg() == MOP2.getReg()) { +// if (isSSRReg(MOP1.getReg()) && MII != MBB.rbegin().getInstrIterator()) { //this is ssr pop (and there is at least one potential user) +// MCRegister dest = MOP0.getReg().asMCReg(); +// MCRegister ssr_reg = MOP1.getReg().asMCReg(); +// unsigned dmid = getSSRRegIdx(ssr_reg); +// //try to find unique user of dest +// Liveness Live(TRI, MRI, MBB, true); +// Live.MoveBackward(std::prev(MII)); //increment liveness to past MII +// auto user = Live.findUniqueUser(dest, SearchEnd); +// if (user != SearchEnd && modifiedInsts[dmid].find(&*user) == modifiedInsts[dmid].end()) { //found user +// user->dump(); +// for (auto &MOP : user->operands()) { +// if (MOP.isReg() && !MOP.isDef() && MOP.getReg() == dest) MOP.setReg(ssr_reg); //replace all non-def uses of r with ssr_reg +// } +// user->dump(); +// MII->eraseFromBundle(); +// modifiedInsts[dmid].insert(&*user); +// Modified = true; +// } +// } else if (isSSRReg(MOP0.getReg())) { +// MCRegister src = MOP1.getReg(); +// MCRegister ssr_reg = MOP0.getReg(); +// unsigned dmid = getSSRRegIdx(ssr_reg); +// MachineBasicBlock::reverse_instr_iterator beginSearch = std::next(MII); +// while (beginSearch != MBB.rend().getInstrIterator()) { +// if (isSSREn(*beginSearch)) break; +// auto ut = isDefIsUse(*beginSearch, src); +// if (ut.first) break; +// beginSearch++; +// } +// if (beginSearch != MBB.rend().getInstrIterator() && !isSSREn(*beginSearch)) { +// assert(isDefIsUse(*beginSearch, src).first && "does define src"); +// Liveness Live(TRI, MRI, MBB, true); +// Live.MoveBackward(std::prev(beginSearch)); +// auto user = Live.findUniqueUser(src, std::next(MII.getReverse())); +// if (user == MII.getReverse() && modifiedInsts[dmid].find(&*beginSearch) == modifiedInsts[dmid].end()) { +// beginSearch->dump(); +// for (auto &MOP : beginSearch->operands()) { +// if (MOP.isReg() && MOP.isDef() && MOP.getReg() == src) { +// MOP.setReg(ssr_reg); //replace all defs of R with ssr_reg +// MOP.setIsDef(false); +// } +// } +// beginSearch->dump(); +// MII->eraseFromBundle(); +// modifiedInsts[dmid].insert(&*beginSearch); +// Modified = true; +// } +// } +// } +// } +// } +// MII = NMII; +// } +// return Modified; +// } \ No newline at end of file diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td index 6bf48cec791dd..a41df75c1cf4a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td @@ -105,7 +105,7 @@ class SPseudoPush: let usesCustomInserter = 0; } -class SPseudoStoreMove: +class SPseudoStoreMove: //instead of using these could give isBarrier = 1 to ssr csrrsi/csrrci Pseudo<(outs FPR64:$ssr), (ins FPR64:$val),[]> { let mayLoad = 0; let mayStore = 1; diff --git a/llvm/lib/Target/RISCV/RISCVSSRReassociate.cpp b/llvm/lib/Target/RISCV/RISCVSSRReassociate.cpp new file mode 100644 index 0000000000000..395b8969e4cca --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVSSRReassociate.cpp @@ -0,0 +1,362 @@ +//===- SSRReassociatePass.cpp - Expand atomic instructions ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass (at IR level) to replace atomic instructions with +// __atomic_* library calls, or target specific instruction which implement the +// same semantics in a way which better fits the target backend. This can +// include the use of (intrinsic-based) load-linked/store-conditional loops, +// AtomicCmpXchg, or type coercions. +// +//===----------------------------------------------------------------------===// + +#include "RISCV.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsRISCV.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "ssr-inference" + +namespace { + + class SSRReassociate: public FunctionPass { + const TargetLowering *TLI = nullptr; + + public: + static char ID; // Pass identification, replacement for typeid + + SSRReassociate() : FunctionPass(ID) { + initializeSSRReassociatePass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + + private: + bool runOnBB(BasicBlock &BB); + // void moveAfterWithMetadata + // DominatorTreeWrapperPass DTP; + }; + +} // end anonymous namespace + +bool SSRReassociate::runOnFunction(Function &F) { + bool Modified = false; + + errs()<<"SSR Reassociate Pass running on: "<DTP.runOnFunction(F); + if (!F.hasFnAttribute("SSR")) return false; + + for (auto &BB : F) Modified |= runOnBB(BB); + + return Modified; +} + +// static bool BubbleSSRIntrinsics(BasicBlock &BB) { +// bool Modified = false; +// auto II = BB.getFirstInsertionPt(); +// auto LastInsertedPop = std::prev(II); +// auto LastInsertedPush = BB.getTerminator()->getIterator(); +// while (II != BB.end() && II != LastInsertedPush) { +// auto NII = std::next(II); +// if (isa(*II)) { +// auto &Intr = cast(*II); +// if (Intr.getIntrinsicID() == Intrinsic::riscv_ssr_pop) { +// Intr.removeFromParent(); +// Intr.insertAfter(&*LastInsertedPop); +// LastInsertedPop = Intr.getIterator(); +// Modified = true; +// } else if (Intr.getIntrinsicID() == Intrinsic::riscv_ssr_push) { +// Intr.removeFromParent(); +// Intr.insertBefore(&*LastInsertedPush); +// LastInsertedPush = Intr.getIterator(); +// Modified = true; +// } +// } +// II = NII; +// } +// return Modified; +// } + +static bool BubbleSSRIntrinsics(BasicBlock &BB) { + bool Modified = false; + auto II = BB.getFirstInsertionPt(); + DenseSet vis; + while (II != BB.end()) { + auto NII = std::next(II); + if (vis.find(&*II) != vis.end()) { + II = NII; + continue; + } + vis.insert(&*II); + if (isa(*II)) { + auto &Intr = cast(*II); + if (Intr.getIntrinsicID() == Intrinsic::riscv_ssr_pop) { + Instruction *UU = nullptr; + for (User *U : Intr.users()) { + if (isa(U) && !UU) UU = cast(U); + else UU = nullptr; + } + if (UU) { + Intr.moveBefore(UU); + Modified = true; + } + } else if (Intr.getIntrinsicID() == Intrinsic::riscv_ssr_push) { + if (Instruction *D = dyn_cast(Intr.getOperand(1))) { + Intr.moveAfter(D); + Modified = true; + } + } + } + II = NII; + } + return Modified; +} + +static bool isAssociative(const Value &V) { + if (!isa(V)) return false; + const auto &I = cast(V); + if (I.getType()->isIntegerTy(1u)) return false; //ignore bools + if(I.isAssociative()) return true; + if ((I.getType()->isFloatingPointTy() && I.isFast())){ //https://gcc.gnu.org/wiki/FloatingPointMath + switch (I.getOpcode()) + { + case Instruction::BinaryOps::FAdd: + case Instruction::BinaryOps::FMul: + return true; + default: + return false; + } + } + return false; +} + +static bool isBinop(const Value &I) { + return isa(I); +} + +static unsigned getAndUpdateHeight(const Value &V, DenseMap &heights); //bc mutual recursion + +static unsigned updateHeightFromChildren(const BinaryOperator &I, DenseMap &heights) { + unsigned this_height = 1u + std::max( + getAndUpdateHeight(*I.getOperand(0), heights), + getAndUpdateHeight(*I.getOperand(1), heights) + ); + auto p = heights.insert(std::make_pair(&I, this_height)); + if (!p.second) p.first->getSecond() = this_height; //update value + return this_height; +} + +static unsigned getAndUpdateHeight(const Value &V, DenseMap &heights) { + if (!isa(V)) return 0; + const Instruction &I = cast(V); + if (!isBinop(I)) return 0; + auto d = heights.find(&I); + if (d != heights.end()) return d->second; //if height is available it is correct + return updateHeightFromChildren(cast(I), heights); +} + +// static void moveAfterWithMetadata(BinaryOperator &OP, Instruction *Point) { + +// } + +static BinaryOperator *rotateCC(BinaryOperator &L, BinaryOperator &I, DenseMap &heights) { + errs()<<"rotating CC:"< &heights) { + errs()<<"rotating CW:"< &heights) { + if (isBinop(Left) && isBinop(Root) && isAssociative(Left) && isAssociative(Root)) { + BinaryOperator &L = cast(Left); + BinaryOperator &I = cast(Root); + const unsigned opcode = I.getOpcode(); + if (L.getOpcode() != opcode) return nullptr; //cannot do anything + unsigned lh = getAndUpdateHeight(L, heights); + if (lh <= 1u) return nullptr; //nothing to do + auto &L_RChild = *L.getOperand(1); + if (isBinop(L_RChild) && isAssociative(L_RChild) + && getAndUpdateHeight(L_RChild, heights) + 1u == lh) { + auto &LRC = cast(L_RChild); + if (LRC.getOpcode() == opcode) { + auto &newL = *rotateCW(LRC, L, heights); + return rotateCC(newL, I, heights); + } + } + return rotateCC(L, I, heights); + } + return nullptr; +} + +static BinaryOperator *tryRotateR(Value &Right, Value &Root, DenseMap &heights) { + if (isBinop(Right) && isBinop(Root) && isAssociative(Right) && isAssociative(Root)) { + BinaryOperator &R = cast(Right); + BinaryOperator &I = cast(Root); + const unsigned opcode = I.getOpcode(); + if (R.getOpcode() != opcode) return nullptr; //cannot do anything + unsigned rh = getAndUpdateHeight(R, heights); + if (rh <= 1u) return nullptr; //nothing to do + auto &R_LChild = *R.getOperand(0); + if (isBinop(R_LChild) && isAssociative(R_LChild) + && getAndUpdateHeight(R_LChild, heights) + 1u == rh) { + auto &RLC = cast(R_LChild); + if (RLC.getOpcode() == opcode) { + auto &newR = *rotateCC(RLC, R, heights); + return rotateCW(newR, I, heights); + } + } + return rotateCW(R, I, heights); + } + return nullptr; +} + +static bool Reassociate(Value &Inst, DenseMap &heights) { + bool Modified = false; + if (isBinop(Inst) && isAssociative(Inst)) { + BinaryOperator *I = cast(&Inst); + bool improved_root = true; + while (improved_root) { + improved_root = false; + int lminusr = + (int)getAndUpdateHeight(*I->getOperand(0), heights) + - (int)getAndUpdateHeight(*I->getOperand(1), heights); + BinaryOperator *NewRoot = nullptr; + if (lminusr >= 2) { + NewRoot = tryRotateL(*I->getOperand(0), *I, heights); //try to fix at this height + } else if (lminusr <= -2) { + NewRoot = tryRotateR(*I->getOperand(1), *I, heights); //try to fix at this height + } + if (NewRoot) { + I = NewRoot; + improved_root = true; + Modified = true; + } + } + + bool improved_left = Reassociate(*I->getOperand(0), heights); //fix left subtree + bool improved_right = Reassociate(*I->getOperand(1), heights); //fix right subtree + Modified = Modified || improved_left || improved_right; + + updateHeightFromChildren(*I, heights); + } + return Modified; +} + +static void printDep(Value &I, unsigned lvl, DenseMap &heights, DenseSet &vis) { + if (vis.find(&I) != vis.end()) return; + vis.insert(&I); + for (unsigned i = 0; i < lvl; i++) errs()<<"| \t"; + unsigned h = 0; + if (isa(I)) { + auto p = heights.find(&cast(I)); + if (p != heights.end()) h = p->second; + } + errs()<<" h = "<(I); + for (unsigned i = 0; i < X.getNumOperands(); i++) { + auto *V = X.getOperand(i); + if (V) printDep(*V, lvl+1, heights, vis); + } + } +} + +static bool Reassociate(BasicBlock &BB) { + bool Modified = false; + + DenseMap heights; + + auto RI = BB.rbegin(); + while (RI != BB.rend()) { + Modified |= Reassociate(*RI, heights); + RI++; //yes, this means we miss some instructions, but those are optimized already anyway + } + + if (Modified) { + errs()<<"Reassociate in BB: "< vis; + for (auto RI = BB.rbegin(); RI != BB.rend(); RI++) { + printDep(*RI, 0, heights, vis); + } + BB.dump(); + } + + return Modified; +} + +bool SSRReassociate::runOnBB(BasicBlock &BB) { + bool Modified = false; + + Modified |= Reassociate(BB); + + Modified |= BubbleSSRIntrinsics(BB); + + return Modified; +} + + +char SSRReassociate::ID = 0; + +INITIALIZE_PASS(SSRReassociate, DEBUG_TYPE, "SSR Reassociate Pass", false, false) + +FunctionPass *llvm::createSSRReassociatePass() { return new SSRReassociate(); } diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index cf86d705768a4..847d693d7551c 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -170,8 +170,8 @@ TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) { } void RISCVPassConfig::addIRPasses() { + //addPass(createSSRReassociatePass()); addPass(createAtomicExpandPass()); - //TODO: add pass for auto SSR Inference here? TargetPassConfig::addIRPasses(); } @@ -211,6 +211,8 @@ void RISCVPassConfig::addPreEmitPass2() { addPass(createRISCVExpandPseudoPass()); addPass(createPULPFixupHwLoops()); addPass(createRISCVExpandSSRPostRegAllocPass()); + addPass(createSNITCHAutoFrepPass()); + // Schedule the expansion of AMOs at the last possible moment, avoiding the // possibility for other passes to break the requirements for forward // progress in the LR/SC block. diff --git a/llvm/lib/Target/RISCV/Snitch/SNITCHAutoFrep.cpp b/llvm/lib/Target/RISCV/Snitch/SNITCHAutoFrep.cpp new file mode 100644 index 0000000000000..a1a6be242caac --- /dev/null +++ b/llvm/lib/Target/RISCV/Snitch/SNITCHAutoFrep.cpp @@ -0,0 +1,590 @@ +//===-- SNITCHAutoFrep.cpp - Expand SSR pseudo instructions ---------===// +// +// Copyright 2021 ETH Zurich, University of Bologna. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVInstrInfo.h" +#include "RISCVTargetMachine.h" +#include "RISCVMachineFunctionInfo.h" + +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/LowLevelType.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +#define DEBUG_TYPE "riscv-frep" + +namespace llvm { + /// Command line options + +} + +#define SNITCH_AUTO_FREP_NAME "Snitch Auto Frep" + +#define MAX_SEARCH_WINDOW 4 +#define MIN_REP 4 +#define MAX_STAGGER 4 +#define NUM_SSR 3 + +namespace { + +class SNITCHAutoFrep : public MachineFunctionPass { +public: + const RISCVInstrInfo *TII; + static char ID; + + SNITCHAutoFrep() : MachineFunctionPass(ID) { + initializeSNITCHAutoFrepPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return SNITCH_AUTO_FREP_NAME; } + +private: + + const MachineFunction *MF; + RISCVMachineFunctionInfo *RVFI; + DenseSet FPOps; + + bool process(MachineBasicBlock &MBB); + bool isFPInstr(MachineInstr &I); + std::pair findRep( + MachineBasicBlock::instr_iterator window_beg, + MachineBasicBlock::instr_iterator window_end, + MachineBasicBlock::instr_iterator end); +}; + +static Register getSSRFtReg(unsigned streamer) { //taken from RISCVExpandSSRInsts.cpp + unsigned AssignedReg = RISCV::F0_D + streamer; + // Advance the iterator to the assigned register until the valid + // register is found + const TargetRegisterClass *RC = &RISCV::FPR64RegClass; + TargetRegisterClass::iterator I = RC->begin(); + for (; *I != AssignedReg; ++I) + assert(I != RC->end() && "AssignedReg should be a member of provided RC"); + return Register(*I); +} + +char SNITCHAutoFrep::ID = 0; + +static constexpr unsigned fpopcodes[] = {RISCV::FADD_D, RISCV::FMUL_D, RISCV::FMADD_D, RISCV::FSGNJ_D, RISCV::FDIV_D, RISCV::FSUB_D, RISCV::FMSUB_D, RISCV::FMIN_D, RISCV::FMAX_D, RISCV::FSQRT_D}; + +bool SNITCHAutoFrep::runOnMachineFunction(MachineFunction &MF) { + TII = static_cast(MF.getSubtarget().getInstrInfo()); + this->MF = &MF; + this->RVFI = MF.getInfo(); + for (const unsigned &x : fpopcodes) this->FPOps.insert(x); + + errs()<<"autofrep: running on:"<FPOps.find(I.getOpcode()) != this->FPOps.end(); +} + +std::pair SNITCHAutoFrep::findRep( + MachineBasicBlock::instr_iterator window_beg, + MachineBasicBlock::instr_iterator window_end, + MachineBasicBlock::instr_iterator end) +{ + MachineBasicBlock::instr_iterator wi = window_beg; + MachineBasicBlock::instr_iterator s_end = window_end; + MachineBasicBlock::instr_iterator s_res = window_end; + unsigned rep = 1u; + while (s_end != end && isFPInstr(*s_end) && areTheSame(*s_end, *wi)) { + s_end++; + wi++; + if (wi == window_end) { + wi = window_beg; + rep++; + s_res = s_end; //found rep + } + } + return std::make_pair(s_res, rep); +} + +static unsigned getCycles(unsigned opcode) { + switch (opcode) + { + case RISCV::FADD_D: + return 2; + case RISCV::FMUL_D: + return 3; + case RISCV::FMADD_D: + return 4; + default: + return 1; + } +} + +static Optional getCombineOpcode(unsigned opcode, unsigned src_idx) { + switch (opcode) + { + case RISCV::FADD_D: + if (src_idx == 1 || src_idx == 2) return (unsigned)RISCV::FADD_D; + return None; + case RISCV::FMADD_D: + if (src_idx == 0) return (unsigned)RISCV::FADD_D; + return None; + default: + return None; + } +} + +static unsigned toMask (const std::vector> &deps) { + unsigned mask = 0u; + for (const auto &p : deps) mask |= p.second; + return mask; +} + +static Optional>> findRepDependenceRegs( + MachineBasicBlock::instr_iterator window_begin, + MachineBasicBlock::instr_iterator window_end) +{ + DenseMap def; //defs that are live going out of window + std::vector> internal, external; + for (auto MII = window_begin; MII != window_end; MII++) { + for (unsigned i = MII->getNumOperands()-1; i < MII->getNumOperands(); i--) { + int idx = 3 - (int)i; + auto &MOP = MII->getOperand(i); + if (!MOP.isReg()) continue; + if (idx < 0) return None; //there is an instruction with more than 4 fpr's in window ==> cannot stagger + MCRegister r = MOP.getReg().asMCReg(); + if (MOP.isDef()) { + if (idx != 3) return None; //defining operand not at idx 0 ==> cannot stagger + def.insert(std::make_pair(r, (unsigned)(1 << idx))); + } else { //use + auto p = def.find(r); + if (p != def.end()) internal.push_back(std::make_pair(r, (unsigned)(1 << idx) | p->second)); + if (MOP.isKill()) def.erase(r); + } + idx--; + } + } + for (auto MII = window_begin; MII != window_end; MII++) { + for (unsigned i = MII->getNumOperands()-1; i < MII->getNumOperands(); i--) { + int idx = 3 - (int)i; + auto &MOP = MII->getOperand(i); + if (!MOP.isReg()) continue; + assert(idx >= 0); + MCRegister r = MOP.getReg().asMCReg(); + if (MOP.isDef()) { + def.erase(r); //redef'ed before use + } else { + auto p = def.find(r); + if (p != def.end()) external.push_back(std::make_pair(r, (unsigned)(1 << idx) | p->second)); + if (MOP.isKill()) def.erase(r); + } + } + } + unsigned internal_mask = toMask(internal); + unsigned external_mask = toMask(external); + for (auto &p : external) external_mask |= p.second; + //internal needs to be a subset of external + if ((internal_mask & external_mask) ^ internal_mask) return None; + return external; +} + +static void mergeRegisters(std::vector> &deps) { + unsigned i = 0; + while (i < deps.size()) { + MCRegister r = deps[i].first; + unsigned found = 0u; + for (unsigned j = 0; j < i; j++) { + if (deps[j].first == r) { + deps[j] = std::make_pair(r, deps[j].second | deps[i].first); + found++; + } + } + if (found) { + assert(found == 1); + deps.erase(deps.begin() + i); + //no need to increment i + } else { + i++; + } + } +} + +static bool isSSRReg(MCRegister r) { + for (unsigned i = 0; i < NUM_SSR; i++) { + if (getSSRFtReg(i) == r) return true; + } + return false; +} + +static Optional> findCombineOps( + MCRegister DReg, + unsigned stagger_mask, + MachineBasicBlock::instr_iterator window_begin, + MachineBasicBlock::instr_iterator window_end) +{ + MachineInstr *Def = nullptr; + for (auto MII = std::next(window_end.getReverse()); MII != std::next(window_begin.getReverse()); MII++) { + if (MII->getOperand(0).isReg() && MII->getOperand(0).getReg() == DReg) { + Def = &*MII; + } + } + if (!Def) return None; + + std::vector ops; + MCRegister r = DReg; + bool reached_def = false; + for (auto MII = window_begin; !reached_def && MII != window_end; MII++) { + for (unsigned i = MII->getNumOperands() - 1; !reached_def && i < MII->getNumOperands(); i--) { + int idx = 3 - i; + if (idx < 0) continue; + auto &MOP = MII->getOperand(i); + if (MOP.isReg() && MOP.getReg().asMCReg() == r) { + if (!MII->getOperand(0).isReg()) return None; + r = MII->getOperand(0).getReg().asMCReg(); + auto op = getCombineOpcode(MII->getOpcode(), (unsigned)idx); + if (!op.hasValue()) return None; + ops.push_back(op.getValue()); + reached_def = (&*MII == Def); + if (!reached_def) return None; //FIXME: currently only one combineop allowed + break; //go to next instruction + } + } + } + return ops; +} + +struct StaggerInfo { + unsigned count; + unsigned mask; + std::vector regs; + std::vector combineOps; +}; + +static Optional findStagger( + MachineBasicBlock::instr_iterator window_begin, + MachineBasicBlock::instr_iterator window_end, + const LivePhysRegs &liveness, + const llvm::MachineRegisterInfo &MRI) +{ + errs()<<"trying to find stagger\n"; + auto depsopt = findRepDependenceRegs(window_begin, window_end); + if (!depsopt.hasValue()) return None; + errs()<<"found deps\n"; + auto deps = depsopt.getValue(); + mergeRegisters(deps); + for (const auto &p : deps) errs()<<"reg = "< regs; + regs.push_back(DReg); + while (max_stagger_count < MAX_STAGGER && liveness.available(MRI, DReg + max_stagger_count + 1)) { + max_stagger_count++; + regs.push_back(DReg + max_stagger_count); + } + if (!max_stagger_count) return None; //regs not free (FIXME: rename instead) + StaggerInfo info; + info.count = max_stagger_count; + info.mask = stagger_mask; + info.regs = std::move(regs); + info.combineOps = std::move(ops.getValue()); + return info; +} + +static MachineBasicBlock *findBB(MachineInstr &MI) { + for (auto &MOP : MI.operands()) { + if (MOP.isMBB()) return MOP.getMBB(); + } + return nullptr; +} + +//FIXME: no idea how to make a block a label for sure ==> just search for a branch and take its target +static MachineBasicBlock *findBrAbleBB(MachineBasicBlock &MBB) { + if (!MBB.empty()) { + auto *BB = findBB(*std::prev(MBB.end())); + if (BB) return BB; + } + std::vector s; + SmallSet vis; + s.push_back(&MBB); + while (!s.empty()) { + auto *B = s.back(); s.pop_back(); + if (!B || vis.contains(B)) continue; + vis.insert(B); + if (!B->empty()) { + auto *x = findBB(*std::prev(B->end())); + if (x) return x; + } + for (auto *BB : B->predecessors()) s.push_back(BB); + for (auto *BB : B->successors()) s.push_back(BB); + } + return &MBB; +} + +bool SNITCHAutoFrep::process(MachineBasicBlock &MBB) { + bool Modified = false; + + recomputeLivenessFlags(MBB); //to be sure + + for (auto II = MBB.begin().getInstrIterator(); II != MBB.end().getInstrIterator(); ) { + auto NII = std::next(II); + if (II->isDebugInstr()) { //get rid of some dbg instructions (sorry) + II->eraseFromParent(); + } + II = NII; + } + + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + LivePhysRegs liveness(TRI); //use RegScavenger ? + liveness.addLiveIns(MBB); + for (unsigned r = 0; r < NUM_SSR; r++) + liveness.addReg(getSSRFtReg(r).asMCReg()); //add SSR regs for good measure (FIXME: conservative) + + MachineBasicBlock::instr_iterator MII = MBB.begin().getInstrIterator(); + while (MII != MBB.end().getInstrIterator()){ + if (!isFPInstr(*MII)) { + MII = std::next(MII); + continue; + } + + std::vector> search_results; + for (auto II = MII; II != MBB.end().getInstrIterator() && search_results.size() < MAX_SEARCH_WINDOW; ++II) { + auto wend = std::next(II); + auto sr = findRep(MII, wend, MBB.end().getInstrIterator()); + search_results.push_back(sr); + } + + unsigned best = 0u; + for (unsigned i = 0u; i < search_results.size(); i++) { + best = search_results[best].second < search_results[i].second ? i : best; + } + + bool found = false; + if (!search_results.empty() && search_results[best].second >= MIN_REP) { //if we have found at least MIN_REP repetitions + errs()<<"found repeting fp instr's\n"; + for (auto II = MII; II != search_results[best].first; ++II) II->dump(); + const TargetRegisterClass *RC = &RISCV::GPRNoX0RegClass; + TargetRegisterClass::iterator I = RC->begin(); + while(I != RC->end() && !liveness.available(MRI, MCPhysReg(*I))) I++; + if (I != RC->end()) { //did find a free GPR register + errs()<<"found a free GPR reg \n"; + + MCPhysReg freeGPR = *I; + + const unsigned window_size = best + 1; //recover window size + const unsigned reps = search_results[best].second; //get reps + + auto delete_begin = std::next(MII, window_size); //start of repeting region + auto delete_end = search_results[best].first; //end of repeting region (excl.) + + auto info = findStagger(MII, delete_begin, liveness, MRI); + + if (info.hasValue()) { + errs()<<"found stagger \n"; + + unsigned window_cycles = 0u; + for (auto MI = MII; MI != delete_begin; MI++) window_cycles += getCycles(MI->getOpcode()); + unsigned rep_stall = getCycles(std::prev(delete_begin)->getOpcode()) - 1u; + unsigned combine_cycles = 0u; + for (unsigned &op : info.getValue().combineOps) combine_cycles += getCycles(op); + + std::vector cost; + cost.push_back(reps * window_cycles); //cycles needed with no frep + errs()<<"default = "< 0u) { + errs()<<"frep+stagger is better\n"; + //code generation: + //delete repetitions + MBB.dump(); + found = true; + Modified = true; //we will modify now + + for (auto di = delete_begin; di != delete_end;) { + auto din = std::next(di); + di->eraseFromParentAndMarkDBGValuesForRemoval(); //delete repeated parts + di = din; + } + for (unsigned s = 1; s <= best_stagger; s++) { + //fcvt.d.w stagger, zero (FIXME: only allows additive combine op for now) + BuildMI(MBB, MII, MII->getDebugLoc(), this->TII->get(RISCV::FCVT_D_W), info.getValue().regs[s]) + .addReg(RISCV::X0); + } + //load rep + BuildMI(MBB, MII, MII->getDebugLoc(), this->TII->get(RISCV::ADDI), freeGPR) + .addReg(RISCV::X0) + .addImm(reps-1); + //frep.i + BuildMI(MBB, MII, MII->getDebugLoc(), this->TII->get(RISCV::FREP_O)) + .addReg(freeGPR, RegState::Kill) //reps + .addImm(window_size) //nr of instructions + .addImm(best_stagger) //stagger count + .addImm(info.getValue().mask); //stagger mask + + //combine result + errs()<<"generate combine result\n"; + unsigned step = 1; + while (step < best_stagger + 1u) { + for (unsigned i = 0u; i + step < best_stagger + 1u; i += (step + 1)) { + //FIXME: currently only one combine op allowed, if more: need temp regs here ??? + errs()<<"src = "<getDebugLoc(), this->TII->get(info.getValue().combineOps.front()), info.getValue().regs[i]) + .addReg(info.getValue().regs[i], RegState::Kill) + .addReg(info.getValue().regs[i + step], RegState::Kill) + .addImm(7); + } + step = step * 2; + } + + //FPU fence (as done in SNITCHFrepLoops.cpp) + BuildMI(MBB, delete_end, delete_end->getDebugLoc(), this->TII->get(RISCV::FMV_X_W), freeGPR) + .addReg(info.getValue().regs[1]); + auto *BB = findBrAbleBB(MBB); + BuildMI(MBB, delete_end, delete_end->getDebugLoc(), this->TII->get(RISCV::BLT)) + .addReg(freeGPR, RegState::Kill) + .addReg(freeGPR, RegState::Kill) + .addMBB(BB); + //advance liveness + for (auto II = MII; II != delete_end; II++) { + SmallVector, 4u> clobbers; + liveness.stepForward(*II, clobbers); + } + + MII = delete_end; //continue from here + } + } + } + } + + if (!found) { + SmallVector, 4u> clobbers; + liveness.stepForward(*MII, clobbers); + MII = std::next(MII); + } + } + + if (Modified) MBB.dump(); + + return Modified; +} + +} // end of anonymous namespace + +INITIALIZE_PASS(SNITCHAutoFrep, "riscv-snitch-auto-frep", + SNITCH_AUTO_FREP_NAME, false, false) +namespace llvm { + +FunctionPass *createSNITCHAutoFrepPass() { return new SNITCHAutoFrep(); } + +} // end of namespace llvm + + +// if (window_size == 1) { +// std::vector defs; +// std::vector ins; +// for (auto &MOP : MII->operands()) { +// if (!MOP.isReg()) continue; +// if (MOP.isDef()) defs.push_back(MOP.getReg().asMCReg()); +// else ins.push_back(MOP.getReg().asMCReg()); +// std::vector inter_dep; +// for (auto &d : defs) { +// for (auto &i : ins) { +// if (d == i) inter_dep.push_back(d); +// } +// } +// if (inter_dep.size() == 1) { +// errs()<<"only one interdependence\n"; +// MCRegister stagger_reg = inter_dep[0]; +// unsigned mask_idx = 3u; +// for (unsigned s = 0; s < MII->getNumOperands(); s++) { +// if (MII->getOperand(s).isReg() && MII->getOperand(s).getReg().asMCReg() == stagger_reg) +// stagger_mask |= 1u << mask_idx; +// mask_idx--; +// } +// auto p = getCombineOpcode(MII->getOpcode()); +// if (p.hasValue()) { +// errs()<<"has combine opcode\n"; +// combine_opcode = p.getValue().first; +// unsigned allowed_mask = p.getValue().second; +// if ((stagger_mask | allowed_mask) == allowed_mask) { //allowed +// errs()<<"stagger is allowed\n"; +// while (stagger_count < MAX_STAGGER && liveness.available(MRI, stagger_reg + stagger_count + 1)) +// stagger_count++; +// if (stagger_count && stagger_mask && combine_opcode) { +// errs()<<"can stagger\n"; +// std::vector stagger_regs; +// stagger_regs.push_back(stagger_reg); +// for (unsigned x = 1; x <= stagger_count; x++){ +// BuildMI(MBB, MII, MII->getDebugLoc(), this->TII->get(RISCV::FCVT_D_W), stagger_reg + x) //fcvt.d.w stagger, zero +// .addReg(RISCV::X0); +// stagger_regs.push_back(stagger_reg + x); +// } +// std::vector stagger_regs2; +// while (stagger_regs.size() > 1u) { +// auto builder = BuildMI(MBB, delete_end, delete_end->getDebugLoc(), MII->getDesc()); +// unsigned m_idx = 3u; +// for (auto MOI = MII->operands_begin(); MOI != MII->operands_end(); ++MOI) { + +// } +// } + +// errs()<<"emited stagger insts\n"; +// } +// } +// } +// } +// } +// } \ No newline at end of file diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index d07b1f4e6cc21..73ba5a20a9562 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -415,7 +415,7 @@ void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ Value *Bound = E.Reps[i]; Function *SSRBoundStrideSetup = Intrinsic::getDeclaration(mod, functions[i]); std::array bsargs = {DMid, Bound, Stride}; - builder.CreateCall(SSRBoundStrideSetup->getFunctionType(), SSRBoundStrideSetup, ArrayRef(bsargs))->dump(); + builder.CreateCall(SSRBoundStrideSetup->getFunctionType(), SSRBoundStrideSetup, ArrayRef(bsargs)); } unsigned n_reps = 0u; @@ -425,7 +425,6 @@ void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ std::array pusharg = {DMid, cast(I)->getValueOperand()}; builder.SetInsertPoint(I); auto *C = builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); - C->dump(); I->dump(); I->eraseFromParent(); n_reps++; } @@ -435,7 +434,6 @@ void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ for (Instruction *I : E.Access->getAccesses()){ builder.SetInsertPoint(I); auto *V = builder.CreateCall(SSRPop->getFunctionType(), SSRPop, ArrayRef(poparg), "ssr.pop"); - V->dump(); I->dump(); BasicBlock::iterator ii(I); ReplaceInstWithValue(I->getParent()->getInstList(), ii, V); n_reps++; @@ -446,7 +444,7 @@ void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ Constant *Rep = ConstantInt::get(i32, n_reps - 1U); Function *SSRRepetitionSetup = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_setup_repetition); std::array repargs = {DMid, Rep}; - builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef(repargs))->dump(); + builder.CreateCall(SSRRepetitionSetup->getFunctionType(), SSRRepetitionSetup, ArrayRef(repargs)); Function *SSRSetup; if (!isStore){ @@ -456,7 +454,7 @@ void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ } std::array args = {DMid, Dim, E.Addr}; //NOTE: this starts the prefetching ==> always needs to be inserted AFTER bound/stride and repetition setups !!! - builder.CreateCall(SSRSetup->getFunctionType(), SSRSetup, ArrayRef(args))->dump(); + builder.CreateCall(SSRSetup->getFunctionType(), SSRSetup, ArrayRef(args)); return; } @@ -465,29 +463,7 @@ void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ void generateSSRBarrier(Instruction *InsertBefore, unsigned dmid) { IRBuilder<> builder(InsertBefore); Function *Barrier = Intrinsic::getDeclaration(InsertBefore->getModule(), Intrinsic::riscv_ssr_barrier); - builder.CreateCall(Barrier->getFunctionType(), Barrier, ConstantInt::get(Type::getInt32Ty(builder.getContext()), dmid))->dump(); -} - -void generateFPDependency(IRBuilder<> &builder){ - constexpr unsigned num_fpr = 32u; - Type *Double = Type::getDoubleTy(builder.getContext()); - std::vector inputs; - std::vector args; - std::string constraints = ""; - for (unsigned i = 0u; i < num_fpr; i++) { - inputs.push_back(Double); - args.push_back(UndefValue::get(Double)); - std::string regname = formatv("f{0}", i); - constraints = "={" + regname + "}" + (i ? "," : "") + constraints + ", {" + regname + "}"; - } - Type *rty = StructType::get(builder.getContext(), inputs); - auto *IA = InlineAsm::get( - FunctionType::get(rty, inputs, false), - "", - constraints, - true - ); - builder.CreateCall(IA, args, "fpr.dep"); + builder.CreateCall(Barrier->getFunctionType(), Barrier, ConstantInt::get(Type::getInt32Ty(builder.getContext()), dmid)); } /// generates SSR enable & disable calls @@ -815,6 +791,7 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F while (!worklist.empty()) { const Loop *L = worklist.front(); worklist.pop_front(); errs()<<"visiting loop: "<getHeader()->getNameOrAsOperand()<<"\n"; + visitLoop(L, possible, tree, AAA, ssrInvalidLoops.find(L) != ssrInvalidLoops.end()); for (const Loop *x : L->getSubLoops()) worklist.push_back(x); diff --git a/llvm/lib/Transforms/SSR/SSRInference.cpp b/llvm/lib/Transforms/SSR/SSRInference.cpp index 6f2375e602d9d..c565075585ca4 100644 --- a/llvm/lib/Transforms/SSR/SSRInference.cpp +++ b/llvm/lib/Transforms/SSR/SSRInference.cpp @@ -31,6 +31,9 @@ #include "llvm/Transforms/Scalar/IndVarSimplify.h" #include "llvm/Transforms/Scalar/SimplifyCFG.h" #include "llvm/Transforms/Scalar/LoopFlatten.h" +#include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h" +#include "llvm/Transforms/Scalar/Reassociate.h" +#include "llvm/Transforms/Scalar/LICM.h" #include "llvm/Transforms/SSR/SSRGeneration.h" #include "llvm/Support/CommandLine.h" @@ -59,10 +62,14 @@ PreservedAnalyses SSRInferencePass::run(Function &F, FunctionAnalysisManager &FA //FPM.addPass(createFunctionToLoopPassAdaptor(LoopRotatePass())); FPM.addPass(LCSSAPass()); //put loops into LCSSA-form //FPM.addPass(createFunctionToLoopPassAdaptor(IndVarSimplifyPass(false))); + FPM.addPass(SSRGenerationPass()); //runs AffineAccess analysis and generates SSR intrinsics - FPM.addPass(SimplifyCFGPass()); //simplifies CFG again + + FPM.addPass(LoopSimplifyPass()); //canonicalize loops again FPM.addPass(InstCombinePass()); //removes phi nodes from LCSSA FPM.addPass(ADCEPass()); //remove potential dead instructions that result from SSR replacement + FPM.addPass(createFunctionToLoopPassAdaptor(LICMPass())); //LICM of rt-checks maybe + FPM.addPass(SimplifyCFGPass()); //simplifies CFG again FPM.addPass(LoopSimplifyPass()); //canonicalize loops again auto pa = FPM.run(F, FAM); errs()<<"SSR Inference Pass on function: "< Date: Wed, 6 Jul 2022 14:51:32 +0200 Subject: [PATCH 41/47] boundcheck fix --- .../llvm/Analysis/AffineAccessAnalysis.h | 12 +- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 245 ++++++++++++------ llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp | 3 +- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 9 +- 4 files changed, 183 insertions(+), 86 deletions(-) diff --git a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h index e2b223c08b97d..a50d68af2b193 100644 --- a/llvm/include/llvm/Analysis/AffineAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/AffineAccessAnalysis.h @@ -22,6 +22,7 @@ class MemoryUseOrDef; class MemoryDef; struct ExpandedAffAcc; class DependenceInfo; +class LoopAccessInfo; struct LoopRep{ private: @@ -69,6 +70,7 @@ struct AffAcc{ ///immediately copies the contens of accesses and containingLoops AffAcc(ArrayRef accesses, const SCEV *Addr, MemoryUseOrDef *MA, ArrayRef containingLoops, ScalarEvolution &SE); ArrayRef getAccesses() const; + Value *getAddrValue() const; bool isWrite() const; int getMaxDimension() const; const Loop *getDeepestMalformed() const; @@ -94,7 +96,7 @@ struct AffAcc{ Value *expandBaseAddr(unsigned dimension, Type *ty, Instruction *InsertBefore = (Instruction *)nullptr); Value *expandStep(unsigned dimension, Type *ty, Instruction *InsertBefore = (Instruction *)nullptr); Value *expandRep(unsigned dimension, Type *ty, Instruction *InsertBefore = (Instruction *)nullptr); - ExpandedAffAcc expandAt(const Loop *L, Instruction *Point, Type *PtrTy, IntegerType *ParamTy, IntegerType *AgParamTy); + ExpandedAffAcc expandAt(const Loop *L, Instruction *Point, Type *PtrTy, IntegerType *ParamTy); }; struct MemDep { @@ -139,10 +141,10 @@ class AffineAccess{ MemDep MD; DenseMap access; DenseMap reps; - DenseMap> wellformedAccesses; - DenseMap> expandableAccesses; + DenseMap> promotedAccesses; + DenseMap> expandableAccesses; - std::vector analyze(const Loop *Parent, ArrayRef loopPath); + std::unique_ptr> analyze(Loop *Parent, ArrayRef loopPath); void addAllConflicts(const std::vector &all); AffAccConflict calcRWConflict(AffAcc *Read, AffAcc *Write, const Loop *L) const; std::pair calcConflict(AffAcc *A, AffAcc *B) const; @@ -162,7 +164,7 @@ class AffineAccess{ std::vector getExpandableAccesses(const Loop *L, bool conflictFreeOnly = false); std::vector expandAllAt(ArrayRef Accs, const Loop *L, Instruction *Point, - Value *&BoundCheck, Type *PtrTy, IntegerType *ParamTy, IntegerType *AgParamTy, bool conflictChecks = true, bool repChecks = false); + Value *&BoundCheck, Type *PtrTy, IntegerType *ParamTy, bool conflictChecks = true, bool repChecks = false); }; class AffineAccessAnalysis : public AnalysisInfoMixin { diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index cacc9067289e8..66689a2034359 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -19,6 +19,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" +#include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -27,6 +28,7 @@ #include "llvm/Analysis/AliasAnalysisEvaluator.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormatVariadic.h" @@ -268,7 +270,6 @@ bool isOnAllPredicatedControlFlowPaths(BasicBlock *BB, const Loop *L, const Domi if (q.front() == L->getHeader()) return false; //bfs arrived at Header (again) with a path that never went through BB } - return true; } @@ -304,22 +305,6 @@ const Loop *findFirstContaining(ArrayRef loops, BasicBlock *BB){ return nullptr; } -/*Value *goodAnd(IRBuilder<> &builder, ArrayRef bools){ - assert(!bools.empty()); - std::vector b1, b2; - for (Value *b : bools) b1.push_back(b); - while (b1.size() > 1u) { - unsigned i = 0u; - for (; i+1 < b1.size(); i += 2) { - b2.push_back(builder.CreateAnd(b1[i], b1[i+1], "and.tree")); - } - if (i < b1.size()) b2.push_back(b1[i]); //add last element if odd nr in b1 - std::swap(b1, b2); - b2.clear(); - } - return b1[0]; //return the last value -}*/ - bool hasMemInst(MemoryUseOrDef *MA) { return MA && MA->getMemoryInst(); } //updates L<-M if M is a descendant of L (or if L is nullptr) @@ -347,22 +332,86 @@ void updateOutermostExpandableExcl(const Loop *&outerMostExpandableExl, AffAccCo } } -void dumpAffAccConflict(AffAccConflict kind) { - switch (kind) +// void dumpAffAccConflict(AffAccConflict kind) { +// switch (kind) +// { +// case AffAccConflict::Bad: +// errs()<<"Bad"; +// break; +// case AffAccConflict::MustNotIntersect: +// errs()<<"MustNotIntersect"; +// break; +// case AffAccConflict::NoConflict: +// errs()<<"NoConflict"; +// break; +// default: +// break; +// } +// errs()<<"\n"; +// } + +Optional findSign(const SCEV *S, ScalarEvolution &SE, std::vector> &known) { + if (!S) return None; + + //in case we know + for (const auto &p : known) { + if (SCEVEquals(S, p.first, SE)) return p.second; + } + + //in case SE knows + if (SE.isKnownNegative(S)) return -1; + if (SE.isKnownPositive(S)) return 1; + if (S->isZero()) return 0; + + //do recursively + switch (S->getSCEVType()) { - case AffAccConflict::Bad: - errs()<<"Bad"; - break; - case AffAccConflict::MustNotIntersect: - errs()<<"MustNotIntersect"; - break; - case AffAccConflict::NoConflict: - errs()<<"NoConflict"; - break; + case SCEVTypes::scConstant: + if (S->isZero()) return 0; + else if (SE.isKnownPositive(S)) return 1; + else if (SE.isKnownNegative(S)) return -1; + llvm_unreachable("SE does not know sign of constant value ???"); + + case SCEVTypes::scMulExpr: { + auto l = findSign(cast(S)->getOperand(0), SE, known); + auto r = findSign(cast(S)->getOperand(1), SE, known); + if (!l.hasValue() || !r.hasValue()) return None; + return r.getValue() * l.getValue(); + } + + case SCEVTypes::scAddExpr: { + auto l = findSign(cast(S)->getOperand(0), SE, known); + auto r = findSign(cast(S)->getOperand(1), SE, known); + if (!l.hasValue() || !r.hasValue()) return None; + if (l.getValue() + r.getValue() >= 1) return 1; + if (l.getValue() + r.getValue() <= -1) return -1; + return None; + } + + case SCEVTypes::scPtrToInt: + case SCEVTypes::scTruncate: + return findSign(cast(S)->getOperand(0), SE, known); + + //TODO: could add max/min, etc... + default: - break; + return None; + } + llvm_unreachable(""); +} + +const SCEV *getZExtIfNeeded(const SCEV *S, Type *Ty, ScalarEvolution &SE) { + if (SE.getDataLayout().getTypeSizeInBits(S->getType()) < SE.getDataLayout().getTypeSizeInBits(Ty)) { + return SE.getZeroExtendExpr(S, Ty); } - errs()<<"\n"; + return S; +} + +const SCEV *getSExtIfNeeded(const SCEV *S, Type *Ty, ScalarEvolution &SE) { + if (SE.getDataLayout().getTypeSizeInBits(S->getType()) < SE.getDataLayout().getTypeSizeInBits(Ty)) { + return SE.getSignExtendExpr(S, Ty); + } + return S; } } //end of namespace @@ -561,6 +610,15 @@ unsigned AffAcc::loopToDimension(const Loop *L) const { llvm_unreachable("The provided loop does not contain `this`!"); } +Value *AffAcc::getAddrValue() const { + assert(getBaseAddr(0u) && "has an address"); + if (isWrite()) { + return cast(accesses[0])->getPointerOperand(); + } else { + return cast(accesses[0])->getPointerOperand(); + } +} + ///SCEV of base Address for the base address at a given dimension const SCEV *AffAcc::getBaseAddr(unsigned dim) const { assert(dim < baseAddresses.size()); return baseAddresses[dim]; } @@ -714,18 +772,22 @@ Value *AffAcc::expandRep(unsigned dimension, Type *ty, Instruction *InsertBefore } ExpandedAffAcc AffAcc::expandAt(const Loop *L, Instruction *Point, - Type *PtrTy, IntegerType *ParamTy, IntegerType *AgParamTy) + Type *PtrTy, IntegerType *ParamTy) { - errs()<<"expanding for Loop with header: "<getHeader()->getNameOrAsOperand()<<" the following:\n"; - dumpInLoop(L); - if (!Point) Point = L->getLoopPreheader()->getTerminator(); IRBuilder<> builder(Point); assert(isWellFormed(L)); std::vector reps, steps, ranges, prefixsum_ranges; - unsigned dim = loopToDimension(L); + const unsigned dim = loopToDimension(L); Value *Addr = expandBaseAddr(dim, PtrTy, Point); + IntegerType *SizeTy = IntegerType::get(SE.getContext(), (unsigned)SE.getTypeSizeInBits(Addr->getType())); Value *psum = nullptr; + Value *LowerBound = builder.CreatePtrToInt(Addr, SizeTy, "lb"); + Value *UpperBound = LowerBound; + std::vector> known; + for (int d = 1u; d < getMaxDimension(); d++) { + known.push_back(std::make_pair(this->reps[d]->getSCEVPlusOne(), 1)); + } for (unsigned i = 1u; i <= dim; i++) { reps.push_back(expandRep(i, ParamTy, Point)); steps.push_back(expandStep(i, ParamTy, Point)); @@ -735,14 +797,36 @@ ExpandedAffAcc AffAcc::expandAt(const Loop *L, Instruction *Point, if (psum) psum = builder.CreateAdd(psum, ranges.back(), formatv("prefsum.range.{0}d", i)); else psum = ranges.back(); prefixsum_ranges.push_back(psum); + auto sign = findSign(getStep(i), SE, known); + if (sign.hasValue()) { + if (sign.getValue() < 0) LowerBound = builder.CreateAdd(LowerBound, builder.CreateSExtOrTrunc(ranges.back(), SizeTy, "lb.dec")); + else if (sign.getValue() > 0) UpperBound = builder.CreateAdd(UpperBound, builder.CreateZExtOrTrunc(ranges.back(), SizeTy, "ub.inc")); + //else sign == 0: no action needed + } else { //we do not know sign! need to test at runtime + Value *Test = builder.CreateICmpSGE(ranges.back(), ConstantInt::get(ParamTy, 0), "test.nonnegative"); //FIXME: does not work for unsigned values > 2^30 + LowerBound = builder.CreateSelect( + builder.CreateNot(Test, formatv("not.test.{0}d", i)), + builder.CreateSExtOrTrunc(ranges.back(), SizeTy, formatv("range.{0}d.sext", i)), + ConstantInt::get(SizeTy, 0) + ); + UpperBound = builder.CreateSelect( + Test, + builder.CreateZExtOrTrunc(ranges.back(), SizeTy, formatv("range.{0}d.zext", i)), + ConstantInt::get(SizeTy, 0) + ); + } } - Value *LowerBound = builder.CreatePtrToInt(Addr, AgParamTy, "lb"); - Value *r = builder.CreateZExtOrTrunc(prefixsum_ranges.back(), AgParamTy, "prefsum.cast"); - Value *UpperBound = builder.CreateAdd(LowerBound, r, "ub"); ExpandedAffAcc Aexp(this, Addr, steps, reps, ranges, prefixsum_ranges, LowerBound, UpperBound); return Aexp; } +// // ================= CustomMultiDRTPointerChecking =================== +// //takes inspiration from RuntimePointerChecking's .insert(...) +// void CustomMultiDRTPointerChecking::insert(const AffAcc &A) { + +// } +// Value *generateChecks(Instruction *I, Value *memRangeStart, Value *memRangeEnd); + // ================= MemDep ============== bool MemDep::alias(Value *A, Value *B) { return !A || !B || AA.alias(A, B) != AliasResult::NoAlias; } @@ -808,40 +892,55 @@ DenseSet MemDep::findClobberUsers(MemoryDef *MA) { //================== Affine Access =========================================================== -AffineAccess::AffineAccess(Function &F, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, MemorySSA &MSSA, AAResults &AA, DependenceInfo &DI) - : SE(SE), DT(DT), LI(LI), MSSA(MSSA), AA(AA), DI(DI), MD(MSSA, AA){ - for (const Loop *L : LI.getTopLevelLoops()){ - std::vector all = analyze(L, ArrayRef()); - addAllConflicts(all); +AffineAccess::AffineAccess( + Function &F, ScalarEvolution &SE, DominatorTree &DT, + LoopInfo &LI, MemorySSA &MSSA, AAResults &AA, + DependenceInfo &DI + ) : SE(SE), DT(DT), LI(LI), MSSA(MSSA), AA(AA), DI(DI), MD(MSSA, AA) +{ + for (Loop *L : LI.getTopLevelLoops()){ + auto all = analyze(L, ArrayRef()); + addAllConflicts(*all); + all.release(); } } -std::vector AffineAccess::analyze(const Loop *Parent, ArrayRef loopPath){ +std::unique_ptr> AffineAccess::analyze(Loop *Parent, ArrayRef loopPath){ errs()<<"analyze: loop : "<getHeader()->getNameOrAsOperand()<<"\n"; + + //LoopRep for Parent LoopRep *ParentLR = new LoopRep(Parent, loopPath, SE, DT); reps.insert(std::make_pair(Parent, ParentLR)); //add Parent to LoopReps + + //prepare path std::vector path; path.push_back(Parent); //add Parent to path for (auto *L : loopPath) path.push_back(L); - wellformedAccesses.insert(std::make_pair(Parent, SmallVector())); - std::vector all; - - for (const Loop *L : Parent->getSubLoops()){ - std::vector accs = analyze(L, ArrayRef(path)); + + //prepare results + auto all = std::make_unique>(); + auto &promoted = promotedAccesses.insert(std::make_pair(Parent, SmallVector())).first->getSecond(); + + //promote subloop accesses + for (Loop *L : Parent->getSubLoops()){ + std::unique_ptr> accs = analyze(L, ArrayRef(path)); + all->reserve(accs->size()); LoopRep *LR = reps.find(L)->second; //guaranteed to exist, no check needed bool canPromote = LR->isAvailable() && ParentLR->isAvailable() && LR->isOnAllCFPathsOfParentIfExecuted(); - for (AffAcc *A : accs){ - all.push_back(A); + for (AffAcc *A : *accs){ + all->push_back(A); if (canPromote){ //L is well-formed and on all CF-paths if its rep is >0 at run-time - auto &l = wellformedAccesses.find(Parent)->getSecond(); if (A->promote(ParentLR)){ - l.push_back(A); //guaranteed to exist + promoted.push_back(A); //guaranteed to exist } } } + accs.release(); } + //promote accesses from this loop for (BasicBlock *BB : Parent->getBlocks()){ + if (LI.getLoopFor(BB) != Parent) continue; //skip BB as it was already processed in a subloop for (Instruction &I : *BB){ MemoryUseOrDef *MA = MSSA.getMemoryAccess(&I); if (MA && hasMemInst(MA) && access.find(MA) == access.end()){ //no AffAcc for this memory access yet! @@ -849,19 +948,21 @@ std::vector AffineAccess::analyze(const Loop *Parent, ArrayRef(&I), AddrSCEV, MA, ArrayRef(path), SE); - all.push_back(A); + all->push_back(A); access.insert(std::make_pair(MA, A)); if (ParentLR->isAvailable()){ bool onAllCFPaths = true; for (Instruction *I : A->getAccesses()) onAllCFPaths &= isOnAllControlFlowPaths(I->getParent(), Parent, DT); if (onAllCFPaths && A->promote(ParentLR)){ - wellformedAccesses.find(Parent)->getSecond().push_back(A); //guaranteed to exist + promoted.push_back(A); //guaranteed to exist } } } } } + errs()<<"analyze: done with loop: "<getHeader()->getNameOrAsOperand()<<"\n"; + return all; } @@ -880,13 +981,12 @@ void AffineAccess::addAllConflicts(const std::vector &all) { auto p = access.find(D); if (p == access.end()) continue; AffAcc *B = p->second; - //A->dump(); B->dump(); auto r = calcConflict(A, B); - //dumpAffAccConflict(r.first); if (r.first != AffAccConflict::NoConflict) A->addConflict(B, r.second, r.first); updateOutermostExpandableExcl(outerMostExpandableExl, r.first, r.second, B->getDeepestMalformed()); assert(!outerMostExpandableExl || outerMostExpandableExl->contains(A->getMemoryAccess()->getBlock())); } + ArrayRef loops = A->getContainingLoops(); for (const Loop *L : loops) { if (!L) continue; @@ -917,24 +1017,24 @@ AffAccConflict AffineAccess::calcRWConflict(AffAcc *Read, AffAcc *Write, const L Value *Addr = getAddress(r); Value *DAddr = getAddress(w); bool dominates = MSSA.dominates(r, w); - //auto dep = DI.depends(r->getMemoryInst(), w->getMemoryInst(), dominates && L->isInnermost()); if (Addr && DAddr && AA.alias(Addr, DAddr) == NoAlias) return AffAccConflict::NoConflict; AffAccConflict kind = AffAccConflict::Bad; - if (!dominates) { //read does not dominate write ==> RaW + if (!dominates) { //read does not dominate write ==> R maybe after W kind = AffAccConflict::MustNotIntersect; - } else { //read dominates write ==> WaR + } else { //read dominates write ==> W is after R kind = AffAccConflict::MustNotIntersect; //exception: we know that the store always happens to a position already written from if the store is to same address as write (FIXME: CONSERVATIVE) //but the steps needs to be != 0 such that there is no dependence from one iteration to the next - if ((Addr && DAddr && AA.alias(Addr, DAddr) == MustAlias) - || accessPatternsAndAddressesMatch(Read, Write, L)) + bool nonzeroSteps = true; + unsigned dr = Read->loopToDimension(L); + unsigned dw = Write->loopToDimension(L); + while (Read->isWellFormed(dr) && Write->isWellFormed(dw)) { + nonzeroSteps &= SE.isKnownNonZero(Read->getStep(dr++)) && SE.isKnownNonZero(Write->getStep(dw++)); + } + if ((Addr && DAddr && AA.alias(Addr, DAddr) == MustAlias && nonzeroSteps) + || (accessPatternsAndAddressesMatch(Read, Write, L) && nonzeroSteps)) { - bool nonzeroSteps = true; - unsigned dr = Read->loopToDimension(L); - unsigned dw = Write->loopToDimension(L); - while (Read->isWellFormed(dr) && Write->isWellFormed(dw)) - nonzeroSteps &= SE.isKnownNonZero(Read->getStep(dr++)) && SE.isKnownNonZero(Write->getStep(dw++)); - if (nonzeroSteps) kind = AffAccConflict::NoConflict; + kind = AffAccConflict::NoConflict; } } return kind; @@ -943,9 +1043,6 @@ AffAccConflict AffineAccess::calcRWConflict(AffAcc *Read, AffAcc *Write, const L ///returns the kind of conflict (and innermost common loop) that A and B have assuming there is some memory dependency ///does not check for the memory dependency itself for to peformance std::pair AffineAccess::calcConflict(AffAcc *A, AffAcc *B) const { - //auto dep = DI.depends(A->getMemoryAccess()->getMemoryInst(), B->getMemoryAccess()->getMemoryInst(), MSSA.dominates(A->getMemoryAccess(), B->getMemoryAccess())); - //dep->dump(errs()); - //errs()<<"confused = "<isConfused()<<", is consistent = "<isConsistent()<<", is anti = "<isAnti()<<", is flow = "<isFlow()<<", is input = "<isInput()<<", is output = "<isOutput()<<"\n"; assert((A->isWrite() || B->isWrite()) && "conflict between two reads ???"); const Loop *const innermostCommon = findFirstContaining(A->getContainingLoops(), B->getMemoryAccess()->getBlock()); if (!innermostCommon) return std::make_pair(AffAccConflict::NoConflict, innermostCommon); @@ -1007,14 +1104,14 @@ std::vector AffineAccess::getExpandableAccesses(const Loop *L, bool co std::vector AffineAccess::expandAllAt(ArrayRef Accs, const Loop *L, Instruction *Point, Value *&BoundCheck, - Type *PtrTy, IntegerType *ParamTy, IntegerType *AgParamTy, bool conflictChecks, bool repChecks) + Type *PtrTy, IntegerType *ParamTy, bool conflictChecks, bool repChecks) { assert(Point); IRBuilder<> builder(Point); DenseMap exps; for (AffAcc *A : Accs) { //expand the requested AffAcc's - exps.insert(std::make_pair(A, std::move(A->expandAt(L, Point, PtrTy, ParamTy, AgParamTy)))); + exps.insert(std::make_pair(A, std::move(A->expandAt(L, Point, PtrTy, ParamTy)))); } std::vector checks; @@ -1033,7 +1130,7 @@ AffineAccess::expandAllAt(ArrayRef Accs, const Loop *L, case AffAccConflict::MustNotIntersect: { auto e = exps.find(B); if (e == exps.end()) { //if B was not yet expanded, do that and update the iterator for the pair in exps - e = exps.insert(std::make_pair(B, std::move(B->expandAt(L, Point, PtrTy, ParamTy, AgParamTy)))).first; + e = exps.insert(std::make_pair(B, std::move(B->expandAt(L, Point, PtrTy, ParamTy)))).first; } assert(e->first == B); ExpandedAffAcc &expB = e->getSecond(); diff --git a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp index caa1161762794..1fc4bf919f6e3 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp @@ -495,5 +495,4 @@ FunctionPass *createRISCVExpandSSRPass() { return new RISCVExpandSSR(); } } if (b->getSecond().second == Pred) b->getSecond().second = MI; } - }*/ - */ \ No newline at end of file + }*/ \ No newline at end of file diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index d07b1f4e6cc21..1c24ce3ffd2b8 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -384,9 +384,8 @@ BasicBlock *getSingleExitBlock(const Loop *L) { Value *GenerateTCDMCheck(ExpandedAffAcc &E, Instruction *Point) { IRBuilder<> builder(Point); - IntegerType *i64 = IntegerType::getInt64Ty(Point->getContext()); - Value *c1 = builder.CreateICmpULE(ConstantInt::get(i64, SSR_SCRATCHPAD_BEGIN), E.LowerBound, "beg.check"); - Value *c2 = builder.CreateICmpULE(E.UpperBound, ConstantInt::get(i64, SSR_SCRATCHPAD_END), "end.check"); + Value *c1 = builder.CreateICmpULE(ConstantInt::get(E.LowerBound->getType(), SSR_SCRATCHPAD_BEGIN), E.LowerBound, "beg.check"); + Value *c2 = builder.CreateICmpULE(E.UpperBound, ConstantInt::get(E.UpperBound->getType(), SSR_SCRATCHPAD_END), "end.check"); return builder.CreateAnd(c1, c2, "tcdm.check"); } @@ -592,13 +591,12 @@ std::vector expandInLoop(const std::vector &accs, cons auto &ctxt = L->getHeader()->getContext(); IntegerType *i32 = IntegerType::getInt32Ty(ctxt); - IntegerType *i64 = IntegerType::getInt64Ty(ctxt); Type *i8Ptr = Type::getInt8PtrTy(ctxt); Instruction *PhT = L->getLoopPreheader()->getTerminator(); //generate Steps, Reps, base addresses, intersect checks, and bound checks - auto exp = AAA.expandAllAt(accs, L, PhT, Cond, i8Ptr, i32, i64, !SSRNoIntersectCheck, !SSRNoBoundCheck); + auto exp = AAA.expandAllAt(accs, L, PhT, Cond, i8Ptr, i32, !SSRNoIntersectCheck, !SSRNoBoundCheck); assert(Cond); //TCDM Checks @@ -851,6 +849,7 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F if (p != conds.end()) { BasicBlock *Ex = getSingleExitBlock(L); assert(Ex); + //LoopVersioning LV(LAI, ArrayRef(), Parent, &LI, &DT, &SE);LoopAccessInfo LAI(Parent, &SE, nullptr, &AA, &DT, &LI); cloneAndSetup(L->getLoopPreheader()->getTerminator(), &*Ex->getFirstInsertionPt(), p->second, exps.find(L)->getSecond()); } } From 0cb8ef80f65ee593d17d5f905dd412a6e2731e5f Mon Sep 17 00:00:00 2001 From: thrupf Date: Wed, 6 Jul 2022 14:55:20 +0200 Subject: [PATCH 42/47] change target machine --- llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 847d693d7551c..64d9028c14217 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -170,7 +170,7 @@ TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) { } void RISCVPassConfig::addIRPasses() { - //addPass(createSSRReassociatePass()); + //addPass(createSSRReassociatePass()); //sadly creates some problems right now addPass(createAtomicExpandPass()); TargetPassConfig::addIRPasses(); } @@ -211,7 +211,7 @@ void RISCVPassConfig::addPreEmitPass2() { addPass(createRISCVExpandPseudoPass()); addPass(createPULPFixupHwLoops()); addPass(createRISCVExpandSSRPostRegAllocPass()); - addPass(createSNITCHAutoFrepPass()); + //addPass(createSNITCHAutoFrepPass()); can have benefits in some cases // Schedule the expansion of AMOs at the last possible moment, avoiding the // possibility for other passes to break the requirements for forward From 126d107329d9170da7660683373a9b1561903e0e Mon Sep 17 00:00:00 2001 From: thrupf Date: Mon, 11 Jul 2022 10:40:07 +0200 Subject: [PATCH 43/47] prepare backend for submission --- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 42 +- llvm/lib/CodeGen/PostRASchedulerList.cpp | 11 +- llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp | 5 +- .../RISCV/RISCVExpandSSRInstsPostRegAlloc.cpp | 123 ++---- llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td | 2 +- llvm/lib/Target/RISCV/RISCVSSRReassociate.cpp | 374 +++++++++++++----- llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 10 +- .../Target/RISCV/Snitch/SNITCHAutoFrep.cpp | 112 +++--- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 2 +- 9 files changed, 404 insertions(+), 277 deletions(-) diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index 66689a2034359..cf420acd50eab 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -512,26 +512,35 @@ AffAcc::AffAcc(ArrayRef accesses, const SCEV *Addr, MemoryUseOrDe void AffAcc::findSteps(const SCEV *A, const SCEV *Factor, unsigned loop){ assert(A); assert(baseAddresses.size() == 1 && reps.size() == 1 && "we only know dim=0 so far"); - if (loop >= containingLoops.size()) return; - if (!SE.containsAddRecurrence(A) && loop < containingLoops.size()){ //A is inv to the rest of the loops + + if (loop >= containingLoops.size()) return; //we are done + + if (!SE.containsAddRecurrence(A) && loop < containingLoops.size()){ + //A is inv to the rest of the loops steps.push_back(SE.getConstant(Type::getInt64Ty(this->accesses[0]->getContext()), 0U)); findSteps(A, Factor, loop + 1u); } + switch (A->getSCEVType()) { - //case SCEVTypes::scZeroExtend: FIXME: this is unsafe, right? + //unary expressions that do not change value + case SCEVTypes::scZeroExtend: //FIXME: this might be unsafe case SCEVTypes::scSignExtend: case SCEVTypes::scTruncate: return findSteps(cast(A)->getOperand(0), Factor, loop); - case SCEVTypes::scAddExpr: { - const SCEV *L = cast(A)->getOperand(0); - const SCEV *R = cast(A)->getOperand(1); - bool l = SE.containsAddRecurrence(L); - bool r = SE.containsAddRecurrence(R); - if (l && !r) return findSteps(L, Factor, loop); - else if(!l && r) return findSteps(R, Factor, loop); - return; - } + + // TODO: if we want to allow random adds in between then we would need to add the non-recursive part to the base address + // case SCEVTypes::scAddExpr: { + // const SCEV *L = cast(A)->getOperand(0); + // const SCEV *R = cast(A)->getOperand(1); + // bool l = SE.containsAddRecurrence(L); + // bool r = SE.containsAddRecurrence(R); + // if (l && !r) return findSteps(L, Factor, loop); + // else if(!l && r) return findSteps(R, Factor, loop); + // return; + // } + + //L * R case SCEVTypes::scMulExpr: { const SCEV *L = cast(A)->getOperand(0); const SCEV *R = cast(A)->getOperand(1); @@ -547,10 +556,12 @@ void AffAcc::findSteps(const SCEV *A, const SCEV *Factor, unsigned loop){ }else Factor = R; return findSteps(L, Factor, loop); } + + //{,+,Step} case SCEVTypes::scAddRecExpr: { const auto *S = cast(A); const SCEV *Step; - if (S->getLoop() == containingLoops[loop]){ + if (S->getLoop() == containingLoops[loop]){ //L == containingLoops[loop] Step = S->getStepRecurrence(SE); if (Factor) { auto p = toSameType(Factor, Step, SE, true); @@ -561,13 +572,14 @@ void AffAcc::findSteps(const SCEV *A, const SCEV *Factor, unsigned loop){ return findSteps(S->getStart(), Factor, loop+1); }else{ //A is loop-invariant to containingLoops[loop] bool occursLater = false; //loop needs to occur later - for (unsigned i = loop+1; i < containingLoops.size(); i++) occursLater = occursLater || containingLoops[i] == S->getLoop(); + for (unsigned i = loop+1; i < containingLoops.size(); i++) + occursLater = occursLater || containingLoops[i] == S->getLoop(); if (!occursLater) return; steps.push_back(SE.getConstant(Type::getInt64Ty(this->accesses[0]->getContext()), 0U)); return findSteps(S, Factor, loop+1); } } - default: + default: //in all other cases we cannot safely extract more steps and thus just return return; } } diff --git a/llvm/lib/CodeGen/PostRASchedulerList.cpp b/llvm/lib/CodeGen/PostRASchedulerList.cpp index b85f00a61eac1..157ee3f183726 100644 --- a/llvm/lib/CodeGen/PostRASchedulerList.cpp +++ b/llvm/lib/CodeGen/PostRASchedulerList.cpp @@ -268,10 +268,12 @@ bool PostRAScheduler::enablePostRAScheduler( TargetSubtargetInfo::RegClassVector &CriticalPathRCs) const { Mode = ST.getAntiDepBreakMode(); ST.getCriticalPathRCs(CriticalPathRCs); - // Check for explicit enable/disable of post-ra scheduling. - if (EnablePostRAScheduler.getPosition() > 0) + if (EnablePostRAScheduler.getPosition() > 0) { return EnablePostRAScheduler; + } + + // return true; //FIXME: Snitch does not enable this by default (and should probably) return ST.enablePostRAScheduler() && OptLevel >= ST.getOptLevelToEnablePostRAScheduler(); @@ -291,7 +293,6 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) { TargetSubtargetInfo::AntiDepBreakMode AntiDepMode = TargetSubtargetInfo::ANTIDEP_NONE; SmallVector CriticalPathRCs; - // Check that post-RA scheduling is enabled for this target. // This may upgrade the AntiDepMode. if (!enablePostRAScheduler(Fn.getSubtarget(), PassConfig->getOptLevel(), @@ -307,11 +308,13 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) { : TargetSubtargetInfo::ANTIDEP_NONE); } + // AntiDepMode = TargetSubtargetInfo::ANTIDEP_ALL; //FIXME: Snitch does not enable this by default (and probably should) + LLVM_DEBUG(dbgs() << "PostRAScheduler\n"); SchedulePostRATDList Scheduler(Fn, MLI, AA, RegClassInfo, AntiDepMode, CriticalPathRCs); - + // Loop over all of the basic blocks for (auto &MBB : Fn) { #ifndef NDEBUG diff --git a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp index 1fc4bf919f6e3..93156b2315546 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandSSRInsts.cpp @@ -261,9 +261,10 @@ bool RISCVExpandSSR::expandSSR_PushPop(MachineBasicBlock &MBB, else { Register valR = MBBI->getOperand(ssrValIdx).getReg(); // Insert a "storing move" this is like a normal move but has side effects - MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoStoreMove), R).addReg(valR).getInstr(); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoStoreMove), R) + .addReg(valR, getRegState(MBBI->getOperand(ssrValIdx))) + .getInstr(); MBBI->eraseFromParent(); // The pseudo instruction is gone now. - MI->getOperand(0).setIsDef(); this->MoveStores.push_back(MI); } diff --git a/llvm/lib/Target/RISCV/RISCVExpandSSRInstsPostRegAlloc.cpp b/llvm/lib/Target/RISCV/RISCVExpandSSRInstsPostRegAlloc.cpp index 0b4cb3d9a0daf..2297f189fc1a3 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandSSRInstsPostRegAlloc.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandSSRInstsPostRegAlloc.cpp @@ -1,51 +1,15 @@ -//===-- RISCVExpandSSRPostRegAllocInsts.cpp - Expand SSR pseudo instructions ---------===// +//===-- RISCVExpandSSRPostRegAllocInsts.cpp - Expand the rest of the SSR pseudo insts ---------===// // -// Copyright 2021 ETH Zurich, University of Bologna. -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// ??? // //===----------------------------------------------------------------------===// // -// This file contains a pass that expands SSR pseudo instructions into target -// instructions. This pass should be run before register allocation +// This file contains a pass that expands the PseudoLoadMove and PseudoStoreMove +// into normal moves and is meant to be run after any scheduling to guarantee +// correctness. // //===----------------------------------------------------------------------===// -//===----------------------------------------------------------------------===// -// The SSR are configured in a memory-mapped address space accessible through -// the SCGGW(I)/SCGGR(I) instructions. The (I)mmediate instructions take the -// address as an immediate. The Address map is as follows: -// -// | Word| Hex | reg | -// |-----|------|------------| -// | 0 | 0x00 | status | -// | 1 | 0x01 | repeat | -// | 2 | 0x02 | Bound 0 | -// | 3 | 0x03 | Bound 1 | -// | 4 | 0x04 | Bound 2 | -// | 5 | 0x05 | Bound 3 | -// | 6 | 0x06 | Stride 0 | -// | 7 | 0x07 | Stride 1 | -// | 8 | 0x08 | Stride 2 | -// | 9 | 0x09 | Stride 3 | -// | | | _reserved_ | -// | 24 | 0x18 | Rptr 0 | -// | 25 | 0x19 | Rptr 1 | -// | 26 | 0x1a | Rptr 2 | -// | 27 | 0x1b | Rptr 3 | -// | 28 | 0x1c | Wptr 0 | -// | 29 | 0x1d | Wptr 1 | -// | 30 | 0x1e | Wptr 2 | -// | 31 | 0x1f | Wptr 3 | -// -// The data mover is selected in the lower 5 bits, the register offset is encoded -// in the upper 7 bits. The value passed to scfgX is therefore -// addr = dm + reg << 5 -// -// scfgw rs1 rs2 # rs1=value rs2=addr -//===----------------------------------------------------------------------===// - #include "RISCV.h" #include "RISCVInstrInfo.h" #include "RISCVTargetMachine.h" @@ -91,11 +55,6 @@ class RISCVExpandSSRPostRegAlloc : public MachineFunctionPass { StringRef getPassName() const override { return RISCV_EXPAND_SSR_POST_REG_ALLOC_NAME; } private: - - const MachineFunction *MF; - RISCVMachineFunctionInfo *RVFI; - bool Enabled; - bool expandMBB(MachineBasicBlock &MBB); bool mergePushPop(MachineBasicBlock &MBB); bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -106,6 +65,7 @@ class RISCVExpandSSRPostRegAlloc : public MachineFunctionPass { char RISCVExpandSSRPostRegAlloc::ID = 0; +//from RISCVExpandSSRInsts.cpp static Register getSSRFtReg(unsigned streamer) { unsigned AssignedReg = RISCV::F0_D + streamer; // Advance the iterator to the assigned register until the valid @@ -119,21 +79,14 @@ static Register getSSRFtReg(unsigned streamer) { bool RISCVExpandSSRPostRegAlloc::runOnMachineFunction(MachineFunction &MF) { TII = static_cast(MF.getSubtarget().getInstrInfo()); - this->MF = &MF; - this->RVFI = MF.getInfo(); bool Modified = false; for (auto &MBB : MF) Modified |= expandMBB(MBB); - if (SSRNoRegisterMerge) errs()<<"regmerge disabled \n"; + if (SSRNoRegisterMerge) LLVM_DEBUG(dbgs()<<"regmerge disabled\n"); if (!SSRNoRegisterMerge && Modified){ for (auto &MBB : MF) mergePushPop(MBB); } - // auto &MRI = MF.getRegInfo(); - // auto &TRI = *MRI.getTargetRegisterInfo(); - // RegisterClassInfo RCI; - // RCI.runOnMachineFunction(MF); - // auto *ADB = createAggressiveAntiDepBreaker(MF, RCI, ) return Modified; } @@ -179,32 +132,27 @@ bool RISCVExpandSSRPostRegAlloc::expandSSR_StoreLoadMove(MachineBasicBlock &MBB, return true; } -static std::pair isDefIsUse(MachineInstr &MI, MCRegister R) { - bool def = false; - bool use = false; - for (auto &MOP : MI.operands()) { - if (MOP.isReg() && MOP.getReg() == R) { - if (MOP.isDef()) def = true; - else use = true; - } - } - return std::make_pair(def, use); -} - static MachineOperand *getUniqueUser ( MachineBasicBlock::instr_iterator beg, MachineBasicBlock::instr_iterator end, - MachineBasicBlock::instr_iterator realend, Register valR) { + + if (beg.isEnd()) return nullptr; + auto *MBB = beg->getParent(); + assert(MBB); + + auto realend = MBB->end().getInstrIterator(); + MachineOperand *UseMOP = nullptr; bool isPastEnd = false; + for (auto MII = beg; MII != realend; ++MII) { + isPastEnd |= MII == end; if (MII->isDebugInstr()) continue; //skip debug instructions - errs()<<"looing at: "<<*MII; - if (UseMOP) errs()<<"usemop = "<<*UseMOP<<"\n"; bool definesValR = false; + for (auto &MOP : MII->operands()) { if (!MOP.isReg() || MOP.getReg() != valR) continue; //at this point we know MII accesses valR, with MOP, but maybe also other operands @@ -214,28 +162,35 @@ static MachineOperand *getUniqueUser ( if (MOP.isKill()) return UseMOP; //if MOP kills valR then we can stop looking further and return } } + if (definesValR) { return UseMOP; //if MII (re-)defines valR then we must have already found the Use before, (or we haven't in which case we return null) } + } - auto *MBB = beg->getParent(); + if (MBB) { + bool avail_in_all = true; MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + for (auto *Succ : MBB->successors()) { + if (!Succ) continue; + LivePhysRegs liveness(*MRI.getTargetRegisterInfo()); liveness.addLiveIns(*Succ); avail_in_all &= liveness.available(MRI, valR); } + if (avail_in_all) return UseMOP; + } + return nullptr; } bool RISCVExpandSSRPostRegAlloc::mergePushPop(MachineBasicBlock &MBB) { - const TargetRegisterInfo *TRI = MBB.getParent()->getRegInfo().getTargetRegisterInfo(); - Register ssr_regs[NUM_SSR]; for(unsigned ssr_no = 0; ssr_no < NUM_SSR; ++ssr_no) ssr_regs[ssr_no] = getSSRFtReg(ssr_no); @@ -259,17 +214,20 @@ bool RISCVExpandSSRPostRegAlloc::mergePushPop(MachineBasicBlock &MBB) { } } Register r = MI->getOperand(0).getReg(); //register to replace - errs()<<"looking for "<getOperand(0)<<"\n"; - MachineOperand *MO = getUniqueUser(std::next(MI.getReverse()), rangeLimit, MI->getParent()->end().getInstrIterator(), r); - if (!MO) errs()<<"*** NOT FOUND ***\n"; + MachineOperand *MO = getUniqueUser(std::next(MI.getReverse()), rangeLimit, r); + if (!MO) LLVM_DEBUG(dbgs()<<"*** NOT FOUND ***\n"); if (MO) { //if unique user exists MachineInstr *MIUser = MO->getParent(); if (MIUser && modified.find(MIUser) == modified.end()){ //if unique user exists and was not yet modified - MIUser->dump(); + LLVM_DEBUG(MIUser->dump()); for (auto &MOP : MIUser->operands()) { - if (MOP.isReg() && !MOP.isDef() && MOP.getReg() == r) MOP.setReg(ssr_reg); //replace all non-def uses of r with ssr_reg + if (MOP.isReg() && !MOP.isDef() && MOP.getReg() == r) { + MOP.setReg(ssr_reg); //replace all non-def uses of r with ssr_reg + MOP.setIsKill(false); + MOP.setIsRenamable(false); + } } - MIUser->dump(); + LLVM_DEBUG(MIUser->dump()); MI->eraseFromBundle(); modified.insert(MIUser); } @@ -288,16 +246,19 @@ bool RISCVExpandSSRPostRegAlloc::mergePushPop(MachineBasicBlock &MBB) { } if (predDefsR) { //if Pred defines R auto end = MI->getParent()->end().getInstrIterator(); - MachineOperand *MO = getUniqueUser(Pred->getIterator(), end, end, R); + MachineOperand *MO = getUniqueUser(Pred->getIterator(), end, R); if (MO && MO->getParent() == &*MI) { //if MI is unique user of R - Pred->dump(); + LLVM_DEBUG(Pred->dump()); for (auto &MOP : Pred->operands()) { if (MOP.isReg() && MOP.isDef() && MOP.getReg() == R) { MOP.setReg(ssr_reg); //replace all defs of R with ssr_reg MOP.setIsDef(false); + MOP.setIsKill(false); + MOP.setIsDead(false); + MOP.setIsRenamable(false); } } - Pred->dump(); + LLVM_DEBUG(Pred->dump()); MI->eraseFromBundle(); modified.insert(Pred); } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td index a41df75c1cf4a..bc1e03d5eedc6 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXssr.td @@ -124,7 +124,7 @@ class SPseudoPop: class SPseudoLoadMove: Pseudo<(outs FPR64:$val), (ins FPR64:$ssr),[]> { let mayLoad = 1; - let mayStore = 0; + let mayStore = 1; let hasSideEffects = 1; let usesCustomInserter = 0; } diff --git a/llvm/lib/Target/RISCV/RISCVSSRReassociate.cpp b/llvm/lib/Target/RISCV/RISCVSSRReassociate.cpp index 395b8969e4cca..31320a8d05624 100644 --- a/llvm/lib/Target/RISCV/RISCVSSRReassociate.cpp +++ b/llvm/lib/Target/RISCV/RISCVSSRReassociate.cpp @@ -1,16 +1,18 @@ -//===- SSRReassociatePass.cpp - Expand atomic instructions ------------------===// +//===- SSRReassociatePass.cpp - Reassociate Fast FP insts and move SSR push/pop intrinsics ------------------===// // -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// ??? // //===----------------------------------------------------------------------===// // -// This file contains a pass (at IR level) to replace atomic instructions with -// __atomic_* library calls, or target specific instruction which implement the -// same semantics in a way which better fits the target backend. This can -// include the use of (intrinsic-based) load-linked/store-conditional loops, -// AtomicCmpXchg, or type coercions. +// FIXME: The reassociation should really be done by the ReassociatePass but it +// for some reason does no reassociate fast FP insts? (maybe because it expects +// a normal out of order processor to vectorize anyway.) +// The reassociation is always done in full an can thus be quite slow when the +// dependency trees are large. Might want to introduce a max height or sth +// like that. +// Bubbling the Pushs/Pops might be better done in the pre RA ssr expand pass +// because we have more control over where they land there. +// This is not really meant to be used yet, so debug msg's are output by errs(). // //===----------------------------------------------------------------------===// @@ -54,7 +56,24 @@ using namespace llvm; -#define DEBUG_TYPE "ssr-inference" +#define DEBUG_TYPE "ssr-reassociate" + +namespace llvm { + cl::opt AggressiveReassociate( + "ssr-aggressive-reassociation", + cl::init(true), + cl::desc("Reassociate aggressively and move ssr push/pop out of the way. In particular: reassociate also fast fp-ops") + ); + + cl::opt BubbleStreams( + "ssr-bubble-streams", + cl::init(0), + cl::desc( + "Try to schedule pops earlier and pushs later making \"windows\" holding the given nr. of instructions given." + "This gives more freedom to the scheduler in unrolled loops. If window is too large then there are not enough registers which leads to unnecessary spills" + "0 means off (default), negative number means max window size") + ); +} namespace { @@ -72,8 +91,6 @@ namespace { private: bool runOnBB(BasicBlock &BB); - // void moveAfterWithMetadata - // DominatorTreeWrapperPass DTP; }; } // end anonymous namespace @@ -82,42 +99,80 @@ bool SSRReassociate::runOnFunction(Function &F) { bool Modified = false; errs()<<"SSR Reassociate Pass running on: "<DTP.runOnFunction(F); - if (!F.hasFnAttribute("SSR")) return false; + if (BubbleStreams) errs()<<"bubbling streams by "<getIterator(); -// while (II != BB.end() && II != LastInsertedPush) { -// auto NII = std::next(II); -// if (isa(*II)) { -// auto &Intr = cast(*II); -// if (Intr.getIntrinsicID() == Intrinsic::riscv_ssr_pop) { -// Intr.removeFromParent(); -// Intr.insertAfter(&*LastInsertedPop); -// LastInsertedPop = Intr.getIterator(); -// Modified = true; -// } else if (Intr.getIntrinsicID() == Intrinsic::riscv_ssr_push) { -// Intr.removeFromParent(); -// Intr.insertBefore(&*LastInsertedPush); -// LastInsertedPush = Intr.getIterator(); -// Modified = true; -// } -// } -// II = NII; -// } -// return Modified; -// } - -static bool BubbleSSRIntrinsics(BasicBlock &BB) { +static bool isPushPop(Instruction &I) { + return isa(I) && + (cast(I).getIntrinsicID() == Intrinsic::riscv_ssr_push + || cast(I).getIntrinsicID() == Intrinsic::riscv_ssr_push); +} + +//put pops at top and pushs at bottom +static bool BubbleSSRIntrinsics(BasicBlock::iterator begin, BasicBlock::iterator end) { + bool Modified = false; + auto II = begin; + auto LastInsertedPopSucc = begin; + auto LastInsertedPush = std::prev(end); + auto FirstInsertedPush = end; + while (II != end && II != FirstInsertedPush) { + auto NII = std::next(II); + if (isa(*II)) { + auto &Intr = cast(*II); + if (Intr.getIntrinsicID() == Intrinsic::riscv_ssr_pop) { + Intr.moveBefore(&*LastInsertedPopSucc); + LastInsertedPopSucc = std::next(Intr.getIterator()); + Modified = true; + } else if (Intr.getIntrinsicID() == Intrinsic::riscv_ssr_push) { + Intr.moveAfter(&*LastInsertedPush); + LastInsertedPush = Intr.getIterator(); + Modified = true; + if (FirstInsertedPush == end) FirstInsertedPush = LastInsertedPush; + } + } + II = NII; + } + return Modified; +} + +//genetates the window that the above function uses to bubble +//windows depend are constrained by bubble_count and by ssr enable/disable calls +static bool BubbleSSRIntrinsics(BasicBlock &BB, unsigned bubble_count) { + bool Modified = false; + auto start = BB.getFirstInsertionPt(); + auto finish = start; + while (start != BB.end()) { + //increment finish until it hits an ssr enable / disable + unsigned w = 0; // or until we have bubble_count many instructions (non push/pop instructions) inside the window + while (finish != BB.end() && finish != BB.getTerminator()->getIterator() && w < bubble_count) { + assert(finish != BB.end()); + if (isa(*finish)) { + auto id = cast(*finish).getIntrinsicID(); + if (id == Intrinsic::riscv_ssr_enable || id == Intrinsic::riscv_ssr_disable) { + break; + } + } + if (!isPushPop(*finish) && !finish->isDebugOrPseudoInst()) w++; + finish++; + } + + Modified |= BubbleSSRIntrinsics(start, finish); + + if (finish != BB.getTerminator()->getIterator() && finish != BB.end()) finish++; //move past ssr en/dis + else break; // we are done + start = finish; + } + + return Modified; +} + +//put pops and pushs as close to their def/use as possible +static bool BubbleSSRIntrinsicsBack(BasicBlock &BB) { bool Modified = false; auto II = BB.getFirstInsertionPt(); DenseSet vis; @@ -157,25 +212,28 @@ static bool isAssociative(const Value &V) { const auto &I = cast(V); if (I.getType()->isIntegerTy(1u)) return false; //ignore bools if(I.isAssociative()) return true; - if ((I.getType()->isFloatingPointTy() && I.isFast())){ //https://gcc.gnu.org/wiki/FloatingPointMath - switch (I.getOpcode()) - { - case Instruction::BinaryOps::FAdd: - case Instruction::BinaryOps::FMul: - return true; - default: - return false; - } - } + if (isa(I) && I.hasAllowReassoc()) return true; + // if ((I.getType()->isFloatingPointTy() && I.isFast())){ //https://gcc.gnu.org/wiki/FloatingPointMath + // switch (I.getOpcode()) + // { + // case Instruction::BinaryOps::FAdd: + // case Instruction::BinaryOps::FMul: + // return true; + // default: + // return false; + // } + // } return false; } +// a bit redundant, but might allow to be extended static bool isBinop(const Value &I) { return isa(I); } static unsigned getAndUpdateHeight(const Value &V, DenseMap &heights); //bc mutual recursion +//assumes children have the correct height, updates the height of I accordingly static unsigned updateHeightFromChildren(const BinaryOperator &I, DenseMap &heights) { unsigned this_height = 1u + std::max( getAndUpdateHeight(*I.getOperand(0), heights), @@ -186,6 +244,7 @@ static unsigned updateHeightFromChildren(const BinaryOperator &I, DenseMap &heights) { if (!isa(V)) return 0; const Instruction &I = cast(V); @@ -195,90 +254,206 @@ static unsigned getAndUpdateHeight(const Value &V, DenseMap(I), heights); } -// static void moveAfterWithMetadata(BinaryOperator &OP, Instruction *Point) { - -// } +//moves OP and all users that are between OP and Point to after Point in the same order +static void moveAfterWithAllUsers(BinaryOperator &OP, Instruction &Point) { + assert(OP.getParent() == Point.getParent()); + auto II = std::next(Point.getIterator().getReverse()); //start right before point + auto rend = OP.getIterator().getReverse(); //end just after OP + SmallPtrSet users; //for faster lookup + for (auto *U : OP.users()) { + if (auto *I = dyn_cast(U)) { + users.insert(I); + } + } + while (II != OP.getParent()->rend() && II != rend) { + auto NII = std::next(II); + for (auto *U : II->users()){ + if (auto *I = dyn_cast(U)) + users.insert(I); + } + if (users.contains(&*II)) { + II->moveAfter(&Point); + } + II = NII; + assert(II != OP.getParent()->rend()); + } + OP.moveAfter(&Point); +} +//we can only rotate if B only depends directly on A without any other def-use path between them +static bool canRotate(const Instruction &A, const Instruction &B) { + SmallPtrSet users; + for (auto *U : A.users()) { + if (auto *I = dyn_cast(U)) users.insert(I); + } + auto II = A.getIterator(); + for (; II != A.getParent()->end() && &*II != &B; II++) { + if (users.contains(&*II)) { + for (auto *U : II->users()) { + if (auto *I = dyn_cast(U)) { + if (I == &B) return false; //additional def-use path + users.insert(I); + } + } + if (!isa(*II) && !isa(*II) && !isa(*II) && !isa(*II)) + return false; //if user (which will need to be moved is not a "simple" instrucion ==> then cannot do it) + } + } + return II != A.getParent()->end() && &*II == &B; //return true if II now points to B +} + +//single rotation counter-clockwise (trees are with root at bottom because thats how they are in LLVM IR) static BinaryOperator *rotateCC(BinaryOperator &L, BinaryOperator &I, DenseMap &heights) { - errs()<<"rotating CC:"< &heights) { - errs()<<"rotating CW:"<(*I.user_begin()) == &R && std::next(I.user_begin()) == I.user_end() && "the only user of I is R"); return &R; } +//try to rotate or double rotate if applicable (see AVL trees) static BinaryOperator *tryRotateL(Value &Left, Value &Root, DenseMap &heights) { if (isBinop(Left) && isBinop(Root) && isAssociative(Left) && isAssociative(Root)) { BinaryOperator &L = cast(Left); BinaryOperator &I = cast(Root); const unsigned opcode = I.getOpcode(); - if (L.getOpcode() != opcode) return nullptr; //cannot do anything + if (L.getOpcode() != opcode || L.getParent() != I.getParent()) return nullptr; //cannot do anything unsigned lh = getAndUpdateHeight(L, heights); if (lh <= 1u) return nullptr; //nothing to do auto &L_RChild = *L.getOperand(1); if (isBinop(L_RChild) && isAssociative(L_RChild) && getAndUpdateHeight(L_RChild, heights) + 1u == lh) { auto &LRC = cast(L_RChild); - if (LRC.getOpcode() == opcode) { + if (LRC.getOpcode() == opcode && LRC.getParent() == I.getParent() && canRotate(LRC, L) && canRotate(L, I)) { auto &newL = *rotateCW(LRC, L, heights); - return rotateCC(newL, I, heights); + if (canRotate(newL, I)) return rotateCC(newL, I, heights); + else return nullptr; } } - return rotateCC(L, I, heights); + if (canRotate(L, I)) return rotateCC(L, I, heights); } return nullptr; } +//try to rotate or double rotate if applicable (see AVL trees) static BinaryOperator *tryRotateR(Value &Right, Value &Root, DenseMap &heights) { if (isBinop(Right) && isBinop(Root) && isAssociative(Right) && isAssociative(Root)) { BinaryOperator &R = cast(Right); BinaryOperator &I = cast(Root); const unsigned opcode = I.getOpcode(); - if (R.getOpcode() != opcode) return nullptr; //cannot do anything + if (R.getOpcode() != opcode || R.getParent() != I.getParent()) return nullptr; //cannot do anything unsigned rh = getAndUpdateHeight(R, heights); if (rh <= 1u) return nullptr; //nothing to do auto &R_LChild = *R.getOperand(0); if (isBinop(R_LChild) && isAssociative(R_LChild) && getAndUpdateHeight(R_LChild, heights) + 1u == rh) { auto &RLC = cast(R_LChild); - if (RLC.getOpcode() == opcode) { + if (RLC.getOpcode() == opcode && RLC.getParent() == I.getParent() && canRotate(RLC, R) && canRotate(R, I)) { auto &newR = *rotateCC(RLC, R, heights); - return rotateCW(newR, I, heights); + if (canRotate(newR, I)) return rotateCW(newR, I, heights); + else return nullptr; } } - return rotateCW(R, I, heights); + if (canRotate(R, I)) return rotateCW(R, I, heights); } return nullptr; } +//needed to check whether we are actually dealing with a tree +static bool subGraphsIntersect(const Value &X, const Value &Y) { + if (!isBinop(X) || !isBinop(Y)) return false; + const auto &A = cast(X); + const auto &B = cast(Y); + DenseSet seen; + std::deque q; + const BasicBlock *BB = A.getParent(); + q.push_back(&A); + while (!q.empty()) { + const auto *I = q.front(); q.pop_front(); + seen.insert(I); + if (auto *X = dyn_cast(I->getOperand(0))) { + if (X && X->getParent() == BB) q.push_back(X); + } + if (auto *X = dyn_cast(I->getOperand(1))) { + if (X && X->getParent() == BB) q.push_back(X); + } + } + assert(q.empty()); + q.push_back(&B); + while (!q.empty()) { + const auto *I = q.front(); q.pop_front(); + if (seen.contains(I)) return true; + if (auto *X = dyn_cast(I->getOperand(0))) { + if (X && X->getParent() == BB) q.push_back(X); + } + if (auto *X = dyn_cast(I->getOperand(1))) { + if (X && X->getParent() == BB) q.push_back(X); + } + } + return false; +} + +//print trees for debugging purposes +static void printDep(Value &I, unsigned lvl, DenseMap &heights, DenseSet &vis) { + if (vis.find(&I) != vis.end()) return; + vis.insert(&I); + for (unsigned i = 0; i < lvl; i++) errs()<<"| \t"; + unsigned h = 0; + if (isa(I)) { + auto p = heights.find(&cast(I)); + if (p != heights.end()) h = p->second; + } + errs()<<" h = "<(I); + for (unsigned i = 0; i < X.getNumOperands(); i++) { + auto *V = X.getOperand(i); + if (V) printDep(*V, lvl+1, heights, vis); + } + } +} + +//try to reassociate tree rooted in Inst (if it is a tree!) +//insts might be moved past Inst and Inst might not be the root anymore afterwards static bool Reassociate(Value &Inst, DenseMap &heights) { bool Modified = false; if (isBinop(Inst) && isAssociative(Inst)) { BinaryOperator *I = cast(&Inst); - bool improved_root = true; - while (improved_root) { - improved_root = false; - int lminusr = + unsigned h = updateHeightFromChildren(*I, heights); + if (h <= 2) return false; //nothing todo + if (subGraphsIntersect(*I->getOperand(0), *I->getOperand(1))) { + return false; //Inst is not root of a tree! cannot optimize! + } + bool better = true; + int lminusr = std::numeric_limits::max(); + DenseSet vis; + do { + if (vis.contains(I)) break; + vis.insert(I); + int new_lminusr = (int)getAndUpdateHeight(*I->getOperand(0), heights) - (int)getAndUpdateHeight(*I->getOperand(1), heights); + better = std::abs(lminusr) > std::abs(new_lminusr); + lminusr = new_lminusr; BinaryOperator *NewRoot = nullptr; if (lminusr >= 2) { NewRoot = tryRotateL(*I->getOperand(0), *I, heights); //try to fix at this height @@ -287,10 +462,12 @@ static bool Reassociate(Value &Inst, DenseMap &he } if (NewRoot) { I = NewRoot; - improved_root = true; Modified = true; + better = true; + } else { + better = false; //defenitely do not repeat if we haven't changed anything anymore } - } + } while (better); bool improved_left = Reassociate(*I->getOperand(0), heights); //fix left subtree bool improved_right = Reassociate(*I->getOperand(1), heights); //fix right subtree @@ -301,26 +478,7 @@ static bool Reassociate(Value &Inst, DenseMap &he return Modified; } -static void printDep(Value &I, unsigned lvl, DenseMap &heights, DenseSet &vis) { - if (vis.find(&I) != vis.end()) return; - vis.insert(&I); - for (unsigned i = 0; i < lvl; i++) errs()<<"| \t"; - unsigned h = 0; - if (isa(I)) { - auto p = heights.find(&cast(I)); - if (p != heights.end()) h = p->second; - } - errs()<<" h = "<(I); - for (unsigned i = 0; i < X.getNumOperands(); i++) { - auto *V = X.getOperand(i); - if (V) printDep(*V, lvl+1, heights, vis); - } - } -} - +//try to reassociate all insts in BB static bool Reassociate(BasicBlock &BB) { bool Modified = false; @@ -328,28 +486,36 @@ static bool Reassociate(BasicBlock &BB) { auto RI = BB.rbegin(); while (RI != BB.rend()) { - Modified |= Reassociate(*RI, heights); + if (heights.find(&*RI) == heights.end()) {//only reassociate if this was not part of any tree already + Modified |= Reassociate(*RI, heights); + } RI++; //yes, this means we miss some instructions, but those are optimized already anyway } - if (Modified) { - errs()<<"Reassociate in BB: "< vis; - for (auto RI = BB.rbegin(); RI != BB.rend(); RI++) { - printDep(*RI, 0, heights, vis); - } - BB.dump(); - } + // if (Modified) { + // errs()<<"Reassociate in BB: "< vis; + // for (auto RI = BB.rbegin(); RI != BB.rend(); RI++) { + // printDep(*RI, 0, heights, vis); + // } + // } return Modified; } +//reassociate and then bubble bool SSRReassociate::runOnBB(BasicBlock &BB) { bool Modified = false; - Modified |= Reassociate(BB); + if (AggressiveReassociate) { + Modified |= BubbleSSRIntrinsics(BB, std::numeric_limits::max()); //move pop/pushs out of the way + Modified |= Reassociate(BB); + if (BubbleStreams >= 0) Modified |= BubbleSSRIntrinsicsBack(BB); //move them back if needed + } - Modified |= BubbleSSRIntrinsics(BB); + if (BubbleStreams > 0) { + Modified |= BubbleSSRIntrinsics(BB, (unsigned)BubbleStreams); //bubble to form windows + } return Modified; } diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 64d9028c14217..7710f77871fd1 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -30,6 +30,8 @@ #include "llvm/Support/FormattedStream.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Transforms/Scalar/Reassociate.h" +#include "llvm/Transforms/Scalar.h" using namespace llvm; extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { @@ -170,7 +172,8 @@ TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) { } void RISCVPassConfig::addIRPasses() { - //addPass(createSSRReassociatePass()); //sadly creates some problems right now + addPass(createSSRReassociatePass()); //slow, see top of file for more info + //addPass(createReassociatePass()); does not reassociate fast fp-ops ??? addPass(createAtomicExpandPass()); TargetPassConfig::addIRPasses(); } @@ -211,7 +214,10 @@ void RISCVPassConfig::addPreEmitPass2() { addPass(createRISCVExpandPseudoPass()); addPass(createPULPFixupHwLoops()); addPass(createRISCVExpandSSRPostRegAllocPass()); - //addPass(createSNITCHAutoFrepPass()); can have benefits in some cases + //FIXME: scheduling the post ra scheduler after ssr expand gives better results but is unsafe + //because it might move insts with ssr regs after ssr-disable (or before enable) or reorder them internally (change in order of stream!) + // addPass(&PostRASchedulerID); + addPass(createSNITCHAutoFrepPass()); // Schedule the expansion of AMOs at the last possible moment, avoiding the // possibility for other passes to break the requirements for forward diff --git a/llvm/lib/Target/RISCV/Snitch/SNITCHAutoFrep.cpp b/llvm/lib/Target/RISCV/Snitch/SNITCHAutoFrep.cpp index a1a6be242caac..5b3e212541877 100644 --- a/llvm/lib/Target/RISCV/Snitch/SNITCHAutoFrep.cpp +++ b/llvm/lib/Target/RISCV/Snitch/SNITCHAutoFrep.cpp @@ -1,9 +1,19 @@ -//===-- SNITCHAutoFrep.cpp - Expand SSR pseudo instructions ---------===// +//===-- SNITCHAutoFrep.cpp - Automatically insert frep for repeating FP insts ---------===// // -// Copyright 2021 ETH Zurich, University of Bologna. -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// ??? +// +//===----------------------------------------------------------------------===// +// +// FIXME: combine this with the SNITCHFrepLoop.cpp pass + extend that one to allow +// for the PseudoLoadMove and PsuedoStoreMove pseudo insts. Might need to make two +// passes one pre RA and one post RA. +// +// This pass looks for repeating fp insts and then tries to find a reduction operation. +// If it finds one it will try to use freps stagger. If these can both be applied, the +// pass will calculate what stagger amount is best and then insert a frep inst with it +// as well as insts that reduce the different "critical paths" into one result. +// The current fpu fence is quite strong (a branch) a weaker one might suffice. +// Currently not meant to be used ==> debug output done with errs(). // //===----------------------------------------------------------------------===// @@ -24,8 +34,10 @@ using namespace llvm; #define DEBUG_TYPE "riscv-frep" namespace llvm { - /// Command line options - + cl::opt SnitchAutoFrep( + "snitch-auto-frep", + cl::init(false), + cl::desc("Find repeating fp insts in unrolled loops. If a reduction can be found (not good yet) insert frep with stagger.")); } #define SNITCH_AUTO_FREP_NAME "Snitch Auto Frep" @@ -80,6 +92,13 @@ char SNITCHAutoFrep::ID = 0; static constexpr unsigned fpopcodes[] = {RISCV::FADD_D, RISCV::FMUL_D, RISCV::FMADD_D, RISCV::FSGNJ_D, RISCV::FDIV_D, RISCV::FSUB_D, RISCV::FMSUB_D, RISCV::FMIN_D, RISCV::FMAX_D, RISCV::FSQRT_D}; bool SNITCHAutoFrep::runOnMachineFunction(MachineFunction &MF) { + + if (SnitchAutoFrep) { + errs()<<"snitch auto frep on "<(MF.getSubtarget().getInstrInfo()); this->MF = &MF; this->RVFI = MF.getInfo(); @@ -96,6 +115,7 @@ bool SNITCHAutoFrep::runOnMachineFunction(MachineFunction &MF) { } //very conservative +// return true if two insts do the same static bool areTheSame(MachineInstr &A, MachineInstr &B){ if (A.isBundled() || B.isBundled() || A.isDebugInstr() || B.isDebugInstr()) return false; bool same = A.getOpcode() == B.getOpcode(); @@ -108,10 +128,12 @@ static bool areTheSame(MachineInstr &A, MachineInstr &B){ return same; } +//FIXME: surely there is a better way to do this bool SNITCHAutoFrep::isFPInstr(MachineInstr &I) { return this->FPOps.find(I.getOpcode()) != this->FPOps.end(); } +//test whether the window [window_beg, window_end) is repeating and how many times it is std::pair SNITCHAutoFrep::findRep( MachineBasicBlock::instr_iterator window_beg, MachineBasicBlock::instr_iterator window_end, @@ -133,6 +155,7 @@ std::pair SNITCHAutoFrep::findRep( return std::make_pair(s_res, rep); } +//used to calculate best possible stagger amount static unsigned getCycles(unsigned opcode) { switch (opcode) { @@ -147,6 +170,9 @@ static unsigned getCycles(unsigned opcode) { } } +//return reduction operation +//fmul.d not included because we currently always init the staggered regs with 0 (and mul would need 1) +//min/max might also work, anything associative should work static Optional getCombineOpcode(unsigned opcode, unsigned src_idx) { switch (opcode) { @@ -161,12 +187,15 @@ static Optional getCombineOpcode(unsigned opcode, unsigned src_idx) { } } +//combine usages to mask static unsigned toMask (const std::vector> &deps) { unsigned mask = 0u; for (const auto &p : deps) mask |= p.second; return mask; } + +//find internal and external dependencies static Optional>> findRepDependenceRegs( MachineBasicBlock::instr_iterator window_begin, MachineBasicBlock::instr_iterator window_end) @@ -210,11 +239,12 @@ static Optional>> findRepDependenceR unsigned internal_mask = toMask(internal); unsigned external_mask = toMask(external); for (auto &p : external) external_mask |= p.second; - //internal needs to be a subset of external + //internal needs to be a subset of external so that we can stagger (FIXME: right?) if ((internal_mask & external_mask) ^ internal_mask) return None; return external; } +//merge dependecy vector static void mergeRegisters(std::vector> &deps) { unsigned i = 0; while (i < deps.size()) { @@ -236,6 +266,7 @@ static void mergeRegisters(std::vector> &deps) { } } +//duh static bool isSSRReg(MCRegister r) { for (unsigned i = 0; i < NUM_SSR; i++) { if (getSSRFtReg(i) == r) return true; @@ -243,6 +274,7 @@ static bool isSSRReg(MCRegister r) { return false; } +//try to find readuction operation, currently only single ops are allowed static Optional> findCombineOps( MCRegister DReg, unsigned stagger_mask, @@ -287,6 +319,7 @@ struct StaggerInfo { std::vector combineOps; }; +//try to find a way to stagger static Optional findStagger( MachineBasicBlock::instr_iterator window_begin, MachineBasicBlock::instr_iterator window_end, @@ -334,6 +367,8 @@ static MachineBasicBlock *findBB(MachineInstr &MI) { } //FIXME: no idea how to make a block a label for sure ==> just search for a branch and take its target +// there must be a better way to do this +// used for an always "dead" branch in the fpu fence static MachineBasicBlock *findBrAbleBB(MachineBasicBlock &MBB) { if (!MBB.empty()) { auto *BB = findBB(*std::prev(MBB.end())); @@ -356,6 +391,7 @@ static MachineBasicBlock *findBrAbleBB(MachineBasicBlock &MBB) { return &MBB; } +// work on a single BB, try to find repetitions, then try to find a way to stagger, then generate code if it gives an improvement bool SNITCHAutoFrep::process(MachineBasicBlock &MBB) { bool Modified = false; @@ -529,62 +565,4 @@ namespace llvm { FunctionPass *createSNITCHAutoFrepPass() { return new SNITCHAutoFrep(); } -} // end of namespace llvm - - -// if (window_size == 1) { -// std::vector defs; -// std::vector ins; -// for (auto &MOP : MII->operands()) { -// if (!MOP.isReg()) continue; -// if (MOP.isDef()) defs.push_back(MOP.getReg().asMCReg()); -// else ins.push_back(MOP.getReg().asMCReg()); -// std::vector inter_dep; -// for (auto &d : defs) { -// for (auto &i : ins) { -// if (d == i) inter_dep.push_back(d); -// } -// } -// if (inter_dep.size() == 1) { -// errs()<<"only one interdependence\n"; -// MCRegister stagger_reg = inter_dep[0]; -// unsigned mask_idx = 3u; -// for (unsigned s = 0; s < MII->getNumOperands(); s++) { -// if (MII->getOperand(s).isReg() && MII->getOperand(s).getReg().asMCReg() == stagger_reg) -// stagger_mask |= 1u << mask_idx; -// mask_idx--; -// } -// auto p = getCombineOpcode(MII->getOpcode()); -// if (p.hasValue()) { -// errs()<<"has combine opcode\n"; -// combine_opcode = p.getValue().first; -// unsigned allowed_mask = p.getValue().second; -// if ((stagger_mask | allowed_mask) == allowed_mask) { //allowed -// errs()<<"stagger is allowed\n"; -// while (stagger_count < MAX_STAGGER && liveness.available(MRI, stagger_reg + stagger_count + 1)) -// stagger_count++; -// if (stagger_count && stagger_mask && combine_opcode) { -// errs()<<"can stagger\n"; -// std::vector stagger_regs; -// stagger_regs.push_back(stagger_reg); -// for (unsigned x = 1; x <= stagger_count; x++){ -// BuildMI(MBB, MII, MII->getDebugLoc(), this->TII->get(RISCV::FCVT_D_W), stagger_reg + x) //fcvt.d.w stagger, zero -// .addReg(RISCV::X0); -// stagger_regs.push_back(stagger_reg + x); -// } -// std::vector stagger_regs2; -// while (stagger_regs.size() > 1u) { -// auto builder = BuildMI(MBB, delete_end, delete_end->getDebugLoc(), MII->getDesc()); -// unsigned m_idx = 3u; -// for (auto MOI = MII->operands_begin(); MOI != MII->operands_end(); ++MOI) { - -// } -// } - -// errs()<<"emited stagger insts\n"; -// } -// } -// } -// } -// } -// } \ No newline at end of file +} // end of namespace llvm \ No newline at end of file diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index f4cdf16c7c938..12842f3ceb048 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -423,7 +423,7 @@ void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ for (Instruction *I : E.Access->getAccesses()){ std::array pusharg = {DMid, cast(I)->getValueOperand()}; builder.SetInsertPoint(I); - auto *C = builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); + builder.CreateCall(SSRPush->getFunctionType(), SSRPush, ArrayRef(pusharg)); I->eraseFromParent(); n_reps++; } From 7fe3a149e807ae503b08f35e3136262c62b8afc7 Mon Sep 17 00:00:00 2001 From: thrupf Date: Sun, 31 Jul 2022 09:31:22 +0200 Subject: [PATCH 44/47] some comments --- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 4 ++-- llvm/lib/Passes/PassBuilder.cpp | 2 -- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index cf420acd50eab..1926b94096bde 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -1004,10 +1004,10 @@ void AffineAccess::addAllConflicts(const std::vector &all) { if (!L) continue; if (L == outerMostExpandableExl) break; if (!(!L || A->isWellFormed(L))){ - errs()<<"HERE\n"; if (L) L->dump(); if (outerMostExpandableExl) outerMostExpandableExl->dump(); A->dump(); + llvm_unreachable("this should not happen!"); } assert(!L || A->isWellFormed(L)); auto p = expandableAccesses.find(L); @@ -1064,7 +1064,7 @@ std::pair AffineAccess::calcConflict(AffAcc *A, Aff if (B->isWrite()) kind = AffAccConflict::MustNotIntersect; //WaW else kind = calcRWConflict(B, A, innermostCommon); //B is read and A is write } - //at this point, even if the two do not alias, we assume the chance is high that they do at runtime + //at this point, even if the two may alias, we assume the chance is high that they do at runtime //if their base addresses share some SCEVUnknowns (ie. some Value's) (FIXME: this is CONSERVATIVE) if (kind == AffAccConflict::MustNotIntersect){ const Loop *L = innermostCommon->getParentLoop(); diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index e54e4c6b1a37d..40b2068b83cf7 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -774,8 +774,6 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, std::move(LPM2), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false, DebugLogging)); - //FPM.addPass(SSRInferencePass()); - // Delete small array after loop unroll. FPM.addPass(SROA()); diff --git a/llvm/lib/Transforms/SSR/SSRGeneration.cpp b/llvm/lib/Transforms/SSR/SSRGeneration.cpp index 12842f3ceb048..3b173ae8e50ab 100644 --- a/llvm/lib/Transforms/SSR/SSRGeneration.cpp +++ b/llvm/lib/Transforms/SSR/SSRGeneration.cpp @@ -108,7 +108,7 @@ cl::opt SSRNoInline( cl::opt SSRBarrier( "ssr-barrier", cl::init(false), - cl::desc("Enable the insertion of a spinning loop that waits for the stream to be done before it is dissabled.") + cl::desc("Enable the insertion of a spinning loop that waits for the stream to be done before it is disabled.") ); } //end of namespace llvm From abba9f70a90ab722a2069230a08ebe19c2bb50c0 Mon Sep 17 00:00:00 2001 From: thrupf Date: Wed, 17 Aug 2022 13:08:20 +0200 Subject: [PATCH 45/47] clean up errs output --- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 61 +++++++----------- llvm/lib/Passes/PassBuilder.cpp | 11 ---- llvm/lib/Target/RISCV/RISCVSSRReassociate.cpp | 8 +-- llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 4 +- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 64 ++++++++++++------- llvm/lib/Transforms/SSR/SSRInference.cpp | 8 ++- 6 files changed, 74 insertions(+), 82 deletions(-) diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index 1926b94096bde..20635a3b713e1 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -49,6 +49,8 @@ #include #include +#define DEBUG_TYPE "ssr" + using namespace llvm; //================== AffineAcces, helper functions ========================================= @@ -247,11 +249,11 @@ bool isOnAllPredicatedControlFlowPaths(BasicBlock *BB, const Loop *L, const Domi vis.insert(Current); Instruction *T = Current->getTerminator(); - T->dump(); + LLVM_DEBUG(T->dump()); if (BranchInst *BR = dyn_cast(T)){ if (BR->isConditional()){ if (ICmpInst *Cmp = dyn_cast(BR->getCondition())){ //FOR NOW: only works with a single ICmpInst as branch condition operand - Cmp->dump(); + LLVM_DEBUG(Cmp->dump()); auto r = predicatedICmpOutcome(Cmp, Rep, SE); if (r.hasValue()){ if (r.getValue()) q.push_back(BR->getSuccessor(0)); @@ -332,24 +334,6 @@ void updateOutermostExpandableExcl(const Loop *&outerMostExpandableExl, AffAccCo } } -// void dumpAffAccConflict(AffAccConflict kind) { -// switch (kind) -// { -// case AffAccConflict::Bad: -// errs()<<"Bad"; -// break; -// case AffAccConflict::MustNotIntersect: -// errs()<<"MustNotIntersect"; -// break; -// case AffAccConflict::NoConflict: -// errs()<<"NoConflict"; -// break; -// default: -// break; -// } -// errs()<<"\n"; -// } - Optional findSign(const SCEV *S, ScalarEvolution &SE, std::vector> &known) { if (!S) return None; @@ -423,8 +407,8 @@ LoopRep::LoopRep(const Loop *L, ArrayRef contLoops, ScalarEvolutio : SE(SE), DT(DT), L(L), containingLoops(contLoops.begin(), contLoops.end()), safeExpandBound(0u) { RepSCEV = getLoopBTSCEV(L, DT, SE); - if (RepSCEV) errs()<<"new LoopRep with rep scev: "<<*RepSCEV<<"\n"; - else errs()<<"new LoopRep with rep scev: \n"; + if (RepSCEV) LLVM_DEBUG(dbgs()<<"new LoopRep with rep scev: "<<*RepSCEV<<"\n"); + else LLVM_DEBUG(dbgs()<<"new LoopRep with rep scev: \n"); if (RepSCEV){ while (safeExpandBound < containingLoops.size() @@ -721,14 +705,13 @@ void AffAcc::addConflict(AffAcc *A, const Loop *StartL, AffAccConflict kind){ assert(conflicts.find(A) == conflicts.end() && "no conflict for A yet"); assert(kind == AffAccConflict::Bad || (isWellFormed(StartL) && A->isWellFormed(StartL))); conflicts.insert(std::make_pair(A, std::make_pair(StartL, kind))); - //errs()<<"conflict for:\n"; dumpInLoop(StartL); errs()<<"with:\n"; A->dumpInLoop(StartL); errs()<<"is ===> "; } bool AffAcc::promote(LoopRep *LR){ if (!LR->isAvailable()) return false; unsigned newDim = (unsigned)(getMaxDimension() + 1); //getMaxDimension() >= -1 if (getLoop(newDim) != LR->getLoop()) return false; - errs()<<"promote: (1) loops match, "; + LLVM_DEBUG(dbgs()<<"promote: (1) loops match, "); bool possible = true; Instruction *Point = LR->getLoop()->getLoopPreheader()->getTerminator(); //check all current reps and steps @@ -736,15 +719,15 @@ bool AffAcc::promote(LoopRep *LR){ possible &= isSafeToExpandAt(getStep(dim), Point, SE); possible &= reps[dim]->isSafeToExpandBefore(LR->getLoop()); } - if (possible) errs()<<"can expand (2) current rep & step, "; + if (possible) LLVM_DEBUG(dbgs()<<"can expand (2) current rep & step, "); //check rep and step of new dimension possible &= steps.size() > newDim && isSafeToExpandAt(getStep(newDim), Point, SE); possible &= LR->isSafeToExpandBefore(LR->getLoop()); - if (possible) errs()<<"(3) new rep & step, "; + if (possible) LLVM_DEBUG(dbgs()<<"(3) new rep & step, "); //check base address possible &= !SCEVContainsCouldNotCompute(getBaseAddr(newDim)) && isSafeToExpandAt(getBaseAddr(newDim), Point, SE); - if (possible) errs()<<"and (4) new base addr!"; - errs()<<"\n"; + if (possible) LLVM_DEBUG(dbgs()<<"and (4) new base addr!"); + LLVM_DEBUG(dbgs()<<"\n"); if (!possible) return false; reps.push_back(LR); //changes getMaxDimension() @@ -755,11 +738,11 @@ Value *AffAcc::expandBaseAddr(unsigned dimension, Type *ty, Instruction *InsertB assert(isWellFormed(dimension)); InsertBefore = InsertBefore ? InsertBefore : reps[dimension]->getLoop()->getLoopPreheader()->getTerminator(); if (!isSafeToExpandAt(getBaseAddr(dimension), InsertBefore, SE)){ - errs()<<"data not expanable here (note: only preheader guaranteed)\n"; - errs()<<"SCEV (dim = "<getParent()->dump(); - errs()<<"before inst: "<<*InsertBefore<<"\n"; - this->dump(); + LLVM_DEBUG(dbgs()<<"data not expanable here (note: only preheader guaranteed)\n"); + LLVM_DEBUG(dbgs()<<"SCEV (dim = "<getParent()->dump()); + LLVM_DEBUG(dbgs()<<"before inst: "<<*InsertBefore<<"\n"); + LLVM_DEBUG(this->dump()); llvm_unreachable("cannot expand SCEV at desired location"); } SCEVExpander ex(SE, reps[dimension]->getLoop()->getHeader()->getModule()->getDataLayout(), "addr"); @@ -918,7 +901,7 @@ AffineAccess::AffineAccess( } std::unique_ptr> AffineAccess::analyze(Loop *Parent, ArrayRef loopPath){ - errs()<<"analyze: loop : "<getHeader()->getNameOrAsOperand()<<"\n"; + LLVM_DEBUG(dbgs()<<"analyze: loop : "<getHeader()->getNameOrAsOperand()<<"\n"); //LoopRep for Parent LoopRep *ParentLR = new LoopRep(Parent, loopPath, SE, DT); @@ -973,7 +956,7 @@ std::unique_ptr> AffineAccess::analyze(Loop *Parent, Array } } - errs()<<"analyze: done with loop: "<getHeader()->getNameOrAsOperand()<<"\n"; + LLVM_DEBUG(dbgs()<<"analyze: done with loop: "<getHeader()->getNameOrAsOperand()<<"\n"); return all; } @@ -1004,9 +987,9 @@ void AffineAccess::addAllConflicts(const std::vector &all) { if (!L) continue; if (L == outerMostExpandableExl) break; if (!(!L || A->isWellFormed(L))){ - if (L) L->dump(); - if (outerMostExpandableExl) outerMostExpandableExl->dump(); - A->dump(); + if (L) LLVM_DEBUG(L->dump()); + if (outerMostExpandableExl) LLVM_DEBUG(outerMostExpandableExl->dump()); + LLVM_DEBUG(A->dump()); llvm_unreachable("this should not happen!"); } assert(!L || A->isWellFormed(L)); @@ -1194,7 +1177,7 @@ AnalysisKey AffineAccessAnalysis::Key; AffineAccess AffineAccessAnalysis::run(Function &F, FunctionAnalysisManager &FAM) { - errs()<<"running AffineAccessAnalysis on "<(F); DominatorTree &DT = FAM.getResult(F); diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 40b2068b83cf7..0680b98465a4b 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -521,7 +521,6 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, ThinOrFullLTOPhase Phase) { FunctionPassManager FPM(DebugLogging); - errs()<<"O1SimplificationPipeline\n"; // Form SSA out of local memory accesses after breaking apart aggregates into // scalars. FPM.addPass(SROA()); @@ -648,7 +647,6 @@ FunctionPassManager PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, ThinOrFullLTOPhase Phase) { assert(Level != OptimizationLevel::O0 && "Must request optimizations!"); - errs()<<"O2/3FunctionSimplificationPipeline\n"; // The O1 pipeline has a separate pipeline creation function to simplify // construction readability. if (Level.getSpeedupLevel() == 1) @@ -846,7 +844,6 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, std::string ProfileFile, std::string ProfileRemappingFile) { assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!"); - errs()<<"addPGOInstrPasses\n"; if (!IsCS && !DisablePreInliner) { InlineParams IP; @@ -940,7 +937,6 @@ ModuleInlinerWrapperPass PassBuilder::buildInlinerPipeline(OptimizationLevel Level, ThinOrFullLTOPhase Phase) { InlineParams IP = getInlineParamsFromOptLevel(Level); - errs()<<"buildInlinerPipeline\n"; if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) IP.HotCallSiteThreshold = 0; @@ -1006,7 +1002,6 @@ PassBuilder::buildInlinerPipeline(OptimizationLevel Level, ModulePassManager PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, ThinOrFullLTOPhase Phase) { - errs()<<"ModuleSimplificationPipeline\n"; ModulePassManager MPM(DebugLogging); // Add UniqueInternalLinkageNames Pass which renames internal linkage @@ -1167,7 +1162,6 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, ModulePassManager PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, bool LTOPreLink) { - errs()<<"ModuleOptimizationPipeline\n"; ModulePassManager MPM(DebugLogging); // Optimize globals now that the module is fully simplified. @@ -1411,7 +1405,6 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, ModulePassManager PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, bool LTOPreLink) { - errs()<<"PerModuleDefaultPipeline\n"; assert(Level != OptimizationLevel::O0 && "Must request optimizations for the default pipeline!"); @@ -1452,7 +1445,6 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, ModulePassManager PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) { - errs()<<"ThinLTOPreLinkDefaultPipeline\n"; assert(Level != OptimizationLevel::O0 && "Must request optimizations for the default pipeline!"); @@ -1509,7 +1501,6 @@ PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) { ModulePassManager PassBuilder::buildThinLTODefaultPipeline( OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) { - errs()<<"buildThinLTODefaultPipeline\n"; ModulePassManager MPM(DebugLogging); // Convert @llvm.global.annotations to !annotation metadata. @@ -1556,7 +1547,6 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline( ModulePassManager PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level) { - errs()<<"buildLTOPreLinkDefaultPipeline\n"; assert(Level != OptimizationLevel::O0 && "Must request optimizations for the default pipeline!"); // FIXME: We should use a customized pre-link pipeline! @@ -1567,7 +1557,6 @@ PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level) { ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, ModuleSummaryIndex *ExportSummary) { - errs()<<"buildLTODefaultPipeline\n"; ModulePassManager MPM(DebugLogging); // Convert @llvm.global.annotations to !annotation metadata. diff --git a/llvm/lib/Target/RISCV/RISCVSSRReassociate.cpp b/llvm/lib/Target/RISCV/RISCVSSRReassociate.cpp index 31320a8d05624..3fefec22f122c 100644 --- a/llvm/lib/Target/RISCV/RISCVSSRReassociate.cpp +++ b/llvm/lib/Target/RISCV/RISCVSSRReassociate.cpp @@ -61,7 +61,7 @@ using namespace llvm; namespace llvm { cl::opt AggressiveReassociate( "ssr-aggressive-reassociation", - cl::init(true), + cl::init(false), cl::desc("Reassociate aggressively and move ssr push/pop out of the way. In particular: reassociate also fast fp-ops") ); @@ -98,9 +98,9 @@ namespace { bool SSRReassociate::runOnFunction(Function &F) { bool Modified = false; - errs()<<"SSR Reassociate Pass running on: "< #include +#define DEBUG_TYPE "ssr" + #define NUM_SSR 3U #define SSR_MAX_DIM 4U @@ -111,6 +113,12 @@ cl::opt SSRBarrier( cl::desc("Enable the insertion of a spinning loop that waits for the stream to be done before it is disabled.") ); +cl::opt SSRVerbose( + "ssr-verbose", + cl::init(true), + cl::desc("Write information about inferred streams to stderr.") +); + } //end of namespace llvm @@ -269,7 +277,7 @@ std::pair splitAt(Instruction *X, const Twine &name) ///assumes there is a phi node for each value defined in the region that will be cloned in the block of EndBefore that is live after EndBefore ///returns the branch that splits region from coloned region and the pair of branches that jump to EndBefore at the end std::pair> cloneRegion(Instruction *BeginWith, Instruction *EndBefore){ - errs()<<"cloning from "<<*BeginWith<<" up to "<<*EndBefore<<"\n"; + LLVM_DEBUG(dbgs()<<"cloning from "<<*BeginWith<<" up to "<<*EndBefore<<"\n"); auto p = splitAt(BeginWith, "split.before"); BasicBlock *Head = p.first; @@ -365,7 +373,7 @@ std::pair> cloneRegion(Instr } } } - errs()<<"done cloning \n"; + LLVM_DEBUG(dbgs()<<"done cloning \n"); return std::make_pair(HeadBr, std::make_pair(BRFuse, cast(clones.find(BRFuse)->second))); } @@ -395,7 +403,17 @@ void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ IRBuilder<> builder(Point); Type *i32 = Type::getInt32Ty(Point->getContext()); unsigned dim = E.getDimension(); - errs()<<"SSR Setup for stream with dim = "<isWrite() ? "write" : "read") + <<" stream with base address SCEV " + <<*E.Access->getBaseAddr(E.getDimension()) + <<" of dimension " + <getFunctionType(), SSRDisable, ArrayRef()); - errs()<<"generated ssr_enable and ssr_disable\n"; + LLVM_DEBUG(dbgs()<<"generated ssr_enable and ssr_disable\n"); return; } @@ -563,7 +581,7 @@ std::vector expandInLoop(const std::vector &accs, cons assert(accs.size() <= NUM_SSR); assert(L); - errs()<<"expanding in Loop: "<getHeader()->getNameOrAsOperand()<<" at depth "<getLoopDepth()<<"\n"; + LLVM_DEBUG(dbgs()<<"expanding in Loop: "<getHeader()->getNameOrAsOperand()<<" at depth "<getLoopDepth()<<"\n"); auto &ctxt = L->getHeader()->getContext(); IntegerType *i32 = IntegerType::getInt32Ty(ctxt); @@ -662,7 +680,7 @@ bool visitLoop(const Loop *L, DenseMap> &pos } //add to tree: int gain = getEstGain(l, L, AAA); - errs()<<"est. gain is "<isOutermost() ? nullptr : L->getParentLoop()); @@ -715,7 +733,7 @@ DenseSet findLoopsWithSSR(Function &F, LoopInfo &LI) { Instruction *I = &i; if (CallBase *C = dyn_cast(I)) { if (C->hasFnAttr(SSRFnAttr)) { - errs()<<"call "<<*C<<" has attribute "< no need to mark the BB const Loop *L = LI.getLoopFor(BB); while (L) { @@ -725,13 +743,13 @@ DenseSet findLoopsWithSSR(Function &F, LoopInfo &LI) { } if (IntrinsicInst *II = dyn_cast(C)) { if (ids.contains(II->getIntrinsicID())) { - errs()<<"Intrinsic Instr "<<*II<<" calls an SSR intrinsic\n"; + LLVM_DEBUG(dbgs()<<"Intrinsic Instr "<<*II<<" calls an SSR intrinsic\n"); marked = true; //mark this (and thus also all following BBs) } } if (C->isInlineAsm()) { //inline asm may contain ssr setup insts! - errs()<<"inline asm call "<<*C<<" may contain ssr insts!\n"; - C->getType()->dump(); + LLVM_DEBUG(dbgs()<<"inline asm call "<<*C<<" may contain ssr insts!\n"); + LLVM_DEBUG(C->getType()->dump()); marked = true; } } @@ -743,9 +761,9 @@ DenseSet findLoopsWithSSR(Function &F, LoopInfo &LI) { worklist.push_back(std::make_pair(BB2, marked)); } } - if (!invalid.empty()) errs()<<"Loops that are invalid bc of SSR\n"; + if (!invalid.empty()) LLVM_DEBUG(dbgs()<<"Loops that are invalid bc of SSR\n"); for (auto l : invalid) { - errs()<<"header = "<getHeader()->getNameOrAsOperand()<<" at depth = "<getLoopDepth()<<"\n"; + LLVM_DEBUG(dbgs()<<"header = "<getHeader()->getNameOrAsOperand()<<" at depth = "<getLoopDepth()<<"\n"); } return invalid; @@ -754,22 +772,22 @@ DenseSet findLoopsWithSSR(Function &F, LoopInfo &LI) { } //end of namespace PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &FAM){ - errs()<<"SSRInference Flags: "; - if (InferSSR) errs()<<"infer-ssr"; - if (SSRNoIntersectCheck) errs()<<", ssr-no-intersect-check"; - if (SSRNoBoundCheck) errs()<<", ssr-no-bound-check"; - if (SSRNoTCDMCheck) errs()<<", ssr-no-tcdm-check"; - if (SSRBarrier) errs()<<", ssr-barrier"; - if (SSRNoInline) errs()<<", ssr-no-inline"; - if (SSRConflictFreeOnly) errs()<<", ssr-conflict-free-only"; - errs()<<"\n"; + LLVM_DEBUG(dbgs()<<"SSRInference Flags: "); + if (InferSSR) LLVM_DEBUG(dbgs()<<"infer-ssr"); + if (SSRNoIntersectCheck) LLVM_DEBUG(dbgs()<<", ssr-no-intersect-check"); + if (SSRNoBoundCheck) LLVM_DEBUG(dbgs()<<", ssr-no-bound-check"); + if (SSRNoTCDMCheck) LLVM_DEBUG(dbgs()<<", ssr-no-tcdm-check"); + if (SSRBarrier) LLVM_DEBUG(dbgs()<<", ssr-barrier"); + if (SSRNoInline) LLVM_DEBUG(dbgs()<<", ssr-no-inline"); + if (SSRConflictFreeOnly) LLVM_DEBUG(dbgs()<<", ssr-conflict-free-only"); + LLVM_DEBUG(dbgs()<<"\n"); if (!InferSSR) return PreservedAnalyses::all(); if (F.hasFnAttribute(SSRFnAttr)) return PreservedAnalyses::all(); //this function already contains streams ==> skip AffineAccess &AAA = FAM.getResult(F); - errs()<<"SSR Generation Pass on function: "<getHeader()->getNameOrAsOperand()<<"\n"; + LLVM_DEBUG(dbgs()<<"visiting loop: "<getHeader()->getNameOrAsOperand()<<"\n"); visitLoop(L, possible, tree, AAA, ssrInvalidLoops.find(L) != ssrInvalidLoops.end()); diff --git a/llvm/lib/Transforms/SSR/SSRInference.cpp b/llvm/lib/Transforms/SSR/SSRInference.cpp index c565075585ca4..bd3e27a75c4cc 100644 --- a/llvm/lib/Transforms/SSR/SSRInference.cpp +++ b/llvm/lib/Transforms/SSR/SSRInference.cpp @@ -52,11 +52,13 @@ #include #include +#define DEBUG_TYPE "ssr" + using namespace llvm; PreservedAnalyses SSRInferencePass::run(Function &F, FunctionAnalysisManager &FAM){ - errs()<<"SSR Inference Pass on function: "< Date: Thu, 25 Aug 2022 11:41:13 +0200 Subject: [PATCH 46/47] update readme --- README.md | 19 +- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 13 +- llvm/lib/Target/RISCV/CMakeLists.txt | 1 + llvm/lib/Target/RISCV/RISCV.h | 3 + llvm/lib/Target/RISCV/RISCVSSRStatistics.cpp | 169 ++++++++++++++++++ llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 5 +- llvm/lib/Transforms/SSR/SSRGeneration.cpp | 176 +++++++------------ 7 files changed, 270 insertions(+), 116 deletions(-) create mode 100644 llvm/lib/Target/RISCV/RISCVSSRStatistics.cpp diff --git a/README.md b/README.md index 793180dc80b7b..be75cdf8c8569 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ LLVM 12 with extensions for processors and computer systems of the [PULP platfor - [HERO][hero]: mixed-data-model (64-bit + 32-bit) compilation and data sharing; automatic tiling of data structures and insertion of DMA transfers; - MemPool: Instruction scheduling model for the MemPool architecture; `Xmempool` extension to allow dynamic instruction tracing; - [PULPv2 RISC-V ISA extension (`Xpulpv2`)][hero]: automatic insertion of hardware loops, post-increment memory accesses, and multiply-accumulates; intrinsics, `clang` builtins , and assembly support for all instructions of the extension; -- [Snitch RISC-V ISA extensions (`Xssr`, `Xfrep`, and `Xdma`)][snitch]: automatic insertion of `frep` hardware loops; intrinsics and `clang` builtins for `Xssr` and `Xdma` extensions; assembly support for all instructions of the extension. +- [Snitch RISC-V ISA extensions (`Xssr`, `Xfrep`, and `Xdma`)][snitch]: automatic insertion of `frep` hardware loops; intrinsics and `clang` builtins for `Xssr` and `Xdma` extensions; assembly support for all instructions of the extension. NEW: automatic SSR inference. # HERO and PULPv2 RISC-V ISA Extension Support @@ -16,6 +16,7 @@ Refer to the [HERO repository](https://github.com/pulp-platform/hero) for build Refer to [snitch-toolchain-cd](https://github.com/pulp-platform/snitch-toolchain-cd) for build scripts and continuous deployment of pre-built toolchains. ## Command-line options +Note that flags that are passed to LLVM through `clang` need to be prefaced with `-mllvm` (use `"SHELL:-mllvm "` in CMake to prevent removal of repeated `-mllvm`s). | Flag | Description | |---|---| @@ -23,9 +24,16 @@ Refer to [snitch-toolchain-cd](https://github.com/pulp-platform/snitch-toolchain | `--debug-only=riscv-sdma` | Enable the debug output of the DMA pseudo instruction expansion pass | | `--debug-only=riscv-ssr` | Enable the debug output of the SSR pseudo instruction expansion pass | | `--debug-only=snitch-freploops` | Enable the debug output of the FREP loop inference pass | -| `--ssr-noregmerge` | Disable the SSR register merging in the SSR pseudo instruction expansion pass. Register merging is enabled by default and can be disabled with this flag. | +| `--ssr-no-regmerge` | Disable the SSR register merging in the SSR pseudo instruction expansion pass. Register merging is enabled by default and can be disabled with this flag. | | `--snitch-frep-inference` | Globally enable the FREP inference on all loops in the compiled module. | -| `--enable-misched=false` | Disable the machine instruction scheduler. Instructions in a complex loop with multiple SSR push or pop instructions on the same data mover may not be rescheduled because the order in which the SSR are accessed is important. | +| `-infer-ssr` | Enable automatic inference of SSR streams. | +| `-ssr-no-intersect-check` | Do not generate intersection checks (unsafe). Use `restrict` key-word instead if possible. | +| `-ssr-no-tcdm-check` | Assume all data of inferred streams is inside TCDM. | +| `-ssr-no-bound-check` | Do not generate checks that make sure the inferred stream's access is executed at least once. | +| `-ssr-conflict-free-only` | Only infer streams if they have no conflicts with other memory accesses. | +| `-ssr-no-inline` | Prevent functions that contain SSR streams from being inlined | +| `-ssr-barrier` | Enable the insertion of a spinning loop that waits for the stream to be done before it is disabled. | +| `-ssr-verbose` | Write information about inferred streams to `stderr`. | ## `clang` builtins The following `clang` builtins can be used to directly make use of the SSR and DMA extensions. @@ -189,6 +197,11 @@ void __builtin_ssr_setup_bound_stride_4d(uint32_t DM, uint32_t b, uint32_t s); void __builtin_ssr_barrier(uint32_t DM); ``` +#### SSR Inference Interoperability +Automatic SSR infernce will not infer any streams in an `ssr_enable` to `ssr_disable` region. +Note that SSR inference currently treats any inline asm block as if it would contain an SSR instruction. Thus it will not infer streams in any loop nests that contain inline asm somewhere. + + ### SDMA ```c diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index 20635a3b713e1..170f26e9ca87e 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -412,8 +412,10 @@ LoopRep::LoopRep(const Loop *L, ArrayRef contLoops, ScalarEvolutio if (RepSCEV){ while (safeExpandBound < containingLoops.size() - && isSafeToExpandAt(RepSCEV, containingLoops[safeExpandBound]->getLoopPreheader()->getTerminator(), SE)) + && (!containingLoops[safeExpandBound] + || isSafeToExpandAt(RepSCEV, containingLoops[safeExpandBound]->getLoopPreheader()->getTerminator(), SE))){ safeExpandBound++; + } } } @@ -715,7 +717,7 @@ bool AffAcc::promote(LoopRep *LR){ bool possible = true; Instruction *Point = LR->getLoop()->getLoopPreheader()->getTerminator(); //check all current reps and steps - for (int dim = 1; possible && dim < getMaxDimension(); dim++){ + for (unsigned dim = 1; dim < newDim; dim++){ possible &= isSafeToExpandAt(getStep(dim), Point, SE); possible &= reps[dim]->isSafeToExpandBefore(LR->getLoop()); } @@ -762,7 +764,12 @@ Value *AffAcc::expandStep(unsigned dimension, Type *ty, Instruction *InsertBefor Value *AffAcc::expandRep(unsigned dimension, Type *ty, Instruction *InsertBefore){ assert(isWellFormed(dimension) && dimension > 0u); InsertBefore = InsertBefore ? InsertBefore : reps[dimension]->getLoop()->getLoopPreheader()->getTerminator(); - assert(isSafeToExpandAt(getRep(dimension), InsertBefore, SE) && "data not expanable here (note: only preheader guaranteed)"); + if (!isSafeToExpandAt(getRep(dimension), InsertBefore, SE)) { + getRep(dimension)->dump(); + InsertBefore->dump(); + InsertBefore->getParent()->dump(); + this->dump(); + } return reps[dimension]->expandAt(ty, InsertBefore); } diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index e1faae23538d4..2dbd3f5e9be12 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -44,6 +44,7 @@ add_llvm_target(RISCVCodeGen RISCVTargetObjectFile.cpp RISCVTargetTransformInfo.cpp RISCVSSRReassociate.cpp + RISCVSSRStatistics.cpp Snitch/SNITCHFrepLoops.cpp Snitch/SNITCHAutoFrep.cpp diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index 4f6ea25817183..f731a4b7c1fbb 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -54,6 +54,9 @@ void initializeSNITCHAutoFrepPass(PassRegistry &); FunctionPass *createSSRReassociatePass(); void initializeSSRReassociatePass(PassRegistry &); +FunctionPass *createSSRStatisticsPass(); +void initializeSSRStatisticsPass(PassRegistry &); + FunctionPass *createRISCVExpandAtomicPseudoPass(); void initializeRISCVExpandAtomicPseudoPass(PassRegistry &); diff --git a/llvm/lib/Target/RISCV/RISCVSSRStatistics.cpp b/llvm/lib/Target/RISCV/RISCVSSRStatistics.cpp new file mode 100644 index 0000000000000..9c24e799cb09d --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVSSRStatistics.cpp @@ -0,0 +1,169 @@ +//===- RISCVSSRStatistics.cpp - Reassociate Fast FP insts and move SSR push/pop intrinsics ------------------===// +// +// ??? +// +//===----------------------------------------------------------------------===// +// +// count how many memory accesses there are and at what loop depth +// +//===----------------------------------------------------------------------===// + +#include "RISCV.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsRISCV.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsRISCV.h" +#include +#include +#include + +using namespace llvm; + +namespace { + + class SSRStatistics: public FunctionPass { + const TargetLowering *TLI = nullptr; + + public: + static char ID; // Pass identification, replacement for typeid + + SSRStatistics() : FunctionPass(ID) { + initializeSSRStatisticsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + + virtual void getAnalysisUsage(AnalysisUsage& AU) const override { + AU.addRequired(); + } + }; + +} // end anonymous namespace + +bool SSRStatistics::runOnFunction(Function &F) { + + DenseMap ld; + DenseMap st; + DenseMap push; + DenseMap pop; + + const LoopInfo &LI = getAnalysis().getLoopInfo(); + std::vector s; + for (const auto *L : LI.getTopLevelLoops()) { + s.push_back(L); + } + for (const auto &BB : F) { + const Loop *L = LI.getLoopFor(&BB); + if (!L) continue; + if (ld.find(L) == ld.end()) ld.insert(std::make_pair(L, 0)); + if (st.find(L) == st.end()) st.insert(std::make_pair(L, 0)); + if (push.find(L) == push.end()) push.insert(std::make_pair(L, 0)); + if (pop.find(L) == pop.end()) pop.insert(std::make_pair(L, 0)); + for (const Instruction &I : BB) { + if (isa(I)) { + auto x = ld.find(L); + assert(x != ld.end()); + x->getSecond() += 1; + } else if (isa(I)) { + auto x = st.find(L); + assert(x != st.end()); + x->getSecond() += 1; + } else if (isa(I)) { + const auto &In = cast(I); + if (In.getIntrinsicID() == Intrinsic::riscv_ssr_pop) { + auto x = pop.find(L); + assert(x != pop.end()); + x->getSecond() += 1; + } else if (In.getIntrinsicID() == Intrinsic::riscv_ssr_push) { + auto x = push.find(L); + assert(x != push.end()); + x->getSecond() += 1; + } + } + } + } + + errs()<<"\""<getHeader()->getNameOrAsOperand()<<"\": {\n"; + errs()<<"\t\t\"depth\": "<getLoopDepth()<<",\n"; + errs()<<"\t\t\"loads\": "<getSecond()<<",\n"; + errs()<<"\t\t\"stores\": "<getSecond()<<",\n"; + errs()<<"\t\t\"pushs\": "<getSecond()<<",\n"; + errs()<<"\t\t\"pops\": "<getSecond()<<"\n"; + errs()<<"\t},\n"; + } + errs()<<"},\n"; + + + return false; +} + +// bool SSRStatistics::runOnFunction(Function &F) { + +// std::vector n_ld, n_st; +// constexpr int max_depth = 5; +// while (n_ld.size() <= max_depth) n_ld.push_back(0); +// while (n_st.size() <= max_depth) n_st.push_back(0); + +// const LoopInfo &LI = getAnalysis().getLoopInfo(); +// for (const auto &BB : F) { +// unsigned depth = LI.getLoopDepth(&BB); +// assert(n_ld.size() > depth); +// assert(n_st.size() > depth); +// for (const Instruction &I : BB) { +// if (isa(I)) { +// n_ld[depth] += 1; +// } else if (isa(I)) { +// n_st[depth] += 1; +// } +// } +// } + +// errs()< +#include + #include #include #include @@ -115,7 +118,7 @@ cl::opt SSRBarrier( cl::opt SSRVerbose( "ssr-verbose", - cl::init(true), + cl::init(false), cl::desc("Write information about inferred streams to stderr.") ); @@ -194,25 +197,7 @@ struct ConflictTree { const NodeT *Root = nullptr; }; -/* -void clobberRegisters(ArrayRef regs, IRBuilder<> &builder){ - std::string constraints = ""; - if (regs.size() > 0u) { - constraints = "~{" + regs[0] + "}"; - for (unsigned i = 1u; i < regs.size(); i++) { - constraints = "~{" + regs[i] + "}," + constraints; - } - } - InlineAsm *IA = InlineAsm::get( - FunctionType::get(Type::getVoidTy(builder.getContext()), false), - "", - constraints, - true - ); - builder.CreateCall(IA)->dump(); -} -*/ - +// copy Phi-nodes from predecessor Basic Block (BB) void copyPHIsFromPred(BasicBlock *BB){ BasicBlock *Pred = nullptr; for (BasicBlock *B : predecessors(BB)) { @@ -390,6 +375,24 @@ BasicBlock *getSingleExitBlock(const Loop *L) { return Ex; } +void printInfo(ExpandedAffAcc &E) { + errs() + <<(E.Access->isWrite() ? "write" : "read ") + <<" stream of dimension " + <getAccesses()[0]->getDebugLoc(); + if (DL.get()) { + errs() + <<" orig. on line " + <getBaseAddr(E.getDimension()) + <<".\n"; +} + +//code for run-time checks for TCDM Value *GenerateTCDMCheck(ExpandedAffAcc &E, Instruction *Point) { IRBuilder<> builder(Point); Value *c1 = builder.CreateICmpULE(ConstantInt::get(E.LowerBound->getType(), SSR_SCRATCHPAD_BEGIN), E.LowerBound, "beg.check"); @@ -397,6 +400,7 @@ Value *GenerateTCDMCheck(ExpandedAffAcc &E, Instruction *Point) { return builder.CreateAnd(c1, c2, "tcdm.check"); } +//generate code for SSR setup void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ assert(Point); Module *mod = Point->getModule(); @@ -405,14 +409,7 @@ void GenerateSSRSetup(ExpandedAffAcc &E, unsigned dmid, Instruction *Point){ unsigned dim = E.getDimension(); LLVM_DEBUG(dbgs()<<"SSR Setup for stream with dim = "<isWrite() ? "write" : "read") - <<" stream with base address SCEV " - <<*E.Access->getBaseAddr(E.getDimension()) - <<" of dimension " - < generateSSREnDis(Instruction *PhP, Instruction *ExP){ IRBuilder<> builder(PhP); // ----------- in preheader Module *mod = PhP->getParent()->getModule(); Function *SSREnable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_enable); - builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef()); + Instruction *en = builder.CreateCall(SSREnable->getFunctionType(), SSREnable, ArrayRef()); builder.SetInsertPoint(ExP); // ----------- in exit block //generateFPDependency(builder); Function *SSRDisable = Intrinsic::getDeclaration(mod, Intrinsic::riscv_ssr_disable); - builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef()); + Instruction *dis = builder.CreateCall(SSRDisable->getFunctionType(), SSRDisable, ArrayRef()); LLVM_DEBUG(dbgs()<<"generated ssr_enable and ssr_disable\n"); - return; + return std::make_pair(en, dis); } +//estimate how much it costs to compute the SSR setup data (bounds, strides, base address, etc...) int getEstExpandCost(AffAcc *A, unsigned dim) { int cost = 0; cost += A->getBaseAddr(dim)->getExpressionSize(); @@ -512,6 +510,7 @@ int getEstExpandCost(AffAcc *A, unsigned dim) { return cost; } +//estimate the benefit of turning some AffAccs into streams int getEstGain(ArrayRef Accs, const Loop *L, AffineAccess &AAA) { int gain = 0; DenseSet accs; @@ -612,11 +611,14 @@ void cloneAndSetup(Instruction *PhT, Instruction *ExP, Value *Cond, std::vector< assert(exp.size() <= NUM_SSR); if (exp.size() == 0u) return; + //generate en/dis range over both loop versions to prevent later runs of this pass to infer streams in the clone version + // ExP = generateSSREnDis(PhT, ExP).second; //TODO: this might be better here + + if (!isa(Cond)){ //if Cond is not a constant we cannot make the decision at compile time ==> clone whole region for if-else auto p = cloneRegion(PhT, ExP); BranchInst *BR = p.first; ExP = p.second.first; //terminator of exit block that jumps to original ExP - //PhT = cast(BR->getOperand(1u))->getTerminator(); BR->setCondition(Cond); } else { //this should never happen, but it means the runtime checks were somehow known at compile time and turned out false: @@ -632,6 +634,8 @@ void cloneAndSetup(Instruction *PhT, Instruction *ExP, Value *Cond, std::vector< generateSSREnDis(PhT, ExP); } +//predicate to filter AffAccs +//in accordance with HW limitations, i.e., dimension <= 4, type = double, see #defines used bool isValid(AffAcc *A, const Loop *L) { assert(A->isWellFormed(L)); bool valid = true; @@ -644,12 +648,16 @@ bool isValid(AffAcc *A, const Loop *L) { return valid; } +//should be guaranteed by SimplifyLoops in SSRInferencePass, but the pass says that any guarantees should be rechecked when depended upon. bool isValidLoop(const Loop *L) { assert(L); if (!L->getLoopPreheader() || !getSingleExitBlock(L)) return false; return true; } +// collect some information about loop: +// possible streams +// insertion into conflict tree (for mapping to data movers) bool visitLoop(const Loop *L, DenseMap> &possible, ConflictTree &tree, AffineAccess &AAA, bool isKnownInvalid) { assert(L); @@ -684,6 +692,18 @@ bool visitLoop(const Loop *L, DenseMap> &pos unsigned val = (unsigned)std::max(0, gain); tree.insertNode(L, val, L->isOutermost() ? nullptr : L->getParentLoop()); + if (SSRVerbose) { + for (auto *A : l) { + errs() + <<"potential stream with base addr SCEV " + <<*A->getBaseAddr(L) + <<" of dimension " + <loopToDimension(L) + <<"\n"; + } + if (!l.empty()) errs()<<"With est. gain = "< findLoopsWithSSR(Function &F, LoopInfo &LI) { } //end of namespace +// main "run" of this pass PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &FAM){ LLVM_DEBUG(dbgs()<<"SSRInference Flags: "); if (InferSSR) LLVM_DEBUG(dbgs()<<"infer-ssr"); @@ -782,10 +803,10 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F if (SSRConflictFreeOnly) LLVM_DEBUG(dbgs()<<", ssr-conflict-free-only"); LLVM_DEBUG(dbgs()<<"\n"); - if (!InferSSR) return PreservedAnalyses::all(); + if (!InferSSR) return PreservedAnalyses::all(); //if no SSR inference is enabled, we exit early if (F.hasFnAttribute(SSRFnAttr)) return PreservedAnalyses::all(); //this function already contains streams ==> skip - AffineAccess &AAA = FAM.getResult(F); + AffineAccess &AAA = FAM.getResult(F); //call analysis LLVM_DEBUG(dbgs()<<"SSR Generation Pass on function: "<getSubLoops()) worklist.push_back(x); } - //find best expands + //find best expands (map best loops to data movers) auto f = [](unsigned a, unsigned b){ return a + b; }; std::vector best = tree.findBest(f); @@ -844,7 +865,16 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F if (p != conds.end()) { BasicBlock *Ex = getSingleExitBlock(L); assert(Ex); - //LoopVersioning LV(LAI, ArrayRef(), Parent, &LI, &DT, &SE);LoopAccessInfo LAI(Parent, &SE, nullptr, &AA, &DT, &LI); + if (SSRVerbose) { + errs() + <<"> Function " + <getHeader()->getParent()->getNameOrAsOperand() + <<": Expanding SSR streams with " + <<(L->getLoopDepth()-1) + <<" containing loops and setup in preheader of loop with header " + <getHeader()->getNameOrAsOperand() + <<"\n"; + } cloneAndSetup(L->getLoopPreheader()->getTerminator(), &*Ex->getFirstInsertionPt(), p->second, exps.find(L)->getSecond()); } } @@ -855,74 +885,4 @@ PreservedAnalyses SSRGenerationPass::run(Function &F, FunctionAnalysisManager &F F.addFnAttr(StringRef(SSRFnAttr)); //we have inserted a stream, tag accordingly if (SSRNoInline) F.addFnAttr(Attribute::AttrKind::NoInline); return PreservedAnalyses::none(); -} - -/* - unsigned n_reps = 0U; - if (isStore){ - for (Instruction *I : E.Access->getAccesses()){ - generatePushAsm(I, dmid, cast(I)->getValueOperand()); - I->dump(); - I->eraseFromParent(); - n_reps++; - } - }else{ - for (Instruction *I : E.Access->getAccesses()){ - builder.SetInsertPoint(I); - auto *V = generatePopAsm(I, dmid); - V->dump(); I->dump(); - BasicBlock::iterator ii(I); - ReplaceInstWithValue(I->getParent()->getInstList(), ii, V); - n_reps++; - } - } - - ///generates SSR enable & disable calls -void generateSSREnDisAsm(Instruction *PhT, Instruction *ExP){ - constexpr unsigned num_ssr = 3u; //FIXME: make use of NUM_SSR - - IRBuilder<> builder(PhT); // ----------- in preheader - auto &ctxt = builder.getContext(); - Type *Double = Type::getDoubleTy(ctxt); - std::vector structTys; - for (unsigned i = 0; i < num_ssr; i++) structTys.push_back(Double); - Type *ArrTy = StructType::get(ctxt, structTys); //auto *ArrTy = ArrayType::get(Double, num_ssr); //VectorType::get(Double, num_ssr, false); - std::vector argtypes; - for (unsigned i = 0u; i < num_ssr; i++) argtypes.push_back(Double); - std::string constraints = "={f0},={f1},={f2},{f0},{f1},{f2},~{memory}"; - FunctionType* fty = FunctionType::get(ArrTy, argtypes, false); - InlineAsm *En = InlineAsm::get(fty, "csrsi 0x7C0, 1\0A", constraints, true); - En->dump(); - std::vector args; - for (unsigned i = 0u; i < num_ssr; i++) args.push_back(UndefValue::get(Double)); - CallInst *Dep = builder.CreateCall(En, args, "ssr.enable.dep"); - Dep->dump(); - - builder.SetInsertPoint(ExP); // ----------- in exit block - std::vector deps; - for (unsigned i = 0u; i < num_ssr; i++) - deps.push_back(builder.CreateExtractValue(Dep, i, formatv("dep.{0}", i))); - InlineAsm *Dis = InlineAsm::get(fty, "csrci 0x7C0, 1\0A", constraints, true); - builder.CreateCall(Dis, deps, "ssr.disable.dep")->dump(); - - errs()<<"generated ssr_enable and ssr_disable\n"; - - return; -} - -Value *generatePopAsm(Instruction *InsertBefore, unsigned dmid) { - IRBuilder<> builder(InsertBefore); - FunctionType *fty = FunctionType::get(Type::getDoubleTy(builder.getContext()), false); - std::string inst = formatv("fmv.d $0, ft{0}\0A", dmid); - InlineAsm *Pop = InlineAsm::get(fty, inst, "=f", true); - return builder.CreateCall(Pop, ArrayRef(), "ssr.pop"); -} - -void generatePushAsm(Instruction *InsertBefore, unsigned dmid, Value *Val){ - IRBuilder<> builder(InsertBefore); - FunctionType *fty = FunctionType::get(Type::getVoidTy(builder.getContext()), ArrayRef(Type::getDoubleTy(builder.getContext())), false); - std::string inst = formatv("fmv.d ft{0}, $0\0A", dmid); - InlineAsm *Push = InlineAsm::get(fty, inst, "f", true); - builder.CreateCall(Push, ArrayRef(Val)); -} - */ \ No newline at end of file +} \ No newline at end of file From 049135ed4316074235702ec85811a92ce0075bbe Mon Sep 17 00:00:00 2001 From: thrupf Date: Thu, 25 Aug 2022 12:08:54 +0200 Subject: [PATCH 47/47] comments --- llvm/lib/Analysis/AffineAccessAnalysis.cpp | 70 +++++++++++++++++++--- llvm/lib/Transforms/SSR/SSRInference.cpp | 4 +- 2 files changed, 62 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Analysis/AffineAccessAnalysis.cpp b/llvm/lib/Analysis/AffineAccessAnalysis.cpp index 170f26e9ca87e..6b5a4220eef30 100644 --- a/llvm/lib/Analysis/AffineAccessAnalysis.cpp +++ b/llvm/lib/Analysis/AffineAccessAnalysis.cpp @@ -1,3 +1,11 @@ +//===-- SSRGeneration.cpp - find prefetchable square affine accesses --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #include "llvm/Analysis/AffineAccessAnalysis.h" #include "llvm/IR/PassManager.h" @@ -57,6 +65,7 @@ using namespace llvm; namespace { +//collects the set of unknown values in SCEV struct SCEVUknownSetFinder { DenseSet values; // return true to follow this node. @@ -70,6 +79,7 @@ struct SCEVUknownSetFinder { bool isDone() { return false; /*continue forever*/ } }; +//finds whether two SCEVs share unknown values bool shareValues(const SCEV *A, const SCEV *B) { SCEVUknownSetFinder finderA; SCEVTraversal trA(finderA); @@ -86,6 +96,7 @@ bool shareValues(const SCEV *A, const SCEV *B) { return shareValues; } +//checks whether SCEV contains the SCEVCouldNotCompute expression bool SCEVContainsCouldNotCompute(const SCEV *S) { auto pred = [](const SCEV *X) { return !X || X->getSCEVType() == SCEVTypes::scCouldNotCompute || isa(X); }; return SCEVExprContains(S, std::move(pred)); @@ -114,6 +125,7 @@ const SCEV *getLoopBTSCEV(const Loop *L, DominatorTree &DT, ScalarEvolution &SE) return bt; } +//casts SCEVs to same type if possible (or always if unsafe = true) Optional> toSameType(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool unsafe = false){ assert(LHS && RHS); using PT = std::pair; @@ -194,6 +206,7 @@ bool isOnAllControlFlowPaths(BasicBlock *BB, const Loop *L, const DominatorTree } //return result of Cmp predicated on Rep > 0 if possible. +// i.e. if we can say that Rep > 0 implies that Cmp is always false or true, we return that, o/w return None Optional predicatedICmpOutcome(ICmpInst *Cmp, const SCEV *Rep, ScalarEvolution &SE){ switch (Cmp->getPredicate()) { @@ -275,6 +288,7 @@ bool isOnAllPredicatedControlFlowPaths(BasicBlock *BB, const Loop *L, const Domi return true; } +//cast to right integer size, insert instruction at `InsPoint` Value *castToSize(Value *R, Type *ty, Instruction *InsPoint){ const DataLayout &DL = InsPoint->getParent()->getModule()->getDataLayout(); IRBuilder<> builder(InsPoint); @@ -289,6 +303,7 @@ Value *castToSize(Value *R, Type *ty, Instruction *InsPoint){ return builder.CreateBitOrPointerCast(R, ty, "scev.cast"); } +// extract the Address Value of MA (nullptr if not available) Value *getAddress(MemoryUseOrDef *MA) { assert(MA && "called getAddress on nullptr"); assert(MA->getMemoryInst()); @@ -298,6 +313,8 @@ Value *getAddress(MemoryUseOrDef *MA) { return nullptr; } +//find the first L in loops that contains BB +//loops should be a nesting of loops from inner to outermost const Loop *findFirstContaining(ArrayRef loops, BasicBlock *BB){ for (const Loop *L : loops) { if (L && L->contains(BB)) { @@ -307,6 +324,7 @@ const Loop *findFirstContaining(ArrayRef loops, BasicBlock *BB){ return nullptr; } +//find out whether MA stands for some load/store (for some reason they don't always do, maybe bc of DCE?) bool hasMemInst(MemoryUseOrDef *MA) { return MA && MA->getMemoryInst(); } //updates L<-M if M is a descendant of L (or if L is nullptr) @@ -334,6 +352,7 @@ void updateOutermostExpandableExcl(const Loop *&outerMostExpandableExl, AffAccCo } } +//tries to find the sign of SCEV which information given Optional findSign(const SCEV *S, ScalarEvolution &SE, std::vector> &known) { if (!S) return None; @@ -384,6 +403,7 @@ Optional findSign(const SCEV *S, ScalarEvolution &SE, std::vectorgetType()) < SE.getDataLayout().getTypeSizeInBits(Ty)) { return SE.getZeroExtendExpr(S, Ty); @@ -391,6 +411,7 @@ const SCEV *getZExtIfNeeded(const SCEV *S, Type *Ty, ScalarEvolution &SE) { return S; } +//cast some SCEVs if necessary const SCEV *getSExtIfNeeded(const SCEV *S, Type *Ty, ScalarEvolution &SE) { if (SE.getDataLayout().getTypeSizeInBits(S->getType()) < SE.getDataLayout().getTypeSizeInBits(Ty)) { return SE.getSignExtendExpr(S, Ty); @@ -445,6 +466,7 @@ bool LoopRep::isSafeToExpandBefore(const Loop *L) const { return false; } +//code generation for loop rep, will cache the Value holding the results after calling for the first time to prevent excessive code-gen Value *LoopRep::expandAt(Type *ty, Instruction *InsertBefore){ assert(ty); assert(RepSCEV); @@ -463,6 +485,7 @@ Value *LoopRep::expandAt(Type *ty, Instruction *InsertBefore){ return Rep; } +//code-gen for loop guard, ie. inserts code of rep+1 > 0 Value *LoopRep::expandLoopGuard(Instruction *InsertBefore) { assert(RepPlusOne && "expandAt has to be called before this"); InsertBefore = InsertBefore ? InsertBefore : L->getLoopPreheader()->getTerminator(); @@ -478,10 +501,10 @@ AffAcc::AffAcc(ArrayRef accesses, const SCEV *Addr, MemoryUseOrDe assert(MA); containingLoops.push_back((const Loop *)nullptr); //there is no loop for dim=0 - containingLoops.append(contLoops.begin(), contLoops.end()); + containingLoops.append(contLoops.begin(), contLoops.end()); //initialize loops bool isVolatile = false; - for (Instruction *I : accesses) + for (Instruction *I : accesses) //check for volatile mem insts, we don't want to touch those isVolatile |= (isa(I) && cast(I)->isVolatile()) || (isa(I) && cast(I)->isVolatile()); if (Addr && (SCEVContainsCouldNotCompute(Addr) || isVolatile)) Addr = nullptr; //set to null if contains SCEVCouldNotCompute baseAddresses.push_back(Addr); @@ -495,6 +518,8 @@ AffAcc::AffAcc(ArrayRef accesses, const SCEV *Addr, MemoryUseOrDe } } +//fold over A and collect steps in AddRec expressions +//the found steps might not be valid for square affine access patterns ==> `promote` will check this void AffAcc::findSteps(const SCEV *A, const SCEV *Factor, unsigned loop){ assert(A); assert(baseAddresses.size() == 1 && reps.size() == 1 && "we only know dim=0 so far"); @@ -602,7 +627,7 @@ bool AffAcc::isWellFormed(const Loop *L) const { return isWellFormed(loopToDimen ///returns the dimension that is defined by `L` (starts at 1) unsigned AffAcc::loopToDimension(const Loop *L) const { assert(L && "L not nullptr"); - for (unsigned d = 1u; d < containingLoops.size(); d++){ //FIXME: linear search -> improve with a map + for (unsigned d = 1u; d < containingLoops.size(); d++){ //FIXME: linear search -> improve with a map if (containingLoops[d] == L) return d; } llvm_unreachable("The provided loop does not contain `this`!"); @@ -639,6 +664,7 @@ const Loop *AffAcc::getLoop(unsigned dim) const { assert(dim < containingLoops.s ///get containing loops from inner- to outermost ArrayRef AffAcc::getContainingLoops() const { return ArrayRef(containingLoops); } +//dump info known for this AffAcc up to some loop L void AffAcc::dumpInLoop(const Loop *L) const { errs()<<"Affine Access of \n"; int dimension = getMaxDimension(); @@ -668,8 +694,10 @@ void AffAcc::dumpInLoop(const Loop *L) const { } } +//dump all info known about this AffAcc void AffAcc::dump() const { dumpInLoop(nullptr); } +//get the actual conflict between this and the AffAcc in the pair for some loop L AffAccConflict AffAcc::fromConflictPair(const detail::DenseMapPair> &p, const Loop *L) const { const Loop *S = p.getSecond().first; if (S == L || L->contains(S)) { //if start is L or more "inner" loop @@ -679,6 +707,7 @@ AffAccConflict AffAcc::fromConflictPair(const detail::DenseMapPair> AffAcc::getConflicts(const Loop MemoryUseOrDef *AffAcc::getMemoryAccess() { return MA; } +//add conflict with A, where StartL is innermost shared loop, with conflict classification `kind` void AffAcc::addConflict(AffAcc *A, const Loop *StartL, AffAccConflict kind){ assert(StartL); assert(conflicts.find(A) == conflicts.end() && "no conflict for A yet"); @@ -709,6 +739,9 @@ void AffAcc::addConflict(AffAcc *A, const Loop *StartL, AffAccConflict kind){ conflicts.insert(std::make_pair(A, std::make_pair(StartL, kind))); } +//promote `this` if possible. +//`LR` should be the rep of the next outer loop where this is not (yet) well-formed +// if successful, `this` is well-formed for LR->getLoop() afterwards. bool AffAcc::promote(LoopRep *LR){ if (!LR->isAvailable()) return false; unsigned newDim = (unsigned)(getMaxDimension() + 1); //getMaxDimension() >= -1 @@ -736,6 +769,7 @@ bool AffAcc::promote(LoopRep *LR){ return true; } +//Code-gen for base address Value *AffAcc::expandBaseAddr(unsigned dimension, Type *ty, Instruction *InsertBefore){ assert(isWellFormed(dimension)); InsertBefore = InsertBefore ? InsertBefore : reps[dimension]->getLoop()->getLoopPreheader()->getTerminator(); @@ -752,6 +786,7 @@ Value *AffAcc::expandBaseAddr(unsigned dimension, Type *ty, Instruction *InsertB return castToSize(ex.expandCodeFor(getBaseAddr(dimension)), ty, InsertBefore); } +//code-gen for step Value *AffAcc::expandStep(unsigned dimension, Type *ty, Instruction *InsertBefore){ assert(isWellFormed(dimension) && dimension > 0u); InsertBefore = InsertBefore ? InsertBefore : reps[dimension]->getLoop()->getLoopPreheader()->getTerminator(); @@ -761,6 +796,7 @@ Value *AffAcc::expandStep(unsigned dimension, Type *ty, Instruction *InsertBefor return castToSize(ex.expandCodeFor(getStep(dimension)), ty, InsertBefore); } +//code-gen for rep (calls code-gen of the LoopRep) Value *AffAcc::expandRep(unsigned dimension, Type *ty, Instruction *InsertBefore){ assert(isWellFormed(dimension) && dimension > 0u); InsertBefore = InsertBefore ? InsertBefore : reps[dimension]->getLoop()->getLoopPreheader()->getTerminator(); @@ -773,6 +809,8 @@ Value *AffAcc::expandRep(unsigned dimension, Type *ty, Instruction *InsertBefore return reps[dimension]->expandAt(ty, InsertBefore); } +//code-gen for all info needed to know the square affine access pattern inside of L +//guaranteed to work if `Point` is the terminator of preheader of L ExpandedAffAcc AffAcc::expandAt(const Loop *L, Instruction *Point, Type *PtrTy, IntegerType *ParamTy) { @@ -822,12 +860,6 @@ ExpandedAffAcc AffAcc::expandAt(const Loop *L, Instruction *Point, return Aexp; } -// // ================= CustomMultiDRTPointerChecking =================== -// //takes inspiration from RuntimePointerChecking's .insert(...) -// void CustomMultiDRTPointerChecking::insert(const AffAcc &A) { - -// } -// Value *generateChecks(Instruction *I, Value *memRangeStart, Value *memRangeEnd); // ================= MemDep ============== @@ -837,6 +869,8 @@ bool MemDep::alias(MemoryUseOrDef *A, MemoryUseOrDef *B) { else return alias(getAddress(A), getAddress(B)); } +//returns all MemoryDefs that might clobber MA +//i.e. we cannot be sure at compile-time that they *don't* clobber MA DenseSet MemDep::findClobbers(MemoryUseOrDef *MA){ DenseSet res; std::deque worklist; @@ -863,6 +897,7 @@ DenseSet MemDep::findClobbers(MemoryUseOrDef *MA){ return res; } +//find all MemoryUse's or MemoryDef's that might be clobbered by MA (might = must OR we do not know at compile-time) DenseSet MemDep::findClobberUsers(MemoryDef *MA) { DenseSet res; std::deque worklist; @@ -894,6 +929,7 @@ DenseSet MemDep::findClobberUsers(MemoryDef *MA) { //================== Affine Access =========================================================== +//constructor of analysis result, immediately computes all necessary information AffineAccess::AffineAccess( Function &F, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, MemorySSA &MSSA, AAResults &AA, @@ -907,6 +943,7 @@ AffineAccess::AffineAccess( } } +//DFS over loop tree, constructs an AffAcc for each memory access and tries to promote it as far as possible std::unique_ptr> AffineAccess::analyze(Loop *Parent, ArrayRef loopPath){ LLVM_DEBUG(dbgs()<<"analyze: loop : "<getHeader()->getNameOrAsOperand()<<"\n"); @@ -968,6 +1005,7 @@ std::unique_ptr> AffineAccess::analyze(Loop *Parent, Array return all; } +//given the list of all AffAccs in a loop-tree, this finds all the conflicts between them void AffineAccess::addAllConflicts(const std::vector &all) { for (AffAcc *A : all) { assert(A); @@ -1009,6 +1047,7 @@ void AffineAccess::addAllConflicts(const std::vector &all) { } } +//classify conflict between Read and Write AffAccConflict AffineAccess::calcRWConflict(AffAcc *Read, AffAcc *Write, const Loop *L) const { assert(!Read->isWrite()); assert(Write->isWrite()); @@ -1068,6 +1107,7 @@ std::pair AffineAccess::calcConflict(AffAcc *A, Aff return std::make_pair(kind, innermostCommon); } +//checks whether access patterns (step, rep) match up to some loop L bool AffineAccess::accessPatternsMatch(const AffAcc *A, const AffAcc *B, const Loop *L) const { unsigned dimA = A->loopToDimension(L); unsigned dimB = B->loopToDimension(L); @@ -1080,11 +1120,13 @@ bool AffineAccess::accessPatternsMatch(const AffAcc *A, const AffAcc *B, const L return true; } +//checks whether step, rep, and base address matches up to some loop L bool AffineAccess::accessPatternsAndAddressesMatch(const AffAcc *A, const AffAcc *B, const Loop *L) const { if (!accessPatternsMatch(A, B, L)) return false; return SCEVEquals(A->getBaseAddr(A->loopToDimension(L)), B->getBaseAddr(B->loopToDimension(L)), SE); } +//simple access methods ScalarEvolution &AffineAccess::getSE() const { return this->SE; } DominatorTree &AffineAccess::getDT()const { return this->DT; } LoopInfo &AffineAccess::getLI() const { return this->LI; } @@ -1093,6 +1135,11 @@ AAResults &AffineAccess::getAA() const { return this->AA; } DependenceInfo &AffineAccess::getDI() const { return this->DI; } SmallVector AffineAccess::getLoopsInPreorder() const { return this->LI.getLoopsInPreorder(); } +//get accesses with no bad conflicts for some loop L +//guarantees: +// no bad conflicts with any other memory instruction in L +// is well formed for L +// if conflictFreeOnly: has no conflicts at all (only NoConflict) ==> no run-time checks necessary std::vector AffineAccess::getExpandableAccesses(const Loop *L, bool conflictFreeOnly) { auto p = expandableAccesses.find(L); std::vector res; @@ -1103,6 +1150,10 @@ std::vector AffineAccess::getExpandableAccesses(const Loop *L, bool co return res; } +// code-gen: calls code-gen for all AffAccs in list, +// generates run-time checks for conflicts, +// generates run-time checks for loop-trip-counts (if repChecks = true) +// ANDs all the rt-checks to a single Value and writes it into BoundCheck std::vector AffineAccess::expandAllAt(ArrayRef Accs, const Loop *L, Instruction *Point, Value *&BoundCheck, @@ -1182,6 +1233,7 @@ AffineAccess::expandAllAt(ArrayRef Accs, const Loop *L, AnalysisKey AffineAccessAnalysis::Key; +// run of the analysis pass AffineAccess AffineAccessAnalysis::run(Function &F, FunctionAnalysisManager &FAM) { LLVM_DEBUG(dbgs()<<"running AffineAccessAnalysis on "<