Skip to content

Commit 3f5ac7d

Browse files
committed
[VPlan] Hoist loads with invariant addresses using noalias metadata. (llvm#166247)
This patch implements a transform to hoists single-scalar replicated loads with invariant addresses out of the vector loop to the preheader when scoped noalias metadata proves they cannot alias with any stores in the loop. This enables hosting of loads we can prove do not alias any stores in the loop due to memory runtime checks added during vectorization. PR: llvm#166247 (cherry picked from commit 7c34848)
1 parent 32ade50 commit 3f5ac7d

22 files changed

+337
-236
lines changed

llvm/include/llvm/Analysis/ScopedNoAliasAA.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,12 @@ class ScopedNoAliasAAResult : public AAResultBase {
4646
LLVM_ABI ModRefInfo getModRefInfo(const CallBase *Call1,
4747
const CallBase *Call2, AAQueryInfo &AAQI);
4848

49-
LLVM_ABI void
49+
LLVM_ABI static void
5050
collectScopedDomains(const MDNode *NoAlias,
51-
SmallPtrSetImpl<const MDNode *> &Domains) const;
51+
SmallPtrSetImpl<const MDNode *> &Domains);
5252

53-
private:
54-
bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias) const;
53+
LLVM_ABI static bool mayAliasInScopes(const MDNode *Scopes,
54+
const MDNode *NoAlias);
5555
};
5656

5757
/// Analysis pass providing a never-invalidated alias analysis result.

llvm/lib/Analysis/ScopedNoAliasAA.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ static void collectMDInDomain(const MDNode *List, const MDNode *Domain,
116116

117117
/// Collect the set of scoped domains relevant to the noalias scopes.
118118
void ScopedNoAliasAAResult::collectScopedDomains(
119-
const MDNode *NoAlias, SmallPtrSetImpl<const MDNode *> &Domains) const {
119+
const MDNode *NoAlias, SmallPtrSetImpl<const MDNode *> &Domains) {
120120
if (!NoAlias)
121121
return;
122122
assert(Domains.empty() && "Domains should be empty");
@@ -127,7 +127,7 @@ void ScopedNoAliasAAResult::collectScopedDomains(
127127
}
128128

129129
bool ScopedNoAliasAAResult::mayAliasInScopes(const MDNode *Scopes,
130-
const MDNode *NoAlias) const {
130+
const MDNode *NoAlias) {
131131
if (!Scopes || !NoAlias)
132132
return true;
133133

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include "llvm/ADT/ilist.h"
3535
#include "llvm/ADT/ilist_node.h"
3636
#include "llvm/Analysis/IVDescriptors.h"
37+
#include "llvm/Analysis/MemoryLocation.h"
3738
#include "llvm/Analysis/VectorUtils.h"
3839
#include "llvm/IR/DebugLoc.h"
3940
#include "llvm/IR/FMF.h"
@@ -947,6 +948,13 @@ class VPIRMetadata {
947948
/// Intersect this VPIRMetada object with \p MD, keeping only metadata
948949
/// nodes that are common to both.
949950
void intersect(const VPIRMetadata &MD);
951+
952+
/// Get metadata of kind \p Kind. Returns nullptr if not found.
953+
MDNode *getMetadata(unsigned Kind) const {
954+
auto It =
955+
find_if(Metadata, [Kind](const auto &P) { return P.first == Kind; });
956+
return It != Metadata.end() ? It->second : nullptr;
957+
}
950958
};
951959

952960
/// This is a concrete Recipe that models a single VPlan-level instruction.

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,8 +156,13 @@ bool VPRecipeBase::mayHaveSideEffects() const {
156156
case VPPredInstPHISC:
157157
case VPVectorEndPointerSC:
158158
return false;
159-
case VPInstructionSC:
159+
case VPInstructionSC: {
160+
auto *VPI = cast<VPInstruction>(this);
161+
if (VPI->getOpcode() == VPInstruction::BranchOnCond ||
162+
VPI->getOpcode() == VPInstruction::BranchOnCount)
163+
return true;
160164
return mayWriteToMemory();
165+
}
161166
case VPWidenCallSC: {
162167
Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
163168
return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
@@ -1038,6 +1043,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
10381043
case Instruction::ICmp:
10391044
case Instruction::Select:
10401045
case VPInstruction::AnyOf:
1046+
case VPInstruction::BranchOnCond:
1047+
case VPInstruction::BranchOnCount:
10411048
case VPInstruction::BuildStructVector:
10421049
case VPInstruction::BuildVector:
10431050
case VPInstruction::CalculateTripCountMinusVF:

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@
2424
#include "llvm/ADT/APInt.h"
2525
#include "llvm/ADT/PostOrderIterator.h"
2626
#include "llvm/ADT/STLExtras.h"
27+
#include "llvm/ADT/SetOperations.h"
2728
#include "llvm/ADT/SetVector.h"
29+
#include "llvm/ADT/SmallPtrSet.h"
2830
#include "llvm/ADT/TypeSwitch.h"
2931
#include "llvm/Analysis/IVDescriptors.h"
3032
#include "llvm/Analysis/InstSimplifyFolder.h"
@@ -33,6 +35,13 @@
3335
#include "llvm/IR/Intrinsics.h"
3436
#include "llvm/IR/MDBuilder.h"
3537
#include "llvm/IR/PatternMatch.h"
38+
#include "llvm/Analysis/MemoryLocation.h"
39+
#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
40+
#include "llvm/Analysis/ScopedNoAliasAA.h"
41+
#include "llvm/Analysis/VectorUtils.h"
42+
#include "llvm/IR/Intrinsics.h"
43+
#include "llvm/IR/MDBuilder.h"
44+
#include "llvm/IR/Metadata.h"
3645
#include "llvm/Support/Casting.h"
3746
#include "llvm/Support/TypeSize.h"
3847

@@ -1891,6 +1900,7 @@ static void removeBranchOnConst(VPlan &Plan) {
18911900
vp_depth_first_shallow(Plan.getEntry()))) {
18921901
VPValue *Cond;
18931902
if (VPBB->getNumSuccessors() != 2 || VPBB == Plan.getEntry() ||
1903+
VPBB->empty() ||
18941904
!match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
18951905
continue;
18961906

@@ -1936,6 +1946,7 @@ void VPlanTransforms::optimize(VPlan &Plan) {
19361946
runPass(removeDeadRecipes, Plan);
19371947

19381948
runPass(createAndOptimizeReplicateRegions, Plan);
1949+
runPass(hoistInvariantLoads, Plan);
19391950
runPass(mergeBlocksIntoPredecessors, Plan);
19401951
runPass(licm, Plan);
19411952
}
@@ -3091,6 +3102,55 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
30913102
}
30923103
}
30933104

3105+
void VPlanTransforms::hoistInvariantLoads(VPlan &Plan) {
3106+
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3107+
if (!LoopRegion)
3108+
return;
3109+
3110+
// Collect candidate loads with invariant addresses and noalias scopes
3111+
// metadata and memory-writing recipes with noalias metadata.
3112+
SmallVector<std::pair<VPRecipeBase *, MemoryLocation>> CandidateLoads;
3113+
SmallVector<MemoryLocation> Stores;
3114+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3115+
vp_depth_first_shallow(LoopRegion->getEntry()))) {
3116+
for (VPRecipeBase &R : *VPBB) {
3117+
// Only handle single-scalar replicated loads with invariant addresses.
3118+
if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
3119+
if (RepR->isPredicated() || !RepR->isSingleScalar() ||
3120+
RepR->getOpcode() != Instruction::Load)
3121+
continue;
3122+
3123+
VPValue *Addr = RepR->getOperand(0);
3124+
if (Addr->isDefinedOutsideLoopRegions()) {
3125+
auto OptLoc = vputils::getMemoryLocation(*RepR);
3126+
if (!OptLoc || !OptLoc->AATags.Scope)
3127+
continue;
3128+
CandidateLoads.push_back({RepR, *OptLoc});
3129+
}
3130+
}
3131+
if (R.mayWriteToMemory()) {
3132+
auto Loc = vputils::getMemoryLocation(R);
3133+
if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
3134+
return;
3135+
Stores.push_back(*Loc);
3136+
}
3137+
}
3138+
}
3139+
3140+
VPBasicBlock *Preheader = Plan.getVectorPreheader();
3141+
for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
3142+
// Hoist the load to the preheader if it doesn't alias with any stores
3143+
// according to the noalias metadata. Other loads should have been hoisted
3144+
// by other passes
3145+
const AAMDNodes &LoadAA = LoadLoc.AATags;
3146+
if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
3147+
return !ScopedNoAliasAAResult::mayAliasInScopes(
3148+
LoadAA.Scope, StoreLoc.AATags.NoAlias);
3149+
})) {
3150+
LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
3151+
}
3152+
}
3153+
}
30943154
/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
30953155
/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
30963156
/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,11 @@ struct VPlanTransforms {
230230
/// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors.
231231
static void materializeBroadcasts(VPlan &Plan);
232232

233+
/// Hoist single-scalar loads with invariant addresses out of the vector loop
234+
/// to the preheader, if they are proven not to alias with any stores in the
235+
/// plan using noalias metadata.
236+
static void hoistInvariantLoads(VPlan &Plan);
237+
233238
/// Try to convert a plan with interleave groups with VF elements to a plan
234239
/// with the interleave groups replaced by wide loads and stores processing VF
235240
/// elements, if all transformed interleave groups access the full vector

llvm/lib/Transforms/Vectorize/VPlanUtils.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "VPlanCFG.h"
1111
#include "VPlanPatternMatch.h"
1212
#include "llvm/ADT/TypeSwitch.h"
13+
#include "llvm/Analysis/MemoryLocation.h"
1314
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
1415

1516
using namespace llvm;
@@ -135,3 +136,20 @@ VPBasicBlock *vputils::getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT) {
135136
});
136137
return I == DepthFirst.end() ? nullptr : cast<VPBasicBlock>(*I);
137138
}
139+
140+
std::optional<MemoryLocation>
141+
vputils::getMemoryLocation(const VPRecipeBase &R) {
142+
return TypeSwitch<const VPRecipeBase *, std::optional<MemoryLocation>>(&R)
143+
.Case<VPWidenMemoryRecipe, VPInterleaveRecipe, VPReplicateRecipe>(
144+
[](auto *S) {
145+
MemoryLocation Loc;
146+
// Populate noalias metadata from VPIRMetadata.
147+
if (MDNode *NoAliasMD = S->getMetadata(LLVMContext::MD_noalias))
148+
Loc.AATags.NoAlias = NoAliasMD;
149+
if (MDNode *AliasScopeMD =
150+
S->getMetadata(LLVMContext::MD_alias_scope))
151+
Loc.AATags.Scope = AliasScopeMD;
152+
return Loc;
153+
})
154+
.Default([](auto *) { return std::nullopt; });
155+
}

llvm/lib/Transforms/Vectorize/VPlanUtils.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "VPlan.h"
1313

1414
namespace llvm {
15+
class MemoryLocation;
1516
class ScalarEvolution;
1617
class SCEV;
1718
} // namespace llvm
@@ -97,6 +98,12 @@ bool isUniformAcrossVFsAndUFs(VPValue *V);
9798
/// Returns the header block of the first, top-level loop, or null if none
9899
/// exist.
99100
VPBasicBlock *getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT);
101+
102+
/// Return a MemoryLocation for \p R with noalias metadata populated from
103+
/// \p R, if the recipe is supported and std::nullopt otherwise. The pointer of
104+
/// the location is conservatively set to nullptr.
105+
std::optional<MemoryLocation> getMemoryLocation(const VPRecipeBase &R);
106+
100107
} // namespace vputils
101108

102109
//===----------------------------------------------------------------------===//

llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -491,7 +491,7 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
491491
; DEFAULT-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], ptr [[E:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
492492
; DEFAULT-NEXT: [[ENTRY:.*]]:
493493
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
494-
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 60
494+
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 28
495495
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
496496
; DEFAULT: [[VECTOR_MEMCHECK]]:
497497
; DEFAULT-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[E]], i64 4
@@ -532,26 +532,26 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
532532
; DEFAULT: [[VECTOR_PH]]:
533533
; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
534534
; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
535-
; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
536-
; DEFAULT: [[VECTOR_BODY]]:
537-
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE37:.*]] ]
538-
; DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META7:![0-9]+]]
539-
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> poison, i32 [[TMP9]], i64 0
535+
; DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META7:![0-9]+]]
536+
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0
540537
; DEFAULT-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> poison, <4 x i32> zeroinitializer
541-
; DEFAULT-NEXT: [[TMP19:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META10:![0-9]+]]
542-
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP19]], i64 0
538+
; DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META10:![0-9]+]]
539+
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
543540
; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
544-
; DEFAULT-NEXT: [[TMP6:%.*]] = or <4 x i32> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT29]]
545-
; DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META12:![0-9]+]]
541+
; DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META12:![0-9]+]]
546542
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
547543
; DEFAULT-NEXT: [[BROADCAST_SPLAT31:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT30]], <4 x i32> poison, <4 x i32> zeroinitializer
548-
; DEFAULT-NEXT: [[TMP8:%.*]] = icmp ugt <4 x i32> [[BROADCAST_SPLAT31]], [[TMP6]]
544+
; DEFAULT-NEXT: [[TMP6:%.*]] = or <4 x i32> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT31]]
545+
; DEFAULT-NEXT: [[TMP8:%.*]] = icmp ugt <4 x i32> [[BROADCAST_SPLAT29]], [[TMP6]]
546+
; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
547+
; DEFAULT: [[VECTOR_BODY]]:
548+
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE37:.*]] ]
549549
; DEFAULT-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[D]], i64 [[INDEX]]
550-
; DEFAULT-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0
551-
; DEFAULT-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
550+
; DEFAULT-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0
551+
; DEFAULT-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
552552
; DEFAULT: [[PRED_STORE_IF]]:
553-
; DEFAULT-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
554-
; DEFAULT-NEXT: store i32 [[TMP11]], ptr [[E]], align 4, !alias.scope [[META14:![0-9]+]], !noalias [[META16:![0-9]+]]
553+
; DEFAULT-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
554+
; DEFAULT-NEXT: store i32 [[TMP10]], ptr [[E]], align 4, !alias.scope [[META14:![0-9]+]], !noalias [[META16:![0-9]+]]
555555
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE]]
556556
; DEFAULT: [[PRED_STORE_CONTINUE]]:
557557
; DEFAULT-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1

0 commit comments

Comments
 (0)