Skip to content

Commit d2bd353

Browse files
committed
Use the tgt_target_kernel interface
- Avoid dummy kernel argument for robust codegen
1 parent a551e82 commit d2bd353

1 file changed

Lines changed: 43 additions & 49 deletions

File tree

src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.cpp

Lines changed: 43 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "CGIntrinsicsOpenMP.h"
22
#include "DebugOpenMP.h"
33

4+
#include <llvm/ADT/StringExtras.h>
45
#include <llvm/Frontend/OpenMP/OMP.h.inc>
56
#include <llvm/Frontend/OpenMP/OMPConstants.h>
67
#include <llvm/Frontend/OpenMP/OMPIRBuilder.h>
@@ -1739,8 +1740,8 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings(
17391740
OffloadMapNames.push_back(OMPBuilder.getOrCreateSrcLocStr(
17401741
BasePtr->getName(), "", 0, 0, SrcLocStrSize));
17411742
DEBUG_ENABLE(dbgs() << "Emit mapping entry BasePtr " << *BasePtr << " Ptr "
1742-
<< *Ptr << " Size " << *Size << " MapType " << MapType
1743-
<< "\n");
1743+
<< *Ptr << " Size " << *Size << " MapType 0x"
1744+
<< toHex(MapType) << "\n");
17441745
MapperInfos.push_back({BasePtr, Ptr, Size});
17451746
};
17461747

@@ -1805,14 +1806,6 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings(
18051806
return MapType;
18061807
};
18071808

1808-
// TODO: This a dummy entry to workaround the runtime issue that it expects a
1809-
// kernel launch environment parameter. It will be removed once we move to the
1810-
// __tgt_target_kernel interface.
1811-
auto *DummyAlloca = OMPBuilder.Builder.CreateAlloca(
1812-
OMPBuilder.Int8, ConstantInt::get(OMPBuilder.Int64, 8), "dummy.alloca");
1813-
EmitMappingEntry(ConstantInt::get(OMPBuilder.SizeTy, 8),
1814-
OMP_TGT_MAPTYPE_TARGET_PARAM, DummyAlloca, DummyAlloca);
1815-
18161809
// Keep track of argument position, needed for struct mappings.
18171810
for (auto &It : DSAValueMap) {
18181811
Value *V = It.first;
@@ -2259,17 +2252,6 @@ void CGIntrinsicsOpenMP::emitOMPTargetHost(
22592252
Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
22602253
Value *Ident = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
22612254

2262-
// TODO: should we use target_mapper without teams or the more general
2263-
// target_teams_mapper. Does the former buy us anything (less overhead?)
2264-
// FunctionCallee TargetMapper =
2265-
// OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_mapper);
2266-
// TODO: For nowait we need to enclose the host code in a task for async
2267-
// execution.
2268-
FunctionCallee TargetMapper =
2269-
(TargetInfo.NoWait ? OMPBuilder.getOrCreateRuntimeFunction(
2270-
M, OMPRTL___tgt_target_teams_nowait_mapper)
2271-
: OMPBuilder.getOrCreateRuntimeFunction(
2272-
M, OMPRTL___tgt_target_teams_mapper));
22732255
OMPBuilder.Builder.SetInsertPoint(EntryBB->getTerminator());
22742256

22752257
// Emit mappings.
@@ -2279,19 +2261,13 @@ void CGIntrinsicsOpenMP::emitOMPTargetHost(
22792261
emitOMPOffloadingMappings(AllocaIP, DSAValueMap, StructMappingInfoMap,
22802262
OffloadingMappingArgs, /* isTargetRegion */ true);
22812263

2282-
// Push the tripcount.
2264+
// Set the tripcount, if available.
2265+
Value *TripCount = nullptr;
22832266
if (OMPLoopInfo) {
2284-
FunctionCallee TripcountMapper = OMPBuilder.getOrCreateRuntimeFunction(
2285-
M,
2286-
llvm::omp::RuntimeFunction::OMPRTL___kmpc_push_target_tripcount_mapper);
22872267
Value *Load =
22882268
OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, OMPLoopInfo->UB);
2289-
Value *Tripcount = OMPBuilder.Builder.CreateAdd(
2269+
TripCount = OMPBuilder.Builder.CreateAdd(
22902270
Load, ConstantInt::get(OMPBuilder.Int64, 1));
2291-
auto *CI = checkCreateCall(
2292-
OMPBuilder.Builder, TripcountMapper,
2293-
{Ident, ConstantInt::get(OMPBuilder.Int64, -1), Tripcount});
2294-
assert(CI && "Expected valid call");
22952271
}
22962272

22972273
Value *NumTeams = createScalarCast(TargetInfo.NumTeams, OMPBuilder.Int32);
@@ -2301,24 +2277,6 @@ void CGIntrinsicsOpenMP::emitOMPTargetHost(
23012277
assert(NumTeams && "Expected non-null NumTeams");
23022278
assert(ThreadLimit && "Expected non-null ThreadLimit");
23032279

2304-
SmallVector<Value *, 16> Args = {
2305-
Ident, ConstantInt::get(OMPBuilder.Int64, -1),
2306-
ConstantExpr::getBitCast(OMPRegionId, OMPBuilder.VoidPtr),
2307-
ConstantInt::get(OMPBuilder.Int32, OffloadingMappingArgs.Size),
2308-
OffloadingMappingArgs.BasePtrs, OffloadingMappingArgs.Ptrs,
2309-
OffloadingMappingArgs.Sizes, OffloadingMappingArgs.MapTypes,
2310-
OffloadingMappingArgs.MapNames,
2311-
// TODO: offload_mappers is null for now.
2312-
Constant::getNullValue(OMPBuilder.VoidPtrPtr), NumTeams, ThreadLimit};
2313-
2314-
if (TargetInfo.NoWait) {
2315-
// Add extra dependency information (unused for now).
2316-
Args.push_back(Constant::getNullValue(OMPBuilder.Int32));
2317-
Args.push_back(Constant::getNullValue(OMPBuilder.Int8Ptr));
2318-
Args.push_back(Constant::getNullValue(OMPBuilder.Int32));
2319-
Args.push_back(Constant::getNullValue(OMPBuilder.Int8Ptr));
2320-
}
2321-
23222280
if (!isOpenMPDeviceRuntime()) {
23232281
FunctionCallee KmpcSetThreadLimit = OMPBuilder.getOrCreateRuntimeFunction(
23242282
M, OMPRTL___kmpc_set_thread_limit);
@@ -2328,7 +2286,43 @@ void CGIntrinsicsOpenMP::emitOMPTargetHost(
23282286
{Ident, ThreadID, ThreadLimit});
23292287
}
23302288

2331-
auto *OffloadResult = checkCreateCall(OMPBuilder.Builder, TargetMapper, Args);
2289+
SmallVector<Value *> ArgsVector;
2290+
2291+
auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
2292+
OpenMPIRBuilder::TargetDataRTArgs RTArgs{
2293+
OffloadingMappingArgs.BasePtrs,
2294+
OffloadingMappingArgs.Ptrs,
2295+
OffloadingMappingArgs.Sizes,
2296+
OffloadingMappingArgs.MapTypes,
2297+
ConstantPointerNull::get(UnqualPtrTy),
2298+
ConstantPointerNull::get(UnqualPtrTy),
2299+
OffloadingMappingArgs.MapNames,
2300+
};
2301+
// Avoid initializer-list temporaries for ArrayRef fields. Use stable
2302+
// SmallVector storage so ArrayRef in TargetKernelArgs refers to valid
2303+
// memory.
2304+
SmallVector<Value *, 1> KernelNumTeams;
2305+
KernelNumTeams.push_back(NumTeams);
2306+
SmallVector<Value *, 1> KernelNumThreads;
2307+
KernelNumThreads.push_back(ThreadLimit);
2308+
2309+
// TODO: Implement nowait: we need to enclose the host code in a task for
2310+
// async execution. OpenMPIRBuilder may support that now.
2311+
OpenMPIRBuilder::TargetKernelArgs Args{
2312+
static_cast<unsigned int>(OffloadingMappingArgs.Size),
2313+
RTArgs,
2314+
(TripCount ? TripCount : OMPBuilder.Builder.getInt64(0)),
2315+
KernelNumTeams,
2316+
KernelNumThreads,
2317+
Constant::getNullValue(OMPBuilder.VoidPtr),
2318+
/*TargetInfo.NoWait*/ false};
2319+
OpenMPIRBuilder::getKernelArgsVector(Args, OMPBuilder.Builder, ArgsVector);
2320+
2321+
Value *DeviceID = ConstantInt::get(OMPBuilder.Int64, -1);
2322+
Value *OffloadResult = nullptr;
2323+
OMPBuilder.emitTargetKernel(Loc, AllocaIP, OffloadResult, Ident, DeviceID,
2324+
NumTeams, ThreadLimit, OMPRegionId, ArgsVector);
2325+
23322326
assert(OffloadResult && "Expected non-null call inst from code generation");
23332327
auto *Failed = OMPBuilder.Builder.CreateIsNotNull(OffloadResult);
23342328
OMPBuilder.Builder.CreateCondBr(Failed, StartBB, EndBB);

0 commit comments

Comments
 (0)