Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions intercept/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ set(CLINTERCEPT_RESOURCE_FILES
kernels/precompiled_kernels.cl
resource/clIntercept.rc
resource/clIntercept_resource.h
scripts/run.py
"${CMAKE_CURRENT_BINARY_DIR}/git_version.rc2"
)
source_group(Resources FILES
Expand Down
80 changes: 49 additions & 31 deletions intercept/scripts/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,58 +45,62 @@ def sampler_from_string(ctx, sampler_descr):
cl.filter_mode.NEAREST
return cl.Sampler(ctx, False, addressing_mode, filter_mode)

def replay(repetitions):
def replay(repetitions = 1, use_svm = False):
# Read the enqueue number from the file
with open('./enqueueNumber.txt') as file:
enqueue_number = file.read().splitlines()[0]

padded_enqueue_num = str(enqueue_number).rjust(4, "0")

arguments = {}
argument_files = gl.glob("./Argument*.bin")
for argument in argument_files:
idx = int(re.findall(r'\d+', argument)[0])
arguments[idx] = np.fromfile(argument, dtype='uint8').tobytes()
for fileName in gl.glob("./Argument*.bin"):
idx = int(re.findall(r'\d+', fileName)[0])
arguments[idx] = np.fromfile(fileName, dtype='uint8').tobytes()

svm_arg_offsets = {}
for fileName in gl.glob("./SVM_Arg_Offset*.txt"):
idx = int(re.findall(r'\d+', fileName)[0])
with open(fileName) as file:
svm_arg_offsets[idx] = int(file.read())
if use_svm is False and svm_arg_offsets[idx] != 0:
print("Non-zero SVM arg offset found, forcing SVM replay.")
use_svm = True

buffer_idx = []
input_buffers = {}
output_buffers = {}
buffer_files = gl.glob("./Pre/Enqueue_" + padded_enqueue_num + "*.bin")
input_buffer_ptrs = defaultdict(list)
for buffer in buffer_files:
start = buffer.find("_Arg_")
idx = int(re.findall(r'\d+', buffer[start:])[0])
for fileName in gl.glob("./Pre/Enqueue_" + padded_enqueue_num + "*.bin"):
start = fileName.find("_Arg_")
idx = int(re.findall(r'\d+', fileName[start:])[0])
buffer_idx.append(idx)
input_buffers[idx] = np.fromfile(buffer, dtype='uint8').tobytes()
input_buffers[idx] = np.fromfile(fileName, dtype='uint8')
input_buffer_ptrs[arguments[idx]].append(idx)
output_buffers[idx] = np.empty_like(input_buffers[idx])

image_idx = []
input_images = {}
output_images = {}
image_files = gl.glob("./Pre/Enqueue_" + padded_enqueue_num + "*.raw")
input_images_ptrs = defaultdict(list)
for image in image_files:
start = image.find("_Arg_")
idx = int(re.findall(r'\d+', image[start:])[0])
for fileName in gl.glob("./Pre/Enqueue_" + padded_enqueue_num + "*.raw"):
start = fileName.find("_Arg_")
idx = int(re.findall(r'\d+', fileName[start:])[0])
image_idx.append(idx)
input_images[idx] = np.fromfile(image, dtype='uint8').tobytes()
input_images[idx] = np.fromfile(fileName, dtype='uint8')
input_images_ptrs[arguments[idx]].append(idx)
output_images[idx] = np.empty_like(input_images[idx])

local_sizes = {}
local_files = gl.glob("./Local*.txt")
for local in local_files:
with open(local) as file:
for fileName in gl.glob("./Local*.txt"):
idx = int(re.findall(r'\d+', fileName)[0])
with open(fileName) as file:
size = int(file.read())
local_sizes[int(re.findall(r'\d+', local)[0])] = size
local_sizes[idx] = size

# Check if we have pointer aliasing for the buffers
tmp_args = []
for idx in buffer_idx:
tmp_args.append(arguments[idx])

# Check if all input pointer addresses are unique
if len(tmp_args) != len(set(tmp_args)):
print("Some of the buffers are aliasing, we will replicate this behavior.")

Expand All @@ -123,16 +127,22 @@ def replay(repetitions):
for idx in list(samplers):
del arguments[idx]

mf = cl.mem_flags

gpu_buffers = {}
gpu_svm = {}
for idxs in input_buffer_ptrs.values():
gpu_buffers[tuple(idxs)] = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=input_buffers[idxs[0]])
if use_svm:
svm = cl.SVM(cl.csvm_empty_like(ctx, input_buffers[idxs[0]]))
cl.enqueue_copy(queue, svm, input_buffers[idxs[0]])
gpu_svm[tuple(idxs)] = svm
else:
buf = cl.Buffer(ctx, cl.mem_flags.COPY_HOST_PTR, hostbuf=input_buffers[idxs[0]].tobytes())
gpu_buffers[tuple(idxs)] = buf

gpu_images = {}
for idx in image_idx:
format, shape = get_image_metadata(idx)
gpu_images[idx] = cl.Image(ctx, mf.COPY_HOST_PTR, format, shape, hostbuf=input_images[idx])
img = cl.Image(ctx, cl.mem_flags.COPY_HOST_PTR, format, shape, hostbuf=input_images[idx].tobytes())
gpu_images[idx] = img

with open("buildOptions.txt", 'r') as file:
options = [line.rstrip() for line in file]
Expand Down Expand Up @@ -176,6 +186,11 @@ def replay(repetitions):
for pos, buffer in gpu_buffers.items():
for idx in pos:
kernel.set_arg(idx, buffer)

for pos, svm in gpu_svm.items():
for idx in pos:
offset = svm_arg_offsets[idx]
kernel.set_arg(idx, cl.SVM(svm.mem[offset:]))

for pos, image in gpu_images.items():
kernel.set_arg(pos, image)
Expand Down Expand Up @@ -208,11 +223,12 @@ def replay(repetitions):
cl.enqueue_nd_range_kernel(queue, kernel, gws, lws, gwo)

for pos in gpu_buffers.keys():
if len(pos) == 1:
cl.enqueue_copy(queue, output_buffers[pos[0]], gpu_buffers[pos])
else:
for idx in range(len(pos)):
cl.enqueue_copy(queue, output_buffers[pos[idx]], gpu_buffers[pos])
for idx in range(len(pos)):
cl.enqueue_copy(queue, output_buffers[pos[idx]], gpu_buffers[pos])

for pos in gpu_svm.keys():
for idx in range(len(pos)):
cl.enqueue_copy(queue, output_buffers[pos[idx]], gpu_svm[pos])

for pos in gpu_images.keys():
cl.enqueue_copy(queue, output_images[pos], gpu_images[pos], region=shape, origin=(0,0,0))
Expand Down Expand Up @@ -296,6 +312,8 @@ def validate():
parser = argparse.ArgumentParser(description='Script to replay and validate captured kernels')
parser.add_argument('-r', '--repetitions', type=int, dest='repetitions', default=1,
help='How often the kernel should be enqueued')
parser.add_argument('-m', '--svm', action='store_true', dest='svm', default=False,
help='Use SVM when replaying the captured kernel')
parser.add_argument('-s', '--skipreplay', action='store_true', dest='skip', default=False,
help='Skip replaying the captured kernel and do not dump data')
parser.add_argument('-v', '--validate', action='store_true', dest='validate', default=False,
Expand All @@ -305,7 +323,7 @@ def validate():
if args.skip:
print("Skipping replay of the captured kernel.")
else:
replay(args.repetitions)
replay(args.repetitions, args.svm)

if args.validate:
validate()
Expand Down
45 changes: 23 additions & 22 deletions intercept/src/intercept.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7900,7 +7900,8 @@ void CLIntercept::setKernelArg(
if( m_MemAllocNumberMap.find(mem) != m_MemAllocNumberMap.end() )
{
CArgMemMap& argMemMap = m_KernelArgMemMap[ kernel ];
argMemMap[ arg_index ] = mem;
SMemInfo& memInfo = argMemMap[ arg_index ];
memInfo.Value = mem;
}
}

Expand Down Expand Up @@ -7954,15 +7955,12 @@ void CLIntercept::setKernelArgSVMPointer(
if( arg >= startPtr && arg < endPtr )
{
CArgMemMap& argMemMap = m_KernelArgMemMap[ kernel ];
argMemMap[ arg_index ] = startPtr;
}
SMemInfo& memInfo = argMemMap[ arg_index ];
memInfo.Value = startPtr;
memInfo.Offset = (const char*)arg - (const char*)startPtr;

// Currently, only pointers to the start of an SVM allocation are supported for
// capture and replay.
if( arg == startPtr )
{
CArgDataMap& argDataMap = m_KernelArgDataMap[kernel];
const uint8_t* pRawArgData = reinterpret_cast<const uint8_t*>(&arg);
const uint8_t* pRawArgData = reinterpret_cast<const uint8_t*>(&startPtr);
argDataMap[ arg_index ] = std::vector<uint8_t>(
pRawArgData, pRawArgData + sizeof(void*) );
}
Expand Down Expand Up @@ -7994,15 +7992,12 @@ void CLIntercept::setKernelArgUSMPointer(
if( arg >= startPtr && arg < endPtr )
{
CArgMemMap& argMemMap = m_KernelArgMemMap[ kernel ];
argMemMap[ arg_index ] = startPtr;
}
SMemInfo& memInfo = argMemMap[ arg_index ];
memInfo.Value = startPtr;
memInfo.Offset = (const char*)arg - (const char*)startPtr;

// Currently, only pointers to the start of an SVM allocation are supported for
// capture and replay.
if( arg == startPtr )
{
CArgDataMap& argDataMap = m_KernelArgDataMap[kernel];
const uint8_t* pRawArgData = reinterpret_cast<const uint8_t*>(&arg);
const uint8_t* pRawArgData = reinterpret_cast<const uint8_t*>(&startPtr);
argDataMap[ arg_index ] = std::vector<uint8_t>(
pRawArgData, pRawArgData + sizeof(void*) );
}
Expand Down Expand Up @@ -8197,7 +8192,7 @@ void CLIntercept::dumpCaptureReplayKernelArguments(
for( const auto& arg : argMemMap )
{
const auto index = arg.first;
const auto value = (cl_mem)arg.second;
const auto value = (cl_mem)arg.second.Value;
if( m_ImageInfoMap.find( value ) != m_ImageInfoMap.end() )
{
const SImageInfo& info = m_ImageInfoMap[ value ];
Expand All @@ -8213,6 +8208,12 @@ void CLIntercept::dumpCaptureReplayKernelArguments(
<< info.Format.image_channel_order << '\n'
<< static_cast<int>(info.ImageType);
}
else
{
std::string fileName{dumpDirectory + "SVM_Arg_Offset_" + std::to_string(index) + ".txt"};
std::ofstream out{fileName};
out << arg.second.Offset << '\n';
}
}

const auto& samplerValues = m_KernelArgSamplerMap[kernel];
Expand Down Expand Up @@ -8282,7 +8283,7 @@ void CLIntercept::dumpBuffersForKernel(
CLI_C_ASSERT( sizeof(void*) == sizeof(cl_mem) );

cl_uint arg_index = (*i).first;
void* allocation = (void*)(*i).second;
const void* allocation = (*i).second.Value;
cl_mem memobj = (cl_mem)allocation;

++i;
Expand Down Expand Up @@ -8392,7 +8393,7 @@ void CLIntercept::dumpBuffersForKernel(
command_queue,
CL_TRUE,
CL_MAP_READ,
allocation,
(void*)allocation,
size,
0,
NULL,
Expand Down Expand Up @@ -8420,7 +8421,7 @@ void CLIntercept::dumpBuffersForKernel(

dispatch().clEnqueueSVMUnmap(
command_queue,
allocation,
(void*)allocation,
0,
NULL,
NULL );
Expand Down Expand Up @@ -8530,7 +8531,7 @@ void CLIntercept::dumpImagesForKernel(
CLI_C_ASSERT( sizeof(void*) == sizeof(cl_mem) );

cl_uint arg_index = (*i).first;
cl_mem memobj = (cl_mem)(*i).second;
cl_mem memobj = (cl_mem)(*i).second.Value;

++i;

Expand Down Expand Up @@ -8666,7 +8667,7 @@ void CLIntercept::injectBuffersForKernel(
CLI_C_ASSERT( sizeof(void*) == sizeof(cl_mem) );

cl_uint arg_index = (*i).first;
void* allocation = (void*)(*i).second;
void* allocation = (void*)(*i).second.Value;
cl_mem memobj = (cl_mem)allocation;

++i;
Expand Down Expand Up @@ -8861,7 +8862,7 @@ void CLIntercept::injectImagesForKernel(
CLI_C_ASSERT( sizeof(void*) == sizeof(cl_mem) );

cl_uint arg_index = (*i).first;
cl_mem memobj = (cl_mem)(*i).second;
cl_mem memobj = (cl_mem)(*i).second.Value;

++i;

Expand Down
9 changes: 8 additions & 1 deletion intercept/src/intercept.h
Original file line number Diff line number Diff line change
Expand Up @@ -1314,7 +1314,14 @@ class CLIntercept
typedef std::map< cl_mem, SImageInfo > CImageInfoMap;
CImageInfoMap m_ImageInfoMap;

typedef std::map< cl_uint, const void* > CArgMemMap;
struct SMemInfo
{
// Note: "Value" is either a cl_mem or the base of an SVM/USM allocation.
const void* Value = NULL;
size_t Offset = 0;
};

typedef std::map< cl_uint, SMemInfo > CArgMemMap;
typedef std::map< cl_kernel, CArgMemMap > CKernelArgMemMap;
CKernelArgMemMap m_KernelArgMemMap;

Expand Down