Skip to content

Commit 3127481

Browse files
authored
[feat] ucmtrans: Unify API for Device-Host Memory Transfers (#379)
## Description: This PR presents a unified API for transferring data between device and host memory, offering a straightforward interface while encapsulating device-specific logic. ## Implementation: - Native API leveraging CopyEngine for transfers. - Kernel operations utilizing StreamMultiprocessor.
1 parent 16ed5da commit 3127481

36 files changed

+1982
-20
lines changed

CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
99
option(BUILD_UCM_STORE "build ucm store module." ON)
1010
option(BUILD_UCM_SPARSE "build ucm sparse module." ON)
1111
option(BUILD_UNIT_TESTS "build all unit test suits." OFF)
12-
option(BUILD_NUMA "build numactl library" OFF)
12+
option(BUILD_NUMA "build numactl library." OFF)
13+
option(DOWNLOAD_DEPENDENCE "download dependence by cmake." ON)
1314
set(RUNTIME_ENVIRONMENT "simu" CACHE STRING "runtime: simu, ascend, musa or cuda.")
1415

1516
execute_process(COMMAND git rev-parse HEAD OUTPUT_VARIABLE UCM_COMMIT_ID OUTPUT_STRIP_TRAILING_WHITESPACE)

ucm/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
add_subdirectory(shared)
12
if(BUILD_UCM_STORE)
23
add_subdirectory(store)
34
endif()

ucm/shared/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
2+
add_subdirectory(vendor)
3+
add_subdirectory(trans)
4+
add_subdirectory(test)

ucm/shared/__init__.py

Whitespace-only changes.

ucm/shared/test/CMakeLists.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
if(BUILD_UNIT_TESTS)
2+
include(GoogleTest)
3+
file(GLOB_RECURSE UCMSHARED_TEST_SOURCE_FILES "./case/*.cc")
4+
add_executable(ucmshared.test ${UCMSHARED_TEST_SOURCE_FILES})
5+
target_include_directories(ucmshared.test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/case)
6+
target_link_libraries(ucmshared.test PRIVATE
7+
trans
8+
gtest_main gtest mockcpp
9+
)
10+
gtest_discover_tests(ucmshared.test)
11+
endif()
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
/**
2+
* MIT License
3+
*
4+
* Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in all
14+
* copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22+
* SOFTWARE.
23+
* */
24+
#include <gtest/gtest.h>
25+
#include "trans/device.h"
26+
27+
class UCTransUnitTest : public ::testing::Test {};
28+
29+
TEST_F(UCTransUnitTest, CopyDataWithCE)
30+
{
31+
const auto ok = UC::Trans::Status::OK();
32+
constexpr int32_t deviceId = 0;
33+
constexpr size_t size = 36 * 1024;
34+
constexpr size_t number = 64 * 61;
35+
UC::Trans::Device device;
36+
ASSERT_EQ(device.Setup(deviceId), ok);
37+
auto buffer = device.MakeBuffer();
38+
auto stream = device.MakeStream();
39+
auto hPtr1 = buffer->MakeHostBuffer(size * number);
40+
ASSERT_NE(hPtr1, nullptr);
41+
ASSERT_EQ(buffer->MakeDeviceBuffers(size, number), ok);
42+
std::vector<std::shared_ptr<void>> ptrHolder;
43+
ptrHolder.reserve(number);
44+
void* dPtrArr[number];
45+
for (size_t i = 0; i < number; i++) {
46+
*(size_t*)(((char*)hPtr1.get()) + size * i) = i;
47+
auto ptr = buffer->GetDeviceBuffer(size);
48+
dPtrArr[i] = ptr.get();
49+
ptrHolder.emplace_back(ptr);
50+
}
51+
auto hPtr2 = buffer->MakeHostBuffer(size * number);
52+
ASSERT_NE(hPtr2, nullptr);
53+
ASSERT_EQ(stream->HostToDeviceAsync(hPtr1.get(), dPtrArr, size, number), ok);
54+
ASSERT_EQ(stream->DeviceToHostAsync(dPtrArr, hPtr2.get(), size, number), ok);
55+
ASSERT_EQ(stream->Synchronized(), ok);
56+
for (size_t i = 0; i < number; i++) {
57+
ASSERT_EQ(*(size_t*)(((char*)hPtr2.get()) + size * i), i);
58+
}
59+
}
60+
61+
TEST_F(UCTransUnitTest, CopyDataWithSM)
62+
{
63+
const auto ok = UC::Trans::Status::OK();
64+
constexpr int32_t deviceId = 0;
65+
constexpr size_t size = 36 * 1024;
66+
constexpr size_t number = 64 * 61;
67+
UC::Trans::Device device;
68+
ASSERT_EQ(device.Setup(deviceId), ok);
69+
auto buffer = device.MakeBuffer();
70+
auto stream = device.MakeSMStream();
71+
auto hPtr1 = buffer->MakeHostBuffer(size * number);
72+
ASSERT_NE(hPtr1, nullptr);
73+
ASSERT_EQ(buffer->MakeDeviceBuffers(size, number), ok);
74+
std::vector<std::shared_ptr<void>> ptrHolder;
75+
ptrHolder.reserve(number);
76+
void* dPtrArr[number];
77+
for (size_t i = 0; i < number; i++) {
78+
*(size_t*)(((char*)hPtr1.get()) + size * i) = i;
79+
auto ptr = buffer->GetDeviceBuffer(size);
80+
dPtrArr[i] = ptr.get();
81+
ptrHolder.emplace_back(ptr);
82+
}
83+
auto dPtrArrOnDev = buffer->MakeDeviceBuffer(sizeof(dPtrArr));
84+
ASSERT_EQ(stream->HostToDevice((void*)dPtrArr, dPtrArrOnDev.get(), sizeof(dPtrArr)), ok);
85+
auto hPtr2 = buffer->MakeHostBuffer(size * number);
86+
ASSERT_NE(hPtr2, nullptr);
87+
ASSERT_EQ(stream->HostToDeviceAsync(hPtr1.get(), (void**)dPtrArrOnDev.get(), size, number), ok);
88+
ASSERT_EQ(stream->DeviceToHostAsync((void**)dPtrArrOnDev.get(), hPtr2.get(), size, number), ok);
89+
ASSERT_EQ(stream->Synchronized(), ok);
90+
for (size_t i = 0; i < number; i++) {
91+
ASSERT_EQ(*(size_t*)(((char*)hPtr2.get()) + size * i), i);
92+
}
93+
}
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# MIT License
4+
#
5+
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
6+
#
7+
# Permission is hereby granted, free of charge, to any person obtaining a copy
8+
# of this software and associated documentation files (the "Software"), to deal
9+
# in the Software without restriction, including without limitation the rights
10+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11+
# copies of the Software, and to permit persons to whom the Software is
12+
# furnished to do so, subject to the following conditions:
13+
#
14+
# The above copyright notice and this permission notice shall be included in all
15+
# copies or substantial portions of the Software.
16+
#
17+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23+
# SOFTWARE.
24+
#
25+
import time
26+
from functools import wraps
27+
28+
import cupy
29+
import numpy as np
30+
31+
from ucm.shared.trans import ucmtrans
32+
33+
34+
def test_wrap(func):
35+
@wraps(func)
36+
def wrapper(*args, **kwargs):
37+
print(f"========>> Running in {func.__name__}:")
38+
result = func(*args, **kwargs)
39+
print()
40+
return result
41+
42+
return wrapper
43+
44+
45+
def make_host_memory(size, number, dtype, fill=False):
46+
host = cupy.cuda.alloc_pinned_memory(size * number)
47+
host_np = np.frombuffer(host, dtype=dtype)
48+
if fill:
49+
fixed_len = min(1024, number)
50+
host_np[:fixed_len] = np.arange(fixed_len, dtype=dtype)
51+
print("make:", host_np.shape, host_np.itemsize, host_np)
52+
return host
53+
54+
55+
def compare(host1, host2, dtype):
56+
host1_np = np.frombuffer(host1, dtype=dtype)
57+
host2_np = np.frombuffer(host2, dtype=dtype)
58+
print("compare[1]:", host1_np.shape, host1_np.itemsize, host1_np)
59+
print("compare[2]:", host2_np.shape, host2_np.itemsize, host2_np)
60+
return np.array_equal(host1_np, host2_np)
61+
62+
63+
@test_wrap
64+
def trans_with_ce(d, size, number, dtype):
65+
s = d.MakeStream()
66+
host1 = make_host_memory(size, number, dtype, True)
67+
device = [cupy.empty(size, dtype=np.uint8) for _ in range(number)]
68+
device_ptr = np.array([d.data.ptr for d in device], dtype=np.uint64)
69+
host2 = make_host_memory(size, number, dtype)
70+
tp = time.perf_counter()
71+
s.HostToDeviceScatter(host1.ptr, device_ptr, size, number)
72+
s.DeviceToHostGather(device_ptr, host2.ptr, size, number)
73+
cost = time.perf_counter() - tp
74+
print(f"cost: {cost}s")
75+
print(f"bandwidth: {size * number / cost / 1e9}GB/s")
76+
assert compare(host1, host2, dtype)
77+
78+
79+
@test_wrap
80+
def trans_with_sm(d, size, number, dtype):
81+
s = d.MakeSMStream()
82+
host1 = make_host_memory(size, number, dtype, True)
83+
device = [cupy.empty(size, dtype=np.uint8) for _ in range(number)]
84+
device_ptr = np.array([d.data.ptr for d in device], dtype=np.uint64)
85+
device_ptr_cupy = cupy.empty(number, dtype=np.uint64)
86+
device_ptr_cupy.set(device_ptr)
87+
host2 = make_host_memory(size, number, dtype)
88+
tp = time.perf_counter()
89+
s.HostToDeviceScatter(host1.ptr, device_ptr_cupy.data.ptr, size, number)
90+
s.DeviceToHostGather(device_ptr_cupy.data.ptr, host2.ptr, size, number)
91+
cost = time.perf_counter() - tp
92+
print(f"cost: {cost}s")
93+
print(f"bandwidth: {size * number / cost / 1e9}GB/s")
94+
assert compare(host1, host2, dtype)
95+
96+
97+
@test_wrap
98+
def trans_with_ce_async(d, size, number, dtype):
99+
s = d.MakeStream()
100+
host1 = make_host_memory(size, number, dtype, True)
101+
device = [cupy.empty(size, dtype=np.uint8) for _ in range(number)]
102+
device_ptr = np.array([d.data.ptr for d in device], dtype=np.uint64)
103+
host2 = make_host_memory(size, number, dtype)
104+
tp = time.perf_counter()
105+
s.HostToDeviceScatterAsync(host1.ptr, device_ptr, size, number)
106+
s.DeviceToHostGatherAsync(device_ptr, host2.ptr, size, number)
107+
s.Synchronized()
108+
cost = time.perf_counter() - tp
109+
print(f"cost: {cost}s")
110+
print(f"bandwidth: {size * number / cost / 1e9}GB/s")
111+
assert compare(host1, host2, dtype)
112+
113+
114+
@test_wrap
115+
def trans_with_sm_async(d, size, number, dtype):
116+
s = d.MakeSMStream()
117+
host1 = make_host_memory(size, number, dtype, True)
118+
device = [cupy.empty(size, dtype=np.uint8) for _ in range(number)]
119+
device_ptr = np.array([d.data.ptr for d in device], dtype=np.uint64)
120+
device_ptr_cupy = cupy.empty(number, dtype=np.uint64)
121+
device_ptr_cupy.set(device_ptr)
122+
host2 = make_host_memory(size, number, dtype)
123+
tp = time.perf_counter()
124+
s.HostToDeviceScatterAsync(host1.ptr, device_ptr_cupy.data.ptr, size, number)
125+
s.DeviceToHostGatherAsync(device_ptr_cupy.data.ptr, host2.ptr, size, number)
126+
s.Synchronized()
127+
cost = time.perf_counter() - tp
128+
print(f"cost: {cost}s")
129+
print(f"bandwidth: {size * number / cost / 1e9}GB/s")
130+
assert compare(host1, host2, dtype)
131+
132+
133+
def main():
134+
device_id = 0
135+
size = 36 * 1024
136+
number = 61 * 64
137+
dtype = np.float16
138+
print(f"ucmtrans: {ucmtrans.commit_id}-{ucmtrans.build_type}")
139+
cupy.cuda.Device(device_id).use()
140+
d = ucmtrans.Device()
141+
d.Setup(device_id)
142+
trans_with_ce(d, size, number, dtype)
143+
trans_with_sm(d, size, number, dtype)
144+
trans_with_ce_async(d, size, number, dtype)
145+
trans_with_sm_async(d, size, number, dtype)
146+
147+
148+
if __name__ == "__main__":
149+
main()

ucm/shared/trans/CMakeLists.txt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
if(RUNTIME_ENVIRONMENT STREQUAL "ascend")
2+
add_subdirectory(ascend)
3+
endif()
4+
if(RUNTIME_ENVIRONMENT STREQUAL "cuda")
5+
add_subdirectory(cuda)
6+
endif()
7+
if(RUNTIME_ENVIRONMENT STREQUAL "simu")
8+
add_subdirectory(simu)
9+
endif()
10+
11+
file(GLOB_RECURSE UCMTRANS_CPY_SOURCE_FILES "./cpy/*.cc")
12+
pybind11_add_module(ucmtrans ${UCMTRANS_CPY_SOURCE_FILES})
13+
target_link_libraries(ucmtrans PRIVATE trans)
14+
set_target_properties(ucmtrans PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})

ucm/shared/trans/__init__.py

Whitespace-only changes.

ucm/shared/trans/buffer.h

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/**
2+
* MIT License
3+
*
4+
* Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in all
14+
* copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22+
* SOFTWARE.
23+
* */
24+
#ifndef UNIFIEDCACHE_TRANS_BUFFER_H
25+
#define UNIFIEDCACHE_TRANS_BUFFER_H
26+
27+
#include <memory>
28+
#include "status.h"
29+
30+
namespace UC::Trans {
31+
32+
class Buffer {
33+
public:
34+
virtual ~Buffer() = default;
35+
36+
virtual std::shared_ptr<void> MakeDeviceBuffer(size_t size) = 0;
37+
virtual Status MakeDeviceBuffers(size_t size, size_t number) = 0;
38+
virtual std::shared_ptr<void> GetDeviceBuffer(size_t size) = 0;
39+
40+
virtual std::shared_ptr<void> MakeHostBuffer(size_t size) = 0;
41+
virtual Status MakeHostBuffers(size_t size, size_t number) = 0;
42+
virtual std::shared_ptr<void> GetHostBuffer(size_t size) = 0;
43+
44+
virtual Status RegisterHostBuffer(void* ptr, size_t size) = 0;
45+
virtual void UnregisterHostBuffer(void* ptr) = 0;
46+
virtual void* GetHostPtrOnDevice(void* ptr) = 0;
47+
};
48+
49+
} // namespace UC::Trans
50+
51+
#endif

0 commit comments

Comments
 (0)