Skip to content

Commit 0e13ae6

Browse files
committed
GPU Device Caching for Encoder Output in CUDA Backend
Add CUDA GPU caching functionality for encoder outputs to improve performance in ASR applications by avoiding redundant computation. Key changes: - Add GPU caching mechanism in cuda_backend.cpp with RAII management - Add clear_stored_tensor option for cache control - Add encoder output caching support in ASR runner
1 parent 6cca6e6 commit 0e13ae6

File tree

2 files changed

+337
-11
lines changed

2 files changed

+337
-11
lines changed

backends/cuda/runtime/cuda_backend.cpp

Lines changed: 295 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include <cuda_runtime.h>
1010
#include <executorch/runtime/backend/interface.h>
11+
#include <executorch/runtime/backend/options.h>
1112
#include <executorch/runtime/core/error.h>
1213
#include <executorch/runtime/core/evalue.h>
1314
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
@@ -16,6 +17,7 @@
1617
#include <filesystem>
1718
#include <fstream>
1819
#include <string>
20+
#include <unordered_map>
1921
#include <vector>
2022

2123
// Include our shim layer headers
@@ -46,9 +48,88 @@ using executorch::runtime::Result;
4648
using executorch::runtime::Span;
4749
using executorch::runtime::etensor::Tensor;
4850

51+
// Structure to hold a reference to a GPU tensor for "keep on device"
52+
// optimization. Owns the tensor handle - must be deleted when no longer needed.
53+
struct GpuTensorRef {
54+
AOTITensorHandle handle; // Tensor handle (owned, for later deletion)
55+
void* data_ptr; // GPU memory pointer (for D2D copy)
56+
size_t size_bytes; // Total size in bytes
57+
};
58+
4959
class ET_EXPERIMENTAL CudaBackend final
5060
: public ::executorch::runtime::BackendInterface {
5161
private:
62+
// ============================================================================
63+
// GPU Tensor Storage for D2D Copy Optimization
64+
// ============================================================================
65+
//
66+
// This backend supports storing GPU tensors between execute() calls to enable
67+
// device-to-device (D2D) copies instead of slower host-to-device (H2D)
68+
// copies. This is useful for encoder-decoder models where the encoder output
69+
// is reused across many decoder iterations.
70+
//
71+
// SUPPORTED OPTIONS (via set_option):
72+
//
73+
// "store_output" (string): Store the output tensor under this name after
74+
// the next execute() call. The tensor remains on GPU until cleared.
75+
// Only supports single-output methods.
76+
// Example: opts.set_option("store_output", "encoder_output");
77+
//
78+
// "use_stored_input" (string): For inputs matching the stored tensor's
79+
// size,
80+
// use D2D copy from the stored tensor instead of H2D copy from CPU.
81+
// This setting persists across execute() calls until reset.
82+
// Example: opts.set_option("use_stored_input", "encoder_output");
83+
//
84+
// "reset_stored_input" (bool): Clear the use_stored_input setting.
85+
// Does NOT delete the stored tensor - only stops using it for D2D.
86+
// Example: opts.set_option("reset_stored_input", true);
87+
//
88+
// "clear_stored_tensor" (string): Delete the named tensor from storage,
89+
// freeing GPU memory. Use after decoder loop completes.
90+
// Example: opts.set_option("clear_stored_tensor", "encoder_output");
91+
//
92+
// TYPICAL USAGE PATTERN (encoder-decoder model):
93+
//
94+
// 1. Before encoder: set_option("store_output", "encoder_output")
95+
// 2. Execute encoder (output is stored on GPU)
96+
// 3. Before decoder loop: set_option("use_stored_input", "encoder_output")
97+
// 4. Execute decoder N times (D2D copies for encoder output input)
98+
// 5. After decoder loop:
99+
// set_option("reset_stored_input", true)
100+
// set_option("clear_stored_tensor", "encoder_output")
101+
//
102+
// ============================================================================
103+
104+
// Storage control options (set via set_option before execute)
105+
mutable std::string
106+
store_output_name_; // Name to store output under (empty = none)
107+
mutable std::string
108+
use_stored_input_name_; // Name of stored tensor to use (empty = none)
109+
110+
// Per-instance map of named GPU tensor references.
111+
// Mutable because execute() is const but needs to modify this.
112+
//
113+
// LIFETIME CONTRACT:
114+
// - Stored tensors are valid until overwritten or destroy() is called.
115+
// - Caller must ensure the producing execute() call (e.g., encoder) completes
116+
// before any consuming execute() call (e.g., decoder) begins.
117+
// - Caller must not call destroy() while execute() is in progress.
118+
// - Overwriting a tensor (same name) deletes the old tensor immediately,
119+
// so caller must ensure no concurrent execute() is using it.
120+
mutable std::unordered_map<std::string, GpuTensorRef> gpu_tensors_;
121+
122+
// Helper to clear stored GPU tensors and free their memory.
123+
// Only call when no execute() is in progress.
124+
void clear_gpu_tensors() const {
125+
for (auto& pair : gpu_tensors_) {
126+
if (pair.second.handle != nullptr) {
127+
aoti_torch_delete_tensor_object(pair.second.handle);
128+
}
129+
}
130+
gpu_tensors_.clear();
131+
}
132+
52133
Error load_function_pointers_into_handle(
53134
void* so_handle,
54135
AOTIDelegateHandle* handle) const {
@@ -91,6 +172,70 @@ class ET_EXPERIMENTAL CudaBackend final
91172
return 1;
92173
}
93174

175+
Error set_option(
176+
__ET_UNUSED executorch::runtime::BackendOptionContext& context,
177+
const executorch::runtime::Span<executorch::runtime::BackendOption>&
178+
backend_options) override {
179+
for (size_t i = 0; i < backend_options.size(); i++) {
180+
const auto& option = backend_options[i];
181+
// Handle store_output: expects a string name (e.g., "encoder_output")
182+
if (strcmp(option.key, "store_output") == 0) {
183+
if (auto* arr = std::get_if<
184+
std::array<char, executorch::runtime::kMaxOptionValueLength>>(
185+
&option.value)) {
186+
store_output_name_ = std::string(arr->data());
187+
} else {
188+
ET_LOG(Error, "store_output option expects a string value");
189+
return Error::InvalidArgument;
190+
}
191+
}
192+
// Handle use_stored_input: expects a string name (e.g., "encoder_output")
193+
else if (strcmp(option.key, "use_stored_input") == 0) {
194+
if (auto* arr = std::get_if<
195+
std::array<char, executorch::runtime::kMaxOptionValueLength>>(
196+
&option.value)) {
197+
use_stored_input_name_ = std::string(arr->data());
198+
} else {
199+
ET_LOG(Error, "use_stored_input option expects a string value");
200+
return Error::InvalidArgument;
201+
}
202+
}
203+
// Handle reset_stored_input: expects a boolean value
204+
// Note: This only resets the name setting. The stored GPU tensor
205+
// remains in memory until overwritten or destroy() is called.
206+
else if (strcmp(option.key, "reset_stored_input") == 0) {
207+
if (auto* val = std::get_if<bool>(&option.value)) {
208+
if (*val) {
209+
use_stored_input_name_.clear();
210+
}
211+
} else {
212+
ET_LOG(Error, "reset_stored_input option expects a boolean value");
213+
return Error::InvalidArgument;
214+
}
215+
}
216+
// Handle clear_stored_tensor: expects a string name
217+
// Deletes the named GPU tensor from storage, freeing GPU memory.
218+
else if (strcmp(option.key, "clear_stored_tensor") == 0) {
219+
if (auto* arr = std::get_if<
220+
std::array<char, executorch::runtime::kMaxOptionValueLength>>(
221+
&option.value)) {
222+
std::string name(arr->data());
223+
auto it = gpu_tensors_.find(name);
224+
if (it != gpu_tensors_.end()) {
225+
if (it->second.handle != nullptr) {
226+
aoti_torch_delete_tensor_object(it->second.handle);
227+
}
228+
gpu_tensors_.erase(it);
229+
}
230+
} else {
231+
ET_LOG(Error, "clear_stored_tensor option expects a string value");
232+
return Error::InvalidArgument;
233+
}
234+
}
235+
}
236+
return Error::Ok;
237+
}
238+
94239
// Once per loaded binary blob
95240
Result<DelegateHandle*> init(
96241
BackendInitContext& context,
@@ -222,15 +367,52 @@ class ET_EXPERIMENTAL CudaBackend final
222367
std::vector<AOTITensorHandle> gpu_outputs(
223368
n_outputs); // GPU tensors for kernel output
224369

370+
// RAII helper to ensure GPU tensors are cleaned up on all exit paths.
371+
// Prevents memory leaks when errors occur during execute().
372+
struct TensorCleanup {
373+
std::vector<AOTITensorHandle>& inputs;
374+
std::vector<AOTITensorHandle>& outputs;
375+
const std::unordered_map<std::string, GpuTensorRef>& stored_tensors;
376+
377+
~TensorCleanup() {
378+
// Clean up input tensors
379+
for (auto* handle : inputs) {
380+
if (handle != nullptr) {
381+
aoti_torch_delete_tensor_object(handle);
382+
}
383+
}
384+
// Clean up output tensors, except those that are stored
385+
for (auto* handle : outputs) {
386+
if (handle != nullptr) {
387+
bool is_stored = false;
388+
for (const auto& pair : stored_tensors) {
389+
if (pair.second.handle == handle) {
390+
is_stored = true;
391+
break;
392+
}
393+
}
394+
if (!is_stored) {
395+
aoti_torch_delete_tensor_object(handle);
396+
}
397+
}
398+
}
399+
}
400+
};
401+
TensorCleanup cleanup{gpu_inputs, gpu_outputs, gpu_tensors_};
402+
403+
// Track which input index was matched for D2D copy (for duplicate
404+
// detection)
405+
ssize_t matched_input_idx = -1;
406+
225407
// Process input tensors: ExecuTorch provides CPU tensors, create GPU
226-
// copies
227-
for (int i = 0; i < n_inputs; i++) {
408+
// copies. For stored inputs, use GPU-to-GPU copy instead of CPU-to-GPU.
409+
for (size_t i = 0; i < n_inputs; i++) {
228410
// Get tensor dimensions and properties from ExecuTorch CPU tensor
229411
auto cpu_tensor = &(args[i]->toTensor());
230412
auto sizes = cpu_tensor->sizes();
231413
auto scalar_type = cpu_tensor->scalar_type();
232414

233-
// Create GPU tensor with same shape
415+
// Create GPU tensor with same shape (always needed for AOTI format)
234416
std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
235417

236418
AOTITensorHandle gpu_input_handle;
@@ -246,21 +428,75 @@ class ET_EXPERIMENTAL CudaBackend final
246428
ET_CHECK_OR_RETURN_ERROR(
247429
create_err == Error::Ok,
248430
Internal,
249-
"Failed to create GPU tensor for input %d",
431+
"Failed to create GPU tensor for input %zu",
250432
i);
251433

252434
gpu_inputs[i] = gpu_input_handle;
253435

254-
// Copy data from CPU to GPU
436+
// Check if this input matches a stored GPU tensor (by size).
437+
if (!use_stored_input_name_.empty()) {
438+
auto it = gpu_tensors_.find(use_stored_input_name_);
439+
if (it != gpu_tensors_.end()) {
440+
const GpuTensorRef& ref = it->second;
441+
size_t numel = gpu_inputs[i]->numel();
442+
size_t elem_size = gpu_inputs[i]->element_size();
443+
size_t copy_bytes = numel * elem_size;
444+
445+
// Match by size: use stored tensor if sizes match
446+
if (copy_bytes == ref.size_bytes) {
447+
if (matched_input_idx >= 0) {
448+
// Another input already matched - warn about ambiguity
449+
ET_LOG(
450+
Error,
451+
"Multiple inputs match stored tensor '%s' size (%zu bytes): "
452+
"input %zd was used, input %zu also matches. "
453+
"Consider using unique tensor sizes or a different matching strategy.",
454+
use_stored_input_name_.c_str(),
455+
copy_bytes,
456+
matched_input_idx,
457+
i);
458+
} else {
459+
// First match - perform D2D copy
460+
matched_input_idx = static_cast<ssize_t>(i);
461+
462+
ET_LOG(
463+
Debug,
464+
"Using stored tensor '%s' for input %zu (%zu bytes, D2D copy)",
465+
use_stored_input_name_.c_str(),
466+
i,
467+
copy_bytes);
468+
469+
// GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
470+
cudaError_t cuda_err = cudaMemcpy(
471+
gpu_inputs[i]->data_ptr(),
472+
ref.data_ptr,
473+
copy_bytes,
474+
cudaMemcpyDeviceToDevice);
475+
476+
ET_CHECK_OR_RETURN_ERROR(
477+
cuda_err == cudaSuccess,
478+
Internal,
479+
"Failed GPU-to-GPU copy for input %zu: %s",
480+
i,
481+
cudaGetErrorString(cuda_err));
482+
483+
// Skip the CPU-to-GPU copy below
484+
continue;
485+
}
486+
}
487+
}
488+
}
489+
490+
// Copy data from CPU to GPU (normal path)
255491
ET_CHECK_OR_RETURN_ERROR(
256492
aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok,
257493
Internal,
258-
"Failed to copy input %d from CPU to GPU",
494+
"Failed to copy input %zu from CPU to GPU",
259495
i);
260496
}
261497
// Process output tensors: create GPU counterparts for ExecuTorch CPU
262498
// tensors
263-
for (int i = 0; i < n_outputs; i++) {
499+
for (size_t i = 0; i < n_outputs; i++) {
264500
// Get output tensor dimensions from ExecuTorch CPU tensor
265501
auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
266502
auto sizes = cpu_output_tensor->sizes();
@@ -282,7 +518,7 @@ class ET_EXPERIMENTAL CudaBackend final
282518
ET_CHECK_OR_RETURN_ERROR(
283519
create_err == Error::Ok,
284520
Internal,
285-
"Failed to create GPU tensor for output %d",
521+
"Failed to create GPU tensor for output %zu",
286522
i);
287523

288524
gpu_outputs[i] = gpu_output_handle;
@@ -303,20 +539,65 @@ class ET_EXPERIMENTAL CudaBackend final
303539
"AOTInductorModelContainerRun failed with error code %d",
304540
error);
305541

542+
// Store reference to output GPU tensor if requested.
543+
// The tensor will be kept alive for later D2D copy to decoder inputs.
544+
if (!store_output_name_.empty()) {
545+
ET_CHECK_OR_RETURN_ERROR(
546+
n_outputs == 1,
547+
InvalidArgument,
548+
"store_output only supports single-output methods, got %zu outputs",
549+
n_outputs);
550+
551+
auto* gpu_tensor = gpu_outputs[0];
552+
size_t numel = gpu_tensor->numel();
553+
size_t elem_size = gpu_tensor->element_size();
554+
size_t size_bytes = numel * elem_size;
555+
556+
// Delete old tensor if overwriting (erase first to prevent double-free)
557+
auto old_it = gpu_tensors_.find(store_output_name_);
558+
if (old_it != gpu_tensors_.end()) {
559+
AOTITensorHandle old_handle = old_it->second.handle;
560+
gpu_tensors_.erase(old_it); // Remove from map before deleting
561+
if (old_handle != nullptr) {
562+
aoti_torch_delete_tensor_object(old_handle);
563+
}
564+
}
565+
566+
// Store tensor reference (we now own this tensor)
567+
GpuTensorRef ref;
568+
ref.handle = gpu_tensor;
569+
ref.data_ptr = gpu_tensor->data_ptr();
570+
ref.size_bytes = size_bytes;
571+
gpu_tensors_[store_output_name_] = ref;
572+
573+
// Reset store_output name after storing
574+
store_output_name_.clear();
575+
}
576+
306577
// Copy GPU output results back to CPU output tensors
307-
for (int i = 0; i < n_outputs; i++) {
578+
for (size_t i = 0; i < n_outputs; i++) {
308579
auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
309580
// For DYNAMIC_BOUND tensors we try to resize
310581
ET_CHECK_OK_OR_RETURN_ERROR(
311582
resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()),
312-
"Error resizing tensor at output index %d",
583+
"Error resizing tensor at output index %zu",
313584
i);
314585
ET_CHECK_OK_OR_RETURN_ERROR(
315586
aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
316-
"Failed to copy GPU output %d back to CPU",
587+
"Failed to copy GPU output %zu back to CPU",
317588
i);
318589
}
319590

591+
// Memory management notes:
592+
// - GPU tensor cleanup is handled by TensorCleanup RAII guard above.
593+
// - use_stored_input setting persists across execute() calls to support
594+
// decoder loops that reuse the stored encoder output.
595+
// - Stored GPU tensors (in gpu_tensors_) remain in memory until:
596+
// (a) overwritten by a new tensor with the same name, or
597+
// (b) destroy() is called, which frees all stored tensors.
598+
// - The "reset_stored_input" option only resets the input name setting,
599+
// NOT the stored GPU tensors themselves.
600+
320601
return Error::Ok;
321602
}
322603

@@ -326,6 +607,9 @@ class ET_EXPERIMENTAL CudaBackend final
326607
}
327608
AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
328609

610+
// Delete stored GPU tensors
611+
clear_gpu_tensors();
612+
329613
// Destroy the CUDA stream if it exists
330614
if (handle->cuda_stream != nullptr) {
331615
cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);

0 commit comments

Comments
 (0)