88
99#include < cuda_runtime.h>
1010#include < executorch/runtime/backend/interface.h>
11+ #include < executorch/runtime/backend/options.h>
1112#include < executorch/runtime/core/error.h>
1213#include < executorch/runtime/core/evalue.h>
1314#include < executorch/runtime/core/exec_aten/util/tensor_util.h>
1617#include < filesystem>
1718#include < fstream>
1819#include < string>
20+ #include < unordered_map>
1921#include < vector>
2022
2123// Include our shim layer headers
@@ -46,9 +48,88 @@ using executorch::runtime::Result;
4648using executorch::runtime::Span;
4749using executorch::runtime::etensor::Tensor;
4850
51+ // Structure to hold a reference to a GPU tensor for "keep on device"
52+ // optimization. Owns the tensor handle - must be deleted when no longer needed.
53+ struct GpuTensorRef {
54+ AOTITensorHandle handle; // Tensor handle (owned, for later deletion)
55+ void * data_ptr; // GPU memory pointer (for D2D copy)
56+ size_t size_bytes; // Total size in bytes
57+ };
58+
4959class ET_EXPERIMENTAL CudaBackend final
5060 : public ::executorch::runtime::BackendInterface {
5161 private:
62+ // ============================================================================
63+ // GPU Tensor Storage for D2D Copy Optimization
64+ // ============================================================================
65+ //
66+ // This backend supports storing GPU tensors between execute() calls to enable
67+ // device-to-device (D2D) copies instead of slower host-to-device (H2D)
68+ // copies. This is useful for encoder-decoder models where the encoder output
69+ // is reused across many decoder iterations.
70+ //
71+ // SUPPORTED OPTIONS (via set_option):
72+ //
73+ // "store_output" (string): Store the output tensor under this name after
74+ // the next execute() call. The tensor remains on GPU until cleared.
75+ // Only supports single-output methods.
76+ // Example: opts.set_option("store_output", "encoder_output");
77+ //
78+ // "use_stored_input" (string): For inputs matching the stored tensor's
79+ // size,
80+ // use D2D copy from the stored tensor instead of H2D copy from CPU.
81+ // This setting persists across execute() calls until reset.
82+ // Example: opts.set_option("use_stored_input", "encoder_output");
83+ //
84+ // "reset_stored_input" (bool): Clear the use_stored_input setting.
85+ // Does NOT delete the stored tensor - only stops using it for D2D.
86+ // Example: opts.set_option("reset_stored_input", true);
87+ //
88+ // "clear_stored_tensor" (string): Delete the named tensor from storage,
89+ // freeing GPU memory. Use after decoder loop completes.
90+ // Example: opts.set_option("clear_stored_tensor", "encoder_output");
91+ //
92+ // TYPICAL USAGE PATTERN (encoder-decoder model):
93+ //
94+ // 1. Before encoder: set_option("store_output", "encoder_output")
95+ // 2. Execute encoder (output is stored on GPU)
96+ // 3. Before decoder loop: set_option("use_stored_input", "encoder_output")
97+ // 4. Execute decoder N times (D2D copies for encoder output input)
98+ // 5. After decoder loop:
99+ // set_option("reset_stored_input", true)
100+ // set_option("clear_stored_tensor", "encoder_output")
101+ //
102+ // ============================================================================
103+
104+ // Storage control options (set via set_option before execute)
105+ mutable std::string
106+ store_output_name_; // Name to store output under (empty = none)
107+ mutable std::string
108+ use_stored_input_name_; // Name of stored tensor to use (empty = none)
109+
110+ // Per-instance map of named GPU tensor references.
111+ // Mutable because execute() is const but needs to modify this.
112+ //
113+ // LIFETIME CONTRACT:
114+ // - Stored tensors are valid until overwritten or destroy() is called.
115+ // - Caller must ensure the producing execute() call (e.g., encoder) completes
116+ // before any consuming execute() call (e.g., decoder) begins.
117+ // - Caller must not call destroy() while execute() is in progress.
118+ // - Overwriting a tensor (same name) deletes the old tensor immediately,
119+ // so caller must ensure no concurrent execute() is using it.
120+ mutable std::unordered_map<std::string, GpuTensorRef> gpu_tensors_;
121+
122+ // Helper to clear stored GPU tensors and free their memory.
123+ // Only call when no execute() is in progress.
124+ void clear_gpu_tensors () const {
125+ for (auto & pair : gpu_tensors_) {
126+ if (pair.second .handle != nullptr ) {
127+ aoti_torch_delete_tensor_object (pair.second .handle );
128+ }
129+ }
130+ gpu_tensors_.clear ();
131+ }
132+
52133 Error load_function_pointers_into_handle (
53134 void * so_handle,
54135 AOTIDelegateHandle* handle) const {
@@ -91,6 +172,70 @@ class ET_EXPERIMENTAL CudaBackend final
91172 return 1 ;
92173 }
93174
175+ Error set_option (
176+ __ET_UNUSED executorch::runtime::BackendOptionContext& context,
177+ const executorch::runtime::Span<executorch::runtime::BackendOption>&
178+ backend_options) override {
179+ for (size_t i = 0 ; i < backend_options.size (); i++) {
180+ const auto & option = backend_options[i];
181+ // Handle store_output: expects a string name (e.g., "encoder_output")
182+ if (strcmp (option.key , " store_output" ) == 0 ) {
183+ if (auto * arr = std::get_if<
184+ std::array<char , executorch::runtime::kMaxOptionValueLength >>(
185+ &option.value )) {
186+ store_output_name_ = std::string (arr->data ());
187+ } else {
188+ ET_LOG (Error, " store_output option expects a string value" );
189+ return Error::InvalidArgument;
190+ }
191+ }
192+ // Handle use_stored_input: expects a string name (e.g., "encoder_output")
193+ else if (strcmp (option.key , " use_stored_input" ) == 0 ) {
194+ if (auto * arr = std::get_if<
195+ std::array<char , executorch::runtime::kMaxOptionValueLength >>(
196+ &option.value )) {
197+ use_stored_input_name_ = std::string (arr->data ());
198+ } else {
199+ ET_LOG (Error, " use_stored_input option expects a string value" );
200+ return Error::InvalidArgument;
201+ }
202+ }
203+ // Handle reset_stored_input: expects a boolean value
204+ // Note: This only resets the name setting. The stored GPU tensor
205+ // remains in memory until overwritten or destroy() is called.
206+ else if (strcmp (option.key , " reset_stored_input" ) == 0 ) {
207+ if (auto * val = std::get_if<bool >(&option.value )) {
208+ if (*val) {
209+ use_stored_input_name_.clear ();
210+ }
211+ } else {
212+ ET_LOG (Error, " reset_stored_input option expects a boolean value" );
213+ return Error::InvalidArgument;
214+ }
215+ }
216+ // Handle clear_stored_tensor: expects a string name
217+ // Deletes the named GPU tensor from storage, freeing GPU memory.
218+ else if (strcmp (option.key , " clear_stored_tensor" ) == 0 ) {
219+ if (auto * arr = std::get_if<
220+ std::array<char , executorch::runtime::kMaxOptionValueLength >>(
221+ &option.value )) {
222+ std::string name (arr->data ());
223+ auto it = gpu_tensors_.find (name);
224+ if (it != gpu_tensors_.end ()) {
225+ if (it->second .handle != nullptr ) {
226+ aoti_torch_delete_tensor_object (it->second .handle );
227+ }
228+ gpu_tensors_.erase (it);
229+ }
230+ } else {
231+ ET_LOG (Error, " clear_stored_tensor option expects a string value" );
232+ return Error::InvalidArgument;
233+ }
234+ }
235+ }
236+ return Error::Ok;
237+ }
238+
94239 // Once per loaded binary blob
95240 Result<DelegateHandle*> init (
96241 BackendInitContext& context,
@@ -222,15 +367,52 @@ class ET_EXPERIMENTAL CudaBackend final
222367 std::vector<AOTITensorHandle> gpu_outputs (
223368 n_outputs); // GPU tensors for kernel output
224369
370+ // RAII helper to ensure GPU tensors are cleaned up on all exit paths.
371+ // Prevents memory leaks when errors occur during execute().
372+ struct TensorCleanup {
373+ std::vector<AOTITensorHandle>& inputs;
374+ std::vector<AOTITensorHandle>& outputs;
375+ const std::unordered_map<std::string, GpuTensorRef>& stored_tensors;
376+
377+ ~TensorCleanup () {
378+ // Clean up input tensors
379+ for (auto * handle : inputs) {
380+ if (handle != nullptr ) {
381+ aoti_torch_delete_tensor_object (handle);
382+ }
383+ }
384+ // Clean up output tensors, except those that are stored
385+ for (auto * handle : outputs) {
386+ if (handle != nullptr ) {
387+ bool is_stored = false ;
388+ for (const auto & pair : stored_tensors) {
389+ if (pair.second .handle == handle) {
390+ is_stored = true ;
391+ break ;
392+ }
393+ }
394+ if (!is_stored) {
395+ aoti_torch_delete_tensor_object (handle);
396+ }
397+ }
398+ }
399+ }
400+ };
401+ TensorCleanup cleanup{gpu_inputs, gpu_outputs, gpu_tensors_};
402+
403+ // Track which input index was matched for D2D copy (for duplicate
404+ // detection)
405+ ssize_t matched_input_idx = -1 ;
406+
225407 // Process input tensors: ExecuTorch provides CPU tensors, create GPU
226- // copies
227- for (int i = 0 ; i < n_inputs; i++) {
408+ // copies. For stored inputs, use GPU-to-GPU copy instead of CPU-to-GPU.
409+ for (size_t i = 0 ; i < n_inputs; i++) {
228410 // Get tensor dimensions and properties from ExecuTorch CPU tensor
229411 auto cpu_tensor = &(args[i]->toTensor ());
230412 auto sizes = cpu_tensor->sizes ();
231413 auto scalar_type = cpu_tensor->scalar_type ();
232414
233- // Create GPU tensor with same shape
415+ // Create GPU tensor with same shape (always needed for AOTI format)
234416 std::vector<int64_t > sizes_vec (sizes.begin (), sizes.end ());
235417
236418 AOTITensorHandle gpu_input_handle;
@@ -246,21 +428,75 @@ class ET_EXPERIMENTAL CudaBackend final
246428 ET_CHECK_OR_RETURN_ERROR (
247429 create_err == Error::Ok,
248430 Internal,
249- " Failed to create GPU tensor for input %d " ,
431+ " Failed to create GPU tensor for input %zu " ,
250432 i);
251433
252434 gpu_inputs[i] = gpu_input_handle;
253435
254- // Copy data from CPU to GPU
436+ // Check if this input matches a stored GPU tensor (by size).
437+ if (!use_stored_input_name_.empty ()) {
438+ auto it = gpu_tensors_.find (use_stored_input_name_);
439+ if (it != gpu_tensors_.end ()) {
440+ const GpuTensorRef& ref = it->second ;
441+ size_t numel = gpu_inputs[i]->numel ();
442+ size_t elem_size = gpu_inputs[i]->element_size ();
443+ size_t copy_bytes = numel * elem_size;
444+
445+ // Match by size: use stored tensor if sizes match
446+ if (copy_bytes == ref.size_bytes ) {
447+ if (matched_input_idx >= 0 ) {
448+ // Another input already matched - warn about ambiguity
449+ ET_LOG (
450+ Error,
451+ " Multiple inputs match stored tensor '%s' size (%zu bytes): "
452+ " input %zd was used, input %zu also matches. "
453+ " Consider using unique tensor sizes or a different matching strategy." ,
454+ use_stored_input_name_.c_str (),
455+ copy_bytes,
456+ matched_input_idx,
457+ i);
458+ } else {
459+ // First match - perform D2D copy
460+ matched_input_idx = static_cast <ssize_t >(i);
461+
462+ ET_LOG (
463+ Debug,
464+ " Using stored tensor '%s' for input %zu (%zu bytes, D2D copy)" ,
465+ use_stored_input_name_.c_str (),
466+ i,
467+ copy_bytes);
468+
469+ // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
470+ cudaError_t cuda_err = cudaMemcpy (
471+ gpu_inputs[i]->data_ptr (),
472+ ref.data_ptr ,
473+ copy_bytes,
474+ cudaMemcpyDeviceToDevice);
475+
476+ ET_CHECK_OR_RETURN_ERROR (
477+ cuda_err == cudaSuccess,
478+ Internal,
479+ " Failed GPU-to-GPU copy for input %zu: %s" ,
480+ i,
481+ cudaGetErrorString (cuda_err));
482+
483+ // Skip the CPU-to-GPU copy below
484+ continue ;
485+ }
486+ }
487+ }
488+ }
489+
490+ // Copy data from CPU to GPU (normal path)
255491 ET_CHECK_OR_RETURN_ERROR (
256492 aoti_torch_copy_ (gpu_inputs[i], cpu_tensor, 0 ) == Error::Ok,
257493 Internal,
258- " Failed to copy input %d from CPU to GPU" ,
494+ " Failed to copy input %zu from CPU to GPU" ,
259495 i);
260496 }
261497 // Process output tensors: create GPU counterparts for ExecuTorch CPU
262498 // tensors
263- for (int i = 0 ; i < n_outputs; i++) {
499+ for (size_t i = 0 ; i < n_outputs; i++) {
264500 // Get output tensor dimensions from ExecuTorch CPU tensor
265501 auto cpu_output_tensor = &(args[i + n_inputs]->toTensor ());
266502 auto sizes = cpu_output_tensor->sizes ();
@@ -282,7 +518,7 @@ class ET_EXPERIMENTAL CudaBackend final
282518 ET_CHECK_OR_RETURN_ERROR (
283519 create_err == Error::Ok,
284520 Internal,
285- " Failed to create GPU tensor for output %d " ,
521+ " Failed to create GPU tensor for output %zu " ,
286522 i);
287523
288524 gpu_outputs[i] = gpu_output_handle;
@@ -303,20 +539,65 @@ class ET_EXPERIMENTAL CudaBackend final
303539 " AOTInductorModelContainerRun failed with error code %d" ,
304540 error);
305541
542+ // Store reference to output GPU tensor if requested.
543+ // The tensor will be kept alive for later D2D copy to decoder inputs.
544+ if (!store_output_name_.empty ()) {
545+ ET_CHECK_OR_RETURN_ERROR (
546+ n_outputs == 1 ,
547+ InvalidArgument,
548+ " store_output only supports single-output methods, got %zu outputs" ,
549+ n_outputs);
550+
551+ auto * gpu_tensor = gpu_outputs[0 ];
552+ size_t numel = gpu_tensor->numel ();
553+ size_t elem_size = gpu_tensor->element_size ();
554+ size_t size_bytes = numel * elem_size;
555+
556+ // Delete old tensor if overwriting (erase first to prevent double-free)
557+ auto old_it = gpu_tensors_.find (store_output_name_);
558+ if (old_it != gpu_tensors_.end ()) {
559+ AOTITensorHandle old_handle = old_it->second .handle ;
560+ gpu_tensors_.erase (old_it); // Remove from map before deleting
561+ if (old_handle != nullptr ) {
562+ aoti_torch_delete_tensor_object (old_handle);
563+ }
564+ }
565+
566+ // Store tensor reference (we now own this tensor)
567+ GpuTensorRef ref;
568+ ref.handle = gpu_tensor;
569+ ref.data_ptr = gpu_tensor->data_ptr ();
570+ ref.size_bytes = size_bytes;
571+ gpu_tensors_[store_output_name_] = ref;
572+
573+ // Reset store_output name after storing
574+ store_output_name_.clear ();
575+ }
576+
306577 // Copy GPU output results back to CPU output tensors
307- for (int i = 0 ; i < n_outputs; i++) {
578+ for (size_t i = 0 ; i < n_outputs; i++) {
308579 auto cpu_output_tensor = &(args[i + n_inputs]->toTensor ());
309580 // For DYNAMIC_BOUND tensors we try to resize
310581 ET_CHECK_OK_OR_RETURN_ERROR (
311582 resize_tensor (*cpu_output_tensor, gpu_outputs[i]->sizes ()),
312- " Error resizing tensor at output index %d " ,
583+ " Error resizing tensor at output index %zu " ,
313584 i);
314585 ET_CHECK_OK_OR_RETURN_ERROR (
315586 aoti_torch_copy_ (cpu_output_tensor, gpu_outputs[i], 0 ),
316- " Failed to copy GPU output %d back to CPU" ,
587+ " Failed to copy GPU output %zu back to CPU" ,
317588 i);
318589 }
319590
591+ // Memory management notes:
592+ // - GPU tensor cleanup is handled by TensorCleanup RAII guard above.
593+ // - use_stored_input setting persists across execute() calls to support
594+ // decoder loops that reuse the stored encoder output.
595+ // - Stored GPU tensors (in gpu_tensors_) remain in memory until:
596+ // (a) overwritten by a new tensor with the same name, or
597+ // (b) destroy() is called, which frees all stored tensors.
598+ // - The "reset_stored_input" option only resets the input name setting,
599+ // NOT the stored GPU tensors themselves.
600+
320601 return Error::Ok;
321602 }
322603
@@ -326,6 +607,9 @@ class ET_EXPERIMENTAL CudaBackend final
326607 }
327608 AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
328609
610+ // Delete stored GPU tensors
611+ clear_gpu_tensors ();
612+
329613 // Destroy the CUDA stream if it exists
330614 if (handle->cuda_stream != nullptr ) {
331615 cudaStream_t cuda_stream = static_cast <cudaStream_t>(handle->cuda_stream );
0 commit comments