diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9233898..9a1bb3c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,7 +64,7 @@ set_ifndef(CUDA_DIR /usr/local/cuda-${CUDA_CTK_VERSION})
 if(NOT DEFINED AARCH64_BUILD)
   set(CMAKE_CUDA_ARCHITECTURES 80;86;89)
   if(CUDA_CTK_VERSION VERSION_GREATER_EQUAL 12.8)
-    list(APPEND CMAKE_CUDA_ARCHITECTURES 100 120)
+    list(APPEND CMAKE_CUDA_ARCHITECTURES 100 110 120)
   endif()
 endif()
 
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index eab3baf..451c1b5 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -98,7 +98,7 @@ add_library(
   ${RUNTIME_CPP_SRCS}
   ${PROFILER_CPP_SRCS})
 target_include_directories(edgellmCore PRIVATE ${COMMON_INCLUDE_DIRS})
-target_link_libraries(edgellmCore PRIVATE dl)
+target_link_libraries(edgellmCore PRIVATE dl curand)
 # Apply FMHA SM exclusion definitions
 target_compile_definitions(edgellmCore PRIVATE ${FMHA_EXCLUDE_DEFINITIONS})
 # Enable separable compilation for MoE Marlin kernel templates
diff --git a/cpp/common/inputLimits.h b/cpp/common/inputLimits.h
index 5b72c60..c245228 100644
--- a/cpp/common/inputLimits.h
+++ b/cpp/common/inputLimits.h
@@ -42,7 +42,7 @@ constexpr int kReasonableMaxBatchSize = 16;
 // Validation limits for message parsing.
 constexpr size_t kMaxMessageContentSizeBytes = 128 * 1024; // 128KB per content item
 constexpr size_t kMaxMessagesPerRequest = 64;
-constexpr size_t kMaxContentItemsPerMessage = 16;
+constexpr size_t kMaxContentItemsPerMessage = 64; // Alpamayo uses 4 cameras x 4 frames = 16 images + text items
 
 } // namespace security
 
diff --git a/cpp/kernels/alpamayoExpertKernels/alpamayoExpertKernels.cu b/cpp/kernels/alpamayoExpertKernels/alpamayoExpertKernels.cu
new file mode 100644
index 0000000..1e3bc63
--- /dev/null
+++ b/cpp/kernels/alpamayoExpertKernels/alpamayoExpertKernels.cu
@@ -0,0 +1,118 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * CUDA kernels for AlpamayoExpertRunner.
+ */
+
+#include "alpamayoExpertKernels.h"
+
+namespace trt_edgellm
+{
+namespace kernel
+{
+
+namespace
+{
+
+__global__ void kvCacheReshapeRepeatKernel(float* __restrict__ dst, half const* __restrict__ src, int32_t numLayers,
+    int32_t numKVHeads, int32_t seqLen, int32_t headDim, int32_t numCandidates, int64_t totalElements)
+{
+    int64_t const idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    if (idx >= totalElements)
+        return;
+
+    // Decompose: dst[layer2, cand, head, s, d]
+    int64_t rem = idx;
+    int32_t const d = rem % headDim;
+    rem /= headDim;
+    int32_t const s = rem % seqLen;
+    rem /= seqLen;
+    int32_t const head = rem % numKVHeads;
+    rem /= numKVHeads;
+    rem /= numCandidates; // skip cand dimension (broadcast)
+    int32_t const layer2 = rem;
+
+    int32_t const layer = layer2 / 2;
+    int32_t const kv = layer2 % 2;
+
+    // src layout: [layer, kv, head, s, d] contiguous
+    int64_t const hsd = static_cast<int64_t>(numKVHeads) * seqLen * headDim;
+    int64_t const srcIdx = (static_cast<int64_t>(layer) * 2 + kv) * hsd + static_cast<int64_t>(head) * seqLen * headDim
+        + static_cast<int64_t>(s) * headDim + d;
+
+    dst[idx] = __half2float(src[srcIdx]);
+}
+
+__global__ void buildPositionIdsKernel(int64_t* __restrict__ posIds, int32_t total, int32_t numTokens, int64_t basePos)
+{
+    int32_t const idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total)
+        return;
+    posIds[idx] = basePos + (idx % numTokens);
+}
+
+__global__ void fillTimestepKernel(float* __restrict__ dst, int32_t n, float val)
+{
+    int32_t const idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n)
+        dst[idx] = val;
+}
+
+__global__ void eulerUpdateKernel(float* __restrict__ x, float const* __restrict__ v, float dt, int32_t n)
+{
+    int32_t const idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n)
+        x[idx] += v[idx] * dt;
+}
+
+} // anonymous namespace
+
+void kvCacheReshapeRepeat(float* dst, half const* src, int32_t numLayers, int32_t numKVHeads, int32_t seqLen,
+    int32_t headDim, int32_t numCandidates, cudaStream_t stream)
+{
+    int64_t const totalElements = static_cast<int64_t>(numLayers) * 2 * numCandidates * numKVHeads * seqLen * headDim;
+    int32_t const blockSize = 256;
+    int32_t const numBlocks = static_cast<int32_t>((totalElements + blockSize - 1) / blockSize);
+    kvCacheReshapeRepeatKernel<<<numBlocks, blockSize, 0, stream>>>(
+        dst, src, numLayers, numKVHeads, seqLen, headDim, numCandidates, totalElements);
+}
+
+void buildPositionIds(int64_t* posIds, int32_t numCandidates, int32_t numTokens, int64_t basePos, cudaStream_t stream)
+{
+    int32_t const total = 3 * numCandidates * numTokens;
+    int32_t const blockSize = 256;
+    int32_t const numBlocks = (total + blockSize - 1) / blockSize;
+    buildPositionIdsKernel<<<numBlocks, blockSize, 0, stream>>>(posIds, total, numTokens, basePos);
+}
+
+void fillTimestep(float* dst, int32_t numCandidates, float tVal, cudaStream_t stream)
+{
+    int32_t const blockSize = 256;
+    int32_t const numBlocks = (numCandidates + blockSize - 1) / blockSize;
+    fillTimestepKernel<<<numBlocks, blockSize, 0, stream>>>(dst, numCandidates, tVal);
+}
+
+void eulerUpdate(float* x, float const* v, float dt, int32_t n, cudaStream_t stream)
+{
+    int32_t const blockSize = 256;
+    int32_t const numBlocks = (n + blockSize - 1) / blockSize;
+    eulerUpdateKernel<<<numBlocks, blockSize, 0, stream>>>(x, v, dt, n);
+}
+
+} // namespace kernel
+} // namespace trt_edgellm
diff --git a/cpp/kernels/alpamayoExpertKernels/alpamayoExpertKernels.h b/cpp/kernels/alpamayoExpertKernels/alpamayoExpertKernels.h
new file mode 100644
index 0000000..53e5995
--- /dev/null
+++ b/cpp/kernels/alpamayoExpertKernels/alpamayoExpertKernels.h
@@ -0,0 +1,43 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * CUDA kernels for AlpamayoExpertRunner.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+namespace trt_edgellm
+{
+namespace kernel
+{
+
+void kvCacheReshapeRepeat(float* dst, half const* src, int32_t numLayers, int32_t numKVHeads, int32_t seqLen,
+    int32_t headDim, int32_t numCandidates, cudaStream_t stream);
+
+void buildPositionIds(int64_t* posIds, int32_t numCandidates, int32_t numTokens, int64_t basePos, cudaStream_t stream);
+
+void fillTimestep(float* dst, int32_t numCandidates, float tVal, cudaStream_t stream);
+
+void eulerUpdate(float* x, float const* v, float dt, int32_t n, cudaStream_t stream);
+
+} // namespace kernel
+} // namespace trt_edgellm
diff --git a/cpp/multimodal/multimodalRunner.cpp b/cpp/multimodal/multimodalRunner.cpp
index 8bf9efa..2e699fe 100644
--- a/cpp/multimodal/multimodalRunner.cpp
+++ b/cpp/multimodal/multimodalRunner.cpp
@@ -133,5 +133,15 @@ bool MultimodalRunner::preprocessSystemPrompt([[maybe_unused]] std::string const
     return true;
 }
 
+bool MultimodalRunner::preprocessPreparedVisual([[maybe_unused]] rt::LLMGenerationRequest const& request,
+    [[maybe_unused]] std::vector<std::vector<int32_t>>& batchedInputIds,
+    [[maybe_unused]] tokenizer::Tokenizer const* tokenizer, [[maybe_unused]] rt::Tensor& ropeRotaryCosSinDevice,
+    [[maybe_unused]] rt::Tensor const& pixelValues, [[maybe_unused]] rt::Tensor const& imageGridTHW,
+    [[maybe_unused]] cudaStream_t stream)
+{
+    LOG_ERROR("preprocessPreparedVisual is not implemented for this multimodal runner");
+    return false;
+}
+
 } // namespace rt
 } // namespace trt_edgellm
diff --git a/cpp/multimodal/multimodalRunner.h b/cpp/multimodal/multimodalRunner.h
index 111906a..bff81a5 100644
--- a/cpp/multimodal/multimodalRunner.h
+++ b/cpp/multimodal/multimodalRunner.h
@@ -90,6 +90,26 @@ class MultimodalRunner
         tokenizer::Tokenizer const* tokenizer, [[maybe_unused]] rt::Tensor& ropeRotaryCosSinDevice, cudaStream_t stream)
         = 0;
 
+    /*!
+     * @brief Preprocess using already prepared visual inputs from an external pipeline.
+     *
+     * This bypasses image decoding / resize / patchification in TRT runtime and instead
+     * consumes tensors equivalent to processor outputs such as `pixel_values` and `image_grid_thw`.
+     *
+     * @param request Generation request with text/messages
+     * @param batchedInputIds Output batched input token IDs
+     * @param tokenizer Tokenizer instance
+     * @param ropeRotaryCosSinDevice RoPE cache tensor
+     * @param pixelValues Preprocessed visual input tensor [num_patches, input_dim]
+     * @param imageGridTHW Image grid tensor [num_images, 3]
+     * @param stream CUDA stream
+     * @return True on success, false on failure
+     */
+    virtual bool preprocessPreparedVisual(rt::LLMGenerationRequest const& request,
+        std::vector<std::vector<int32_t>>& batchedInputIds, tokenizer::Tokenizer const* tokenizer,
+        [[maybe_unused]] rt::Tensor& ropeRotaryCosSinDevice, [[maybe_unused]] rt::Tensor const& pixelValues,
+        [[maybe_unused]] rt::Tensor const& imageGridTHW, cudaStream_t stream);
+
     /*!
      * @brief Used for KVCache saving where we need to conduct the tokenization of the system prompt and generate
      * ND-Rope parameters for the system prompt.
diff --git a/cpp/multimodal/qwenViTRunner.cpp b/cpp/multimodal/qwenViTRunner.cpp
index 261813d..0a3b4f3 100644
--- a/cpp/multimodal/qwenViTRunner.cpp
+++ b/cpp/multimodal/qwenViTRunner.cpp
@@ -768,6 +768,136 @@ bool QwenViTRunner::preprocess(rt::LLMGenerationRequest const& request,
     return true;
 }
 
+bool QwenViTRunner::preprocessPreparedVisual(rt::LLMGenerationRequest const& request,
+    std::vector<std::vector<int32_t>>& batchedInputIds, tokenizer::Tokenizer const* tokenizer,
+    rt::Tensor& ropeRotaryCosSinDevice, rt::Tensor const& pixelValues, rt::Tensor const& imageGridTHW,
+    cudaStream_t stream)
+{
+    try
+    {
+        check::check(pixelValues.getShape().getNumDims() == 2, "pixelValues must be 2D [num_patches, input_dim]");
+        check::check(pixelValues.getDataType() == nvinfer1::DataType::kHALF, "pixelValues must be FP16");
+        check::check(
+            imageGridTHW.getShape().getNumDims() == 2 && imageGridTHW.getShape()[1] == 3, "imageGridTHW must be [N, 3]");
+
+        int64_t const numImages = imageGridTHW.getShape()[0];
+        std::vector<int64_t> imageGridTHWHostVec(numImages * 3);
+        CUDA_CHECK(cudaMemcpyAsync(imageGridTHWHostVec.data(), imageGridTHW.rawPointer(),
+            sizeof(int64_t) * numImages * 3, cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+
+        std::vector<std::vector<int64_t>> imageGridTHWs;
+        imageGridTHWs.reserve(numImages);
+        std::vector<int64_t> imageTokenLengths;
+        imageTokenLengths.reserve(numImages);
+        std::vector<int64_t> numImagesPerRequest;
+        numImagesPerRequest.reserve(request.requests.size());
+
+        for (int64_t i = 0; i < numImages; ++i)
+        {
+            int64_t t = imageGridTHWHostVec[i * 3 + 0];
+            int64_t h = imageGridTHWHostVec[i * 3 + 1];
+            int64_t w = imageGridTHWHostVec[i * 3 + 2];
+            imageGridTHWs.push_back({t, h, w});
+            imageTokenLengths.push_back(t * h * w / (mConfig.mergeSize * mConfig.mergeSize));
+        }
+
+        // Count image items per request from message contents.
+        // For prepared_visual_input, the request may intentionally omit raw image content and
+        // provide a text-only prompt plus externally prepared pixel_values/image_grid_thw.
+        // In that case, infer the per-request image count from imageGridTHW for the single
+        // request we currently support.
+        int64_t totalImageCount = 0;
+        for (auto const& req : request.requests)
+        {
+            int64_t requestImageCount = 0;
+            for (auto const& msg : req.messages)
+            {
+                for (auto const& content : msg.contents)
+                {
+                    if (content.type == "image")
+                    {
+                        requestImageCount++;
+                    }
+                }
+            }
+            numImagesPerRequest.push_back(requestImageCount);
+            totalImageCount += requestImageCount;
+        }
+        if (totalImageCount == 0)
+        {
+            check::check(request.requests.size() == 1,
+                "prepared_visual_input without raw image messages currently supports a single request only");
+            numImagesPerRequest[0] = numImages;
+            totalImageCount = numImages;
+        }
+        check::check(totalImageCount == numImages, "imageGridTHW image count does not match request image count");
+
+        // Populate mVitInput directly from the prepared processor output.
+        int64_t const totalSeqLength = pixelValues.getShape()[0];
+        check::check(pixelValues.getShape()[1] == mConfig.inputDim, "pixelValues input dim mismatch");
+        check::check(mVitInput.reshape({totalSeqLength, mConfig.inputDim}), "Tensor reshape failed");
+        CUDA_CHECK(cudaMemcpyAsync(mVitInput.rawPointer(), pixelValues.rawPointer(),
+            totalSeqLength * mConfig.inputDim * sizeof(half), cudaMemcpyDeviceToDevice, stream));
+
+        // Build cu_seqlens and auxiliary tensors exactly like imagePreprocess tail.
+        int32_t* cuSeqlensData = mCuSeqlensHost.dataPointer<int32_t>();
+        cuSeqlensData[0] = 0;
+        int64_t cuSeqlensSize = 1;
+        int64_t maxSeqLen = 0;
+        for (auto const& grid : imageGridTHWs)
+        {
+            int64_t curSeqLength = grid[0] * grid[1] * grid[2];
+            int32_t prevCuSeqlen = cuSeqlensData[cuSeqlensSize - 1];
+            cuSeqlensData[cuSeqlensSize++] = static_cast<int32_t>(prevCuSeqlen + curSeqLength);
+            maxSeqLen = std::max(maxSeqLen, curSeqLength);
+        }
+
+        int64_t const totalImageTokens = totalSeqLength / (mConfig.mergeSize * mConfig.mergeSize);
+        check::check(mOutputEmbedding.reshape({totalImageTokens, mConfig.outHiddenSize}), "Tensor reshape failed");
+        check::check(mMaxSeqLenCarrier.reshape({maxSeqLen}), "Tensor reshape failed");
+
+        check::check(mCuSeqlens.reshape({cuSeqlensSize}), "Tensor reshape failed");
+        CUDA_CHECK(cudaMemcpyAsync(mCuSeqlens.rawPointer(), mCuSeqlensHost.rawPointer(),
+            cuSeqlensSize * sizeof(int32_t), cudaMemcpyHostToDevice, stream));
+
+        check::check(mRotaryPosEmb.reshape({totalSeqLength, mConfig.vitPosEmbDim}), "Tensor reshape failed");
+        for (size_t i = 0; i < imageGridTHWs.size(); ++i)
+        {
+            kernel::initRotaryPosEmbQwenViT(
+                mRotaryPosEmb, imageGridTHWs[i], mConfig.mergeSize, cuSeqlensData[i], 10000.0f, 1.0f, stream);
+        }
+
+        if (mModelType == multimodal::ModelType::QWEN3_VL
+            || mModelType == multimodal::ModelType::QWEN3_OMNI_VISION_ENCODER)
+        {
+            check::check(mFastPosEmbIdx.reshape({4, totalSeqLength}), "Tensor reshape failed");
+            check::check(mFastPosEmbWeight.reshape({4, totalSeqLength}), "Tensor reshape failed");
+            for (size_t i = 0; i < imageGridTHWs.size(); ++i)
+            {
+                kernel::initFastPosEmbedQwenViT(mFastPosEmbIdx, mFastPosEmbWeight, imageGridTHWs[i], mConfig.mergeSize,
+                    mConfig.numGridPerSide, cuSeqlensData[i], stream);
+            }
+            for (int64_t i = 0; i < mConfig.numDeepstackFeatures; ++i)
+            {
+                check::check(
+                    mDeepstackFeatures[i].reshape({totalImageTokens, mConfig.outHiddenSize}), "Tensor reshape failed");
+            }
+        }
+        mLastImageGridTHWs = imageGridTHWs;
+
+        textPreprocess(request, batchedInputIds, numImagesPerRequest, imageTokenLengths, tokenizer);
+        generateMropeParams(batchedInputIds, imageGridTHWs, ropeRotaryCosSinDevice, stream);
+    }
+    catch (std::exception const& e)
+    {
+        LOG_ERROR("QwenViTRunner::preprocessPreparedVisual() failed: %s", e.what());
+        return false;
+    }
+
+    return true;
+}
+
 bool QwenViTRunner::preprocessSystemPrompt(std::string const& systemPrompt, tokenizer::Tokenizer const* tokenizer,
     rt::Tensor& ropeRotaryCosSinDevice, cudaStream_t stream)
 {
diff --git a/cpp/multimodal/qwenViTRunner.h b/cpp/multimodal/qwenViTRunner.h
index 7e59ced..708389a 100644
--- a/cpp/multimodal/qwenViTRunner.h
+++ b/cpp/multimodal/qwenViTRunner.h
@@ -84,6 +84,11 @@ class QwenViTRunner : public MultimodalRunner
     bool preprocess(rt::LLMGenerationRequest const& request, std::vector<std::vector<int32_t>>& batchedInputIds,
         tokenizer::Tokenizer const* tokenizer, rt::Tensor& ropeRotaryCosSinDevice, cudaStream_t stream) override;
 
+    bool preprocessPreparedVisual(rt::LLMGenerationRequest const& request,
+        std::vector<std::vector<int32_t>>& batchedInputIds, tokenizer::Tokenizer const* tokenizer,
+        rt::Tensor& ropeRotaryCosSinDevice, rt::Tensor const& pixelValues, rt::Tensor const& imageGridTHW,
+        cudaStream_t stream) override;
+
     //! \brief Encode the system prompt and generate ND-RoPE parameters for the system prompt for KVCache saving.
     //! \param[in] systemPrompt System prompt string
     //! \param[in] tokenizer Tokenizer for text processing
diff --git a/cpp/runtime/alpamayoExpertRunner.cpp b/cpp/runtime/alpamayoExpertRunner.cpp
new file mode 100644
index 0000000..4aebb51
--- /dev/null
+++ b/cpp/runtime/alpamayoExpertRunner.cpp
@@ -0,0 +1,222 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * AlpamayoExpertRunner: runs TRT Diffusion Expert inline, KV cache stays on GPU.
+ */
+
+#include "alpamayoExpertRunner.h"
+
+#include "kernels/alpamayoExpertKernels/alpamayoExpertKernels.h"
+
+#include <cmath>
+#include <fstream>
+
+namespace trt_edgellm
+{
+namespace rt
+{
+
+AlpamayoExpertRunner::AlpamayoExpertRunner(
+    std::string const& enginePath, AlpamayoExpertConfig const& config, cudaStream_t stream)
+    : mConfig(config)
+{
+    loadEngine(enginePath, stream);
+
+    auto maxShape = mEngine->getProfileShape("kv_cache", 0, nvinfer1::OptProfileSelector::kMAX);
+    int32_t maxKVSeqLen = maxShape.d[3];
+
+    allocateBuffers(maxKVSeqLen, stream);
+
+    curandCreateGenerator(&mRNG, CURAND_RNG_PSEUDO_DEFAULT);
+    curandSetPseudoRandomGeneratorSeed(mRNG, mConfig.seed);
+    curandSetStream(mRNG, stream);
+
+    LOG_INFO("AlpamayoExpertRunner: %d candidates, %d steps, maxKVSeq=%d, engine=%s", mConfig.numCandidates,
+        mConfig.numDiffusionSteps, maxKVSeqLen, enginePath.c_str());
+}
+
+AlpamayoExpertRunner::~AlpamayoExpertRunner() noexcept
+{
+    if (mRNG)
+    {
+        curandDestroyGenerator(mRNG);
+    }
+}
+
+void AlpamayoExpertRunner::loadEngine(std::string const& path, cudaStream_t stream)
+{
+    mRuntime = std::unique_ptr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(gLogger));
+
+    std::ifstream file(path, std::ios::binary | std::ios::ate);
+    if (!file.good())
+    {
+        throw std::runtime_error("Cannot open Expert engine: " + path);
+    }
+    size_t const fileSize = file.tellg();
+    file.seekg(0, std::ios::beg);
+    std::vector<char> buf(fileSize);
+    file.read(buf.data(), fileSize);
+
+    mEngine = std::unique_ptr<nvinfer1::ICudaEngine>(mRuntime->deserializeCudaEngine(buf.data(), fileSize));
+    if (!mEngine)
+    {
+        throw std::runtime_error("Failed to deserialize Expert engine");
+    }
+
+    int64_t const execBytes = mEngine->getDeviceMemorySizeV2();
+    mExecContextMemory
+        = rt::Tensor({execBytes}, rt::DeviceType::kGPU, nvinfer1::DataType::kUINT8, "ExpertRunner::execMem");
+
+    mContext = std::unique_ptr<nvinfer1::IExecutionContext>(
+        mEngine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED));
+    mContext->setDeviceMemoryV2(mExecContextMemory.rawPointer(), execBytes);
+
+    LOG_INFO("Expert engine loaded: exec ctx %zu MB", execBytes / (1024 * 1024));
+}
+
+void AlpamayoExpertRunner::allocateBuffers(int32_t maxKVSeqLen, cudaStream_t stream)
+{
+    int32_t const B = mConfig.numCandidates;
+    int32_t const T = mConfig.numDiffusionTokens;
+    int32_t const A = mConfig.actionDim;
+    int64_t const L2 = mConfig.numLayers * 2;
+    int32_t const H = mConfig.numKVHeads;
+    int32_t const D = mConfig.headDim;
+
+    mKVCacheReshaped = rt::Tensor({L2, B, H, static_cast<int64_t>(maxKVSeqLen), D}, rt::DeviceType::kGPU,
+        nvinfer1::DataType::kFLOAT, "ExpertRunner::kv");
+    mNoisyAction = rt::Tensor({B, T, A}, rt::DeviceType::kGPU, nvinfer1::DataType::kFLOAT, "ExpertRunner::x");
+    mTimestep = rt::Tensor({B, 1, 1}, rt::DeviceType::kGPU, nvinfer1::DataType::kFLOAT, "ExpertRunner::t");
+    mPositionIds = rt::Tensor({3, static_cast<int64_t>(B), static_cast<int64_t>(T)}, rt::DeviceType::kGPU,
+        nvinfer1::DataType::kINT64, "ExpertRunner::posIds");
+    mAttentionMask = rt::Tensor({B, 1, T, static_cast<int64_t>(maxKVSeqLen + T)}, rt::DeviceType::kGPU,
+        nvinfer1::DataType::kFLOAT, "ExpertRunner::mask");
+    mPredVelocity = rt::Tensor({B, T, A}, rt::DeviceType::kGPU, nvinfer1::DataType::kFLOAT, "ExpertRunner::v");
+}
+
+int32_t AlpamayoExpertRunner::findTrajFutureStart(
+    std::vector<int32_t> const& outputIds, int32_t prefillSeqLen) const noexcept
+{
+    for (size_t i = 0; i < outputIds.size(); ++i)
+    {
+        if (outputIds[i] == mConfig.trajFutureStartTokenId)
+        {
+            return prefillSeqLen + static_cast<int32_t>(i) + 1;
+        }
+    }
+    return -1;
+}
+
+void AlpamayoExpertRunner::runDiffusion(
+    rt::Tensor const& vlmKVCache, int32_t kvSeqLen, int32_t ropeDelta, rt::Tensor& outputActions, cudaStream_t stream)
+{
+    int32_t const B = mConfig.numCandidates;
+    int32_t const T = mConfig.numDiffusionTokens;
+    int32_t const A = mConfig.actionDim;
+    int32_t const L = mConfig.numLayers;
+    int32_t const H = mConfig.numKVHeads;
+    int32_t const D = mConfig.headDim;
+    int32_t const L2 = L * 2;
+    int32_t const nSteps = mConfig.numDiffusionSteps;
+    int32_t const actionElements = B * T * A;
+    int32_t const totalMaskLen = kvSeqLen + T;
+
+    // 1. KV cache reshape + repeat: (L,2,H,S,D) fp16 → (L*2,B,H,S,D) fp32
+    check::check(mKVCacheReshaped.reshape({L2, B, H, static_cast<int64_t>(kvSeqLen), D}), "KV reshape failed");
+    kernel::kvCacheReshapeRepeat(mKVCacheReshaped.dataPointer<float>(),
+        reinterpret_cast<half const*>(vlmKVCache.rawPointer()), L, H, kvSeqLen, D, B, stream);
+
+    // 2. Position IDs: basePos = kvSeqLen + ropeDelta
+    int64_t basePos = static_cast<int64_t>(kvSeqLen) + ropeDelta;
+    kernel::buildPositionIds(mPositionIds.dataPointer<int64_t>(), B, T, basePos, stream);
+
+    // 3. Attention mask: all zeros (non-causal expert)
+    check::check(mAttentionMask.reshape({B, 1, T, static_cast<int64_t>(totalMaskLen)}), "Mask reshape failed");
+    CUDA_CHECK(cudaMemsetAsync(
+        mAttentionMask.rawPointer(), 0, static_cast<size_t>(B) * T * totalMaskLen * sizeof(float), stream));
+
+    // 4. Initial noise x₀ ~ N(0, I)
+    check::check(actionElements % 2 == 0, "curandGenerateNormal requires even element count");
+    curandGenerateNormal(mRNG, mNoisyAction.dataPointer<float>(), actionElements, 0.0f, 1.0f);
+
+    // 5. Set TRT input shapes
+    nvinfer1::Dims dims;
+
+    dims.nbDims = 3;
+    dims.d[0] = B;
+    dims.d[1] = T;
+    dims.d[2] = A;
+    mContext->setInputShape("noisy_action", dims);
+
+    dims.d[0] = B;
+    dims.d[1] = 1;
+    dims.d[2] = 1;
+    mContext->setInputShape("timestep", dims);
+
+    dims.d[0] = 3;
+    dims.d[1] = B;
+    dims.d[2] = T;
+    mContext->setInputShape("position_ids", dims);
+
+    dims.nbDims = 4;
+    dims.d[0] = B;
+    dims.d[1] = 1;
+    dims.d[2] = T;
+    dims.d[3] = totalMaskLen;
+    mContext->setInputShape("attention_mask", dims);
+
+    dims.nbDims = 5;
+    dims.d[0] = L2;
+    dims.d[1] = B;
+    dims.d[2] = H;
+    dims.d[3] = kvSeqLen;
+    dims.d[4] = D;
+    mContext->setInputShape("kv_cache", dims);
+
+    mContext->setTensorAddress("noisy_action", mNoisyAction.rawPointer());
+    mContext->setTensorAddress("timestep", mTimestep.rawPointer());
+    mContext->setTensorAddress("position_ids", mPositionIds.rawPointer());
+    mContext->setTensorAddress("attention_mask", mAttentionMask.rawPointer());
+    mContext->setTensorAddress("kv_cache", mKVCacheReshaped.rawPointer());
+    mContext->setTensorAddress("pred_velocity", mPredVelocity.rawPointer());
+
+    // 6. Flow matching: Euler integration (no CUDA Graph — kvSeqLen varies between requests)
+    float const dt = 1.0f / static_cast<float>(nSteps);
+
+    for (int32_t step = 0; step < nSteps; ++step)
+    {
+        float const tVal = static_cast<float>(step) * dt;
+        kernel::fillTimestep(mTimestep.dataPointer<float>(), B, tVal, stream);
+
+        auto ok = mContext->enqueueV3(stream);
+        check::check(ok, "Expert enqueueV3 failed at diffusion step");
+
+        kernel::eulerUpdate(
+            mNoisyAction.dataPointer<float>(), mPredVelocity.dataPointer<float>(), dt, actionElements, stream);
+    }
+
+    // 7. Copy result to output
+    CUDA_CHECK(cudaMemcpyAsync(outputActions.rawPointer(), mNoisyAction.rawPointer(), actionElements * sizeof(float),
+        cudaMemcpyDeviceToDevice, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    LOG_INFO("Expert diffusion done: %d steps, %d candidates, kvSeq=%d", nSteps, B, kvSeqLen);
+}
+
+} // namespace rt
+} // namespace trt_edgellm
diff --git a/cpp/runtime/alpamayoExpertRunner.h b/cpp/runtime/alpamayoExpertRunner.h
new file mode 100644
index 0000000..abc250f
--- /dev/null
+++ b/cpp/runtime/alpamayoExpertRunner.h
@@ -0,0 +1,114 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * AlpamayoExpertRunner: runs TRT Diffusion Expert inline after VLM decode.
+ * KV cache stays on GPU — no disk I/O, no Python.
+ */
+
+#pragma once
+
+#include "common/checkMacros.h"
+#include "common/cudaUtils.h"
+#include "common/logger.h"
+#include "common/tensor.h"
+
+#include <NvInfer.h>
+#include <curand.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace trt_edgellm
+{
+namespace rt
+{
+
+struct AlpamayoExpertConfig
+{
+    int32_t numCandidates{6};
+    int32_t numDiffusionSteps{10};
+    int32_t numDiffusionTokens{64};
+    int32_t actionDim{2};
+    int32_t numLayers{36};
+    int32_t numKVHeads{8};
+    int32_t headDim{128};
+    int32_t trajFutureStartTokenId{155681};
+    int32_t ropeDelta{-2592};
+    int64_t seed{42};
+    bool multiSeq{false};
+};
+
+class AlpamayoExpertRunner
+{
+public:
+    AlpamayoExpertRunner(std::string const& enginePath, AlpamayoExpertConfig const& config, cudaStream_t stream);
+    ~AlpamayoExpertRunner() noexcept;
+
+    /*! Run 10-step flow matching diffusion using VLM's KV cache.
+     *
+     * @param vlmKVCache Contiguous GPU tensor: (numLayers, 2, numKVHeads, seqLen, headDim) fp16
+     *                   from kernel::saveKVCacheIntoTensor()
+     * @param kvSeqLen   Sequence length of the truncated KV cache (after traj_future_start)
+     * @param ropeDelta  RoPE delta (typically -2592 for 4-camera 4-frame setup)
+     * @param outputActions Output GPU tensor: (numCandidates, numDiffusionTokens, actionDim) fp32
+     * @param stream     CUDA stream
+     */
+    void runDiffusion(rt::Tensor const& vlmKVCache, int32_t kvSeqLen, int32_t ropeDelta, rt::Tensor& outputActions,
+        cudaStream_t stream);
+
+    /*! Find <traj_future_start> in decoded output IDs and return the truncation point.
+     *  Returns prefillLen + idx + 1 where idx is the index in outputIds.
+     *  Returns -1 if not found.
+     */
+    int32_t findTrajFutureStart(std::vector<int32_t> const& outputIds, int32_t prefillSeqLen) const noexcept;
+
+    AlpamayoExpertConfig const& getConfig() const noexcept
+    {
+        return mConfig;
+    }
+    nvinfer1::IExecutionContext* getContext() noexcept
+    {
+        return mContext.get();
+    }
+
+private:
+    void loadEngine(std::string const& path, cudaStream_t stream);
+    void allocateBuffers(int32_t maxKVSeqLen, cudaStream_t stream);
+
+    AlpamayoExpertConfig mConfig;
+
+    // TRT engine
+    std::unique_ptr<nvinfer1::IRuntime> mRuntime;
+    std::unique_ptr<nvinfer1::ICudaEngine> mEngine;
+    std::unique_ptr<nvinfer1::IExecutionContext> mContext;
+    rt::Tensor mExecContextMemory;
+
+    // Pre-allocated GPU buffers
+    rt::Tensor mKVCacheReshaped; // (72, B, 8, maxS, 128) float32
+    rt::Tensor mNoisyAction;     // (B, 64, 2) float32
+    rt::Tensor mTimestep;        // (B, 1, 1) float32
+    rt::Tensor mPositionIds;     // (3, B, 64) int64
+    rt::Tensor mAttentionMask;   // (B, 1, 64, maxS+64) float32
+    rt::Tensor mPredVelocity;    // (B, 64, 2) float32
+
+    // cuRAND
+    curandGenerator_t mRNG{nullptr};
+};
+
+} // namespace rt
+} // namespace trt_edgellm
diff --git a/cpp/runtime/llmInferenceRuntime.cpp b/cpp/runtime/llmInferenceRuntime.cpp
index cb2c091..dd887ba 100644
--- a/cpp/runtime/llmInferenceRuntime.cpp
+++ b/cpp/runtime/llmInferenceRuntime.cpp
@@ -27,6 +27,7 @@
 #include "common/safetensorsUtils.h"
 #include "kernels/embeddingKernels/embeddingKernels.h"
 #include "kernels/kvCacheUtilKernels/kvCacheUtilsKernels.h"
+#include "kernels/alpamayoExpertKernels/alpamayoExpertKernels.h"
 #include "multimodal/multimodalRunner.h"
 #include "profiling/metrics.h"
 #include "profiling/nvtx_wrapper.h"
@@ -525,8 +526,10 @@ bool LLMInferenceRuntime::handleRequest(
         request.requests.begin(), request.requests.end(), [](auto const& req) { return !req.audioBuffers.empty(); });
     bool hasVision = std::any_of(
         request.requests.begin(), request.requests.end(), [](auto const& req) { return !req.imageBuffers.empty(); });
+    bool hasPreparedVision = std::any_of(request.requests.begin(), request.requests.end(),
+        [](auto const& req) { return !req.preparedVisualInputPath.empty(); });
 
-    if ((hasAudio && mAudioRunner) || (hasVision && mVisionRunner))
+    if ((hasAudio && mAudioRunner) || ((hasVision || hasPreparedVision) && mVisionRunner))
     {
         // Mark multimodal preprocessing and inference for NVTX profiling
         NVTX_SCOPED_RANGE(nvtx_multimodal, "MULTIMODAL_PROCESSING", nvtx_colors::ORANGE);
@@ -550,11 +553,48 @@ bool LLMInferenceRuntime::handleRequest(
         }
 
         // Process vision inputs (if present)
-        if (hasVision && mVisionRunner)
+        if ((hasVision || hasPreparedVision) && mVisionRunner)
         {
             LOG_INFO("Processing vision inputs");
-            if (!mVisionRunner->preprocess(
-                    request, batchedInputIds, mTokenizer.get(), mLLMEngineRunner->getRopeCosSinCacheTensor(), stream))
+            bool visionStatus = false;
+            if (hasPreparedVision)
+            {
+                check::check(activeBatchSize == 1, "prepared_visual_input currently supports batch_size == 1 only");
+                check::check(!hasVision, "prepared_visual_input cannot be mixed with raw image inputs in the same request");
+                std::vector<rt::Tensor> preparedTensors;
+                if (!safetensors::loadSafetensors(request.requests[0].preparedVisualInputPath, preparedTensors, stream))
+                {
+                    LOG_ERROR("Failed to load prepared visual input from: %s",
+                        request.requests[0].preparedVisualInputPath.c_str());
+                    return false;
+                }
+
+                rt::Tensor const* pixelValues = nullptr;
+                rt::Tensor const* imageGridTHW = nullptr;
+                for (auto const& tensor : preparedTensors)
+                {
+                    if (tensor.getName() == "pixel_values")
+                    {
+                        pixelValues = &tensor;
+                    }
+                    else if (tensor.getName() == "image_grid_thw")
+                    {
+                        imageGridTHW = &tensor;
+                    }
+                }
+                check::check(pixelValues != nullptr, "prepared_visual_input is missing tensor 'pixel_values'");
+                check::check(imageGridTHW != nullptr, "prepared_visual_input is missing tensor 'image_grid_thw'");
+
+                visionStatus = mVisionRunner->preprocessPreparedVisual(request, batchedInputIds, mTokenizer.get(),
+                    mLLMEngineRunner->getRopeCosSinCacheTensor(), *pixelValues, *imageGridTHW, stream);
+            }
+            else
+            {
+                visionStatus = mVisionRunner->preprocess(
+                    request, batchedInputIds, mTokenizer.get(), mLLMEngineRunner->getRopeCosSinCacheTensor(), stream);
+            }
+
+            if (!visionStatus)
             {
                 LOG_ERROR("LLMInferenceRuntime(): Vision preprocessing failed. This request cannot be handled.");
                 return false;
@@ -634,6 +674,17 @@ bool LLMInferenceRuntime::handleRequest(
             {
                 outputIds[i].push_back(hostSelectedTokenIdsData[i]);
                 finishedStates[i] = hostSelectedTokenIdsData[i] == mTokenizer->getEosId();
+                // Early stop AFTER <traj_future_start> when Expert runner is active.
+                // We need traj_future_start to be processed by the decoder (one more iteration)
+                // so its KV cache entry exists. Stop on the token AFTER it.
+                if (mExpertRunner && outputIds[i].size() >= 2)
+                {
+                    int32_t prevToken = outputIds[i][outputIds[i].size() - 2];
+                    if (prevToken == mExpertRunner->getConfig().trajFutureStartTokenId)
+                    {
+                        finishedStates[i] = true;
+                    }
+                }
                 if (finishedStates[i])
                 {
                     unFinishedBatchNum--;
@@ -748,6 +799,35 @@ bool LLMInferenceRuntime::handleRequest(
     // Record prefill metrics
     mPrefillMetrics.recordRun(tokenCount.totalReusedTokens, tokenCount.totalComputedTokens);
 
+    bool const hasPrefillDumpRequest = std::any_of(request.requests.begin(), request.requests.end(),
+        [](auto const& req) { return !req.dumpPrefillKVCachePath.empty(); });
+    bool const hasPrefillOnlyRequest
+        = std::any_of(request.requests.begin(), request.requests.end(), [](auto const& req) { return req.prefillOnly; });
+
+    if (hasPrefillDumpRequest || hasPrefillOnlyRequest)
+    {
+        check::check(activeBatchSize == 1,
+            "dump_prefill_kv_cache / prefill_only currently supports batch_size == 1 only");
+
+        if (!request.requests[0].dumpPrefillKVCachePath.empty())
+        {
+            int32_t const sequenceLength = mHostContextLengths.dataPointer<int32_t>()[0];
+            if (!dumpCurrentPrefillKVCache(request.requests[0].dumpPrefillKVCachePath, sequenceLength, stream))
+            {
+                LOG_ERROR("LLMInferenceRuntime(): Failed to dump request prefill KV cache to %s",
+                    request.requests[0].dumpPrefillKVCachePath.c_str());
+                return false;
+            }
+        }
+
+        if (request.requests[0].prefillOnly)
+        {
+            response.outputIds.assign(activeBatchSize, {});
+            response.outputTexts.assign(activeBatchSize, "");
+            return true;
+        }
+    }
+
     // Reshape for decoding step
     check::check(mInputsEmbeds.reshape({activeBatchSize, 1, mEngineConfig.hiddenSize}), "Tensor reshape failed");
 
@@ -800,6 +880,336 @@ bool LLMInferenceRuntime::handleRequest(
         mGenerationMetrics.recordRun(totalGeneratedTokens);
     }
 
+    // ===== KV cache dump after decode (Path A) =====
+    if (!mKVCacheDumpPath.empty())
+    {
+        auto& linearKVCache = mLLMEngineRunner->getLinearKVCache();
+        auto cacheConfig = linearKVCache.getConfig();
+        auto kvCacheBuffer = linearKVCache.getKVCacheBuffer();
+
+        int32_t totalGenLen = static_cast<int32_t>(outputIds[0].size());
+        int32_t totalSeqLen = prefillSequenceLength + totalGenLen - 1;
+
+        rt::Coords dumpShape{cacheConfig.numAttentionLayers, 2, cacheConfig.numKVHeads,
+            static_cast<int64_t>(totalSeqLen), cacheConfig.headDim};
+        rt::Tensor dumpTensor(dumpShape, rt::DeviceType::kGPU,
+            cacheConfig.kvCacheTypeTRT, "LLMInferenceRuntime::dumpKVCache");
+        kernel::saveKVCacheIntoTensor(dumpTensor, kvCacheBuffer, 0, stream);
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+
+        int64_t numBytes = dumpTensor.getMemoryCapacity();
+        std::vector<uint8_t> hostBuf(numBytes);
+        CUDA_CHECK(cudaMemcpy(hostBuf.data(), dumpTensor.rawPointer(), numBytes, cudaMemcpyDeviceToHost));
+
+        std::string binPath = mKVCacheDumpPath + ".req" + std::to_string(mKVCacheDumpReqIdx) + ".bin";
+        std::string metaPath = mKVCacheDumpPath + ".req" + std::to_string(mKVCacheDumpReqIdx) + ".shape.json";
+        {
+            std::ofstream binOut(binPath, std::ios::binary);
+            binOut.write(reinterpret_cast<char const*>(hostBuf.data()), numBytes);
+        }
+        {
+            std::ofstream metaOut(metaPath);
+            metaOut << "{\n";
+            metaOut << "  \"shape\": [" << cacheConfig.numAttentionLayers << ", 2, "
+                    << cacheConfig.numKVHeads << ", " << totalSeqLen << ", "
+                    << cacheConfig.headDim << "],\n";
+            metaOut << "  \"dtype\": " << static_cast<int>(cacheConfig.kvCacheTypeTRT) << ",\n";
+            metaOut << "  \"prefill_seq_len\": " << prefillSequenceLength << ",\n";
+            metaOut << "  \"total_seq_len\": " << totalSeqLen << ",\n";
+            metaOut << "  \"num_generated\": " << (totalGenLen - 1) << ",\n";
+            metaOut << "  \"num_bytes\": " << numBytes << ",\n";
+            metaOut << "  \"req_idx\": " << mKVCacheDumpReqIdx << ",\n";
+            std::vector<int32_t> hostInputIds(prefillSequenceLength);
+            CUDA_CHECK(cudaMemcpy(hostInputIds.data(), mInputIds.rawPointer(),
+                prefillSequenceLength * sizeof(int32_t), cudaMemcpyDeviceToHost));
+            metaOut << "  \"input_ids\": [";
+            for (int32_t i = 0; i < prefillSequenceLength; ++i) {
+                if (i > 0) metaOut << ", ";
+                metaOut << hostInputIds[i];
+            }
+            metaOut << "],\n";
+            metaOut << "  \"output_ids\": [";
+            for (size_t i = 0; i < outputIds[0].size(); ++i) {
+                if (i > 0) metaOut << ", ";
+                metaOut << outputIds[0][i];
+            }
+            metaOut << "]\n";
+            metaOut << "}\n";
+        }
+        LOG_INFO("Dumped KV cache (post-decode): %s (%ld bytes, total_seq_len=%d, generated=%d)",
+            binPath.c_str(), numBytes, totalSeqLen, totalGenLen - 1);
+        mKVCacheDumpReqIdx++;
+    }
+    // ===== end KV cache dump =====
+    // ===== Run Expert diffusion with multi-sequence VLM decode =====
+    if (mExpertRunner)
+    {
+        auto& linearKVCache = mLLMEngineRunner->getLinearKVCache();
+        auto cacheConfig = linearKVCache.getConfig();
+        auto const& expertCfg = mExpertRunner->getConfig();
+        int32_t const numCandidates = expertCfg.numCandidates;
+        int32_t const L = cacheConfig.numAttentionLayers;
+        int32_t const H = cacheConfig.numKVHeads;
+        int32_t const D = cacheConfig.headDim;
+
+        int32_t truncSeqLen = mExpertRunner->findTrajFutureStart(outputIds[0], prefillSequenceLength);
+        if (truncSeqLen < 0)
+        {
+            LOG_WARNING("ExpertRunner: <traj_future_start> not found, skipping expert");
+        }
+        else if (!expertCfg.multiSeq)
+        {
+            // ── Single-sequence mode ──
+            LOG_INFO("ExpertRunner: single-seq, truncKV=%d", truncSeqLen);
+            auto kvBuf = linearKVCache.getKVCacheBuffer();
+            rt::Tensor vlmKV({L, 2, H, static_cast<int64_t>(truncSeqLen), D},
+                rt::DeviceType::kGPU, cacheConfig.kvCacheTypeTRT, "vlmKV");
+            kernel::saveKVCacheIntoTensor(vlmKV, kvBuf, 0, stream);
+
+            rt::Tensor actions({expertCfg.numCandidates, expertCfg.numDiffusionTokens, expertCfg.actionDim},
+                rt::DeviceType::kGPU, nvinfer1::DataType::kFLOAT, "actions");
+            mExpertRunner->runDiffusion(vlmKV, truncSeqLen, expertCfg.ropeDelta, actions, stream);
+
+            std::vector<float> hostAct(expertCfg.numCandidates * expertCfg.numDiffusionTokens * expertCfg.actionDim);
+            CUDA_CHECK(cudaMemcpy(hostAct.data(), actions.rawPointer(),
+                hostAct.size() * sizeof(float), cudaMemcpyDeviceToHost));
+            float minV = *std::min_element(hostAct.begin(), hostAct.end());
+            float maxV = *std::max_element(hostAct.begin(), hostAct.end());
+            LOG_INFO("ExpertRunner: actions [%.4f, %.4f]", minV, maxV);
+            if (!mKVCacheDumpPath.empty())
+            {
+                std::string p = mKVCacheDumpPath + ".req" + std::to_string(mKVCacheDumpReqIdx - 1) + ".actions.bin";
+                std::ofstream o(p, std::ios::binary);
+                o.write(reinterpret_cast<char const*>(hostAct.data()), hostAct.size() * sizeof(float));
+                LOG_INFO("ExpertRunner: dumped %s", p.c_str());
+            }
+        }
+        else
+        {
+            // ── Multi-sequence mode: decode N times from prefill checkpoint ──
+            LOG_INFO("ExpertRunner: multi-seq mode, %d candidates", numCandidates);
+
+            // Candidate 0: already decoded, extract its KV
+            auto kvBuf0 = linearKVCache.getKVCacheBuffer();
+            rt::Tensor cand0KV({L, 2, H, static_cast<int64_t>(truncSeqLen), D},
+                rt::DeviceType::kGPU, cacheConfig.kvCacheTypeTRT, "cand0KV");
+            kernel::saveKVCacheIntoTensor(cand0KV, kvBuf0, 0, stream);
+            CUDA_CHECK(cudaStreamSynchronize(stream));
+            LOG_INFO("ExpertRunner: cand 0 done (truncLen=%d)", truncSeqLen);
+
+            // Save the full KV buffer state (for restoration)
+            size_t const kvBufBytes = kvBuf0.getMemoryCapacity();
+            rt::Tensor savedFullKV({static_cast<int64_t>(kvBufBytes)}, rt::DeviceType::kGPU,
+                nvinfer1::DataType::kUINT8, "savedFullKV");
+            CUDA_CHECK(cudaMemcpyAsync(savedFullKV.rawPointer(), kvBuf0.rawPointer(),
+                kvBufBytes, cudaMemcpyDeviceToDevice, stream));
+
+            // Also save KV lengths
+            auto& kvLengths = linearKVCache.getKVCacheLengths();
+            rt::Tensor savedKVLens({kvLengths.getShape()[0]}, rt::DeviceType::kGPU,
+                nvinfer1::DataType::kINT32, "savedKVLens");
+            CUDA_CHECK(cudaMemcpyAsync(savedKVLens.rawPointer(), kvLengths.rawPointer(),
+                kvLengths.getMemoryCapacity(), cudaMemcpyDeviceToDevice, stream));
+
+            // Create prefill-only checkpoint: reset KV lengths to prefillSequenceLength
+            int32_t prefillLenHost = prefillSequenceLength;
+            rt::Tensor savedPrefillKV({static_cast<int64_t>(kvBufBytes)}, rt::DeviceType::kGPU,
+                nvinfer1::DataType::kUINT8, "savedPrefillKV");
+            CUDA_CHECK(cudaMemcpyAsync(savedPrefillKV.rawPointer(), kvBuf0.rawPointer(),
+                kvBufBytes, cudaMemcpyDeviceToDevice, stream));
+            CUDA_CHECK(cudaStreamSynchronize(stream));
+
+            // Collect all candidate KVs and truncation lengths
+            std::vector<rt::Tensor> candKVs;
+            candKVs.push_back(std::move(cand0KV));
+            std::vector<int32_t> candTruncLens;
+            candTruncLens.push_back(truncSeqLen);
+
+            // Decode candidates 1..N-1
+            for (int32_t ci = 1; ci < numCandidates; ++ci)
+            {
+                // Restore prefill KV state
+                CUDA_CHECK(cudaMemcpyAsync(linearKVCache.getKVCacheBuffer().rawPointer(),
+                    savedPrefillKV.rawPointer(), kvBufBytes, cudaMemcpyDeviceToDevice, stream));
+                CUDA_CHECK(cudaMemcpyAsync(kvLengths.rawPointer(), &prefillLenHost,
+                    sizeof(int32_t), cudaMemcpyHostToDevice, stream));
+                CUDA_CHECK(cudaStreamSynchronize(stream));
+
+                // Decode loop
+                std::vector<int32_t> candIds;
+                candIds.push_back(outputIds[0][0]);  // first token same for all (from prefill)
+                bool candDone = false;
+                int32_t candIter = 1;
+
+                check::check(mInputsEmbeds.reshape({1, 1, mEngineConfig.hiddenSize}), "reshape failed");
+
+                while (!candDone && candIter < maxGenerationLength)
+                {
+                    int32_t lastTok = candIds.back();
+                    CUDA_CHECK(cudaMemcpyAsync(mSelectedIndices.rawPointer(), &lastTok,
+                        sizeof(int32_t), cudaMemcpyHostToDevice, stream));
+                    kernel::embeddingLookup(mSelectedIndices, mEmbeddingTable, mInputsEmbeds, stream);
+
+                    bool ok = mLLMEngineRunner->executeVanillaDecodingStep(
+                        mInputsEmbeds, mOutputLogits, rt::OptionalOutputTensor{std::nullopt}, stream);
+                    if (!ok) break;
+
+                    SamplingParams params(1, mEngineConfig.outputVocabSize,
+                        request.temperature, request.topK, request.topP);
+                    trt_edgellm::topKtopPSamplingFromLogits(
+                        mOutputLogits, mSelectedIndices, params, mSamplingWorkspace, stream);
+                    if (mEngineConfig.reducedVocabSize > 0)
+                        trt_edgellm::mapReducedVocabToFullVocab(mSelectedIndices, mVocabMappingTable, stream);
+
+                    int32_t sampled;
+                    CUDA_CHECK(cudaMemcpyAsync(&sampled, mSelectedIndices.rawPointer(),
+                        sizeof(int32_t), cudaMemcpyDeviceToHost, stream));
+                    CUDA_CHECK(cudaStreamSynchronize(stream));
+                    candIds.push_back(sampled);
+
+                    if (candIds.size() >= 2)
+                    {
+                        int32_t prev = candIds[candIds.size() - 2];
+                        if (prev == expertCfg.trajFutureStartTokenId)
+                            candDone = true;
+                    }
+                    if (sampled == mTokenizer->getEosId())
+                        candDone = true;
+
+                    ++candIter;
+                }
+
+                int32_t candTrunc = mExpertRunner->findTrajFutureStart(candIds, prefillSequenceLength);
+                if (candTrunc < 0) candTrunc = truncSeqLen;
+
+                auto candBuf = linearKVCache.getKVCacheBuffer();
+                rt::Tensor candKV({L, 2, H, static_cast<int64_t>(candTrunc), D},
+                    rt::DeviceType::kGPU, cacheConfig.kvCacheTypeTRT,
+                    ("cand" + std::to_string(ci) + "KV").c_str());
+                kernel::saveKVCacheIntoTensor(candKV, candBuf, 0, stream);
+                CUDA_CHECK(cudaStreamSynchronize(stream));
+
+                candKVs.push_back(std::move(candKV));
+                candTruncLens.push_back(candTrunc);
+                LOG_INFO("ExpertRunner: cand %d done (truncLen=%d, decoded %d)", ci, candTrunc, candIter);
+            }
+
+            // Use minimum truncation length across candidates
+            int32_t minTrunc = *std::min_element(candTruncLens.begin(), candTruncLens.end());
+
+            // Stack KV caches: per-candidate (L,2,H,S,D) fp16 → combined (L*2,N,H,S,D) fp32
+            int64_t const L2 = L * 2;
+            int32_t const T = expertCfg.numDiffusionTokens;
+            int32_t const A = expertCfg.actionDim;
+
+            rt::Tensor stackedKV({L2, static_cast<int64_t>(numCandidates), H,
+                static_cast<int64_t>(minTrunc), D},
+                rt::DeviceType::kGPU, nvinfer1::DataType::kFLOAT, "stackedKV");
+
+            int64_t const sliceElems = static_cast<int64_t>(H) * minTrunc * D;
+            for (int32_t c = 0; c < numCandidates; ++c)
+            {
+                // Reshape single candidate to (L2, 1, H, minTrunc, D) fp32
+                rt::Tensor tmp({L2, 1, H, static_cast<int64_t>(minTrunc), D},
+                    rt::DeviceType::kGPU, nvinfer1::DataType::kFLOAT, "tmp");
+                kernel::kvCacheReshapeRepeat(tmp.dataPointer<float>(),
+                    reinterpret_cast<half const*>(candKVs[c].rawPointer()),
+                    L, H, minTrunc, D, 1, stream);
+
+                // Copy into stacked[:, c, :, :, :]
+                for (int64_t l = 0; l < L2; ++l)
+                {
+                    float* dst = stackedKV.dataPointer<float>() + l * numCandidates * sliceElems + c * sliceElems;
+                    float const* src2 = tmp.dataPointer<float>() + l * sliceElems;
+                    CUDA_CHECK(cudaMemcpyAsync(dst, src2, sliceElems * sizeof(float),
+                        cudaMemcpyDeviceToDevice, stream));
+                }
+            }
+            CUDA_CHECK(cudaStreamSynchronize(stream));
+
+            // Build Expert inputs and run diffusion
+            int32_t const totalMaskLen = minTrunc + T;
+            int64_t basePos = static_cast<int64_t>(minTrunc) + expertCfg.ropeDelta;
+
+            rt::Tensor posIds({3, static_cast<int64_t>(numCandidates), static_cast<int64_t>(T)},
+                rt::DeviceType::kGPU, nvinfer1::DataType::kINT64, "posIds");
+            kernel::buildPositionIds(posIds.dataPointer<int64_t>(), numCandidates, T, basePos, stream);
+
+            rt::Tensor mask({numCandidates, 1, T, static_cast<int64_t>(totalMaskLen)},
+                rt::DeviceType::kGPU, nvinfer1::DataType::kFLOAT, "mask");
+            CUDA_CHECK(cudaMemsetAsync(mask.rawPointer(), 0,
+                static_cast<size_t>(numCandidates) * T * totalMaskLen * sizeof(float), stream));
+
+            rt::Tensor noisyAct({numCandidates, T, A},
+                rt::DeviceType::kGPU, nvinfer1::DataType::kFLOAT, "noisyAct");
+            curandGenerator_t rng;
+            curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT);
+            curandSetPseudoRandomGeneratorSeed(rng, expertCfg.seed);
+            curandSetStream(rng, stream);
+            check::check((numCandidates * T * A) % 2 == 0, "curandGenerateNormal requires even element count");
+            curandGenerateNormal(rng, noisyAct.dataPointer<float>(), numCandidates * T * A, 0.0f, 1.0f);
+
+            rt::Tensor predVel({numCandidates, T, A},
+                rt::DeviceType::kGPU, nvinfer1::DataType::kFLOAT, "predVel");
+            rt::Tensor ts({numCandidates, 1, 1}, rt::DeviceType::kGPU, nvinfer1::DataType::kFLOAT, "ts");
+
+            auto* ctx = mExpertRunner->getContext();
+            nvinfer1::Dims dims;
+            dims.nbDims = 3; dims.d[0] = numCandidates; dims.d[1] = T; dims.d[2] = A;
+            ctx->setInputShape("noisy_action", dims);
+            dims.d[0] = numCandidates; dims.d[1] = 1; dims.d[2] = 1;
+            ctx->setInputShape("timestep", dims);
+            dims.d[0] = 3; dims.d[1] = numCandidates; dims.d[2] = T;
+            ctx->setInputShape("position_ids", dims);
+            dims.nbDims = 4; dims.d[0] = numCandidates; dims.d[1] = 1; dims.d[2] = T; dims.d[3] = totalMaskLen;
+            ctx->setInputShape("attention_mask", dims);
+            dims.nbDims = 5; dims.d[0] = L2; dims.d[1] = numCandidates; dims.d[2] = H;
+            dims.d[3] = minTrunc; dims.d[4] = D;
+            ctx->setInputShape("kv_cache", dims);
+
+            ctx->setTensorAddress("noisy_action", noisyAct.rawPointer());
+            ctx->setTensorAddress("timestep", ts.rawPointer());
+            ctx->setTensorAddress("position_ids", posIds.rawPointer());
+            ctx->setTensorAddress("attention_mask", mask.rawPointer());
+            ctx->setTensorAddress("kv_cache", stackedKV.rawPointer());
+            ctx->setTensorAddress("pred_velocity", predVel.rawPointer());
+
+            int32_t const nSteps = expertCfg.numDiffusionSteps;
+            float const dt = 1.0f / static_cast<float>(nSteps);
+            int32_t const actElems = numCandidates * T * A;
+            for (int32_t step = 0; step < nSteps; ++step)
+            {
+                kernel::fillTimestep(ts.dataPointer<float>(), numCandidates, step * dt, stream);
+                auto ok = ctx->enqueueV3(stream);
+                check::check(ok, "Expert enqueueV3 failed in multi-seq diffusion step");
+                kernel::eulerUpdate(noisyAct.dataPointer<float>(), predVel.dataPointer<float>(), dt, actElems, stream);
+            }
+            CUDA_CHECK(cudaStreamSynchronize(stream));
+            curandDestroyGenerator(rng);
+
+            std::vector<float> hostAct(actElems);
+            CUDA_CHECK(cudaMemcpy(hostAct.data(), noisyAct.rawPointer(), actElems * sizeof(float), cudaMemcpyDeviceToHost));
+            float minV = *std::min_element(hostAct.begin(), hostAct.end());
+            float maxV = *std::max_element(hostAct.begin(), hostAct.end());
+            LOG_INFO("ExpertRunner: multi-seq done, %d cands, actions [%.4f, %.4f]", numCandidates, minV, maxV);
+
+            if (!mKVCacheDumpPath.empty())
+            {
+                std::string p = mKVCacheDumpPath + ".req" + std::to_string(mKVCacheDumpReqIdx - 1) + ".actions.bin";
+                std::ofstream o(p, std::ios::binary);
+                o.write(reinterpret_cast<char const*>(hostAct.data()), hostAct.size() * sizeof(float));
+                LOG_INFO("ExpertRunner: dumped %s", p.c_str());
+            }
+
+            // Restore original KV state
+            CUDA_CHECK(cudaMemcpyAsync(linearKVCache.getKVCacheBuffer().rawPointer(),
+                savedFullKV.rawPointer(), kvBufBytes, cudaMemcpyDeviceToDevice, stream));
+            CUDA_CHECK(cudaMemcpyAsync(kvLengths.rawPointer(), savedKVLens.rawPointer(),
+                kvLengths.getMemoryCapacity(), cudaMemcpyDeviceToDevice, stream));
+        }
+    }
+    // ===== end Expert diffusion =====
+
     // Clean the response field and fill the generated outputIds and decoded texts.
     response.outputIds.clear();
     response.outputTexts.clear();
@@ -1042,5 +1452,41 @@ bool LLMInferenceRuntime::genAndSaveSystemPromptKVCache(
     return true;
 }
 
+bool LLMInferenceRuntime::dumpCurrentPrefillKVCache(
+    std::string const& outputPath, int32_t sequenceLength, cudaStream_t stream)
+{
+    check::check(sequenceLength > 0, "sequenceLength must be positive for prefill KV cache dump");
+
+    auto& linearKVCache = mLLMEngineRunner->getLinearKVCache();
+    auto const cacheConfig = linearKVCache.getConfig();
+    auto kvCacheBuffer = linearKVCache.getKVCacheBuffer();
+
+    rt::Tensor kvCacheContent({cacheConfig.numAttentionLayers, 2, cacheConfig.numKVHeads, sequenceLength,
+                                  cacheConfig.headDim},
+        rt::DeviceType::kGPU, cacheConfig.kvCacheTypeTRT, "kv_cache");
+    kernel::saveKVCacheIntoTensor(kvCacheContent, kvCacheBuffer, /*batchIdx=*/0, stream);
+
+    rt::Tensor sequenceLengthTensor({1}, rt::DeviceType::kCPU, nvinfer1::DataType::kINT32, "sequence_length");
+    sequenceLengthTensor.dataPointer<int32_t>()[0] = sequenceLength;
+
+    rt::Tensor inputIdsTensor({sequenceLength}, rt::DeviceType::kCPU, nvinfer1::DataType::kINT32, "input_ids");
+    CUDA_CHECK(cudaMemcpyAsync(inputIdsTensor.rawPointer(), mInputIds.rawPointer(), sequenceLength * sizeof(int32_t),
+        cudaMemcpyDeviceToHost, stream));
+
+    std::vector<rt::Tensor> tensors;
+    tensors.push_back(std::move(kvCacheContent));
+    tensors.push_back(std::move(sequenceLengthTensor));
+    tensors.push_back(std::move(inputIdsTensor));
+
+    if (!safetensors::saveSafetensors(outputPath, tensors, stream))
+    {
+        LOG_ERROR("LLMInferenceRuntime(): Failed to save safetensors to %s", outputPath.c_str());
+        return false;
+    }
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    LOG_INFO("LLMInferenceRuntime(): Prefill KV cache dumped to %s", outputPath.c_str());
+    return true;
+}
+
 } // namespace rt
 } // namespace trt_edgellm
diff --git a/cpp/runtime/llmInferenceRuntime.h b/cpp/runtime/llmInferenceRuntime.h
index dc790c6..b2f0272 100644
--- a/cpp/runtime/llmInferenceRuntime.h
+++ b/cpp/runtime/llmInferenceRuntime.h
@@ -18,9 +18,11 @@
 #pragma once
 
 #include "common/hashUtils.h"
+#include "common/safetensorsUtils.h"
 #include "multimodal/multimodalRunner.h"
 #include "profiling/metrics.h"
 #include "profiling/timer.h"
+#include "runtime/alpamayoExpertRunner.h"
 #include "runtime/llmEngineRunner.h"
 #include "runtime/llmRuntimeUtils.h"
 #include "tokenizer/tokenizer.h"
@@ -117,6 +119,20 @@ class LLMInferenceRuntime
                              : metrics::MultimodalMetrics{};
     }
 
+    /*! \brief Enable dumping the KV cache to disk after each prefill+decode.
+     *  When non-empty, every successful request in handleRequest() writes the
+     *  KV cache (and shape metadata) to <path>.req<idx>.bin / <path>.req<idx>.shape.json.
+     */
+    void setKVCacheDumpPath(std::string const& path) noexcept { mKVCacheDumpPath = path; }
+
+    /*! \brief Initialize the Expert runner for inline diffusion after VLM decode.
+     *  When set, handleRequest() runs Expert diffusion instead of (or in addition to) dumping KV cache.
+     */
+    void initExpertRunner(std::string const& expertEnginePath, AlpamayoExpertConfig const& config, cudaStream_t stream)
+    {
+        mExpertRunner = std::make_unique<AlpamayoExpertRunner>(expertEnginePath, config, stream);
+    }
+
 private:
     /*! \brief Helper structure to hold token counting results
      */
@@ -176,6 +192,17 @@ class LLMInferenceRuntime
     //! \throws std::runtime_error if system prompt is malformed, or a CUDA operation fails
     bool setUpForPrefillExecution(std::vector<std::vector<int32_t>> const& batchedInputIds,
         std::vector<std::string> const& systemPrompts, std::string const& loraWeightsName, cudaStream_t stream);
+
+    //! Dump the current prefill KV cache for a single-batch request.
+    //! Saves a safetensors file containing:
+    //! - kv_cache: [num_layers, 2, num_kv_heads, seq_len, head_dim]
+    //! - sequence_length: [1] int32
+    //! - input_ids: [seq_len] int32
+    bool dumpCurrentPrefillKVCache(std::string const& outputPath, int32_t sequenceLength, cudaStream_t stream);
+
+    std::unique_ptr<AlpamayoExpertRunner> mExpertRunner{nullptr};
+    std::string mKVCacheDumpPath{};             //!< Optional path prefix to dump KV cache after decode (Path A)
+    int64_t mKVCacheDumpReqIdx{0};              //!< Counter for dump file naming
 };
 } // namespace rt
 } // namespace trt_edgellm
diff --git a/cpp/runtime/llmRuntimeUtils.h b/cpp/runtime/llmRuntimeUtils.h
index 3ea037b..0cde4d7 100644
--- a/cpp/runtime/llmRuntimeUtils.h
+++ b/cpp/runtime/llmRuntimeUtils.h
@@ -69,6 +69,9 @@ struct LLMGenerationRequest
         std::vector<Message> messages; //!< Structured messages (required - use chat template format)
         std::vector<rt::imageUtils::ImageData> imageBuffers; //!< Optional image data for multimodal inputs
         std::vector<rt::audioUtils::AudioData> audioBuffers; //!< Optional audio data for multimodal inputs (Qwen3-Omni)
+        std::string preparedVisualInputPath; //!< Optional path to preprocessed visual inputs (pixel_values + image_grid_thw)
+        std::string dumpPrefillKVCachePath; //!< Optional output path to dump prefill KV cache as safetensors
+        bool prefillOnly{false}; //!< If true, stop after prefill (useful with dumpPrefillKVCachePath)
 
         mutable FormattedRequest formatted; //!< Formatted request (populated by tokenizer or user-provided)
     };
diff --git a/examples/llm/llm_inference.cpp b/examples/llm/llm_inference.cpp
index 5631aac..b025f46 100644
--- a/examples/llm/llm_inference.cpp
+++ b/examples/llm/llm_inference.cpp
@@ -60,7 +60,12 @@ enum LLMInferenceOptionId : int
     EAGLE_DRAFT_STEP = 912,
     EAGLE_VERIFY_TREE_SIZE = 913,
     BATCH_SIZE = 914,
-    MAX_GENERATE_LENGTH = 915
+    MAX_GENERATE_LENGTH = 915,
+    DUMP_KV_CACHE = 916,
+    EXPERT_ENGINE = 917,
+    NUM_CANDIDATES = 918,
+    MULTI_SEQ = 919,
+    DIFFUSION_STEPS = 920
 };
 
 // Struct to hold Eagle-specific arguments for speculative decoding
@@ -98,6 +103,11 @@ struct LLMInferenceArgs
     // For other sampling parameters (temperature, top_p, top_k), please specify them in the input JSON file
     int32_t batchSize{-1};         // -1 means use value from input file
     int64_t maxGenerateLength{-1}; // -1 means use value from input file
+    std::string dumpKVCache{""};
+    std::string expertEngine{""};   // Path to Expert TRT engine for diffusion (empty = disabled)
+    int32_t numCandidates{6};
+    bool multiSeq{false};
+    int32_t numDiffusionSteps{10};
     EagleArgs eagleArgs;
 };
 
@@ -152,7 +162,12 @@ bool parseLLMInferenceArgs(LLMInferenceArgs& args, int argc, char* argv[])
         {"eagleDraftStep", required_argument, 0, LLMInferenceOptionId::EAGLE_DRAFT_STEP},
         {"eagleVerifyTreeSize", required_argument, 0, LLMInferenceOptionId::EAGLE_VERIFY_TREE_SIZE},
         {"batchSize", required_argument, 0, LLMInferenceOptionId::BATCH_SIZE},
-        {"maxGenerateLength", required_argument, 0, LLMInferenceOptionId::MAX_GENERATE_LENGTH}, {0, 0, 0, 0}};
+        {"maxGenerateLength", required_argument, 0, LLMInferenceOptionId::MAX_GENERATE_LENGTH},
+        {"dumpKVCache", required_argument, 0, LLMInferenceOptionId::DUMP_KV_CACHE},
+        {"expertEngine", required_argument, 0, LLMInferenceOptionId::EXPERT_ENGINE},
+        {"numCandidates", required_argument, 0, LLMInferenceOptionId::NUM_CANDIDATES},
+        {"multiSeq", no_argument, 0, LLMInferenceOptionId::MULTI_SEQ},
+        {"numDiffusionSteps", required_argument, 0, LLMInferenceOptionId::DIFFUSION_STEPS}, {0, 0, 0, 0}};
 
     int opt;
     while ((opt = getopt_long(argc, argv, "", inferenceOptions, nullptr)) != -1)
@@ -265,6 +280,11 @@ bool parseLLMInferenceArgs(LLMInferenceArgs& args, int argc, char* argv[])
                 return false;
             }
             break;
+        case LLMInferenceOptionId::DUMP_KV_CACHE: args.dumpKVCache = optarg; break;
+        case LLMInferenceOptionId::EXPERT_ENGINE: args.expertEngine = optarg; break;
+        case LLMInferenceOptionId::NUM_CANDIDATES: args.numCandidates = std::stoi(optarg); break;
+        case LLMInferenceOptionId::MULTI_SEQ: args.multiSeq = true; break;
+        case LLMInferenceOptionId::DIFFUSION_STEPS: args.numDiffusionSteps = std::stoi(optarg); break;
         default: return false;
         }
     }
@@ -447,6 +467,9 @@ std::pair<std::unordered_map<std::string, std::string>, std::vector<rt::LLMGener
 
                 // Get per-conversation LoRA name if present
                 std::string requestLoraName = "";
+                std::string preparedVisualInputPath = "";
+                std::string dumpPrefillKVCachePath = "";
+                bool prefillOnly = requestItem.value("prefill_only", false);
                 if (requestItem.contains("lora_name") && !requestItem["lora_name"].is_null())
                 {
                     requestLoraName = requestItem["lora_name"].get<std::string>();
@@ -456,6 +479,14 @@ std::pair<std::unordered_map<std::string, std::string>, std::vector<rt::LLMGener
                         requestLoraName.empty() || loraWeightsMap.find(requestLoraName) != loraWeightsMap.end(),
                         "LoRA name '" + requestLoraName + "' not found in available_lora_weights");
                 }
+                if (requestItem.contains("prepared_visual_input") && !requestItem["prepared_visual_input"].is_null())
+                {
+                    preparedVisualInputPath = requestItem["prepared_visual_input"].get<std::string>();
+                }
+                if (requestItem.contains("dump_prefill_kv_cache") && !requestItem["dump_prefill_kv_cache"].is_null())
+                {
+                    dumpPrefillKVCachePath = requestItem["dump_prefill_kv_cache"].get<std::string>();
+                }
 
                 // Validate that all requests in this batch use the same LoRA weights
                 if (firstInBatch)
@@ -546,11 +577,14 @@ std::pair<std::unordered_map<std::string, std::string>, std::vector<rt::LLMGener
                             else if (msgContent.type == "image")
                             {
                                 msgContent.content = contentItemJson["image"].get<std::string>();
-                                // TODO: Need to consider multi-turn conversation, and whether to load all images.
-                                auto image = rt::imageUtils::loadImageFromFile(msgContent.content);
-                                if (image.buffer != nullptr)
+                                if (preparedVisualInputPath.empty())
                                 {
-                                    imageBuffers.push_back(std::move(image));
+                                    // TODO: Need to consider multi-turn conversation, and whether to load all images.
+                                    auto image = rt::imageUtils::loadImageFromFile(msgContent.content);
+                                    if (image.buffer != nullptr)
+                                    {
+                                        imageBuffers.push_back(std::move(image));
+                                    }
                                 }
                             }
                             else if (msgContent.type == "audio")
@@ -609,6 +643,9 @@ std::pair<std::unordered_map<std::string, std::string>, std::vector<rt::LLMGener
                 request.messages = std::move(chatMessages);
                 request.imageBuffers = std::move(imageBuffers);
                 request.audioBuffers = std::move(audioBuffers);
+                request.preparedVisualInputPath = std::move(preparedVisualInputPath);
+                request.dumpPrefillKVCachePath = std::move(dumpPrefillKVCachePath);
+                request.prefillOnly = prefillOnly;
                 batchRequest.requests.push_back(std::move(request));
             }
 
@@ -718,6 +755,20 @@ int main(int argc, char* argv[])
         {
             LOG_WARNING("Failed to capture CUDA graph for decoding usage, proceeding with normal engine execution.");
         }
+        if (!args.expertEngine.empty())
+        {
+            rt::AlpamayoExpertConfig expertCfg;
+            expertCfg.numCandidates = args.numCandidates;
+            expertCfg.multiSeq = args.multiSeq;
+            expertCfg.numDiffusionSteps = args.numDiffusionSteps;
+            llmInferenceRuntime->initExpertRunner(args.expertEngine, expertCfg, stream);
+            LOG_INFO("Expert runner initialized: %s", args.expertEngine.c_str());
+        }
+        if (!args.dumpKVCache.empty())
+        {
+            llmInferenceRuntime->setKVCacheDumpPath(args.dumpKVCache);
+            LOG_INFO("KV cache dump enabled, prefix: %s", args.dumpKVCache.c_str());
+        }
     }
 
     // Perform warmup runs if requested
@@ -817,6 +868,7 @@ int main(int argc, char* argv[])
             // Validate UTF-8 for output text (inputs are always valid)
             // If invalid UTF-8 detected, error message is returned and original text is logged
             responseJson["output_text"] = sanitizeUtf8ForJson(outputText);
+            responseJson["output_ids"] = requestStatus ? nlohmann::json(response.outputIds[batchIdx]) : nlohmann::json::array();
             responseJson["request_idx"] = requestIdx;
             responseJson["batch_idx"] = batchIdx;
             // Store messages for reference