diff --git a/CMakeLists.txt b/CMakeLists.txt index 9233898..9a1bb3c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,7 +64,7 @@ set_ifndef(CUDA_DIR /usr/local/cuda-${CUDA_CTK_VERSION}) if(NOT DEFINED AARCH64_BUILD) set(CMAKE_CUDA_ARCHITECTURES 80;86;89) if(CUDA_CTK_VERSION VERSION_GREATER_EQUAL 12.8) - list(APPEND CMAKE_CUDA_ARCHITECTURES 100 120) + list(APPEND CMAKE_CUDA_ARCHITECTURES 100 110 120) endif() endif() diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index eab3baf..451c1b5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -98,7 +98,7 @@ add_library( ${RUNTIME_CPP_SRCS} ${PROFILER_CPP_SRCS}) target_include_directories(edgellmCore PRIVATE ${COMMON_INCLUDE_DIRS}) -target_link_libraries(edgellmCore PRIVATE dl) +target_link_libraries(edgellmCore PRIVATE dl curand) # Apply FMHA SM exclusion definitions target_compile_definitions(edgellmCore PRIVATE ${FMHA_EXCLUDE_DEFINITIONS}) # Enable separable compilation for MoE Marlin kernel templates diff --git a/cpp/common/inputLimits.h b/cpp/common/inputLimits.h index 5b72c60..c245228 100644 --- a/cpp/common/inputLimits.h +++ b/cpp/common/inputLimits.h @@ -42,7 +42,7 @@ constexpr int kReasonableMaxBatchSize = 16; // Validation limits for message parsing. constexpr size_t kMaxMessageContentSizeBytes = 128 * 1024; // 128KB per content item constexpr size_t kMaxMessagesPerRequest = 64; -constexpr size_t kMaxContentItemsPerMessage = 16; +constexpr size_t kMaxContentItemsPerMessage = 64; // Alpamayo uses 4 cameras x 4 frames = 16 images + text items } // namespace security diff --git a/cpp/kernels/alpamayoExpertKernels/alpamayoExpertKernels.cu b/cpp/kernels/alpamayoExpertKernels/alpamayoExpertKernels.cu new file mode 100644 index 0000000..1e3bc63 --- /dev/null +++ b/cpp/kernels/alpamayoExpertKernels/alpamayoExpertKernels.cu @@ -0,0 +1,118 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * CUDA kernels for AlpamayoExpertRunner. + */ + +#include "alpamayoExpertKernels.h" + +namespace trt_edgellm +{ +namespace kernel +{ + +namespace +{ + +__global__ void kvCacheReshapeRepeatKernel(float* __restrict__ dst, half const* __restrict__ src, int32_t numLayers, + int32_t numKVHeads, int32_t seqLen, int32_t headDim, int32_t numCandidates, int64_t totalElements) +{ + int64_t const idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= totalElements) + return; + + // Decompose: dst[layer2, cand, head, s, d] + int64_t rem = idx; + int32_t const d = rem % headDim; + rem /= headDim; + int32_t const s = rem % seqLen; + rem /= seqLen; + int32_t const head = rem % numKVHeads; + rem /= numKVHeads; + rem /= numCandidates; // skip cand dimension (broadcast) + int32_t const layer2 = rem; + + int32_t const layer = layer2 / 2; + int32_t const kv = layer2 % 2; + + // src layout: [layer, kv, head, s, d] contiguous + int64_t const hsd = static_cast(numKVHeads) * seqLen * headDim; + int64_t const srcIdx = (static_cast(layer) * 2 + kv) * hsd + static_cast(head) * seqLen * headDim + + static_cast(s) * headDim + d; + + dst[idx] = __half2float(src[srcIdx]); +} + +__global__ void buildPositionIdsKernel(int64_t* __restrict__ posIds, int32_t total, int32_t numTokens, int64_t basePos) +{ + int32_t const idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= total) + return; + posIds[idx] = basePos + (idx % numTokens); +} + +__global__ void fillTimestepKernel(float* __restrict__ dst, int32_t n, float val) +{ + int32_t const idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) + dst[idx] = val; +} + +__global__ void eulerUpdateKernel(float* __restrict__ x, float const* __restrict__ v, float dt, int32_t n) +{ + int32_t const idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) + x[idx] += v[idx] * dt; +} + +} // anonymous namespace + +void kvCacheReshapeRepeat(float* dst, half const* src, int32_t numLayers, int32_t numKVHeads, int32_t seqLen, + int32_t headDim, int32_t numCandidates, cudaStream_t stream) +{ + int64_t const totalElements = static_cast(numLayers) * 2 * numCandidates * numKVHeads * seqLen * headDim; + int32_t const blockSize = 256; + int32_t const numBlocks = static_cast((totalElements + blockSize - 1) / blockSize); + kvCacheReshapeRepeatKernel<<>>( + dst, src, numLayers, numKVHeads, seqLen, headDim, numCandidates, totalElements); +} + +void buildPositionIds(int64_t* posIds, int32_t numCandidates, int32_t numTokens, int64_t basePos, cudaStream_t stream) +{ + int32_t const total = 3 * numCandidates * numTokens; + int32_t const blockSize = 256; + int32_t const numBlocks = (total + blockSize - 1) / blockSize; + buildPositionIdsKernel<<>>(posIds, total, numTokens, basePos); +} + +void fillTimestep(float* dst, int32_t numCandidates, float tVal, cudaStream_t stream) +{ + int32_t const blockSize = 256; + int32_t const numBlocks = (numCandidates + blockSize - 1) / blockSize; + fillTimestepKernel<<>>(dst, numCandidates, tVal); +} + +void eulerUpdate(float* x, float const* v, float dt, int32_t n, cudaStream_t stream) +{ + int32_t const blockSize = 256; + int32_t const numBlocks = (n + blockSize - 1) / blockSize; + eulerUpdateKernel<<>>(x, v, dt, n); +} + +} // namespace kernel +} // namespace trt_edgellm diff --git a/cpp/kernels/alpamayoExpertKernels/alpamayoExpertKernels.h b/cpp/kernels/alpamayoExpertKernels/alpamayoExpertKernels.h new file mode 100644 index 0000000..53e5995 --- /dev/null +++ b/cpp/kernels/alpamayoExpertKernels/alpamayoExpertKernels.h @@ -0,0 +1,43 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * CUDA kernels for AlpamayoExpertRunner. + */ + +#pragma once + +#include +#include +#include + +namespace trt_edgellm +{ +namespace kernel +{ + +void kvCacheReshapeRepeat(float* dst, half const* src, int32_t numLayers, int32_t numKVHeads, int32_t seqLen, + int32_t headDim, int32_t numCandidates, cudaStream_t stream); + +void buildPositionIds(int64_t* posIds, int32_t numCandidates, int32_t numTokens, int64_t basePos, cudaStream_t stream); + +void fillTimestep(float* dst, int32_t numCandidates, float tVal, cudaStream_t stream); + +void eulerUpdate(float* x, float const* v, float dt, int32_t n, cudaStream_t stream); + +} // namespace kernel +} // namespace trt_edgellm diff --git a/cpp/multimodal/multimodalRunner.cpp b/cpp/multimodal/multimodalRunner.cpp index 8bf9efa..2e699fe 100644 --- a/cpp/multimodal/multimodalRunner.cpp +++ b/cpp/multimodal/multimodalRunner.cpp @@ -133,5 +133,15 @@ bool MultimodalRunner::preprocessSystemPrompt([[maybe_unused]] std::string const return true; } +bool MultimodalRunner::preprocessPreparedVisual([[maybe_unused]] rt::LLMGenerationRequest const& request, + [[maybe_unused]] std::vector>& batchedInputIds, + [[maybe_unused]] tokenizer::Tokenizer const* tokenizer, [[maybe_unused]] rt::Tensor& ropeRotaryCosSinDevice, + [[maybe_unused]] rt::Tensor const& pixelValues, [[maybe_unused]] rt::Tensor const& imageGridTHW, + [[maybe_unused]] cudaStream_t stream) +{ + LOG_ERROR("preprocessPreparedVisual is not implemented for this multimodal runner"); + return false; +} + } // namespace rt } // namespace trt_edgellm diff --git a/cpp/multimodal/multimodalRunner.h b/cpp/multimodal/multimodalRunner.h index 111906a..bff81a5 100644 --- a/cpp/multimodal/multimodalRunner.h +++ b/cpp/multimodal/multimodalRunner.h @@ -90,6 +90,26 @@ class MultimodalRunner tokenizer::Tokenizer const* tokenizer, [[maybe_unused]] rt::Tensor& ropeRotaryCosSinDevice, cudaStream_t stream) = 0; + /*! + * @brief Preprocess using already prepared visual inputs from an external pipeline. + * + * This bypasses image decoding / resize / patchification in TRT runtime and instead + * consumes tensors equivalent to processor outputs such as `pixel_values` and `image_grid_thw`. + * + * @param request Generation request with text/messages + * @param batchedInputIds Output batched input token IDs + * @param tokenizer Tokenizer instance + * @param ropeRotaryCosSinDevice RoPE cache tensor + * @param pixelValues Preprocessed visual input tensor [num_patches, input_dim] + * @param imageGridTHW Image grid tensor [num_images, 3] + * @param stream CUDA stream + * @return True on success, false on failure + */ + virtual bool preprocessPreparedVisual(rt::LLMGenerationRequest const& request, + std::vector>& batchedInputIds, tokenizer::Tokenizer const* tokenizer, + [[maybe_unused]] rt::Tensor& ropeRotaryCosSinDevice, [[maybe_unused]] rt::Tensor const& pixelValues, + [[maybe_unused]] rt::Tensor const& imageGridTHW, cudaStream_t stream); + /*! * @brief Used for KVCache saving where we need to conduct the tokenization of the system prompt and generate * ND-Rope parameters for the system prompt. diff --git a/cpp/multimodal/qwenViTRunner.cpp b/cpp/multimodal/qwenViTRunner.cpp index 261813d..0a3b4f3 100644 --- a/cpp/multimodal/qwenViTRunner.cpp +++ b/cpp/multimodal/qwenViTRunner.cpp @@ -768,6 +768,136 @@ bool QwenViTRunner::preprocess(rt::LLMGenerationRequest const& request, return true; } +bool QwenViTRunner::preprocessPreparedVisual(rt::LLMGenerationRequest const& request, + std::vector>& batchedInputIds, tokenizer::Tokenizer const* tokenizer, + rt::Tensor& ropeRotaryCosSinDevice, rt::Tensor const& pixelValues, rt::Tensor const& imageGridTHW, + cudaStream_t stream) +{ + try + { + check::check(pixelValues.getShape().getNumDims() == 2, "pixelValues must be 2D [num_patches, input_dim]"); + check::check(pixelValues.getDataType() == nvinfer1::DataType::kHALF, "pixelValues must be FP16"); + check::check( + imageGridTHW.getShape().getNumDims() == 2 && imageGridTHW.getShape()[1] == 3, "imageGridTHW must be [N, 3]"); + + int64_t const numImages = imageGridTHW.getShape()[0]; + std::vector imageGridTHWHostVec(numImages * 3); + CUDA_CHECK(cudaMemcpyAsync(imageGridTHWHostVec.data(), imageGridTHW.rawPointer(), + sizeof(int64_t) * numImages * 3, cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + std::vector> imageGridTHWs; + imageGridTHWs.reserve(numImages); + std::vector imageTokenLengths; + imageTokenLengths.reserve(numImages); + std::vector numImagesPerRequest; + numImagesPerRequest.reserve(request.requests.size()); + + for (int64_t i = 0; i < numImages; ++i) + { + int64_t t = imageGridTHWHostVec[i * 3 + 0]; + int64_t h = imageGridTHWHostVec[i * 3 + 1]; + int64_t w = imageGridTHWHostVec[i * 3 + 2]; + imageGridTHWs.push_back({t, h, w}); + imageTokenLengths.push_back(t * h * w / (mConfig.mergeSize * mConfig.mergeSize)); + } + + // Count image items per request from message contents. + // For prepared_visual_input, the request may intentionally omit raw image content and + // provide a text-only prompt plus externally prepared pixel_values/image_grid_thw. + // In that case, infer the per-request image count from imageGridTHW for the single + // request we currently support. + int64_t totalImageCount = 0; + for (auto const& req : request.requests) + { + int64_t requestImageCount = 0; + for (auto const& msg : req.messages) + { + for (auto const& content : msg.contents) + { + if (content.type == "image") + { + requestImageCount++; + } + } + } + numImagesPerRequest.push_back(requestImageCount); + totalImageCount += requestImageCount; + } + if (totalImageCount == 0) + { + check::check(request.requests.size() == 1, + "prepared_visual_input without raw image messages currently supports a single request only"); + numImagesPerRequest[0] = numImages; + totalImageCount = numImages; + } + check::check(totalImageCount == numImages, "imageGridTHW image count does not match request image count"); + + // Populate mVitInput directly from the prepared processor output. + int64_t const totalSeqLength = pixelValues.getShape()[0]; + check::check(pixelValues.getShape()[1] == mConfig.inputDim, "pixelValues input dim mismatch"); + check::check(mVitInput.reshape({totalSeqLength, mConfig.inputDim}), "Tensor reshape failed"); + CUDA_CHECK(cudaMemcpyAsync(mVitInput.rawPointer(), pixelValues.rawPointer(), + totalSeqLength * mConfig.inputDim * sizeof(half), cudaMemcpyDeviceToDevice, stream)); + + // Build cu_seqlens and auxiliary tensors exactly like imagePreprocess tail. + int32_t* cuSeqlensData = mCuSeqlensHost.dataPointer(); + cuSeqlensData[0] = 0; + int64_t cuSeqlensSize = 1; + int64_t maxSeqLen = 0; + for (auto const& grid : imageGridTHWs) + { + int64_t curSeqLength = grid[0] * grid[1] * grid[2]; + int32_t prevCuSeqlen = cuSeqlensData[cuSeqlensSize - 1]; + cuSeqlensData[cuSeqlensSize++] = static_cast(prevCuSeqlen + curSeqLength); + maxSeqLen = std::max(maxSeqLen, curSeqLength); + } + + int64_t const totalImageTokens = totalSeqLength / (mConfig.mergeSize * mConfig.mergeSize); + check::check(mOutputEmbedding.reshape({totalImageTokens, mConfig.outHiddenSize}), "Tensor reshape failed"); + check::check(mMaxSeqLenCarrier.reshape({maxSeqLen}), "Tensor reshape failed"); + + check::check(mCuSeqlens.reshape({cuSeqlensSize}), "Tensor reshape failed"); + CUDA_CHECK(cudaMemcpyAsync(mCuSeqlens.rawPointer(), mCuSeqlensHost.rawPointer(), + cuSeqlensSize * sizeof(int32_t), cudaMemcpyHostToDevice, stream)); + + check::check(mRotaryPosEmb.reshape({totalSeqLength, mConfig.vitPosEmbDim}), "Tensor reshape failed"); + for (size_t i = 0; i < imageGridTHWs.size(); ++i) + { + kernel::initRotaryPosEmbQwenViT( + mRotaryPosEmb, imageGridTHWs[i], mConfig.mergeSize, cuSeqlensData[i], 10000.0f, 1.0f, stream); + } + + if (mModelType == multimodal::ModelType::QWEN3_VL + || mModelType == multimodal::ModelType::QWEN3_OMNI_VISION_ENCODER) + { + check::check(mFastPosEmbIdx.reshape({4, totalSeqLength}), "Tensor reshape failed"); + check::check(mFastPosEmbWeight.reshape({4, totalSeqLength}), "Tensor reshape failed"); + for (size_t i = 0; i < imageGridTHWs.size(); ++i) + { + kernel::initFastPosEmbedQwenViT(mFastPosEmbIdx, mFastPosEmbWeight, imageGridTHWs[i], mConfig.mergeSize, + mConfig.numGridPerSide, cuSeqlensData[i], stream); + } + for (int64_t i = 0; i < mConfig.numDeepstackFeatures; ++i) + { + check::check( + mDeepstackFeatures[i].reshape({totalImageTokens, mConfig.outHiddenSize}), "Tensor reshape failed"); + } + } + mLastImageGridTHWs = imageGridTHWs; + + textPreprocess(request, batchedInputIds, numImagesPerRequest, imageTokenLengths, tokenizer); + generateMropeParams(batchedInputIds, imageGridTHWs, ropeRotaryCosSinDevice, stream); + } + catch (std::exception const& e) + { + LOG_ERROR("QwenViTRunner::preprocessPreparedVisual() failed: %s", e.what()); + return false; + } + + return true; +} + bool QwenViTRunner::preprocessSystemPrompt(std::string const& systemPrompt, tokenizer::Tokenizer const* tokenizer, rt::Tensor& ropeRotaryCosSinDevice, cudaStream_t stream) { diff --git a/cpp/multimodal/qwenViTRunner.h b/cpp/multimodal/qwenViTRunner.h index 7e59ced..708389a 100644 --- a/cpp/multimodal/qwenViTRunner.h +++ b/cpp/multimodal/qwenViTRunner.h @@ -84,6 +84,11 @@ class QwenViTRunner : public MultimodalRunner bool preprocess(rt::LLMGenerationRequest const& request, std::vector>& batchedInputIds, tokenizer::Tokenizer const* tokenizer, rt::Tensor& ropeRotaryCosSinDevice, cudaStream_t stream) override; + bool preprocessPreparedVisual(rt::LLMGenerationRequest const& request, + std::vector>& batchedInputIds, tokenizer::Tokenizer const* tokenizer, + rt::Tensor& ropeRotaryCosSinDevice, rt::Tensor const& pixelValues, rt::Tensor const& imageGridTHW, + cudaStream_t stream) override; + //! \brief Encode the system prompt and generate ND-RoPE parameters for the system prompt for KVCache saving. //! \param[in] systemPrompt System prompt string //! \param[in] tokenizer Tokenizer for text processing diff --git a/cpp/runtime/alpamayoExpertRunner.cpp b/cpp/runtime/alpamayoExpertRunner.cpp new file mode 100644 index 0000000..4aebb51 --- /dev/null +++ b/cpp/runtime/alpamayoExpertRunner.cpp @@ -0,0 +1,222 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * AlpamayoExpertRunner: runs TRT Diffusion Expert inline, KV cache stays on GPU. + */ + +#include "alpamayoExpertRunner.h" + +#include "kernels/alpamayoExpertKernels/alpamayoExpertKernels.h" + +#include +#include + +namespace trt_edgellm +{ +namespace rt +{ + +AlpamayoExpertRunner::AlpamayoExpertRunner( + std::string const& enginePath, AlpamayoExpertConfig const& config, cudaStream_t stream) + : mConfig(config) +{ + loadEngine(enginePath, stream); + + auto maxShape = mEngine->getProfileShape("kv_cache", 0, nvinfer1::OptProfileSelector::kMAX); + int32_t maxKVSeqLen = maxShape.d[3]; + + allocateBuffers(maxKVSeqLen, stream); + + curandCreateGenerator(&mRNG, CURAND_RNG_PSEUDO_DEFAULT); + curandSetPseudoRandomGeneratorSeed(mRNG, mConfig.seed); + curandSetStream(mRNG, stream); + + LOG_INFO("AlpamayoExpertRunner: %d candidates, %d steps, maxKVSeq=%d, engine=%s", mConfig.numCandidates, + mConfig.numDiffusionSteps, maxKVSeqLen, enginePath.c_str()); +} + +AlpamayoExpertRunner::~AlpamayoExpertRunner() noexcept +{ + if (mRNG) + { + curandDestroyGenerator(mRNG); + } +} + +void AlpamayoExpertRunner::loadEngine(std::string const& path, cudaStream_t stream) +{ + mRuntime = std::unique_ptr(nvinfer1::createInferRuntime(gLogger)); + + std::ifstream file(path, std::ios::binary | std::ios::ate); + if (!file.good()) + { + throw std::runtime_error("Cannot open Expert engine: " + path); + } + size_t const fileSize = file.tellg(); + file.seekg(0, std::ios::beg); + std::vector buf(fileSize); + file.read(buf.data(), fileSize); + + mEngine = std::unique_ptr(mRuntime->deserializeCudaEngine(buf.data(), fileSize)); + if (!mEngine) + { + throw std::runtime_error("Failed to deserialize Expert engine"); + } + + int64_t const execBytes = mEngine->getDeviceMemorySizeV2(); + mExecContextMemory + = rt::Tensor({execBytes}, rt::DeviceType::kGPU, nvinfer1::DataType::kUINT8, "ExpertRunner::execMem"); + + mContext = std::unique_ptr( + mEngine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED)); + mContext->setDeviceMemoryV2(mExecContextMemory.rawPointer(), execBytes); + + LOG_INFO("Expert engine loaded: exec ctx %zu MB", execBytes / (1024 * 1024)); +} + +void AlpamayoExpertRunner::allocateBuffers(int32_t maxKVSeqLen, cudaStream_t stream) +{ + int32_t const B = mConfig.numCandidates; + int32_t const T = mConfig.numDiffusionTokens; + int32_t const A = mConfig.actionDim; + int64_t const L2 = mConfig.numLayers * 2; + int32_t const H = mConfig.numKVHeads; + int32_t const D = mConfig.headDim; + + mKVCacheReshaped = rt::Tensor({L2, B, H, static_cast(maxKVSeqLen), D}, rt::DeviceType::kGPU, + nvinfer1::DataType::kFLOAT, "ExpertRunner::kv"); + mNoisyAction = rt::Tensor({B, T, A}, rt::DeviceType::kGPU, nvinfer1::DataType::kFLOAT, "ExpertRunner::x"); + mTimestep = rt::Tensor({B, 1, 1}, rt::DeviceType::kGPU, nvinfer1::DataType::kFLOAT, "ExpertRunner::t"); + mPositionIds = rt::Tensor({3, static_cast(B), static_cast(T)}, rt::DeviceType::kGPU, + nvinfer1::DataType::kINT64, "ExpertRunner::posIds"); + mAttentionMask = rt::Tensor({B, 1, T, static_cast(maxKVSeqLen + T)}, rt::DeviceType::kGPU, + nvinfer1::DataType::kFLOAT, "ExpertRunner::mask"); + mPredVelocity = rt::Tensor({B, T, A}, rt::DeviceType::kGPU, nvinfer1::DataType::kFLOAT, "ExpertRunner::v"); +} + +int32_t AlpamayoExpertRunner::findTrajFutureStart( + std::vector const& outputIds, int32_t prefillSeqLen) const noexcept +{ + for (size_t i = 0; i < outputIds.size(); ++i) + { + if (outputIds[i] == mConfig.trajFutureStartTokenId) + { + return prefillSeqLen + static_cast(i) + 1; + } + } + return -1; +} + +void AlpamayoExpertRunner::runDiffusion( + rt::Tensor const& vlmKVCache, int32_t kvSeqLen, int32_t ropeDelta, rt::Tensor& outputActions, cudaStream_t stream) +{ + int32_t const B = mConfig.numCandidates; + int32_t const T = mConfig.numDiffusionTokens; + int32_t const A = mConfig.actionDim; + int32_t const L = mConfig.numLayers; + int32_t const H = mConfig.numKVHeads; + int32_t const D = mConfig.headDim; + int32_t const L2 = L * 2; + int32_t const nSteps = mConfig.numDiffusionSteps; + int32_t const actionElements = B * T * A; + int32_t const totalMaskLen = kvSeqLen + T; + + // 1. KV cache reshape + repeat: (L,2,H,S,D) fp16 → (L*2,B,H,S,D) fp32 + check::check(mKVCacheReshaped.reshape({L2, B, H, static_cast(kvSeqLen), D}), "KV reshape failed"); + kernel::kvCacheReshapeRepeat(mKVCacheReshaped.dataPointer(), + reinterpret_cast(vlmKVCache.rawPointer()), L, H, kvSeqLen, D, B, stream); + + // 2. Position IDs: basePos = kvSeqLen + ropeDelta + int64_t basePos = static_cast(kvSeqLen) + ropeDelta; + kernel::buildPositionIds(mPositionIds.dataPointer(), B, T, basePos, stream); + + // 3. Attention mask: all zeros (non-causal expert) + check::check(mAttentionMask.reshape({B, 1, T, static_cast(totalMaskLen)}), "Mask reshape failed"); + CUDA_CHECK(cudaMemsetAsync( + mAttentionMask.rawPointer(), 0, static_cast(B) * T * totalMaskLen * sizeof(float), stream)); + + // 4. Initial noise x₀ ~ N(0, I) + check::check(actionElements % 2 == 0, "curandGenerateNormal requires even element count"); + curandGenerateNormal(mRNG, mNoisyAction.dataPointer(), actionElements, 0.0f, 1.0f); + + // 5. Set TRT input shapes + nvinfer1::Dims dims; + + dims.nbDims = 3; + dims.d[0] = B; + dims.d[1] = T; + dims.d[2] = A; + mContext->setInputShape("noisy_action", dims); + + dims.d[0] = B; + dims.d[1] = 1; + dims.d[2] = 1; + mContext->setInputShape("timestep", dims); + + dims.d[0] = 3; + dims.d[1] = B; + dims.d[2] = T; + mContext->setInputShape("position_ids", dims); + + dims.nbDims = 4; + dims.d[0] = B; + dims.d[1] = 1; + dims.d[2] = T; + dims.d[3] = totalMaskLen; + mContext->setInputShape("attention_mask", dims); + + dims.nbDims = 5; + dims.d[0] = L2; + dims.d[1] = B; + dims.d[2] = H; + dims.d[3] = kvSeqLen; + dims.d[4] = D; + mContext->setInputShape("kv_cache", dims); + + mContext->setTensorAddress("noisy_action", mNoisyAction.rawPointer()); + mContext->setTensorAddress("timestep", mTimestep.rawPointer()); + mContext->setTensorAddress("position_ids", mPositionIds.rawPointer()); + mContext->setTensorAddress("attention_mask", mAttentionMask.rawPointer()); + mContext->setTensorAddress("kv_cache", mKVCacheReshaped.rawPointer()); + mContext->setTensorAddress("pred_velocity", mPredVelocity.rawPointer()); + + // 6. Flow matching: Euler integration (no CUDA Graph — kvSeqLen varies between requests) + float const dt = 1.0f / static_cast(nSteps); + + for (int32_t step = 0; step < nSteps; ++step) + { + float const tVal = static_cast(step) * dt; + kernel::fillTimestep(mTimestep.dataPointer(), B, tVal, stream); + + auto ok = mContext->enqueueV3(stream); + check::check(ok, "Expert enqueueV3 failed at diffusion step"); + + kernel::eulerUpdate( + mNoisyAction.dataPointer(), mPredVelocity.dataPointer(), dt, actionElements, stream); + } + + // 7. Copy result to output + CUDA_CHECK(cudaMemcpyAsync(outputActions.rawPointer(), mNoisyAction.rawPointer(), actionElements * sizeof(float), + cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + LOG_INFO("Expert diffusion done: %d steps, %d candidates, kvSeq=%d", nSteps, B, kvSeqLen); +} + +} // namespace rt +} // namespace trt_edgellm diff --git a/cpp/runtime/alpamayoExpertRunner.h b/cpp/runtime/alpamayoExpertRunner.h new file mode 100644 index 0000000..abc250f --- /dev/null +++ b/cpp/runtime/alpamayoExpertRunner.h @@ -0,0 +1,114 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * AlpamayoExpertRunner: runs TRT Diffusion Expert inline after VLM decode. + * KV cache stays on GPU — no disk I/O, no Python. + */ + +#pragma once + +#include "common/checkMacros.h" +#include "common/cudaUtils.h" +#include "common/logger.h" +#include "common/tensor.h" + +#include +#include +#include +#include +#include + +namespace trt_edgellm +{ +namespace rt +{ + +struct AlpamayoExpertConfig +{ + int32_t numCandidates{6}; + int32_t numDiffusionSteps{10}; + int32_t numDiffusionTokens{64}; + int32_t actionDim{2}; + int32_t numLayers{36}; + int32_t numKVHeads{8}; + int32_t headDim{128}; + int32_t trajFutureStartTokenId{155681}; + int32_t ropeDelta{-2592}; + int64_t seed{42}; + bool multiSeq{false}; +}; + +class AlpamayoExpertRunner +{ +public: + AlpamayoExpertRunner(std::string const& enginePath, AlpamayoExpertConfig const& config, cudaStream_t stream); + ~AlpamayoExpertRunner() noexcept; + + /*! Run 10-step flow matching diffusion using VLM's KV cache. + * + * @param vlmKVCache Contiguous GPU tensor: (numLayers, 2, numKVHeads, seqLen, headDim) fp16 + * from kernel::saveKVCacheIntoTensor() + * @param kvSeqLen Sequence length of the truncated KV cache (after traj_future_start) + * @param ropeDelta RoPE delta (typically -2592 for 4-camera 4-frame setup) + * @param outputActions Output GPU tensor: (numCandidates, numDiffusionTokens, actionDim) fp32 + * @param stream CUDA stream + */ + void runDiffusion(rt::Tensor const& vlmKVCache, int32_t kvSeqLen, int32_t ropeDelta, rt::Tensor& outputActions, + cudaStream_t stream); + + /*! Find in decoded output IDs and return the truncation point. + * Returns prefillLen + idx + 1 where idx is the index in outputIds. + * Returns -1 if not found. + */ + int32_t findTrajFutureStart(std::vector const& outputIds, int32_t prefillSeqLen) const noexcept; + + AlpamayoExpertConfig const& getConfig() const noexcept + { + return mConfig; + } + nvinfer1::IExecutionContext* getContext() noexcept + { + return mContext.get(); + } + +private: + void loadEngine(std::string const& path, cudaStream_t stream); + void allocateBuffers(int32_t maxKVSeqLen, cudaStream_t stream); + + AlpamayoExpertConfig mConfig; + + // TRT engine + std::unique_ptr mRuntime; + std::unique_ptr mEngine; + std::unique_ptr mContext; + rt::Tensor mExecContextMemory; + + // Pre-allocated GPU buffers + rt::Tensor mKVCacheReshaped; // (72, B, 8, maxS, 128) float32 + rt::Tensor mNoisyAction; // (B, 64, 2) float32 + rt::Tensor mTimestep; // (B, 1, 1) float32 + rt::Tensor mPositionIds; // (3, B, 64) int64 + rt::Tensor mAttentionMask; // (B, 1, 64, maxS+64) float32 + rt::Tensor mPredVelocity; // (B, 64, 2) float32 + + // cuRAND + curandGenerator_t mRNG{nullptr}; +}; + +} // namespace rt +} // namespace trt_edgellm diff --git a/cpp/runtime/llmInferenceRuntime.cpp b/cpp/runtime/llmInferenceRuntime.cpp index cb2c091..dd887ba 100644 --- a/cpp/runtime/llmInferenceRuntime.cpp +++ b/cpp/runtime/llmInferenceRuntime.cpp @@ -27,6 +27,7 @@ #include "common/safetensorsUtils.h" #include "kernels/embeddingKernels/embeddingKernels.h" #include "kernels/kvCacheUtilKernels/kvCacheUtilsKernels.h" +#include "kernels/alpamayoExpertKernels/alpamayoExpertKernels.h" #include "multimodal/multimodalRunner.h" #include "profiling/metrics.h" #include "profiling/nvtx_wrapper.h" @@ -525,8 +526,10 @@ bool LLMInferenceRuntime::handleRequest( request.requests.begin(), request.requests.end(), [](auto const& req) { return !req.audioBuffers.empty(); }); bool hasVision = std::any_of( request.requests.begin(), request.requests.end(), [](auto const& req) { return !req.imageBuffers.empty(); }); + bool hasPreparedVision = std::any_of(request.requests.begin(), request.requests.end(), + [](auto const& req) { return !req.preparedVisualInputPath.empty(); }); - if ((hasAudio && mAudioRunner) || (hasVision && mVisionRunner)) + if ((hasAudio && mAudioRunner) || ((hasVision || hasPreparedVision) && mVisionRunner)) { // Mark multimodal preprocessing and inference for NVTX profiling NVTX_SCOPED_RANGE(nvtx_multimodal, "MULTIMODAL_PROCESSING", nvtx_colors::ORANGE); @@ -550,11 +553,48 @@ bool LLMInferenceRuntime::handleRequest( } // Process vision inputs (if present) - if (hasVision && mVisionRunner) + if ((hasVision || hasPreparedVision) && mVisionRunner) { LOG_INFO("Processing vision inputs"); - if (!mVisionRunner->preprocess( - request, batchedInputIds, mTokenizer.get(), mLLMEngineRunner->getRopeCosSinCacheTensor(), stream)) + bool visionStatus = false; + if (hasPreparedVision) + { + check::check(activeBatchSize == 1, "prepared_visual_input currently supports batch_size == 1 only"); + check::check(!hasVision, "prepared_visual_input cannot be mixed with raw image inputs in the same request"); + std::vector preparedTensors; + if (!safetensors::loadSafetensors(request.requests[0].preparedVisualInputPath, preparedTensors, stream)) + { + LOG_ERROR("Failed to load prepared visual input from: %s", + request.requests[0].preparedVisualInputPath.c_str()); + return false; + } + + rt::Tensor const* pixelValues = nullptr; + rt::Tensor const* imageGridTHW = nullptr; + for (auto const& tensor : preparedTensors) + { + if (tensor.getName() == "pixel_values") + { + pixelValues = &tensor; + } + else if (tensor.getName() == "image_grid_thw") + { + imageGridTHW = &tensor; + } + } + check::check(pixelValues != nullptr, "prepared_visual_input is missing tensor 'pixel_values'"); + check::check(imageGridTHW != nullptr, "prepared_visual_input is missing tensor 'image_grid_thw'"); + + visionStatus = mVisionRunner->preprocessPreparedVisual(request, batchedInputIds, mTokenizer.get(), + mLLMEngineRunner->getRopeCosSinCacheTensor(), *pixelValues, *imageGridTHW, stream); + } + else + { + visionStatus = mVisionRunner->preprocess( + request, batchedInputIds, mTokenizer.get(), mLLMEngineRunner->getRopeCosSinCacheTensor(), stream); + } + + if (!visionStatus) { LOG_ERROR("LLMInferenceRuntime(): Vision preprocessing failed. This request cannot be handled."); return false; @@ -634,6 +674,17 @@ bool LLMInferenceRuntime::handleRequest( { outputIds[i].push_back(hostSelectedTokenIdsData[i]); finishedStates[i] = hostSelectedTokenIdsData[i] == mTokenizer->getEosId(); + // Early stop AFTER when Expert runner is active. + // We need traj_future_start to be processed by the decoder (one more iteration) + // so its KV cache entry exists. Stop on the token AFTER it. + if (mExpertRunner && outputIds[i].size() >= 2) + { + int32_t prevToken = outputIds[i][outputIds[i].size() - 2]; + if (prevToken == mExpertRunner->getConfig().trajFutureStartTokenId) + { + finishedStates[i] = true; + } + } if (finishedStates[i]) { unFinishedBatchNum--; @@ -748,6 +799,35 @@ bool LLMInferenceRuntime::handleRequest( // Record prefill metrics mPrefillMetrics.recordRun(tokenCount.totalReusedTokens, tokenCount.totalComputedTokens); + bool const hasPrefillDumpRequest = std::any_of(request.requests.begin(), request.requests.end(), + [](auto const& req) { return !req.dumpPrefillKVCachePath.empty(); }); + bool const hasPrefillOnlyRequest + = std::any_of(request.requests.begin(), request.requests.end(), [](auto const& req) { return req.prefillOnly; }); + + if (hasPrefillDumpRequest || hasPrefillOnlyRequest) + { + check::check(activeBatchSize == 1, + "dump_prefill_kv_cache / prefill_only currently supports batch_size == 1 only"); + + if (!request.requests[0].dumpPrefillKVCachePath.empty()) + { + int32_t const sequenceLength = mHostContextLengths.dataPointer()[0]; + if (!dumpCurrentPrefillKVCache(request.requests[0].dumpPrefillKVCachePath, sequenceLength, stream)) + { + LOG_ERROR("LLMInferenceRuntime(): Failed to dump request prefill KV cache to %s", + request.requests[0].dumpPrefillKVCachePath.c_str()); + return false; + } + } + + if (request.requests[0].prefillOnly) + { + response.outputIds.assign(activeBatchSize, {}); + response.outputTexts.assign(activeBatchSize, ""); + return true; + } + } + // Reshape for decoding step check::check(mInputsEmbeds.reshape({activeBatchSize, 1, mEngineConfig.hiddenSize}), "Tensor reshape failed"); @@ -800,6 +880,336 @@ bool LLMInferenceRuntime::handleRequest( mGenerationMetrics.recordRun(totalGeneratedTokens); } + // ===== KV cache dump after decode (Path A) ===== + if (!mKVCacheDumpPath.empty()) + { + auto& linearKVCache = mLLMEngineRunner->getLinearKVCache(); + auto cacheConfig = linearKVCache.getConfig(); + auto kvCacheBuffer = linearKVCache.getKVCacheBuffer(); + + int32_t totalGenLen = static_cast(outputIds[0].size()); + int32_t totalSeqLen = prefillSequenceLength + totalGenLen - 1; + + rt::Coords dumpShape{cacheConfig.numAttentionLayers, 2, cacheConfig.numKVHeads, + static_cast(totalSeqLen), cacheConfig.headDim}; + rt::Tensor dumpTensor(dumpShape, rt::DeviceType::kGPU, + cacheConfig.kvCacheTypeTRT, "LLMInferenceRuntime::dumpKVCache"); + kernel::saveKVCacheIntoTensor(dumpTensor, kvCacheBuffer, 0, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + int64_t numBytes = dumpTensor.getMemoryCapacity(); + std::vector hostBuf(numBytes); + CUDA_CHECK(cudaMemcpy(hostBuf.data(), dumpTensor.rawPointer(), numBytes, cudaMemcpyDeviceToHost)); + + std::string binPath = mKVCacheDumpPath + ".req" + std::to_string(mKVCacheDumpReqIdx) + ".bin"; + std::string metaPath = mKVCacheDumpPath + ".req" + std::to_string(mKVCacheDumpReqIdx) + ".shape.json"; + { + std::ofstream binOut(binPath, std::ios::binary); + binOut.write(reinterpret_cast(hostBuf.data()), numBytes); + } + { + std::ofstream metaOut(metaPath); + metaOut << "{\n"; + metaOut << " \"shape\": [" << cacheConfig.numAttentionLayers << ", 2, " + << cacheConfig.numKVHeads << ", " << totalSeqLen << ", " + << cacheConfig.headDim << "],\n"; + metaOut << " \"dtype\": " << static_cast(cacheConfig.kvCacheTypeTRT) << ",\n"; + metaOut << " \"prefill_seq_len\": " << prefillSequenceLength << ",\n"; + metaOut << " \"total_seq_len\": " << totalSeqLen << ",\n"; + metaOut << " \"num_generated\": " << (totalGenLen - 1) << ",\n"; + metaOut << " \"num_bytes\": " << numBytes << ",\n"; + metaOut << " \"req_idx\": " << mKVCacheDumpReqIdx << ",\n"; + std::vector hostInputIds(prefillSequenceLength); + CUDA_CHECK(cudaMemcpy(hostInputIds.data(), mInputIds.rawPointer(), + prefillSequenceLength * sizeof(int32_t), cudaMemcpyDeviceToHost)); + metaOut << " \"input_ids\": ["; + for (int32_t i = 0; i < prefillSequenceLength; ++i) { + if (i > 0) metaOut << ", "; + metaOut << hostInputIds[i]; + } + metaOut << "],\n"; + metaOut << " \"output_ids\": ["; + for (size_t i = 0; i < outputIds[0].size(); ++i) { + if (i > 0) metaOut << ", "; + metaOut << outputIds[0][i]; + } + metaOut << "]\n"; + metaOut << "}\n"; + } + LOG_INFO("Dumped KV cache (post-decode): %s (%ld bytes, total_seq_len=%d, generated=%d)", + binPath.c_str(), numBytes, totalSeqLen, totalGenLen - 1); + mKVCacheDumpReqIdx++; + } + // ===== end KV cache dump ===== + // ===== Run Expert diffusion with multi-sequence VLM decode ===== + if (mExpertRunner) + { + auto& linearKVCache = mLLMEngineRunner->getLinearKVCache(); + auto cacheConfig = linearKVCache.getConfig(); + auto const& expertCfg = mExpertRunner->getConfig(); + int32_t const numCandidates = expertCfg.numCandidates; + int32_t const L = cacheConfig.numAttentionLayers; + int32_t const H = cacheConfig.numKVHeads; + int32_t const D = cacheConfig.headDim; + + int32_t truncSeqLen = mExpertRunner->findTrajFutureStart(outputIds[0], prefillSequenceLength); + if (truncSeqLen < 0) + { + LOG_WARNING("ExpertRunner: not found, skipping expert"); + } + else if (!expertCfg.multiSeq) + { + // ── Single-sequence mode ── + LOG_INFO("ExpertRunner: single-seq, truncKV=%d", truncSeqLen); + auto kvBuf = linearKVCache.getKVCacheBuffer(); + rt::Tensor vlmKV({L, 2, H, static_cast(truncSeqLen), D}, + rt::DeviceType::kGPU, cacheConfig.kvCacheTypeTRT, "vlmKV"); + kernel::saveKVCacheIntoTensor(vlmKV, kvBuf, 0, stream); + + rt::Tensor actions({expertCfg.numCandidates, expertCfg.numDiffusionTokens, expertCfg.actionDim}, + rt::DeviceType::kGPU, nvinfer1::DataType::kFLOAT, "actions"); + mExpertRunner->runDiffusion(vlmKV, truncSeqLen, expertCfg.ropeDelta, actions, stream); + + std::vector hostAct(expertCfg.numCandidates * expertCfg.numDiffusionTokens * expertCfg.actionDim); + CUDA_CHECK(cudaMemcpy(hostAct.data(), actions.rawPointer(), + hostAct.size() * sizeof(float), cudaMemcpyDeviceToHost)); + float minV = *std::min_element(hostAct.begin(), hostAct.end()); + float maxV = *std::max_element(hostAct.begin(), hostAct.end()); + LOG_INFO("ExpertRunner: actions [%.4f, %.4f]", minV, maxV); + if (!mKVCacheDumpPath.empty()) + { + std::string p = mKVCacheDumpPath + ".req" + std::to_string(mKVCacheDumpReqIdx - 1) + ".actions.bin"; + std::ofstream o(p, std::ios::binary); + o.write(reinterpret_cast(hostAct.data()), hostAct.size() * sizeof(float)); + LOG_INFO("ExpertRunner: dumped %s", p.c_str()); + } + } + else + { + // ── Multi-sequence mode: decode N times from prefill checkpoint ── + LOG_INFO("ExpertRunner: multi-seq mode, %d candidates", numCandidates); + + // Candidate 0: already decoded, extract its KV + auto kvBuf0 = linearKVCache.getKVCacheBuffer(); + rt::Tensor cand0KV({L, 2, H, static_cast(truncSeqLen), D}, + rt::DeviceType::kGPU, cacheConfig.kvCacheTypeTRT, "cand0KV"); + kernel::saveKVCacheIntoTensor(cand0KV, kvBuf0, 0, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + LOG_INFO("ExpertRunner: cand 0 done (truncLen=%d)", truncSeqLen); + + // Save the full KV buffer state (for restoration) + size_t const kvBufBytes = kvBuf0.getMemoryCapacity(); + rt::Tensor savedFullKV({static_cast(kvBufBytes)}, rt::DeviceType::kGPU, + nvinfer1::DataType::kUINT8, "savedFullKV"); + CUDA_CHECK(cudaMemcpyAsync(savedFullKV.rawPointer(), kvBuf0.rawPointer(), + kvBufBytes, cudaMemcpyDeviceToDevice, stream)); + + // Also save KV lengths + auto& kvLengths = linearKVCache.getKVCacheLengths(); + rt::Tensor savedKVLens({kvLengths.getShape()[0]}, rt::DeviceType::kGPU, + nvinfer1::DataType::kINT32, "savedKVLens"); + CUDA_CHECK(cudaMemcpyAsync(savedKVLens.rawPointer(), kvLengths.rawPointer(), + kvLengths.getMemoryCapacity(), cudaMemcpyDeviceToDevice, stream)); + + // Create prefill-only checkpoint: reset KV lengths to prefillSequenceLength + int32_t prefillLenHost = prefillSequenceLength; + rt::Tensor savedPrefillKV({static_cast(kvBufBytes)}, rt::DeviceType::kGPU, + nvinfer1::DataType::kUINT8, "savedPrefillKV"); + CUDA_CHECK(cudaMemcpyAsync(savedPrefillKV.rawPointer(), kvBuf0.rawPointer(), + kvBufBytes, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // Collect all candidate KVs and truncation lengths + std::vector candKVs; + candKVs.push_back(std::move(cand0KV)); + std::vector candTruncLens; + candTruncLens.push_back(truncSeqLen); + + // Decode candidates 1..N-1 + for (int32_t ci = 1; ci < numCandidates; ++ci) + { + // Restore prefill KV state + CUDA_CHECK(cudaMemcpyAsync(linearKVCache.getKVCacheBuffer().rawPointer(), + savedPrefillKV.rawPointer(), kvBufBytes, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(kvLengths.rawPointer(), &prefillLenHost, + sizeof(int32_t), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // Decode loop + std::vector candIds; + candIds.push_back(outputIds[0][0]); // first token same for all (from prefill) + bool candDone = false; + int32_t candIter = 1; + + check::check(mInputsEmbeds.reshape({1, 1, mEngineConfig.hiddenSize}), "reshape failed"); + + while (!candDone && candIter < maxGenerationLength) + { + int32_t lastTok = candIds.back(); + CUDA_CHECK(cudaMemcpyAsync(mSelectedIndices.rawPointer(), &lastTok, + sizeof(int32_t), cudaMemcpyHostToDevice, stream)); + kernel::embeddingLookup(mSelectedIndices, mEmbeddingTable, mInputsEmbeds, stream); + + bool ok = mLLMEngineRunner->executeVanillaDecodingStep( + mInputsEmbeds, mOutputLogits, rt::OptionalOutputTensor{std::nullopt}, stream); + if (!ok) break; + + SamplingParams params(1, mEngineConfig.outputVocabSize, + request.temperature, request.topK, request.topP); + trt_edgellm::topKtopPSamplingFromLogits( + mOutputLogits, mSelectedIndices, params, mSamplingWorkspace, stream); + if (mEngineConfig.reducedVocabSize > 0) + trt_edgellm::mapReducedVocabToFullVocab(mSelectedIndices, mVocabMappingTable, stream); + + int32_t sampled; + CUDA_CHECK(cudaMemcpyAsync(&sampled, mSelectedIndices.rawPointer(), + sizeof(int32_t), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + candIds.push_back(sampled); + + if (candIds.size() >= 2) + { + int32_t prev = candIds[candIds.size() - 2]; + if (prev == expertCfg.trajFutureStartTokenId) + candDone = true; + } + if (sampled == mTokenizer->getEosId()) + candDone = true; + + ++candIter; + } + + int32_t candTrunc = mExpertRunner->findTrajFutureStart(candIds, prefillSequenceLength); + if (candTrunc < 0) candTrunc = truncSeqLen; + + auto candBuf = linearKVCache.getKVCacheBuffer(); + rt::Tensor candKV({L, 2, H, static_cast(candTrunc), D}, + rt::DeviceType::kGPU, cacheConfig.kvCacheTypeTRT, + ("cand" + std::to_string(ci) + "KV").c_str()); + kernel::saveKVCacheIntoTensor(candKV, candBuf, 0, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + candKVs.push_back(std::move(candKV)); + candTruncLens.push_back(candTrunc); + LOG_INFO("ExpertRunner: cand %d done (truncLen=%d, decoded %d)", ci, candTrunc, candIter); + } + + // Use minimum truncation length across candidates + int32_t minTrunc = *std::min_element(candTruncLens.begin(), candTruncLens.end()); + + // Stack KV caches: per-candidate (L,2,H,S,D) fp16 → combined (L*2,N,H,S,D) fp32 + int64_t const L2 = L * 2; + int32_t const T = expertCfg.numDiffusionTokens; + int32_t const A = expertCfg.actionDim; + + rt::Tensor stackedKV({L2, static_cast(numCandidates), H, + static_cast(minTrunc), D}, + rt::DeviceType::kGPU, nvinfer1::DataType::kFLOAT, "stackedKV"); + + int64_t const sliceElems = static_cast(H) * minTrunc * D; + for (int32_t c = 0; c < numCandidates; ++c) + { + // Reshape single candidate to (L2, 1, H, minTrunc, D) fp32 + rt::Tensor tmp({L2, 1, H, static_cast(minTrunc), D}, + rt::DeviceType::kGPU, nvinfer1::DataType::kFLOAT, "tmp"); + kernel::kvCacheReshapeRepeat(tmp.dataPointer(), + reinterpret_cast(candKVs[c].rawPointer()), + L, H, minTrunc, D, 1, stream); + + // Copy into stacked[:, c, :, :, :] + for (int64_t l = 0; l < L2; ++l) + { + float* dst = stackedKV.dataPointer() + l * numCandidates * sliceElems + c * sliceElems; + float const* src2 = tmp.dataPointer() + l * sliceElems; + CUDA_CHECK(cudaMemcpyAsync(dst, src2, sliceElems * sizeof(float), + cudaMemcpyDeviceToDevice, stream)); + } + } + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // Build Expert inputs and run diffusion + int32_t const totalMaskLen = minTrunc + T; + int64_t basePos = static_cast(minTrunc) + expertCfg.ropeDelta; + + rt::Tensor posIds({3, static_cast(numCandidates), static_cast(T)}, + rt::DeviceType::kGPU, nvinfer1::DataType::kINT64, "posIds"); + kernel::buildPositionIds(posIds.dataPointer(), numCandidates, T, basePos, stream); + + rt::Tensor mask({numCandidates, 1, T, static_cast(totalMaskLen)}, + rt::DeviceType::kGPU, nvinfer1::DataType::kFLOAT, "mask"); + CUDA_CHECK(cudaMemsetAsync(mask.rawPointer(), 0, + static_cast(numCandidates) * T * totalMaskLen * sizeof(float), stream)); + + rt::Tensor noisyAct({numCandidates, T, A}, + rt::DeviceType::kGPU, nvinfer1::DataType::kFLOAT, "noisyAct"); + curandGenerator_t rng; + curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT); + curandSetPseudoRandomGeneratorSeed(rng, expertCfg.seed); + curandSetStream(rng, stream); + check::check((numCandidates * T * A) % 2 == 0, "curandGenerateNormal requires even element count"); + curandGenerateNormal(rng, noisyAct.dataPointer(), numCandidates * T * A, 0.0f, 1.0f); + + rt::Tensor predVel({numCandidates, T, A}, + rt::DeviceType::kGPU, nvinfer1::DataType::kFLOAT, "predVel"); + rt::Tensor ts({numCandidates, 1, 1}, rt::DeviceType::kGPU, nvinfer1::DataType::kFLOAT, "ts"); + + auto* ctx = mExpertRunner->getContext(); + nvinfer1::Dims dims; + dims.nbDims = 3; dims.d[0] = numCandidates; dims.d[1] = T; dims.d[2] = A; + ctx->setInputShape("noisy_action", dims); + dims.d[0] = numCandidates; dims.d[1] = 1; dims.d[2] = 1; + ctx->setInputShape("timestep", dims); + dims.d[0] = 3; dims.d[1] = numCandidates; dims.d[2] = T; + ctx->setInputShape("position_ids", dims); + dims.nbDims = 4; dims.d[0] = numCandidates; dims.d[1] = 1; dims.d[2] = T; dims.d[3] = totalMaskLen; + ctx->setInputShape("attention_mask", dims); + dims.nbDims = 5; dims.d[0] = L2; dims.d[1] = numCandidates; dims.d[2] = H; + dims.d[3] = minTrunc; dims.d[4] = D; + ctx->setInputShape("kv_cache", dims); + + ctx->setTensorAddress("noisy_action", noisyAct.rawPointer()); + ctx->setTensorAddress("timestep", ts.rawPointer()); + ctx->setTensorAddress("position_ids", posIds.rawPointer()); + ctx->setTensorAddress("attention_mask", mask.rawPointer()); + ctx->setTensorAddress("kv_cache", stackedKV.rawPointer()); + ctx->setTensorAddress("pred_velocity", predVel.rawPointer()); + + int32_t const nSteps = expertCfg.numDiffusionSteps; + float const dt = 1.0f / static_cast(nSteps); + int32_t const actElems = numCandidates * T * A; + for (int32_t step = 0; step < nSteps; ++step) + { + kernel::fillTimestep(ts.dataPointer(), numCandidates, step * dt, stream); + auto ok = ctx->enqueueV3(stream); + check::check(ok, "Expert enqueueV3 failed in multi-seq diffusion step"); + kernel::eulerUpdate(noisyAct.dataPointer(), predVel.dataPointer(), dt, actElems, stream); + } + CUDA_CHECK(cudaStreamSynchronize(stream)); + curandDestroyGenerator(rng); + + std::vector hostAct(actElems); + CUDA_CHECK(cudaMemcpy(hostAct.data(), noisyAct.rawPointer(), actElems * sizeof(float), cudaMemcpyDeviceToHost)); + float minV = *std::min_element(hostAct.begin(), hostAct.end()); + float maxV = *std::max_element(hostAct.begin(), hostAct.end()); + LOG_INFO("ExpertRunner: multi-seq done, %d cands, actions [%.4f, %.4f]", numCandidates, minV, maxV); + + if (!mKVCacheDumpPath.empty()) + { + std::string p = mKVCacheDumpPath + ".req" + std::to_string(mKVCacheDumpReqIdx - 1) + ".actions.bin"; + std::ofstream o(p, std::ios::binary); + o.write(reinterpret_cast(hostAct.data()), hostAct.size() * sizeof(float)); + LOG_INFO("ExpertRunner: dumped %s", p.c_str()); + } + + // Restore original KV state + CUDA_CHECK(cudaMemcpyAsync(linearKVCache.getKVCacheBuffer().rawPointer(), + savedFullKV.rawPointer(), kvBufBytes, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(kvLengths.rawPointer(), savedKVLens.rawPointer(), + kvLengths.getMemoryCapacity(), cudaMemcpyDeviceToDevice, stream)); + } + } + // ===== end Expert diffusion ===== + // Clean the response field and fill the generated outputIds and decoded texts. response.outputIds.clear(); response.outputTexts.clear(); @@ -1042,5 +1452,41 @@ bool LLMInferenceRuntime::genAndSaveSystemPromptKVCache( return true; } +bool LLMInferenceRuntime::dumpCurrentPrefillKVCache( + std::string const& outputPath, int32_t sequenceLength, cudaStream_t stream) +{ + check::check(sequenceLength > 0, "sequenceLength must be positive for prefill KV cache dump"); + + auto& linearKVCache = mLLMEngineRunner->getLinearKVCache(); + auto const cacheConfig = linearKVCache.getConfig(); + auto kvCacheBuffer = linearKVCache.getKVCacheBuffer(); + + rt::Tensor kvCacheContent({cacheConfig.numAttentionLayers, 2, cacheConfig.numKVHeads, sequenceLength, + cacheConfig.headDim}, + rt::DeviceType::kGPU, cacheConfig.kvCacheTypeTRT, "kv_cache"); + kernel::saveKVCacheIntoTensor(kvCacheContent, kvCacheBuffer, /*batchIdx=*/0, stream); + + rt::Tensor sequenceLengthTensor({1}, rt::DeviceType::kCPU, nvinfer1::DataType::kINT32, "sequence_length"); + sequenceLengthTensor.dataPointer()[0] = sequenceLength; + + rt::Tensor inputIdsTensor({sequenceLength}, rt::DeviceType::kCPU, nvinfer1::DataType::kINT32, "input_ids"); + CUDA_CHECK(cudaMemcpyAsync(inputIdsTensor.rawPointer(), mInputIds.rawPointer(), sequenceLength * sizeof(int32_t), + cudaMemcpyDeviceToHost, stream)); + + std::vector tensors; + tensors.push_back(std::move(kvCacheContent)); + tensors.push_back(std::move(sequenceLengthTensor)); + tensors.push_back(std::move(inputIdsTensor)); + + if (!safetensors::saveSafetensors(outputPath, tensors, stream)) + { + LOG_ERROR("LLMInferenceRuntime(): Failed to save safetensors to %s", outputPath.c_str()); + return false; + } + CUDA_CHECK(cudaStreamSynchronize(stream)); + LOG_INFO("LLMInferenceRuntime(): Prefill KV cache dumped to %s", outputPath.c_str()); + return true; +} + } // namespace rt } // namespace trt_edgellm diff --git a/cpp/runtime/llmInferenceRuntime.h b/cpp/runtime/llmInferenceRuntime.h index dc790c6..b2f0272 100644 --- a/cpp/runtime/llmInferenceRuntime.h +++ b/cpp/runtime/llmInferenceRuntime.h @@ -18,9 +18,11 @@ #pragma once #include "common/hashUtils.h" +#include "common/safetensorsUtils.h" #include "multimodal/multimodalRunner.h" #include "profiling/metrics.h" #include "profiling/timer.h" +#include "runtime/alpamayoExpertRunner.h" #include "runtime/llmEngineRunner.h" #include "runtime/llmRuntimeUtils.h" #include "tokenizer/tokenizer.h" @@ -117,6 +119,20 @@ class LLMInferenceRuntime : metrics::MultimodalMetrics{}; } + /*! \brief Enable dumping the KV cache to disk after each prefill+decode. + * When non-empty, every successful request in handleRequest() writes the + * KV cache (and shape metadata) to .req.bin / .req.shape.json. + */ + void setKVCacheDumpPath(std::string const& path) noexcept { mKVCacheDumpPath = path; } + + /*! \brief Initialize the Expert runner for inline diffusion after VLM decode. + * When set, handleRequest() runs Expert diffusion instead of (or in addition to) dumping KV cache. + */ + void initExpertRunner(std::string const& expertEnginePath, AlpamayoExpertConfig const& config, cudaStream_t stream) + { + mExpertRunner = std::make_unique(expertEnginePath, config, stream); + } + private: /*! \brief Helper structure to hold token counting results */ @@ -176,6 +192,17 @@ class LLMInferenceRuntime //! \throws std::runtime_error if system prompt is malformed, or a CUDA operation fails bool setUpForPrefillExecution(std::vector> const& batchedInputIds, std::vector const& systemPrompts, std::string const& loraWeightsName, cudaStream_t stream); + + //! Dump the current prefill KV cache for a single-batch request. + //! Saves a safetensors file containing: + //! - kv_cache: [num_layers, 2, num_kv_heads, seq_len, head_dim] + //! - sequence_length: [1] int32 + //! - input_ids: [seq_len] int32 + bool dumpCurrentPrefillKVCache(std::string const& outputPath, int32_t sequenceLength, cudaStream_t stream); + + std::unique_ptr mExpertRunner{nullptr}; + std::string mKVCacheDumpPath{}; //!< Optional path prefix to dump KV cache after decode (Path A) + int64_t mKVCacheDumpReqIdx{0}; //!< Counter for dump file naming }; } // namespace rt } // namespace trt_edgellm diff --git a/cpp/runtime/llmRuntimeUtils.h b/cpp/runtime/llmRuntimeUtils.h index 3ea037b..0cde4d7 100644 --- a/cpp/runtime/llmRuntimeUtils.h +++ b/cpp/runtime/llmRuntimeUtils.h @@ -69,6 +69,9 @@ struct LLMGenerationRequest std::vector messages; //!< Structured messages (required - use chat template format) std::vector imageBuffers; //!< Optional image data for multimodal inputs std::vector audioBuffers; //!< Optional audio data for multimodal inputs (Qwen3-Omni) + std::string preparedVisualInputPath; //!< Optional path to preprocessed visual inputs (pixel_values + image_grid_thw) + std::string dumpPrefillKVCachePath; //!< Optional output path to dump prefill KV cache as safetensors + bool prefillOnly{false}; //!< If true, stop after prefill (useful with dumpPrefillKVCachePath) mutable FormattedRequest formatted; //!< Formatted request (populated by tokenizer or user-provided) }; diff --git a/examples/llm/llm_inference.cpp b/examples/llm/llm_inference.cpp index 5631aac..b025f46 100644 --- a/examples/llm/llm_inference.cpp +++ b/examples/llm/llm_inference.cpp @@ -60,7 +60,12 @@ enum LLMInferenceOptionId : int EAGLE_DRAFT_STEP = 912, EAGLE_VERIFY_TREE_SIZE = 913, BATCH_SIZE = 914, - MAX_GENERATE_LENGTH = 915 + MAX_GENERATE_LENGTH = 915, + DUMP_KV_CACHE = 916, + EXPERT_ENGINE = 917, + NUM_CANDIDATES = 918, + MULTI_SEQ = 919, + DIFFUSION_STEPS = 920 }; // Struct to hold Eagle-specific arguments for speculative decoding @@ -98,6 +103,11 @@ struct LLMInferenceArgs // For other sampling parameters (temperature, top_p, top_k), please specify them in the input JSON file int32_t batchSize{-1}; // -1 means use value from input file int64_t maxGenerateLength{-1}; // -1 means use value from input file + std::string dumpKVCache{""}; + std::string expertEngine{""}; // Path to Expert TRT engine for diffusion (empty = disabled) + int32_t numCandidates{6}; + bool multiSeq{false}; + int32_t numDiffusionSteps{10}; EagleArgs eagleArgs; }; @@ -152,7 +162,12 @@ bool parseLLMInferenceArgs(LLMInferenceArgs& args, int argc, char* argv[]) {"eagleDraftStep", required_argument, 0, LLMInferenceOptionId::EAGLE_DRAFT_STEP}, {"eagleVerifyTreeSize", required_argument, 0, LLMInferenceOptionId::EAGLE_VERIFY_TREE_SIZE}, {"batchSize", required_argument, 0, LLMInferenceOptionId::BATCH_SIZE}, - {"maxGenerateLength", required_argument, 0, LLMInferenceOptionId::MAX_GENERATE_LENGTH}, {0, 0, 0, 0}}; + {"maxGenerateLength", required_argument, 0, LLMInferenceOptionId::MAX_GENERATE_LENGTH}, + {"dumpKVCache", required_argument, 0, LLMInferenceOptionId::DUMP_KV_CACHE}, + {"expertEngine", required_argument, 0, LLMInferenceOptionId::EXPERT_ENGINE}, + {"numCandidates", required_argument, 0, LLMInferenceOptionId::NUM_CANDIDATES}, + {"multiSeq", no_argument, 0, LLMInferenceOptionId::MULTI_SEQ}, + {"numDiffusionSteps", required_argument, 0, LLMInferenceOptionId::DIFFUSION_STEPS}, {0, 0, 0, 0}}; int opt; while ((opt = getopt_long(argc, argv, "", inferenceOptions, nullptr)) != -1) @@ -265,6 +280,11 @@ bool parseLLMInferenceArgs(LLMInferenceArgs& args, int argc, char* argv[]) return false; } break; + case LLMInferenceOptionId::DUMP_KV_CACHE: args.dumpKVCache = optarg; break; + case LLMInferenceOptionId::EXPERT_ENGINE: args.expertEngine = optarg; break; + case LLMInferenceOptionId::NUM_CANDIDATES: args.numCandidates = std::stoi(optarg); break; + case LLMInferenceOptionId::MULTI_SEQ: args.multiSeq = true; break; + case LLMInferenceOptionId::DIFFUSION_STEPS: args.numDiffusionSteps = std::stoi(optarg); break; default: return false; } } @@ -447,6 +467,9 @@ std::pair, std::vector(); @@ -456,6 +479,14 @@ std::pair, std::vector(); + } + if (requestItem.contains("dump_prefill_kv_cache") && !requestItem["dump_prefill_kv_cache"].is_null()) + { + dumpPrefillKVCachePath = requestItem["dump_prefill_kv_cache"].get(); + } // Validate that all requests in this batch use the same LoRA weights if (firstInBatch) @@ -546,11 +577,14 @@ std::pair, std::vector(); - // TODO: Need to consider multi-turn conversation, and whether to load all images. - auto image = rt::imageUtils::loadImageFromFile(msgContent.content); - if (image.buffer != nullptr) + if (preparedVisualInputPath.empty()) { - imageBuffers.push_back(std::move(image)); + // TODO: Need to consider multi-turn conversation, and whether to load all images. + auto image = rt::imageUtils::loadImageFromFile(msgContent.content); + if (image.buffer != nullptr) + { + imageBuffers.push_back(std::move(image)); + } } } else if (msgContent.type == "audio") @@ -609,6 +643,9 @@ std::pair, std::vectorinitExpertRunner(args.expertEngine, expertCfg, stream); + LOG_INFO("Expert runner initialized: %s", args.expertEngine.c_str()); + } + if (!args.dumpKVCache.empty()) + { + llmInferenceRuntime->setKVCacheDumpPath(args.dumpKVCache); + LOG_INFO("KV cache dump enabled, prefix: %s", args.dumpKVCache.c_str()); + } } // Perform warmup runs if requested @@ -817,6 +868,7 @@ int main(int argc, char* argv[]) // Validate UTF-8 for output text (inputs are always valid) // If invalid UTF-8 detected, error message is returned and original text is logged responseJson["output_text"] = sanitizeUtf8ForJson(outputText); + responseJson["output_ids"] = requestStatus ? nlohmann::json(response.outputIds[batchIdx]) : nlohmann::json::array(); responseJson["request_idx"] = requestIdx; responseJson["batch_idx"] = batchIdx; // Store messages for reference