diff --git a/cpp/tensorrt_llm/kernels/quantization.cu b/cpp/tensorrt_llm/kernels/quantization.cu index 3941277dfa0..4d54e6fcbd0 100644 --- a/cpp/tensorrt_llm/kernels/quantization.cu +++ b/cpp/tensorrt_llm/kernels/quantization.cu @@ -178,7 +178,7 @@ void invokeFP4Quantization(int b, int m, int n, T const* input, float const* SFS config.stream = stream; cudaLaunchAttribute attrs[1]; attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; - attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL(); + attrs[0].val.programmaticStreamSerializationAllowed = false; config.numAttrs = 1; config.attrs = attrs; cudaLaunchKernelEx(&config, kernel_instance, b, m, n, n, input, SFScale, reinterpret_cast(output), @@ -213,7 +213,7 @@ void invokeMxFP8Quantization(int b, int m, int n, int padded_n, T const* input, config.stream = stream; cudaLaunchAttribute attrs[1]; attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; - attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL(); + attrs[0].val.programmaticStreamSerializationAllowed = false; config.numAttrs = 1; config.attrs = attrs; cudaLaunchKernelEx(&config, @@ -388,7 +388,7 @@ void computePerTokenGlobalScaleForFP4Quantization(int b, int m, int n, T const* config.stream = stream; cudaLaunchAttribute attrs[1]; attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; - attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL(); + attrs[0].val.programmaticStreamSerializationAllowed = false; config.numAttrs = 1; config.attrs = attrs; TLLM_CUDA_CHECK(cudaLaunchKernelEx(