From 6216bc3784633574a285591538cbe6a414d73973 Mon Sep 17 00:00:00 2001
From: "wangyang (wysaid)" <wysaid@gmail.com>
Date: Sat, 28 Mar 2026 23:18:26 +0800
Subject: [PATCH 1/2] fix: critical zero-copy and format conversion bugs

Fixes 11 critical and medium-priority bugs across backends:

CRITICAL (HIGH):
- Fix double CVPixelBufferUnlockBaseAddress in Apple camera
- Fix double CVPixelBufferUnlockBaseAddress in Apple file reader
- Fix use-after-free in Windows file reader

DANGLING POINTER (MEDIUM):
- Fix dangling nativeHandle in Apple camera after conversion
- Fix dangling nativeHandle in Apple file reader after conversion
- Fix dangling nativeHandle in DirectShow backend

FORMAT CONVERSION (MEDIUM):
- Guard shouldConvert against Unknown output format in DShow/MSMF
- Log warnings for unsupported YUV-to-different-YUV conversion
- Log warnings for unsupported RGB-to-YUV conversion

DOCUMENTATION (LOW):
- Fix kPixelFormatBGRBit comment typo
- Remove misleading @refitem from I420 docs
- Clarify zero-copy requirements in PixelFormatOutput
- Document Apple YUV subtype behavior

All 907 functional tests pass with ASAN enabled. No regressions.
---
 include/ccap_def.h               | 8 ++++----
 src/ccap_convert_frame.cpp       | 8 ++++++++
 src/ccap_file_reader_apple.mm    | 5 ++++-
 src/ccap_file_reader_windows.cpp | 2 +-
 src/ccap_imp_apple.mm            | 7 ++++++-
 src/ccap_imp_linux.h             | 4 ++--
 src/ccap_imp_windows.cpp         | 8 +++++---
 src/ccap_imp_windows.h           | 2 +-
 src/ccap_imp_windows_msmf.cpp    | 3 ++-
 9 files changed, 33 insertions(+), 14 deletions(-)
diff --git a/include/ccap_def.h b/include/ccap_def.h
index 107cad3c..3c64208b 100644
--- a/include/ccap_def.h
+++ b/include/ccap_def.h
@@ -35,7 +35,7 @@ namespace ccap {
 enum PixelFormatConstants : uint32_t {
     /// `kPixelFormatRGBBit` indicates that the pixel format is RGB or RGBA.
     kPixelFormatRGBBit = 1 << 3,
-    /// `kPixelFormatRGBBit` indicates that the pixel format is BGR or BGRA.
+    /// `kPixelFormatBGRBit` indicates that the pixel format is BGR or BGRA.
     kPixelFormatBGRBit = 1 << 4,
 
     /// Color Bit Mask
@@ -82,7 +82,6 @@ enum class PixelFormat : uint32_t {
      *    In software design, you can implement a toggle option to allow users to choose whether
      *    the received Frame is FullRange or VideoRange based on what they observe.
      * @note This format is also known by other names, such as YUV420P or IYUV.
-     * @refitem #NV12
      */
     I420 = 1 << 2 | kPixelFormatYUVColorBit,
 
@@ -191,10 +190,11 @@ enum class PropertyName {
 
     /**
      * @brief The output pixel format of ccap. Can be different from PixelFormatInternal.
-     * @note If PixelFormatInternal is RGB(A), PixelFormatOutput cannot be set to a YUV format.
+     * @note If PixelFormatInternal is RGB(A), PixelFormatOutput cannot be set to a YUV format (the conversion will fail).
      *       If PixelFormatInternal is YUV and PixelFormatOutput is RGB(A), BT.601 will be used for conversion.
      *       For other cases, there are no issues.
-     *       If PixelFormatInternal and PixelFormatOutput are the same format, data conversion will be skipped and the original data will be used directly.
+     *       If PixelFormatInternal and PixelFormatOutput are the same format AND the camera natively supports
+     *       PixelFormatInternal, data conversion will be skipped and the original data will be used directly.
      *       In general, setting both PixelFormatInternal and PixelFormatOutput to YUV formats can achieve better performance.
      */
     PixelFormatOutput = 0x30002,
diff --git a/src/ccap_convert_frame.cpp b/src/ccap_convert_frame.cpp
index c9d9f789..dbf4c8f9 100644
--- a/src/ccap_convert_frame.cpp
+++ b/src/ccap_convert_frame.cpp
@@ -10,6 +10,7 @@
 
 #include "ccap_convert.h"
 #include "ccap_imp.h"
+#include "ccap_utils.h"
 
 #include <cassert>
 #include <cstring>
@@ -229,8 +230,15 @@ inline bool inplaceConvertFrameImp(VideoFrame* frame, PixelFormat toFormat, bool
             return inplaceConvertFrameYUV2YUV(frame, toFormat, verticalFlip);
 #endif
 
+        if (isInputYUV && isOutputYUV) {
+            CCAP_LOG_W("ccap: YUV to different YUV subtype conversion is not supported without libyuv, skipping conversion\n");
+            return false;
+        }
+
         if (isInputYUV) // yuv -> BGR
             return inplaceConvertFrameYUV2RGBColor(frame, toFormat, verticalFlip);
+
+        CCAP_LOG_W("ccap: RGB to YUV conversion is not supported, skipping conversion\n");
         return false; // no rgb -> yuv
     }
 
diff --git a/src/ccap_file_reader_apple.mm b/src/ccap_file_reader_apple.mm
index 6e03f34a..196da8d9 100644
--- a/src/ccap_file_reader_apple.mm
+++ b/src/ccap_file_reader_apple.mm
@@ -398,7 +398,10 @@ - (void)processFrame:(CMSampleBufferRef)sampleBuffer {
         }
         
         zeroCopy = !inplaceConvertFrame(newFrame.get(), prop.outputPixelFormat, shouldFlip);
-        CVPixelBufferUnlockBaseAddress(imageBuffer, kCVPixelBufferLock_ReadOnly);
+        if (!zeroCopy) {
+            CVPixelBufferUnlockBaseAddress(imageBuffer, kCVPixelBufferLock_ReadOnly);
+            newFrame->nativeHandle = nullptr;
+        }
     }
     
     if (zeroCopy) {
diff --git a/src/ccap_file_reader_windows.cpp b/src/ccap_file_reader_windows.cpp
index 7424d82c..51e57b34 100644
--- a/src/ccap_file_reader_windows.cpp
+++ b/src/ccap_file_reader_windows.cpp
@@ -462,7 +462,7 @@ void FileReaderWindows::readLoop() {
                         auto&& f = m_provider->getAllocatorFactory();
                         newFrame->allocator = f ? f() : std::make_shared<DefaultAllocator>();
                     }
-                    inplaceConvertFrame(newFrame.get(), prop.outputPixelFormat, shouldFlip);
+                    zeroCopy = !inplaceConvertFrame(newFrame.get(), prop.outputPixelFormat, shouldFlip);
                 }
 
                 newFrame->frameIndex = m_currentFrameIndex;
diff --git a/src/ccap_imp_apple.mm b/src/ccap_imp_apple.mm
index a77dc8d4..171347ae 100644
--- a/src/ccap_imp_apple.mm
+++ b/src/ccap_imp_apple.mm
@@ -905,6 +905,8 @@ - (void)captureOutput:(AVCaptureOutput*)output
     }
 
     /// iOS/macOS does not support i420, and we do not intend to support nv12 to i420 conversion here.
+    /// When both internal and output formats are YUV, zeroCopy is used regardless of subtype differences
+    /// (e.g., NV12 vs I420). The frame will carry the actual camera format, not the requested output format.
     bool zeroCopy = ((internalFormat & kPixelFormatYUVColorBit) && (outputFormat & kPixelFormatYUVColorBit)) ||
         (internalFormat == outputFormat && _provider->frameOrientation() == kDefaultFrameOrientation);
 
@@ -924,7 +926,10 @@ - (void)captureOutput:(AVCaptureOutput*)output
 
         zeroCopy = !inplaceConvertFrame(newFrame.get(), outputFormat, (int)(newFrame->orientation != kDefaultFrameOrientation));
 
-        CVPixelBufferUnlockBaseAddress(imageBuffer, kCVPixelBufferLock_ReadOnly);
+        if (!zeroCopy) {
+            CVPixelBufferUnlockBaseAddress(imageBuffer, kCVPixelBufferLock_ReadOnly);
+            newFrame->nativeHandle = nullptr;
+        }
 
         if (verboseLogEnabled()) {
 #ifdef DEBUG
diff --git a/src/ccap_imp_linux.h b/src/ccap_imp_linux.h
index ec548de2..8b4cb9f6 100644
--- a/src/ccap_imp_linux.h
+++ b/src/ccap_imp_linux.h
@@ -101,12 +101,12 @@ class ProviderV4L2 : public ProviderImp {
     bool m_isStreaming = false;
 
     // V4L2 device capabilities
-    struct v4l2_capability m_caps{};
+    struct v4l2_capability m_caps {};
     std::vector<V4L2Format> m_supportedFormats;
     std::vector<DeviceInfo::Resolution> m_supportedResolutions;
 
     // Current format
-    struct v4l2_format m_currentFormat{};
+    struct v4l2_format m_currentFormat {};
 
     // Buffer management
     std::vector<V4L2Buffer> m_buffers;
diff --git a/src/ccap_imp_windows.cpp b/src/ccap_imp_windows.cpp
index 07e23b16..7d4ca2fe 100644
--- a/src/ccap_imp_windows.cpp
+++ b/src/ccap_imp_windows.cpp
@@ -849,10 +849,11 @@ HRESULT STDMETHODCALLTYPE ProviderDirectShow::SampleCB(double sampleTime, IMedia
     newFrame->width = m_frameProp.width;
     newFrame->height = m_frameProp.height;
     newFrame->orientation = isOutputYUV ? FrameOrientation::TopToBottom : m_frameOrientation;
-    newFrame->nativeHandle = mediaSample;
+    newFrame->nativeHandle = nullptr;
 
     bool shouldFlip = newFrame->orientation != m_inputOrientation && !isOutputYUV;
-    bool shouldConvert = m_frameProp.cameraPixelFormat != m_frameProp.outputPixelFormat;
+    bool shouldConvert = m_frameProp.outputPixelFormat != PixelFormat::Unknown &&
+        m_frameProp.cameraPixelFormat != m_frameProp.outputPixelFormat;
     bool zeroCopy = !shouldConvert && !shouldFlip;
 
     if (isInputYUV) {
@@ -949,6 +950,7 @@ HRESULT STDMETHODCALLTYPE ProviderDirectShow::SampleCB(double sampleTime, IMedia
         // Conversion may fail. If conversion fails, fall back to zero-copy mode.
         // In this case, the returned format is the original camera input format.
         newFrame->sizeInBytes = bufferLen;
+        newFrame->nativeHandle = mediaSample;
 
         mediaSample->AddRef(); // Ensure data lifecycle
         auto manager = std::make_shared<FakeFrame>([newFrame, mediaSample]() mutable {
@@ -1001,7 +1003,7 @@ HRESULT STDMETHODCALLTYPE ProviderDirectShow::BufferCB(double SampleTime, BYTE*
     return S_OK;
 }
 
-HRESULT STDMETHODCALLTYPE ProviderDirectShow::QueryInterface(REFIID riid, _COM_Outptr_ void __RPC_FAR * __RPC_FAR * ppvObject) {
+HRESULT STDMETHODCALLTYPE ProviderDirectShow::QueryInterface(REFIID riid, _COM_Outptr_ void __RPC_FAR* __RPC_FAR* ppvObject) {
     static constexpr const IID IID_ISampleGrabberCB = { 0x0579154A, 0x2B53, 0x4994, { 0xB0, 0xD0, 0xE7, 0x73, 0x14, 0x8E, 0xFF, 0x85 } };
 
     if (riid == IID_IUnknown) {
diff --git a/src/ccap_imp_windows.h b/src/ccap_imp_windows.h
index fe45ab6d..6e4dd02f 100644
--- a/src/ccap_imp_windows.h
+++ b/src/ccap_imp_windows.h
@@ -93,7 +93,7 @@ class ProviderDirectShow : public ProviderImp, public ISampleGrabberCB {
     inline FrameOrientation frameOrientation() const { return m_frameOrientation; }
 
 private:
-    HRESULT STDMETHODCALLTYPE QueryInterface(REFIID riid, _COM_Outptr_ void __RPC_FAR * __RPC_FAR * ppvObject) override;
+    HRESULT STDMETHODCALLTYPE QueryInterface(REFIID riid, _COM_Outptr_ void __RPC_FAR* __RPC_FAR* ppvObject) override;
     ULONG STDMETHODCALLTYPE AddRef(void) override;
     ULONG STDMETHODCALLTYPE Release(void) override;
 
diff --git a/src/ccap_imp_windows_msmf.cpp b/src/ccap_imp_windows_msmf.cpp
index cec5ab80..0cfe58a5 100644
--- a/src/ccap_imp_windows_msmf.cpp
+++ b/src/ccap_imp_windows_msmf.cpp
@@ -801,7 +801,8 @@ void ProviderMSMF::readLoop() {
         }
 
         bool shouldFlip = !isOutputYUV && targetOrientation != m_inputOrientation;
-        bool shouldConvert = newFrame->pixelFormat != m_frameProp.outputPixelFormat;
+        bool shouldConvert = m_frameProp.outputPixelFormat != PixelFormat::Unknown &&
+            newFrame->pixelFormat != m_frameProp.outputPixelFormat;
         bool zeroCopy = !shouldConvert && !shouldFlip;
 
         if (!zeroCopy) {

From 52ec4a3385be7452a04122777f6c261a736bfd01 Mon Sep 17 00:00:00 2001
From: "wangyang (wysaid)" <wysaid@gmail.com>
Date: Sun, 29 Mar 2026 20:43:24 +0800
Subject: [PATCH 2/2] fix: handle Unknown output format and reduce per-frame
 log noise

- Derive effectiveOutputFormat to avoid passing PixelFormat::Unknown to
  inplaceConvertFrame across all providers (DirectShow, MSMF, V4L2,
  AVFoundation) and file readers (Windows MF, Apple AVAssetReader).
  When outputPixelFormat is Unknown, treat it as 'keep input format'
  so isOutputYUV, shouldConvert, shouldFlip, and the conversion call
  all use the camera/input format instead.
- Use log-once pattern for unsupported YUV-to-YUV (without libyuv) and
  RGB-to-YUV conversion warnings to prevent flooding stderr in
  long-running captures.
- Update PixelFormatOutput documentation to clarify Unknown semantics
  and unsupported conversion limitations.
---
 include/ccap_def.h               |  7 +++++--
 src/ccap_convert_frame.cpp       | 12 ++++++++++--
 src/ccap_file_reader_apple.mm    |  7 ++++---
 src/ccap_file_reader_windows.cpp |  7 ++++---
 src/ccap_imp_apple.mm            |  3 +++
 src/ccap_imp_linux.cpp           | 12 ++++++------
 src/ccap_imp_windows.cpp         | 14 +++++++-------
 src/ccap_imp_windows_msmf.cpp    |  8 ++++----
 8 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/include/ccap_def.h b/include/ccap_def.h
index 3c64208b..8f0288d9 100644
--- a/include/ccap_def.h
+++ b/include/ccap_def.h
@@ -190,9 +190,12 @@ enum class PropertyName {
 
     /**
      * @brief The output pixel format of ccap. Can be different from PixelFormatInternal.
-     * @note If PixelFormatInternal is RGB(A), PixelFormatOutput cannot be set to a YUV format (the conversion will fail).
+     * @note If PixelFormatInternal is RGB(A), PixelFormatOutput cannot be set to a YUV format (RGB->YUV conversion is not supported).
+     *       If PixelFormatInternal is YUV and PixelFormatOutput is a different YUV subtype, conversion requires libyuv;
+     *       without it the frame will keep the camera format and no conversion is performed.
      *       If PixelFormatInternal is YUV and PixelFormatOutput is RGB(A), BT.601 will be used for conversion.
-     *       For other cases, there are no issues.
+     *       If PixelFormatOutput is set to PixelFormat::Unknown (or not set), the camera's native format is used as-is
+     *       and no conversion is performed.
      *       If PixelFormatInternal and PixelFormatOutput are the same format AND the camera natively supports
      *       PixelFormatInternal, data conversion will be skipped and the original data will be used directly.
      *       In general, setting both PixelFormatInternal and PixelFormatOutput to YUV formats can achieve better performance.
diff --git a/src/ccap_convert_frame.cpp b/src/ccap_convert_frame.cpp
index dbf4c8f9..73c8a331 100644
--- a/src/ccap_convert_frame.cpp
+++ b/src/ccap_convert_frame.cpp
@@ -231,14 +231,22 @@ inline bool inplaceConvertFrameImp(VideoFrame* frame, PixelFormat toFormat, bool
 #endif
 
         if (isInputYUV && isOutputYUV) {
-            CCAP_LOG_W("ccap: YUV to different YUV subtype conversion is not supported without libyuv, skipping conversion\n");
+            static bool sLoggedYuv2YuvUnsupported = false;
+            if (!sLoggedYuv2YuvUnsupported) {
+                CCAP_LOG_W("ccap: YUV to different YUV subtype conversion is not supported without libyuv, skipping conversion\n");
+                sLoggedYuv2YuvUnsupported = true;
+            }
             return false;
         }
 
         if (isInputYUV) // yuv -> BGR
             return inplaceConvertFrameYUV2RGBColor(frame, toFormat, verticalFlip);
 
-        CCAP_LOG_W("ccap: RGB to YUV conversion is not supported, skipping conversion\n");
+        static bool sLoggedRgbToYuvUnsupported = false;
+        if (!sLoggedRgbToYuvUnsupported) {
+            CCAP_LOG_W("ccap: RGB to YUV conversion is not supported, skipping conversion\n");
+            sLoggedRgbToYuvUnsupported = true;
+        }
         return false; // no rgb -> yuv
     }
 
diff --git a/src/ccap_file_reader_apple.mm b/src/ccap_file_reader_apple.mm
index 196da8d9..42664928 100644
--- a/src/ccap_file_reader_apple.mm
+++ b/src/ccap_file_reader_apple.mm
@@ -382,10 +382,11 @@ - (void)processFrame:(CMSampleBufferRef)sampleBuffer {
     
     // Check if conversion or flip is needed
     auto& prop = _provider->getFrameProperty();
-    bool isOutputYUV = (newFrame->pixelFormat & kPixelFormatYUVColorBit) != 0;
+    PixelFormat effectiveOutputFormat = (prop.outputPixelFormat == PixelFormat::Unknown) ? newFrame->pixelFormat : prop.outputPixelFormat;
+    bool isOutputYUV = (effectiveOutputFormat & kPixelFormatYUVColorBit) != 0;
     FrameOrientation targetOrientation = isOutputYUV ? FrameOrientation::TopToBottom : _provider->frameOrientation();
     bool shouldFlip = !isOutputYUV && (inputOrientation != targetOrientation);
-    bool shouldConvert = newFrame->pixelFormat != prop.outputPixelFormat;
+    bool shouldConvert = newFrame->pixelFormat != effectiveOutputFormat;
     
     newFrame->orientation = targetOrientation;
     
@@ -397,7 +398,7 @@ - (void)processFrame:(CMSampleBufferRef)sampleBuffer {
             newFrame->allocator = f ? f() : std::make_shared<DefaultAllocator>();
         }
         
-        zeroCopy = !inplaceConvertFrame(newFrame.get(), prop.outputPixelFormat, shouldFlip);
+        zeroCopy = !inplaceConvertFrame(newFrame.get(), effectiveOutputFormat, shouldFlip);
         if (!zeroCopy) {
             CVPixelBufferUnlockBaseAddress(imageBuffer, kCVPixelBufferLock_ReadOnly);
             newFrame->nativeHandle = nullptr;
diff --git a/src/ccap_file_reader_windows.cpp b/src/ccap_file_reader_windows.cpp
index 51e57b34..78c28a50 100644
--- a/src/ccap_file_reader_windows.cpp
+++ b/src/ccap_file_reader_windows.cpp
@@ -448,10 +448,11 @@ void FileReaderWindows::readLoop() {
 
                 // Check if conversion or flip is needed
                 auto& prop = m_provider->getFrameProperty();
-                bool isOutputYUV = (prop.outputPixelFormat & kPixelFormatYUVColorBit) != 0;
+                PixelFormat effectiveOutputFormat = (prop.outputPixelFormat == PixelFormat::Unknown) ? newFrame->pixelFormat : prop.outputPixelFormat;
+                bool isOutputYUV = (effectiveOutputFormat & kPixelFormatYUVColorBit) != 0;
                 FrameOrientation targetOrientation = isOutputYUV ? FrameOrientation::TopToBottom : m_provider->frameOrientation();
                 bool shouldFlip = !isOutputYUV && (inputOrientation != targetOrientation);
-                bool shouldConvert = newFrame->pixelFormat != prop.outputPixelFormat;
+                bool shouldConvert = newFrame->pixelFormat != effectiveOutputFormat;
 
                 newFrame->orientation = targetOrientation;
 
@@ -462,7 +463,7 @@ void FileReaderWindows::readLoop() {
                         auto&& f = m_provider->getAllocatorFactory();
                         newFrame->allocator = f ? f() : std::make_shared<DefaultAllocator>();
                     }
-                    zeroCopy = !inplaceConvertFrame(newFrame.get(), prop.outputPixelFormat, shouldFlip);
+                    zeroCopy = !inplaceConvertFrame(newFrame.get(), effectiveOutputFormat, shouldFlip);
                 }
 
                 newFrame->frameIndex = m_currentFrameIndex;
diff --git a/src/ccap_imp_apple.mm b/src/ccap_imp_apple.mm
index 171347ae..e0efdaad 100644
--- a/src/ccap_imp_apple.mm
+++ b/src/ccap_imp_apple.mm
@@ -873,6 +873,9 @@ - (void)captureOutput:(AVCaptureOutput*)output
     CMTime timestamp = CMSampleBufferGetPresentationTimeStamp(sampleBuffer);
     auto internalFormat = _provider->getFrameProperty().cameraPixelFormat;
     auto outputFormat = _provider->getFrameProperty().outputPixelFormat;
+    if (outputFormat == PixelFormat::Unknown) {
+        outputFormat = internalFormat;
+    }
 
     newFrame->timestamp = (uint64_t)(CMTimeGetSeconds(timestamp) * 1e9);
     newFrame->width = (uint32_t)CVPixelBufferGetWidth(imageBuffer);
diff --git a/src/ccap_imp_linux.cpp b/src/ccap_imp_linux.cpp
index 96713b82..4727b819 100644
--- a/src/ccap_imp_linux.cpp
+++ b/src/ccap_imp_linux.cpp
@@ -549,7 +549,8 @@ bool ProviderV4L2::readFrame() {
 
     // Check input/output format types and orientations
     bool isInputYUV = (frame->pixelFormat & kPixelFormatYUVColorBit) != 0;
-    bool isOutputYUV = (m_frameProp.outputPixelFormat & kPixelFormatYUVColorBit) != 0;
+    PixelFormat effectiveOutputFormat = (m_frameProp.outputPixelFormat == PixelFormat::Unknown) ? frame->pixelFormat : m_frameProp.outputPixelFormat;
+    bool isOutputYUV = (effectiveOutputFormat & kPixelFormatYUVColorBit) != 0;
     auto inputOrientation = FrameOrientation::TopToBottom; // V4L2 always provides TopToBottom
 
     // Set output orientation based on format type
@@ -557,8 +558,7 @@ bool ProviderV4L2::readFrame() {
 
     // Check if we need conversion or flipping
     bool shouldFlip = frame->orientation != inputOrientation && !isOutputYUV;
-    bool shouldConvert = (m_frameProp.outputPixelFormat != PixelFormat::Unknown &&
-                          m_frameProp.outputPixelFormat != frame->pixelFormat);
+    bool shouldConvert = (effectiveOutputFormat != frame->pixelFormat);
     bool zeroCopy = !shouldConvert && !shouldFlip;
 
     uint8_t* bufferData = static_cast<uint8_t*>(m_buffers[buf.index].start);
@@ -614,7 +614,7 @@ bool ProviderV4L2::readFrame() {
 
             std::chrono::steady_clock::time_point startTime = std::chrono::steady_clock::now();
 
-            zeroCopy = !inplaceConvertFrame(frame.get(), m_frameProp.outputPixelFormat, shouldFlip);
+            zeroCopy = !inplaceConvertFrame(frame.get(), effectiveOutputFormat, shouldFlip);
 
             double durInMs = (std::chrono::steady_clock::now() - startTime).count() / 1.e6;
             static double s_allCostTime = 0;
@@ -630,10 +630,10 @@ bool ProviderV4L2::readFrame() {
 
             CCAP_LOG_V(
                 "ccap: inplaceConvertFrame requested pixel format: %s, actual pixel format: %s, flip: %s, cost time %s: (cur %g ms, avg %g ms)\n",
-                pixelFormatToString(m_frameProp.outputPixelFormat).data(), pixelFormatToString(m_frameProp.cameraPixelFormat).data(),
+                pixelFormatToString(effectiveOutputFormat).data(), pixelFormatToString(m_frameProp.cameraPixelFormat).data(),
                 shouldFlip ? "YES" : "NO", mode, durInMs, s_allCostTime / s_frames);
         } else {
-            zeroCopy = !inplaceConvertFrame(frame.get(), m_frameProp.outputPixelFormat, shouldFlip);
+            zeroCopy = !inplaceConvertFrame(frame.get(), effectiveOutputFormat, shouldFlip);
         }
     }
 
diff --git a/src/ccap_imp_windows.cpp b/src/ccap_imp_windows.cpp
index 7d4ca2fe..ab4efe7e 100644
--- a/src/ccap_imp_windows.cpp
+++ b/src/ccap_imp_windows.cpp
@@ -843,7 +843,8 @@ HRESULT STDMETHODCALLTYPE ProviderDirectShow::SampleCB(double sampleTime, IMedia
 
     uint32_t bufferLen = mediaSample->GetActualDataLength();
     bool isInputYUV = (m_frameProp.cameraPixelFormat & kPixelFormatYUVColorBit);
-    bool isOutputYUV = (m_frameProp.outputPixelFormat & kPixelFormatYUVColorBit);
+    PixelFormat effectiveOutputFormat = (m_frameProp.outputPixelFormat == PixelFormat::Unknown) ? m_frameProp.cameraPixelFormat : m_frameProp.outputPixelFormat;
+    bool isOutputYUV = (effectiveOutputFormat & kPixelFormatYUVColorBit);
 
     newFrame->pixelFormat = m_frameProp.cameraPixelFormat;
     newFrame->width = m_frameProp.width;
@@ -852,8 +853,7 @@ HRESULT STDMETHODCALLTYPE ProviderDirectShow::SampleCB(double sampleTime, IMedia
     newFrame->nativeHandle = nullptr;
 
     bool shouldFlip = newFrame->orientation != m_inputOrientation && !isOutputYUV;
-    bool shouldConvert = m_frameProp.outputPixelFormat != PixelFormat::Unknown &&
-        m_frameProp.cameraPixelFormat != m_frameProp.outputPixelFormat;
+    bool shouldConvert = m_frameProp.cameraPixelFormat != effectiveOutputFormat;
     bool zeroCopy = !shouldConvert && !shouldFlip;
 
     if (isInputYUV) {
@@ -921,7 +921,7 @@ HRESULT STDMETHODCALLTYPE ProviderDirectShow::SampleCB(double sampleTime, IMedia
 
             std::chrono::steady_clock::time_point startTime = std::chrono::steady_clock::now();
 
-            zeroCopy = !inplaceConvertFrame(newFrame.get(), m_frameProp.outputPixelFormat, shouldFlip);
+            zeroCopy = !inplaceConvertFrame(newFrame.get(), effectiveOutputFormat, shouldFlip);
 
             double durInMs = (std::chrono::steady_clock::now() - startTime).count() / 1.e6;
             static double s_allCostTime = 0;
@@ -937,10 +937,10 @@ HRESULT STDMETHODCALLTYPE ProviderDirectShow::SampleCB(double sampleTime, IMedia
 
             CCAP_LOG_V(
                 "ccap: inplaceConvertFrame requested pixel format: %s, actual pixel format: %s, flip: %s, cost time %s: (cur %g ms, avg %g ms)\n",
-                pixelFormatToString(m_frameProp.outputPixelFormat).data(), pixelFormatToString(m_frameProp.cameraPixelFormat).data(),
+                pixelFormatToString(effectiveOutputFormat).data(), pixelFormatToString(m_frameProp.cameraPixelFormat).data(),
                 shouldFlip ? "YES" : "NO", mode, durInMs, s_allCostTime / s_frames);
         } else {
-            zeroCopy = !inplaceConvertFrame(newFrame.get(), m_frameProp.outputPixelFormat, shouldFlip);
+            zeroCopy = !inplaceConvertFrame(newFrame.get(), effectiveOutputFormat, shouldFlip);
         }
 
         newFrame->sizeInBytes = newFrame->stride[0] * newFrame->height + (newFrame->stride[1] + newFrame->stride[2]) * newFrame->height / 2;
@@ -1168,7 +1168,7 @@ void ProviderDirectShow::close() {
 bool ProviderDirectShow::start() {
     if (!m_isOpened) return false;
 
-    // File mode
+        // File mode
 #ifdef CCAP_ENABLE_FILE_PLAYBACK
     if (m_isFileMode && m_fileReader) {
         return m_fileReader->start();
diff --git a/src/ccap_imp_windows_msmf.cpp b/src/ccap_imp_windows_msmf.cpp
index 0cfe58a5..e20abdff 100644
--- a/src/ccap_imp_windows_msmf.cpp
+++ b/src/ccap_imp_windows_msmf.cpp
@@ -747,7 +747,8 @@ void ProviderMSMF::readLoop() {
         newFrame->height = m_activeHeight;
         newFrame->nativeHandle = nullptr;
 
-        bool isOutputYUV = (m_frameProp.outputPixelFormat & kPixelFormatYUVColorBit) != 0;
+        PixelFormat effectiveOutputFormat = (m_frameProp.outputPixelFormat == PixelFormat::Unknown) ? m_activePixelFormat : m_frameProp.outputPixelFormat;
+        bool isOutputYUV = (effectiveOutputFormat & kPixelFormatYUVColorBit) != 0;
         FrameOrientation targetOrientation = isOutputYUV ? FrameOrientation::TopToBottom : m_frameOrientation;
         newFrame->orientation = targetOrientation;
 
@@ -801,8 +802,7 @@ void ProviderMSMF::readLoop() {
         }
 
         bool shouldFlip = !isOutputYUV && targetOrientation != m_inputOrientation;
-        bool shouldConvert = m_frameProp.outputPixelFormat != PixelFormat::Unknown &&
-            newFrame->pixelFormat != m_frameProp.outputPixelFormat;
+        bool shouldConvert = newFrame->pixelFormat != effectiveOutputFormat;
         bool zeroCopy = !shouldConvert && !shouldFlip;
 
         if (!zeroCopy) {
@@ -810,7 +810,7 @@ void ProviderMSMF::readLoop() {
                 newFrame->allocator = m_allocatorFactory ? m_allocatorFactory() : std::make_shared<DefaultAllocator>();
             }
 
-            zeroCopy = !inplaceConvertFrame(newFrame.get(), m_frameProp.outputPixelFormat, shouldFlip);
+            zeroCopy = !inplaceConvertFrame(newFrame.get(), effectiveOutputFormat, shouldFlip);
             newFrame->sizeInBytes = newFrame->stride[0] * newFrame->height +
                 (newFrame->stride[1] + newFrame->stride[2]) * newFrame->height / 2;
         }