From 41137178bbab42419d70c0007e0f70a3cc19df30 Mon Sep 17 00:00:00 2001 From: David Chen Date: Mon, 22 Jun 2026 21:45:01 -0700 Subject: [PATCH 01/24] adding initial livekit-capture crate --- .changeset/livekit-capture-preencoded.md | 8 + Cargo.lock | 8 + Cargo.toml | 2 + libwebrtc/src/native/rtp_sender.rs | 3 + libwebrtc/src/native/video_source.rs | 33 +- libwebrtc/src/rtp_sender.rs | 2 + libwebrtc/src/video_frame.rs | 67 +++ libwebrtc/src/video_source.rs | 7 +- livekit-capture/Cargo.toml | 12 + livekit-capture/README.md | 4 + livekit-capture/src/lib.rs | 537 ++++++++++++++++++ webrtc-sys/build.rs | 2 + .../livekit/encoded_video_frame_buffer.h | 75 +++ .../livekit/passthrough_video_encoder.h | 47 ++ webrtc-sys/include/livekit/video_track.h | 5 + webrtc-sys/src/encoded_video_frame_buffer.cpp | 74 +++ webrtc-sys/src/passthrough_video_encoder.cpp | 244 ++++++++ webrtc-sys/src/rtp_sender.cpp | 5 + webrtc-sys/src/video_encoder_factory.cpp | 12 + webrtc-sys/src/video_track.cpp | 48 ++ webrtc-sys/src/video_track.rs | 32 ++ webrtc-sys/src/webrtc.rs | 1 + 22 files changed, 1226 insertions(+), 2 deletions(-) create mode 100644 .changeset/livekit-capture-preencoded.md create mode 100644 livekit-capture/Cargo.toml create mode 100644 livekit-capture/README.md create mode 100644 livekit-capture/src/lib.rs create mode 100644 webrtc-sys/include/livekit/encoded_video_frame_buffer.h create mode 100644 webrtc-sys/include/livekit/passthrough_video_encoder.h create mode 100644 webrtc-sys/src/encoded_video_frame_buffer.cpp create mode 100644 webrtc-sys/src/passthrough_video_encoder.cpp diff --git a/.changeset/livekit-capture-preencoded.md b/.changeset/livekit-capture-preencoded.md new file mode 100644 index 000000000..ec53f333c --- /dev/null +++ b/.changeset/livekit-capture-preencoded.md @@ -0,0 +1,8 @@ +--- +"livekit-capture": minor +"livekit": patch +"libwebrtc": patch +"webrtc-sys": patch +--- + +Add a `livekit-capture` crate with codec-neutral capture types and pre-encoded H264/H265 passthrough support. diff --git a/Cargo.lock b/Cargo.lock index aa163b90c..656972ec0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3882,6 +3882,14 @@ dependencies = [ "url", ] +[[package]] +name = "livekit-capture" +version = "0.1.0" +dependencies = [ + "livekit", + "thiserror 2.0.18", +] + [[package]] name = "livekit-datatrack" version = "0.1.8" diff --git a/Cargo.toml b/Cargo.toml index 9fb63c366..c60cc7050 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ members = [ "livekit-uniffi", "livekit-datatrack", "livekit-ffi-node-bindings", + "livekit-capture", "livekit-runtime", "livekit-wakeword", "libwebrtc", @@ -49,6 +50,7 @@ imgproc = { version = "0.3.19", path = "imgproc" } libwebrtc = { version = "0.3.37", path = "libwebrtc" } livekit = { version = "0.7.47", path = "livekit" } livekit-api = { version = "0.5.2", path = "livekit-api" } +livekit-capture = { version = "0.1.0", path = "livekit-capture" } livekit-ffi = { version = "0.12.65", path = "livekit-ffi" } livekit-datatrack = { version = "0.1.8", path = "livekit-datatrack" } livekit-protocol = { version = "0.7.8", path = "livekit-protocol" } diff --git a/libwebrtc/src/native/rtp_sender.rs b/libwebrtc/src/native/rtp_sender.rs index f58b08484..e90b43a75 100644 --- a/libwebrtc/src/native/rtp_sender.rs +++ b/libwebrtc/src/native/rtp_sender.rs @@ -95,6 +95,7 @@ impl From for sys_webrtc::ffi::VideoEncoderBackend { VideoEncoderBackend::Nvenc => Self::Nvenc, VideoEncoderBackend::Vaapi => Self::Vaapi, VideoEncoderBackend::VideoToolbox => Self::VideoToolbox, + VideoEncoderBackend::PreEncoded => Self::PreEncoded, } } } @@ -108,6 +109,7 @@ impl From for VideoEncoderBackend { sys_webrtc::ffi::VideoEncoderBackend::Nvenc => Self::Nvenc, sys_webrtc::ffi::VideoEncoderBackend::Vaapi => Self::Vaapi, sys_webrtc::ffi::VideoEncoderBackend::VideoToolbox => Self::VideoToolbox, + sys_webrtc::ffi::VideoEncoderBackend::PreEncoded => Self::PreEncoded, _ => panic!("unknown VideoEncoderBackend"), } } @@ -130,6 +132,7 @@ mod tests { (VideoEncoderBackend::Nvenc, sys_webrtc::ffi::VideoEncoderBackend::Nvenc), (VideoEncoderBackend::Vaapi, sys_webrtc::ffi::VideoEncoderBackend::Vaapi), (VideoEncoderBackend::VideoToolbox, sys_webrtc::ffi::VideoEncoderBackend::VideoToolbox), + (VideoEncoderBackend::PreEncoded, sys_webrtc::ffi::VideoEncoderBackend::PreEncoded), ]; for (backend, expected) in cases { diff --git a/libwebrtc/src/native/video_source.rs b/libwebrtc/src/native/video_source.rs index 4bbe47fcf..78e55c2a8 100644 --- a/libwebrtc/src/native/video_source.rs +++ b/libwebrtc/src/native/video_source.rs @@ -26,7 +26,7 @@ use webrtc_sys::{video_frame as vf_sys, video_frame::ffi::VideoRotation, video_t use crate::video_frame::FrameMetadata; use crate::{ native::packet_trailer::PacketTrailerHandler, - video_frame::{I420Buffer, VideoBuffer, VideoFrame}, + video_frame::{EncodedVideoFrame, I420Buffer, VideoBuffer, VideoFrame}, video_source::VideoResolution, }; @@ -132,6 +132,37 @@ impl NativeVideoSource { ); } + pub fn capture_encoded_frame(&self, frame: &EncodedVideoFrame<'_>) -> bool { + let (has_trailer, user_ts, fid) = match frame.frame_metadata { + Some(meta) => (true, meta.user_timestamp.unwrap_or(0), meta.frame_id.unwrap_or(0)), + None => (false, 0, 0), + }; + + let capture_ts = if frame.timestamp_us == 0 { + let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap(); + now.as_micros() as i64 + } else { + frame.timestamp_us + }; + + self.inner.lock().captured_frames += 1; + self.sys_handle.capture_encoded_frame( + frame.width as i32, + frame.height as i32, + &vt_sys::ffi::EncodedVideoFrameData { + codec: frame.codec.into(), + frame_type: frame.frame_type.into(), + payload: frame.payload.to_vec(), + timestamp_us: capture_ts, + }, + &vt_sys::ffi::FrameMetadata { + has_packet_trailer: has_trailer, + user_timestamp: user_ts, + frame_id: fid, + }, + ) + } + /// Captures a Jetson DMA-buffer backed video frame. /// /// `pixel_format` is `0` for NV12 and `1` for YUV420M. diff --git a/libwebrtc/src/rtp_sender.rs b/libwebrtc/src/rtp_sender.rs index b16c008c2..1bdf96add 100644 --- a/libwebrtc/src/rtp_sender.rs +++ b/libwebrtc/src/rtp_sender.rs @@ -36,6 +36,8 @@ pub enum VideoEncoderBackend { Vaapi, /// Prefer VideoToolbox on Apple platforms when available. VideoToolbox, + /// Pass pre-encoded frames through without encoding raw video frames. + PreEncoded, } impl VideoEncoderBackend { diff --git a/libwebrtc/src/video_frame.rs b/libwebrtc/src/video_frame.rs index 3f90768a7..608130195 100644 --- a/libwebrtc/src/video_frame.rs +++ b/libwebrtc/src/video_frame.rs @@ -64,6 +64,73 @@ pub struct FrameMetadata { pub frame_id: Option, } +/// Codec carried by a pre-encoded video access unit. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum EncodedVideoCodec { + /// H.264/AVC video. + H264, + /// H.265/HEVC video. + H265, + /// VP8 video. + VP8, + /// VP9 video. + VP9, + /// AV1 video. + AV1, +} + +/// Frame type of a pre-encoded video access unit. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum EncodedFrameType { + /// A key frame. + Key, + /// A delta frame. + Delta, +} + +/// A pre-encoded video access unit ready for passthrough publishing. +#[derive(Debug, Clone)] +pub struct EncodedVideoFrame<'a> { + /// Encoded video codec. + pub codec: EncodedVideoCodec, + /// Encoded access-unit payload. + pub payload: &'a [u8], + /// Capture timestamp in microseconds. + pub timestamp_us: i64, + /// Encoded frame type. + pub frame_type: EncodedFrameType, + /// Encoded frame width in pixels. + pub width: u32, + /// Encoded frame height in pixels. + pub height: u32, + /// Optional metadata to attach through packet trailers. + pub frame_metadata: Option, +} + +#[cfg(not(target_arch = "wasm32"))] +impl From for webrtc_sys::video_track::ffi::EncodedVideoCodec { + fn from(value: EncodedVideoCodec) -> Self { + match value { + EncodedVideoCodec::H264 => Self::H264, + EncodedVideoCodec::H265 => Self::H265, + EncodedVideoCodec::VP8 => Self::VP8, + EncodedVideoCodec::VP9 => Self::VP9, + EncodedVideoCodec::AV1 => Self::AV1, + } + } +} + +#[cfg(not(target_arch = "wasm32"))] +impl From for webrtc_sys::video_track::ffi::EncodedFrameType { + fn from(value: EncodedFrameType) -> Self { + match value { + EncodedFrameType::Key => Self::Key, + EncodedFrameType::Delta => Self::Delta, + } + } +} + #[derive(Debug)] pub struct VideoFrame where diff --git a/libwebrtc/src/video_source.rs b/libwebrtc/src/video_source.rs index e88fd73de..23f7ae62f 100644 --- a/libwebrtc/src/video_source.rs +++ b/libwebrtc/src/video_source.rs @@ -51,7 +51,7 @@ pub mod native { use crate::native::packet_trailer::PacketTrailerHandler; #[cfg(target_os = "linux")] use crate::video_frame::FrameMetadata; - use crate::video_frame::{VideoBuffer, VideoFrame}; + use crate::video_frame::{EncodedVideoFrame, VideoBuffer, VideoFrame}; #[derive(Clone)] pub struct NativeVideoSource { @@ -79,6 +79,11 @@ pub mod native { self.handle.capture_frame(frame) } + /// Captures one pre-encoded video access unit. + pub fn capture_encoded_frame(&self, frame: &EncodedVideoFrame<'_>) -> bool { + self.handle.capture_encoded_frame(frame) + } + /// Captures a Jetson DMA-buffer backed video frame. /// /// `pixel_format` is `0` for NV12 and `1` for YUV420M. diff --git a/livekit-capture/Cargo.toml b/livekit-capture/Cargo.toml new file mode 100644 index 000000000..a1c18c19a --- /dev/null +++ b/livekit-capture/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "livekit-capture" +description = "Capture sources and pre-encoded video publishing helpers for LiveKit" +version = "0.1.0" +readme = "README.md" +license.workspace = true +edition.workspace = true +repository.workspace = true + +[dependencies] +livekit = { workspace = true } +thiserror = { workspace = true } diff --git a/livekit-capture/README.md b/livekit-capture/README.md new file mode 100644 index 000000000..4fa7a8722 --- /dev/null +++ b/livekit-capture/README.md @@ -0,0 +1,4 @@ +# livekit-capture + +Capture helpers for publishing decoded, DMA-BUF, and pre-encoded video frames +with the LiveKit Rust SDK. diff --git a/livekit-capture/src/lib.rs b/livekit-capture/src/lib.rs new file mode 100644 index 000000000..5863e4eaf --- /dev/null +++ b/livekit-capture/src/lib.rs @@ -0,0 +1,537 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use livekit::{ + options::{TrackPublishOptions, VideoCodec, VideoEncoderBackend}, + prelude::LocalVideoTrack, + webrtc::{ + video_frame::{ + EncodedFrameType as RtcEncodedFrameType, EncodedVideoCodec as RtcEncodedVideoCodec, + EncodedVideoFrame, FrameMetadata, VideoBuffer, VideoFrame, + }, + video_source::{native::NativeVideoSource, RtcVideoSource, VideoResolution}, + }, +}; +use thiserror::Error; + +const ANNEX_B_START_CODE: [u8; 4] = [0, 0, 0, 1]; + +/// Encoded video codec carried by an [`EncodedAccessUnit`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum EncodedVideoCodec { + /// H.264/AVC video. + H264, + /// H.265/HEVC video. + H265, + /// VP8 video. + VP8, + /// VP9 video. + VP9, + /// AV1 video. + AV1, +} + +/// Encoded video frame type. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum EncodedFrameType { + /// A key frame. + Key, + /// A delta frame. + Delta, +} + +/// Layer identifiers associated with an encoded frame. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct EncodedLayerInfo { + /// Spatial layer index, when present. + pub spatial_id: Option, + /// Temporal layer index, when present. + pub temporal_id: Option, +} + +/// Packet-trailer metadata associated with an encoded frame. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct EncodedFrameMetadata { + /// Wall-clock capture timestamp in microseconds. + pub user_timestamp: Option, + /// Monotonically increasing frame identifier. + pub frame_id: Option, +} + +/// H.264 packetization mode for passthrough metadata. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum H264PacketizationMode { + /// Non-interleaved packetization mode. + NonInterleaved, +} + +/// Codec-specific metadata for encoded passthrough. +#[derive(Debug, Clone, PartialEq, Eq)] +#[non_exhaustive] +pub enum CodecSpecific { + /// No codec-specific metadata. + None, + /// H.264-specific metadata. + H264 { + /// H.264 RTP packetization mode. + packetization_mode: H264PacketizationMode, + }, + /// H.265-specific metadata. + H265, + /// VP8-specific metadata. + VP8 { + /// Temporal layer index, when present. + temporal_id: Option, + /// Whether this frame synchronizes a temporal layer. + layer_sync: bool, + }, + /// VP9-specific metadata. + VP9 { + /// Temporal layer index, when present. + temporal_id: Option, + /// Spatial layer index, when present. + spatial_id: Option, + /// Whether this frame depends on an inter-layer reference. + inter_layer_predicted: Option, + }, + /// AV1-specific metadata. + AV1 { + /// RTP scalability mode, such as `L1T1`. + scalability_mode: Option, + /// Encoded dependency descriptor bytes, when supplied by the caller. + dependency_descriptor: Option>, + }, +} + +impl Default for CodecSpecific { + fn default() -> Self { + Self::None + } +} + +/// Borrowed encoded payload fragment. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct EncodedFragment<'a> { + /// Encoded fragment bytes. + pub bytes: &'a [u8], +} + +/// Encoded access-unit payload. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum EncodedPayload<'a> { + /// One contiguous payload buffer. + Contiguous(&'a [u8]), + /// Multiple payload fragments. + Fragments(&'a [EncodedFragment<'a>]), + /// Owned payload bytes. + Owned(Vec), +} + +impl EncodedPayload<'_> { + fn is_empty(&self) -> bool { + match self { + Self::Contiguous(bytes) => bytes.is_empty(), + Self::Fragments(fragments) => { + fragments.is_empty() || fragments.iter().any(|fragment| fragment.bytes.is_empty()) + } + Self::Owned(bytes) => bytes.is_empty(), + } + } + + fn to_vec(&self) -> Vec { + match self { + Self::Contiguous(bytes) => bytes.to_vec(), + Self::Fragments(fragments) => { + let len = fragments.iter().map(|fragment| fragment.bytes.len()).sum(); + let mut payload = Vec::with_capacity(len); + for fragment in *fragments { + payload.extend_from_slice(fragment.bytes); + } + payload + } + Self::Owned(bytes) => bytes.clone(), + } + } +} + +/// One encoded video access unit. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct EncodedAccessUnit<'a> { + /// Encoded codec. + pub codec: EncodedVideoCodec, + /// Encoded payload. + pub payload: EncodedPayload<'a>, + /// Capture timestamp in microseconds. + pub timestamp_us: i64, + /// Encoded frame type. + pub frame_type: EncodedFrameType, + /// Encoded frame width in pixels. + pub width: u32, + /// Encoded frame height in pixels. + pub height: u32, + /// Optional layer identifiers. + pub layers: EncodedLayerInfo, + /// Optional codec-specific metadata. + pub codec_specific: CodecSpecific, + /// Optional packet-trailer metadata. + pub metadata: EncodedFrameMetadata, +} + +impl<'a> EncodedAccessUnit<'a> { + /// Creates an access unit from one contiguous payload. + pub fn contiguous( + codec: EncodedVideoCodec, + payload: &'a [u8], + timestamp_us: i64, + frame_type: EncodedFrameType, + width: u32, + height: u32, + ) -> Self { + Self { + codec, + payload: EncodedPayload::Contiguous(payload), + timestamp_us, + frame_type, + width, + height, + layers: EncodedLayerInfo::default(), + codec_specific: CodecSpecific::None, + metadata: EncodedFrameMetadata::default(), + } + } + + /// Creates an H.264 access unit from raw NAL-unit payloads. + pub fn from_h264_nalus( + nal_units: &[&[u8]], + timestamp_us: i64, + width: u32, + height: u32, + ) -> Result, CaptureError> { + let mut is_key = false; + for nal in nal_units { + let nal_type = h264_nal_type(nal)?; + if nal_type == 5 { + is_key = true; + } + } + + Ok(EncodedAccessUnit { + codec: EncodedVideoCodec::H264, + payload: EncodedPayload::Owned(annex_b_payload(nal_units)?), + timestamp_us, + frame_type: if is_key { EncodedFrameType::Key } else { EncodedFrameType::Delta }, + width, + height, + layers: EncodedLayerInfo::default(), + codec_specific: CodecSpecific::H264 { + packetization_mode: H264PacketizationMode::NonInterleaved, + }, + metadata: EncodedFrameMetadata::default(), + }) + } + + /// Creates an H.265 access unit from raw NAL-unit payloads. + pub fn from_h265_nalus( + nal_units: &[&[u8]], + timestamp_us: i64, + width: u32, + height: u32, + ) -> Result, CaptureError> { + let mut is_key = false; + for nal in nal_units { + let nal_type = h265_nal_type(nal)?; + if (16..=21).contains(&nal_type) { + is_key = true; + } + } + + Ok(EncodedAccessUnit { + codec: EncodedVideoCodec::H265, + payload: EncodedPayload::Owned(annex_b_payload(nal_units)?), + timestamp_us, + frame_type: if is_key { EncodedFrameType::Key } else { EncodedFrameType::Delta }, + width, + height, + layers: EncodedLayerInfo::default(), + codec_specific: CodecSpecific::H265, + metadata: EncodedFrameMetadata::default(), + }) + } +} + +/// DMA-BUF pixel format. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DmaBufPixelFormat { + /// NV12 biplanar format. + Nv12, + /// YUV420M multiplanar format. + Yuv420M, +} + +impl DmaBufPixelFormat { + #[cfg(target_os = "linux")] + fn as_native(self) -> i32 { + match self { + Self::Nv12 => 0, + Self::Yuv420M => 1, + } + } +} + +/// One DMA-BUF plane descriptor. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct DmaBufPlane { + /// DMA-BUF file descriptor. + pub fd: i32, + /// Plane byte offset. + pub offset: u32, + /// Plane byte stride. + pub stride: u32, +} + +/// One DMA-BUF backed captured frame. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DmaBufFrame { + /// Frame width in pixels. + pub width: u32, + /// Frame height in pixels. + pub height: u32, + /// Pixel format. + pub pixel_format: DmaBufPixelFormat, + /// DMA-BUF planes. + pub planes: Vec, + /// Optional DRM format modifier. + pub modifier: Option, + /// Capture timestamp in microseconds. + pub timestamp_us: i64, + /// Optional packet-trailer metadata. + pub metadata: EncodedFrameMetadata, +} + +/// Capture source backed by a LiveKit local video track. +#[derive(Debug, Clone)] +pub struct VideoCaptureTrack { + source: NativeVideoSource, + track: LocalVideoTrack, +} + +impl VideoCaptureTrack { + /// Creates a capture track with the supplied resolution. + pub fn new(name: &str, resolution: VideoResolution, is_screencast: bool) -> Self { + let source = NativeVideoSource::new(resolution, is_screencast); + let track = + LocalVideoTrack::create_video_track(name, RtcVideoSource::Native(source.clone())); + Self { source, track } + } + + /// Returns the publishable local video track. + pub fn track(&self) -> LocalVideoTrack { + self.track.clone() + } + + /// Captures one decoded video frame. + pub fn capture_frame>(&self, frame: &VideoFrame) { + self.source.capture_frame(frame); + } + + /// Captures one DMA-BUF backed frame. + #[cfg(target_os = "linux")] + pub fn capture_dmabuf(&self, frame: &DmaBufFrame) -> Result<(), CaptureError> { + let plane = frame.planes.first().ok_or(CaptureError::MissingDmaBufPlane)?; + let ok = self.source.capture_dmabuf_frame_with_metadata( + plane.fd, + frame.width, + frame.height, + frame.pixel_format.as_native(), + frame.timestamp_us, + frame.metadata.into_frame_metadata(), + ); + ok.then_some(()).ok_or(CaptureError::CaptureFailed) + } + + /// Captures one encoded video access unit. + pub fn capture_encoded(&self, access_unit: &EncodedAccessUnit<'_>) -> Result<(), CaptureError> { + match access_unit.codec { + EncodedVideoCodec::H264 | EncodedVideoCodec::H265 => {} + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { + return Err(CaptureError::UnsupportedCodec(access_unit.codec)); + } + } + if access_unit.payload.is_empty() { + return Err(CaptureError::EmptyPayload); + } + + let payload = access_unit.payload.to_vec(); + let frame = EncodedVideoFrame { + codec: access_unit.codec.into(), + payload: &payload, + timestamp_us: access_unit.timestamp_us, + frame_type: access_unit.frame_type.into(), + width: access_unit.width, + height: access_unit.height, + frame_metadata: access_unit.metadata.into_frame_metadata(), + }; + self.source.capture_encoded_frame(&frame).then_some(()).ok_or(CaptureError::CaptureFailed) + } + + /// Returns publish options appropriate for encoded passthrough. + pub fn encoded_publish_options(codec: EncodedVideoCodec) -> TrackPublishOptions { + TrackPublishOptions { + video_codec: codec.into(), + video_encoder: VideoEncoderBackend::PreEncoded, + simulcast: false, + ..Default::default() + } + } +} + +/// Error returned by capture helpers. +#[derive(Debug, Error, PartialEq, Eq)] +pub enum CaptureError { + /// Encoded payload is empty. + #[error("encoded payload is empty")] + EmptyPayload, + /// H.265 NAL unit is too short to contain its header. + #[error("H.265 NAL unit is too short")] + H265NalTooShort, + /// DMA-BUF frame did not include any planes. + #[error("DMA-BUF frame did not include any planes")] + MissingDmaBufPlane, + /// Codec is represented by the API but not yet supported by native passthrough. + #[error("encoded passthrough does not support {0:?} yet")] + UnsupportedCodec(EncodedVideoCodec), + /// The underlying source rejected the frame. + #[error("capture source rejected the frame")] + CaptureFailed, +} + +impl EncodedFrameMetadata { + fn into_frame_metadata(self) -> Option { + (self.user_timestamp.is_some() || self.frame_id.is_some()).then_some(FrameMetadata { + user_timestamp: self.user_timestamp, + frame_id: self.frame_id, + }) + } +} + +impl From for VideoCodec { + fn from(value: EncodedVideoCodec) -> Self { + match value { + EncodedVideoCodec::H264 => Self::H264, + EncodedVideoCodec::H265 => Self::H265, + EncodedVideoCodec::VP8 => Self::VP8, + EncodedVideoCodec::VP9 => Self::VP9, + EncodedVideoCodec::AV1 => Self::AV1, + } + } +} + +impl From for RtcEncodedVideoCodec { + fn from(value: EncodedVideoCodec) -> Self { + match value { + EncodedVideoCodec::H264 => Self::H264, + EncodedVideoCodec::H265 => Self::H265, + EncodedVideoCodec::VP8 => Self::VP8, + EncodedVideoCodec::VP9 => Self::VP9, + EncodedVideoCodec::AV1 => Self::AV1, + } + } +} + +impl From for RtcEncodedFrameType { + fn from(value: EncodedFrameType) -> Self { + match value { + EncodedFrameType::Key => Self::Key, + EncodedFrameType::Delta => Self::Delta, + } + } +} + +fn h264_nal_type(nal: &[u8]) -> Result { + let header = nal.first().ok_or(CaptureError::EmptyPayload)?; + Ok(header & 0x1f) +} + +fn h265_nal_type(nal: &[u8]) -> Result { + if nal.is_empty() { + return Err(CaptureError::EmptyPayload); + } + if nal.len() < 2 { + return Err(CaptureError::H265NalTooShort); + } + Ok((nal[0] >> 1) & 0x3f) +} + +fn annex_b_payload(nal_units: &[&[u8]]) -> Result, CaptureError> { + if nal_units.is_empty() { + return Err(CaptureError::EmptyPayload); + } + let len = nal_units.iter().try_fold(0usize, |len, nal| { + if nal.is_empty() { + Err(CaptureError::EmptyPayload) + } else { + Ok(len + ANNEX_B_START_CODE.len() + nal.len()) + } + })?; + + let mut payload = Vec::with_capacity(len); + for nal in nal_units { + payload.extend_from_slice(&ANNEX_B_START_CODE); + payload.extend_from_slice(nal); + } + Ok(payload) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn h264_nal_helper_assembles_annex_b_and_detects_keyframe() { + let sps = [0x67, 1, 2, 3]; + let idr = [0x65, 4, 5, 6]; + let au = EncodedAccessUnit::from_h264_nalus(&[&sps, &idr], 10, 640, 480).unwrap(); + + assert_eq!(au.codec, EncodedVideoCodec::H264); + assert_eq!(au.frame_type, EncodedFrameType::Key); + assert_eq!( + au.payload, + EncodedPayload::Owned(vec![0, 0, 0, 1, 0x67, 1, 2, 3, 0, 0, 0, 1, 0x65, 4, 5, 6]) + ); + } + + #[test] + fn h265_nal_helper_detects_irap_keyframe() { + let vps = [0x40, 1, 2]; + let idr_w_radl = [19 << 1, 1, 3]; + let au = EncodedAccessUnit::from_h265_nalus(&[&vps, &idr_w_radl], 10, 640, 480).unwrap(); + + assert_eq!(au.codec, EncodedVideoCodec::H265); + assert_eq!(au.frame_type, EncodedFrameType::Key); + } + + #[test] + fn h265_rejects_too_short_nal_header() { + let err = EncodedAccessUnit::from_h265_nalus(&[&[0x26]], 10, 640, 480).unwrap_err(); + assert_eq!(err, CaptureError::H265NalTooShort); + } + + #[test] + fn fragments_reject_empty_fragment() { + let fragments = [EncodedFragment { bytes: &[1] }, EncodedFragment { bytes: &[] }]; + let payload = EncodedPayload::Fragments(&fragments); + assert!(payload.is_empty()); + } +} diff --git a/webrtc-sys/build.rs b/webrtc-sys/build.rs index 6115c231d..d4d559db1 100644 --- a/webrtc-sys/build.rs +++ b/webrtc-sys/build.rs @@ -84,7 +84,9 @@ fn main() { "src/video_frame.cpp", "src/video_frame_buffer.cpp", "src/dmabuf_video_frame_buffer.cpp", + "src/encoded_video_frame_buffer.cpp", "src/video_encoder_factory.cpp", + "src/passthrough_video_encoder.cpp", "src/video_decoder_factory.cpp", "src/synthetic_audio_device.cpp", "src/adm_proxy.cpp", diff --git a/webrtc-sys/include/livekit/encoded_video_frame_buffer.h b/webrtc-sys/include/livekit/encoded_video_frame_buffer.h new file mode 100644 index 000000000..f5bec6665 --- /dev/null +++ b/webrtc-sys/include/livekit/encoded_video_frame_buffer.h @@ -0,0 +1,75 @@ +/* + * Copyright 2026 LiveKit, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "api/video/video_frame_buffer.h" + +namespace livekit { + +enum class EncodedVideoCodec { + kH264, + kH265, + kVP8, + kVP9, + kAV1, +}; + +enum class EncodedFrameType { + kKey, + kDelta, +}; + +// A native WebRTC frame buffer carrying one encoded video access unit. +class EncodedVideoFrameBuffer : public webrtc::VideoFrameBuffer { + public: + EncodedVideoFrameBuffer(int width, + int height, + EncodedVideoCodec codec, + EncodedFrameType frame_type, + std::vector payload); + ~EncodedVideoFrameBuffer() override = default; + + Type type() const override; + int width() const override; + int height() const override; + webrtc::scoped_refptr ToI420() override; + webrtc::scoped_refptr CropAndScale( + int offset_x, + int offset_y, + int crop_width, + int crop_height, + int scaled_width, + int scaled_height) override; + + EncodedVideoCodec codec() const { return codec_; } + EncodedFrameType frame_type() const { return frame_type_; } + const std::vector& payload() const { return payload_; } + + static EncodedVideoFrameBuffer* FromNative(webrtc::VideoFrameBuffer* buffer); + + private: + int width_; + int height_; + EncodedVideoCodec codec_; + EncodedFrameType frame_type_; + std::vector payload_; +}; + +} // namespace livekit diff --git a/webrtc-sys/include/livekit/passthrough_video_encoder.h b/webrtc-sys/include/livekit/passthrough_video_encoder.h new file mode 100644 index 000000000..a50f9f63a --- /dev/null +++ b/webrtc-sys/include/livekit/passthrough_video_encoder.h @@ -0,0 +1,47 @@ +/* + * Copyright 2026 LiveKit, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "api/environment/environment.h" +#include "api/video_codecs/sdp_video_format.h" +#include "api/video_codecs/video_encoder.h" +#include "api/video_codecs/video_encoder_factory.h" + +namespace livekit_ffi { + +class PassthroughVideoEncoderFactory : public webrtc::VideoEncoderFactory { + public: + PassthroughVideoEncoderFactory(); + ~PassthroughVideoEncoderFactory() override = default; + + std::vector GetSupportedFormats() const override; + std::vector GetImplementations() const override; + CodecSupport QueryCodecSupport( + const webrtc::SdpVideoFormat& format, + std::optional scalability_mode) const override; + std::unique_ptr Create( + const webrtc::Environment& env, + const webrtc::SdpVideoFormat& format) override; + + private: + std::vector supported_formats_; +}; + +} // namespace livekit_ffi diff --git a/webrtc-sys/include/livekit/video_track.h b/webrtc-sys/include/livekit/video_track.h index 5520cf2ec..fc1d614fd 100644 --- a/webrtc-sys/include/livekit/video_track.h +++ b/webrtc-sys/include/livekit/video_track.h @@ -132,6 +132,11 @@ class VideoTrackSource { int64_t timestamp_us, const FrameMetadata& frame_metadata) const; + bool capture_encoded_frame(int width, + int height, + const EncodedVideoFrameData& frame, + const FrameMetadata& frame_metadata) const; + void set_packet_trailer_handler( std::shared_ptr handler) const; diff --git a/webrtc-sys/src/encoded_video_frame_buffer.cpp b/webrtc-sys/src/encoded_video_frame_buffer.cpp new file mode 100644 index 000000000..c5d321fd0 --- /dev/null +++ b/webrtc-sys/src/encoded_video_frame_buffer.cpp @@ -0,0 +1,74 @@ +/* + * Copyright 2026 LiveKit, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "livekit/encoded_video_frame_buffer.h" + +#include + +#include "rtc_base/logging.h" + +namespace livekit { + +EncodedVideoFrameBuffer::EncodedVideoFrameBuffer( + int width, + int height, + EncodedVideoCodec codec, + EncodedFrameType frame_type, + std::vector payload) + : width_(width), + height_(height), + codec_(codec), + frame_type_(frame_type), + payload_(std::move(payload)) {} + +webrtc::VideoFrameBuffer::Type EncodedVideoFrameBuffer::type() const { + return Type::kNative; +} + +int EncodedVideoFrameBuffer::width() const { + return width_; +} + +int EncodedVideoFrameBuffer::height() const { + return height_; +} + +webrtc::scoped_refptr +EncodedVideoFrameBuffer::ToI420() { + RTC_LOG(LS_ERROR) << "EncodedVideoFrameBuffer::ToI420 is unsupported"; + return nullptr; +} + +webrtc::scoped_refptr +EncodedVideoFrameBuffer::CropAndScale(int /* offset_x */, + int /* offset_y */, + int /* crop_width */, + int /* crop_height */, + int /* scaled_width */, + int /* scaled_height */) { + RTC_LOG(LS_ERROR) << "EncodedVideoFrameBuffer::CropAndScale is unsupported"; + return nullptr; +} + +EncodedVideoFrameBuffer* EncodedVideoFrameBuffer::FromNative( + webrtc::VideoFrameBuffer* buffer) { + if (!buffer || buffer->type() != webrtc::VideoFrameBuffer::Type::kNative) { + return nullptr; + } + return dynamic_cast(buffer); +} + +} // namespace livekit diff --git a/webrtc-sys/src/passthrough_video_encoder.cpp b/webrtc-sys/src/passthrough_video_encoder.cpp new file mode 100644 index 000000000..117539a48 --- /dev/null +++ b/webrtc-sys/src/passthrough_video_encoder.cpp @@ -0,0 +1,244 @@ +/* + * Copyright 2026 LiveKit, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "livekit/passthrough_video_encoder.h" + +#include +#include +#include +#include +#include +#include + +#include "api/video/encoded_image.h" +#include "api/video/video_frame.h" +#include "api/video_codecs/video_encoder.h" +#include "common_video/h264/h264_common.h" +#include "livekit/encoded_video_frame_buffer.h" +#include "media/base/media_constants.h" +#include "modules/video_coding/include/video_codec_interface.h" +#include "modules/video_coding/include/video_error_codes.h" +#include "rtc_base/logging.h" + +namespace livekit_ffi { +namespace { + +using livekit::EncodedVideoFrameBuffer; +using webrtc::CodecSpecificInfo; +using webrtc::EncodedImage; +using webrtc::EncodedImageBuffer; +using webrtc::EncodedImageCallback; +using webrtc::Environment; +using webrtc::H264PacketizationMode; +using webrtc::SdpVideoFormat; +using webrtc::VideoCodec; +using webrtc::VideoCodecType; +using webrtc::VideoEncoder; +using webrtc::VideoFrame; +using webrtc::VideoFrameBuffer; +using webrtc::VideoFrameType; + +VideoCodecType CodecTypeFromFormat(const SdpVideoFormat& format) { + if (format.name == "H264") { + return webrtc::kVideoCodecH264; + } + if (format.name == "H265" || format.name == "HEVC") { + return webrtc::kVideoCodecH265; + } + return webrtc::kVideoCodecGeneric; +} + +VideoCodecType CodecTypeFromBuffer(livekit::EncodedVideoCodec codec) { + switch (codec) { + case livekit::EncodedVideoCodec::kH264: + return webrtc::kVideoCodecH264; + case livekit::EncodedVideoCodec::kH265: + return webrtc::kVideoCodecH265; + case livekit::EncodedVideoCodec::kVP8: + return webrtc::kVideoCodecVP8; + case livekit::EncodedVideoCodec::kVP9: + return webrtc::kVideoCodecVP9; + case livekit::EncodedVideoCodec::kAV1: + return webrtc::kVideoCodecAV1; + } +} + +VideoFrameType FrameTypeFromBuffer(livekit::EncodedFrameType frame_type) { + switch (frame_type) { + case livekit::EncodedFrameType::kKey: + return VideoFrameType::kVideoFrameKey; + case livekit::EncodedFrameType::kDelta: + return VideoFrameType::kVideoFrameDelta; + } +} + +class PassthroughVideoEncoder final : public VideoEncoder { + public: + PassthroughVideoEncoder(const Environment& env, const SdpVideoFormat& format) + : env_(env), format_(format), codec_type_(CodecTypeFromFormat(format)) {} + + int32_t InitEncode(const VideoCodec* codec_settings, + const Settings& /* settings */) override { + if (!codec_settings || codec_settings->codecType != codec_type_) { + return WEBRTC_VIDEO_CODEC_ERR_PARAMETER; + } + codec_ = *codec_settings; + return WEBRTC_VIDEO_CODEC_OK; + } + + int32_t RegisterEncodeCompleteCallback( + EncodedImageCallback* callback) override { + encoded_image_callback_ = callback; + return WEBRTC_VIDEO_CODEC_OK; + } + + int32_t Release() override { + encoded_image_callback_ = nullptr; + return WEBRTC_VIDEO_CODEC_OK; + } + + int32_t Encode(const VideoFrame& frame, + const std::vector* /* frame_types */) override { + if (!encoded_image_callback_) { + RTC_LOG(LS_ERROR) + << "PassthroughVideoEncoder callback is not registered"; + return WEBRTC_VIDEO_CODEC_UNINITIALIZED; + } + + webrtc::scoped_refptr frame_buffer = + frame.video_frame_buffer(); + EncodedVideoFrameBuffer* encoded_buffer = + EncodedVideoFrameBuffer::FromNative(frame_buffer.get()); + if (!encoded_buffer) { + RTC_LOG(LS_ERROR) + << "PassthroughVideoEncoder received a non-encoded frame buffer"; + return WEBRTC_VIDEO_CODEC_ERR_PARAMETER; + } + + if (CodecTypeFromBuffer(encoded_buffer->codec()) != codec_type_) { + RTC_LOG(LS_ERROR) + << "PassthroughVideoEncoder frame codec does not match sender codec"; + return WEBRTC_VIDEO_CODEC_ERR_PARAMETER; + } + + const std::vector& payload = encoded_buffer->payload(); + if (payload.empty()) { + RTC_LOG(LS_ERROR) << "PassthroughVideoEncoder received an empty frame"; + return WEBRTC_VIDEO_CODEC_ERR_PARAMETER; + } + + EncodedImage encoded_image; + encoded_image._encodedWidth = encoded_buffer->width(); + encoded_image._encodedHeight = encoded_buffer->height(); + encoded_image.SetRtpTimestamp(frame.rtp_timestamp()); + encoded_image.SetSimulcastIndex(0); + encoded_image.ntp_time_ms_ = frame.ntp_time_ms(); + encoded_image.capture_time_ms_ = frame.render_time_ms(); + encoded_image.rotation_ = frame.rotation(); + encoded_image.content_type_ = webrtc::VideoContentType::UNSPECIFIED; + encoded_image.timing_.flags = webrtc::VideoSendTiming::kInvalid; + encoded_image._frameType = FrameTypeFromBuffer(encoded_buffer->frame_type()); + encoded_image.SetColorSpace(frame.color_space()); + encoded_image.SetEncodedData( + EncodedImageBuffer::Create(payload.data(), payload.size())); + encoded_image.set_size(payload.size()); + encoded_image.qp_ = -1; + + CodecSpecificInfo codec_info; + codec_info.codecType = codec_type_; + if (codec_type_ == webrtc::kVideoCodecH264) { + codec_info.codecSpecific.H264.packetization_mode = + H264PacketizationMode::NonInterleaved; + } + + const auto result = + encoded_image_callback_->OnEncodedImage(encoded_image, &codec_info); + if (result.error != EncodedImageCallback::Result::OK) { + RTC_LOG(LS_ERROR) << "PassthroughVideoEncoder callback failed " + << result.error; + return WEBRTC_VIDEO_CODEC_ERROR; + } + return WEBRTC_VIDEO_CODEC_OK; + } + + void SetRates(const RateControlParameters& /* parameters */) override {} + + EncoderInfo GetEncoderInfo() const override { + EncoderInfo info; + info.supports_native_handle = true; + info.implementation_name = "LiveKit pre-encoded passthrough"; + info.scaling_settings = VideoEncoder::ScalingSettings::kOff; + info.is_hardware_accelerated = false; + info.supports_simulcast = false; + info.preferred_pixel_formats = {VideoFrameBuffer::Type::kNative}; + return info; + } + + private: + Environment env_; + SdpVideoFormat format_; + VideoCodecType codec_type_; + VideoCodec codec_; + EncodedImageCallback* encoded_image_callback_ = nullptr; +}; + +} // namespace + +PassthroughVideoEncoderFactory::PassthroughVideoEncoderFactory() { + std::map h264_parameters = { + {"profile-level-id", "42e01f"}, + {"level-asymmetry-allowed", "1"}, + {"packetization-mode", "1"}, + }; + supported_formats_.push_back(SdpVideoFormat("H264", h264_parameters)); + supported_formats_.push_back(SdpVideoFormat("H265")); + supported_formats_.push_back(SdpVideoFormat("HEVC")); +} + +std::vector +PassthroughVideoEncoderFactory::GetSupportedFormats() const { + return supported_formats_; +} + +std::vector +PassthroughVideoEncoderFactory::GetImplementations() const { + return supported_formats_; +} + +PassthroughVideoEncoderFactory::CodecSupport +PassthroughVideoEncoderFactory::QueryCodecSupport( + const SdpVideoFormat& format, + std::optional /* scalability_mode */) const { + for (const auto& supported_format : supported_formats_) { + if (format.IsSameCodec(supported_format)) { + return {.is_supported = true, .is_power_efficient = true}; + } + } + return {.is_supported = false, .is_power_efficient = false}; +} + +std::unique_ptr PassthroughVideoEncoderFactory::Create( + const Environment& env, + const SdpVideoFormat& format) { + for (const auto& supported_format : supported_formats_) { + if (format.IsSameCodec(supported_format)) { + return std::make_unique(env, supported_format); + } + } + return nullptr; +} + +} // namespace livekit_ffi diff --git a/webrtc-sys/src/rtp_sender.cpp b/webrtc-sys/src/rtp_sender.cpp index cb7809d4a..1351c83dc 100644 --- a/webrtc-sys/src/rtp_sender.cpp +++ b/webrtc-sys/src/rtp_sender.cpp @@ -46,6 +46,8 @@ const char* BackendName(VideoEncoderBackend backend) { return "vaapi"; case VideoEncoderBackend::VideoToolbox: return "videotoolbox"; + case VideoEncoderBackend::PreEncoded: + return "preencoded"; } } @@ -71,6 +73,9 @@ std::optional BackendFromFormat( if (it->second == BackendName(VideoEncoderBackend::VideoToolbox)) { return VideoEncoderBackend::VideoToolbox; } + if (it->second == BackendName(VideoEncoderBackend::PreEncoded)) { + return VideoEncoderBackend::PreEncoded; + } return std::nullopt; } diff --git a/webrtc-sys/src/video_encoder_factory.cpp b/webrtc-sys/src/video_encoder_factory.cpp index 98505813e..4d3f4ca49 100644 --- a/webrtc-sys/src/video_encoder_factory.cpp +++ b/webrtc-sys/src/video_encoder_factory.cpp @@ -26,6 +26,7 @@ #include "api/video_codecs/video_encoder.h" #include "api/video_codecs/video_encoder_factory_template.h" #include "livekit/objc_video_factory.h" +#include "livekit/passthrough_video_encoder.h" #include "livekit/webrtc.h" #include "media/base/media_constants.h" #include "media/engine/simulcast_encoder_adapter.h" @@ -107,6 +108,8 @@ const char* BackendName(VideoEncoderBackend backend) { return "vaapi"; case VideoEncoderBackend::VideoToolbox: return "videotoolbox"; + case VideoEncoderBackend::PreEncoded: + return "preencoded"; } } @@ -132,6 +135,9 @@ std::optional BackendFromFormat( if (it->second == BackendName(VideoEncoderBackend::VideoToolbox)) { return VideoEncoderBackend::VideoToolbox; } + if (it->second == BackendName(VideoEncoderBackend::PreEncoded)) { + return VideoEncoderBackend::PreEncoded; + } return std::nullopt; } @@ -256,6 +262,7 @@ rust::Vec video_encoder_backend_list() { rust::Vec backends; backends.push_back(VideoEncoderBackend::Auto); backends.push_back(VideoEncoderBackend::Software); + backends.push_back(VideoEncoderBackend::PreEncoded); bool has_hardware_backend = false; bool hardware_backend_listed = false; @@ -299,6 +306,11 @@ rust::Vec video_encoder_backend_list() { } VideoEncoderFactory::InternalFactory::InternalFactory() { + AddBackendFactory( + factories_, + VideoEncoderBackend::PreEncoded, + std::make_unique()); + #ifdef __APPLE__ AddBackendFactory( factories_, diff --git a/webrtc-sys/src/video_track.cpp b/webrtc-sys/src/video_track.cpp index 4a25ab1ef..af8866b40 100644 --- a/webrtc-sys/src/video_track.cpp +++ b/webrtc-sys/src/video_track.cpp @@ -27,6 +27,7 @@ #include "audio/remix_resample.h" #include "common_audio/include/audio_util.h" #include "livekit/dmabuf_video_frame_buffer.h" +#include "livekit/encoded_video_frame_buffer.h" #include "livekit/media_stream.h" #include "livekit/packet_trailer.h" #include "livekit/video_track.h" @@ -38,6 +39,33 @@ #include "webrtc-sys/src/video_track.rs.h" namespace livekit_ffi { +namespace { + +livekit::EncodedVideoCodec ToNativeEncodedCodec(EncodedVideoCodec codec) { + switch (codec) { + case EncodedVideoCodec::H264: + return livekit::EncodedVideoCodec::kH264; + case EncodedVideoCodec::H265: + return livekit::EncodedVideoCodec::kH265; + case EncodedVideoCodec::VP8: + return livekit::EncodedVideoCodec::kVP8; + case EncodedVideoCodec::VP9: + return livekit::EncodedVideoCodec::kVP9; + case EncodedVideoCodec::AV1: + return livekit::EncodedVideoCodec::kAV1; + } +} + +livekit::EncodedFrameType ToNativeEncodedFrameType(EncodedFrameType frame_type) { + switch (frame_type) { + case EncodedFrameType::Key: + return livekit::EncodedFrameType::kKey; + case EncodedFrameType::Delta: + return livekit::EncodedFrameType::kDelta; + } +} + +} // namespace VideoTrack::VideoTrack(std::shared_ptr rtc_runtime, webrtc::scoped_refptr track) @@ -244,6 +272,26 @@ bool VideoTrackSource::capture_dmabuf_frame(int dmabuf_fd, return source_->on_captured_frame(frame, frame_metadata); } +bool VideoTrackSource::capture_encoded_frame( + int width, + int height, + const EncodedVideoFrameData& encoded_frame, + const FrameMetadata& frame_metadata) const { + auto buffer = webrtc::make_ref_counted( + width, height, ToNativeEncodedCodec(encoded_frame.codec), + ToNativeEncodedFrameType(encoded_frame.frame_type), + std::vector(encoded_frame.payload.begin(), + encoded_frame.payload.end())); + + auto frame = webrtc::VideoFrame::Builder() + .set_video_frame_buffer(std::move(buffer)) + .set_rotation(webrtc::kVideoRotation_0) + .set_timestamp_us(encoded_frame.timestamp_us) + .build(); + + return source_->on_captured_frame(frame, frame_metadata); +} + void VideoTrackSource::set_packet_trailer_handler( std::shared_ptr handler) const { source_->set_packet_trailer_handler(std::move(handler)); diff --git a/webrtc-sys/src/video_track.rs b/webrtc-sys/src/video_track.rs index 9453d5766..5a82839c2 100644 --- a/webrtc-sys/src/video_track.rs +++ b/webrtc-sys/src/video_track.rs @@ -49,6 +49,31 @@ pub mod ffi { pub frame_id: u32, } + #[derive(Debug, Clone, Copy, PartialEq, Eq)] + #[repr(i32)] + pub enum EncodedVideoCodec { + H264, + H265, + VP8, + VP9, + AV1, + } + + #[derive(Debug, Clone, Copy, PartialEq, Eq)] + #[repr(i32)] + pub enum EncodedFrameType { + Key, + Delta, + } + + #[derive(Debug)] + pub struct EncodedVideoFrameData { + pub codec: EncodedVideoCodec, + pub frame_type: EncodedFrameType, + pub payload: Vec, + pub timestamp_us: i64, + } + extern "C++" { include!("livekit/video_frame.h"); include!("livekit/media_stream_track.h"); @@ -93,6 +118,13 @@ pub mod ffi { timestamp_us: i64, frame_metadata: &FrameMetadata, ) -> bool; + fn capture_encoded_frame( + self: &VideoTrackSource, + width: i32, + height: i32, + frame: &EncodedVideoFrameData, + frame_metadata: &FrameMetadata, + ) -> bool; fn set_packet_trailer_handler( self: &VideoTrackSource, handler: SharedPtr, diff --git a/webrtc-sys/src/webrtc.rs b/webrtc-sys/src/webrtc.rs index b7878341c..fc7b7d099 100644 --- a/webrtc-sys/src/webrtc.rs +++ b/webrtc-sys/src/webrtc.rs @@ -63,6 +63,7 @@ pub mod ffi { Nvenc, Vaapi, VideoToolbox, + PreEncoded, } unsafe extern "C++" { From fda0671316f513607a109f8ee691005d8dc880ff Mon Sep 17 00:00:00 2001 From: David Chen Date: Mon, 22 Jun 2026 22:52:39 -0700 Subject: [PATCH 02/24] refactor livekit-capture --- livekit-capture/src/lib.rs | 537 ++----------------------------------- 1 file changed, 15 insertions(+), 522 deletions(-) diff --git a/livekit-capture/src/lib.rs b/livekit-capture/src/lib.rs index 5863e4eaf..2485806a6 100644 --- a/livekit-capture/src/lib.rs +++ b/livekit-capture/src/lib.rs @@ -12,526 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -use livekit::{ - options::{TrackPublishOptions, VideoCodec, VideoEncoderBackend}, - prelude::LocalVideoTrack, - webrtc::{ - video_frame::{ - EncodedFrameType as RtcEncodedFrameType, EncodedVideoCodec as RtcEncodedVideoCodec, - EncodedVideoFrame, FrameMetadata, VideoBuffer, VideoFrame, - }, - video_source::{native::NativeVideoSource, RtcVideoSource, VideoResolution}, - }, +//! Capture helpers for publishing decoded, DMA-BUF, and encoded video with LiveKit. + +pub mod dmabuf; +pub mod encoded; +mod error; +pub mod metadata; +pub mod track; + +pub use dmabuf::{DmaBufFrame, DmaBufPixelFormat, DmaBufPlane}; +pub use encoded::{ + CodecSpecific, EncodedAccessUnit, EncodedFragment, EncodedFrameType, EncodedLayerInfo, + EncodedPayload, EncodedVideoCodec, H264PacketizationMode, }; -use thiserror::Error; - -const ANNEX_B_START_CODE: [u8; 4] = [0, 0, 0, 1]; - -/// Encoded video codec carried by an [`EncodedAccessUnit`]. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -#[non_exhaustive] -pub enum EncodedVideoCodec { - /// H.264/AVC video. - H264, - /// H.265/HEVC video. - H265, - /// VP8 video. - VP8, - /// VP9 video. - VP9, - /// AV1 video. - AV1, -} - -/// Encoded video frame type. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum EncodedFrameType { - /// A key frame. - Key, - /// A delta frame. - Delta, -} - -/// Layer identifiers associated with an encoded frame. -#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] -pub struct EncodedLayerInfo { - /// Spatial layer index, when present. - pub spatial_id: Option, - /// Temporal layer index, when present. - pub temporal_id: Option, -} - -/// Packet-trailer metadata associated with an encoded frame. -#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] -pub struct EncodedFrameMetadata { - /// Wall-clock capture timestamp in microseconds. - pub user_timestamp: Option, - /// Monotonically increasing frame identifier. - pub frame_id: Option, -} - -/// H.264 packetization mode for passthrough metadata. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum H264PacketizationMode { - /// Non-interleaved packetization mode. - NonInterleaved, -} - -/// Codec-specific metadata for encoded passthrough. -#[derive(Debug, Clone, PartialEq, Eq)] -#[non_exhaustive] -pub enum CodecSpecific { - /// No codec-specific metadata. - None, - /// H.264-specific metadata. - H264 { - /// H.264 RTP packetization mode. - packetization_mode: H264PacketizationMode, - }, - /// H.265-specific metadata. - H265, - /// VP8-specific metadata. - VP8 { - /// Temporal layer index, when present. - temporal_id: Option, - /// Whether this frame synchronizes a temporal layer. - layer_sync: bool, - }, - /// VP9-specific metadata. - VP9 { - /// Temporal layer index, when present. - temporal_id: Option, - /// Spatial layer index, when present. - spatial_id: Option, - /// Whether this frame depends on an inter-layer reference. - inter_layer_predicted: Option, - }, - /// AV1-specific metadata. - AV1 { - /// RTP scalability mode, such as `L1T1`. - scalability_mode: Option, - /// Encoded dependency descriptor bytes, when supplied by the caller. - dependency_descriptor: Option>, - }, -} - -impl Default for CodecSpecific { - fn default() -> Self { - Self::None - } -} - -/// Borrowed encoded payload fragment. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct EncodedFragment<'a> { - /// Encoded fragment bytes. - pub bytes: &'a [u8], -} - -/// Encoded access-unit payload. -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum EncodedPayload<'a> { - /// One contiguous payload buffer. - Contiguous(&'a [u8]), - /// Multiple payload fragments. - Fragments(&'a [EncodedFragment<'a>]), - /// Owned payload bytes. - Owned(Vec), -} - -impl EncodedPayload<'_> { - fn is_empty(&self) -> bool { - match self { - Self::Contiguous(bytes) => bytes.is_empty(), - Self::Fragments(fragments) => { - fragments.is_empty() || fragments.iter().any(|fragment| fragment.bytes.is_empty()) - } - Self::Owned(bytes) => bytes.is_empty(), - } - } - - fn to_vec(&self) -> Vec { - match self { - Self::Contiguous(bytes) => bytes.to_vec(), - Self::Fragments(fragments) => { - let len = fragments.iter().map(|fragment| fragment.bytes.len()).sum(); - let mut payload = Vec::with_capacity(len); - for fragment in *fragments { - payload.extend_from_slice(fragment.bytes); - } - payload - } - Self::Owned(bytes) => bytes.clone(), - } - } -} - -/// One encoded video access unit. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct EncodedAccessUnit<'a> { - /// Encoded codec. - pub codec: EncodedVideoCodec, - /// Encoded payload. - pub payload: EncodedPayload<'a>, - /// Capture timestamp in microseconds. - pub timestamp_us: i64, - /// Encoded frame type. - pub frame_type: EncodedFrameType, - /// Encoded frame width in pixels. - pub width: u32, - /// Encoded frame height in pixels. - pub height: u32, - /// Optional layer identifiers. - pub layers: EncodedLayerInfo, - /// Optional codec-specific metadata. - pub codec_specific: CodecSpecific, - /// Optional packet-trailer metadata. - pub metadata: EncodedFrameMetadata, -} - -impl<'a> EncodedAccessUnit<'a> { - /// Creates an access unit from one contiguous payload. - pub fn contiguous( - codec: EncodedVideoCodec, - payload: &'a [u8], - timestamp_us: i64, - frame_type: EncodedFrameType, - width: u32, - height: u32, - ) -> Self { - Self { - codec, - payload: EncodedPayload::Contiguous(payload), - timestamp_us, - frame_type, - width, - height, - layers: EncodedLayerInfo::default(), - codec_specific: CodecSpecific::None, - metadata: EncodedFrameMetadata::default(), - } - } - - /// Creates an H.264 access unit from raw NAL-unit payloads. - pub fn from_h264_nalus( - nal_units: &[&[u8]], - timestamp_us: i64, - width: u32, - height: u32, - ) -> Result, CaptureError> { - let mut is_key = false; - for nal in nal_units { - let nal_type = h264_nal_type(nal)?; - if nal_type == 5 { - is_key = true; - } - } - - Ok(EncodedAccessUnit { - codec: EncodedVideoCodec::H264, - payload: EncodedPayload::Owned(annex_b_payload(nal_units)?), - timestamp_us, - frame_type: if is_key { EncodedFrameType::Key } else { EncodedFrameType::Delta }, - width, - height, - layers: EncodedLayerInfo::default(), - codec_specific: CodecSpecific::H264 { - packetization_mode: H264PacketizationMode::NonInterleaved, - }, - metadata: EncodedFrameMetadata::default(), - }) - } - - /// Creates an H.265 access unit from raw NAL-unit payloads. - pub fn from_h265_nalus( - nal_units: &[&[u8]], - timestamp_us: i64, - width: u32, - height: u32, - ) -> Result, CaptureError> { - let mut is_key = false; - for nal in nal_units { - let nal_type = h265_nal_type(nal)?; - if (16..=21).contains(&nal_type) { - is_key = true; - } - } - - Ok(EncodedAccessUnit { - codec: EncodedVideoCodec::H265, - payload: EncodedPayload::Owned(annex_b_payload(nal_units)?), - timestamp_us, - frame_type: if is_key { EncodedFrameType::Key } else { EncodedFrameType::Delta }, - width, - height, - layers: EncodedLayerInfo::default(), - codec_specific: CodecSpecific::H265, - metadata: EncodedFrameMetadata::default(), - }) - } -} - -/// DMA-BUF pixel format. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum DmaBufPixelFormat { - /// NV12 biplanar format. - Nv12, - /// YUV420M multiplanar format. - Yuv420M, -} - -impl DmaBufPixelFormat { - #[cfg(target_os = "linux")] - fn as_native(self) -> i32 { - match self { - Self::Nv12 => 0, - Self::Yuv420M => 1, - } - } -} - -/// One DMA-BUF plane descriptor. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct DmaBufPlane { - /// DMA-BUF file descriptor. - pub fd: i32, - /// Plane byte offset. - pub offset: u32, - /// Plane byte stride. - pub stride: u32, -} - -/// One DMA-BUF backed captured frame. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct DmaBufFrame { - /// Frame width in pixels. - pub width: u32, - /// Frame height in pixels. - pub height: u32, - /// Pixel format. - pub pixel_format: DmaBufPixelFormat, - /// DMA-BUF planes. - pub planes: Vec, - /// Optional DRM format modifier. - pub modifier: Option, - /// Capture timestamp in microseconds. - pub timestamp_us: i64, - /// Optional packet-trailer metadata. - pub metadata: EncodedFrameMetadata, -} - -/// Capture source backed by a LiveKit local video track. -#[derive(Debug, Clone)] -pub struct VideoCaptureTrack { - source: NativeVideoSource, - track: LocalVideoTrack, -} - -impl VideoCaptureTrack { - /// Creates a capture track with the supplied resolution. - pub fn new(name: &str, resolution: VideoResolution, is_screencast: bool) -> Self { - let source = NativeVideoSource::new(resolution, is_screencast); - let track = - LocalVideoTrack::create_video_track(name, RtcVideoSource::Native(source.clone())); - Self { source, track } - } - - /// Returns the publishable local video track. - pub fn track(&self) -> LocalVideoTrack { - self.track.clone() - } - - /// Captures one decoded video frame. - pub fn capture_frame>(&self, frame: &VideoFrame) { - self.source.capture_frame(frame); - } - - /// Captures one DMA-BUF backed frame. - #[cfg(target_os = "linux")] - pub fn capture_dmabuf(&self, frame: &DmaBufFrame) -> Result<(), CaptureError> { - let plane = frame.planes.first().ok_or(CaptureError::MissingDmaBufPlane)?; - let ok = self.source.capture_dmabuf_frame_with_metadata( - plane.fd, - frame.width, - frame.height, - frame.pixel_format.as_native(), - frame.timestamp_us, - frame.metadata.into_frame_metadata(), - ); - ok.then_some(()).ok_or(CaptureError::CaptureFailed) - } - - /// Captures one encoded video access unit. - pub fn capture_encoded(&self, access_unit: &EncodedAccessUnit<'_>) -> Result<(), CaptureError> { - match access_unit.codec { - EncodedVideoCodec::H264 | EncodedVideoCodec::H265 => {} - EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { - return Err(CaptureError::UnsupportedCodec(access_unit.codec)); - } - } - if access_unit.payload.is_empty() { - return Err(CaptureError::EmptyPayload); - } - - let payload = access_unit.payload.to_vec(); - let frame = EncodedVideoFrame { - codec: access_unit.codec.into(), - payload: &payload, - timestamp_us: access_unit.timestamp_us, - frame_type: access_unit.frame_type.into(), - width: access_unit.width, - height: access_unit.height, - frame_metadata: access_unit.metadata.into_frame_metadata(), - }; - self.source.capture_encoded_frame(&frame).then_some(()).ok_or(CaptureError::CaptureFailed) - } - - /// Returns publish options appropriate for encoded passthrough. - pub fn encoded_publish_options(codec: EncodedVideoCodec) -> TrackPublishOptions { - TrackPublishOptions { - video_codec: codec.into(), - video_encoder: VideoEncoderBackend::PreEncoded, - simulcast: false, - ..Default::default() - } - } -} - -/// Error returned by capture helpers. -#[derive(Debug, Error, PartialEq, Eq)] -pub enum CaptureError { - /// Encoded payload is empty. - #[error("encoded payload is empty")] - EmptyPayload, - /// H.265 NAL unit is too short to contain its header. - #[error("H.265 NAL unit is too short")] - H265NalTooShort, - /// DMA-BUF frame did not include any planes. - #[error("DMA-BUF frame did not include any planes")] - MissingDmaBufPlane, - /// Codec is represented by the API but not yet supported by native passthrough. - #[error("encoded passthrough does not support {0:?} yet")] - UnsupportedCodec(EncodedVideoCodec), - /// The underlying source rejected the frame. - #[error("capture source rejected the frame")] - CaptureFailed, -} - -impl EncodedFrameMetadata { - fn into_frame_metadata(self) -> Option { - (self.user_timestamp.is_some() || self.frame_id.is_some()).then_some(FrameMetadata { - user_timestamp: self.user_timestamp, - frame_id: self.frame_id, - }) - } -} - -impl From for VideoCodec { - fn from(value: EncodedVideoCodec) -> Self { - match value { - EncodedVideoCodec::H264 => Self::H264, - EncodedVideoCodec::H265 => Self::H265, - EncodedVideoCodec::VP8 => Self::VP8, - EncodedVideoCodec::VP9 => Self::VP9, - EncodedVideoCodec::AV1 => Self::AV1, - } - } -} - -impl From for RtcEncodedVideoCodec { - fn from(value: EncodedVideoCodec) -> Self { - match value { - EncodedVideoCodec::H264 => Self::H264, - EncodedVideoCodec::H265 => Self::H265, - EncodedVideoCodec::VP8 => Self::VP8, - EncodedVideoCodec::VP9 => Self::VP9, - EncodedVideoCodec::AV1 => Self::AV1, - } - } -} - -impl From for RtcEncodedFrameType { - fn from(value: EncodedFrameType) -> Self { - match value { - EncodedFrameType::Key => Self::Key, - EncodedFrameType::Delta => Self::Delta, - } - } -} - -fn h264_nal_type(nal: &[u8]) -> Result { - let header = nal.first().ok_or(CaptureError::EmptyPayload)?; - Ok(header & 0x1f) -} - -fn h265_nal_type(nal: &[u8]) -> Result { - if nal.is_empty() { - return Err(CaptureError::EmptyPayload); - } - if nal.len() < 2 { - return Err(CaptureError::H265NalTooShort); - } - Ok((nal[0] >> 1) & 0x3f) -} - -fn annex_b_payload(nal_units: &[&[u8]]) -> Result, CaptureError> { - if nal_units.is_empty() { - return Err(CaptureError::EmptyPayload); - } - let len = nal_units.iter().try_fold(0usize, |len, nal| { - if nal.is_empty() { - Err(CaptureError::EmptyPayload) - } else { - Ok(len + ANNEX_B_START_CODE.len() + nal.len()) - } - })?; - - let mut payload = Vec::with_capacity(len); - for nal in nal_units { - payload.extend_from_slice(&ANNEX_B_START_CODE); - payload.extend_from_slice(nal); - } - Ok(payload) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn h264_nal_helper_assembles_annex_b_and_detects_keyframe() { - let sps = [0x67, 1, 2, 3]; - let idr = [0x65, 4, 5, 6]; - let au = EncodedAccessUnit::from_h264_nalus(&[&sps, &idr], 10, 640, 480).unwrap(); - - assert_eq!(au.codec, EncodedVideoCodec::H264); - assert_eq!(au.frame_type, EncodedFrameType::Key); - assert_eq!( - au.payload, - EncodedPayload::Owned(vec![0, 0, 0, 1, 0x67, 1, 2, 3, 0, 0, 0, 1, 0x65, 4, 5, 6]) - ); - } - - #[test] - fn h265_nal_helper_detects_irap_keyframe() { - let vps = [0x40, 1, 2]; - let idr_w_radl = [19 << 1, 1, 3]; - let au = EncodedAccessUnit::from_h265_nalus(&[&vps, &idr_w_radl], 10, 640, 480).unwrap(); - - assert_eq!(au.codec, EncodedVideoCodec::H265); - assert_eq!(au.frame_type, EncodedFrameType::Key); - } - - #[test] - fn h265_rejects_too_short_nal_header() { - let err = EncodedAccessUnit::from_h265_nalus(&[&[0x26]], 10, 640, 480).unwrap_err(); - assert_eq!(err, CaptureError::H265NalTooShort); - } - - #[test] - fn fragments_reject_empty_fragment() { - let fragments = [EncodedFragment { bytes: &[1] }, EncodedFragment { bytes: &[] }]; - let payload = EncodedPayload::Fragments(&fragments); - assert!(payload.is_empty()); - } -} +pub use error::CaptureError; +pub use metadata::FrameMetadata; +pub use track::{CapturePath, VideoCaptureTrack}; From 1c2fb9dacbcaae9936a1456fddc542ffa2de9684 Mon Sep 17 00:00:00 2001 From: David Chen Date: Mon, 22 Jun 2026 23:40:09 -0700 Subject: [PATCH 03/24] add avfoundation & pre-encoded support --- .changeset/livekit-capture-preencoded.md | 2 +- Cargo.lock | 75 ++- livekit-capture/Cargo.toml | 47 ++ livekit-capture/src/device.rs | 130 +++++ livekit-capture/src/dmabuf.rs | 64 +++ livekit-capture/src/encoded.rs | 487 ++++++++++++++++++ livekit-capture/src/encoded/h26x.rs | 317 ++++++++++++ livekit-capture/src/encoded/ingress.rs | 118 +++++ livekit-capture/src/encoded/rtp.rs | 504 +++++++++++++++++++ livekit-capture/src/error.rs | 46 ++ livekit-capture/src/lib.rs | 11 +- livekit-capture/src/metadata.rs | 33 ++ livekit-capture/src/platform/avfoundation.rs | 171 +++++++ livekit-capture/src/platform/mod.rs | 18 + livekit-capture/src/sources/gstreamer.rs | 57 +++ livekit-capture/src/sources/mod.rs | 22 + livekit-capture/src/sources/rtsp.rs | 200 ++++++++ livekit-capture/src/sources/tcp.rs | 306 +++++++++++ livekit-capture/src/track.rs | 119 +++++ 19 files changed, 2723 insertions(+), 4 deletions(-) create mode 100644 livekit-capture/src/device.rs create mode 100644 livekit-capture/src/dmabuf.rs create mode 100644 livekit-capture/src/encoded.rs create mode 100644 livekit-capture/src/encoded/h26x.rs create mode 100644 livekit-capture/src/encoded/ingress.rs create mode 100644 livekit-capture/src/encoded/rtp.rs create mode 100644 livekit-capture/src/error.rs create mode 100644 livekit-capture/src/metadata.rs create mode 100644 livekit-capture/src/platform/avfoundation.rs create mode 100644 livekit-capture/src/platform/mod.rs create mode 100644 livekit-capture/src/sources/gstreamer.rs create mode 100644 livekit-capture/src/sources/mod.rs create mode 100644 livekit-capture/src/sources/rtsp.rs create mode 100644 livekit-capture/src/sources/tcp.rs create mode 100644 livekit-capture/src/track.rs diff --git a/.changeset/livekit-capture-preencoded.md b/.changeset/livekit-capture-preencoded.md index ec53f333c..3548b0662 100644 --- a/.changeset/livekit-capture-preencoded.md +++ b/.changeset/livekit-capture-preencoded.md @@ -5,4 +5,4 @@ "webrtc-sys": patch --- -Add a `livekit-capture` crate with codec-neutral capture types and pre-encoded H264/H265 passthrough support. +Add a `livekit-capture` crate with codec-neutral capture types, H264/H265 passthrough support, common encoded ingress helpers, and feature-gated source/platform scaffolding for TCP, RTSP, GStreamer appsink, and AVFoundation capture. diff --git a/Cargo.lock b/Cargo.lock index 656972ec0..3cc5ed018 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3886,7 +3886,16 @@ dependencies = [ name = "livekit-capture" version = "0.1.0" dependencies = [ + "block2 0.6.2", + "bytes", + "dispatch2", + "imgproc", "livekit", + "objc2 0.6.4", + "objc2-av-foundation", + "objc2-core-media", + "objc2-core-video", + "objc2-foundation 0.3.2", "thiserror 2.0.18", ] @@ -4746,6 +4755,18 @@ dependencies = [ "objc2-foundation 0.3.2", ] +[[package]] +name = "objc2-av-foundation" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "478ae33fcac9df0a18db8302387c666b8ef08a3e2d62b510ca4fc278a384b6c0" +dependencies = [ + "bitflags 2.11.0", + "objc2 0.6.4", + "objc2-core-media", + "objc2-foundation 0.3.2", +] + [[package]] name = "objc2-cloud-kit" version = "0.2.2" @@ -4781,6 +4802,28 @@ dependencies = [ "objc2-foundation 0.2.2", ] +[[package]] +name = "objc2-core-audio" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1eebcea8b0dbff5f7c8504f3107c68fc061a3eb44932051c8cf8a68d969c3b2" +dependencies = [ + "dispatch2", + "objc2 0.6.4", + "objc2-core-audio-types", + "objc2-core-foundation", +] + +[[package]] +name = "objc2-core-audio-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a89f2ec274a0cf4a32642b2991e8b351a404d290da87bb6a9a9d8632490bd1c" +dependencies = [ + "bitflags 2.11.0", + "objc2 0.6.4", +] + [[package]] name = "objc2-core-data" version = "0.2.2" @@ -4871,6 +4914,21 @@ dependencies = [ "objc2-foundation 0.3.2", ] +[[package]] +name = "objc2-core-media" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05ec576860167a15dd9fce7fbee7512beb4e31f532159d3482d1f9c6caedf31d" +dependencies = [ + "bitflags 2.11.0", + "dispatch2", + "objc2 0.6.4", + "objc2-core-audio", + "objc2-core-audio-types", + "objc2-core-foundation", + "objc2-core-video", +] + [[package]] name = "objc2-core-text" version = "0.3.2" @@ -4883,6 +4941,19 @@ dependencies = [ "objc2-core-graphics", ] +[[package]] +name = "objc2-core-video" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d425caf1df73233f29fd8a5c3e5edbc30d2d4307870f802d18f00d83dc5141a6" +dependencies = [ + "bitflags 2.11.0", + "objc2 0.6.4", + "objc2-core-foundation", + "objc2-core-graphics", + "objc2-io-surface", +] + [[package]] name = "objc2-encode" version = "4.1.0" @@ -5655,7 +5726,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" dependencies = [ "bytes", - "heck 0.4.1", + "heck 0.5.0", "itertools 0.12.1", "log", "multimap", @@ -5675,7 +5746,7 @@ version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" dependencies = [ - "heck 0.4.1", + "heck 0.5.0", "itertools 0.14.0", "log", "multimap", diff --git a/livekit-capture/Cargo.toml b/livekit-capture/Cargo.toml index a1c18c19a..72caf30b9 100644 --- a/livekit-capture/Cargo.toml +++ b/livekit-capture/Cargo.toml @@ -8,5 +8,52 @@ edition.workspace = true repository.workspace = true [dependencies] +bytes = { workspace = true } +imgproc = { workspace = true, optional = true } livekit = { workspace = true } thiserror = { workspace = true } + +[features] +default = [] +avfoundation = [ + "dep:block2", + "dep:dispatch2", + "dep:imgproc", + "dep:objc2", + "dep:objc2-av-foundation", + "dep:objc2-core-media", + "dep:objc2-core-video", + "dep:objc2-foundation", + "objc2-av-foundation/AVCaptureDevice", + "objc2-av-foundation/AVCaptureInput", + "objc2-av-foundation/AVCaptureOutputBase", + "objc2-av-foundation/AVCaptureSession", + "objc2-av-foundation/AVCaptureVideoDataOutput", + "objc2-av-foundation/AVMediaFormat", + "objc2-av-foundation/AVVideoSettings", + "objc2-av-foundation/objc2-core-media", + "objc2-core-media/CMTime", + "objc2-core-media/CMSampleBuffer", + "objc2-core-media/objc2-core-video", + "objc2-core-video/CVBase", + "objc2-core-video/CVBuffer", + "objc2-core-video/CVImageBuffer", + "objc2-core-video/CVPixelBuffer", + "objc2-core-video/CVReturn", + "objc2-foundation/NSArray", + "objc2-foundation/NSDictionary", + "objc2-foundation/NSObject", + "objc2-foundation/NSString", +] +gstreamer = [] +rtsp = [] +tcp-source = [] + +[target.'cfg(target_os = "macos")'.dependencies] +block2 = { version = "0.6.2", default-features = false, optional = true } +dispatch2 = { version = "0.3.1", default-features = false, features = ["std"], optional = true } +objc2 = { version = "0.6.4", default-features = false, features = ["std"], optional = true } +objc2-av-foundation = { version = "0.3.2", default-features = false, optional = true } +objc2-core-media = { version = "0.3.2", default-features = false, optional = true } +objc2-core-video = { version = "0.3.2", default-features = false, optional = true } +objc2-foundation = { version = "0.3.2", default-features = false, features = ["std"], optional = true } diff --git a/livekit-capture/src/device.rs b/livekit-capture/src/device.rs new file mode 100644 index 000000000..8dd8d0a9c --- /dev/null +++ b/livekit-capture/src/device.rs @@ -0,0 +1,130 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use livekit::webrtc::video_source::VideoResolution; + +/// Capture device discovered by a platform backend. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CaptureDeviceInfo { + /// Backend-stable device identifier. + pub id: String, + /// Human-readable device name. + pub name: String, + /// Device model identifier, when available. + pub model_id: Option, + /// Device manufacturer, when available. + pub manufacturer: Option, + /// Capture formats reported by the backend. + pub formats: Vec, +} + +/// Device selector used by capture backends. +#[derive(Debug, Clone, PartialEq, Eq)] +#[non_exhaustive] +pub enum CaptureDeviceSelector { + /// Use the backend default video device. + Default, + /// Use the device at the backend enumeration index. + Index(usize), + /// Use a backend-stable device identifier. + Id(String), +} + +/// Pixel format used by a decoded-frame capture backend. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum CapturePixelFormat { + /// Planar I420/YUV420P. + I420, + /// Biplanar NV12. + Nv12, + /// Packed BGRA. + Bgra, + /// Packed RGB24. + Rgb24, + /// Packed YUYV/YUY2. + Yuyv, + /// Encoded MJPEG frames. + Mjpeg, +} + +/// Pixel dimensions for a capture format. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct CaptureResolution { + /// Frame width in pixels. + pub width: u32, + /// Frame height in pixels. + pub height: u32, +} + +impl CaptureResolution { + /// Creates a capture resolution. + pub const fn new(width: u32, height: u32) -> Self { + Self { width, height } + } +} + +impl From for VideoResolution { + fn from(value: CaptureResolution) -> Self { + Self { width: value.width, height: value.height } + } +} + +/// Decoded-frame capture format. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct CaptureFormat { + /// Frame dimensions. + pub resolution: CaptureResolution, + /// Frame rate in frames per second. + pub frame_rate: u32, + /// Pixel format. + pub pixel_format: CapturePixelFormat, +} + +impl CaptureFormat { + /// Creates a decoded-frame capture format. + pub const fn new( + resolution: CaptureResolution, + frame_rate: u32, + pixel_format: CapturePixelFormat, + ) -> Self { + Self { resolution, frame_rate, pixel_format } + } +} + +/// Format selection requested from a capture backend. +#[derive(Debug, Clone, PartialEq, Eq)] +#[non_exhaustive] +pub enum CaptureFormatRequest { + /// Let the backend choose its default format. + Default, + /// Require an exact format match. + Exact(CaptureFormat), + /// Use the backend's closest supported format. + Closest(CaptureFormat), + /// Prefer the highest frame rate, optionally constrained by resolution and pixel format. + HighestFrameRate { + /// Optional resolution constraint. + resolution: Option, + /// Optional pixel format constraint. + pixel_format: Option, + }, + /// Prefer the highest resolution, optionally constrained by frame rate and pixel format. + HighestResolution { + /// Optional frame-rate constraint. + frame_rate: Option, + /// Optional pixel format constraint. + pixel_format: Option, + }, +} diff --git a/livekit-capture/src/dmabuf.rs b/livekit-capture/src/dmabuf.rs new file mode 100644 index 000000000..8410582f0 --- /dev/null +++ b/livekit-capture/src/dmabuf.rs @@ -0,0 +1,64 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::metadata::FrameMetadata; + +/// DMA-BUF pixel format. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DmaBufPixelFormat { + /// NV12 biplanar format. + Nv12, + /// YUV420M multiplanar format. + Yuv420M, +} + +impl DmaBufPixelFormat { + #[cfg(target_os = "linux")] + pub(crate) fn as_native(self) -> i32 { + match self { + Self::Nv12 => 0, + Self::Yuv420M => 1, + } + } +} + +/// One DMA-BUF plane descriptor. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct DmaBufPlane { + /// DMA-BUF file descriptor. + pub fd: i32, + /// Plane byte offset. + pub offset: u32, + /// Plane byte stride. + pub stride: u32, +} + +/// One DMA-BUF backed captured frame. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DmaBufFrame { + /// Frame width in pixels. + pub width: u32, + /// Frame height in pixels. + pub height: u32, + /// Pixel format. + pub pixel_format: DmaBufPixelFormat, + /// DMA-BUF planes. + pub planes: Vec, + /// Optional DRM format modifier. + pub modifier: Option, + /// Capture timestamp in microseconds. + pub timestamp_us: i64, + /// Optional packet-trailer metadata. + pub metadata: FrameMetadata, +} diff --git a/livekit-capture/src/encoded.rs b/livekit-capture/src/encoded.rs new file mode 100644 index 000000000..cf7214150 --- /dev/null +++ b/livekit-capture/src/encoded.rs @@ -0,0 +1,487 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod h26x; +pub mod ingress; +pub mod rtp; + +use bytes::Bytes; +use livekit::{ + options::VideoCodec, + webrtc::video_frame::{ + EncodedFrameType as RtcEncodedFrameType, EncodedVideoCodec as RtcEncodedVideoCodec, + }, +}; + +use crate::{error::CaptureError, metadata::FrameMetadata}; + +const ANNEX_B_START_CODE: [u8; 4] = [0, 0, 0, 1]; + +/// Encoded byte-stream framing used by encoded source backends. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum EncodedWireFormat { + /// H.264 Annex-B byte stream. + H264AnnexB, + /// H.265 Annex-B byte stream. + H265AnnexB, + /// RTP packets for the supplied codec and RTP clock rate. + Rtp { + /// RTP payload codec. + codec: EncodedVideoCodec, + /// RTP timestamp clock rate. + clock_rate: u32, + }, + /// MPEG transport stream carrying encoded video. + MpegTs, +} + +/// Encoded video codec carried by an [`EncodedAccessUnit`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum EncodedVideoCodec { + /// H.264/AVC video. + H264, + /// H.265/HEVC video. + H265, + /// VP8 video. + VP8, + /// VP9 video. + VP9, + /// AV1 video. + AV1, +} + +/// Encoded video frame type. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum EncodedFrameType { + /// A key frame. + Key, + /// A delta frame. + Delta, +} + +/// Layer identifiers associated with an encoded frame. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct EncodedLayerInfo { + /// Spatial layer index, when present. + pub spatial_id: Option, + /// Temporal layer index, when present. + pub temporal_id: Option, +} + +/// H.264 packetization mode for passthrough metadata. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum H264PacketizationMode { + /// Non-interleaved packetization mode. + NonInterleaved, +} + +/// Codec-specific metadata for encoded passthrough. +#[derive(Debug, Clone, PartialEq, Eq)] +#[non_exhaustive] +pub enum CodecSpecific { + /// No codec-specific metadata. + None, + /// H.264-specific metadata. + H264 { + /// H.264 RTP packetization mode. + packetization_mode: H264PacketizationMode, + }, + /// H.265-specific metadata. + H265, + /// VP8-specific metadata. + VP8 { + /// Temporal layer index, when present. + temporal_id: Option, + /// Whether this frame synchronizes a temporal layer. + layer_sync: bool, + }, + /// VP9-specific metadata. + VP9 { + /// Temporal layer index, when present. + temporal_id: Option, + /// Spatial layer index, when present. + spatial_id: Option, + /// Whether this frame depends on an inter-layer reference. + inter_layer_predicted: Option, + }, + /// AV1-specific metadata. + AV1 { + /// RTP scalability mode, such as `L1T1`. + scalability_mode: Option, + /// Encoded dependency descriptor bytes, when supplied by the caller. + dependency_descriptor: Option>, + }, +} + +impl Default for CodecSpecific { + fn default() -> Self { + Self::None + } +} + +/// Borrowed encoded payload fragment. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct EncodedFragment<'a> { + /// Encoded fragment bytes. + pub bytes: &'a [u8], +} + +/// Encoded access-unit payload. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum EncodedPayload<'a> { + /// One contiguous payload buffer. + Contiguous(&'a [u8]), + /// Multiple payload fragments. + Fragments(&'a [EncodedFragment<'a>]), + /// Owned payload bytes. + Owned(Vec), +} + +impl EncodedPayload<'_> { + pub(crate) fn is_empty(&self) -> bool { + match self { + Self::Contiguous(bytes) => bytes.is_empty(), + Self::Fragments(fragments) => { + fragments.is_empty() || fragments.iter().any(|fragment| fragment.bytes.is_empty()) + } + Self::Owned(bytes) => bytes.is_empty(), + } + } + + pub(crate) fn to_vec(&self) -> Vec { + match self { + Self::Contiguous(bytes) => bytes.to_vec(), + Self::Fragments(fragments) => { + let len = fragments.iter().map(|fragment| fragment.bytes.len()).sum(); + let mut payload = Vec::with_capacity(len); + for fragment in *fragments { + payload.extend_from_slice(fragment.bytes); + } + payload + } + Self::Owned(bytes) => bytes.clone(), + } + } +} + +/// One encoded video access unit. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct EncodedAccessUnit<'a> { + /// Encoded codec. + pub codec: EncodedVideoCodec, + /// Encoded payload. + pub payload: EncodedPayload<'a>, + /// Capture timestamp in microseconds. + pub timestamp_us: i64, + /// Encoded frame type. + pub frame_type: EncodedFrameType, + /// Encoded frame width in pixels. + pub width: u32, + /// Encoded frame height in pixels. + pub height: u32, + /// Optional layer identifiers. + pub layers: EncodedLayerInfo, + /// Optional codec-specific metadata. + pub codec_specific: CodecSpecific, + /// Optional packet-trailer metadata. + pub metadata: FrameMetadata, +} + +/// Owned encoded video access unit. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct OwnedEncodedAccessUnit { + /// Encoded codec. + pub codec: EncodedVideoCodec, + /// Encoded payload bytes. + pub payload: Bytes, + /// Capture timestamp in microseconds. + pub timestamp_us: i64, + /// Encoded frame type. + pub frame_type: EncodedFrameType, + /// Encoded frame width in pixels. + pub width: u32, + /// Encoded frame height in pixels. + pub height: u32, + /// Optional layer identifiers. + pub layers: EncodedLayerInfo, + /// Optional codec-specific metadata. + pub codec_specific: CodecSpecific, + /// Optional packet-trailer metadata. + pub metadata: FrameMetadata, +} + +impl OwnedEncodedAccessUnit { + /// Creates an owned encoded access unit from contiguous bytes. + pub fn new( + codec: EncodedVideoCodec, + payload: impl Into, + timestamp_us: i64, + frame_type: EncodedFrameType, + width: u32, + height: u32, + ) -> Self { + Self { + codec, + payload: payload.into(), + timestamp_us, + frame_type, + width, + height, + layers: EncodedLayerInfo::default(), + codec_specific: CodecSpecific::None, + metadata: FrameMetadata::default(), + } + } + + /// Borrows this owned access unit as an [`EncodedAccessUnit`]. + pub fn as_access_unit(&self) -> EncodedAccessUnit<'_> { + EncodedAccessUnit { + codec: self.codec, + payload: EncodedPayload::Contiguous(&self.payload), + timestamp_us: self.timestamp_us, + frame_type: self.frame_type, + width: self.width, + height: self.height, + layers: self.layers, + codec_specific: self.codec_specific.clone(), + metadata: self.metadata, + } + } + + /// Creates an owned access unit by copying a borrowed access unit. + pub fn copy_from(access_unit: &EncodedAccessUnit<'_>) -> Self { + Self { + codec: access_unit.codec, + payload: Bytes::from(access_unit.payload.to_vec()), + timestamp_us: access_unit.timestamp_us, + frame_type: access_unit.frame_type, + width: access_unit.width, + height: access_unit.height, + layers: access_unit.layers, + codec_specific: access_unit.codec_specific.clone(), + metadata: access_unit.metadata, + } + } +} + +impl<'a> EncodedAccessUnit<'a> { + /// Creates an access unit from one contiguous payload. + pub fn contiguous( + codec: EncodedVideoCodec, + payload: &'a [u8], + timestamp_us: i64, + frame_type: EncodedFrameType, + width: u32, + height: u32, + ) -> Self { + Self { + codec, + payload: EncodedPayload::Contiguous(payload), + timestamp_us, + frame_type, + width, + height, + layers: EncodedLayerInfo::default(), + codec_specific: CodecSpecific::None, + metadata: FrameMetadata::default(), + } + } + + /// Creates an H.264 access unit from raw NAL-unit payloads. + pub fn from_h264_nalus( + nal_units: &[&[u8]], + timestamp_us: i64, + width: u32, + height: u32, + ) -> Result, CaptureError> { + let mut is_key = false; + for nal in nal_units { + let nal_type = h264_nal_type(nal)?; + if nal_type == 5 { + is_key = true; + } + } + + Ok(EncodedAccessUnit { + codec: EncodedVideoCodec::H264, + payload: EncodedPayload::Owned(annex_b_payload(nal_units)?), + timestamp_us, + frame_type: if is_key { EncodedFrameType::Key } else { EncodedFrameType::Delta }, + width, + height, + layers: EncodedLayerInfo::default(), + codec_specific: CodecSpecific::H264 { + packetization_mode: H264PacketizationMode::NonInterleaved, + }, + metadata: FrameMetadata::default(), + }) + } + + /// Creates an H.265 access unit from raw NAL-unit payloads. + pub fn from_h265_nalus( + nal_units: &[&[u8]], + timestamp_us: i64, + width: u32, + height: u32, + ) -> Result, CaptureError> { + let mut is_key = false; + for nal in nal_units { + let nal_type = h265_nal_type(nal)?; + if (16..=21).contains(&nal_type) { + is_key = true; + } + } + + Ok(EncodedAccessUnit { + codec: EncodedVideoCodec::H265, + payload: EncodedPayload::Owned(annex_b_payload(nal_units)?), + timestamp_us, + frame_type: if is_key { EncodedFrameType::Key } else { EncodedFrameType::Delta }, + width, + height, + layers: EncodedLayerInfo::default(), + codec_specific: CodecSpecific::H265, + metadata: FrameMetadata::default(), + }) + } +} + +impl From for VideoCodec { + fn from(value: EncodedVideoCodec) -> Self { + match value { + EncodedVideoCodec::H264 => Self::H264, + EncodedVideoCodec::H265 => Self::H265, + EncodedVideoCodec::VP8 => Self::VP8, + EncodedVideoCodec::VP9 => Self::VP9, + EncodedVideoCodec::AV1 => Self::AV1, + } + } +} + +impl From for RtcEncodedVideoCodec { + fn from(value: EncodedVideoCodec) -> Self { + match value { + EncodedVideoCodec::H264 => Self::H264, + EncodedVideoCodec::H265 => Self::H265, + EncodedVideoCodec::VP8 => Self::VP8, + EncodedVideoCodec::VP9 => Self::VP9, + EncodedVideoCodec::AV1 => Self::AV1, + } + } +} + +impl From for RtcEncodedFrameType { + fn from(value: EncodedFrameType) -> Self { + match value { + EncodedFrameType::Key => Self::Key, + EncodedFrameType::Delta => Self::Delta, + } + } +} + +pub(crate) fn h264_nal_type(nal: &[u8]) -> Result { + let header = nal.first().ok_or(CaptureError::EmptyPayload)?; + Ok(header & 0x1f) +} + +pub(crate) fn h265_nal_type(nal: &[u8]) -> Result { + if nal.is_empty() { + return Err(CaptureError::EmptyPayload); + } + if nal.len() < 2 { + return Err(CaptureError::H265NalTooShort); + } + Ok((nal[0] >> 1) & 0x3f) +} + +pub(crate) fn annex_b_payload(nal_units: &[&[u8]]) -> Result, CaptureError> { + if nal_units.is_empty() { + return Err(CaptureError::EmptyPayload); + } + let len = nal_units.iter().try_fold(0usize, |len, nal| { + if nal.is_empty() { + Err(CaptureError::EmptyPayload) + } else { + Ok(len + ANNEX_B_START_CODE.len() + nal.len()) + } + })?; + + let mut payload = Vec::with_capacity(len); + for nal in nal_units { + payload.extend_from_slice(&ANNEX_B_START_CODE); + payload.extend_from_slice(nal); + } + Ok(payload) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn h264_nal_helper_assembles_annex_b_and_detects_keyframe() { + let sps = [0x67, 1, 2, 3]; + let idr = [0x65, 4, 5, 6]; + let au = EncodedAccessUnit::from_h264_nalus(&[&sps, &idr], 10, 640, 480).unwrap(); + + assert_eq!(au.codec, EncodedVideoCodec::H264); + assert_eq!(au.frame_type, EncodedFrameType::Key); + assert_eq!( + au.payload, + EncodedPayload::Owned(vec![0, 0, 0, 1, 0x67, 1, 2, 3, 0, 0, 0, 1, 0x65, 4, 5, 6]) + ); + } + + #[test] + fn h265_nal_helper_detects_irap_keyframe() { + let vps = [0x40, 1, 2]; + let idr_w_radl = [19 << 1, 1, 3]; + let au = EncodedAccessUnit::from_h265_nalus(&[&vps, &idr_w_radl], 10, 640, 480).unwrap(); + + assert_eq!(au.codec, EncodedVideoCodec::H265); + assert_eq!(au.frame_type, EncodedFrameType::Key); + } + + #[test] + fn h265_rejects_too_short_nal_header() { + let err = EncodedAccessUnit::from_h265_nalus(&[&[0x26]], 10, 640, 480).unwrap_err(); + assert_eq!(err, CaptureError::H265NalTooShort); + } + + #[test] + fn fragments_reject_empty_fragment() { + let fragments = [EncodedFragment { bytes: &[1] }, EncodedFragment { bytes: &[] }]; + let payload = EncodedPayload::Fragments(&fragments); + assert!(payload.is_empty()); + } + + #[test] + fn owned_access_unit_borrows_without_copying_payload() { + let owned = OwnedEncodedAccessUnit::new( + EncodedVideoCodec::H264, + Bytes::from_static(&[1, 2, 3]), + 10, + EncodedFrameType::Delta, + 640, + 480, + ); + + let borrowed = owned.as_access_unit(); + assert_eq!(borrowed.codec, EncodedVideoCodec::H264); + assert_eq!(borrowed.payload, EncodedPayload::Contiguous(&[1, 2, 3])); + assert_eq!(borrowed.timestamp_us, 10); + } +} diff --git a/livekit-capture/src/encoded/h26x.rs b/livekit-capture/src/encoded/h26x.rs new file mode 100644 index 000000000..1bb3716a0 --- /dev/null +++ b/livekit-capture/src/encoded/h26x.rs @@ -0,0 +1,317 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::ops::Range; + +use bytes::Bytes; + +use crate::{ + encoded::{ + annex_b_payload, h264_nal_type, h265_nal_type, CodecSpecific, EncodedFrameType, + EncodedVideoCodec, H264PacketizationMode, OwnedEncodedAccessUnit, + }, + error::CaptureError, +}; + +/// H26x Annex-B parser state. +#[derive(Debug, Clone)] +pub struct AnnexBAccessUnitParser { + codec: EncodedVideoCodec, + pending: Vec, + next_timestamp_us: i64, + frame_interval_us: i64, + width: u32, + height: u32, +} + +impl AnnexBAccessUnitParser { + /// Creates a parser for H.264 or H.265 Annex-B byte streams. + pub fn new( + codec: EncodedVideoCodec, + start_timestamp_us: i64, + frame_interval_us: i64, + width: u32, + height: u32, + ) -> Result { + match codec { + EncodedVideoCodec::H264 | EncodedVideoCodec::H265 => {} + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { + return Err(CaptureError::UnsupportedCodec(codec)); + } + } + + Ok(Self { + codec, + pending: Vec::new(), + next_timestamp_us: start_timestamp_us, + frame_interval_us, + width, + height, + }) + } + + /// Pushes encoded bytes and returns the next complete access unit if one is found. + pub fn push(&mut self, bytes: &[u8]) -> Result, CaptureError> { + self.pending.extend_from_slice(bytes); + self.drain_next(false) + } + + /// Flushes the pending bytes as the final access unit. + pub fn flush(&mut self) -> Result, CaptureError> { + self.drain_next(true) + } + + fn drain_next(&mut self, at_eof: bool) -> Result, CaptureError> { + let ranges = annex_b_nal_ranges(&self.pending); + if ranges.is_empty() { + return Ok(None); + } + + let Some(split_at) = access_unit_split_index(self.codec, &self.pending, &ranges)? else { + if at_eof { + return self.take_access_unit(self.pending.len()); + } + return Ok(None); + }; + + self.take_access_unit(split_at) + } + + fn take_access_unit( + &mut self, + byte_len: usize, + ) -> Result, CaptureError> { + if byte_len == 0 { + return Ok(None); + } + + let access_unit = self.pending[..byte_len].to_vec(); + self.pending.drain(..byte_len); + let timestamp_us = self.next_timestamp_us; + self.next_timestamp_us = self.next_timestamp_us.saturating_add(self.frame_interval_us); + access_unit_from_annex_b( + self.codec, + Bytes::from(access_unit), + timestamp_us, + self.width, + self.height, + ) + .map(Some) + } +} + +/// Returns NAL-unit byte ranges for an Annex-B access unit or stream chunk. +pub fn annex_b_nal_ranges(bytes: &[u8]) -> Vec> { + let mut ranges = Vec::new(); + let mut cursor = 0; + let mut current_start = None; + + while let Some((prefix_start, prefix_len)) = find_start_code(&bytes[cursor..]) { + let prefix_start = cursor + prefix_start; + let nal_start = prefix_start + prefix_len; + if let Some(start) = current_start.replace(nal_start) { + if start < prefix_start { + ranges.push(start..prefix_start); + } + } + cursor = nal_start; + } + + if let Some(start) = current_start { + if start < bytes.len() { + ranges.push(start..bytes.len()); + } + } + + ranges +} + +/// Returns borrowed NAL units from an Annex-B buffer. +pub fn annex_b_nalus(bytes: &[u8]) -> Result, CaptureError> { + let nals = annex_b_nal_ranges(bytes) + .into_iter() + .map(|range| &bytes[range]) + .filter(|nal| !nal.is_empty()) + .collect::>(); + Ok(nals) +} + +/// Creates an access unit from an Annex-B buffer. +pub fn access_unit_from_annex_b( + codec: EncodedVideoCodec, + payload: Bytes, + timestamp_us: i64, + width: u32, + height: u32, +) -> Result { + if payload.is_empty() { + return Err(CaptureError::EmptyPayload); + } + + let frame_type = if is_keyframe_annex_b(codec, &payload)? { + EncodedFrameType::Key + } else { + EncodedFrameType::Delta + }; + let mut access_unit = + OwnedEncodedAccessUnit::new(codec, payload, timestamp_us, frame_type, width, height); + access_unit.codec_specific = match codec { + EncodedVideoCodec::H264 => { + CodecSpecific::H264 { packetization_mode: H264PacketizationMode::NonInterleaved } + } + EncodedVideoCodec::H265 => CodecSpecific::H265, + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { + return Err(CaptureError::UnsupportedCodec(codec)); + } + }; + Ok(access_unit) +} + +/// Creates an Annex-B access unit from raw NAL units. +pub fn access_unit_from_nalus( + codec: EncodedVideoCodec, + nal_units: &[&[u8]], + timestamp_us: i64, + width: u32, + height: u32, +) -> Result { + let payload = Bytes::from(annex_b_payload(nal_units)?); + access_unit_from_annex_b(codec, payload, timestamp_us, width, height) +} + +/// Returns true when an Annex-B access unit contains an intra/key picture. +pub fn is_keyframe_annex_b(codec: EncodedVideoCodec, bytes: &[u8]) -> Result { + let nals = annex_b_nalus(bytes)?; + match codec { + EncodedVideoCodec::H264 => { + nals.iter().try_fold(false, |is_key, nal| Ok(is_key || h264_nal_type(nal)? == 5)) + } + EncodedVideoCodec::H265 => nals.iter().try_fold(false, |is_key, nal| { + let nal_type = h265_nal_type(nal)?; + Ok(is_key || (16..=21).contains(&nal_type)) + }), + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { + Err(CaptureError::UnsupportedCodec(codec)) + } + } +} + +fn access_unit_split_index( + codec: EncodedVideoCodec, + bytes: &[u8], + ranges: &[Range], +) -> Result, CaptureError> { + if ranges.len() < 2 { + return Ok(None); + } + + let first_nal = &bytes[ranges[0].clone()]; + let mut seen_vcl = is_vcl_nal(codec, first_nal)?; + for range in ranges.iter().skip(1) { + let nal = &bytes[range.clone()]; + if is_access_unit_delimiter(codec, nal)? && seen_vcl { + return split_start_code_index(bytes, range.start).map(Some); + } + seen_vcl |= is_vcl_nal(codec, nal)?; + } + Ok(None) +} + +fn split_start_code_index(bytes: &[u8], nal_start: usize) -> Result { + if nal_start >= 4 && bytes[nal_start - 4..nal_start] == [0, 0, 0, 1] { + return Ok(nal_start - 4); + } + if nal_start >= 3 && bytes[nal_start - 3..nal_start] == [0, 0, 1] { + return Ok(nal_start - 3); + } + Err(CaptureError::InvalidEncodedData("missing Annex-B start code")) +} + +fn is_access_unit_delimiter(codec: EncodedVideoCodec, nal: &[u8]) -> Result { + Ok(match codec { + EncodedVideoCodec::H264 => h264_nal_type(nal)? == 9, + EncodedVideoCodec::H265 => h265_nal_type(nal)? == 35, + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { + return Err(CaptureError::UnsupportedCodec(codec)); + } + }) +} + +fn is_vcl_nal(codec: EncodedVideoCodec, nal: &[u8]) -> Result { + Ok(match codec { + EncodedVideoCodec::H264 => (1..=5).contains(&h264_nal_type(nal)?), + EncodedVideoCodec::H265 => h265_nal_type(nal)? <= 31, + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { + return Err(CaptureError::UnsupportedCodec(codec)); + } + }) +} + +fn find_start_code(bytes: &[u8]) -> Option<(usize, usize)> { + let mut idx = 0; + while idx + 3 <= bytes.len() { + if bytes[idx..].starts_with(&[0, 0, 1]) { + return Some((idx, 3)); + } + if idx + 4 <= bytes.len() && bytes[idx..].starts_with(&[0, 0, 0, 1]) { + return Some((idx, 4)); + } + idx += 1; + } + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn splits_annex_b_nals_with_three_and_four_byte_prefixes() { + let bytes = [0, 0, 1, 0x67, 1, 0, 0, 0, 1, 0x65, 2, 3]; + let nals = annex_b_nalus(&bytes).unwrap(); + assert_eq!(nals, vec![&[0x67, 1][..], &[0x65, 2, 3][..]]); + } + + #[test] + fn detects_h264_keyframe_from_annex_b() { + let bytes = [0, 0, 0, 1, 0x61, 1, 0, 0, 0, 1, 0x65, 2]; + assert!(is_keyframe_annex_b(EncodedVideoCodec::H264, &bytes).unwrap()); + } + + #[test] + fn parser_flushes_final_access_unit() { + let mut parser = + AnnexBAccessUnitParser::new(EncodedVideoCodec::H264, 100, 33_333, 640, 480).unwrap(); + assert!(parser.push(&[0, 0, 1, 0x65, 1, 2]).unwrap().is_none()); + let au = parser.flush().unwrap().unwrap(); + assert_eq!(au.timestamp_us, 100); + assert_eq!(au.frame_type, EncodedFrameType::Key); + } + + #[test] + fn parser_splits_at_next_access_unit_delimiter() { + let mut parser = + AnnexBAccessUnitParser::new(EncodedVideoCodec::H264, 100, 33_333, 640, 480).unwrap(); + let stream = + [0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x65, 1, 2, 0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x41, 3]; + + let au = parser.push(&stream).unwrap().unwrap(); + assert_eq!(au.timestamp_us, 100); + assert_eq!(au.payload.as_ref(), &[0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x65, 1, 2]); + + let au = parser.flush().unwrap().unwrap(); + assert_eq!(au.timestamp_us, 33_433); + assert_eq!(au.payload.as_ref(), &[0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x41, 3]); + } +} diff --git a/livekit-capture/src/encoded/ingress.rs b/livekit-capture/src/encoded/ingress.rs new file mode 100644 index 000000000..0032e0d98 --- /dev/null +++ b/livekit-capture/src/encoded/ingress.rs @@ -0,0 +1,118 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{error::Error, fmt}; + +use crate::{encoded::OwnedEncodedAccessUnit, error::CaptureError, track::VideoCaptureTrack}; + +/// Source of owned encoded access units. +pub trait EncodedAccessUnitSource { + /// Error returned by the source. + type Error: Error + Send + Sync + 'static; + + /// Returns the next encoded access unit, or `Ok(None)` when the source reaches EOF. + fn next_access_unit(&mut self) -> Result, Self::Error>; +} + +/// Error returned while forwarding encoded access units into a track. +#[derive(Debug)] +pub enum EncodedIngressError { + /// The encoded source failed. + Source(E), + /// The capture track rejected an access unit. + Capture(CaptureError), +} + +impl fmt::Display for EncodedIngressError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Source(err) => write!(f, "encoded source failed: {err}"), + Self::Capture(err) => write!(f, "encoded capture failed: {err}"), + } + } +} + +impl Error for EncodedIngressError +where + E: Error + 'static, +{ + fn source(&self) -> Option<&(dyn Error + 'static)> { + match self { + Self::Source(err) => Some(err), + Self::Capture(err) => Some(err), + } + } +} + +/// Pulls encoded access units from a source and forwards them into a video track. +#[derive(Debug)] +pub struct EncodedIngress { + track: VideoCaptureTrack, + source: S, +} + +impl EncodedIngress { + /// Creates an encoded ingress runner. + pub fn new(track: VideoCaptureTrack, source: S) -> Self { + Self { track, source } + } + + /// Returns the capture track used by this runner. + pub fn track(&self) -> &VideoCaptureTrack { + &self.track + } + + /// Returns the underlying encoded source. + pub fn source(&self) -> &S { + &self.source + } + + /// Returns the underlying encoded source mutably. + pub fn source_mut(&mut self) -> &mut S { + &mut self.source + } + + /// Consumes this runner and returns its parts. + pub fn into_parts(self) -> (VideoCaptureTrack, S) { + (self.track, self.source) + } +} + +impl EncodedIngress +where + S: EncodedAccessUnitSource, +{ + /// Captures the next access unit and returns `false` after source EOF. + pub fn capture_next(&mut self) -> Result> { + let Some(access_unit) = + self.source.next_access_unit().map_err(EncodedIngressError::Source)? + else { + return Ok(false); + }; + + self.track + .capture_encoded(&access_unit.as_access_unit()) + .map_err(EncodedIngressError::Capture)?; + Ok(true) + } + + /// Captures access units until the source reaches EOF. + pub fn run_until_end(&mut self) -> Result> { + let mut captured = 0; + while self.capture_next()? { + captured += 1; + } + Ok(captured) + } +} diff --git a/livekit-capture/src/encoded/rtp.rs b/livekit-capture/src/encoded/rtp.rs new file mode 100644 index 000000000..2accdc7e9 --- /dev/null +++ b/livekit-capture/src/encoded/rtp.rs @@ -0,0 +1,504 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use thiserror::Error; + +use crate::{ + encoded::{h26x::access_unit_from_nalus, EncodedVideoCodec, OwnedEncodedAccessUnit}, + error::CaptureError, +}; + +/// Parsed RTP packet header and payload. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct RtpPacket<'a> { + /// RTP marker bit. + pub marker: bool, + /// RTP payload type. + pub payload_type: u8, + /// RTP sequence number. + pub sequence_number: u16, + /// RTP timestamp. + pub timestamp: u32, + /// RTP SSRC. + pub ssrc: u32, + /// RTP payload bytes. + pub payload: &'a [u8], +} + +impl<'a> RtpPacket<'a> { + /// Parses a single RTP packet. + pub fn parse(bytes: &'a [u8]) -> Result { + if bytes.len() < 12 { + return Err(RtpDepacketizerError::PacketTooShort); + } + if bytes[0] >> 6 != 2 { + return Err(RtpDepacketizerError::UnsupportedVersion(bytes[0] >> 6)); + } + + let has_padding = (bytes[0] & 0x20) != 0; + let has_extension = (bytes[0] & 0x10) != 0; + let csrc_count = (bytes[0] & 0x0f) as usize; + let mut payload_start = 12 + csrc_count * 4; + if bytes.len() < payload_start { + return Err(RtpDepacketizerError::PacketTooShort); + } + + if has_extension { + if bytes.len() < payload_start + 4 { + return Err(RtpDepacketizerError::PacketTooShort); + } + let extension_words = + u16::from_be_bytes([bytes[payload_start + 2], bytes[payload_start + 3]]) as usize; + payload_start += 4 + extension_words * 4; + if bytes.len() < payload_start { + return Err(RtpDepacketizerError::PacketTooShort); + } + } + + let payload_end = if has_padding { + let Some(padding) = bytes.last().copied() else { + return Err(RtpDepacketizerError::PacketTooShort); + }; + let padding = padding as usize; + if padding == 0 || bytes.len() < payload_start + padding { + return Err(RtpDepacketizerError::PacketTooShort); + } + bytes.len() - padding + } else { + bytes.len() + }; + + Ok(Self { + marker: (bytes[1] & 0x80) != 0, + payload_type: bytes[1] & 0x7f, + sequence_number: u16::from_be_bytes([bytes[2], bytes[3]]), + timestamp: u32::from_be_bytes([bytes[4], bytes[5], bytes[6], bytes[7]]), + ssrc: u32::from_be_bytes([bytes[8], bytes[9], bytes[10], bytes[11]]), + payload: &bytes[payload_start..payload_end], + }) + } +} + +/// Maps RTP timestamps to capture timestamps in microseconds. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct RtpTimestampMapper { + clock_rate: u32, + base_rtp_timestamp: Option, + base_timestamp_us: i64, +} + +impl RtpTimestampMapper { + /// Creates an RTP timestamp mapper. + pub fn new(clock_rate: u32, base_timestamp_us: i64) -> Self { + Self { clock_rate, base_rtp_timestamp: None, base_timestamp_us } + } + + /// Maps an RTP timestamp to microseconds, handling `u32` RTP timestamp rollover. + pub fn map(&mut self, rtp_timestamp: u32) -> Result { + if self.clock_rate == 0 { + return Err(RtpDepacketizerError::InvalidClockRate); + } + + let base = *self.base_rtp_timestamp.get_or_insert(rtp_timestamp); + let delta = rtp_timestamp.wrapping_sub(base) as u64; + let delta_us = delta.saturating_mul(1_000_000) / u64::from(self.clock_rate); + Ok(self.base_timestamp_us.saturating_add(delta_us as i64)) + } +} + +/// Error returned by RTP depayloading and access-unit assembly. +#[derive(Debug, Error, PartialEq, Eq)] +pub enum RtpDepacketizerError { + /// RTP packet is shorter than its declared header. + #[error("RTP packet is too short")] + PacketTooShort, + /// RTP version is not supported. + #[error("unsupported RTP version {0}")] + UnsupportedVersion(u8), + /// RTP clock rate must be non-zero. + #[error("RTP clock rate must be non-zero")] + InvalidClockRate, + /// RTP sequence number gap was detected. + #[error("RTP sequence gap: expected {expected}, got {actual}")] + SequenceGap { + /// Expected RTP sequence number. + expected: u16, + /// Actual RTP sequence number. + actual: u16, + }, + /// RTP payload format is unsupported or malformed. + #[error("unsupported or malformed RTP payload")] + UnsupportedPayload, + /// RTP fragmentation state was invalid. + #[error("invalid RTP fragmentation sequence")] + InvalidFragment, + /// Codec is not supported by this RTP assembler. + #[error("RTP assembler does not support {0:?}")] + UnsupportedCodec(EncodedVideoCodec), + /// Capture data could not be converted into an access unit. + #[error(transparent)] + Capture(#[from] CaptureError), +} + +/// Reassembles RTP packets into encoded access units. +#[derive(Debug, Clone)] +pub struct RtpAccessUnitAssembler { + codec: EncodedVideoCodec, + width: u32, + height: u32, + timestamp_mapper: RtpTimestampMapper, + expected_sequence_number: Option, + current: Option, + fragment: Option, +} + +#[derive(Debug, Clone)] +struct PartialAccessUnit { + rtp_timestamp: u32, + timestamp_us: i64, + nal_units: Vec>, +} + +#[derive(Debug, Clone)] +struct FragmentState { + rtp_timestamp: u32, + nal_unit: Vec, +} + +impl RtpAccessUnitAssembler { + /// Creates an RTP access-unit assembler for H.264 or H.265 payloads. + pub fn new( + codec: EncodedVideoCodec, + clock_rate: u32, + start_timestamp_us: i64, + width: u32, + height: u32, + ) -> Result { + match codec { + EncodedVideoCodec::H264 | EncodedVideoCodec::H265 => {} + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { + return Err(RtpDepacketizerError::UnsupportedCodec(codec)); + } + } + if clock_rate == 0 { + return Err(RtpDepacketizerError::InvalidClockRate); + } + + Ok(Self { + codec, + width, + height, + timestamp_mapper: RtpTimestampMapper::new(clock_rate, start_timestamp_us), + expected_sequence_number: None, + current: None, + fragment: None, + }) + } + + /// Pushes one encoded RTP packet and returns an access unit when a marker closes a frame. + pub fn push( + &mut self, + bytes: &[u8], + ) -> Result, RtpDepacketizerError> { + let packet = RtpPacket::parse(bytes)?; + self.push_packet(packet) + } + + /// Pushes one parsed RTP packet and returns an access unit when a marker closes a frame. + pub fn push_packet( + &mut self, + packet: RtpPacket<'_>, + ) -> Result, RtpDepacketizerError> { + self.check_sequence(packet.sequence_number)?; + + match self.codec { + EncodedVideoCodec::H264 => self.push_h264_payload(&packet)?, + EncodedVideoCodec::H265 => self.push_h265_payload(&packet)?, + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { + return Err(RtpDepacketizerError::UnsupportedCodec(self.codec)); + } + } + + if packet.marker { + return self.finish_current(); + } + Ok(None) + } + + fn check_sequence(&mut self, sequence_number: u16) -> Result<(), RtpDepacketizerError> { + let Some(expected) = self.expected_sequence_number.replace(sequence_number.wrapping_add(1)) + else { + return Ok(()); + }; + if sequence_number == expected { + return Ok(()); + } + + self.current = None; + self.fragment = None; + Err(RtpDepacketizerError::SequenceGap { expected, actual: sequence_number }) + } + + fn current_mut( + &mut self, + rtp_timestamp: u32, + ) -> Result<&mut PartialAccessUnit, RtpDepacketizerError> { + if self.current.as_ref().is_some_and(|current| current.rtp_timestamp != rtp_timestamp) { + self.current = None; + self.fragment = None; + } + + if self.current.is_none() { + let timestamp_us = self.timestamp_mapper.map(rtp_timestamp)?; + self.current = + Some(PartialAccessUnit { rtp_timestamp, timestamp_us, nal_units: Vec::new() }); + } + + self.current.as_mut().ok_or(RtpDepacketizerError::InvalidFragment) + } + + fn push_h264_payload(&mut self, packet: &RtpPacket<'_>) -> Result<(), RtpDepacketizerError> { + let payload = packet.payload; + let Some(&header) = payload.first() else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + let nal_type = header & 0x1f; + + match nal_type { + 1..=23 => self.current_mut(packet.timestamp)?.nal_units.push(payload.to_vec()), + 24 => self.push_h264_stap_a(packet.timestamp, &payload[1..])?, + 28 => self.push_h264_fu_a(packet.timestamp, payload)?, + _ => return Err(RtpDepacketizerError::UnsupportedPayload), + } + + Ok(()) + } + + fn push_h264_stap_a( + &mut self, + rtp_timestamp: u32, + payload: &[u8], + ) -> Result<(), RtpDepacketizerError> { + let mut cursor = 0; + while cursor < payload.len() { + if payload.len() < cursor + 2 { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + let len = u16::from_be_bytes([payload[cursor], payload[cursor + 1]]) as usize; + cursor += 2; + if len == 0 || payload.len() < cursor + len { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + self.current_mut(rtp_timestamp)?.nal_units.push(payload[cursor..cursor + len].to_vec()); + cursor += len; + } + Ok(()) + } + + fn push_h264_fu_a( + &mut self, + rtp_timestamp: u32, + payload: &[u8], + ) -> Result<(), RtpDepacketizerError> { + if payload.len() < 2 { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + + let indicator = payload[0]; + let header = payload[1]; + let start = (header & 0x80) != 0; + let end = (header & 0x40) != 0; + let nal_type = header & 0x1f; + if nal_type == 0 || nal_type > 23 { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + + if start { + let mut nal_unit = Vec::with_capacity(1 + payload.len().saturating_sub(2)); + nal_unit.push((indicator & 0xe0) | nal_type); + nal_unit.extend_from_slice(&payload[2..]); + self.fragment = Some(FragmentState { rtp_timestamp, nal_unit }); + return Ok(()); + } + + let fragment = self + .fragment + .as_mut() + .filter(|fragment| fragment.rtp_timestamp == rtp_timestamp) + .ok_or(RtpDepacketizerError::InvalidFragment)?; + fragment.nal_unit.extend_from_slice(&payload[2..]); + + if end { + let nal_unit = + self.fragment.take().ok_or(RtpDepacketizerError::InvalidFragment)?.nal_unit; + self.current_mut(rtp_timestamp)?.nal_units.push(nal_unit); + } + Ok(()) + } + + fn push_h265_payload(&mut self, packet: &RtpPacket<'_>) -> Result<(), RtpDepacketizerError> { + let payload = packet.payload; + if payload.len() < 2 { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + let nal_type = (payload[0] >> 1) & 0x3f; + + match nal_type { + 0..=47 => self.current_mut(packet.timestamp)?.nal_units.push(payload.to_vec()), + 48 => self.push_h265_aggregation(packet.timestamp, &payload[2..])?, + 49 => self.push_h265_fragment(packet.timestamp, payload)?, + _ => return Err(RtpDepacketizerError::UnsupportedPayload), + } + + Ok(()) + } + + fn push_h265_aggregation( + &mut self, + rtp_timestamp: u32, + payload: &[u8], + ) -> Result<(), RtpDepacketizerError> { + let mut cursor = 0; + while cursor < payload.len() { + if payload.len() < cursor + 2 { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + let len = u16::from_be_bytes([payload[cursor], payload[cursor + 1]]) as usize; + cursor += 2; + if len == 0 || payload.len() < cursor + len { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + self.current_mut(rtp_timestamp)?.nal_units.push(payload[cursor..cursor + len].to_vec()); + cursor += len; + } + Ok(()) + } + + fn push_h265_fragment( + &mut self, + rtp_timestamp: u32, + payload: &[u8], + ) -> Result<(), RtpDepacketizerError> { + if payload.len() < 3 { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + + let fu_header = payload[2]; + let start = (fu_header & 0x80) != 0; + let end = (fu_header & 0x40) != 0; + let nal_type = fu_header & 0x3f; + if nal_type > 47 { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + + if start { + let mut nal_unit = Vec::with_capacity(2 + payload.len().saturating_sub(3)); + nal_unit.push((payload[0] & 0x81) | (nal_type << 1)); + nal_unit.push(payload[1]); + nal_unit.extend_from_slice(&payload[3..]); + self.fragment = Some(FragmentState { rtp_timestamp, nal_unit }); + return Ok(()); + } + + let fragment = self + .fragment + .as_mut() + .filter(|fragment| fragment.rtp_timestamp == rtp_timestamp) + .ok_or(RtpDepacketizerError::InvalidFragment)?; + fragment.nal_unit.extend_from_slice(&payload[3..]); + + if end { + let nal_unit = + self.fragment.take().ok_or(RtpDepacketizerError::InvalidFragment)?.nal_unit; + self.current_mut(rtp_timestamp)?.nal_units.push(nal_unit); + } + Ok(()) + } + + fn finish_current(&mut self) -> Result, RtpDepacketizerError> { + let Some(current) = self.current.take() else { + return Ok(None); + }; + if current.nal_units.is_empty() { + return Ok(None); + } + + let nal_units = current.nal_units.iter().map(Vec::as_slice).collect::>(); + Ok(Some(access_unit_from_nalus( + self.codec, + &nal_units, + current.timestamp_us, + self.width, + self.height, + )?)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn rtp_packet(sequence_number: u16, timestamp: u32, marker: bool, payload: &[u8]) -> Vec { + let mut packet = Vec::with_capacity(12 + payload.len()); + packet.push(0x80); + packet.push(if marker { 0x80 | 96 } else { 96 }); + packet.extend_from_slice(&sequence_number.to_be_bytes()); + packet.extend_from_slice(×tamp.to_be_bytes()); + packet.extend_from_slice(&0x1122_3344_u32.to_be_bytes()); + packet.extend_from_slice(payload); + packet + } + + #[test] + fn parses_rtp_packet_header() { + let bytes = rtp_packet(7, 90_000, true, &[0x65, 1, 2]); + let packet = RtpPacket::parse(&bytes).unwrap(); + assert!(packet.marker); + assert_eq!(packet.payload_type, 96); + assert_eq!(packet.sequence_number, 7); + assert_eq!(packet.timestamp, 90_000); + assert_eq!(packet.payload, &[0x65, 1, 2]); + } + + #[test] + fn maps_rtp_timestamp_rollover() { + let mut mapper = RtpTimestampMapper::new(90_000, 1_000); + assert_eq!(mapper.map(u32::MAX - 89).unwrap(), 1_000); + assert_eq!(mapper.map(0).unwrap(), 2_000); + } + + #[test] + fn assembles_h264_fu_a() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::H264, 90_000, 0, 640, 480).unwrap(); + let start = rtp_packet(10, 12_000, false, &[0x7c, 0x85, 1, 2]); + let end = rtp_packet(11, 12_000, true, &[0x7c, 0x45, 3, 4]); + + assert!(assembler.push(&start).unwrap().is_none()); + let access_unit = assembler.push(&end).unwrap().unwrap(); + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 1, 2, 3, 4]); + } + + #[test] + fn sequence_gap_clears_current_frame() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::H264, 90_000, 0, 640, 480).unwrap(); + let start = rtp_packet(10, 12_000, false, &[0x7c, 0x85, 1, 2]); + let end = rtp_packet(12, 12_000, true, &[0x7c, 0x45, 3, 4]); + + assert!(assembler.push(&start).unwrap().is_none()); + let err = assembler.push(&end).unwrap_err(); + assert_eq!(err, RtpDepacketizerError::SequenceGap { expected: 11, actual: 12 }); + } +} diff --git a/livekit-capture/src/error.rs b/livekit-capture/src/error.rs new file mode 100644 index 000000000..c3714ec54 --- /dev/null +++ b/livekit-capture/src/error.rs @@ -0,0 +1,46 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use thiserror::Error; + +use crate::encoded::{EncodedVideoCodec, EncodedWireFormat}; + +/// Error returned by capture helpers. +#[derive(Debug, Error, PartialEq, Eq)] +pub enum CaptureError { + /// Encoded payload is empty. + #[error("encoded payload is empty")] + EmptyPayload, + /// H.265 NAL unit is too short to contain its header. + #[error("H.265 NAL unit is too short")] + H265NalTooShort, + /// DMA-BUF frame did not include any planes. + #[error("DMA-BUF frame did not include any planes")] + MissingDmaBufPlane, + /// Codec is represented by the API but not yet supported by native passthrough. + #[error("encoded passthrough does not support {0:?} yet")] + UnsupportedCodec(EncodedVideoCodec), + /// Encoded payload or transport data is malformed. + #[error("invalid encoded data: {0}")] + InvalidEncodedData(&'static str), + /// Wire format is represented by the API but not supported by this source. + #[error("encoded wire format is not supported by this source: {0:?}")] + UnsupportedWireFormat(EncodedWireFormat), + /// Capture backend is not available on this platform. + #[error("{0} is not supported on this platform")] + UnsupportedPlatform(&'static str), + /// The underlying source rejected the frame. + #[error("capture source rejected the frame")] + CaptureFailed, +} diff --git a/livekit-capture/src/lib.rs b/livekit-capture/src/lib.rs index 2485806a6..1dee1586f 100644 --- a/livekit-capture/src/lib.rs +++ b/livekit-capture/src/lib.rs @@ -14,16 +14,25 @@ //! Capture helpers for publishing decoded, DMA-BUF, and encoded video with LiveKit. +pub mod device; pub mod dmabuf; pub mod encoded; mod error; pub mod metadata; +pub mod platform; +pub mod sources; pub mod track; +pub use device::{ + CaptureDeviceInfo, CaptureDeviceSelector, CaptureFormat, CaptureFormatRequest, + CapturePixelFormat, CaptureResolution, +}; pub use dmabuf::{DmaBufFrame, DmaBufPixelFormat, DmaBufPlane}; pub use encoded::{ + ingress::{EncodedAccessUnitSource, EncodedIngress, EncodedIngressError}, CodecSpecific, EncodedAccessUnit, EncodedFragment, EncodedFrameType, EncodedLayerInfo, - EncodedPayload, EncodedVideoCodec, H264PacketizationMode, + EncodedPayload, EncodedVideoCodec, EncodedWireFormat, H264PacketizationMode, + OwnedEncodedAccessUnit, }; pub use error::CaptureError; pub use metadata::FrameMetadata; diff --git a/livekit-capture/src/metadata.rs b/livekit-capture/src/metadata.rs new file mode 100644 index 000000000..6eac32db8 --- /dev/null +++ b/livekit-capture/src/metadata.rs @@ -0,0 +1,33 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// Packet-trailer metadata associated with a captured frame. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct FrameMetadata { + /// Wall-clock capture timestamp in microseconds. + pub user_timestamp: Option, + /// Monotonically increasing frame identifier. + pub frame_id: Option, +} + +impl FrameMetadata { + pub(crate) fn into_rtc(self) -> Option { + (self.user_timestamp.is_some() || self.frame_id.is_some()).then_some( + livekit::webrtc::video_frame::FrameMetadata { + user_timestamp: self.user_timestamp, + frame_id: self.frame_id, + }, + ) + } +} diff --git a/livekit-capture/src/platform/avfoundation.rs b/livekit-capture/src/platform/avfoundation.rs new file mode 100644 index 000000000..c4a64b2c3 --- /dev/null +++ b/livekit-capture/src/platform/avfoundation.rs @@ -0,0 +1,171 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use thiserror::Error; + +use crate::{ + device::{CaptureDeviceInfo, CaptureDeviceSelector, CaptureFormatRequest}, + error::CaptureError, + track::VideoCaptureTrack, +}; + +/// Options used to create an AVFoundation capture session. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct AvFoundationCaptureOptions { + /// Device to use for capture. + pub device: CaptureDeviceSelector, + /// Format requested from the device. + pub format: CaptureFormatRequest, + /// Whether the resulting track should be marked as a screencast. + pub is_screencast: bool, +} + +impl Default for AvFoundationCaptureOptions { + fn default() -> Self { + Self { + device: CaptureDeviceSelector::Default, + format: CaptureFormatRequest::Default, + is_screencast: false, + } + } +} + +/// AVFoundation decoded-frame capture session. +#[derive(Debug)] +pub struct AvFoundationCapture { + track: VideoCaptureTrack, + options: AvFoundationCaptureOptions, +} + +impl AvFoundationCapture { + /// Creates an AVFoundation capture session wrapper for a capture track. + pub fn new( + track: VideoCaptureTrack, + options: AvFoundationCaptureOptions, + ) -> Result { + ensure_platform_available()?; + Ok(Self { track, options }) + } + + /// Returns the capture track that receives decoded frames. + pub fn track(&self) -> &VideoCaptureTrack { + &self.track + } + + /// Returns the configured capture options. + pub fn options(&self) -> &AvFoundationCaptureOptions { + &self.options + } + + /// Starts AVFoundation capture. + pub fn start(&mut self) -> Result<(), AvFoundationError> { + start_capture(self) + } + + /// Stops AVFoundation capture. + pub fn stop(&mut self) -> Result<(), AvFoundationError> { + stop_capture(self) + } +} + +/// Lists AVFoundation video capture devices. +pub fn devices() -> Result, AvFoundationError> { + list_devices() +} + +/// Error returned by AVFoundation capture. +#[derive(Debug, Error)] +pub enum AvFoundationError { + /// AVFoundation capture is only available on macOS. + #[error("AVFoundation capture is only available on macOS")] + UnsupportedPlatform, + /// The requested device was not found. + #[error("AVFoundation capture device was not found")] + DeviceNotFound, + /// The requested operation is represented by the API but not implemented yet. + #[error("{0}")] + NotImplemented(&'static str), + /// The shared capture track rejected a frame. + #[error(transparent)] + Capture(#[from] CaptureError), +} + +#[cfg(target_os = "macos")] +fn ensure_platform_available() -> Result<(), AvFoundationError> { + Ok(()) +} + +#[cfg(not(target_os = "macos"))] +fn ensure_platform_available() -> Result<(), AvFoundationError> { + Err(AvFoundationError::UnsupportedPlatform) +} + +#[cfg(target_os = "macos")] +fn list_devices() -> Result, AvFoundationError> { + use objc2_av_foundation::{AVCaptureDevice, AVMediaTypeVideo}; + + // SAFETY: AVMediaTypeVideo is a framework-provided immutable NSString + // constant. We only borrow it to ask AVFoundation for video devices. + let media_type = unsafe { AVMediaTypeVideo }.ok_or(AvFoundationError::DeviceNotFound)?; + // SAFETY: AVFoundation returns an immutable NSArray of currently available + // AVCaptureDevice instances. We only retain/copy string properties from it. + #[allow(deprecated)] + let devices = unsafe { AVCaptureDevice::devicesWithMediaType(media_type) }; + + let mut results = Vec::with_capacity(devices.len()); + for device in devices.iter() { + // SAFETY: These Objective-C property getters return retained NSStrings + // for a live AVCaptureDevice from the immutable devices array. + let id = unsafe { device.uniqueID() }.to_string(); + let name = unsafe { device.localizedName() }.to_string(); + let model_id = non_empty_string(unsafe { device.modelID() }.to_string()); + let manufacturer = non_empty_string(unsafe { device.manufacturer() }.to_string()); + + results.push(CaptureDeviceInfo { id, name, model_id, manufacturer, formats: Vec::new() }); + } + + Ok(results) +} + +#[cfg(not(target_os = "macos"))] +fn list_devices() -> Result, AvFoundationError> { + Err(AvFoundationError::UnsupportedPlatform) +} + +#[cfg(target_os = "macos")] +fn non_empty_string(value: String) -> Option { + (!value.is_empty()).then_some(value) +} + +#[cfg(target_os = "macos")] +fn start_capture(_capture: &mut AvFoundationCapture) -> Result<(), AvFoundationError> { + Err(AvFoundationError::NotImplemented( + "AVFoundation decoded-frame delegate capture is not wired yet", + )) +} + +#[cfg(not(target_os = "macos"))] +fn start_capture(_capture: &mut AvFoundationCapture) -> Result<(), AvFoundationError> { + Err(AvFoundationError::UnsupportedPlatform) +} + +#[cfg(target_os = "macos")] +fn stop_capture(_capture: &mut AvFoundationCapture) -> Result<(), AvFoundationError> { + Ok(()) +} + +#[cfg(not(target_os = "macos"))] +fn stop_capture(_capture: &mut AvFoundationCapture) -> Result<(), AvFoundationError> { + Err(AvFoundationError::UnsupportedPlatform) +} diff --git a/livekit-capture/src/platform/mod.rs b/livekit-capture/src/platform/mod.rs new file mode 100644 index 000000000..739bbda39 --- /dev/null +++ b/livekit-capture/src/platform/mod.rs @@ -0,0 +1,18 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Platform capture backends. + +#[cfg(feature = "avfoundation")] +pub mod avfoundation; diff --git a/livekit-capture/src/sources/gstreamer.rs b/livekit-capture/src/sources/gstreamer.rs new file mode 100644 index 000000000..09456f713 --- /dev/null +++ b/livekit-capture/src/sources/gstreamer.rs @@ -0,0 +1,57 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::error::Error; + +use crate::encoded::{ingress::EncodedAccessUnitSource, OwnedEncodedAccessUnit}; + +/// Callback-backed encoded source for GStreamer appsink integrations. +#[derive(Debug)] +pub struct GStreamerAppSinkSource { + next_access_unit: F, +} + +impl GStreamerAppSinkSource { + /// Creates a source from a callback that pulls the next encoded appsink sample. + pub fn new(next_access_unit: F) -> Self { + Self { next_access_unit } + } + + /// Returns the wrapped callback. + pub fn callback(&self) -> &F { + &self.next_access_unit + } + + /// Returns the wrapped callback mutably. + pub fn callback_mut(&mut self) -> &mut F { + &mut self.next_access_unit + } + + /// Consumes this source and returns the wrapped callback. + pub fn into_callback(self) -> F { + self.next_access_unit + } +} + +impl EncodedAccessUnitSource for GStreamerAppSinkSource +where + F: FnMut() -> Result, E>, + E: Error + Send + Sync + 'static, +{ + type Error = E; + + fn next_access_unit(&mut self) -> Result, Self::Error> { + (self.next_access_unit)() + } +} diff --git a/livekit-capture/src/sources/mod.rs b/livekit-capture/src/sources/mod.rs new file mode 100644 index 000000000..fbae7ca1b --- /dev/null +++ b/livekit-capture/src/sources/mod.rs @@ -0,0 +1,22 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Optional capture sources that feed the shared capture paths. + +#[cfg(feature = "gstreamer")] +pub mod gstreamer; +#[cfg(feature = "rtsp")] +pub mod rtsp; +#[cfg(feature = "tcp-source")] +pub mod tcp; diff --git a/livekit-capture/src/sources/rtsp.rs b/livekit-capture/src/sources/rtsp.rs new file mode 100644 index 000000000..f735b2382 --- /dev/null +++ b/livekit-capture/src/sources/rtsp.rs @@ -0,0 +1,200 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::io::{self, Read}; + +use thiserror::Error; + +use crate::encoded::{ + ingress::EncodedAccessUnitSource, + rtp::{RtpAccessUnitAssembler, RtpDepacketizerError}, + EncodedVideoCodec, OwnedEncodedAccessUnit, +}; + +/// Configuration for RTSP interleaved RTP media. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct RtspInterleavedSourceConfig { + /// RTP payload codec. + pub codec: EncodedVideoCodec, + /// RTP timestamp clock rate. + pub clock_rate: u32, + /// RTSP interleaved channel carrying video RTP packets. + pub video_channel: u8, + /// Timestamp assigned to the first emitted access unit. + pub start_timestamp_us: i64, + /// Encoded frame width in pixels. + pub width: u32, + /// Encoded frame height in pixels. + pub height: u32, +} + +/// Encoded source for RTSP interleaved RTP streams. +#[derive(Debug)] +pub struct RtspInterleavedRtpSource { + reader: R, + config: RtspInterleavedSourceConfig, + assembler: RtpAccessUnitAssembler, + eof: bool, +} + +impl RtspInterleavedRtpSource +where + R: Read, +{ + /// Creates a source for an RTSP stream that is already in interleaved RTP mode. + pub fn new(reader: R, config: RtspInterleavedSourceConfig) -> Result { + let assembler = RtpAccessUnitAssembler::new( + config.codec, + config.clock_rate, + config.start_timestamp_us, + config.width, + config.height, + )?; + Ok(Self { reader, config, assembler, eof: false }) + } + + /// Returns the wrapped reader. + pub fn reader(&self) -> &R { + &self.reader + } + + /// Returns the wrapped reader mutably. + pub fn reader_mut(&mut self) -> &mut R { + &mut self.reader + } + + /// Consumes this source and returns its reader. + pub fn into_reader(self) -> R { + self.reader + } + + fn read_next_interleaved_frame(&mut self) -> Result)>, RtspSourceError> { + while !self.eof { + let mut magic = [0u8; 1]; + if !read_exact_or_clean_eof(&mut self.reader, &mut magic) + .map_err(RtspSourceError::Io)? + { + self.eof = true; + return Ok(None); + } + + if magic[0] != b'$' { + return Err(RtspSourceError::UnexpectedData); + } + + let mut header = [0u8; 3]; + self.reader.read_exact(&mut header).map_err(RtspSourceError::Io)?; + let channel = header[0]; + let len = u16::from_be_bytes([header[1], header[2]]) as usize; + let mut payload = vec![0; len]; + self.reader.read_exact(&mut payload).map_err(RtspSourceError::Io)?; + return Ok(Some((channel, payload))); + } + + Ok(None) + } +} + +impl EncodedAccessUnitSource for RtspInterleavedRtpSource +where + R: Read + Send + Sync + 'static, +{ + type Error = RtspSourceError; + + fn next_access_unit(&mut self) -> Result, Self::Error> { + loop { + let Some((channel, payload)) = self.read_next_interleaved_frame()? else { + return Ok(None); + }; + if channel != self.config.video_channel { + continue; + } + if let Some(access_unit) = self.assembler.push(&payload)? { + return Ok(Some(access_unit)); + } + } + } +} + +/// Error returned by RTSP encoded sources. +#[derive(Debug, Error)] +pub enum RtspSourceError { + /// I/O failed while reading RTSP interleaved data. + #[error("RTSP read failed: {0}")] + Io(io::Error), + /// Interleaved RTP was malformed or a non-interleaved byte was encountered. + #[error("unexpected RTSP interleaved data")] + UnexpectedData, + /// RTP depayloading failed. + #[error(transparent)] + Rtp(#[from] RtpDepacketizerError), +} + +fn read_exact_or_clean_eof(reader: &mut impl Read, buf: &mut [u8]) -> io::Result { + let mut offset = 0; + while offset < buf.len() { + match reader.read(&mut buf[offset..])? { + 0 if offset == 0 => return Ok(false), + 0 => return Err(io::Error::from(io::ErrorKind::UnexpectedEof)), + read => offset += read, + } + } + Ok(true) +} + +#[cfg(test)] +mod tests { + use std::io::Cursor; + + use super::*; + + fn rtp_packet(sequence_number: u16, timestamp: u32, marker: bool, payload: &[u8]) -> Vec { + let mut packet = Vec::with_capacity(12 + payload.len()); + packet.push(0x80); + packet.push(if marker { 0x80 | 96 } else { 96 }); + packet.extend_from_slice(&sequence_number.to_be_bytes()); + packet.extend_from_slice(×tamp.to_be_bytes()); + packet.extend_from_slice(&0x1122_3344_u32.to_be_bytes()); + packet.extend_from_slice(payload); + packet + } + + fn interleaved(channel: u8, payload: &[u8]) -> Vec { + let mut frame = Vec::with_capacity(4 + payload.len()); + frame.push(b'$'); + frame.push(channel); + frame.extend_from_slice(&(payload.len() as u16).to_be_bytes()); + frame.extend_from_slice(payload); + frame + } + + #[test] + fn reads_rtsp_interleaved_rtp_access_unit() { + let packet = rtp_packet(10, 12_000, true, &[0x65, 1, 2]); + let stream = interleaved(0, &packet); + let config = RtspInterleavedSourceConfig { + codec: EncodedVideoCodec::H264, + clock_rate: 90_000, + video_channel: 0, + start_timestamp_us: 0, + width: 640, + height: 480, + }; + let mut source = RtspInterleavedRtpSource::new(Cursor::new(stream), config).unwrap(); + + let access_unit = source.next_access_unit().unwrap().unwrap(); + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 1, 2]); + assert!(source.next_access_unit().unwrap().is_none()); + } +} diff --git a/livekit-capture/src/sources/tcp.rs b/livekit-capture/src/sources/tcp.rs new file mode 100644 index 000000000..173a007ba --- /dev/null +++ b/livekit-capture/src/sources/tcp.rs @@ -0,0 +1,306 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{ + io::{self, Read}, + net::TcpStream, +}; + +use thiserror::Error; + +use crate::{ + encoded::{ + h26x::AnnexBAccessUnitParser, + ingress::EncodedAccessUnitSource, + rtp::{RtpAccessUnitAssembler, RtpDepacketizerError}, + EncodedVideoCodec, EncodedWireFormat, OwnedEncodedAccessUnit, + }, + error::CaptureError, +}; + +const DEFAULT_CHUNK_SIZE: usize = 4096; + +/// Configuration for a byte-stream encoded source. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct ByteStreamSourceConfig { + /// Declared stream wire format. + pub wire_format: EncodedWireFormat, + /// Timestamp assigned to the first emitted access unit. + pub start_timestamp_us: i64, + /// Frame interval used for Annex-B byte streams. + pub frame_interval_us: i64, + /// Encoded frame width in pixels. + pub width: u32, + /// Encoded frame height in pixels. + pub height: u32, + /// Read chunk size for Annex-B byte streams. + pub read_chunk_size: usize, +} + +impl ByteStreamSourceConfig { + /// Creates byte-stream source configuration with a 4096-byte read chunk. + pub fn new( + wire_format: EncodedWireFormat, + start_timestamp_us: i64, + frame_interval_us: i64, + width: u32, + height: u32, + ) -> Self { + Self { + wire_format, + start_timestamp_us, + frame_interval_us, + width, + height, + read_chunk_size: DEFAULT_CHUNK_SIZE, + } + } +} + +/// Encoded source backed by any blocking byte stream. +#[derive(Debug)] +pub struct ByteStreamEncodedSource { + reader: R, + parser: ByteStreamParser, + read_chunk: Vec, + eof: bool, +} + +/// TCP encoded source using the same parser as other byte streams. +pub type TcpEncodedSource = ByteStreamEncodedSource; + +#[derive(Debug)] +enum ByteStreamParser { + H26x(AnnexBAccessUnitParser), + Rtp(RtpAccessUnitAssembler), +} + +impl ByteStreamEncodedSource +where + R: Read, +{ + /// Creates an encoded source for a declared byte-stream wire format. + pub fn new(reader: R, config: ByteStreamSourceConfig) -> Result { + let parser = match config.wire_format { + EncodedWireFormat::H264AnnexB => ByteStreamParser::H26x( + AnnexBAccessUnitParser::new( + EncodedVideoCodec::H264, + config.start_timestamp_us, + config.frame_interval_us, + config.width, + config.height, + ) + .map_err(TcpSourceError::Capture)?, + ), + EncodedWireFormat::H265AnnexB => ByteStreamParser::H26x( + AnnexBAccessUnitParser::new( + EncodedVideoCodec::H265, + config.start_timestamp_us, + config.frame_interval_us, + config.width, + config.height, + ) + .map_err(TcpSourceError::Capture)?, + ), + EncodedWireFormat::Rtp { codec, clock_rate } => { + ByteStreamParser::Rtp(RtpAccessUnitAssembler::new( + codec, + clock_rate, + config.start_timestamp_us, + config.width, + config.height, + )?) + } + EncodedWireFormat::MpegTs => { + return Err(TcpSourceError::UnsupportedWireFormat(config.wire_format)); + } + }; + + Ok(Self { reader, parser, read_chunk: vec![0; config.read_chunk_size.max(1)], eof: false }) + } + + /// Returns the wrapped reader. + pub fn reader(&self) -> &R { + &self.reader + } + + /// Returns the wrapped reader mutably. + pub fn reader_mut(&mut self) -> &mut R { + &mut self.reader + } + + /// Consumes this source and returns its reader. + pub fn into_reader(self) -> R { + self.reader + } + + fn next_annex_b( + reader: &mut R, + read_chunk: &mut [u8], + parser: &mut AnnexBAccessUnitParser, + eof: &mut bool, + ) -> Result, TcpSourceError> { + loop { + if let Some(access_unit) = parser.push(&[]).map_err(TcpSourceError::Capture)? { + return Ok(Some(access_unit)); + } + if *eof { + return parser.flush().map_err(TcpSourceError::Capture); + } + + let read = reader.read(read_chunk).map_err(TcpSourceError::Io)?; + if read == 0 { + *eof = true; + continue; + } + if let Some(access_unit) = + parser.push(&read_chunk[..read]).map_err(TcpSourceError::Capture)? + { + return Ok(Some(access_unit)); + } + } + } + + fn next_rtp( + reader: &mut R, + assembler: &mut RtpAccessUnitAssembler, + eof: &mut bool, + ) -> Result, TcpSourceError> { + while !*eof { + let mut len = [0u8; 2]; + if !read_exact_or_clean_eof(reader, &mut len).map_err(TcpSourceError::Io)? { + *eof = true; + return Ok(None); + } + + let packet_len = u16::from_be_bytes(len) as usize; + if packet_len == 0 { + continue; + } + + let mut packet = vec![0; packet_len]; + reader.read_exact(&mut packet).map_err(TcpSourceError::Io)?; + if let Some(access_unit) = assembler.push(&packet)? { + return Ok(Some(access_unit)); + } + } + + Ok(None) + } +} + +impl EncodedAccessUnitSource for ByteStreamEncodedSource +where + R: Read + Send + Sync + 'static, +{ + type Error = TcpSourceError; + + fn next_access_unit(&mut self) -> Result, Self::Error> { + match &mut self.parser { + ByteStreamParser::H26x(parser) => { + Self::next_annex_b(&mut self.reader, &mut self.read_chunk, parser, &mut self.eof) + } + ByteStreamParser::Rtp(assembler) => { + Self::next_rtp(&mut self.reader, assembler, &mut self.eof) + } + } + } +} + +/// Error returned by byte-stream encoded sources. +#[derive(Debug, Error)] +pub enum TcpSourceError { + /// I/O failed while reading the byte stream. + #[error("byte-stream read failed: {0}")] + Io(io::Error), + /// The declared wire format is not supported by this source. + #[error("unsupported byte-stream wire format: {0:?}")] + UnsupportedWireFormat(EncodedWireFormat), + /// RTP depayloading failed. + #[error(transparent)] + Rtp(#[from] RtpDepacketizerError), + /// Access-unit construction failed. + #[error(transparent)] + Capture(CaptureError), +} + +fn read_exact_or_clean_eof(reader: &mut impl Read, buf: &mut [u8]) -> io::Result { + let mut offset = 0; + while offset < buf.len() { + match reader.read(&mut buf[offset..])? { + 0 if offset == 0 => return Ok(false), + 0 => return Err(io::Error::from(io::ErrorKind::UnexpectedEof)), + read => offset += read, + } + } + Ok(true) +} + +#[cfg(test)] +mod tests { + use std::io::Cursor; + + use super::*; + + fn rtp_packet(sequence_number: u16, timestamp: u32, marker: bool, payload: &[u8]) -> Vec { + let mut packet = Vec::with_capacity(12 + payload.len()); + packet.push(0x80); + packet.push(if marker { 0x80 | 96 } else { 96 }); + packet.extend_from_slice(&sequence_number.to_be_bytes()); + packet.extend_from_slice(×tamp.to_be_bytes()); + packet.extend_from_slice(&0x1122_3344_u32.to_be_bytes()); + packet.extend_from_slice(payload); + packet + } + + fn rfc4571(packet: &[u8]) -> Vec { + let mut bytes = Vec::with_capacity(2 + packet.len()); + bytes.extend_from_slice(&(packet.len() as u16).to_be_bytes()); + bytes.extend_from_slice(packet); + bytes + } + + #[test] + fn reads_annex_b_access_units() { + let stream = + [0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x65, 1, 2, 0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x41, 3]; + let config = + ByteStreamSourceConfig::new(EncodedWireFormat::H264AnnexB, 0, 33_333, 640, 480); + let mut source = ByteStreamEncodedSource::new(Cursor::new(stream), config).unwrap(); + + let first = source.next_access_unit().unwrap().unwrap(); + assert_eq!(first.payload.as_ref(), &[0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x65, 1, 2]); + let second = source.next_access_unit().unwrap().unwrap(); + assert_eq!(second.payload.as_ref(), &[0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x41, 3]); + assert!(source.next_access_unit().unwrap().is_none()); + } + + #[test] + fn reads_rfc4571_rtp_access_unit() { + let packet = rtp_packet(10, 12_000, true, &[0x65, 1, 2]); + let stream = rfc4571(&packet); + let config = ByteStreamSourceConfig::new( + EncodedWireFormat::Rtp { codec: EncodedVideoCodec::H264, clock_rate: 90_000 }, + 0, + 33_333, + 640, + 480, + ); + let mut source = ByteStreamEncodedSource::new(Cursor::new(stream), config).unwrap(); + + let access_unit = source.next_access_unit().unwrap().unwrap(); + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 1, 2]); + assert!(source.next_access_unit().unwrap().is_none()); + } +} diff --git a/livekit-capture/src/track.rs b/livekit-capture/src/track.rs new file mode 100644 index 000000000..fb80d1364 --- /dev/null +++ b/livekit-capture/src/track.rs @@ -0,0 +1,119 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use livekit::{ + options::{TrackPublishOptions, VideoEncoderBackend}, + prelude::LocalVideoTrack, + webrtc::{ + video_frame::{EncodedVideoFrame, VideoBuffer, VideoFrame}, + video_source::{native::NativeVideoSource, RtcVideoSource, VideoResolution}, + }, +}; + +use crate::{ + encoded::{EncodedAccessUnit, EncodedVideoCodec}, + error::CaptureError, +}; + +#[cfg(target_os = "linux")] +use crate::dmabuf::DmaBufFrame; + +/// Capture path used by a source implementation. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum CapturePath { + /// Decoded CPU or native frame buffers. + FrameBuffer, + /// Linux DMA-BUF backed frames. + DmaBuf, + /// Pre-encoded compressed access units. + Encoded, +} + +/// Capture source backed by a LiveKit local video track. +#[derive(Debug, Clone)] +pub struct VideoCaptureTrack { + source: NativeVideoSource, + track: LocalVideoTrack, +} + +impl VideoCaptureTrack { + /// Creates a capture track with the supplied resolution. + pub fn new(name: &str, resolution: VideoResolution, is_screencast: bool) -> Self { + let source = NativeVideoSource::new(resolution, is_screencast); + let track = + LocalVideoTrack::create_video_track(name, RtcVideoSource::Native(source.clone())); + Self { source, track } + } + + /// Returns the publishable local video track. + pub fn track(&self) -> LocalVideoTrack { + self.track.clone() + } + + /// Captures one decoded video frame. + pub fn capture_frame>(&self, frame: &VideoFrame) { + self.source.capture_frame(frame); + } + + /// Captures one DMA-BUF backed frame. + #[cfg(target_os = "linux")] + pub fn capture_dmabuf(&self, frame: &DmaBufFrame) -> Result<(), CaptureError> { + let plane = frame.planes.first().ok_or(CaptureError::MissingDmaBufPlane)?; + let ok = self.source.capture_dmabuf_frame_with_metadata( + plane.fd, + frame.width, + frame.height, + frame.pixel_format.as_native(), + frame.timestamp_us, + frame.metadata.into_rtc(), + ); + ok.then_some(()).ok_or(CaptureError::CaptureFailed) + } + + /// Captures one encoded video access unit. + pub fn capture_encoded(&self, access_unit: &EncodedAccessUnit<'_>) -> Result<(), CaptureError> { + match access_unit.codec { + EncodedVideoCodec::H264 | EncodedVideoCodec::H265 => {} + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { + return Err(CaptureError::UnsupportedCodec(access_unit.codec)); + } + } + if access_unit.payload.is_empty() { + return Err(CaptureError::EmptyPayload); + } + + let payload = access_unit.payload.to_vec(); + let frame = EncodedVideoFrame { + codec: access_unit.codec.into(), + payload: &payload, + timestamp_us: access_unit.timestamp_us, + frame_type: access_unit.frame_type.into(), + width: access_unit.width, + height: access_unit.height, + frame_metadata: access_unit.metadata.into_rtc(), + }; + self.source.capture_encoded_frame(&frame).then_some(()).ok_or(CaptureError::CaptureFailed) + } + + /// Returns publish options appropriate for encoded passthrough. + pub fn encoded_publish_options(codec: EncodedVideoCodec) -> TrackPublishOptions { + TrackPublishOptions { + video_codec: codec.into(), + video_encoder: VideoEncoderBackend::PreEncoded, + simulcast: false, + ..Default::default() + } + } +} From c32c7cb0ba490deab042dbfe70a61101f450be3a Mon Sep 17 00:00:00 2001 From: David Chen Date: Tue, 23 Jun 2026 11:11:25 -0700 Subject: [PATCH 04/24] adding v4l support --- .changeset/livekit-capture-preencoded.md | 2 +- Cargo.lock | 4 + livekit-capture/Cargo.toml | 10 + livekit-capture/README.md | 3 + livekit-capture/build.rs | 65 ++ livekit-capture/src/device.rs | 4 + livekit-capture/src/sources/argus.rs | 383 ++++++++++ livekit-capture/src/sources/lk_argus.cpp | 618 ++++++++++++++++ livekit-capture/src/sources/mod.rs | 4 + livekit-capture/src/sources/v4l.rs | 895 +++++++++++++++++++++++ 10 files changed, 1987 insertions(+), 1 deletion(-) create mode 100644 livekit-capture/build.rs create mode 100644 livekit-capture/src/sources/argus.rs create mode 100644 livekit-capture/src/sources/lk_argus.cpp create mode 100644 livekit-capture/src/sources/v4l.rs diff --git a/.changeset/livekit-capture-preencoded.md b/.changeset/livekit-capture-preencoded.md index 3548b0662..d0b7cb263 100644 --- a/.changeset/livekit-capture-preencoded.md +++ b/.changeset/livekit-capture-preencoded.md @@ -5,4 +5,4 @@ "webrtc-sys": patch --- -Add a `livekit-capture` crate with codec-neutral capture types, H264/H265 passthrough support, common encoded ingress helpers, and feature-gated source/platform scaffolding for TCP, RTSP, GStreamer appsink, and AVFoundation capture. +Add a `livekit-capture` crate with codec-neutral capture types, H264/H265 passthrough support, common encoded ingress helpers, and feature-gated source/platform scaffolding for TCP, RTSP, GStreamer appsink, AVFoundation, Linux V4L, and Jetson libargus capture. diff --git a/Cargo.lock b/Cargo.lock index 3cc5ed018..0439042e2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3888,15 +3888,19 @@ version = "0.1.0" dependencies = [ "block2 0.6.2", "bytes", + "cc", "dispatch2", + "image", "imgproc", "livekit", + "nokhwa", "objc2 0.6.4", "objc2-av-foundation", "objc2-core-media", "objc2-core-video", "objc2-foundation 0.3.2", "thiserror 2.0.18", + "yuv-sys", ] [[package]] diff --git a/livekit-capture/Cargo.toml b/livekit-capture/Cargo.toml index 72caf30b9..e5eaf2920 100644 --- a/livekit-capture/Cargo.toml +++ b/livekit-capture/Cargo.toml @@ -9,9 +9,11 @@ repository.workspace = true [dependencies] bytes = { workspace = true } +image = { workspace = true, optional = true } imgproc = { workspace = true, optional = true } livekit = { workspace = true } thiserror = { workspace = true } +yuv-sys = { workspace = true, features = ["jpeg"], optional = true } [features] default = [] @@ -46,8 +48,13 @@ avfoundation = [ "objc2-foundation/NSString", ] gstreamer = [] +libargus = [] rtsp = [] tcp-source = [] +v4l = ["dep:image", "dep:nokhwa", "dep:yuv-sys"] + +[build-dependencies] +cc = { workspace = true } [target.'cfg(target_os = "macos")'.dependencies] block2 = { version = "0.6.2", default-features = false, optional = true } @@ -57,3 +64,6 @@ objc2-av-foundation = { version = "0.3.2", default-features = false, optional = objc2-core-media = { version = "0.3.2", default-features = false, optional = true } objc2-core-video = { version = "0.3.2", default-features = false, optional = true } objc2-foundation = { version = "0.3.2", default-features = false, features = ["std"], optional = true } + +[target.'cfg(target_os = "linux")'.dependencies] +nokhwa = { git = "https://github.com/l1npengtul/nokhwa", rev = "4923ecab7cf26f9dba83867a15a9d8662d021296", default-features = false, features = ["input-v4l"], optional = true } diff --git a/livekit-capture/README.md b/livekit-capture/README.md index 4fa7a8722..21e3e7cb9 100644 --- a/livekit-capture/README.md +++ b/livekit-capture/README.md @@ -2,3 +2,6 @@ Capture helpers for publishing decoded, DMA-BUF, and pre-encoded video frames with the LiveKit Rust SDK. + +Optional source features include `avfoundation`, `libargus`, `v4l`, +`tcp-source`, `rtsp`, and `gstreamer`. diff --git a/livekit-capture/build.rs b/livekit-capture/build.rs new file mode 100644 index 000000000..296eb6d04 --- /dev/null +++ b/livekit-capture/build.rs @@ -0,0 +1,65 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::path::PathBuf; + +fn main() { + println!("cargo:rustc-check-cfg=cfg(livekit_capture_argus)"); + println!("cargo:rerun-if-env-changed=JETSON_MULTIMEDIA_API_DIR"); + + if std::env::var_os("CARGO_FEATURE_LIBARGUS").is_none() { + return; + } + + let target_os = std::env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); + let target_arch = std::env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default(); + if target_os != "linux" || target_arch != "aarch64" { + return; + } + + let mmapi_root = std::env::var_os("JETSON_MULTIMEDIA_API_DIR") + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from("/usr/src/jetson_multimedia_api")); + let argus_include = mmapi_root.join("argus/include"); + let mmapi_include = mmapi_root.join("include"); + + if !argus_include.exists() || !mmapi_include.exists() { + println!( + "cargo:warning=Argus headers not found under {}; skipping libargus capture shim", + mmapi_root.display() + ); + return; + } + + println!("cargo:rerun-if-changed=src/sources/lk_argus.cpp"); + + cc::Build::new() + .cpp(true) + .file("src/sources/lk_argus.cpp") + .include(&argus_include) + .include(&mmapi_include) + .flag("-std=c++14") + .flag("-Wno-deprecated-declarations") + .compile("lk_argus"); + + println!("cargo:rustc-cfg=livekit_capture_argus"); + println!("cargo:rustc-link-lib=dylib=nvargus_socketclient"); + println!("cargo:rustc-link-lib=dylib=nvbufsurface"); + + let tegra_lib_dir = PathBuf::from("/usr/lib/aarch64-linux-gnu/tegra"); + if tegra_lib_dir.exists() { + println!("cargo:rustc-link-search=native={}", tegra_lib_dir.display()); + } + println!("cargo:rustc-link-search=native=/usr/lib/aarch64-linux-gnu"); +} diff --git a/livekit-capture/src/device.rs b/livekit-capture/src/device.rs index 8dd8d0a9c..fbffcd503 100644 --- a/livekit-capture/src/device.rs +++ b/livekit-capture/src/device.rs @@ -53,8 +53,12 @@ pub enum CapturePixelFormat { Bgra, /// Packed RGB24. Rgb24, + /// Packed BGR24. + Bgr24, /// Packed YUYV/YUY2. Yuyv, + /// Single-plane 8-bit luma. + Gray, /// Encoded MJPEG frames. Mjpeg, } diff --git a/livekit-capture/src/sources/argus.rs b/livekit-capture/src/sources/argus.rs new file mode 100644 index 000000000..ae294bb11 --- /dev/null +++ b/livekit-capture/src/sources/argus.rs @@ -0,0 +1,383 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! NVIDIA Argus/libargus capture for Jetson MIPI CSI cameras. + +use thiserror::Error; + +use crate::{ + device::{CaptureFormat, CapturePixelFormat, CaptureResolution}, + dmabuf::DmaBufFrame, +}; + +#[cfg(livekit_capture_argus)] +use crate::{ + dmabuf::{DmaBufPixelFormat, DmaBufPlane}, + metadata::FrameMetadata, +}; +#[cfg(livekit_capture_argus)] +use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; +#[cfg(livekit_capture_argus)] +use std::{ffi::c_int, ffi::c_void}; + +#[cfg(livekit_capture_argus)] +extern "C" { + fn lk_argus_create_session( + sensor_index: c_int, + width: c_int, + height: c_int, + fps: c_int, + ) -> *mut c_void; + + fn lk_argus_destroy_session(session: *mut c_void); + + fn lk_argus_acquire_frame_with_metadata( + session: *mut c_void, + sensor_timestamp_ns: *mut u64, + acquire_wait_ns: *mut u64, + blit_ns: *mut u64, + ) -> c_int; + + fn lk_argus_release_frame(session: *mut c_void); +} + +/// Options used to open a Jetson Argus capture session. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ArgusCaptureOptions { + /// MIPI CSI sensor index. + pub sensor_index: u32, + /// Requested capture format. + pub format: CaptureFormat, + /// Attach the Argus sensor timestamp as [`crate::FrameMetadata::user_timestamp`] when available. + pub attach_sensor_timestamp: bool, + /// Attach a monotonically increasing frame id as [`crate::FrameMetadata::frame_id`]. + pub attach_frame_id: bool, +} + +impl ArgusCaptureOptions { + /// Creates options for NV12 DMA-BUF capture from a Jetson MIPI CSI sensor. + pub const fn new(sensor_index: u32, resolution: CaptureResolution, frame_rate: u32) -> Self { + Self { + sensor_index, + format: CaptureFormat::new(resolution, frame_rate, CapturePixelFormat::Nv12), + attach_sensor_timestamp: false, + attach_frame_id: false, + } + } +} + +impl Default for ArgusCaptureOptions { + fn default() -> Self { + Self::new(0, CaptureResolution::new(1280, 720), 30) + } +} + +/// Error returned by the Argus capture backend. +#[derive(Debug, Error, PartialEq, Eq)] +pub enum ArgusError { + /// Argus capture is not available for this target or build. + #[error("libargus capture is not available on this target or build")] + Unsupported, + /// Argus only publishes NV12 DMA-BUF frames in this backend. + #[error("libargus capture only supports NV12 DMA-BUF frames, got {0:?}")] + UnsupportedPixelFormat(CapturePixelFormat), + /// The requested format contains an invalid value. + #[error("invalid Argus capture option: {0}")] + InvalidOption(&'static str), + /// A numeric option could not be represented by the C shim. + #[error("Argus capture option is out of range for the C shim: {0}")] + OptionOutOfRange(&'static str), + /// The C shim failed to create an Argus capture session. + #[error("failed to create Argus capture session")] + CreateSessionFailed, + /// The C shim failed to acquire a frame. + #[error("Argus frame acquisition failed")] + AcquireFrameFailed, +} + +/// One Argus frame backed by an NV12 DMA-BUF. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ArgusFrame { + /// DMA-BUF frame suitable for [`crate::VideoCaptureTrack::capture_dmabuf`]. + pub dmabuf: DmaBufFrame, + /// Argus sensor start timestamp in nanoseconds, when available. + pub sensor_timestamp_ns: Option, + /// Time spent waiting for `FrameConsumer::acquireFrame` to return. + pub acquire_wait_ns: u64, + /// Time spent copying the acquired EGLStream frame into the DMA buffer. + pub blit_ns: u64, +} + +impl ArgusFrame { + /// Returns the DMA-BUF frame descriptor. + pub fn dmabuf_frame(&self) -> &DmaBufFrame { + &self.dmabuf + } +} + +/// Jetson Argus capture session that emits NV12 DMA-BUF frames. +#[derive(Debug)] +pub struct ArgusCaptureSession { + #[cfg(livekit_capture_argus)] + handle: *mut c_void, + options: ArgusCaptureOptions, + #[cfg(livekit_capture_argus)] + started_at: Instant, + #[cfg(livekit_capture_argus)] + next_frame_id: u32, +} + +// SAFETY: The C++ Argus session is driven by one mutable Rust owner at a time. +unsafe impl Send for ArgusCaptureSession {} + +impl ArgusCaptureSession { + /// Opens an Argus capture session. + pub fn new(options: ArgusCaptureOptions) -> Result { + validate_options(&options)?; + Self::open(options) + } + + /// Acquires the next captured frame as an NV12 DMA-BUF. + /// + /// The returned DMA-BUF file descriptor is owned by the Argus session's + /// internal buffer ring. It remains valid until the session is dropped, but + /// callers should publish frames promptly so the ring can be reused. + pub fn acquire_frame(&mut self) -> Result { + self.acquire_frame_inner() + } + + /// Releases the currently held Argus frame, when one is held by the shim. + pub fn release_frame(&mut self) { + self.release_frame_inner(); + } + + /// Returns the configured frame width. + pub fn width(&self) -> u32 { + self.options.format.resolution.width + } + + /// Returns the configured frame height. + pub fn height(&self) -> u32 { + self.options.format.resolution.height + } + + /// Returns the requested capture format. + pub fn format(&self) -> CaptureFormat { + self.options.format + } + + #[cfg(livekit_capture_argus)] + fn open(options: ArgusCaptureOptions) -> Result { + let sensor_index = c_int_from_u32(options.sensor_index, "sensor_index")?; + let width = c_int_from_u32(options.format.resolution.width, "width")?; + let height = c_int_from_u32(options.format.resolution.height, "height")?; + let frame_rate = c_int_from_u32(options.format.frame_rate, "frame_rate")?; + + let handle = unsafe { + // SAFETY: The C shim expects plain integer values and returns either + // a valid opaque session pointer or null on failure. + lk_argus_create_session(sensor_index, width, height, frame_rate) + }; + if handle.is_null() { + return Err(ArgusError::CreateSessionFailed); + } + + Ok(Self { handle, options, started_at: Instant::now(), next_frame_id: 1 }) + } + + #[cfg(not(livekit_capture_argus))] + fn open(_options: ArgusCaptureOptions) -> Result { + Err(ArgusError::Unsupported) + } + + #[cfg(livekit_capture_argus)] + fn acquire_frame_inner(&mut self) -> Result { + let mut sensor_timestamp_ns = 0; + let mut acquire_wait_ns = 0; + let mut blit_ns = 0; + let fd = unsafe { + // SAFETY: `self.handle` is created by `lk_argus_create_session` and + // remains valid until `Drop`; the out-pointers are valid for the call. + lk_argus_acquire_frame_with_metadata( + self.handle, + &mut sensor_timestamp_ns, + &mut acquire_wait_ns, + &mut blit_ns, + ) + }; + if fd < 0 { + return Err(ArgusError::AcquireFrameFailed); + } + + let sensor_timestamp_ns = (sensor_timestamp_ns > 0).then_some(sensor_timestamp_ns); + let metadata = self.frame_metadata(sensor_timestamp_ns); + let resolution = self.options.format.resolution; + let dmabuf = DmaBufFrame { + width: resolution.width, + height: resolution.height, + pixel_format: DmaBufPixelFormat::Nv12, + planes: vec![DmaBufPlane { fd, offset: 0, stride: resolution.width }], + modifier: None, + timestamp_us: elapsed_us(self.started_at.elapsed()), + metadata, + }; + + Ok(ArgusFrame { dmabuf, sensor_timestamp_ns, acquire_wait_ns, blit_ns }) + } + + #[cfg(not(livekit_capture_argus))] + fn acquire_frame_inner(&mut self) -> Result { + Err(ArgusError::Unsupported) + } + + #[cfg(livekit_capture_argus)] + fn release_frame_inner(&mut self) { + unsafe { + // SAFETY: `self.handle` is owned by this session and valid until `Drop`. + lk_argus_release_frame(self.handle); + } + } + + #[cfg(not(livekit_capture_argus))] + fn release_frame_inner(&mut self) {} + + #[cfg(livekit_capture_argus)] + fn frame_metadata(&mut self, sensor_timestamp_ns: Option) -> FrameMetadata { + let user_timestamp = self + .options + .attach_sensor_timestamp + .then(|| sensor_timestamp_ns.and_then(sensor_wall_time_us).or_else(unix_time_us_now)) + .flatten(); + let frame_id = self.options.attach_frame_id.then(|| { + let frame_id = self.next_frame_id; + self.next_frame_id = self.next_frame_id.wrapping_add(1); + frame_id + }); + FrameMetadata { user_timestamp, frame_id } + } +} + +impl Drop for ArgusCaptureSession { + fn drop(&mut self) { + #[cfg(livekit_capture_argus)] + if !self.handle.is_null() { + unsafe { + // SAFETY: `self.handle` is owned by this session and is destroyed once here. + lk_argus_destroy_session(self.handle); + } + self.handle = std::ptr::null_mut(); + } + } +} + +fn validate_options(options: &ArgusCaptureOptions) -> Result<(), ArgusError> { + if options.format.pixel_format != CapturePixelFormat::Nv12 { + return Err(ArgusError::UnsupportedPixelFormat(options.format.pixel_format)); + } + if options.format.resolution.width == 0 { + return Err(ArgusError::InvalidOption("width must be non-zero")); + } + if options.format.resolution.height == 0 { + return Err(ArgusError::InvalidOption("height must be non-zero")); + } + if options.format.frame_rate == 0 { + return Err(ArgusError::InvalidOption("frame_rate must be non-zero")); + } + Ok(()) +} + +#[cfg(livekit_capture_argus)] +fn c_int_from_u32(value: u32, field: &'static str) -> Result { + c_int::try_from(value).map_err(|_| ArgusError::OptionOutOfRange(field)) +} + +#[cfg(livekit_capture_argus)] +fn elapsed_us(duration: Duration) -> i64 { + i64::try_from(duration.as_micros()).unwrap_or(i64::MAX) +} + +#[cfg(livekit_capture_argus)] +fn sensor_wall_time_us(sensor_timestamp_ns: u64) -> Option { + let wall_time_us = unix_time_us_now()?; + sensor_monotonic_ns_to_unix_us(sensor_timestamp_ns, wall_time_us) +} + +/// Converts an Argus `CLOCK_MONOTONIC` timestamp into a UNIX-epoch microsecond timestamp. +pub fn sensor_monotonic_ns_to_unix_us(sensor_timestamp_ns: u64, wall_time_us: u64) -> Option { + let monotonic_now_ns = monotonic_time_ns_now()?; + let monotonic_delta_us = monotonic_now_ns.abs_diff(sensor_timestamp_ns) / 1_000; + if sensor_timestamp_ns <= monotonic_now_ns { + Some(wall_time_us.saturating_sub(monotonic_delta_us)) + } else { + Some(wall_time_us.saturating_add(monotonic_delta_us)) + } +} + +#[cfg(livekit_capture_argus)] +fn unix_time_us_now() -> Option { + let elapsed = SystemTime::now().duration_since(UNIX_EPOCH).ok()?; + u64::try_from(elapsed.as_micros()).ok() +} + +#[cfg(target_os = "linux")] +fn monotonic_time_ns_now() -> Option { + #[repr(C)] + struct Timespec { + tv_sec: i64, + tv_nsec: i64, + } + + extern "C" { + fn clock_gettime(clk_id: i32, tp: *mut Timespec) -> i32; + } + + const CLOCK_MONOTONIC: i32 = 1; + let mut ts = Timespec { tv_sec: 0, tv_nsec: 0 }; + let ret = unsafe { + // SAFETY: `ts` is a valid writable `Timespec` for the duration of the call. + clock_gettime(CLOCK_MONOTONIC, &mut ts) + }; + if ret != 0 || ts.tv_sec < 0 || ts.tv_nsec < 0 { + return None; + } + + let seconds = u64::try_from(ts.tv_sec).ok()?; + let nanos = u64::try_from(ts.tv_nsec).ok()?; + seconds.checked_mul(1_000_000_000)?.checked_add(nanos) +} + +#[cfg(not(target_os = "linux"))] +fn monotonic_time_ns_now() -> Option { + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn validates_nv12_only() { + let mut options = ArgusCaptureOptions::default(); + options.format.pixel_format = CapturePixelFormat::I420; + let err = ArgusCaptureSession::new(options).expect_err("I420 must be rejected"); + assert_eq!(err, ArgusError::UnsupportedPixelFormat(CapturePixelFormat::I420)); + } + + #[test] + fn validates_non_zero_frame_rate() { + let options = ArgusCaptureOptions::new(0, CaptureResolution::new(1280, 720), 0); + let err = ArgusCaptureSession::new(options).expect_err("zero frame rate must be rejected"); + assert_eq!(err, ArgusError::InvalidOption("frame_rate must be non-zero")); + } +} diff --git a/livekit-capture/src/sources/lk_argus.cpp b/livekit-capture/src/sources/lk_argus.cpp new file mode 100644 index 000000000..5ae3931f3 --- /dev/null +++ b/livekit-capture/src/sources/lk_argus.cpp @@ -0,0 +1,618 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// C shim around NVIDIA libargus for MIPI CSI camera capture on Jetson. +// +// Exposes a simple C API for the Rust FFI in argus.rs: +// lk_argus_create_session - open sensor, configure ISP, start repeating capture +// lk_argus_acquire_frame - dequeue next frame, return NvBufSurface DMA fd +// lk_argus_release_frame - release frame back to Argus buffer pool +// lk_argus_destroy_session - tear down everything + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include "NvBufSurface.h" + +// Ring buffer size for persistent NvBufSurface DMA allocations. +// The encoder may hold 1-2 buffers while encoding, and the blit writes to +// another. 4 buffers gives comfortable headroom to avoid the "Wrong buffer +// index" errors that occur when the capture loop laps the encoder. +static constexpr int kNumDmaBufs = 4; + +struct LkArgusSession { + Argus::UniqueObj provider; + Argus::UniqueObj session; + Argus::UniqueObj stream_settings; + Argus::UniqueObj stream; + Argus::UniqueObj request; + Argus::UniqueObj event_queue; + Argus::UniqueObj consumer; + + // Most recently acquired frame (kept alive until release/next acquire). + Argus::UniqueObj current_frame; + + // Ring of DMA fds so the encoder can hold one buffer while we blit the + // next frame into a different one. Avoids the "Wrong buffer index" + // errors caused by the encoder and Argus racing on a single buffer. + int dmabuf_fds[kNumDmaBufs]; + NvBufSurface* dmabuf_surfaces[kNumDmaBufs]; // original surface ptrs for cleanup + int dmabuf_write_idx; // next buffer to blit into + int width; + int height; + bool metadata_enabled; + bool event_metadata_enabled; +}; + +static const uint64_t kAcquireTimeoutNs = 1000000000ULL; // 1 second + +enum class SensorTimestampStatus { + Available, + InvalidArgs, + NoEventQueue, + EventWaitFailed, + NoCaptureCompleteEvent, + CaptureCompleteFailed, + NoEventMetadata, + NoOutputStream, + MetadataCreateFailed, + NoCaptureMetadata, + ZeroTimestamp, +}; + +static const char* sensor_timestamp_status_name(SensorTimestampStatus status) { + switch (status) { + case SensorTimestampStatus::Available: + return "available"; + case SensorTimestampStatus::InvalidArgs: + return "invalid args"; + case SensorTimestampStatus::NoEventQueue: + return "no capture-complete event queue"; + case SensorTimestampStatus::EventWaitFailed: + return "capture-complete event wait failed"; + case SensorTimestampStatus::NoCaptureCompleteEvent: + return "no capture-complete event"; + case SensorTimestampStatus::CaptureCompleteFailed: + return "capture-complete event failed"; + case SensorTimestampStatus::NoEventMetadata: + return "no capture-complete metadata"; + case SensorTimestampStatus::NoOutputStream: + return "no EGL output stream"; + case SensorTimestampStatus::MetadataCreateFailed: + return "metadata container create failed"; + case SensorTimestampStatus::NoCaptureMetadata: + return "no capture metadata interface"; + case SensorTimestampStatus::ZeroTimestamp: + return "zero sensor timestamp"; + } + return "unknown"; +} + +static SensorTimestampStatus read_sensor_timestamp_ns_from_event( + LkArgusSession* s, + uint64_t* sensor_timestamp_ns, + Argus::Status* metadata_status) { + if (metadata_status) *metadata_status = Argus::STATUS_OK; + if (!s || !sensor_timestamp_ns) return SensorTimestampStatus::InvalidArgs; + *sensor_timestamp_ns = 0; + + auto* i_event_provider = Argus::interface_cast(s->session); + auto* i_event_queue = Argus::interface_cast(s->event_queue); + if (!i_event_provider || !i_event_queue) { + return SensorTimestampStatus::NoEventQueue; + } + + Argus::Status status = i_event_provider->waitForEvents(s->event_queue.get(), 1000000); + if (metadata_status) *metadata_status = status; + if (status != Argus::STATUS_OK) { + return SensorTimestampStatus::EventWaitFailed; + } + + const Argus::Event* newest_capture_complete = nullptr; + for (uint32_t i = 0; i < i_event_queue->getSize(); i++) { + const Argus::Event* event = i_event_queue->getEvent(i); + auto* i_event = Argus::interface_cast(event); + if (i_event && i_event->getEventType() == Argus::EVENT_TYPE_CAPTURE_COMPLETE) { + newest_capture_complete = event; + } + } + if (!newest_capture_complete) { + return SensorTimestampStatus::NoCaptureCompleteEvent; + } + + auto* i_capture_complete = + Argus::interface_cast(newest_capture_complete); + if (!i_capture_complete) { + return SensorTimestampStatus::NoCaptureCompleteEvent; + } + status = i_capture_complete->getStatus(); + if (metadata_status) *metadata_status = status; + if (status != Argus::STATUS_OK) { + return SensorTimestampStatus::CaptureCompleteFailed; + } + + const Argus::CaptureMetadata* metadata = i_capture_complete->getMetadata(); + if (!metadata) { + return SensorTimestampStatus::NoEventMetadata; + } + + auto* i_metadata = Argus::interface_cast(metadata); + if (!i_metadata) { + return SensorTimestampStatus::NoCaptureMetadata; + } + + *sensor_timestamp_ns = i_metadata->getSensorTimestamp(); + if (*sensor_timestamp_ns == 0) { + return SensorTimestampStatus::ZeroTimestamp; + } + return SensorTimestampStatus::Available; +} + +static SensorTimestampStatus read_sensor_timestamp_ns_from_egl_metadata( + LkArgusSession* s, + uint64_t* sensor_timestamp_ns, + Argus::Status* metadata_status) { + if (metadata_status) *metadata_status = Argus::STATUS_OK; + if (!s || !sensor_timestamp_ns) return SensorTimestampStatus::InvalidArgs; + *sensor_timestamp_ns = 0; + + auto* i_stream = Argus::interface_cast(s->stream); + if (!i_stream) return SensorTimestampStatus::NoOutputStream; + + Argus::Status status; + EGLStream::MetadataContainer* metadata = EGLStream::MetadataContainer::create( + i_stream->getEGLDisplay(), + i_stream->getEGLStream(), + EGLStream::MetadataContainer::CONSUMER, + &status); + if (metadata_status) *metadata_status = status; + if (status != Argus::STATUS_OK || !metadata) { + return SensorTimestampStatus::MetadataCreateFailed; + } + + auto* i_metadata = Argus::interface_cast(metadata); + if (!i_metadata) { + metadata->destroy(); + return SensorTimestampStatus::NoCaptureMetadata; + } + + *sensor_timestamp_ns = i_metadata->getSensorTimestamp(); + metadata->destroy(); + if (*sensor_timestamp_ns == 0) { + return SensorTimestampStatus::ZeroTimestamp; + } + return SensorTimestampStatus::Available; +} + +static SensorTimestampStatus read_sensor_timestamp_ns( + LkArgusSession* s, + uint64_t* sensor_timestamp_ns, + Argus::Status* metadata_status) { + SensorTimestampStatus status = + read_sensor_timestamp_ns_from_egl_metadata(s, sensor_timestamp_ns, metadata_status); + if (status == SensorTimestampStatus::Available) { + return status; + } + + // Fall back to capture-complete events only when embedded EGLStream metadata + // is unavailable. Event queues are session-scoped, so they can lag or lead + // the exact frame returned by FrameConsumer::acquireFrame(). + SensorTimestampStatus egl_status = status; + Argus::Status egl_metadata_status = + metadata_status ? *metadata_status : Argus::STATUS_OK; + + SensorTimestampStatus event_status = + read_sensor_timestamp_ns_from_event(s, sensor_timestamp_ns, metadata_status); + if (event_status == SensorTimestampStatus::Available) { + return event_status; + } + + if (metadata_status) *metadata_status = egl_metadata_status; + return egl_status; +} + +extern "C" { + +void* lk_argus_create_session(int sensor_index, int width, int height, int fps) { + auto* s = new LkArgusSession(); + for (int i = 0; i < kNumDmaBufs; i++) { + s->dmabuf_fds[i] = -1; + s->dmabuf_surfaces[i] = nullptr; + } + s->dmabuf_write_idx = 0; + s->width = width; + s->height = height; + s->metadata_enabled = false; + s->event_metadata_enabled = false; + + // Create CameraProvider + s->provider = Argus::UniqueObj( + Argus::CameraProvider::create()); + auto* i_provider = Argus::interface_cast(s->provider); + if (!i_provider) { + fprintf(stderr, "[lk_argus] Failed to create CameraProvider\n"); + delete s; + return nullptr; + } + + // Enumerate camera devices + std::vector devices; + i_provider->getCameraDevices(&devices); + if (devices.empty() || sensor_index >= static_cast(devices.size())) { + fprintf(stderr, "[lk_argus] No camera device at index %d (found %zu)\n", + sensor_index, devices.size()); + delete s; + return nullptr; + } + + // Create CaptureSession + Argus::Status status; + s->session = Argus::UniqueObj( + i_provider->createCaptureSession(devices[sensor_index], &status)); + if (status != Argus::STATUS_OK) { + fprintf(stderr, "[lk_argus] Failed to create CaptureSession: %d\n", + static_cast(status)); + delete s; + return nullptr; + } + auto* i_session = Argus::interface_cast(s->session); + auto* i_event_provider = Argus::interface_cast(s->session); + if (i_event_provider) { + std::vector event_types; + event_types.push_back(Argus::EVENT_TYPE_CAPTURE_COMPLETE); + s->event_queue = Argus::UniqueObj( + i_event_provider->createEventQueue(event_types, &status)); + if (status != Argus::STATUS_OK || !s->event_queue) { + fprintf(stderr, + "[lk_argus] WARNING: failed to create capture-complete event queue: %d\n", + static_cast(status)); + } else { + s->event_metadata_enabled = true; + fprintf(stderr, "[lk_argus] Capture-complete metadata events enabled: yes\n"); + } + } else { + fprintf(stderr, "[lk_argus] WARNING: capture session has no event provider interface\n"); + } + + // Create OutputStream (EGLStream-backed) + s->stream_settings = Argus::UniqueObj( + i_session->createOutputStreamSettings(Argus::STREAM_TYPE_EGL, &status)); + auto* i_stream_settings = + Argus::interface_cast(s->stream_settings); + if (!i_stream_settings) { + fprintf(stderr, "[lk_argus] Failed to get IEGLOutputStreamSettings\n"); + delete s; + return nullptr; + } + i_stream_settings->setPixelFormat(Argus::PIXEL_FMT_YCbCr_420_888); + i_stream_settings->setResolution(Argus::Size2D(width, height)); + status = i_stream_settings->setMode(Argus::EGL_STREAM_MODE_MAILBOX); + if (status != Argus::STATUS_OK) { + fprintf(stderr, "[lk_argus] WARNING: failed to set EGLStream mailbox mode: %d\n", + static_cast(status)); + } + status = i_stream_settings->setFifoLength(1); + if (status != Argus::STATUS_OK) { + fprintf(stderr, "[lk_argus] WARNING: failed to set EGLStream FIFO length: %d\n", + static_cast(status)); + } + fprintf(stderr, "[lk_argus] EGLStream mode: mailbox, fifo length: %u\n", + i_stream_settings->getFifoLength()); + status = i_stream_settings->setMetadataEnable(true); + if (status != Argus::STATUS_OK) { + fprintf(stderr, "[lk_argus] WARNING: failed to enable EGLStream metadata: %d\n", + static_cast(status)); + } + s->metadata_enabled = i_stream_settings->getMetadataEnable(); + fprintf(stderr, "[lk_argus] EGLStream metadata enabled: %s\n", + s->metadata_enabled ? "yes" : "no"); + + s->stream = Argus::UniqueObj( + i_session->createOutputStream(s->stream_settings.get(), &status)); + if (status != Argus::STATUS_OK) { + fprintf(stderr, "[lk_argus] Failed to create OutputStream: %d\n", + static_cast(status)); + delete s; + return nullptr; + } + + // Create FrameConsumer + s->consumer = Argus::UniqueObj( + EGLStream::FrameConsumer::create(s->stream.get())); + auto* i_consumer = + Argus::interface_cast(s->consumer); + if (!i_consumer) { + fprintf(stderr, "[lk_argus] Failed to create FrameConsumer\n"); + delete s; + return nullptr; + } + + // Create capture Request + s->request = Argus::UniqueObj( + i_session->createRequest(Argus::CAPTURE_INTENT_VIDEO_RECORD, &status)); + if (status != Argus::STATUS_OK) { + fprintf(stderr, "[lk_argus] Failed to create Request: %d\n", + static_cast(status)); + delete s; + return nullptr; + } + auto* i_request = + Argus::interface_cast(s->request); + i_request->enableOutputStream(s->stream.get()); + + // --- Sensor mode selection --- + // Argus auto-selects a sensor mode, but often picks the highest-resolution + // mode and downscales, running at that mode's (lower) framerate. We + // explicitly pick the smallest mode that covers the requested resolution + // and supports the requested framerate. + auto* i_props = Argus::interface_cast( + devices[sensor_index]); + if (i_props) { + std::vector modes; + i_props->getAllSensorModes(&modes); + fprintf(stderr, "[lk_argus] %zu sensor modes available:\n", modes.size()); + + Argus::SensorMode* best_mode = nullptr; + uint64_t best_pixels = UINT64_MAX; + uint64_t requested_dur_ns = 1000000000ULL / fps; + + for (size_t i = 0; i < modes.size(); i++) { + auto* i_mode = Argus::interface_cast(modes[i]); + if (!i_mode) continue; + auto res = i_mode->getResolution(); + auto dur = i_mode->getFrameDurationRange(); + double min_fps_mode = 1e9 / static_cast(dur.max()); + double max_fps_mode = 1e9 / static_cast(dur.min()); + fprintf(stderr, " [%zu] %ux%u fps %.1f-%.1f duration %lu-%lu ns\n", + i, res.width(), res.height(), + min_fps_mode, max_fps_mode, + dur.min(), dur.max()); + + // Compare frame durations instead of floating-point fps. + // Sensor durations are in nanoseconds and often off by 1 ns + // from the ideal value (e.g., 33333334 vs 33333333 for 30fps). + // A 1ms tolerance handles this rounding. + if (static_cast(res.width()) >= width && + static_cast(res.height()) >= height && + dur.min() <= requested_dur_ns + 1000000) { + uint64_t pixels = static_cast(res.width()) * res.height(); + if (pixels < best_pixels) { + best_pixels = pixels; + best_mode = modes[i]; + } + } + } + + auto* i_source = Argus::interface_cast( + i_request->getSourceSettings()); + + if (best_mode) { + auto* i_best = Argus::interface_cast(best_mode); + auto res = i_best->getResolution(); + auto dur = i_best->getFrameDurationRange(); + fprintf(stderr, "[lk_argus] Selected sensor mode: %ux%u fps %.1f-%.1f\n", + res.width(), res.height(), + 1e9 / static_cast(dur.max()), + 1e9 / static_cast(dur.min())); + if (i_source) { + i_source->setSensorMode(best_mode); + } + } else { + fprintf(stderr, "[lk_argus] WARNING: no sensor mode found for %dx%d @ %d fps, " + "using Argus default (may be slower)\n", width, height, fps); + } + + if (i_source) { + uint64_t frame_dur_ns = 1000000000ULL / fps; + i_source->setFrameDurationRange( + Argus::Range(frame_dur_ns, frame_dur_ns)); + i_source->setExposureTimeRange( + Argus::Range(0, frame_dur_ns)); + fprintf(stderr, "[lk_argus] Frame duration: %lu ns, max exposure: %lu ns\n", + frame_dur_ns, frame_dur_ns); + } + } else { + fprintf(stderr, "[lk_argus] WARNING: could not query sensor modes\n"); + auto* i_source = Argus::interface_cast( + i_request->getSourceSettings()); + if (i_source) { + i_source->setFrameDurationRange( + Argus::Range(1000000000ULL / fps, 1000000000ULL / fps)); + } + } + + // Allocate a ring of persistent NvBufSurface buffers so the encoder can + // hold one while we blit the next frame into a different one. + for (int i = 0; i < kNumDmaBufs; i++) { + NvBufSurfaceCreateParams create_params = {}; + create_params.gpuId = 0; + create_params.width = static_cast(width); + create_params.height = static_cast(height); + create_params.size = 0; + create_params.colorFormat = NVBUF_COLOR_FORMAT_NV12; + create_params.layout = NVBUF_LAYOUT_PITCH; + create_params.memType = NVBUF_MEM_SURFACE_ARRAY; + + NvBufSurface* surface = nullptr; + if (NvBufSurfaceCreate(&surface, 1, &create_params) != 0 || !surface) { + fprintf(stderr, "[lk_argus] Failed to create NvBufSurface[%d]\n", i); + delete s; + return nullptr; + } + s->dmabuf_fds[i] = surface->surfaceList[0].bufferDesc; + s->dmabuf_surfaces[i] = surface; + } + + // Start repeating capture + status = i_session->repeat(s->request.get()); + if (status != Argus::STATUS_OK) { + fprintf(stderr, "[lk_argus] Failed to start repeating capture: %d\n", + static_cast(status)); + delete s; + return nullptr; + } + + fprintf(stderr, "[lk_argus] Session created: %dx%d @ %d fps, sensor %d, %d DMA buffers (fds:", + width, height, fps, sensor_index, kNumDmaBufs); + for (int i = 0; i < kNumDmaBufs; i++) fprintf(stderr, " %d", s->dmabuf_fds[i]); + fprintf(stderr, ")\n"); + return s; +} + +int lk_argus_acquire_frame_with_metadata( + void* handle, + uint64_t* sensor_timestamp_ns, + uint64_t* acquire_wait_ns, + uint64_t* blit_ns) { + using Clock = std::chrono::steady_clock; + + auto* s = static_cast(handle); + if (!s) return -1; + if (sensor_timestamp_ns) *sensor_timestamp_ns = 0; + if (acquire_wait_ns) *acquire_wait_ns = 0; + if (blit_ns) *blit_ns = 0; + + auto* i_consumer = + Argus::interface_cast(s->consumer); + if (!i_consumer) return -1; + + // Release any previously held frame + s->current_frame.reset(); + + auto t0 = Clock::now(); + + Argus::Status status; + s->current_frame = Argus::UniqueObj( + i_consumer->acquireFrame(kAcquireTimeoutNs, &status)); + if (status != Argus::STATUS_OK || !s->current_frame) { + return -1; + } + + auto t1 = Clock::now(); + + auto* i_frame = + Argus::interface_cast(s->current_frame); + if (!i_frame) return -1; + + Argus::Status metadata_status = Argus::STATUS_OK; + SensorTimestampStatus sensor_timestamp_status = + read_sensor_timestamp_ns(s, sensor_timestamp_ns, &metadata_status); + bool has_sensor_timestamp = + sensor_timestamp_status == SensorTimestampStatus::Available; + static SensorTimestampStatus last_logged_sensor_timestamp_status = + SensorTimestampStatus::Available; + if (!has_sensor_timestamp && + sensor_timestamp_status != last_logged_sensor_timestamp_status) { + fprintf(stderr, + "[lk_argus] Sensor timestamp unavailable: %s " + "(event metadata enabled=%s, EGL metadata enabled=%s, status=%d)\n", + sensor_timestamp_status_name(sensor_timestamp_status), + s->event_metadata_enabled ? "yes" : "no", + s->metadata_enabled ? "yes" : "no", + static_cast(metadata_status)); + last_logged_sensor_timestamp_status = sensor_timestamp_status; + } else if (has_sensor_timestamp && + last_logged_sensor_timestamp_status != SensorTimestampStatus::Available) { + fprintf(stderr, "[lk_argus] Sensor timestamp available\n"); + last_logged_sensor_timestamp_status = SensorTimestampStatus::Available; + } + + auto* image = i_frame->getImage(); + if (!image) return -1; + + // Get the NativeBuffer interface to extract the DMA fd + auto* i_native = + Argus::interface_cast(image); + if (!i_native) { + fprintf(stderr, "[lk_argus] Image does not support IImageNativeBuffer\n"); + return -1; + } + + // Pick the next buffer in the ring so we don't overwrite a buffer the + // encoder may still be reading from. + int idx = s->dmabuf_write_idx; + s->dmabuf_write_idx = (s->dmabuf_write_idx + 1) % kNumDmaBufs; + int fd = s->dmabuf_fds[idx]; + + // Copy (blit) the acquired frame into the selected NvBufSurface. + status = i_native->copyToNvBuffer(fd); + + auto t2 = Clock::now(); + auto acquire_duration_ns = + std::chrono::duration_cast(t1 - t0).count(); + auto blit_duration_ns = + std::chrono::duration_cast(t2 - t1).count(); + if (acquire_wait_ns) *acquire_wait_ns = static_cast(acquire_duration_ns); + if (blit_ns) *blit_ns = static_cast(blit_duration_ns); + + // Release the Argus frame immediately - the pixel data has been blitted + // into our persistent NvBufSurface so we no longer need the EGLStream frame. + s->current_frame.reset(); + + if (status != Argus::STATUS_OK) { + fprintf(stderr, "[lk_argus] copyToNvBuffer failed: %d\n", + static_cast(status)); + return -1; + } + + return fd; +} + +int lk_argus_acquire_frame(void* handle) { + return lk_argus_acquire_frame_with_metadata(handle, nullptr, nullptr, nullptr); +} + +void lk_argus_release_frame(void* handle) { + auto* s = static_cast(handle); + if (!s) return; + s->current_frame.reset(); +} + +void lk_argus_destroy_session(void* handle) { + auto* s = static_cast(handle); + if (!s) return; + + // Stop repeating capture + auto* i_session = Argus::interface_cast(s->session); + if (i_session) { + i_session->stopRepeat(); + i_session->waitForIdle(); + } + + s->current_frame.reset(); + + // Free all persistent NvBufSurface buffers using the original pointers. + for (int i = 0; i < kNumDmaBufs; i++) { + if (s->dmabuf_surfaces[i]) { + NvBufSurfaceDestroy(s->dmabuf_surfaces[i]); + s->dmabuf_surfaces[i] = nullptr; + } + s->dmabuf_fds[i] = -1; + } + + delete s; + fprintf(stderr, "[lk_argus] Session destroyed\n"); +} + +} // extern "C" diff --git a/livekit-capture/src/sources/mod.rs b/livekit-capture/src/sources/mod.rs index fbae7ca1b..3b33ec675 100644 --- a/livekit-capture/src/sources/mod.rs +++ b/livekit-capture/src/sources/mod.rs @@ -14,9 +14,13 @@ //! Optional capture sources that feed the shared capture paths. +#[cfg(feature = "libargus")] +pub mod argus; #[cfg(feature = "gstreamer")] pub mod gstreamer; #[cfg(feature = "rtsp")] pub mod rtsp; #[cfg(feature = "tcp-source")] pub mod tcp; +#[cfg(feature = "v4l")] +pub mod v4l; diff --git a/livekit-capture/src/sources/v4l.rs b/livekit-capture/src/sources/v4l.rs new file mode 100644 index 000000000..a0725463c --- /dev/null +++ b/livekit-capture/src/sources/v4l.rs @@ -0,0 +1,895 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Linux V4L2 capture using Nokhwa's V4L backend. + +use std::time::Duration; +#[cfg(target_os = "linux")] +use std::time::{Instant, SystemTime, UNIX_EPOCH}; + +#[cfg(target_os = "linux")] +use livekit::webrtc::video_frame::VideoRotation; +use livekit::webrtc::video_frame::{I420Buffer, VideoFrame}; +#[cfg(target_os = "linux")] +use nokhwa::{ + pixel_format::RgbFormat, + utils::{ + ApiBackend, CameraFormat, CameraIndex, FrameFormat, RequestedFormat, RequestedFormatType, + Resolution, + }, + Camera, +}; +use thiserror::Error; + +use crate::device::{ + CaptureDeviceInfo, CaptureDeviceSelector, CaptureFormat, CaptureFormatRequest, + CapturePixelFormat, CaptureResolution, +}; +#[cfg(target_os = "linux")] +use crate::metadata::FrameMetadata; + +#[cfg(any(target_os = "linux", test))] +const MAX_BACKEND_CAPTURE_TIMESTAMP_AGE_US: u64 = 5_000_000; + +/// Options used to open a Linux V4L2 capture session. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct V4lCaptureOptions { + /// Device to open. + pub device: CaptureDeviceSelector, + /// Requested format policy. + pub format: CaptureFormatRequest, + /// Ordered source pixel formats to try. + pub pixel_formats: Vec, + /// Attach a wall-clock capture timestamp as [`crate::FrameMetadata::user_timestamp`]. + pub attach_capture_timestamp: bool, + /// Attach a monotonically increasing frame id as [`crate::FrameMetadata::frame_id`]. + pub attach_frame_id: bool, +} + +impl V4lCaptureOptions { + /// Creates options that try YUYV, MJPEG, grayscale, RGB24, and NV12 at the requested format. + pub fn new( + device: CaptureDeviceSelector, + resolution: CaptureResolution, + frame_rate: u32, + ) -> Self { + Self { + device, + format: CaptureFormatRequest::Exact(CaptureFormat::new( + resolution, + frame_rate, + CapturePixelFormat::Yuyv, + )), + pixel_formats: default_pixel_formats(), + attach_capture_timestamp: false, + attach_frame_id: false, + } + } +} + +impl Default for V4lCaptureOptions { + fn default() -> Self { + Self::new(CaptureDeviceSelector::Default, CaptureResolution::new(1280, 720), 30) + } +} + +/// Error returned by the V4L capture backend. +#[derive(Debug, Error)] +pub enum V4lError { + /// V4L capture is only available on Linux. + #[error("V4L capture is not supported on this platform")] + UnsupportedPlatform, + /// The requested pixel format is not supported by this backend. + #[error("V4L capture does not support pixel format {0:?}")] + UnsupportedPixelFormat(CapturePixelFormat), + /// The requested option is invalid. + #[error("invalid V4L capture option: {0}")] + InvalidOption(&'static str), + /// A numeric option could not be represented by Nokhwa. + #[error("V4L capture option is out of range: {0}")] + OptionOutOfRange(&'static str), + /// The camera backend returned an error. + #[error("V4L camera error: {0}")] + Camera(String), + /// Captured frame bytes did not match the negotiated format. + #[error("invalid V4L frame buffer: {0}")] + InvalidFrame(&'static str), + /// Pixel conversion failed. + #[error("failed to convert V4L frame to I420: {0}")] + Convert(&'static str), + /// MJPEG fallback decoding failed. + #[error("failed to decode MJPEG frame: {0}")] + Decode(String), +} + +/// One V4L frame converted to I420. +#[derive(Debug)] +pub struct V4lFrame { + /// Decoded I420 frame suitable for [`crate::VideoCaptureTrack::capture_frame`]. + pub frame: VideoFrame, + /// Source pixel format delivered by the camera backend. + pub source_pixel_format: CapturePixelFormat, + /// Backend-provided capture timestamp, when available. + pub backend_capture_timestamp: Option, + /// Whether compressed image decoding was needed. + pub used_decode_path: bool, +} + +impl V4lFrame { + /// Returns the decoded video frame. + pub fn video_frame(&self) -> &VideoFrame { + &self.frame + } +} + +/// Linux V4L2 capture session that emits decoded I420 frames. +pub struct V4lCaptureSession { + #[cfg(target_os = "linux")] + camera: Camera, + format: CaptureFormat, + #[cfg(target_os = "linux")] + options: V4lCaptureOptions, + #[cfg(target_os = "linux")] + started_at: Instant, + #[cfg(target_os = "linux")] + next_frame_id: u32, +} + +impl std::fmt::Debug for V4lCaptureSession { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let mut debug = f.debug_struct("V4lCaptureSession"); + debug.field("format", &self.format); + #[cfg(target_os = "linux")] + debug.field("options", &self.options); + debug.finish() + } +} + +impl V4lCaptureSession { + /// Opens a Linux V4L2 capture session. + pub fn new(options: V4lCaptureOptions) -> Result { + validate_options(&options)?; + Self::open(options) + } + + /// Captures the next frame and converts it to I420. + pub fn capture_frame(&mut self) -> Result { + self.capture_frame_inner() + } + + /// Returns the negotiated capture format. + pub fn format(&self) -> CaptureFormat { + self.format + } + + #[cfg(target_os = "linux")] + fn open(options: V4lCaptureOptions) -> Result { + let frame_formats = frame_formats_for_request(&options)?; + let requested = RequestedFormat::with_formats(RequestedFormatType::None, &frame_formats); + let mut camera = Camera::with_backend( + camera_index(&options.device)?, + requested, + ApiBackend::Video4Linux, + ) + .map_err(nokhwa_error)?; + + apply_format_request(&mut camera, &options, &frame_formats)?; + + camera.open_stream().map_err(nokhwa_error)?; + let format = capture_format_from_nokhwa(camera.camera_format())?; + Ok(Self { camera, format, options, started_at: Instant::now(), next_frame_id: 1 }) + } + + #[cfg(not(target_os = "linux"))] + fn open(_options: V4lCaptureOptions) -> Result { + Err(V4lError::UnsupportedPlatform) + } + + #[cfg(target_os = "linux")] + fn capture_frame_inner(&mut self) -> Result { + let fallback_wall_time_us = unix_time_us_now().unwrap_or_default(); + let buffer = self.camera.frame().map_err(nokhwa_error)?; + let read_wall_time_us = unix_time_us_now().unwrap_or(fallback_wall_time_us); + let backend_capture_timestamp = buffer.capture_timestamp(); + let capture_wall_time_us = select_capture_wall_time_us( + backend_capture_timestamp, + fallback_wall_time_us, + read_wall_time_us, + ); + + let format = self.camera.camera_format(); + let width = format.width(); + let height = format.height(); + let mut frame = VideoFrame { + rotation: VideoRotation::VideoRotation0, + timestamp_us: elapsed_us(self.started_at.elapsed()), + frame_metadata: self.frame_metadata(capture_wall_time_us).into_rtc(), + buffer: I420Buffer::new(width, height), + }; + let used_decode_path = convert_to_i420( + buffer.source_frame_format(), + buffer.buffer(), + width, + height, + &mut frame.buffer, + )?; + let source_pixel_format = capture_pixel_format_from_nokhwa(buffer.source_frame_format())?; + + Ok(V4lFrame { frame, source_pixel_format, backend_capture_timestamp, used_decode_path }) + } + + #[cfg(not(target_os = "linux"))] + fn capture_frame_inner(&mut self) -> Result { + Err(V4lError::UnsupportedPlatform) + } + + #[cfg(target_os = "linux")] + fn frame_metadata(&mut self, capture_wall_time_us: u64) -> FrameMetadata { + let user_timestamp = self.options.attach_capture_timestamp.then_some(capture_wall_time_us); + let frame_id = self.options.attach_frame_id.then(|| { + let frame_id = self.next_frame_id; + self.next_frame_id = self.next_frame_id.wrapping_add(1); + frame_id + }); + FrameMetadata { user_timestamp, frame_id } + } +} + +/// Returns Linux V4L2 capture devices. +#[cfg(target_os = "linux")] +pub fn devices() -> Result, V4lError> { + nokhwa::query(ApiBackend::Video4Linux) + .map_err(nokhwa_error)? + .into_iter() + .map(|info| { + let formats = enumerate_formats(&info).unwrap_or_default(); + Ok(CaptureDeviceInfo { + id: info.index().as_string(), + name: info.human_name(), + model_id: Some(info.description().to_string()).filter(|value| !value.is_empty()), + manufacturer: None, + formats, + }) + }) + .collect() +} + +/// Returns Linux V4L2 capture devices. +#[cfg(not(target_os = "linux"))] +pub fn devices() -> Result, V4lError> { + Err(V4lError::UnsupportedPlatform) +} + +fn default_pixel_formats() -> Vec { + vec![ + CapturePixelFormat::Yuyv, + CapturePixelFormat::Mjpeg, + CapturePixelFormat::Gray, + CapturePixelFormat::Rgb24, + CapturePixelFormat::Nv12, + ] +} + +fn validate_options(options: &V4lCaptureOptions) -> Result<(), V4lError> { + match &options.device { + CaptureDeviceSelector::Default => {} + CaptureDeviceSelector::Index(index) => { + u32::try_from(*index).map_err(|_| V4lError::OptionOutOfRange("device index"))?; + } + CaptureDeviceSelector::Id(id) => { + if id.is_empty() { + return Err(V4lError::InvalidOption("device id must be non-empty")); + } + } + } + + if options.pixel_formats.is_empty() { + return Err(V4lError::InvalidOption("pixel_formats must include at least one format")); + } + for pixel_format in &options.pixel_formats { + if nokhwa_frame_format(*pixel_format).is_none() { + return Err(V4lError::UnsupportedPixelFormat(*pixel_format)); + } + } + + validate_format_request(&options.format) +} + +fn validate_format_request(format: &CaptureFormatRequest) -> Result<(), V4lError> { + let validate_format = |format: &CaptureFormat| { + if format.resolution.width == 0 { + return Err(V4lError::InvalidOption("width must be non-zero")); + } + if format.resolution.height == 0 { + return Err(V4lError::InvalidOption("height must be non-zero")); + } + if format.frame_rate == 0 { + return Err(V4lError::InvalidOption("frame_rate must be non-zero")); + } + if nokhwa_frame_format(format.pixel_format).is_none() { + return Err(V4lError::UnsupportedPixelFormat(format.pixel_format)); + } + Ok(()) + }; + + match format { + CaptureFormatRequest::Default => Ok(()), + CaptureFormatRequest::Exact(format) | CaptureFormatRequest::Closest(format) => { + validate_format(format) + } + CaptureFormatRequest::HighestFrameRate { resolution, pixel_format } => { + if let Some(resolution) = resolution { + validate_resolution(*resolution)?; + } + if let Some(pixel_format) = pixel_format { + if nokhwa_frame_format(*pixel_format).is_none() { + return Err(V4lError::UnsupportedPixelFormat(*pixel_format)); + } + } + Ok(()) + } + CaptureFormatRequest::HighestResolution { frame_rate, pixel_format } => { + if matches!(frame_rate, Some(0)) { + return Err(V4lError::InvalidOption("frame_rate must be non-zero")); + } + if let Some(pixel_format) = pixel_format { + if nokhwa_frame_format(*pixel_format).is_none() { + return Err(V4lError::UnsupportedPixelFormat(*pixel_format)); + } + } + Ok(()) + } + } +} + +fn validate_resolution(resolution: CaptureResolution) -> Result<(), V4lError> { + if resolution.width == 0 { + return Err(V4lError::InvalidOption("width must be non-zero")); + } + if resolution.height == 0 { + return Err(V4lError::InvalidOption("height must be non-zero")); + } + Ok(()) +} + +#[cfg(target_os = "linux")] +fn camera_index(selector: &CaptureDeviceSelector) -> Result { + match selector { + CaptureDeviceSelector::Default => Ok(CameraIndex::Index(0)), + CaptureDeviceSelector::Index(index) => Ok(CameraIndex::Index( + u32::try_from(*index).map_err(|_| V4lError::OptionOutOfRange("device index"))?, + )), + CaptureDeviceSelector::Id(id) => Ok(CameraIndex::String(id.clone())), + } +} + +#[cfg(target_os = "linux")] +fn frame_formats_for_request(options: &V4lCaptureOptions) -> Result, V4lError> { + let mut formats = match &options.format { + CaptureFormatRequest::Exact(format) | CaptureFormatRequest::Closest(format) => { + ordered_formats_with_first(&options.pixel_formats, format.pixel_format) + } + CaptureFormatRequest::HighestFrameRate { pixel_format: Some(pixel_format), .. } + | CaptureFormatRequest::HighestResolution { pixel_format: Some(pixel_format), .. } => { + vec![*pixel_format] + } + CaptureFormatRequest::Default + | CaptureFormatRequest::HighestFrameRate { pixel_format: None, .. } + | CaptureFormatRequest::HighestResolution { pixel_format: None, .. } => { + options.pixel_formats.clone() + } + }; + formats.dedup(); + formats + .into_iter() + .map(|format| nokhwa_frame_format(format).ok_or(V4lError::UnsupportedPixelFormat(format))) + .collect() +} + +#[cfg(target_os = "linux")] +fn ordered_formats_with_first( + pixel_formats: &[CapturePixelFormat], + first: CapturePixelFormat, +) -> Vec { + std::iter::once(first) + .chain(pixel_formats.iter().copied().filter(|format| *format != first)) + .collect() +} + +#[cfg(target_os = "linux")] +fn requested_format<'a>( + request: &CaptureFormatRequest, + frame_formats: &'a [FrameFormat], + override_format: Option, +) -> Result, V4lError> { + let request_type = match request { + CaptureFormatRequest::Default => RequestedFormatType::None, + CaptureFormatRequest::Exact(format) => { + RequestedFormatType::Exact(nokhwa_camera_format(*format, override_format)?) + } + CaptureFormatRequest::Closest(format) => { + RequestedFormatType::Closest(nokhwa_camera_format(*format, override_format)?) + } + CaptureFormatRequest::HighestFrameRate { resolution: Some(resolution), .. } => { + RequestedFormatType::HighestResolution(nokhwa_resolution(*resolution)) + } + CaptureFormatRequest::HighestFrameRate { resolution: None, .. } => { + RequestedFormatType::AbsoluteHighestFrameRate + } + CaptureFormatRequest::HighestResolution { frame_rate: Some(frame_rate), .. } => { + RequestedFormatType::HighestFrameRate(*frame_rate) + } + CaptureFormatRequest::HighestResolution { frame_rate: None, .. } => { + RequestedFormatType::AbsoluteHighestResolution + } + }; + Ok(RequestedFormat::with_formats(request_type, frame_formats)) +} + +#[cfg(target_os = "linux")] +fn apply_format_request( + camera: &mut Camera, + options: &V4lCaptureOptions, + frame_formats: &[FrameFormat], +) -> Result<(), V4lError> { + match options.format { + CaptureFormatRequest::Default => Ok(()), + CaptureFormatRequest::Exact(_) | CaptureFormatRequest::Closest(_) => { + apply_ordered_format_request(camera, options, frame_formats) + } + CaptureFormatRequest::HighestFrameRate { .. } + | CaptureFormatRequest::HighestResolution { .. } => { + let selected = select_highest_format( + &options.format, + frame_formats, + &camera.compatible_camera_formats().map_err(nokhwa_error)?, + )?; + camera + .set_camera_requset(RequestedFormat::with_formats( + RequestedFormatType::Exact(selected), + &[selected.format()], + )) + .map(|_| ()) + .map_err(nokhwa_error) + } + } +} + +#[cfg(target_os = "linux")] +fn apply_ordered_format_request( + camera: &mut Camera, + options: &V4lCaptureOptions, + frame_formats: &[FrameFormat], +) -> Result<(), V4lError> { + let mut last_error = None; + for frame_format in frame_formats { + let requested = requested_format(&options.format, frame_formats, Some(*frame_format))?; + match camera.set_camera_requset(requested) { + Ok(_) => return Ok(()), + Err(error) => last_error = Some(error), + } + } + + Err(last_error + .map(nokhwa_error) + .unwrap_or(V4lError::InvalidOption("no V4L pixel formats were requested"))) +} + +#[cfg(target_os = "linux")] +fn select_highest_format( + request: &CaptureFormatRequest, + frame_formats: &[FrameFormat], + all_formats: &[CameraFormat], +) -> Result { + let candidates = all_formats + .iter() + .copied() + .filter(|format| frame_formats.contains(&format.format())) + .filter(|format| match request { + CaptureFormatRequest::HighestFrameRate { resolution, .. } => resolution + .map(|resolution| format.resolution() == nokhwa_resolution(resolution)) + .unwrap_or(true), + CaptureFormatRequest::HighestResolution { frame_rate, .. } => { + frame_rate.map(|frame_rate| format.frame_rate() == frame_rate).unwrap_or(true) + } + CaptureFormatRequest::Default + | CaptureFormatRequest::Exact(_) + | CaptureFormatRequest::Closest(_) => false, + }); + + let selected = match request { + CaptureFormatRequest::HighestFrameRate { .. } => candidates.max_by(|a, b| { + a.frame_rate() + .cmp(&b.frame_rate()) + .then_with(|| a.resolution().cmp(&b.resolution())) + .then_with(|| compare_format_preference(a.format(), b.format(), frame_formats)) + }), + CaptureFormatRequest::HighestResolution { .. } => candidates.max_by(|a, b| { + a.resolution() + .cmp(&b.resolution()) + .then_with(|| a.frame_rate().cmp(&b.frame_rate())) + .then_with(|| compare_format_preference(a.format(), b.format(), frame_formats)) + }), + CaptureFormatRequest::Default + | CaptureFormatRequest::Exact(_) + | CaptureFormatRequest::Closest(_) => None, + }; + + selected.ok_or_else(|| V4lError::Camera("CameraFormat: Failed to Fufill".to_string())) +} + +#[cfg(target_os = "linux")] +fn compare_format_preference( + left: FrameFormat, + right: FrameFormat, + frame_formats: &[FrameFormat], +) -> std::cmp::Ordering { + let left_index = frame_formats.iter().position(|format| *format == left).unwrap_or(usize::MAX); + let right_index = + frame_formats.iter().position(|format| *format == right).unwrap_or(usize::MAX); + right_index.cmp(&left_index) +} + +#[cfg(target_os = "linux")] +fn nokhwa_camera_format( + format: CaptureFormat, + override_format: Option, +) -> Result { + let frame_format = match override_format { + Some(format) => format, + None => nokhwa_frame_format(format.pixel_format) + .ok_or(V4lError::UnsupportedPixelFormat(format.pixel_format))?, + }; + Ok(CameraFormat::new(nokhwa_resolution(format.resolution), frame_format, format.frame_rate)) +} + +#[cfg(target_os = "linux")] +fn nokhwa_resolution(resolution: CaptureResolution) -> Resolution { + Resolution::new(resolution.width, resolution.height) +} + +#[cfg(target_os = "linux")] +fn nokhwa_frame_format(pixel_format: CapturePixelFormat) -> Option { + match pixel_format { + CapturePixelFormat::Nv12 => Some(FrameFormat::NV12), + CapturePixelFormat::Rgb24 => Some(FrameFormat::RAWRGB), + CapturePixelFormat::Bgr24 => Some(FrameFormat::RAWBGR), + CapturePixelFormat::Yuyv => Some(FrameFormat::YUYV), + CapturePixelFormat::Gray => Some(FrameFormat::GRAY), + CapturePixelFormat::Mjpeg => Some(FrameFormat::MJPEG), + CapturePixelFormat::I420 | CapturePixelFormat::Bgra => None, + } +} + +#[cfg(not(target_os = "linux"))] +fn nokhwa_frame_format(pixel_format: CapturePixelFormat) -> Option<()> { + match pixel_format { + CapturePixelFormat::Nv12 + | CapturePixelFormat::Rgb24 + | CapturePixelFormat::Bgr24 + | CapturePixelFormat::Yuyv + | CapturePixelFormat::Gray + | CapturePixelFormat::Mjpeg => Some(()), + CapturePixelFormat::I420 | CapturePixelFormat::Bgra => None, + } +} + +#[cfg(target_os = "linux")] +fn capture_format_from_nokhwa(format: CameraFormat) -> Result { + Ok(CaptureFormat::new( + CaptureResolution::new(format.width(), format.height()), + format.frame_rate(), + capture_pixel_format_from_nokhwa(format.format())?, + )) +} + +#[cfg(target_os = "linux")] +fn capture_pixel_format_from_nokhwa(format: FrameFormat) -> Result { + match format { + FrameFormat::MJPEG => Ok(CapturePixelFormat::Mjpeg), + FrameFormat::YUYV => Ok(CapturePixelFormat::Yuyv), + FrameFormat::NV12 => Ok(CapturePixelFormat::Nv12), + FrameFormat::GRAY => Ok(CapturePixelFormat::Gray), + FrameFormat::RAWRGB => Ok(CapturePixelFormat::Rgb24), + FrameFormat::RAWBGR => Ok(CapturePixelFormat::Bgr24), + } +} + +#[cfg(target_os = "linux")] +fn enumerate_formats(info: &nokhwa::utils::CameraInfo) -> Result, V4lError> { + let requested = RequestedFormat::new::(RequestedFormatType::None); + let mut camera = Camera::with_backend(info.index().clone(), requested, ApiBackend::Video4Linux) + .map_err(nokhwa_error)?; + + Ok(camera + .compatible_camera_formats() + .map_err(nokhwa_error)? + .into_iter() + .filter_map(|format| capture_format_from_nokhwa(format).ok()) + .collect()) +} + +#[cfg(target_os = "linux")] +fn convert_to_i420( + source_format: FrameFormat, + source: &[u8], + width: u32, + height: u32, + destination: &mut I420Buffer, +) -> Result { + let (stride_y, stride_u, stride_v) = destination.strides(); + let (dst_y, dst_u, dst_v) = destination.data_mut(); + let width_i32 = i32_from_u32(width, "width")?; + let height_i32 = i32_from_u32(height, "height")?; + + let ret = match source_format { + FrameFormat::YUYV => { + validate_len(source, width as usize * height as usize * 2, "YUYV frame")?; + unsafe { + // SAFETY: Source and destination slices are valid for the dimensions and strides. + yuv_sys::rs_YUY2ToI420( + source.as_ptr(), + width_i32 * 2, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width_i32, + height_i32, + ) + } + } + FrameFormat::RAWRGB => { + validate_len(source, width as usize * height as usize * 3, "RGB24 frame")?; + unsafe { + // SAFETY: Source and destination slices are valid for the dimensions and strides. + yuv_sys::rs_RGB24ToI420( + source.as_ptr(), + width_i32 * 3, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width_i32, + height_i32, + ) + } + } + FrameFormat::RAWBGR => { + validate_len(source, width as usize * height as usize * 3, "BGR24 frame")?; + unsafe { + // SAFETY: Source and destination slices are valid for the dimensions and strides. + yuv_sys::rs_RAWToI420( + source.as_ptr(), + width_i32 * 3, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width_i32, + height_i32, + ) + } + } + FrameFormat::GRAY => { + validate_len(source, width as usize * height as usize, "GRAY frame")?; + unsafe { + // SAFETY: Source and destination slices are valid for the dimensions and strides. + yuv_sys::rs_I400ToI420( + source.as_ptr(), + width_i32, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width_i32, + height_i32, + ) + } + } + FrameFormat::NV12 => { + let y_size = width as usize * height as usize; + validate_len(source, y_size + y_size / 2, "NV12 frame")?; + unsafe { + // SAFETY: Source and destination slices are valid for the dimensions and strides. + yuv_sys::rs_NV12ToI420( + source.as_ptr(), + width_i32, + source[y_size..].as_ptr(), + width_i32, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width_i32, + height_i32, + ) + } + } + FrameFormat::MJPEG => { + return convert_mjpeg_to_i420(source, width, height, destination).map(|()| true); + } + }; + + if ret == 0 { + Ok(false) + } else { + Err(V4lError::Convert("libyuv conversion failed")) + } +} + +#[cfg(target_os = "linux")] +fn convert_mjpeg_to_i420( + source: &[u8], + width: u32, + height: u32, + destination: &mut I420Buffer, +) -> Result<(), V4lError> { + let (stride_y, stride_u, stride_v) = destination.strides(); + let (dst_y, dst_u, dst_v) = destination.data_mut(); + let width_i32 = i32_from_u32(width, "width")?; + let height_i32 = i32_from_u32(height, "height")?; + + let ret = unsafe { + // SAFETY: Source and destination slices are valid for the dimensions and strides. + yuv_sys::rs_MJPGToI420( + source.as_ptr(), + source.len(), + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width_i32, + height_i32, + width_i32, + height_i32, + ) + }; + if ret == 0 { + return Ok(()); + } + + let rgb = image::load_from_memory(source) + .map_err(|error| V4lError::Decode(error.to_string()))? + .to_rgb8(); + if rgb.width() != width || rgb.height() != height { + return Err(V4lError::InvalidFrame("decoded MJPEG dimensions changed")); + } + let ret = unsafe { + // SAFETY: Source and destination slices are valid for the dimensions and strides. + yuv_sys::rs_RGB24ToI420( + rgb.as_raw().as_ptr(), + width_i32 * 3, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width_i32, + height_i32, + ) + }; + if ret == 0 { + Ok(()) + } else { + Err(V4lError::Convert("RGB24 fallback conversion failed")) + } +} + +#[cfg(target_os = "linux")] +fn validate_len(source: &[u8], expected: usize, label: &'static str) -> Result<(), V4lError> { + if source.len() < expected { + return Err(V4lError::InvalidFrame(label)); + } + Ok(()) +} + +#[cfg(any(target_os = "linux", test))] +fn select_capture_wall_time_us( + backend_capture_timestamp: Option, + fallback_wall_time_us: u64, + read_wall_time_us: u64, +) -> u64 { + backend_capture_timestamp + .and_then(|timestamp| validate_backend_capture_timestamp_us(timestamp, read_wall_time_us)) + .unwrap_or(fallback_wall_time_us) +} + +#[cfg(any(target_os = "linux", test))] +fn validate_backend_capture_timestamp_us( + capture_timestamp: Duration, + read_wall_time_us: u64, +) -> Option { + let capture_timestamp_us = u64::try_from(capture_timestamp.as_micros()).ok()?; + if capture_timestamp_us == 0 || capture_timestamp_us > read_wall_time_us { + return None; + } + if read_wall_time_us - capture_timestamp_us > MAX_BACKEND_CAPTURE_TIMESTAMP_AGE_US { + return None; + } + Some(capture_timestamp_us) +} + +#[cfg(target_os = "linux")] +fn unix_time_us_now() -> Option { + let elapsed = SystemTime::now().duration_since(UNIX_EPOCH).ok()?; + u64::try_from(elapsed.as_micros()).ok() +} + +#[cfg(target_os = "linux")] +fn elapsed_us(duration: Duration) -> i64 { + i64::try_from(duration.as_micros()).unwrap_or(i64::MAX) +} + +#[cfg(target_os = "linux")] +fn i32_from_u32(value: u32, field: &'static str) -> Result { + i32::try_from(value).map_err(|_| V4lError::OptionOutOfRange(field)) +} + +#[cfg(target_os = "linux")] +fn nokhwa_error(error: nokhwa::NokhwaError) -> V4lError { + V4lError::Camera(error.to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn rejects_empty_pixel_format_preferences() { + let mut options = V4lCaptureOptions::default(); + options.pixel_formats.clear(); + let err = V4lCaptureSession::new(options).expect_err("empty formats must be rejected"); + assert!(matches!(err, V4lError::InvalidOption(_))); + } + + #[test] + fn rejects_unsupported_i420_source_format() { + let mut options = V4lCaptureOptions::default(); + options.pixel_formats = vec![CapturePixelFormat::I420]; + let err = V4lCaptureSession::new(options).expect_err("I420 source must be rejected"); + assert!(matches!(err, V4lError::UnsupportedPixelFormat(CapturePixelFormat::I420))); + } + + #[test] + fn rejects_zero_frame_rate() { + let options = V4lCaptureOptions::new( + CaptureDeviceSelector::Default, + CaptureResolution::new(640, 480), + 0, + ); + let err = V4lCaptureSession::new(options).expect_err("zero fps must be rejected"); + assert!(matches!(err, V4lError::InvalidOption(_))); + } + + #[test] + fn ignores_stream_relative_capture_timestamp() { + let selected = + select_capture_wall_time_us(Some(Duration::from_micros(10)), 10_000_000, 10_000_000); + assert_eq!(selected, 10_000_000); + } +} From a4c78d498d0ca599f684c6f6970396feda190277 Mon Sep 17 00:00:00 2001 From: David Chen Date: Tue, 23 Jun 2026 14:39:51 -0700 Subject: [PATCH 05/24] avfoundation implementation --- .changeset/livekit-capture-preencoded.md | 2 +- Cargo.lock | 244 ++--- examples/local_video/Cargo.toml | 13 +- examples/local_video/src/list_devices.rs | 128 +-- examples/local_video/src/publisher.rs | 539 ++++------- livekit-capture/Cargo.toml | 8 +- livekit-capture/src/platform/avfoundation.rs | 892 ++++++++++++++++++- livekit-capture/src/sources/v4l.rs | 13 +- 8 files changed, 1184 insertions(+), 655 deletions(-) diff --git a/.changeset/livekit-capture-preencoded.md b/.changeset/livekit-capture-preencoded.md index d0b7cb263..79b8409d6 100644 --- a/.changeset/livekit-capture-preencoded.md +++ b/.changeset/livekit-capture-preencoded.md @@ -5,4 +5,4 @@ "webrtc-sys": patch --- -Add a `livekit-capture` crate with codec-neutral capture types, H264/H265 passthrough support, common encoded ingress helpers, and feature-gated source/platform scaffolding for TCP, RTSP, GStreamer appsink, AVFoundation, Linux V4L, and Jetson libargus capture. +Add a `livekit-capture` crate with codec-neutral capture types, H264/H265 passthrough support, common encoded ingress helpers, macOS AVFoundation decoded-frame capture, Linux V4L capture, and Jetson libargus capture hooks. The `local_video` examples now open platform camera capture through `livekit-capture` instead of depending on Nokhwa directly. diff --git a/Cargo.lock b/Cargo.lock index 0439042e2..34b434b0b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -61,7 +61,7 @@ version = "0.8.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "getrandom 0.3.4", "once_cell", "serde", @@ -110,7 +110,7 @@ checksum = "ed7572b7ba83a31e20d1b48970ee402d2e3e0537dcfe0a3ff4d6eb7508617d43" dependencies = [ "alsa-sys", "bitflags 2.11.0", - "cfg-if 1.0.4", + "cfg-if", "libc", ] @@ -434,7 +434,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "456b8a8feb6f42d237746d4b3e9a178494627745c3c56c6ea55d92ba50d026fc" dependencies = [ "autocfg", - "cfg-if 1.0.4", + "cfg-if", "concurrent-queue", "futures-io", "futures-lite 2.6.1", @@ -636,7 +636,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6" dependencies = [ "addr2line", - "cfg-if 1.0.4", + "cfg-if", "libc", "miniz_oxide", "object", @@ -1047,12 +1047,6 @@ dependencies = [ "target-lexicon", ] -[[package]] -name = "cfg-if" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" - [[package]] name = "cfg-if" version = "1.0.4" @@ -1139,34 +1133,6 @@ dependencies = [ "error-code", ] -[[package]] -name = "cocoa" -version = "0.20.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c49e86fc36d5704151f5996b7b3795385f50ce09e3be0f47a0cfde869681cf8" -dependencies = [ - "bitflags 1.3.2", - "block", - "core-foundation 0.7.0", - "core-graphics 0.19.2", - "foreign-types 0.3.2", - "libc", - "objc", -] - -[[package]] -name = "cocoa-foundation" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81411967c50ee9a1fc11365f8c585f863a22a9697c89239c452292c40ba79b0d" -dependencies = [ - "bitflags 2.11.0", - "block", - "core-foundation 0.10.1", - "core-graphics-types 0.2.0", - "objc", -] - [[package]] name = "codespan-reporting" version = "0.12.0" @@ -1275,23 +1241,13 @@ dependencies = [ "unicode-segmentation", ] -[[package]] -name = "core-foundation" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57d24c7a13c43e870e37c1556b74555437870a04514f7685f5b354e090567171" -dependencies = [ - "core-foundation-sys 0.7.0", - "libc", -] - [[package]] name = "core-foundation" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" dependencies = [ - "core-foundation-sys 0.8.7", + "core-foundation-sys", "libc", ] @@ -1301,34 +1257,16 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6" dependencies = [ - "core-foundation-sys 0.8.7", + "core-foundation-sys", "libc", ] -[[package]] -name = "core-foundation-sys" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3a71ab494c0b5b860bdc8407ae08978052417070c2ced38573a9157ad75b8ac" - [[package]] name = "core-foundation-sys" version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" -[[package]] -name = "core-graphics" -version = "0.19.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3889374e6ea6ab25dba90bb5d96202f61108058361f6dc72e8b03e6f8bbe923" -dependencies = [ - "bitflags 1.3.2", - "core-foundation 0.7.0", - "foreign-types 0.3.2", - "libc", -] - [[package]] name = "core-graphics" version = "0.23.2" @@ -1364,31 +1302,6 @@ dependencies = [ "libc", ] -[[package]] -name = "core-media-sys" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "273bf3fc5bf51fd06a7766a84788c1540b6527130a0bce39e00567d6ab9f31f1" -dependencies = [ - "cfg-if 0.1.10", - "core-foundation-sys 0.7.0", - "libc", -] - -[[package]] -name = "core-video-sys" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34ecad23610ad9757664d644e369246edde1803fcb43ed72876565098a5d3828" -dependencies = [ - "cfg-if 0.1.10", - "core-foundation-sys 0.7.0", - "core-graphics 0.19.2", - "libc", - "metal 0.18.0", - "objc", -] - [[package]] name = "core2" version = "0.4.0" @@ -1405,7 +1318,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "321077172d79c662f64f5071a03120748d5bb652f5231570141be24cfcd2bace" dependencies = [ "bitflags 1.3.2", - "core-foundation-sys 0.8.7", + "core-foundation-sys", "coreaudio-sys", ] @@ -1425,7 +1338,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "873dab07c8f743075e57f524c583985fbaf745602acbe916a01539364369a779" dependencies = [ "alsa", - "core-foundation-sys 0.8.7", + "core-foundation-sys", "coreaudio-rs", "dasp_sample", "jni 0.21.1", @@ -1456,7 +1369,7 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", ] [[package]] @@ -1719,7 +1632,7 @@ version = "6.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6361d5c062261c78a176addb82d4c821ae42bed6089de0e12603cd25de2059c" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "crossbeam-utils", "hashbrown 0.14.5", "lock_api", @@ -1819,6 +1732,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e0e367e4e7da84520dedcac1901e4da967309406d1e51017ae1abfb97adbd38" dependencies = [ "bitflags 2.11.0", + "block2 0.6.2", "objc2 0.6.4", ] @@ -2033,7 +1947,7 @@ version = "0.8.35" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", ] [[package]] @@ -2263,7 +2177,7 @@ version = "0.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "libc", "libredox", ] @@ -2559,7 +2473,7 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "js-sys", "libc", "wasi", @@ -2572,7 +2486,7 @@ version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "js-sys", "libc", "r-efi 5.3.0", @@ -2586,7 +2500,7 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "libc", "r-efi 6.0.0", "wasip2", @@ -2828,7 +2742,7 @@ version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "crunchy", "num-traits", "zerocopy", @@ -3124,7 +3038,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" dependencies = [ "android_system_properties", - "core-foundation-sys 0.8.7", + "core-foundation-sys", "iana-time-zone-haiku", "js-sys", "log", @@ -3330,7 +3244,7 @@ version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", ] [[package]] @@ -3477,7 +3391,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a87aa2bb7d2af34197c04845522473242e1aa17c12f4935d5856491a7fb8c97" dependencies = [ "cesu8", - "cfg-if 1.0.4", + "cfg-if", "combine", "jni-sys 0.3.1", "log", @@ -3492,7 +3406,7 @@ version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5efd9a482cf3a427f00d6b35f14332adc7902ce91efb778580e180ff90fa3498" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "combine", "jni-macros", "jni-sys 0.4.1", @@ -3660,7 +3574,7 @@ version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "windows-link 0.2.1", ] @@ -3670,7 +3584,7 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "windows-link 0.2.1", ] @@ -3886,12 +3800,10 @@ dependencies = [ name = "livekit-capture" version = "0.1.0" dependencies = [ - "block2 0.6.2", "bytes", "cc", "dispatch2", "image", - "imgproc", "livekit", "nokhwa", "objc2 0.6.4", @@ -4029,19 +3941,16 @@ dependencies = [ "egui-wgpu", "env_logger 0.11.10", "futures", - "image", "livekit", "livekit-api", + "livekit-capture", "log", "metal 0.32.0", - "nokhwa", - "objc2 0.6.4", "parking_lot", "tokio", "tokio-stream", "wgpu 28.0.0", "winit", - "yuv-sys", ] [[package]] @@ -4132,7 +4041,7 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ea1f30cedd69f0a2954655f7188c6a834246d2bcf1e315e2ac40c4b24dc9519" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "rayon", ] @@ -4151,21 +4060,6 @@ dependencies = [ "libc", ] -[[package]] -name = "metal" -version = "0.18.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e198a0ee42bdbe9ef2c09d0b9426f3b2b47d90d93a4a9b0395c4cea605e92dc0" -dependencies = [ - "bitflags 1.3.2", - "block", - "cocoa", - "core-graphics 0.19.2", - "foreign-types 0.3.2", - "log", - "objc", -] - [[package]] name = "metal" version = "0.32.0" @@ -4269,7 +4163,7 @@ dependencies = [ "arrayvec", "bit-set 0.8.0", "bitflags 2.11.0", - "cfg-if 1.0.4", + "cfg-if", "cfg_aliases", "codespan-reporting 0.12.0", "half", @@ -4295,7 +4189,7 @@ dependencies = [ "arrayvec", "bit-set 0.8.0", "bitflags 2.11.0", - "cfg-if 1.0.4", + "cfg-if", "cfg_aliases", "codespan-reporting 0.12.0", "half", @@ -4492,7 +4386,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" dependencies = [ "bitflags 2.11.0", - "cfg-if 1.0.4", + "cfg-if", "cfg_aliases", "libc", ] @@ -4511,10 +4405,7 @@ dependencies = [ "flume", "image", "nokhwa-bindings-linux", - "nokhwa-bindings-macos", - "nokhwa-bindings-windows", "nokhwa-core", - "parking_lot", "paste", "thiserror 2.0.18", ] @@ -4529,32 +4420,6 @@ dependencies = [ "v4l", ] -[[package]] -name = "nokhwa-bindings-macos" -version = "0.2.3" -source = "git+https://github.com/l1npengtul/nokhwa?rev=4923ecab7cf26f9dba83867a15a9d8662d021296#4923ecab7cf26f9dba83867a15a9d8662d021296" -dependencies = [ - "block", - "cocoa-foundation", - "core-foundation 0.10.1", - "core-media-sys", - "core-video-sys", - "flume", - "nokhwa-core", - "objc", - "once_cell", -] - -[[package]] -name = "nokhwa-bindings-windows" -version = "0.4.5" -source = "git+https://github.com/l1npengtul/nokhwa?rev=4923ecab7cf26f9dba83867a15a9d8662d021296#4923ecab7cf26f9dba83867a15a9d8662d021296" -dependencies = [ - "nokhwa-core", - "once_cell", - "windows 0.62.2", -] - [[package]] name = "nokhwa-core" version = "0.1.8" @@ -4703,7 +4568,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "915b1b472bc21c53464d6c8461c9d3af805ba1ef837e1cac254428f4a77177b1" dependencies = [ "malloc_buf", - "objc_exception", ] [[package]] @@ -4766,6 +4630,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "478ae33fcac9df0a18db8302387c666b8ef08a3e2d62b510ca4fc278a384b6c0" dependencies = [ "bitflags 2.11.0", + "dispatch2", "objc2 0.6.4", "objc2-core-media", "objc2-foundation 0.3.2", @@ -5136,15 +5001,6 @@ dependencies = [ "objc2-foundation 0.3.2", ] -[[package]] -name = "objc_exception" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad970fb455818ad6cba4c122ad012fae53ae8b4795f86378bce65e4f6bab2ca4" -dependencies = [ - "cc", -] - [[package]] name = "object" version = "0.37.3" @@ -5196,7 +5052,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a45fa2aa886c42762255da344f0a0d313e254066c46aad76f300c3d3da62d967" dependencies = [ "bitflags 2.11.0", - "cfg-if 1.0.4", + "cfg-if", "foreign-types 0.3.2", "libc", "openssl-macros", @@ -5344,7 +5200,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" dependencies = [ "backtrace", - "cfg-if 1.0.4", + "cfg-if", "libc", "petgraph 0.6.5", "redox_syscall 0.5.18", @@ -5564,7 +5420,7 @@ checksum = "4b2d323e8ca7996b3e23126511a523f7e62924d93ecd5ae73b333815b0eb3dce" dependencies = [ "autocfg", "bitflags 1.3.2", - "cfg-if 1.0.4", + "cfg-if", "concurrent-queue", "libc", "log", @@ -5578,7 +5434,7 @@ version = "3.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d0e4f59085d47d8241c88ead0f274e8a0cb551f3625263c05eb8dd897c34218" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "concurrent-queue", "hermit-abi", "pin-project-lite", @@ -6024,7 +5880,7 @@ dependencies = [ "av1-grain", "bitstream-io", "built", - "cfg-if 1.0.4", + "cfg-if", "interpolate_name", "itertools 0.14.0", "libc", @@ -6217,7 +6073,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" dependencies = [ "cc", - "cfg-if 1.0.4", + "cfg-if", "getrandom 0.2.17", "libc", "untrusted", @@ -6530,7 +6386,7 @@ checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" dependencies = [ "bitflags 2.11.0", "core-foundation 0.10.1", - "core-foundation-sys 0.8.7", + "core-foundation-sys", "libc", "security-framework-sys", ] @@ -6541,7 +6397,7 @@ version = "2.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3" dependencies = [ - "core-foundation-sys 0.8.7", + "core-foundation-sys", "libc", ] @@ -6664,7 +6520,7 @@ version = "0.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "cpufeatures", "digest", ] @@ -6675,7 +6531,7 @@ version = "0.10.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "cpufeatures", "digest", ] @@ -6937,7 +6793,7 @@ version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07f9fdfdd31a0ff38b59deb401be81b73913d76c9cc5b1aed4e1330a223420b9" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "hashbrown 0.14.5", "serde", ] @@ -7075,7 +6931,7 @@ version = "3.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adcb7fd841cd518e279be3d5a3eb0636409487998a4aff22f3de87b81e88384f" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "proc-macro2", "quote", "syn 2.0.117", @@ -7170,7 +7026,7 @@ version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", ] [[package]] @@ -7227,7 +7083,7 @@ dependencies = [ "arrayref", "arrayvec", "bytemuck", - "cfg-if 1.0.4", + "cfg-if", "log", "tiny-skia-path", ] @@ -8139,7 +7995,7 @@ version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "once_cell", "rustversion", "wasm-bindgen-macro", @@ -8152,7 +8008,7 @@ version = "0.4.64" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "futures-util", "js-sys", "once_cell", @@ -8474,7 +8330,7 @@ checksum = "bfe68bac7cde125de7a731c3400723cadaaf1703795ad3f4805f187459cd7a77" dependencies = [ "arrayvec", "bitflags 2.11.0", - "cfg-if 1.0.4", + "cfg-if", "cfg_aliases", "document-features", "hashbrown 0.16.1", @@ -8504,7 +8360,7 @@ dependencies = [ "arrayvec", "bitflags 2.11.0", "bytemuck", - "cfg-if 1.0.4", + "cfg-if", "cfg_aliases", "document-features", "hashbrown 0.16.1", @@ -8656,7 +8512,7 @@ dependencies = [ "bitflags 2.11.0", "block", "bytemuck", - "cfg-if 1.0.4", + "cfg-if", "cfg_aliases", "core-graphics-types 0.2.0", "glow", @@ -8705,7 +8561,7 @@ dependencies = [ "bitflags 2.11.0", "block", "bytemuck", - "cfg-if 1.0.4", + "cfg-if", "cfg_aliases", "core-graphics-types 0.2.0", "glow", @@ -9352,7 +9208,7 @@ dependencies = [ "cfg_aliases", "concurrent-queue", "core-foundation 0.9.4", - "core-graphics 0.23.2", + "core-graphics", "cursor-icon", "dpi", "js-sys", diff --git a/examples/local_video/Cargo.toml b/examples/local_video/Cargo.toml index acc58a5c1..4daefd61a 100644 --- a/examples/local_video/Cargo.toml +++ b/examples/local_video/Cargo.toml @@ -32,7 +32,7 @@ tokio = { workspace = true, features = ["full", "parking_lot"] } tokio-stream = { workspace = true } livekit = { workspace = true, features = ["rustls-tls-native-roots"] } livekit-api = { workspace = true } -yuv-sys = { workspace = true, features = ["jpeg"] } +livekit-capture = { workspace = true } futures = { workspace = true } clap = { workspace = true, features = ["derive"] } log = { workspace = true } @@ -40,7 +40,6 @@ env_logger = { workspace = true } eframe = { workspace = true, features = ["default_fonts", "wgpu", "persistence"] } egui = { workspace = true } egui-wgpu = { workspace = true } -image = { workspace = true } wgpu = { workspace = true } winit = { workspace = true, features = [ "android-native-activity" ] } parking_lot = { workspace = true, features = ["deadlock_detection"] } @@ -48,18 +47,12 @@ anyhow = { workspace = true } chrono = "0.4" bytemuck = { version = "1.16", features = ["derive"] } -nokhwa = { git = "https://github.com/l1npengtul/nokhwa", rev = "4923ecab7cf26f9dba83867a15a9d8662d021296", default-features = false, features = ["output-threaded"] } - [target.'cfg(target_os = "macos")'.dependencies] -nokhwa = { git = "https://github.com/l1npengtul/nokhwa", rev = "4923ecab7cf26f9dba83867a15a9d8662d021296", default-features = false, features = ["input-avfoundation"] } +livekit-capture = { workspace = true, features = ["avfoundation"] } metal = "0.32" -objc2 = { version = "0.6.0", features = ["relax-sign-encoding"] } [target.'cfg(target_os = "linux")'.dependencies] -nokhwa = { git = "https://github.com/l1npengtul/nokhwa", rev = "4923ecab7cf26f9dba83867a15a9d8662d021296", default-features = false, features = ["input-v4l"] } - -[target.'cfg(target_os = "windows")'.dependencies] -nokhwa = { git = "https://github.com/l1npengtul/nokhwa", rev = "4923ecab7cf26f9dba83867a15a9d8662d021296", default-features = false, features = ["input-msmf"] } +livekit-capture = { workspace = true, features = ["libargus", "v4l"] } [build-dependencies] cc = { workspace = true } diff --git a/examples/local_video/src/list_devices.rs b/examples/local_video/src/list_devices.rs index 194aedfa3..d738a8fa1 100644 --- a/examples/local_video/src/list_devices.rs +++ b/examples/local_video/src/list_devices.rs @@ -1,14 +1,8 @@ use anyhow::Result; -use nokhwa::pixel_format::RgbFormat; -use nokhwa::utils::{ - ApiBackend, CameraFormat, CameraInfo, FrameFormat, RequestedFormat, RequestedFormatType, - Resolution, -}; -use nokhwa::Camera; -use std::collections::BTreeMap; +use livekit_capture::device::{CaptureDeviceInfo, CaptureFormat}; fn main() -> Result<()> { - let cameras = nokhwa::query(ApiBackend::Auto)?; + let cameras = platform_devices()?; if cameras.is_empty() { println!("No cameras detected."); return Ok(()); @@ -17,96 +11,66 @@ fn main() -> Result<()> { println!("Available cameras and capabilities:"); for (idx, info) in cameras.iter().enumerate() { println!(); - println!("{}. {}", idx, info.human_name()); - match enumerate_capabilities(info) { - Ok(formats) => print_capabilities(&formats), - Err(err) => println!(" Capabilities: unavailable ({})", err), - } + println!("{}. {}", idx, info.name); + print_device_details(info); } Ok(()) } -/// Enumerate camera capabilities using only Nokhwa public APIs. -/// -/// This avoids any direct dependency on platform-specific bindings crates like -/// `nokhwa_bindings_macos`, making the example portable across targets. -fn enumerate_capabilities( - info: &CameraInfo, -) -> Result>>> { - // We don't need to actually capture frames; we just want to query supported formats. - // Using "None" requested format keeps it flexible. - let requested = RequestedFormat::new::(RequestedFormatType::None); - - // `CameraInfo::index()` is what Nokhwa uses to open the device. Depending on Nokhwa - // version, this may be Copy/Clone; clone defensively. - let mut camera = Camera::new(info.index().clone(), requested)?; - - // Prefer FourCC-based queries if available; otherwise fall back to camera formats. - let mut capabilities = BTreeMap::new(); - - if let Ok(mut fourccs) = camera.compatible_fourcc() { - fourccs.sort(); - for fourcc in fourccs { - // Returns a map: Resolution -> Vec - let mut res_map = camera.compatible_list_by_resolution(fourcc)?; - let mut res_sorted = BTreeMap::new(); - - for (res, mut fps_list) in res_map.drain() { - fps_list.sort(); - fps_list.dedup(); - res_sorted.insert(res, fps_list); - } +#[cfg(target_os = "macos")] +fn platform_devices() -> Result> { + Ok(livekit_capture::platform::avfoundation::devices()?) +} - capabilities.insert(fourcc, res_sorted); - } - } else { - // Some backends don’t support FourCC enumeration; use generic formats instead. - let formats = camera.compatible_camera_formats()?; - capabilities = capabilities_from_formats(formats); - } +#[cfg(target_os = "linux")] +fn platform_devices() -> Result> { + Ok(livekit_capture::sources::v4l::devices()?) +} - Ok(capabilities) +#[cfg(not(any(target_os = "macos", target_os = "linux")))] +fn platform_devices() -> Result> { + anyhow::bail!( + "camera listing is not supported on {}; local_video supports macOS AVFoundation and Linux V4L2", + std::env::consts::OS + ); } -fn capabilities_from_formats( - formats: Vec, -) -> BTreeMap>> { - let mut capabilities = BTreeMap::new(); - for fmt in formats { - let res_map = capabilities.entry(fmt.format()).or_insert_with(BTreeMap::new); - let fps_list = res_map.entry(fmt.resolution()).or_insert_with(Vec::new); - fps_list.push(fmt.frame_rate()); +fn print_device_details(info: &CaptureDeviceInfo) { + println!(" ID: {}", info.id); + if let Some(model_id) = info.model_id.as_deref() { + println!(" Model: {}", model_id); } - for res_map in capabilities.values_mut() { - for fps_list in res_map.values_mut() { - fps_list.sort(); - fps_list.dedup(); - } + if let Some(manufacturer) = info.manufacturer.as_deref() { + println!(" Manufacturer: {}", manufacturer); } - capabilities + print_capabilities(&info.formats); } -fn print_capabilities(capabilities: &BTreeMap>>) { - if capabilities.is_empty() { - println!(" Capabilities: none reported"); +fn print_capabilities(formats: &[CaptureFormat]) { + if formats.is_empty() { + println!(" Capabilities: none reported by backend"); return; } + let mut formats = formats.to_vec(); + formats.sort_by_key(|format| { + ( + format!("{:?}", format.pixel_format), + format.resolution.width, + format.resolution.height, + format.frame_rate, + ) + }); + println!(" Capabilities:"); - for (format, resolutions) in capabilities { - println!(" - Format: {}", format); - if resolutions.is_empty() { - println!(" (no resolutions reported)"); - continue; - } - for (resolution, fps_list) in resolutions { - let fps_text = if fps_list.is_empty() { - "unknown".to_string() - } else { - fps_list.iter().map(|fps| fps.to_string()).collect::>().join(", ") - }; - println!(" {} @ {} fps", resolution, fps_text); - } + for format in formats { + println!( + " - {:?}: {}x{} @ {} fps", + format.pixel_format, + format.resolution.width, + format.resolution.height, + format.frame_rate + ); } } diff --git a/examples/local_video/src/publisher.rs b/examples/local_video/src/publisher.rs index c9cbb93c2..42c2a9687 100644 --- a/examples/local_video/src/publisher.rs +++ b/examples/local_video/src/publisher.rs @@ -12,13 +12,17 @@ use livekit::webrtc::video_source::{RtcVideoSource, VideoResolution}; use livekit_api::access_token; use livekit_api::services::room::{CreateRoomOptions, RoomClient}; use livekit_api::services::{ServiceError, TwirpError, TwirpErrorCode}; -use log::{debug, info}; -use nokhwa::pixel_format::RgbFormat; -use nokhwa::utils::{ - ApiBackend, CameraFormat, CameraIndex, FrameFormat, RequestedFormat, RequestedFormatType, - Resolution, +use livekit_capture::device::{ + CaptureDeviceSelector, CaptureFormat as LkCaptureFormat, CaptureFormatRequest, + CapturePixelFormat, CaptureResolution, +}; +#[cfg(target_os = "macos")] +use livekit_capture::platform::avfoundation::{ + self, AvFoundationCaptureOptions, AvFoundationCaptureSession, }; -use nokhwa::Camera; +#[cfg(target_os = "linux")] +use livekit_capture::sources::v4l::{self, V4lCaptureOptions, V4lCaptureSession}; +use log::{debug, info}; use parking_lot::Mutex; use std::collections::{HashMap, VecDeque}; use std::env; @@ -27,7 +31,6 @@ use std::sync::{ Arc, }; use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; -use yuv_sys; #[cfg(all(target_os = "linux", target_arch = "aarch64"))] mod argus; @@ -65,7 +68,7 @@ impl From for VideoCodec { /// Selects the camera backend used by the publisher. #[derive(Copy, Clone, Debug, PartialEq, Eq, ValueEnum)] enum SourceKind { - /// USB / V4L2 camera via the `nokhwa` crate (default). + /// Platform camera via livekit-capture (AVFoundation on macOS, V4L2 on Linux). Uvc, /// NVIDIA Jetson MIPI CSI camera via libargus (Jetson-only). Argus, @@ -83,11 +86,12 @@ enum CaptureFormat { } impl CaptureFormat { - fn frame_formats(self) -> &'static [FrameFormat] { + #[cfg(target_os = "linux")] + fn pixel_formats(self) -> &'static [CapturePixelFormat] { match self { - Self::Auto => &[FrameFormat::YUYV, FrameFormat::MJPEG], - Self::Yuv => &[FrameFormat::YUYV], - Self::Mjpeg => &[FrameFormat::MJPEG], + Self::Auto => &[CapturePixelFormat::Yuyv, CapturePixelFormat::Mjpeg], + Self::Yuv => &[CapturePixelFormat::Yuyv], + Self::Mjpeg => &[CapturePixelFormat::Mjpeg], } } } @@ -166,7 +170,7 @@ struct Args { #[arg(long, default_value_t = 0)] camera_index: usize, - /// Camera backend: `uvc` (default, V4L2/USB via nokhwa) or `argus` (Jetson MIPI CSI). + /// Camera backend: `uvc` (default platform camera) or `argus` (Jetson MIPI CSI). #[arg(long, value_enum, default_value_t = SourceKind::Uvc)] source: SourceKind, @@ -267,72 +271,6 @@ fn unix_time_us_now() -> u64 { SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_micros() as u64 } -const MAX_BACKEND_CAPTURE_TIMESTAMP_AGE_US: u64 = 5_000_000; - -#[derive(Default)] -struct CaptureTimestampLogState { - logged_source: bool, - logged_missing: bool, - logged_invalid: bool, -} - -fn validate_backend_capture_timestamp_us( - capture_timestamp: Duration, - read_wall_time_us: u64, -) -> Result { - let capture_timestamp_us = - u64::try_from(capture_timestamp.as_micros()).map_err(|_| "overflows u64")?; - if capture_timestamp_us == 0 { - return Err("is zero"); - } - if capture_timestamp_us > read_wall_time_us { - return Err("is in the future"); - } - if read_wall_time_us - capture_timestamp_us > MAX_BACKEND_CAPTURE_TIMESTAMP_AGE_US { - return Err("is too old"); - } - Ok(capture_timestamp_us) -} - -fn select_capture_wall_time_us( - backend_capture_timestamp: Option, - fallback_wall_time_us: u64, - read_wall_time_us: u64, - log_state: &mut CaptureTimestampLogState, -) -> u64 { - match backend_capture_timestamp { - Some(capture_timestamp) => { - match validate_backend_capture_timestamp_us(capture_timestamp, read_wall_time_us) { - Ok(capture_timestamp_us) => { - if !log_state.logged_source { - info!("Using camera capture_timestamp for user_timestamp"); - log_state.logged_source = true; - } - capture_timestamp_us - } - Err(reason) => { - if !log_state.logged_invalid { - log::warn!( - "Ignoring camera capture_timestamp because it {reason}; falling back to system wall clock" - ); - log_state.logged_invalid = true; - } - fallback_wall_time_us - } - } - } - None => { - if !log_state.logged_missing { - log::warn!( - "Buffer::capture_timestamp() not available; falling back to system wall clock" - ); - log_state.logged_missing = true; - } - fallback_wall_time_us - } - } -} - fn is_twirp_not_found(err: &ServiceError) -> bool { matches!( err, @@ -759,53 +697,35 @@ mod tests { current.sensor_exposure_timestamp_us ); } - - #[test] - fn capture_timestamp_validation_rejects_future_timestamp() { - assert_eq!( - validate_backend_capture_timestamp_us(Duration::from_micros(1_001), 1_000), - Err("is in the future") - ); - } - - #[test] - fn capture_timestamp_selection_falls_back_for_invalid_backend_timestamp() { - let mut log_state = CaptureTimestampLogState::default(); - - let selected = select_capture_wall_time_us( - Some(Duration::from_micros(1_001)), - 900, - 1_000, - &mut log_state, - ); - - assert_eq!(selected, 900); - } - - #[test] - fn capture_timestamp_selection_uses_valid_backend_timestamp() { - let mut log_state = CaptureTimestampLogState::default(); - - let selected = select_capture_wall_time_us( - Some(Duration::from_micros(950)), - 900, - 1_000, - &mut log_state, - ); - - assert_eq!(selected, 950); - } } fn list_cameras() -> Result<()> { - let cams = nokhwa::query(ApiBackend::Auto)?; + let cams = platform_devices()?; println!("Available cameras:"); for (i, cam) in cams.iter().enumerate() { - println!("{}. {}", i, cam.human_name()); + println!("{}. {}", i, cam.name); } Ok(()) } +#[cfg(target_os = "macos")] +fn platform_devices() -> Result> { + Ok(avfoundation::devices()?) +} + +#[cfg(target_os = "linux")] +fn platform_devices() -> Result> { + Ok(v4l::devices()?) +} + +#[cfg(not(any(target_os = "macos", target_os = "linux")))] +fn platform_devices() -> Result> { + anyhow::bail!( + "camera capture is not supported on {}; local_video supports macOS AVFoundation and Linux V4L2", + std::env::consts::OS + ); +} + fn list_encoders() { println!("Available video encoder backends:"); for backend in VideoEncoderBackend::list_available() { @@ -815,14 +735,52 @@ fn list_encoders() { enum VideoInput { TestPattern(TestPattern), - Camera { - camera: Camera, - is_yuyv: bool, - }, + Camera(PlatformCamera), #[cfg(all(target_os = "linux", target_arch = "aarch64"))] Argus(argus::ArgusCaptureSession), } +enum PlatformCamera { + #[cfg(target_os = "macos")] + AvFoundation(AvFoundationCaptureSession), + #[cfg(target_os = "linux")] + V4l(V4lCaptureSession), +} + +struct PlatformCameraFrame { + frame: VideoFrame, + capture_wall_time_us: u64, + read_wall_time_us: u64, + used_decode_path: bool, +} + +impl PlatformCamera { + fn capture_frame(&mut self) -> Result { + match self { + #[cfg(target_os = "macos")] + Self::AvFoundation(session) => { + let frame = session.capture_frame()?; + Ok(PlatformCameraFrame { + frame: frame.frame, + capture_wall_time_us: frame.capture_wall_time_us, + read_wall_time_us: frame.read_wall_time_us, + used_decode_path: frame.used_decode_path, + }) + } + #[cfg(target_os = "linux")] + Self::V4l(session) => { + let frame = session.capture_frame()?; + Ok(PlatformCameraFrame { + frame: frame.frame, + capture_wall_time_us: frame.capture_wall_time_us, + read_wall_time_us: frame.read_wall_time_us, + used_decode_path: frame.used_decode_path, + }) + } + } + } +} + #[derive(Clone, Copy)] struct CaptureConfig { fps: u32, @@ -847,6 +805,85 @@ fn create_i420_buffer(width: u32, height: u32, align_for_display: bool) -> I420B } } +fn open_platform_camera(args: &Args) -> Result<(u32, u32, VideoInput)> { + #[cfg(target_os = "macos")] + { + if args.format != CaptureFormat::Auto { + log::warn!( + "--format={} is ignored for AVFoundation decoded capture; AVFoundation supplies decoded CVPixelBuffers", + args.format + ); + } + let requested = LkCaptureFormat::new( + CaptureResolution::new(args.width, args.height), + args.fps, + CapturePixelFormat::Nv12, + ); + let session = AvFoundationCaptureSession::new(AvFoundationCaptureOptions { + device: CaptureDeviceSelector::Index(args.camera_index), + format: CaptureFormatRequest::Closest(requested), + is_screencast: false, + })?; + let format = session.format(); + info!( + "Camera opened with AVFoundation: {}x{} @ {} fps (source format: {:?}, camera {})", + format.resolution.width, + format.resolution.height, + format.frame_rate, + format.pixel_format, + args.camera_index, + ); + Ok(( + format.resolution.width, + format.resolution.height, + VideoInput::Camera(PlatformCamera::AvFoundation(session)), + )) + } + + #[cfg(target_os = "linux")] + { + let requested = LkCaptureFormat::new( + CaptureResolution::new(args.width, args.height), + args.fps, + args.format.pixel_formats()[0], + ); + let mut options = V4lCaptureOptions::new( + CaptureDeviceSelector::Index(args.camera_index), + requested.resolution, + requested.frame_rate, + ); + options.format = if args.format == CaptureFormat::Auto { + CaptureFormatRequest::Closest(requested) + } else { + CaptureFormatRequest::Exact(requested) + }; + options.pixel_formats = args.format.pixel_formats().to_vec(); + let session = V4lCaptureSession::new(options)?; + let format = session.format(); + info!( + "Camera opened with V4L2: {}x{} @ {} fps (format: {:?}, requested: {})", + format.resolution.width, + format.resolution.height, + format.frame_rate, + format.pixel_format, + args.format, + ); + Ok(( + format.resolution.width, + format.resolution.height, + VideoInput::Camera(PlatformCamera::V4l(session)), + )) + } + + #[cfg(not(any(target_os = "macos", target_os = "linux")))] + { + anyhow::bail!( + "camera capture is not supported on {}; local_video supports macOS AVFoundation and Linux V4L2", + std::env::consts::OS + ); + } +} + #[tokio::main] async fn main() -> Result<()> { env_logger::init(); @@ -877,14 +914,17 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { // LiveKit connection details let url = args .url + .clone() .or_else(|| env::var("LIVEKIT_URL").ok()) .expect("LIVEKIT_URL must be provided via --url or env"); let api_key = args .api_key + .clone() .or_else(|| env::var("LIVEKIT_API_KEY").ok()) .expect("LIVEKIT_API_KEY must be provided via --api-key or env"); let api_secret = args .api_secret + .clone() .or_else(|| env::var("LIVEKIT_API_SECRET").ok()) .expect("LIVEKIT_API_SECRET must be provided via --api-secret or env"); @@ -1014,85 +1054,7 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { ); (width, height, VideoInput::TestPattern(TestPattern::new(width, height))) } else { - // Setup camera - let index = CameraIndex::Index(args.camera_index as u32); - let requested = RequestedFormat::new::( - RequestedFormatType::AbsoluteHighestFrameRate, - ); - let mut camera = Camera::new(index, requested)?; - - let mut requested_camera_format = None; - let mut last_request_error = None; - for frame_format in args.format.frame_formats() { - let wanted = CameraFormat::new( - Resolution::new(args.width, args.height), - *frame_format, - args.fps, - ); - match camera.set_camera_requset(RequestedFormat::new::( - RequestedFormatType::Exact(wanted), - )) { - Ok(format) => { - requested_camera_format = Some(format); - break; - } - Err(err) => { - last_request_error = Some(err); - } - } - } - if let Some(requested_camera_format) = requested_camera_format { - debug!("Requested nokhwa CameraFormat: {:?}", requested_camera_format); - } else if args.format == CaptureFormat::Auto { - if let Some(err) = last_request_error { - log::warn!( - "Failed to request YUYV or MJPEG at {}x{} @ {} fps; using backend-selected camera format: {}", - args.width, - args.height, - args.fps, - err - ); - } - } else { - let formats = args - .format - .frame_formats() - .iter() - .map(ToString::to_string) - .collect::>() - .join(" or "); - return Err(match last_request_error { - Some(err) => anyhow::anyhow!( - "failed to request camera format {} at {}x{} @ {} fps: {}", - formats, - args.width, - args.height, - args.fps, - err - ), - None => anyhow::anyhow!("no camera capture formats were requested"), - }); - } - camera.open_stream()?; - let fmt = camera.camera_format(); - let width = fmt.width(); - let height = fmt.height(); - let fps = fmt.frame_rate(); - let is_yuyv = fmt.format() == FrameFormat::YUYV; - info!( - "Camera opened: {}x{} @ {} fps (format: {}, requested: {})", - width, - height, - fps, - fmt.format(), - args.format - ); - debug!("Negotiated nokhwa CameraFormat: {:?}", fmt); - info!( - "Selected conversion path: {}", - if is_yuyv { "YUYV->I420 (libyuv)" } else { "Auto (RGB24 or MJPEG)" } - ); - (width, height, VideoInput::Camera { camera, is_yuyv }) + open_platform_camera(&args)? } } }; @@ -1331,8 +1293,6 @@ async fn run_capture_loop( // Timing accumulators (ms) for rolling stats let mut timings = PublisherTimingSummary::default(); - let mut logged_mjpeg_fallback = false; - let mut capture_timestamp_log_state = CaptureTimestampLogState::default(); let mut frame_counter: u32 = 1; let mut timestamp_overlay = (config.attach_timestamp && config.burn_timestamp) .then(|| TimestampOverlay::new(width, height)); @@ -1347,22 +1307,10 @@ async fn run_capture_loop( ticker.tick().await; let paced_wait_finished_at = Instant::now(); - // WebRTC may queue the frame and hardware encoders may upload it asynchronously. - // Give each submitted frame unique backing storage so later captures cannot - // overwrite buffers that are still in-flight. - let mut frame = VideoFrame { - rotation: VideoRotation::VideoRotation0, - timestamp_us: 0, - frame_metadata: None, - buffer: create_i420_buffer(width, height, align_buffers_for_display), - }; - let (stride_y, stride_u, stride_v) = frame.buffer.strides(); - let stride_y_usize = stride_y as usize; - let source_frame_started_at = Instant::now(); let frame_wall_time_us = unix_time_us_now(); - let (data_y, data_u, data_v) = frame.buffer.data_mut(); let ( + mut frame, capture_wall_time_us, read_wall_time_us, source_frame_acquired_at, @@ -1372,6 +1320,17 @@ async fn run_capture_loop( record_convert_timing, ) = match &mut video_input { VideoInput::TestPattern(pattern) => { + // WebRTC may queue the frame and hardware encoders may upload it asynchronously. + // Give each submitted frame unique backing storage so later captures cannot + // overwrite buffers that are still in-flight. + let mut frame = VideoFrame { + rotation: VideoRotation::VideoRotation0, + timestamp_us: 0, + frame_metadata: None, + buffer: create_i420_buffer(width, height, align_buffers_for_display), + }; + let (stride_y, stride_u, stride_v) = frame.buffer.strides(); + let (data_y, data_u, data_v) = frame.buffer.data_mut(); pattern.render( data_y, stride_y as i32, @@ -1382,6 +1341,7 @@ async fn run_capture_loop( ); let frame_acquired_at = Instant::now(); ( + frame, frame_wall_time_us, unix_time_us_now(), frame_acquired_at, @@ -1391,146 +1351,20 @@ async fn run_capture_loop( false, ) } - VideoInput::Camera { camera, is_yuyv } => { - // Capture the frame as early as possible so the attached timestamp is - // close to the camera acquisition point. - let frame_buf = camera.frame()?; - let read_wall_time_us = unix_time_us_now(); + VideoInput::Camera(camera) => { + let mut captured = camera.capture_frame()?; let camera_frame_acquired_at = Instant::now(); - - // Prefer backend capture timestamps only when they are plausible Unix - // wall-clock times. Some camera APIs expose stream-relative or future - // presentation timestamps; attaching those makes latency appear negative. - let capture_wall_time_us = select_capture_wall_time_us( - frame_buf.capture_timestamp(), - frame_wall_time_us, - read_wall_time_us, - &mut capture_timestamp_log_state, - ); - - let (decode_finished_at, convert_finished_at, used_decode_path) = if *is_yuyv { - // Fast path for YUYV: convert directly to I420 via libyuv - let src = frame_buf.buffer(); - let src_bytes = src.as_ref(); - let src_stride = (width * 2) as i32; // YUYV packed 4:2:2 - unsafe { - // returns 0 on success - let _ = yuv_sys::rs_YUY2ToI420( - src_bytes.as_ptr(), - src_stride, - data_y.as_mut_ptr(), - stride_y as i32, - data_u.as_mut_ptr(), - stride_u as i32, - data_v.as_mut_ptr(), - stride_v as i32, - width as i32, - height as i32, - ); - } - (camera_frame_acquired_at, Instant::now(), false) - } else { - // Auto path (either RGB24 already or compressed MJPEG) - let src = frame_buf.buffer(); - if src.len() == (width as usize * height as usize * 3) { - // Already RGB24 from backend; convert directly - unsafe { - let _ = yuv_sys::rs_RGB24ToI420( - src.as_ref().as_ptr(), - (width * 3) as i32, - data_y.as_mut_ptr(), - stride_y as i32, - data_u.as_mut_ptr(), - stride_u as i32, - data_v.as_mut_ptr(), - stride_v as i32, - width as i32, - height as i32, - ); - } - (camera_frame_acquired_at, Instant::now(), false) - } else { - // Try fast MJPEG->I420 via libyuv if available; fallback to image crate - let mut used_fast_mjpeg = false; - let fast_mjpeg_buffer_ready_at = unsafe { - // rs_MJPGToI420 returns 0 on success - let ret = yuv_sys::rs_MJPGToI420( - src.as_ref().as_ptr(), - src.len(), - data_y.as_mut_ptr(), - stride_y as i32, - data_u.as_mut_ptr(), - stride_u as i32, - data_v.as_mut_ptr(), - stride_v as i32, - width as i32, - height as i32, - width as i32, - height as i32, - ); - if ret == 0 { - used_fast_mjpeg = true; - Instant::now() - } else { - camera_frame_acquired_at - } - }; - if used_fast_mjpeg { - (fast_mjpeg_buffer_ready_at, fast_mjpeg_buffer_ready_at, true) - } else { - // Fallback: decode MJPEG using image crate then RGB24->I420 - match image::load_from_memory(src.as_ref()) { - Ok(img_dyn) => { - let rgb8 = img_dyn.to_rgb8(); - let decode_finished_at = Instant::now(); - let dec_w = rgb8.width() as u32; - let dec_h = rgb8.height() as u32; - if dec_w != width || dec_h != height { - log::warn!( - "Decoded MJPEG size {}x{} differs from requested {}x{}; dropping frame", - dec_w, dec_h, width, height - ); - continue; - } - unsafe { - let _ = yuv_sys::rs_RGB24ToI420( - rgb8.as_raw().as_ptr(), - (dec_w * 3) as i32, - data_y.as_mut_ptr(), - stride_y as i32, - data_u.as_mut_ptr(), - stride_u as i32, - data_v.as_mut_ptr(), - stride_v as i32, - width as i32, - height as i32, - ); - } - (decode_finished_at, Instant::now(), true) - } - Err(e2) => { - if !logged_mjpeg_fallback { - log::error!( - "MJPEG decode failed; buffer not RGB24 and image decode failed: {}", - e2 - ); - logged_mjpeg_fallback = true; - } - continue; - } - } - } - } - }; + captured.frame.rotation = VideoRotation::VideoRotation0; ( - capture_wall_time_us, - read_wall_time_us, + captured.frame, + captured.capture_wall_time_us, + captured.read_wall_time_us, camera_frame_acquired_at, - decode_finished_at, - convert_finished_at, - used_decode_path, - true, + camera_frame_acquired_at, + camera_frame_acquired_at, + captured.used_decode_path, + false, ) } #[cfg(all(target_os = "linux", target_arch = "aarch64"))] @@ -1541,6 +1375,8 @@ async fn run_capture_loop( unreachable!("argus video input must be driven by run_argus_capture_loop") } }; + let (stride_y, _, _) = frame.buffer.strides(); + let stride_y_usize = stride_y as usize; let fid = if config.attach_frame_id { let id = frame_counter; @@ -1557,6 +1393,7 @@ async fn run_capture_loop( let mut burned_timestamp_us = None; if let Some(overlay) = timestamp_overlay.as_mut() { let overlay_started_at = Instant::now(); + let (data_y, _, _) = frame.buffer.data_mut(); overlay.draw(data_y, stride_y_usize, capture_wall_time_us, fid); burned_timestamp_us = Some(capture_wall_time_us); let overlay_finished_at = Instant::now(); diff --git a/livekit-capture/Cargo.toml b/livekit-capture/Cargo.toml index e5eaf2920..b0e699fa3 100644 --- a/livekit-capture/Cargo.toml +++ b/livekit-capture/Cargo.toml @@ -10,7 +10,6 @@ repository.workspace = true [dependencies] bytes = { workspace = true } image = { workspace = true, optional = true } -imgproc = { workspace = true, optional = true } livekit = { workspace = true } thiserror = { workspace = true } yuv-sys = { workspace = true, features = ["jpeg"], optional = true } @@ -18,21 +17,22 @@ yuv-sys = { workspace = true, features = ["jpeg"], optional = true } [features] default = [] avfoundation = [ - "dep:block2", "dep:dispatch2", - "dep:imgproc", "dep:objc2", "dep:objc2-av-foundation", "dep:objc2-core-media", "dep:objc2-core-video", "dep:objc2-foundation", + "dep:yuv-sys", "objc2-av-foundation/AVCaptureDevice", "objc2-av-foundation/AVCaptureInput", "objc2-av-foundation/AVCaptureOutputBase", "objc2-av-foundation/AVCaptureSession", + "objc2-av-foundation/AVCaptureSessionPreset", "objc2-av-foundation/AVCaptureVideoDataOutput", "objc2-av-foundation/AVMediaFormat", "objc2-av-foundation/AVVideoSettings", + "objc2-av-foundation/dispatch2", "objc2-av-foundation/objc2-core-media", "objc2-core-media/CMTime", "objc2-core-media/CMSampleBuffer", @@ -44,6 +44,7 @@ avfoundation = [ "objc2-core-video/CVReturn", "objc2-foundation/NSArray", "objc2-foundation/NSDictionary", + "objc2-foundation/NSError", "objc2-foundation/NSObject", "objc2-foundation/NSString", ] @@ -57,7 +58,6 @@ v4l = ["dep:image", "dep:nokhwa", "dep:yuv-sys"] cc = { workspace = true } [target.'cfg(target_os = "macos")'.dependencies] -block2 = { version = "0.6.2", default-features = false, optional = true } dispatch2 = { version = "0.3.1", default-features = false, features = ["std"], optional = true } objc2 = { version = "0.6.4", default-features = false, features = ["std"], optional = true } objc2-av-foundation = { version = "0.3.2", default-features = false, optional = true } diff --git a/livekit-capture/src/platform/avfoundation.rs b/livekit-capture/src/platform/avfoundation.rs index c4a64b2c3..5ac7efe3c 100644 --- a/livekit-capture/src/platform/avfoundation.rs +++ b/livekit-capture/src/platform/avfoundation.rs @@ -12,14 +12,27 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::sync::{ + atomic::{AtomicBool, Ordering}, + Arc, +}; +use std::thread::JoinHandle; + +use livekit::webrtc::video_frame::{I420Buffer, VideoFrame}; use thiserror::Error; use crate::{ - device::{CaptureDeviceInfo, CaptureDeviceSelector, CaptureFormatRequest}, + device::{ + CaptureDeviceInfo, CaptureDeviceSelector, CaptureFormat, CaptureFormatRequest, + CapturePixelFormat, CaptureResolution, + }, error::CaptureError, track::VideoCaptureTrack, }; +#[cfg(target_os = "macos")] +const FIRST_FRAME_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(5); + /// Options used to create an AVFoundation capture session. #[derive(Debug, Clone, PartialEq, Eq)] pub struct AvFoundationCaptureOptions { @@ -41,11 +54,103 @@ impl Default for AvFoundationCaptureOptions { } } -/// AVFoundation decoded-frame capture session. +/// One AVFoundation frame converted to I420. #[derive(Debug)] +pub struct AvFoundationFrame { + /// Decoded I420 frame suitable for [`crate::VideoCaptureTrack::capture_frame`]. + pub frame: VideoFrame, + /// Source pixel format delivered by AVFoundation. + pub source_pixel_format: CapturePixelFormat, + /// Wall-clock timestamp selected for metadata and timing correlation. + pub capture_wall_time_us: u64, + /// Wall-clock timestamp recorded after the frame was read from AVFoundation. + pub read_wall_time_us: u64, + /// Whether compressed image decoding was needed. + pub used_decode_path: bool, +} + +impl AvFoundationFrame { + /// Returns the decoded video frame. + pub fn video_frame(&self) -> &VideoFrame { + &self.frame + } +} + +/// AVFoundation decoded-frame capture session that emits I420 frames. +pub struct AvFoundationCaptureSession { + format: CaptureFormat, + #[cfg(target_os = "macos")] + inner: macos::SessionInner, +} + +impl std::fmt::Debug for AvFoundationCaptureSession { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AvFoundationCaptureSession").field("format", &self.format).finish() + } +} + +// SAFETY: `AvFoundationCaptureSession` owns AVFoundation objects and only exposes +// `&mut self` frame capture plus `Drop`; moving ownership to another thread does +// not create concurrent access to those Objective-C objects. +#[cfg(target_os = "macos")] +unsafe impl Send for AvFoundationCaptureSession {} + +impl AvFoundationCaptureSession { + /// Opens an AVFoundation decoded-frame capture session. + pub fn new(options: AvFoundationCaptureOptions) -> Result { + validate_options(&options)?; + Self::open(options) + } + + /// Captures the next decoded frame and converts it to I420. + pub fn capture_frame(&mut self) -> Result { + self.capture_frame_inner() + } + + /// Returns the negotiated capture format. + pub fn format(&self) -> CaptureFormat { + self.format + } + + #[cfg(target_os = "macos")] + fn open(options: AvFoundationCaptureOptions) -> Result { + let inner = macos::SessionInner::new(&options)?; + let mut format = inner.wait_for_format(FIRST_FRAME_TIMEOUT)?; + format.frame_rate = requested_frame_rate_hint(&options.format).unwrap_or(30); + Ok(Self { format, inner }) + } + + #[cfg(not(target_os = "macos"))] + fn open(_options: AvFoundationCaptureOptions) -> Result { + Err(AvFoundationError::UnsupportedPlatform) + } + + #[cfg(target_os = "macos")] + fn capture_frame_inner(&mut self) -> Result { + self.inner.capture_frame() + } + + #[cfg(not(target_os = "macos"))] + fn capture_frame_inner(&mut self) -> Result { + Err(AvFoundationError::UnsupportedPlatform) + } +} + +/// AVFoundation decoded-frame capture session that forwards frames into a track. pub struct AvFoundationCapture { track: VideoCaptureTrack, options: AvFoundationCaptureOptions, + runner: Option, +} + +impl std::fmt::Debug for AvFoundationCapture { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AvFoundationCapture") + .field("track", &self.track) + .field("options", &self.options) + .field("running", &self.runner.is_some()) + .finish() + } } impl AvFoundationCapture { @@ -55,7 +160,7 @@ impl AvFoundationCapture { options: AvFoundationCaptureOptions, ) -> Result { ensure_platform_available()?; - Ok(Self { track, options }) + Ok(Self { track, options, runner: None }) } /// Returns the capture track that receives decoded frames. @@ -68,7 +173,7 @@ impl AvFoundationCapture { &self.options } - /// Starts AVFoundation capture. + /// Starts AVFoundation capture on a background thread. pub fn start(&mut self) -> Result<(), AvFoundationError> { start_capture(self) } @@ -79,6 +184,18 @@ impl AvFoundationCapture { } } +impl Drop for AvFoundationCapture { + fn drop(&mut self) { + let _ = self.stop(); + } +} + +#[derive(Debug)] +struct CaptureRunner { + stop: Arc, + handle: JoinHandle<()>, +} + /// Lists AVFoundation video capture devices. pub fn devices() -> Result, AvFoundationError> { list_devices() @@ -86,6 +203,7 @@ pub fn devices() -> Result, AvFoundationError> { /// Error returned by AVFoundation capture. #[derive(Debug, Error)] +#[non_exhaustive] pub enum AvFoundationError { /// AVFoundation capture is only available on macOS. #[error("AVFoundation capture is only available on macOS")] @@ -93,14 +211,126 @@ pub enum AvFoundationError { /// The requested device was not found. #[error("AVFoundation capture device was not found")] DeviceNotFound, - /// The requested operation is represented by the API but not implemented yet. - #[error("{0}")] - NotImplemented(&'static str), + /// The requested option is invalid. + #[error("invalid AVFoundation capture option: {0}")] + InvalidOption(&'static str), + /// The requested capture format is not supported by this backend. + #[error("AVFoundation capture does not support pixel format {0:?}")] + UnsupportedPixelFormat(CapturePixelFormat), + /// AVFoundation could not configure the capture session. + #[error("AVFoundation session setup failed: {0}")] + SessionSetup(String), + /// Timed out waiting for AVFoundation to deliver a frame. + #[error("timed out waiting for AVFoundation frame")] + FrameTimeout, + /// The capture session is already running. + #[error("AVFoundation capture is already running")] + AlreadyRunning, + /// The capture session is not running. + #[error("AVFoundation capture is not running")] + NotRunning, + /// Captured frame bytes did not match the negotiated format. + #[error("invalid AVFoundation frame buffer: {0}")] + InvalidFrame(&'static str), + /// AVFoundation produced a pixel format this backend cannot convert yet. + #[error("unsupported AVFoundation pixel format 0x{0:08x}")] + UnsupportedCoreVideoPixelFormat(u32), + /// Pixel conversion failed. + #[error("failed to convert AVFoundation frame to I420: {0}")] + Convert(&'static str), + /// AVFoundation reported a runtime capture error. + #[error("AVFoundation runtime error: {0}")] + Runtime(String), /// The shared capture track rejected a frame. #[error(transparent)] Capture(#[from] CaptureError), } +fn validate_options(options: &AvFoundationCaptureOptions) -> Result<(), AvFoundationError> { + match &options.device { + CaptureDeviceSelector::Default | CaptureDeviceSelector::Index(_) => {} + CaptureDeviceSelector::Id(id) => { + if id.is_empty() { + return Err(AvFoundationError::InvalidOption("device id must be non-empty")); + } + } + } + + validate_format_request(&options.format) +} + +fn validate_format_request(format: &CaptureFormatRequest) -> Result<(), AvFoundationError> { + let validate_format = |format: &CaptureFormat| { + if format.resolution.width == 0 { + return Err(AvFoundationError::InvalidOption("width must be non-zero")); + } + if format.resolution.height == 0 { + return Err(AvFoundationError::InvalidOption("height must be non-zero")); + } + if format.frame_rate == 0 { + return Err(AvFoundationError::InvalidOption("frame_rate must be non-zero")); + } + validate_pixel_format(format.pixel_format)?; + Ok(()) + }; + + match format { + CaptureFormatRequest::Default => Ok(()), + CaptureFormatRequest::Exact(format) | CaptureFormatRequest::Closest(format) => { + validate_format(format) + } + CaptureFormatRequest::HighestFrameRate { resolution, pixel_format } => { + if let Some(resolution) = resolution { + validate_resolution(*resolution)?; + } + if let Some(pixel_format) = pixel_format { + validate_pixel_format(*pixel_format)?; + } + Ok(()) + } + CaptureFormatRequest::HighestResolution { frame_rate, pixel_format } => { + if matches!(frame_rate, Some(0)) { + return Err(AvFoundationError::InvalidOption("frame_rate must be non-zero")); + } + if let Some(pixel_format) = pixel_format { + validate_pixel_format(*pixel_format)?; + } + Ok(()) + } + } +} + +fn validate_pixel_format(pixel_format: CapturePixelFormat) -> Result<(), AvFoundationError> { + if !matches!( + pixel_format, + CapturePixelFormat::Nv12 | CapturePixelFormat::Bgra | CapturePixelFormat::I420 + ) { + return Err(AvFoundationError::UnsupportedPixelFormat(pixel_format)); + } + Ok(()) +} + +fn requested_frame_rate_hint(format: &CaptureFormatRequest) -> Option { + match format { + CaptureFormatRequest::Default => None, + CaptureFormatRequest::Exact(format) | CaptureFormatRequest::Closest(format) => { + Some(format.frame_rate) + } + CaptureFormatRequest::HighestFrameRate { .. } => None, + CaptureFormatRequest::HighestResolution { frame_rate, .. } => *frame_rate, + } +} + +fn validate_resolution(resolution: CaptureResolution) -> Result<(), AvFoundationError> { + if resolution.width == 0 { + return Err(AvFoundationError::InvalidOption("width must be non-zero")); + } + if resolution.height == 0 { + return Err(AvFoundationError::InvalidOption("height must be non-zero")); + } + Ok(()) +} + #[cfg(target_os = "macos")] fn ensure_platform_available() -> Result<(), AvFoundationError> { Ok(()) @@ -149,10 +379,29 @@ fn non_empty_string(value: String) -> Option { } #[cfg(target_os = "macos")] -fn start_capture(_capture: &mut AvFoundationCapture) -> Result<(), AvFoundationError> { - Err(AvFoundationError::NotImplemented( - "AVFoundation decoded-frame delegate capture is not wired yet", - )) +fn start_capture(capture: &mut AvFoundationCapture) -> Result<(), AvFoundationError> { + if capture.runner.is_some() { + return Err(AvFoundationError::AlreadyRunning); + } + + let track = capture.track.clone(); + let mut session = AvFoundationCaptureSession::new(capture.options.clone())?; + let stop = Arc::new(AtomicBool::new(false)); + let stop_for_thread = stop.clone(); + let handle = std::thread::Builder::new() + .name("avfoundation-capture".into()) + .spawn(move || { + while !stop_for_thread.load(Ordering::Acquire) { + match session.capture_frame() { + Ok(frame) => track.capture_frame(&frame.frame), + Err(_) => break, + } + } + }) + .map_err(|err| AvFoundationError::SessionSetup(err.to_string()))?; + + capture.runner = Some(CaptureRunner { stop, handle }); + Ok(()) } #[cfg(not(target_os = "macos"))] @@ -161,7 +410,15 @@ fn start_capture(_capture: &mut AvFoundationCapture) -> Result<(), AvFoundationE } #[cfg(target_os = "macos")] -fn stop_capture(_capture: &mut AvFoundationCapture) -> Result<(), AvFoundationError> { +fn stop_capture(capture: &mut AvFoundationCapture) -> Result<(), AvFoundationError> { + let Some(runner) = capture.runner.take() else { + return Ok(()); + }; + + runner.stop.store(true, Ordering::Release); + runner.handle.join().map_err(|_| { + AvFoundationError::Runtime("AVFoundation capture thread panicked".to_string()) + })?; Ok(()) } @@ -169,3 +426,614 @@ fn stop_capture(_capture: &mut AvFoundationCapture) -> Result<(), AvFoundationEr fn stop_capture(_capture: &mut AvFoundationCapture) -> Result<(), AvFoundationError> { Err(AvFoundationError::UnsupportedPlatform) } + +#[cfg(target_os = "macos")] +mod macos { + use std::sync::{Arc, Condvar, Mutex}; + use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; + + use dispatch2::{DispatchQueue, DispatchRetained}; + use livekit::webrtc::video_frame::{I420Buffer, VideoBuffer, VideoFrame, VideoRotation}; + use objc2::rc::Retained; + use objc2::runtime::ProtocolObject; + use objc2::{define_class, msg_send, AnyThread, DefinedClass, Message}; + use objc2_av_foundation::{ + AVCaptureDevice, AVCaptureDeviceInput, AVCaptureOutput, AVCaptureSession, + AVCaptureSessionPreset1280x720, AVCaptureSessionPreset1920x1080, + AVCaptureSessionPreset640x480, AVCaptureSessionPresetHigh, AVCaptureSessionPresetMedium, + AVCaptureVideoDataOutput, AVCaptureVideoDataOutputSampleBufferDelegate, AVMediaTypeVideo, + }; + use objc2_core_media::{CMSampleBuffer, CMTime}; + use objc2_core_video::{ + kCVPixelFormatType_32BGRA, kCVPixelFormatType_420YpCbCr8BiPlanarFullRange, + kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange, kCVPixelFormatType_420YpCbCr8Planar, + kCVPixelFormatType_420YpCbCr8PlanarFullRange, kCVReturnSuccess, CVImageBuffer, + CVPixelBuffer, CVPixelBufferGetBaseAddress, CVPixelBufferGetBaseAddressOfPlane, + CVPixelBufferGetBytesPerRow, CVPixelBufferGetBytesPerRowOfPlane, CVPixelBufferGetHeight, + CVPixelBufferGetHeightOfPlane, CVPixelBufferGetPixelFormatType, CVPixelBufferGetPlaneCount, + CVPixelBufferGetWidth, CVPixelBufferGetWidthOfPlane, CVPixelBufferLockBaseAddress, + CVPixelBufferLockFlags, CVPixelBufferUnlockBaseAddress, + }; + use objc2_foundation::{NSObject, NSObjectProtocol, NSString}; + + use super::{AvFoundationCaptureOptions, AvFoundationError, AvFoundationFrame}; + use crate::device::{ + CaptureDeviceSelector, CaptureFormat, CaptureFormatRequest, CapturePixelFormat, + CaptureResolution, + }; + use crate::metadata::FrameMetadata; + + pub(super) struct SessionInner { + session: Retained, + _input: Retained, + output: Retained, + _delegate: Retained, + _queue: DispatchRetained, + shared: Arc, + } + + impl std::fmt::Debug for SessionInner { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SessionInner").finish_non_exhaustive() + } + } + + impl Drop for SessionInner { + fn drop(&mut self) { + self.shared.stop(); + // SAFETY: The output and session are owned by this wrapper. Clearing + // the delegate before stopping prevents callbacks from racing with + // the delegate being released during teardown. + unsafe { + self.output.setSampleBufferDelegate_queue(None, None); + self.session.stopRunning(); + } + } + } + + impl SessionInner { + pub(super) fn new(options: &AvFoundationCaptureOptions) -> Result { + let device = select_device(&options.device)?; + let session = unsafe { AVCaptureSession::new() }; + let input = unsafe { AVCaptureDeviceInput::deviceInputWithDevice_error(&device) } + .map_err(|err| { + AvFoundationError::SessionSetup(err.localizedDescription().to_string()) + })?; + let output = unsafe { AVCaptureVideoDataOutput::new() }; + let shared = Arc::new(FrameQueue::default()); + let delegate = CaptureDelegate::new(shared.clone()); + let queue = DispatchQueue::new("io.livekit.capture.avfoundation", None); + + // SAFETY: The session is newly created and not running. We add a + // camera input and video data output only after canAdd* checks. + unsafe { + session.beginConfiguration(); + if let Some(preset) = session_preset(&options.format) { + session.setSessionPreset(preset); + } + if !session.canAddInput(&input) { + session.commitConfiguration(); + return Err(AvFoundationError::SessionSetup( + "capture device input could not be added".to_string(), + )); + } + session.addInput(&input); + + output.setAlwaysDiscardsLateVideoFrames(true); + output.setSampleBufferDelegate_queue( + Some(ProtocolObject::from_ref(&*delegate)), + Some(&queue), + ); + if !session.canAddOutput(&output) { + session.commitConfiguration(); + return Err(AvFoundationError::SessionSetup( + "video data output could not be added".to_string(), + )); + } + session.addOutput(&output); + session.commitConfiguration(); + } + + configure_device(&device, &options.format)?; + + // SAFETY: Configuration has been committed and the session is ready + // to synchronously start delivering video samples. + unsafe { + session.startRunning(); + } + + Ok(Self { session, _input: input, output, _delegate: delegate, _queue: queue, shared }) + } + + pub(super) fn wait_for_format( + &self, + timeout: Duration, + ) -> Result { + self.shared.wait_for_format(timeout) + } + + pub(super) fn capture_frame(&mut self) -> Result { + self.shared.take_frame() + } + } + + #[derive(Debug)] + struct CaptureDelegateIvars { + shared: Arc, + } + + define_class!( + // SAFETY: + // - The superclass NSObject does not have subclassing requirements. + // - CaptureDelegate does not implement Drop; retained Rust state lives in ivars. + #[unsafe(super = NSObject)] + #[thread_kind = AnyThread] + #[ivars = CaptureDelegateIvars] + struct CaptureDelegate; + + // SAFETY: `NSObjectProtocol` has no additional safety requirements. + unsafe impl NSObjectProtocol for CaptureDelegate {} + + // SAFETY: The selector signatures match the generated AVFoundation protocol. + unsafe impl AVCaptureVideoDataOutputSampleBufferDelegate for CaptureDelegate { + #[unsafe(method(captureOutput:didOutputSampleBuffer:fromConnection:))] + #[allow(non_snake_case)] + unsafe fn captureOutput_didOutputSampleBuffer_fromConnection( + &self, + _output: &AVCaptureOutput, + sample_buffer: &CMSampleBuffer, + _connection: &objc2_av_foundation::AVCaptureConnection, + ) { + if let Err(err) = process_sample_buffer(sample_buffer, &self.ivars().shared) { + self.ivars().shared.set_error(err.to_string()); + } + } + } + ); + + impl CaptureDelegate { + fn new(shared: Arc) -> Retained { + let this = Self::alloc().set_ivars(CaptureDelegateIvars { shared }); + // SAFETY: `this` is freshly allocated and initialized exactly once + // using NSObject's designated initializer. + unsafe { msg_send![super(this), init] } + } + } + + #[derive(Debug)] + struct FrameQueue { + state: Mutex, + ready: Condvar, + started_at: Instant, + } + + impl Default for FrameQueue { + fn default() -> Self { + Self { + state: Mutex::new(FrameQueueState::default()), + ready: Condvar::new(), + started_at: Instant::now(), + } + } + } + + #[derive(Debug, Default)] + struct FrameQueueState { + latest: Option, + stopped: bool, + error: Option, + } + + impl FrameQueue { + fn push_frame(&self, frame: AvFoundationFrame) { + let mut state = self.state.lock().expect("AVFoundation frame queue poisoned"); + if state.stopped { + return; + } + state.latest = Some(frame); + self.ready.notify_one(); + } + + fn set_error(&self, error: String) { + let mut state = self.state.lock().expect("AVFoundation frame queue poisoned"); + state.error = Some(error); + self.ready.notify_all(); + } + + fn stop(&self) { + let mut state = self.state.lock().expect("AVFoundation frame queue poisoned"); + state.stopped = true; + self.ready.notify_all(); + } + + fn wait_for_format(&self, timeout: Duration) -> Result { + let mut state = self.state.lock().expect("AVFoundation frame queue poisoned"); + loop { + if let Some(frame) = state.latest.as_ref() { + let buffer = &frame.frame.buffer; + return Ok(CaptureFormat::new( + CaptureResolution::new(buffer.width(), buffer.height()), + 0, + frame.source_pixel_format, + )); + } + if let Some(error) = state.error.take() { + return Err(AvFoundationError::Runtime(error)); + } + if state.stopped { + return Err(AvFoundationError::NotRunning); + } + + let (next_state, wait_result) = self + .ready + .wait_timeout(state, timeout) + .expect("AVFoundation frame queue poisoned"); + if wait_result.timed_out() { + return Err(AvFoundationError::FrameTimeout); + } + state = next_state; + } + } + + fn take_frame(&self) -> Result { + let mut state = self.state.lock().expect("AVFoundation frame queue poisoned"); + loop { + if let Some(frame) = state.latest.take() { + return Ok(frame); + } + if let Some(error) = state.error.take() { + return Err(AvFoundationError::Runtime(error)); + } + if state.stopped { + return Err(AvFoundationError::NotRunning); + } + state = self.ready.wait(state).expect("AVFoundation frame queue poisoned"); + } + } + + fn timestamp_us(&self) -> i64 { + elapsed_us(self.started_at.elapsed()) + } + } + + fn select_device( + selector: &CaptureDeviceSelector, + ) -> Result, AvFoundationError> { + let media_type = unsafe { AVMediaTypeVideo }.ok_or(AvFoundationError::DeviceNotFound)?; + match selector { + CaptureDeviceSelector::Default => { + unsafe { AVCaptureDevice::defaultDeviceWithMediaType(media_type) } + .ok_or(AvFoundationError::DeviceNotFound) + } + CaptureDeviceSelector::Index(index) => { + #[allow(deprecated)] + let devices = unsafe { AVCaptureDevice::devicesWithMediaType(media_type) }; + devices + .iter() + .nth(*index) + .map(|device| device.retain()) + .ok_or(AvFoundationError::DeviceNotFound) + } + CaptureDeviceSelector::Id(id) => { + let id = NSString::from_str(id); + unsafe { AVCaptureDevice::deviceWithUniqueID(&id) } + .ok_or(AvFoundationError::DeviceNotFound) + } + } + } + + fn configure_device( + device: &AVCaptureDevice, + request: &CaptureFormatRequest, + ) -> Result<(), AvFoundationError> { + let Some(frame_rate) = requested_frame_rate(request) else { + return Ok(()); + }; + if frame_rate == 0 { + return Ok(()); + } + + unsafe { device.lockForConfiguration() }.map_err(|err| { + AvFoundationError::SessionSetup(err.localizedDescription().to_string()) + })?; + let duration = unsafe { CMTime::with_seconds(1.0 / frame_rate as f64, 600) }; + // SAFETY: The device is locked for configuration and the CMTime value is finite. + unsafe { + device.setActiveVideoMinFrameDuration(duration); + device.setActiveVideoMaxFrameDuration(duration); + device.unlockForConfiguration(); + } + Ok(()) + } + + fn requested_frame_rate(request: &CaptureFormatRequest) -> Option { + match request { + CaptureFormatRequest::Default => None, + CaptureFormatRequest::Exact(format) | CaptureFormatRequest::Closest(format) => { + Some(format.frame_rate) + } + CaptureFormatRequest::HighestFrameRate { .. } => None, + CaptureFormatRequest::HighestResolution { frame_rate, .. } => *frame_rate, + } + } + + fn session_preset( + request: &CaptureFormatRequest, + ) -> Option<&'static objc2_av_foundation::AVCaptureSessionPreset> { + let resolution = match request { + CaptureFormatRequest::Exact(format) | CaptureFormatRequest::Closest(format) => { + Some(format.resolution) + } + CaptureFormatRequest::HighestFrameRate { resolution, .. } => *resolution, + CaptureFormatRequest::Default + | CaptureFormatRequest::HighestResolution { frame_rate: _, pixel_format: _ } => None, + }?; + + match (resolution.width, resolution.height) { + (1920, 1080) => Some(unsafe { AVCaptureSessionPreset1920x1080 }), + (1280, 720) => Some(unsafe { AVCaptureSessionPreset1280x720 }), + (640, 480) => Some(unsafe { AVCaptureSessionPreset640x480 }), + (w, h) if w <= 640 && h <= 480 => Some(unsafe { AVCaptureSessionPresetMedium }), + _ => Some(unsafe { AVCaptureSessionPresetHigh }), + } + } + + fn process_sample_buffer( + sample_buffer: &CMSampleBuffer, + shared: &FrameQueue, + ) -> Result<(), AvFoundationError> { + let read_wall_time_us = unix_time_us_now().unwrap_or_default(); + let image_buffer = unsafe { sample_buffer.image_buffer() } + .ok_or(AvFoundationError::InvalidFrame("sample buffer has no image buffer"))?; + let image_buffer_ref: &CVImageBuffer = &image_buffer; + // SAFETY: Video data output sample buffers deliver CVPixelBuffer-backed + // CVImageBuffer objects. The retained image buffer keeps the object alive + // for the duration of this conversion. + let pixel_buffer = + unsafe { &*(image_buffer_ref as *const CVImageBuffer as *const CVPixelBuffer) }; + let (buffer, source_pixel_format) = convert_pixel_buffer(pixel_buffer)?; + + let capture_wall_time_us = read_wall_time_us; + let frame = VideoFrame { + rotation: VideoRotation::VideoRotation0, + timestamp_us: shared.timestamp_us(), + frame_metadata: FrameMetadata { + user_timestamp: Some(capture_wall_time_us), + frame_id: None, + } + .into_rtc(), + buffer, + }; + + shared.push_frame(AvFoundationFrame { + frame, + source_pixel_format, + capture_wall_time_us, + read_wall_time_us, + used_decode_path: false, + }); + Ok(()) + } + + fn convert_pixel_buffer( + pixel_buffer: &CVPixelBuffer, + ) -> Result<(I420Buffer, CapturePixelFormat), AvFoundationError> { + let lock_flags = CVPixelBufferLockFlags::ReadOnly; + let lock_result = unsafe { CVPixelBufferLockBaseAddress(pixel_buffer, lock_flags) }; + if lock_result != kCVReturnSuccess { + return Err(AvFoundationError::InvalidFrame("CVPixelBuffer lock failed")); + } + + let result = convert_locked_pixel_buffer(pixel_buffer); + + // SAFETY: The pixel buffer was locked above with the same flags. + let unlock_result = unsafe { CVPixelBufferUnlockBaseAddress(pixel_buffer, lock_flags) }; + if unlock_result != kCVReturnSuccess { + return Err(AvFoundationError::InvalidFrame("CVPixelBuffer unlock failed")); + } + + result + } + + fn convert_locked_pixel_buffer( + pixel_buffer: &CVPixelBuffer, + ) -> Result<(I420Buffer, CapturePixelFormat), AvFoundationError> { + let width = u32::try_from(CVPixelBufferGetWidth(pixel_buffer)) + .map_err(|_| AvFoundationError::InvalidFrame("width is out of range"))?; + let height = u32::try_from(CVPixelBufferGetHeight(pixel_buffer)) + .map_err(|_| AvFoundationError::InvalidFrame("height is out of range"))?; + let pixel_format = CVPixelBufferGetPixelFormatType(pixel_buffer); + + match pixel_format { + format + if format == kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange + || format == kCVPixelFormatType_420YpCbCr8BiPlanarFullRange => + { + convert_nv12(pixel_buffer, width, height) + .map(|buffer| (buffer, CapturePixelFormat::Nv12)) + } + format if format == kCVPixelFormatType_32BGRA => { + convert_bgra(pixel_buffer, width, height) + .map(|buffer| (buffer, CapturePixelFormat::Bgra)) + } + format + if format == kCVPixelFormatType_420YpCbCr8Planar + || format == kCVPixelFormatType_420YpCbCr8PlanarFullRange => + { + convert_i420(pixel_buffer, width, height) + .map(|buffer| (buffer, CapturePixelFormat::I420)) + } + other => Err(AvFoundationError::UnsupportedCoreVideoPixelFormat(other)), + } + } + + fn convert_nv12( + pixel_buffer: &CVPixelBuffer, + width: u32, + height: u32, + ) -> Result { + if CVPixelBufferGetPlaneCount(pixel_buffer) < 2 { + return Err(AvFoundationError::InvalidFrame("NV12 buffer has fewer than two planes")); + } + + let y = plane(pixel_buffer, 0)?; + let uv = plane(pixel_buffer, 1)?; + let mut buffer = I420Buffer::new(width, height); + let (stride_y, stride_u, stride_v) = buffer.strides(); + let (dst_y, dst_u, dst_v) = buffer.data_mut(); + let ret = unsafe { + yuv_sys::rs_NV12ToI420( + y.data.as_ptr(), + y.stride as i32, + uv.data.as_ptr(), + uv.stride as i32, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width as i32, + height as i32, + ) + }; + if ret != 0 { + return Err(AvFoundationError::Convert("NV12ToI420 failed")); + } + Ok(buffer) + } + + fn convert_bgra( + pixel_buffer: &CVPixelBuffer, + width: u32, + height: u32, + ) -> Result { + let bgra = packed_plane(pixel_buffer, 4)?; + let mut buffer = I420Buffer::new(width, height); + let (stride_y, stride_u, stride_v) = buffer.strides(); + let (dst_y, dst_u, dst_v) = buffer.data_mut(); + let ret = unsafe { + yuv_sys::rs_BGRAToI420( + bgra.data.as_ptr(), + bgra.stride as i32, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width as i32, + height as i32, + ) + }; + if ret != 0 { + return Err(AvFoundationError::Convert("BGRAToI420 failed")); + } + Ok(buffer) + } + + fn convert_i420( + pixel_buffer: &CVPixelBuffer, + width: u32, + height: u32, + ) -> Result { + if CVPixelBufferGetPlaneCount(pixel_buffer) < 3 { + return Err(AvFoundationError::InvalidFrame("I420 buffer has fewer than three planes")); + } + + let y = plane(pixel_buffer, 0)?; + let u = plane(pixel_buffer, 1)?; + let v = plane(pixel_buffer, 2)?; + let mut buffer = I420Buffer::new(width, height); + let (stride_y, stride_u, stride_v) = buffer.strides(); + let (dst_y, dst_u, dst_v) = buffer.data_mut(); + let ret = unsafe { + yuv_sys::rs_I420Copy( + y.data.as_ptr(), + y.stride as i32, + u.data.as_ptr(), + u.stride as i32, + v.data.as_ptr(), + v.stride as i32, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width as i32, + height as i32, + ) + }; + if ret != 0 { + return Err(AvFoundationError::Convert("I420Copy failed")); + } + Ok(buffer) + } + + struct Plane<'a> { + data: &'a [u8], + stride: usize, + } + + fn plane(pixel_buffer: &CVPixelBuffer, index: usize) -> Result, AvFoundationError> { + let plane_count = CVPixelBufferGetPlaneCount(pixel_buffer); + if index >= plane_count { + return Err(AvFoundationError::InvalidFrame("plane index is out of range")); + } + + let base = CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, index); + if base.is_null() { + return Err(AvFoundationError::InvalidFrame("pixel plane has no base address")); + } + let stride = CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, index); + let height = CVPixelBufferGetHeightOfPlane(pixel_buffer, index); + let width = CVPixelBufferGetWidthOfPlane(pixel_buffer, index); + let min_len = stride + .checked_mul(height.saturating_sub(1)) + .and_then(|value| value.checked_add(width)) + .ok_or(AvFoundationError::InvalidFrame("pixel plane size overflow"))?; + + // SAFETY: The CVPixelBuffer is locked for read-only access, the plane + // base address is non-null, and CoreVideo reports the minimum readable + // extent for this plane. + let data = unsafe { std::slice::from_raw_parts(base.cast::(), min_len) }; + Ok(Plane { data, stride }) + } + + fn packed_plane( + pixel_buffer: &CVPixelBuffer, + bytes_per_pixel: usize, + ) -> Result, AvFoundationError> { + let base = CVPixelBufferGetBaseAddress(pixel_buffer); + if base.is_null() { + return Err(AvFoundationError::InvalidFrame("pixel buffer has no base address")); + } + let stride = CVPixelBufferGetBytesPerRow(pixel_buffer); + let height = CVPixelBufferGetHeight(pixel_buffer); + let width = CVPixelBufferGetWidth(pixel_buffer) + .checked_mul(bytes_per_pixel) + .ok_or(AvFoundationError::InvalidFrame("packed pixel row size overflow"))?; + let min_len = stride + .checked_mul(height.saturating_sub(1)) + .and_then(|value| value.checked_add(width)) + .ok_or(AvFoundationError::InvalidFrame("packed pixel buffer size overflow"))?; + + // SAFETY: The CVPixelBuffer is locked for read-only access, the base + // address is non-null, and CoreVideo reports the minimum readable extent + // for this packed buffer. + let data = unsafe { std::slice::from_raw_parts(base.cast::(), min_len) }; + Ok(Plane { data, stride }) + } + + fn elapsed_us(duration: Duration) -> i64 { + i64::try_from(duration.as_micros()).unwrap_or(i64::MAX) + } + + fn unix_time_us_now() -> Option { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .ok() + .and_then(|duration| u64::try_from(duration.as_micros()).ok()) + } +} diff --git a/livekit-capture/src/sources/v4l.rs b/livekit-capture/src/sources/v4l.rs index a0725463c..3df2fad03 100644 --- a/livekit-capture/src/sources/v4l.rs +++ b/livekit-capture/src/sources/v4l.rs @@ -122,6 +122,10 @@ pub struct V4lFrame { pub source_pixel_format: CapturePixelFormat, /// Backend-provided capture timestamp, when available. pub backend_capture_timestamp: Option, + /// Wall-clock timestamp selected for metadata and timing correlation. + pub capture_wall_time_us: u64, + /// Wall-clock timestamp recorded after the frame was read from the camera backend. + pub read_wall_time_us: u64, /// Whether compressed image decoding was needed. pub used_decode_path: bool, } @@ -226,7 +230,14 @@ impl V4lCaptureSession { )?; let source_pixel_format = capture_pixel_format_from_nokhwa(buffer.source_frame_format())?; - Ok(V4lFrame { frame, source_pixel_format, backend_capture_timestamp, used_decode_path }) + Ok(V4lFrame { + frame, + source_pixel_format, + backend_capture_timestamp, + capture_wall_time_us, + read_wall_time_us, + used_decode_path, + }) } #[cfg(not(target_os = "linux"))] From 6e9973f5f6f88c9229130ec54b8aca82016723f2 Mon Sep 17 00:00:00 2001 From: David Chen Date: Tue, 23 Jun 2026 21:00:33 -0700 Subject: [PATCH 06/24] fix avfoundation --- examples/local_video/src/publisher.rs | 1 + livekit-capture/Cargo.toml | 3 + livekit-capture/src/device.rs | 2 + livekit-capture/src/platform/avfoundation.rs | 392 +++++++++++++++++-- livekit-capture/src/sources/v4l.rs | 4 +- webrtc-sys/src/video_encoder_factory.cpp | 17 +- 6 files changed, 374 insertions(+), 45 deletions(-) diff --git a/examples/local_video/src/publisher.rs b/examples/local_video/src/publisher.rs index 42c2a9687..87ff2e93e 100644 --- a/examples/local_video/src/publisher.rs +++ b/examples/local_video/src/publisher.rs @@ -151,6 +151,7 @@ fn video_encoder_backend_name(backend: VideoEncoderBackend) -> &'static str { VideoEncoderBackend::Nvenc => "nvenc", VideoEncoderBackend::Vaapi => "vaapi", VideoEncoderBackend::VideoToolbox => "videotoolbox", + VideoEncoderBackend::PreEncoded => "preencoded", _ => "unknown", } } diff --git a/livekit-capture/Cargo.toml b/livekit-capture/Cargo.toml index b0e699fa3..5cc486c8b 100644 --- a/livekit-capture/Cargo.toml +++ b/livekit-capture/Cargo.toml @@ -34,6 +34,7 @@ avfoundation = [ "objc2-av-foundation/AVVideoSettings", "objc2-av-foundation/dispatch2", "objc2-av-foundation/objc2-core-media", + "objc2-core-media/CMFormatDescription", "objc2-core-media/CMTime", "objc2-core-media/CMSampleBuffer", "objc2-core-media/objc2-core-video", @@ -46,7 +47,9 @@ avfoundation = [ "objc2-foundation/NSDictionary", "objc2-foundation/NSError", "objc2-foundation/NSObject", + "objc2-foundation/NSValue", "objc2-foundation/NSString", + "objc2-foundation/objc2-core-foundation", ] gstreamer = [] libargus = [] diff --git a/livekit-capture/src/device.rs b/livekit-capture/src/device.rs index fbffcd503..9baab00c0 100644 --- a/livekit-capture/src/device.rs +++ b/livekit-capture/src/device.rs @@ -57,6 +57,8 @@ pub enum CapturePixelFormat { Bgr24, /// Packed YUYV/YUY2. Yuyv, + /// Packed UYVY. + Uyvy, /// Single-plane 8-bit luma. Gray, /// Encoded MJPEG frames. diff --git a/livekit-capture/src/platform/avfoundation.rs b/livekit-capture/src/platform/avfoundation.rs index 5ac7efe3c..392cbfc24 100644 --- a/livekit-capture/src/platform/avfoundation.rs +++ b/livekit-capture/src/platform/avfoundation.rs @@ -217,6 +217,9 @@ pub enum AvFoundationError { /// The requested capture format is not supported by this backend. #[error("AVFoundation capture does not support pixel format {0:?}")] UnsupportedPixelFormat(CapturePixelFormat), + /// The requested capture format is not available on the selected device. + #[error("AVFoundation capture format is not available: {0:?}")] + UnsupportedFormat(CaptureFormat), /// AVFoundation could not configure the capture session. #[error("AVFoundation session setup failed: {0}")] SessionSetup(String), @@ -435,26 +438,29 @@ mod macos { use dispatch2::{DispatchQueue, DispatchRetained}; use livekit::webrtc::video_frame::{I420Buffer, VideoBuffer, VideoFrame, VideoRotation}; use objc2::rc::Retained; - use objc2::runtime::ProtocolObject; + use objc2::runtime::{AnyObject, ProtocolObject}; use objc2::{define_class, msg_send, AnyThread, DefinedClass, Message}; use objc2_av_foundation::{ - AVCaptureDevice, AVCaptureDeviceInput, AVCaptureOutput, AVCaptureSession, - AVCaptureSessionPreset1280x720, AVCaptureSessionPreset1920x1080, + AVCaptureDevice, AVCaptureDeviceFormat, AVCaptureDeviceInput, AVCaptureOutput, + AVCaptureSession, AVCaptureSessionPreset1280x720, AVCaptureSessionPreset1920x1080, AVCaptureSessionPreset640x480, AVCaptureSessionPresetHigh, AVCaptureSessionPresetMedium, AVCaptureVideoDataOutput, AVCaptureVideoDataOutputSampleBufferDelegate, AVMediaTypeVideo, }; - use objc2_core_media::{CMSampleBuffer, CMTime}; + use objc2_core_media::{CMSampleBuffer, CMTime, CMVideoFormatDescriptionGetDimensions}; use objc2_core_video::{ - kCVPixelFormatType_32BGRA, kCVPixelFormatType_420YpCbCr8BiPlanarFullRange, + kCVPixelBufferPixelFormatTypeKey, kCVPixelFormatType_32BGRA, + kCVPixelFormatType_420YpCbCr8BiPlanarFullRange, kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange, kCVPixelFormatType_420YpCbCr8Planar, - kCVPixelFormatType_420YpCbCr8PlanarFullRange, kCVReturnSuccess, CVImageBuffer, - CVPixelBuffer, CVPixelBufferGetBaseAddress, CVPixelBufferGetBaseAddressOfPlane, - CVPixelBufferGetBytesPerRow, CVPixelBufferGetBytesPerRowOfPlane, CVPixelBufferGetHeight, - CVPixelBufferGetHeightOfPlane, CVPixelBufferGetPixelFormatType, CVPixelBufferGetPlaneCount, - CVPixelBufferGetWidth, CVPixelBufferGetWidthOfPlane, CVPixelBufferLockBaseAddress, - CVPixelBufferLockFlags, CVPixelBufferUnlockBaseAddress, + kCVPixelFormatType_420YpCbCr8PlanarFullRange, kCVPixelFormatType_422YpCbCr8, + kCVPixelFormatType_422YpCbCr8FullRange, kCVPixelFormatType_422YpCbCr8_yuvs, + kCVReturnSuccess, CVImageBuffer, CVPixelBuffer, CVPixelBufferGetBaseAddress, + CVPixelBufferGetBaseAddressOfPlane, CVPixelBufferGetBytesPerRow, + CVPixelBufferGetBytesPerRowOfPlane, CVPixelBufferGetHeight, CVPixelBufferGetHeightOfPlane, + CVPixelBufferGetPixelFormatType, CVPixelBufferGetPlaneCount, CVPixelBufferGetWidth, + CVPixelBufferGetWidthOfPlane, CVPixelBufferLockBaseAddress, CVPixelBufferLockFlags, + CVPixelBufferUnlockBaseAddress, }; - use objc2_foundation::{NSObject, NSObjectProtocol, NSString}; + use objc2_foundation::{NSDictionary, NSNumber, NSObject, NSObjectProtocol, NSString}; use super::{AvFoundationCaptureOptions, AvFoundationError, AvFoundationFrame}; use crate::device::{ @@ -503,39 +509,47 @@ mod macos { let shared = Arc::new(FrameQueue::default()); let delegate = CaptureDelegate::new(shared.clone()); let queue = DispatchQueue::new("io.livekit.capture.avfoundation", None); + let active_format = select_active_format(&device, &options.format)?; // SAFETY: The session is newly created and not running. We add a // camera input and video data output only after canAdd* checks. unsafe { session.beginConfiguration(); - if let Some(preset) = session_preset(&options.format) { - session.setSessionPreset(preset); + if active_format.is_none() { + if let Some(preset) = session_preset(&options.format) { + session.setSessionPreset(preset); + } } - if !session.canAddInput(&input) { - session.commitConfiguration(); - return Err(AvFoundationError::SessionSetup( - "capture device input could not be added".to_string(), - )); - } - session.addInput(&input); - - output.setAlwaysDiscardsLateVideoFrames(true); - output.setSampleBufferDelegate_queue( - Some(ProtocolObject::from_ref(&*delegate)), - Some(&queue), - ); - if !session.canAddOutput(&output) { - session.commitConfiguration(); - return Err(AvFoundationError::SessionSetup( - "video data output could not be added".to_string(), - )); - } - session.addOutput(&output); + let config_result = (|| { + if !session.canAddInput(&input) { + return Err(AvFoundationError::SessionSetup( + "capture device input could not be added".to_string(), + )); + } + session.addInput(&input); + + configure_device(&device, &options.format, active_format.as_deref())?; + + if let Some(video_settings) = preferred_video_settings(&output) { + output.setVideoSettings(Some(&video_settings)); + } + output.setAlwaysDiscardsLateVideoFrames(true); + output.setSampleBufferDelegate_queue( + Some(ProtocolObject::from_ref(&*delegate)), + Some(&queue), + ); + if !session.canAddOutput(&output) { + return Err(AvFoundationError::SessionSetup( + "video data output could not be added".to_string(), + )); + } + session.addOutput(&output); + Ok(()) + })(); session.commitConfiguration(); + config_result?; } - configure_device(&device, &options.format)?; - // SAFETY: Configuration has been committed and the session is ready // to synchronously start delivering video samples. unsafe { @@ -557,6 +571,28 @@ mod macos { } } + fn preferred_video_settings( + output: &AVCaptureVideoDataOutput, + ) -> Option>> { + let preferred = kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange; + // SAFETY: `output` is a live AVCaptureVideoDataOutput owned by the session setup path, and + // querying advertised CV pixel formats does not mutate Rust-managed memory. + let supported = unsafe { output.availableVideoCVPixelFormatTypes() } + .iter() + .any(|format| format.as_u32() == preferred); + if !supported { + return None; + } + + let pixel_format = NSNumber::new_u32(preferred); + // SAFETY: `kCVPixelBufferPixelFormatTypeKey` is a CoreVideo-provided + // immutable CFString constant. `CFString` and `NSString` are toll-free + // bridged, which objc2-foundation exposes through `AsRef`. + let key: &NSString = unsafe { kCVPixelBufferPixelFormatTypeKey }.as_ref(); + let value: &AnyObject = pixel_format.as_ref(); + Some(NSDictionary::from_slices(&[key], &[value])) + } + #[derive(Debug)] struct CaptureDelegateIvars { shared: Arc, @@ -722,26 +758,225 @@ mod macos { } } + fn select_active_format( + device: &AVCaptureDevice, + request: &CaptureFormatRequest, + ) -> Result>, AvFoundationError> { + match request { + CaptureFormatRequest::Default => Ok(None), + CaptureFormatRequest::Exact(format) => { + let selected = best_device_format( + device, + Some(format.resolution), + Some(format.frame_rate), + SelectionMode::Exact, + ); + selected.map(Some).ok_or(AvFoundationError::UnsupportedFormat(*format)) + } + CaptureFormatRequest::Closest(format) => Ok(best_device_format( + device, + Some(format.resolution), + Some(format.frame_rate), + SelectionMode::Closest, + )), + CaptureFormatRequest::HighestFrameRate { resolution, .. } => { + Ok(best_device_format(device, *resolution, None, SelectionMode::HighestFrameRate)) + } + CaptureFormatRequest::HighestResolution { frame_rate, .. } => { + Ok(best_device_format(device, None, *frame_rate, SelectionMode::HighestResolution)) + } + } + } + + #[derive(Debug, Clone, Copy, PartialEq, Eq)] + enum SelectionMode { + Exact, + Closest, + HighestFrameRate, + HighestResolution, + } + + #[derive(Debug)] + struct DeviceFormatCandidate { + format: Retained, + resolution: CaptureResolution, + frame_rate_supported: bool, + max_frame_rate: u32, + } + + fn best_device_format( + device: &AVCaptureDevice, + resolution: Option, + frame_rate: Option, + mode: SelectionMode, + ) -> Option> { + // SAFETY: The AVCaptureDevice is retained for the session setup path; querying the + // immutable list of supported formats does not mutate Rust-managed memory. + let formats = unsafe { device.formats() }; + let mut candidates = formats + .iter() + .filter_map(|format| { + let candidate_resolution = device_format_resolution(&format)?; + let frame_rate_supported = frame_rate + .map(|frame_rate| device_format_supports_frame_rate(&format, frame_rate)) + .unwrap_or(true); + Some(DeviceFormatCandidate { + format: format.retain(), + resolution: candidate_resolution, + frame_rate_supported, + max_frame_rate: device_format_max_frame_rate(&format), + }) + }) + .collect::>(); + + if let Some(resolution) = resolution { + if mode == SelectionMode::Exact { + return candidates + .into_iter() + .find(|candidate| { + candidate.resolution == resolution && candidate.frame_rate_supported + }) + .map(|candidate| candidate.format); + } + } + + if frame_rate.is_some() && candidates.iter().any(|candidate| candidate.frame_rate_supported) + { + candidates.retain(|candidate| candidate.frame_rate_supported); + } + + match mode { + SelectionMode::Exact => None, + SelectionMode::Closest => { + let resolution = resolution?; + candidates + .into_iter() + .min_by_key(|candidate| resolution_distance(candidate.resolution, resolution)) + .map(|candidate| candidate.format) + } + SelectionMode::HighestFrameRate => candidates + .into_iter() + .filter(|candidate| { + resolution.map(|resolution| candidate.resolution == resolution).unwrap_or(true) + }) + .max_by_key(|candidate| { + ( + candidate.max_frame_rate, + candidate.resolution.width as u64 * candidate.resolution.height as u64, + ) + }) + .map(|candidate| candidate.format), + SelectionMode::HighestResolution => candidates + .into_iter() + .max_by_key(|candidate| { + ( + candidate.resolution.width as u64 * candidate.resolution.height as u64, + candidate.max_frame_rate, + ) + }) + .map(|candidate| candidate.format), + } + } + + fn device_format_resolution(format: &AVCaptureDeviceFormat) -> Option { + // SAFETY: `format` is an AVCaptureDeviceFormat from the device's immutable formats array. + // Its format description is a valid CMVideoFormatDescription for video capture formats. + let description = unsafe { format.formatDescription() }; + // SAFETY: `description` is the video format description returned by AVFoundation. + let dimensions = unsafe { CMVideoFormatDescriptionGetDimensions(&description) }; + if dimensions.width <= 0 || dimensions.height <= 0 { + return None; + } + Some(CaptureResolution::new(dimensions.width as u32, dimensions.height as u32)) + } + + fn device_format_supports_frame_rate(format: &AVCaptureDeviceFormat, frame_rate: u32) -> bool { + let requested = frame_rate as f64; + // SAFETY: `format` is an AVCaptureDeviceFormat from the device's immutable formats array. + // The returned frame-rate ranges are immutable AVFoundation objects. + unsafe { format.videoSupportedFrameRateRanges() }.iter().any(|range| { + // SAFETY: AVFrameRateRange values are immutable for the lifetime of the object. + let min = unsafe { range.minFrameRate() }; + // SAFETY: AVFrameRateRange values are immutable for the lifetime of the object. + let max = unsafe { range.maxFrameRate() }; + requested >= min.floor() && requested <= max.ceil() + }) + } + + fn device_format_max_frame_rate(format: &AVCaptureDeviceFormat) -> u32 { + // SAFETY: `format` is an AVCaptureDeviceFormat from the device's immutable formats array. + // The returned frame-rate ranges are immutable AVFoundation objects. + unsafe { format.videoSupportedFrameRateRanges() } + .iter() + .map(|range| { + // SAFETY: AVFrameRateRange values are immutable for the lifetime of the object. + unsafe { range.maxFrameRate() }.floor().max(0.0) as u32 + }) + .max() + .unwrap_or_default() + } + + fn resolution_distance(actual: CaptureResolution, requested: CaptureResolution) -> u64 { + let width_delta = actual.width.abs_diff(requested.width) as u64; + let height_delta = actual.height.abs_diff(requested.height) as u64; + let pixel_delta = (actual.width as u64 * actual.height as u64) + .abs_diff(requested.width as u64 * requested.height as u64); + pixel_delta + width_delta * width_delta + height_delta * height_delta + } + fn configure_device( device: &AVCaptureDevice, request: &CaptureFormatRequest, + active_format: Option<&AVCaptureDeviceFormat>, ) -> Result<(), AvFoundationError> { - let Some(frame_rate) = requested_frame_rate(request) else { - return Ok(()); - }; - if frame_rate == 0 { + let frame_rate = requested_frame_rate(request); + if active_format.is_none() && frame_rate.is_none() { return Ok(()); } unsafe { device.lockForConfiguration() }.map_err(|err| { AvFoundationError::SessionSetup(err.localizedDescription().to_string()) })?; + + let configure_result = configure_locked_device(device, active_format, frame_rate); + // SAFETY: The device was successfully locked above and must be unlocked exactly once. + unsafe { + device.unlockForConfiguration(); + } + configure_result + } + + fn configure_locked_device( + device: &AVCaptureDevice, + active_format: Option<&AVCaptureDeviceFormat>, + frame_rate: Option, + ) -> Result<(), AvFoundationError> { + // SAFETY: The caller holds the AVCaptureDevice configuration lock, and `active_format` + // was selected from this device's formats array. + unsafe { + if let Some(active_format) = active_format { + device.setActiveFormat(active_format); + } + } + + let Some(frame_rate) = frame_rate.filter(|frame_rate| *frame_rate > 0) else { + return Ok(()); + }; + + let active_format = match active_format { + Some(active_format) => active_format.retain(), + // SAFETY: The caller holds the configuration lock, and reading activeFormat is valid. + None => unsafe { device.activeFormat() }, + }; + if !device_format_supports_frame_rate(&active_format, frame_rate) { + return Ok(()); + } + let duration = unsafe { CMTime::with_seconds(1.0 / frame_rate as f64, 600) }; // SAFETY: The device is locked for configuration and the CMTime value is finite. unsafe { device.setActiveVideoMinFrameDuration(duration); device.setActiveVideoMaxFrameDuration(duration); - device.unlockForConfiguration(); } Ok(()) } @@ -863,6 +1098,17 @@ mod macos { convert_i420(pixel_buffer, width, height) .map(|buffer| (buffer, CapturePixelFormat::I420)) } + format if format == kCVPixelFormatType_422YpCbCr8 => { + convert_uyvy(pixel_buffer, width, height) + .map(|buffer| (buffer, CapturePixelFormat::Uyvy)) + } + format + if format == kCVPixelFormatType_422YpCbCr8_yuvs + || format == kCVPixelFormatType_422YpCbCr8FullRange => + { + convert_yuy2(pixel_buffer, width, height) + .map(|buffer| (buffer, CapturePixelFormat::Yuyv)) + } other => Err(AvFoundationError::UnsupportedCoreVideoPixelFormat(other)), } } @@ -932,6 +1178,70 @@ mod macos { Ok(buffer) } + fn convert_uyvy( + pixel_buffer: &CVPixelBuffer, + width: u32, + height: u32, + ) -> Result { + let uyvy = packed_plane(pixel_buffer, 2)?; + let mut buffer = I420Buffer::new(width, height); + let (stride_y, stride_u, stride_v) = buffer.strides(); + let (dst_y, dst_u, dst_v) = buffer.data_mut(); + // SAFETY: The source slice covers the locked CVPixelBuffer plane for the duration of this + // call, and the destination planes come from a freshly allocated I420Buffer with matching + // width, height, and strides. + let ret = unsafe { + yuv_sys::rs_UYVYToI420( + uyvy.data.as_ptr(), + uyvy.stride as i32, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width as i32, + height as i32, + ) + }; + if ret != 0 { + return Err(AvFoundationError::Convert("UYVYToI420 failed")); + } + Ok(buffer) + } + + fn convert_yuy2( + pixel_buffer: &CVPixelBuffer, + width: u32, + height: u32, + ) -> Result { + let yuy2 = packed_plane(pixel_buffer, 2)?; + let mut buffer = I420Buffer::new(width, height); + let (stride_y, stride_u, stride_v) = buffer.strides(); + let (dst_y, dst_u, dst_v) = buffer.data_mut(); + // SAFETY: The source slice covers the locked CVPixelBuffer plane for the duration of this + // call, and the destination planes come from a freshly allocated I420Buffer with matching + // width, height, and strides. + let ret = unsafe { + yuv_sys::rs_YUY2ToI420( + yuy2.data.as_ptr(), + yuy2.stride as i32, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width as i32, + height as i32, + ) + }; + if ret != 0 { + return Err(AvFoundationError::Convert("YUY2ToI420 failed")); + } + Ok(buffer) + } + fn convert_i420( pixel_buffer: &CVPixelBuffer, width: u32, diff --git a/livekit-capture/src/sources/v4l.rs b/livekit-capture/src/sources/v4l.rs index 3df2fad03..ae8634491 100644 --- a/livekit-capture/src/sources/v4l.rs +++ b/livekit-capture/src/sources/v4l.rs @@ -579,7 +579,7 @@ fn nokhwa_frame_format(pixel_format: CapturePixelFormat) -> Option CapturePixelFormat::Yuyv => Some(FrameFormat::YUYV), CapturePixelFormat::Gray => Some(FrameFormat::GRAY), CapturePixelFormat::Mjpeg => Some(FrameFormat::MJPEG), - CapturePixelFormat::I420 | CapturePixelFormat::Bgra => None, + CapturePixelFormat::I420 | CapturePixelFormat::Bgra | CapturePixelFormat::Uyvy => None, } } @@ -592,7 +592,7 @@ fn nokhwa_frame_format(pixel_format: CapturePixelFormat) -> Option<()> { | CapturePixelFormat::Yuyv | CapturePixelFormat::Gray | CapturePixelFormat::Mjpeg => Some(()), - CapturePixelFormat::I420 | CapturePixelFormat::Bgra => None, + CapturePixelFormat::I420 | CapturePixelFormat::Bgra | CapturePixelFormat::Uyvy => None, } } diff --git a/webrtc-sys/src/video_encoder_factory.cpp b/webrtc-sys/src/video_encoder_factory.cpp index 4d3f4ca49..8ec524a15 100644 --- a/webrtc-sys/src/video_encoder_factory.cpp +++ b/webrtc-sys/src/video_encoder_factory.cpp @@ -166,8 +166,12 @@ bool IsSpecificHardwareBackend(VideoEncoderBackend backend) { bool BackendMatches(VideoEncoderBackend requested, VideoEncoderBackend actual) { return requested == actual || (requested == VideoEncoderBackend::Hardware && - actual != VideoEncoderBackend::Software && - actual != VideoEncoderBackend::Auto); + (actual == VideoEncoderBackend::Hardware || + IsSpecificHardwareBackend(actual))); +} + +bool IsAutomaticFallbackBackend(VideoEncoderBackend backend) { + return backend != VideoEncoderBackend::PreEncoded; } void AddBackendFactory( @@ -354,6 +358,9 @@ std::vector VideoEncoderFactory::InternalFactory::GetImplementations() const { std::vector formats; for (const auto& backend_factory : factories_) { + if (backend_factory.backend == VideoEncoderBackend::PreEncoded) { + continue; + } for (const auto& format : backend_factory.factory->GetImplementations()) { formats.push_back(WithBackend(format, backend_factory.backend)); if (IsSpecificHardwareBackend(backend_factory.backend)) { @@ -418,6 +425,9 @@ VideoEncoderFactory::InternalFactory::QueryCodecSupport( } for (const auto& backend_factory : factories_) { + if (!IsAutomaticFallbackBackend(backend_factory.backend)) { + continue; + } for (const auto& supported_format : backend_factory.factory->GetSupportedFormats()) { if (stripped_format.IsSameCodec(supported_format)) { @@ -480,6 +490,9 @@ VideoEncoderFactory::InternalFactory::Create( } for (const auto& backend_factory : factories_) { + if (!IsAutomaticFallbackBackend(backend_factory.backend)) { + continue; + } for (const auto& supported_format : backend_factory.factory->GetSupportedFormats()) { if (supported_format.IsSameCodec(stripped_format)) From a6cca8a8ff672437430e88cab64d37b6974cea79 Mon Sep 17 00:00:00 2001 From: David Chen Date: Wed, 24 Jun 2026 10:43:47 -0700 Subject: [PATCH 07/24] rename for clarity --- examples/local_video/src/publisher.rs | 26 +-- livekit-capture/src/device.rs | 201 +++++++++++++++++-- livekit-capture/src/lib.rs | 14 +- livekit-capture/src/platform/avfoundation.rs | 96 +++++---- livekit-capture/src/sources/argus.rs | 60 +++++- livekit-capture/src/sources/mod.rs | 6 + livekit-capture/src/sources/v4l.rs | 191 ++++++++++-------- livekit-capture/src/track.rs | 13 +- 8 files changed, 438 insertions(+), 169 deletions(-) diff --git a/examples/local_video/src/publisher.rs b/examples/local_video/src/publisher.rs index 87ff2e93e..248b08d63 100644 --- a/examples/local_video/src/publisher.rs +++ b/examples/local_video/src/publisher.rs @@ -14,10 +14,10 @@ use livekit_api::services::room::{CreateRoomOptions, RoomClient}; use livekit_api::services::{ServiceError, TwirpError, TwirpErrorCode}; use livekit_capture::device::{ CaptureDeviceSelector, CaptureFormat as LkCaptureFormat, CaptureFormatRequest, - CapturePixelFormat, CaptureResolution, + CaptureFrameFormat, CaptureResolution, }; #[cfg(target_os = "macos")] -use livekit_capture::platform::avfoundation::{ +use livekit_capture::sources::avfoundation::{ self, AvFoundationCaptureOptions, AvFoundationCaptureSession, }; #[cfg(target_os = "linux")] @@ -74,7 +74,7 @@ enum SourceKind { Argus, } -/// Selects the UVC camera capture pixel format. +/// Selects the UVC camera capture frame format. #[derive(Copy, Clone, Debug, PartialEq, Eq, ValueEnum)] enum CaptureFormat { /// Try YUYV first and fall back to MJPEG. @@ -87,11 +87,11 @@ enum CaptureFormat { impl CaptureFormat { #[cfg(target_os = "linux")] - fn pixel_formats(self) -> &'static [CapturePixelFormat] { + fn frame_formats(self) -> &'static [CaptureFrameFormat] { match self { - Self::Auto => &[CapturePixelFormat::Yuyv, CapturePixelFormat::Mjpeg], - Self::Yuv => &[CapturePixelFormat::Yuyv], - Self::Mjpeg => &[CapturePixelFormat::Mjpeg], + Self::Auto => &[CaptureFrameFormat::Yuyv, CaptureFrameFormat::Mjpeg], + Self::Yuv => &[CaptureFrameFormat::Yuyv], + Self::Mjpeg => &[CaptureFrameFormat::Mjpeg], } } } @@ -765,7 +765,7 @@ impl PlatformCamera { frame: frame.frame, capture_wall_time_us: frame.capture_wall_time_us, read_wall_time_us: frame.read_wall_time_us, - used_decode_path: frame.used_decode_path, + used_decode_path: false, }) } #[cfg(target_os = "linux")] @@ -818,7 +818,7 @@ fn open_platform_camera(args: &Args) -> Result<(u32, u32, VideoInput)> { let requested = LkCaptureFormat::new( CaptureResolution::new(args.width, args.height), args.fps, - CapturePixelFormat::Nv12, + CaptureFrameFormat::Nv12, ); let session = AvFoundationCaptureSession::new(AvFoundationCaptureOptions { device: CaptureDeviceSelector::Index(args.camera_index), @@ -831,7 +831,7 @@ fn open_platform_camera(args: &Args) -> Result<(u32, u32, VideoInput)> { format.resolution.width, format.resolution.height, format.frame_rate, - format.pixel_format, + format.frame_format, args.camera_index, ); Ok(( @@ -846,7 +846,7 @@ fn open_platform_camera(args: &Args) -> Result<(u32, u32, VideoInput)> { let requested = LkCaptureFormat::new( CaptureResolution::new(args.width, args.height), args.fps, - args.format.pixel_formats()[0], + args.format.frame_formats()[0], ); let mut options = V4lCaptureOptions::new( CaptureDeviceSelector::Index(args.camera_index), @@ -858,7 +858,7 @@ fn open_platform_camera(args: &Args) -> Result<(u32, u32, VideoInput)> { } else { CaptureFormatRequest::Exact(requested) }; - options.pixel_formats = args.format.pixel_formats().to_vec(); + options.frame_formats = args.format.frame_formats().to_vec(); let session = V4lCaptureSession::new(options)?; let format = session.format(); info!( @@ -866,7 +866,7 @@ fn open_platform_camera(args: &Args) -> Result<(u32, u32, VideoInput)> { format.resolution.width, format.resolution.height, format.frame_rate, - format.pixel_format, + format.frame_format, args.format, ); Ok(( diff --git a/livekit-capture/src/device.rs b/livekit-capture/src/device.rs index 9baab00c0..106821bff 100644 --- a/livekit-capture/src/device.rs +++ b/livekit-capture/src/device.rs @@ -12,21 +12,102 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::fmt; + use livekit::webrtc::video_source::VideoResolution; +use thiserror::Error; + +/// Capture backend used by a source implementation. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[non_exhaustive] +pub enum CaptureBackend { + /// Let `livekit-capture` choose the platform default backend. + Auto, + /// macOS AVFoundation camera capture. + AvFoundation, + /// Linux Video4Linux2 camera capture. + V4l2, + /// NVIDIA Jetson libargus camera capture. + LibArgus, + /// RTSP encoded ingress. + Rtsp, + /// TCP byte-stream encoded ingress. + Tcp, + /// GStreamer appsink encoded ingress. + Gstreamer, +} + +impl CaptureBackend { + /// Returns a stable backend name. + pub const fn as_str(self) -> &'static str { + match self { + Self::Auto => "auto", + Self::AvFoundation => "avfoundation", + Self::V4l2 => "v4l2", + Self::LibArgus => "libargus", + Self::Rtsp => "rtsp", + Self::Tcp => "tcp", + Self::Gstreamer => "gstreamer", + } + } +} + +impl fmt::Display for CaptureBackend { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.as_str()) + } +} + +/// Capture path used by a source implementation. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[non_exhaustive] +pub enum CapturePath { + /// Uncompressed CPU-accessible frame buffers. + Raw, + /// Linux DMA-BUF backed frames. + DmaBuf, + /// Compressed encoded access units. + Encoded, +} + +/// Error returned while querying capture devices. +#[derive(Debug, Error)] +#[non_exhaustive] +pub enum CaptureDeviceQueryError { + /// The backend does not support device enumeration on this target or build. + #[error("capture backend {0} does not support device enumeration")] + UnsupportedBackend(CaptureBackend), + /// The backend failed while querying devices. + #[error("capture backend {backend} device query failed: {message}")] + Backend { + /// Backend that failed. + backend: CaptureBackend, + /// Backend error message. + message: String, + }, +} /// Capture device discovered by a platform backend. #[derive(Debug, Clone, PartialEq, Eq)] pub struct CaptureDeviceInfo { + /// Backend that reported this device. + pub backend: CaptureBackend, /// Backend-stable device identifier. pub id: String, + /// Preferred selector that reopens this exact device. + pub selector: CaptureDeviceSelector, /// Human-readable device name. pub name: String, /// Device model identifier, when available. pub model_id: Option, /// Device manufacturer, when available. pub manufacturer: Option, + /// Capture paths supported by this device. + pub paths: Vec, /// Capture formats reported by the backend. pub formats: Vec, + /// Whether [`CaptureDeviceInfo::formats`] is a complete backend-reported list. + pub formats_complete: bool, } /// Device selector used by capture backends. @@ -41,10 +122,10 @@ pub enum CaptureDeviceSelector { Id(String), } -/// Pixel format used by a decoded-frame capture backend. +/// Frame format used by a raw-frame capture backend. #[derive(Debug, Clone, Copy, PartialEq, Eq)] #[non_exhaustive] -pub enum CapturePixelFormat { +pub enum CaptureFrameFormat { /// Planar I420/YUV420P. I420, /// Biplanar NV12. @@ -65,6 +146,57 @@ pub enum CapturePixelFormat { Mjpeg, } +impl CaptureFrameFormat { + /// Returns a stable lower-case frame-format name. + pub const fn as_str(self) -> &'static str { + match self { + Self::I420 => "i420", + Self::Nv12 => "nv12", + Self::Bgra => "bgra", + Self::Rgb24 => "rgb24", + Self::Bgr24 => "bgr24", + Self::Yuyv => "yuyv", + Self::Uyvy => "uyvy", + Self::Gray => "gray", + Self::Mjpeg => "mjpeg", + } + } +} + +impl fmt::Display for CaptureFrameFormat { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.as_str()) + } +} + +impl std::str::FromStr for CaptureFrameFormat { + type Err = CaptureFrameFormatParseError; + + fn from_str(value: &str) -> Result { + match value.to_ascii_lowercase().as_str() { + "i420" | "yuv420p" => Ok(Self::I420), + "nv12" => Ok(Self::Nv12), + "bgra" => Ok(Self::Bgra), + "rgb24" | "rgb" => Ok(Self::Rgb24), + "bgr24" | "bgr" => Ok(Self::Bgr24), + "yuyv" | "yuy2" => Ok(Self::Yuyv), + "uyvy" => Ok(Self::Uyvy), + "gray" | "grey" | "greyscale" | "grayscale" => Ok(Self::Gray), + "mjpeg" | "mjpg" => Ok(Self::Mjpeg), + _ => Err(CaptureFrameFormatParseError), + } + } +} + +/// Error returned when parsing a [`CaptureFrameFormat`] from a string. +#[derive(Debug, Clone, Copy, Error, PartialEq, Eq)] +#[error("unknown capture frame format")] +pub struct CaptureFrameFormatParseError; + +/// Deprecated alias for [`CaptureFrameFormat`]. +#[deprecated(note = "use CaptureFrameFormat")] +pub type CapturePixelFormat = CaptureFrameFormat; + /// Pixel dimensions for a capture format. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct CaptureResolution { @@ -87,25 +219,25 @@ impl From for VideoResolution { } } -/// Decoded-frame capture format. +/// Raw-frame capture format. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct CaptureFormat { /// Frame dimensions. pub resolution: CaptureResolution, /// Frame rate in frames per second. pub frame_rate: u32, - /// Pixel format. - pub pixel_format: CapturePixelFormat, + /// Frame format. + pub frame_format: CaptureFrameFormat, } impl CaptureFormat { - /// Creates a decoded-frame capture format. + /// Creates a raw-frame capture format. pub const fn new( resolution: CaptureResolution, frame_rate: u32, - pixel_format: CapturePixelFormat, + frame_format: CaptureFrameFormat, ) -> Self { - Self { resolution, frame_rate, pixel_format } + Self { resolution, frame_rate, frame_format } } } @@ -119,18 +251,59 @@ pub enum CaptureFormatRequest { Exact(CaptureFormat), /// Use the backend's closest supported format. Closest(CaptureFormat), - /// Prefer the highest frame rate, optionally constrained by resolution and pixel format. + /// Prefer the highest frame rate, optionally constrained by resolution and frame format. HighestFrameRate { /// Optional resolution constraint. resolution: Option, - /// Optional pixel format constraint. - pixel_format: Option, + /// Optional frame format constraint. + frame_format: Option, }, - /// Prefer the highest resolution, optionally constrained by frame rate and pixel format. + /// Prefer the highest resolution, optionally constrained by frame rate and frame format. HighestResolution { /// Optional frame-rate constraint. frame_rate: Option, - /// Optional pixel format constraint. - pixel_format: Option, + /// Optional frame format constraint. + frame_format: Option, }, } + +#[cfg(test)] +mod tests { + use super::*; + use std::str::FromStr; + + #[test] + fn capture_frame_format_parses_common_names() { + assert_eq!(CaptureFrameFormat::from_str("MJPEG"), Ok(CaptureFrameFormat::Mjpeg)); + assert_eq!(CaptureFrameFormat::from_str("mjpg"), Ok(CaptureFrameFormat::Mjpeg)); + assert_eq!(CaptureFrameFormat::from_str("gray"), Ok(CaptureFrameFormat::Gray)); + assert_eq!(CaptureFrameFormat::from_str("GREY"), Ok(CaptureFrameFormat::Gray)); + assert_eq!(CaptureFrameFormat::from_str("yuy2"), Ok(CaptureFrameFormat::Yuyv)); + } + + #[test] + fn capture_frame_format_displays_canonical_names() { + assert_eq!(CaptureFrameFormat::Mjpeg.to_string(), "mjpeg"); + assert_eq!(CaptureFrameFormat::Gray.to_string(), "gray"); + } + + #[test] + fn device_info_can_report_incomplete_format_lists() { + let info = CaptureDeviceInfo { + backend: CaptureBackend::AvFoundation, + id: "camera-0".to_string(), + selector: CaptureDeviceSelector::Id("camera-0".to_string()), + name: "Camera".to_string(), + model_id: None, + manufacturer: None, + paths: vec![CapturePath::Raw], + formats: Vec::new(), + formats_complete: false, + }; + + assert_eq!(info.backend, CaptureBackend::AvFoundation); + assert_eq!(info.selector, CaptureDeviceSelector::Id("camera-0".to_string())); + assert_eq!(info.paths, vec![CapturePath::Raw]); + assert!(!info.formats_complete); + } +} diff --git a/livekit-capture/src/lib.rs b/livekit-capture/src/lib.rs index 1dee1586f..dfcc9ef83 100644 --- a/livekit-capture/src/lib.rs +++ b/livekit-capture/src/lib.rs @@ -20,12 +20,15 @@ pub mod encoded; mod error; pub mod metadata; pub mod platform; +pub mod source; pub mod sources; pub mod track; +#[allow(deprecated)] +pub use device::CapturePixelFormat; pub use device::{ - CaptureDeviceInfo, CaptureDeviceSelector, CaptureFormat, CaptureFormatRequest, - CapturePixelFormat, CaptureResolution, + CaptureBackend, CaptureDeviceInfo, CaptureDeviceQueryError, CaptureDeviceSelector, + CaptureFormat, CaptureFormatRequest, CaptureFrameFormat, CapturePath, CaptureResolution, }; pub use dmabuf::{DmaBufFrame, DmaBufPixelFormat, DmaBufPlane}; pub use encoded::{ @@ -36,4 +39,9 @@ pub use encoded::{ }; pub use error::CaptureError; pub use metadata::FrameMetadata; -pub use track::{CapturePath, VideoCaptureTrack}; +pub use source::{ + CaptureFrame, CaptureFrameSource, CaptureMetadataOptions, CaptureSourceError, + CaptureSourceOptions, CaptureTimestampSource, EncodedCaptureFrameSource, + EncodedFrameSourceError, RawVideoFrame, VideoCaptureSource, +}; +pub use track::VideoCaptureTrack; diff --git a/livekit-capture/src/platform/avfoundation.rs b/livekit-capture/src/platform/avfoundation.rs index 392cbfc24..202ba137a 100644 --- a/livekit-capture/src/platform/avfoundation.rs +++ b/livekit-capture/src/platform/avfoundation.rs @@ -23,8 +23,8 @@ use thiserror::Error; use crate::{ device::{ - CaptureDeviceInfo, CaptureDeviceSelector, CaptureFormat, CaptureFormatRequest, - CapturePixelFormat, CaptureResolution, + CaptureBackend, CaptureDeviceInfo, CaptureDeviceSelector, CaptureFormat, + CaptureFormatRequest, CaptureFrameFormat, CapturePath, CaptureResolution, }, error::CaptureError, track::VideoCaptureTrack, @@ -59,14 +59,14 @@ impl Default for AvFoundationCaptureOptions { pub struct AvFoundationFrame { /// Decoded I420 frame suitable for [`crate::VideoCaptureTrack::capture_frame`]. pub frame: VideoFrame, - /// Source pixel format delivered by AVFoundation. - pub source_pixel_format: CapturePixelFormat, + /// Source frame format delivered by AVFoundation. + pub source_format: CaptureFrameFormat, /// Wall-clock timestamp selected for metadata and timing correlation. pub capture_wall_time_us: u64, /// Wall-clock timestamp recorded after the frame was read from AVFoundation. pub read_wall_time_us: u64, - /// Whether compressed image decoding was needed. - pub used_decode_path: bool, + /// Whether conversion from the source format to I420 was needed. + pub used_conversion: bool, } impl AvFoundationFrame { @@ -79,13 +79,17 @@ impl AvFoundationFrame { /// AVFoundation decoded-frame capture session that emits I420 frames. pub struct AvFoundationCaptureSession { format: CaptureFormat, + options: AvFoundationCaptureOptions, #[cfg(target_os = "macos")] inner: macos::SessionInner, } impl std::fmt::Debug for AvFoundationCaptureSession { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("AvFoundationCaptureSession").field("format", &self.format).finish() + f.debug_struct("AvFoundationCaptureSession") + .field("format", &self.format) + .field("options", &self.options) + .finish() } } @@ -112,12 +116,22 @@ impl AvFoundationCaptureSession { self.format } + /// Returns the configured capture options. + pub fn options(&self) -> &AvFoundationCaptureOptions { + &self.options + } + + /// Returns the capture path produced by this session. + pub fn capture_path(&self) -> CapturePath { + CapturePath::Raw + } + #[cfg(target_os = "macos")] fn open(options: AvFoundationCaptureOptions) -> Result { let inner = macos::SessionInner::new(&options)?; let mut format = inner.wait_for_format(FIRST_FRAME_TIMEOUT)?; format.frame_rate = requested_frame_rate_hint(&options.format).unwrap_or(30); - Ok(Self { format, inner }) + Ok(Self { format, options, inner }) } #[cfg(not(target_os = "macos"))] @@ -214,9 +228,9 @@ pub enum AvFoundationError { /// The requested option is invalid. #[error("invalid AVFoundation capture option: {0}")] InvalidOption(&'static str), - /// The requested capture format is not supported by this backend. - #[error("AVFoundation capture does not support pixel format {0:?}")] - UnsupportedPixelFormat(CapturePixelFormat), + /// The requested capture frame format is not supported by this backend. + #[error("AVFoundation capture does not support frame format {0:?}")] + UnsupportedFrameFormat(CaptureFrameFormat), /// The requested capture format is not available on the selected device. #[error("AVFoundation capture format is not available: {0:?}")] UnsupportedFormat(CaptureFormat), @@ -273,7 +287,7 @@ fn validate_format_request(format: &CaptureFormatRequest) -> Result<(), AvFounda if format.frame_rate == 0 { return Err(AvFoundationError::InvalidOption("frame_rate must be non-zero")); } - validate_pixel_format(format.pixel_format)?; + validate_frame_format(format.frame_format)?; Ok(()) }; @@ -282,33 +296,33 @@ fn validate_format_request(format: &CaptureFormatRequest) -> Result<(), AvFounda CaptureFormatRequest::Exact(format) | CaptureFormatRequest::Closest(format) => { validate_format(format) } - CaptureFormatRequest::HighestFrameRate { resolution, pixel_format } => { + CaptureFormatRequest::HighestFrameRate { resolution, frame_format } => { if let Some(resolution) = resolution { validate_resolution(*resolution)?; } - if let Some(pixel_format) = pixel_format { - validate_pixel_format(*pixel_format)?; + if let Some(frame_format) = frame_format { + validate_frame_format(*frame_format)?; } Ok(()) } - CaptureFormatRequest::HighestResolution { frame_rate, pixel_format } => { + CaptureFormatRequest::HighestResolution { frame_rate, frame_format } => { if matches!(frame_rate, Some(0)) { return Err(AvFoundationError::InvalidOption("frame_rate must be non-zero")); } - if let Some(pixel_format) = pixel_format { - validate_pixel_format(*pixel_format)?; + if let Some(frame_format) = frame_format { + validate_frame_format(*frame_format)?; } Ok(()) } } } -fn validate_pixel_format(pixel_format: CapturePixelFormat) -> Result<(), AvFoundationError> { +fn validate_frame_format(frame_format: CaptureFrameFormat) -> Result<(), AvFoundationError> { if !matches!( - pixel_format, - CapturePixelFormat::Nv12 | CapturePixelFormat::Bgra | CapturePixelFormat::I420 + frame_format, + CaptureFrameFormat::Nv12 | CaptureFrameFormat::Bgra | CaptureFrameFormat::I420 ) { - return Err(AvFoundationError::UnsupportedPixelFormat(pixel_format)); + return Err(AvFoundationError::UnsupportedFrameFormat(frame_format)); } Ok(()) } @@ -365,7 +379,17 @@ fn list_devices() -> Result, AvFoundationError> { let model_id = non_empty_string(unsafe { device.modelID() }.to_string()); let manufacturer = non_empty_string(unsafe { device.manufacturer() }.to_string()); - results.push(CaptureDeviceInfo { id, name, model_id, manufacturer, formats: Vec::new() }); + results.push(CaptureDeviceInfo { + backend: CaptureBackend::AvFoundation, + id: id.clone(), + selector: CaptureDeviceSelector::Id(id), + name, + model_id, + manufacturer, + paths: vec![CapturePath::Raw], + formats: Vec::new(), + formats_complete: false, + }); } Ok(results) @@ -464,7 +488,7 @@ mod macos { use super::{AvFoundationCaptureOptions, AvFoundationError, AvFoundationFrame}; use crate::device::{ - CaptureDeviceSelector, CaptureFormat, CaptureFormatRequest, CapturePixelFormat, + CaptureDeviceSelector, CaptureFormat, CaptureFormatRequest, CaptureFrameFormat, CaptureResolution, }; use crate::metadata::FrameMetadata; @@ -690,7 +714,7 @@ mod macos { return Ok(CaptureFormat::new( CaptureResolution::new(buffer.width(), buffer.height()), 0, - frame.source_pixel_format, + frame.source_format, )); } if let Some(error) = state.error.take() { @@ -1001,7 +1025,7 @@ mod macos { } CaptureFormatRequest::HighestFrameRate { resolution, .. } => *resolution, CaptureFormatRequest::Default - | CaptureFormatRequest::HighestResolution { frame_rate: _, pixel_format: _ } => None, + | CaptureFormatRequest::HighestResolution { frame_rate: _, frame_format: _ } => None, }?; match (resolution.width, resolution.height) { @@ -1026,7 +1050,7 @@ mod macos { // for the duration of this conversion. let pixel_buffer = unsafe { &*(image_buffer_ref as *const CVImageBuffer as *const CVPixelBuffer) }; - let (buffer, source_pixel_format) = convert_pixel_buffer(pixel_buffer)?; + let (buffer, source_format) = convert_pixel_buffer(pixel_buffer)?; let capture_wall_time_us = read_wall_time_us; let frame = VideoFrame { @@ -1042,17 +1066,17 @@ mod macos { shared.push_frame(AvFoundationFrame { frame, - source_pixel_format, + source_format, capture_wall_time_us, read_wall_time_us, - used_decode_path: false, + used_conversion: source_format != CaptureFrameFormat::I420, }); Ok(()) } fn convert_pixel_buffer( pixel_buffer: &CVPixelBuffer, - ) -> Result<(I420Buffer, CapturePixelFormat), AvFoundationError> { + ) -> Result<(I420Buffer, CaptureFrameFormat), AvFoundationError> { let lock_flags = CVPixelBufferLockFlags::ReadOnly; let lock_result = unsafe { CVPixelBufferLockBaseAddress(pixel_buffer, lock_flags) }; if lock_result != kCVReturnSuccess { @@ -1072,7 +1096,7 @@ mod macos { fn convert_locked_pixel_buffer( pixel_buffer: &CVPixelBuffer, - ) -> Result<(I420Buffer, CapturePixelFormat), AvFoundationError> { + ) -> Result<(I420Buffer, CaptureFrameFormat), AvFoundationError> { let width = u32::try_from(CVPixelBufferGetWidth(pixel_buffer)) .map_err(|_| AvFoundationError::InvalidFrame("width is out of range"))?; let height = u32::try_from(CVPixelBufferGetHeight(pixel_buffer)) @@ -1085,29 +1109,29 @@ mod macos { || format == kCVPixelFormatType_420YpCbCr8BiPlanarFullRange => { convert_nv12(pixel_buffer, width, height) - .map(|buffer| (buffer, CapturePixelFormat::Nv12)) + .map(|buffer| (buffer, CaptureFrameFormat::Nv12)) } format if format == kCVPixelFormatType_32BGRA => { convert_bgra(pixel_buffer, width, height) - .map(|buffer| (buffer, CapturePixelFormat::Bgra)) + .map(|buffer| (buffer, CaptureFrameFormat::Bgra)) } format if format == kCVPixelFormatType_420YpCbCr8Planar || format == kCVPixelFormatType_420YpCbCr8PlanarFullRange => { convert_i420(pixel_buffer, width, height) - .map(|buffer| (buffer, CapturePixelFormat::I420)) + .map(|buffer| (buffer, CaptureFrameFormat::I420)) } format if format == kCVPixelFormatType_422YpCbCr8 => { convert_uyvy(pixel_buffer, width, height) - .map(|buffer| (buffer, CapturePixelFormat::Uyvy)) + .map(|buffer| (buffer, CaptureFrameFormat::Uyvy)) } format if format == kCVPixelFormatType_422YpCbCr8_yuvs || format == kCVPixelFormatType_422YpCbCr8FullRange => { convert_yuy2(pixel_buffer, width, height) - .map(|buffer| (buffer, CapturePixelFormat::Yuyv)) + .map(|buffer| (buffer, CaptureFrameFormat::Yuyv)) } other => Err(AvFoundationError::UnsupportedCoreVideoPixelFormat(other)), } diff --git a/livekit-capture/src/sources/argus.rs b/livekit-capture/src/sources/argus.rs index ae294bb11..18df839e9 100644 --- a/livekit-capture/src/sources/argus.rs +++ b/livekit-capture/src/sources/argus.rs @@ -16,8 +16,12 @@ use thiserror::Error; +#[cfg(livekit_capture_argus)] +use crate::device::{CaptureBackend, CaptureDeviceSelector}; use crate::{ - device::{CaptureFormat, CapturePixelFormat, CaptureResolution}, + device::{ + CaptureDeviceInfo, CaptureFormat, CaptureFrameFormat, CapturePath, CaptureResolution, + }, dmabuf::DmaBufFrame, }; @@ -70,7 +74,7 @@ impl ArgusCaptureOptions { pub const fn new(sensor_index: u32, resolution: CaptureResolution, frame_rate: u32) -> Self { Self { sensor_index, - format: CaptureFormat::new(resolution, frame_rate, CapturePixelFormat::Nv12), + format: CaptureFormat::new(resolution, frame_rate, CaptureFrameFormat::Nv12), attach_sensor_timestamp: false, attach_frame_id: false, } @@ -91,7 +95,7 @@ pub enum ArgusError { Unsupported, /// Argus only publishes NV12 DMA-BUF frames in this backend. #[error("libargus capture only supports NV12 DMA-BUF frames, got {0:?}")] - UnsupportedPixelFormat(CapturePixelFormat), + UnsupportedFrameFormat(CaptureFrameFormat), /// The requested format contains an invalid value. #[error("invalid Argus capture option: {0}")] InvalidOption(&'static str), @@ -148,15 +152,21 @@ impl ArgusCaptureSession { Self::open(options) } - /// Acquires the next captured frame as an NV12 DMA-BUF. + /// Captures the next frame as an NV12 DMA-BUF. /// /// The returned DMA-BUF file descriptor is owned by the Argus session's /// internal buffer ring. It remains valid until the session is dropped, but /// callers should publish frames promptly so the ring can be reused. - pub fn acquire_frame(&mut self) -> Result { + pub fn capture_frame(&mut self) -> Result { self.acquire_frame_inner() } + /// Acquires the next captured frame as an NV12 DMA-BUF. + #[deprecated(note = "use capture_frame")] + pub fn acquire_frame(&mut self) -> Result { + self.capture_frame() + } + /// Releases the currently held Argus frame, when one is held by the shim. pub fn release_frame(&mut self) { self.release_frame_inner(); @@ -177,6 +187,16 @@ impl ArgusCaptureSession { self.options.format } + /// Returns the configured capture options. + pub fn options(&self) -> &ArgusCaptureOptions { + &self.options + } + + /// Returns the capture path produced by this session. + pub fn capture_path(&self) -> CapturePath { + CapturePath::DmaBuf + } + #[cfg(livekit_capture_argus)] fn open(options: ArgusCaptureOptions) -> Result { let sensor_index = c_int_from_u32(options.sensor_index, "sensor_index")?; @@ -282,8 +302,8 @@ impl Drop for ArgusCaptureSession { } fn validate_options(options: &ArgusCaptureOptions) -> Result<(), ArgusError> { - if options.format.pixel_format != CapturePixelFormat::Nv12 { - return Err(ArgusError::UnsupportedPixelFormat(options.format.pixel_format)); + if options.format.frame_format != CaptureFrameFormat::Nv12 { + return Err(ArgusError::UnsupportedFrameFormat(options.format.frame_format)); } if options.format.resolution.width == 0 { return Err(ArgusError::InvalidOption("width must be non-zero")); @@ -297,6 +317,28 @@ fn validate_options(options: &ArgusCaptureOptions) -> Result<(), ArgusError> { Ok(()) } +/// Returns Jetson Argus capture devices. +pub fn devices() -> Result, ArgusError> { + #[cfg(livekit_capture_argus)] + { + return Ok(vec![CaptureDeviceInfo { + backend: CaptureBackend::LibArgus, + id: "0".to_string(), + selector: CaptureDeviceSelector::Index(0), + name: "Jetson Argus sensor 0".to_string(), + model_id: None, + manufacturer: Some("NVIDIA".to_string()), + paths: vec![CapturePath::DmaBuf], + formats: vec![ArgusCaptureOptions::default().format], + formats_complete: false, + }]); + } + #[cfg(not(livekit_capture_argus))] + { + Err(ArgusError::Unsupported) + } +} + #[cfg(livekit_capture_argus)] fn c_int_from_u32(value: u32, field: &'static str) -> Result { c_int::try_from(value).map_err(|_| ArgusError::OptionOutOfRange(field)) @@ -369,9 +411,9 @@ mod tests { #[test] fn validates_nv12_only() { let mut options = ArgusCaptureOptions::default(); - options.format.pixel_format = CapturePixelFormat::I420; + options.format.frame_format = CaptureFrameFormat::I420; let err = ArgusCaptureSession::new(options).expect_err("I420 must be rejected"); - assert_eq!(err, ArgusError::UnsupportedPixelFormat(CapturePixelFormat::I420)); + assert_eq!(err, ArgusError::UnsupportedFrameFormat(CaptureFrameFormat::I420)); } #[test] diff --git a/livekit-capture/src/sources/mod.rs b/livekit-capture/src/sources/mod.rs index 3b33ec675..9441ccb3f 100644 --- a/livekit-capture/src/sources/mod.rs +++ b/livekit-capture/src/sources/mod.rs @@ -14,6 +14,12 @@ //! Optional capture sources that feed the shared capture paths. +#[cfg(feature = "avfoundation")] +pub mod avfoundation { + //! macOS AVFoundation decoded-frame capture. + + pub use crate::platform::avfoundation::*; +} #[cfg(feature = "libargus")] pub mod argus; #[cfg(feature = "gstreamer")] diff --git a/livekit-capture/src/sources/v4l.rs b/livekit-capture/src/sources/v4l.rs index ae8634491..b6311842a 100644 --- a/livekit-capture/src/sources/v4l.rs +++ b/livekit-capture/src/sources/v4l.rs @@ -32,9 +32,11 @@ use nokhwa::{ }; use thiserror::Error; +#[cfg(target_os = "linux")] +use crate::device::CaptureBackend; use crate::device::{ CaptureDeviceInfo, CaptureDeviceSelector, CaptureFormat, CaptureFormatRequest, - CapturePixelFormat, CaptureResolution, + CaptureFrameFormat, CapturePath, CaptureResolution, }; #[cfg(target_os = "linux")] use crate::metadata::FrameMetadata; @@ -49,8 +51,8 @@ pub struct V4lCaptureOptions { pub device: CaptureDeviceSelector, /// Requested format policy. pub format: CaptureFormatRequest, - /// Ordered source pixel formats to try. - pub pixel_formats: Vec, + /// Ordered source frame formats to try. + pub frame_formats: Vec, /// Attach a wall-clock capture timestamp as [`crate::FrameMetadata::user_timestamp`]. pub attach_capture_timestamp: bool, /// Attach a monotonically increasing frame id as [`crate::FrameMetadata::frame_id`]. @@ -69,9 +71,9 @@ impl V4lCaptureOptions { format: CaptureFormatRequest::Exact(CaptureFormat::new( resolution, frame_rate, - CapturePixelFormat::Yuyv, + CaptureFrameFormat::Yuyv, )), - pixel_formats: default_pixel_formats(), + frame_formats: default_frame_formats(), attach_capture_timestamp: false, attach_frame_id: false, } @@ -90,9 +92,9 @@ pub enum V4lError { /// V4L capture is only available on Linux. #[error("V4L capture is not supported on this platform")] UnsupportedPlatform, - /// The requested pixel format is not supported by this backend. - #[error("V4L capture does not support pixel format {0:?}")] - UnsupportedPixelFormat(CapturePixelFormat), + /// The requested frame format is not supported by this backend. + #[error("V4L capture does not support frame format {0:?}")] + UnsupportedFrameFormat(CaptureFrameFormat), /// The requested option is invalid. #[error("invalid V4L capture option: {0}")] InvalidOption(&'static str), @@ -118,15 +120,17 @@ pub enum V4lError { pub struct V4lFrame { /// Decoded I420 frame suitable for [`crate::VideoCaptureTrack::capture_frame`]. pub frame: VideoFrame, - /// Source pixel format delivered by the camera backend. - pub source_pixel_format: CapturePixelFormat, + /// Source frame format delivered by the camera backend. + pub source_format: CaptureFrameFormat, /// Backend-provided capture timestamp, when available. pub backend_capture_timestamp: Option, /// Wall-clock timestamp selected for metadata and timing correlation. pub capture_wall_time_us: u64, /// Wall-clock timestamp recorded after the frame was read from the camera backend. pub read_wall_time_us: u64, - /// Whether compressed image decoding was needed. + /// Whether conversion from the source format to I420 was needed. + pub used_conversion: bool, + /// Whether compressed image decoding was needed before conversion. pub used_decode_path: bool, } @@ -142,7 +146,6 @@ pub struct V4lCaptureSession { #[cfg(target_os = "linux")] camera: Camera, format: CaptureFormat, - #[cfg(target_os = "linux")] options: V4lCaptureOptions, #[cfg(target_os = "linux")] started_at: Instant, @@ -154,7 +157,6 @@ impl std::fmt::Debug for V4lCaptureSession { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let mut debug = f.debug_struct("V4lCaptureSession"); debug.field("format", &self.format); - #[cfg(target_os = "linux")] debug.field("options", &self.options); debug.finish() } @@ -177,6 +179,16 @@ impl V4lCaptureSession { self.format } + /// Returns the configured capture options. + pub fn options(&self) -> &V4lCaptureOptions { + &self.options + } + + /// Returns the capture path produced by this session. + pub fn capture_path(&self) -> CapturePath { + CapturePath::Raw + } + #[cfg(target_os = "linux")] fn open(options: V4lCaptureOptions) -> Result { let frame_formats = frame_formats_for_request(&options)?; @@ -228,14 +240,15 @@ impl V4lCaptureSession { height, &mut frame.buffer, )?; - let source_pixel_format = capture_pixel_format_from_nokhwa(buffer.source_frame_format())?; + let source_format = capture_frame_format_from_nokhwa(buffer.source_frame_format())?; Ok(V4lFrame { frame, - source_pixel_format, + source_format, backend_capture_timestamp, capture_wall_time_us, read_wall_time_us, + used_conversion: source_format != CaptureFrameFormat::I420, used_decode_path, }) } @@ -264,13 +277,22 @@ pub fn devices() -> Result, V4lError> { .map_err(nokhwa_error)? .into_iter() .map(|info| { - let formats = enumerate_formats(&info).unwrap_or_default(); + let formats = enumerate_formats(&info); + let (formats, formats_complete) = match formats { + Ok(formats) => (formats, true), + Err(_) => (Vec::new(), false), + }; + let id = info.index().as_string(); Ok(CaptureDeviceInfo { - id: info.index().as_string(), + backend: CaptureBackend::V4l2, + id: id.clone(), + selector: CaptureDeviceSelector::Id(id), name: info.human_name(), model_id: Some(info.description().to_string()).filter(|value| !value.is_empty()), manufacturer: None, + paths: vec![CapturePath::Raw], formats, + formats_complete, }) }) .collect() @@ -282,16 +304,22 @@ pub fn devices() -> Result, V4lError> { Err(V4lError::UnsupportedPlatform) } -fn default_pixel_formats() -> Vec { +/// Returns the default ordered V4L source frame formats. +pub fn default_frame_formats() -> Vec { vec![ - CapturePixelFormat::Yuyv, - CapturePixelFormat::Mjpeg, - CapturePixelFormat::Gray, - CapturePixelFormat::Rgb24, - CapturePixelFormat::Nv12, + CaptureFrameFormat::Yuyv, + CaptureFrameFormat::Mjpeg, + CaptureFrameFormat::Gray, + CaptureFrameFormat::Rgb24, + CaptureFrameFormat::Nv12, ] } +/// Returns default V4L source frame formats with `first` preferred. +pub fn ordered_frame_formats_with_first(first: CaptureFrameFormat) -> Vec { + ordered_formats_with_first(&default_frame_formats(), first) +} + fn validate_options(options: &V4lCaptureOptions) -> Result<(), V4lError> { match &options.device { CaptureDeviceSelector::Default => {} @@ -305,12 +333,12 @@ fn validate_options(options: &V4lCaptureOptions) -> Result<(), V4lError> { } } - if options.pixel_formats.is_empty() { - return Err(V4lError::InvalidOption("pixel_formats must include at least one format")); + if options.frame_formats.is_empty() { + return Err(V4lError::InvalidOption("frame_formats must include at least one format")); } - for pixel_format in &options.pixel_formats { - if nokhwa_frame_format(*pixel_format).is_none() { - return Err(V4lError::UnsupportedPixelFormat(*pixel_format)); + for frame_format in &options.frame_formats { + if nokhwa_frame_format(*frame_format).is_none() { + return Err(V4lError::UnsupportedFrameFormat(*frame_format)); } } @@ -328,8 +356,8 @@ fn validate_format_request(format: &CaptureFormatRequest) -> Result<(), V4lError if format.frame_rate == 0 { return Err(V4lError::InvalidOption("frame_rate must be non-zero")); } - if nokhwa_frame_format(format.pixel_format).is_none() { - return Err(V4lError::UnsupportedPixelFormat(format.pixel_format)); + if nokhwa_frame_format(format.frame_format).is_none() { + return Err(V4lError::UnsupportedFrameFormat(format.frame_format)); } Ok(()) }; @@ -339,24 +367,24 @@ fn validate_format_request(format: &CaptureFormatRequest) -> Result<(), V4lError CaptureFormatRequest::Exact(format) | CaptureFormatRequest::Closest(format) => { validate_format(format) } - CaptureFormatRequest::HighestFrameRate { resolution, pixel_format } => { + CaptureFormatRequest::HighestFrameRate { resolution, frame_format } => { if let Some(resolution) = resolution { validate_resolution(*resolution)?; } - if let Some(pixel_format) = pixel_format { - if nokhwa_frame_format(*pixel_format).is_none() { - return Err(V4lError::UnsupportedPixelFormat(*pixel_format)); + if let Some(frame_format) = frame_format { + if nokhwa_frame_format(*frame_format).is_none() { + return Err(V4lError::UnsupportedFrameFormat(*frame_format)); } } Ok(()) } - CaptureFormatRequest::HighestResolution { frame_rate, pixel_format } => { + CaptureFormatRequest::HighestResolution { frame_rate, frame_format } => { if matches!(frame_rate, Some(0)) { return Err(V4lError::InvalidOption("frame_rate must be non-zero")); } - if let Some(pixel_format) = pixel_format { - if nokhwa_frame_format(*pixel_format).is_none() { - return Err(V4lError::UnsupportedPixelFormat(*pixel_format)); + if let Some(frame_format) = frame_format { + if nokhwa_frame_format(*frame_format).is_none() { + return Err(V4lError::UnsupportedFrameFormat(*frame_format)); } } Ok(()) @@ -389,32 +417,31 @@ fn camera_index(selector: &CaptureDeviceSelector) -> Result Result, V4lError> { let mut formats = match &options.format { CaptureFormatRequest::Exact(format) | CaptureFormatRequest::Closest(format) => { - ordered_formats_with_first(&options.pixel_formats, format.pixel_format) + ordered_formats_with_first(&options.frame_formats, format.frame_format) } - CaptureFormatRequest::HighestFrameRate { pixel_format: Some(pixel_format), .. } - | CaptureFormatRequest::HighestResolution { pixel_format: Some(pixel_format), .. } => { - vec![*pixel_format] + CaptureFormatRequest::HighestFrameRate { frame_format: Some(frame_format), .. } + | CaptureFormatRequest::HighestResolution { frame_format: Some(frame_format), .. } => { + vec![*frame_format] } CaptureFormatRequest::Default - | CaptureFormatRequest::HighestFrameRate { pixel_format: None, .. } - | CaptureFormatRequest::HighestResolution { pixel_format: None, .. } => { - options.pixel_formats.clone() + | CaptureFormatRequest::HighestFrameRate { frame_format: None, .. } + | CaptureFormatRequest::HighestResolution { frame_format: None, .. } => { + options.frame_formats.clone() } }; formats.dedup(); formats .into_iter() - .map(|format| nokhwa_frame_format(format).ok_or(V4lError::UnsupportedPixelFormat(format))) + .map(|format| nokhwa_frame_format(format).ok_or(V4lError::UnsupportedFrameFormat(format))) .collect() } -#[cfg(target_os = "linux")] fn ordered_formats_with_first( - pixel_formats: &[CapturePixelFormat], - first: CapturePixelFormat, -) -> Vec { + frame_formats: &[CaptureFrameFormat], + first: CaptureFrameFormat, +) -> Vec { std::iter::once(first) - .chain(pixel_formats.iter().copied().filter(|format| *format != first)) + .chain(frame_formats.iter().copied().filter(|format| *format != first)) .collect() } @@ -494,7 +521,7 @@ fn apply_ordered_format_request( Err(last_error .map(nokhwa_error) - .unwrap_or(V4lError::InvalidOption("no V4L pixel formats were requested"))) + .unwrap_or(V4lError::InvalidOption("no V4L frame formats were requested"))) } #[cfg(target_os = "linux")] @@ -559,8 +586,8 @@ fn nokhwa_camera_format( ) -> Result { let frame_format = match override_format { Some(format) => format, - None => nokhwa_frame_format(format.pixel_format) - .ok_or(V4lError::UnsupportedPixelFormat(format.pixel_format))?, + None => nokhwa_frame_format(format.frame_format) + .ok_or(V4lError::UnsupportedFrameFormat(format.frame_format))?, }; Ok(CameraFormat::new(nokhwa_resolution(format.resolution), frame_format, format.frame_rate)) } @@ -571,28 +598,28 @@ fn nokhwa_resolution(resolution: CaptureResolution) -> Resolution { } #[cfg(target_os = "linux")] -fn nokhwa_frame_format(pixel_format: CapturePixelFormat) -> Option { +fn nokhwa_frame_format(pixel_format: CaptureFrameFormat) -> Option { match pixel_format { - CapturePixelFormat::Nv12 => Some(FrameFormat::NV12), - CapturePixelFormat::Rgb24 => Some(FrameFormat::RAWRGB), - CapturePixelFormat::Bgr24 => Some(FrameFormat::RAWBGR), - CapturePixelFormat::Yuyv => Some(FrameFormat::YUYV), - CapturePixelFormat::Gray => Some(FrameFormat::GRAY), - CapturePixelFormat::Mjpeg => Some(FrameFormat::MJPEG), - CapturePixelFormat::I420 | CapturePixelFormat::Bgra | CapturePixelFormat::Uyvy => None, + CaptureFrameFormat::Nv12 => Some(FrameFormat::NV12), + CaptureFrameFormat::Rgb24 => Some(FrameFormat::RAWRGB), + CaptureFrameFormat::Bgr24 => Some(FrameFormat::RAWBGR), + CaptureFrameFormat::Yuyv => Some(FrameFormat::YUYV), + CaptureFrameFormat::Gray => Some(FrameFormat::GRAY), + CaptureFrameFormat::Mjpeg => Some(FrameFormat::MJPEG), + CaptureFrameFormat::I420 | CaptureFrameFormat::Bgra | CaptureFrameFormat::Uyvy => None, } } #[cfg(not(target_os = "linux"))] -fn nokhwa_frame_format(pixel_format: CapturePixelFormat) -> Option<()> { +fn nokhwa_frame_format(pixel_format: CaptureFrameFormat) -> Option<()> { match pixel_format { - CapturePixelFormat::Nv12 - | CapturePixelFormat::Rgb24 - | CapturePixelFormat::Bgr24 - | CapturePixelFormat::Yuyv - | CapturePixelFormat::Gray - | CapturePixelFormat::Mjpeg => Some(()), - CapturePixelFormat::I420 | CapturePixelFormat::Bgra | CapturePixelFormat::Uyvy => None, + CaptureFrameFormat::Nv12 + | CaptureFrameFormat::Rgb24 + | CaptureFrameFormat::Bgr24 + | CaptureFrameFormat::Yuyv + | CaptureFrameFormat::Gray + | CaptureFrameFormat::Mjpeg => Some(()), + CaptureFrameFormat::I420 | CaptureFrameFormat::Bgra | CaptureFrameFormat::Uyvy => None, } } @@ -601,19 +628,19 @@ fn capture_format_from_nokhwa(format: CameraFormat) -> Result Result { +fn capture_frame_format_from_nokhwa(format: FrameFormat) -> Result { match format { - FrameFormat::MJPEG => Ok(CapturePixelFormat::Mjpeg), - FrameFormat::YUYV => Ok(CapturePixelFormat::Yuyv), - FrameFormat::NV12 => Ok(CapturePixelFormat::Nv12), - FrameFormat::GRAY => Ok(CapturePixelFormat::Gray), - FrameFormat::RAWRGB => Ok(CapturePixelFormat::Rgb24), - FrameFormat::RAWBGR => Ok(CapturePixelFormat::Bgr24), + FrameFormat::MJPEG => Ok(CaptureFrameFormat::Mjpeg), + FrameFormat::YUYV => Ok(CaptureFrameFormat::Yuyv), + FrameFormat::NV12 => Ok(CaptureFrameFormat::Nv12), + FrameFormat::GRAY => Ok(CaptureFrameFormat::Gray), + FrameFormat::RAWRGB => Ok(CaptureFrameFormat::Rgb24), + FrameFormat::RAWBGR => Ok(CaptureFrameFormat::Bgr24), } } @@ -871,9 +898,9 @@ mod tests { use super::*; #[test] - fn rejects_empty_pixel_format_preferences() { + fn rejects_empty_frame_format_preferences() { let mut options = V4lCaptureOptions::default(); - options.pixel_formats.clear(); + options.frame_formats.clear(); let err = V4lCaptureSession::new(options).expect_err("empty formats must be rejected"); assert!(matches!(err, V4lError::InvalidOption(_))); } @@ -881,9 +908,9 @@ mod tests { #[test] fn rejects_unsupported_i420_source_format() { let mut options = V4lCaptureOptions::default(); - options.pixel_formats = vec![CapturePixelFormat::I420]; + options.frame_formats = vec![CaptureFrameFormat::I420]; let err = V4lCaptureSession::new(options).expect_err("I420 source must be rejected"); - assert!(matches!(err, V4lError::UnsupportedPixelFormat(CapturePixelFormat::I420))); + assert!(matches!(err, V4lError::UnsupportedFrameFormat(CaptureFrameFormat::I420))); } #[test] diff --git a/livekit-capture/src/track.rs b/livekit-capture/src/track.rs index fb80d1364..0e9d45806 100644 --- a/livekit-capture/src/track.rs +++ b/livekit-capture/src/track.rs @@ -26,21 +26,10 @@ use crate::{ error::CaptureError, }; +pub use crate::device::CapturePath; #[cfg(target_os = "linux")] use crate::dmabuf::DmaBufFrame; -/// Capture path used by a source implementation. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -#[non_exhaustive] -pub enum CapturePath { - /// Decoded CPU or native frame buffers. - FrameBuffer, - /// Linux DMA-BUF backed frames. - DmaBuf, - /// Pre-encoded compressed access units. - Encoded, -} - /// Capture source backed by a LiveKit local video track. #[derive(Debug, Clone)] pub struct VideoCaptureTrack { From 26bb266debb60a6d4e9dc39fbe677f3e1380560c Mon Sep 17 00:00:00 2001 From: David Chen Date: Wed, 24 Jun 2026 13:15:29 -0700 Subject: [PATCH 08/24] implement rtsp & tcp ingest path --- .changeset/livekit-capture-preencoded.md | 2 +- Cargo.lock | 237 ++++- Cargo.toml | 5 + examples/local_video/src/subscriber.rs | 16 + examples/preencode_publish/Cargo.toml | 15 + examples/preencode_publish/src/main.rs | 687 ++++++++++++ livekit-capture/Cargo.toml | 6 +- livekit-capture/src/source.rs | 772 ++++++++++++++ livekit-capture/src/sources/gstreamer.rs | 285 ++++- livekit-capture/src/sources/rtsp.rs | 1213 +++++++++++++++++++++- livekit-capture/src/sources/tcp.rs | 156 ++- webrtc-sys/src/video_track.cpp | 3 +- 12 files changed, 3370 insertions(+), 27 deletions(-) create mode 100644 examples/preencode_publish/Cargo.toml create mode 100644 examples/preencode_publish/src/main.rs create mode 100644 livekit-capture/src/source.rs diff --git a/.changeset/livekit-capture-preencoded.md b/.changeset/livekit-capture-preencoded.md index 79b8409d6..c852c1bb4 100644 --- a/.changeset/livekit-capture-preencoded.md +++ b/.changeset/livekit-capture-preencoded.md @@ -5,4 +5,4 @@ "webrtc-sys": patch --- -Add a `livekit-capture` crate with codec-neutral capture types, H264/H265 passthrough support, common encoded ingress helpers, macOS AVFoundation decoded-frame capture, Linux V4L capture, and Jetson libargus capture hooks. The `local_video` examples now open platform camera capture through `livekit-capture` instead of depending on Nokhwa directly. +Add a `livekit-capture` crate with codec-neutral capture types, H264/H265 passthrough support, common encoded ingress helpers, TCP byte-stream encoded ingress, RTSP-over-TCP encoded ingress, GStreamer appsink encoded ingress, macOS AVFoundation decoded-frame capture, Linux V4L capture, and Jetson libargus capture hooks. The `local_video` examples now open platform camera capture through `livekit-capture` instead of depending on Nokhwa directly, and a `preencode_publish` example demonstrates publishing H264/H265 Annex-B TCP or RTSP streams as pre-encoded video tracks. diff --git a/Cargo.lock b/Cargo.lock index 34b434b0b..74d0dcc27 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -535,6 +535,12 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" +[[package]] +name = "atomic_refcell" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21e4227379beff4205943696e6c3e0cd809bacdf3f0edd6e3dd153e2269571a4" + [[package]] name = "autocfg" version = "1.5.0" @@ -554,7 +560,7 @@ dependencies = [ "log", "num-rational", "num-traits", - "pastey", + "pastey 0.1.1", "rayon", "thiserror 2.0.18", "v_frame", @@ -2529,8 +2535,21 @@ version = "0.21.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0071fe88dba8e40086c8ff9bbb62622999f49628344b1d1bf490a48a29d80f22" dependencies = [ - "glib-sys", - "gobject-sys", + "glib-sys 0.21.5", + "gobject-sys 0.21.5", + "libc", + "system-deps", + "windows-sys 0.61.2", +] + +[[package]] +name = "gio-sys" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64729ba2772c080448f9f966dba8f4456beeb100d8c28a865ef8a0f2ef4987e1" +dependencies = [ + "glib-sys 0.22.6", + "gobject-sys 0.22.6", "libc", "system-deps", "windows-sys 0.61.2", @@ -2559,10 +2578,31 @@ dependencies = [ "futures-executor", "futures-task", "futures-util", - "gio-sys", - "glib-macros", - "glib-sys", - "gobject-sys", + "gio-sys 0.21.5", + "glib-macros 0.21.5", + "glib-sys 0.21.5", + "gobject-sys 0.21.5", + "libc", + "memchr", + "smallvec", +] + +[[package]] +name = "glib" +version = "0.22.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c207e04e51605dcf7b2924c41591b3a10e1438eaac5bcf448fb91f325381104a" +dependencies = [ + "bitflags 2.11.0", + "futures-channel", + "futures-core", + "futures-executor", + "futures-task", + "futures-util", + "gio-sys 0.22.0", + "glib-macros 0.22.6", + "glib-sys 0.22.6", + "gobject-sys 0.22.6", "libc", "memchr", "smallvec", @@ -2581,6 +2621,18 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "glib-macros" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "506d23499707c7142898429757e8d9a3871d965239a2cb66dfa05052be6d6f19" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "glib-sys" version = "0.21.5" @@ -2591,6 +2643,16 @@ dependencies = [ "system-deps", ] +[[package]] +name = "glib-sys" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f7fbac234ed5bc2a28359b7bde8e1b9cdf1441cc2d7f068e4824672d7db9445" +dependencies = [ + "libc", + "system-deps", +] + [[package]] name = "glob" version = "0.3.3" @@ -2636,7 +2698,18 @@ version = "0.21.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dca35da0d19a18f4575f3cb99fe1c9e029a2941af5662f326f738a21edaf294" dependencies = [ - "glib-sys", + "glib-sys 0.21.5", + "libc", + "system-deps", +] + +[[package]] +name = "gobject-sys" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22a861859b887a79cf461359c192c97a57d8fb0229dd291232e57aa11f6fa72c" +dependencies = [ + "glib-sys 0.22.6", "libc", "system-deps", ] @@ -2717,6 +2790,99 @@ dependencies = [ "bitflags 2.11.0", ] +[[package]] +name = "gstreamer" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28ca0c594cac4e86f5444aaa767c7bb810340c0710667a6467d3ead248e35e84" +dependencies = [ + "cfg-if", + "futures-channel", + "futures-core", + "futures-util", + "glib 0.22.7", + "gstreamer-sys", + "itertools 0.14.0", + "kstring", + "libc", + "muldiv", + "num-integer", + "num-rational", + "option-operations", + "pastey 0.2.3", + "pin-project-lite", + "smallvec", + "thiserror 2.0.18", +] + +[[package]] +name = "gstreamer-app" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97f8ae9238c2352398dcc084de28df3f7099af216ac6c160b52318d23f25c010" +dependencies = [ + "futures-core", + "futures-sink", + "glib 0.22.7", + "gstreamer", + "gstreamer-app-sys", + "gstreamer-base", + "libc", +] + +[[package]] +name = "gstreamer-app-sys" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a74a8211e5d7df2f45b612c284ddf56b92bdf4e879e8ed72e7c46dd0842e158" +dependencies = [ + "glib-sys 0.22.6", + "gstreamer-base-sys", + "gstreamer-sys", + "libc", + "system-deps", +] + +[[package]] +name = "gstreamer-base" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c279df2918be97fb9570e589a32ade33598f643b0c4f0c92c17f06be6940574e" +dependencies = [ + "atomic_refcell", + "cfg-if", + "glib 0.22.7", + "gstreamer", + "gstreamer-base-sys", + "libc", +] + +[[package]] +name = "gstreamer-base-sys" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6569606feeb89cfcf95a6476a64a0f0aec83fadcef0e91c24e576f7851ceac3a" +dependencies = [ + "glib-sys 0.22.6", + "gobject-sys 0.22.6", + "gstreamer-sys", + "libc", + "system-deps", +] + +[[package]] +name = "gstreamer-sys" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "533fa8d28fc830eafccbcfcfddb390563ea5d3a351af2c3aab99e197e5f5b1ba" +dependencies = [ + "cfg-if", + "glib-sys 0.22.6", + "gobject-sys 0.22.6", + "libc", + "system-deps", +] + [[package]] name = "h2" version = "0.3.27" @@ -3612,7 +3778,7 @@ version = "0.3.37" dependencies = [ "cxx", "env_logger 0.11.10", - "glib", + "glib 0.21.5", "jni 0.21.1", "js-sys", "lazy_static", @@ -3800,11 +3966,15 @@ dependencies = [ name = "livekit-capture" version = "0.1.0" dependencies = [ + "base64 0.22.1", "bytes", "cc", "dispatch2", + "gstreamer", + "gstreamer-app", "image", "livekit", + "md-5", "nokhwa", "objc2 0.6.4", "objc2-av-foundation", @@ -4045,6 +4215,16 @@ dependencies = [ "rayon", ] +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + [[package]] name = "memchr" version = "2.8.0" @@ -4148,6 +4328,12 @@ dependencies = [ "pxfm", ] +[[package]] +name = "muldiv" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "956787520e75e9bd233246045d19f42fb73242759cc57fba9611d940ae96d4b0" + [[package]] name = "multimap" version = "0.10.1" @@ -5104,6 +5290,15 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "option-operations" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aca39cf52b03268400c16eeb9b56382ea3c3353409309b63f5c8f0b1faf42754" +dependencies = [ + "pastey 0.2.3", +] + [[package]] name = "orbclient" version = "0.3.51" @@ -5220,6 +5415,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35fb2e5f958ec131621fdd531e9fc186ed768cbe395337403ae56c17a74c68ec" +[[package]] +name = "pastey" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ee67f1008b1ba2321834326597b8e186293b049a023cdef258527550b9935b4" + [[package]] name = "pbjson" version = "0.6.0" @@ -5487,6 +5688,20 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "preencode_publish" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "env_logger 0.11.10", + "livekit", + "livekit-api", + "livekit-capture", + "log", + "tokio", +] + [[package]] name = "presser" version = "0.3.1" @@ -7471,7 +7686,7 @@ dependencies = [ "num-complex", "num-integer", "num-traits", - "pastey", + "pastey 0.1.1", "rustfft", "smallvec", "tract-data", @@ -7534,7 +7749,7 @@ dependencies = [ "liquid-derive", "log", "num-traits", - "pastey", + "pastey 0.1.1", "scan_fmt", "smallvec", "time", diff --git a/Cargo.toml b/Cargo.toml index c60cc7050..592b75564 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,6 +30,7 @@ members = [ "examples/local_video", "examples/mobile", "examples/play_from_disk", + "examples/preencode_publish", "examples/rpc", "examples/save_to_disk", "examples/screensharing", @@ -61,6 +62,7 @@ webrtc-sys-build = { version = "0.3.18", path = "webrtc-sys/build" } yuv-sys = { version = "0.3.14", path = "yuv-sys" } anyhow = "1.0" +base64 = "0.22" bytes = "1.10" clap = "4.5" console-subscriber = "0.1" @@ -69,8 +71,11 @@ from_variants = "1.0.2" futures = "0.3" futures-core = "0.3" futures-util = { version = "0.3", default-features = false } +gstreamer = "0.25.2" +gstreamer-app = "0.25.2" lazy_static = "1.4" log = "0.4" +md-5 = "0.10" parking_lot = "0.12" prost = "0.14" prost-build = "0.14" diff --git a/examples/local_video/src/subscriber.rs b/examples/local_video/src/subscriber.rs index 54f4fff5e..5fd12e628 100644 --- a/examples/local_video/src/subscriber.rs +++ b/examples/local_video/src/subscriber.rs @@ -693,6 +693,22 @@ fn log_video_decode_health(stats: &[livekit::webrtc::stats::RtcStats]) { inbound.inbound.total_decode_time, inbound.inbound.decoder_implementation, ); + info!( + "RTP receive health: packets={}, lost={}, discarded={}, jitter={:.1}ms, nacks={}, plis={}, firs={}, rtx_packets={}, rtx_bytes={}, freezes={} ({:.3}s), pauses={} ({:.3}s)", + inbound.received.packets_received, + inbound.received.packets_lost, + inbound.inbound.packets_discarded, + inbound.received.jitter * 1000.0, + inbound.inbound.nack_count, + inbound.inbound.pli_count, + inbound.inbound.fir_count, + inbound.inbound.retransmitted_packets_received, + inbound.inbound.retransmitted_bytes_received, + inbound.inbound.freeze_count, + inbound.inbound.total_freeze_duration, + inbound.inbound.pause_count, + inbound.inbound.total_pause_duration, + ); if inbound.inbound.frames_received > 0 && inbound.inbound.frames_decoded == 0 { log::warn!( diff --git a/examples/preencode_publish/Cargo.toml b/examples/preencode_publish/Cargo.toml new file mode 100644 index 000000000..7ad348e09 --- /dev/null +++ b/examples/preencode_publish/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "preencode_publish" +version = "0.1.0" +edition.workspace = true +publish = false + +[dependencies] +anyhow = { workspace = true } +clap = { workspace = true, features = ["derive", "env"] } +env_logger = { workspace = true } +livekit = { workspace = true, features = ["rustls-tls-native-roots"] } +livekit-api = { workspace = true, features = ["rustls-tls-native-roots"] } +livekit-capture = { workspace = true, features = ["rtsp", "tcp-source"] } +log = { workspace = true } +tokio = { workspace = true, features = ["full"] } diff --git a/examples/preencode_publish/src/main.rs b/examples/preencode_publish/src/main.rs new file mode 100644 index 000000000..61da68bc1 --- /dev/null +++ b/examples/preencode_publish/src/main.rs @@ -0,0 +1,687 @@ +use std::{ + net::{Shutdown, TcpStream}, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, + time::{Duration, Instant, SystemTime, UNIX_EPOCH}, +}; + +use anyhow::{bail, Context, Result}; +use clap::{Parser, ValueEnum}; +use livekit::{prelude::*, webrtc::video_source::VideoResolution}; +use livekit_api::access_token; +use livekit_capture::{ + encoded::h26x::annex_b_nal_ranges, + sources::{ + rtsp::{RtspEncodedSource, RtspSourceOptions}, + tcp::{ByteStreamSourceConfig, TcpEncodedSource}, + }, + CaptureError, EncodedAccessUnitSource, EncodedFrameType, EncodedVideoCodec, EncodedWireFormat, + OwnedEncodedAccessUnit, VideoCaptureTrack, +}; + +const DIAGNOSTIC_REPORT_INTERVAL: Duration = Duration::from_secs(1); +const SOURCE_STALL_THRESHOLD: Duration = Duration::from_millis(250); +const BURST_WALL_DELTA_THRESHOLD: Duration = Duration::from_millis(5); +const KEYFRAME_GAP_THRESHOLD: Duration = Duration::from_secs(5); + +/// Publish a pre-encoded video stream into a LiveKit room. +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + /// Encoded stream source. + #[arg(long, value_enum, default_value_t = SourceKind::Tcp)] + source: SourceKind, + + /// Encoded video codec. Required with --source tcp; optional validation with --source rtsp. + #[arg(long, value_enum)] + codec: Option, + + /// TCP server address as host:port. Required with --source tcp. + #[arg(long)] + host: Option, + + /// RTSP URL. Required with --source rtsp. + #[arg(long)] + rtsp_url: Option, + + /// LiveKit server URL. + #[arg(long, env = "LIVEKIT_URL")] + url: String, + + /// LiveKit API key. + #[arg(long, env = "LIVEKIT_API_KEY")] + api_key: String, + + /// LiveKit API secret. + #[arg(long, env = "LIVEKIT_API_SECRET")] + api_secret: String, + + /// Room name to join. + #[arg(long)] + room_name: String, + + /// Participant identity to publish as. + #[arg(long)] + identity: String, + + /// Encoded frame width in pixels. + #[arg(long, default_value_t = 1920)] + width: u32, + + /// Encoded frame height in pixels. + #[arg(long, default_value_t = 1080)] + height: u32, + + /// Frame rate used to timestamp TCP Annex-B access units. + #[arg(long, default_value_t = 30)] + fps: u32, + + /// Log access-unit timing, keyframe, and H26x NAL diagnostics. + #[arg(long)] + diagnostics: bool, +} + +#[derive(Debug, Clone, Copy, ValueEnum)] +enum SourceKind { + Tcp, + Rtsp, +} + +#[derive(Debug, Clone, Copy, ValueEnum)] +enum CodecArg { + H264, + H265, +} + +impl CodecArg { + fn encoded_codec(self) -> EncodedVideoCodec { + match self { + Self::H264 => EncodedVideoCodec::H264, + Self::H265 => EncodedVideoCodec::H265, + } + } + + fn wire_format(self) -> EncodedWireFormat { + match self { + Self::H264 => EncodedWireFormat::H264AnnexB, + Self::H265 => EncodedWireFormat::H265AnnexB, + } + } +} + +#[tokio::main] +async fn main() -> Result<()> { + env_logger::init(); + run(Args::parse()).await +} + +async fn run(args: Args) -> Result<()> { + validate_dimensions(args.width, args.height)?; + + match args.source { + SourceKind::Tcp => { + let frame_interval_us = frame_interval_us(args.fps)?; + run_tcp_source(args, frame_interval_us).await + } + SourceKind::Rtsp => run_rtsp_source(args).await, + } +} + +async fn run_tcp_source(args: Args, frame_interval_us: i64) -> Result<()> { + let codec_arg = args.codec.context("--codec is required with --source tcp")?; + let codec = codec_arg.encoded_codec(); + let host = args.host.clone().context("--host is required with --source tcp")?; + let config = ByteStreamSourceConfig::new( + codec_arg.wire_format(), + current_time_us(), + frame_interval_us, + args.width, + args.height, + ); + + log::info!("Connecting to TCP encoded stream at {host}"); + let stream = TcpStream::connect(&host) + .with_context(|| format!("failed to connect to TCP source at {host}"))?; + let shutdown_stream = stream.try_clone().context("failed to clone TCP stream")?; + let source = TcpEncodedSource::from_tcp_stream(stream, config)?; + + publish_encoded_source(args, codec, "TCP", source, shutdown_stream, Some(frame_interval_us)) + .await +} + +async fn run_rtsp_source(args: Args) -> Result<()> { + let rtsp_url = args.rtsp_url.clone().context("--rtsp-url is required with --source rtsp")?; + let mut options = + RtspSourceOptions::new(args.width, args.height).with_start_timestamp_us(current_time_us()); + if let Some(codec) = args.codec { + options = options.with_expected_codec(codec.encoded_codec()); + } + + log::info!("Connecting to RTSP encoded stream at {rtsp_url}"); + let source = RtspEncodedSource::connect(&rtsp_url, options) + .with_context(|| format!("failed to connect to RTSP source at {rtsp_url}"))?; + let shutdown_stream = source.try_clone_stream().context("failed to clone RTSP TCP stream")?; + let codec = source.session_info().codec; + log::info!( + "RTSP setup selected {:?} payload type {} on interleaved channel {}", + codec, + source.session_info().payload_type, + source.session_info().video_channel + ); + + publish_encoded_source(args, codec, "RTSP", source, shutdown_stream, None).await +} + +async fn publish_encoded_source( + args: Args, + codec: EncodedVideoCodec, + source_label: &'static str, + source: S, + shutdown_stream: TcpStream, + expected_frame_interval_us: Option, +) -> Result<()> +where + S: EncodedAccessUnitSource + Send + 'static, +{ + let diagnostics_enabled = args.diagnostics; + let token = access_token::AccessToken::with_api_key(&args.api_key, &args.api_secret) + .with_identity(&args.identity) + .with_name(&args.identity) + .with_grants(access_token::VideoGrants { + room_join: true, + room: args.room_name.clone(), + can_publish: true, + can_subscribe: false, + ..Default::default() + }) + .to_jwt()?; + + log::info!("Connecting to LiveKit room '{}' as '{}'", args.room_name, args.identity); + let (room, _) = Room::connect(&args.url, &token, RoomOptions::default()) + .await + .context("failed to connect to LiveKit room")?; + + let capture_track = VideoCaptureTrack::new( + "preencoded", + VideoResolution { width: args.width, height: args.height }, + false, + ); + let mut publish_options = VideoCaptureTrack::encoded_publish_options(codec); + publish_options.source = TrackSource::Camera; + + room.local_participant() + .publish_track(LocalTrack::Video(capture_track.track()), publish_options) + .await + .context("failed to publish pre-encoded video track")?; + log::info!( + "Published pre-encoded {:?} track at {}x{}; forwarding {} access units", + codec, + args.width, + args.height, + source_label + ); + + let stop = Arc::new(AtomicBool::new(false)); + let signal_task = tokio::spawn({ + let stop = stop.clone(); + async move { + let _ = tokio::signal::ctrl_c().await; + stop.store(true, Ordering::Release); + let _ = shutdown_stream.shutdown(Shutdown::Both); + } + }); + + let capture_task = tokio::task::spawn_blocking({ + let stop = stop.clone(); + move || { + let diagnostics = AccessUnitDiagnostics::new( + diagnostics_enabled, + source_label, + expected_frame_interval_us, + ); + forward_access_units(source, capture_track, stop, diagnostics) + } + }); + let captured = capture_task.await.context("capture task failed to join")??; + signal_task.abort(); + room.close().await.context("failed to close LiveKit room")?; + + log::info!("Stopped after publishing {captured} encoded access units"); + Ok(()) +} + +fn forward_access_units( + mut source: S, + track: VideoCaptureTrack, + stop: Arc, + mut diagnostics: AccessUnitDiagnostics, +) -> Result +where + S: EncodedAccessUnitSource, +{ + let mut captured = 0; + let mut dropped = 0; + while !stop.load(Ordering::Acquire) { + let read_started = Instant::now(); + let access_unit = match source.next_access_unit() { + Ok(Some(access_unit)) => access_unit, + Ok(None) => break, + Err(err) if stop.load(Ordering::Acquire) => { + log::debug!("encoded source stopped after shutdown: {err}"); + break; + } + Err(err) => return Err(err.into()), + }; + diagnostics.observe_source_wait(read_started.elapsed()); + diagnostics.observe_access_unit(&access_unit); + + match track.capture_encoded(&access_unit.as_access_unit()) { + Ok(()) => {} + Err(CaptureError::CaptureFailed) => { + dropped += 1; + if dropped == 1 || dropped % 300 == 0 { + log::info!("Dropped {dropped} encoded access units before capture"); + } + continue; + } + Err(err) => return Err(err.into()), + } + captured += 1; + if captured % 300 == 0 { + log::info!("Published {captured} encoded access units"); + } + } + diagnostics.finish(); + + Ok(captured) +} + +#[derive(Debug)] +struct AccessUnitDiagnostics { + enabled: bool, + source_label: &'static str, + expected_frame_interval_us: Option, + last_report: Instant, + last_wall_time: Option, + last_timestamp_us: Option, + last_keyframe_wall_time: Option, + last_keyframe_warning: Option, + total_frames: u64, + total_keyframes: u64, + report_frames: u64, + report_keyframes: u64, + report_bytes: u64, + report_max_bytes: usize, + report_max_source_wait: Duration, + report_max_wall_gap: Duration, + report_max_timestamp_gap_us: i64, + report_stalls: u64, + report_bursts: u64, + report_missing_parameter_keyframes: u64, +} + +impl AccessUnitDiagnostics { + fn new( + enabled: bool, + source_label: &'static str, + expected_frame_interval_us: Option, + ) -> Self { + let now = Instant::now(); + if enabled { + match expected_frame_interval_us { + Some(interval_us) => log::info!( + "{source_label} diagnostics enabled; expected frame interval {:.2}ms", + interval_us as f64 / 1000.0 + ), + None => log::info!("{source_label} diagnostics enabled"), + } + } + + Self { + enabled, + source_label, + expected_frame_interval_us, + last_report: now, + last_wall_time: None, + last_timestamp_us: None, + last_keyframe_wall_time: None, + last_keyframe_warning: None, + total_frames: 0, + total_keyframes: 0, + report_frames: 0, + report_keyframes: 0, + report_bytes: 0, + report_max_bytes: 0, + report_max_source_wait: Duration::ZERO, + report_max_wall_gap: Duration::ZERO, + report_max_timestamp_gap_us: 0, + report_stalls: 0, + report_bursts: 0, + report_missing_parameter_keyframes: 0, + } + } + + fn observe_source_wait(&mut self, wait: Duration) { + if !self.enabled { + return; + } + + self.report_max_source_wait = self.report_max_source_wait.max(wait); + if wait > SOURCE_STALL_THRESHOLD { + self.report_stalls += 1; + log::warn!( + "{} source wait {:.1}ms before next access unit", + self.source_label, + wait.as_secs_f64() * 1000.0 + ); + } + } + + fn observe_access_unit(&mut self, access_unit: &OwnedEncodedAccessUnit) { + if !self.enabled { + return; + } + + let now = Instant::now(); + let payload = access_unit.payload.as_ref(); + let payload_len = payload.len(); + let nal_summary = NalSummary::from_annex_b(access_unit.codec, payload); + let is_keyframe = access_unit.frame_type == EncodedFrameType::Key; + let timestamp_gap_us = + self.last_timestamp_us.map(|last| access_unit.timestamp_us.saturating_sub(last)); + + self.total_frames += 1; + self.report_frames += 1; + self.report_bytes = self.report_bytes.saturating_add(payload_len as u64); + self.report_max_bytes = self.report_max_bytes.max(payload_len); + if is_keyframe { + self.total_keyframes += 1; + self.report_keyframes += 1; + self.last_keyframe_wall_time = Some(now); + self.last_keyframe_warning = None; + } + + if let Some(last_wall_time) = self.last_wall_time { + let wall_gap = now.saturating_duration_since(last_wall_time); + self.report_max_wall_gap = self.report_max_wall_gap.max(wall_gap); + if wall_gap > SOURCE_STALL_THRESHOLD { + self.report_stalls += 1; + log::warn!( + "{} publish gap {:.1}ms before frame {}", + self.source_label, + wall_gap.as_secs_f64() * 1000.0, + self.total_frames + ); + } + if wall_gap < BURST_WALL_DELTA_THRESHOLD { + if let Some(timestamp_gap_us) = timestamp_gap_us { + if timestamp_gap_us > BURST_WALL_DELTA_THRESHOLD.as_micros() as i64 { + self.report_bursts += 1; + } + } + } + } + + if let Some(timestamp_gap_us) = timestamp_gap_us { + self.report_max_timestamp_gap_us = + self.report_max_timestamp_gap_us.max(timestamp_gap_us); + self.observe_timestamp_gap(timestamp_gap_us); + } + + if is_keyframe { + if nal_summary.missing_recovery_parameter_set() { + self.report_missing_parameter_keyframes += 1; + log::warn!( + "{} keyframe {} missing recovery parameter sets: {}", + self.source_label, + self.total_frames, + nal_summary.describe(access_unit.codec) + ); + } else { + log::info!( + "{} keyframe {} ts={} size={} {}", + self.source_label, + self.total_frames, + access_unit.timestamp_us, + payload_len, + nal_summary.describe(access_unit.codec) + ); + } + } else if nal_summary.contains_key_picture { + log::warn!( + "{} access unit {} contains a key picture but is marked delta: {}", + self.source_label, + self.total_frames, + nal_summary.describe(access_unit.codec) + ); + } + + self.warn_if_keyframe_gap(now); + self.last_wall_time = Some(now); + self.last_timestamp_us = Some(access_unit.timestamp_us); + self.report_if_due(now); + } + + fn observe_timestamp_gap(&mut self, timestamp_gap_us: i64) { + let Some(expected_us) = self.expected_frame_interval_us else { + return; + }; + let tolerance_us = (expected_us / 2).max(10_000); + let deviation_us = (timestamp_gap_us - expected_us).abs(); + if deviation_us > tolerance_us { + log::warn!( + "{} timestamp gap {:.2}ms differs from expected {:.2}ms", + self.source_label, + timestamp_gap_us as f64 / 1000.0, + expected_us as f64 / 1000.0 + ); + } + } + + fn warn_if_keyframe_gap(&mut self, now: Instant) { + let Some(last_keyframe_wall_time) = self.last_keyframe_wall_time else { + if self.total_frames > 1 + && self.last_keyframe_warning.is_none_or(|last| { + now.saturating_duration_since(last) >= KEYFRAME_GAP_THRESHOLD + }) + { + self.last_keyframe_warning = Some(now); + log::warn!( + "{} has not seen a keyframe after {} access units", + self.source_label, + self.total_frames + ); + } + return; + }; + + let keyframe_gap = now.saturating_duration_since(last_keyframe_wall_time); + if keyframe_gap >= KEYFRAME_GAP_THRESHOLD + && self + .last_keyframe_warning + .is_none_or(|last| now.saturating_duration_since(last) >= KEYFRAME_GAP_THRESHOLD) + { + self.last_keyframe_warning = Some(now); + log::warn!( + "{} no keyframe for {:.1}s; passthrough cannot satisfy PLI without upstream IDR", + self.source_label, + keyframe_gap.as_secs_f64() + ); + } + } + + fn report_if_due(&mut self, now: Instant) { + let elapsed = now.saturating_duration_since(self.last_report); + if elapsed < DIAGNOSTIC_REPORT_INTERVAL { + return; + } + + let avg_size = + if self.report_frames == 0 { 0 } else { self.report_bytes / self.report_frames }; + let fps = self.report_frames as f64 / elapsed.as_secs_f64(); + log::info!( + "{} diagnostics: frames={} fps={:.1} keys={} avg_size={} max_size={} \ + max_source_wait={:.1}ms max_publish_gap={:.1}ms max_ts_gap={:.1}ms stalls={} \ + bursts={} missing_param_keys={}", + self.source_label, + self.report_frames, + fps, + self.report_keyframes, + avg_size, + self.report_max_bytes, + self.report_max_source_wait.as_secs_f64() * 1000.0, + self.report_max_wall_gap.as_secs_f64() * 1000.0, + self.report_max_timestamp_gap_us as f64 / 1000.0, + self.report_stalls, + self.report_bursts, + self.report_missing_parameter_keyframes + ); + self.reset_report(now); + } + + fn reset_report(&mut self, now: Instant) { + self.last_report = now; + self.report_frames = 0; + self.report_keyframes = 0; + self.report_bytes = 0; + self.report_max_bytes = 0; + self.report_max_source_wait = Duration::ZERO; + self.report_max_wall_gap = Duration::ZERO; + self.report_max_timestamp_gap_us = 0; + self.report_stalls = 0; + self.report_bursts = 0; + self.report_missing_parameter_keyframes = 0; + } + + fn finish(&mut self) { + if !self.enabled { + return; + } + + log::info!( + "{} diagnostics finished: frames={} keyframes={}", + self.source_label, + self.total_frames, + self.total_keyframes + ); + } +} + +#[derive(Debug, Default)] +struct NalSummary { + nal_count: usize, + vcl_count: usize, + aud_count: usize, + vps_count: usize, + sps_count: usize, + pps_count: usize, + contains_key_picture: bool, +} + +impl NalSummary { + fn from_annex_b(codec: EncodedVideoCodec, payload: &[u8]) -> Self { + let mut summary = Self::default(); + for range in annex_b_nal_ranges(payload) { + let nal = &payload[range]; + if nal.is_empty() { + continue; + } + + match codec { + EncodedVideoCodec::H264 => summary.observe_h264(nal[0] & 0x1f), + EncodedVideoCodec::H265 => { + if nal.len() >= 2 { + summary.observe_h265((nal[0] >> 1) & 0x3f); + } + } + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => {} + _ => {} + } + } + summary + } + + fn observe_h264(&mut self, nal_type: u8) { + self.nal_count += 1; + if (1..=5).contains(&nal_type) { + self.vcl_count += 1; + } + match nal_type { + 5 => self.contains_key_picture = true, + 7 => self.sps_count += 1, + 8 => self.pps_count += 1, + 9 => self.aud_count += 1, + _ => {} + } + } + + fn observe_h265(&mut self, nal_type: u8) { + self.nal_count += 1; + if nal_type <= 31 { + self.vcl_count += 1; + } + match nal_type { + 16..=21 => self.contains_key_picture = true, + 32 => self.vps_count += 1, + 33 => self.sps_count += 1, + 34 => self.pps_count += 1, + 35 => self.aud_count += 1, + _ => {} + } + } + + fn missing_recovery_parameter_set(&self) -> bool { + self.sps_count == 0 || self.pps_count == 0 + } + + fn describe(&self, codec: EncodedVideoCodec) -> String { + match codec { + EncodedVideoCodec::H264 => format!( + "nals={} vcl={} aud={} sps={} pps={} key_picture={}", + self.nal_count, + self.vcl_count, + self.aud_count, + self.sps_count, + self.pps_count, + self.contains_key_picture + ), + EncodedVideoCodec::H265 => format!( + "nals={} vcl={} aud={} vps={} sps={} pps={} key_picture={}", + self.nal_count, + self.vcl_count, + self.aud_count, + self.vps_count, + self.sps_count, + self.pps_count, + self.contains_key_picture + ), + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { + "non-H26x payload".to_string() + } + _ => "unknown encoded payload".to_string(), + } + } +} + +fn validate_dimensions(width: u32, height: u32) -> Result<()> { + if width == 0 || height == 0 { + bail!("--width and --height must be greater than zero"); + } + Ok(()) +} + +fn frame_interval_us(fps: u32) -> Result { + if fps == 0 { + bail!("--fps must be greater than zero"); + } + Ok(1_000_000_i64 / i64::from(fps)) +} + +fn current_time_us() -> i64 { + let Ok(duration) = SystemTime::now().duration_since(UNIX_EPOCH) else { + return 0; + }; + duration.as_micros().min(i64::MAX as u128) as i64 +} diff --git a/livekit-capture/Cargo.toml b/livekit-capture/Cargo.toml index 5cc486c8b..b3218de14 100644 --- a/livekit-capture/Cargo.toml +++ b/livekit-capture/Cargo.toml @@ -8,9 +8,13 @@ edition.workspace = true repository.workspace = true [dependencies] +base64 = { workspace = true } bytes = { workspace = true } +gstreamer = { workspace = true, optional = true } +gstreamer-app = { workspace = true, optional = true } image = { workspace = true, optional = true } livekit = { workspace = true } +md-5 = { workspace = true } thiserror = { workspace = true } yuv-sys = { workspace = true, features = ["jpeg"], optional = true } @@ -51,7 +55,7 @@ avfoundation = [ "objc2-foundation/NSString", "objc2-foundation/objc2-core-foundation", ] -gstreamer = [] +gstreamer = ["dep:gstreamer", "dep:gstreamer-app"] libargus = [] rtsp = [] tcp-source = [] diff --git a/livekit-capture/src/source.rs b/livekit-capture/src/source.rs new file mode 100644 index 000000000..284b664fe --- /dev/null +++ b/livekit-capture/src/source.rs @@ -0,0 +1,772 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{error::Error, fmt}; + +use livekit::webrtc::video_frame::{I420Buffer, VideoFrame}; +use thiserror::Error; + +use crate::{ + device::{ + CaptureBackend, CaptureDeviceInfo, CaptureDeviceQueryError, CaptureDeviceSelector, + CaptureFormat, CaptureFormatRequest, CaptureFrameFormat, CapturePath, + }, + dmabuf::DmaBufFrame, + encoded::{ingress::EncodedAccessUnitSource, OwnedEncodedAccessUnit}, + error::CaptureError, + track::VideoCaptureTrack, +}; + +/// Capture timestamp metadata attached to frames by high-level source options. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +#[non_exhaustive] +pub enum CaptureTimestampSource { + /// Do not attach a user timestamp. + #[default] + None, + /// Attach the wall-clock timestamp observed by the source wrapper. + WallClock, + /// Attach the backend-provided sensor/capture timestamp when available. + Backend, +} + +/// Metadata options shared by high-level capture sources. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct CaptureMetadataOptions { + /// Timestamp source to attach as [`crate::FrameMetadata::user_timestamp`]. + pub timestamp: CaptureTimestampSource, + /// Whether to attach a monotonically increasing frame id. + pub frame_id: bool, +} + +/// Options used by [`VideoCaptureSource::open`]. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CaptureSourceOptions { + /// Backend to open. + pub backend: CaptureBackend, + /// Device to open. + pub device: CaptureDeviceSelector, + /// Format requested from the backend. + pub format: CaptureFormatRequest, + /// Metadata to attach when the backend supports it. + pub metadata: CaptureMetadataOptions, + /// Whether the resulting track should be marked as a screencast. + pub is_screencast: bool, +} + +impl Default for CaptureSourceOptions { + fn default() -> Self { + Self { + backend: CaptureBackend::Auto, + device: CaptureDeviceSelector::Default, + format: CaptureFormatRequest::Default, + metadata: CaptureMetadataOptions::default(), + is_screencast: false, + } + } +} + +/// Uncompressed CPU-accessible video frame buffer produced by a capture source. +#[derive(Debug)] +pub struct RawVideoFrame { + /// I420 video frame suitable for [`VideoCaptureTrack::capture_frame`]. + pub frame: VideoFrame, + /// Source format delivered by the capture backend before conversion to I420. + pub source_format: CaptureFrameFormat, + /// Wall-clock capture timestamp in microseconds. + pub capture_wall_time_us: u64, + /// Wall-clock timestamp recorded after the frame was read, in microseconds. + pub read_wall_time_us: u64, + /// Whether the backend converted the source buffer before publishing. + pub used_conversion: bool, +} + +impl RawVideoFrame { + /// Returns the decoded I420 video frame. + pub fn video_frame(&self) -> &VideoFrame { + &self.frame + } +} + +/// Frame produced by a capture source. +#[derive(Debug)] +#[non_exhaustive] +pub enum CaptureFrame { + /// Uncompressed CPU-accessible frame. + Raw(RawVideoFrame), + /// Linux DMA-BUF backed frame. + DmaBuf(DmaBufFrame), + /// Encoded video access unit. + Encoded(OwnedEncodedAccessUnit), +} + +impl CaptureFrame { + /// Returns the capture path used by this frame. + pub fn capture_path(&self) -> CapturePath { + match self { + Self::Raw(_) => CapturePath::Raw, + Self::DmaBuf(_) => CapturePath::DmaBuf, + Self::Encoded(_) => CapturePath::Encoded, + } + } + + /// Publishes this frame into a LiveKit capture track. + pub fn publish_to(&self, track: &VideoCaptureTrack) -> Result<(), CaptureError> { + match self { + Self::Raw(frame) => { + track.capture_frame(&frame.frame); + Ok(()) + } + #[cfg(target_os = "linux")] + Self::DmaBuf(frame) => track.capture_dmabuf(frame), + #[cfg(not(target_os = "linux"))] + Self::DmaBuf(_) => Err(CaptureError::UnsupportedPlatform("DMA-BUF capture")), + Self::Encoded(access_unit) => track.capture_encoded(&access_unit.as_access_unit()), + } + } +} + +/// Source that produces one of the common capture frame paths. +pub trait CaptureFrameSource { + /// Error returned by the source. + type Error: Error + Send + Sync + 'static; + + /// Returns the capture path produced by this source. + fn capture_path(&self) -> CapturePath; + + /// Returns the negotiated capture format when the source has one. + fn format(&self) -> Option; + + /// Captures the next frame. + fn next_frame(&mut self) -> Result; +} + +/// Adapts an [`EncodedAccessUnitSource`] into the common frame-source model. +#[derive(Debug)] +pub struct EncodedCaptureFrameSource { + source: S, +} + +impl EncodedCaptureFrameSource { + /// Creates a frame-source adapter for an encoded access-unit source. + pub fn new(source: S) -> Self { + Self { source } + } + + /// Returns the underlying encoded source. + pub fn source(&self) -> &S { + &self.source + } + + /// Returns the underlying encoded source mutably. + pub fn source_mut(&mut self) -> &mut S { + &mut self.source + } + + /// Consumes this adapter and returns the underlying encoded source. + pub fn into_inner(self) -> S { + self.source + } +} + +impl CaptureFrameSource for EncodedCaptureFrameSource +where + S: EncodedAccessUnitSource, +{ + type Error = EncodedFrameSourceError; + + fn capture_path(&self) -> CapturePath { + CapturePath::Encoded + } + + fn format(&self) -> Option { + None + } + + fn next_frame(&mut self) -> Result { + let Some(access_unit) = + self.source.next_access_unit().map_err(EncodedFrameSourceError::Source)? + else { + return Err(EncodedFrameSourceError::EndOfStream); + }; + Ok(CaptureFrame::Encoded(access_unit)) + } +} + +/// Error returned by [`EncodedCaptureFrameSource`]. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum EncodedFrameSourceError { + /// The encoded source reached EOF. + EndOfStream, + /// The encoded source failed. + Source(E), +} + +impl fmt::Display for EncodedFrameSourceError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::EndOfStream => f.write_str("encoded source reached end of stream"), + Self::Source(err) => write!(f, "encoded source failed: {err}"), + } + } +} + +impl Error for EncodedFrameSourceError +where + E: Error + 'static, +{ + fn source(&self) -> Option<&(dyn Error + 'static)> { + match self { + Self::EndOfStream => None, + Self::Source(err) => Some(err), + } + } +} + +/// Error returned by the high-level capture source façade. +#[derive(Debug, Error)] +#[non_exhaustive] +pub enum CaptureSourceError { + /// The requested backend cannot be used by this façade on this target or build. + #[error("capture backend {0} is not supported by VideoCaptureSource on this target or build")] + UnsupportedBackend(CaptureBackend), + /// The backend source failed. + #[error("capture backend {backend} failed: {message}")] + Backend { + /// Backend that failed. + backend: CaptureBackend, + /// Backend error message. + message: String, + }, + /// The capture track rejected the frame. + #[error(transparent)] + Capture(#[from] CaptureError), +} + +/// High-level capture source façade for common camera capture use cases. +#[derive(Debug)] +#[non_exhaustive] +pub enum VideoCaptureSource { + /// AVFoundation decoded-frame source. + #[cfg(feature = "avfoundation")] + AvFoundation(crate::sources::avfoundation::AvFoundationCaptureSession), + /// Linux V4L2 decoded-frame source. + #[cfg(feature = "v4l")] + V4l(crate::sources::v4l::V4lCaptureSession), + /// Jetson libargus DMA-BUF source. + #[cfg(feature = "libargus")] + LibArgus(crate::sources::argus::ArgusCaptureSession), +} + +impl VideoCaptureSource { + /// Lists capture devices for a backend. + pub fn list_devices( + backend: CaptureBackend, + ) -> Result, CaptureDeviceQueryError> { + match backend { + CaptureBackend::Auto => list_auto_devices(), + CaptureBackend::AvFoundation => list_avfoundation_devices(), + CaptureBackend::V4l2 => list_v4l_devices(), + CaptureBackend::LibArgus => list_argus_devices(), + CaptureBackend::Rtsp | CaptureBackend::Tcp | CaptureBackend::Gstreamer => { + Err(CaptureDeviceQueryError::UnsupportedBackend(backend)) + } + } + } + + /// Opens a capture source. + pub fn open(options: CaptureSourceOptions) -> Result { + match options.backend { + CaptureBackend::Auto => open_auto_source(options), + CaptureBackend::AvFoundation => open_avfoundation_source(options), + CaptureBackend::V4l2 => open_v4l_source(options), + CaptureBackend::LibArgus => open_argus_source(options), + CaptureBackend::Rtsp | CaptureBackend::Tcp | CaptureBackend::Gstreamer => { + Err(CaptureSourceError::UnsupportedBackend(options.backend)) + } + } + } + + /// Returns the capture path produced by this source. + pub fn capture_path(&self) -> CapturePath { + match self { + #[cfg(feature = "avfoundation")] + Self::AvFoundation(source) => source.capture_path(), + #[cfg(feature = "v4l")] + Self::V4l(source) => source.capture_path(), + #[cfg(feature = "libargus")] + Self::LibArgus(source) => source.capture_path(), + #[allow(unreachable_patterns)] + _ => unreachable!("VideoCaptureSource has no enabled backend variants"), + } + } + + /// Returns the negotiated capture format when the source has one. + pub fn format(&self) -> Option { + match self { + #[cfg(feature = "avfoundation")] + Self::AvFoundation(source) => Some(source.format()), + #[cfg(feature = "v4l")] + Self::V4l(source) => Some(source.format()), + #[cfg(feature = "libargus")] + Self::LibArgus(source) => Some(source.format()), + #[allow(unreachable_patterns)] + _ => unreachable!("VideoCaptureSource has no enabled backend variants"), + } + } + + /// Captures the next frame. + pub fn next_frame(&mut self) -> Result { + match self { + #[cfg(feature = "avfoundation")] + Self::AvFoundation(source) => source + .next_frame() + .map_err(|err| backend_source_error(CaptureBackend::AvFoundation, err)), + #[cfg(feature = "v4l")] + Self::V4l(source) => { + source.next_frame().map_err(|err| backend_source_error(CaptureBackend::V4l2, err)) + } + #[cfg(feature = "libargus")] + Self::LibArgus(source) => source + .next_frame() + .map_err(|err| backend_source_error(CaptureBackend::LibArgus, err)), + #[allow(unreachable_patterns)] + _ => unreachable!("VideoCaptureSource has no enabled backend variants"), + } + } + + /// Captures and publishes the next frame. + pub fn publish_next(&mut self, track: &VideoCaptureTrack) -> Result { + let frame = self.next_frame()?; + frame.publish_to(track)?; + Ok(true) + } +} + +#[cfg(feature = "avfoundation")] +impl CaptureFrameSource for crate::sources::avfoundation::AvFoundationCaptureSession { + type Error = crate::sources::avfoundation::AvFoundationError; + + fn capture_path(&self) -> CapturePath { + self.capture_path() + } + + fn format(&self) -> Option { + Some(self.format()) + } + + fn next_frame(&mut self) -> Result { + self.capture_frame().map(|frame| CaptureFrame::Raw(frame.into())) + } +} + +#[cfg(feature = "avfoundation")] +impl From for RawVideoFrame { + fn from(frame: crate::sources::avfoundation::AvFoundationFrame) -> Self { + Self { + frame: frame.frame, + source_format: frame.source_format, + capture_wall_time_us: frame.capture_wall_time_us, + read_wall_time_us: frame.read_wall_time_us, + used_conversion: frame.used_conversion, + } + } +} + +#[cfg(feature = "v4l")] +impl CaptureFrameSource for crate::sources::v4l::V4lCaptureSession { + type Error = crate::sources::v4l::V4lError; + + fn capture_path(&self) -> CapturePath { + self.capture_path() + } + + fn format(&self) -> Option { + Some(self.format()) + } + + fn next_frame(&mut self) -> Result { + self.capture_frame().map(|frame| CaptureFrame::Raw(frame.into())) + } +} + +#[cfg(feature = "v4l")] +impl From for RawVideoFrame { + fn from(frame: crate::sources::v4l::V4lFrame) -> Self { + Self { + used_conversion: frame.used_conversion, + frame: frame.frame, + source_format: frame.source_format, + capture_wall_time_us: frame.capture_wall_time_us, + read_wall_time_us: frame.read_wall_time_us, + } + } +} + +#[cfg(feature = "libargus")] +impl CaptureFrameSource for crate::sources::argus::ArgusCaptureSession { + type Error = crate::sources::argus::ArgusError; + + fn capture_path(&self) -> CapturePath { + self.capture_path() + } + + fn format(&self) -> Option { + Some(self.format()) + } + + fn next_frame(&mut self) -> Result { + self.capture_frame().map(|frame| CaptureFrame::DmaBuf(frame.dmabuf)) + } +} + +#[allow(dead_code)] +fn backend_source_error( + backend: CaptureBackend, + error: impl Error + Send + Sync + 'static, +) -> CaptureSourceError { + CaptureSourceError::Backend { backend, message: error.to_string() } +} + +#[allow(dead_code)] +fn backend_query_error( + backend: CaptureBackend, + error: impl Error + Send + Sync + 'static, +) -> CaptureDeviceQueryError { + CaptureDeviceQueryError::Backend { backend, message: error.to_string() } +} + +fn list_auto_devices() -> Result, CaptureDeviceQueryError> { + #[cfg(all(target_os = "macos", feature = "avfoundation"))] + { + return list_avfoundation_devices(); + } + #[cfg(all(target_os = "linux", feature = "v4l"))] + { + return list_v4l_devices(); + } + #[allow(unreachable_code)] + Err(CaptureDeviceQueryError::UnsupportedBackend(CaptureBackend::Auto)) +} + +fn open_auto_source( + options: CaptureSourceOptions, +) -> Result { + let _ = &options; + #[cfg(all(target_os = "macos", feature = "avfoundation"))] + { + let mut options = options; + options.backend = CaptureBackend::AvFoundation; + return open_avfoundation_source(options); + } + #[cfg(all(target_os = "linux", feature = "v4l"))] + { + let mut options = options; + options.backend = CaptureBackend::V4l2; + return open_v4l_source(options); + } + #[allow(unreachable_code)] + Err(CaptureSourceError::UnsupportedBackend(CaptureBackend::Auto)) +} + +#[cfg(feature = "avfoundation")] +fn list_avfoundation_devices() -> Result, CaptureDeviceQueryError> { + crate::sources::avfoundation::devices().map_err(|err| match err { + crate::sources::avfoundation::AvFoundationError::UnsupportedPlatform => { + CaptureDeviceQueryError::UnsupportedBackend(CaptureBackend::AvFoundation) + } + other => backend_query_error(CaptureBackend::AvFoundation, other), + }) +} + +#[cfg(not(feature = "avfoundation"))] +fn list_avfoundation_devices() -> Result, CaptureDeviceQueryError> { + Err(CaptureDeviceQueryError::UnsupportedBackend(CaptureBackend::AvFoundation)) +} + +#[cfg(feature = "avfoundation")] +fn open_avfoundation_source( + options: CaptureSourceOptions, +) -> Result { + let source = crate::sources::avfoundation::AvFoundationCaptureSession::new(options.into()) + .map_err(|err| match err { + crate::sources::avfoundation::AvFoundationError::UnsupportedPlatform => { + CaptureSourceError::UnsupportedBackend(CaptureBackend::AvFoundation) + } + other => backend_source_error(CaptureBackend::AvFoundation, other), + })?; + Ok(VideoCaptureSource::AvFoundation(source)) +} + +#[cfg(not(feature = "avfoundation"))] +fn open_avfoundation_source( + _options: CaptureSourceOptions, +) -> Result { + Err(CaptureSourceError::UnsupportedBackend(CaptureBackend::AvFoundation)) +} + +#[cfg(feature = "avfoundation")] +impl From for crate::sources::avfoundation::AvFoundationCaptureOptions { + fn from(options: CaptureSourceOptions) -> Self { + Self { + device: options.device, + format: options.format, + is_screencast: options.is_screencast, + } + } +} + +#[cfg(feature = "v4l")] +fn list_v4l_devices() -> Result, CaptureDeviceQueryError> { + crate::sources::v4l::devices().map_err(|err| match err { + crate::sources::v4l::V4lError::UnsupportedPlatform => { + CaptureDeviceQueryError::UnsupportedBackend(CaptureBackend::V4l2) + } + other => backend_query_error(CaptureBackend::V4l2, other), + }) +} + +#[cfg(not(feature = "v4l"))] +fn list_v4l_devices() -> Result, CaptureDeviceQueryError> { + Err(CaptureDeviceQueryError::UnsupportedBackend(CaptureBackend::V4l2)) +} + +#[cfg(feature = "v4l")] +fn open_v4l_source( + options: CaptureSourceOptions, +) -> Result { + let source = + crate::sources::v4l::V4lCaptureSession::new(options.into()).map_err(|err| match err { + crate::sources::v4l::V4lError::UnsupportedPlatform => { + CaptureSourceError::UnsupportedBackend(CaptureBackend::V4l2) + } + other => backend_source_error(CaptureBackend::V4l2, other), + })?; + Ok(VideoCaptureSource::V4l(source)) +} + +#[cfg(not(feature = "v4l"))] +fn open_v4l_source( + _options: CaptureSourceOptions, +) -> Result { + Err(CaptureSourceError::UnsupportedBackend(CaptureBackend::V4l2)) +} + +#[cfg(feature = "v4l")] +impl From for crate::sources::v4l::V4lCaptureOptions { + fn from(options: CaptureSourceOptions) -> Self { + let mut source_options = Self { + device: options.device, + format: options.format, + frame_formats: crate::sources::v4l::default_frame_formats(), + attach_capture_timestamp: options.metadata.timestamp != CaptureTimestampSource::None, + attach_frame_id: options.metadata.frame_id, + }; + if let CaptureFormatRequest::Exact(format) | CaptureFormatRequest::Closest(format) = + source_options.format + { + source_options.frame_formats = + crate::sources::v4l::ordered_frame_formats_with_first(format.frame_format); + } + source_options + } +} + +#[cfg(feature = "libargus")] +fn list_argus_devices() -> Result, CaptureDeviceQueryError> { + crate::sources::argus::devices().map_err(|err| match err { + crate::sources::argus::ArgusError::Unsupported => { + CaptureDeviceQueryError::UnsupportedBackend(CaptureBackend::LibArgus) + } + other => backend_query_error(CaptureBackend::LibArgus, other), + }) +} + +#[cfg(not(feature = "libargus"))] +fn list_argus_devices() -> Result, CaptureDeviceQueryError> { + Err(CaptureDeviceQueryError::UnsupportedBackend(CaptureBackend::LibArgus)) +} + +#[cfg(feature = "libargus")] +fn open_argus_source( + options: CaptureSourceOptions, +) -> Result { + let source = + crate::sources::argus::ArgusCaptureSession::new(options.try_into()?).map_err(|err| { + match err { + crate::sources::argus::ArgusError::Unsupported => { + CaptureSourceError::UnsupportedBackend(CaptureBackend::LibArgus) + } + other => backend_source_error(CaptureBackend::LibArgus, other), + } + })?; + Ok(VideoCaptureSource::LibArgus(source)) +} + +#[cfg(not(feature = "libargus"))] +fn open_argus_source( + _options: CaptureSourceOptions, +) -> Result { + Err(CaptureSourceError::UnsupportedBackend(CaptureBackend::LibArgus)) +} + +#[cfg(feature = "libargus")] +impl TryFrom for crate::sources::argus::ArgusCaptureOptions { + type Error = CaptureSourceError; + + fn try_from(options: CaptureSourceOptions) -> Result { + let sensor_index = match options.device { + CaptureDeviceSelector::Default => 0, + CaptureDeviceSelector::Index(index) => { + u32::try_from(index).map_err(|_| CaptureSourceError::Backend { + backend: CaptureBackend::LibArgus, + message: "device index is out of range".to_string(), + })? + } + CaptureDeviceSelector::Id(_) => { + return Err(CaptureSourceError::Backend { + backend: CaptureBackend::LibArgus, + message: "libargus does not support string device selectors".to_string(), + }); + } + }; + let format = match options.format { + CaptureFormatRequest::Exact(format) | CaptureFormatRequest::Closest(format) => format, + CaptureFormatRequest::Default => { + crate::sources::argus::ArgusCaptureOptions::default().format + } + CaptureFormatRequest::HighestFrameRate { .. } + | CaptureFormatRequest::HighestResolution { .. } => { + return Err(CaptureSourceError::Backend { + backend: CaptureBackend::LibArgus, + message: "libargus requires an exact or closest format".to_string(), + }); + } + }; + Ok(Self { + sensor_index, + format, + attach_sensor_timestamp: options.metadata.timestamp != CaptureTimestampSource::None, + attach_frame_id: options.metadata.frame_id, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::encoded::{EncodedFrameType, EncodedVideoCodec}; + use crate::{ + dmabuf::{DmaBufPixelFormat, DmaBufPlane}, + metadata::FrameMetadata, + }; + use livekit::webrtc::video_frame::VideoRotation; + + #[derive(Debug, Error)] + #[error("fake source failed")] + struct FakeSourceError; + + #[derive(Debug)] + struct FakeEncodedSource { + next: Option, + } + + impl EncodedAccessUnitSource for FakeEncodedSource { + type Error = FakeSourceError; + + fn next_access_unit(&mut self) -> Result, Self::Error> { + Ok(self.next.take()) + } + } + + #[test] + fn encoded_source_adapts_to_capture_frame_source() { + let access_unit = OwnedEncodedAccessUnit::new( + EncodedVideoCodec::H264, + vec![0, 0, 0, 1, 0x65], + 10, + EncodedFrameType::Key, + 640, + 480, + ); + let mut source = + EncodedCaptureFrameSource::new(FakeEncodedSource { next: Some(access_unit.clone()) }); + + assert_eq!(source.capture_path(), CapturePath::Encoded); + let frame = source.next_frame().expect("encoded frame should be returned"); + assert_eq!(frame.capture_path(), CapturePath::Encoded); + let CaptureFrame::Encoded(returned) = frame else { + panic!("expected encoded frame"); + }; + assert_eq!(returned, access_unit); + } + + #[test] + fn encoded_source_reports_end_of_stream() { + let mut source = EncodedCaptureFrameSource::new(FakeEncodedSource { next: None }); + let err = source.next_frame().expect_err("EOF should be reported"); + assert!(matches!(err, EncodedFrameSourceError::EndOfStream)); + } + + #[test] + fn capture_frame_reports_common_paths() { + let raw = CaptureFrame::Raw(RawVideoFrame { + frame: VideoFrame { + rotation: VideoRotation::VideoRotation0, + timestamp_us: 0, + frame_metadata: None, + buffer: I420Buffer::new(2, 2), + }, + source_format: CaptureFrameFormat::I420, + capture_wall_time_us: 1, + read_wall_time_us: 2, + used_conversion: false, + }); + assert_eq!(raw.capture_path(), CapturePath::Raw); + + let dmabuf = CaptureFrame::DmaBuf(DmaBufFrame { + width: 2, + height: 2, + pixel_format: DmaBufPixelFormat::Nv12, + planes: vec![DmaBufPlane { fd: -1, offset: 0, stride: 2 }], + modifier: None, + timestamp_us: 0, + metadata: FrameMetadata::default(), + }); + assert_eq!(dmabuf.capture_path(), CapturePath::DmaBuf); + + let encoded = CaptureFrame::Encoded(OwnedEncodedAccessUnit::new( + EncodedVideoCodec::H264, + vec![0, 0, 0, 1, 0x65], + 0, + EncodedFrameType::Key, + 2, + 2, + )); + assert_eq!(encoded.capture_path(), CapturePath::Encoded); + } + + #[cfg(feature = "avfoundation")] + #[test] + fn avfoundation_canonical_and_compatibility_imports_compile() { + let _ = std::any::TypeId::of::(); + let _ = std::any::TypeId::of::(); + } + + #[cfg(feature = "v4l")] + #[test] + fn v4l_canonical_import_compiles() { + let _ = std::any::TypeId::of::(); + } +} diff --git a/livekit-capture/src/sources/gstreamer.rs b/livekit-capture/src/sources/gstreamer.rs index 09456f713..5678bd20c 100644 --- a/livekit-capture/src/sources/gstreamer.rs +++ b/livekit-capture/src/sources/gstreamer.rs @@ -12,9 +12,176 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::error::Error; +use std::error::Error as StdError; -use crate::encoded::{ingress::EncodedAccessUnitSource, OwnedEncodedAccessUnit}; +use bytes::Bytes; +use thiserror::Error; + +use ::gstreamer as gst; +use ::gstreamer_app as gst_app; + +use crate::{ + encoded::{ + h26x::access_unit_from_annex_b, ingress::EncodedAccessUnitSource, CodecSpecific, + EncodedFrameType, EncodedVideoCodec, H264PacketizationMode, OwnedEncodedAccessUnit, + }, + error::CaptureError, +}; + +/// Encoded sample format expected from a GStreamer appsink. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum GStreamerSampleFormat { + /// H.264 Annex-B access units, usually from `h264parse` with byte-stream caps. + H264AnnexB, + /// H.265 Annex-B access units, usually from `h265parse` with byte-stream caps. + H265AnnexB, + /// One already-delimited encoded access unit per appsink sample. + AccessUnit { + /// Codec carried by each appsink sample. + codec: EncodedVideoCodec, + }, +} + +impl GStreamerSampleFormat { + /// Returns the encoded codec carried by this sample format. + pub fn codec(self) -> EncodedVideoCodec { + match self { + Self::H264AnnexB => EncodedVideoCodec::H264, + Self::H265AnnexB => EncodedVideoCodec::H265, + Self::AccessUnit { codec } => codec, + } + } +} + +/// Configuration for a GStreamer appsink encoded source. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct GStreamerAppSinkConfig { + /// Format of encoded buffers pulled from appsink. + pub sample_format: GStreamerSampleFormat, + /// Timestamp added to the first buffer timestamp, or used directly as fallback. + pub start_timestamp_us: i64, + /// Fallback frame interval when a GStreamer buffer has no PTS or DTS. + pub frame_interval_us: i64, + /// Encoded frame width in pixels. + pub width: u32, + /// Encoded frame height in pixels. + pub height: u32, +} + +impl GStreamerAppSinkConfig { + /// Creates GStreamer appsink source configuration. + pub fn new( + sample_format: GStreamerSampleFormat, + start_timestamp_us: i64, + frame_interval_us: i64, + width: u32, + height: u32, + ) -> Self { + Self { sample_format, start_timestamp_us, frame_interval_us, width, height } + } +} + +/// Encoded source backed by a GStreamer appsink. +#[derive(Debug)] +pub struct GStreamerAppSinkEncodedSource { + appsink: gst_app::AppSink, + config: GStreamerAppSinkConfig, + next_fallback_timestamp_us: i64, +} + +impl GStreamerAppSinkEncodedSource { + /// Creates an encoded source from an existing GStreamer appsink. + pub fn new(appsink: gst_app::AppSink, config: GStreamerAppSinkConfig) -> Self { + Self { appsink, config, next_fallback_timestamp_us: config.start_timestamp_us } + } + + /// Returns the wrapped appsink. + pub fn appsink(&self) -> &gst_app::AppSink { + &self.appsink + } + + /// Returns the source configuration. + pub fn config(&self) -> GStreamerAppSinkConfig { + self.config + } + + /// Consumes this source and returns the wrapped appsink. + pub fn into_appsink(self) -> gst_app::AppSink { + self.appsink + } + + fn access_unit_from_sample( + &mut self, + sample: &gst::Sample, + ) -> Result { + let buffer = sample.buffer().ok_or(GStreamerSourceError::MissingBuffer)?; + let timestamp_us = self.timestamp_us(buffer); + let frame_type = if buffer.flags().contains(gst::BufferFlags::DELTA_UNIT) { + EncodedFrameType::Delta + } else { + EncodedFrameType::Key + }; + + let map = buffer + .map_readable() + .map_err(|err| GStreamerSourceError::MapReadable(err.to_string()))?; + let payload = map.as_ref(); + access_unit_from_sample_payload( + self.config.sample_format, + payload, + timestamp_us, + frame_type, + self.config.width, + self.config.height, + ) + .map_err(GStreamerSourceError::Capture) + } + + fn timestamp_us(&mut self, buffer: &gst::BufferRef) -> i64 { + if let Some(timestamp) = buffer.pts().or_else(|| buffer.dts()) { + let timestamp_us = + clock_time_to_timestamp_us(self.config.start_timestamp_us, timestamp); + self.next_fallback_timestamp_us = + timestamp_us.saturating_add(self.config.frame_interval_us); + return timestamp_us; + } + + let timestamp_us = self.next_fallback_timestamp_us; + self.next_fallback_timestamp_us = + self.next_fallback_timestamp_us.saturating_add(self.config.frame_interval_us); + timestamp_us + } +} + +impl EncodedAccessUnitSource for GStreamerAppSinkEncodedSource { + type Error = GStreamerSourceError; + + fn next_access_unit(&mut self) -> Result, Self::Error> { + match self.appsink.pull_sample() { + Ok(sample) => self.access_unit_from_sample(&sample).map(Some), + Err(_err) if self.appsink.is_eos() => Ok(None), + Err(err) => Err(GStreamerSourceError::PullSample(err.to_string())), + } + } +} + +/// Error returned by GStreamer appsink encoded sources. +#[derive(Debug, Error)] +pub enum GStreamerSourceError { + /// The appsink failed to produce a sample. + #[error("failed to pull GStreamer appsink sample: {0}")] + PullSample(String), + /// The sample did not contain an encoded buffer. + #[error("GStreamer sample did not contain a buffer")] + MissingBuffer, + /// The sample buffer could not be mapped for reading. + #[error("failed to map GStreamer buffer for reading: {0}")] + MapReadable(String), + /// Access-unit construction failed. + #[error(transparent)] + Capture(CaptureError), +} /// Callback-backed encoded source for GStreamer appsink integrations. #[derive(Debug)] @@ -47,7 +214,7 @@ impl GStreamerAppSinkSource { impl EncodedAccessUnitSource for GStreamerAppSinkSource where F: FnMut() -> Result, E>, - E: Error + Send + Sync + 'static, + E: StdError + Send + Sync + 'static, { type Error = E; @@ -55,3 +222,115 @@ where (self.next_access_unit)() } } + +fn access_unit_from_sample_payload( + sample_format: GStreamerSampleFormat, + payload: &[u8], + timestamp_us: i64, + frame_type: EncodedFrameType, + width: u32, + height: u32, +) -> Result { + match sample_format { + GStreamerSampleFormat::H264AnnexB => access_unit_from_annex_b( + EncodedVideoCodec::H264, + Bytes::copy_from_slice(payload), + timestamp_us, + width, + height, + ), + GStreamerSampleFormat::H265AnnexB => access_unit_from_annex_b( + EncodedVideoCodec::H265, + Bytes::copy_from_slice(payload), + timestamp_us, + width, + height, + ), + GStreamerSampleFormat::AccessUnit { codec } => { + if payload.is_empty() { + return Err(CaptureError::EmptyPayload); + } + + let mut access_unit = OwnedEncodedAccessUnit::new( + codec, + Bytes::copy_from_slice(payload), + timestamp_us, + frame_type, + width, + height, + ); + access_unit.codec_specific = codec_specific_for(codec); + Ok(access_unit) + } + } +} + +fn codec_specific_for(codec: EncodedVideoCodec) -> CodecSpecific { + match codec { + EncodedVideoCodec::H264 => { + CodecSpecific::H264 { packetization_mode: H264PacketizationMode::NonInterleaved } + } + EncodedVideoCodec::H265 => CodecSpecific::H265, + EncodedVideoCodec::VP8 => CodecSpecific::VP8 { temporal_id: None, layer_sync: false }, + EncodedVideoCodec::VP9 => { + CodecSpecific::VP9 { temporal_id: None, spatial_id: None, inter_layer_predicted: None } + } + EncodedVideoCodec::AV1 => CodecSpecific::AV1 { + scalability_mode: Some("L1T1".to_string()), + dependency_descriptor: None, + }, + } +} + +fn clock_time_to_timestamp_us(start_timestamp_us: i64, timestamp: gst::ClockTime) -> i64 { + let timestamp_us = timestamp.useconds().min(i64::MAX as u64) as i64; + start_timestamp_us.saturating_add(timestamp_us) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn sample_payload_h264_annex_b_detects_keyframe() { + let access_unit = access_unit_from_sample_payload( + GStreamerSampleFormat::H264AnnexB, + &[0, 0, 1, 0x65, 1, 2], + 1_000, + EncodedFrameType::Delta, + 640, + 480, + ) + .unwrap(); + + assert_eq!(access_unit.codec, EncodedVideoCodec::H264); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.timestamp_us, 1_000); + } + + #[test] + fn sample_payload_access_unit_uses_buffer_delta_flag() { + let access_unit = access_unit_from_sample_payload( + GStreamerSampleFormat::AccessUnit { codec: EncodedVideoCodec::VP8 }, + &[1, 2, 3], + 2_000, + EncodedFrameType::Delta, + 640, + 480, + ) + .unwrap(); + + assert_eq!(access_unit.codec, EncodedVideoCodec::VP8); + assert_eq!(access_unit.frame_type, EncodedFrameType::Delta); + assert_eq!( + access_unit.codec_specific, + CodecSpecific::VP8 { temporal_id: None, layer_sync: false } + ); + } + + #[test] + fn clock_time_is_offset_from_start_timestamp() { + let timestamp = clock_time_to_timestamp_us(10_000, gst::ClockTime::from_useconds(1_234)); + assert_eq!(timestamp, 11_234); + } +} diff --git a/livekit-capture/src/sources/rtsp.rs b/livekit-capture/src/sources/rtsp.rs index f735b2382..663e4850e 100644 --- a/livekit-capture/src/sources/rtsp.rs +++ b/livekit-capture/src/sources/rtsp.rs @@ -12,8 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::io::{self, Read}; +use std::{ + io::{self, Read, Write}, + net::TcpStream, + str, + time::{Duration, Instant, SystemTime, UNIX_EPOCH}, +}; +use base64::{engine::general_purpose, Engine as _}; +use md5::{Digest, Md5}; use thiserror::Error; use crate::encoded::{ @@ -22,6 +29,218 @@ use crate::encoded::{ EncodedVideoCodec, OwnedEncodedAccessUnit, }; +const DEFAULT_RTSP_CLOCK_RATE: u32 = 90_000; +const MAX_RTSP_HEADER_BYTES: usize = 64 * 1024; + +/// Options used to open an RTSP encoded video source. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct RtspSourceOptions { + /// Expected video codec, when the caller wants to reject mismatched SDP. + pub expected_codec: Option, + /// Timestamp assigned to the first emitted access unit. + pub start_timestamp_us: i64, + /// Encoded frame width in pixels. + pub width: u32, + /// Encoded frame height in pixels. + pub height: u32, +} + +impl RtspSourceOptions { + /// Creates RTSP source options for encoded frames with the supplied dimensions. + pub fn new(width: u32, height: u32) -> Self { + Self { expected_codec: None, start_timestamp_us: 0, width, height } + } + + /// Requires the SDP video track to use the supplied codec. + pub fn with_expected_codec(mut self, codec: EncodedVideoCodec) -> Self { + self.expected_codec = Some(codec); + self + } + + /// Sets the timestamp assigned to the first emitted access unit. + pub fn with_start_timestamp_us(mut self, start_timestamp_us: i64) -> Self { + self.start_timestamp_us = start_timestamp_us; + self + } +} + +/// RTSP session details discovered while opening a source. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct RtspSessionInfo { + /// RTP payload codec selected from SDP. + pub codec: EncodedVideoCodec, + /// RTP payload type selected from SDP. + pub payload_type: u8, + /// RTP timestamp clock rate. + pub clock_rate: u32, + /// RTSP interleaved channel carrying video RTP packets. + pub video_channel: u8, + /// RTSP media control URL used for SETUP. + pub control_url: String, + /// RTSP session identifier returned by SETUP. + pub session_id: String, +} + +/// Encoded RTSP source that performs DESCRIBE, SETUP, and PLAY over TCP. +#[derive(Debug)] +pub struct RtspEncodedSource { + source: RtspInterleavedRtpSource, + session_info: RtspSessionInfo, + keepalive: RtspKeepalive, +} + +impl RtspEncodedSource { + /// Connects to an RTSP URL and starts TCP-interleaved RTP playback. + pub fn connect(url: &str, options: RtspSourceOptions) -> Result { + let rtsp_url = RtspUrl::parse(url)?; + let mut stream = TcpStream::connect((rtsp_url.connect_host.as_str(), rtsp_url.port)) + .map_err(RtspSourceError::Io)?; + let _ = stream.set_nodelay(true); + let mut auth = RtspAuthContext::new(rtsp_url.credentials.clone()); + let mut cseq = 1; + + let describe = send_authenticated_rtsp_request( + &mut stream, + "DESCRIBE", + &rtsp_url.original, + &mut cseq, + &[("Host", rtsp_url.host_header.as_str()), ("Accept", "application/sdp")], + &mut auth, + )?; + let sdp = str::from_utf8(&describe.body).map_err(|_| RtspSourceError::InvalidSdp)?; + let media = parse_sdp_video_track(&rtsp_url, sdp, options.expected_codec)?; + + let setup = send_authenticated_rtsp_request( + &mut stream, + "SETUP", + &media.control_url, + &mut cseq, + &[ + ("Host", rtsp_url.host_header.as_str()), + ("Transport", "RTP/AVP/TCP;unicast;interleaved=0-1"), + ], + &mut auth, + )?; + let session_header = + setup.header("session").ok_or(RtspSourceError::MissingHeader("Session"))?; + let session_id = parse_session_id(session_header)?; + let session_timeout_secs = parse_session_timeout_secs(session_header); + let video_channel = parse_interleaved_channel(setup.header("transport")); + + send_authenticated_rtsp_request( + &mut stream, + "PLAY", + &rtsp_url.original, + &mut cseq, + &[ + ("Host", rtsp_url.host_header.as_str()), + ("Session", session_id.as_str()), + ("Range", "npt=0.000-"), + ], + &mut auth, + )?; + + let session_info = RtspSessionInfo { + codec: media.codec, + payload_type: media.payload_type, + clock_rate: media.clock_rate, + video_channel, + control_url: media.control_url, + session_id, + }; + let config = RtspInterleavedSourceConfig { + codec: session_info.codec, + clock_rate: session_info.clock_rate, + video_channel: session_info.video_channel, + start_timestamp_us: options.start_timestamp_us, + width: options.width, + height: options.height, + }; + let source = RtspInterleavedRtpSource::new(stream, config)?; + let keepalive = RtspKeepalive::new( + rtsp_url.original, + rtsp_url.host_header, + session_info.session_id.clone(), + cseq, + auth, + session_timeout_secs, + ); + + Ok(Self { source, session_info, keepalive }) + } + + /// Returns RTSP session details discovered during setup. + pub fn session_info(&self) -> &RtspSessionInfo { + &self.session_info + } + + /// Attempts to clone the underlying TCP stream. + pub fn try_clone_stream(&self) -> io::Result { + self.source.reader().try_clone() + } +} + +impl EncodedAccessUnitSource for RtspEncodedSource { + type Error = RtspSourceError; + + fn next_access_unit(&mut self) -> Result, Self::Error> { + self.keepalive.maybe_send(self.source.reader_mut())?; + self.source.next_access_unit() + } +} + +#[derive(Debug)] +struct RtspKeepalive { + request_uri: String, + host_header: String, + session_id: String, + cseq: u32, + auth: RtspAuthContext, + interval: Duration, + next_due: Instant, +} + +impl RtspKeepalive { + fn new( + request_uri: String, + host_header: String, + session_id: String, + cseq: u32, + auth: RtspAuthContext, + session_timeout_secs: Option, + ) -> Self { + let interval_secs = session_timeout_secs.map(|timeout| (timeout / 2).max(1)).unwrap_or(30); + let interval = Duration::from_secs(interval_secs); + Self { + request_uri, + host_header, + session_id, + cseq, + auth, + interval, + next_due: Instant::now() + interval, + } + } + + fn maybe_send(&mut self, stream: &mut TcpStream) -> Result<(), RtspSourceError> { + if Instant::now() < self.next_due { + return Ok(()); + } + + let authorization = self.auth.header("OPTIONS", &self.request_uri)?; + write_rtsp_request( + stream, + "OPTIONS", + &self.request_uri, + next_cseq(&mut self.cseq), + &[("Host", self.host_header.as_str()), ("Session", self.session_id.as_str())], + authorization, + )?; + self.next_due = Instant::now() + self.interval; + Ok(()) + } +} + /// Configuration for RTSP interleaved RTP media. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct RtspInterleavedSourceConfig { @@ -89,6 +308,10 @@ where return Ok(None); } + if magic[0] == b'R' { + let _ = read_rtsp_response_with_initial_byte(&mut self.reader, magic[0])?; + continue; + } if magic[0] != b'$' { return Err(RtspSourceError::UnexpectedData); } @@ -131,8 +354,48 @@ where #[derive(Debug, Error)] pub enum RtspSourceError { /// I/O failed while reading RTSP interleaved data. - #[error("RTSP read failed: {0}")] + #[error("RTSP I/O failed: {0}")] Io(io::Error), + /// RTSP URL was invalid or unsupported. + #[error("invalid RTSP URL: {0}")] + InvalidUrl(&'static str), + /// RTSP server returned a non-success status. + #[error("RTSP request failed with status {code} {reason}")] + RtspStatus { + /// RTSP status code. + code: u16, + /// RTSP status reason. + reason: String, + }, + /// RTSP response was malformed. + #[error("invalid RTSP response: {0}")] + InvalidResponse(&'static str), + /// RTSP response was missing a required header. + #[error("RTSP response missing {0} header")] + MissingHeader(&'static str), + /// RTSP server requested authentication but no URL credentials were supplied. + #[error("RTSP authentication required but the URL does not contain credentials")] + MissingCredentials, + /// RTSP authentication challenge was malformed. + #[error("invalid RTSP authentication challenge")] + InvalidAuthChallenge, + /// RTSP authentication scheme is not supported. + #[error("unsupported RTSP authentication scheme: {0}")] + UnsupportedAuthScheme(String), + /// SDP was missing a supported video track. + #[error("RTSP SDP does not contain a supported H264/H265 video track")] + MissingVideoTrack, + /// SDP selected a codec different from the requested codec. + #[error("RTSP SDP codec mismatch: expected {expected:?}, got {actual:?}")] + CodecMismatch { + /// Codec requested by the caller. + expected: EncodedVideoCodec, + /// Codec selected from SDP. + actual: EncodedVideoCodec, + }, + /// SDP body was malformed or not valid UTF-8. + #[error("invalid RTSP SDP")] + InvalidSdp, /// Interleaved RTP was malformed or a non-interleaved byte was encountered. #[error("unexpected RTSP interleaved data")] UnexpectedData, @@ -153,9 +416,668 @@ fn read_exact_or_clean_eof(reader: &mut impl Read, buf: &mut [u8]) -> io::Result Ok(true) } +#[derive(Debug, Clone, PartialEq, Eq)] +struct RtspUrl { + original: String, + authority: String, + connect_host: String, + host_header: String, + port: u16, + credentials: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct RtspCredentials { + username: String, + password: String, +} + +impl RtspUrl { + fn parse(url: &str) -> Result { + let Some(rest) = url.strip_prefix("rtsp://") else { + return Err(RtspSourceError::InvalidUrl("expected rtsp:// scheme")); + }; + let (authority, path_suffix) = match rest.find('/') { + Some(path_start) => (&rest[..path_start], &rest[path_start..]), + None => (rest, ""), + }; + if authority.is_empty() { + return Err(RtspSourceError::InvalidUrl("missing host")); + } + + let (credentials, host_port) = match authority.rsplit_once('@') { + Some((userinfo, host_port)) => (Some(parse_userinfo(userinfo)?), host_port), + None => (None, authority), + }; + if host_port.is_empty() { + return Err(RtspSourceError::InvalidUrl("missing host")); + } + let (connect_host, port) = parse_host_port(host_port)?; + let host_header = if host_port.contains(':') { + host_port.to_owned() + } else { + format!("{host_port}:{port}") + }; + + Ok(Self { + original: format!("rtsp://{host_port}{path_suffix}"), + authority: host_port.to_owned(), + connect_host, + host_header, + port, + credentials, + }) + } +} + +fn parse_userinfo(userinfo: &str) -> Result { + let (username, password) = userinfo.split_once(':').unwrap_or((userinfo, "")); + if username.is_empty() { + return Err(RtspSourceError::InvalidUrl("missing username")); + } + Ok(RtspCredentials { username: username.to_owned(), password: password.to_owned() }) +} + +fn parse_host_port(host_port: &str) -> Result<(String, u16), RtspSourceError> { + if let Some(rest) = host_port.strip_prefix('[') { + let Some((host, after_host)) = rest.split_once(']') else { + return Err(RtspSourceError::InvalidUrl("malformed IPv6 host")); + }; + let port = after_host.strip_prefix(':').map(parse_port).transpose()?.unwrap_or(554); + return Ok((host.to_owned(), port)); + } + + if let Some((host, port)) = host_port.rsplit_once(':') { + if !host.contains(':') { + return Ok((host.to_owned(), parse_port(port)?)); + } + } + + Ok((host_port.to_owned(), 554)) +} + +fn parse_port(port: &str) -> Result { + port.parse().map_err(|_| RtspSourceError::InvalidUrl("invalid port")) +} + +#[derive(Debug, Clone)] +struct RtspResponse { + status_code: u16, + reason: String, + headers: Vec<(String, String)>, + body: Vec, +} + +impl RtspResponse { + fn header(&self, name: &str) -> Option<&str> { + self.headers + .iter() + .find(|(header_name, _)| header_name.eq_ignore_ascii_case(name)) + .map(|(_, value)| value.as_str()) + } + + fn headers<'a>(&'a self, name: &'a str) -> impl Iterator + 'a { + self.headers + .iter() + .filter(move |(header_name, _)| header_name.eq_ignore_ascii_case(name)) + .map(|(_, value)| value.as_str()) + } +} + +fn send_authenticated_rtsp_request( + stream: &mut TcpStream, + method: &str, + uri: &str, + cseq: &mut u32, + headers: &[(&str, &str)], + auth: &mut RtspAuthContext, +) -> Result { + let mut response = send_rtsp_request( + stream, + method, + uri, + next_cseq(cseq), + headers, + auth.header(method, uri)?, + )?; + if response.status_code == 401 { + auth.update_from_unauthorized(&response)?; + response = send_rtsp_request( + stream, + method, + uri, + next_cseq(cseq), + headers, + auth.header(method, uri)?, + )?; + } + + if !(200..300).contains(&response.status_code) { + return Err(RtspSourceError::RtspStatus { + code: response.status_code, + reason: response.reason, + }); + } + Ok(response) +} + +fn next_cseq(cseq: &mut u32) -> u32 { + let current = *cseq; + *cseq = cseq.saturating_add(1); + current +} + +fn send_rtsp_request( + stream: &mut TcpStream, + method: &str, + uri: &str, + cseq: u32, + headers: &[(&str, &str)], + authorization: Option, +) -> Result { + write_rtsp_request(stream, method, uri, cseq, headers, authorization)?; + read_rtsp_response(stream) +} + +fn write_rtsp_request( + stream: &mut TcpStream, + method: &str, + uri: &str, + cseq: u32, + headers: &[(&str, &str)], + authorization: Option, +) -> Result<(), RtspSourceError> { + write!(stream, "{method} {uri} RTSP/1.0\r\n").map_err(RtspSourceError::Io)?; + write!(stream, "CSeq: {cseq}\r\n").map_err(RtspSourceError::Io)?; + write!(stream, "User-Agent: livekit-capture/0.1\r\n").map_err(RtspSourceError::Io)?; + if let Some(authorization) = authorization { + write!(stream, "Authorization: {authorization}\r\n").map_err(RtspSourceError::Io)?; + } + for (name, value) in headers { + write!(stream, "{name}: {value}\r\n").map_err(RtspSourceError::Io)?; + } + write!(stream, "\r\n").map_err(RtspSourceError::Io)?; + stream.flush().map_err(RtspSourceError::Io)?; + Ok(()) +} + +#[derive(Debug, Clone)] +struct RtspAuthContext { + credentials: Option, + challenge: Option, + nonce_count: u32, + cnonce: String, +} + +impl RtspAuthContext { + fn new(credentials: Option) -> Self { + Self { credentials, challenge: None, nonce_count: 0, cnonce: make_cnonce() } + } + + fn header(&mut self, method: &str, uri: &str) -> Result, RtspSourceError> { + let Some(challenge) = self.challenge.clone() else { + return Ok(None); + }; + let credentials = self.credentials.as_ref().ok_or(RtspSourceError::MissingCredentials)?; + match challenge { + RtspAuthChallenge::Basic => { + let token = general_purpose::STANDARD + .encode(format!("{}:{}", credentials.username, credentials.password)); + Ok(Some(format!("Basic {token}"))) + } + RtspAuthChallenge::Digest(challenge) => { + self.nonce_count = self.nonce_count.saturating_add(1); + Ok(Some(build_digest_authorization( + credentials, + &challenge, + method, + uri, + self.nonce_count, + &self.cnonce, + ))) + } + } + } + + fn update_from_unauthorized(&mut self, response: &RtspResponse) -> Result<(), RtspSourceError> { + if self.credentials.is_none() { + return Err(RtspSourceError::MissingCredentials); + } + self.challenge = Some(parse_authenticate_header( + response.headers("www-authenticate").collect::>().as_slice(), + )?); + self.nonce_count = 0; + Ok(()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +enum RtspAuthChallenge { + Basic, + Digest(DigestAuthChallenge), +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct DigestAuthChallenge { + realm: String, + nonce: String, + opaque: Option, + qop: Option, +} + +fn parse_authenticate_header(headers: &[&str]) -> Result { + for header in headers { + if strip_auth_scheme(header, "Digest").is_some() { + return parse_digest_challenge(header); + } + } + for header in headers { + if strip_auth_scheme(header, "Basic").is_some() { + return Ok(RtspAuthChallenge::Basic); + } + } + Err(RtspSourceError::UnsupportedAuthScheme( + headers.first().copied().unwrap_or_default().to_owned(), + )) +} + +fn parse_digest_challenge(header: &str) -> Result { + let params = parse_auth_params( + strip_auth_scheme(header, "Digest").ok_or(RtspSourceError::InvalidAuthChallenge)?, + ); + let realm = params + .iter() + .find(|(name, _)| name.eq_ignore_ascii_case("realm")) + .map(|(_, value)| value.to_owned()) + .ok_or(RtspSourceError::InvalidAuthChallenge)?; + let nonce = params + .iter() + .find(|(name, _)| name.eq_ignore_ascii_case("nonce")) + .map(|(_, value)| value.to_owned()) + .ok_or(RtspSourceError::InvalidAuthChallenge)?; + if let Some((_, algorithm)) = + params.iter().find(|(name, _)| name.eq_ignore_ascii_case("algorithm")) + { + if !algorithm.eq_ignore_ascii_case("MD5") { + return Err(RtspSourceError::UnsupportedAuthScheme(format!( + "Digest algorithm={algorithm}" + ))); + } + } + let qop = params + .iter() + .find(|(name, _)| name.eq_ignore_ascii_case("qop")) + .and_then(|(_, value)| select_digest_qop(value)); + let opaque = params + .iter() + .find(|(name, _)| name.eq_ignore_ascii_case("opaque")) + .map(|(_, value)| value.to_owned()); + + Ok(RtspAuthChallenge::Digest(DigestAuthChallenge { realm, nonce, opaque, qop })) +} + +fn strip_auth_scheme<'a>(header: &'a str, scheme: &str) -> Option<&'a str> { + let header = header.trim_start(); + let rest = header.get(scheme.len()..)?; + if !header[..scheme.len()].eq_ignore_ascii_case(scheme) { + return None; + } + if rest.is_empty() { + return Some(rest); + } + rest.strip_prefix(' ') +} + +fn parse_auth_params(params: &str) -> Vec<(String, String)> { + let mut parsed = Vec::new(); + let mut current = String::new(); + let mut in_quotes = false; + let mut escaped = false; + for ch in params.chars() { + if escaped { + current.push(ch); + escaped = false; + continue; + } + match ch { + '\\' if in_quotes => { + escaped = true; + current.push(ch); + } + '"' => { + in_quotes = !in_quotes; + current.push(ch); + } + ',' if !in_quotes => { + push_auth_param(&mut parsed, ¤t); + current.clear(); + } + _ => current.push(ch), + } + } + push_auth_param(&mut parsed, ¤t); + parsed +} + +fn push_auth_param(parsed: &mut Vec<(String, String)>, param: &str) { + let Some((name, value)) = param.trim().split_once('=') else { + return; + }; + parsed.push((name.trim().to_owned(), unquote_auth_value(value.trim()))); +} + +fn unquote_auth_value(value: &str) -> String { + let Some(value) = value.strip_prefix('"').and_then(|value| value.strip_suffix('"')) else { + return value.to_owned(); + }; + let mut unquoted = String::new(); + let mut escaped = false; + for ch in value.chars() { + if escaped { + unquoted.push(ch); + escaped = false; + } else if ch == '\\' { + escaped = true; + } else { + unquoted.push(ch); + } + } + unquoted +} + +fn select_digest_qop(value: &str) -> Option { + value.split(',').map(str::trim).find(|qop| qop.eq_ignore_ascii_case("auth")).map(str::to_owned) +} + +fn build_digest_authorization( + credentials: &RtspCredentials, + challenge: &DigestAuthChallenge, + method: &str, + uri: &str, + nonce_count: u32, + cnonce: &str, +) -> String { + let ha1 = + md5_hex(format!("{}:{}:{}", credentials.username, challenge.realm, credentials.password)); + let ha2 = md5_hex(format!("{method}:{uri}")); + let response = if let Some(qop) = &challenge.qop { + md5_hex(format!("{ha1}:{}:{nonce_count:08x}:{cnonce}:{qop}:{ha2}", challenge.nonce)) + } else { + md5_hex(format!("{ha1}:{}:{ha2}", challenge.nonce)) + }; + + let mut header = format!( + "Digest username=\"{}\", realm=\"{}\", nonce=\"{}\", uri=\"{}\", response=\"{}\"", + quote_auth_value(&credentials.username), + quote_auth_value(&challenge.realm), + quote_auth_value(&challenge.nonce), + quote_auth_value(uri), + response + ); + if let Some(qop) = &challenge.qop { + header.push_str(&format!( + ", qop={}, nc={nonce_count:08x}, cnonce=\"{}\"", + quote_auth_value(qop), + quote_auth_value(cnonce) + )); + } + if let Some(opaque) = &challenge.opaque { + header.push_str(&format!(", opaque=\"{}\"", quote_auth_value(opaque))); + } + header +} + +fn quote_auth_value(value: &str) -> String { + value.replace('\\', "\\\\").replace('"', "\\\"") +} + +fn md5_hex(input: impl AsRef<[u8]>) -> String { + format!("{:x}", Md5::digest(input)) +} + +fn make_cnonce() -> String { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|duration| duration.as_nanos()) + .unwrap_or_default(); + format!("{nanos:032x}") +} + +fn read_rtsp_response(reader: &mut impl Read) -> Result { + read_rtsp_response_with_header_prefix(reader, Vec::new()) +} + +fn read_rtsp_response_with_initial_byte( + reader: &mut impl Read, + initial_byte: u8, +) -> Result { + read_rtsp_response_with_header_prefix(reader, vec![initial_byte]) +} + +fn read_rtsp_response_with_header_prefix( + reader: &mut impl Read, + mut header: Vec, +) -> Result { + let mut byte = [0u8; 1]; + loop { + reader.read_exact(&mut byte).map_err(RtspSourceError::Io)?; + header.push(byte[0]); + if header.ends_with(b"\r\n\r\n") { + break; + } + if header.len() > MAX_RTSP_HEADER_BYTES { + return Err(RtspSourceError::InvalidResponse("header too large")); + } + } + + let header_text = + str::from_utf8(&header).map_err(|_| RtspSourceError::InvalidResponse("header UTF-8"))?; + let mut lines = header_text.trim_end_matches("\r\n\r\n").split("\r\n"); + let status_line = + lines.next().ok_or(RtspSourceError::InvalidResponse("missing status line"))?; + let mut status_parts = status_line.splitn(3, ' '); + if status_parts.next() != Some("RTSP/1.0") { + return Err(RtspSourceError::InvalidResponse("unsupported version")); + } + let status_code = status_parts + .next() + .ok_or(RtspSourceError::InvalidResponse("missing status code"))? + .parse() + .map_err(|_| RtspSourceError::InvalidResponse("invalid status code"))?; + let reason = status_parts.next().unwrap_or_default().to_owned(); + + let mut headers = Vec::new(); + for line in lines { + let Some((name, value)) = line.split_once(':') else { + return Err(RtspSourceError::InvalidResponse("malformed header")); + }; + headers.push((name.trim().to_owned(), value.trim().to_owned())); + } + + let content_length = headers + .iter() + .find(|(name, _)| name.eq_ignore_ascii_case("content-length")) + .map(|(_, value)| value.parse::()) + .transpose() + .map_err(|_| RtspSourceError::InvalidResponse("invalid content length"))? + .unwrap_or(0); + let mut body = vec![0; content_length]; + if content_length > 0 { + reader.read_exact(&mut body).map_err(RtspSourceError::Io)?; + } + + Ok(RtspResponse { status_code, reason, headers, body }) +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct SdpVideoTrack { + codec: EncodedVideoCodec, + payload_type: u8, + clock_rate: u32, + control_url: String, +} + +#[derive(Debug, Clone, Default)] +struct PartialSdpVideoTrack { + payload_types: Vec, + rtp_maps: Vec, + control: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct SdpRtpMap { + payload_type: u8, + codec: EncodedVideoCodec, + clock_rate: u32, +} + +fn parse_sdp_video_track( + base_url: &RtspUrl, + sdp: &str, + expected_codec: Option, +) -> Result { + let mut tracks = Vec::new(); + let mut current = None; + + for line in sdp.lines().map(str::trim).filter(|line| !line.is_empty()) { + if let Some(media) = line.strip_prefix("m=") { + if let Some(track) = current.take() { + tracks.push(track); + } + if let Some(video) = media.strip_prefix("video ") { + current = Some(parse_video_media(video)); + } + continue; + } + + let Some(track) = current.as_mut() else { + continue; + }; + if let Some(control) = line.strip_prefix("a=control:") { + track.control = Some(control.trim().to_owned()); + } else if let Some(rtpmap) = line.strip_prefix("a=rtpmap:") { + if let Some(rtp_map) = parse_rtpmap(rtpmap) { + track.rtp_maps.push(rtp_map); + } + } + } + if let Some(track) = current { + tracks.push(track); + } + + for track in tracks { + for payload_type in &track.payload_types { + let Some(rtp_map) = track.rtp_maps.iter().find(|map| map.payload_type == *payload_type) + else { + continue; + }; + if let Some(expected) = expected_codec { + if rtp_map.codec != expected { + return Err(RtspSourceError::CodecMismatch { expected, actual: rtp_map.codec }); + } + } + + return Ok(SdpVideoTrack { + codec: rtp_map.codec, + payload_type: *payload_type, + clock_rate: rtp_map.clock_rate, + control_url: resolve_control_url(base_url, track.control.as_deref()), + }); + } + } + + Err(RtspSourceError::MissingVideoTrack) +} + +fn parse_video_media(media: &str) -> PartialSdpVideoTrack { + let payload_types = media + .split_whitespace() + .skip(2) + .filter_map(|payload_type| payload_type.parse().ok()) + .collect(); + PartialSdpVideoTrack { payload_types, ..Default::default() } +} + +fn parse_rtpmap(rtpmap: &str) -> Option { + let (payload_type, encoding) = rtpmap.trim().split_once(' ')?; + let payload_type = payload_type.parse().ok()?; + let mut encoding_parts = encoding.split('/'); + let codec_name = encoding_parts.next()?; + let codec = parse_sdp_codec(codec_name)?; + let clock_rate = encoding_parts + .next() + .and_then(|clock_rate| clock_rate.parse().ok()) + .unwrap_or(DEFAULT_RTSP_CLOCK_RATE); + Some(SdpRtpMap { payload_type, codec, clock_rate }) +} + +fn parse_sdp_codec(codec_name: &str) -> Option { + if codec_name.eq_ignore_ascii_case("H264") { + Some(EncodedVideoCodec::H264) + } else if codec_name.eq_ignore_ascii_case("H265") || codec_name.eq_ignore_ascii_case("HEVC") { + Some(EncodedVideoCodec::H265) + } else { + None + } +} + +fn resolve_control_url(base_url: &RtspUrl, control: Option<&str>) -> String { + let Some(control) = control.map(str::trim).filter(|control| !control.is_empty()) else { + return base_url.original.clone(); + }; + if control == "*" { + return base_url.original.clone(); + } + if control.starts_with("rtsp://") { + return control.to_owned(); + } + if control.starts_with('/') { + return format!("rtsp://{}{}", base_url.authority, control); + } + format!("{}/{}", base_url.original.trim_end_matches('/'), control) +} + +fn parse_session_id(session_header: &str) -> Result { + let session_id = session_header.split(';').next().unwrap_or_default().trim(); + if session_id.is_empty() { + return Err(RtspSourceError::InvalidResponse("empty session id")); + } + Ok(session_id.to_owned()) +} + +fn parse_session_timeout_secs(session_header: &str) -> Option { + session_header.split(';').skip(1).find_map(|part| { + let (name, value) = part.trim().split_once('=')?; + if name.trim().eq_ignore_ascii_case("timeout") { + value.trim().parse().ok() + } else { + None + } + }) +} + +fn parse_interleaved_channel(transport_header: Option<&str>) -> u8 { + let Some(transport_header) = transport_header else { + return 0; + }; + for part in transport_header.split(';') { + let Some(value) = part.trim().strip_prefix("interleaved=") else { + continue; + }; + if let Some(first) = value.split('-').next().and_then(|channel| channel.parse().ok()) { + return first; + } + } + 0 +} + #[cfg(test)] mod tests { - use std::io::Cursor; + use std::{ + io::{Cursor, Write}, + net::TcpListener, + thread, + }; use super::*; @@ -197,4 +1119,289 @@ mod tests { assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 1, 2]); assert!(source.next_access_unit().unwrap().is_none()); } + + #[test] + fn skips_rtsp_response_between_interleaved_frames() { + let packet = rtp_packet(10, 12_000, true, &[0x65, 1, 2]); + let mut stream = Vec::new(); + write_status_response(&mut stream, 4, &[], &[], 200, "OK"); + stream.extend_from_slice(&interleaved(0, &packet)); + let config = RtspInterleavedSourceConfig { + codec: EncodedVideoCodec::H264, + clock_rate: 90_000, + video_channel: 0, + start_timestamp_us: 0, + width: 640, + height: 480, + }; + let mut source = RtspInterleavedRtpSource::new(Cursor::new(stream), config).unwrap(); + + let access_unit = source.next_access_unit().unwrap().unwrap(); + + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 1, 2]); + assert!(source.next_access_unit().unwrap().is_none()); + } + + #[test] + fn parses_sdp_video_track() { + let base_url = RtspUrl::parse("rtsp://camera.example/live").unwrap(); + let sdp = "\ +v=0\r\n\ +m=video 0 RTP/AVP 96\r\n\ +a=control:trackID=1\r\n\ +a=rtpmap:96 H264/90000\r\n"; + + let track = parse_sdp_video_track(&base_url, sdp, Some(EncodedVideoCodec::H264)).unwrap(); + + assert_eq!(track.codec, EncodedVideoCodec::H264); + assert_eq!(track.payload_type, 96); + assert_eq!(track.clock_rate, 90_000); + assert_eq!(track.control_url, "rtsp://camera.example/live/trackID=1"); + } + + #[test] + fn resolves_absolute_path_control_url() { + let base_url = RtspUrl::parse("rtsp://camera.example/live").unwrap(); + assert_eq!( + resolve_control_url(&base_url, Some("/stream/trackID=1")), + "rtsp://camera.example/stream/trackID=1" + ); + } + + #[test] + fn parses_session_timeout() { + assert_eq!(parse_session_timeout_secs("abc123;timeout=60"), Some(60)); + assert_eq!(parse_session_timeout_secs("abc123; Timeout = 30"), Some(30)); + assert_eq!(parse_session_timeout_secs("abc123"), None); + } + + #[test] + fn parses_credentials_but_strips_them_from_request_url() { + let url = RtspUrl::parse("rtsp://admin:secret@camera.example:554/live").unwrap(); + + assert_eq!(url.original, "rtsp://camera.example:554/live"); + assert_eq!(url.authority, "camera.example:554"); + assert_eq!( + url.credentials, + Some(RtspCredentials { username: "admin".to_owned(), password: "secret".to_owned() }) + ); + } + + #[test] + fn builds_digest_authorization_with_qop_auth() { + let credentials = RtspCredentials { + username: "Mufasa".to_owned(), + password: "Circle Of Life".to_owned(), + }; + let challenge = DigestAuthChallenge { + realm: "testrealm@host.com".to_owned(), + nonce: "dcd98b7102dd2f0e8b11d0f600bfb0c093".to_owned(), + opaque: Some("5ccc069c403ebaf9f0171e9517f40e41".to_owned()), + qop: Some("auth".to_owned()), + }; + + let authorization = build_digest_authorization( + &credentials, + &challenge, + "GET", + "/dir/index.html", + 1, + "0a4f113b", + ); + + assert!(authorization.contains("response=\"6629fae49393a05397450978507c4ef1\"")); + assert!(authorization.contains("qop=auth")); + assert!(authorization.contains("nc=00000001")); + } + + #[test] + fn sends_rtsp_keepalive_when_due() { + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let addr = listener.local_addr().unwrap(); + let server = thread::spawn(move || { + let (mut stream, _) = listener.accept().unwrap(); + read_request(&mut stream) + }); + let mut client = std::net::TcpStream::connect(addr).unwrap(); + let mut keepalive = RtspKeepalive::new( + "rtsp://camera.example/live".to_owned(), + "camera.example:554".to_owned(), + "abc123".to_owned(), + 4, + RtspAuthContext::new(None), + Some(2), + ); + keepalive.next_due = Instant::now() - Duration::from_secs(1); + + keepalive.maybe_send(&mut client).unwrap(); + let request = server.join().unwrap(); + + assert!(request.starts_with("OPTIONS rtsp://camera.example/live RTSP/1.0")); + assert!(request.contains("CSeq: 4")); + assert!(request.contains("Session: abc123")); + } + + #[test] + fn connects_and_reads_rtsp_access_unit() { + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let addr = listener.local_addr().unwrap(); + let server = thread::spawn(move || { + let (mut stream, _) = listener.accept().unwrap(); + let describe = read_request(&mut stream); + assert!(describe.starts_with("DESCRIBE rtsp://")); + let sdp = "\ +v=0\r\n\ +m=video 0 RTP/AVP 96\r\n\ +a=control:trackID=0\r\n\ +a=rtpmap:96 H264/90000\r\n"; + write_response( + &mut stream, + 1, + &[("Content-Type", "application/sdp"), ("Content-Length", &sdp.len().to_string())], + sdp.as_bytes(), + ); + + let setup = read_request(&mut stream); + assert!(setup.starts_with("SETUP rtsp://")); + assert!(setup.contains("Transport: RTP/AVP/TCP;unicast;interleaved=0-1")); + write_response( + &mut stream, + 2, + &[ + ("Session", "abc123;timeout=60"), + ("Transport", "RTP/AVP/TCP;unicast;interleaved=2-3"), + ], + &[], + ); + + let play = read_request(&mut stream); + assert!(play.starts_with("PLAY rtsp://")); + assert!(play.contains("Session: abc123")); + write_response(&mut stream, 3, &[], &[]); + + let packet = rtp_packet(10, 12_000, true, &[0x65, 1, 2]); + stream.write_all(&interleaved(2, &packet)).unwrap(); + }); + + let options = RtspSourceOptions::new(640, 480) + .with_expected_codec(EncodedVideoCodec::H264) + .with_start_timestamp_us(0); + let mut source = + RtspEncodedSource::connect(&format!("rtsp://{addr}/camera"), options).unwrap(); + assert_eq!(source.session_info().codec, EncodedVideoCodec::H264); + assert_eq!(source.session_info().video_channel, 2); + assert_eq!(source.session_info().session_id, "abc123"); + + let access_unit = source.next_access_unit().unwrap().unwrap(); + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 1, 2]); + server.join().unwrap(); + } + + #[test] + fn connects_with_rtsp_digest_auth() { + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let addr = listener.local_addr().unwrap(); + let server = thread::spawn(move || { + let (mut stream, _) = listener.accept().unwrap(); + let first_describe = read_request(&mut stream); + assert!(first_describe.starts_with(&format!("DESCRIBE rtsp://{addr}/camera"))); + assert!(!first_describe.contains("Authorization:")); + write_status_response( + &mut stream, + 1, + &[("WWW-Authenticate", "Digest realm=\"camera\", nonce=\"abcdef\", qop=\"auth\"")], + &[], + 401, + "Unauthorized", + ); + + let second_describe = read_request(&mut stream); + assert!(second_describe.starts_with(&format!("DESCRIBE rtsp://{addr}/camera"))); + assert!(!second_describe.contains("admin:secret@")); + assert!(second_describe.contains("Authorization: Digest username=\"admin\"")); + assert!(second_describe.contains(&format!("uri=\"rtsp://{addr}/camera\""))); + assert!(second_describe.contains("qop=auth")); + let sdp = "\ +v=0\r\n\ +m=video 0 RTP/AVP 96\r\n\ +a=control:trackID=0\r\n\ +a=rtpmap:96 H264/90000\r\n"; + write_status_response( + &mut stream, + 2, + &[("Content-Type", "application/sdp"), ("Content-Length", &sdp.len().to_string())], + sdp.as_bytes(), + 200, + "OK", + ); + + let setup = read_request(&mut stream); + assert!(setup.contains("Authorization: Digest username=\"admin\"")); + write_status_response( + &mut stream, + 3, + &[ + ("Session", "abc123;timeout=60"), + ("Transport", "RTP/AVP/TCP;unicast;interleaved=0-1"), + ], + &[], + 200, + "OK", + ); + + let play = read_request(&mut stream); + assert!(play.contains("Authorization: Digest username=\"admin\"")); + write_status_response(&mut stream, 4, &[], &[], 200, "OK"); + + let packet = rtp_packet(10, 12_000, true, &[0x65, 1, 2]); + stream.write_all(&interleaved(0, &packet)).unwrap(); + }); + + let options = RtspSourceOptions::new(640, 480) + .with_expected_codec(EncodedVideoCodec::H264) + .with_start_timestamp_us(0); + let mut source = + RtspEncodedSource::connect(&format!("rtsp://admin:secret@{addr}/camera"), options) + .unwrap(); + + let access_unit = source.next_access_unit().unwrap().unwrap(); + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 1, 2]); + server.join().unwrap(); + } + + fn read_request(stream: &mut impl Read) -> String { + let mut request = Vec::new(); + let mut byte = [0u8; 1]; + loop { + stream.read_exact(&mut byte).unwrap(); + request.push(byte[0]); + if request.ends_with(b"\r\n\r\n") { + break; + } + } + String::from_utf8(request).unwrap() + } + + fn write_response(stream: &mut impl Write, cseq: u32, headers: &[(&str, &str)], body: &[u8]) { + write_status_response(stream, cseq, headers, body, 200, "OK"); + } + + fn write_status_response( + stream: &mut impl Write, + cseq: u32, + headers: &[(&str, &str)], + body: &[u8], + status_code: u16, + reason: &str, + ) { + write!(stream, "RTSP/1.0 {status_code} {reason}\r\nCSeq: {cseq}\r\n").unwrap(); + for (name, value) in headers { + write!(stream, "{name}: {value}\r\n").unwrap(); + } + write!(stream, "\r\n").unwrap(); + if !body.is_empty() { + stream.write_all(body).unwrap(); + } + stream.flush().unwrap(); + } } diff --git a/livekit-capture/src/sources/tcp.rs b/livekit-capture/src/sources/tcp.rs index 173a007ba..962f73658 100644 --- a/livekit-capture/src/sources/tcp.rs +++ b/livekit-capture/src/sources/tcp.rs @@ -14,7 +14,7 @@ use std::{ io::{self, Read}, - net::TcpStream, + net::{SocketAddr, TcpListener, TcpStream, ToSocketAddrs}, }; use thiserror::Error; @@ -66,12 +66,19 @@ impl ByteStreamSourceConfig { read_chunk_size: DEFAULT_CHUNK_SIZE, } } + + /// Sets the read chunk size used for Annex-B byte streams. + pub fn with_read_chunk_size(mut self, read_chunk_size: usize) -> Self { + self.read_chunk_size = read_chunk_size.max(1); + self + } } /// Encoded source backed by any blocking byte stream. #[derive(Debug)] pub struct ByteStreamEncodedSource { reader: R, + config: ByteStreamSourceConfig, parser: ByteStreamParser, read_chunk: Vec, eof: bool, @@ -127,7 +134,18 @@ where } }; - Ok(Self { reader, parser, read_chunk: vec![0; config.read_chunk_size.max(1)], eof: false }) + Ok(Self { + reader, + config, + parser, + read_chunk: vec![0; config.read_chunk_size.max(1)], + eof: false, + }) + } + + /// Returns the source configuration. + pub fn config(&self) -> ByteStreamSourceConfig { + self.config } /// Returns the wrapped reader. @@ -200,6 +218,79 @@ where } } +impl ByteStreamEncodedSource { + /// Connects to a TCP producer and parses the declared encoded wire format. + pub fn connect( + addr: A, + config: ByteStreamSourceConfig, + ) -> Result { + let stream = TcpStream::connect(addr).map_err(TcpSourceError::Io)?; + Self::new(stream, config) + } + + /// Creates a TCP encoded source from an already connected stream. + pub fn from_tcp_stream( + stream: TcpStream, + config: ByteStreamSourceConfig, + ) -> Result { + Self::new(stream, config) + } +} + +/// TCP listener for producer-initiated encoded byte streams. +#[derive(Debug)] +pub struct TcpEncodedListener { + listener: TcpListener, + config: ByteStreamSourceConfig, +} + +impl TcpEncodedListener { + /// Binds a TCP listener for encoded byte-stream producers. + pub fn bind( + addr: A, + config: ByteStreamSourceConfig, + ) -> Result { + let listener = TcpListener::bind(addr).map_err(TcpSourceError::Io)?; + Ok(Self { listener, config }) + } + + /// Creates an encoded listener from an existing [`TcpListener`]. + pub fn from_listener(listener: TcpListener, config: ByteStreamSourceConfig) -> Self { + Self { listener, config } + } + + /// Returns the listener configuration. + pub fn config(&self) -> ByteStreamSourceConfig { + self.config + } + + /// Returns the bound local socket address. + pub fn local_addr(&self) -> Result { + self.listener.local_addr().map_err(TcpSourceError::Io) + } + + /// Returns the wrapped TCP listener. + pub fn listener(&self) -> &TcpListener { + &self.listener + } + + /// Returns the wrapped TCP listener mutably. + pub fn listener_mut(&mut self) -> &mut TcpListener { + &mut self.listener + } + + /// Accepts one producer connection and returns it as a TCP encoded source. + pub fn accept(&self) -> Result { + self.accept_with_peer().map(|(source, _peer)| source) + } + + /// Accepts one producer connection and returns the source plus peer address. + pub fn accept_with_peer(&self) -> Result<(TcpEncodedSource, SocketAddr), TcpSourceError> { + let (stream, peer) = self.listener.accept().map_err(TcpSourceError::Io)?; + Ok((TcpEncodedSource::from_tcp_stream(stream, self.config)?, peer)) + } +} + impl EncodedAccessUnitSource for ByteStreamEncodedSource where R: Read + Send + Sync + 'static, @@ -249,7 +340,11 @@ fn read_exact_or_clean_eof(reader: &mut impl Read, buf: &mut [u8]) -> io::Result #[cfg(test)] mod tests { - use std::io::Cursor; + use std::{ + io::{Cursor, Write}, + net::{Shutdown, TcpListener as StdTcpListener, TcpStream as StdTcpStream}, + thread, + }; use super::*; @@ -271,12 +366,18 @@ mod tests { bytes } + fn annex_b_stream() -> Vec { + vec![0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x65, 1, 2, 0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x41, 3] + } + + fn annex_b_config() -> ByteStreamSourceConfig { + ByteStreamSourceConfig::new(EncodedWireFormat::H264AnnexB, 0, 33_333, 640, 480) + } + #[test] fn reads_annex_b_access_units() { - let stream = - [0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x65, 1, 2, 0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x41, 3]; - let config = - ByteStreamSourceConfig::new(EncodedWireFormat::H264AnnexB, 0, 33_333, 640, 480); + let stream = annex_b_stream(); + let config = annex_b_config(); let mut source = ByteStreamEncodedSource::new(Cursor::new(stream), config).unwrap(); let first = source.next_access_unit().unwrap().unwrap(); @@ -286,6 +387,47 @@ mod tests { assert!(source.next_access_unit().unwrap().is_none()); } + #[test] + fn tcp_connect_reads_annex_b_access_units() { + let listener = StdTcpListener::bind("127.0.0.1:0").unwrap(); + let addr = listener.local_addr().unwrap(); + let writer = thread::spawn(move || { + let (mut stream, _) = listener.accept().unwrap(); + stream.write_all(&annex_b_stream()).unwrap(); + stream.shutdown(Shutdown::Write).unwrap(); + }); + + let mut source = TcpEncodedSource::connect(addr, annex_b_config()).unwrap(); + let first = source.next_access_unit().unwrap().unwrap(); + assert_eq!(first.payload.as_ref(), &[0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x65, 1, 2]); + let second = source.next_access_unit().unwrap().unwrap(); + assert_eq!(second.payload.as_ref(), &[0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x41, 3]); + assert!(source.next_access_unit().unwrap().is_none()); + writer.join().unwrap(); + } + + #[test] + fn tcp_listener_accepts_annex_b_source() { + let listener = TcpEncodedListener::bind("127.0.0.1:0", annex_b_config()).unwrap(); + let addr = listener.local_addr().unwrap(); + let writer = thread::spawn(move || { + let mut stream = StdTcpStream::connect(addr).unwrap(); + stream.write_all(&annex_b_stream()).unwrap(); + stream.shutdown(Shutdown::Write).unwrap(); + }); + + let (mut source, peer) = listener.accept_with_peer().unwrap(); + assert_eq!(peer.ip(), addr.ip()); + assert_eq!(source.config(), annex_b_config()); + + let first = source.next_access_unit().unwrap().unwrap(); + assert_eq!(first.payload.as_ref(), &[0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x65, 1, 2]); + let second = source.next_access_unit().unwrap().unwrap(); + assert_eq!(second.payload.as_ref(), &[0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x41, 3]); + assert!(source.next_access_unit().unwrap().is_none()); + writer.join().unwrap(); + } + #[test] fn reads_rfc4571_rtp_access_unit() { let packet = rtp_packet(10, 12_000, true, &[0x65, 1, 2]); diff --git a/webrtc-sys/src/video_track.cpp b/webrtc-sys/src/video_track.cpp index af8866b40..757d4268a 100644 --- a/webrtc-sys/src/video_track.cpp +++ b/webrtc-sys/src/video_track.cpp @@ -197,7 +197,8 @@ bool VideoTrackSource::InternalSource::on_captured_frame( return false; } - if (adapted_width != frame.width() || adapted_height != frame.height()) { + if ((adapted_width != frame.width() || adapted_height != frame.height()) && + buffer->type() != webrtc::VideoFrameBuffer::Type::kNative) { buffer = buffer->CropAndScale(crop_x, crop_y, crop_width, crop_height, adapted_width, adapted_height); } From 22d567e4106c75ef67e7fc192ea775ced429f14f Mon Sep 17 00:00:00 2001 From: David Chen Date: Wed, 24 Jun 2026 14:42:55 -0700 Subject: [PATCH 09/24] dont touch frame metadata from capture crate --- .changeset/livekit-capture-preencoded.md | 2 +- livekit-capture/src/dmabuf.rs | 6 +-- livekit-capture/src/encoded.rs | 12 +---- livekit-capture/src/lib.rs | 7 +-- livekit-capture/src/metadata.rs | 33 -------------- livekit-capture/src/platform/avfoundation.rs | 11 ++--- livekit-capture/src/source.rs | 46 ++++---------------- livekit-capture/src/sources/argus.rs | 44 ++++++------------- livekit-capture/src/sources/v4l.rs | 28 +++--------- livekit-capture/src/track.rs | 5 +-- 10 files changed, 38 insertions(+), 156 deletions(-) delete mode 100644 livekit-capture/src/metadata.rs diff --git a/.changeset/livekit-capture-preencoded.md b/.changeset/livekit-capture-preencoded.md index c852c1bb4..809b9b152 100644 --- a/.changeset/livekit-capture-preencoded.md +++ b/.changeset/livekit-capture-preencoded.md @@ -5,4 +5,4 @@ "webrtc-sys": patch --- -Add a `livekit-capture` crate with codec-neutral capture types, H264/H265 passthrough support, common encoded ingress helpers, TCP byte-stream encoded ingress, RTSP-over-TCP encoded ingress, GStreamer appsink encoded ingress, macOS AVFoundation decoded-frame capture, Linux V4L capture, and Jetson libargus capture hooks. The `local_video` examples now open platform camera capture through `livekit-capture` instead of depending on Nokhwa directly, and a `preencode_publish` example demonstrates publishing H264/H265 Annex-B TCP or RTSP streams as pre-encoded video tracks. +Add a `livekit-capture` crate with codec-neutral capture types, H264/H265 passthrough support, common encoded ingress helpers, TCP byte-stream encoded ingress, RTSP-over-TCP encoded ingress, GStreamer appsink encoded ingress, macOS AVFoundation decoded-frame capture, Linux V4L capture, and Jetson libargus capture hooks. The capture crate reports capture-origin timing such as optional sensor timestamps, while packet-trailer frame metadata remains a publishing concern. The `local_video` examples now open platform camera capture through `livekit-capture` instead of depending on Nokhwa directly, and a `preencode_publish` example demonstrates publishing H264/H265 Annex-B TCP or RTSP streams as pre-encoded video tracks. diff --git a/livekit-capture/src/dmabuf.rs b/livekit-capture/src/dmabuf.rs index 8410582f0..041f2c545 100644 --- a/livekit-capture/src/dmabuf.rs +++ b/livekit-capture/src/dmabuf.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::metadata::FrameMetadata; - /// DMA-BUF pixel format. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum DmaBufPixelFormat { @@ -59,6 +57,6 @@ pub struct DmaBufFrame { pub modifier: Option, /// Capture timestamp in microseconds. pub timestamp_us: i64, - /// Optional packet-trailer metadata. - pub metadata: FrameMetadata, + /// Sensor timestamp translated to UNIX-epoch microseconds, when available. + pub sensor_timestamp_us: Option, } diff --git a/livekit-capture/src/encoded.rs b/livekit-capture/src/encoded.rs index cf7214150..465fdb493 100644 --- a/livekit-capture/src/encoded.rs +++ b/livekit-capture/src/encoded.rs @@ -24,7 +24,7 @@ use livekit::{ }, }; -use crate::{error::CaptureError, metadata::FrameMetadata}; +use crate::error::CaptureError; const ANNEX_B_START_CODE: [u8; 4] = [0, 0, 0, 1]; @@ -196,8 +196,6 @@ pub struct EncodedAccessUnit<'a> { pub layers: EncodedLayerInfo, /// Optional codec-specific metadata. pub codec_specific: CodecSpecific, - /// Optional packet-trailer metadata. - pub metadata: FrameMetadata, } /// Owned encoded video access unit. @@ -219,8 +217,6 @@ pub struct OwnedEncodedAccessUnit { pub layers: EncodedLayerInfo, /// Optional codec-specific metadata. pub codec_specific: CodecSpecific, - /// Optional packet-trailer metadata. - pub metadata: FrameMetadata, } impl OwnedEncodedAccessUnit { @@ -242,7 +238,6 @@ impl OwnedEncodedAccessUnit { height, layers: EncodedLayerInfo::default(), codec_specific: CodecSpecific::None, - metadata: FrameMetadata::default(), } } @@ -257,7 +252,6 @@ impl OwnedEncodedAccessUnit { height: self.height, layers: self.layers, codec_specific: self.codec_specific.clone(), - metadata: self.metadata, } } @@ -272,7 +266,6 @@ impl OwnedEncodedAccessUnit { height: access_unit.height, layers: access_unit.layers, codec_specific: access_unit.codec_specific.clone(), - metadata: access_unit.metadata, } } } @@ -296,7 +289,6 @@ impl<'a> EncodedAccessUnit<'a> { height, layers: EncodedLayerInfo::default(), codec_specific: CodecSpecific::None, - metadata: FrameMetadata::default(), } } @@ -326,7 +318,6 @@ impl<'a> EncodedAccessUnit<'a> { codec_specific: CodecSpecific::H264 { packetization_mode: H264PacketizationMode::NonInterleaved, }, - metadata: FrameMetadata::default(), }) } @@ -354,7 +345,6 @@ impl<'a> EncodedAccessUnit<'a> { height, layers: EncodedLayerInfo::default(), codec_specific: CodecSpecific::H265, - metadata: FrameMetadata::default(), }) } } diff --git a/livekit-capture/src/lib.rs b/livekit-capture/src/lib.rs index dfcc9ef83..d83d9d4fa 100644 --- a/livekit-capture/src/lib.rs +++ b/livekit-capture/src/lib.rs @@ -18,7 +18,6 @@ pub mod device; pub mod dmabuf; pub mod encoded; mod error; -pub mod metadata; pub mod platform; pub mod source; pub mod sources; @@ -38,10 +37,8 @@ pub use encoded::{ OwnedEncodedAccessUnit, }; pub use error::CaptureError; -pub use metadata::FrameMetadata; pub use source::{ - CaptureFrame, CaptureFrameSource, CaptureMetadataOptions, CaptureSourceError, - CaptureSourceOptions, CaptureTimestampSource, EncodedCaptureFrameSource, - EncodedFrameSourceError, RawVideoFrame, VideoCaptureSource, + CaptureFrame, CaptureFrameSource, CaptureSourceError, CaptureSourceOptions, + EncodedCaptureFrameSource, EncodedFrameSourceError, RawVideoFrame, VideoCaptureSource, }; pub use track::VideoCaptureTrack; diff --git a/livekit-capture/src/metadata.rs b/livekit-capture/src/metadata.rs deleted file mode 100644 index 6eac32db8..000000000 --- a/livekit-capture/src/metadata.rs +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2026 LiveKit, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/// Packet-trailer metadata associated with a captured frame. -#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] -pub struct FrameMetadata { - /// Wall-clock capture timestamp in microseconds. - pub user_timestamp: Option, - /// Monotonically increasing frame identifier. - pub frame_id: Option, -} - -impl FrameMetadata { - pub(crate) fn into_rtc(self) -> Option { - (self.user_timestamp.is_some() || self.frame_id.is_some()).then_some( - livekit::webrtc::video_frame::FrameMetadata { - user_timestamp: self.user_timestamp, - frame_id: self.frame_id, - }, - ) - } -} diff --git a/livekit-capture/src/platform/avfoundation.rs b/livekit-capture/src/platform/avfoundation.rs index 202ba137a..92b595e71 100644 --- a/livekit-capture/src/platform/avfoundation.rs +++ b/livekit-capture/src/platform/avfoundation.rs @@ -65,6 +65,8 @@ pub struct AvFoundationFrame { pub capture_wall_time_us: u64, /// Wall-clock timestamp recorded after the frame was read from AVFoundation. pub read_wall_time_us: u64, + /// Sensor timestamp translated to UNIX-epoch microseconds, when available. + pub sensor_timestamp_us: Option, /// Whether conversion from the source format to I420 was needed. pub used_conversion: bool, } @@ -491,8 +493,6 @@ mod macos { CaptureDeviceSelector, CaptureFormat, CaptureFormatRequest, CaptureFrameFormat, CaptureResolution, }; - use crate::metadata::FrameMetadata; - pub(super) struct SessionInner { session: Retained, _input: Retained, @@ -1056,11 +1056,7 @@ mod macos { let frame = VideoFrame { rotation: VideoRotation::VideoRotation0, timestamp_us: shared.timestamp_us(), - frame_metadata: FrameMetadata { - user_timestamp: Some(capture_wall_time_us), - frame_id: None, - } - .into_rtc(), + frame_metadata: None, buffer, }; @@ -1069,6 +1065,7 @@ mod macos { source_format, capture_wall_time_us, read_wall_time_us, + sensor_timestamp_us: None, used_conversion: source_format != CaptureFrameFormat::I420, }); Ok(()) diff --git a/livekit-capture/src/source.rs b/livekit-capture/src/source.rs index 284b664fe..f2b006c72 100644 --- a/livekit-capture/src/source.rs +++ b/livekit-capture/src/source.rs @@ -28,28 +28,6 @@ use crate::{ track::VideoCaptureTrack, }; -/// Capture timestamp metadata attached to frames by high-level source options. -#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] -#[non_exhaustive] -pub enum CaptureTimestampSource { - /// Do not attach a user timestamp. - #[default] - None, - /// Attach the wall-clock timestamp observed by the source wrapper. - WallClock, - /// Attach the backend-provided sensor/capture timestamp when available. - Backend, -} - -/// Metadata options shared by high-level capture sources. -#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] -pub struct CaptureMetadataOptions { - /// Timestamp source to attach as [`crate::FrameMetadata::user_timestamp`]. - pub timestamp: CaptureTimestampSource, - /// Whether to attach a monotonically increasing frame id. - pub frame_id: bool, -} - /// Options used by [`VideoCaptureSource::open`]. #[derive(Debug, Clone, PartialEq, Eq)] pub struct CaptureSourceOptions { @@ -59,8 +37,6 @@ pub struct CaptureSourceOptions { pub device: CaptureDeviceSelector, /// Format requested from the backend. pub format: CaptureFormatRequest, - /// Metadata to attach when the backend supports it. - pub metadata: CaptureMetadataOptions, /// Whether the resulting track should be marked as a screencast. pub is_screencast: bool, } @@ -71,7 +47,6 @@ impl Default for CaptureSourceOptions { backend: CaptureBackend::Auto, device: CaptureDeviceSelector::Default, format: CaptureFormatRequest::Default, - metadata: CaptureMetadataOptions::default(), is_screencast: false, } } @@ -88,6 +63,8 @@ pub struct RawVideoFrame { pub capture_wall_time_us: u64, /// Wall-clock timestamp recorded after the frame was read, in microseconds. pub read_wall_time_us: u64, + /// Sensor timestamp translated to UNIX-epoch microseconds, when available. + pub sensor_timestamp_us: Option, /// Whether the backend converted the source buffer before publishing. pub used_conversion: bool, } @@ -379,6 +356,7 @@ impl From for RawVideoFrame { source_format: frame.source_format, capture_wall_time_us: frame.capture_wall_time_us, read_wall_time_us: frame.read_wall_time_us, + sensor_timestamp_us: frame.sensor_timestamp_us, used_conversion: frame.used_conversion, } } @@ -410,6 +388,7 @@ impl From for RawVideoFrame { source_format: frame.source_format, capture_wall_time_us: frame.capture_wall_time_us, read_wall_time_us: frame.read_wall_time_us, + sensor_timestamp_us: frame.sensor_timestamp_us, } } } @@ -570,8 +549,6 @@ impl From for crate::sources::v4l::V4lCaptureOptions { device: options.device, format: options.format, frame_formats: crate::sources::v4l::default_frame_formats(), - attach_capture_timestamp: options.metadata.timestamp != CaptureTimestampSource::None, - attach_frame_id: options.metadata.frame_id, }; if let CaptureFormatRequest::Exact(format) | CaptureFormatRequest::Closest(format) = source_options.format @@ -654,23 +631,15 @@ impl TryFrom for crate::sources::argus::ArgusCaptureOption }); } }; - Ok(Self { - sensor_index, - format, - attach_sensor_timestamp: options.metadata.timestamp != CaptureTimestampSource::None, - attach_frame_id: options.metadata.frame_id, - }) + Ok(Self { sensor_index, format }) } } #[cfg(test)] mod tests { use super::*; + use crate::dmabuf::{DmaBufPixelFormat, DmaBufPlane}; use crate::encoded::{EncodedFrameType, EncodedVideoCodec}; - use crate::{ - dmabuf::{DmaBufPixelFormat, DmaBufPlane}, - metadata::FrameMetadata, - }; use livekit::webrtc::video_frame::VideoRotation; #[derive(Debug, Error)] @@ -731,6 +700,7 @@ mod tests { source_format: CaptureFrameFormat::I420, capture_wall_time_us: 1, read_wall_time_us: 2, + sensor_timestamp_us: None, used_conversion: false, }); assert_eq!(raw.capture_path(), CapturePath::Raw); @@ -742,7 +712,7 @@ mod tests { planes: vec![DmaBufPlane { fd: -1, offset: 0, stride: 2 }], modifier: None, timestamp_us: 0, - metadata: FrameMetadata::default(), + sensor_timestamp_us: None, }); assert_eq!(dmabuf.capture_path(), CapturePath::DmaBuf); diff --git a/livekit-capture/src/sources/argus.rs b/livekit-capture/src/sources/argus.rs index 18df839e9..ba365b714 100644 --- a/livekit-capture/src/sources/argus.rs +++ b/livekit-capture/src/sources/argus.rs @@ -26,10 +26,7 @@ use crate::{ }; #[cfg(livekit_capture_argus)] -use crate::{ - dmabuf::{DmaBufPixelFormat, DmaBufPlane}, - metadata::FrameMetadata, -}; +use crate::dmabuf::{DmaBufPixelFormat, DmaBufPlane}; #[cfg(livekit_capture_argus)] use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; #[cfg(livekit_capture_argus)] @@ -63,10 +60,6 @@ pub struct ArgusCaptureOptions { pub sensor_index: u32, /// Requested capture format. pub format: CaptureFormat, - /// Attach the Argus sensor timestamp as [`crate::FrameMetadata::user_timestamp`] when available. - pub attach_sensor_timestamp: bool, - /// Attach a monotonically increasing frame id as [`crate::FrameMetadata::frame_id`]. - pub attach_frame_id: bool, } impl ArgusCaptureOptions { @@ -75,8 +68,6 @@ impl ArgusCaptureOptions { Self { sensor_index, format: CaptureFormat::new(resolution, frame_rate, CaptureFrameFormat::Nv12), - attach_sensor_timestamp: false, - attach_frame_id: false, } } } @@ -117,6 +108,8 @@ pub struct ArgusFrame { pub dmabuf: DmaBufFrame, /// Argus sensor start timestamp in nanoseconds, when available. pub sensor_timestamp_ns: Option, + /// Argus sensor start timestamp translated to UNIX-epoch microseconds, when available. + pub sensor_timestamp_us: Option, /// Time spent waiting for `FrameConsumer::acquireFrame` to return. pub acquire_wait_ns: u64, /// Time spent copying the acquired EGLStream frame into the DMA buffer. @@ -138,8 +131,6 @@ pub struct ArgusCaptureSession { options: ArgusCaptureOptions, #[cfg(livekit_capture_argus)] started_at: Instant, - #[cfg(livekit_capture_argus)] - next_frame_id: u32, } // SAFETY: The C++ Argus session is driven by one mutable Rust owner at a time. @@ -213,7 +204,7 @@ impl ArgusCaptureSession { return Err(ArgusError::CreateSessionFailed); } - Ok(Self { handle, options, started_at: Instant::now(), next_frame_id: 1 }) + Ok(Self { handle, options, started_at: Instant::now() }) } #[cfg(not(livekit_capture_argus))] @@ -241,7 +232,7 @@ impl ArgusCaptureSession { } let sensor_timestamp_ns = (sensor_timestamp_ns > 0).then_some(sensor_timestamp_ns); - let metadata = self.frame_metadata(sensor_timestamp_ns); + let sensor_timestamp_us = sensor_timestamp_ns.and_then(sensor_wall_time_us); let resolution = self.options.format.resolution; let dmabuf = DmaBufFrame { width: resolution.width, @@ -250,10 +241,16 @@ impl ArgusCaptureSession { planes: vec![DmaBufPlane { fd, offset: 0, stride: resolution.width }], modifier: None, timestamp_us: elapsed_us(self.started_at.elapsed()), - metadata, + sensor_timestamp_us, }; - Ok(ArgusFrame { dmabuf, sensor_timestamp_ns, acquire_wait_ns, blit_ns }) + Ok(ArgusFrame { + dmabuf, + sensor_timestamp_ns, + sensor_timestamp_us, + acquire_wait_ns, + blit_ns, + }) } #[cfg(not(livekit_capture_argus))] @@ -271,21 +268,6 @@ impl ArgusCaptureSession { #[cfg(not(livekit_capture_argus))] fn release_frame_inner(&mut self) {} - - #[cfg(livekit_capture_argus)] - fn frame_metadata(&mut self, sensor_timestamp_ns: Option) -> FrameMetadata { - let user_timestamp = self - .options - .attach_sensor_timestamp - .then(|| sensor_timestamp_ns.and_then(sensor_wall_time_us).or_else(unix_time_us_now)) - .flatten(); - let frame_id = self.options.attach_frame_id.then(|| { - let frame_id = self.next_frame_id; - self.next_frame_id = self.next_frame_id.wrapping_add(1); - frame_id - }); - FrameMetadata { user_timestamp, frame_id } - } } impl Drop for ArgusCaptureSession { diff --git a/livekit-capture/src/sources/v4l.rs b/livekit-capture/src/sources/v4l.rs index b6311842a..9fb6c495f 100644 --- a/livekit-capture/src/sources/v4l.rs +++ b/livekit-capture/src/sources/v4l.rs @@ -38,8 +38,6 @@ use crate::device::{ CaptureDeviceInfo, CaptureDeviceSelector, CaptureFormat, CaptureFormatRequest, CaptureFrameFormat, CapturePath, CaptureResolution, }; -#[cfg(target_os = "linux")] -use crate::metadata::FrameMetadata; #[cfg(any(target_os = "linux", test))] const MAX_BACKEND_CAPTURE_TIMESTAMP_AGE_US: u64 = 5_000_000; @@ -53,10 +51,6 @@ pub struct V4lCaptureOptions { pub format: CaptureFormatRequest, /// Ordered source frame formats to try. pub frame_formats: Vec, - /// Attach a wall-clock capture timestamp as [`crate::FrameMetadata::user_timestamp`]. - pub attach_capture_timestamp: bool, - /// Attach a monotonically increasing frame id as [`crate::FrameMetadata::frame_id`]. - pub attach_frame_id: bool, } impl V4lCaptureOptions { @@ -74,8 +68,6 @@ impl V4lCaptureOptions { CaptureFrameFormat::Yuyv, )), frame_formats: default_frame_formats(), - attach_capture_timestamp: false, - attach_frame_id: false, } } } @@ -128,6 +120,8 @@ pub struct V4lFrame { pub capture_wall_time_us: u64, /// Wall-clock timestamp recorded after the frame was read from the camera backend. pub read_wall_time_us: u64, + /// Sensor timestamp translated to UNIX-epoch microseconds, when available. + pub sensor_timestamp_us: Option, /// Whether conversion from the source format to I420 was needed. pub used_conversion: bool, /// Whether compressed image decoding was needed before conversion. @@ -149,8 +143,6 @@ pub struct V4lCaptureSession { options: V4lCaptureOptions, #[cfg(target_os = "linux")] started_at: Instant, - #[cfg(target_os = "linux")] - next_frame_id: u32, } impl std::fmt::Debug for V4lCaptureSession { @@ -204,7 +196,7 @@ impl V4lCaptureSession { camera.open_stream().map_err(nokhwa_error)?; let format = capture_format_from_nokhwa(camera.camera_format())?; - Ok(Self { camera, format, options, started_at: Instant::now(), next_frame_id: 1 }) + Ok(Self { camera, format, options, started_at: Instant::now() }) } #[cfg(not(target_os = "linux"))] @@ -230,7 +222,7 @@ impl V4lCaptureSession { let mut frame = VideoFrame { rotation: VideoRotation::VideoRotation0, timestamp_us: elapsed_us(self.started_at.elapsed()), - frame_metadata: self.frame_metadata(capture_wall_time_us).into_rtc(), + frame_metadata: None, buffer: I420Buffer::new(width, height), }; let used_decode_path = convert_to_i420( @@ -248,6 +240,7 @@ impl V4lCaptureSession { backend_capture_timestamp, capture_wall_time_us, read_wall_time_us, + sensor_timestamp_us: None, used_conversion: source_format != CaptureFrameFormat::I420, used_decode_path, }) @@ -257,17 +250,6 @@ impl V4lCaptureSession { fn capture_frame_inner(&mut self) -> Result { Err(V4lError::UnsupportedPlatform) } - - #[cfg(target_os = "linux")] - fn frame_metadata(&mut self, capture_wall_time_us: u64) -> FrameMetadata { - let user_timestamp = self.options.attach_capture_timestamp.then_some(capture_wall_time_us); - let frame_id = self.options.attach_frame_id.then(|| { - let frame_id = self.next_frame_id; - self.next_frame_id = self.next_frame_id.wrapping_add(1); - frame_id - }); - FrameMetadata { user_timestamp, frame_id } - } } /// Returns Linux V4L2 capture devices. diff --git a/livekit-capture/src/track.rs b/livekit-capture/src/track.rs index 0e9d45806..30cf3ce75 100644 --- a/livekit-capture/src/track.rs +++ b/livekit-capture/src/track.rs @@ -60,13 +60,12 @@ impl VideoCaptureTrack { #[cfg(target_os = "linux")] pub fn capture_dmabuf(&self, frame: &DmaBufFrame) -> Result<(), CaptureError> { let plane = frame.planes.first().ok_or(CaptureError::MissingDmaBufPlane)?; - let ok = self.source.capture_dmabuf_frame_with_metadata( + let ok = self.source.capture_dmabuf_frame( plane.fd, frame.width, frame.height, frame.pixel_format.as_native(), frame.timestamp_us, - frame.metadata.into_rtc(), ); ok.then_some(()).ok_or(CaptureError::CaptureFailed) } @@ -91,7 +90,7 @@ impl VideoCaptureTrack { frame_type: access_unit.frame_type.into(), width: access_unit.width, height: access_unit.height, - frame_metadata: access_unit.metadata.into_rtc(), + frame_metadata: None, }; self.source.capture_encoded_frame(&frame).then_some(()).ok_or(CaptureError::CaptureFailed) } From e06449033cb6abf6fb082947f3d4694984fd54a6 Mon Sep 17 00:00:00 2001 From: David Chen Date: Sat, 27 Jun 2026 17:47:00 -0700 Subject: [PATCH 10/24] reduce getStat calls --- examples/local_video/src/subscriber.rs | 8 +-- examples/local_video/src/video_display.rs | 2 - examples/local_video/src/viewport_aspect.rs | 2 +- livekit-capture/src/platform/avfoundation.rs | 53 ++++++++++++++++++-- 4 files changed, 55 insertions(+), 10 deletions(-) diff --git a/examples/local_video/src/subscriber.rs b/examples/local_video/src/subscriber.rs index faeb7bf32..76f5c7175 100644 --- a/examples/local_video/src/subscriber.rs +++ b/examples/local_video/src/subscriber.rs @@ -783,6 +783,10 @@ fn update_receive_bitrate_from_stats( } } +fn stats_poll_interval() -> Duration { + Duration::from_secs(10) +} + struct TimestampAnchor { unix_timestamp_us: u64, instant: Instant, @@ -1042,7 +1046,7 @@ async fn handle_track_subscribed( let mut receive_bitrate_snapshot = None; let mut last_jitter_buffer_log = Instant::now().checked_sub(Duration::from_secs(5)).unwrap_or_else(Instant::now); - let mut interval = tokio::time::interval(Duration::from_secs(1)); + let mut interval = tokio::time::interval(stats_poll_interval()); interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); loop { @@ -1345,8 +1349,6 @@ impl eframe::App for VideoApp { ); egui::CentralPanel::default().frame(egui::Frame::NONE).show(root_ui, |ui| { - ui.ctx().request_repaint(); - // Let the native window follow live resize, and letterbox the video instead of // programmatically resizing the window while the user is dragging it. let size = diff --git a/examples/local_video/src/video_display.rs b/examples/local_video/src/video_display.rs index 4ce1b6ae2..55c937c1f 100644 --- a/examples/local_video/src/video_display.rs +++ b/examples/local_video/src/video_display.rs @@ -661,8 +661,6 @@ impl eframe::App for VideoApp { let channel_values = self.channels.as_ref().map(|targets| drive_channels(&ctx, targets)); egui::CentralPanel::default().frame(egui::Frame::NONE).show(root_ui, |ui| { - ui.ctx().request_repaint(); - let size = viewport_aspect::fitted_video_size(ui.available_size(), self.viewport.aspect()); diff --git a/examples/local_video/src/viewport_aspect.rs b/examples/local_video/src/viewport_aspect.rs index 98551e655..67544de29 100644 --- a/examples/local_video/src/viewport_aspect.rs +++ b/examples/local_video/src/viewport_aspect.rs @@ -53,7 +53,7 @@ pub(crate) fn native_options(initial_aspect: Option) -> eframe::NativeOptio let mut wgpu_options = egui_wgpu_backend::WgpuConfiguration::default(); #[cfg(target_os = "macos")] { - wgpu_options.surface.present_mode = wgpu::PresentMode::Immediate; + wgpu_options.surface.present_mode = wgpu::PresentMode::AutoVsync; } #[cfg(not(target_os = "macos"))] { diff --git a/livekit-capture/src/platform/avfoundation.rs b/livekit-capture/src/platform/avfoundation.rs index 92b595e71..81d907d15 100644 --- a/livekit-capture/src/platform/avfoundation.rs +++ b/livekit-capture/src/platform/avfoundation.rs @@ -18,7 +18,7 @@ use std::sync::{ }; use std::thread::JoinHandle; -use livekit::webrtc::video_frame::{I420Buffer, VideoFrame}; +use livekit::webrtc::video_frame::{I420Buffer, VideoBuffer, VideoFrame}; use thiserror::Error; use crate::{ @@ -82,6 +82,7 @@ impl AvFoundationFrame { pub struct AvFoundationCaptureSession { format: CaptureFormat, options: AvFoundationCaptureOptions, + target_resolution: Option, #[cfg(target_os = "macos")] inner: macos::SessionInner, } @@ -133,7 +134,11 @@ impl AvFoundationCaptureSession { let inner = macos::SessionInner::new(&options)?; let mut format = inner.wait_for_format(FIRST_FRAME_TIMEOUT)?; format.frame_rate = requested_frame_rate_hint(&options.format).unwrap_or(30); - Ok(Self { format, options, inner }) + let target_resolution = requested_output_resolution(&options.format, format.resolution); + if let Some(resolution) = target_resolution { + format.resolution = resolution; + } + Ok(Self { format, options, target_resolution, inner }) } #[cfg(not(target_os = "macos"))] @@ -143,7 +148,19 @@ impl AvFoundationCaptureSession { #[cfg(target_os = "macos")] fn capture_frame_inner(&mut self) -> Result { - self.inner.capture_frame() + let mut frame = self.inner.capture_frame()?; + if let Some(resolution) = self.target_resolution { + if frame.frame.buffer.width() != resolution.width + || frame.frame.buffer.height() != resolution.height + { + let width = i32::try_from(resolution.width) + .map_err(|_| AvFoundationError::InvalidFrame("scaled width exceeds i32"))?; + let height = i32::try_from(resolution.height) + .map_err(|_| AvFoundationError::InvalidFrame("scaled height exceeds i32"))?; + frame.frame.buffer = frame.frame.buffer.scale(width, height); + } + } + Ok(frame) } #[cfg(not(target_os = "macos"))] @@ -340,6 +357,23 @@ fn requested_frame_rate_hint(format: &CaptureFormatRequest) -> Option { } } +fn requested_output_resolution( + request: &CaptureFormatRequest, + delivered: CaptureResolution, +) -> Option { + let CaptureFormatRequest::Closest(format) = request else { + return None; + }; + if format.resolution == delivered { + return None; + } + (resolution_area(format.resolution) <= resolution_area(delivered)).then_some(format.resolution) +} + +fn resolution_area(resolution: CaptureResolution) -> u64 { + resolution.width as u64 * resolution.height as u64 +} + fn validate_resolution(resolution: CaptureResolution) -> Result<(), AvFoundationError> { if resolution.width == 0 { return Err(AvFoundationError::InvalidOption("width must be non-zero")); @@ -797,6 +831,11 @@ mod macos { ); selected.map(Some).ok_or(AvFoundationError::UnsupportedFormat(*format)) } + CaptureFormatRequest::Closest(format) + if exact_session_preset(format.resolution).is_some() => + { + Ok(None) + } CaptureFormatRequest::Closest(format) => Ok(best_device_format( device, Some(format.resolution), @@ -1028,12 +1067,18 @@ mod macos { | CaptureFormatRequest::HighestResolution { frame_rate: _, frame_format: _ } => None, }?; + exact_session_preset(resolution).or(Some(unsafe { AVCaptureSessionPresetHigh })) + } + + fn exact_session_preset( + resolution: CaptureResolution, + ) -> Option<&'static objc2_av_foundation::AVCaptureSessionPreset> { match (resolution.width, resolution.height) { (1920, 1080) => Some(unsafe { AVCaptureSessionPreset1920x1080 }), (1280, 720) => Some(unsafe { AVCaptureSessionPreset1280x720 }), (640, 480) => Some(unsafe { AVCaptureSessionPreset640x480 }), (w, h) if w <= 640 && h <= 480 => Some(unsafe { AVCaptureSessionPresetMedium }), - _ => Some(unsafe { AVCaptureSessionPresetHigh }), + _ => None, } } From 86e407c368ca4c66110832809e2675d0804baa1f Mon Sep 17 00:00:00 2001 From: David Chen Date: Sat, 27 Jun 2026 22:56:47 -0700 Subject: [PATCH 11/24] add animated test pattern --- examples/local_video/README.md | 10 +- examples/local_video/src/publisher.rs | 82 ++++- examples/local_video/src/test_pattern.rs | 414 +++++++++++++++++++++-- 3 files changed, 468 insertions(+), 38 deletions(-) diff --git a/examples/local_video/README.md b/examples/local_video/README.md index edb608802..0a2c331a6 100644 --- a/examples/local_video/README.md +++ b/examples/local_video/README.md @@ -83,7 +83,13 @@ Publisher usage: # publish a static SMPTE color-bar test pattern (no camera required) cargo run -p local_video -F desktop --bin publisher -- \ - --test-pattern \ + --test-pattern 0 \ + --room-name demo \ + --identity test-1 + + # publish an animated encoder exercise test pattern (no camera required) + cargo run -p local_video -F desktop --bin publisher -- \ + --test-pattern 1 \ --room-name demo \ --identity test-1 @@ -124,7 +130,7 @@ Publisher flags (in addition to the common connection flags above): - `--camera-index `: Camera index to use (default: `0`). Use `--list-cameras` to see available indices. - `--source `: Camera backend to use (default: `uvc`). `argus` uses NVIDIA libargus for MIPI CSI cameras and is available only on Linux aarch64 Jetson builds. - `--format `: UVC camera capture format (default: `auto`). `auto` tries uncompressed YUYV first and falls back to MJPEG; `mjpeg` can reduce USB bandwidth when running multiple cameras. -- `--test-pattern`: Generate a standard SMPTE 75% color-bar test pattern instead of capturing from a camera. `--camera-index` is ignored when this is set; `--width`, `--height`, and `--fps` still control the output resolution and frame rate. +- `--test-pattern [0|1]`: Generate a test pattern instead of capturing from a camera. `0` is a static SMPTE 75% color-bar pattern and `1` is an animated encoder exercise graphic. Omitting the value defaults to `0`. `--camera-index` is ignored when this is set; `--width`, `--height`, and `--fps` still control the output resolution and frame rate. - `--width `: Desired capture width (default: `1280`). - `--height `: Desired capture height (default: `720`). - `--fps `: Desired capture framerate (default: `30`). diff --git a/examples/local_video/src/publisher.rs b/examples/local_video/src/publisher.rs index 83943cb37..2bc2c4c98 100644 --- a/examples/local_video/src/publisher.rs +++ b/examples/local_video/src/publisher.rs @@ -41,7 +41,7 @@ mod user_data; mod video_display; mod viewport_aspect; -use test_pattern::TestPattern; +use test_pattern::{TestPattern, TestPatternKind}; use timestamp_burn::TimestampOverlay; use video_display::{align_up, PublisherTimingSample, SharedYuv}; @@ -180,9 +180,16 @@ struct Args { #[arg(long, value_enum, default_value_t = CaptureFormat::Auto)] format: CaptureFormat, - /// Generate a standard SMPTE color-bar test pattern instead of using a camera - #[arg(long, default_value_t = false, conflicts_with_all = ["list_cameras", "list_encoders"])] - test_pattern: bool, + /// Generate a numeric test pattern instead of using a camera: 0 = static bars, 1 = animated + #[arg( + long, + value_name = "N", + num_args = 0..=1, + default_missing_value = "0", + value_parser = parse_test_pattern_kind, + conflicts_with_all = ["list_cameras", "list_encoders"] + )] + test_pattern: Option, /// Desired width #[arg(long, default_value_t = 1280)] @@ -300,6 +307,13 @@ fn requested_playout_delay( } } +fn parse_test_pattern_kind(value: &str) -> Result { + let numeric = + value.parse::().map_err(|_| format!("test pattern must be 0 or 1, got `{value}`"))?; + TestPatternKind::try_from(numeric) + .map_err(|_| format!("test pattern must be 0 or 1, got `{value}`")) +} + fn normalize_twirp_host(url: &str) -> String { if let Some(rest) = url.strip_prefix("wss://") { return format!("https://{}", rest.trim_end_matches("/rtc")); @@ -706,6 +720,46 @@ mod tests { current.sensor_exposure_timestamp_us ); } + + #[test] + fn test_pattern_is_absent_by_default() { + let args = Args::try_parse_from(["publisher"]).expect("default args should parse"); + + assert_eq!(args.test_pattern, None); + } + + #[test] + fn test_pattern_without_value_defaults_to_static_bars() { + let args = + Args::try_parse_from(["publisher", "--test-pattern"]).expect("args should parse"); + + assert_eq!(args.test_pattern, Some(TestPatternKind::StaticColorBars)); + } + + #[test] + fn test_pattern_without_value_allows_following_option() { + let args = Args::try_parse_from(["publisher", "--test-pattern", "--room-name", "demo"]) + .expect("args should parse"); + + assert_eq!(args.test_pattern, Some(TestPatternKind::StaticColorBars)); + assert_eq!(args.room_name, "demo"); + } + + #[test] + fn test_pattern_accepts_numeric_mode() { + let args = + Args::try_parse_from(["publisher", "--test-pattern", "1"]).expect("args should parse"); + + assert_eq!(args.test_pattern, Some(TestPatternKind::AnimatedGraphic)); + } + + #[test] + fn test_pattern_rejects_unknown_numeric_mode() { + let err = + Args::try_parse_from(["publisher", "--test-pattern", "2"]).expect_err("2 is invalid"); + + assert!(err.to_string().contains("test pattern must be 0 or 1")); + } } fn list_cameras() -> Result<()> { @@ -1017,7 +1071,7 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { SourceKind::Argus => { #[cfg(all(target_os = "linux", target_arch = "aarch64"))] { - if args.test_pattern { + if args.test_pattern.is_some() { anyhow::bail!("--test-pattern is not supported with --source argus"); } if args.display_video { @@ -1053,15 +1107,22 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { } } SourceKind::Uvc => { - if args.test_pattern { + if let Some(test_pattern) = args.test_pattern { let width = args.width; let height = args.height; let fps = args.fps; info!( - "Test pattern enabled: SMPTE 75% color bars at {}x{} @ {} fps", - width, height, fps + "Test pattern enabled: {} at {}x{} @ {} fps", + test_pattern.label(), + width, + height, + fps ); - (width, height, VideoInput::TestPattern(TestPattern::new(width, height))) + ( + width, + height, + VideoInput::TestPattern(TestPattern::new(width, height, test_pattern)), + ) } else { open_platform_camera(&args)? } @@ -1314,6 +1375,7 @@ async fn run_capture_loop( // Timing accumulators (ms) for rolling stats let mut timings = PublisherTimingSummary::default(); let mut frame_counter: u32 = 1; + let mut test_pattern_frame_index: u64 = 0; let mut timestamp_overlay = (config.attach_timestamp && config.burn_timestamp) .then(|| TimestampOverlay::new(width, height)); let align_buffers_for_display = display_shared.is_some(); @@ -1358,7 +1420,9 @@ async fn run_capture_loop( stride_u as i32, data_v, stride_v as i32, + test_pattern_frame_index, ); + test_pattern_frame_index = test_pattern_frame_index.wrapping_add(1); let frame_acquired_at = Instant::now(); ( frame, diff --git a/examples/local_video/src/test_pattern.rs b/examples/local_video/src/test_pattern.rs index d9c689cda..34a64e4d9 100644 --- a/examples/local_video/src/test_pattern.rs +++ b/examples/local_video/src/test_pattern.rs @@ -1,9 +1,54 @@ -/// Generates a static SMPTE-style 75% color-bar pattern in I420 format. -pub struct TestPattern { +/// Selects the generated test pattern. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(super) enum TestPatternKind { + /// Static SMPTE-style 75% color bars. + StaticColorBars, + /// Animated motion graphic for exercising video encoders. + AnimatedGraphic, +} + +/// Returned when a numeric test pattern selector is unsupported. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(super) struct UnsupportedTestPatternKind; + +impl TryFrom for TestPatternKind { + type Error = UnsupportedTestPatternKind; + + fn try_from(value: u8) -> Result { + match value { + 0 => Ok(Self::StaticColorBars), + 1 => Ok(Self::AnimatedGraphic), + _ => Err(UnsupportedTestPatternKind), + } + } +} + +impl TestPatternKind { + /// Returns a short label for logs and help text. + pub(super) fn label(self) -> &'static str { + match self { + Self::StaticColorBars => "SMPTE 75% color bars", + Self::AnimatedGraphic => "animated encoder exercise graphic", + } + } +} + +/// Generates a test pattern in I420 format. +pub(super) struct TestPattern { width: usize, height: usize, chroma_width: usize, chroma_height: usize, + frames: TestPatternFrames, +} + +enum TestPatternFrames { + Static(I420Frame), + AnimatedCached(Vec), + AnimatedDynamic, +} + +struct I420Frame { y_plane: Vec, u_plane: Vec, v_plane: Vec, @@ -16,6 +61,10 @@ struct I420Color { v: u8, } +const ANIMATED_CACHE_TARGET_FRAMES: usize = 60; +const ANIMATED_CACHE_MIN_FRAMES: usize = 2; +const ANIMATED_CACHE_MAX_BYTES: usize = 128 * 1024 * 1024; + const BARS: [I420Color; 7] = [ rgb_to_i420(191, 191, 191), // white rgb_to_i420(191, 191, 0), // yellow @@ -26,38 +75,52 @@ const BARS: [I420Color; 7] = [ rgb_to_i420(0, 0, 191), // blue ]; +const ANIMATED_PALETTE: [I420Color; 6] = [ + rgb_to_i420(235, 64, 32), + rgb_to_i420(64, 224, 72), + rgb_to_i420(48, 128, 255), + rgb_to_i420(245, 220, 64), + rgb_to_i420(224, 72, 220), + rgb_to_i420(64, 224, 224), +]; + impl TestPattern { - /// Precompute a static SMPTE-style 75% color-bar pattern for the requested resolution. - pub fn new(width: u32, height: u32) -> Self { + /// Precompute the reusable planes for the requested pattern and resolution. + pub(super) fn new(width: u32, height: u32, kind: TestPatternKind) -> Self { let width = width as usize; let height = height as usize; let chroma_width = width.div_ceil(2); let chroma_height = height.div_ceil(2); - let mut y_plane = vec![0; width * height]; - let mut u_plane = vec![128; chroma_width * chroma_height]; - let mut v_plane = vec![128; chroma_width * chroma_height]; - - for row in 0..height { - let row_start = row * width; - for col in 0..width { - y_plane[row_start + col] = color_for_luma_column(col, width).y; - } - } - for row in 0..chroma_height { - let row_start = row * chroma_width; - for col in 0..chroma_width { - let color = color_for_luma_column(col * 2, width); - u_plane[row_start + col] = color.u; - v_plane[row_start + col] = color.v; + let frames = match kind { + TestPatternKind::StaticColorBars => TestPatternFrames::Static(color_bars_frame( + width, + height, + chroma_width, + chroma_height, + )), + TestPatternKind::AnimatedGraphic => { + if let Some(frame_count) = + cached_animation_frame_count(width, height, chroma_width, chroma_height) + { + TestPatternFrames::AnimatedCached(animated_frames( + width, + height, + chroma_width, + chroma_height, + frame_count, + )) + } else { + TestPatternFrames::AnimatedDynamic + } } - } + }; - Self { width, height, chroma_width, chroma_height, y_plane, u_plane, v_plane } + Self { width, height, chroma_width, chroma_height, frames } } - /// Copy the precomputed pattern into the provided I420 destination planes. - pub fn render( + /// Render the selected pattern into the provided I420 destination planes. + pub(super) fn render( &self, data_y: &mut [u8], stride_y: i32, @@ -65,11 +128,160 @@ impl TestPattern { stride_u: i32, data_v: &mut [u8], stride_v: i32, + frame_index: u64, ) { - copy_plane(data_y, stride_y as usize, &self.y_plane, self.width, self.height); - copy_plane(data_u, stride_u as usize, &self.u_plane, self.chroma_width, self.chroma_height); - copy_plane(data_v, stride_v as usize, &self.v_plane, self.chroma_width, self.chroma_height); + match &self.frames { + TestPatternFrames::Static(frame) => { + frame.copy_to( + data_y, + stride_y as usize, + data_u, + stride_u as usize, + data_v, + stride_v as usize, + self.width, + self.height, + self.chroma_width, + self.chroma_height, + ); + } + TestPatternFrames::AnimatedCached(frames) => { + let frame = &frames[(frame_index % frames.len() as u64) as usize]; + frame.copy_to( + data_y, + stride_y as usize, + data_u, + stride_u as usize, + data_v, + stride_v as usize, + self.width, + self.height, + self.chroma_width, + self.chroma_height, + ); + } + TestPatternFrames::AnimatedDynamic => { + render_animated_pattern( + data_y, + stride_y as usize, + data_u, + stride_u as usize, + data_v, + stride_v as usize, + self.width, + self.height, + self.chroma_width, + self.chroma_height, + frame_index, + ); + } + } + } +} + +impl I420Frame { + fn new(width: usize, height: usize, chroma_width: usize, chroma_height: usize) -> Self { + Self { + y_plane: vec![0; width * height], + u_plane: vec![128; chroma_width * chroma_height], + v_plane: vec![128; chroma_width * chroma_height], + } + } + + fn copy_to( + &self, + data_y: &mut [u8], + stride_y: usize, + data_u: &mut [u8], + stride_u: usize, + data_v: &mut [u8], + stride_v: usize, + width: usize, + height: usize, + chroma_width: usize, + chroma_height: usize, + ) { + copy_plane(data_y, stride_y, &self.y_plane, width, height); + copy_plane(data_u, stride_u, &self.u_plane, chroma_width, chroma_height); + copy_plane(data_v, stride_v, &self.v_plane, chroma_width, chroma_height); + } +} + +fn color_bars_frame( + width: usize, + height: usize, + chroma_width: usize, + chroma_height: usize, +) -> I420Frame { + let mut frame = I420Frame::new(width, height, chroma_width, chroma_height); + + for row in 0..height { + let row_start = row * width; + for col in 0..width { + frame.y_plane[row_start + col] = color_for_luma_column(col, width).y; + } + } + + for row in 0..chroma_height { + let row_start = row * chroma_width; + for col in 0..chroma_width { + let color = color_for_luma_column(col * 2, width); + frame.u_plane[row_start + col] = color.u; + frame.v_plane[row_start + col] = color.v; + } + } + + frame +} + +fn cached_animation_frame_count( + width: usize, + height: usize, + chroma_width: usize, + chroma_height: usize, +) -> Option { + let bytes_per_frame = i420_frame_len(width, height, chroma_width, chroma_height); + if bytes_per_frame == 0 { + return Some(1); } + + let max_frames = ANIMATED_CACHE_MAX_BYTES / bytes_per_frame; + (max_frames >= ANIMATED_CACHE_MIN_FRAMES) + .then_some(max_frames.min(ANIMATED_CACHE_TARGET_FRAMES)) +} + +fn animated_frames( + width: usize, + height: usize, + chroma_width: usize, + chroma_height: usize, + frame_count: usize, +) -> Vec { + (0..frame_count) + .map(|frame_index| { + let mut frame = I420Frame::new(width, height, chroma_width, chroma_height); + render_animated_pattern( + &mut frame.y_plane, + width, + &mut frame.u_plane, + chroma_width, + &mut frame.v_plane, + chroma_width, + width, + height, + chroma_width, + chroma_height, + frame_index as u64, + ); + frame + }) + .collect() +} + +fn i420_frame_len(width: usize, height: usize, chroma_width: usize, chroma_height: usize) -> usize { + width + .saturating_mul(height) + .saturating_add(chroma_width.saturating_mul(chroma_height).saturating_mul(2)) } const fn rgb_to_i420(r: u8, g: u8, b: u8) -> I420Color { @@ -102,6 +314,92 @@ fn color_for_luma_column(col: usize, width: usize) -> I420Color { BARS[bar.min(BARS.len() - 1)] } +fn render_animated_pattern( + data_y: &mut [u8], + stride_y: usize, + data_u: &mut [u8], + stride_u: usize, + data_v: &mut [u8], + stride_v: usize, + width: usize, + height: usize, + chroma_width: usize, + chroma_height: usize, + frame_index: u64, +) { + if width == 0 || height == 0 { + return; + } + + let frame = frame_index as usize; + let tile = (width.min(height) / 10).clamp(16, 96); + let sweep_x = frame.wrapping_mul(7) % width; + let sweep_y = frame.wrapping_mul(5) % height; + let box_w = (width / 5).clamp(32, 256).min(width); + let box_h = (height / 4).clamp(24, 192).min(height); + let box_x = bouncing_offset(frame.wrapping_mul(9), width.saturating_sub(box_w)); + let box_y = bouncing_offset(frame.wrapping_mul(5), height.saturating_sub(box_h)); + + for row in 0..height { + let dst_start = row * stride_y; + for col in 0..width { + let shifted_x = col.wrapping_add(sweep_x); + let shifted_y = row.wrapping_add(sweep_y); + let checker = ((shifted_x / tile) ^ (shifted_y / tile)) & 1; + let ramp = if width > 1 { (col * 144) / (width - 1) } else { 0 }; + let diagonal = (col.wrapping_add(row).wrapping_add(frame.wrapping_mul(11)) % tile) < 3; + let mut luma = 42 + ramp as i32 + (checker as i32 * 42); + + if diagonal { + luma += 64; + } + if in_box(col, row, box_x, box_y, box_w, box_h) { + luma = if ((col / 8) ^ (row / 8) ^ (frame / 2)) & 1 == 0 { 235 } else { 24 }; + } + + data_y[dst_start + col] = clamp_to_u8(luma); + } + } + + for row in 0..chroma_height { + let dst_u_start = row * stride_u; + let dst_v_start = row * stride_v; + for col in 0..chroma_width { + let luma_col = col * 2; + let luma_row = row * 2; + let color = if in_box(luma_col, luma_row, box_x, box_y, box_w, box_h) { + ANIMATED_PALETTE[(frame / 4) % ANIMATED_PALETTE.len()] + } else { + let palette_index = ((luma_col.wrapping_add(sweep_x) / tile) + + (luma_row.wrapping_add(sweep_y) / tile) + + (frame / 12)) + % ANIMATED_PALETTE.len(); + ANIMATED_PALETTE[palette_index] + }; + data_u[dst_u_start + col] = color.u; + data_v[dst_v_start + col] = color.v; + } + } +} + +fn bouncing_offset(position: usize, travel: usize) -> usize { + if travel == 0 { + return 0; + } + + let period = travel.saturating_mul(2); + let phase = position % period; + if phase <= travel { + phase + } else { + period - phase + } +} + +fn in_box(col: usize, row: usize, box_x: usize, box_y: usize, box_w: usize, box_h: usize) -> bool { + (box_x..box_x + box_w).contains(&col) && (box_y..box_y + box_h).contains(&row) +} + fn copy_plane(dst: &mut [u8], dst_stride: usize, src: &[u8], width: usize, height: usize) { if width == 0 || height == 0 { return; @@ -119,3 +417,65 @@ fn copy_plane(dst: &mut [u8], dst_stride: usize, src: &[u8], width: usize, heigh dst[dst_start..dst_start + width].copy_from_slice(&src[src_start..src_start + width]); } } + +#[cfg(test)] +mod tests { + use super::*; + + fn render_frame(kind: TestPatternKind, frame_index: u64) -> (Vec, Vec, Vec) { + let pattern = TestPattern::new(64, 36, kind); + let mut y = vec![0; 64 * 36]; + let mut u = vec![0; 32 * 18]; + let mut v = vec![0; 32 * 18]; + pattern.render(&mut y, 64, &mut u, 32, &mut v, 32, frame_index); + (y, u, v) + } + + #[test] + fn test_pattern_kind_accepts_supported_numeric_selectors() { + assert_eq!(TestPatternKind::try_from(0), Ok(TestPatternKind::StaticColorBars)); + assert_eq!(TestPatternKind::try_from(1), Ok(TestPatternKind::AnimatedGraphic)); + assert_eq!(TestPatternKind::try_from(2), Err(UnsupportedTestPatternKind)); + } + + #[test] + fn animated_graphic_uses_cached_frames_when_memory_allows() { + let pattern = TestPattern::new(64, 36, TestPatternKind::AnimatedGraphic); + + let TestPatternFrames::AnimatedCached(frames) = pattern.frames else { + panic!("small animated pattern should use cached frames"); + }; + assert_eq!(frames.len(), ANIMATED_CACHE_TARGET_FRAMES); + } + + #[test] + fn animated_cache_is_bounded_by_memory_budget() { + let frame_count = cached_animation_frame_count(1920, 1080, 960, 540) + .expect("1080p should still cache multiple frames"); + + assert!(frame_count >= ANIMATED_CACHE_MIN_FRAMES); + assert!(frame_count < ANIMATED_CACHE_TARGET_FRAMES); + assert!(frame_count * i420_frame_len(1920, 1080, 960, 540) <= ANIMATED_CACHE_MAX_BYTES); + } + + #[test] + fn very_large_animated_patterns_fall_back_to_dynamic_rendering() { + assert_eq!(cached_animation_frame_count(16_384, 9_216, 8_192, 4_608), None); + } + + #[test] + fn static_color_bars_do_not_change_between_frames() { + assert_eq!( + render_frame(TestPatternKind::StaticColorBars, 0), + render_frame(TestPatternKind::StaticColorBars, 24) + ); + } + + #[test] + fn animated_graphic_changes_between_frames() { + assert_ne!( + render_frame(TestPatternKind::AnimatedGraphic, 0), + render_frame(TestPatternKind::AnimatedGraphic, 24) + ); + } +} From fde64ee51039f39c02ca69f887c78ce83075095d Mon Sep 17 00:00:00 2001 From: David Chen Date: Sun, 28 Jun 2026 09:34:13 -0700 Subject: [PATCH 12/24] implement zero copy encode path for avfoundation --- examples/local_video/src/list_devices.rs | 4 +- examples/local_video/src/publisher.rs | 412 +++++++++++-- livekit-capture/Cargo.toml | 1 + livekit-capture/README.md | 4 +- livekit-capture/src/device.rs | 6 +- livekit-capture/src/lib.rs | 3 +- livekit-capture/src/platform/avfoundation.rs | 598 ++++++++++++++++--- livekit-capture/src/source.rs | 50 +- 8 files changed, 929 insertions(+), 149 deletions(-) diff --git a/examples/local_video/src/list_devices.rs b/examples/local_video/src/list_devices.rs index d738a8fa1..4e83c72bf 100644 --- a/examples/local_video/src/list_devices.rs +++ b/examples/local_video/src/list_devices.rs @@ -56,7 +56,7 @@ fn print_capabilities(formats: &[CaptureFormat]) { let mut formats = formats.to_vec(); formats.sort_by_key(|format| { ( - format!("{:?}", format.pixel_format), + format!("{:?}", format.frame_format), format.resolution.width, format.resolution.height, format.frame_rate, @@ -67,7 +67,7 @@ fn print_capabilities(formats: &[CaptureFormat]) { for format in formats { println!( " - {:?}: {}x{} @ {} fps", - format.pixel_format, + format.frame_format, format.resolution.width, format.resolution.height, format.frame_rate diff --git a/examples/local_video/src/publisher.rs b/examples/local_video/src/publisher.rs index 2bc2c4c98..c65cf6629 100644 --- a/examples/local_video/src/publisher.rs +++ b/examples/local_video/src/publisher.rs @@ -6,7 +6,10 @@ use livekit::options::{ VideoEncoderBackend, VideoEncoding, VideoPreset, }; use livekit::prelude::*; -use livekit::webrtc::video_frame::{FrameMetadata, I420Buffer, VideoFrame, VideoRotation}; +use livekit::webrtc::video_frame::{ + native::{NativeBuffer, VideoFrameBufferExt}, + FrameMetadata, I420Buffer, VideoFrame, VideoRotation, +}; use livekit::webrtc::video_source::native::NativeVideoSource; use livekit::webrtc::video_source::{RtcVideoSource, VideoResolution}; use livekit_api::access_token; @@ -14,7 +17,7 @@ use livekit_api::services::room::{CreateRoomOptions, RoomClient}; use livekit_api::services::{ServiceError, TwirpError, TwirpErrorCode}; use livekit_capture::device::{ CaptureDeviceSelector, CaptureFormat as LkCaptureFormat, CaptureFormatRequest, - CaptureFrameFormat, CaptureResolution, + CaptureFrameFormat, CapturePath as LkCapturePath, CaptureResolution, }; #[cfg(target_os = "macos")] use livekit_capture::sources::avfoundation::{ @@ -324,6 +327,16 @@ fn normalize_twirp_host(url: &str) -> String { url.trim_end_matches("/rtc").to_string() } +fn capture_path_name(path: LkCapturePath) -> &'static str { + match path { + LkCapturePath::Native => "native platform buffer", + LkCapturePath::Raw => "CPU I420", + LkCapturePath::DmaBuf => "DMA-BUF", + LkCapturePath::Encoded => "pre-encoded", + _ => "unknown", + } +} + #[derive(Default)] struct RollingMs { total_ms: f64, @@ -349,6 +362,8 @@ impl RollingMs { struct PublisherTimingSummary { paced_wait_ms: RollingMs, camera_frame_read_ms: RollingMs, + capture_timestamp_age_ms: RollingMs, + capture_timestamp_to_webrtc_ms: RollingMs, decode_mjpeg_ms: RollingMs, buffer_convert_ms: RollingMs, frame_draw_ms: RollingMs, @@ -425,9 +440,45 @@ fn log_publisher_outbound_health(stats: &[livekit::webrtc::stats::RtcStats]) { } } -async fn update_publisher_video_stats(track: LocalVideoTrack, ctrl_c_received: Arc) { +fn maybe_request_native_capture_fallback( + outbound: &livekit::webrtc::stats::OutboundRtpStats, + first_starved_at: &mut Option, + native_capture_fallback: &AtomicBool, +) { + if native_capture_fallback.load(Ordering::Acquire) { + return; + } + if outbound.outbound.frames_encoded > 0 || outbound.outbound.key_frames_encoded > 0 { + *first_starved_at = None; + return; + } + if outbound.outbound.pli_count == 0 && outbound.outbound.fir_count == 0 { + return; + } + + let starved_at = first_starved_at.get_or_insert_with(Instant::now); + if starved_at.elapsed() < Duration::from_secs(3) + && outbound.outbound.pli_count < 3 + && outbound.outbound.fir_count == 0 + { + return; + } + + native_capture_fallback.store(true, Ordering::Release); + log::warn!( + "Native AVFoundation CVPixelBuffer publish produced no encoded frames; falling back to CPU I420 capture" + ); +} + +async fn update_publisher_video_stats( + track: LocalVideoTrack, + ctrl_c_received: Arc, + native_capture_fallback: Option>, +) { let mut last_log = Instant::now().checked_sub(Duration::from_secs(2)).unwrap_or_else(Instant::now); + let mut last_encoder_implementation = String::new(); + let mut native_capture_starved_at = None; let mut interval = tokio::time::interval(Duration::from_secs(1)); interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); @@ -437,6 +488,21 @@ async fn update_publisher_video_stats(track: LocalVideoTrack, ctrl_c_received: A } if let Ok(stats) = track.get_stats().await { + if let Some(implementation) = find_video_outbound_encoder(&stats) { + if implementation != last_encoder_implementation { + info!("Publisher encode path: WebRTC encoder implementation={implementation}"); + last_encoder_implementation = implementation.to_string(); + } + } + if let (Some(outbound), Some(fallback)) = + (find_video_outbound_stats(&stats), native_capture_fallback.as_ref()) + { + maybe_request_native_capture_fallback( + &outbound, + &mut native_capture_starved_at, + fallback, + ); + } if last_log.elapsed() >= Duration::from_secs(2) { log_publisher_outbound_health(&stats); last_log = Instant::now(); @@ -453,7 +519,6 @@ async fn update_publisher_encoder_overlay( ctrl_c_received: Arc, ) { let mut logged_initial = false; - let mut last_implementation = String::new(); let mut interval = tokio::time::interval(Duration::from_secs(1)); interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); @@ -465,11 +530,6 @@ async fn update_publisher_encoder_overlay( match track.get_stats().await { Ok(stats) => { if let Some(implementation) = find_video_outbound_encoder(&stats) { - if implementation != last_implementation { - info!("Publisher video encoder implementation: {implementation}"); - last_implementation = implementation.to_string(); - } - let mut shared = shared.lock(); shared.codec_implementation = implementation.to_string(); } @@ -490,6 +550,8 @@ impl PublisherTimingSummary { fn reset(&mut self) { self.paced_wait_ms.reset(); self.camera_frame_read_ms.reset(); + self.capture_timestamp_age_ms.reset(); + self.capture_timestamp_to_webrtc_ms.reset(); self.decode_mjpeg_ms.reset(); self.buffer_convert_ms.reset(); self.frame_draw_ms.reset(); @@ -505,6 +567,14 @@ fn format_timing_line(timings: &PublisherTimingSummary) -> String { "camera_frame_read {:.2}", timings.camera_frame_read_ms.average().unwrap_or_default() ), + format!( + "capture_ts_age {:.2}", + timings.capture_timestamp_age_ms.average().unwrap_or_default() + ), + format!( + "capture_ts_to_webrtc {:.2}", + timings.capture_timestamp_to_webrtc_ms.average().unwrap_or_default() + ), ]; let mut line_two = Vec::new(); @@ -810,33 +880,124 @@ enum PlatformCamera { V4l(V4lCaptureSession), } +fn publisher_capture_path_label(video_input: &VideoInput, burn_timestamp: bool) -> String { + match video_input { + VideoInput::TestPattern(_) => "test-pattern CPU I420".to_string(), + VideoInput::Camera(camera) => match camera { + #[cfg(target_os = "macos")] + PlatformCamera::AvFoundation(session) => { + let source_format = session.format().frame_format; + let core_video_format = core_video_fourcc(session.core_video_pixel_format()); + if burn_timestamp { + format!( + "AVFoundation CPU I420 fallback from {source_format}/{core_video_format} (timestamp burn)" + ) + } else { + match session.capture_path() { + LkCapturePath::Native => { + format!( + "AVFoundation native IOSurface CVPixelBuffer {core_video_format} from {source_format}" + ) + } + path => format!( + "AVFoundation {} fallback from {source_format}/{core_video_format}", + capture_path_name(path), + ), + } + } + } + #[cfg(target_os = "linux")] + PlatformCamera::V4l(session) => { + let format = session.format(); + let decode_suffix = if format.frame_format == CaptureFrameFormat::Mjpeg { + " with MJPEG decode" + } else { + "" + }; + format!( + "V4L2 {} from {}{}", + capture_path_name(session.capture_path()), + format.frame_format, + decode_suffix + ) + } + }, + #[cfg(all(target_os = "linux", target_arch = "aarch64"))] + VideoInput::Argus(_) => "libargus NV12 DMA-BUF".to_string(), + } +} + +#[cfg(target_os = "macos")] +fn core_video_fourcc(pixel_format: u32) -> String { + let bytes = pixel_format.to_be_bytes(); + if bytes.iter().all(|byte| byte.is_ascii_graphic() || *byte == b' ') { + String::from_utf8_lossy(&bytes).into_owned() + } else { + format!("0x{pixel_format:08x}") + } +} + +fn publisher_uses_native_camera_capture(video_input: &VideoInput, burn_timestamp: bool) -> bool { + if burn_timestamp { + return false; + } + + match video_input { + #[cfg(target_os = "macos")] + VideoInput::Camera(PlatformCamera::AvFoundation(session)) => { + session.capture_path() == LkCapturePath::Native + } + _ => false, + } +} + struct PlatformCameraFrame { - frame: VideoFrame, + buffer: CapturedFrameBuffer, capture_wall_time_us: u64, read_wall_time_us: u64, + sensor_timestamp_us: Option, used_decode_path: bool, } +enum CapturedFrameBuffer { + I420(VideoFrame), + #[cfg(target_os = "macos")] + Native(VideoFrame), +} + impl PlatformCamera { - fn capture_frame(&mut self) -> Result { + fn capture_frame(&mut self, prefer_native: bool) -> Result { match self { #[cfg(target_os = "macos")] Self::AvFoundation(session) => { - let frame = session.capture_frame()?; - Ok(PlatformCameraFrame { - frame: frame.frame, - capture_wall_time_us: frame.capture_wall_time_us, - read_wall_time_us: frame.read_wall_time_us, - used_decode_path: false, - }) + if prefer_native && session.capture_path() == LkCapturePath::Native { + let frame = session.capture_native_frame()?; + Ok(PlatformCameraFrame { + buffer: CapturedFrameBuffer::Native(frame.frame), + capture_wall_time_us: frame.capture_wall_time_us, + read_wall_time_us: frame.read_wall_time_us, + sensor_timestamp_us: frame.sensor_timestamp_us, + used_decode_path: false, + }) + } else { + let frame = session.capture_frame()?; + Ok(PlatformCameraFrame { + buffer: CapturedFrameBuffer::I420(frame.frame), + capture_wall_time_us: frame.capture_wall_time_us, + read_wall_time_us: frame.read_wall_time_us, + sensor_timestamp_us: frame.sensor_timestamp_us, + used_decode_path: false, + }) + } } #[cfg(target_os = "linux")] Self::V4l(session) => { let frame = session.capture_frame()?; Ok(PlatformCameraFrame { - frame: frame.frame, + buffer: CapturedFrameBuffer::I420(frame.frame), capture_wall_time_us: frame.capture_wall_time_us, read_wall_time_us: frame.read_wall_time_us, + sensor_timestamp_us: frame.sensor_timestamp_us, used_decode_path: frame.used_decode_path, }) } @@ -1246,6 +1407,15 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { info!("Published camera track"); requested_codec }; + info!( + "Publisher media path: capture={}, encode=requested codec {} via {}", + publisher_capture_path_label(&video_input, args.burn_timestamp), + actual_codec.as_str(), + video_encoder_backend_name(requested_encoder), + ); + let native_capture_fallback = + publisher_uses_native_camera_capture(&video_input, args.burn_timestamp) + .then(|| Arc::new(AtomicBool::new(false))); let capture_config = CaptureConfig { fps: args.fps, @@ -1260,8 +1430,11 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { let user_data_channels = args.attach_user_data.then(|| Arc::new(Mutex::new([0.0f32; user_data::NUM_CHANNELS]))); - let publish_stats_task = - tokio::spawn(update_publisher_video_stats(track.clone(), ctrl_c_received.clone())); + let publish_stats_task = tokio::spawn(update_publisher_video_stats( + track.clone(), + ctrl_c_received.clone(), + native_capture_fallback.clone(), + )); match video_input { #[cfg(all(target_os = "linux", target_arch = "aarch64"))] @@ -1304,6 +1477,7 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { Some(shared.clone()), publish_timing_state.clone(), user_data_channels.clone(), + native_capture_fallback.clone(), )); let display_result = video_display::run_display( @@ -1331,6 +1505,7 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { None, publish_timing_state.clone(), user_data_channels.clone(), + native_capture_fallback.clone(), ) .await; let _ = publish_stats_task.await; @@ -1353,14 +1528,23 @@ async fn run_capture_loop( display_shared: Option>>, publish_timing_state: Option>>, user_data_channels: Option>>, + native_capture_fallback: Option>, ) -> Result<()> { - // Pace publishing at the requested FPS (not the camera-reported FPS) to hit desired cadence let pace_fps = config.fps as f64; - // Accurate pacing using absolute schedule (no drift) - let mut ticker = tokio::time::interval(Duration::from_secs_f64(1.0 / pace_fps)); - ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); - // Align the first tick to now - ticker.tick().await; + #[cfg(target_os = "macos")] + let camera_driven_pacing = + matches!(&video_input, VideoInput::Camera(PlatformCamera::AvFoundation(_))); + #[cfg(not(target_os = "macos"))] + let camera_driven_pacing = false; + let mut ticker = if camera_driven_pacing { + None + } else { + let mut ticker = tokio::time::interval(Duration::from_secs_f64(1.0 / pace_fps)); + ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + // Align the first tick to now. + ticker.tick().await; + Some(ticker) + }; let start_ts = Instant::now(); // Capture loop @@ -1371,6 +1555,11 @@ async fn run_capture_loop( let mut fps_smoothed: f32 = 0.0; let target = Duration::from_secs_f64(1.0 / pace_fps); info!("Target frame interval: {:.2} ms", target.as_secs_f64() * 1000.0); + if camera_driven_pacing { + info!("Capture pacing: camera frame-arrival driven"); + } else { + info!("Capture pacing: application timer driven"); + } // Timing accumulators (ms) for rolling stats let mut timings = PublisherTimingSummary::default(); @@ -1379,26 +1568,32 @@ async fn run_capture_loop( let mut timestamp_overlay = (config.attach_timestamp && config.burn_timestamp) .then(|| TimestampOverlay::new(width, height)); let align_buffers_for_display = display_shared.is_some(); + let mut logged_camera_timestamp_source = false; + let mut logged_camera_timestamp_fallback = false; + let mut logged_native_capture_fallback = false; loop { if ctrl_c_received.load(Ordering::Acquire) { break; } - // Wait until the scheduled next frame time let paced_wait_started_at = Instant::now(); - ticker.tick().await; + if let Some(ticker) = ticker.as_mut() { + ticker.tick().await; + } let paced_wait_finished_at = Instant::now(); - let source_frame_started_at = Instant::now(); + let source_frame_read_started_at = Instant::now(); let frame_wall_time_us = unix_time_us_now(); let ( - mut frame, + mut captured_frame, capture_wall_time_us, read_wall_time_us, source_frame_acquired_at, + frame_pipeline_started_at, decode_finished_at, convert_finished_at, used_decode_path, + has_capture_timestamp, record_convert_timing, ) = match &mut video_input { VideoInput::TestPattern(pattern) => { @@ -1425,29 +1620,70 @@ async fn run_capture_loop( test_pattern_frame_index = test_pattern_frame_index.wrapping_add(1); let frame_acquired_at = Instant::now(); ( - frame, + CapturedFrameBuffer::I420(frame), frame_wall_time_us, unix_time_us_now(), frame_acquired_at, + source_frame_read_started_at, frame_acquired_at, frame_acquired_at, false, false, + false, ) } VideoInput::Camera(camera) => { - let mut captured = camera.capture_frame()?; + let force_i420_after_native_failure = native_capture_fallback + .as_ref() + .is_some_and(|fallback| fallback.load(Ordering::Acquire)); + if force_i420_after_native_failure && !logged_native_capture_fallback { + log::warn!( + "Publisher media path changed: capture=AVFoundation CPU I420 fallback after native encode starvation" + ); + logged_native_capture_fallback = true; + } + let prefer_native = !config.burn_timestamp && !force_i420_after_native_failure; + let mut captured = camera.capture_frame(prefer_native)?; let camera_frame_acquired_at = Instant::now(); - captured.frame.rotation = VideoRotation::VideoRotation0; + match &mut captured.buffer { + CapturedFrameBuffer::I420(frame) => { + frame.rotation = VideoRotation::VideoRotation0; + } + #[cfg(target_os = "macos")] + CapturedFrameBuffer::Native(frame) => { + frame.rotation = VideoRotation::VideoRotation0; + } + } + if captured.sensor_timestamp_us.is_some() { + if !logged_camera_timestamp_source { + let capture_timestamp_age_ms = captured + .read_wall_time_us + .saturating_sub(captured.capture_wall_time_us) + as f64 + / 1000.0; + info!( + "Using camera-provided capture timestamp (age at frame read {:.2} ms)", + capture_timestamp_age_ms + ); + logged_camera_timestamp_source = true; + } + } else if !logged_camera_timestamp_fallback { + log::warn!( + "Camera-provided capture timestamp unavailable or implausible; using frame read wall clock" + ); + logged_camera_timestamp_fallback = true; + } ( - captured.frame, + captured.buffer, captured.capture_wall_time_us, captured.read_wall_time_us, camera_frame_acquired_at, camera_frame_acquired_at, camera_frame_acquired_at, + camera_frame_acquired_at, captured.used_decode_path, + captured.sensor_timestamp_us.is_some(), false, ) } @@ -1459,8 +1695,6 @@ async fn run_capture_loop( unreachable!("argus video input must be driven by run_argus_capture_loop") } }; - let (stride_y, _, _) = frame.buffer.strides(); - let stride_y_usize = stride_y as usize; let fid = if config.attach_frame_id { let id = frame_counter; @@ -1477,8 +1711,17 @@ async fn run_capture_loop( let mut burned_timestamp_us = None; if let Some(overlay) = timestamp_overlay.as_mut() { let overlay_started_at = Instant::now(); - let (data_y, _, _) = frame.buffer.data_mut(); - overlay.draw(data_y, stride_y_usize, capture_wall_time_us, fid); + match &mut captured_frame { + CapturedFrameBuffer::I420(frame) => { + let (stride_y, _, _) = frame.buffer.strides(); + let (data_y, _, _) = frame.buffer.data_mut(); + overlay.draw(data_y, stride_y as usize, capture_wall_time_us, fid); + } + #[cfg(target_os = "macos")] + CapturedFrameBuffer::Native(_) => { + anyhow::bail!("timestamp burning requires an I420 capture frame"); + } + } burned_timestamp_us = Some(capture_wall_time_us); let overlay_finished_at = Instant::now(); frame_draw_ms = Some((overlay_finished_at - overlay_started_at).as_secs_f64() * 1000.0); @@ -1496,18 +1739,29 @@ async fn run_capture_loop( } let user_data = user_data_channels.as_ref().map(|targets| user_data::encode(&targets.lock())); - frame.frame_metadata = if user_ts.is_some() || fid.is_some() || user_data.is_some() { + let frame_metadata = if user_ts.is_some() || fid.is_some() || user_data.is_some() { Some(FrameMetadata { user_timestamp: user_ts, frame_id: fid, user_data }) } else { None }; // Monotonic, microseconds since start. - frame.timestamp_us = start_ts.elapsed().as_micros() as i64; - rtc_source.capture_frame(&frame); + let timestamp_us = start_ts.elapsed().as_micros() as i64; + match &mut captured_frame { + CapturedFrameBuffer::I420(frame) => { + frame.frame_metadata = frame_metadata; + frame.timestamp_us = timestamp_us; + rtc_source.capture_frame(frame); + } + #[cfg(target_os = "macos")] + CapturedFrameBuffer::Native(frame) => { + frame.frame_metadata = frame_metadata; + frame.timestamp_us = timestamp_us; + rtc_source.capture_frame(frame); + } + } let webrtc_capture_finished_at = Instant::now(); + let webrtc_capture_finished_wall_time_us = unix_time_us_now(); if let Some(shared) = display_shared.as_ref() { - let (stride_y, stride_u, stride_v) = frame.buffer.strides(); - let (data_y, data_u, data_v) = frame.buffer.data(); let timing_sample = if config.display_timing { publish_timing_state .as_ref() @@ -1515,18 +1769,42 @@ async fn run_capture_loop( } else { None }; - video_display::pack_i420_into_shared( - shared, - width, - height, - data_y, - stride_y as u32, - data_u, - stride_u as u32, - data_v, - stride_v as u32, - timing_sample, - ); + match &captured_frame { + CapturedFrameBuffer::I420(frame) => { + let (stride_y, stride_u, stride_v) = frame.buffer.strides(); + let (data_y, data_u, data_v) = frame.buffer.data(); + video_display::pack_i420_into_shared( + shared, + width, + height, + data_y, + stride_y as u32, + data_u, + stride_u as u32, + data_v, + stride_v as u32, + timing_sample, + ); + } + #[cfg(target_os = "macos")] + CapturedFrameBuffer::Native(frame) => { + let i420 = frame.buffer.to_i420(); + let (stride_y, stride_u, stride_v) = i420.strides(); + let (data_y, data_u, data_v) = i420.data(); + video_display::pack_i420_into_shared( + shared, + width, + height, + data_y, + stride_y as u32, + data_u, + stride_u as u32, + data_v, + stride_v as u32, + timing_sample, + ); + } + } } frames += 1; @@ -1550,9 +1828,19 @@ async fn run_capture_loop( timings .paced_wait_ms .record((paced_wait_finished_at - paced_wait_started_at).as_secs_f64() * 1000.0); - timings - .camera_frame_read_ms - .record((source_frame_acquired_at - source_frame_started_at).as_secs_f64() * 1000.0); + timings.camera_frame_read_ms.record( + (source_frame_acquired_at - source_frame_read_started_at).as_secs_f64() * 1000.0, + ); + if has_capture_timestamp && read_wall_time_us >= capture_wall_time_us { + timings + .capture_timestamp_age_ms + .record((read_wall_time_us - capture_wall_time_us) as f64 / 1000.0); + } + if has_capture_timestamp && webrtc_capture_finished_wall_time_us >= capture_wall_time_us { + timings.capture_timestamp_to_webrtc_ms.record( + (webrtc_capture_finished_wall_time_us - capture_wall_time_us) as f64 / 1000.0, + ); + } if used_decode_path { timings .decode_mjpeg_ms @@ -1569,9 +1857,9 @@ async fn run_capture_loop( timings .submit_to_webrtc_ms .record((webrtc_capture_finished_at - buffer_ready_at).as_secs_f64() * 1000.0); - timings - .capture_to_webrtc_total_ms - .record((webrtc_capture_finished_at - source_frame_started_at).as_secs_f64() * 1000.0); + timings.capture_to_webrtc_total_ms.record( + (webrtc_capture_finished_at - frame_pipeline_started_at).as_secs_f64() * 1000.0, + ); if last_fps_log.elapsed() >= std::time::Duration::from_secs(2) { let secs = last_fps_log.elapsed().as_secs_f64(); diff --git a/livekit-capture/Cargo.toml b/livekit-capture/Cargo.toml index b3218de14..0c997e201 100644 --- a/livekit-capture/Cargo.toml +++ b/livekit-capture/Cargo.toml @@ -39,6 +39,7 @@ avfoundation = [ "objc2-av-foundation/dispatch2", "objc2-av-foundation/objc2-core-media", "objc2-core-media/CMFormatDescription", + "objc2-core-media/CMSync", "objc2-core-media/CMTime", "objc2-core-media/CMSampleBuffer", "objc2-core-media/objc2-core-video", diff --git a/livekit-capture/README.md b/livekit-capture/README.md index 21e3e7cb9..5d6b5935f 100644 --- a/livekit-capture/README.md +++ b/livekit-capture/README.md @@ -1,7 +1,7 @@ # livekit-capture -Capture helpers for publishing decoded, DMA-BUF, and pre-encoded video frames -with the LiveKit Rust SDK. +Capture helpers for publishing decoded, native platform, DMA-BUF, and +pre-encoded video frames with the LiveKit Rust SDK. Optional source features include `avfoundation`, `libargus`, `v4l`, `tcp-source`, `rtsp`, and `gstreamer`. diff --git a/livekit-capture/src/device.rs b/livekit-capture/src/device.rs index 106821bff..355c05d99 100644 --- a/livekit-capture/src/device.rs +++ b/livekit-capture/src/device.rs @@ -62,6 +62,8 @@ impl fmt::Display for CaptureBackend { #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] #[non_exhaustive] pub enum CapturePath { + /// Platform-native uncompressed frame buffers. + Native, /// Uncompressed CPU-accessible frame buffers. Raw, /// Linux DMA-BUF backed frames. @@ -296,14 +298,14 @@ mod tests { name: "Camera".to_string(), model_id: None, manufacturer: None, - paths: vec![CapturePath::Raw], + paths: vec![CapturePath::Native, CapturePath::Raw], formats: Vec::new(), formats_complete: false, }; assert_eq!(info.backend, CaptureBackend::AvFoundation); assert_eq!(info.selector, CaptureDeviceSelector::Id("camera-0".to_string())); - assert_eq!(info.paths, vec![CapturePath::Raw]); + assert_eq!(info.paths, vec![CapturePath::Native, CapturePath::Raw]); assert!(!info.formats_complete); } } diff --git a/livekit-capture/src/lib.rs b/livekit-capture/src/lib.rs index d83d9d4fa..a50ffb4c5 100644 --- a/livekit-capture/src/lib.rs +++ b/livekit-capture/src/lib.rs @@ -39,6 +39,7 @@ pub use encoded::{ pub use error::CaptureError; pub use source::{ CaptureFrame, CaptureFrameSource, CaptureSourceError, CaptureSourceOptions, - EncodedCaptureFrameSource, EncodedFrameSourceError, RawVideoFrame, VideoCaptureSource, + EncodedCaptureFrameSource, EncodedFrameSourceError, NativeVideoFrame, RawVideoFrame, + VideoCaptureSource, }; pub use track::VideoCaptureTrack; diff --git a/livekit-capture/src/platform/avfoundation.rs b/livekit-capture/src/platform/avfoundation.rs index 81d907d15..62dc56947 100644 --- a/livekit-capture/src/platform/avfoundation.rs +++ b/livekit-capture/src/platform/avfoundation.rs @@ -18,7 +18,7 @@ use std::sync::{ }; use std::thread::JoinHandle; -use livekit::webrtc::video_frame::{I420Buffer, VideoBuffer, VideoFrame}; +use livekit::webrtc::video_frame::{native::NativeBuffer, I420Buffer, VideoBuffer, VideoFrame}; use thiserror::Error; use crate::{ @@ -32,6 +32,8 @@ use crate::{ #[cfg(target_os = "macos")] const FIRST_FRAME_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(5); +#[cfg(target_os = "macos")] +const MAX_CAPTURE_TIMESTAMP_AGE_US: u64 = 5_000_000; /// Options used to create an AVFoundation capture session. #[derive(Debug, Clone, PartialEq, Eq)] @@ -78,11 +80,36 @@ impl AvFoundationFrame { } } -/// AVFoundation decoded-frame capture session that emits I420 frames. +/// One AVFoundation frame backed by a native IOSurface-backed `CVPixelBuffer`. +#[derive(Debug)] +pub struct AvFoundationNativeFrame { + /// Native frame suitable for [`crate::VideoCaptureTrack::capture_frame`]. + pub frame: VideoFrame, + /// Source frame format delivered by AVFoundation. + pub source_format: CaptureFrameFormat, + /// Wall-clock timestamp selected for metadata and timing correlation. + pub capture_wall_time_us: u64, + /// Wall-clock timestamp recorded after the frame was read from AVFoundation. + pub read_wall_time_us: u64, + /// Sensor timestamp translated to UNIX-epoch microseconds, when available. + pub sensor_timestamp_us: Option, +} + +impl AvFoundationNativeFrame { + /// Returns the native video frame. + pub fn video_frame(&self) -> &VideoFrame { + &self.frame + } +} + +/// AVFoundation capture session that emits I420 frames or native `CVPixelBuffer`s. pub struct AvFoundationCaptureSession { format: CaptureFormat, options: AvFoundationCaptureOptions, target_resolution: Option, + native_frame_supported: bool, + #[cfg(target_os = "macos")] + core_video_pixel_format: u32, #[cfg(target_os = "macos")] inner: macos::SessionInner, } @@ -114,6 +141,11 @@ impl AvFoundationCaptureSession { self.capture_frame_inner() } + /// Captures the next frame as a native `CVPixelBuffer`. + pub fn capture_native_frame(&mut self) -> Result { + self.capture_native_frame_inner() + } + /// Returns the negotiated capture format. pub fn format(&self) -> CaptureFormat { self.format @@ -126,19 +158,44 @@ impl AvFoundationCaptureSession { /// Returns the capture path produced by this session. pub fn capture_path(&self) -> CapturePath { - CapturePath::Raw + if self.native_capture_supported() { + CapturePath::Native + } else { + CapturePath::Raw + } + } + + /// Returns the CoreVideo pixel format type delivered by AVFoundation. + #[cfg(target_os = "macos")] + pub fn core_video_pixel_format(&self) -> u32 { + self.core_video_pixel_format + } + + pub(crate) fn native_capture_supported(&self) -> bool { + self.native_frame_supported + && self.target_resolution.is_none() + && self.format.frame_format == CaptureFrameFormat::Nv12 } #[cfg(target_os = "macos")] fn open(options: AvFoundationCaptureOptions) -> Result { let inner = macos::SessionInner::new(&options)?; - let mut format = inner.wait_for_format(FIRST_FRAME_TIMEOUT)?; + let initial_frame = inner.wait_for_format(FIRST_FRAME_TIMEOUT)?; + inner.discard_pending_frame(); + let mut format = initial_frame.format; format.frame_rate = requested_frame_rate_hint(&options.format).unwrap_or(30); let target_resolution = requested_output_resolution(&options.format, format.resolution); if let Some(resolution) = target_resolution { format.resolution = resolution; } - Ok(Self { format, options, target_resolution, inner }) + Ok(Self { + format, + options, + target_resolution, + native_frame_supported: initial_frame.native_frame_supported, + core_video_pixel_format: initial_frame.core_video_pixel_format, + inner, + }) } #[cfg(not(target_os = "macos"))] @@ -167,6 +224,22 @@ impl AvFoundationCaptureSession { fn capture_frame_inner(&mut self) -> Result { Err(AvFoundationError::UnsupportedPlatform) } + + #[cfg(target_os = "macos")] + fn capture_native_frame_inner(&mut self) -> Result { + if self.target_resolution.is_some() { + return Err(AvFoundationError::NativeCaptureUnavailable); + } + if self.format.frame_format != CaptureFrameFormat::Nv12 { + return Err(AvFoundationError::UnsupportedFrameFormat(self.format.frame_format)); + } + self.inner.capture_native_frame() + } + + #[cfg(not(target_os = "macos"))] + fn capture_native_frame_inner(&mut self) -> Result { + Err(AvFoundationError::UnsupportedPlatform) + } } /// AVFoundation decoded-frame capture session that forwards frames into a track. @@ -271,6 +344,9 @@ pub enum AvFoundationError { /// AVFoundation produced a pixel format this backend cannot convert yet. #[error("unsupported AVFoundation pixel format 0x{0:08x}")] UnsupportedCoreVideoPixelFormat(u32), + /// Native capture cannot be used for the negotiated session. + #[error("AVFoundation native capture requires NV12 without software scaling")] + NativeCaptureUnavailable, /// Pixel conversion failed. #[error("failed to convert AVFoundation frame to I420: {0}")] Convert(&'static str), @@ -422,7 +498,7 @@ fn list_devices() -> Result, AvFoundationError> { name, model_id, manufacturer, - paths: vec![CapturePath::Raw], + paths: vec![CapturePath::Native, CapturePath::Raw], formats: Vec::new(), formats_complete: false, }); @@ -449,15 +525,23 @@ fn start_capture(capture: &mut AvFoundationCapture) -> Result<(), AvFoundationEr let track = capture.track.clone(); let mut session = AvFoundationCaptureSession::new(capture.options.clone())?; + let capture_native = session.native_capture_supported(); let stop = Arc::new(AtomicBool::new(false)); let stop_for_thread = stop.clone(); let handle = std::thread::Builder::new() .name("avfoundation-capture".into()) .spawn(move || { while !stop_for_thread.load(Ordering::Acquire) { - match session.capture_frame() { - Ok(frame) => track.capture_frame(&frame.frame), - Err(_) => break, + if capture_native { + match session.capture_native_frame() { + Ok(frame) => track.capture_frame(&frame.frame), + Err(_) => break, + } + } else { + match session.capture_frame() { + Ok(frame) => track.capture_frame(&frame.frame), + Err(_) => break, + } } } }) @@ -492,22 +576,32 @@ fn stop_capture(_capture: &mut AvFoundationCapture) -> Result<(), AvFoundationEr #[cfg(target_os = "macos")] mod macos { + use std::ffi::c_void; + use std::ops::Deref; + use std::ptr::NonNull; use std::sync::{Arc, Condvar, Mutex}; use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; use dispatch2::{DispatchQueue, DispatchRetained}; - use livekit::webrtc::video_frame::{I420Buffer, VideoBuffer, VideoFrame, VideoRotation}; + use livekit::webrtc::video_frame::{ + native::NativeBuffer, I420Buffer, VideoFrame, VideoRotation, + }; use objc2::rc::Retained; use objc2::runtime::{AnyObject, ProtocolObject}; use objc2::{define_class, msg_send, AnyThread, DefinedClass, Message}; use objc2_av_foundation::{ AVCaptureDevice, AVCaptureDeviceFormat, AVCaptureDeviceInput, AVCaptureOutput, AVCaptureSession, AVCaptureSessionPreset1280x720, AVCaptureSessionPreset1920x1080, - AVCaptureSessionPreset640x480, AVCaptureSessionPresetHigh, AVCaptureSessionPresetMedium, - AVCaptureVideoDataOutput, AVCaptureVideoDataOutputSampleBufferDelegate, AVMediaTypeVideo, + AVCaptureSessionPreset640x480, AVCaptureSessionPresetHigh, + AVCaptureSessionPresetInputPriority, AVCaptureSessionPresetMedium, + AVCaptureVideoDataOutput, AVCaptureVideoDataOutputSampleBufferDelegate, + AVCaptureVideoStabilizationMode, AVMediaTypeVideo, + }; + use objc2_core_media::{ + CMClock, CMSampleBuffer, CMTime, CMTimeFlags, CMVideoFormatDescriptionGetDimensions, }; - use objc2_core_media::{CMSampleBuffer, CMTime, CMVideoFormatDescriptionGetDimensions}; use objc2_core_video::{ + kCVPixelBufferIOSurfacePropertiesKey, kCVPixelBufferMetalCompatibilityKey, kCVPixelBufferPixelFormatTypeKey, kCVPixelFormatType_32BGRA, kCVPixelFormatType_420YpCbCr8BiPlanarFullRange, kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange, kCVPixelFormatType_420YpCbCr8Planar, @@ -522,11 +616,19 @@ mod macos { }; use objc2_foundation::{NSDictionary, NSNumber, NSObject, NSObjectProtocol, NSString}; - use super::{AvFoundationCaptureOptions, AvFoundationError, AvFoundationFrame}; + use super::{ + AvFoundationCaptureOptions, AvFoundationError, AvFoundationFrame, AvFoundationNativeFrame, + MAX_CAPTURE_TIMESTAMP_AGE_US, + }; use crate::device::{ CaptureDeviceSelector, CaptureFormat, CaptureFormatRequest, CaptureFrameFormat, CaptureResolution, }; + + unsafe extern "C" { + fn CFRelease(cf: *const c_void); + fn CVPixelBufferGetIOSurface(pixel_buffer: *const CVPixelBuffer) -> *const c_void; + } pub(super) struct SessionInner { session: Retained, _input: Retained, @@ -573,6 +675,7 @@ mod macos { // camera input and video data output only after canAdd* checks. unsafe { session.beginConfiguration(); + session.setAutomaticallyConfiguresCaptureDeviceForWideColor(false); if active_format.is_none() { if let Some(preset) = session_preset(&options.format) { session.setSessionPreset(preset); @@ -587,6 +690,12 @@ mod macos { session.addInput(&input); configure_device(&device, &options.format, active_format.as_deref())?; + if active_format.is_some() + && session.canSetSessionPreset(AVCaptureSessionPresetInputPriority) + { + session.setSessionPreset(AVCaptureSessionPresetInputPriority); + } + configure_input_frame_duration(&input, &device, &options.format); if let Some(video_settings) = preferred_video_settings(&output) { output.setVideoSettings(Some(&video_settings)); @@ -602,6 +711,7 @@ mod macos { )); } session.addOutput(&output); + configure_output_connection(&output)?; Ok(()) })(); session.commitConfiguration(); @@ -620,35 +730,117 @@ mod macos { pub(super) fn wait_for_format( &self, timeout: Duration, - ) -> Result { + ) -> Result { self.shared.wait_for_format(timeout) } pub(super) fn capture_frame(&mut self) -> Result { self.shared.take_frame() } + + pub(super) fn capture_native_frame( + &mut self, + ) -> Result { + self.shared.take_native_frame() + } + + pub(super) fn discard_pending_frame(&self) { + self.shared.discard_latest(); + } } fn preferred_video_settings( output: &AVCaptureVideoDataOutput, ) -> Option>> { - let preferred = kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange; + let preferred = [ + // WebRTC's VideoToolbox H.264 encoder allocates full-range NV12 + // buffers for its CPU upload path. Prefer the same CoreVideo + // format for direct CVPixelBuffer input so the native path does + // not have to reset VideoToolbox into a separate video-range pool. + kCVPixelFormatType_420YpCbCr8BiPlanarFullRange, + kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange, + ]; // SAFETY: `output` is a live AVCaptureVideoDataOutput owned by the session setup path, and // querying advertised CV pixel formats does not mutate Rust-managed memory. - let supported = unsafe { output.availableVideoCVPixelFormatTypes() } - .iter() - .any(|format| format.as_u32() == preferred); - if !supported { - return None; + let supported_formats = unsafe { output.availableVideoCVPixelFormatTypes() }; + let pixel_format_type = preferred.into_iter().find(|preferred| { + supported_formats.iter().any(|format| format.as_u32() == *preferred) + })?; + + let pixel_format = NSNumber::new_u32(pixel_format_type); + let metal_compatible = NSNumber::new_bool(true); + let iosurface_properties = NSDictionary::::new(); + // SAFETY: The CoreVideo constants are immutable CFString keys. + // `CFString` and `NSString` are toll-free bridged, which + // objc2-foundation exposes through `AsRef`. + let pixel_format_key: &NSString = unsafe { kCVPixelBufferPixelFormatTypeKey }.as_ref(); + // SAFETY: Same as above. + let iosurface_key: &NSString = unsafe { kCVPixelBufferIOSurfacePropertiesKey }.as_ref(); + // SAFETY: Same as above. + let metal_key: &NSString = unsafe { kCVPixelBufferMetalCompatibilityKey }.as_ref(); + Some(NSDictionary::from_slices( + &[pixel_format_key, iosurface_key, metal_key], + &[pixel_format.as_ref(), iosurface_properties.as_ref(), metal_compatible.as_ref()], + )) + } + + fn configure_input_frame_duration( + input: &AVCaptureDeviceInput, + device: &AVCaptureDevice, + request: &CaptureFormatRequest, + ) { + let Some(frame_rate) = requested_frame_rate(request).filter(|frame_rate| *frame_rate > 0) + else { + return; + }; + // SAFETY: `input` is the live input just added to the session. The + // support predicate is checked before setting the locked duration. + if !unsafe { input.isLockedVideoFrameDurationSupported() } { + return; } - let pixel_format = NSNumber::new_u32(preferred); - // SAFETY: `kCVPixelBufferPixelFormatTypeKey` is a CoreVideo-provided - // immutable CFString constant. `CFString` and `NSString` are toll-free - // bridged, which objc2-foundation exposes through `AsRef`. - let key: &NSString = unsafe { kCVPixelBufferPixelFormatTypeKey }.as_ref(); - let value: &AnyObject = pixel_format.as_ref(); - Some(NSDictionary::from_slices(&[key], &[value])) + let duration = unsafe { CMTime::with_seconds(1.0 / frame_rate as f64, 600) }; + // SAFETY: `device` and `input` belong to the same session setup path. + // The requested rate has already been checked against the active format + // before the device frame durations are set, and `input` reports locked + // frame duration support. + unsafe { + if device_format_supports_frame_rate(&device.activeFormat(), frame_rate) { + input.setActiveLockedVideoFrameDuration(duration); + } + } + } + + fn configure_output_connection( + output: &AVCaptureVideoDataOutput, + ) -> Result<(), AvFoundationError> { + let media_type = unsafe { AVMediaTypeVideo }.ok_or(AvFoundationError::DeviceNotFound)?; + // SAFETY: `output` has just been added to a configured session. Querying + // its video connection does not mutate Rust-managed memory. + let Some(connection) = (unsafe { output.connectionWithMediaType(media_type) }) else { + return Err(AvFoundationError::SessionSetup( + "video data output connection was not created".to_string(), + )); + }; + + // Keep frame-duration control on the device/input path. The deprecated + // output connection frame-duration setters can change whether macOS + // delivers IOSurface-backed pixel buffers. + // SAFETY: The connection is the video data output connection. Each + // setter is guarded by the corresponding support/configuration checks + // required by AVFoundation's API contract. + unsafe { + if connection.isVideoStabilizationSupported() { + connection.setPreferredVideoStabilizationMode(AVCaptureVideoStabilizationMode::Off); + } + if connection.automaticallyAdjustsVideoMirroring() { + connection.setAutomaticallyAdjustsVideoMirroring(false); + } + if connection.isVideoMirroringSupported() && connection.isVideoMirrored() { + connection.setVideoMirrored(false); + } + } + Ok(()) } #[derive(Debug)] @@ -713,13 +905,20 @@ mod macos { #[derive(Debug, Default)] struct FrameQueueState { - latest: Option, + latest: Option, stopped: bool, error: Option, } + #[derive(Debug)] + pub(super) struct InitialFrameInfo { + pub(super) format: CaptureFormat, + pub(super) native_frame_supported: bool, + pub(super) core_video_pixel_format: u32, + } + impl FrameQueue { - fn push_frame(&self, frame: AvFoundationFrame) { + fn push_frame(&self, frame: QueuedAvFoundationFrame) { let mut state = self.state.lock().expect("AVFoundation frame queue poisoned"); if state.stopped { return; @@ -740,16 +939,27 @@ mod macos { self.ready.notify_all(); } - fn wait_for_format(&self, timeout: Duration) -> Result { + fn discard_latest(&self) { + let mut state = self.state.lock().expect("AVFoundation frame queue poisoned"); + state.latest = None; + } + + fn wait_for_format( + &self, + timeout: Duration, + ) -> Result { let mut state = self.state.lock().expect("AVFoundation frame queue poisoned"); loop { if let Some(frame) = state.latest.as_ref() { - let buffer = &frame.frame.buffer; - return Ok(CaptureFormat::new( - CaptureResolution::new(buffer.width(), buffer.height()), - 0, - frame.source_format, - )); + return Ok(InitialFrameInfo { + format: CaptureFormat::new( + CaptureResolution::new(frame.width, frame.height), + 0, + frame.source_format, + ), + native_frame_supported: frame.native_frame_supported(), + core_video_pixel_format: frame.core_video_pixel_format, + }); } if let Some(error) = state.error.take() { return Err(AvFoundationError::Runtime(error)); @@ -773,7 +983,23 @@ mod macos { let mut state = self.state.lock().expect("AVFoundation frame queue poisoned"); loop { if let Some(frame) = state.latest.take() { - return Ok(frame); + return frame.into_i420_frame(); + } + if let Some(error) = state.error.take() { + return Err(AvFoundationError::Runtime(error)); + } + if state.stopped { + return Err(AvFoundationError::NotRunning); + } + state = self.ready.wait(state).expect("AVFoundation frame queue poisoned"); + } + } + + fn take_native_frame(&self) -> Result { + let mut state = self.state.lock().expect("AVFoundation frame queue poisoned"); + loop { + if let Some(frame) = state.latest.take() { + return frame.into_native_frame(); } if let Some(error) = state.error.take() { return Err(AvFoundationError::Runtime(error)); @@ -790,6 +1016,131 @@ mod macos { } } + #[derive(Debug)] + struct QueuedAvFoundationFrame { + pixel_buffer: RetainedPixelBuffer, + width: u32, + height: u32, + source_format: CaptureFrameFormat, + core_video_pixel_format: u32, + capture_wall_time_us: u64, + read_wall_time_us: u64, + sensor_timestamp_us: Option, + timestamp_us: i64, + is_iosurface_backed: bool, + } + + impl QueuedAvFoundationFrame { + fn into_i420_frame(self) -> Result { + let (buffer, source_format) = convert_pixel_buffer(self.pixel_buffer.as_ref())?; + let frame = VideoFrame { + rotation: VideoRotation::VideoRotation0, + timestamp_us: self.timestamp_us, + frame_metadata: None, + buffer, + }; + + Ok(AvFoundationFrame { + frame, + source_format, + capture_wall_time_us: self.capture_wall_time_us, + read_wall_time_us: self.read_wall_time_us, + sensor_timestamp_us: self.sensor_timestamp_us, + used_conversion: source_format != CaptureFrameFormat::I420, + }) + } + + fn into_native_frame(self) -> Result { + if self.source_format != CaptureFrameFormat::Nv12 { + return Err(AvFoundationError::UnsupportedFrameFormat(self.source_format)); + } + if self.core_video_pixel_format != kCVPixelFormatType_420YpCbCr8BiPlanarFullRange { + return Err(AvFoundationError::NativeCaptureUnavailable); + } + if !self.is_iosurface_backed { + return Err(AvFoundationError::NativeCaptureUnavailable); + } + + let buffer = self.pixel_buffer.into_native_buffer(); + let frame = VideoFrame { + rotation: VideoRotation::VideoRotation0, + timestamp_us: self.timestamp_us, + frame_metadata: None, + buffer, + }; + + Ok(AvFoundationNativeFrame { + frame, + source_format: self.source_format, + capture_wall_time_us: self.capture_wall_time_us, + read_wall_time_us: self.read_wall_time_us, + sensor_timestamp_us: self.sensor_timestamp_us, + }) + } + + fn native_frame_supported(&self) -> bool { + self.source_format == CaptureFrameFormat::Nv12 + && self.core_video_pixel_format == kCVPixelFormatType_420YpCbCr8BiPlanarFullRange + && self.is_iosurface_backed + } + } + + fn pixel_buffer_has_iosurface(pixel_buffer: &CVPixelBuffer) -> bool { + // SAFETY: `pixel_buffer` is a valid CVPixelBufferRef. CoreVideo returns + // an unretained IOSurfaceRef; this code only checks for null and does + // not store or release the returned pointer. + !unsafe { CVPixelBufferGetIOSurface(pixel_buffer) }.is_null() + } + + #[derive(Debug)] + struct RetainedPixelBuffer { + ptr: NonNull, + } + + // SAFETY: `RetainedPixelBuffer` owns a +1 CoreFoundation reference to a + // CVPixelBuffer. CoreFoundation retain/release and CoreVideo pixel-buffer + // inspection are thread-safe for this usage, and mutable pixel access still + // goes through CoreVideo's lock/unlock API. + unsafe impl Send for RetainedPixelBuffer {} + // SAFETY: The wrapper exposes only shared access to the pixel buffer and + // releases its retained reference on drop. + unsafe impl Sync for RetainedPixelBuffer {} + + impl RetainedPixelBuffer { + fn from_image_buffer(image_buffer: T) -> Self + where + T: Deref, + { + let ptr = NonNull::from(&*image_buffer).cast::(); + std::mem::forget(image_buffer); + Self { ptr } + } + + fn as_ref(&self) -> &CVPixelBuffer { + // SAFETY: `ptr` was created from a retained CVImageBuffer returned + // by CMSampleBufferGetImageBuffer and remains valid until this + // wrapper drops or transfers that retain. + unsafe { self.ptr.as_ref() } + } + + fn into_native_buffer(self) -> NativeBuffer { + let ptr = self.ptr.as_ptr().cast::(); + std::mem::forget(self); + // SAFETY: `ptr` is a valid retained CVPixelBufferRef. The WebRTC + // bridge wraps it in RTCCVPixelBuffer and then releases the +1 + // retain we transfer here, so Rust must not release it afterwards. + unsafe { NativeBuffer::from_cv_pixel_buffer(ptr) } + } + } + + impl Drop for RetainedPixelBuffer { + fn drop(&mut self) { + // SAFETY: `ptr` owns one CoreFoundation retain unless ownership was + // transferred by `into_native_buffer`, which forgets `self`. + unsafe { CFRelease(self.ptr.as_ptr().cast::()) }; + } + } + fn select_device( selector: &CaptureDeviceSelector, ) -> Result, AvFoundationError> { @@ -831,11 +1182,6 @@ mod macos { ); selected.map(Some).ok_or(AvFoundationError::UnsupportedFormat(*format)) } - CaptureFormatRequest::Closest(format) - if exact_session_preset(format.resolution).is_some() => - { - Ok(None) - } CaptureFormatRequest::Closest(format) => Ok(best_device_format( device, Some(format.resolution), @@ -1021,6 +1367,7 @@ mod macos { device.setActiveFormat(active_format); } } + configure_low_latency_device_processing(device); let Some(frame_rate) = frame_rate.filter(|frame_rate| *frame_rate > 0) else { return Ok(()); @@ -1044,6 +1391,28 @@ mod macos { Ok(()) } + fn configure_low_latency_device_processing(device: &AVCaptureDevice) { + // SAFETY: The caller holds the AVCaptureDevice configuration lock. + // Setters are guarded by their support/current-state predicates where + // AVFoundation requires that. + unsafe { + if device.automaticallyAdjustsVideoHDREnabled() { + device.setAutomaticallyAdjustsVideoHDREnabled(false); + } + if device.isVideoHDREnabled() { + device.setVideoHDREnabled(false); + } + if device.isLowLightBoostSupported() + && device.automaticallyEnablesLowLightBoostWhenAvailable() + { + device.setAutomaticallyEnablesLowLightBoostWhenAvailable(false); + } + if device.isSmoothAutoFocusSupported() && device.isSmoothAutoFocusEnabled() { + device.setSmoothAutoFocusEnabled(false); + } + } + } + fn requested_frame_rate(request: &CaptureFormatRequest) -> Option { match request { CaptureFormatRequest::Default => None, @@ -1087,35 +1456,98 @@ mod macos { shared: &FrameQueue, ) -> Result<(), AvFoundationError> { let read_wall_time_us = unix_time_us_now().unwrap_or_default(); + let sensor_timestamp_us = + sample_buffer_capture_wall_time_us(sample_buffer, read_wall_time_us); let image_buffer = unsafe { sample_buffer.image_buffer() } .ok_or(AvFoundationError::InvalidFrame("sample buffer has no image buffer"))?; - let image_buffer_ref: &CVImageBuffer = &image_buffer; - // SAFETY: Video data output sample buffers deliver CVPixelBuffer-backed - // CVImageBuffer objects. The retained image buffer keeps the object alive - // for the duration of this conversion. - let pixel_buffer = - unsafe { &*(image_buffer_ref as *const CVImageBuffer as *const CVPixelBuffer) }; - let (buffer, source_format) = convert_pixel_buffer(pixel_buffer)?; - - let capture_wall_time_us = read_wall_time_us; - let frame = VideoFrame { - rotation: VideoRotation::VideoRotation0, - timestamp_us: shared.timestamp_us(), - frame_metadata: None, - buffer, - }; - - shared.push_frame(AvFoundationFrame { - frame, + let pixel_buffer = RetainedPixelBuffer::from_image_buffer(image_buffer); + let pixel_buffer_ref = pixel_buffer.as_ref(); + let width = u32::try_from(CVPixelBufferGetWidth(pixel_buffer_ref)) + .map_err(|_| AvFoundationError::InvalidFrame("width is out of range"))?; + let height = u32::try_from(CVPixelBufferGetHeight(pixel_buffer_ref)) + .map_err(|_| AvFoundationError::InvalidFrame("height is out of range"))?; + let source_format = capture_frame_format_from_core_video(CVPixelBufferGetPixelFormatType( + pixel_buffer_ref, + ))?; + let core_video_pixel_format = CVPixelBufferGetPixelFormatType(pixel_buffer_ref); + let is_iosurface_backed = pixel_buffer_has_iosurface(pixel_buffer_ref); + + let capture_wall_time_us = sensor_timestamp_us.unwrap_or(read_wall_time_us); + shared.push_frame(QueuedAvFoundationFrame { + pixel_buffer, + width, + height, source_format, + core_video_pixel_format, capture_wall_time_us, read_wall_time_us, - sensor_timestamp_us: None, - used_conversion: source_format != CaptureFrameFormat::I420, + sensor_timestamp_us, + timestamp_us: shared.timestamp_us(), + is_iosurface_backed, }); Ok(()) } + fn sample_buffer_capture_wall_time_us( + sample_buffer: &CMSampleBuffer, + read_wall_time_us: u64, + ) -> Option { + let sample_time = unsafe { sample_buffer.presentation_time_stamp() }; + + let timestamp_us = cm_time_to_us(sample_time)?; + if validate_capture_timestamp_us(timestamp_us, read_wall_time_us).is_some() { + return Some(timestamp_us); + } + + let host_now_us = current_host_time_us()?; + let age_us = host_now_us.checked_sub(timestamp_us)?; + if age_us > MAX_CAPTURE_TIMESTAMP_AGE_US { + return None; + } + read_wall_time_us.checked_sub(age_us) + } + + fn current_host_time_us() -> Option { + // SAFETY: The CoreMedia host time clock is a process-wide singleton and + // reading it does not mutate Rust-managed memory. + let host_clock = unsafe { CMClock::host_time_clock() }; + // SAFETY: `host_clock` is a valid retained CoreMedia clock. + let host_time = unsafe { host_clock.time() }; + cm_time_to_us(host_time) + } + + fn cm_time_to_us(time: CMTime) -> Option { + let flags = time.flags; + if !flags.contains(CMTimeFlags::Valid) + || flags.intersects(CMTimeFlags::ImpliedValueFlagsMask) + { + return None; + } + + // SAFETY: `time` is a valid CMTime value returned by CoreMedia. Invalid + // and indefinite values were filtered above. + let seconds = unsafe { time.seconds() }; + if !seconds.is_finite() || seconds < 0.0 { + return None; + } + + let micros = seconds * 1_000_000.0; + (micros <= u64::MAX as f64).then_some(micros.round() as u64) + } + + fn validate_capture_timestamp_us( + capture_timestamp_us: u64, + read_wall_time_us: u64, + ) -> Option { + if capture_timestamp_us == 0 || capture_timestamp_us > read_wall_time_us { + return None; + } + if read_wall_time_us - capture_timestamp_us > MAX_CAPTURE_TIMESTAMP_AGE_US { + return None; + } + Some(capture_timestamp_us) + } + fn convert_pixel_buffer( pixel_buffer: &CVPixelBuffer, ) -> Result<(I420Buffer, CaptureFrameFormat), AvFoundationError> { @@ -1143,37 +1575,47 @@ mod macos { .map_err(|_| AvFoundationError::InvalidFrame("width is out of range"))?; let height = u32::try_from(CVPixelBufferGetHeight(pixel_buffer)) .map_err(|_| AvFoundationError::InvalidFrame("height is out of range"))?; - let pixel_format = CVPixelBufferGetPixelFormatType(pixel_buffer); + let source_format = + capture_frame_format_from_core_video(CVPixelBufferGetPixelFormatType(pixel_buffer))?; + + match source_format { + CaptureFrameFormat::Nv12 => convert_nv12(pixel_buffer, width, height) + .map(|buffer| (buffer, CaptureFrameFormat::Nv12)), + CaptureFrameFormat::Bgra => convert_bgra(pixel_buffer, width, height) + .map(|buffer| (buffer, CaptureFrameFormat::Bgra)), + CaptureFrameFormat::I420 => convert_i420(pixel_buffer, width, height) + .map(|buffer| (buffer, CaptureFrameFormat::I420)), + CaptureFrameFormat::Uyvy => convert_uyvy(pixel_buffer, width, height) + .map(|buffer| (buffer, CaptureFrameFormat::Uyvy)), + CaptureFrameFormat::Yuyv => convert_yuy2(pixel_buffer, width, height) + .map(|buffer| (buffer, CaptureFrameFormat::Yuyv)), + other => Err(AvFoundationError::UnsupportedFrameFormat(other)), + } + } + fn capture_frame_format_from_core_video( + pixel_format: u32, + ) -> Result { match pixel_format { format if format == kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange || format == kCVPixelFormatType_420YpCbCr8BiPlanarFullRange => { - convert_nv12(pixel_buffer, width, height) - .map(|buffer| (buffer, CaptureFrameFormat::Nv12)) - } - format if format == kCVPixelFormatType_32BGRA => { - convert_bgra(pixel_buffer, width, height) - .map(|buffer| (buffer, CaptureFrameFormat::Bgra)) + Ok(CaptureFrameFormat::Nv12) } + format if format == kCVPixelFormatType_32BGRA => Ok(CaptureFrameFormat::Bgra), format if format == kCVPixelFormatType_420YpCbCr8Planar || format == kCVPixelFormatType_420YpCbCr8PlanarFullRange => { - convert_i420(pixel_buffer, width, height) - .map(|buffer| (buffer, CaptureFrameFormat::I420)) - } - format if format == kCVPixelFormatType_422YpCbCr8 => { - convert_uyvy(pixel_buffer, width, height) - .map(|buffer| (buffer, CaptureFrameFormat::Uyvy)) + Ok(CaptureFrameFormat::I420) } + format if format == kCVPixelFormatType_422YpCbCr8 => Ok(CaptureFrameFormat::Uyvy), format if format == kCVPixelFormatType_422YpCbCr8_yuvs || format == kCVPixelFormatType_422YpCbCr8FullRange => { - convert_yuy2(pixel_buffer, width, height) - .map(|buffer| (buffer, CaptureFrameFormat::Yuyv)) + Ok(CaptureFrameFormat::Yuyv) } other => Err(AvFoundationError::UnsupportedCoreVideoPixelFormat(other)), } diff --git a/livekit-capture/src/source.rs b/livekit-capture/src/source.rs index f2b006c72..41fc6cc0f 100644 --- a/livekit-capture/src/source.rs +++ b/livekit-capture/src/source.rs @@ -14,7 +14,7 @@ use std::{error::Error, fmt}; -use livekit::webrtc::video_frame::{I420Buffer, VideoFrame}; +use livekit::webrtc::video_frame::{native::NativeBuffer, I420Buffer, VideoFrame}; use thiserror::Error; use crate::{ @@ -76,10 +76,34 @@ impl RawVideoFrame { } } +/// Platform-native uncompressed video frame buffer produced by a capture source. +#[derive(Debug)] +pub struct NativeVideoFrame { + /// Native video frame suitable for [`VideoCaptureTrack::capture_frame`]. + pub frame: VideoFrame, + /// Source format delivered by the capture backend. + pub source_format: CaptureFrameFormat, + /// Wall-clock capture timestamp in microseconds. + pub capture_wall_time_us: u64, + /// Wall-clock timestamp recorded after the frame was read, in microseconds. + pub read_wall_time_us: u64, + /// Sensor timestamp translated to UNIX-epoch microseconds, when available. + pub sensor_timestamp_us: Option, +} + +impl NativeVideoFrame { + /// Returns the native video frame. + pub fn video_frame(&self) -> &VideoFrame { + &self.frame + } +} + /// Frame produced by a capture source. #[derive(Debug)] #[non_exhaustive] pub enum CaptureFrame { + /// Platform-native uncompressed frame. + Native(NativeVideoFrame), /// Uncompressed CPU-accessible frame. Raw(RawVideoFrame), /// Linux DMA-BUF backed frame. @@ -92,6 +116,7 @@ impl CaptureFrame { /// Returns the capture path used by this frame. pub fn capture_path(&self) -> CapturePath { match self { + Self::Native(_) => CapturePath::Native, Self::Raw(_) => CapturePath::Raw, Self::DmaBuf(_) => CapturePath::DmaBuf, Self::Encoded(_) => CapturePath::Encoded, @@ -101,6 +126,10 @@ impl CaptureFrame { /// Publishes this frame into a LiveKit capture track. pub fn publish_to(&self, track: &VideoCaptureTrack) -> Result<(), CaptureError> { match self { + Self::Native(frame) => { + track.capture_frame(&frame.frame); + Ok(()) + } Self::Raw(frame) => { track.capture_frame(&frame.frame); Ok(()) @@ -344,7 +373,24 @@ impl CaptureFrameSource for crate::sources::avfoundation::AvFoundationCaptureSes } fn next_frame(&mut self) -> Result { - self.capture_frame().map(|frame| CaptureFrame::Raw(frame.into())) + if self.native_capture_supported() { + self.capture_native_frame().map(|frame| CaptureFrame::Native(frame.into())) + } else { + self.capture_frame().map(|frame| CaptureFrame::Raw(frame.into())) + } + } +} + +#[cfg(feature = "avfoundation")] +impl From for NativeVideoFrame { + fn from(frame: crate::sources::avfoundation::AvFoundationNativeFrame) -> Self { + Self { + frame: frame.frame, + source_format: frame.source_format, + capture_wall_time_us: frame.capture_wall_time_us, + read_wall_time_us: frame.read_wall_time_us, + sensor_timestamp_us: frame.sensor_timestamp_us, + } } } From 80fae4ae3f77cbb13193c3b65675cb4f20f5b016 Mon Sep 17 00:00:00 2001 From: David Chen Date: Mon, 29 Jun 2026 20:24:35 -0700 Subject: [PATCH 13/24] remove nokhwa dep --- Cargo.lock | 66 +-- livekit-capture/Cargo.toml | 5 +- livekit-capture/src/device.rs | 12 +- livekit-capture/src/sources/v4l.rs | 716 ++++++++++++++++++++--------- 4 files changed, 503 insertions(+), 296 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 90b439707..283f35d31 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2238,18 +2238,6 @@ dependencies = [ "miniz_oxide", ] -[[package]] -name = "flume" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0e4dd2a88388a1f4ccc7c9ce104604dab68d9f408dc34cd45823d5a9069095" -dependencies = [ - "futures-core", - "futures-sink", - "nanorand", - "spin", -] - [[package]] name = "fnv" version = "1.0.7" @@ -4039,15 +4027,16 @@ dependencies = [ "gstreamer", "gstreamer-app", "image", + "libc", "livekit", "md-5", - "nokhwa", "objc2 0.6.4", "objc2-av-foundation", "objc2-core-media", "objc2-core-video", "objc2-foundation 0.3.2", "thiserror 2.0.18", + "v4l", "yuv-sys", ] @@ -4419,15 +4408,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "nanorand" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a51313c5820b0b02bd422f4b44776fbf47961755c74ce64afc73bfad10226c3" -dependencies = [ - "getrandom 0.2.17", -] - [[package]] name = "napi" version = "3.8.3" @@ -4610,39 +4590,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451" -[[package]] -name = "nokhwa" -version = "0.10.10" -source = "git+https://github.com/l1npengtul/nokhwa?rev=4923ecab7cf26f9dba83867a15a9d8662d021296#4923ecab7cf26f9dba83867a15a9d8662d021296" -dependencies = [ - "flume", - "image", - "nokhwa-bindings-linux", - "nokhwa-core", - "paste", - "thiserror 2.0.18", -] - -[[package]] -name = "nokhwa-bindings-linux" -version = "0.1.3" -source = "git+https://github.com/l1npengtul/nokhwa?rev=4923ecab7cf26f9dba83867a15a9d8662d021296#4923ecab7cf26f9dba83867a15a9d8662d021296" -dependencies = [ - "libc", - "nokhwa-core", - "v4l", -] - -[[package]] -name = "nokhwa-core" -version = "0.1.8" -source = "git+https://github.com/l1npengtul/nokhwa?rev=4923ecab7cf26f9dba83867a15a9d8662d021296#4923ecab7cf26f9dba83867a15a9d8662d021296" -dependencies = [ - "bytes", - "image", - "thiserror 2.0.18", -] - [[package]] name = "nom" version = "7.1.3" @@ -7066,15 +7013,6 @@ dependencies = [ "hound", ] -[[package]] -name = "spin" -version = "0.9.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" -dependencies = [ - "lock_api", -] - [[package]] name = "spirv" version = "0.4.0+sdk-1.4.341.0" diff --git a/livekit-capture/Cargo.toml b/livekit-capture/Cargo.toml index 0c997e201..22f7eaff3 100644 --- a/livekit-capture/Cargo.toml +++ b/livekit-capture/Cargo.toml @@ -60,7 +60,7 @@ gstreamer = ["dep:gstreamer", "dep:gstreamer-app"] libargus = [] rtsp = [] tcp-source = [] -v4l = ["dep:image", "dep:nokhwa", "dep:yuv-sys"] +v4l = ["dep:image", "dep:libc", "dep:v4l", "dep:yuv-sys"] [build-dependencies] cc = { workspace = true } @@ -74,4 +74,5 @@ objc2-core-video = { version = "0.3.2", default-features = false, optional = tru objc2-foundation = { version = "0.3.2", default-features = false, features = ["std"], optional = true } [target.'cfg(target_os = "linux")'.dependencies] -nokhwa = { git = "https://github.com/l1npengtul/nokhwa", rev = "4923ecab7cf26f9dba83867a15a9d8662d021296", default-features = false, features = ["input-v4l"], optional = true } +libc = { version = "0.2", optional = true } +v4l = { version = "0.14", default-features = false, features = ["v4l2"], optional = true } diff --git a/livekit-capture/src/device.rs b/livekit-capture/src/device.rs index 355c05d99..6907a7123 100644 --- a/livekit-capture/src/device.rs +++ b/livekit-capture/src/device.rs @@ -143,7 +143,7 @@ pub enum CaptureFrameFormat { /// Packed UYVY. Uyvy, /// Single-plane 8-bit luma. - Gray, + Grey, /// Encoded MJPEG frames. Mjpeg, } @@ -159,7 +159,7 @@ impl CaptureFrameFormat { Self::Bgr24 => "bgr24", Self::Yuyv => "yuyv", Self::Uyvy => "uyvy", - Self::Gray => "gray", + Self::Grey => "grey", Self::Mjpeg => "mjpeg", } } @@ -183,7 +183,7 @@ impl std::str::FromStr for CaptureFrameFormat { "bgr24" | "bgr" => Ok(Self::Bgr24), "yuyv" | "yuy2" => Ok(Self::Yuyv), "uyvy" => Ok(Self::Uyvy), - "gray" | "grey" | "greyscale" | "grayscale" => Ok(Self::Gray), + "grey" | "greyscale" => Ok(Self::Grey), "mjpeg" | "mjpg" => Ok(Self::Mjpeg), _ => Err(CaptureFrameFormatParseError), } @@ -278,15 +278,15 @@ mod tests { fn capture_frame_format_parses_common_names() { assert_eq!(CaptureFrameFormat::from_str("MJPEG"), Ok(CaptureFrameFormat::Mjpeg)); assert_eq!(CaptureFrameFormat::from_str("mjpg"), Ok(CaptureFrameFormat::Mjpeg)); - assert_eq!(CaptureFrameFormat::from_str("gray"), Ok(CaptureFrameFormat::Gray)); - assert_eq!(CaptureFrameFormat::from_str("GREY"), Ok(CaptureFrameFormat::Gray)); + assert_eq!(CaptureFrameFormat::from_str("grey"), Ok(CaptureFrameFormat::Grey)); + assert_eq!(CaptureFrameFormat::from_str("GREY"), Ok(CaptureFrameFormat::Grey)); assert_eq!(CaptureFrameFormat::from_str("yuy2"), Ok(CaptureFrameFormat::Yuyv)); } #[test] fn capture_frame_format_displays_canonical_names() { assert_eq!(CaptureFrameFormat::Mjpeg.to_string(), "mjpeg"); - assert_eq!(CaptureFrameFormat::Gray.to_string(), "gray"); + assert_eq!(CaptureFrameFormat::Grey.to_string(), "grey"); } #[test] diff --git a/livekit-capture/src/sources/v4l.rs b/livekit-capture/src/sources/v4l.rs index 9fb6c495f..899282f4e 100644 --- a/livekit-capture/src/sources/v4l.rs +++ b/livekit-capture/src/sources/v4l.rs @@ -12,25 +12,31 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! Linux V4L2 capture using Nokhwa's V4L backend. +//! Linux V4L2 capture using direct V4L2 access. use std::time::Duration; #[cfg(target_os = "linux")] -use std::time::{Instant, SystemTime, UNIX_EPOCH}; +use std::{ + path::Path, + time::{Instant, SystemTime, UNIX_EPOCH}, +}; #[cfg(target_os = "linux")] use livekit::webrtc::video_frame::VideoRotation; use livekit::webrtc::video_frame::{I420Buffer, VideoFrame}; +use thiserror::Error; #[cfg(target_os = "linux")] -use nokhwa::{ - pixel_format::RgbFormat, - utils::{ - ApiBackend, CameraFormat, CameraIndex, FrameFormat, RequestedFormat, RequestedFormatType, - Resolution, - }, - Camera, +use v4l::{ + buffer::Type as V4lBufferType, + capability::Flags as V4lCapabilityFlags, + context, + format::{Format as V4lFormat, FourCC}, + frameinterval::FrameIntervalEnum, + framesize::FrameSizeEnum, + io::{mmap::Stream as MmapStream, traits::CaptureStream}, + video::{capture::Parameters as V4lCaptureParameters, Capture}, + Device, }; -use thiserror::Error; #[cfg(target_os = "linux")] use crate::device::CaptureBackend; @@ -54,7 +60,7 @@ pub struct V4lCaptureOptions { } impl V4lCaptureOptions { - /// Creates options that try YUYV, MJPEG, grayscale, RGB24, and NV12 at the requested format. + /// Creates options that try YUYV, MJPEG, greyscale, RGB24, and NV12 at the requested format. pub fn new( device: CaptureDeviceSelector, resolution: CaptureResolution, @@ -90,7 +96,7 @@ pub enum V4lError { /// The requested option is invalid. #[error("invalid V4L capture option: {0}")] InvalidOption(&'static str), - /// A numeric option could not be represented by Nokhwa. + /// A numeric option could not be represented by the V4L backend. #[error("V4L capture option is out of range: {0}")] OptionOutOfRange(&'static str), /// The camera backend returned an error. @@ -138,7 +144,7 @@ impl V4lFrame { /// Linux V4L2 capture session that emits decoded I420 frames. pub struct V4lCaptureSession { #[cfg(target_os = "linux")] - camera: Camera, + stream: MmapStream<'static>, format: CaptureFormat, options: V4lCaptureOptions, #[cfg(target_os = "linux")] @@ -184,19 +190,12 @@ impl V4lCaptureSession { #[cfg(target_os = "linux")] fn open(options: V4lCaptureOptions) -> Result { let frame_formats = frame_formats_for_request(&options)?; - let requested = RequestedFormat::with_formats(RequestedFormatType::None, &frame_formats); - let mut camera = Camera::with_backend( - camera_index(&options.device)?, - requested, - ApiBackend::Video4Linux, - ) - .map_err(nokhwa_error)?; - - apply_format_request(&mut camera, &options, &frame_formats)?; - - camera.open_stream().map_err(nokhwa_error)?; - let format = capture_format_from_nokhwa(camera.camera_format())?; - Ok(Self { camera, format, options, started_at: Instant::now() }) + let device = open_device(&options.device)?; + let all_formats = enumerate_device_formats(&device)?; + let format = apply_format_request(&device, &options, &frame_formats, &all_formats)?; + let stream = + MmapStream::with_buffers(&device, V4lBufferType::VideoCapture, 4).map_err(v4l_error)?; + Ok(Self { stream, format, options, started_at: Instant::now() }) } #[cfg(not(target_os = "linux"))] @@ -207,41 +206,36 @@ impl V4lCaptureSession { #[cfg(target_os = "linux")] fn capture_frame_inner(&mut self) -> Result { let fallback_wall_time_us = unix_time_us_now().unwrap_or_default(); - let buffer = self.camera.frame().map_err(nokhwa_error)?; + let format = self.format; + let (buffer, metadata) = self.stream.next().map_err(v4l_error)?; let read_wall_time_us = unix_time_us_now().unwrap_or(fallback_wall_time_us); - let backend_capture_timestamp = buffer.capture_timestamp(); + let backend_capture_timestamp = monotonic_to_wallclock(metadata.timestamp); let capture_wall_time_us = select_capture_wall_time_us( backend_capture_timestamp, fallback_wall_time_us, read_wall_time_us, ); - let format = self.camera.camera_format(); - let width = format.width(); - let height = format.height(); + let width = format.resolution.width; + let height = format.resolution.height; let mut frame = VideoFrame { rotation: VideoRotation::VideoRotation0, timestamp_us: elapsed_us(self.started_at.elapsed()), frame_metadata: None, buffer: I420Buffer::new(width, height), }; - let used_decode_path = convert_to_i420( - buffer.source_frame_format(), - buffer.buffer(), - width, - height, - &mut frame.buffer, - )?; - let source_format = capture_frame_format_from_nokhwa(buffer.source_frame_format())?; + let source = frame_bytes(buffer, metadata.bytesused); + let used_decode_path = + convert_to_i420(format.frame_format, source, width, height, &mut frame.buffer)?; Ok(V4lFrame { frame, - source_format, + source_format: format.frame_format, backend_capture_timestamp, capture_wall_time_us, read_wall_time_us, sensor_timestamp_us: None, - used_conversion: source_format != CaptureFrameFormat::I420, + used_conversion: format.frame_format != CaptureFrameFormat::I420, used_decode_path, }) } @@ -255,27 +249,47 @@ impl V4lCaptureSession { /// Returns Linux V4L2 capture devices. #[cfg(target_os = "linux")] pub fn devices() -> Result, V4lError> { - nokhwa::query(ApiBackend::Video4Linux) - .map_err(nokhwa_error)? + context::enum_devices() .into_iter() - .map(|info| { - let formats = enumerate_formats(&info); - let (formats, formats_complete) = match formats { - Ok(formats) => (formats, true), - Err(_) => (Vec::new(), false), + .filter_map(|node| { + let id = node.index().to_string(); + let fallback_name = + node.name().unwrap_or_else(|| node.path().to_string_lossy().into_owned()); + let mut name = fallback_name; + let mut model_id = None; + let mut manufacturer = None; + let mut formats = Vec::new(); + let mut formats_complete = false; + + if let Ok(device) = Device::with_path(node.path()) { + if let Ok(capabilities) = device.query_caps() { + if !capabilities.capabilities.contains(V4lCapabilityFlags::VIDEO_CAPTURE) { + return None; + } + if !capabilities.card.is_empty() { + name = capabilities.card; + } + model_id = Some(capabilities.bus).filter(|value| !value.is_empty()); + manufacturer = Some(capabilities.driver).filter(|value| !value.is_empty()); + } + + if let Ok(device_formats) = enumerate_device_formats(&device) { + formats = device_formats; + formats_complete = true; + } }; - let id = info.index().as_string(); - Ok(CaptureDeviceInfo { + + Some(Ok(CaptureDeviceInfo { backend: CaptureBackend::V4l2, id: id.clone(), selector: CaptureDeviceSelector::Id(id), - name: info.human_name(), - model_id: Some(info.description().to_string()).filter(|value| !value.is_empty()), - manufacturer: None, + name, + model_id, + manufacturer, paths: vec![CapturePath::Raw], formats, formats_complete, - }) + })) }) .collect() } @@ -291,7 +305,7 @@ pub fn default_frame_formats() -> Vec { vec![ CaptureFrameFormat::Yuyv, CaptureFrameFormat::Mjpeg, - CaptureFrameFormat::Gray, + CaptureFrameFormat::Grey, CaptureFrameFormat::Rgb24, CaptureFrameFormat::Nv12, ] @@ -319,7 +333,7 @@ fn validate_options(options: &V4lCaptureOptions) -> Result<(), V4lError> { return Err(V4lError::InvalidOption("frame_formats must include at least one format")); } for frame_format in &options.frame_formats { - if nokhwa_frame_format(*frame_format).is_none() { + if !is_supported_source_format(*frame_format) { return Err(V4lError::UnsupportedFrameFormat(*frame_format)); } } @@ -338,7 +352,7 @@ fn validate_format_request(format: &CaptureFormatRequest) -> Result<(), V4lError if format.frame_rate == 0 { return Err(V4lError::InvalidOption("frame_rate must be non-zero")); } - if nokhwa_frame_format(format.frame_format).is_none() { + if !is_supported_source_format(format.frame_format) { return Err(V4lError::UnsupportedFrameFormat(format.frame_format)); } Ok(()) @@ -354,7 +368,7 @@ fn validate_format_request(format: &CaptureFormatRequest) -> Result<(), V4lError validate_resolution(*resolution)?; } if let Some(frame_format) = frame_format { - if nokhwa_frame_format(*frame_format).is_none() { + if !is_supported_source_format(*frame_format) { return Err(V4lError::UnsupportedFrameFormat(*frame_format)); } } @@ -365,7 +379,7 @@ fn validate_format_request(format: &CaptureFormatRequest) -> Result<(), V4lError return Err(V4lError::InvalidOption("frame_rate must be non-zero")); } if let Some(frame_format) = frame_format { - if nokhwa_frame_format(*frame_format).is_none() { + if !is_supported_source_format(*frame_format) { return Err(V4lError::UnsupportedFrameFormat(*frame_format)); } } @@ -385,18 +399,27 @@ fn validate_resolution(resolution: CaptureResolution) -> Result<(), V4lError> { } #[cfg(target_os = "linux")] -fn camera_index(selector: &CaptureDeviceSelector) -> Result { +fn open_device(selector: &CaptureDeviceSelector) -> Result { match selector { - CaptureDeviceSelector::Default => Ok(CameraIndex::Index(0)), - CaptureDeviceSelector::Index(index) => Ok(CameraIndex::Index( - u32::try_from(*index).map_err(|_| V4lError::OptionOutOfRange("device index"))?, - )), - CaptureDeviceSelector::Id(id) => Ok(CameraIndex::String(id.clone())), + CaptureDeviceSelector::Default => Device::new(0).map_err(v4l_error), + CaptureDeviceSelector::Index(index) => Device::new(*index).map_err(v4l_error), + CaptureDeviceSelector::Id(id) => open_device_id(id), } } #[cfg(target_os = "linux")] -fn frame_formats_for_request(options: &V4lCaptureOptions) -> Result, V4lError> { +fn open_device_id(id: &str) -> Result { + if let Ok(index) = id.parse::() { + return Device::new(index).map_err(v4l_error); + } + + Device::with_path(Path::new(id)).map_err(v4l_error) +} + +#[cfg(target_os = "linux")] +fn frame_formats_for_request( + options: &V4lCaptureOptions, +) -> Result, V4lError> { let mut formats = match &options.format { CaptureFormatRequest::Exact(format) | CaptureFormatRequest::Closest(format) => { ordered_formats_with_first(&options.frame_formats, format.frame_format) @@ -412,10 +435,12 @@ fn frame_formats_for_request(options: &V4lCaptureOptions) -> Result( - request: &CaptureFormatRequest, - frame_formats: &'a [FrameFormat], - override_format: Option, -) -> Result, V4lError> { - let request_type = match request { - CaptureFormatRequest::Default => RequestedFormatType::None, - CaptureFormatRequest::Exact(format) => { - RequestedFormatType::Exact(nokhwa_camera_format(*format, override_format)?) - } - CaptureFormatRequest::Closest(format) => { - RequestedFormatType::Closest(nokhwa_camera_format(*format, override_format)?) - } - CaptureFormatRequest::HighestFrameRate { resolution: Some(resolution), .. } => { - RequestedFormatType::HighestResolution(nokhwa_resolution(*resolution)) - } - CaptureFormatRequest::HighestFrameRate { resolution: None, .. } => { - RequestedFormatType::AbsoluteHighestFrameRate - } - CaptureFormatRequest::HighestResolution { frame_rate: Some(frame_rate), .. } => { - RequestedFormatType::HighestFrameRate(*frame_rate) - } - CaptureFormatRequest::HighestResolution { frame_rate: None, .. } => { - RequestedFormatType::AbsoluteHighestResolution - } - }; - Ok(RequestedFormat::with_formats(request_type, frame_formats)) -} - #[cfg(target_os = "linux")] fn apply_format_request( - camera: &mut Camera, + device: &Device, options: &V4lCaptureOptions, - frame_formats: &[FrameFormat], -) -> Result<(), V4lError> { + frame_formats: &[CaptureFrameFormat], + all_formats: &[CaptureFormat], +) -> Result { match options.format { - CaptureFormatRequest::Default => Ok(()), + CaptureFormatRequest::Default => { + let selected = select_format_for_request(&options.format, frame_formats, all_formats)?; + set_device_format(device, selected) + } CaptureFormatRequest::Exact(_) | CaptureFormatRequest::Closest(_) => { - apply_ordered_format_request(camera, options, frame_formats) + apply_ordered_format_request(device, options, frame_formats, all_formats) } CaptureFormatRequest::HighestFrameRate { .. } | CaptureFormatRequest::HighestResolution { .. } => { - let selected = select_highest_format( - &options.format, - frame_formats, - &camera.compatible_camera_formats().map_err(nokhwa_error)?, - )?; - camera - .set_camera_requset(RequestedFormat::with_formats( - RequestedFormatType::Exact(selected), - &[selected.format()], - )) - .map(|_| ()) - .map_err(nokhwa_error) + let selected = select_format_for_request(&options.format, frame_formats, all_formats)?; + set_device_format(device, selected) } } } #[cfg(target_os = "linux")] fn apply_ordered_format_request( - camera: &mut Camera, + device: &Device, options: &V4lCaptureOptions, - frame_formats: &[FrameFormat], -) -> Result<(), V4lError> { + frame_formats: &[CaptureFrameFormat], + all_formats: &[CaptureFormat], +) -> Result { let mut last_error = None; for frame_format in frame_formats { - let requested = requested_format(&options.format, frame_formats, Some(*frame_format))?; - match camera.set_camera_requset(requested) { - Ok(_) => return Ok(()), + let request = format_request_with_frame_format(&options.format, *frame_format); + let selected = match select_format_for_request(&request, &[*frame_format], all_formats) { + Ok(selected) => selected, + Err(error) => { + last_error = Some(error); + continue; + } + }; + + match set_device_format(device, selected) { + Ok(format) => return Ok(format), Err(error) => last_error = Some(error), } } - Err(last_error - .map(nokhwa_error) - .unwrap_or(V4lError::InvalidOption("no V4L frame formats were requested"))) + Err(last_error.unwrap_or(V4lError::InvalidOption("no V4L frame formats were requested"))) } #[cfg(target_os = "linux")] -fn select_highest_format( +fn format_request_with_frame_format( request: &CaptureFormatRequest, - frame_formats: &[FrameFormat], - all_formats: &[CameraFormat], -) -> Result { - let candidates = all_formats - .iter() - .copied() - .filter(|format| frame_formats.contains(&format.format())) - .filter(|format| match request { - CaptureFormatRequest::HighestFrameRate { resolution, .. } => resolution - .map(|resolution| format.resolution() == nokhwa_resolution(resolution)) - .unwrap_or(true), - CaptureFormatRequest::HighestResolution { frame_rate, .. } => { - frame_rate.map(|frame_rate| format.frame_rate() == frame_rate).unwrap_or(true) + frame_format: CaptureFrameFormat, +) -> CaptureFormatRequest { + match request { + CaptureFormatRequest::Exact(format) => CaptureFormatRequest::Exact(CaptureFormat::new( + format.resolution, + format.frame_rate, + frame_format, + )), + CaptureFormatRequest::Closest(format) => CaptureFormatRequest::Closest(CaptureFormat::new( + format.resolution, + format.frame_rate, + frame_format, + )), + CaptureFormatRequest::Default => CaptureFormatRequest::Default, + CaptureFormatRequest::HighestFrameRate { resolution, .. } => { + CaptureFormatRequest::HighestFrameRate { + resolution: *resolution, + frame_format: Some(frame_format), } - CaptureFormatRequest::Default - | CaptureFormatRequest::Exact(_) - | CaptureFormatRequest::Closest(_) => false, - }); + } + CaptureFormatRequest::HighestResolution { frame_rate, .. } => { + CaptureFormatRequest::HighestResolution { + frame_rate: *frame_rate, + frame_format: Some(frame_format), + } + } + } +} +#[cfg(target_os = "linux")] +fn select_format_for_request( + request: &CaptureFormatRequest, + frame_formats: &[CaptureFrameFormat], + all_formats: &[CaptureFormat], +) -> Result { let selected = match request { - CaptureFormatRequest::HighestFrameRate { .. } => candidates.max_by(|a, b| { - a.frame_rate() - .cmp(&b.frame_rate()) - .then_with(|| a.resolution().cmp(&b.resolution())) - .then_with(|| compare_format_preference(a.format(), b.format(), frame_formats)) - }), - CaptureFormatRequest::HighestResolution { .. } => candidates.max_by(|a, b| { - a.resolution() - .cmp(&b.resolution()) - .then_with(|| a.frame_rate().cmp(&b.frame_rate())) - .then_with(|| compare_format_preference(a.format(), b.format(), frame_formats)) - }), - CaptureFormatRequest::Default - | CaptureFormatRequest::Exact(_) - | CaptureFormatRequest::Closest(_) => None, + CaptureFormatRequest::Default => { + all_formats.iter().find(|format| frame_formats.contains(&format.frame_format)).copied() + } + CaptureFormatRequest::Exact(format) => { + if frame_formats.contains(&format.frame_format) { + Some(*format) + } else { + None + } + } + CaptureFormatRequest::Closest(format) => { + select_closest_format(*format, frame_formats, all_formats) + } + CaptureFormatRequest::HighestFrameRate { .. } => { + select_highest_frame_rate_format(request, frame_formats, all_formats) + } + CaptureFormatRequest::HighestResolution { .. } => { + select_highest_resolution_format(request, frame_formats, all_formats) + } }; selected.ok_or_else(|| V4lError::Camera("CameraFormat: Failed to Fufill".to_string())) } +#[cfg(target_os = "linux")] +fn select_closest_format( + requested: CaptureFormat, + frame_formats: &[CaptureFrameFormat], + all_formats: &[CaptureFormat], +) -> Option { + if !frame_formats.contains(&requested.frame_format) { + return None; + } + + let resolution = all_formats + .iter() + .copied() + .filter(|format| format.frame_format == requested.frame_format) + .min_by_key(|format| resolution_distance(format.resolution, requested.resolution))? + .resolution; + + let frame_rate = all_formats + .iter() + .copied() + .filter(|format| { + format.frame_format == requested.frame_format && format.resolution == resolution + }) + .min_by_key(|format| format.frame_rate.abs_diff(requested.frame_rate))? + .frame_rate; + + Some(CaptureFormat::new(resolution, frame_rate, requested.frame_format)) +} + +#[cfg(target_os = "linux")] +fn select_highest_frame_rate_format( + request: &CaptureFormatRequest, + frame_formats: &[CaptureFrameFormat], + all_formats: &[CaptureFormat], +) -> Option { + all_formats + .iter() + .copied() + .filter(|format| frame_formats.contains(&format.frame_format)) + .filter(|format| match request { + CaptureFormatRequest::HighestFrameRate { resolution, frame_format } => { + resolution.map(|resolution| format.resolution == resolution).unwrap_or(true) + && frame_format + .map(|frame_format| format.frame_format == frame_format) + .unwrap_or(true) + } + _ => false, + }) + .max_by(|left, right| { + left.frame_rate + .cmp(&right.frame_rate) + .then_with(|| compare_resolution(left.resolution, right.resolution)) + .then_with(|| { + compare_format_preference(left.frame_format, right.frame_format, frame_formats) + }) + }) +} + +#[cfg(target_os = "linux")] +fn select_highest_resolution_format( + request: &CaptureFormatRequest, + frame_formats: &[CaptureFrameFormat], + all_formats: &[CaptureFormat], +) -> Option { + all_formats + .iter() + .copied() + .filter(|format| frame_formats.contains(&format.frame_format)) + .filter(|format| match request { + CaptureFormatRequest::HighestResolution { frame_rate, frame_format } => { + frame_rate.map(|frame_rate| format.frame_rate == frame_rate).unwrap_or(true) + && frame_format + .map(|frame_format| format.frame_format == frame_format) + .unwrap_or(true) + } + _ => false, + }) + .max_by(|left, right| { + compare_resolution(left.resolution, right.resolution) + .then_with(|| left.frame_rate.cmp(&right.frame_rate)) + .then_with(|| { + compare_format_preference(left.frame_format, right.frame_format, frame_formats) + }) + }) +} + +#[cfg(target_os = "linux")] +fn compare_resolution(left: CaptureResolution, right: CaptureResolution) -> std::cmp::Ordering { + frame_area(left) + .cmp(&frame_area(right)) + .then_with(|| left.width.cmp(&right.width)) + .then_with(|| left.height.cmp(&right.height)) +} + +#[cfg(target_os = "linux")] +fn resolution_distance(left: CaptureResolution, right: CaptureResolution) -> u64 { + let width = i64::from(left.width) - i64::from(right.width); + let height = i64::from(left.height) - i64::from(right.height); + width.unsigned_abs().pow(2) + height.unsigned_abs().pow(2) +} + +#[cfg(target_os = "linux")] +fn frame_area(resolution: CaptureResolution) -> u64 { + u64::from(resolution.width) * u64::from(resolution.height) +} + #[cfg(target_os = "linux")] fn compare_format_preference( - left: FrameFormat, - right: FrameFormat, - frame_formats: &[FrameFormat], + left: CaptureFrameFormat, + right: CaptureFrameFormat, + frame_formats: &[CaptureFrameFormat], ) -> std::cmp::Ordering { let left_index = frame_formats.iter().position(|format| *format == left).unwrap_or(usize::MAX); let right_index = @@ -562,87 +684,203 @@ fn compare_format_preference( } #[cfg(target_os = "linux")] -fn nokhwa_camera_format( - format: CaptureFormat, - override_format: Option, -) -> Result { - let frame_format = match override_format { - Some(format) => format, - None => nokhwa_frame_format(format.frame_format) - .ok_or(V4lError::UnsupportedFrameFormat(format.frame_format))?, - }; - Ok(CameraFormat::new(nokhwa_resolution(format.resolution), frame_format, format.frame_rate)) +fn set_device_format(device: &Device, selected: CaptureFormat) -> Result { + let current = device_capture_format(device)?; + let format_changed = + current.resolution != selected.resolution || current.frame_format != selected.frame_format; + if format_changed { + device + .set_format(&V4lFormat::new( + selected.resolution.width, + selected.resolution.height, + fourcc_for_frame_format(selected.frame_format) + .ok_or(V4lError::UnsupportedFrameFormat(selected.frame_format))?, + )) + .map_err(v4l_error)?; + } + if format_changed || current.frame_rate != selected.frame_rate { + device + .set_params(&V4lCaptureParameters::with_fps(selected.frame_rate)) + .map_err(v4l_error)?; + } + + let actual = device_capture_format(device)?; + if actual != selected { + return Err(V4lError::Camera(format!( + "CameraFormat rejected: requested {:?}, got {:?}", + selected, actual + ))); + } + Ok(actual) } #[cfg(target_os = "linux")] -fn nokhwa_resolution(resolution: CaptureResolution) -> Resolution { - Resolution::new(resolution.width, resolution.height) +fn device_capture_format(device: &Device) -> Result { + let format = device.format().map_err(v4l_error)?; + let params = device.params().map_err(v4l_error)?; + let frame_rate = frame_rate_from_fraction(params.interval) + .ok_or(V4lError::InvalidOption("V4L frame interval must be a whole frame rate"))?; + Ok(CaptureFormat::new( + CaptureResolution::new(format.width, format.height), + frame_rate, + capture_frame_format_from_fourcc(format.fourcc) + .ok_or_else(|| V4lError::Camera(format!("unsupported V4L fourcc {}", format.fourcc)))?, + )) } #[cfg(target_os = "linux")] -fn nokhwa_frame_format(pixel_format: CaptureFrameFormat) -> Option { - match pixel_format { - CaptureFrameFormat::Nv12 => Some(FrameFormat::NV12), - CaptureFrameFormat::Rgb24 => Some(FrameFormat::RAWRGB), - CaptureFrameFormat::Bgr24 => Some(FrameFormat::RAWBGR), - CaptureFrameFormat::Yuyv => Some(FrameFormat::YUYV), - CaptureFrameFormat::Gray => Some(FrameFormat::GRAY), - CaptureFrameFormat::Mjpeg => Some(FrameFormat::MJPEG), - CaptureFrameFormat::I420 | CaptureFrameFormat::Bgra | CaptureFrameFormat::Uyvy => None, +fn enumerate_device_formats(device: &Device) -> Result, V4lError> { + let mut formats = Vec::new(); + let fourccs = device + .enum_formats() + .map_err(v4l_error)? + .into_iter() + .filter_map(|format| capture_frame_format_from_fourcc(format.fourcc).map(|_| format.fourcc)) + .collect::>(); + + for fourcc in dedup_fourccs(fourccs) { + let Some(frame_format) = capture_frame_format_from_fourcc(fourcc) else { + continue; + }; + let frame_sizes = device.enum_framesizes(fourcc).map_err(v4l_error)?; + for resolution in frame_sizes.into_iter().flat_map(resolutions_from_frame_size) { + let intervals = device + .enum_frameintervals(fourcc, resolution.width, resolution.height) + .unwrap_or_default(); + for frame_rate in intervals.into_iter().flat_map(frame_rates_from_interval) { + formats.push(CaptureFormat::new(resolution, frame_rate, frame_format)); + } + } } + + Ok(formats) } -#[cfg(not(target_os = "linux"))] -fn nokhwa_frame_format(pixel_format: CaptureFrameFormat) -> Option<()> { - match pixel_format { +fn is_supported_source_format(frame_format: CaptureFrameFormat) -> bool { + matches!( + frame_format, CaptureFrameFormat::Nv12 - | CaptureFrameFormat::Rgb24 - | CaptureFrameFormat::Bgr24 - | CaptureFrameFormat::Yuyv - | CaptureFrameFormat::Gray - | CaptureFrameFormat::Mjpeg => Some(()), + | CaptureFrameFormat::Rgb24 + | CaptureFrameFormat::Bgr24 + | CaptureFrameFormat::Yuyv + | CaptureFrameFormat::Grey + | CaptureFrameFormat::Mjpeg + ) +} + +#[cfg(target_os = "linux")] +fn fourcc_for_frame_format(frame_format: CaptureFrameFormat) -> Option { + match frame_format { + CaptureFrameFormat::Nv12 => Some(FourCC::new(b"NV12")), + CaptureFrameFormat::Rgb24 => Some(FourCC::new(b"RGB3")), + CaptureFrameFormat::Bgr24 => Some(FourCC::new(b"BGR3")), + CaptureFrameFormat::Yuyv => Some(FourCC::new(b"YUYV")), + CaptureFrameFormat::Grey => Some(FourCC::new(b"GREY")), + CaptureFrameFormat::Mjpeg => Some(FourCC::new(b"MJPG")), CaptureFrameFormat::I420 | CaptureFrameFormat::Bgra | CaptureFrameFormat::Uyvy => None, } } #[cfg(target_os = "linux")] -fn capture_format_from_nokhwa(format: CameraFormat) -> Result { - Ok(CaptureFormat::new( - CaptureResolution::new(format.width(), format.height()), - format.frame_rate(), - capture_frame_format_from_nokhwa(format.format())?, - )) +fn capture_frame_format_from_fourcc(fourcc: FourCC) -> Option { + match fourcc.str().ok()? { + "NV12" => Some(CaptureFrameFormat::Nv12), + "RGB3" => Some(CaptureFrameFormat::Rgb24), + "BGR3" => Some(CaptureFrameFormat::Bgr24), + "YUYV" | "YUY2" => Some(CaptureFrameFormat::Yuyv), + "GREY" => Some(CaptureFrameFormat::Grey), + "MJPG" | "JPEG" => Some(CaptureFrameFormat::Mjpeg), + _ => None, + } } #[cfg(target_os = "linux")] -fn capture_frame_format_from_nokhwa(format: FrameFormat) -> Result { - match format { - FrameFormat::MJPEG => Ok(CaptureFrameFormat::Mjpeg), - FrameFormat::YUYV => Ok(CaptureFrameFormat::Yuyv), - FrameFormat::NV12 => Ok(CaptureFrameFormat::Nv12), - FrameFormat::GRAY => Ok(CaptureFrameFormat::Gray), - FrameFormat::RAWRGB => Ok(CaptureFrameFormat::Rgb24), - FrameFormat::RAWBGR => Ok(CaptureFrameFormat::Bgr24), +fn dedup_fourccs(fourccs: Vec) -> Vec { + let mut deduped = Vec::new(); + for fourcc in fourccs { + if !deduped.contains(&fourcc) { + deduped.push(fourcc); + } } + deduped } #[cfg(target_os = "linux")] -fn enumerate_formats(info: &nokhwa::utils::CameraInfo) -> Result, V4lError> { - let requested = RequestedFormat::new::(RequestedFormatType::None); - let mut camera = Camera::with_backend(info.index().clone(), requested, ApiBackend::Video4Linux) - .map_err(nokhwa_error)?; +fn resolutions_from_frame_size(size: v4l::FrameSize) -> Vec { + match size.size { + FrameSizeEnum::Discrete(discrete) => { + vec![CaptureResolution::new(discrete.width, discrete.height)] + } + FrameSizeEnum::Stepwise(stepwise) => { + let mut resolutions = Vec::new(); + push_stepwise_resolution( + &mut resolutions, + CaptureResolution::new(stepwise.min_width, stepwise.min_height), + ); + push_stepwise_resolution( + &mut resolutions, + CaptureResolution::new(stepwise.max_width, stepwise.max_height), + ); + resolutions + } + } +} - Ok(camera - .compatible_camera_formats() - .map_err(nokhwa_error)? - .into_iter() - .filter_map(|format| capture_format_from_nokhwa(format).ok()) - .collect()) +#[cfg(target_os = "linux")] +fn push_stepwise_resolution( + resolutions: &mut Vec, + resolution: CaptureResolution, +) { + if resolution.width != 0 && resolution.height != 0 && !resolutions.contains(&resolution) { + resolutions.push(resolution); + } +} + +#[cfg(target_os = "linux")] +fn frame_rates_from_interval(interval: v4l::FrameInterval) -> Vec { + match interval.interval { + FrameIntervalEnum::Discrete(fraction) => { + frame_rate_from_fraction(fraction).into_iter().collect() + } + FrameIntervalEnum::Stepwise(stepwise) => { + let mut frame_rates = Vec::new(); + if let Some(frame_rate) = frame_rate_from_fraction(stepwise.min) { + frame_rates.push(frame_rate); + } + if let Some(frame_rate) = frame_rate_from_fraction(stepwise.max) { + if !frame_rates.contains(&frame_rate) { + frame_rates.push(frame_rate); + } + } + frame_rates + } + } +} + +#[cfg(target_os = "linux")] +fn frame_rate_from_fraction(fraction: v4l::Fraction) -> Option { + if fraction.numerator == 0 || fraction.denominator == 0 { + return None; + } + if fraction.denominator % fraction.numerator != 0 { + return None; + } + Some(fraction.denominator / fraction.numerator) +} + +#[cfg(target_os = "linux")] +fn frame_bytes(buffer: &[u8], bytes_used: u32) -> &[u8] { + let bytes_used = usize::try_from(bytes_used).unwrap_or(buffer.len()).min(buffer.len()); + if bytes_used == 0 { + buffer + } else { + &buffer[..bytes_used] + } } #[cfg(target_os = "linux")] fn convert_to_i420( - source_format: FrameFormat, + source_format: CaptureFrameFormat, source: &[u8], width: u32, height: u32, @@ -654,7 +892,7 @@ fn convert_to_i420( let height_i32 = i32_from_u32(height, "height")?; let ret = match source_format { - FrameFormat::YUYV => { + CaptureFrameFormat::Yuyv => { validate_len(source, width as usize * height as usize * 2, "YUYV frame")?; unsafe { // SAFETY: Source and destination slices are valid for the dimensions and strides. @@ -672,7 +910,7 @@ fn convert_to_i420( ) } } - FrameFormat::RAWRGB => { + CaptureFrameFormat::Rgb24 => { validate_len(source, width as usize * height as usize * 3, "RGB24 frame")?; unsafe { // SAFETY: Source and destination slices are valid for the dimensions and strides. @@ -690,7 +928,7 @@ fn convert_to_i420( ) } } - FrameFormat::RAWBGR => { + CaptureFrameFormat::Bgr24 => { validate_len(source, width as usize * height as usize * 3, "BGR24 frame")?; unsafe { // SAFETY: Source and destination slices are valid for the dimensions and strides. @@ -708,8 +946,8 @@ fn convert_to_i420( ) } } - FrameFormat::GRAY => { - validate_len(source, width as usize * height as usize, "GRAY frame")?; + CaptureFrameFormat::Grey => { + validate_len(source, width as usize * height as usize, "GREY frame")?; unsafe { // SAFETY: Source and destination slices are valid for the dimensions and strides. yuv_sys::rs_I400ToI420( @@ -726,7 +964,7 @@ fn convert_to_i420( ) } } - FrameFormat::NV12 => { + CaptureFrameFormat::Nv12 => { let y_size = width as usize * height as usize; validate_len(source, y_size + y_size / 2, "NV12 frame")?; unsafe { @@ -747,9 +985,12 @@ fn convert_to_i420( ) } } - FrameFormat::MJPEG => { + CaptureFrameFormat::Mjpeg => { return convert_mjpeg_to_i420(source, width, height, destination).map(|()| true); } + CaptureFrameFormat::I420 | CaptureFrameFormat::Bgra | CaptureFrameFormat::Uyvy => { + return Err(V4lError::UnsupportedFrameFormat(source_format)); + } }; if ret == 0 { @@ -871,10 +1112,37 @@ fn i32_from_u32(value: u32, field: &'static str) -> Result { } #[cfg(target_os = "linux")] -fn nokhwa_error(error: nokhwa::NokhwaError) -> V4lError { +fn v4l_error(error: std::io::Error) -> V4lError { V4lError::Camera(error.to_string()) } +#[cfg(target_os = "linux")] +fn monotonic_to_wallclock(timestamp: v4l::Timestamp) -> Option { + let frame_monotonic = Duration::from(timestamp); + if frame_monotonic.is_zero() { + return None; + } + + let monotonic_now = clock_time(libc::CLOCK_MONOTONIC)?; + let wall_now = clock_time(libc::CLOCK_REALTIME)?; + let frame_age = monotonic_now.checked_sub(frame_monotonic)?; + wall_now.checked_sub(frame_age) +} + +#[cfg(target_os = "linux")] +fn clock_time(clock_id: libc::clockid_t) -> Option { + let mut time = libc::timespec { tv_sec: 0, tv_nsec: 0 }; + let ret = unsafe { + // SAFETY: `time` is a valid out pointer and `clock_id` is supplied by libc constants. + libc::clock_gettime(clock_id, &mut time) + }; + if ret != 0 || time.tv_sec < 0 || time.tv_nsec < 0 { + return None; + } + + Some(Duration::new(time.tv_sec as u64, time.tv_nsec as u32)) +} + #[cfg(test)] mod tests { use super::*; From d6281011a136ce64cfc731f9d379ef088d446602 Mon Sep 17 00:00:00 2001 From: David Chen Date: Mon, 29 Jun 2026 22:51:41 -0700 Subject: [PATCH 14/24] add support for GREY cameras --- examples/local_video/src/publisher.rs | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/examples/local_video/src/publisher.rs b/examples/local_video/src/publisher.rs index c65cf6629..c6ec95a4a 100644 --- a/examples/local_video/src/publisher.rs +++ b/examples/local_video/src/publisher.rs @@ -81,21 +81,26 @@ enum SourceKind { /// Selects the UVC camera capture frame format. #[derive(Copy, Clone, Debug, PartialEq, Eq, ValueEnum)] enum CaptureFormat { - /// Try YUYV first and fall back to MJPEG. + /// Try YUYV first, then MJPEG, then GREY. Auto, /// Request uncompressed YUYV capture. Yuv, /// Request compressed MJPEG capture. Mjpeg, + /// Request uncompressed GREY capture. + Grey, } impl CaptureFormat { #[cfg(target_os = "linux")] fn frame_formats(self) -> &'static [CaptureFrameFormat] { match self { - Self::Auto => &[CaptureFrameFormat::Yuyv, CaptureFrameFormat::Mjpeg], + Self::Auto => { + &[CaptureFrameFormat::Yuyv, CaptureFrameFormat::Mjpeg, CaptureFrameFormat::Grey] + } Self::Yuv => &[CaptureFrameFormat::Yuyv], Self::Mjpeg => &[CaptureFrameFormat::Mjpeg], + Self::Grey => &[CaptureFrameFormat::Grey], } } } @@ -106,6 +111,7 @@ impl std::fmt::Display for CaptureFormat { Self::Auto => write!(f, "auto"), Self::Yuv => write!(f, "yuv"), Self::Mjpeg => write!(f, "mjpeg"), + Self::Grey => write!(f, "grey"), } } } @@ -179,7 +185,7 @@ struct Args { #[arg(long, value_enum, default_value_t = SourceKind::Uvc)] source: SourceKind, - /// UVC camera capture format: `auto` tries YUYV then MJPEG; `mjpeg` uses less USB bandwidth. + /// UVC camera capture format: `auto` tries YUYV, MJPEG, then GREY. #[arg(long, value_enum, default_value_t = CaptureFormat::Auto)] format: CaptureFormat, @@ -823,6 +829,14 @@ mod tests { assert_eq!(args.test_pattern, Some(TestPatternKind::AnimatedGraphic)); } + #[test] + fn capture_format_accepts_grey() { + let args = + Args::try_parse_from(["publisher", "--format", "grey"]).expect("args should parse"); + + assert_eq!(args.format, CaptureFormat::Grey); + } + #[test] fn test_pattern_rejects_unknown_numeric_mode() { let err = From 4dc8f5452e1729c90e0032370631feb474a3a6c1 Mon Sep 17 00:00:00 2001 From: David Chen Date: Mon, 29 Jun 2026 23:42:59 -0700 Subject: [PATCH 15/24] add gstreamer app sink --- Cargo.lock | 2 + examples/preencode_publish/Cargo.toml | 6 + examples/preencode_publish/src/main.rs | 585 ++++++++++++++++++++++++- 3 files changed, 584 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 283f35d31..47f253d92 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5706,6 +5706,8 @@ dependencies = [ "anyhow", "clap", "env_logger 0.11.10", + "gstreamer", + "gstreamer-app", "livekit", "livekit-api", "livekit-capture", diff --git a/examples/preencode_publish/Cargo.toml b/examples/preencode_publish/Cargo.toml index 7ad348e09..17686c1a6 100644 --- a/examples/preencode_publish/Cargo.toml +++ b/examples/preencode_publish/Cargo.toml @@ -4,10 +4,16 @@ version = "0.1.0" edition.workspace = true publish = false +[features] +default = [] +gstreamer = ["dep:gstreamer", "dep:gstreamer-app", "livekit-capture/gstreamer"] + [dependencies] anyhow = { workspace = true } clap = { workspace = true, features = ["derive", "env"] } env_logger = { workspace = true } +gstreamer = { workspace = true, optional = true } +gstreamer-app = { workspace = true, optional = true } livekit = { workspace = true, features = ["rustls-tls-native-roots"] } livekit-api = { workspace = true, features = ["rustls-tls-native-roots"] } livekit-capture = { workspace = true, features = ["rtsp", "tcp-source"] } diff --git a/examples/preencode_publish/src/main.rs b/examples/preencode_publish/src/main.rs index 61da68bc1..4ee24c0d0 100644 --- a/examples/preencode_publish/src/main.rs +++ b/examples/preencode_publish/src/main.rs @@ -9,8 +9,18 @@ use std::{ use anyhow::{bail, Context, Result}; use clap::{Parser, ValueEnum}; +#[cfg(feature = "gstreamer")] +use gstreamer as gst; +#[cfg(feature = "gstreamer")] +use gstreamer::prelude::*; +#[cfg(feature = "gstreamer")] +use gstreamer_app as gst_app; use livekit::{prelude::*, webrtc::video_source::VideoResolution}; use livekit_api::access_token; +#[cfg(feature = "gstreamer")] +use livekit_capture::sources::gstreamer::{ + GStreamerAppSinkConfig, GStreamerAppSinkEncodedSource, GStreamerSampleFormat, +}; use livekit_capture::{ encoded::h26x::annex_b_nal_ranges, sources::{ @@ -25,6 +35,8 @@ const DIAGNOSTIC_REPORT_INTERVAL: Duration = Duration::from_secs(1); const SOURCE_STALL_THRESHOLD: Duration = Duration::from_millis(250); const BURST_WALL_DELTA_THRESHOLD: Duration = Duration::from_millis(5); const KEYFRAME_GAP_THRESHOLD: Duration = Duration::from_secs(5); +#[cfg(feature = "gstreamer")] +const GSTREAMER_APPSINK_NAME: &str = "lk_appsink"; /// Publish a pre-encoded video stream into a LiveKit room. #[derive(Parser, Debug)] @@ -35,6 +47,8 @@ struct Args { source: SourceKind, /// Encoded video codec. Required with --source tcp; optional validation with --source rtsp. + /// Optional with --source gstappsink; omitted custom GStreamer pipelines infer H.264/H.265 + /// from their unlinked encoded output when possible. #[arg(long, value_enum)] codec: Option, @@ -74,22 +88,31 @@ struct Args { #[arg(long, default_value_t = 1080)] height: u32, - /// Frame rate used to timestamp TCP Annex-B access units. + /// Frame rate used for generated video and fallback timestamps. #[arg(long, default_value_t = 30)] fps: u32, /// Log access-unit timing, keyframe, and H26x NAL diagnostics. #[arg(long)] diagnostics: bool, + + /// GStreamer launch pipeline used with --source gstappsink. If the pipeline does not include + /// appsink name=lk_appsink, an H.264/H.265 parser and appsink are attached to its unlinked + /// output. + #[cfg(feature = "gstreamer")] + #[arg(last = true, value_name = "PIPELINE")] + gstreamer_pipeline: Vec, } -#[derive(Debug, Clone, Copy, ValueEnum)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] enum SourceKind { Tcp, Rtsp, + #[cfg(feature = "gstreamer")] + Gstappsink, } -#[derive(Debug, Clone, Copy, ValueEnum)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] enum CodecArg { H264, H265, @@ -119,6 +142,8 @@ async fn main() -> Result<()> { async fn run(args: Args) -> Result<()> { validate_dimensions(args.width, args.height)?; + #[cfg(feature = "gstreamer")] + validate_gstreamer_args(&args)?; match args.source { SourceKind::Tcp => { @@ -126,9 +151,22 @@ async fn run(args: Args) -> Result<()> { run_tcp_source(args, frame_interval_us).await } SourceKind::Rtsp => run_rtsp_source(args).await, + #[cfg(feature = "gstreamer")] + SourceKind::Gstappsink => { + let frame_interval_us = frame_interval_us(args.fps)?; + run_gstreamer_source(args, frame_interval_us).await + } } } +#[cfg(feature = "gstreamer")] +fn validate_gstreamer_args(args: &Args) -> Result<()> { + if args.source != SourceKind::Gstappsink && !args.gstreamer_pipeline.is_empty() { + bail!("trailing GStreamer pipeline arguments are only valid with --source gstappsink"); + } + Ok(()) +} + async fn run_tcp_source(args: Args, frame_interval_us: i64) -> Result<()> { let codec_arg = args.codec.context("--codec is required with --source tcp")?; let codec = codec_arg.encoded_codec(); @@ -147,8 +185,17 @@ async fn run_tcp_source(args: Args, frame_interval_us: i64) -> Result<()> { let shutdown_stream = stream.try_clone().context("failed to clone TCP stream")?; let source = TcpEncodedSource::from_tcp_stream(stream, config)?; - publish_encoded_source(args, codec, "TCP", source, shutdown_stream, Some(frame_interval_us)) - .await + publish_encoded_source( + args, + codec, + "TCP", + source, + move || { + let _ = shutdown_stream.shutdown(Shutdown::Both); + }, + Some(frame_interval_us), + ) + .await } async fn run_rtsp_source(args: Args) -> Result<()> { @@ -171,19 +218,337 @@ async fn run_rtsp_source(args: Args) -> Result<()> { source.session_info().video_channel ); - publish_encoded_source(args, codec, "RTSP", source, shutdown_stream, None).await + publish_encoded_source( + args, + codec, + "RTSP", + source, + move || { + let _ = shutdown_stream.shutdown(Shutdown::Both); + }, + None, + ) + .await +} + +#[cfg(feature = "gstreamer")] +async fn run_gstreamer_source(args: Args, frame_interval_us: i64) -> Result<()> { + let source = GStreamerTestSource::start( + args.width, + args.height, + args.fps, + current_time_us(), + frame_interval_us, + args.codec.map(CodecArg::encoded_codec), + &args.gstreamer_pipeline, + )?; + let codec = source.codec(); + let shutdown_pipeline = source.shutdown_pipeline(); + log::info!("Started GStreamer {:?} pipeline: {}", codec, source.pipeline_description()); + + publish_encoded_source( + args, + codec, + "GStreamer", + source, + move || { + let _ = shutdown_pipeline.set_state(gst::State::Null); + }, + Some(frame_interval_us), + ) + .await +} + +#[cfg(feature = "gstreamer")] +#[derive(Debug)] +struct GStreamerTestSource { + pipeline: gst::Pipeline, + source: GStreamerAppSinkEncodedSource, + pipeline_description: String, +} + +#[cfg(feature = "gstreamer")] +impl GStreamerTestSource { + fn start( + width: u32, + height: u32, + fps: u32, + start_timestamp_us: i64, + frame_interval_us: i64, + requested_codec: Option, + pipeline_args: &[String], + ) -> Result { + gst::init().context("failed to initialize GStreamer")?; + + let generated_codec = requested_codec.unwrap_or(EncodedVideoCodec::H264); + let pipeline_description = + gstreamer_pipeline_description(width, height, fps, generated_codec, pipeline_args); + let element = gst::parse::launch(&pipeline_description).with_context(|| { + format!("failed to create GStreamer pipeline: {pipeline_description}") + })?; + let Ok(pipeline) = element.downcast::() else { + bail!("GStreamer description did not create a pipeline"); + }; + let requested_codec = + if pipeline_args.is_empty() { Some(generated_codec) } else { requested_codec }; + let (appsink, sample_format) = ensure_encoded_appsink(&pipeline, requested_codec)?; + let Ok(appsink) = appsink.downcast::() else { + bail!("GStreamer element {GSTREAMER_APPSINK_NAME} was not an appsink"); + }; + + let config = GStreamerAppSinkConfig::new( + sample_format, + start_timestamp_us, + frame_interval_us, + width, + height, + ); + pipeline + .set_state(gst::State::Playing) + .context("failed to start GStreamer test pipeline")?; + + Ok(Self { + pipeline, + source: GStreamerAppSinkEncodedSource::new(appsink, config), + pipeline_description, + }) + } + + fn pipeline_description(&self) -> &str { + &self.pipeline_description + } + + fn codec(&self) -> EncodedVideoCodec { + self.source.config().sample_format.codec() + } + + fn shutdown_pipeline(&self) -> gst::Pipeline { + self.pipeline.clone() + } +} + +#[cfg(feature = "gstreamer")] +impl EncodedAccessUnitSource for GStreamerTestSource { + type Error = ::Error; + + fn next_access_unit(&mut self) -> Result, Self::Error> { + self.source.next_access_unit() + } +} + +#[cfg(feature = "gstreamer")] +impl Drop for GStreamerTestSource { + fn drop(&mut self) { + let _ = self.pipeline.set_state(gst::State::Null); + } +} + +#[cfg(feature = "gstreamer")] +fn gstreamer_pipeline_description( + width: u32, + height: u32, + fps: u32, + codec: EncodedVideoCodec, + pipeline_args: &[String], +) -> String { + if pipeline_args.is_empty() { + return gstreamer_test_pipeline_description(width, height, fps, codec); + } + + pipeline_args.join(" ") +} + +#[cfg(feature = "gstreamer")] +fn gstreamer_test_pipeline_description( + width: u32, + height: u32, + fps: u32, + codec: EncodedVideoCodec, +) -> String { + let key_int_max = fps.max(1); + let (encoder, parser, caps) = match codec { + EncodedVideoCodec::H264 => ( + format!( + "x264enc tune=zerolatency speed-preset=ultrafast key-int-max={key_int_max} \ + bitrate=2500 byte-stream=true aud=true" + ), + "h264parse config-interval=-1", + "video/x-h264,stream-format=byte-stream,alignment=au", + ), + EncodedVideoCodec::H265 => ( + format!( + "x265enc tune=zerolatency speed-preset=ultrafast key-int-max={key_int_max} \ + bitrate=2500" + ), + "h265parse config-interval=-1", + "video/x-h265,stream-format=byte-stream,alignment=au", + ), + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { + unreachable!("GStreamer generated test pipeline only supports H.264/H.265") + } + _ => unreachable!("unknown generated GStreamer codec"), + }; + + format!( + "videotestsrc is-live=true do-timestamp=true pattern=smpte ! \ + video/x-raw,width={width},height={height},framerate={fps}/1 ! \ + timeoverlay halignment=right valignment=bottom shaded-background=true ! \ + videoconvert ! \ + {encoder} ! \ + {parser} ! \ + {caps} ! \ + appsink name={GSTREAMER_APPSINK_NAME} sync=false max-buffers=8 drop=true" + ) +} + +#[cfg(feature = "gstreamer")] +fn ensure_encoded_appsink( + pipeline: &gst::Pipeline, + requested_codec: Option, +) -> Result<(gst::Element, GStreamerSampleFormat)> { + if let Some(appsink) = pipeline.by_name(GSTREAMER_APPSINK_NAME) { + let codec = requested_codec + .or_else(|| codec_from_element_sink_caps(&appsink)) + .unwrap_or(EncodedVideoCodec::H264); + let sample_format = h26x_sample_format(codec)?; + return Ok((appsink, sample_format)); + } + + let src_pad = pipeline.find_unlinked_pad(gst::PadDirection::Src).with_context(|| { + format!("GStreamer pipeline must include appsink name={GSTREAMER_APPSINK_NAME} or leave one H.264/H.265 source pad unlinked") + })?; + let inferred_codec = codec_from_pad_caps(&src_pad).with_context(|| { + format!( + "unlinked GStreamer pad '{}' does not advertise video/x-h264 or video/x-h265 caps", + src_pad.name() + ) + })?; + let codec = match requested_codec { + Some(requested_codec) if requested_codec != inferred_codec => bail!( + "GStreamer codec mismatch: --codec requested {:?}, but unlinked pad '{}' advertises {:?}", + requested_codec, + src_pad.name(), + inferred_codec + ), + Some(requested_codec) => requested_codec, + None => inferred_codec, + }; + let sample_format = h26x_sample_format(codec)?; + let Some(src_element) = src_pad.parent_element() else { + bail!("unlinked GStreamer encoded pad has no parent element"); + }; + + let parser = gst::ElementFactory::make(h26x_parser_name(codec)?) + .property("config-interval", -1i32) + .build() + .with_context(|| format!("failed to create {}", h26x_parser_name(codec).unwrap()))?; + let codec_caps = h26x_appsink_caps(codec)?; + let capsfilter = gst::ElementFactory::make("capsfilter") + .property("caps", codec_caps) + .build() + .with_context(|| format!("failed to create {:?} capsfilter", codec))?; + let appsink = gst::ElementFactory::make("appsink") + .name(GSTREAMER_APPSINK_NAME) + .property("sync", false) + .property("max-buffers", 8u32) + .property("drop", true) + .build() + .context("failed to create appsink")?; + + pipeline + .add(&parser) + .with_context(|| format!("failed to add {} to GStreamer pipeline", parser.name()))?; + pipeline.add(&capsfilter).context("failed to add capsfilter to GStreamer pipeline")?; + pipeline.add(&appsink).context("failed to add appsink to GStreamer pipeline")?; + gst::Element::link_many([&parser, &capsfilter, &appsink]) + .with_context(|| format!("failed to link {} to appsink", parser.name()))?; + let sink_pad = parser + .static_pad("sink") + .with_context(|| format!("{} did not expose a sink pad", parser.name()))?; + src_pad + .link(&sink_pad) + .with_context(|| format!("failed to link '{}' to {}", src_element.name(), parser.name()))?; + + Ok((appsink, sample_format)) +} + +#[cfg(feature = "gstreamer")] +fn h26x_sample_format(codec: EncodedVideoCodec) -> Result { + match codec { + EncodedVideoCodec::H264 => Ok(GStreamerSampleFormat::H264AnnexB), + EncodedVideoCodec::H265 => Ok(GStreamerSampleFormat::H265AnnexB), + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => bail!( + "GStreamer passthrough currently supports H.264/H.265 Annex-B; {:?} needs an explicit access-unit source path", + codec + ), + _ => bail!("unsupported GStreamer codec: {:?}", codec), + } +} + +#[cfg(feature = "gstreamer")] +fn h26x_parser_name(codec: EncodedVideoCodec) -> Result<&'static str> { + match codec { + EncodedVideoCodec::H264 => Ok("h264parse"), + EncodedVideoCodec::H265 => Ok("h265parse"), + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { + bail!("no H26x parser for {:?}", codec) + } + _ => bail!("unsupported GStreamer codec: {:?}", codec), + } +} + +#[cfg(feature = "gstreamer")] +fn h26x_caps_name(codec: EncodedVideoCodec) -> Result<&'static str> { + match codec { + EncodedVideoCodec::H264 => Ok("video/x-h264"), + EncodedVideoCodec::H265 => Ok("video/x-h265"), + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { + bail!("GStreamer passthrough currently supports H.264/H.265 Annex-B") + } + _ => bail!("unsupported GStreamer codec: {:?}", codec), + } +} + +#[cfg(feature = "gstreamer")] +fn h26x_appsink_caps(codec: EncodedVideoCodec) -> Result { + Ok(gst::Caps::builder(h26x_caps_name(codec)?) + .field("stream-format", "byte-stream") + .field("alignment", "au") + .build()) +} + +#[cfg(feature = "gstreamer")] +fn codec_from_element_sink_caps(element: &gst::Element) -> Option { + let sink_pad = element.static_pad("sink")?; + codec_from_pad_caps(&sink_pad) } -async fn publish_encoded_source( +#[cfg(feature = "gstreamer")] +fn codec_from_pad_caps(pad: &gst::Pad) -> Option { + let caps = pad.current_caps().unwrap_or_else(|| pad.query_caps(None)); + caps.iter().find_map(|structure| codec_from_caps_name(structure.name())) +} + +#[cfg(feature = "gstreamer")] +fn codec_from_caps_name(name: &str) -> Option { + match name { + "video/x-h264" => Some(EncodedVideoCodec::H264), + "video/x-h265" => Some(EncodedVideoCodec::H265), + _ => None, + } +} + +async fn publish_encoded_source( args: Args, codec: EncodedVideoCodec, source_label: &'static str, source: S, - shutdown_stream: TcpStream, + shutdown_source: ShutdownSource, expected_frame_interval_us: Option, ) -> Result<()> where S: EncodedAccessUnitSource + Send + 'static, + ShutdownSource: FnOnce() + Send + 'static, { let diagnostics_enabled = args.diagnostics; let token = access_token::AccessToken::with_api_key(&args.api_key, &args.api_secret) @@ -229,7 +594,7 @@ where async move { let _ = tokio::signal::ctrl_c().await; stop.store(true, Ordering::Release); - let _ = shutdown_stream.shutdown(Shutdown::Both); + shutdown_source(); } }); @@ -685,3 +1050,205 @@ fn current_time_us() -> i64 { }; duration.as_micros().min(i64::MAX as u128) as i64 } + +#[cfg(all(test, feature = "gstreamer"))] +mod tests { + use super::*; + + #[test] + fn gstreamer_pipeline_description_routes_test_source_to_h264_appsink() { + let description = + gstreamer_test_pipeline_description(320, 180, 30, EncodedVideoCodec::H264); + + assert!(description.contains("videotestsrc is-live=true do-timestamp=true")); + assert!(description.contains("timeoverlay")); + assert!(description.contains("x264enc")); + assert!(description.contains("video/x-h264,stream-format=byte-stream,alignment=au")); + assert!(description.contains(&format!("appsink name={GSTREAMER_APPSINK_NAME}"))); + } + + #[test] + fn gstreamer_pipeline_description_routes_test_source_to_h265_appsink() { + let description = + gstreamer_test_pipeline_description(320, 180, 30, EncodedVideoCodec::H265); + + assert!(description.contains("videotestsrc is-live=true do-timestamp=true")); + assert!(description.contains("timeoverlay")); + assert!(description.contains("x265enc")); + assert!(description.contains("h265parse config-interval=-1")); + assert!(description.contains("video/x-h265,stream-format=byte-stream,alignment=au")); + assert!(description.contains(&format!("appsink name={GSTREAMER_APPSINK_NAME}"))); + } + + #[test] + fn gstreamer_pipeline_description_uses_trailing_pipeline_args() { + let pipeline = [ + "videotestsrc".to_string(), + "is-live=true".to_string(), + "!".to_string(), + "x264enc".to_string(), + ]; + + assert_eq!( + gstreamer_pipeline_description(320, 180, 30, EncodedVideoCodec::H265, &pipeline), + "videotestsrc is-live=true ! x264enc" + ); + } + + #[test] + fn gstreamer_test_source_pulls_h264_access_units_when_plugins_are_available() { + let frame_interval_us = frame_interval_us(30).unwrap(); + let mut source = match GStreamerTestSource::start( + 320, + 180, + 30, + 10_000, + frame_interval_us, + Some(EncodedVideoCodec::H264), + &[], + ) { + Ok(source) => source, + Err(err) => { + eprintln!("skipping GStreamer appsink smoke test: {err:#}"); + return; + } + }; + + assert_h264_access_units(&mut source); + } + + #[test] + fn gstreamer_test_source_pulls_h265_access_units_when_plugins_are_available() { + let frame_interval_us = frame_interval_us(30).unwrap(); + let mut source = match GStreamerTestSource::start( + 320, + 180, + 30, + 10_000, + frame_interval_us, + Some(EncodedVideoCodec::H265), + &[], + ) { + Ok(source) => source, + Err(err) => { + eprintln!("skipping GStreamer H.265 appsink smoke test: {err:#}"); + return; + } + }; + + assert_h265_access_units(&mut source); + } + + #[test] + fn gstreamer_test_source_attaches_appsink_to_trailing_h264_pipeline() { + let frame_interval_us = frame_interval_us(30).unwrap(); + let pipeline = [ + "videotestsrc".to_string(), + "is-live=true".to_string(), + "do-timestamp=true".to_string(), + "pattern=smpte".to_string(), + "!".to_string(), + "video/x-raw,width=320,height=180,framerate=30/1".to_string(), + "!".to_string(), + "videoconvert".to_string(), + "!".to_string(), + "x264enc".to_string(), + "tune=zerolatency".to_string(), + "speed-preset=ultrafast".to_string(), + "key-int-max=30".to_string(), + "byte-stream=true".to_string(), + "aud=true".to_string(), + ]; + let mut source = match GStreamerTestSource::start( + 320, + 180, + 30, + 10_000, + frame_interval_us, + None, + &pipeline, + ) { + Ok(source) => source, + Err(err) => { + eprintln!("skipping custom GStreamer pipeline smoke test: {err:#}"); + return; + } + }; + + assert_h264_access_units(&mut source); + } + + #[test] + fn gstreamer_test_source_attaches_appsink_to_trailing_h265_pipeline() { + let frame_interval_us = frame_interval_us(30).unwrap(); + let pipeline = [ + "videotestsrc".to_string(), + "is-live=true".to_string(), + "do-timestamp=true".to_string(), + "pattern=smpte".to_string(), + "!".to_string(), + "video/x-raw,width=320,height=180,framerate=30/1".to_string(), + "!".to_string(), + "videoconvert".to_string(), + "!".to_string(), + "x265enc".to_string(), + "tune=zerolatency".to_string(), + "speed-preset=ultrafast".to_string(), + "key-int-max=30".to_string(), + "bitrate=2500".to_string(), + ]; + let mut source = match GStreamerTestSource::start( + 320, + 180, + 30, + 10_000, + frame_interval_us, + None, + &pipeline, + ) { + Ok(source) => source, + Err(err) => { + eprintln!("skipping custom GStreamer H.265 pipeline smoke test: {err:#}"); + return; + } + }; + + assert_h265_access_units(&mut source); + } + + fn assert_h264_access_units(source: &mut GStreamerTestSource) { + let first = source + .next_access_unit() + .expect("GStreamer appsink source should read the first sample") + .expect("GStreamer appsink should produce a first access unit"); + let second = source + .next_access_unit() + .expect("GStreamer appsink source should read the second sample") + .expect("GStreamer appsink should produce a second access unit"); + + assert_eq!(first.codec, EncodedVideoCodec::H264); + assert_eq!(first.width, 320); + assert_eq!(first.height, 180); + assert!(!first.payload.is_empty()); + assert!(first.timestamp_us >= 10_000); + assert!(second.timestamp_us > first.timestamp_us); + } + + fn assert_h265_access_units(source: &mut GStreamerTestSource) { + let first = source + .next_access_unit() + .expect("GStreamer appsink source should read the first sample") + .expect("GStreamer appsink should produce a first access unit"); + let second = source + .next_access_unit() + .expect("GStreamer appsink source should read the second sample") + .expect("GStreamer appsink should produce a second access unit"); + + assert_eq!(first.codec, EncodedVideoCodec::H265); + assert_eq!(first.width, 320); + assert_eq!(first.height, 180); + assert!(!first.payload.is_empty()); + assert!(first.timestamp_us >= 10_000); + assert!(second.timestamp_us > first.timestamp_us); + } +} From 3e04e3b0f69b0d6656886b431316a92595080830 Mon Sep 17 00:00:00 2001 From: David Chen Date: Tue, 30 Jun 2026 00:00:47 -0700 Subject: [PATCH 16/24] libargus capture --- Cargo.lock | 1 - examples/local_video/Cargo.toml | 3 - examples/local_video/README.md | 13 +- examples/local_video/build.rs | 46 -- examples/local_video/src/argus.rs | 169 ------- examples/local_video/src/lk_argus.cpp | 604 ----------------------- examples/local_video/src/publisher.rs | 182 ++++--- livekit-capture/src/sources/argus.rs | 128 +++++ livekit-capture/src/sources/lk_argus.cpp | 121 +++++ 9 files changed, 387 insertions(+), 880 deletions(-) delete mode 100644 examples/local_video/build.rs delete mode 100644 examples/local_video/src/argus.rs delete mode 100644 examples/local_video/src/lk_argus.cpp diff --git a/Cargo.lock b/Cargo.lock index 47f253d92..1039be26a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4158,7 +4158,6 @@ version = "0.2.0" dependencies = [ "anyhow", "bytemuck", - "cc", "chrono", "clap", "eframe", diff --git a/examples/local_video/Cargo.toml b/examples/local_video/Cargo.toml index 9d5dfa82d..d4c36ade0 100644 --- a/examples/local_video/Cargo.toml +++ b/examples/local_video/Cargo.toml @@ -55,6 +55,3 @@ objc2-metal = "0.3.2" [target.'cfg(target_os = "linux")'.dependencies] livekit-capture = { workspace = true, features = ["libargus", "v4l"] } - -[build-dependencies] -cc = { workspace = true } diff --git a/examples/local_video/README.md b/examples/local_video/README.md index 0a2c331a6..10137e66b 100644 --- a/examples/local_video/README.md +++ b/examples/local_video/README.md @@ -73,6 +73,15 @@ Publisher usage: --room-name demo \ --identity jetson-cam-1 + # publish from Argus with a visible burned timestamp (uses CPU I420 copy) + cargo run -p local_video -F desktop --bin publisher -- \ + --source argus \ + --camera-index 0 \ + --attach-timestamp \ + --burn-timestamp \ + --room-name demo \ + --identity jetson-cam-1 + # publish AV1 through the Jetson hardware encoder (Orin only) cargo run -p local_video -F desktop --bin publisher -- \ --source argus \ @@ -138,7 +147,7 @@ Publisher flags (in addition to the common connection flags above): - `--simulcast`: Publish simulcast video (multiple layers when the resolution is large enough). - `--max-bitrate `: Max video bitrate for the main (highest) layer in bits per second (e.g. `1500000`). - `--attach-timestamp`: Attach the current wall-clock time (microseconds since UNIX epoch) as the user timestamp on each published frame. The subscriber can display this to measure end-to-end latency. -- `--burn-timestamp`: Burn the attached timestamp into the video frame as a visible overlay. Has no effect unless `--attach-timestamp` is also set. +- `--burn-timestamp`: Burn the attached timestamp into the video frame as a visible overlay. Has no effect unless `--attach-timestamp` is also set. With `--source argus`, this maps the NV12 DMA-BUF and copies it to CPU I420 before publishing. - `--attach-frame-id`: Attach a monotonically increasing frame ID to each published frame via the packet trailer. The subscriber displays this in the timestamp overlay when `--display-timestamp` is used. - `--display-video`: Open a window that displays the video frames being published. - `--display-timing`: Burn publisher timing metrics into the local preview window. Requires `--display-video`. @@ -185,6 +194,6 @@ Notes: - If the active video track is unsubscribed or unpublished, the app clears its state and will automatically attach to the next matching video track when it appears. - For E2EE to work, both publisher and subscriber must specify the same `--e2ee-key` value. If the keys don't match, the subscriber will not be able to decode the video. - The timestamp overlay updates at ~2 Hz so the latency value is readable rather than flickering every frame. -- On Jetson, `--source argus` requires the Jetson Multimedia API headers under `/usr/src/jetson_multimedia_api`. It publishes NV12 DMA buffers through the Jetson hardware encoder; local publisher preview and burned timestamps are not supported on that path. +- On Jetson, `--source argus` requires the Jetson Multimedia API headers under `/usr/src/jetson_multimedia_api`. By default it publishes NV12 DMA buffers through the Jetson hardware encoder. `--attach-timestamp --burn-timestamp` intentionally switches that source to a CPU I420 copy so the timestamp can be drawn into the frame. - Jetson AV1 hardware encoding requires an Orin-class device (e.g. Orin NX or AGX Orin on JetPack 5+); the encoder is probed at startup and on devices without AV1 support (e.g. Xavier) `--codec av1` automatically falls back to the software libaom encoder. The Jetson AV1 encoder produces a single L1T1 stream (no SVC). - On Linux, preview windows use the Vulkan `wgpu` backend by default to avoid GLES/EGL conflicts on Jetson desktops. Set `WGPU_BACKEND=gl` or another supported `wgpu` backend to override this. diff --git a/examples/local_video/build.rs b/examples/local_video/build.rs deleted file mode 100644 index 1edde6e87..000000000 --- a/examples/local_video/build.rs +++ /dev/null @@ -1,46 +0,0 @@ -use std::path::PathBuf; - -fn main() { - let target_os = std::env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); - let target_arch = std::env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default(); - - // Only compile the Argus shim on aarch64 Linux (Jetson). - if target_os != "linux" || target_arch != "aarch64" { - return; - } - - let argus_include = PathBuf::from("/usr/src/jetson_multimedia_api/argus/include"); - let mmapi_include = PathBuf::from("/usr/src/jetson_multimedia_api/include"); - - if !argus_include.exists() { - println!( - "cargo:warning=Argus headers not found at {}; skipping lk_argus build", - argus_include.display() - ); - return; - } - - println!("cargo:rerun-if-changed=src/lk_argus.cpp"); - - cc::Build::new() - .cpp(true) - .file("src/lk_argus.cpp") - .include(&argus_include) - .include(&mmapi_include) - .flag("-std=c++14") - .flag("-Wno-deprecated-declarations") - .compile("lk_argus"); - - // Link Argus client library (talks to nvargus-daemon) and NvBufSurface - println!("cargo:rustc-link-lib=dylib=nvargus_socketclient"); - println!("cargo:rustc-link-lib=dylib=nvbufsurface"); - - // Tegra library path - let tegra_lib_dir = PathBuf::from("/usr/lib/aarch64-linux-gnu/tegra"); - if tegra_lib_dir.exists() { - println!("cargo:rustc-link-search=native={}", tegra_lib_dir.display()); - } - - // Standard aarch64 library path - println!("cargo:rustc-link-search=native=/usr/lib/aarch64-linux-gnu"); -} diff --git a/examples/local_video/src/argus.rs b/examples/local_video/src/argus.rs deleted file mode 100644 index 0fadbc9d3..000000000 --- a/examples/local_video/src/argus.rs +++ /dev/null @@ -1,169 +0,0 @@ -//! Thin FFI wrapper around NVIDIA Argus/libargus for MIPI CSI camera capture. -//! -//! This module provides DMA-buffer frame acquisition from MIPI cameras on Jetson -//! platforms. Frames are blitted from Argus' EGLStream frame into NvBufSurface -//! DMA file descriptors that can be passed to the hardware encoder without -//! CPU-side pixel copies. -//! -//! The Argus API is C++, so we use a small C shim (linked via build.rs on -//! Jetson) to expose the capture session lifecycle. - -use std::ffi::c_int; -use std::io; - -/// Opaque handle to an Argus capture session. -pub struct ArgusCaptureSession { - handle: *mut std::ffi::c_void, - width: u32, - height: u32, -} - -/// A captured Argus frame backed by a DMA buffer. -pub struct ArgusFrame { - /// DMA buffer file descriptor containing an NV12 frame. - pub dmabuf_fd: i32, - /// Argus sensor start timestamp in nanoseconds, when available. - pub sensor_timestamp_ns: Option, - /// Time spent waiting for `FrameConsumer::acquireFrame` to return. - pub acquire_wait_ns: u64, - /// Time spent copying the acquired EGLStream frame into the DMA buffer. - pub blit_ns: u64, -} - -// The C++ session is single-threaded but we move it across the tokio runtime. -unsafe impl Send for ArgusCaptureSession {} - -extern "C" { - fn lk_argus_create_session( - sensor_index: c_int, - width: c_int, - height: c_int, - fps: c_int, - ) -> *mut std::ffi::c_void; - - fn lk_argus_destroy_session(session: *mut std::ffi::c_void); - - /// Acquire the next frame and optionally return the Argus sensor timestamp. - /// Returns the NvBufSurface DMA fd, or -1 on error. - /// The fd is valid until the next acquire call or `lk_argus_release_frame`. - fn lk_argus_acquire_frame_with_metadata( - session: *mut std::ffi::c_void, - sensor_timestamp_ns: *mut u64, - acquire_wait_ns: *mut u64, - blit_ns: *mut u64, - ) -> c_int; - - /// Release the most recently acquired frame back to the Argus buffer pool. - fn lk_argus_release_frame(session: *mut std::ffi::c_void); -} - -impl ArgusCaptureSession { - /// Open an Argus capture session on the given MIPI CSI sensor. - /// - /// `sensor_index` selects the camera (0 for the first CSI camera). - /// The session negotiates the given resolution and framerate with the ISP. - pub fn new(sensor_index: u32, width: u32, height: u32, fps: u32) -> io::Result { - let handle = unsafe { - lk_argus_create_session( - sensor_index as c_int, - width as c_int, - height as c_int, - fps as c_int, - ) - }; - if handle.is_null() { - return Err(io::Error::new( - io::ErrorKind::Other, - "Failed to create Argus capture session", - )); - } - Ok(Self { handle, width, height }) - } - - /// Acquire the next captured frame as a DMA buffer. - /// - /// The returned fd refers to an NvBufSurface in NV12 format. It remains - /// valid until [`release_frame`](Self::release_frame) is called or the - /// next `acquire_frame` implicitly releases the previous one. - pub fn acquire_frame(&mut self) -> io::Result { - let mut sensor_timestamp_ns = 0; - let mut acquire_wait_ns = 0; - let mut blit_ns = 0; - let fd = unsafe { - lk_argus_acquire_frame_with_metadata( - self.handle, - &mut sensor_timestamp_ns, - &mut acquire_wait_ns, - &mut blit_ns, - ) - }; - if fd < 0 { - return Err(io::Error::new(io::ErrorKind::Other, "Argus frame acquisition failed")); - } - Ok(ArgusFrame { - dmabuf_fd: fd, - sensor_timestamp_ns: (sensor_timestamp_ns > 0).then_some(sensor_timestamp_ns), - acquire_wait_ns, - blit_ns, - }) - } - - /// Release the most recently acquired frame back to the buffer pool. - pub fn release_frame(&mut self) { - unsafe { lk_argus_release_frame(self.handle) }; - } - - pub fn width(&self) -> u32 { - self.width - } - - pub fn height(&self) -> u32 { - self.height - } -} - -impl Drop for ArgusCaptureSession { - fn drop(&mut self) { - if !self.handle.is_null() { - unsafe { lk_argus_destroy_session(self.handle) }; - self.handle = std::ptr::null_mut(); - } - } -} - -/// Convert an Argus `CLOCK_MONOTONIC` sensor timestamp into a UNIX-epoch microsecond value -/// by computing the offset between the current monotonic clock and the supplied wall time. -pub fn sensor_monotonic_ns_to_unix_us(sensor_timestamp_ns: u64, wall_time_us: u64) -> Option { - let monotonic_now_ns = monotonic_time_ns_now()?; - let monotonic_delta_us = monotonic_now_ns.abs_diff(sensor_timestamp_ns) / 1_000; - if sensor_timestamp_ns <= monotonic_now_ns { - Some(wall_time_us.saturating_sub(monotonic_delta_us)) - } else { - Some(wall_time_us.saturating_add(monotonic_delta_us)) - } -} - -/// Current `CLOCK_MONOTONIC` value in nanoseconds, used to translate Argus sensor -/// timestamps into wall time. -fn monotonic_time_ns_now() -> Option { - #[repr(C)] - struct Timespec { - tv_sec: i64, - tv_nsec: i64, - } - - extern "C" { - fn clock_gettime(clk_id: i32, tp: *mut Timespec) -> i32; - } - - const CLOCK_MONOTONIC: i32 = 1; - let mut ts = Timespec { tv_sec: 0, tv_nsec: 0 }; - let ret = unsafe { - // SAFETY: `ts` is a valid, writable `Timespec` for the duration of the call. - clock_gettime(CLOCK_MONOTONIC, &mut ts) - }; - if ret != 0 || ts.tv_sec < 0 || ts.tv_nsec < 0 { - return None; - } - Some(ts.tv_sec as u64 * 1_000_000_000 + ts.tv_nsec as u64) -} diff --git a/examples/local_video/src/lk_argus.cpp b/examples/local_video/src/lk_argus.cpp deleted file mode 100644 index 8deb5e1fc..000000000 --- a/examples/local_video/src/lk_argus.cpp +++ /dev/null @@ -1,604 +0,0 @@ -// C shim around NVIDIA libargus for MIPI CSI camera capture on Jetson. -// -// Exposes a simple C API for the Rust FFI in argus.rs: -// lk_argus_create_session – open sensor, configure ISP, start repeating capture -// lk_argus_acquire_frame – dequeue next frame, return NvBufSurface DMA fd -// lk_argus_release_frame – release frame back to Argus buffer pool -// lk_argus_destroy_session – tear down everything - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include "NvBufSurface.h" - -// Ring buffer size for persistent NvBufSurface DMA allocations. -// The encoder may hold 1-2 buffers while encoding, and the blit writes to -// another. 4 buffers gives comfortable headroom to avoid the "Wrong buffer -// index" errors that occur when the capture loop laps the encoder. -static constexpr int kNumDmaBufs = 4; - -struct LkArgusSession { - Argus::UniqueObj provider; - Argus::UniqueObj session; - Argus::UniqueObj stream_settings; - Argus::UniqueObj stream; - Argus::UniqueObj request; - Argus::UniqueObj event_queue; - Argus::UniqueObj consumer; - - // Most recently acquired frame (kept alive until release/next acquire). - Argus::UniqueObj current_frame; - - // Ring of DMA fds so the encoder can hold one buffer while we blit the - // next frame into a different one. Avoids the "Wrong buffer index" - // errors caused by the encoder and Argus racing on a single buffer. - int dmabuf_fds[kNumDmaBufs]; - NvBufSurface* dmabuf_surfaces[kNumDmaBufs]; // original surface ptrs for cleanup - int dmabuf_write_idx; // next buffer to blit into - int width; - int height; - bool metadata_enabled; - bool event_metadata_enabled; -}; - -static const uint64_t kAcquireTimeoutNs = 1000000000ULL; // 1 second - -enum class SensorTimestampStatus { - Available, - InvalidArgs, - NoEventQueue, - EventWaitFailed, - NoCaptureCompleteEvent, - CaptureCompleteFailed, - NoEventMetadata, - NoOutputStream, - MetadataCreateFailed, - NoCaptureMetadata, - ZeroTimestamp, -}; - -static const char* sensor_timestamp_status_name(SensorTimestampStatus status) { - switch (status) { - case SensorTimestampStatus::Available: - return "available"; - case SensorTimestampStatus::InvalidArgs: - return "invalid args"; - case SensorTimestampStatus::NoEventQueue: - return "no capture-complete event queue"; - case SensorTimestampStatus::EventWaitFailed: - return "capture-complete event wait failed"; - case SensorTimestampStatus::NoCaptureCompleteEvent: - return "no capture-complete event"; - case SensorTimestampStatus::CaptureCompleteFailed: - return "capture-complete event failed"; - case SensorTimestampStatus::NoEventMetadata: - return "no capture-complete metadata"; - case SensorTimestampStatus::NoOutputStream: - return "no EGL output stream"; - case SensorTimestampStatus::MetadataCreateFailed: - return "metadata container create failed"; - case SensorTimestampStatus::NoCaptureMetadata: - return "no capture metadata interface"; - case SensorTimestampStatus::ZeroTimestamp: - return "zero sensor timestamp"; - } - return "unknown"; -} - -static SensorTimestampStatus read_sensor_timestamp_ns_from_event( - LkArgusSession* s, - uint64_t* sensor_timestamp_ns, - Argus::Status* metadata_status) { - if (metadata_status) *metadata_status = Argus::STATUS_OK; - if (!s || !sensor_timestamp_ns) return SensorTimestampStatus::InvalidArgs; - *sensor_timestamp_ns = 0; - - auto* i_event_provider = Argus::interface_cast(s->session); - auto* i_event_queue = Argus::interface_cast(s->event_queue); - if (!i_event_provider || !i_event_queue) { - return SensorTimestampStatus::NoEventQueue; - } - - Argus::Status status = i_event_provider->waitForEvents(s->event_queue.get(), 1000000); - if (metadata_status) *metadata_status = status; - if (status != Argus::STATUS_OK) { - return SensorTimestampStatus::EventWaitFailed; - } - - const Argus::Event* newest_capture_complete = nullptr; - for (uint32_t i = 0; i < i_event_queue->getSize(); i++) { - const Argus::Event* event = i_event_queue->getEvent(i); - auto* i_event = Argus::interface_cast(event); - if (i_event && i_event->getEventType() == Argus::EVENT_TYPE_CAPTURE_COMPLETE) { - newest_capture_complete = event; - } - } - if (!newest_capture_complete) { - return SensorTimestampStatus::NoCaptureCompleteEvent; - } - - auto* i_capture_complete = - Argus::interface_cast(newest_capture_complete); - if (!i_capture_complete) { - return SensorTimestampStatus::NoCaptureCompleteEvent; - } - status = i_capture_complete->getStatus(); - if (metadata_status) *metadata_status = status; - if (status != Argus::STATUS_OK) { - return SensorTimestampStatus::CaptureCompleteFailed; - } - - const Argus::CaptureMetadata* metadata = i_capture_complete->getMetadata(); - if (!metadata) { - return SensorTimestampStatus::NoEventMetadata; - } - - auto* i_metadata = Argus::interface_cast(metadata); - if (!i_metadata) { - return SensorTimestampStatus::NoCaptureMetadata; - } - - *sensor_timestamp_ns = i_metadata->getSensorTimestamp(); - if (*sensor_timestamp_ns == 0) { - return SensorTimestampStatus::ZeroTimestamp; - } - return SensorTimestampStatus::Available; -} - -static SensorTimestampStatus read_sensor_timestamp_ns_from_egl_metadata( - LkArgusSession* s, - uint64_t* sensor_timestamp_ns, - Argus::Status* metadata_status) { - if (metadata_status) *metadata_status = Argus::STATUS_OK; - if (!s || !sensor_timestamp_ns) return SensorTimestampStatus::InvalidArgs; - *sensor_timestamp_ns = 0; - - auto* i_stream = Argus::interface_cast(s->stream); - if (!i_stream) return SensorTimestampStatus::NoOutputStream; - - Argus::Status status; - EGLStream::MetadataContainer* metadata = EGLStream::MetadataContainer::create( - i_stream->getEGLDisplay(), - i_stream->getEGLStream(), - EGLStream::MetadataContainer::CONSUMER, - &status); - if (metadata_status) *metadata_status = status; - if (status != Argus::STATUS_OK || !metadata) { - return SensorTimestampStatus::MetadataCreateFailed; - } - - auto* i_metadata = Argus::interface_cast(metadata); - if (!i_metadata) { - metadata->destroy(); - return SensorTimestampStatus::NoCaptureMetadata; - } - - *sensor_timestamp_ns = i_metadata->getSensorTimestamp(); - metadata->destroy(); - if (*sensor_timestamp_ns == 0) { - return SensorTimestampStatus::ZeroTimestamp; - } - return SensorTimestampStatus::Available; -} - -static SensorTimestampStatus read_sensor_timestamp_ns( - LkArgusSession* s, - uint64_t* sensor_timestamp_ns, - Argus::Status* metadata_status) { - SensorTimestampStatus status = - read_sensor_timestamp_ns_from_egl_metadata(s, sensor_timestamp_ns, metadata_status); - if (status == SensorTimestampStatus::Available) { - return status; - } - - // Fall back to capture-complete events only when embedded EGLStream metadata - // is unavailable. Event queues are session-scoped, so they can lag or lead - // the exact frame returned by FrameConsumer::acquireFrame(). - SensorTimestampStatus egl_status = status; - Argus::Status egl_metadata_status = - metadata_status ? *metadata_status : Argus::STATUS_OK; - - SensorTimestampStatus event_status = - read_sensor_timestamp_ns_from_event(s, sensor_timestamp_ns, metadata_status); - if (event_status == SensorTimestampStatus::Available) { - return event_status; - } - - if (metadata_status) *metadata_status = egl_metadata_status; - return egl_status; -} - -extern "C" { - -void* lk_argus_create_session(int sensor_index, int width, int height, int fps) { - auto* s = new LkArgusSession(); - for (int i = 0; i < kNumDmaBufs; i++) { - s->dmabuf_fds[i] = -1; - s->dmabuf_surfaces[i] = nullptr; - } - s->dmabuf_write_idx = 0; - s->width = width; - s->height = height; - s->metadata_enabled = false; - s->event_metadata_enabled = false; - - // Create CameraProvider - s->provider = Argus::UniqueObj( - Argus::CameraProvider::create()); - auto* i_provider = Argus::interface_cast(s->provider); - if (!i_provider) { - fprintf(stderr, "[lk_argus] Failed to create CameraProvider\n"); - delete s; - return nullptr; - } - - // Enumerate camera devices - std::vector devices; - i_provider->getCameraDevices(&devices); - if (devices.empty() || sensor_index >= static_cast(devices.size())) { - fprintf(stderr, "[lk_argus] No camera device at index %d (found %zu)\n", - sensor_index, devices.size()); - delete s; - return nullptr; - } - - // Create CaptureSession - Argus::Status status; - s->session = Argus::UniqueObj( - i_provider->createCaptureSession(devices[sensor_index], &status)); - if (status != Argus::STATUS_OK) { - fprintf(stderr, "[lk_argus] Failed to create CaptureSession: %d\n", - static_cast(status)); - delete s; - return nullptr; - } - auto* i_session = Argus::interface_cast(s->session); - auto* i_event_provider = Argus::interface_cast(s->session); - if (i_event_provider) { - std::vector event_types; - event_types.push_back(Argus::EVENT_TYPE_CAPTURE_COMPLETE); - s->event_queue = Argus::UniqueObj( - i_event_provider->createEventQueue(event_types, &status)); - if (status != Argus::STATUS_OK || !s->event_queue) { - fprintf(stderr, - "[lk_argus] WARNING: failed to create capture-complete event queue: %d\n", - static_cast(status)); - } else { - s->event_metadata_enabled = true; - fprintf(stderr, "[lk_argus] Capture-complete metadata events enabled: yes\n"); - } - } else { - fprintf(stderr, "[lk_argus] WARNING: capture session has no event provider interface\n"); - } - - // Create OutputStream (EGLStream-backed) - s->stream_settings = Argus::UniqueObj( - i_session->createOutputStreamSettings(Argus::STREAM_TYPE_EGL, &status)); - auto* i_stream_settings = - Argus::interface_cast(s->stream_settings); - if (!i_stream_settings) { - fprintf(stderr, "[lk_argus] Failed to get IEGLOutputStreamSettings\n"); - delete s; - return nullptr; - } - i_stream_settings->setPixelFormat(Argus::PIXEL_FMT_YCbCr_420_888); - i_stream_settings->setResolution(Argus::Size2D(width, height)); - status = i_stream_settings->setMode(Argus::EGL_STREAM_MODE_MAILBOX); - if (status != Argus::STATUS_OK) { - fprintf(stderr, "[lk_argus] WARNING: failed to set EGLStream mailbox mode: %d\n", - static_cast(status)); - } - status = i_stream_settings->setFifoLength(1); - if (status != Argus::STATUS_OK) { - fprintf(stderr, "[lk_argus] WARNING: failed to set EGLStream FIFO length: %d\n", - static_cast(status)); - } - fprintf(stderr, "[lk_argus] EGLStream mode: mailbox, fifo length: %u\n", - i_stream_settings->getFifoLength()); - status = i_stream_settings->setMetadataEnable(true); - if (status != Argus::STATUS_OK) { - fprintf(stderr, "[lk_argus] WARNING: failed to enable EGLStream metadata: %d\n", - static_cast(status)); - } - s->metadata_enabled = i_stream_settings->getMetadataEnable(); - fprintf(stderr, "[lk_argus] EGLStream metadata enabled: %s\n", - s->metadata_enabled ? "yes" : "no"); - - s->stream = Argus::UniqueObj( - i_session->createOutputStream(s->stream_settings.get(), &status)); - if (status != Argus::STATUS_OK) { - fprintf(stderr, "[lk_argus] Failed to create OutputStream: %d\n", - static_cast(status)); - delete s; - return nullptr; - } - - // Create FrameConsumer - s->consumer = Argus::UniqueObj( - EGLStream::FrameConsumer::create(s->stream.get())); - auto* i_consumer = - Argus::interface_cast(s->consumer); - if (!i_consumer) { - fprintf(stderr, "[lk_argus] Failed to create FrameConsumer\n"); - delete s; - return nullptr; - } - - // Create capture Request - s->request = Argus::UniqueObj( - i_session->createRequest(Argus::CAPTURE_INTENT_VIDEO_RECORD, &status)); - if (status != Argus::STATUS_OK) { - fprintf(stderr, "[lk_argus] Failed to create Request: %d\n", - static_cast(status)); - delete s; - return nullptr; - } - auto* i_request = - Argus::interface_cast(s->request); - i_request->enableOutputStream(s->stream.get()); - - // --- Sensor mode selection --- - // Argus auto-selects a sensor mode, but often picks the highest-resolution - // mode and downscales, running at that mode's (lower) framerate. We - // explicitly pick the smallest mode that covers the requested resolution - // and supports the requested framerate. - auto* i_props = Argus::interface_cast( - devices[sensor_index]); - if (i_props) { - std::vector modes; - i_props->getAllSensorModes(&modes); - fprintf(stderr, "[lk_argus] %zu sensor modes available:\n", modes.size()); - - Argus::SensorMode* best_mode = nullptr; - uint64_t best_pixels = UINT64_MAX; - uint64_t requested_dur_ns = 1000000000ULL / fps; - - for (size_t i = 0; i < modes.size(); i++) { - auto* i_mode = Argus::interface_cast(modes[i]); - if (!i_mode) continue; - auto res = i_mode->getResolution(); - auto dur = i_mode->getFrameDurationRange(); - double min_fps_mode = 1e9 / static_cast(dur.max()); - double max_fps_mode = 1e9 / static_cast(dur.min()); - fprintf(stderr, " [%zu] %ux%u fps %.1f-%.1f duration %lu-%lu ns\n", - i, res.width(), res.height(), - min_fps_mode, max_fps_mode, - dur.min(), dur.max()); - - // Compare frame durations instead of floating-point fps. - // Sensor durations are in nanoseconds and often off by 1 ns - // from the ideal value (e.g., 33333334 vs 33333333 for 30fps). - // A 1ms tolerance handles this rounding. - if (static_cast(res.width()) >= width && - static_cast(res.height()) >= height && - dur.min() <= requested_dur_ns + 1000000) { - uint64_t pixels = static_cast(res.width()) * res.height(); - if (pixels < best_pixels) { - best_pixels = pixels; - best_mode = modes[i]; - } - } - } - - auto* i_source = Argus::interface_cast( - i_request->getSourceSettings()); - - if (best_mode) { - auto* i_best = Argus::interface_cast(best_mode); - auto res = i_best->getResolution(); - auto dur = i_best->getFrameDurationRange(); - fprintf(stderr, "[lk_argus] Selected sensor mode: %ux%u fps %.1f-%.1f\n", - res.width(), res.height(), - 1e9 / static_cast(dur.max()), - 1e9 / static_cast(dur.min())); - if (i_source) { - i_source->setSensorMode(best_mode); - } - } else { - fprintf(stderr, "[lk_argus] WARNING: no sensor mode found for %dx%d @ %d fps, " - "using Argus default (may be slower)\n", width, height, fps); - } - - if (i_source) { - uint64_t frame_dur_ns = 1000000000ULL / fps; - i_source->setFrameDurationRange( - Argus::Range(frame_dur_ns, frame_dur_ns)); - i_source->setExposureTimeRange( - Argus::Range(0, frame_dur_ns)); - fprintf(stderr, "[lk_argus] Frame duration: %lu ns, max exposure: %lu ns\n", - frame_dur_ns, frame_dur_ns); - } - } else { - fprintf(stderr, "[lk_argus] WARNING: could not query sensor modes\n"); - auto* i_source = Argus::interface_cast( - i_request->getSourceSettings()); - if (i_source) { - i_source->setFrameDurationRange( - Argus::Range(1000000000ULL / fps, 1000000000ULL / fps)); - } - } - - // Allocate a ring of persistent NvBufSurface buffers so the encoder can - // hold one while we blit the next frame into a different one. - for (int i = 0; i < kNumDmaBufs; i++) { - NvBufSurfaceCreateParams create_params = {}; - create_params.gpuId = 0; - create_params.width = static_cast(width); - create_params.height = static_cast(height); - create_params.size = 0; - create_params.colorFormat = NVBUF_COLOR_FORMAT_NV12; - create_params.layout = NVBUF_LAYOUT_PITCH; - create_params.memType = NVBUF_MEM_SURFACE_ARRAY; - - NvBufSurface* surface = nullptr; - if (NvBufSurfaceCreate(&surface, 1, &create_params) != 0 || !surface) { - fprintf(stderr, "[lk_argus] Failed to create NvBufSurface[%d]\n", i); - delete s; - return nullptr; - } - s->dmabuf_fds[i] = surface->surfaceList[0].bufferDesc; - s->dmabuf_surfaces[i] = surface; - } - - // Start repeating capture - status = i_session->repeat(s->request.get()); - if (status != Argus::STATUS_OK) { - fprintf(stderr, "[lk_argus] Failed to start repeating capture: %d\n", - static_cast(status)); - delete s; - return nullptr; - } - - fprintf(stderr, "[lk_argus] Session created: %dx%d @ %d fps, sensor %d, %d DMA buffers (fds:", - width, height, fps, sensor_index, kNumDmaBufs); - for (int i = 0; i < kNumDmaBufs; i++) fprintf(stderr, " %d", s->dmabuf_fds[i]); - fprintf(stderr, ")\n"); - return s; -} - -int lk_argus_acquire_frame_with_metadata( - void* handle, - uint64_t* sensor_timestamp_ns, - uint64_t* acquire_wait_ns, - uint64_t* blit_ns) { - using Clock = std::chrono::steady_clock; - - auto* s = static_cast(handle); - if (!s) return -1; - if (sensor_timestamp_ns) *sensor_timestamp_ns = 0; - if (acquire_wait_ns) *acquire_wait_ns = 0; - if (blit_ns) *blit_ns = 0; - - auto* i_consumer = - Argus::interface_cast(s->consumer); - if (!i_consumer) return -1; - - // Release any previously held frame - s->current_frame.reset(); - - auto t0 = Clock::now(); - - Argus::Status status; - s->current_frame = Argus::UniqueObj( - i_consumer->acquireFrame(kAcquireTimeoutNs, &status)); - if (status != Argus::STATUS_OK || !s->current_frame) { - return -1; - } - - auto t1 = Clock::now(); - - auto* i_frame = - Argus::interface_cast(s->current_frame); - if (!i_frame) return -1; - - Argus::Status metadata_status = Argus::STATUS_OK; - SensorTimestampStatus sensor_timestamp_status = - read_sensor_timestamp_ns(s, sensor_timestamp_ns, &metadata_status); - bool has_sensor_timestamp = - sensor_timestamp_status == SensorTimestampStatus::Available; - static SensorTimestampStatus last_logged_sensor_timestamp_status = - SensorTimestampStatus::Available; - if (!has_sensor_timestamp && - sensor_timestamp_status != last_logged_sensor_timestamp_status) { - fprintf(stderr, - "[lk_argus] Sensor timestamp unavailable: %s " - "(event metadata enabled=%s, EGL metadata enabled=%s, status=%d)\n", - sensor_timestamp_status_name(sensor_timestamp_status), - s->event_metadata_enabled ? "yes" : "no", - s->metadata_enabled ? "yes" : "no", - static_cast(metadata_status)); - last_logged_sensor_timestamp_status = sensor_timestamp_status; - } else if (has_sensor_timestamp && - last_logged_sensor_timestamp_status != SensorTimestampStatus::Available) { - fprintf(stderr, "[lk_argus] Sensor timestamp available\n"); - last_logged_sensor_timestamp_status = SensorTimestampStatus::Available; - } - - auto* image = i_frame->getImage(); - if (!image) return -1; - - // Get the NativeBuffer interface to extract the DMA fd - auto* i_native = - Argus::interface_cast(image); - if (!i_native) { - fprintf(stderr, "[lk_argus] Image does not support IImageNativeBuffer\n"); - return -1; - } - - // Pick the next buffer in the ring so we don't overwrite a buffer the - // encoder may still be reading from. - int idx = s->dmabuf_write_idx; - s->dmabuf_write_idx = (s->dmabuf_write_idx + 1) % kNumDmaBufs; - int fd = s->dmabuf_fds[idx]; - - // Copy (blit) the acquired frame into the selected NvBufSurface. - status = i_native->copyToNvBuffer(fd); - - auto t2 = Clock::now(); - auto acquire_duration_ns = - std::chrono::duration_cast(t1 - t0).count(); - auto blit_duration_ns = - std::chrono::duration_cast(t2 - t1).count(); - if (acquire_wait_ns) *acquire_wait_ns = static_cast(acquire_duration_ns); - if (blit_ns) *blit_ns = static_cast(blit_duration_ns); - - // Release the Argus frame immediately – the pixel data has been blitted - // into our persistent NvBufSurface so we no longer need the EGLStream frame. - s->current_frame.reset(); - - if (status != Argus::STATUS_OK) { - fprintf(stderr, "[lk_argus] copyToNvBuffer failed: %d\n", - static_cast(status)); - return -1; - } - - return fd; -} - -int lk_argus_acquire_frame(void* handle) { - return lk_argus_acquire_frame_with_metadata(handle, nullptr, nullptr, nullptr); -} - -void lk_argus_release_frame(void* handle) { - auto* s = static_cast(handle); - if (!s) return; - s->current_frame.reset(); -} - -void lk_argus_destroy_session(void* handle) { - auto* s = static_cast(handle); - if (!s) return; - - // Stop repeating capture - auto* i_session = Argus::interface_cast(s->session); - if (i_session) { - i_session->stopRepeat(); - i_session->waitForIdle(); - } - - s->current_frame.reset(); - - // Free all persistent NvBufSurface buffers using the original pointers. - for (int i = 0; i < kNumDmaBufs; i++) { - if (s->dmabuf_surfaces[i]) { - NvBufSurfaceDestroy(s->dmabuf_surfaces[i]); - s->dmabuf_surfaces[i] = nullptr; - } - s->dmabuf_fds[i] = -1; - } - - delete s; - fprintf(stderr, "[lk_argus] Session destroyed\n"); -} - -} // extern "C" diff --git a/examples/local_video/src/publisher.rs b/examples/local_video/src/publisher.rs index c6ec95a4a..e5ee53ea2 100644 --- a/examples/local_video/src/publisher.rs +++ b/examples/local_video/src/publisher.rs @@ -19,6 +19,8 @@ use livekit_capture::device::{ CaptureDeviceSelector, CaptureFormat as LkCaptureFormat, CaptureFormatRequest, CaptureFrameFormat, CapturePath as LkCapturePath, CaptureResolution, }; +#[cfg(all(target_os = "linux", target_arch = "aarch64"))] +use livekit_capture::sources::argus::{self, ArgusCaptureOptions, ArgusCaptureSession}; #[cfg(target_os = "macos")] use livekit_capture::sources::avfoundation::{ self, AvFoundationCaptureOptions, AvFoundationCaptureSession, @@ -35,8 +37,6 @@ use std::sync::{ }; use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; -#[cfg(all(target_os = "linux", target_arch = "aarch64"))] -mod argus; mod codec_display; mod test_pattern; mod timestamp_burn; @@ -884,7 +884,7 @@ enum VideoInput { TestPattern(TestPattern), Camera(PlatformCamera), #[cfg(all(target_os = "linux", target_arch = "aarch64"))] - Argus(argus::ArgusCaptureSession), + Argus(ArgusCaptureSession), } enum PlatformCamera { @@ -937,7 +937,13 @@ fn publisher_capture_path_label(video_input: &VideoInput, burn_timestamp: bool) } }, #[cfg(all(target_os = "linux", target_arch = "aarch64"))] - VideoInput::Argus(_) => "libargus NV12 DMA-BUF".to_string(), + VideoInput::Argus(_) => { + if burn_timestamp { + "libargus CPU I420 from NV12 DMA-BUF (timestamp burn)".to_string() + } else { + "libargus NV12 DMA-BUF".to_string() + } + } } } @@ -1252,17 +1258,11 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { if args.display_video { anyhow::bail!("--display-video is not supported with --source argus"); } - if args.burn_timestamp { - log::warn!( - "--burn-timestamp is ignored with --source argus (DMA buffers are not CPU-mapped on the publish path)" - ); - } - let session = argus::ArgusCaptureSession::new( + let session = ArgusCaptureSession::new(ArgusCaptureOptions::new( args.camera_index as u32, - args.width, - args.height, + CaptureResolution::new(args.width, args.height), args.fps, - )?; + ))?; info!( "Argus MIPI capture session opened: {}x{} @ {} fps (camera {})", session.width(), @@ -1421,14 +1421,15 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { info!("Published camera track"); requested_codec }; + let burn_timestamp_enabled = args.attach_timestamp && args.burn_timestamp; info!( "Publisher media path: capture={}, encode=requested codec {} via {}", - publisher_capture_path_label(&video_input, args.burn_timestamp), + publisher_capture_path_label(&video_input, burn_timestamp_enabled), actual_codec.as_str(), video_encoder_backend_name(requested_encoder), ); let native_capture_fallback = - publisher_uses_native_camera_capture(&video_input, args.burn_timestamp) + publisher_uses_native_camera_capture(&video_input, burn_timestamp_enabled) .then(|| Arc::new(AtomicBool::new(false))); let capture_config = CaptureConfig { @@ -1568,6 +1569,7 @@ async fn run_capture_loop( let mut fps_window_start = Instant::now(); let mut fps_smoothed: f32 = 0.0; let target = Duration::from_secs_f64(1.0 / pace_fps); + let burn_timestamp_enabled = config.attach_timestamp && config.burn_timestamp; info!("Target frame interval: {:.2} ms", target.as_secs_f64() * 1000.0); if camera_driven_pacing { info!("Capture pacing: camera frame-arrival driven"); @@ -1579,8 +1581,8 @@ async fn run_capture_loop( let mut timings = PublisherTimingSummary::default(); let mut frame_counter: u32 = 1; let mut test_pattern_frame_index: u64 = 0; - let mut timestamp_overlay = (config.attach_timestamp && config.burn_timestamp) - .then(|| TimestampOverlay::new(width, height)); + let mut timestamp_overlay = + burn_timestamp_enabled.then(|| TimestampOverlay::new(width, height)); let align_buffers_for_display = display_shared.is_some(); let mut logged_camera_timestamp_source = false; let mut logged_camera_timestamp_fallback = false; @@ -1656,7 +1658,7 @@ async fn run_capture_loop( ); logged_native_capture_fallback = true; } - let prefer_native = !config.burn_timestamp && !force_i420_after_native_failure; + let prefer_native = !burn_timestamp_enabled && !force_i420_after_native_failure; let mut captured = camera.capture_frame(prefer_native)?; let camera_frame_acquired_at = Instant::now(); match &mut captured.buffer { @@ -1916,15 +1918,15 @@ async fn run_capture_loop( /// Capture loop dedicated to Jetson MIPI capture via libargus. /// /// Argus blocks inside `acquireFrame`, pacing capture itself, so this loop runs in a -/// dedicated OS thread and pushes NV12 DMA-buffer fds straight into `NativeVideoSource` -/// via [`NativeVideoSource::capture_dmabuf_frame_with_metadata`] for zero-copy hand-off -/// to the Jetson hardware encoder. +/// dedicated OS thread. The normal path pushes NV12 DMA-buffer fds straight into +/// [`NativeVideoSource::capture_dmabuf_frame_with_metadata`] for zero-copy hand-off +/// to the Jetson hardware encoder; timestamp burn explicitly copies to CPU I420. #[cfg(all(target_os = "linux", target_arch = "aarch64"))] async fn run_argus_capture_loop( config: CaptureConfig, ctrl_c_received: Arc, rtc_source: NativeVideoSource, - session: argus::ArgusCaptureSession, + session: ArgusCaptureSession, width: u32, height: u32, user_data_channels: Option>>, @@ -1932,13 +1934,22 @@ async fn run_argus_capture_loop( let capture_handle = std::thread::Builder::new() .name("mipi-capture".into()) .spawn(move || -> Result<()> { + enum CapturedArgusFrame { + DmaBuf(argus::ArgusFrame), + I420(argus::ArgusI420Frame), + } + let mut session = session; - let start_ts = Instant::now(); + let burn_timestamp_enabled = config.attach_timestamp && config.burn_timestamp; + let mut timestamp_overlay = + burn_timestamp_enabled.then(|| TimestampOverlay::new(width, height)); let mut frames: u64 = 0; let mut last_fps_log = Instant::now(); let mut sum_acquire_ms = 0.0; let mut sum_argus_wait_ms = 0.0; let mut sum_argus_blit_ms = 0.0; + let mut sum_argus_i420_copy_ms = 0.0; + let mut sum_timestamp_burn_ms = 0.0; let mut sum_capture_ms = 0.0; let mut sum_iter_ms = 0.0; let mut consecutive_failures: u32 = 0; @@ -1950,6 +1961,11 @@ async fn run_argus_capture_loop( let mut backup_timestamp_frames: u64 = 0; let mut sum_sensor_to_acquire_ms = 0.0; let mut sum_sensor_to_argus_acquire_ms = 0.0; + if burn_timestamp_enabled { + info!( + "Argus timestamp burn enabled: copying NV12 DMA-BUF frames to CPU I420 before publish" + ); + } loop { if ctrl_c_received.load(Ordering::Acquire) { @@ -1958,7 +1974,12 @@ async fn run_argus_capture_loop( let iter_start = Instant::now(); let acquire_started_at = Instant::now(); - let argus_frame = match session.acquire_frame() { + let capture_result = if burn_timestamp_enabled { + session.capture_i420_frame().map(CapturedArgusFrame::I420) + } else { + session.capture_frame().map(CapturedArgusFrame::DmaBuf) + }; + let captured_frame = match capture_result { Ok(frame) => { consecutive_failures = 0; frame @@ -1979,6 +2000,12 @@ async fn run_argus_capture_loop( } }; let acquire_finished_at = Instant::now(); + let argus_frame = match &captured_frame { + CapturedArgusFrame::DmaBuf(frame) => frame, + CapturedArgusFrame::I420(frame) => &frame.dmabuf, + }; + let argus_wait_ms = argus_frame.acquire_wait_ns as f64 / 1_000_000.0; + let argus_blit_ms = argus_frame.blit_ns as f64 / 1_000_000.0; let fallback_wall_time_us = if config.attach_timestamp { unix_time_us_now() } else { 0 }; @@ -2023,14 +2050,12 @@ async fn run_argus_capture_loop( if config.attach_timestamp { if timestamp_from_sensor { sensor_timestamp_frames += 1; - let sensor_to_acquire_ms = fallback_wall_time_us - .saturating_sub(capture_wall_time_us) - as f64 - / 1_000.0; - let blit_ms = argus_frame.blit_ns as f64 / 1_000_000.0; + let sensor_to_acquire_ms = + fallback_wall_time_us.saturating_sub(capture_wall_time_us) as f64 + / 1_000.0; sum_sensor_to_acquire_ms += sensor_to_acquire_ms; sum_sensor_to_argus_acquire_ms += - (sensor_to_acquire_ms - blit_ms).max(0.0); + (sensor_to_acquire_ms - argus_blit_ms).max(0.0); } else { backup_timestamp_frames += 1; } @@ -2052,20 +2077,45 @@ async fn run_argus_capture_loop( None }; - rtc_source.capture_dmabuf_frame_with_metadata( - argus_frame.dmabuf_fd, - width, - height, - 0, // NV12 - start_ts.elapsed().as_micros() as i64, - frame_metadata, - ); + match captured_frame { + CapturedArgusFrame::DmaBuf(argus_frame) => { + let plane = argus_frame + .dmabuf + .planes + .first() + .ok_or_else(|| anyhow::anyhow!("Argus DMA-BUF frame missing plane"))?; + rtc_source.capture_dmabuf_frame_with_metadata( + plane.fd, + argus_frame.dmabuf.width, + argus_frame.dmabuf.height, + 0, // NV12 + argus_frame.dmabuf.timestamp_us, + frame_metadata, + ); + } + CapturedArgusFrame::I420(mut argus_i420_frame) => { + if let Some(overlay) = timestamp_overlay.as_mut() { + let overlay_started_at = Instant::now(); + let (stride_y, _, _) = argus_i420_frame.frame.buffer.strides(); + let (data_y, _, _) = argus_i420_frame.frame.buffer.data_mut(); + overlay.draw(data_y, stride_y as usize, capture_wall_time_us, fid); + sum_timestamp_burn_ms += + overlay_started_at.elapsed().as_secs_f64() * 1000.0; + } + sum_argus_i420_copy_ms += + argus_i420_frame.copy_to_i420_ns as f64 / 1_000_000.0; + argus_i420_frame.frame.frame_metadata = frame_metadata; + argus_i420_frame.frame.timestamp_us = + argus_i420_frame.dmabuf.dmabuf.timestamp_us; + rtc_source.capture_frame(&argus_i420_frame.frame); + } + } let capture_finished_at = Instant::now(); frames += 1; sum_acquire_ms += (acquire_finished_at - acquire_started_at).as_secs_f64() * 1000.0; - sum_argus_wait_ms += argus_frame.acquire_wait_ns as f64 / 1_000_000.0; - sum_argus_blit_ms += argus_frame.blit_ns as f64 / 1_000_000.0; + sum_argus_wait_ms += argus_wait_ms; + sum_argus_blit_ms += argus_blit_ms; sum_capture_ms += (capture_finished_at - acquire_finished_at).as_secs_f64() * 1000.0; sum_iter_ms += (Instant::now() - iter_start).as_secs_f64() * 1000.0; @@ -2085,21 +2135,41 @@ async fn run_argus_capture_loop( } else { 0.0 }; - info!( - "MIPI publishing: {}x{}, ~{:.1} fps | packet trailer timestamp source: sensor {} frames, backup system {} frames | avg ms: sensor_to_argus_acquire {:.2}, argus_wait {:.2}, argus_blit {:.2}, sensor_to_acquire {:.2}, acquire {:.2}, capture {:.2}, iter {:.2}", - width, - height, - fps_est, - sensor_timestamp_frames, - backup_timestamp_frames, - sensor_to_argus_acquire_ms, - sum_argus_wait_ms / n, - sum_argus_blit_ms / n, - sensor_age_ms, - sum_acquire_ms / n, - sum_capture_ms / n, - sum_iter_ms / n, - ); + if burn_timestamp_enabled { + info!( + "MIPI publishing: {}x{}, ~{:.1} fps | packet trailer timestamp source: sensor {} frames, backup system {} frames | avg ms: sensor_to_argus_acquire {:.2}, argus_wait {:.2}, argus_blit {:.2}, argus_i420_copy {:.2}, timestamp_burn {:.2}, sensor_to_acquire {:.2}, acquire {:.2}, capture {:.2}, iter {:.2}", + width, + height, + fps_est, + sensor_timestamp_frames, + backup_timestamp_frames, + sensor_to_argus_acquire_ms, + sum_argus_wait_ms / n, + sum_argus_blit_ms / n, + sum_argus_i420_copy_ms / n, + sum_timestamp_burn_ms / n, + sensor_age_ms, + sum_acquire_ms / n, + sum_capture_ms / n, + sum_iter_ms / n, + ); + } else { + info!( + "MIPI publishing: {}x{}, ~{:.1} fps | packet trailer timestamp source: sensor {} frames, backup system {} frames | avg ms: sensor_to_argus_acquire {:.2}, argus_wait {:.2}, argus_blit {:.2}, sensor_to_acquire {:.2}, acquire {:.2}, capture {:.2}, iter {:.2}", + width, + height, + fps_est, + sensor_timestamp_frames, + backup_timestamp_frames, + sensor_to_argus_acquire_ms, + sum_argus_wait_ms / n, + sum_argus_blit_ms / n, + sensor_age_ms, + sum_acquire_ms / n, + sum_capture_ms / n, + sum_iter_ms / n, + ); + } } else { info!( "MIPI publishing: {}x{}, ~{:.1} fps | packet trailer timestamp: disabled | avg ms: argus_wait {:.2}, argus_blit {:.2}, acquire {:.2}, capture {:.2}, iter {:.2}", @@ -2119,6 +2189,8 @@ async fn run_argus_capture_loop( sum_acquire_ms = 0.0; sum_argus_wait_ms = 0.0; sum_argus_blit_ms = 0.0; + sum_argus_i420_copy_ms = 0.0; + sum_timestamp_burn_ms = 0.0; sum_capture_ms = 0.0; sum_iter_ms = 0.0; sum_sensor_to_acquire_ms = 0.0; diff --git a/livekit-capture/src/sources/argus.rs b/livekit-capture/src/sources/argus.rs index ba365b714..f522d944e 100644 --- a/livekit-capture/src/sources/argus.rs +++ b/livekit-capture/src/sources/argus.rs @@ -14,6 +14,7 @@ //! NVIDIA Argus/libargus capture for Jetson MIPI CSI cameras. +use livekit::webrtc::video_frame::{I420Buffer, VideoFrame, VideoRotation}; use thiserror::Error; #[cfg(livekit_capture_argus)] @@ -50,6 +51,18 @@ extern "C" { blit_ns: *mut u64, ) -> c_int; + fn lk_argus_copy_frame_to_i420( + session: *mut c_void, + dmabuf_fd: c_int, + dst_y: *mut u8, + dst_stride_y: c_int, + dst_u: *mut u8, + dst_stride_u: c_int, + dst_v: *mut u8, + dst_stride_v: c_int, + copy_to_i420_ns: *mut u64, + ) -> c_int; + fn lk_argus_release_frame(session: *mut c_void); } @@ -99,6 +112,53 @@ pub enum ArgusError { /// The C shim failed to acquire a frame. #[error("Argus frame acquisition failed")] AcquireFrameFailed, + /// The captured DMA-BUF frame did not include a plane descriptor. + #[error("Argus frame did not include a DMA-BUF plane")] + MissingDmaBufPlane, + /// The C shim failed to copy the captured frame to I420. + #[error("failed to copy Argus frame to I420: {0}")] + CopyToI420Failed(ArgusI420CopyError), +} + +/// Error returned while copying an Argus DMA-BUF frame to CPU I420. +#[derive(Debug, Clone, Copy, Error, PartialEq, Eq)] +pub enum ArgusI420CopyError { + /// The C shim received invalid arguments. + #[error("invalid argument")] + InvalidArgument, + /// The DMA-BUF fd was not found in the active Argus buffer ring. + #[error("DMA-BUF surface not found")] + SurfaceNotFound, + /// Mapping the DMA-BUF surface for CPU readback failed. + #[error("failed to map DMA-BUF surface for CPU readback: {0}")] + MapFailed(i32), + /// Synchronizing the DMA-BUF surface for CPU readback failed. + #[error("failed to synchronize DMA-BUF surface for CPU readback: {0}")] + SyncForCpuFailed(i32), + /// The mapped surface did not expose valid NV12 planes. + #[error("invalid mapped NV12 surface")] + InvalidSurface, + /// Unmapping the DMA-BUF surface failed. + #[error("failed to unmap DMA-BUF surface: {0}")] + UnmapFailed(i32), + /// The C shim returned an unknown error code. + #[error("unknown error code {0}")] + Unknown(i32), +} + +#[cfg(livekit_capture_argus)] +impl ArgusI420CopyError { + fn from_status(status: i32) -> Self { + match status { + -1 => Self::InvalidArgument, + -2 => Self::SurfaceNotFound, + -4 => Self::InvalidSurface, + code if code <= -2000 => Self::SyncForCpuFailed(-2000 - code), + code if code <= -1000 => Self::MapFailed(-1000 - code), + code if code <= -100 => Self::UnmapFailed(-100 - code), + code => Self::Unknown(code), + } + } } /// One Argus frame backed by an NV12 DMA-BUF. @@ -123,6 +183,17 @@ impl ArgusFrame { } } +/// One Argus frame copied to CPU-accessible I420. +#[derive(Debug)] +pub struct ArgusI420Frame { + /// I420 frame suitable for timestamp burning or other CPU-side mutation. + pub frame: VideoFrame, + /// Original Argus DMA-BUF frame descriptor. + pub dmabuf: ArgusFrame, + /// Time spent copying NV12 DMA-BUF data into the I420 frame. + pub copy_to_i420_ns: u64, +} + /// Jetson Argus capture session that emits NV12 DMA-BUF frames. #[derive(Debug)] pub struct ArgusCaptureSession { @@ -152,6 +223,22 @@ impl ArgusCaptureSession { self.acquire_frame_inner() } + /// Captures the next frame and copies it to CPU-accessible I420. + /// + /// This intentionally maps the DMA-BUF for CPU readback and should be used + /// only when the caller needs to mutate pixels before publishing. + pub fn capture_i420_frame(&mut self) -> Result { + let dmabuf = self.capture_frame()?; + let mut frame = VideoFrame { + rotation: VideoRotation::VideoRotation0, + timestamp_us: dmabuf.dmabuf.timestamp_us, + frame_metadata: None, + buffer: I420Buffer::new(dmabuf.dmabuf.width, dmabuf.dmabuf.height), + }; + let copy_to_i420_ns = self.copy_frame_to_i420(&dmabuf.dmabuf, &mut frame.buffer)?; + Ok(ArgusI420Frame { frame, dmabuf, copy_to_i420_ns }) + } + /// Acquires the next captured frame as an NV12 DMA-BUF. #[deprecated(note = "use capture_frame")] pub fn acquire_frame(&mut self) -> Result { @@ -258,6 +345,47 @@ impl ArgusCaptureSession { Err(ArgusError::Unsupported) } + #[cfg(livekit_capture_argus)] + fn copy_frame_to_i420( + &self, + dmabuf: &DmaBufFrame, + destination: &mut I420Buffer, + ) -> Result { + let plane = dmabuf.planes.first().ok_or(ArgusError::MissingDmaBufPlane)?; + let (stride_y, stride_u, stride_v) = destination.strides(); + let (dst_y, dst_u, dst_v) = destination.data_mut(); + let mut copy_to_i420_ns = 0; + let status = unsafe { + // SAFETY: `self.handle` owns the Argus session; destination slices + // come from a mutable I420 buffer and remain valid for this call. + lk_argus_copy_frame_to_i420( + self.handle, + plane.fd, + dst_y.as_mut_ptr(), + c_int_from_u32(stride_y, "stride_y")?, + dst_u.as_mut_ptr(), + c_int_from_u32(stride_u, "stride_u")?, + dst_v.as_mut_ptr(), + c_int_from_u32(stride_v, "stride_v")?, + &mut copy_to_i420_ns, + ) + }; + if status == 0 { + Ok(copy_to_i420_ns) + } else { + Err(ArgusError::CopyToI420Failed(ArgusI420CopyError::from_status(status))) + } + } + + #[cfg(not(livekit_capture_argus))] + fn copy_frame_to_i420( + &self, + _dmabuf: &DmaBufFrame, + _destination: &mut I420Buffer, + ) -> Result { + Err(ArgusError::Unsupported) + } + #[cfg(livekit_capture_argus)] fn release_frame_inner(&mut self) { unsafe { diff --git a/livekit-capture/src/sources/lk_argus.cpp b/livekit-capture/src/sources/lk_argus.cpp index 5ae3931f3..18b521550 100644 --- a/livekit-capture/src/sources/lk_argus.cpp +++ b/livekit-capture/src/sources/lk_argus.cpp @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -67,6 +68,26 @@ struct LkArgusSession { static const uint64_t kAcquireTimeoutNs = 1000000000ULL; // 1 second +static constexpr int kCopyI420InvalidArgument = -1; +static constexpr int kCopyI420SurfaceNotFound = -2; +static constexpr int kCopyI420InvalidSurface = -4; + +static int copy_i420_error_code(int ret) { + return ret < 0 ? -ret : ret; +} + +static int copy_i420_map_error(int ret) { + return -1000 - copy_i420_error_code(ret); +} + +static int copy_i420_sync_error(int ret) { + return -2000 - copy_i420_error_code(ret); +} + +static int copy_i420_unmap_error(int ret) { + return -100 - copy_i420_error_code(ret); +} + enum class SensorTimestampStatus { Available, InvalidArgs, @@ -583,6 +604,106 @@ int lk_argus_acquire_frame(void* handle) { return lk_argus_acquire_frame_with_metadata(handle, nullptr, nullptr, nullptr); } +int lk_argus_copy_frame_to_i420( + void* handle, + int dmabuf_fd, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + uint64_t* copy_to_i420_ns) { + using Clock = std::chrono::steady_clock; + + auto* s = static_cast(handle); + if (!s || dmabuf_fd < 0 || !dst_y || !dst_u || !dst_v) { + return kCopyI420InvalidArgument; + } + + const int width = s->width; + const int height = s->height; + const int chroma_width = (width + 1) / 2; + const int chroma_height = (height + 1) / 2; + if (width <= 0 || height <= 0 || + dst_stride_y < width || + dst_stride_u < chroma_width || + dst_stride_v < chroma_width) { + return kCopyI420InvalidArgument; + } + + NvBufSurface* surface = nullptr; + for (int i = 0; i < kNumDmaBufs; i++) { + if (s->dmabuf_fds[i] == dmabuf_fd) { + surface = s->dmabuf_surfaces[i]; + break; + } + } + if (!surface || surface->batchSize < 1) { + return kCopyI420SurfaceNotFound; + } + + auto t0 = Clock::now(); + int ret = NvBufSurfaceMap(surface, 0, -1, NVBUF_MAP_READ); + if (ret != 0) { + return copy_i420_map_error(ret); + } + + ret = NvBufSurfaceSyncForCpu(surface, 0, -1); + if (ret != 0) { + int unmap_ret = NvBufSurfaceUnMap(surface, 0, -1); + if (unmap_ret != 0) { + return copy_i420_unmap_error(unmap_ret); + } + return copy_i420_sync_error(ret); + } + + const NvBufSurfaceParams& params = surface->surfaceList[0]; + const uint8_t* src_y = + static_cast(params.mappedAddr.addr[0]); + const uint8_t* src_uv = + static_cast(params.mappedAddr.addr[1]); + const int src_stride_y = static_cast(params.planeParams.pitch[0]); + const int src_stride_uv = static_cast(params.planeParams.pitch[1]); + + if (!src_y || !src_uv || + src_stride_y < width || + src_stride_uv < chroma_width * 2) { + ret = NvBufSurfaceUnMap(surface, 0, -1); + if (ret != 0) { + return copy_i420_unmap_error(ret); + } + return kCopyI420InvalidSurface; + } + + for (int row = 0; row < height; row++) { + std::memcpy(dst_y + row * dst_stride_y, + src_y + row * src_stride_y, + static_cast(width)); + } + + for (int row = 0; row < chroma_height; row++) { + const uint8_t* src_row = src_uv + row * src_stride_uv; + uint8_t* dst_u_row = dst_u + row * dst_stride_u; + uint8_t* dst_v_row = dst_v + row * dst_stride_v; + for (int col = 0; col < chroma_width; col++) { + dst_u_row[col] = src_row[col * 2]; + dst_v_row[col] = src_row[col * 2 + 1]; + } + } + + ret = NvBufSurfaceUnMap(surface, 0, -1); + auto t1 = Clock::now(); + if (copy_to_i420_ns) { + *copy_to_i420_ns = static_cast( + std::chrono::duration_cast(t1 - t0).count()); + } + if (ret != 0) { + return copy_i420_unmap_error(ret); + } + return 0; +} + void lk_argus_release_frame(void* handle) { auto* s = static_cast(handle); if (!s) return; From 6f278bc6a1ae51524f309f23b0f38cac47aa776d Mon Sep 17 00:00:00 2001 From: David Chen Date: Tue, 30 Jun 2026 00:25:53 -0700 Subject: [PATCH 17/24] support h264 avc --- examples/preencode_publish/src/main.rs | 152 +++++++++++++-- livekit-capture/src/encoded.rs | 8 + livekit-capture/src/encoded/h26x.rs | 230 +++++++++++++++++++++++ livekit-capture/src/sources/gstreamer.rs | 32 +++- livekit-capture/src/sources/tcp.rs | 73 ++++++- 5 files changed, 480 insertions(+), 15 deletions(-) diff --git a/examples/preencode_publish/src/main.rs b/examples/preencode_publish/src/main.rs index 4ee24c0d0..a533f02c9 100644 --- a/examples/preencode_publish/src/main.rs +++ b/examples/preencode_publish/src/main.rs @@ -92,6 +92,14 @@ struct Args { #[arg(long, default_value_t = 30)] fps: u32, + /// H.264 TCP byte-stream format. + #[arg(long, value_enum, default_value_t = H264FormatArg::AnnexB)] + h264_format: H264FormatArg, + + /// Length-prefix size in bytes for --h264-format avc. + #[arg(long, default_value_t = 4)] + avc_nal_length_size: u8, + /// Log access-unit timing, keyframe, and H26x NAL diagnostics. #[arg(long)] diagnostics: bool, @@ -118,6 +126,12 @@ enum CodecArg { H265, } +#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] +enum H264FormatArg { + AnnexB, + Avc, +} + impl CodecArg { fn encoded_codec(self) -> EncodedVideoCodec { match self { @@ -126,9 +140,14 @@ impl CodecArg { } } - fn wire_format(self) -> EncodedWireFormat { + fn wire_format(self, h264_format: H264FormatArg, avc_nal_length_size: u8) -> EncodedWireFormat { match self { - Self::H264 => EncodedWireFormat::H264AnnexB, + Self::H264 => match h264_format { + H264FormatArg::AnnexB => EncodedWireFormat::H264AnnexB, + H264FormatArg::Avc => { + EncodedWireFormat::H264Avc { nal_length_size: avc_nal_length_size } + } + }, Self::H265 => EncodedWireFormat::H265AnnexB, } } @@ -142,6 +161,7 @@ async fn main() -> Result<()> { async fn run(args: Args) -> Result<()> { validate_dimensions(args.width, args.height)?; + validate_h264_format_args(&args)?; #[cfg(feature = "gstreamer")] validate_gstreamer_args(&args)?; @@ -167,19 +187,35 @@ fn validate_gstreamer_args(args: &Args) -> Result<()> { Ok(()) } +fn validate_h264_format_args(args: &Args) -> Result<()> { + if !(1..=4).contains(&args.avc_nal_length_size) { + bail!("--avc-nal-length-size must be between 1 and 4 bytes"); + } + if args.h264_format == H264FormatArg::Avc { + if args.source != SourceKind::Tcp { + bail!("--h264-format avc is only valid with --source tcp"); + } + if args.codec != Some(CodecArg::H264) { + bail!("--h264-format avc requires --codec h264"); + } + } + Ok(()) +} + async fn run_tcp_source(args: Args, frame_interval_us: i64) -> Result<()> { let codec_arg = args.codec.context("--codec is required with --source tcp")?; let codec = codec_arg.encoded_codec(); let host = args.host.clone().context("--host is required with --source tcp")?; + let wire_format = codec_arg.wire_format(args.h264_format, args.avc_nal_length_size); let config = ByteStreamSourceConfig::new( - codec_arg.wire_format(), + wire_format, current_time_us(), frame_interval_us, args.width, args.height, ); - log::info!("Connecting to TCP encoded stream at {host}"); + log::info!("Connecting to TCP {wire_format:?} encoded stream at {host}"); let stream = TcpStream::connect(&host) .with_context(|| format!("failed to connect to TCP source at {host}"))?; let shutdown_stream = stream.try_clone().context("failed to clone TCP stream")?; @@ -407,10 +443,22 @@ fn ensure_encoded_appsink( requested_codec: Option, ) -> Result<(gst::Element, GStreamerSampleFormat)> { if let Some(appsink) = pipeline.by_name(GSTREAMER_APPSINK_NAME) { - let codec = requested_codec - .or_else(|| codec_from_element_sink_caps(&appsink)) - .unwrap_or(EncodedVideoCodec::H264); - let sample_format = h26x_sample_format(codec)?; + let sample_format = match sample_format_from_element_sink_caps(&appsink)? { + Some(sample_format) => { + if let Some(requested_codec) = requested_codec { + if requested_codec != sample_format.codec() { + bail!( + "GStreamer codec mismatch: --codec requested {:?}, but appsink '{}' advertises {:?}", + requested_codec, + GSTREAMER_APPSINK_NAME, + sample_format.codec() + ); + } + } + sample_format + } + None => h26x_sample_format(requested_codec.unwrap_or(EncodedVideoCodec::H264))?, + }; return Ok((appsink, sample_format)); } @@ -478,7 +526,7 @@ fn h26x_sample_format(codec: EncodedVideoCodec) -> Result EncodedVideoCodec::H264 => Ok(GStreamerSampleFormat::H264AnnexB), EncodedVideoCodec::H265 => Ok(GStreamerSampleFormat::H265AnnexB), EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => bail!( - "GStreamer passthrough currently supports H.264/H.265 Annex-B; {:?} needs an explicit access-unit source path", + "GStreamer passthrough currently supports H.264/H.265 access units; {:?} needs an explicit access-unit source path", codec ), _ => bail!("unsupported GStreamer codec: {:?}", codec), @@ -518,9 +566,68 @@ fn h26x_appsink_caps(codec: EncodedVideoCodec) -> Result { } #[cfg(feature = "gstreamer")] -fn codec_from_element_sink_caps(element: &gst::Element) -> Option { - let sink_pad = element.static_pad("sink")?; - codec_from_pad_caps(&sink_pad) +fn sample_format_from_element_sink_caps( + element: &gst::Element, +) -> Result> { + let Some(sink_pad) = element.static_pad("sink") else { + return Ok(None); + }; + sample_format_from_pad_caps(&sink_pad) +} + +#[cfg(feature = "gstreamer")] +fn sample_format_from_pad_caps(pad: &gst::Pad) -> Result> { + let caps = pad.current_caps().unwrap_or_else(|| pad.query_caps(None)); + for structure in caps.iter() { + if let Some(sample_format) = sample_format_from_caps_structure(structure)? { + return Ok(Some(sample_format)); + } + } + Ok(None) +} + +#[cfg(feature = "gstreamer")] +fn sample_format_from_caps_structure( + structure: &gst::StructureRef, +) -> Result> { + let Some(codec) = codec_from_caps_name(structure.name()) else { + return Ok(None); + }; + + match codec { + EncodedVideoCodec::H264 => { + let stream_format = structure.get::("stream-format").ok(); + match stream_format.as_deref() { + Some("avc") | Some("avc3") => Ok(Some(GStreamerSampleFormat::H264Avc { + nal_length_size: h264_avc_nal_length_size_from_caps(structure), + })), + Some("byte-stream") | None => Ok(Some(GStreamerSampleFormat::H264AnnexB)), + Some(stream_format) => bail!( + "unsupported GStreamer H.264 stream-format '{stream_format}'; expected byte-stream or avc" + ), + } + } + EncodedVideoCodec::H265 => Ok(Some(GStreamerSampleFormat::H265AnnexB)), + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => Ok(None), + _ => Ok(None), + } +} + +#[cfg(feature = "gstreamer")] +fn h264_avc_nal_length_size_from_caps(structure: &gst::StructureRef) -> u8 { + let Ok(codec_data) = structure.get::("codec_data") else { + return 4; + }; + let Ok(codec_data) = codec_data.map_readable() else { + return 4; + }; + h264_avc_nal_length_size_from_codec_data(codec_data.as_ref()).unwrap_or(4) +} + +#[cfg(feature = "gstreamer")] +fn h264_avc_nal_length_size_from_codec_data(codec_data: &[u8]) -> Option { + let length_size = (codec_data.get(4)? & 0x03) + 1; + (1..=4).contains(&length_size).then_some(length_size) } #[cfg(feature = "gstreamer")] @@ -1095,6 +1202,27 @@ mod tests { ); } + #[test] + fn gstreamer_caps_detect_h264_avc_sample_format() { + let caps = gst::Caps::builder("video/x-h264") + .field("stream-format", "avc") + .field("alignment", "au") + .build(); + let structure = caps.iter().next().unwrap(); + + assert_eq!( + sample_format_from_caps_structure(structure).unwrap(), + Some(GStreamerSampleFormat::H264Avc { nal_length_size: 4 }) + ); + } + + #[test] + fn gstreamer_avc_codec_data_sets_nal_length_size() { + assert_eq!(h264_avc_nal_length_size_from_codec_data(&[1, 0, 0, 0, 0xfc]), Some(1)); + assert_eq!(h264_avc_nal_length_size_from_codec_data(&[1, 0, 0, 0, 0xfd]), Some(2)); + assert_eq!(h264_avc_nal_length_size_from_codec_data(&[1, 0, 0, 0, 0xff]), Some(4)); + } + #[test] fn gstreamer_test_source_pulls_h264_access_units_when_plugins_are_available() { let frame_interval_us = frame_interval_us(30).unwrap(); diff --git a/livekit-capture/src/encoded.rs b/livekit-capture/src/encoded.rs index 465fdb493..d2e2059c2 100644 --- a/livekit-capture/src/encoded.rs +++ b/livekit-capture/src/encoded.rs @@ -34,6 +34,14 @@ const ANNEX_B_START_CODE: [u8; 4] = [0, 0, 0, 1]; pub enum EncodedWireFormat { /// H.264 Annex-B byte stream. H264AnnexB, + /// H.264/AVC byte stream with length-prefixed NAL units. + /// + /// `nal_length_size` is the number of big-endian length bytes before each NAL unit. Values + /// from 1 through 4 are accepted; 4 is the common AVC configuration. + H264Avc { + /// Length-prefix size in bytes. + nal_length_size: u8, + }, /// H.265 Annex-B byte stream. H265AnnexB, /// RTP packets for the supplied codec and RTP clock rate. diff --git a/livekit-capture/src/encoded/h26x.rs b/livekit-capture/src/encoded/h26x.rs index 1bb3716a0..d151eb1c2 100644 --- a/livekit-capture/src/encoded/h26x.rs +++ b/livekit-capture/src/encoded/h26x.rs @@ -35,6 +35,18 @@ pub struct AnnexBAccessUnitParser { height: u32, } +/// H.264/AVC length-prefixed parser state. +#[cfg(any(feature = "tcp-source", test))] +#[derive(Debug, Clone)] +pub(crate) struct AvcAccessUnitParser { + pending: Vec, + nal_length_size: u8, + next_timestamp_us: i64, + frame_interval_us: i64, + width: u32, + height: u32, +} + impl AnnexBAccessUnitParser { /// Creates a parser for H.264 or H.265 Annex-B byte streams. pub fn new( @@ -111,6 +123,83 @@ impl AnnexBAccessUnitParser { } } +#[cfg(any(feature = "tcp-source", test))] +impl AvcAccessUnitParser { + /// Creates a parser for H.264/AVC length-prefixed byte streams. + pub(crate) fn new( + nal_length_size: u8, + start_timestamp_us: i64, + frame_interval_us: i64, + width: u32, + height: u32, + ) -> Result { + validate_avc_nal_length_size(nal_length_size)?; + + Ok(Self { + pending: Vec::new(), + nal_length_size, + next_timestamp_us: start_timestamp_us, + frame_interval_us, + width, + height, + }) + } + + /// Pushes encoded bytes and returns the next complete access unit if one is found. + pub(crate) fn push( + &mut self, + bytes: &[u8], + ) -> Result, CaptureError> { + self.pending.extend_from_slice(bytes); + self.drain_next(false) + } + + /// Flushes the pending bytes as the final access unit. + pub(crate) fn flush(&mut self) -> Result, CaptureError> { + self.drain_next(true) + } + + fn drain_next(&mut self, at_eof: bool) -> Result, CaptureError> { + let ranges = avc_nal_ranges(&self.pending, self.nal_length_size, at_eof)?; + if ranges.is_empty() { + return Ok(None); + } + + let Some(split_at) = + avc_access_unit_split_index(&self.pending, &ranges, self.nal_length_size as usize)? + else { + if at_eof { + return self.take_access_unit(self.pending.len()); + } + return Ok(None); + }; + + self.take_access_unit(split_at) + } + + fn take_access_unit( + &mut self, + byte_len: usize, + ) -> Result, CaptureError> { + if byte_len == 0 { + return Ok(None); + } + + let access_unit = self.pending[..byte_len].to_vec(); + self.pending.drain(..byte_len); + let timestamp_us = self.next_timestamp_us; + self.next_timestamp_us = self.next_timestamp_us.saturating_add(self.frame_interval_us); + access_unit_from_h264_avc( + &access_unit, + self.nal_length_size, + timestamp_us, + self.width, + self.height, + ) + .map(Some) + } +} + /// Returns NAL-unit byte ranges for an Annex-B access unit or stream chunk. pub fn annex_b_nal_ranges(bytes: &[u8]) -> Vec> { let mut ranges = Vec::new(); @@ -147,6 +236,18 @@ pub fn annex_b_nalus(bytes: &[u8]) -> Result, CaptureError> { Ok(nals) } +/// Creates an Annex-B access unit from H.264/AVC length-prefixed NAL units. +pub(crate) fn access_unit_from_h264_avc( + payload: &[u8], + nal_length_size: u8, + timestamp_us: i64, + width: u32, + height: u32, +) -> Result { + let nals = avc_nalus(payload, nal_length_size)?; + access_unit_from_nalus(EncodedVideoCodec::H264, &nals, timestamp_us, width, height) +} + /// Creates an access unit from an Annex-B buffer. pub fn access_unit_from_annex_b( codec: EncodedVideoCodec, @@ -228,6 +329,32 @@ fn access_unit_split_index( Ok(None) } +#[cfg(any(feature = "tcp-source", test))] +fn avc_access_unit_split_index( + bytes: &[u8], + ranges: &[Range], + nal_length_size: usize, +) -> Result, CaptureError> { + if ranges.len() < 2 { + return Ok(None); + } + + let first_nal = &bytes[ranges[0].clone()]; + let mut seen_vcl = is_vcl_nal(EncodedVideoCodec::H264, first_nal)?; + for range in ranges.iter().skip(1) { + let nal = &bytes[range.clone()]; + if is_access_unit_delimiter(EncodedVideoCodec::H264, nal)? && seen_vcl { + return range + .start + .checked_sub(nal_length_size) + .ok_or(CaptureError::InvalidEncodedData("missing AVC NAL length")) + .map(Some); + } + seen_vcl |= is_vcl_nal(EncodedVideoCodec::H264, nal)?; + } + Ok(None) +} + fn split_start_code_index(bytes: &[u8], nal_start: usize) -> Result { if nal_start >= 4 && bytes[nal_start - 4..nal_start] == [0, 0, 0, 1] { return Ok(nal_start - 4); @@ -272,6 +399,66 @@ fn find_start_code(bytes: &[u8]) -> Option<(usize, usize)> { None } +fn avc_nalus(payload: &[u8], nal_length_size: u8) -> Result, CaptureError> { + let ranges = avc_nal_ranges(payload, nal_length_size, true)?; + if ranges.is_empty() { + return Err(CaptureError::EmptyPayload); + } + Ok(ranges.into_iter().map(|range| &payload[range]).collect()) +} + +fn avc_nal_ranges( + bytes: &[u8], + nal_length_size: u8, + at_eof: bool, +) -> Result>, CaptureError> { + validate_avc_nal_length_size(nal_length_size)?; + + let nal_length_size = nal_length_size as usize; + let mut ranges = Vec::new(); + let mut cursor = 0; + while cursor < bytes.len() { + if bytes.len() - cursor < nal_length_size { + if at_eof { + return Err(CaptureError::InvalidEncodedData("truncated AVC NAL length")); + } + break; + } + + let nal_len = read_avc_nal_length(&bytes[cursor..cursor + nal_length_size]); + cursor += nal_length_size; + if nal_len == 0 { + return Err(CaptureError::InvalidEncodedData("empty AVC NAL unit")); + } + + let Some(nal_end) = cursor.checked_add(nal_len) else { + return Err(CaptureError::InvalidEncodedData("AVC NAL unit length overflow")); + }; + if nal_end > bytes.len() { + if at_eof { + return Err(CaptureError::InvalidEncodedData("truncated AVC NAL unit")); + } + break; + } + + ranges.push(cursor..nal_end); + cursor = nal_end; + } + + Ok(ranges) +} + +fn read_avc_nal_length(bytes: &[u8]) -> usize { + bytes.iter().fold(0usize, |len, byte| (len << 8) | usize::from(*byte)) +} + +fn validate_avc_nal_length_size(nal_length_size: u8) -> Result<(), CaptureError> { + if (1..=4).contains(&nal_length_size) { + return Ok(()); + } + Err(CaptureError::InvalidEncodedData("invalid AVC NAL length size")) +} + #[cfg(test)] mod tests { use super::*; @@ -289,6 +476,32 @@ mod tests { assert!(is_keyframe_annex_b(EncodedVideoCodec::H264, &bytes).unwrap()); } + #[test] + fn access_unit_from_avc_converts_length_prefixed_nals() { + let bytes = [0, 0, 0, 4, 0x67, 1, 2, 3, 0, 0, 0, 3, 0x65, 4, 5]; + let au = access_unit_from_h264_avc(&bytes, 4, 10, 640, 480).unwrap(); + + assert_eq!(au.codec, EncodedVideoCodec::H264); + assert_eq!(au.frame_type, EncodedFrameType::Key); + assert_eq!(au.payload.as_ref(), &[0, 0, 0, 1, 0x67, 1, 2, 3, 0, 0, 0, 1, 0x65, 4, 5]); + } + + #[test] + fn access_unit_from_avc_supports_two_byte_lengths() { + let bytes = [0, 2, 0x61, 1]; + let au = access_unit_from_h264_avc(&bytes, 2, 10, 640, 480).unwrap(); + + assert_eq!(au.frame_type, EncodedFrameType::Delta); + assert_eq!(au.payload.as_ref(), &[0, 0, 0, 1, 0x61, 1]); + } + + #[test] + fn access_unit_from_avc_rejects_truncated_nal() { + let err = access_unit_from_h264_avc(&[0, 0, 0, 3, 0x65], 4, 10, 640, 480).unwrap_err(); + + assert_eq!(err, CaptureError::InvalidEncodedData("truncated AVC NAL unit")); + } + #[test] fn parser_flushes_final_access_unit() { let mut parser = @@ -314,4 +527,21 @@ mod tests { assert_eq!(au.timestamp_us, 33_433); assert_eq!(au.payload.as_ref(), &[0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x41, 3]); } + + #[test] + fn avc_parser_splits_at_next_access_unit_delimiter() { + let mut parser = AvcAccessUnitParser::new(4, 100, 33_333, 640, 480).unwrap(); + let stream = [ + 0, 0, 0, 2, 0x09, 0x10, 0, 0, 0, 3, 0x65, 1, 2, 0, 0, 0, 2, 0x09, 0x10, 0, 0, 0, 2, + 0x41, 3, + ]; + + let au = parser.push(&stream).unwrap().unwrap(); + assert_eq!(au.timestamp_us, 100); + assert_eq!(au.payload.as_ref(), &[0, 0, 0, 1, 0x09, 0x10, 0, 0, 0, 1, 0x65, 1, 2]); + + let au = parser.flush().unwrap().unwrap(); + assert_eq!(au.timestamp_us, 33_433); + assert_eq!(au.payload.as_ref(), &[0, 0, 0, 1, 0x09, 0x10, 0, 0, 0, 1, 0x41, 3]); + } } diff --git a/livekit-capture/src/sources/gstreamer.rs b/livekit-capture/src/sources/gstreamer.rs index 5678bd20c..d5b07cbd8 100644 --- a/livekit-capture/src/sources/gstreamer.rs +++ b/livekit-capture/src/sources/gstreamer.rs @@ -22,8 +22,10 @@ use ::gstreamer_app as gst_app; use crate::{ encoded::{ - h26x::access_unit_from_annex_b, ingress::EncodedAccessUnitSource, CodecSpecific, - EncodedFrameType, EncodedVideoCodec, H264PacketizationMode, OwnedEncodedAccessUnit, + h26x::{access_unit_from_annex_b, access_unit_from_h264_avc}, + ingress::EncodedAccessUnitSource, + CodecSpecific, EncodedFrameType, EncodedVideoCodec, H264PacketizationMode, + OwnedEncodedAccessUnit, }, error::CaptureError, }; @@ -34,6 +36,11 @@ use crate::{ pub enum GStreamerSampleFormat { /// H.264 Annex-B access units, usually from `h264parse` with byte-stream caps. H264AnnexB, + /// H.264 access units with AVC length-prefixed NAL units. + H264Avc { + /// Length-prefix size in bytes. + nal_length_size: u8, + }, /// H.265 Annex-B access units, usually from `h265parse` with byte-stream caps. H265AnnexB, /// One already-delimited encoded access unit per appsink sample. @@ -48,6 +55,7 @@ impl GStreamerSampleFormat { pub fn codec(self) -> EncodedVideoCodec { match self { Self::H264AnnexB => EncodedVideoCodec::H264, + Self::H264Avc { .. } => EncodedVideoCodec::H264, Self::H265AnnexB => EncodedVideoCodec::H265, Self::AccessUnit { codec } => codec, } @@ -239,6 +247,9 @@ fn access_unit_from_sample_payload( width, height, ), + GStreamerSampleFormat::H264Avc { nal_length_size } => { + access_unit_from_h264_avc(payload, nal_length_size, timestamp_us, width, height) + } GStreamerSampleFormat::H265AnnexB => access_unit_from_annex_b( EncodedVideoCodec::H265, Bytes::copy_from_slice(payload), @@ -308,6 +319,23 @@ mod tests { assert_eq!(access_unit.timestamp_us, 1_000); } + #[test] + fn sample_payload_h264_avc_converts_to_annex_b_and_detects_keyframe() { + let access_unit = access_unit_from_sample_payload( + GStreamerSampleFormat::H264Avc { nal_length_size: 4 }, + &[0, 0, 0, 3, 0x65, 1, 2], + 1_000, + EncodedFrameType::Delta, + 640, + 480, + ) + .unwrap(); + + assert_eq!(access_unit.codec, EncodedVideoCodec::H264); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 1, 2]); + } + #[test] fn sample_payload_access_unit_uses_buffer_delta_flag() { let access_unit = access_unit_from_sample_payload( diff --git a/livekit-capture/src/sources/tcp.rs b/livekit-capture/src/sources/tcp.rs index 962f73658..d2c96f544 100644 --- a/livekit-capture/src/sources/tcp.rs +++ b/livekit-capture/src/sources/tcp.rs @@ -21,7 +21,7 @@ use thiserror::Error; use crate::{ encoded::{ - h26x::AnnexBAccessUnitParser, + h26x::{AnnexBAccessUnitParser, AvcAccessUnitParser}, ingress::EncodedAccessUnitSource, rtp::{RtpAccessUnitAssembler, RtpDepacketizerError}, EncodedVideoCodec, EncodedWireFormat, OwnedEncodedAccessUnit, @@ -90,6 +90,7 @@ pub type TcpEncodedSource = ByteStreamEncodedSource; #[derive(Debug)] enum ByteStreamParser { H26x(AnnexBAccessUnitParser), + H264Avc(AvcAccessUnitParser), Rtp(RtpAccessUnitAssembler), } @@ -110,6 +111,16 @@ where ) .map_err(TcpSourceError::Capture)?, ), + EncodedWireFormat::H264Avc { nal_length_size } => ByteStreamParser::H264Avc( + AvcAccessUnitParser::new( + nal_length_size, + config.start_timestamp_us, + config.frame_interval_us, + config.width, + config.height, + ) + .map_err(TcpSourceError::Capture)?, + ), EncodedWireFormat::H265AnnexB => ByteStreamParser::H26x( AnnexBAccessUnitParser::new( EncodedVideoCodec::H265, @@ -190,6 +201,33 @@ where } } + fn next_avc( + reader: &mut R, + read_chunk: &mut [u8], + parser: &mut AvcAccessUnitParser, + eof: &mut bool, + ) -> Result, TcpSourceError> { + loop { + if let Some(access_unit) = parser.push(&[]).map_err(TcpSourceError::Capture)? { + return Ok(Some(access_unit)); + } + if *eof { + return parser.flush().map_err(TcpSourceError::Capture); + } + + let read = reader.read(read_chunk).map_err(TcpSourceError::Io)?; + if read == 0 { + *eof = true; + continue; + } + if let Some(access_unit) = + parser.push(&read_chunk[..read]).map_err(TcpSourceError::Capture)? + { + return Ok(Some(access_unit)); + } + } + } + fn next_rtp( reader: &mut R, assembler: &mut RtpAccessUnitAssembler, @@ -302,6 +340,9 @@ where ByteStreamParser::H26x(parser) => { Self::next_annex_b(&mut self.reader, &mut self.read_chunk, parser, &mut self.eof) } + ByteStreamParser::H264Avc(parser) => { + Self::next_avc(&mut self.reader, &mut self.read_chunk, parser, &mut self.eof) + } ByteStreamParser::Rtp(assembler) => { Self::next_rtp(&mut self.reader, assembler, &mut self.eof) } @@ -374,6 +415,23 @@ mod tests { ByteStreamSourceConfig::new(EncodedWireFormat::H264AnnexB, 0, 33_333, 640, 480) } + fn avc_stream() -> Vec { + vec![ + 0, 0, 0, 2, 0x09, 0x10, 0, 0, 0, 3, 0x65, 1, 2, 0, 0, 0, 2, 0x09, 0x10, 0, 0, 0, 2, + 0x41, 3, + ] + } + + fn avc_config() -> ByteStreamSourceConfig { + ByteStreamSourceConfig::new( + EncodedWireFormat::H264Avc { nal_length_size: 4 }, + 0, + 33_333, + 640, + 480, + ) + } + #[test] fn reads_annex_b_access_units() { let stream = annex_b_stream(); @@ -387,6 +445,19 @@ mod tests { assert!(source.next_access_unit().unwrap().is_none()); } + #[test] + fn reads_h264_avc_access_units_as_annex_b() { + let stream = avc_stream(); + let config = avc_config(); + let mut source = ByteStreamEncodedSource::new(Cursor::new(stream), config).unwrap(); + + let first = source.next_access_unit().unwrap().unwrap(); + assert_eq!(first.payload.as_ref(), &[0, 0, 0, 1, 0x09, 0x10, 0, 0, 0, 1, 0x65, 1, 2]); + let second = source.next_access_unit().unwrap().unwrap(); + assert_eq!(second.payload.as_ref(), &[0, 0, 0, 1, 0x09, 0x10, 0, 0, 0, 1, 0x41, 3]); + assert!(source.next_access_unit().unwrap().is_none()); + } + #[test] fn tcp_connect_reads_annex_b_access_units() { let listener = StdTcpListener::bind("127.0.0.1:0").unwrap(); From 9d6dcab09e888f4bf7b3e53bcc61080b1a599d28 Mon Sep 17 00:00:00 2001 From: David Chen Date: Tue, 30 Jun 2026 14:22:27 -0700 Subject: [PATCH 18/24] fix: add numFilled to argus session --- livekit-capture/src/sources/lk_argus.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/livekit-capture/src/sources/lk_argus.cpp b/livekit-capture/src/sources/lk_argus.cpp index 18b521550..abf740d84 100644 --- a/livekit-capture/src/sources/lk_argus.cpp +++ b/livekit-capture/src/sources/lk_argus.cpp @@ -481,6 +481,7 @@ void* lk_argus_create_session(int sensor_index, int width, int height, int fps) delete s; return nullptr; } + surface->numFilled = 1; s->dmabuf_fds[i] = surface->surfaceList[0].bufferDesc; s->dmabuf_surfaces[i] = surface; } From 6e792d7906a0b45c76e50e98d6f856569157b5f1 Mon Sep 17 00:00:00 2001 From: David Chen Date: Tue, 30 Jun 2026 16:13:42 -0700 Subject: [PATCH 19/24] prefer zero copy on platforms that support it, warn when using w/ --burn-timestamp --- examples/local_video/README.md | 6 +- examples/local_video/src/publisher.rs | 249 ++++++++++++++++++-------- 2 files changed, 178 insertions(+), 77 deletions(-) diff --git a/examples/local_video/README.md b/examples/local_video/README.md index 10137e66b..0d367eb52 100644 --- a/examples/local_video/README.md +++ b/examples/local_video/README.md @@ -69,6 +69,7 @@ Publisher usage: cargo run -p local_video -F desktop --bin publisher -- \ --source argus \ --camera-index 0 \ + --zero-copy \ --codec h265 \ --room-name demo \ --identity jetson-cam-1 @@ -139,6 +140,7 @@ Publisher flags (in addition to the common connection flags above): - `--camera-index `: Camera index to use (default: `0`). Use `--list-cameras` to see available indices. - `--source `: Camera backend to use (default: `uvc`). `argus` uses NVIDIA libargus for MIPI CSI cameras and is available only on Linux aarch64 Jetson builds. - `--format `: UVC camera capture format (default: `auto`). `auto` tries uncompressed YUYV first and falls back to MJPEG; `mjpeg` can reduce USB bandwidth when running multiple cameras. +- `--zero-copy`: Use a platform zero-copy capture/encode path when available, such as AVFoundation IOSurface-backed CVPixelBuffers on macOS or Argus DMA-BUF frames on Jetson. If the selected source does not support zero-copy, the publisher logs a warning and uses CPU I420 capture. - `--test-pattern [0|1]`: Generate a test pattern instead of capturing from a camera. `0` is a static SMPTE 75% color-bar pattern and `1` is an animated encoder exercise graphic. Omitting the value defaults to `0`. `--camera-index` is ignored when this is set; `--width`, `--height`, and `--fps` still control the output resolution and frame rate. - `--width `: Desired capture width (default: `1280`). - `--height `: Desired capture height (default: `720`). @@ -147,7 +149,7 @@ Publisher flags (in addition to the common connection flags above): - `--simulcast`: Publish simulcast video (multiple layers when the resolution is large enough). - `--max-bitrate `: Max video bitrate for the main (highest) layer in bits per second (e.g. `1500000`). - `--attach-timestamp`: Attach the current wall-clock time (microseconds since UNIX epoch) as the user timestamp on each published frame. The subscriber can display this to measure end-to-end latency. -- `--burn-timestamp`: Burn the attached timestamp into the video frame as a visible overlay. Has no effect unless `--attach-timestamp` is also set. With `--source argus`, this maps the NV12 DMA-BUF and copies it to CPU I420 before publishing. +- `--burn-timestamp`: Burn the attached timestamp into the video frame as a visible overlay. Has no effect unless `--attach-timestamp` is also set. With `--zero-copy`, frames stay out of CPU memory, so the publisher logs a warning and skips the visible burn while still attaching timestamp metadata. - `--attach-frame-id`: Attach a monotonically increasing frame ID to each published frame via the packet trailer. The subscriber displays this in the timestamp overlay when `--display-timestamp` is used. - `--display-video`: Open a window that displays the video frames being published. - `--display-timing`: Burn publisher timing metrics into the local preview window. Requires `--display-video`. @@ -194,6 +196,6 @@ Notes: - If the active video track is unsubscribed or unpublished, the app clears its state and will automatically attach to the next matching video track when it appears. - For E2EE to work, both publisher and subscriber must specify the same `--e2ee-key` value. If the keys don't match, the subscriber will not be able to decode the video. - The timestamp overlay updates at ~2 Hz so the latency value is readable rather than flickering every frame. -- On Jetson, `--source argus` requires the Jetson Multimedia API headers under `/usr/src/jetson_multimedia_api`. By default it publishes NV12 DMA buffers through the Jetson hardware encoder. `--attach-timestamp --burn-timestamp` intentionally switches that source to a CPU I420 copy so the timestamp can be drawn into the frame. +- On Jetson, `--source argus` requires the Jetson Multimedia API headers under `/usr/src/jetson_multimedia_api`. Use `--zero-copy` to publish NV12 DMA-BUF frames through the Jetson hardware encoder. Without `--zero-copy`, Argus frames are copied to CPU I420 before publish so `--attach-timestamp --burn-timestamp` can draw the timestamp into the frame. - Jetson AV1 hardware encoding requires an Orin-class device (e.g. Orin NX or AGX Orin on JetPack 5+); the encoder is probed at startup and on devices without AV1 support (e.g. Xavier) `--codec av1` automatically falls back to the software libaom encoder. The Jetson AV1 encoder produces a single L1T1 stream (no SVC). - On Linux, preview windows use the Vulkan `wgpu` backend by default to avoid GLES/EGL conflicts on Jetson desktops. Set `WGPU_BACKEND=gl` or another supported `wgpu` backend to override this. diff --git a/examples/local_video/src/publisher.rs b/examples/local_video/src/publisher.rs index e5ee53ea2..3d9880e1d 100644 --- a/examples/local_video/src/publisher.rs +++ b/examples/local_video/src/publisher.rs @@ -189,6 +189,10 @@ struct Args { #[arg(long, value_enum, default_value_t = CaptureFormat::Auto)] format: CaptureFormat, + /// Use zero-copy platform camera buffers when available. + #[arg(long, default_value_t = false)] + zero_copy: bool, + /// Generate a numeric test pattern instead of using a camera: 0 = static bars, 1 = animated #[arg( long, @@ -446,12 +450,12 @@ fn log_publisher_outbound_health(stats: &[livekit::webrtc::stats::RtcStats]) { } } -fn maybe_request_native_capture_fallback( +fn maybe_request_zero_copy_fallback( outbound: &livekit::webrtc::stats::OutboundRtpStats, first_starved_at: &mut Option, - native_capture_fallback: &AtomicBool, + zero_copy_fallback: &AtomicBool, ) { - if native_capture_fallback.load(Ordering::Acquire) { + if zero_copy_fallback.load(Ordering::Acquire) { return; } if outbound.outbound.frames_encoded > 0 || outbound.outbound.key_frames_encoded > 0 { @@ -470,21 +474,21 @@ fn maybe_request_native_capture_fallback( return; } - native_capture_fallback.store(true, Ordering::Release); + zero_copy_fallback.store(true, Ordering::Release); log::warn!( - "Native AVFoundation CVPixelBuffer publish produced no encoded frames; falling back to CPU I420 capture" + "Zero-copy AVFoundation CVPixelBuffer publish produced no encoded frames; falling back to CPU I420 capture" ); } async fn update_publisher_video_stats( track: LocalVideoTrack, ctrl_c_received: Arc, - native_capture_fallback: Option>, + zero_copy_fallback: Option>, ) { let mut last_log = Instant::now().checked_sub(Duration::from_secs(2)).unwrap_or_else(Instant::now); let mut last_encoder_implementation = String::new(); - let mut native_capture_starved_at = None; + let mut zero_copy_starved_at = None; let mut interval = tokio::time::interval(Duration::from_secs(1)); interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); @@ -501,13 +505,9 @@ async fn update_publisher_video_stats( } } if let (Some(outbound), Some(fallback)) = - (find_video_outbound_stats(&stats), native_capture_fallback.as_ref()) + (find_video_outbound_stats(&stats), zero_copy_fallback.as_ref()) { - maybe_request_native_capture_fallback( - &outbound, - &mut native_capture_starved_at, - fallback, - ); + maybe_request_zero_copy_fallback(&outbound, &mut zero_copy_starved_at, fallback); } if last_log.elapsed() >= Duration::from_secs(2) { log_publisher_outbound_health(&stats); @@ -804,6 +804,20 @@ mod tests { assert_eq!(args.test_pattern, None); } + #[test] + fn zero_copy_is_disabled_by_default() { + let args = Args::try_parse_from(["publisher"]).expect("default args should parse"); + + assert!(!args.zero_copy); + } + + #[test] + fn zero_copy_flag_enables_zero_copy() { + let args = Args::try_parse_from(["publisher", "--zero-copy"]).expect("args should parse"); + + assert!(args.zero_copy); + } + #[test] fn test_pattern_without_value_defaults_to_static_bars() { let args = @@ -894,7 +908,11 @@ enum PlatformCamera { V4l(V4lCaptureSession), } -fn publisher_capture_path_label(video_input: &VideoInput, burn_timestamp: bool) -> String { +fn publisher_capture_path_label( + video_input: &VideoInput, + burn_timestamp: bool, + zero_copy: bool, +) -> String { match video_input { VideoInput::TestPattern(_) => "test-pattern CPU I420".to_string(), VideoInput::Camera(camera) => match camera { @@ -902,22 +920,36 @@ fn publisher_capture_path_label(video_input: &VideoInput, burn_timestamp: bool) PlatformCamera::AvFoundation(session) => { let source_format = session.format().frame_format; let core_video_format = core_video_fourcc(session.core_video_pixel_format()); - if burn_timestamp { - format!( - "AVFoundation CPU I420 fallback from {source_format}/{core_video_format} (timestamp burn)" - ) - } else { + if zero_copy { match session.capture_path() { + LkCapturePath::Native if burn_timestamp => { + format!( + "AVFoundation zero-copy IOSurface CVPixelBuffer {core_video_format} from {source_format} (timestamp burn disabled)" + ) + } LkCapturePath::Native => { format!( - "AVFoundation native IOSurface CVPixelBuffer {core_video_format} from {source_format}" + "AVFoundation zero-copy IOSurface CVPixelBuffer {core_video_format} from {source_format}" + ) + } + path => { + let suffix = if burn_timestamp { + "zero-copy unsupported, timestamp burn" + } else { + "zero-copy unsupported" + }; + format!( + "AVFoundation {} fallback from {source_format}/{core_video_format} ({suffix})", + capture_path_name(path), ) } - path => format!( - "AVFoundation {} fallback from {source_format}/{core_video_format}", - capture_path_name(path), - ), } + } else if burn_timestamp { + format!( + "AVFoundation CPU I420 from {source_format}/{core_video_format} (timestamp burn)" + ) + } else { + format!("AVFoundation CPU I420 from {source_format}/{core_video_format}") } } #[cfg(target_os = "linux")] @@ -928,20 +960,38 @@ fn publisher_capture_path_label(video_input: &VideoInput, burn_timestamp: bool) } else { "" }; - format!( - "V4L2 {} from {}{}", - capture_path_name(session.capture_path()), - format.frame_format, - decode_suffix - ) + if zero_copy { + let suffix = if burn_timestamp { + "zero-copy unsupported, timestamp burn" + } else { + "zero-copy unsupported" + }; + format!( + "V4L2 {} fallback from {}{} ({suffix})", + capture_path_name(session.capture_path()), + format.frame_format, + decode_suffix + ) + } else { + format!( + "V4L2 {} from {}{}", + capture_path_name(session.capture_path()), + format.frame_format, + decode_suffix + ) + } } }, #[cfg(all(target_os = "linux", target_arch = "aarch64"))] VideoInput::Argus(_) => { - if burn_timestamp { + if zero_copy && burn_timestamp { + "libargus NV12 DMA-BUF zero-copy (timestamp burn disabled)".to_string() + } else if zero_copy { + "libargus NV12 DMA-BUF zero-copy".to_string() + } else if burn_timestamp { "libargus CPU I420 from NV12 DMA-BUF (timestamp burn)".to_string() } else { - "libargus NV12 DMA-BUF".to_string() + "libargus CPU I420 from NV12 DMA-BUF".to_string() } } } @@ -957,8 +1007,34 @@ fn core_video_fourcc(pixel_format: u32) -> String { } } -fn publisher_uses_native_camera_capture(video_input: &VideoInput, burn_timestamp: bool) -> bool { - if burn_timestamp { +fn publisher_zero_copy_unsupported_reason(video_input: &VideoInput) -> Option<&'static str> { + match video_input { + VideoInput::TestPattern(_) => Some("test pattern frames are generated in CPU I420 memory"), + VideoInput::Camera(camera) => match camera { + #[cfg(target_os = "macos")] + PlatformCamera::AvFoundation(session) => { + if session.capture_path() == LkCapturePath::Native { + None + } else { + Some("the selected AVFoundation format is not IOSurface-backed NV12") + } + } + #[cfg(target_os = "linux")] + PlatformCamera::V4l(_) => { + Some("V4L2 UVC capture does not expose a zero-copy capture/encode path here") + } + }, + #[cfg(all(target_os = "linux", target_arch = "aarch64"))] + VideoInput::Argus(_) => None, + } +} + +fn publisher_zero_copy_supported(video_input: &VideoInput) -> bool { + publisher_zero_copy_unsupported_reason(video_input).is_none() +} + +fn publisher_uses_zero_copy_camera_capture(video_input: &VideoInput, zero_copy: bool) -> bool { + if !zero_copy { return false; } @@ -1028,6 +1104,7 @@ impl PlatformCamera { #[derive(Clone, Copy)] struct CaptureConfig { fps: u32, + zero_copy: bool, attach_timestamp: bool, burn_timestamp: bool, attach_frame_id: bool, @@ -1421,19 +1498,32 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { info!("Published camera track"); requested_codec }; - let burn_timestamp_enabled = args.attach_timestamp && args.burn_timestamp; + let burn_timestamp_requested = args.attach_timestamp && args.burn_timestamp; + let zero_copy_supported = publisher_zero_copy_supported(&video_input); + let zero_copy_active = args.zero_copy && zero_copy_supported; + if args.zero_copy { + if let Some(reason) = publisher_zero_copy_unsupported_reason(&video_input) { + log::warn!("--zero-copy requested, but {reason}; using CPU I420 capture"); + } + } + if zero_copy_active && burn_timestamp_requested { + log::warn!( + "--zero-copy keeps frames out of CPU memory; --burn-timestamp will not draw an overlay" + ); + } info!( "Publisher media path: capture={}, encode=requested codec {} via {}", - publisher_capture_path_label(&video_input, burn_timestamp_enabled), + publisher_capture_path_label(&video_input, burn_timestamp_requested, args.zero_copy), actual_codec.as_str(), video_encoder_backend_name(requested_encoder), ); - let native_capture_fallback = - publisher_uses_native_camera_capture(&video_input, burn_timestamp_enabled) + let zero_copy_fallback = + publisher_uses_zero_copy_camera_capture(&video_input, zero_copy_active) .then(|| Arc::new(AtomicBool::new(false))); let capture_config = CaptureConfig { fps: args.fps, + zero_copy: zero_copy_active, attach_timestamp: args.attach_timestamp, burn_timestamp: args.burn_timestamp, attach_frame_id: args.attach_frame_id, @@ -1448,7 +1538,7 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { let publish_stats_task = tokio::spawn(update_publisher_video_stats( track.clone(), ctrl_c_received.clone(), - native_capture_fallback.clone(), + zero_copy_fallback.clone(), )); match video_input { @@ -1492,7 +1582,7 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { Some(shared.clone()), publish_timing_state.clone(), user_data_channels.clone(), - native_capture_fallback.clone(), + zero_copy_fallback.clone(), )); let display_result = video_display::run_display( @@ -1520,7 +1610,7 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { None, publish_timing_state.clone(), user_data_channels.clone(), - native_capture_fallback.clone(), + zero_copy_fallback.clone(), ) .await; let _ = publish_stats_task.await; @@ -1543,7 +1633,7 @@ async fn run_capture_loop( display_shared: Option>>, publish_timing_state: Option>>, user_data_channels: Option>>, - native_capture_fallback: Option>, + zero_copy_fallback: Option>, ) -> Result<()> { let pace_fps = config.fps as f64; #[cfg(target_os = "macos")] @@ -1569,7 +1659,7 @@ async fn run_capture_loop( let mut fps_window_start = Instant::now(); let mut fps_smoothed: f32 = 0.0; let target = Duration::from_secs_f64(1.0 / pace_fps); - let burn_timestamp_enabled = config.attach_timestamp && config.burn_timestamp; + let burn_timestamp_requested = config.attach_timestamp && config.burn_timestamp; info!("Target frame interval: {:.2} ms", target.as_secs_f64() * 1000.0); if camera_driven_pacing { info!("Capture pacing: camera frame-arrival driven"); @@ -1582,11 +1672,11 @@ async fn run_capture_loop( let mut frame_counter: u32 = 1; let mut test_pattern_frame_index: u64 = 0; let mut timestamp_overlay = - burn_timestamp_enabled.then(|| TimestampOverlay::new(width, height)); + burn_timestamp_requested.then(|| TimestampOverlay::new(width, height)); let align_buffers_for_display = display_shared.is_some(); let mut logged_camera_timestamp_source = false; let mut logged_camera_timestamp_fallback = false; - let mut logged_native_capture_fallback = false; + let mut logged_zero_copy_fallback = false; loop { if ctrl_c_received.load(Ordering::Acquire) { @@ -1649,16 +1739,16 @@ async fn run_capture_loop( ) } VideoInput::Camera(camera) => { - let force_i420_after_native_failure = native_capture_fallback + let force_i420_after_zero_copy_failure = zero_copy_fallback .as_ref() .is_some_and(|fallback| fallback.load(Ordering::Acquire)); - if force_i420_after_native_failure && !logged_native_capture_fallback { + if force_i420_after_zero_copy_failure && !logged_zero_copy_fallback { log::warn!( - "Publisher media path changed: capture=AVFoundation CPU I420 fallback after native encode starvation" + "Publisher media path changed: capture=AVFoundation CPU I420 fallback after zero-copy encode starvation" ); - logged_native_capture_fallback = true; + logged_zero_copy_fallback = true; } - let prefer_native = !burn_timestamp_enabled && !force_i420_after_native_failure; + let prefer_native = config.zero_copy && !force_i420_after_zero_copy_failure; let mut captured = camera.capture_frame(prefer_native)?; let camera_frame_acquired_at = Instant::now(); match &mut captured.buffer { @@ -1725,23 +1815,31 @@ async fn run_capture_loop( let mut buffer_ready_at = convert_finished_at; let mut frame_draw_ms = None; let mut burned_timestamp_us = None; - if let Some(overlay) = timestamp_overlay.as_mut() { - let overlay_started_at = Instant::now(); - match &mut captured_frame { - CapturedFrameBuffer::I420(frame) => { - let (stride_y, _, _) = frame.buffer.strides(); - let (data_y, _, _) = frame.buffer.data_mut(); - overlay.draw(data_y, stride_y as usize, capture_wall_time_us, fid); - } - #[cfg(target_os = "macos")] - CapturedFrameBuffer::Native(_) => { - anyhow::bail!("timestamp burning requires an I420 capture frame"); + let frame_uses_zero_copy = match &captured_frame { + #[cfg(target_os = "macos")] + CapturedFrameBuffer::Native(_) => true, + _ => false, + }; + if !frame_uses_zero_copy { + if let Some(overlay) = timestamp_overlay.as_mut() { + let overlay_started_at = Instant::now(); + match &mut captured_frame { + CapturedFrameBuffer::I420(frame) => { + let (stride_y, _, _) = frame.buffer.strides(); + let (data_y, _, _) = frame.buffer.data_mut(); + overlay.draw(data_y, stride_y as usize, capture_wall_time_us, fid); + } + #[cfg(target_os = "macos")] + CapturedFrameBuffer::Native(_) => { + unreachable!("native frame was classified as zero-copy"); + } } + burned_timestamp_us = Some(capture_wall_time_us); + let overlay_finished_at = Instant::now(); + frame_draw_ms = + Some((overlay_finished_at - overlay_started_at).as_secs_f64() * 1000.0); + buffer_ready_at = overlay_finished_at; } - burned_timestamp_us = Some(capture_wall_time_us); - let overlay_finished_at = Instant::now(); - frame_draw_ms = Some((overlay_finished_at - overlay_started_at).as_secs_f64() * 1000.0); - buffer_ready_at = overlay_finished_at; } // Build frame metadata from enabled packet trailer features and local timing correlation. @@ -1918,9 +2016,9 @@ async fn run_capture_loop( /// Capture loop dedicated to Jetson MIPI capture via libargus. /// /// Argus blocks inside `acquireFrame`, pacing capture itself, so this loop runs in a -/// dedicated OS thread. The normal path pushes NV12 DMA-buffer fds straight into -/// [`NativeVideoSource::capture_dmabuf_frame_with_metadata`] for zero-copy hand-off -/// to the Jetson hardware encoder; timestamp burn explicitly copies to CPU I420. +/// dedicated OS thread. With `--zero-copy`, the path pushes NV12 DMA-buffer fds +/// straight into [`NativeVideoSource::capture_dmabuf_frame_with_metadata`] for +/// hand-off to the Jetson hardware encoder; otherwise it copies to CPU I420. #[cfg(all(target_os = "linux", target_arch = "aarch64"))] async fn run_argus_capture_loop( config: CaptureConfig, @@ -1940,9 +2038,10 @@ async fn run_argus_capture_loop( } let mut session = session; - let burn_timestamp_enabled = config.attach_timestamp && config.burn_timestamp; + let burn_timestamp_requested = config.attach_timestamp && config.burn_timestamp; + let burn_timestamp_active = burn_timestamp_requested && !config.zero_copy; let mut timestamp_overlay = - burn_timestamp_enabled.then(|| TimestampOverlay::new(width, height)); + burn_timestamp_active.then(|| TimestampOverlay::new(width, height)); let mut frames: u64 = 0; let mut last_fps_log = Instant::now(); let mut sum_acquire_ms = 0.0; @@ -1961,7 +2060,7 @@ async fn run_argus_capture_loop( let mut backup_timestamp_frames: u64 = 0; let mut sum_sensor_to_acquire_ms = 0.0; let mut sum_sensor_to_argus_acquire_ms = 0.0; - if burn_timestamp_enabled { + if burn_timestamp_active { info!( "Argus timestamp burn enabled: copying NV12 DMA-BUF frames to CPU I420 before publish" ); @@ -1974,10 +2073,10 @@ async fn run_argus_capture_loop( let iter_start = Instant::now(); let acquire_started_at = Instant::now(); - let capture_result = if burn_timestamp_enabled { - session.capture_i420_frame().map(CapturedArgusFrame::I420) - } else { + let capture_result = if config.zero_copy { session.capture_frame().map(CapturedArgusFrame::DmaBuf) + } else { + session.capture_i420_frame().map(CapturedArgusFrame::I420) }; let captured_frame = match capture_result { Ok(frame) => { @@ -2135,7 +2234,7 @@ async fn run_argus_capture_loop( } else { 0.0 }; - if burn_timestamp_enabled { + if burn_timestamp_active { info!( "MIPI publishing: {}x{}, ~{:.1} fps | packet trailer timestamp source: sensor {} frames, backup system {} frames | avg ms: sensor_to_argus_acquire {:.2}, argus_wait {:.2}, argus_blit {:.2}, argus_i420_copy {:.2}, timestamp_burn {:.2}, sensor_to_acquire {:.2}, acquire {:.2}, capture {:.2}, iter {:.2}", width, From e451820206ddd49c6fea443bab018f6be29d0e36 Mon Sep 17 00:00:00 2001 From: David Chen Date: Tue, 30 Jun 2026 23:44:17 -0700 Subject: [PATCH 20/24] add vp8 vp9 av1 passthru mode --- examples/preencode_publish/Cargo.toml | 2 +- examples/preencode_publish/src/main.rs | 567 +++++++++++--- libwebrtc/src/native/video_frame.rs | 4 +- livekit-capture/Cargo.toml | 2 +- livekit-capture/README.md | 283 ++++++- livekit-capture/src/encoded/h26x.rs | 6 +- livekit-capture/src/encoded/rtp.rs | 704 +++++++++++++++++- livekit-capture/src/sources/gstreamer.rs | 25 + livekit-capture/src/sources/mod.rs | 2 +- livekit-capture/src/sources/rtsp.rs | 53 +- livekit-capture/src/track.rs | 53 +- webrtc-sys/build.rs | 2 +- webrtc-sys/src/jetson/av1_encoder_impl.cpp | 2 + .../src/jetson/jetson_av1_bitstream.cpp | 37 + webrtc-sys/src/jetson/jetson_av1_bitstream.h | 3 + webrtc-sys/src/passthrough_video_encoder.cpp | 145 +++- 16 files changed, 1752 insertions(+), 138 deletions(-) diff --git a/examples/preencode_publish/Cargo.toml b/examples/preencode_publish/Cargo.toml index 17686c1a6..b6b64c1cc 100644 --- a/examples/preencode_publish/Cargo.toml +++ b/examples/preencode_publish/Cargo.toml @@ -16,6 +16,6 @@ gstreamer = { workspace = true, optional = true } gstreamer-app = { workspace = true, optional = true } livekit = { workspace = true, features = ["rustls-tls-native-roots"] } livekit-api = { workspace = true, features = ["rustls-tls-native-roots"] } -livekit-capture = { workspace = true, features = ["rtsp", "tcp-source"] } +livekit-capture = { workspace = true, features = ["rtsp", "tcpsink"] } log = { workspace = true } tokio = { workspace = true, features = ["full"] } diff --git a/examples/preencode_publish/src/main.rs b/examples/preencode_publish/src/main.rs index a533f02c9..9638206e2 100644 --- a/examples/preencode_publish/src/main.rs +++ b/examples/preencode_publish/src/main.rs @@ -15,7 +15,11 @@ use gstreamer as gst; use gstreamer::prelude::*; #[cfg(feature = "gstreamer")] use gstreamer_app as gst_app; -use livekit::{prelude::*, webrtc::video_source::VideoResolution}; +use livekit::{ + options::{self, VideoEncoding}, + prelude::*, + webrtc::video_source::VideoResolution, +}; use livekit_api::access_token; #[cfg(feature = "gstreamer")] use livekit_capture::sources::gstreamer::{ @@ -43,16 +47,16 @@ const GSTREAMER_APPSINK_NAME: &str = "lk_appsink"; #[command(author, version, about, long_about = None)] struct Args { /// Encoded stream source. - #[arg(long, value_enum, default_value_t = SourceKind::Tcp)] + #[arg(long, value_enum, default_value_t = SourceKind::Tcpsink)] source: SourceKind, - /// Encoded video codec. Required with --source tcp; optional validation with --source rtsp. - /// Optional with --source gstappsink; omitted custom GStreamer pipelines infer H.264/H.265 - /// from their unlinked encoded output when possible. + /// Encoded video codec. Required with --source tcpsink and --source shmsink; optional + /// validation with --source rtsp. Optional with --source gstappsink; omitted custom + /// GStreamer pipelines infer the codec from their unlinked encoded output when possible. #[arg(long, value_enum)] codec: Option, - /// TCP server address as host:port. Required with --source tcp. + /// TCP server address as host:port. Required with --source tcpsink. #[arg(long)] host: Option, @@ -92,6 +96,12 @@ struct Args { #[arg(long, default_value_t = 30)] fps: u32, + /// Maximum publish bitrate in bits per second. Generated GStreamer test + /// sources use the same target bitrate so local smoke tests do not overrun + /// the advertised send cap. + #[arg(long)] + max_bitrate: Option, + /// H.264 TCP byte-stream format. #[arg(long, value_enum, default_value_t = H264FormatArg::AnnexB)] h264_format: H264FormatArg, @@ -100,13 +110,26 @@ struct Args { #[arg(long, default_value_t = 4)] avc_nal_length_size: u8, + /// TCP transport framing. + #[arg(long, value_enum, default_value_t = TcpFormatArg::Auto)] + tcp_format: TcpFormatArg, + + /// RTP timestamp clock rate used with --tcp-format rtp. + #[arg(long, default_value_t = 90_000)] + rtp_clock_rate: u32, + /// Log access-unit timing, keyframe, and H26x NAL diagnostics. #[arg(long)] diagnostics: bool, + /// GStreamer shmsink socket path. Used with --source shmsink. + #[cfg(feature = "gstreamer")] + #[arg(long, default_value = "/tmp/livekit-preencode-test.shm")] + shmsink_socket_path: String, + /// GStreamer launch pipeline used with --source gstappsink. If the pipeline does not include - /// appsink name=lk_appsink, an H.264/H.265 parser and appsink are attached to its unlinked - /// output. + /// appsink name=lk_appsink, codec-specific normalization and an appsink are attached to its + /// unlinked output. #[cfg(feature = "gstreamer")] #[arg(last = true, value_name = "PIPELINE")] gstreamer_pipeline: Vec, @@ -114,16 +137,21 @@ struct Args { #[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] enum SourceKind { - Tcp, + Tcpsink, Rtsp, #[cfg(feature = "gstreamer")] Gstappsink, + #[cfg(feature = "gstreamer")] + Shmsink, } #[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] enum CodecArg { H264, H265, + Vp8, + Vp9, + Av1, } #[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] @@ -132,23 +160,68 @@ enum H264FormatArg { Avc, } +#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] +enum TcpFormatArg { + Auto, + ByteStream, + Rtp, +} + impl CodecArg { fn encoded_codec(self) -> EncodedVideoCodec { match self { Self::H264 => EncodedVideoCodec::H264, Self::H265 => EncodedVideoCodec::H265, + Self::Vp8 => EncodedVideoCodec::VP8, + Self::Vp9 => EncodedVideoCodec::VP9, + Self::Av1 => EncodedVideoCodec::AV1, } } - fn wire_format(self, h264_format: H264FormatArg, avc_nal_length_size: u8) -> EncodedWireFormat { - match self { - Self::H264 => match h264_format { - H264FormatArg::AnnexB => EncodedWireFormat::H264AnnexB, - H264FormatArg::Avc => { - EncodedWireFormat::H264Avc { nal_length_size: avc_nal_length_size } + fn tcp_wire_format( + self, + tcp_format: TcpFormatArg, + h264_format: H264FormatArg, + avc_nal_length_size: u8, + rtp_clock_rate: u32, + ) -> Result { + match tcp_format.resolve(self) { + ResolvedTcpFormat::ByteStream => match self { + Self::H264 => match h264_format { + H264FormatArg::AnnexB => Ok(EncodedWireFormat::H264AnnexB), + H264FormatArg::Avc => { + Ok(EncodedWireFormat::H264Avc { nal_length_size: avc_nal_length_size }) + } } + Self::H265 => Ok(EncodedWireFormat::H265AnnexB), + Self::Vp8 | Self::Vp9 | Self::Av1 => bail!( + "--tcp-format byte-stream is only supported for H.264/H.265; use --tcp-format rtp for {:?}", + self.encoded_codec() + ), + }, + ResolvedTcpFormat::Rtp => Ok(EncodedWireFormat::Rtp { + codec: self.encoded_codec(), + clock_rate: rtp_clock_rate, + }), + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ResolvedTcpFormat { + ByteStream, + Rtp, +} + +impl TcpFormatArg { + fn resolve(self, codec: CodecArg) -> ResolvedTcpFormat { + match self { + Self::Auto => match codec { + CodecArg::H264 | CodecArg::H265 => ResolvedTcpFormat::ByteStream, + CodecArg::Vp8 | CodecArg::Vp9 | CodecArg::Av1 => ResolvedTcpFormat::Rtp, }, - Self::H265 => EncodedWireFormat::H265AnnexB, + Self::ByteStream => ResolvedTcpFormat::ByteStream, + Self::Rtp => ResolvedTcpFormat::Rtp, } } } @@ -161,12 +234,13 @@ async fn main() -> Result<()> { async fn run(args: Args) -> Result<()> { validate_dimensions(args.width, args.height)?; + validate_max_bitrate(args.max_bitrate)?; validate_h264_format_args(&args)?; #[cfg(feature = "gstreamer")] validate_gstreamer_args(&args)?; match args.source { - SourceKind::Tcp => { + SourceKind::Tcpsink => { let frame_interval_us = frame_interval_us(args.fps)?; run_tcp_source(args, frame_interval_us).await } @@ -176,6 +250,11 @@ async fn run(args: Args) -> Result<()> { let frame_interval_us = frame_interval_us(args.fps)?; run_gstreamer_source(args, frame_interval_us).await } + #[cfg(feature = "gstreamer")] + SourceKind::Shmsink => { + let frame_interval_us = frame_interval_us(args.fps)?; + run_shmsink_source(args, frame_interval_us).await + } } } @@ -191,9 +270,24 @@ fn validate_h264_format_args(args: &Args) -> Result<()> { if !(1..=4).contains(&args.avc_nal_length_size) { bail!("--avc-nal-length-size must be between 1 and 4 bytes"); } + if args.rtp_clock_rate == 0 { + bail!("--rtp-clock-rate must be greater than zero"); + } + if args.source == SourceKind::Tcpsink { + if let Some(codec) = args.codec { + if args.tcp_format.resolve(codec) == ResolvedTcpFormat::ByteStream + && matches!(codec, CodecArg::Vp8 | CodecArg::Vp9 | CodecArg::Av1) + { + bail!("--tcp-format byte-stream is only supported for H.264/H.265"); + } + } + } if args.h264_format == H264FormatArg::Avc { - if args.source != SourceKind::Tcp { - bail!("--h264-format avc is only valid with --source tcp"); + if args.source != SourceKind::Tcpsink { + bail!("--h264-format avc is only valid with --source tcpsink"); + } + if args.tcp_format == TcpFormatArg::Rtp { + bail!("--h264-format avc is only valid with TCP byte-stream input"); } if args.codec != Some(CodecArg::H264) { bail!("--h264-format avc requires --codec h264"); @@ -203,10 +297,15 @@ fn validate_h264_format_args(args: &Args) -> Result<()> { } async fn run_tcp_source(args: Args, frame_interval_us: i64) -> Result<()> { - let codec_arg = args.codec.context("--codec is required with --source tcp")?; + let codec_arg = args.codec.context("--codec is required with --source tcpsink")?; let codec = codec_arg.encoded_codec(); - let host = args.host.clone().context("--host is required with --source tcp")?; - let wire_format = codec_arg.wire_format(args.h264_format, args.avc_nal_length_size); + let host = args.host.clone().context("--host is required with --source tcpsink")?; + let wire_format = codec_arg.tcp_wire_format( + args.tcp_format, + args.h264_format, + args.avc_nal_length_size, + args.rtp_clock_rate, + )?; let config = ByteStreamSourceConfig::new( wire_format, current_time_us(), @@ -277,6 +376,7 @@ async fn run_gstreamer_source(args: Args, frame_interval_us: i64) -> Result<()> frame_interval_us, args.codec.map(CodecArg::encoded_codec), &args.gstreamer_pipeline, + args.max_bitrate, )?; let codec = source.codec(); let shutdown_pipeline = source.shutdown_pipeline(); @@ -295,6 +395,43 @@ async fn run_gstreamer_source(args: Args, frame_interval_us: i64) -> Result<()> .await } +#[cfg(feature = "gstreamer")] +async fn run_shmsink_source(args: Args, frame_interval_us: i64) -> Result<()> { + let codec_arg = args.codec.context("--codec is required with --source shmsink")?; + let codec = codec_arg.encoded_codec(); + let socket_path = args.shmsink_socket_path.clone(); + let pipeline_args = vec![gstreamer_shmsink_pipeline_description(&socket_path, codec)?]; + let source = GStreamerTestSource::start( + args.width, + args.height, + args.fps, + current_time_us(), + frame_interval_us, + Some(codec), + &pipeline_args, + args.max_bitrate, + )?; + let shutdown_pipeline = source.shutdown_pipeline(); + log::info!( + "Started GStreamer {:?} shmsink reader for {}: {}", + codec, + socket_path, + source.pipeline_description() + ); + + publish_encoded_source( + args, + codec, + "GStreamer shmsink", + source, + move || { + let _ = shutdown_pipeline.set_state(gst::State::Null); + }, + Some(frame_interval_us), + ) + .await +} + #[cfg(feature = "gstreamer")] #[derive(Debug)] struct GStreamerTestSource { @@ -313,12 +450,19 @@ impl GStreamerTestSource { frame_interval_us: i64, requested_codec: Option, pipeline_args: &[String], + max_bitrate: Option, ) -> Result { gst::init().context("failed to initialize GStreamer")?; let generated_codec = requested_codec.unwrap_or(EncodedVideoCodec::H264); - let pipeline_description = - gstreamer_pipeline_description(width, height, fps, generated_codec, pipeline_args); + let pipeline_description = gstreamer_pipeline_description( + width, + height, + fps, + generated_codec, + pipeline_args, + max_bitrate, + ); let element = gst::parse::launch(&pipeline_description).with_context(|| { format!("failed to create GStreamer pipeline: {pipeline_description}") })?; @@ -386,9 +530,10 @@ fn gstreamer_pipeline_description( fps: u32, codec: EncodedVideoCodec, pipeline_args: &[String], + max_bitrate: Option, ) -> String { if pipeline_args.is_empty() { - return gstreamer_test_pipeline_description(width, height, fps, codec); + return gstreamer_test_pipeline_description(width, height, fps, codec, max_bitrate); } pipeline_args.join(" ") @@ -400,43 +545,89 @@ fn gstreamer_test_pipeline_description( height: u32, fps: u32, codec: EncodedVideoCodec, + max_bitrate: Option, ) -> String { - let key_int_max = fps.max(1); - let (encoder, parser, caps) = match codec { - EncodedVideoCodec::H264 => ( - format!( - "x264enc tune=zerolatency speed-preset=ultrafast key-int-max={key_int_max} \ - bitrate=2500 byte-stream=true aud=true" - ), - "h264parse config-interval=-1", - "video/x-h264,stream-format=byte-stream,alignment=au", - ), - EncodedVideoCodec::H265 => ( - format!( - "x265enc tune=zerolatency speed-preset=ultrafast key-int-max={key_int_max} \ - bitrate=2500" - ), - "h265parse config-interval=-1", - "video/x-h265,stream-format=byte-stream,alignment=au", - ), - EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { - unreachable!("GStreamer generated test pipeline only supports H.264/H.265") - } - _ => unreachable!("unknown generated GStreamer codec"), - }; + let bitrate = publish_video_encoding(max_bitrate, width, height, fps, codec).max_bitrate; + let codec_pipeline = gstreamer_test_encode_pipeline(fps, codec, bitrate); format!( - "videotestsrc is-live=true do-timestamp=true pattern=smpte ! \ + "videotestsrc is-live=true do-timestamp=true pattern=ball motion=wavy animation-mode=frames ! \ video/x-raw,width={width},height={height},framerate={fps}/1 ! \ timeoverlay halignment=right valignment=bottom shaded-background=true ! \ videoconvert ! \ - {encoder} ! \ - {parser} ! \ - {caps} ! \ + video/x-raw,format=I420 ! \ + {codec_pipeline} ! \ appsink name={GSTREAMER_APPSINK_NAME} sync=false max-buffers=8 drop=true" ) } +#[cfg(feature = "gstreamer")] +fn gstreamer_test_encode_pipeline(fps: u32, codec: EncodedVideoCodec, bitrate: u64) -> String { + let key_int_max = fps.max(1); + let bitrate_kbps = u64::max(1, bitrate / 1000); + match codec { + EncodedVideoCodec::H264 => format!( + "x264enc tune=zerolatency speed-preset=ultrafast key-int-max={key_int_max} \ + bitrate={bitrate_kbps} byte-stream=true aud=true ! h264parse config-interval=-1 ! \ + video/x-h264,stream-format=byte-stream,alignment=au" + ), + EncodedVideoCodec::H265 => format!( + "x265enc tune=zerolatency speed-preset=ultrafast key-int-max={key_int_max} \ + bitrate={bitrate_kbps} ! h265parse config-interval=-1 ! \ + video/x-h265,stream-format=byte-stream,alignment=au" + ), + EncodedVideoCodec::VP8 => format!( + "vp8enc deadline=1 cpu-used=8 keyframe-max-dist={key_int_max} lag-in-frames=0 \ + target-bitrate={bitrate} ! video/x-vp8" + ), + EncodedVideoCodec::VP9 => format!( + "vp9enc deadline=1 cpu-used=8 keyframe-max-dist={key_int_max} lag-in-frames=0 \ + target-bitrate={bitrate} ! video/x-vp9,profile=(string)0" + ), + EncodedVideoCodec::AV1 => format!( + "av1enc cpu-used=8 usage-profile=realtime keyframe-max-dist={key_int_max} \ + lag-in-frames=0 target-bitrate={bitrate_kbps} ! av1parse ! \ + video/x-av1,stream-format=obu-stream,alignment=tu" + ), + _ => unreachable!("unknown generated GStreamer codec"), + } +} + +#[cfg(feature = "gstreamer")] +fn gstreamer_shmsink_pipeline_description( + socket_path: &str, + codec: EncodedVideoCodec, +) -> Result { + let socket_path = gstreamer_launch_string_value(socket_path); + let caps = gstreamer_launch_caps(codec)?; + + Ok(format!( + "shmsrc socket-path={socket_path} is-live=true do-timestamp=true ! capsfilter caps={caps}" + )) +} + +#[cfg(feature = "gstreamer")] +fn gstreamer_launch_caps(codec: EncodedVideoCodec) -> Result<&'static str> { + match codec { + EncodedVideoCodec::H264 => Ok("video/x-h264,stream-format=byte-stream,alignment=au"), + EncodedVideoCodec::H265 => Ok("video/x-h265,stream-format=byte-stream,alignment=au"), + EncodedVideoCodec::VP8 => Ok("video/x-vp8"), + EncodedVideoCodec::VP9 => Ok("video/x-vp9,profile=(string)0"), + EncodedVideoCodec::AV1 => Ok("video/x-av1,stream-format=obu-stream,alignment=tu"), + _ => bail!("unsupported GStreamer codec: {:?}", codec), + } +} + +#[cfg(feature = "gstreamer")] +fn gstreamer_launch_string_value(value: &str) -> String { + if value.chars().all(|c| c.is_ascii_alphanumeric() || matches!(c, '/' | '_' | '-' | '.' | ':')) + { + return value.to_string(); + } + + format!("\"{}\"", value.replace('\\', "\\\\").replace('"', "\\\"")) +} + #[cfg(feature = "gstreamer")] fn ensure_encoded_appsink( pipeline: &gst::Pipeline, @@ -457,17 +648,17 @@ fn ensure_encoded_appsink( } sample_format } - None => h26x_sample_format(requested_codec.unwrap_or(EncodedVideoCodec::H264))?, + None => sample_format_for_codec(requested_codec.unwrap_or(EncodedVideoCodec::H264))?, }; return Ok((appsink, sample_format)); } let src_pad = pipeline.find_unlinked_pad(gst::PadDirection::Src).with_context(|| { - format!("GStreamer pipeline must include appsink name={GSTREAMER_APPSINK_NAME} or leave one H.264/H.265 source pad unlinked") + format!("GStreamer pipeline must include appsink name={GSTREAMER_APPSINK_NAME} or leave one encoded video source pad unlinked") })?; let inferred_codec = codec_from_pad_caps(&src_pad).with_context(|| { format!( - "unlinked GStreamer pad '{}' does not advertise video/x-h264 or video/x-h265 caps", + "unlinked GStreamer pad '{}' does not advertise supported encoded video caps", src_pad.name() ) })?; @@ -481,16 +672,13 @@ fn ensure_encoded_appsink( Some(requested_codec) => requested_codec, None => inferred_codec, }; - let sample_format = h26x_sample_format(codec)?; + let sample_format = sample_format_for_codec(codec)?; let Some(src_element) = src_pad.parent_element() else { bail!("unlinked GStreamer encoded pad has no parent element"); }; - let parser = gst::ElementFactory::make(h26x_parser_name(codec)?) - .property("config-interval", -1i32) - .build() - .with_context(|| format!("failed to create {}", h26x_parser_name(codec).unwrap()))?; - let codec_caps = h26x_appsink_caps(codec)?; + let parser = parser_element_for_codec(codec)?; + let codec_caps = appsink_caps(codec)?; let capsfilter = gst::ElementFactory::make("capsfilter") .property("caps", codec_caps) .build() @@ -503,66 +691,88 @@ fn ensure_encoded_appsink( .build() .context("failed to create appsink")?; - pipeline - .add(&parser) - .with_context(|| format!("failed to add {} to GStreamer pipeline", parser.name()))?; + if let Some(parser) = &parser { + pipeline + .add(parser) + .with_context(|| format!("failed to add {} to GStreamer pipeline", parser.name()))?; + } pipeline.add(&capsfilter).context("failed to add capsfilter to GStreamer pipeline")?; pipeline.add(&appsink).context("failed to add appsink to GStreamer pipeline")?; - gst::Element::link_many([&parser, &capsfilter, &appsink]) - .with_context(|| format!("failed to link {} to appsink", parser.name()))?; - let sink_pad = parser + if let Some(parser) = &parser { + gst::Element::link_many([parser, &capsfilter, &appsink]) + .with_context(|| format!("failed to link {} to appsink", parser.name()))?; + } else { + gst::Element::link_many([&capsfilter, &appsink]) + .context("failed to link capsfilter to appsink")?; + } + let link_target = parser.as_ref().unwrap_or(&capsfilter); + let sink_pad = link_target .static_pad("sink") - .with_context(|| format!("{} did not expose a sink pad", parser.name()))?; - src_pad - .link(&sink_pad) - .with_context(|| format!("failed to link '{}' to {}", src_element.name(), parser.name()))?; + .with_context(|| format!("{} did not expose a sink pad", link_target.name()))?; + src_pad.link(&sink_pad).with_context(|| { + format!("failed to link '{}' to {}", src_element.name(), link_target.name()) + })?; Ok((appsink, sample_format)) } #[cfg(feature = "gstreamer")] -fn h26x_sample_format(codec: EncodedVideoCodec) -> Result { +fn sample_format_for_codec(codec: EncodedVideoCodec) -> Result { match codec { EncodedVideoCodec::H264 => Ok(GStreamerSampleFormat::H264AnnexB), EncodedVideoCodec::H265 => Ok(GStreamerSampleFormat::H265AnnexB), - EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => bail!( - "GStreamer passthrough currently supports H.264/H.265 access units; {:?} needs an explicit access-unit source path", - codec - ), + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { + Ok(GStreamerSampleFormat::AccessUnit { codec }) + } _ => bail!("unsupported GStreamer codec: {:?}", codec), } } #[cfg(feature = "gstreamer")] -fn h26x_parser_name(codec: EncodedVideoCodec) -> Result<&'static str> { - match codec { - EncodedVideoCodec::H264 => Ok("h264parse"), - EncodedVideoCodec::H265 => Ok("h265parse"), - EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { - bail!("no H26x parser for {:?}", codec) - } - _ => bail!("unsupported GStreamer codec: {:?}", codec), +fn parser_element_for_codec(codec: EncodedVideoCodec) -> Result> { + let Some(name) = parser_name(codec)? else { + return Ok(None); + }; + let mut builder = gst::ElementFactory::make(name); + if matches!(codec, EncodedVideoCodec::H264 | EncodedVideoCodec::H265) { + builder = builder.property("config-interval", -1i32); } + builder.build().map(Some).with_context(|| format!("failed to create {name}")) } #[cfg(feature = "gstreamer")] -fn h26x_caps_name(codec: EncodedVideoCodec) -> Result<&'static str> { +fn parser_name(codec: EncodedVideoCodec) -> Result> { match codec { - EncodedVideoCodec::H264 => Ok("video/x-h264"), - EncodedVideoCodec::H265 => Ok("video/x-h265"), - EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { - bail!("GStreamer passthrough currently supports H.264/H.265 Annex-B") - } + EncodedVideoCodec::H264 => Ok(Some("h264parse")), + EncodedVideoCodec::H265 => Ok(Some("h265parse")), + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 => Ok(None), + EncodedVideoCodec::AV1 => Ok(Some("av1parse")), _ => bail!("unsupported GStreamer codec: {:?}", codec), } } #[cfg(feature = "gstreamer")] -fn h26x_appsink_caps(codec: EncodedVideoCodec) -> Result { - Ok(gst::Caps::builder(h26x_caps_name(codec)?) - .field("stream-format", "byte-stream") - .field("alignment", "au") - .build()) +fn appsink_caps(codec: EncodedVideoCodec) -> Result { + match codec { + EncodedVideoCodec::H264 => Ok(gst::Caps::builder("video/x-h264") + .field("stream-format", "byte-stream") + .field("alignment", "au") + .build()), + EncodedVideoCodec::H265 => Ok(gst::Caps::builder("video/x-h265") + .field("stream-format", "byte-stream") + .field("alignment", "au") + .build()), + EncodedVideoCodec::VP8 => Ok(gst::Caps::builder("video/x-vp8").build()), + EncodedVideoCodec::VP9 => { + Ok(gst::Caps::builder("video/x-vp9").field("profile", "0").build()) + } + EncodedVideoCodec::AV1 => Ok(gst::Caps::builder("video/x-av1") + .field("parsed", true) + .field("stream-format", "obu-stream") + .field("alignment", "tu") + .build()), + _ => bail!("unsupported GStreamer codec: {:?}", codec), + } } #[cfg(feature = "gstreamer")] @@ -608,7 +818,25 @@ fn sample_format_from_caps_structure( } } EncodedVideoCodec::H265 => Ok(Some(GStreamerSampleFormat::H265AnnexB)), - EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => Ok(None), + EncodedVideoCodec::VP8 => Ok(Some(GStreamerSampleFormat::AccessUnit { codec })), + EncodedVideoCodec::VP9 => { + let profile = structure.get::("profile").ok(); + match profile.as_deref() { + Some("0") | None => Ok(Some(GStreamerSampleFormat::AccessUnit { codec })), + Some(profile) => { + bail!("unsupported GStreamer VP9 profile '{profile}'; expected profile 0") + } + } + } + EncodedVideoCodec::AV1 => { + let stream_format = structure.get::("stream-format").ok(); + match stream_format.as_deref() { + Some("obu-stream") | None => Ok(Some(GStreamerSampleFormat::AccessUnit { codec })), + Some(stream_format) => bail!( + "unsupported GStreamer AV1 stream-format '{stream_format}'; expected obu-stream" + ), + } + } _ => Ok(None), } } @@ -641,6 +869,9 @@ fn codec_from_caps_name(name: &str) -> Option { match name { "video/x-h264" => Some(EncodedVideoCodec::H264), "video/x-h265" => Some(EncodedVideoCodec::H265), + "video/x-vp8" => Some(EncodedVideoCodec::VP8), + "video/x-vp9" => Some(EncodedVideoCodec::VP9), + "video/x-av1" => Some(EncodedVideoCodec::AV1), _ => None, } } @@ -681,6 +912,9 @@ where false, ); let mut publish_options = VideoCaptureTrack::encoded_publish_options(codec); + let video_encoding = + publish_video_encoding(args.max_bitrate, args.width, args.height, args.fps, codec); + publish_options.video_encoding = Some(video_encoding.clone()); publish_options.source = TrackSource::Camera; room.local_participant() @@ -688,10 +922,12 @@ where .await .context("failed to publish pre-encoded video track")?; log::info!( - "Published pre-encoded {:?} track at {}x{}; forwarding {} access units", + "Published pre-encoded {:?} track at {}x{} (max_bitrate={}bps max_framerate={:.1}); forwarding {} access units", codec, args.width, args.height, + video_encoding.max_bitrate, + video_encoding.max_framerate, source_label ); @@ -903,7 +1139,9 @@ impl AccessUnitDiagnostics { } if is_keyframe { - if nal_summary.missing_recovery_parameter_set() { + if matches!(access_unit.codec, EncodedVideoCodec::H264 | EncodedVideoCodec::H265) + && nal_summary.missing_recovery_parameter_set() + { self.report_missing_parameter_keyframes += 1; log::warn!( "{} keyframe {} missing recovery parameter sets: {}", @@ -1144,6 +1382,13 @@ fn validate_dimensions(width: u32, height: u32) -> Result<()> { Ok(()) } +fn validate_max_bitrate(max_bitrate: Option) -> Result<()> { + if max_bitrate == Some(0) { + bail!("--max-bitrate must be greater than zero"); + } + Ok(()) +} + fn frame_interval_us(fps: u32) -> Result { if fps == 0 { bail!("--fps must be greater than zero"); @@ -1151,6 +1396,21 @@ fn frame_interval_us(fps: u32) -> Result { Ok(1_000_000_i64 / i64::from(fps)) } +fn publish_video_encoding( + max_bitrate: Option, + width: u32, + height: u32, + fps: u32, + codec: EncodedVideoCodec, +) -> VideoEncoding { + let mut encoding = options::compute_appropriate_encoding(false, width, height, codec.into()); + if let Some(max_bitrate) = max_bitrate { + encoding.max_bitrate = max_bitrate; + } + encoding.max_framerate = f64::from(fps); + encoding +} + fn current_time_us() -> i64 { let Ok(duration) = SystemTime::now().duration_since(UNIX_EPOCH) else { return 0; @@ -1162,13 +1422,19 @@ fn current_time_us() -> i64 { mod tests { use super::*; + fn init_gstreamer_for_test() { + gst::init().expect("failed to initialize GStreamer"); + } + #[test] fn gstreamer_pipeline_description_routes_test_source_to_h264_appsink() { let description = - gstreamer_test_pipeline_description(320, 180, 30, EncodedVideoCodec::H264); + gstreamer_test_pipeline_description(320, 180, 30, EncodedVideoCodec::H264, None); assert!(description.contains("videotestsrc is-live=true do-timestamp=true")); + assert!(description.contains("pattern=ball motion=wavy animation-mode=frames")); assert!(description.contains("timeoverlay")); + assert!(description.contains("video/x-raw,format=I420")); assert!(description.contains("x264enc")); assert!(description.contains("video/x-h264,stream-format=byte-stream,alignment=au")); assert!(description.contains(&format!("appsink name={GSTREAMER_APPSINK_NAME}"))); @@ -1177,16 +1443,39 @@ mod tests { #[test] fn gstreamer_pipeline_description_routes_test_source_to_h265_appsink() { let description = - gstreamer_test_pipeline_description(320, 180, 30, EncodedVideoCodec::H265); + gstreamer_test_pipeline_description(320, 180, 30, EncodedVideoCodec::H265, None); assert!(description.contains("videotestsrc is-live=true do-timestamp=true")); assert!(description.contains("timeoverlay")); + assert!(description.contains("video/x-raw,format=I420")); assert!(description.contains("x265enc")); assert!(description.contains("h265parse config-interval=-1")); assert!(description.contains("video/x-h265,stream-format=byte-stream,alignment=au")); assert!(description.contains(&format!("appsink name={GSTREAMER_APPSINK_NAME}"))); } + #[test] + fn gstreamer_pipeline_description_routes_test_source_to_vp8_vp9_and_av1_appsink() { + let vp8 = gstreamer_test_pipeline_description(320, 180, 30, EncodedVideoCodec::VP8, None); + assert!(vp8.contains("video/x-raw,format=I420")); + assert!(vp8.contains("vp8enc")); + assert!(vp8.contains("video/x-vp8")); + assert!(vp8.contains(&format!("appsink name={GSTREAMER_APPSINK_NAME}"))); + + let vp9 = gstreamer_test_pipeline_description(320, 180, 30, EncodedVideoCodec::VP9, None); + assert!(vp9.contains("video/x-raw,format=I420")); + assert!(vp9.contains("vp9enc")); + assert!(vp9.contains("video/x-vp9,profile=(string)0")); + assert!(vp9.contains(&format!("appsink name={GSTREAMER_APPSINK_NAME}"))); + + let av1 = gstreamer_test_pipeline_description(320, 180, 30, EncodedVideoCodec::AV1, None); + assert!(av1.contains("video/x-raw,format=I420")); + assert!(av1.contains("av1enc")); + assert!(av1.contains("av1parse")); + assert!(av1.contains("video/x-av1,stream-format=obu-stream,alignment=tu")); + assert!(av1.contains(&format!("appsink name={GSTREAMER_APPSINK_NAME}"))); + } + #[test] fn gstreamer_pipeline_description_uses_trailing_pipeline_args() { let pipeline = [ @@ -1197,13 +1486,43 @@ mod tests { ]; assert_eq!( - gstreamer_pipeline_description(320, 180, 30, EncodedVideoCodec::H265, &pipeline), + gstreamer_pipeline_description(320, 180, 30, EncodedVideoCodec::H265, &pipeline, None), "videotestsrc is-live=true ! x264enc" ); } + #[test] + fn gstreamer_shmsink_pipeline_description_uses_socket_path_and_codec_caps() { + let h264 = gstreamer_shmsink_pipeline_description( + "/tmp/livekit h264.shm", + EncodedVideoCodec::H264, + ) + .unwrap(); + assert!(h264.contains("shmsrc socket-path=\"/tmp/livekit h264.shm\"")); + assert!(h264.contains("is-live=true do-timestamp=true")); + assert!(h264.contains("capsfilter caps=")); + assert!(h264.contains("video/x-h264,stream-format=byte-stream,alignment=au")); + + let vp8 = + gstreamer_shmsink_pipeline_description("/tmp/livekit-vp8.shm", EncodedVideoCodec::VP8) + .unwrap(); + assert!(vp8.contains("shmsrc socket-path=/tmp/livekit-vp8.shm")); + assert!(vp8.contains("video/x-vp8")); + + let vp9 = + gstreamer_shmsink_pipeline_description("/tmp/livekit-vp9.shm", EncodedVideoCodec::VP9) + .unwrap(); + assert!(vp9.contains("video/x-vp9,profile=(string)0")); + + let av1 = + gstreamer_shmsink_pipeline_description("/tmp/livekit-av1.shm", EncodedVideoCodec::AV1) + .unwrap(); + assert!(av1.contains("video/x-av1,stream-format=obu-stream,alignment=tu")); + } + #[test] fn gstreamer_caps_detect_h264_avc_sample_format() { + init_gstreamer_for_test(); let caps = gst::Caps::builder("video/x-h264") .field("stream-format", "avc") .field("alignment", "au") @@ -1216,6 +1535,48 @@ mod tests { ); } + #[test] + fn gstreamer_caps_detect_vp8_vp9_and_av1_sample_formats() { + init_gstreamer_for_test(); + for (caps_name, codec) in [ + ("video/x-vp8", EncodedVideoCodec::VP8), + ("video/x-vp9", EncodedVideoCodec::VP9), + ("video/x-av1", EncodedVideoCodec::AV1), + ] { + let caps = if codec == EncodedVideoCodec::AV1 { + gst::Caps::builder(caps_name).field("stream-format", "obu-stream").build() + } else { + gst::Caps::builder(caps_name).build() + }; + let structure = caps.iter().next().unwrap(); + + assert_eq!( + sample_format_from_caps_structure(structure).unwrap(), + Some(GStreamerSampleFormat::AccessUnit { codec }) + ); + } + } + + #[test] + fn gstreamer_caps_reject_av1_annexb_for_appsink_passthrough() { + init_gstreamer_for_test(); + let caps = gst::Caps::builder("video/x-av1").field("stream-format", "annexb").build(); + let structure = caps.iter().next().unwrap(); + + let err = sample_format_from_caps_structure(structure).unwrap_err(); + assert!(err.to_string().contains("unsupported GStreamer AV1 stream-format")); + } + + #[test] + fn gstreamer_caps_reject_nonzero_vp9_profile_for_appsink_passthrough() { + init_gstreamer_for_test(); + let caps = gst::Caps::builder("video/x-vp9").field("profile", "1").build(); + let structure = caps.iter().next().unwrap(); + + let err = sample_format_from_caps_structure(structure).unwrap_err(); + assert!(err.to_string().contains("unsupported GStreamer VP9 profile")); + } + #[test] fn gstreamer_avc_codec_data_sets_nal_length_size() { assert_eq!(h264_avc_nal_length_size_from_codec_data(&[1, 0, 0, 0, 0xfc]), Some(1)); @@ -1234,6 +1595,7 @@ mod tests { frame_interval_us, Some(EncodedVideoCodec::H264), &[], + None, ) { Ok(source) => source, Err(err) => { @@ -1256,6 +1618,7 @@ mod tests { frame_interval_us, Some(EncodedVideoCodec::H265), &[], + None, ) { Ok(source) => source, Err(err) => { @@ -1295,6 +1658,7 @@ mod tests { frame_interval_us, None, &pipeline, + None, ) { Ok(source) => source, Err(err) => { @@ -1333,6 +1697,7 @@ mod tests { frame_interval_us, None, &pipeline, + None, ) { Ok(source) => source, Err(err) => { diff --git a/libwebrtc/src/native/video_frame.rs b/libwebrtc/src/native/video_frame.rs index aaa5491ff..dc3d5fbb8 100644 --- a/libwebrtc/src/native/video_frame.rs +++ b/libwebrtc/src/native/video_frame.rs @@ -51,7 +51,9 @@ pub fn new_video_frame_buffer( vfb_sys::ffi::VideoFrameBufferType::NV12 => Box::new(vf::NV12Buffer { handle: NV12Buffer { sys_handle: sys_handle.pin_mut().get_nv12() }, }), - _ => unreachable!(), + _ => { + Box::new(vf::I420Buffer { handle: I420Buffer { sys_handle: sys_handle.to_i420() } }) + } } } } diff --git a/livekit-capture/Cargo.toml b/livekit-capture/Cargo.toml index 22f7eaff3..edfb2cc7f 100644 --- a/livekit-capture/Cargo.toml +++ b/livekit-capture/Cargo.toml @@ -59,7 +59,7 @@ avfoundation = [ gstreamer = ["dep:gstreamer", "dep:gstreamer-app"] libargus = [] rtsp = [] -tcp-source = [] +tcpsink = [] v4l = ["dep:image", "dep:libc", "dep:v4l", "dep:yuv-sys"] [build-dependencies] diff --git a/livekit-capture/README.md b/livekit-capture/README.md index 5d6b5935f..a881c63f9 100644 --- a/livekit-capture/README.md +++ b/livekit-capture/README.md @@ -4,4 +4,285 @@ Capture helpers for publishing decoded, native platform, DMA-BUF, and pre-encoded video frames with the LiveKit Rust SDK. Optional source features include `avfoundation`, `libargus`, `v4l`, -`tcp-source`, `rtsp`, and `gstreamer`. +`tcpsink`, `rtsp`, and `gstreamer`. + +## Pre-encoded source modes + +The `preencode_publish` example can publish H.264, H.265, VP8, VP9, and AV1 +access units from these sources: + +| Source | Feature | Input shape | +| --- | --- | --- | +| `gstappsink` | `gstreamer` | Generated or custom GStreamer pipeline ending in `appsink` or one unlinked encoded pad | +| `tcpsink` | `tcpsink` | TCP connection to an encoded byte-stream or RFC4571 RTP producer | +| `shmsink` | `gstreamer` | GStreamer `shmsink` producer read through `shmsrc` | +| `rtsp` | `rtsp` | RTSP over TCP with interleaved RTP video | + +H.264/H.265 TCP defaults remain Annex-B byte streams. VP8, VP9, and AV1 use RTP +framing over TCP because those codecs need explicit frame boundaries. + +## Pre-encoded test sources + +The `preencode_publish` example includes GStreamer fixture scripts for testing +the H.264, H.265, VP8, VP9, and AV1 pre-encoded capture paths with an animated +`videotestsrc` source at `1280x720@30fps`. +The generated encoder pipelines force 8-bit I420 input; VP9 fixture caps are +pinned to profile 0 to match the WebRTC passthrough profile. + +Before running a publisher command, provide LiveKit credentials through the +environment or command-line flags: + +```sh +export LIVEKIT_URL=wss://example.livekit.cloud +export LIVEKIT_API_KEY=devkey +export LIVEKIT_API_SECRET=secret +``` + +All scripts require `--codec h264|h265|vp8|vp9|av1`. They also accept `--width`, +`--height`, `--fps`, `--bitrate-kbps`, and `--print`; the defaults match the +test profile above. + +### Runtime status + +The unit and fixture coverage exercises H.264, H.265, VP8, VP9, and AV1 ingest +through GStreamer appsink, TCP RTP, shared-memory shmsink, and RTSP RTP. +H.264/H.265 TCP byte-stream ingest remains the compatibility default. + +Local-SFU smoke testing has verified subscriber decode for H.264, H.265, VP8, +VP9, and AV1 through GStreamer appsink, TCP RTP, shared-memory shmsink, and +RTSP RTP sources. The generated fixture uses a low-motion animated test pattern +so the encoded source stays near the advertised publish cap; high-entropy custom +pipelines may need an explicit `--max-bitrate` large enough for the frames they +produce. + +### Local SFU smoke + +With a local LiveKit server running in dev mode: + +```sh +livekit-server --dev --bind 127.0.0.1 +``` + +Use the dev credentials in the publisher examples: + +```sh +export LIVEKIT_URL=ws://127.0.0.1:7880 +export LIVEKIT_API_KEY=devkey +export LIVEKIT_API_SECRET=secret +``` + +Run a subscriber in another terminal to verify negotiated codec and decoder +health: + +```sh +cargo run -p local_video --features desktop --bin subscriber -- \ + --url "$LIVEKIT_URL" \ + --api-key "$LIVEKIT_API_KEY" \ + --api-secret "$LIVEKIT_API_SECRET" \ + --room-name video-room \ + --identity sub-vp8 \ + --participant gst-vp8-pub \ + --display-timestamp +``` + +Then publish a pre-encoded GStreamer fixture: + +```sh +cargo run -p preencode_publish --features gstreamer -- \ + --source gstappsink \ + --codec vp8 \ + --url "$LIVEKIT_URL" \ + --api-key "$LIVEKIT_API_KEY" \ + --api-secret "$LIVEKIT_API_SECRET" \ + --room-name video-room \ + --identity gst-vp8-pub \ + --width 1280 \ + --height 720 \ + --fps 30 \ + --diagnostics +``` + +Expected publisher signs are a successful room connection, a +`Published pre-encoded ... track at 1280x720` log line, and diagnostics near +30 access units per second. Expected subscriber signs for healthy codecs are a +matching `Subscribed to video track` codec and rising decoded-frame counts with +low loss and no repeated PLI loop. + +### GStreamer `gstappsink` Source + +This exercises: + +`GStreamer videotestsrc -> encoder -> appsink -> GStreamerAppSinkEncodedSource -> VideoCaptureTrack` + +Publish the generated GStreamer source: + +```sh +cargo run -p preencode_publish --features gstreamer -- \ + --source gstappsink \ + --codec h264 \ + --width 1280 \ + --height 720 \ + --fps 30 \ + --room-name video-room \ + --identity gst-h264-pub \ + --diagnostics +``` + +For H.265, VP8, VP9, or AV1, change `--codec` to `h265`, `vp8`, `vp9`, or +`av1`. The generated AV1 path inserts `av1parse` and requests +`stream-format=obu-stream,alignment=tu` before appsink. + +Custom GStreamer launch fragments can be passed after `--`. If the pipeline +does not include `appsink name=lk_appsink`, it must leave exactly one encoded +video source pad unlinked so the example can attach codec-specific parsing, +caps, and appsink: + +```sh +cargo run -p preencode_publish --features gstreamer -- \ + --source gstappsink \ + --codec h264 \ + --width 1280 \ + --height 720 \ + --fps 30 \ + --room-name video-room \ + --identity custom-gst-h264-pub \ + --diagnostics \ + -- \ + 'videotestsrc is-live=true do-timestamp=true ! video/x-raw,width=1280,height=720,framerate=30/1 ! videoconvert ! x264enc tune=zerolatency speed-preset=ultrafast key-int-max=30 byte-stream=true aud=true' +``` + +### TCP `tcpsink` Source + +This exercises: + +`GStreamer videotestsrc -> encoder -> tcpserversink -> TcpEncodedSource -> VideoCaptureTrack` + +The `preencode_publish` CLI source is `tcpsink`; it connects to a TCP producer +such as the fixture script's GStreamer `tcpserversink`. + +Start the producer: + +```sh +examples/preencode_publish/scripts/run-tcp-test-source.sh --codec h264 --port 5000 +``` + +Publish the TCP source: + +```sh +cargo run -p preencode_publish -- \ + --source tcpsink \ + --host 127.0.0.1:5000 \ + --codec h264 \ + --width 1280 \ + --height 720 \ + --fps 30 \ + --room-name video-room \ + --identity tcp-h264-pub \ + --diagnostics +``` + +For H.265, use `--codec h265` in both commands. + +For VP8, VP9, or AV1, use the same script with `--codec vp8`, `--codec vp9`, or +`--codec av1`; `preencode_publish --tcp-format auto` selects RTP automatically: + +```sh +cargo run -p preencode_publish -- \ + --source tcpsink \ + --host 127.0.0.1:5000 \ + --codec vp8 \ + --tcp-format auto \ + --width 1280 \ + --height 720 \ + --fps 30 \ + --room-name video-room \ + --identity tcp-vp8-pub \ + --diagnostics +``` + +### Shared-Memory `shmsink` Source + +This exercises: + +`GStreamer videotestsrc -> encoder -> shmsink -> shmsrc -> GStreamerAppSinkEncodedSource -> VideoCaptureTrack` + +Start the producer: + +```sh +examples/preencode_publish/scripts/run-shm-test-source.sh \ + --codec h264 \ + --socket-path /tmp/livekit-preencode-h264.shm +``` + +Publish by connecting the first-class `shmsink` source to that socket: + +```sh +cargo run -p preencode_publish --features gstreamer -- \ + --source shmsink \ + --codec h264 \ + --shmsink-socket-path /tmp/livekit-preencode-h264.shm \ + --width 1280 \ + --height 720 \ + --fps 30 \ + --room-name video-room \ + --identity shm-h264-pub \ + --diagnostics +``` + +For H.265, use `--codec h265`, a different socket path if desired, and +the same `--source shmsink` command shape. + +For VP8/VP9, use `--codec vp8` or `--codec vp9`. For AV1, the producer script +parses to low-overhead temporal units before `shmsink`, and the `shmsink` +source adds the matching AV1 appsink caps: + +```sh +cargo run -p preencode_publish --features gstreamer -- \ + --source shmsink \ + --codec av1 \ + --shmsink-socket-path /tmp/livekit-preencode-av1.shm \ + --width 1280 \ + --height 720 \ + --fps 30 \ + --room-name video-room \ + --identity shm-av1-pub \ + --diagnostics +``` + +### RTSP source + +This exercises: + +`GStreamer videotestsrc -> encoder -> RTP payloader -> gst-rtsp-server -> RtspEncodedSource -> VideoCaptureTrack` + +Start the RTSP server. The script uses the `test-launch` tool from +`gst-rtsp-server` and serves `/test`: + +```sh +examples/preencode_publish/scripts/run-rtsp-test-source.sh --codec h264 --port 8555 +``` + +Publish the RTSP source: + +```sh +cargo run -p preencode_publish -- \ + --source rtsp \ + --rtsp-url rtsp://127.0.0.1:8555/test \ + --codec h264 \ + --width 1280 \ + --height 720 \ + --fps 30 \ + --room-name video-room \ + --identity rtsp-h264-pub \ + --diagnostics +``` + +For H.265, use `--codec h265` in both commands. + +For VP8, VP9, or AV1, use `--codec vp8`, `--codec vp9`, or `--codec av1` in +both commands. The RTSP fixture switches to `rtpvp8pay`, `rtpvp9pay`, or +`rtpav1pay` automatically. + +Publisher-side success signs are a successful room connection, a +`Published pre-encoded ... track at 1280x720` log line, and diagnostics near +30 access units per second. diff --git a/livekit-capture/src/encoded/h26x.rs b/livekit-capture/src/encoded/h26x.rs index d151eb1c2..866c6dd18 100644 --- a/livekit-capture/src/encoded/h26x.rs +++ b/livekit-capture/src/encoded/h26x.rs @@ -36,7 +36,7 @@ pub struct AnnexBAccessUnitParser { } /// H.264/AVC length-prefixed parser state. -#[cfg(any(feature = "tcp-source", test))] +#[cfg(any(feature = "tcpsink", test))] #[derive(Debug, Clone)] pub(crate) struct AvcAccessUnitParser { pending: Vec, @@ -123,7 +123,7 @@ impl AnnexBAccessUnitParser { } } -#[cfg(any(feature = "tcp-source", test))] +#[cfg(any(feature = "tcpsink", test))] impl AvcAccessUnitParser { /// Creates a parser for H.264/AVC length-prefixed byte streams. pub(crate) fn new( @@ -329,7 +329,7 @@ fn access_unit_split_index( Ok(None) } -#[cfg(any(feature = "tcp-source", test))] +#[cfg(any(feature = "tcpsink", test))] fn avc_access_unit_split_index( bytes: &[u8], ranges: &[Range], diff --git a/livekit-capture/src/encoded/rtp.rs b/livekit-capture/src/encoded/rtp.rs index 2accdc7e9..8911dd61a 100644 --- a/livekit-capture/src/encoded/rtp.rs +++ b/livekit-capture/src/encoded/rtp.rs @@ -12,10 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +use bytes::Bytes; use thiserror::Error; use crate::{ - encoded::{h26x::access_unit_from_nalus, EncodedVideoCodec, OwnedEncodedAccessUnit}, + encoded::{ + h26x::access_unit_from_nalus, CodecSpecific, EncodedFrameType, EncodedVideoCodec, + OwnedEncodedAccessUnit, + }, error::CaptureError, }; @@ -143,6 +147,9 @@ pub enum RtpDepacketizerError { /// RTP fragmentation state was invalid. #[error("invalid RTP fragmentation sequence")] InvalidFragment, + /// The payload descriptor is unsupported by the single-layer depacketizer. + #[error("unsupported RTP payload descriptor")] + UnsupportedPayloadDescriptor, /// Codec is not supported by this RTP assembler. #[error("RTP assembler does not support {0:?}")] UnsupportedCodec(EncodedVideoCodec), @@ -161,6 +168,8 @@ pub struct RtpAccessUnitAssembler { expected_sequence_number: Option, current: Option, fragment: Option, + current_frame: Option, + av1_fragment: Option, } #[derive(Debug, Clone)] @@ -176,8 +185,22 @@ struct FragmentState { nal_unit: Vec, } +#[derive(Debug, Clone)] +struct PartialFrame { + rtp_timestamp: u32, + timestamp_us: i64, + payload: Vec, + frame_type: Option, +} + +#[derive(Debug, Clone)] +struct Av1FragmentState { + rtp_timestamp: u32, + obu: Vec, +} + impl RtpAccessUnitAssembler { - /// Creates an RTP access-unit assembler for H.264 or H.265 payloads. + /// Creates an RTP access-unit assembler for supported video payloads. pub fn new( codec: EncodedVideoCodec, clock_rate: u32, @@ -185,12 +208,6 @@ impl RtpAccessUnitAssembler { width: u32, height: u32, ) -> Result { - match codec { - EncodedVideoCodec::H264 | EncodedVideoCodec::H265 => {} - EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { - return Err(RtpDepacketizerError::UnsupportedCodec(codec)); - } - } if clock_rate == 0 { return Err(RtpDepacketizerError::InvalidClockRate); } @@ -203,6 +220,8 @@ impl RtpAccessUnitAssembler { expected_sequence_number: None, current: None, fragment: None, + current_frame: None, + av1_fragment: None, }) } @@ -225,12 +244,23 @@ impl RtpAccessUnitAssembler { match self.codec { EncodedVideoCodec::H264 => self.push_h264_payload(&packet)?, EncodedVideoCodec::H265 => self.push_h265_payload(&packet)?, - EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { - return Err(RtpDepacketizerError::UnsupportedCodec(self.codec)); - } + EncodedVideoCodec::VP8 => self.push_vp8_payload(&packet)?, + EncodedVideoCodec::VP9 => self.push_vp9_payload(&packet)?, + EncodedVideoCodec::AV1 => self.push_av1_payload(&packet)?, } if packet.marker { + if self.codec == EncodedVideoCodec::AV1 && self.av1_fragment.is_some() { + self.current_frame = None; + self.av1_fragment = None; + return Err(RtpDepacketizerError::InvalidFragment); + } + if matches!( + self.codec, + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 + ) { + return self.finish_current_frame(); + } return self.finish_current(); } Ok(None) @@ -247,6 +277,8 @@ impl RtpAccessUnitAssembler { self.current = None; self.fragment = None; + self.current_frame = None; + self.av1_fragment = None; Err(RtpDepacketizerError::SequenceGap { expected, actual: sequence_number }) } @@ -268,6 +300,29 @@ impl RtpAccessUnitAssembler { self.current.as_mut().ok_or(RtpDepacketizerError::InvalidFragment) } + fn current_frame_mut( + &mut self, + rtp_timestamp: u32, + ) -> Result<&mut PartialFrame, RtpDepacketizerError> { + if self.current_frame.as_ref().is_some_and(|current| current.rtp_timestamp != rtp_timestamp) + { + self.current_frame = None; + self.av1_fragment = None; + } + + if self.current_frame.is_none() { + let timestamp_us = self.timestamp_mapper.map(rtp_timestamp)?; + self.current_frame = Some(PartialFrame { + rtp_timestamp, + timestamp_us, + payload: Vec::new(), + frame_type: None, + }); + } + + self.current_frame.as_mut().ok_or(RtpDepacketizerError::InvalidFragment) + } + fn push_h264_payload(&mut self, packet: &RtpPacket<'_>) -> Result<(), RtpDepacketizerError> { let payload = packet.payload; let Some(&header) = payload.first() else { @@ -426,6 +481,107 @@ impl RtpAccessUnitAssembler { Ok(()) } + fn push_vp8_payload(&mut self, packet: &RtpPacket<'_>) -> Result<(), RtpDepacketizerError> { + let descriptor = parse_vp8_payload_descriptor(packet.payload)?; + if descriptor.payload.is_empty() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + + let frame = self.current_frame_mut(packet.timestamp)?; + if frame.payload.is_empty() { + if !descriptor.start_of_partition || descriptor.partition_id != 0 { + self.current_frame = None; + return Err(RtpDepacketizerError::InvalidFragment); + } + frame.frame_type = Some(if is_vp8_keyframe(descriptor.payload) { + EncodedFrameType::Key + } else { + EncodedFrameType::Delta + }); + } + frame.payload.extend_from_slice(descriptor.payload); + Ok(()) + } + + fn push_vp9_payload(&mut self, packet: &RtpPacket<'_>) -> Result<(), RtpDepacketizerError> { + let descriptor = parse_vp9_payload_descriptor(packet.payload)?; + if descriptor.payload.is_empty() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + if descriptor.spatial_id.unwrap_or(0) != 0 + || descriptor.inter_layer_predicted.unwrap_or(false) + { + return Err(RtpDepacketizerError::UnsupportedPayloadDescriptor); + } + + let frame = self.current_frame_mut(packet.timestamp)?; + if frame.payload.is_empty() { + if !descriptor.beginning_of_frame { + self.current_frame = None; + return Err(RtpDepacketizerError::InvalidFragment); + } + frame.frame_type = Some( + if !descriptor.inter_picture_predicted || is_vp9_keyframe(descriptor.payload) { + EncodedFrameType::Key + } else { + EncodedFrameType::Delta + }, + ); + } + frame.payload.extend_from_slice(descriptor.payload); + Ok(()) + } + + fn push_av1_payload(&mut self, packet: &RtpPacket<'_>) -> Result<(), RtpDepacketizerError> { + let descriptor = parse_av1_payload_descriptor(packet.payload)?; + if descriptor.elements.is_empty() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + + let mut saw_sequence_header = descriptor.new_sequence; + let last_index = descriptor.elements.len() - 1; + for (index, element) in descriptor.elements.iter().enumerate() { + if element.is_empty() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + + let obu = if index == 0 && descriptor.starts_fragment { + let mut fragment = self + .av1_fragment + .take() + .filter(|fragment| fragment.rtp_timestamp == packet.timestamp) + .ok_or(RtpDepacketizerError::InvalidFragment)? + .obu; + fragment.extend_from_slice(element); + fragment + } else { + if index == 0 && self.av1_fragment.is_some() { + return Err(RtpDepacketizerError::InvalidFragment); + } + element.to_vec() + }; + + if index == last_index && descriptor.ends_fragment { + self.av1_fragment = Some(Av1FragmentState { rtp_timestamp: packet.timestamp, obu }); + return Ok(()); + } + + let mut obu = av1_obu_from_rtp_element(&obu)?; + saw_sequence_header |= av1_obu_type(&obu) == Some(1); + let frame = self.current_frame_mut(packet.timestamp)?; + if frame.payload.is_empty() || saw_sequence_header { + frame.frame_type = Some(if saw_sequence_header { + EncodedFrameType::Key + } else { + EncodedFrameType::Delta + }); + } + frame.payload.append(&mut obu); + } + + Ok(()) + } + fn finish_current(&mut self) -> Result, RtpDepacketizerError> { let Some(current) = self.current.take() else { return Ok(None); @@ -443,6 +599,347 @@ impl RtpAccessUnitAssembler { self.height, )?)) } + + fn finish_current_frame( + &mut self, + ) -> Result, RtpDepacketizerError> { + let Some(current) = self.current_frame.take() else { + return Ok(None); + }; + if current.payload.is_empty() { + return Ok(None); + } + + let mut access_unit = OwnedEncodedAccessUnit::new( + self.codec, + Bytes::from(current.payload), + current.timestamp_us, + current.frame_type.unwrap_or(EncodedFrameType::Delta), + self.width, + self.height, + ); + access_unit.codec_specific = match self.codec { + EncodedVideoCodec::VP8 => CodecSpecific::VP8 { temporal_id: None, layer_sync: false }, + EncodedVideoCodec::VP9 => CodecSpecific::VP9 { + temporal_id: None, + spatial_id: None, + inter_layer_predicted: None, + }, + EncodedVideoCodec::AV1 => CodecSpecific::AV1 { + scalability_mode: Some("L1T1".to_string()), + dependency_descriptor: None, + }, + EncodedVideoCodec::H264 | EncodedVideoCodec::H265 => CodecSpecific::None, + }; + Ok(Some(access_unit)) + } +} + +#[derive(Debug, Clone, Copy)] +struct Vp8PayloadDescriptor<'a> { + start_of_partition: bool, + partition_id: u8, + payload: &'a [u8], +} + +#[derive(Debug, Clone, Copy)] +struct Vp9PayloadDescriptor<'a> { + beginning_of_frame: bool, + inter_picture_predicted: bool, + spatial_id: Option, + inter_layer_predicted: Option, + payload: &'a [u8], +} + +#[derive(Debug, Clone)] +struct Av1PayloadDescriptor<'a> { + starts_fragment: bool, + ends_fragment: bool, + new_sequence: bool, + elements: Vec<&'a [u8]>, +} + +fn parse_vp8_payload_descriptor( + payload: &[u8], +) -> Result, RtpDepacketizerError> { + let Some(&descriptor) = payload.first() else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + let start_of_partition = descriptor & 0x10 != 0; + let partition_id = descriptor & 0x0f; + let mut cursor = 1; + if descriptor & 0x80 != 0 { + let Some(&extension) = payload.get(cursor) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + cursor += 1; + if extension & 0x80 != 0 { + let Some(&picture_id) = payload.get(cursor) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + cursor += if picture_id & 0x80 != 0 { 2 } else { 1 }; + } + if extension & 0x40 != 0 { + cursor += 1; + } + if extension & 0x20 != 0 || extension & 0x10 != 0 { + cursor += 1; + } + } + if cursor > payload.len() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + Ok(Vp8PayloadDescriptor { start_of_partition, partition_id, payload: &payload[cursor..] }) +} + +fn parse_vp9_payload_descriptor( + payload: &[u8], +) -> Result, RtpDepacketizerError> { + let Some(&descriptor) = payload.first() else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + if descriptor & 0x10 != 0 { + return Err(RtpDepacketizerError::UnsupportedPayloadDescriptor); + } + + let beginning_of_frame = descriptor & 0x08 != 0; + let inter_picture_predicted = descriptor & 0x40 != 0; + let mut cursor = 1; + if descriptor & 0x80 != 0 { + let Some(&picture_id) = payload.get(cursor) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + cursor += if picture_id & 0x80 != 0 { 2 } else { 1 }; + } + + let mut spatial_id = None; + let mut inter_layer_predicted = None; + if descriptor & 0x20 != 0 { + let Some(&layer_info) = payload.get(cursor) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + cursor += 1; + spatial_id = Some((layer_info >> 1) & 0x07); + inter_layer_predicted = Some(layer_info & 0x01 != 0); + cursor += 1; // TL0PICIDX is present in non-flexible mode. + } + + if descriptor & 0x02 != 0 { + skip_vp9_scalability_structure(payload, &mut cursor)?; + } + + if cursor > payload.len() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + Ok(Vp9PayloadDescriptor { + beginning_of_frame, + inter_picture_predicted, + spatial_id, + inter_layer_predicted, + payload: &payload[cursor..], + }) +} + +fn skip_vp9_scalability_structure( + payload: &[u8], + cursor: &mut usize, +) -> Result<(), RtpDepacketizerError> { + let Some(&structure) = payload.get(*cursor) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + *cursor += 1; + + let spatial_layers = ((structure >> 5) & 0x07) + 1; + if spatial_layers != 1 { + return Err(RtpDepacketizerError::UnsupportedPayloadDescriptor); + } + + if structure & 0x10 != 0 { + let bytes = usize::from(spatial_layers) * 4; + skip_bytes(payload, cursor, bytes)?; + } + + if structure & 0x08 != 0 { + let Some(&group_count) = payload.get(*cursor) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + *cursor += 1; + for _ in 0..group_count { + let Some(&group) = payload.get(*cursor) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + *cursor += 1; + skip_bytes(payload, cursor, usize::from((group >> 2) & 0x03))?; + } + } + + Ok(()) +} + +fn skip_bytes( + payload: &[u8], + cursor: &mut usize, + bytes: usize, +) -> Result<(), RtpDepacketizerError> { + let Some(next) = cursor.checked_add(bytes) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + if next > payload.len() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + *cursor = next; + Ok(()) +} + +fn parse_av1_payload_descriptor( + payload: &[u8], +) -> Result, RtpDepacketizerError> { + let Some(&header) = payload.first() else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + let starts_fragment = header & 0x80 != 0; + let ends_fragment = header & 0x40 != 0; + let element_count = (header >> 4) & 0x03; + let new_sequence = header & 0x08 != 0; + + let mut cursor = 1; + let mut elements = Vec::new(); + if element_count == 0 { + while cursor < payload.len() { + let len = read_leb128(payload, &mut cursor)?; + let Some(end) = cursor.checked_add(len) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + if end > payload.len() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + elements.push(&payload[cursor..end]); + cursor = end; + } + } else { + for index in 0..usize::from(element_count) { + let len = if index + 1 == usize::from(element_count) { + payload.len().saturating_sub(cursor) + } else { + read_leb128(payload, &mut cursor)? + }; + let Some(end) = cursor.checked_add(len) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + if end > payload.len() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + elements.push(&payload[cursor..end]); + cursor = end; + } + } + + Ok(Av1PayloadDescriptor { starts_fragment, ends_fragment, new_sequence, elements }) +} + +fn read_leb128(bytes: &[u8], cursor: &mut usize) -> Result { + let mut value = 0usize; + let mut shift = 0usize; + loop { + let Some(&byte) = bytes.get(*cursor) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + *cursor += 1; + value |= usize::from(byte & 0x7f) << shift; + if byte & 0x80 == 0 { + return Ok(value); + } + shift += 7; + if shift >= usize::BITS as usize { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + } +} + +fn write_leb128(mut value: usize, out: &mut Vec) { + loop { + let mut byte = (value & 0x7f) as u8; + value >>= 7; + if value != 0 { + byte |= 0x80; + } + out.push(byte); + if value == 0 { + break; + } + } +} + +fn av1_obu_from_rtp_element(element: &[u8]) -> Result, RtpDepacketizerError> { + let Some(&header) = element.first() else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + if header & 0x80 != 0 { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + + if header & 0x02 != 0 { + let mut cursor = if header & 0x04 != 0 { 2 } else { 1 }; + if cursor > element.len() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + let payload_size = read_leb128(element, &mut cursor)?; + if payload_size != element.len().saturating_sub(cursor) { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + return Ok(element.to_vec()); + } + + let payload_offset = if header & 0x04 != 0 { 2 } else { 1 }; + if payload_offset > element.len() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + + let payload_size = element.len() - payload_offset; + let mut obu = Vec::with_capacity(element.len() + 8); + obu.push(header | 0x02); + if header & 0x04 != 0 { + obu.push(element[1]); + } + write_leb128(payload_size, &mut obu); + obu.extend_from_slice(&element[payload_offset..]); + Ok(obu) +} + +fn is_vp8_keyframe(payload: &[u8]) -> bool { + payload.first().is_some_and(|header| header & 0x01 == 0) +} + +fn is_vp9_keyframe(payload: &[u8]) -> bool { + let Some(&first_byte) = payload.first() else { + return false; + }; + if first_byte & 0x03 != 0x02 { + return false; + } + + let mut bit_offset = 2usize; + let profile_low = read_bit(first_byte, bit_offset); + bit_offset += 1; + let profile_high = read_bit(first_byte, bit_offset); + bit_offset += 1; + let profile = profile_low | (profile_high << 1); + if profile == 3 { + bit_offset += 1; + } + if read_bit(first_byte, bit_offset) != 0 { + return false; + } + bit_offset += 1; + read_bit(first_byte, bit_offset) == 0 +} + +fn read_bit(byte: u8, bit_offset: usize) -> u8 { + (byte >> bit_offset) & 0x01 +} + +fn av1_obu_type(obu: &[u8]) -> Option { + obu.first().map(|header| (header & 0x78) >> 3) } #[cfg(test)] @@ -501,4 +998,189 @@ mod tests { let err = assembler.push(&end).unwrap_err(); assert_eq!(err, RtpDepacketizerError::SequenceGap { expected: 11, actual: 12 }); } + + #[test] + fn assembles_vp8_fragments() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP8, 90_000, 0, 640, 480).unwrap(); + let start = rtp_packet(10, 12_000, false, &[0x10, 0x00, 1, 2]); + let end = rtp_packet(11, 12_000, true, &[0x00, 3, 4]); + + assert!(assembler.push(&start).unwrap().is_none()); + let access_unit = assembler.push(&end).unwrap().unwrap(); + assert_eq!(access_unit.codec, EncodedVideoCodec::VP8); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0x00, 1, 2, 3, 4]); + } + + #[test] + fn rejects_vp8_mid_frame_start() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP8, 90_000, 0, 640, 480).unwrap(); + let packet = rtp_packet(10, 12_000, true, &[0x00, 1, 2]); + + let err = assembler.push(&packet).unwrap_err(); + assert_eq!(err, RtpDepacketizerError::InvalidFragment); + } + + #[test] + fn assembles_vp9_single_layer_frame() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP9, 90_000, 0, 640, 480).unwrap(); + let packet = rtp_packet(10, 12_000, true, &[0x0c, 0x82, 1, 2]); + + let access_unit = assembler.push(&packet).unwrap().unwrap(); + assert_eq!(access_unit.codec, EncodedVideoCodec::VP9); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0x82, 1, 2]); + } + + #[test] + fn assembles_vp9_non_flexible_layer_descriptor() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP9, 90_000, 0, 640, 480).unwrap(); + let packet = rtp_packet(10, 12_000, true, &[0x2c, 0x10, 7, 0x82, 1, 2]); + + let access_unit = assembler.push(&packet).unwrap().unwrap(); + assert_eq!(access_unit.codec, EncodedVideoCodec::VP9); + assert_eq!(access_unit.payload.as_ref(), &[0x82, 1, 2]); + } + + #[test] + fn assembles_vp9_single_layer_scalability_structure() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP9, 90_000, 0, 640, 480).unwrap(); + let packet = rtp_packet( + 10, + 12_000, + true, + &[ + 0x0e, // B, E, V + 0x18, // one spatial layer, resolution present, picture group present + 0x01, 0x40, 0x00, 0xb4, // 320x180 + 0x01, // one picture group + 0x04, // one reference index + 0x01, // P_DIFF + 0x82, 1, 2, + ], + ); + + let access_unit = assembler.push(&packet).unwrap().unwrap(); + assert_eq!(access_unit.codec, EncodedVideoCodec::VP9); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0x82, 1, 2]); + } + + #[test] + fn assembles_vp9_descriptor_keyframe_from_prediction_bit() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP9, 90_000, 0, 640, 480).unwrap(); + let packet = rtp_packet( + 10, + 12_000, + true, + &[ + 0x0e, // B, E, V; P is clear, so this is not inter-picture predicted. + 0x18, // one spatial layer, resolution present, picture group present + 0x02, 0x80, 0x01, 0x68, // 640x360 + 0x01, // one picture group + 0x04, // one reference index + 0x01, // P_DIFF + 0xb1, 1, 2, + ], + ); + + let access_unit = assembler.push(&packet).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0xb1, 1, 2]); + } + + #[test] + fn assembles_vp9_predicted_frame_as_delta() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP9, 90_000, 0, 640, 480).unwrap(); + let packet = rtp_packet(10, 12_000, true, &[0x4c, 0x83, 1, 2]); + + let access_unit = assembler.push(&packet).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Delta); + assert_eq!(access_unit.payload.as_ref(), &[0x83, 1, 2]); + } + + #[test] + fn rejects_vp9_multi_layer_scalability_structure() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP9, 90_000, 0, 640, 480).unwrap(); + let packet = rtp_packet(10, 12_000, true, &[0x0e, 0x20, 0x82, 1, 2]); + + let err = assembler.push(&packet).unwrap_err(); + assert_eq!(err, RtpDepacketizerError::UnsupportedPayloadDescriptor); + } + + #[test] + fn rejects_vp9_mid_frame_start() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP9, 90_000, 0, 640, 480).unwrap(); + let packet = rtp_packet(10, 12_000, true, &[0x04, 0x82, 1, 2]); + + let err = assembler.push(&packet).unwrap_err(); + assert_eq!(err, RtpDepacketizerError::InvalidFragment); + } + + #[test] + fn rejects_vp9_flexible_mode() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP9, 90_000, 0, 640, 480).unwrap(); + let packet = rtp_packet(10, 12_000, true, &[0x1c, 0xa2, 1, 2]); + + let err = assembler.push(&packet).unwrap_err(); + assert_eq!(err, RtpDepacketizerError::UnsupportedPayloadDescriptor); + } + + #[test] + fn assembles_av1_temporal_unit() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::AV1, 90_000, 0, 640, 480).unwrap(); + let packet = rtp_packet(10, 12_000, true, &[0x18, 0x08]); + + let access_unit = assembler.push(&packet).unwrap().unwrap(); + assert_eq!(access_unit.codec, EncodedVideoCodec::AV1); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0x0a, 0x00]); + } + + #[test] + fn assembles_fragmented_av1_obu() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::AV1, 90_000, 0, 640, 480).unwrap(); + let start = rtp_packet(10, 12_000, false, &[0x50, 0x30, 1]); + let end = rtp_packet(11, 12_000, true, &[0x90, 2, 3]); + + assert!(assembler.push(&start).unwrap().is_none()); + let access_unit = assembler.push(&end).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Delta); + assert_eq!(access_unit.payload.as_ref(), &[0x32, 0x03, 1, 2, 3]); + } + + #[test] + fn assembles_av1_obu_payload_with_size_field() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::AV1, 90_000, 0, 640, 480).unwrap(); + let packet = rtp_packet(10, 12_000, true, &[0x10, 0x30, 1, 2, 3]); + + let access_unit = assembler.push(&packet).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Delta); + assert_eq!(access_unit.payload.as_ref(), &[0x32, 0x03, 1, 2, 3]); + } + + #[test] + fn sequence_gap_clears_vp8_frame() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP8, 90_000, 0, 640, 480).unwrap(); + let start = rtp_packet(10, 12_000, false, &[0x10, 0x00, 1, 2]); + let end = rtp_packet(12, 12_000, true, &[0x00, 3, 4]); + + assert!(assembler.push(&start).unwrap().is_none()); + let err = assembler.push(&end).unwrap_err(); + assert_eq!(err, RtpDepacketizerError::SequenceGap { expected: 11, actual: 12 }); + } } diff --git a/livekit-capture/src/sources/gstreamer.rs b/livekit-capture/src/sources/gstreamer.rs index d5b07cbd8..75499af53 100644 --- a/livekit-capture/src/sources/gstreamer.rs +++ b/livekit-capture/src/sources/gstreamer.rs @@ -356,6 +356,31 @@ mod tests { ); } + #[test] + fn sample_payload_access_unit_sets_vp9_and_av1_specifics() { + let vp9 = access_unit_from_sample_payload( + GStreamerSampleFormat::AccessUnit { codec: EncodedVideoCodec::VP9 }, + &[1, 2, 3], + 2_000, + EncodedFrameType::Key, + 640, + 480, + ) + .unwrap(); + assert_eq!(vp9.codec_specific, codec_specific_for(EncodedVideoCodec::VP9)); + + let av1 = access_unit_from_sample_payload( + GStreamerSampleFormat::AccessUnit { codec: EncodedVideoCodec::AV1 }, + &[1, 2, 3], + 2_000, + EncodedFrameType::Key, + 640, + 480, + ) + .unwrap(); + assert_eq!(av1.codec_specific, codec_specific_for(EncodedVideoCodec::AV1)); + } + #[test] fn clock_time_is_offset_from_start_timestamp() { let timestamp = clock_time_to_timestamp_us(10_000, gst::ClockTime::from_useconds(1_234)); diff --git a/livekit-capture/src/sources/mod.rs b/livekit-capture/src/sources/mod.rs index 9441ccb3f..4eaf64992 100644 --- a/livekit-capture/src/sources/mod.rs +++ b/livekit-capture/src/sources/mod.rs @@ -26,7 +26,7 @@ pub mod argus; pub mod gstreamer; #[cfg(feature = "rtsp")] pub mod rtsp; -#[cfg(feature = "tcp-source")] +#[cfg(feature = "tcpsink")] pub mod tcp; #[cfg(feature = "v4l")] pub mod v4l; diff --git a/livekit-capture/src/sources/rtsp.rs b/livekit-capture/src/sources/rtsp.rs index 663e4850e..bbca3ca3a 100644 --- a/livekit-capture/src/sources/rtsp.rs +++ b/livekit-capture/src/sources/rtsp.rs @@ -383,7 +383,7 @@ pub enum RtspSourceError { #[error("unsupported RTSP authentication scheme: {0}")] UnsupportedAuthScheme(String), /// SDP was missing a supported video track. - #[error("RTSP SDP does not contain a supported H264/H265 video track")] + #[error("RTSP SDP does not contain a supported video track")] MissingVideoTrack, /// SDP selected a codec different from the requested codec. #[error("RTSP SDP codec mismatch: expected {expected:?}, got {actual:?}")] @@ -1016,6 +1016,12 @@ fn parse_sdp_codec(codec_name: &str) -> Option { Some(EncodedVideoCodec::H264) } else if codec_name.eq_ignore_ascii_case("H265") || codec_name.eq_ignore_ascii_case("HEVC") { Some(EncodedVideoCodec::H265) + } else if codec_name.eq_ignore_ascii_case("VP8") { + Some(EncodedVideoCodec::VP8) + } else if codec_name.eq_ignore_ascii_case("VP9") { + Some(EncodedVideoCodec::VP9) + } else if codec_name.eq_ignore_ascii_case("AV1") { + Some(EncodedVideoCodec::AV1) } else { None } @@ -1159,6 +1165,51 @@ a=rtpmap:96 H264/90000\r\n"; assert_eq!(track.control_url, "rtsp://camera.example/live/trackID=1"); } + #[test] + fn parses_vp8_vp9_and_av1_sdp_video_tracks() { + let base_url = RtspUrl::parse("rtsp://camera.example/live").unwrap(); + + for (rtpmap, codec) in [ + ("VP8/90000", EncodedVideoCodec::VP8), + ("VP9/90000", EncodedVideoCodec::VP9), + ("AV1/90000", EncodedVideoCodec::AV1), + ] { + let sdp = format!( + "\ +v=0\r\n\ +m=video 0 RTP/AVP 96\r\n\ +a=control:trackID=1\r\n\ +a=rtpmap:96 {rtpmap}\r\n" + ); + + let track = parse_sdp_video_track(&base_url, &sdp, Some(codec)).unwrap(); + + assert_eq!(track.codec, codec); + assert_eq!(track.payload_type, 96); + assert_eq!(track.clock_rate, 90_000); + } + } + + #[test] + fn rejects_sdp_codec_mismatch_for_vpx_av1() { + let base_url = RtspUrl::parse("rtsp://camera.example/live").unwrap(); + let sdp = "\ +v=0\r\n\ +m=video 0 RTP/AVP 96\r\n\ +a=control:trackID=1\r\n\ +a=rtpmap:96 VP9/90000\r\n"; + + let err = parse_sdp_video_track(&base_url, sdp, Some(EncodedVideoCodec::AV1)).unwrap_err(); + + assert!(matches!( + err, + RtspSourceError::CodecMismatch { + expected: EncodedVideoCodec::AV1, + actual: EncodedVideoCodec::VP9 + } + )); + } + #[test] fn resolves_absolute_path_control_url() { let base_url = RtspUrl::parse("rtsp://camera.example/live").unwrap(); diff --git a/livekit-capture/src/track.rs b/livekit-capture/src/track.rs index 30cf3ce75..c9bfdf26b 100644 --- a/livekit-capture/src/track.rs +++ b/livekit-capture/src/track.rs @@ -72,15 +72,7 @@ impl VideoCaptureTrack { /// Captures one encoded video access unit. pub fn capture_encoded(&self, access_unit: &EncodedAccessUnit<'_>) -> Result<(), CaptureError> { - match access_unit.codec { - EncodedVideoCodec::H264 | EncodedVideoCodec::H265 => {} - EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { - return Err(CaptureError::UnsupportedCodec(access_unit.codec)); - } - } - if access_unit.payload.is_empty() { - return Err(CaptureError::EmptyPayload); - } + validate_encoded_access_unit(access_unit)?; let payload = access_unit.payload.to_vec(); let frame = EncodedVideoFrame { @@ -105,3 +97,46 @@ impl VideoCaptureTrack { } } } + +fn validate_encoded_access_unit(access_unit: &EncodedAccessUnit<'_>) -> Result<(), CaptureError> { + if access_unit.payload.is_empty() { + return Err(CaptureError::EmptyPayload); + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::encoded::EncodedFrameType; + + #[test] + fn accepts_vp8_vp9_and_av1_access_units() { + for codec in [EncodedVideoCodec::VP8, EncodedVideoCodec::VP9, EncodedVideoCodec::AV1] { + let access_unit = EncodedAccessUnit::contiguous( + codec, + &[1, 2, 3], + 0, + EncodedFrameType::Key, + 640, + 480, + ); + + assert!(validate_encoded_access_unit(&access_unit).is_ok()); + } + } + + #[test] + fn rejects_empty_encoded_access_units() { + let access_unit = EncodedAccessUnit::contiguous( + EncodedVideoCodec::VP8, + &[], + 0, + EncodedFrameType::Key, + 640, + 480, + ); + + assert_eq!(validate_encoded_access_unit(&access_unit), Err(CaptureError::EmptyPayload)); + } +} diff --git a/webrtc-sys/build.rs b/webrtc-sys/build.rs index bdc9a783a..b69c5a877 100644 --- a/webrtc-sys/build.rs +++ b/webrtc-sys/build.rs @@ -98,6 +98,7 @@ fn main() { "src/audio_mixer.cpp", "src/packet_trailer.cpp", "src/packet_trailer_av1.cpp", + "src/jetson/jetson_av1_bitstream.cpp", ]); if is_desktop { @@ -233,7 +234,6 @@ fn main() { .file("src/jetson/h264_encoder_impl.cpp") .file("src/jetson/h265_encoder_impl.cpp") .file("src/jetson/av1_encoder_impl.cpp") - .file("src/jetson/jetson_av1_bitstream.cpp") .file("src/jetson/jetson_encoder_factory.cpp") .flag("-DUSE_JETSON_VIDEO_CODEC=1"); diff --git a/webrtc-sys/src/jetson/av1_encoder_impl.cpp b/webrtc-sys/src/jetson/av1_encoder_impl.cpp index 9626de4b4..959ac3185 100644 --- a/webrtc-sys/src/jetson/av1_encoder_impl.cpp +++ b/webrtc-sys/src/jetson/av1_encoder_impl.cpp @@ -290,6 +290,7 @@ int32_t JetsonAV1EncoderImpl::Encode( return WEBRTC_VIDEO_CODEC_NO_OUTPUT; } livekit::av1::ConvertAnnexBToLowOverheadIfPresent(&packet); + livekit::av1::StripNonTransferObusIfPresent(&packet); std::vector sequence_header; if (livekit::av1::ExtractSequenceHeaderObu(packet.data(), packet.size(), @@ -353,6 +354,7 @@ int32_t JetsonAV1EncoderImpl::ProcessEncodedFrame( encoded_image_.qp_ = -1; CodecSpecificInfo codecInfo; + codecInfo.codecSpecific = {}; codecInfo.codecType = kVideoCodecAV1; codecInfo.end_of_picture = true; codecInfo.scalability_mode = ScalabilityMode::kL1T1; diff --git a/webrtc-sys/src/jetson/jetson_av1_bitstream.cpp b/webrtc-sys/src/jetson/jetson_av1_bitstream.cpp index bc3ab2206..0c24672f5 100644 --- a/webrtc-sys/src/jetson/jetson_av1_bitstream.cpp +++ b/webrtc-sys/src/jetson/jetson_av1_bitstream.cpp @@ -200,6 +200,39 @@ bool ConvertAnnexBToLowOverhead(std::vector* packet) { return true; } +bool StripNonTransferObus(std::vector* packet) { + if (!packet || packet->empty()) { + return false; + } + + const std::vector obus = ParseObus(packet->data(), packet->size()); + if (obus.empty()) { + return false; + } + + size_t transfer_size = 0; + bool already_contiguous = true; + size_t next_offset = 0; + for (const ObuSpan& obu : obus) { + transfer_size += obu.total_size; + already_contiguous = already_contiguous && obu.offset == next_offset; + next_offset = obu.offset + obu.total_size; + } + + if (transfer_size == packet->size() && already_contiguous) { + return false; + } + + std::vector filtered; + filtered.reserve(transfer_size); + for (const ObuSpan& obu : obus) { + filtered.insert(filtered.end(), packet->begin() + obu.offset, + packet->begin() + obu.offset + obu.total_size); + } + packet->swap(filtered); + return true; +} + } // namespace std::vector ParseObus(const uint8_t* data, size_t len) { @@ -308,6 +341,10 @@ void ConvertAnnexBToLowOverheadIfPresent(std::vector* packet) { ConvertAnnexBToLowOverhead(packet); } +void StripNonTransferObusIfPresent(std::vector* packet) { + StripNonTransferObus(packet); +} + bool IsWebRtcParseable(const uint8_t* data, size_t len) { if (!data || len == 0) { return false; diff --git a/webrtc-sys/src/jetson/jetson_av1_bitstream.h b/webrtc-sys/src/jetson/jetson_av1_bitstream.h index 0af9614e0..f8d9b46fe 100644 --- a/webrtc-sys/src/jetson/jetson_av1_bitstream.h +++ b/webrtc-sys/src/jetson/jetson_av1_bitstream.h @@ -55,6 +55,9 @@ void StripIvfFrameHeaderIfPresent(std::vector* packet); /// present. void ConvertAnnexBToLowOverheadIfPresent(std::vector* packet); +/// Strip OBUs that should not be transferred in WebRTC RTP payloads when present. +void StripNonTransferObusIfPresent(std::vector* packet); + /// Basic validation that WebRTC's AV1 packetizer can parse the bitstream. bool IsWebRtcParseable(const uint8_t* data, size_t len); diff --git a/webrtc-sys/src/passthrough_video_encoder.cpp b/webrtc-sys/src/passthrough_video_encoder.cpp index 117539a48..1a1014bbe 100644 --- a/webrtc-sys/src/passthrough_video_encoder.cpp +++ b/webrtc-sys/src/passthrough_video_encoder.cpp @@ -23,14 +23,19 @@ #include #include +#include "absl/container/inlined_vector.h" #include "api/video/encoded_image.h" #include "api/video/video_frame.h" +#include "api/video/video_codec_constants.h" +#include "api/video_codecs/scalability_mode.h" #include "api/video_codecs/video_encoder.h" #include "common_video/h264/h264_common.h" +#include "jetson/jetson_av1_bitstream.h" #include "livekit/encoded_video_frame_buffer.h" #include "media/base/media_constants.h" #include "modules/video_coding/include/video_codec_interface.h" #include "modules/video_coding/include/video_error_codes.h" +#include "modules/video_coding/svc/scalable_video_controller_no_layering.h" #include "rtc_base/logging.h" namespace livekit_ffi { @@ -43,6 +48,9 @@ using webrtc::EncodedImageBuffer; using webrtc::EncodedImageCallback; using webrtc::Environment; using webrtc::H264PacketizationMode; +using webrtc::ScalabilityMode; +using webrtc::ScalableVideoController; +using webrtc::ScalableVideoControllerNoLayering; using webrtc::SdpVideoFormat; using webrtc::VideoCodec; using webrtc::VideoCodecType; @@ -58,6 +66,15 @@ VideoCodecType CodecTypeFromFormat(const SdpVideoFormat& format) { if (format.name == "H265" || format.name == "HEVC") { return webrtc::kVideoCodecH265; } + if (format.name == "VP8") { + return webrtc::kVideoCodecVP8; + } + if (format.name == "VP9") { + return webrtc::kVideoCodecVP9; + } + if (format.name == "AV1") { + return webrtc::kVideoCodecAV1; + } return webrtc::kVideoCodecGeneric; } @@ -85,6 +102,86 @@ VideoFrameType FrameTypeFromBuffer(livekit::EncodedFrameType frame_type) { } } +bool IsAv1Codec(VideoCodecType codec_type) { + return codec_type == webrtc::kVideoCodecAV1; +} + +bool IsKeyframe(livekit::EncodedFrameType frame_type) { + return frame_type == livekit::EncodedFrameType::kKey; +} + +std::vector NormalizedPayloadForEncode( + const livekit::EncodedVideoFrameBuffer& encoded_buffer) { + std::vector payload = encoded_buffer.payload(); + if (encoded_buffer.codec() == livekit::EncodedVideoCodec::kAV1) { + livekit::av1::StripIvfFrameHeaderIfPresent(&payload); + livekit::av1::ConvertAnnexBToLowOverheadIfPresent(&payload); + livekit::av1::StripNonTransferObusIfPresent(&payload); + } + return payload; +} + +void FillSingleLayerCodecSpecific( + CodecSpecificInfo* codec_info, + VideoCodecType codec_type, + int width, + int height, + bool keyframe, + ScalableVideoControllerNoLayering* av1_svc_controller) { + codec_info->codecType = codec_type; + codec_info->end_of_picture = true; + + switch (codec_type) { + case webrtc::kVideoCodecH264: + codec_info->codecSpecific.H264.packetization_mode = + H264PacketizationMode::NonInterleaved; + break; + case webrtc::kVideoCodecVP8: + codec_info->codecSpecific.VP8.nonReference = false; + codec_info->codecSpecific.VP8.temporalIdx = 0; + codec_info->codecSpecific.VP8.layerSync = false; + codec_info->codecSpecific.VP8.keyIdx = -1; + break; + case webrtc::kVideoCodecVP9: + codec_info->codecSpecific.VP9.first_frame_in_picture = true; + codec_info->codecSpecific.VP9.inter_pic_predicted = !keyframe; + codec_info->codecSpecific.VP9.flexible_mode = false; + codec_info->codecSpecific.VP9.ss_data_available = keyframe; + codec_info->codecSpecific.VP9.temporal_idx = 0; + codec_info->codecSpecific.VP9.temporal_up_switch = true; + codec_info->codecSpecific.VP9.inter_layer_predicted = false; + codec_info->codecSpecific.VP9.gof_idx = 0; + codec_info->codecSpecific.VP9.num_spatial_layers = 1; + codec_info->codecSpecific.VP9.first_active_layer = 0; + codec_info->codecSpecific.VP9.spatial_layer_resolution_present = keyframe; + codec_info->codecSpecific.VP9.width[0] = width; + codec_info->codecSpecific.VP9.height[0] = height; + codec_info->codecSpecific.VP9.gof.SetGofInfoVP9( + webrtc::kTemporalStructureMode1); + codec_info->codecSpecific.VP9.num_ref_pics = keyframe ? 0 : 1; + codec_info->codecSpecific.VP9.p_diff[0] = 1; + break; + case webrtc::kVideoCodecAV1: { + codec_info->scalability_mode = ScalabilityMode::kL1T1; + std::vector layer_frames = + av1_svc_controller->NextFrameConfig(/*restart=*/keyframe); + if (!layer_frames.empty()) { + const ScalableVideoController::LayerFrameConfig& layer_frame = + layer_frames.front(); + codec_info->generic_frame_info = + av1_svc_controller->OnEncodeDone(layer_frame); + if (layer_frame.IsKeyframe()) { + codec_info->template_structure = + av1_svc_controller->DependencyStructure(); + } + } + break; + } + default: + break; + } +} + class PassthroughVideoEncoder final : public VideoEncoder { public: PassthroughVideoEncoder(const Environment& env, const SdpVideoFormat& format) @@ -96,6 +193,11 @@ class PassthroughVideoEncoder final : public VideoEncoder { return WEBRTC_VIDEO_CODEC_ERR_PARAMETER; } codec_ = *codec_settings; + cached_sequence_header_obu_.clear(); + av1_svc_controller_ = ScalableVideoControllerNoLayering(); + if (IsAv1Codec(codec_type_) && !codec_.GetScalabilityMode().has_value()) { + codec_.SetScalabilityMode(ScalabilityMode::kL1T1); + } return WEBRTC_VIDEO_CODEC_OK; } @@ -134,11 +236,28 @@ class PassthroughVideoEncoder final : public VideoEncoder { return WEBRTC_VIDEO_CODEC_ERR_PARAMETER; } - const std::vector& payload = encoded_buffer->payload(); + std::vector payload = NormalizedPayloadForEncode(*encoded_buffer); if (payload.empty()) { RTC_LOG(LS_ERROR) << "PassthroughVideoEncoder received an empty frame"; return WEBRTC_VIDEO_CODEC_ERR_PARAMETER; } + const bool is_keyframe = IsKeyframe(encoded_buffer->frame_type()); + if (IsAv1Codec(codec_type_)) { + std::vector sequence_header; + if (livekit::av1::ExtractSequenceHeaderObu( + payload.data(), payload.size(), &sequence_header)) { + cached_sequence_header_obu_ = std::move(sequence_header); + } else if (is_keyframe && !cached_sequence_header_obu_.empty()) { + livekit::av1::EnsureSequenceHeaderOnKeyframe( + &payload, cached_sequence_header_obu_); + } + if (!livekit::av1::IsWebRtcParseable(payload.data(), payload.size())) { + RTC_LOG(LS_ERROR) + << "PassthroughVideoEncoder received an AV1 frame that WebRTC " + "cannot packetize"; + return WEBRTC_VIDEO_CODEC_ERR_PARAMETER; + } + } EncodedImage encoded_image; encoded_image._encodedWidth = encoded_buffer->width(); @@ -158,11 +277,10 @@ class PassthroughVideoEncoder final : public VideoEncoder { encoded_image.qp_ = -1; CodecSpecificInfo codec_info; - codec_info.codecType = codec_type_; - if (codec_type_ == webrtc::kVideoCodecH264) { - codec_info.codecSpecific.H264.packetization_mode = - H264PacketizationMode::NonInterleaved; - } + codec_info.codecSpecific = {}; + FillSingleLayerCodecSpecific(&codec_info, codec_type_, encoded_buffer->width(), + encoded_buffer->height(), is_keyframe, + &av1_svc_controller_); const auto result = encoded_image_callback_->OnEncodedImage(encoded_image, &codec_info); @@ -193,6 +311,8 @@ class PassthroughVideoEncoder final : public VideoEncoder { VideoCodecType codec_type_; VideoCodec codec_; EncodedImageCallback* encoded_image_callback_ = nullptr; + ScalableVideoControllerNoLayering av1_svc_controller_; + std::vector cached_sequence_header_obu_; }; } // namespace @@ -203,6 +323,13 @@ PassthroughVideoEncoderFactory::PassthroughVideoEncoderFactory() { {"level-asymmetry-allowed", "1"}, {"packetization-mode", "1"}, }; + absl::InlinedVector + scalability_modes; + scalability_modes.push_back(ScalabilityMode::kL1T1); + supported_formats_.push_back(SdpVideoFormat::VP8()); + supported_formats_.push_back(SdpVideoFormat::VP9Profile0()); + supported_formats_.push_back( + SdpVideoFormat(SdpVideoFormat::AV1Profile0(), scalability_modes)); supported_formats_.push_back(SdpVideoFormat("H264", h264_parameters)); supported_formats_.push_back(SdpVideoFormat("H265")); supported_formats_.push_back(SdpVideoFormat("HEVC")); @@ -221,9 +348,13 @@ PassthroughVideoEncoderFactory::GetImplementations() const { PassthroughVideoEncoderFactory::CodecSupport PassthroughVideoEncoderFactory::QueryCodecSupport( const SdpVideoFormat& format, - std::optional /* scalability_mode */) const { + std::optional scalability_mode) const { for (const auto& supported_format : supported_formats_) { if (format.IsSameCodec(supported_format)) { + if (format.name == "AV1" && scalability_mode.has_value() && + *scalability_mode != "L1T1") { + return {.is_supported = false, .is_power_efficient = false}; + } return {.is_supported = true, .is_power_efficient = true}; } } From 97afedffb0016bdba3b2e37eef4edb6297b46cbe Mon Sep 17 00:00:00 2001 From: David Chen Date: Tue, 30 Jun 2026 23:44:53 -0700 Subject: [PATCH 21/24] add scripts to run gstreamer pipelines for testing --- .../scripts/gst-test-source-common.sh | 194 ++++++++++++++++++ .../scripts/run-rtsp-test-source.sh | 68 ++++++ .../scripts/run-shm-test-source.sh | 78 +++++++ .../scripts/run-tcp-test-source.sh | 88 ++++++++ 4 files changed, 428 insertions(+) create mode 100755 examples/preencode_publish/scripts/gst-test-source-common.sh create mode 100755 examples/preencode_publish/scripts/run-rtsp-test-source.sh create mode 100755 examples/preencode_publish/scripts/run-shm-test-source.sh create mode 100755 examples/preencode_publish/scripts/run-tcp-test-source.sh diff --git a/examples/preencode_publish/scripts/gst-test-source-common.sh b/examples/preencode_publish/scripts/gst-test-source-common.sh new file mode 100755 index 000000000..b651f8a0c --- /dev/null +++ b/examples/preencode_publish/scripts/gst-test-source-common.sh @@ -0,0 +1,194 @@ +#!/usr/bin/env bash + +set -euo pipefail + +GST_LAUNCH=${GST_LAUNCH:-gst-launch-1.0} +GST_RTSP_TEST_LAUNCH=${GST_RTSP_TEST_LAUNCH:-test-launch} + +WIDTH=1280 +HEIGHT=720 +FPS=30 +BITRATE_KBPS=2500 +PRINT_ONLY=0 +GST_COMMON_SHIFT=0 + +gst_error() { + echo "error: $*" >&2 + exit 1 +} + +gst_require_command() { + if ! command -v "$1" >/dev/null 2>&1; then + gst_error "required command not found: $1" + fi +} + +gst_validate_positive_int() { + local name=$1 + local value=$2 + + case "$value" in + ''|*[!0-9]*) + gst_error "$name must be a positive integer, got '$value'" + ;; + esac + + if [ "$value" -eq 0 ]; then + gst_error "$name must be greater than zero" + fi +} + +gst_normalize_codec() { + local codec + codec=$(printf '%s' "$1" | tr '[:upper:]' '[:lower:]') + + case "$codec" in + h264|h265|vp8|vp9|av1) + printf '%s\n' "$codec" + ;; + *) + return 1 + ;; + esac +} + +gst_parse_common_option() { + GST_COMMON_SHIFT=0 + + case "$1" in + --width) + [ "$#" -ge 2 ] || gst_error "--width requires a value" + WIDTH=$2 + GST_COMMON_SHIFT=2 + ;; + --height) + [ "$#" -ge 2 ] || gst_error "--height requires a value" + HEIGHT=$2 + GST_COMMON_SHIFT=2 + ;; + --fps) + [ "$#" -ge 2 ] || gst_error "--fps requires a value" + FPS=$2 + GST_COMMON_SHIFT=2 + ;; + --bitrate-kbps) + [ "$#" -ge 2 ] || gst_error "--bitrate-kbps requires a value" + BITRATE_KBPS=$2 + GST_COMMON_SHIFT=2 + ;; + --print) + PRINT_ONLY=1 + GST_COMMON_SHIFT=1 + ;; + *) + gst_error "unknown common option: $1" + ;; + esac +} + +gst_validate_common_options() { + gst_validate_positive_int "--width" "$WIDTH" + gst_validate_positive_int "--height" "$HEIGHT" + gst_validate_positive_int "--fps" "$FPS" + gst_validate_positive_int "--bitrate-kbps" "$BITRATE_KBPS" +} + +gst_animated_video_source() { + printf 'videotestsrc is-live=true do-timestamp=true pattern=ball motion=wavy animation-mode=frames ! video/x-raw,width=%s,height=%s,framerate=%s/1 ! timeoverlay halignment=right valignment=bottom shaded-background=true ! videoconvert ! video/x-raw,format=I420 ! queue' \ + "$WIDTH" "$HEIGHT" "$FPS" +} + +gst_encoded_access_unit_pipeline() { + local codec=$1 + local key_int_max=$FPS + + case "$codec" in + h264|h265) + gst_h26x_annex_b_pipeline "$codec" + ;; + vp8) + printf 'vp8enc deadline=1 cpu-used=8 keyframe-max-dist=%s lag-in-frames=0 target-bitrate=%s000 ! video/x-vp8' \ + "$key_int_max" "$BITRATE_KBPS" + ;; + vp9) + printf 'vp9enc deadline=1 cpu-used=8 keyframe-max-dist=%s lag-in-frames=0 target-bitrate=%s000 ! video/x-vp9,profile=(string)0' \ + "$key_int_max" "$BITRATE_KBPS" + ;; + av1) + printf 'av1enc cpu-used=8 usage-profile=realtime keyframe-max-dist=%s lag-in-frames=0 target-bitrate=%s ! av1parse ! video/x-av1,stream-format=obu-stream,alignment=tu' \ + "$key_int_max" "$BITRATE_KBPS" + ;; + *) + gst_error "unsupported codec: $codec" + ;; + esac +} + +gst_h26x_annex_b_pipeline() { + local codec=$1 + local key_int_max=$FPS + + case "$codec" in + h264) + printf 'x264enc tune=zerolatency speed-preset=ultrafast key-int-max=%s bitrate=%s byte-stream=true aud=true ! h264parse config-interval=-1 ! video/x-h264,stream-format=byte-stream,alignment=au' \ + "$key_int_max" "$BITRATE_KBPS" + ;; + h265) + printf 'x265enc tune=zerolatency speed-preset=ultrafast key-int-max=%s bitrate=%s ! h265parse config-interval=-1 ! video/x-h265,stream-format=byte-stream,alignment=au' \ + "$key_int_max" "$BITRATE_KBPS" + ;; + *) + gst_error "unsupported codec: $codec" + ;; + esac +} + +gst_rtp_payloader_pipeline() { + case "$1" in + h264) + printf 'rtph264pay name=pay0 pt=96 config-interval=1' + ;; + h265) + printf 'rtph265pay name=pay0 pt=96 config-interval=1' + ;; + vp8) + printf 'rtpvp8pay name=pay0 pt=96' + ;; + vp9) + printf 'rtpvp9pay name=pay0 pt=96' + ;; + av1) + printf 'rtpav1pay name=pay0 pt=96' + ;; + *) + gst_error "unsupported codec: $1" + ;; + esac +} + +gst_run_launch_line() { + local pipeline=$1 + + if [ "$PRINT_ONLY" -eq 1 ]; then + printf 'pipeline=%q\n%q -e $pipeline\n' "$pipeline" "$GST_LAUNCH" + return + fi + + gst_require_command "$GST_LAUNCH" + # Intentionally split the launch line into gst-launch arguments. + # The line is assembled from validated flags and fixed pipeline fragments. + exec "$GST_LAUNCH" -e $pipeline +} + +gst_run_rtsp_launch_line() { + local port=$1 + local pipeline=$2 + + if [ "$PRINT_ONLY" -eq 1 ]; then + printf '%q -p %q %q\n' "$GST_RTSP_TEST_LAUNCH" "$port" "$pipeline" + return + fi + + gst_require_command "$GST_RTSP_TEST_LAUNCH" + exec "$GST_RTSP_TEST_LAUNCH" -p "$port" "$pipeline" +} diff --git a/examples/preencode_publish/scripts/run-rtsp-test-source.sh b/examples/preencode_publish/scripts/run-rtsp-test-source.sh new file mode 100755 index 000000000..82c3568ed --- /dev/null +++ b/examples/preencode_publish/scripts/run-rtsp-test-source.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=examples/preencode_publish/scripts/gst-test-source-common.sh +source "$SCRIPT_DIR/gst-test-source-common.sh" + +PORT=8554 +CODEC= + +usage() { + cat <<'USAGE' +Usage: run-rtsp-test-source.sh --codec h264|h265|vp8|vp9|av1 [options] + +Starts a gst-rtsp-server test-launch server that serves an animated +test-pattern stream at rtsp://127.0.0.1:PORT/test. + +Options: + --codec CODEC Required encoded codec. + --port PORT RTSP server port. Default: 8554. + --width PIXELS Source width. Default: 1280. + --height PIXELS Source height. Default: 720. + --fps FPS Source frame rate. Default: 30. + --bitrate-kbps KBPS Encoder bitrate. Default: 2500. + --print Print the test-launch command instead of running it. + -h, --help Show this help. +USAGE +} + +while [ "$#" -gt 0 ]; do + case "$1" in + --codec) + [ "$#" -ge 2 ] || gst_error "--codec requires h264, h265, vp8, vp9, or av1" + CODEC=$2 + shift 2 + ;; + --port) + [ "$#" -ge 2 ] || gst_error "--port requires a value" + PORT=$2 + shift 2 + ;; + --width|--height|--fps|--bitrate-kbps|--print) + gst_parse_common_option "$@" + shift "$GST_COMMON_SHIFT" + ;; + -h|--help) + usage + exit 0 + ;; + *) + gst_error "unknown option: $1" + ;; + esac +done + +[ -n "$CODEC" ] || gst_error "--codec is required" +if ! CODEC=$(gst_normalize_codec "$CODEC"); then + gst_error "--codec must be h264, h265, vp8, vp9, or av1" +fi + +gst_validate_common_options +gst_validate_positive_int "--port" "$PORT" + +PIPELINE="( $(gst_animated_video_source) ! $(gst_encoded_access_unit_pipeline "$CODEC") ! $(gst_rtp_payloader_pipeline "$CODEC") )" + +echo "Serving $CODEC RTSP test pattern at rtsp://127.0.0.1:$PORT/test" >&2 +gst_run_rtsp_launch_line "$PORT" "$PIPELINE" diff --git a/examples/preencode_publish/scripts/run-shm-test-source.sh b/examples/preencode_publish/scripts/run-shm-test-source.sh new file mode 100755 index 000000000..29a2cb10e --- /dev/null +++ b/examples/preencode_publish/scripts/run-shm-test-source.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=examples/preencode_publish/scripts/gst-test-source-common.sh +source "$SCRIPT_DIR/gst-test-source-common.sh" + +SOCKET_PATH=/tmp/livekit-preencode-test.shm +SHM_SIZE=67108864 +CODEC= + +usage() { + cat <<'USAGE' +Usage: run-shm-test-source.sh --codec h264|h265|vp8|vp9|av1 [options] + +Starts a GStreamer animated test-pattern encoder that writes encoded access +units to shmsink. + +Options: + --codec CODEC Required encoded codec. + --socket-path PATH shmsink control socket. Default: /tmp/livekit-preencode-test.shm. + --shm-size BYTES Shared-memory buffer size. Default: 67108864. + --width PIXELS Source width. Default: 1280. + --height PIXELS Source height. Default: 720. + --fps FPS Source frame rate. Default: 30. + --bitrate-kbps KBPS Encoder bitrate. Default: 2500. + --print Print the gst-launch command instead of running it. + -h, --help Show this help. +USAGE +} + +while [ "$#" -gt 0 ]; do + case "$1" in + --codec) + [ "$#" -ge 2 ] || gst_error "--codec requires h264, h265, vp8, vp9, or av1" + CODEC=$2 + shift 2 + ;; + --socket-path) + [ "$#" -ge 2 ] || gst_error "--socket-path requires a value" + SOCKET_PATH=$2 + shift 2 + ;; + --shm-size) + [ "$#" -ge 2 ] || gst_error "--shm-size requires a value" + SHM_SIZE=$2 + shift 2 + ;; + --width|--height|--fps|--bitrate-kbps|--print) + gst_parse_common_option "$@" + shift "$GST_COMMON_SHIFT" + ;; + -h|--help) + usage + exit 0 + ;; + *) + gst_error "unknown option: $1" + ;; + esac +done + +[ -n "$CODEC" ] || gst_error "--codec is required" +if ! CODEC=$(gst_normalize_codec "$CODEC"); then + gst_error "--codec must be h264, h265, vp8, vp9, or av1" +fi + +gst_validate_common_options +gst_validate_positive_int "--shm-size" "$SHM_SIZE" + +if [ "$PRINT_ONLY" -eq 0 ]; then + rm -f "$SOCKET_PATH" +fi +PIPELINE="$(gst_animated_video_source) ! $(gst_encoded_access_unit_pipeline "$CODEC") ! shmsink socket-path=$SOCKET_PATH shm-size=$SHM_SIZE wait-for-connection=true sync=true" + +echo "Writing $CODEC test pattern to shmsink socket $SOCKET_PATH" >&2 +gst_run_launch_line "$PIPELINE" diff --git a/examples/preencode_publish/scripts/run-tcp-test-source.sh b/examples/preencode_publish/scripts/run-tcp-test-source.sh new file mode 100755 index 000000000..1c0144bc3 --- /dev/null +++ b/examples/preencode_publish/scripts/run-tcp-test-source.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=examples/preencode_publish/scripts/gst-test-source-common.sh +source "$SCRIPT_DIR/gst-test-source-common.sh" + +HOST=127.0.0.1 +PORT=5000 +CODEC= + +usage() { + cat <<'USAGE' +Usage: run-tcp-test-source.sh --codec h264|h265|vp8|vp9|av1 [options] + +Starts a GStreamer animated test-pattern encoder from tcpserversink. +H.264/H.265 are served as Annex-B byte streams. VP8/VP9/AV1 are served as +RFC4571-style length-prefixed RTP packets. + +Options: + --codec CODEC Required encoded codec. + --host HOST Address to listen on. Default: 127.0.0.1. + --port PORT TCP port to listen on. Default: 5000. + --width PIXELS Source width. Default: 1280. + --height PIXELS Source height. Default: 720. + --fps FPS Source frame rate. Default: 30. + --bitrate-kbps KBPS Encoder bitrate. Default: 2500. + --print Print the gst-launch command instead of running it. + -h, --help Show this help. +USAGE +} + +while [ "$#" -gt 0 ]; do + case "$1" in + --codec) + [ "$#" -ge 2 ] || gst_error "--codec requires h264, h265, vp8, vp9, or av1" + CODEC=$2 + shift 2 + ;; + --host) + [ "$#" -ge 2 ] || gst_error "--host requires a value" + HOST=$2 + shift 2 + ;; + --port) + [ "$#" -ge 2 ] || gst_error "--port requires a value" + PORT=$2 + shift 2 + ;; + --width|--height|--fps|--bitrate-kbps|--print) + gst_parse_common_option "$@" + shift "$GST_COMMON_SHIFT" + ;; + -h|--help) + usage + exit 0 + ;; + *) + gst_error "unknown option: $1" + ;; + esac +done + +[ -n "$CODEC" ] || gst_error "--codec is required" +if ! CODEC=$(gst_normalize_codec "$CODEC"); then + gst_error "--codec must be h264, h265, vp8, vp9, or av1" +fi + +gst_validate_common_options +gst_validate_positive_int "--port" "$PORT" + +case "$CODEC" in + h264|h265) + PIPELINE="$(gst_animated_video_source) ! $(gst_h26x_annex_b_pipeline "$CODEC") ! tcpserversink host=$HOST port=$PORT sync-method=next-keyframe recover-policy=keyframe" + FORMAT="Annex-B" + ;; + vp8|vp9|av1) + PIPELINE="$(gst_animated_video_source) ! $(gst_encoded_access_unit_pipeline "$CODEC") ! $(gst_rtp_payloader_pipeline "$CODEC") ! rtpstreampay ! tcpserversink host=$HOST port=$PORT sync-method=next-keyframe recover-policy=keyframe" + FORMAT="RTP" + ;; + *) + gst_error "unsupported codec: $CODEC" + ;; +esac + +echo "Serving $CODEC $FORMAT test pattern on tcp://$HOST:$PORT" >&2 +gst_run_launch_line "$PIPELINE" From ba28354cfd2cf55ade794bea42bd9b99e3f00c1d Mon Sep 17 00:00:00 2001 From: David Chen Date: Wed, 1 Jul 2026 11:03:07 -0700 Subject: [PATCH 22/24] update readme --- .changeset/livekit-capture-preencoded.md | 2 +- livekit-capture/README.md | 132 ++++++++--------------- 2 files changed, 48 insertions(+), 86 deletions(-) diff --git a/.changeset/livekit-capture-preencoded.md b/.changeset/livekit-capture-preencoded.md index 809b9b152..53738f0a6 100644 --- a/.changeset/livekit-capture-preencoded.md +++ b/.changeset/livekit-capture-preencoded.md @@ -5,4 +5,4 @@ "webrtc-sys": patch --- -Add a `livekit-capture` crate with codec-neutral capture types, H264/H265 passthrough support, common encoded ingress helpers, TCP byte-stream encoded ingress, RTSP-over-TCP encoded ingress, GStreamer appsink encoded ingress, macOS AVFoundation decoded-frame capture, Linux V4L capture, and Jetson libargus capture hooks. The capture crate reports capture-origin timing such as optional sensor timestamps, while packet-trailer frame metadata remains a publishing concern. The `local_video` examples now open platform camera capture through `livekit-capture` instead of depending on Nokhwa directly, and a `preencode_publish` example demonstrates publishing H264/H265 Annex-B TCP or RTSP streams as pre-encoded video tracks. +Add a `livekit-capture` crate with codec-neutral capture types, H264/H265/VP8/VP9/AV1 passthrough support, common encoded ingress helpers, TCP byte-stream encoded ingress, RTSP-over-TCP encoded ingress, GStreamer appsink encoded ingress, macOS AVFoundation decoded-frame capture, Linux V4L capture, and Jetson libargus capture hooks. The capture crate reports capture-origin timing such as optional sensor timestamps, while packet-trailer frame metadata remains a publishing concern. The `local_video` examples now open platform camera capture through `livekit-capture`, and a `preencode_publish` example demonstrates publishing H264/H265 Annex-B TCP or RTSP streams as pre-encoded video tracks. diff --git a/livekit-capture/README.md b/livekit-capture/README.md index a881c63f9..86ca61a43 100644 --- a/livekit-capture/README.md +++ b/livekit-capture/README.md @@ -1,14 +1,12 @@ # livekit-capture Capture helpers for publishing decoded, native platform, DMA-BUF, and -pre-encoded video frames with the LiveKit Rust SDK. - -Optional source features include `avfoundation`, `libargus`, `v4l`, -`tcpsink`, `rtsp`, and `gstreamer`. +pre-encoded video frames with the LiveKit Rust SDK. Optional source features +include `avfoundation`, `libargus`, `v4l`, `tcpsink`, `rtsp`, and `gstreamer`. ## Pre-encoded source modes -The `preencode_publish` example can publish H.264, H.265, VP8, VP9, and AV1 +The `preencode_publish` example publishes H.264, H.265, VP8, VP9, and AV1 access units from these sources: | Source | Feature | Input shape | @@ -18,19 +16,18 @@ access units from these sources: | `shmsink` | `gstreamer` | GStreamer `shmsink` producer read through `shmsrc` | | `rtsp` | `rtsp` | RTSP over TCP with interleaved RTP video | -H.264/H.265 TCP defaults remain Annex-B byte streams. VP8, VP9, and AV1 use RTP -framing over TCP because those codecs need explicit frame boundaries. +H.264/H.265 TCP defaults to Annex-B byte streams, while VP8, VP9, and AV1 use +RTP framing over TCP because those codecs need explicit frame boundaries. ## Pre-encoded test sources -The `preencode_publish` example includes GStreamer fixture scripts for testing -the H.264, H.265, VP8, VP9, and AV1 pre-encoded capture paths with an animated -`videotestsrc` source at `1280x720@30fps`. -The generated encoder pipelines force 8-bit I420 input; VP9 fixture caps are -pinned to profile 0 to match the WebRTC passthrough profile. +The example ships GStreamer fixture scripts that exercise the H.264, H.265, +VP8, VP9, and AV1 capture paths with an animated `videotestsrc` at +`1280x720@30fps`. Generated encoder pipelines force 8-bit I420 input, and VP9 +fixture caps are pinned to profile 0 to match the WebRTC passthrough profile. -Before running a publisher command, provide LiveKit credentials through the -environment or command-line flags: +Before running a publisher, provide LiveKit credentials through the environment +or command-line flags: ```sh export LIVEKIT_URL=wss://example.livekit.cloud @@ -38,40 +35,26 @@ export LIVEKIT_API_KEY=devkey export LIVEKIT_API_SECRET=secret ``` -All scripts require `--codec h264|h265|vp8|vp9|av1`. They also accept `--width`, +All scripts require `--codec h264|h265|vp8|vp9|av1` and also accept `--width`, `--height`, `--fps`, `--bitrate-kbps`, and `--print`; the defaults match the test profile above. -### Runtime status - -The unit and fixture coverage exercises H.264, H.265, VP8, VP9, and AV1 ingest -through GStreamer appsink, TCP RTP, shared-memory shmsink, and RTSP RTP. -H.264/H.265 TCP byte-stream ingest remains the compatibility default. - -Local-SFU smoke testing has verified subscriber decode for H.264, H.265, VP8, -VP9, and AV1 through GStreamer appsink, TCP RTP, shared-memory shmsink, and -RTSP RTP sources. The generated fixture uses a low-motion animated test pattern -so the encoded source stays near the advertised publish cap; high-entropy custom -pipelines may need an explicit `--max-bitrate` large enough for the frames they -produce. +### Local SFU example -### Local SFU smoke - -With a local LiveKit server running in dev mode: +Run a local LiveKit server in dev mode and use its dev credentials in the +publisher examples: ```sh -livekit-server --dev --bind 127.0.0.1 +livekit-server --dev --bind 0.0.0.0 ``` -Use the dev credentials in the publisher examples: - ```sh export LIVEKIT_URL=ws://127.0.0.1:7880 export LIVEKIT_API_KEY=devkey export LIVEKIT_API_SECRET=secret ``` -Run a subscriber in another terminal to verify negotiated codec and decoder +Run a subscriber in another terminal to verify the negotiated codec and decoder health: ```sh @@ -104,17 +87,14 @@ cargo run -p preencode_publish --features gstreamer -- \ Expected publisher signs are a successful room connection, a `Published pre-encoded ... track at 1280x720` log line, and diagnostics near -30 access units per second. Expected subscriber signs for healthy codecs are a -matching `Subscribed to video track` codec and rising decoded-frame counts with -low loss and no repeated PLI loop. +30 access units per second. A healthy subscriber shows a matching +`Subscribed to video track` codec and rising decoded-frame counts with low loss +and no repeated PLI loop. -### GStreamer `gstappsink` Source +### GStreamer `gstappsink` source -This exercises: - -`GStreamer videotestsrc -> encoder -> appsink -> GStreamerAppSinkEncodedSource -> VideoCaptureTrack` - -Publish the generated GStreamer source: +Exercises +`GStreamer videotestsrc -> encoder -> appsink -> GStreamerAppSinkEncodedSource -> VideoCaptureTrack`. ```sh cargo run -p preencode_publish --features gstreamer -- \ @@ -128,9 +108,7 @@ cargo run -p preencode_publish --features gstreamer -- \ --diagnostics ``` -For H.265, VP8, VP9, or AV1, change `--codec` to `h265`, `vp8`, `vp9`, or -`av1`. The generated AV1 path inserts `av1parse` and requests -`stream-format=obu-stream,alignment=tu` before appsink. +For H.265, VP8, VP9, or AV1, change `--codec` accordingly. Custom GStreamer launch fragments can be passed after `--`. If the pipeline does not include `appsink name=lk_appsink`, it must leave exactly one encoded @@ -151,23 +129,19 @@ cargo run -p preencode_publish --features gstreamer -- \ 'videotestsrc is-live=true do-timestamp=true ! video/x-raw,width=1280,height=720,framerate=30/1 ! videoconvert ! x264enc tune=zerolatency speed-preset=ultrafast key-int-max=30 byte-stream=true aud=true' ``` -### TCP `tcpsink` Source - -This exercises: - -`GStreamer videotestsrc -> encoder -> tcpserversink -> TcpEncodedSource -> VideoCaptureTrack` +### TCP `tcpsink` source -The `preencode_publish` CLI source is `tcpsink`; it connects to a TCP producer -such as the fixture script's GStreamer `tcpserversink`. +Exercises +`GStreamer videotestsrc -> encoder -> tcpserversink -> TcpEncodedSource -> VideoCaptureTrack`. +The `tcpsink` source connects to a TCP producer such as the fixture script's +GStreamer `tcpserversink`. -Start the producer: +Start the producer, then publish: ```sh examples/preencode_publish/scripts/run-tcp-test-source.sh --codec h264 --port 5000 ``` -Publish the TCP source: - ```sh cargo run -p preencode_publish -- \ --source tcpsink \ @@ -181,10 +155,9 @@ cargo run -p preencode_publish -- \ --diagnostics ``` -For H.265, use `--codec h265` in both commands. - -For VP8, VP9, or AV1, use the same script with `--codec vp8`, `--codec vp9`, or -`--codec av1`; `preencode_publish --tcp-format auto` selects RTP automatically: +For H.265, use `--codec h265` in both commands. For VP8, VP9, or AV1, use the +same script with the matching `--codec` and add `--tcp-format auto` to the +publisher, which selects RTP automatically: ```sh cargo run -p preencode_publish -- \ @@ -200,13 +173,13 @@ cargo run -p preencode_publish -- \ --diagnostics ``` -### Shared-Memory `shmsink` Source +### Shared-memory `shmsink` source -This exercises: +Exercises +`GStreamer videotestsrc -> encoder -> shmsink -> shmsrc -> GStreamerAppSinkEncodedSource -> VideoCaptureTrack`. -`GStreamer videotestsrc -> encoder -> shmsink -> shmsrc -> GStreamerAppSinkEncodedSource -> VideoCaptureTrack` - -Start the producer: +Start the producer, then publish by connecting the `shmsink` source to that +socket: ```sh examples/preencode_publish/scripts/run-shm-test-source.sh \ @@ -214,8 +187,6 @@ examples/preencode_publish/scripts/run-shm-test-source.sh \ --socket-path /tmp/livekit-preencode-h264.shm ``` -Publish by connecting the first-class `shmsink` source to that socket: - ```sh cargo run -p preencode_publish --features gstreamer -- \ --source shmsink \ @@ -229,12 +200,8 @@ cargo run -p preencode_publish --features gstreamer -- \ --diagnostics ``` -For H.265, use `--codec h265`, a different socket path if desired, and -the same `--source shmsink` command shape. - -For VP8/VP9, use `--codec vp8` or `--codec vp9`. For AV1, the producer script -parses to low-overhead temporal units before `shmsink`, and the `shmsink` -source adds the matching AV1 appsink caps: +For H.265, VP8, or VP9, use the same command shape with the matching `--codec` +(and a different socket path if desired). ```sh cargo run -p preencode_publish --features gstreamer -- \ @@ -251,19 +218,16 @@ cargo run -p preencode_publish --features gstreamer -- \ ### RTSP source -This exercises: - -`GStreamer videotestsrc -> encoder -> RTP payloader -> gst-rtsp-server -> RtspEncodedSource -> VideoCaptureTrack` +Exercises +`GStreamer videotestsrc -> encoder -> RTP payloader -> gst-rtsp-server -> RtspEncodedSource -> VideoCaptureTrack`. -Start the RTSP server. The script uses the `test-launch` tool from -`gst-rtsp-server` and serves `/test`: +Start the RTSP server (the script uses the `test-launch` tool from +`gst-rtsp-server` and serves `/test`), then publish: ```sh examples/preencode_publish/scripts/run-rtsp-test-source.sh --codec h264 --port 8555 ``` -Publish the RTSP source: - ```sh cargo run -p preencode_publish -- \ --source rtsp \ @@ -277,11 +241,9 @@ cargo run -p preencode_publish -- \ --diagnostics ``` -For H.265, use `--codec h265` in both commands. - -For VP8, VP9, or AV1, use `--codec vp8`, `--codec vp9`, or `--codec av1` in -both commands. The RTSP fixture switches to `rtpvp8pay`, `rtpvp9pay`, or -`rtpav1pay` automatically. +For H.265, use `--codec h265` in both commands. For VP8, VP9, or AV1, use the +matching `--codec` in both commands; the RTSP fixture switches to `rtpvp8pay`, +`rtpvp9pay`, or `rtpav1pay` automatically. Publisher-side success signs are a successful room connection, a `Published pre-encoded ... track at 1280x720` log line, and diagnostics near From 566b9cc6b74b5a291bea278fdd6ced1129ac5a47 Mon Sep 17 00:00:00 2001 From: David Chen Date: Thu, 2 Jul 2026 10:13:53 -0700 Subject: [PATCH 23/24] cleanup --- Cargo.lock | 1 - examples/local_video/README.md | 5 +- examples/local_video/src/list_devices.rs | 2 +- examples/local_video/src/publisher.rs | 584 ++++++++------- examples/preencode_publish/Cargo.toml | 3 +- examples/preencode_publish/src/main.rs | 679 ++++-------------- libwebrtc/src/native/video_source.rs | 87 ++- libwebrtc/src/video_source.rs | 12 + livekit-capture/Cargo.toml | 6 +- livekit-capture/README.md | 23 + livekit-capture/src/device.rs | 4 - livekit-capture/src/encoded.rs | 81 ++- livekit-capture/src/encoded/h26x.rs | 529 ++++++++++++-- livekit-capture/src/encoded/ingress.rs | 87 ++- livekit-capture/src/encoded/rtp.rs | 380 +++++++--- livekit-capture/src/error.rs | 6 + livekit-capture/src/lib.rs | 13 +- livekit-capture/src/platform/mod.rs | 18 - livekit-capture/src/source.rs | 311 +++++++- livekit-capture/src/sources/argus.rs | 15 +- .../src/{platform => sources}/avfoundation.rs | 206 ++++-- livekit-capture/src/sources/gstreamer.rs | 345 ++++++++- livekit-capture/src/sources/lk_argus.cpp | 23 +- livekit-capture/src/sources/mod.rs | 10 +- livekit-capture/src/sources/rtsp.rs | 531 +++++++++++--- livekit-capture/src/sources/tcp.rs | 88 +-- livekit-capture/src/sources/v4l.rs | 326 +++++++-- livekit-capture/src/track.rs | 125 +++- .../livekit/encoded_video_frame_buffer.h | 35 +- webrtc-sys/include/livekit/video_track.h | 15 + webrtc-sys/src/encoded_video_frame_buffer.cpp | 36 +- webrtc-sys/src/jetson/av1_encoder_impl.cpp | 8 +- .../src/jetson/jetson_av1_bitstream.cpp | 6 + webrtc-sys/src/jetson/jetson_av1_bitstream.h | 6 + webrtc-sys/src/passthrough_video_encoder.cpp | 69 +- webrtc-sys/src/rtp_sender.cpp | 11 +- webrtc-sys/src/video_encoder_factory.cpp | 77 +- webrtc-sys/src/video_track.cpp | 35 +- webrtc-sys/src/video_track.rs | 3 +- 39 files changed, 3376 insertions(+), 1425 deletions(-) delete mode 100644 livekit-capture/src/platform/mod.rs rename livekit-capture/src/{platform => sources}/avfoundation.rs (90%) diff --git a/Cargo.lock b/Cargo.lock index 1039be26a..99d81c806 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5706,7 +5706,6 @@ dependencies = [ "clap", "env_logger 0.11.10", "gstreamer", - "gstreamer-app", "livekit", "livekit-api", "livekit-capture", diff --git a/examples/local_video/README.md b/examples/local_video/README.md index 0d367eb52..70806a2dd 100644 --- a/examples/local_video/README.md +++ b/examples/local_video/README.md @@ -2,7 +2,8 @@ Examples demonstrating capturing frames from a local camera video and publishing to LiveKit, listing camera capabilities, subscribing to render video in a window, and showing a low-latency clock for measurement. -**Note:** These examples are intended for **desktop platforms only** (macOS, Linux, Windows). +**Note:** These examples are intended for **macOS and Linux** (including NVIDIA Jetson). +Windows camera capture is not currently supported; the test-pattern publisher (`--test-pattern`), subscriber, and clock still work there. You must enable the `desktop` feature when building or running them. For smoother local rendering, especially above 720p, run the publisher/subscriber with `cargo run --release`. @@ -139,7 +140,7 @@ The clock draws a 3x9 grid below the time. The top row fills from `0` to `9` for Publisher flags (in addition to the common connection flags above): - `--camera-index `: Camera index to use (default: `0`). Use `--list-cameras` to see available indices. - `--source `: Camera backend to use (default: `uvc`). `argus` uses NVIDIA libargus for MIPI CSI cameras and is available only on Linux aarch64 Jetson builds. -- `--format `: UVC camera capture format (default: `auto`). `auto` tries uncompressed YUYV first and falls back to MJPEG; `mjpeg` can reduce USB bandwidth when running multiple cameras. +- `--format `: UVC camera capture format (default: `auto`). `auto` prefers uncompressed YUYV and falls back to the camera's other supported formats; `mjpeg` can reduce USB bandwidth when running multiple cameras. If an explicitly requested format is unavailable, the publisher logs a warning and continues with the negotiated format. - `--zero-copy`: Use a platform zero-copy capture/encode path when available, such as AVFoundation IOSurface-backed CVPixelBuffers on macOS or Argus DMA-BUF frames on Jetson. If the selected source does not support zero-copy, the publisher logs a warning and uses CPU I420 capture. - `--test-pattern [0|1]`: Generate a test pattern instead of capturing from a camera. `0` is a static SMPTE 75% color-bar pattern and `1` is an animated encoder exercise graphic. Omitting the value defaults to `0`. `--camera-index` is ignored when this is set; `--width`, `--height`, and `--fps` still control the output resolution and frame rate. - `--width `: Desired capture width (default: `1280`). diff --git a/examples/local_video/src/list_devices.rs b/examples/local_video/src/list_devices.rs index 4e83c72bf..59c90d2ab 100644 --- a/examples/local_video/src/list_devices.rs +++ b/examples/local_video/src/list_devices.rs @@ -20,7 +20,7 @@ fn main() -> Result<()> { #[cfg(target_os = "macos")] fn platform_devices() -> Result> { - Ok(livekit_capture::platform::avfoundation::devices()?) + Ok(livekit_capture::sources::avfoundation::devices()?) } #[cfg(target_os = "linux")] diff --git a/examples/local_video/src/publisher.rs b/examples/local_video/src/publisher.rs index 3d9880e1d..68c5d2807 100644 --- a/examples/local_video/src/publisher.rs +++ b/examples/local_video/src/publisher.rs @@ -16,17 +16,14 @@ use livekit_api::access_token; use livekit_api::services::room::{CreateRoomOptions, RoomClient}; use livekit_api::services::{ServiceError, TwirpError, TwirpErrorCode}; use livekit_capture::device::{ - CaptureDeviceSelector, CaptureFormat as LkCaptureFormat, CaptureFormatRequest, + CaptureBackend, CaptureDeviceSelector, CaptureFormat as LkCaptureFormat, CaptureFormatRequest, CaptureFrameFormat, CapturePath as LkCapturePath, CaptureResolution, }; +use livekit_capture::source::{CaptureFrame, CaptureSourceOptions, VideoCaptureSource}; #[cfg(all(target_os = "linux", target_arch = "aarch64"))] use livekit_capture::sources::argus::{self, ArgusCaptureOptions, ArgusCaptureSession}; #[cfg(target_os = "macos")] -use livekit_capture::sources::avfoundation::{ - self, AvFoundationCaptureOptions, AvFoundationCaptureSession, -}; -#[cfg(target_os = "linux")] -use livekit_capture::sources::v4l::{self, V4lCaptureOptions, V4lCaptureSession}; +use livekit_capture::sources::avfoundation::AvFoundationStopHandle; use log::{debug, info}; use parking_lot::Mutex; use std::collections::{HashMap, VecDeque}; @@ -81,7 +78,7 @@ enum SourceKind { /// Selects the UVC camera capture frame format. #[derive(Copy, Clone, Debug, PartialEq, Eq, ValueEnum)] enum CaptureFormat { - /// Try YUYV first, then MJPEG, then GREY. + /// Prefer YUYV, falling back to other formats supported by the camera. Auto, /// Request uncompressed YUYV capture. Yuv, @@ -92,15 +89,15 @@ enum CaptureFormat { } impl CaptureFormat { + /// Preferred source frame format used for V4L2 format negotiation; the + /// capture facade falls back to the camera's other supported formats when + /// the preferred one is unavailable. #[cfg(target_os = "linux")] - fn frame_formats(self) -> &'static [CaptureFrameFormat] { + fn preferred_frame_format(self) -> CaptureFrameFormat { match self { - Self::Auto => { - &[CaptureFrameFormat::Yuyv, CaptureFrameFormat::Mjpeg, CaptureFrameFormat::Grey] - } - Self::Yuv => &[CaptureFrameFormat::Yuyv], - Self::Mjpeg => &[CaptureFrameFormat::Mjpeg], - Self::Grey => &[CaptureFrameFormat::Grey], + Self::Auto | Self::Yuv => CaptureFrameFormat::Yuyv, + Self::Mjpeg => CaptureFrameFormat::Mjpeg, + Self::Grey => CaptureFrameFormat::Grey, } } } @@ -185,7 +182,7 @@ struct Args { #[arg(long, value_enum, default_value_t = SourceKind::Uvc)] source: SourceKind, - /// UVC camera capture format: `auto` tries YUYV, MJPEG, then GREY. + /// UVC camera capture format: `auto` prefers YUYV and falls back to other supported formats. #[arg(long, value_enum, default_value_t = CaptureFormat::Auto)] format: CaptureFormat, @@ -374,8 +371,6 @@ struct PublisherTimingSummary { camera_frame_read_ms: RollingMs, capture_timestamp_age_ms: RollingMs, capture_timestamp_to_webrtc_ms: RollingMs, - decode_mjpeg_ms: RollingMs, - buffer_convert_ms: RollingMs, frame_draw_ms: RollingMs, submit_to_webrtc_ms: RollingMs, capture_to_webrtc_total_ms: RollingMs, @@ -558,8 +553,6 @@ impl PublisherTimingSummary { self.camera_frame_read_ms.reset(); self.capture_timestamp_age_ms.reset(); self.capture_timestamp_to_webrtc_ms.reset(); - self.decode_mjpeg_ms.reset(); - self.buffer_convert_ms.reset(); self.frame_draw_ms.reset(); self.submit_to_webrtc_ms.reset(); self.capture_to_webrtc_total_ms.reset(); @@ -584,14 +577,6 @@ fn format_timing_line(timings: &PublisherTimingSummary) -> String { ]; let mut line_two = Vec::new(); - if let Some(decode_ms) = timings.decode_mjpeg_ms.average() { - line_two.push(format!("decode_mjpeg {:.2}", decode_ms)); - } - - line_two.push(format!( - "convert_to_i420 {:.2}", - timings.buffer_convert_ms.average().unwrap_or_default() - )); if let Some(frame_draw_ms) = timings.frame_draw_ms.average() { line_two.push(format!("frame_draw {:.2}", frame_draw_ms)); } @@ -861,7 +846,7 @@ mod tests { } fn list_cameras() -> Result<()> { - let cams = platform_devices()?; + let cams = VideoCaptureSource::list_devices(CaptureBackend::Auto)?; println!("Available cameras:"); for (i, cam) in cams.iter().enumerate() { println!("{}. {}", i, cam.name); @@ -869,24 +854,6 @@ fn list_cameras() -> Result<()> { Ok(()) } -#[cfg(target_os = "macos")] -fn platform_devices() -> Result> { - Ok(avfoundation::devices()?) -} - -#[cfg(target_os = "linux")] -fn platform_devices() -> Result> { - Ok(v4l::devices()?) -} - -#[cfg(not(any(target_os = "macos", target_os = "linux")))] -fn platform_devices() -> Result> { - anyhow::bail!( - "camera capture is not supported on {}; local_video supports macOS AVFoundation and Linux V4L2", - std::env::consts::OS - ); -} - fn list_encoders() { println!("Available video encoder backends:"); for backend in VideoEncoderBackend::list_available() { @@ -896,16 +863,24 @@ fn list_encoders() { enum VideoInput { TestPattern(TestPattern), - Camera(PlatformCamera), + /// Platform camera opened through the `livekit-capture` facade + /// (AVFoundation on macOS, V4L2 on Linux). + Camera(VideoCaptureSource), + /// Jetson MIPI CSI camera driven directly so the `--zero-copy` CPU/DMA + /// toggle stays available; see [`run_argus_capture_loop`]. #[cfg(all(target_os = "linux", target_arch = "aarch64"))] Argus(ArgusCaptureSession), } -enum PlatformCamera { - #[cfg(target_os = "macos")] - AvFoundation(AvFoundationCaptureSession), - #[cfg(target_os = "linux")] - V4l(V4lCaptureSession), +/// Human-readable name of the backend behind a facade camera source. +fn camera_backend_name(source: &VideoCaptureSource) -> &'static str { + match source { + #[cfg(target_os = "macos")] + VideoCaptureSource::AvFoundation { .. } => "AVFoundation", + #[cfg(target_os = "linux")] + VideoCaptureSource::V4l(_) => "V4L2", + _ => "livekit-capture", + } } fn publisher_capture_path_label( @@ -915,13 +890,13 @@ fn publisher_capture_path_label( ) -> String { match video_input { VideoInput::TestPattern(_) => "test-pattern CPU I420".to_string(), - VideoInput::Camera(camera) => match camera { + VideoInput::Camera(source) => match source { #[cfg(target_os = "macos")] - PlatformCamera::AvFoundation(session) => { + VideoCaptureSource::AvFoundation { session, .. } => { let source_format = session.format().frame_format; let core_video_format = core_video_fourcc(session.core_video_pixel_format()); if zero_copy { - match session.capture_path() { + match source.capture_path() { LkCapturePath::Native if burn_timestamp => { format!( "AVFoundation zero-copy IOSurface CVPixelBuffer {core_video_format} from {source_format} (timestamp burn disabled)" @@ -953,7 +928,7 @@ fn publisher_capture_path_label( } } #[cfg(target_os = "linux")] - PlatformCamera::V4l(session) => { + VideoCaptureSource::V4l(session) => { let format = session.format(); let decode_suffix = if format.frame_format == CaptureFrameFormat::Mjpeg { " with MJPEG decode" @@ -981,6 +956,13 @@ fn publisher_capture_path_label( ) } } + other => { + format!( + "{} {} capture", + camera_backend_name(other), + capture_path_name(other.capture_path()) + ) + } }, #[cfg(all(target_os = "linux", target_arch = "aarch64"))] VideoInput::Argus(_) => { @@ -1010,19 +992,22 @@ fn core_video_fourcc(pixel_format: u32) -> String { fn publisher_zero_copy_unsupported_reason(video_input: &VideoInput) -> Option<&'static str> { match video_input { VideoInput::TestPattern(_) => Some("test pattern frames are generated in CPU I420 memory"), - VideoInput::Camera(camera) => match camera { + VideoInput::Camera(source) => match source { #[cfg(target_os = "macos")] - PlatformCamera::AvFoundation(session) => { - if session.capture_path() == LkCapturePath::Native { + VideoCaptureSource::AvFoundation { .. } => { + if source.capture_path() == LkCapturePath::Native { None } else { Some("the selected AVFoundation format is not IOSurface-backed NV12") } } #[cfg(target_os = "linux")] - PlatformCamera::V4l(_) => { + VideoCaptureSource::V4l(_) => { Some("V4L2 UVC capture does not expose a zero-copy capture/encode path here") } + _ => Some( + "the selected capture backend does not expose a zero-copy capture/encode path here", + ), }, #[cfg(all(target_os = "linux", target_arch = "aarch64"))] VideoInput::Argus(_) => None, @@ -1039,64 +1024,89 @@ fn publisher_uses_zero_copy_camera_capture(video_input: &VideoInput, zero_copy: } match video_input { - #[cfg(target_os = "macos")] - VideoInput::Camera(PlatformCamera::AvFoundation(session)) => { - session.capture_path() == LkCapturePath::Native - } + VideoInput::Camera(source) => source.capture_path() == LkCapturePath::Native, _ => false, } } -struct PlatformCameraFrame { +enum CapturedFrameBuffer { + I420(VideoFrame), + #[cfg(target_os = "macos")] + Native(VideoFrame), +} + +/// One frame obtained from the active video input, together with the timing +/// context the publish pipeline records. +struct SourcedFrame { buffer: CapturedFrameBuffer, + /// Wall-clock capture timestamp in microseconds (camera-provided when available). capture_wall_time_us: u64, + /// Wall-clock time the frame was read from the source, in microseconds. read_wall_time_us: u64, - sensor_timestamp_us: Option, - used_decode_path: bool, + /// When the frame buffer became available to the publish pipeline. + acquired_at: Instant, + /// When work on this frame began; `capture_to_webrtc_total` is measured from here. + pipeline_started_at: Instant, + /// Whether `capture_wall_time_us` came from a camera-provided timestamp. + has_camera_timestamp: bool, } -enum CapturedFrameBuffer { - I420(VideoFrame), +fn sourced_frame_from_capture(frame: CaptureFrame) -> Result { + let acquired_at = Instant::now(); + match frame { + CaptureFrame::Raw(raw) => Ok(SourcedFrame { + has_camera_timestamp: raw.sensor_timestamp_us.is_some(), + capture_wall_time_us: raw.capture_wall_time_us, + read_wall_time_us: raw.read_wall_time_us, + buffer: CapturedFrameBuffer::I420(raw.frame), + acquired_at, + pipeline_started_at: acquired_at, + }), + #[cfg(target_os = "macos")] + CaptureFrame::Native(native) => Ok(SourcedFrame { + has_camera_timestamp: native.sensor_timestamp_us.is_some(), + capture_wall_time_us: native.capture_wall_time_us, + read_wall_time_us: native.read_wall_time_us, + buffer: CapturedFrameBuffer::Native(native.frame), + acquired_at, + pipeline_started_at: acquired_at, + }), + other => anyhow::bail!( + "camera capture returned an unsupported {} frame", + capture_path_name(other.capture_path()) + ), + } +} + +/// Cross-thread stop signal for a capture input blocked inside +/// [`VideoCaptureSource::next_frame`]. +#[derive(Clone)] +enum CaptureStopHandle { + /// AVFoundation wakes a blocked capture call via its stop handle. #[cfg(target_os = "macos")] - Native(VideoFrame), + AvFoundation(AvFoundationStopHandle), + /// The input either never blocks (test pattern) or returns at the next + /// frame boundary, where the loop observes the shutdown flag. + FrameBoundary, } -impl PlatformCamera { - fn capture_frame(&mut self, prefer_native: bool) -> Result { - match self { +impl CaptureStopHandle { + fn for_input(video_input: &VideoInput) -> Self { + match video_input { #[cfg(target_os = "macos")] - Self::AvFoundation(session) => { - if prefer_native && session.capture_path() == LkCapturePath::Native { - let frame = session.capture_native_frame()?; - Ok(PlatformCameraFrame { - buffer: CapturedFrameBuffer::Native(frame.frame), - capture_wall_time_us: frame.capture_wall_time_us, - read_wall_time_us: frame.read_wall_time_us, - sensor_timestamp_us: frame.sensor_timestamp_us, - used_decode_path: false, - }) - } else { - let frame = session.capture_frame()?; - Ok(PlatformCameraFrame { - buffer: CapturedFrameBuffer::I420(frame.frame), - capture_wall_time_us: frame.capture_wall_time_us, - read_wall_time_us: frame.read_wall_time_us, - sensor_timestamp_us: frame.sensor_timestamp_us, - used_decode_path: false, - }) - } - } - #[cfg(target_os = "linux")] - Self::V4l(session) => { - let frame = session.capture_frame()?; - Ok(PlatformCameraFrame { - buffer: CapturedFrameBuffer::I420(frame.frame), - capture_wall_time_us: frame.capture_wall_time_us, - read_wall_time_us: frame.read_wall_time_us, - sensor_timestamp_us: frame.sensor_timestamp_us, - used_decode_path: frame.used_decode_path, - }) + VideoInput::Camera(VideoCaptureSource::AvFoundation { session, .. }) => { + Self::AvFoundation(session.stop_handle()) } + _ => Self::FrameBoundary, + } + } + + /// Interrupts a blocked `next_frame` when the backend supports it. + fn stop(&self) { + match self { + #[cfg(target_os = "macos")] + Self::AvFoundation(handle) => handle.stop(), + Self::FrameBoundary => {} } } } @@ -1104,6 +1114,10 @@ impl PlatformCamera { #[derive(Clone, Copy)] struct CaptureConfig { fps: u32, + /// Read by the Argus capture loop to pick DMA-BUF vs CPU I420 publish; the + /// facade camera path bakes the zero-copy preference into the source when + /// it is opened instead. + #[cfg_attr(not(all(target_os = "linux", target_arch = "aarch64")), allow(dead_code))] zero_copy: bool, attach_timestamp: bool, burn_timestamp: bool, @@ -1126,74 +1140,73 @@ fn create_i420_buffer(width: u32, height: u32, align_for_display: bool) -> I420B } } -fn open_platform_camera(args: &Args) -> Result<(u32, u32, VideoInput)> { - #[cfg(target_os = "macos")] +/// Opens the platform camera through the `livekit-capture` facade +/// (AVFoundation on macOS, V4L2 on Linux). +fn open_camera_source(args: &Args) -> Result<(u32, u32, VideoInput)> { + #[cfg(any(target_os = "macos", target_os = "linux"))] { - if args.format != CaptureFormat::Auto { - log::warn!( - "--format={} is ignored for AVFoundation decoded capture; AVFoundation supplies decoded CVPixelBuffers", - args.format + #[cfg(target_os = "macos")] + let format_request = { + if args.format != CaptureFormat::Auto { + log::warn!( + "--format={} is ignored for AVFoundation decoded capture; AVFoundation supplies decoded CVPixelBuffers", + args.format + ); + } + CaptureFormatRequest::Closest(LkCaptureFormat::new( + CaptureResolution::new(args.width, args.height), + args.fps, + CaptureFrameFormat::Nv12, + )) + }; + #[cfg(target_os = "linux")] + let format_request = { + let requested = LkCaptureFormat::new( + CaptureResolution::new(args.width, args.height), + args.fps, + args.format.preferred_frame_format(), ); - } - let requested = LkCaptureFormat::new( - CaptureResolution::new(args.width, args.height), - args.fps, - CaptureFrameFormat::Nv12, - ); - let session = AvFoundationCaptureSession::new(AvFoundationCaptureOptions { + if args.format == CaptureFormat::Auto { + CaptureFormatRequest::Closest(requested) + } else { + CaptureFormatRequest::Exact(requested) + } + }; + + // Without --zero-copy, ask for CPU-accessible frames so pixel work + // (e.g. the --burn-timestamp overlay) is possible; with --zero-copy, + // let AVFoundation deliver native platform buffers when supported. + let source = VideoCaptureSource::open(CaptureSourceOptions { + backend: CaptureBackend::Auto, device: CaptureDeviceSelector::Index(args.camera_index), - format: CaptureFormatRequest::Closest(requested), - is_screencast: false, + format: format_request, + prefer_raw_frames: !args.zero_copy, + ..Default::default() })?; - let format = session.format(); - info!( - "Camera opened with AVFoundation: {}x{} @ {} fps (source format: {:?}, camera {})", - format.resolution.width, - format.resolution.height, - format.frame_rate, - format.frame_format, - args.camera_index, - ); - Ok(( - format.resolution.width, - format.resolution.height, - VideoInput::Camera(PlatformCamera::AvFoundation(session)), - )) - } - - #[cfg(target_os = "linux")] - { - let requested = LkCaptureFormat::new( - CaptureResolution::new(args.width, args.height), - args.fps, - args.format.frame_formats()[0], - ); - let mut options = V4lCaptureOptions::new( - CaptureDeviceSelector::Index(args.camera_index), - requested.resolution, - requested.frame_rate, - ); - options.format = if args.format == CaptureFormat::Auto { - CaptureFormatRequest::Closest(requested) - } else { - CaptureFormatRequest::Exact(requested) - }; - options.frame_formats = args.format.frame_formats().to_vec(); - let session = V4lCaptureSession::new(options)?; - let format = session.format(); + let format = source + .format() + .ok_or_else(|| anyhow::anyhow!("camera source did not report a negotiated format"))?; info!( - "Camera opened with V4L2: {}x{} @ {} fps (format: {:?}, requested: {})", + "Camera opened with {}: {}x{} @ {} fps (source format: {}, requested: {}, camera {})", + camera_backend_name(&source), format.resolution.width, format.resolution.height, format.frame_rate, format.frame_format, args.format, + args.camera_index, ); - Ok(( - format.resolution.width, - format.resolution.height, - VideoInput::Camera(PlatformCamera::V4l(session)), - )) + #[cfg(target_os = "linux")] + if args.format != CaptureFormat::Auto + && format.frame_format != args.format.preferred_frame_format() + { + log::warn!( + "--format={} was requested but the camera negotiated {}; continuing with the negotiated format", + args.format, + format.frame_format, + ); + } + Ok((format.resolution.width, format.resolution.height, VideoInput::Camera(source))) } #[cfg(not(any(target_os = "macos", target_os = "linux")))] @@ -1376,7 +1389,7 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { VideoInput::TestPattern(TestPattern::new(width, height, test_pattern)), ) } else { - open_platform_camera(&args)? + open_camera_source(&args)? } } }; @@ -1622,7 +1635,71 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { Ok(()) } +/// Maximum number of back-to-back camera capture/convert failures tolerated +/// before the publish is aborted; isolated failures (e.g. one corrupt MJPEG +/// frame) are logged and skipped. +const MAX_CONSECUTIVE_CAPTURE_FAILURES: u32 = 30; + +/// Runs the test-pattern/camera capture loop. +/// +/// Camera backends block inside [`VideoCaptureSource::next_frame`] until a +/// frame arrives (AVFoundation parks on a condvar), so the loop body runs on a +/// dedicated blocking thread, mirroring [`run_argus_capture_loop`]. A watcher +/// task turns the shutdown flag (Ctrl-C or preview window close) into a +/// [`CaptureStopHandle::stop`] call so a blocked `next_frame` returns promptly +/// instead of hanging the process. async fn run_capture_loop( + config: CaptureConfig, + ctrl_c_received: Arc, + track: LocalVideoTrack, + rtc_source: NativeVideoSource, + video_input: VideoInput, + width: u32, + height: u32, + display_shared: Option>>, + publish_timing_state: Option>>, + user_data_channels: Option>>, + zero_copy_fallback: Option>, +) -> Result<()> { + let stop_handle = CaptureStopHandle::for_input(&video_input); + let stop_watcher = tokio::spawn({ + let ctrl_c_received = ctrl_c_received.clone(); + let stop_handle = stop_handle.clone(); + async move { + while !ctrl_c_received.load(Ordering::Acquire) { + tokio::time::sleep(Duration::from_millis(100)).await; + } + stop_handle.stop(); + } + }); + + let capture_result = tokio::task::spawn_blocking({ + let ctrl_c_received = ctrl_c_received.clone(); + move || { + run_capture_loop_blocking( + config, + ctrl_c_received, + track, + rtc_source, + video_input, + width, + height, + display_shared, + publish_timing_state, + user_data_channels, + zero_copy_fallback, + ) + } + }) + .await; + stop_watcher.abort(); + // Unblock the stats/overlay/display tasks when the loop exits on its own + // (e.g. after repeated capture failures) rather than via the shutdown flag. + ctrl_c_received.store(true, Ordering::Release); + capture_result? +} + +fn run_capture_loop_blocking( config: CaptureConfig, ctrl_c_received: Arc, track: LocalVideoTrack, @@ -1638,18 +1715,13 @@ async fn run_capture_loop( let pace_fps = config.fps as f64; #[cfg(target_os = "macos")] let camera_driven_pacing = - matches!(&video_input, VideoInput::Camera(PlatformCamera::AvFoundation(_))); + matches!(&video_input, VideoInput::Camera(VideoCaptureSource::AvFoundation { .. })); #[cfg(not(target_os = "macos"))] let camera_driven_pacing = false; - let mut ticker = if camera_driven_pacing { - None - } else { - let mut ticker = tokio::time::interval(Duration::from_secs_f64(1.0 / pace_fps)); - ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); - // Align the first tick to now. - ticker.tick().await; - Some(ticker) - }; + let target = Duration::from_secs_f64(1.0 / pace_fps); + // Deadline-based pacing with skipped missed intervals, equivalent to the + // previous tokio interval with `MissedTickBehavior::Skip`. + let mut next_frame_deadline = Instant::now() + target; let start_ts = Instant::now(); // Capture loop @@ -1658,7 +1730,6 @@ async fn run_capture_loop( let mut fps_window_frames: u64 = 0; let mut fps_window_start = Instant::now(); let mut fps_smoothed: f32 = 0.0; - let target = Duration::from_secs_f64(1.0 / pace_fps); let burn_timestamp_requested = config.attach_timestamp && config.burn_timestamp; info!("Target frame interval: {:.2} ms", target.as_secs_f64() * 1000.0); if camera_driven_pacing { @@ -1677,32 +1748,29 @@ async fn run_capture_loop( let mut logged_camera_timestamp_source = false; let mut logged_camera_timestamp_fallback = false; let mut logged_zero_copy_fallback = false; + let mut consecutive_capture_failures: u32 = 0; loop { if ctrl_c_received.load(Ordering::Acquire) { break; } let paced_wait_started_at = Instant::now(); - if let Some(ticker) = ticker.as_mut() { - ticker.tick().await; + if !camera_driven_pacing { + if let Some(wait) = next_frame_deadline.checked_duration_since(paced_wait_started_at) { + std::thread::sleep(wait); + } + let now = Instant::now(); + next_frame_deadline += target; + while next_frame_deadline <= now { + next_frame_deadline += target; + } } let paced_wait_finished_at = Instant::now(); let source_frame_read_started_at = Instant::now(); - let frame_wall_time_us = unix_time_us_now(); - let ( - mut captured_frame, - capture_wall_time_us, - read_wall_time_us, - source_frame_acquired_at, - frame_pipeline_started_at, - decode_finished_at, - convert_finished_at, - used_decode_path, - has_capture_timestamp, - record_convert_timing, - ) = match &mut video_input { + let mut sourced = match &mut video_input { VideoInput::TestPattern(pattern) => { + let frame_wall_time_us = unix_time_us_now(); // WebRTC may queue the frame and hardware encoders may upload it asynchronously. // Give each submitted frame unique backing storage so later captures cannot // overwrite buffers that are still in-flight. @@ -1725,33 +1793,57 @@ async fn run_capture_loop( ); test_pattern_frame_index = test_pattern_frame_index.wrapping_add(1); let frame_acquired_at = Instant::now(); - ( - CapturedFrameBuffer::I420(frame), - frame_wall_time_us, - unix_time_us_now(), - frame_acquired_at, - source_frame_read_started_at, - frame_acquired_at, - frame_acquired_at, - false, - false, - false, - ) + SourcedFrame { + buffer: CapturedFrameBuffer::I420(frame), + capture_wall_time_us: frame_wall_time_us, + read_wall_time_us: unix_time_us_now(), + acquired_at: frame_acquired_at, + pipeline_started_at: source_frame_read_started_at, + has_camera_timestamp: false, + } } - VideoInput::Camera(camera) => { - let force_i420_after_zero_copy_failure = zero_copy_fallback + VideoInput::Camera(source) => { + let force_raw_after_zero_copy_failure = zero_copy_fallback .as_ref() .is_some_and(|fallback| fallback.load(Ordering::Acquire)); - if force_i420_after_zero_copy_failure && !logged_zero_copy_fallback { + if force_raw_after_zero_copy_failure && !logged_zero_copy_fallback { log::warn!( "Publisher media path changed: capture=AVFoundation CPU I420 fallback after zero-copy encode starvation" ); logged_zero_copy_fallback = true; + // Switch the facade to CPU-accessible frames for the rest of the run. + #[cfg(target_os = "macos")] + if let VideoCaptureSource::AvFoundation { prefer_raw_frames, .. } = source { + *prefer_raw_frames = true; + } } - let prefer_native = config.zero_copy && !force_i420_after_zero_copy_failure; - let mut captured = camera.capture_frame(prefer_native)?; - let camera_frame_acquired_at = Instant::now(); - match &mut captured.buffer { + let captured = match source.next_frame() { + Ok(frame) => { + consecutive_capture_failures = 0; + frame + } + Err(err) => { + if ctrl_c_received.load(Ordering::Acquire) { + // `stop()` interrupted a blocked `next_frame` during shutdown. + break; + } + consecutive_capture_failures += 1; + log::warn!( + "Camera frame capture failed ({consecutive_capture_failures} consecutive): {err}" + ); + if consecutive_capture_failures >= MAX_CONSECUTIVE_CAPTURE_FAILURES { + return Err(anyhow::Error::new(err).context(format!( + "camera capture failed {MAX_CONSECUTIVE_CAPTURE_FAILURES} times in a row" + ))); + } + std::thread::sleep(Duration::from_millis( + 5 * u64::from(consecutive_capture_failures.min(20)), + )); + continue; + } + }; + let mut sourced = sourced_frame_from_capture(captured)?; + match &mut sourced.buffer { CapturedFrameBuffer::I420(frame) => { frame.rotation = VideoRotation::VideoRotation0; } @@ -1760,13 +1852,12 @@ async fn run_capture_loop( frame.rotation = VideoRotation::VideoRotation0; } } - if captured.sensor_timestamp_us.is_some() { + if sourced.has_camera_timestamp { if !logged_camera_timestamp_source { - let capture_timestamp_age_ms = captured - .read_wall_time_us - .saturating_sub(captured.capture_wall_time_us) - as f64 - / 1000.0; + let capture_timestamp_age_ms = + sourced.read_wall_time_us.saturating_sub(sourced.capture_wall_time_us) + as f64 + / 1000.0; info!( "Using camera-provided capture timestamp (age at frame read {:.2} ms)", capture_timestamp_age_ms @@ -1780,18 +1871,7 @@ async fn run_capture_loop( logged_camera_timestamp_fallback = true; } - ( - captured.buffer, - captured.capture_wall_time_us, - captured.read_wall_time_us, - camera_frame_acquired_at, - camera_frame_acquired_at, - camera_frame_acquired_at, - camera_frame_acquired_at, - captured.used_decode_path, - captured.sensor_timestamp_us.is_some(), - false, - ) + sourced } #[cfg(all(target_os = "linux", target_arch = "aarch64"))] VideoInput::Argus(_) => { @@ -1810,12 +1890,16 @@ async fn run_capture_loop( None }; if let Some(timing_state) = publish_timing_state.as_ref() { - timing_state.lock().record_frame_buffer(capture_wall_time_us, read_wall_time_us, fid); + timing_state.lock().record_frame_buffer( + sourced.capture_wall_time_us, + sourced.read_wall_time_us, + fid, + ); } - let mut buffer_ready_at = convert_finished_at; + let mut buffer_ready_at = sourced.acquired_at; let mut frame_draw_ms = None; let mut burned_timestamp_us = None; - let frame_uses_zero_copy = match &captured_frame { + let frame_uses_zero_copy = match &sourced.buffer { #[cfg(target_os = "macos")] CapturedFrameBuffer::Native(_) => true, _ => false, @@ -1823,18 +1907,18 @@ async fn run_capture_loop( if !frame_uses_zero_copy { if let Some(overlay) = timestamp_overlay.as_mut() { let overlay_started_at = Instant::now(); - match &mut captured_frame { + match &mut sourced.buffer { CapturedFrameBuffer::I420(frame) => { let (stride_y, _, _) = frame.buffer.strides(); let (data_y, _, _) = frame.buffer.data_mut(); - overlay.draw(data_y, stride_y as usize, capture_wall_time_us, fid); + overlay.draw(data_y, stride_y as usize, sourced.capture_wall_time_us, fid); } #[cfg(target_os = "macos")] CapturedFrameBuffer::Native(_) => { unreachable!("native frame was classified as zero-copy"); } } - burned_timestamp_us = Some(capture_wall_time_us); + burned_timestamp_us = Some(sourced.capture_wall_time_us); let overlay_finished_at = Instant::now(); frame_draw_ms = Some((overlay_finished_at - overlay_started_at).as_secs_f64() * 1000.0); @@ -1844,12 +1928,12 @@ async fn run_capture_loop( // Build frame metadata from enabled packet trailer features and local timing correlation. let user_ts = if config.attach_timestamp || config.display_timing { - Some(capture_wall_time_us) + Some(sourced.capture_wall_time_us) } else { None }; if burned_timestamp_us.is_some() { - debug_assert_eq!(burned_timestamp_us, Some(capture_wall_time_us)); + debug_assert_eq!(burned_timestamp_us, Some(sourced.capture_wall_time_us)); } let user_data = user_data_channels.as_ref().map(|targets| user_data::encode(&targets.lock())); @@ -1860,7 +1944,7 @@ async fn run_capture_loop( }; // Monotonic, microseconds since start. let timestamp_us = start_ts.elapsed().as_micros() as i64; - match &mut captured_frame { + match &mut sourced.buffer { CapturedFrameBuffer::I420(frame) => { frame.frame_metadata = frame_metadata; frame.timestamp_us = timestamp_us; @@ -1883,7 +1967,7 @@ async fn run_capture_loop( } else { None }; - match &captured_frame { + match &sourced.buffer { CapturedFrameBuffer::I420(frame) => { let (stride_y, stride_u, stride_v) = frame.buffer.strides(); let (data_y, data_u, data_v) = frame.buffer.data(); @@ -1942,29 +2026,23 @@ async fn run_capture_loop( timings .paced_wait_ms .record((paced_wait_finished_at - paced_wait_started_at).as_secs_f64() * 1000.0); - timings.camera_frame_read_ms.record( - (source_frame_acquired_at - source_frame_read_started_at).as_secs_f64() * 1000.0, - ); - if has_capture_timestamp && read_wall_time_us >= capture_wall_time_us { + timings + .camera_frame_read_ms + .record((sourced.acquired_at - source_frame_read_started_at).as_secs_f64() * 1000.0); + if sourced.has_camera_timestamp && sourced.read_wall_time_us >= sourced.capture_wall_time_us + { timings .capture_timestamp_age_ms - .record((read_wall_time_us - capture_wall_time_us) as f64 / 1000.0); + .record((sourced.read_wall_time_us - sourced.capture_wall_time_us) as f64 / 1000.0); } - if has_capture_timestamp && webrtc_capture_finished_wall_time_us >= capture_wall_time_us { + if sourced.has_camera_timestamp + && webrtc_capture_finished_wall_time_us >= sourced.capture_wall_time_us + { timings.capture_timestamp_to_webrtc_ms.record( - (webrtc_capture_finished_wall_time_us - capture_wall_time_us) as f64 / 1000.0, + (webrtc_capture_finished_wall_time_us - sourced.capture_wall_time_us) as f64 + / 1000.0, ); } - if used_decode_path { - timings - .decode_mjpeg_ms - .record((decode_finished_at - source_frame_acquired_at).as_secs_f64() * 1000.0); - } - if record_convert_timing { - timings - .buffer_convert_ms - .record((convert_finished_at - decode_finished_at).as_secs_f64() * 1000.0); - } if let Some(frame_draw_ms) = frame_draw_ms { timings.frame_draw_ms.record(frame_draw_ms); } @@ -1972,7 +2050,7 @@ async fn run_capture_loop( .submit_to_webrtc_ms .record((webrtc_capture_finished_at - buffer_ready_at).as_secs_f64() * 1000.0); timings.capture_to_webrtc_total_ms.record( - (webrtc_capture_finished_at - frame_pipeline_started_at).as_secs_f64() * 1000.0, + (webrtc_capture_finished_at - sourced.pipeline_started_at).as_secs_f64() * 1000.0, ); if last_fps_log.elapsed() >= std::time::Duration::from_secs(2) { diff --git a/examples/preencode_publish/Cargo.toml b/examples/preencode_publish/Cargo.toml index b6b64c1cc..af740f6ed 100644 --- a/examples/preencode_publish/Cargo.toml +++ b/examples/preencode_publish/Cargo.toml @@ -6,14 +6,13 @@ publish = false [features] default = [] -gstreamer = ["dep:gstreamer", "dep:gstreamer-app", "livekit-capture/gstreamer"] +gstreamer = ["dep:gstreamer", "livekit-capture/gstreamer"] [dependencies] anyhow = { workspace = true } clap = { workspace = true, features = ["derive", "env"] } env_logger = { workspace = true } gstreamer = { workspace = true, optional = true } -gstreamer-app = { workspace = true, optional = true } livekit = { workspace = true, features = ["rustls-tls-native-roots"] } livekit-api = { workspace = true, features = ["rustls-tls-native-roots"] } livekit-capture = { workspace = true, features = ["rtsp", "tcpsink"] } diff --git a/examples/preencode_publish/src/main.rs b/examples/preencode_publish/src/main.rs index 9638206e2..5fd6c1318 100644 --- a/examples/preencode_publish/src/main.rs +++ b/examples/preencode_publish/src/main.rs @@ -1,7 +1,17 @@ +//! Publish a pre-encoded video stream into a LiveKit room. +//! +//! Encoded access units are pulled from a TCP, RTSP, or GStreamer source and +//! pumped into a passthrough `VideoCaptureTrack` by +//! `livekit_capture::EncodedIngress`, which also forwards downstream keyframe +//! requests (PLI/FIR from the SFU) back to the source. The higher-level +//! `livekit_capture::VideoCaptureSource` facade covers the same encoded +//! endpoints via `CaptureSourceOptions::encoded`; this example drives +//! `EncodedIngress` directly to keep its per-access-unit diagnostics. + use std::{ net::{Shutdown, TcpStream}, sync::{ - atomic::{AtomicBool, Ordering}, + atomic::{AtomicU64, Ordering}, Arc, }, time::{Duration, Instant, SystemTime, UNIX_EPOCH}, @@ -13,8 +23,6 @@ use clap::{Parser, ValueEnum}; use gstreamer as gst; #[cfg(feature = "gstreamer")] use gstreamer::prelude::*; -#[cfg(feature = "gstreamer")] -use gstreamer_app as gst_app; use livekit::{ options::{self, VideoEncoding}, prelude::*, @@ -23,24 +31,23 @@ use livekit::{ use livekit_api::access_token; #[cfg(feature = "gstreamer")] use livekit_capture::sources::gstreamer::{ - GStreamerAppSinkConfig, GStreamerAppSinkEncodedSource, GStreamerSampleFormat, + encoded_caps_string, ensure_encoded_appsink, GStreamerAppSinkConfig, + GStreamerAppSinkEncodedSource, ENCODED_APPSINK_NAME, }; use livekit_capture::{ - encoded::h26x::annex_b_nal_ranges, sources::{ rtsp::{RtspEncodedSource, RtspSourceOptions}, tcp::{ByteStreamSourceConfig, TcpEncodedSource}, }, - CaptureError, EncodedAccessUnitSource, EncodedFrameType, EncodedVideoCodec, EncodedWireFormat, - OwnedEncodedAccessUnit, VideoCaptureTrack, + CaptureError, EncodedAccessUnitSource, EncodedFrameType, EncodedIngress, EncodedIngressCapture, + EncodedIngressError, EncodedVideoCodec, EncodedWireFormat, OwnedEncodedAccessUnit, + VideoCaptureTrack, }; const DIAGNOSTIC_REPORT_INTERVAL: Duration = Duration::from_secs(1); const SOURCE_STALL_THRESHOLD: Duration = Duration::from_millis(250); const BURST_WALL_DELTA_THRESHOLD: Duration = Duration::from_millis(5); const KEYFRAME_GAP_THRESHOLD: Duration = Duration::from_secs(5); -#[cfg(feature = "gstreamer")] -const GSTREAMER_APPSINK_NAME: &str = "lk_appsink"; /// Publish a pre-encoded video stream into a LiveKit room. #[derive(Parser, Debug)] @@ -118,7 +125,7 @@ struct Args { #[arg(long, default_value_t = 90_000)] rtp_clock_rate: u32, - /// Log access-unit timing, keyframe, and H26x NAL diagnostics. + /// Log access-unit timing, keyframe, and keyframe-request diagnostics. #[arg(long)] diagnostics: bool, @@ -400,7 +407,7 @@ async fn run_shmsink_source(args: Args, frame_interval_us: i64) -> Result<()> { let codec_arg = args.codec.context("--codec is required with --source shmsink")?; let codec = codec_arg.encoded_codec(); let socket_path = args.shmsink_socket_path.clone(); - let pipeline_args = vec![gstreamer_shmsink_pipeline_description(&socket_path, codec)?]; + let pipeline_args = vec![gstreamer_shmsink_pipeline_description(&socket_path, codec)]; let source = GStreamerTestSource::start( args.width, args.height, @@ -471,10 +478,8 @@ impl GStreamerTestSource { }; let requested_codec = if pipeline_args.is_empty() { Some(generated_codec) } else { requested_codec }; - let (appsink, sample_format) = ensure_encoded_appsink(&pipeline, requested_codec)?; - let Ok(appsink) = appsink.downcast::() else { - bail!("GStreamer element {GSTREAMER_APPSINK_NAME} was not an appsink"); - }; + let (appsink, sample_format) = ensure_encoded_appsink(&pipeline, requested_codec) + .context("failed to prepare GStreamer encoded appsink")?; let config = GStreamerAppSinkConfig::new( sample_format, @@ -514,6 +519,12 @@ impl EncodedAccessUnitSource for GStreamerTestSource { fn next_access_unit(&mut self) -> Result, Self::Error> { self.source.next_access_unit() } + + fn request_keyframe(&mut self) { + // Forward downstream PLI/FIR to the appsink source, which raises a + // GstForceKeyUnit event so the upstream encoder emits an IDR. + self.source.request_keyframe(); + } } #[cfg(feature = "gstreamer")] @@ -557,7 +568,7 @@ fn gstreamer_test_pipeline_description( videoconvert ! \ video/x-raw,format=I420 ! \ {codec_pipeline} ! \ - appsink name={GSTREAMER_APPSINK_NAME} sync=false max-buffers=8 drop=true" + appsink name={ENCODED_APPSINK_NAME} sync=false max-buffers=8 drop=true" ) } @@ -565,57 +576,44 @@ fn gstreamer_test_pipeline_description( fn gstreamer_test_encode_pipeline(fps: u32, codec: EncodedVideoCodec, bitrate: u64) -> String { let key_int_max = fps.max(1); let bitrate_kbps = u64::max(1, bitrate / 1000); + // The trailing capsfilter is the appsink contract, so it comes from the + // crate's caps table; encoder-specific settings before the parser stay + // inline because they configure the encoder, not the appsink. + let caps = encoded_caps_string(codec); match codec { EncodedVideoCodec::H264 => format!( "x264enc tune=zerolatency speed-preset=ultrafast key-int-max={key_int_max} \ bitrate={bitrate_kbps} byte-stream=true aud=true ! h264parse config-interval=-1 ! \ - video/x-h264,stream-format=byte-stream,alignment=au" + {caps}" ), EncodedVideoCodec::H265 => format!( "x265enc tune=zerolatency speed-preset=ultrafast key-int-max={key_int_max} \ - bitrate={bitrate_kbps} ! h265parse config-interval=-1 ! \ - video/x-h265,stream-format=byte-stream,alignment=au" + bitrate={bitrate_kbps} ! h265parse config-interval=-1 ! {caps}" ), EncodedVideoCodec::VP8 => format!( "vp8enc deadline=1 cpu-used=8 keyframe-max-dist={key_int_max} lag-in-frames=0 \ - target-bitrate={bitrate} ! video/x-vp8" + target-bitrate={bitrate} ! {caps}" ), EncodedVideoCodec::VP9 => format!( "vp9enc deadline=1 cpu-used=8 keyframe-max-dist={key_int_max} lag-in-frames=0 \ - target-bitrate={bitrate} ! video/x-vp9,profile=(string)0" + target-bitrate={bitrate} ! {caps}" ), EncodedVideoCodec::AV1 => format!( "av1enc cpu-used=8 usage-profile=realtime keyframe-max-dist={key_int_max} \ - lag-in-frames=0 target-bitrate={bitrate_kbps} ! av1parse ! \ - video/x-av1,stream-format=obu-stream,alignment=tu" + lag-in-frames=0 target-bitrate={bitrate_kbps} ! av1parse ! {caps}" ), _ => unreachable!("unknown generated GStreamer codec"), } } #[cfg(feature = "gstreamer")] -fn gstreamer_shmsink_pipeline_description( - socket_path: &str, - codec: EncodedVideoCodec, -) -> Result { +fn gstreamer_shmsink_pipeline_description(socket_path: &str, codec: EncodedVideoCodec) -> String { let socket_path = gstreamer_launch_string_value(socket_path); - let caps = gstreamer_launch_caps(codec)?; + let caps = encoded_caps_string(codec); - Ok(format!( + format!( "shmsrc socket-path={socket_path} is-live=true do-timestamp=true ! capsfilter caps={caps}" - )) -} - -#[cfg(feature = "gstreamer")] -fn gstreamer_launch_caps(codec: EncodedVideoCodec) -> Result<&'static str> { - match codec { - EncodedVideoCodec::H264 => Ok("video/x-h264,stream-format=byte-stream,alignment=au"), - EncodedVideoCodec::H265 => Ok("video/x-h265,stream-format=byte-stream,alignment=au"), - EncodedVideoCodec::VP8 => Ok("video/x-vp8"), - EncodedVideoCodec::VP9 => Ok("video/x-vp9,profile=(string)0"), - EncodedVideoCodec::AV1 => Ok("video/x-av1,stream-format=obu-stream,alignment=tu"), - _ => bail!("unsupported GStreamer codec: {:?}", codec), - } + ) } #[cfg(feature = "gstreamer")] @@ -628,254 +626,6 @@ fn gstreamer_launch_string_value(value: &str) -> String { format!("\"{}\"", value.replace('\\', "\\\\").replace('"', "\\\"")) } -#[cfg(feature = "gstreamer")] -fn ensure_encoded_appsink( - pipeline: &gst::Pipeline, - requested_codec: Option, -) -> Result<(gst::Element, GStreamerSampleFormat)> { - if let Some(appsink) = pipeline.by_name(GSTREAMER_APPSINK_NAME) { - let sample_format = match sample_format_from_element_sink_caps(&appsink)? { - Some(sample_format) => { - if let Some(requested_codec) = requested_codec { - if requested_codec != sample_format.codec() { - bail!( - "GStreamer codec mismatch: --codec requested {:?}, but appsink '{}' advertises {:?}", - requested_codec, - GSTREAMER_APPSINK_NAME, - sample_format.codec() - ); - } - } - sample_format - } - None => sample_format_for_codec(requested_codec.unwrap_or(EncodedVideoCodec::H264))?, - }; - return Ok((appsink, sample_format)); - } - - let src_pad = pipeline.find_unlinked_pad(gst::PadDirection::Src).with_context(|| { - format!("GStreamer pipeline must include appsink name={GSTREAMER_APPSINK_NAME} or leave one encoded video source pad unlinked") - })?; - let inferred_codec = codec_from_pad_caps(&src_pad).with_context(|| { - format!( - "unlinked GStreamer pad '{}' does not advertise supported encoded video caps", - src_pad.name() - ) - })?; - let codec = match requested_codec { - Some(requested_codec) if requested_codec != inferred_codec => bail!( - "GStreamer codec mismatch: --codec requested {:?}, but unlinked pad '{}' advertises {:?}", - requested_codec, - src_pad.name(), - inferred_codec - ), - Some(requested_codec) => requested_codec, - None => inferred_codec, - }; - let sample_format = sample_format_for_codec(codec)?; - let Some(src_element) = src_pad.parent_element() else { - bail!("unlinked GStreamer encoded pad has no parent element"); - }; - - let parser = parser_element_for_codec(codec)?; - let codec_caps = appsink_caps(codec)?; - let capsfilter = gst::ElementFactory::make("capsfilter") - .property("caps", codec_caps) - .build() - .with_context(|| format!("failed to create {:?} capsfilter", codec))?; - let appsink = gst::ElementFactory::make("appsink") - .name(GSTREAMER_APPSINK_NAME) - .property("sync", false) - .property("max-buffers", 8u32) - .property("drop", true) - .build() - .context("failed to create appsink")?; - - if let Some(parser) = &parser { - pipeline - .add(parser) - .with_context(|| format!("failed to add {} to GStreamer pipeline", parser.name()))?; - } - pipeline.add(&capsfilter).context("failed to add capsfilter to GStreamer pipeline")?; - pipeline.add(&appsink).context("failed to add appsink to GStreamer pipeline")?; - if let Some(parser) = &parser { - gst::Element::link_many([parser, &capsfilter, &appsink]) - .with_context(|| format!("failed to link {} to appsink", parser.name()))?; - } else { - gst::Element::link_many([&capsfilter, &appsink]) - .context("failed to link capsfilter to appsink")?; - } - let link_target = parser.as_ref().unwrap_or(&capsfilter); - let sink_pad = link_target - .static_pad("sink") - .with_context(|| format!("{} did not expose a sink pad", link_target.name()))?; - src_pad.link(&sink_pad).with_context(|| { - format!("failed to link '{}' to {}", src_element.name(), link_target.name()) - })?; - - Ok((appsink, sample_format)) -} - -#[cfg(feature = "gstreamer")] -fn sample_format_for_codec(codec: EncodedVideoCodec) -> Result { - match codec { - EncodedVideoCodec::H264 => Ok(GStreamerSampleFormat::H264AnnexB), - EncodedVideoCodec::H265 => Ok(GStreamerSampleFormat::H265AnnexB), - EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { - Ok(GStreamerSampleFormat::AccessUnit { codec }) - } - _ => bail!("unsupported GStreamer codec: {:?}", codec), - } -} - -#[cfg(feature = "gstreamer")] -fn parser_element_for_codec(codec: EncodedVideoCodec) -> Result> { - let Some(name) = parser_name(codec)? else { - return Ok(None); - }; - let mut builder = gst::ElementFactory::make(name); - if matches!(codec, EncodedVideoCodec::H264 | EncodedVideoCodec::H265) { - builder = builder.property("config-interval", -1i32); - } - builder.build().map(Some).with_context(|| format!("failed to create {name}")) -} - -#[cfg(feature = "gstreamer")] -fn parser_name(codec: EncodedVideoCodec) -> Result> { - match codec { - EncodedVideoCodec::H264 => Ok(Some("h264parse")), - EncodedVideoCodec::H265 => Ok(Some("h265parse")), - EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 => Ok(None), - EncodedVideoCodec::AV1 => Ok(Some("av1parse")), - _ => bail!("unsupported GStreamer codec: {:?}", codec), - } -} - -#[cfg(feature = "gstreamer")] -fn appsink_caps(codec: EncodedVideoCodec) -> Result { - match codec { - EncodedVideoCodec::H264 => Ok(gst::Caps::builder("video/x-h264") - .field("stream-format", "byte-stream") - .field("alignment", "au") - .build()), - EncodedVideoCodec::H265 => Ok(gst::Caps::builder("video/x-h265") - .field("stream-format", "byte-stream") - .field("alignment", "au") - .build()), - EncodedVideoCodec::VP8 => Ok(gst::Caps::builder("video/x-vp8").build()), - EncodedVideoCodec::VP9 => { - Ok(gst::Caps::builder("video/x-vp9").field("profile", "0").build()) - } - EncodedVideoCodec::AV1 => Ok(gst::Caps::builder("video/x-av1") - .field("parsed", true) - .field("stream-format", "obu-stream") - .field("alignment", "tu") - .build()), - _ => bail!("unsupported GStreamer codec: {:?}", codec), - } -} - -#[cfg(feature = "gstreamer")] -fn sample_format_from_element_sink_caps( - element: &gst::Element, -) -> Result> { - let Some(sink_pad) = element.static_pad("sink") else { - return Ok(None); - }; - sample_format_from_pad_caps(&sink_pad) -} - -#[cfg(feature = "gstreamer")] -fn sample_format_from_pad_caps(pad: &gst::Pad) -> Result> { - let caps = pad.current_caps().unwrap_or_else(|| pad.query_caps(None)); - for structure in caps.iter() { - if let Some(sample_format) = sample_format_from_caps_structure(structure)? { - return Ok(Some(sample_format)); - } - } - Ok(None) -} - -#[cfg(feature = "gstreamer")] -fn sample_format_from_caps_structure( - structure: &gst::StructureRef, -) -> Result> { - let Some(codec) = codec_from_caps_name(structure.name()) else { - return Ok(None); - }; - - match codec { - EncodedVideoCodec::H264 => { - let stream_format = structure.get::("stream-format").ok(); - match stream_format.as_deref() { - Some("avc") | Some("avc3") => Ok(Some(GStreamerSampleFormat::H264Avc { - nal_length_size: h264_avc_nal_length_size_from_caps(structure), - })), - Some("byte-stream") | None => Ok(Some(GStreamerSampleFormat::H264AnnexB)), - Some(stream_format) => bail!( - "unsupported GStreamer H.264 stream-format '{stream_format}'; expected byte-stream or avc" - ), - } - } - EncodedVideoCodec::H265 => Ok(Some(GStreamerSampleFormat::H265AnnexB)), - EncodedVideoCodec::VP8 => Ok(Some(GStreamerSampleFormat::AccessUnit { codec })), - EncodedVideoCodec::VP9 => { - let profile = structure.get::("profile").ok(); - match profile.as_deref() { - Some("0") | None => Ok(Some(GStreamerSampleFormat::AccessUnit { codec })), - Some(profile) => { - bail!("unsupported GStreamer VP9 profile '{profile}'; expected profile 0") - } - } - } - EncodedVideoCodec::AV1 => { - let stream_format = structure.get::("stream-format").ok(); - match stream_format.as_deref() { - Some("obu-stream") | None => Ok(Some(GStreamerSampleFormat::AccessUnit { codec })), - Some(stream_format) => bail!( - "unsupported GStreamer AV1 stream-format '{stream_format}'; expected obu-stream" - ), - } - } - _ => Ok(None), - } -} - -#[cfg(feature = "gstreamer")] -fn h264_avc_nal_length_size_from_caps(structure: &gst::StructureRef) -> u8 { - let Ok(codec_data) = structure.get::("codec_data") else { - return 4; - }; - let Ok(codec_data) = codec_data.map_readable() else { - return 4; - }; - h264_avc_nal_length_size_from_codec_data(codec_data.as_ref()).unwrap_or(4) -} - -#[cfg(feature = "gstreamer")] -fn h264_avc_nal_length_size_from_codec_data(codec_data: &[u8]) -> Option { - let length_size = (codec_data.get(4)? & 0x03) + 1; - (1..=4).contains(&length_size).then_some(length_size) -} - -#[cfg(feature = "gstreamer")] -fn codec_from_pad_caps(pad: &gst::Pad) -> Option { - let caps = pad.current_caps().unwrap_or_else(|| pad.query_caps(None)); - caps.iter().find_map(|structure| codec_from_caps_name(structure.name())) -} - -#[cfg(feature = "gstreamer")] -fn codec_from_caps_name(name: &str) -> Option { - match name { - "video/x-h264" => Some(EncodedVideoCodec::H264), - "video/x-h265" => Some(EncodedVideoCodec::H265), - "video/x-vp8" => Some(EncodedVideoCodec::VP8), - "video/x-vp9" => Some(EncodedVideoCodec::VP9), - "video/x-av1" => Some(EncodedVideoCodec::AV1), - _ => None, - } -} - async fn publish_encoded_source( args: Args, codec: EncodedVideoCodec, @@ -906,10 +656,9 @@ where .await .context("failed to connect to LiveKit room")?; - let capture_track = VideoCaptureTrack::new( + let capture_track = VideoCaptureTrack::new_encoded( "preencoded", VideoResolution { width: args.width, height: args.height }, - false, ); let mut publish_options = VideoCaptureTrack::encoded_publish_options(codec); let video_encoding = @@ -931,26 +680,26 @@ where source_label ); - let stop = Arc::new(AtomicBool::new(false)); - let signal_task = tokio::spawn({ - let stop = stop.clone(); - async move { - let _ = tokio::signal::ctrl_c().await; - stop.store(true, Ordering::Release); - shutdown_source(); - } + let keyframe_requests_forwarded = Arc::new(AtomicU64::new(0)); + let ingress = EncodedIngress::new( + capture_track, + KeyframeRequestLogger::new(source, source_label, keyframe_requests_forwarded.clone()), + ); + let stop = ingress.stop_handle(); + let signal_task = tokio::spawn(async move { + let _ = tokio::signal::ctrl_c().await; + stop.stop(); + shutdown_source(); }); - let capture_task = tokio::task::spawn_blocking({ - let stop = stop.clone(); - move || { - let diagnostics = AccessUnitDiagnostics::new( - diagnostics_enabled, - source_label, - expected_frame_interval_us, - ); - forward_access_units(source, capture_track, stop, diagnostics) - } + let capture_task = tokio::task::spawn_blocking(move || { + let diagnostics = AccessUnitDiagnostics::new( + diagnostics_enabled, + source_label, + expected_frame_interval_us, + keyframe_requests_forwarded, + ); + forward_access_units(ingress, diagnostics) }); let captured = capture_task.await.context("capture task failed to join")??; signal_task.abort(); @@ -960,42 +709,38 @@ where Ok(()) } +/// Drives [`EncodedIngress::capture_next`] until EOF or shutdown, feeding the +/// example's per-access-unit diagnostics from each capture. fn forward_access_units( - mut source: S, - track: VideoCaptureTrack, - stop: Arc, + mut ingress: EncodedIngress, mut diagnostics: AccessUnitDiagnostics, ) -> Result where S: EncodedAccessUnitSource, { + let stop = ingress.stop_handle(); let mut captured = 0; let mut dropped = 0; - while !stop.load(Ordering::Acquire) { + while !stop.is_stopped() { let read_started = Instant::now(); - let access_unit = match source.next_access_unit() { - Ok(Some(access_unit)) => access_unit, + let capture = match ingress.capture_next() { + Ok(Some(capture)) => capture, Ok(None) => break, - Err(err) if stop.load(Ordering::Acquire) => { - log::debug!("encoded source stopped after shutdown: {err}"); - break; - } - Err(err) => return Err(err.into()), - }; - diagnostics.observe_source_wait(read_started.elapsed()); - diagnostics.observe_access_unit(&access_unit); - - match track.capture_encoded(&access_unit.as_access_unit()) { - Ok(()) => {} - Err(CaptureError::CaptureFailed) => { + Err(EncodedIngressError::Capture(CaptureError::CaptureFailed)) => { dropped += 1; if dropped == 1 || dropped % 300 == 0 { log::info!("Dropped {dropped} encoded access units before capture"); } continue; } + Err(EncodedIngressError::Source(err)) if stop.is_stopped() => { + log::debug!("encoded source stopped after shutdown: {err}"); + break; + } Err(err) => return Err(err.into()), - } + }; + diagnostics.observe_source_wait(read_started.elapsed()); + diagnostics.observe_capture(&capture); captured += 1; if captured % 300 == 0 { log::info!("Published {captured} encoded access units"); @@ -1006,11 +751,46 @@ where Ok(captured) } +/// Wraps an encoded source to count and log the downstream keyframe requests +/// (PLI/FIR polled by [`EncodedIngress::capture_next`]) forwarded to it. +struct KeyframeRequestLogger { + source: S, + source_label: &'static str, + forwarded: Arc, +} + +impl KeyframeRequestLogger { + fn new(source: S, source_label: &'static str, forwarded: Arc) -> Self { + Self { source, source_label, forwarded } + } +} + +impl EncodedAccessUnitSource for KeyframeRequestLogger +where + S: EncodedAccessUnitSource, +{ + type Error = S::Error; + + fn next_access_unit(&mut self) -> Result, Self::Error> { + self.source.next_access_unit() + } + + fn request_keyframe(&mut self) { + let forwarded = self.forwarded.fetch_add(1, Ordering::Relaxed) + 1; + log::info!( + "{} forwarding downstream keyframe request {forwarded} to the encoded source", + self.source_label + ); + self.source.request_keyframe(); + } +} + #[derive(Debug)] struct AccessUnitDiagnostics { enabled: bool, source_label: &'static str, expected_frame_interval_us: Option, + keyframe_requests_forwarded: Arc, last_report: Instant, last_wall_time: Option, last_timestamp_us: Option, @@ -1027,7 +807,6 @@ struct AccessUnitDiagnostics { report_max_timestamp_gap_us: i64, report_stalls: u64, report_bursts: u64, - report_missing_parameter_keyframes: u64, } impl AccessUnitDiagnostics { @@ -1035,6 +814,7 @@ impl AccessUnitDiagnostics { enabled: bool, source_label: &'static str, expected_frame_interval_us: Option, + keyframe_requests_forwarded: Arc, ) -> Self { let now = Instant::now(); if enabled { @@ -1051,6 +831,7 @@ impl AccessUnitDiagnostics { enabled, source_label, expected_frame_interval_us, + keyframe_requests_forwarded, last_report: now, last_wall_time: None, last_timestamp_us: None, @@ -1067,7 +848,6 @@ impl AccessUnitDiagnostics { report_max_timestamp_gap_us: 0, report_stalls: 0, report_bursts: 0, - report_missing_parameter_keyframes: 0, } } @@ -1087,18 +867,16 @@ impl AccessUnitDiagnostics { } } - fn observe_access_unit(&mut self, access_unit: &OwnedEncodedAccessUnit) { + fn observe_capture(&mut self, capture: &EncodedIngressCapture) { if !self.enabled { return; } let now = Instant::now(); - let payload = access_unit.payload.as_ref(); - let payload_len = payload.len(); - let nal_summary = NalSummary::from_annex_b(access_unit.codec, payload); - let is_keyframe = access_unit.frame_type == EncodedFrameType::Key; + let payload_len = capture.payload_len; + let is_keyframe = capture.frame_type == EncodedFrameType::Key; let timestamp_gap_us = - self.last_timestamp_us.map(|last| access_unit.timestamp_us.saturating_sub(last)); + self.last_timestamp_us.map(|last| capture.timestamp_us.saturating_sub(last)); self.total_frames += 1; self.report_frames += 1; @@ -1139,38 +917,18 @@ impl AccessUnitDiagnostics { } if is_keyframe { - if matches!(access_unit.codec, EncodedVideoCodec::H264 | EncodedVideoCodec::H265) - && nal_summary.missing_recovery_parameter_set() - { - self.report_missing_parameter_keyframes += 1; - log::warn!( - "{} keyframe {} missing recovery parameter sets: {}", - self.source_label, - self.total_frames, - nal_summary.describe(access_unit.codec) - ); - } else { - log::info!( - "{} keyframe {} ts={} size={} {}", - self.source_label, - self.total_frames, - access_unit.timestamp_us, - payload_len, - nal_summary.describe(access_unit.codec) - ); - } - } else if nal_summary.contains_key_picture { - log::warn!( - "{} access unit {} contains a key picture but is marked delta: {}", + log::info!( + "{} keyframe {} ts={} size={}", self.source_label, self.total_frames, - nal_summary.describe(access_unit.codec) + capture.timestamp_us, + payload_len ); } self.warn_if_keyframe_gap(now); self.last_wall_time = Some(now); - self.last_timestamp_us = Some(access_unit.timestamp_us); + self.last_timestamp_us = Some(capture.timestamp_us); self.report_if_due(now); } @@ -1215,9 +973,11 @@ impl AccessUnitDiagnostics { { self.last_keyframe_warning = Some(now); log::warn!( - "{} no keyframe for {:.1}s; passthrough cannot satisfy PLI without upstream IDR", + "{} no keyframe for {:.1}s; {} downstream keyframe request(s) forwarded to the \ + source so far", self.source_label, - keyframe_gap.as_secs_f64() + keyframe_gap.as_secs_f64(), + self.keyframe_requests_forwarded.load(Ordering::Relaxed) ); } } @@ -1234,7 +994,7 @@ impl AccessUnitDiagnostics { log::info!( "{} diagnostics: frames={} fps={:.1} keys={} avg_size={} max_size={} \ max_source_wait={:.1}ms max_publish_gap={:.1}ms max_ts_gap={:.1}ms stalls={} \ - bursts={} missing_param_keys={}", + bursts={} keyframe_requests={}", self.source_label, self.report_frames, fps, @@ -1246,7 +1006,7 @@ impl AccessUnitDiagnostics { self.report_max_timestamp_gap_us as f64 / 1000.0, self.report_stalls, self.report_bursts, - self.report_missing_parameter_keyframes + self.keyframe_requests_forwarded.load(Ordering::Relaxed) ); self.reset_report(now); } @@ -1262,7 +1022,6 @@ impl AccessUnitDiagnostics { self.report_max_timestamp_gap_us = 0; self.report_stalls = 0; self.report_bursts = 0; - self.report_missing_parameter_keyframes = 0; } fn finish(&mut self) { @@ -1279,102 +1038,6 @@ impl AccessUnitDiagnostics { } } -#[derive(Debug, Default)] -struct NalSummary { - nal_count: usize, - vcl_count: usize, - aud_count: usize, - vps_count: usize, - sps_count: usize, - pps_count: usize, - contains_key_picture: bool, -} - -impl NalSummary { - fn from_annex_b(codec: EncodedVideoCodec, payload: &[u8]) -> Self { - let mut summary = Self::default(); - for range in annex_b_nal_ranges(payload) { - let nal = &payload[range]; - if nal.is_empty() { - continue; - } - - match codec { - EncodedVideoCodec::H264 => summary.observe_h264(nal[0] & 0x1f), - EncodedVideoCodec::H265 => { - if nal.len() >= 2 { - summary.observe_h265((nal[0] >> 1) & 0x3f); - } - } - EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => {} - _ => {} - } - } - summary - } - - fn observe_h264(&mut self, nal_type: u8) { - self.nal_count += 1; - if (1..=5).contains(&nal_type) { - self.vcl_count += 1; - } - match nal_type { - 5 => self.contains_key_picture = true, - 7 => self.sps_count += 1, - 8 => self.pps_count += 1, - 9 => self.aud_count += 1, - _ => {} - } - } - - fn observe_h265(&mut self, nal_type: u8) { - self.nal_count += 1; - if nal_type <= 31 { - self.vcl_count += 1; - } - match nal_type { - 16..=21 => self.contains_key_picture = true, - 32 => self.vps_count += 1, - 33 => self.sps_count += 1, - 34 => self.pps_count += 1, - 35 => self.aud_count += 1, - _ => {} - } - } - - fn missing_recovery_parameter_set(&self) -> bool { - self.sps_count == 0 || self.pps_count == 0 - } - - fn describe(&self, codec: EncodedVideoCodec) -> String { - match codec { - EncodedVideoCodec::H264 => format!( - "nals={} vcl={} aud={} sps={} pps={} key_picture={}", - self.nal_count, - self.vcl_count, - self.aud_count, - self.sps_count, - self.pps_count, - self.contains_key_picture - ), - EncodedVideoCodec::H265 => format!( - "nals={} vcl={} aud={} vps={} sps={} pps={} key_picture={}", - self.nal_count, - self.vcl_count, - self.aud_count, - self.vps_count, - self.sps_count, - self.pps_count, - self.contains_key_picture - ), - EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { - "non-H26x payload".to_string() - } - _ => "unknown encoded payload".to_string(), - } - } -} - fn validate_dimensions(width: u32, height: u32) -> Result<()> { if width == 0 || height == 0 { bail!("--width and --height must be greater than zero"); @@ -1422,10 +1085,6 @@ fn current_time_us() -> i64 { mod tests { use super::*; - fn init_gstreamer_for_test() { - gst::init().expect("failed to initialize GStreamer"); - } - #[test] fn gstreamer_pipeline_description_routes_test_source_to_h264_appsink() { let description = @@ -1437,7 +1096,7 @@ mod tests { assert!(description.contains("video/x-raw,format=I420")); assert!(description.contains("x264enc")); assert!(description.contains("video/x-h264,stream-format=byte-stream,alignment=au")); - assert!(description.contains(&format!("appsink name={GSTREAMER_APPSINK_NAME}"))); + assert!(description.contains(&format!("appsink name={ENCODED_APPSINK_NAME}"))); } #[test] @@ -1451,7 +1110,7 @@ mod tests { assert!(description.contains("x265enc")); assert!(description.contains("h265parse config-interval=-1")); assert!(description.contains("video/x-h265,stream-format=byte-stream,alignment=au")); - assert!(description.contains(&format!("appsink name={GSTREAMER_APPSINK_NAME}"))); + assert!(description.contains(&format!("appsink name={ENCODED_APPSINK_NAME}"))); } #[test] @@ -1460,20 +1119,20 @@ mod tests { assert!(vp8.contains("video/x-raw,format=I420")); assert!(vp8.contains("vp8enc")); assert!(vp8.contains("video/x-vp8")); - assert!(vp8.contains(&format!("appsink name={GSTREAMER_APPSINK_NAME}"))); + assert!(vp8.contains(&format!("appsink name={ENCODED_APPSINK_NAME}"))); let vp9 = gstreamer_test_pipeline_description(320, 180, 30, EncodedVideoCodec::VP9, None); assert!(vp9.contains("video/x-raw,format=I420")); assert!(vp9.contains("vp9enc")); assert!(vp9.contains("video/x-vp9,profile=(string)0")); - assert!(vp9.contains(&format!("appsink name={GSTREAMER_APPSINK_NAME}"))); + assert!(vp9.contains(&format!("appsink name={ENCODED_APPSINK_NAME}"))); let av1 = gstreamer_test_pipeline_description(320, 180, 30, EncodedVideoCodec::AV1, None); assert!(av1.contains("video/x-raw,format=I420")); assert!(av1.contains("av1enc")); assert!(av1.contains("av1parse")); assert!(av1.contains("video/x-av1,stream-format=obu-stream,alignment=tu")); - assert!(av1.contains(&format!("appsink name={GSTREAMER_APPSINK_NAME}"))); + assert!(av1.contains(&format!("appsink name={ENCODED_APPSINK_NAME}"))); } #[test] @@ -1496,94 +1155,26 @@ mod tests { let h264 = gstreamer_shmsink_pipeline_description( "/tmp/livekit h264.shm", EncodedVideoCodec::H264, - ) - .unwrap(); + ); assert!(h264.contains("shmsrc socket-path=\"/tmp/livekit h264.shm\"")); assert!(h264.contains("is-live=true do-timestamp=true")); assert!(h264.contains("capsfilter caps=")); assert!(h264.contains("video/x-h264,stream-format=byte-stream,alignment=au")); let vp8 = - gstreamer_shmsink_pipeline_description("/tmp/livekit-vp8.shm", EncodedVideoCodec::VP8) - .unwrap(); + gstreamer_shmsink_pipeline_description("/tmp/livekit-vp8.shm", EncodedVideoCodec::VP8); assert!(vp8.contains("shmsrc socket-path=/tmp/livekit-vp8.shm")); assert!(vp8.contains("video/x-vp8")); let vp9 = - gstreamer_shmsink_pipeline_description("/tmp/livekit-vp9.shm", EncodedVideoCodec::VP9) - .unwrap(); + gstreamer_shmsink_pipeline_description("/tmp/livekit-vp9.shm", EncodedVideoCodec::VP9); assert!(vp9.contains("video/x-vp9,profile=(string)0")); let av1 = - gstreamer_shmsink_pipeline_description("/tmp/livekit-av1.shm", EncodedVideoCodec::AV1) - .unwrap(); + gstreamer_shmsink_pipeline_description("/tmp/livekit-av1.shm", EncodedVideoCodec::AV1); assert!(av1.contains("video/x-av1,stream-format=obu-stream,alignment=tu")); } - #[test] - fn gstreamer_caps_detect_h264_avc_sample_format() { - init_gstreamer_for_test(); - let caps = gst::Caps::builder("video/x-h264") - .field("stream-format", "avc") - .field("alignment", "au") - .build(); - let structure = caps.iter().next().unwrap(); - - assert_eq!( - sample_format_from_caps_structure(structure).unwrap(), - Some(GStreamerSampleFormat::H264Avc { nal_length_size: 4 }) - ); - } - - #[test] - fn gstreamer_caps_detect_vp8_vp9_and_av1_sample_formats() { - init_gstreamer_for_test(); - for (caps_name, codec) in [ - ("video/x-vp8", EncodedVideoCodec::VP8), - ("video/x-vp9", EncodedVideoCodec::VP9), - ("video/x-av1", EncodedVideoCodec::AV1), - ] { - let caps = if codec == EncodedVideoCodec::AV1 { - gst::Caps::builder(caps_name).field("stream-format", "obu-stream").build() - } else { - gst::Caps::builder(caps_name).build() - }; - let structure = caps.iter().next().unwrap(); - - assert_eq!( - sample_format_from_caps_structure(structure).unwrap(), - Some(GStreamerSampleFormat::AccessUnit { codec }) - ); - } - } - - #[test] - fn gstreamer_caps_reject_av1_annexb_for_appsink_passthrough() { - init_gstreamer_for_test(); - let caps = gst::Caps::builder("video/x-av1").field("stream-format", "annexb").build(); - let structure = caps.iter().next().unwrap(); - - let err = sample_format_from_caps_structure(structure).unwrap_err(); - assert!(err.to_string().contains("unsupported GStreamer AV1 stream-format")); - } - - #[test] - fn gstreamer_caps_reject_nonzero_vp9_profile_for_appsink_passthrough() { - init_gstreamer_for_test(); - let caps = gst::Caps::builder("video/x-vp9").field("profile", "1").build(); - let structure = caps.iter().next().unwrap(); - - let err = sample_format_from_caps_structure(structure).unwrap_err(); - assert!(err.to_string().contains("unsupported GStreamer VP9 profile")); - } - - #[test] - fn gstreamer_avc_codec_data_sets_nal_length_size() { - assert_eq!(h264_avc_nal_length_size_from_codec_data(&[1, 0, 0, 0, 0xfc]), Some(1)); - assert_eq!(h264_avc_nal_length_size_from_codec_data(&[1, 0, 0, 0, 0xfd]), Some(2)); - assert_eq!(h264_avc_nal_length_size_from_codec_data(&[1, 0, 0, 0, 0xff]), Some(4)); - } - #[test] fn gstreamer_test_source_pulls_h264_access_units_when_plugins_are_available() { let frame_interval_us = frame_interval_us(30).unwrap(); diff --git a/libwebrtc/src/native/video_source.rs b/libwebrtc/src/native/video_source.rs index f0dee2f4d..446214fdc 100644 --- a/libwebrtc/src/native/video_source.rs +++ b/libwebrtc/src/native/video_source.rs @@ -54,6 +54,24 @@ struct VideoSourceInner { impl NativeVideoSource { pub fn new(resolution: VideoResolution, is_screencast: bool) -> NativeVideoSource { + Self::new_inner(resolution, is_screencast, true) + } + + /// Creates a source for pre-encoded access units. + /// + /// Unlike [`NativeVideoSource::new`], no raw black-frame keepalive is + /// injected before the first capture: raw frames would start a real + /// encoder on a sender meant for the pass-through encoder and corrupt + /// the encoded stream. + pub fn new_encoded(resolution: VideoResolution) -> NativeVideoSource { + Self::new_inner(resolution, false, false) + } + + fn new_inner( + resolution: VideoResolution, + is_screencast: bool, + raw_keepalive: bool, + ) -> NativeVideoSource { let source = Self { sys_handle: vt_sys::ffi::new_video_track_source( &vt_sys::ffi::VideoResolution::from(resolution.clone()), @@ -62,39 +80,41 @@ impl NativeVideoSource { inner: Arc::new(Mutex::new(VideoSourceInner { captured_frames: 0 })), }; - livekit_runtime::spawn({ - let source = source.clone(); - let i420 = I420Buffer::new(resolution.width, resolution.height); - async move { - let mut interval = interval(Duration::from_millis(100)); // 10 fps + if raw_keepalive { + livekit_runtime::spawn({ + let source = source.clone(); + let i420 = I420Buffer::new(resolution.width, resolution.height); + async move { + let mut interval = interval(Duration::from_millis(100)); // 10 fps - loop { - interval.tick().await; + loop { + interval.tick().await; - let inner = source.inner.lock(); - if inner.captured_frames > 0 { - break; - } + let inner = source.inner.lock(); + if inner.captured_frames > 0 { + break; + } - let mut builder = vf_sys::ffi::new_video_frame_builder(); - builder.pin_mut().set_rotation(VideoRotation::VideoRotation0); - builder.pin_mut().set_video_frame_buffer(i420.as_ref().sys_handle()); - - let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap(); - builder.pin_mut().set_timestamp_us(now.as_micros() as i64); - - source.sys_handle.on_captured_frame( - &builder.pin_mut().build(), - &vt_sys::ffi::FrameMetadata { - has_packet_trailer: false, - user_timestamp: 0, - frame_id: 0, - user_data: Vec::new(), - }, - ); + let mut builder = vf_sys::ffi::new_video_frame_builder(); + builder.pin_mut().set_rotation(VideoRotation::VideoRotation0); + builder.pin_mut().set_video_frame_buffer(i420.as_ref().sys_handle()); + + let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap(); + builder.pin_mut().set_timestamp_us(now.as_micros() as i64); + + source.sys_handle.on_captured_frame( + &builder.pin_mut().build(), + &vt_sys::ffi::FrameMetadata { + has_packet_trailer: false, + user_timestamp: 0, + frame_id: 0, + user_data: Vec::new(), + }, + ); + } } - } - }); + }); + } source } @@ -164,9 +184,9 @@ impl NativeVideoSource { &vt_sys::ffi::EncodedVideoFrameData { codec: frame.codec.into(), frame_type: frame.frame_type.into(), - payload: frame.payload.to_vec(), timestamp_us: capture_ts, }, + frame.payload, &vt_sys::ffi::FrameMetadata { has_packet_trailer: has_trailer, user_timestamp: user_ts, @@ -176,6 +196,13 @@ impl NativeVideoSource { ) } + /// Returns and clears the pending keyframe request raised by the + /// pass-through encoder (PLI/FIR or reconfiguration). Poll from the + /// capture loop and forward the request to the upstream encoder. + pub fn take_keyframe_request(&self) -> bool { + self.sys_handle.take_keyframe_request() + } + /// Captures a Jetson DMA-buffer backed video frame. /// /// `pixel_format` is `0` for NV12 and `1` for YUV420M. diff --git a/libwebrtc/src/video_source.rs b/libwebrtc/src/video_source.rs index 23f7ae62f..e9d374b36 100644 --- a/libwebrtc/src/video_source.rs +++ b/libwebrtc/src/video_source.rs @@ -75,6 +75,12 @@ pub mod native { Self { handle: vs_imp::NativeVideoSource::new(resolution, is_screencast) } } + /// Creates a source for pre-encoded access units: no raw black-frame + /// keepalive is injected before the first capture. + pub fn new_encoded(resolution: VideoResolution) -> Self { + Self { handle: vs_imp::NativeVideoSource::new_encoded(resolution) } + } + pub fn capture_frame>(&self, frame: &VideoFrame) { self.handle.capture_frame(frame) } @@ -84,6 +90,12 @@ pub mod native { self.handle.capture_encoded_frame(frame) } + /// Returns and clears the pending keyframe request raised by the + /// pass-through encoder (PLI/FIR or reconfiguration). + pub fn take_keyframe_request(&self) -> bool { + self.handle.take_keyframe_request() + } + /// Captures a Jetson DMA-buffer backed video frame. /// /// `pixel_format` is `0` for NV12 and `1` for YUV420M. diff --git a/livekit-capture/Cargo.toml b/livekit-capture/Cargo.toml index edfb2cc7f..e4d78d499 100644 --- a/livekit-capture/Cargo.toml +++ b/livekit-capture/Cargo.toml @@ -8,13 +8,13 @@ edition.workspace = true repository.workspace = true [dependencies] -base64 = { workspace = true } +base64 = { workspace = true, optional = true } bytes = { workspace = true } gstreamer = { workspace = true, optional = true } gstreamer-app = { workspace = true, optional = true } image = { workspace = true, optional = true } livekit = { workspace = true } -md-5 = { workspace = true } +md-5 = { workspace = true, optional = true } thiserror = { workspace = true } yuv-sys = { workspace = true, features = ["jpeg"], optional = true } @@ -58,7 +58,7 @@ avfoundation = [ ] gstreamer = ["dep:gstreamer", "dep:gstreamer-app"] libargus = [] -rtsp = [] +rtsp = ["dep:base64", "dep:md-5"] tcpsink = [] v4l = ["dep:image", "dep:libc", "dep:v4l", "dep:yuv-sys"] diff --git a/livekit-capture/README.md b/livekit-capture/README.md index 86ca61a43..6d81f3bc9 100644 --- a/livekit-capture/README.md +++ b/livekit-capture/README.md @@ -4,6 +4,29 @@ Capture helpers for publishing decoded, native platform, DMA-BUF, and pre-encoded video frames with the LiveKit Rust SDK. Optional source features include `avfoundation`, `libargus`, `v4l`, `tcpsink`, `rtsp`, and `gstreamer`. +## Library entry points + +- `VideoCaptureSource::open(CaptureSourceOptions)` — one facade over every + backend. Camera backends (`AvFoundation`, `V4l2`, `LibArgus`, `Auto`) are + selected with device/format options; the encoded ingest backends (`Rtsp`, + `Tcp`, `Gstreamer`) take an `EncodedEndpoint` describing the URL, socket, or + `gst-launch` description. `publish_next(&track)` pumps one frame and returns + `Ok(false)` at end of stream; `stop()` interrupts a blocked capture. +- `VideoCaptureTrack::new` for decoded/native/DMA-BUF publishing and + `VideoCaptureTrack::new_encoded` for pre-encoded passthrough (no raw + keepalive frames, so the sender starts directly on the passthrough encoder). +- `EncodedIngress` — the lower-level pre-encoded pump used when the caller + manages its own source: `capture_next()` reports each published access unit, + `stop_handle()` cancels from any thread, and downstream keyframe requests + (PLI/FIR) are forwarded to the source automatically. The GStreamer source + answers them with a `GstForceKeyUnit` upstream event; passthrough is + single-layer (`L1T1`), and access units carrying other layering metadata are + rejected. +- `sources::gstreamer::ensure_encoded_appsink` and friends turn an arbitrary + pipeline (containing `appsink name=lk_appsink` or one unlinked encoded pad) + into an encoded source; `encoded_caps_string` is the single per-codec caps + table. + ## Pre-encoded source modes The `preencode_publish` example publishes H.264, H.265, VP8, VP9, and AV1 diff --git a/livekit-capture/src/device.rs b/livekit-capture/src/device.rs index 6907a7123..734f85834 100644 --- a/livekit-capture/src/device.rs +++ b/livekit-capture/src/device.rs @@ -195,10 +195,6 @@ impl std::str::FromStr for CaptureFrameFormat { #[error("unknown capture frame format")] pub struct CaptureFrameFormatParseError; -/// Deprecated alias for [`CaptureFrameFormat`]. -#[deprecated(note = "use CaptureFrameFormat")] -pub type CapturePixelFormat = CaptureFrameFormat; - /// Pixel dimensions for a capture format. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct CaptureResolution { diff --git a/livekit-capture/src/encoded.rs b/livekit-capture/src/encoded.rs index d2e2059c2..11843551a 100644 --- a/livekit-capture/src/encoded.rs +++ b/livekit-capture/src/encoded.rs @@ -140,6 +140,26 @@ impl Default for CodecSpecific { } } +impl CodecSpecific { + /// Returns the single-layer default metadata for a codec, matching what + /// the passthrough encoder synthesizes on the wire. + pub fn default_for(codec: EncodedVideoCodec) -> Self { + match codec { + EncodedVideoCodec::H264 => { + Self::H264 { packetization_mode: H264PacketizationMode::NonInterleaved } + } + EncodedVideoCodec::H265 => Self::H265, + EncodedVideoCodec::VP8 => Self::VP8 { temporal_id: None, layer_sync: false }, + EncodedVideoCodec::VP9 => { + Self::VP9 { temporal_id: None, spatial_id: None, inter_layer_predicted: None } + } + EncodedVideoCodec::AV1 => { + Self::AV1 { scalability_mode: Some("L1T1".to_owned()), dependency_descriptor: None } + } + } + } +} + /// Borrowed encoded payload fragment. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct EncodedFragment<'a> { @@ -307,26 +327,7 @@ impl<'a> EncodedAccessUnit<'a> { width: u32, height: u32, ) -> Result, CaptureError> { - let mut is_key = false; - for nal in nal_units { - let nal_type = h264_nal_type(nal)?; - if nal_type == 5 { - is_key = true; - } - } - - Ok(EncodedAccessUnit { - codec: EncodedVideoCodec::H264, - payload: EncodedPayload::Owned(annex_b_payload(nal_units)?), - timestamp_us, - frame_type: if is_key { EncodedFrameType::Key } else { EncodedFrameType::Delta }, - width, - height, - layers: EncodedLayerInfo::default(), - codec_specific: CodecSpecific::H264 { - packetization_mode: H264PacketizationMode::NonInterleaved, - }, - }) + Self::from_nalus(EncodedVideoCodec::H264, nal_units, timestamp_us, width, height) } /// Creates an H.265 access unit from raw NAL-unit payloads. @@ -336,27 +337,49 @@ impl<'a> EncodedAccessUnit<'a> { width: u32, height: u32, ) -> Result, CaptureError> { - let mut is_key = false; - for nal in nal_units { - let nal_type = h265_nal_type(nal)?; - if (16..=21).contains(&nal_type) { - is_key = true; - } - } + Self::from_nalus(EncodedVideoCodec::H265, nal_units, timestamp_us, width, height) + } + fn from_nalus( + codec: EncodedVideoCodec, + nal_units: &[&[u8]], + timestamp_us: i64, + width: u32, + height: u32, + ) -> Result, CaptureError> { + let is_key = is_keyframe_nalus(codec, nal_units)?; Ok(EncodedAccessUnit { - codec: EncodedVideoCodec::H265, + codec, payload: EncodedPayload::Owned(annex_b_payload(nal_units)?), timestamp_us, frame_type: if is_key { EncodedFrameType::Key } else { EncodedFrameType::Delta }, width, height, layers: EncodedLayerInfo::default(), - codec_specific: CodecSpecific::H265, + codec_specific: CodecSpecific::default_for(codec), }) } } +/// Returns true when any NAL unit in the slice is an intra/key picture. +pub(crate) fn is_keyframe_nalus( + codec: EncodedVideoCodec, + nal_units: &[&[u8]], +) -> Result { + match codec { + EncodedVideoCodec::H264 => { + nal_units.iter().try_fold(false, |is_key, nal| Ok(is_key || h264_nal_type(nal)? == 5)) + } + EncodedVideoCodec::H265 => nal_units.iter().try_fold(false, |is_key, nal| { + let nal_type = h265_nal_type(nal)?; + Ok(is_key || (16..=21).contains(&nal_type)) + }), + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { + Err(CaptureError::UnsupportedCodec(codec)) + } + } +} + impl From for VideoCodec { fn from(value: EncodedVideoCodec) -> Self { match value { diff --git a/livekit-capture/src/encoded/h26x.rs b/livekit-capture/src/encoded/h26x.rs index 866c6dd18..1b3e96970 100644 --- a/livekit-capture/src/encoded/h26x.rs +++ b/livekit-capture/src/encoded/h26x.rs @@ -18,17 +18,43 @@ use bytes::Bytes; use crate::{ encoded::{ - annex_b_payload, h264_nal_type, h265_nal_type, CodecSpecific, EncodedFrameType, - EncodedVideoCodec, H264PacketizationMode, OwnedEncodedAccessUnit, + annex_b_payload, h264_nal_type, h265_nal_type, is_keyframe_nalus, CodecSpecific, + EncodedFrameType, EncodedVideoCodec, OwnedEncodedAccessUnit, }, error::CaptureError, }; +/// Upper bound on bytes buffered while waiting for an access-unit boundary. +const MAX_PENDING_ACCESS_UNIT_BYTES: usize = 32 * 1024 * 1024; + +/// Byte-stream access-unit parser shared by the encoded ingest sources. +/// +/// `push` appends bytes and returns at most one completed access unit; call +/// `drain` repeatedly to pull further access units already buffered, and +/// `flush` once at end of stream to emit the final pending access unit. +pub(crate) trait AccessUnitParser { + /// Appends bytes and returns the next complete access unit, if any. + fn push(&mut self, bytes: &[u8]) -> Result, CaptureError>; + + /// Returns the next complete access unit from already-buffered bytes. + fn drain(&mut self) -> Result, CaptureError> { + self.push(&[]) + } + + /// Flushes remaining buffered bytes as the final access unit. + fn flush(&mut self) -> Result, CaptureError>; +} + /// H26x Annex-B parser state. #[derive(Debug, Clone)] pub struct AnnexBAccessUnitParser { codec: EncodedVideoCodec, pending: Vec, + /// NAL ranges found in `pending`; the last range's end is provisional + /// until the next start code (or flush) confirms it. + nal_ranges: Vec>, + /// Offset up to which `pending` has been scanned for start codes. + scan_cursor: usize, next_timestamp_us: i64, frame_interval_us: i64, width: u32, @@ -40,6 +66,10 @@ pub struct AnnexBAccessUnitParser { #[derive(Debug, Clone)] pub(crate) struct AvcAccessUnitParser { pending: Vec, + /// Complete NAL ranges found in `pending`. + nal_ranges: Vec>, + /// Offset of the first unparsed length prefix or incomplete NAL in `pending`. + scan_cursor: usize, nal_length_size: u8, next_timestamp_us: i64, frame_interval_us: i64, @@ -66,6 +96,8 @@ impl AnnexBAccessUnitParser { Ok(Self { codec, pending: Vec::new(), + nal_ranges: Vec::new(), + scan_cursor: 0, next_timestamp_us: start_timestamp_us, frame_interval_us, width, @@ -85,19 +117,49 @@ impl AnnexBAccessUnitParser { } fn drain_next(&mut self, at_eof: bool) -> Result, CaptureError> { - let ranges = annex_b_nal_ranges(&self.pending); - if ranges.is_empty() { - return Ok(None); + self.scan_pending(); + + if let Some(split_at) = + access_unit_split_index(self.codec, &self.pending, &self.nal_ranges)? + { + return self.take_access_unit(split_at); + } + if at_eof && self.nal_ranges.iter().any(|range| range.start < range.end) { + return self.take_access_unit(self.pending.len()); + } + if !at_eof && self.pending.len() > MAX_PENDING_ACCESS_UNIT_BYTES { + return Err(CaptureError::InvalidEncodedData( + "access unit exceeds maximum buffered size", + )); } + Ok(None) + } - let Some(split_at) = access_unit_split_index(self.codec, &self.pending, &ranges)? else { - if at_eof { - return self.take_access_unit(self.pending.len()); + /// Scans bytes appended since the previous call, extending the cached NAL ranges. + fn scan_pending(&mut self) { + // Resume behind the previous scan end so a start code straddling the + // boundary is found, but never before the last NAL start so an + // already-found start code is not rediscovered. + let mut cursor = self.scan_cursor.saturating_sub(3); + if let Some(last) = self.nal_ranges.last() { + cursor = cursor.max(last.start); + } + while let Some((offset, prefix_len)) = find_start_code(&self.pending[cursor..]) { + let prefix_start = cursor + offset; + let nal_start = prefix_start + prefix_len; + if let Some(last) = self.nal_ranges.last_mut() { + last.end = prefix_start; + if last.start >= prefix_start { + self.nal_ranges.pop(); + } } - return Ok(None); - }; - - self.take_access_unit(split_at) + self.nal_ranges.push(nal_start..nal_start); + cursor = nal_start; + } + if let Some(last) = self.nal_ranges.last_mut() { + last.end = self.pending.len(); + } + self.scan_cursor = self.pending.len(); } fn take_access_unit( @@ -110,6 +172,15 @@ impl AnnexBAccessUnitParser { let access_unit = self.pending[..byte_len].to_vec(); self.pending.drain(..byte_len); + self.nal_ranges.retain_mut(|range| { + if range.end <= byte_len { + return false; + } + range.start -= byte_len; + range.end -= byte_len; + true + }); + self.scan_cursor -= byte_len; let timestamp_us = self.next_timestamp_us; self.next_timestamp_us = self.next_timestamp_us.saturating_add(self.frame_interval_us); access_unit_from_annex_b( @@ -123,6 +194,16 @@ impl AnnexBAccessUnitParser { } } +impl AccessUnitParser for AnnexBAccessUnitParser { + fn push(&mut self, bytes: &[u8]) -> Result, CaptureError> { + AnnexBAccessUnitParser::push(self, bytes) + } + + fn flush(&mut self) -> Result, CaptureError> { + AnnexBAccessUnitParser::flush(self) + } +} + #[cfg(any(feature = "tcpsink", test))] impl AvcAccessUnitParser { /// Creates a parser for H.264/AVC length-prefixed byte streams. @@ -137,6 +218,8 @@ impl AvcAccessUnitParser { Ok(Self { pending: Vec::new(), + nal_ranges: Vec::new(), + scan_cursor: 0, nal_length_size, next_timestamp_us: start_timestamp_us, frame_interval_us, @@ -160,21 +243,57 @@ impl AvcAccessUnitParser { } fn drain_next(&mut self, at_eof: bool) -> Result, CaptureError> { - let ranges = avc_nal_ranges(&self.pending, self.nal_length_size, at_eof)?; - if ranges.is_empty() { - return Ok(None); + self.scan_pending(at_eof)?; + + if let Some(split_at) = avc_access_unit_split_index( + &self.pending, + &self.nal_ranges, + self.nal_length_size as usize, + )? { + return self.take_access_unit(split_at); } + if at_eof && !self.nal_ranges.is_empty() { + return self.take_access_unit(self.pending.len()); + } + if !at_eof && self.pending.len() > MAX_PENDING_ACCESS_UNIT_BYTES { + return Err(CaptureError::InvalidEncodedData( + "access unit exceeds maximum buffered size", + )); + } + Ok(None) + } + + /// Parses length-prefixed NAL units appended since the previous call. + fn scan_pending(&mut self, at_eof: bool) -> Result<(), CaptureError> { + let nal_length_size = self.nal_length_size as usize; + while self.scan_cursor < self.pending.len() { + if self.pending.len() - self.scan_cursor < nal_length_size { + if at_eof { + return Err(CaptureError::InvalidEncodedData("truncated AVC NAL length")); + } + break; + } - let Some(split_at) = - avc_access_unit_split_index(&self.pending, &ranges, self.nal_length_size as usize)? - else { - if at_eof { - return self.take_access_unit(self.pending.len()); + let nal_start = self.scan_cursor + nal_length_size; + let nal_len = read_avc_nal_length(&self.pending[self.scan_cursor..nal_start]); + if nal_len == 0 { + return Err(CaptureError::InvalidEncodedData("empty AVC NAL unit")); + } + + let Some(nal_end) = nal_start.checked_add(nal_len) else { + return Err(CaptureError::InvalidEncodedData("AVC NAL unit length overflow")); + }; + if nal_end > self.pending.len() { + if at_eof { + return Err(CaptureError::InvalidEncodedData("truncated AVC NAL unit")); + } + break; } - return Ok(None); - }; - self.take_access_unit(split_at) + self.nal_ranges.push(nal_start..nal_end); + self.scan_cursor = nal_end; + } + Ok(()) } fn take_access_unit( @@ -187,6 +306,15 @@ impl AvcAccessUnitParser { let access_unit = self.pending[..byte_len].to_vec(); self.pending.drain(..byte_len); + self.nal_ranges.retain_mut(|range| { + if range.end <= byte_len { + return false; + } + range.start -= byte_len; + range.end -= byte_len; + true + }); + self.scan_cursor -= byte_len; let timestamp_us = self.next_timestamp_us; self.next_timestamp_us = self.next_timestamp_us.saturating_add(self.frame_interval_us); access_unit_from_h264_avc( @@ -200,6 +328,17 @@ impl AvcAccessUnitParser { } } +#[cfg(any(feature = "tcpsink", test))] +impl AccessUnitParser for AvcAccessUnitParser { + fn push(&mut self, bytes: &[u8]) -> Result, CaptureError> { + AvcAccessUnitParser::push(self, bytes) + } + + fn flush(&mut self) -> Result, CaptureError> { + AvcAccessUnitParser::flush(self) + } +} + /// Returns NAL-unit byte ranges for an Annex-B access unit or stream chunk. pub fn annex_b_nal_ranges(bytes: &[u8]) -> Vec> { let mut ranges = Vec::new(); @@ -267,15 +406,7 @@ pub fn access_unit_from_annex_b( }; let mut access_unit = OwnedEncodedAccessUnit::new(codec, payload, timestamp_us, frame_type, width, height); - access_unit.codec_specific = match codec { - EncodedVideoCodec::H264 => { - CodecSpecific::H264 { packetization_mode: H264PacketizationMode::NonInterleaved } - } - EncodedVideoCodec::H265 => CodecSpecific::H265, - EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { - return Err(CaptureError::UnsupportedCodec(codec)); - } - }; + access_unit.codec_specific = CodecSpecific::default_for(codec); Ok(access_unit) } @@ -294,18 +425,7 @@ pub fn access_unit_from_nalus( /// Returns true when an Annex-B access unit contains an intra/key picture. pub fn is_keyframe_annex_b(codec: EncodedVideoCodec, bytes: &[u8]) -> Result { let nals = annex_b_nalus(bytes)?; - match codec { - EncodedVideoCodec::H264 => { - nals.iter().try_fold(false, |is_key, nal| Ok(is_key || h264_nal_type(nal)? == 5)) - } - EncodedVideoCodec::H265 => nals.iter().try_fold(false, |is_key, nal| { - let nal_type = h265_nal_type(nal)?; - Ok(is_key || (16..=21).contains(&nal_type)) - }), - EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { - Err(CaptureError::UnsupportedCodec(codec)) - } - } + is_keyframe_nalus(codec, &nals) } fn access_unit_split_index( @@ -313,20 +433,10 @@ fn access_unit_split_index( bytes: &[u8], ranges: &[Range], ) -> Result, CaptureError> { - if ranges.len() < 2 { - return Ok(None); - } - - let first_nal = &bytes[ranges[0].clone()]; - let mut seen_vcl = is_vcl_nal(codec, first_nal)?; - for range in ranges.iter().skip(1) { - let nal = &bytes[range.clone()]; - if is_access_unit_delimiter(codec, nal)? && seen_vcl { - return split_start_code_index(bytes, range.start).map(Some); - } - seen_vcl |= is_vcl_nal(codec, nal)?; + match access_unit_boundary_nal(codec, bytes, ranges)? { + Some(index) => split_start_code_index(bytes, ranges[index].start).map(Some), + None => Ok(None), } - Ok(None) } #[cfg(any(feature = "tcpsink", test))] @@ -335,46 +445,84 @@ fn avc_access_unit_split_index( ranges: &[Range], nal_length_size: usize, ) -> Result, CaptureError> { - if ranges.len() < 2 { - return Ok(None); + match access_unit_boundary_nal(EncodedVideoCodec::H264, bytes, ranges)? { + Some(index) => ranges[index] + .start + .checked_sub(nal_length_size) + .ok_or(CaptureError::InvalidEncodedData("missing AVC NAL length")) + .map(Some), + None => Ok(None), } +} - let first_nal = &bytes[ranges[0].clone()]; - let mut seen_vcl = is_vcl_nal(EncodedVideoCodec::H264, first_nal)?; - for range in ranges.iter().skip(1) { +/// Returns the index of the first NAL that starts a new access unit, once at +/// least one VCL NAL has been seen in the current one. +fn access_unit_boundary_nal( + codec: EncodedVideoCodec, + bytes: &[u8], + ranges: &[Range], +) -> Result, CaptureError> { + let mut seen_vcl = false; + for (index, range) in ranges.iter().enumerate() { let nal = &bytes[range.clone()]; - if is_access_unit_delimiter(EncodedVideoCodec::H264, nal)? && seen_vcl { - return range - .start - .checked_sub(nal_length_size) - .ok_or(CaptureError::InvalidEncodedData("missing AVC NAL length")) - .map(Some); + // The final NAL may still be streaming in; wait for its header. + if index + 1 == ranges.len() && nal.len() < min_nal_header_len(codec) { + return Ok(None); } - seen_vcl |= is_vcl_nal(EncodedVideoCodec::H264, nal)?; + if seen_vcl && starts_new_access_unit(codec, nal)? { + return Ok(Some(index)); + } + seen_vcl |= is_vcl_nal(codec, nal)?; } Ok(None) } -fn split_start_code_index(bytes: &[u8], nal_start: usize) -> Result { - if nal_start >= 4 && bytes[nal_start - 4..nal_start] == [0, 0, 0, 1] { - return Ok(nal_start - 4); - } - if nal_start >= 3 && bytes[nal_start - 3..nal_start] == [0, 0, 1] { - return Ok(nal_start - 3); +fn min_nal_header_len(codec: EncodedVideoCodec) -> usize { + match codec { + EncodedVideoCodec::H265 => 2, + _ => 1, } - Err(CaptureError::InvalidEncodedData("missing Annex-B start code")) } -fn is_access_unit_delimiter(codec: EncodedVideoCodec, nal: &[u8]) -> Result { +fn starts_new_access_unit(codec: EncodedVideoCodec, nal: &[u8]) -> Result { Ok(match codec { - EncodedVideoCodec::H264 => h264_nal_type(nal)? == 9, - EncodedVideoCodec::H265 => h265_nal_type(nal)? == 35, + EncodedVideoCodec::H264 => match h264_nal_type(nal)? { + // Prefix SEI(6), SPS(7), PPS(8), and AUD(9) open a new access unit. + 6..=9 => true, + // A VCL NAL opens a new picture when first_mb_in_slice == 0: + // ue(v) == 0 is a lone 1 bit, so the first RBSP bit after the + // header is set. The header byte is nonzero, so the next byte + // cannot be an emulation-prevention byte. + 1..=5 => nal.len() >= 2 && nal[1] & 0x80 != 0, + _ => false, + }, + EncodedVideoCodec::H265 => match h265_nal_type(nal)? { + // VPS(32), SPS(33), PPS(34), AUD(35), and prefix SEI(39). + 32..=35 | 39 => true, + // A VCL NAL opens a new picture when + // first_slice_segment_in_pic_flag (the bit after the 2-byte + // header) is set. nuh_temporal_id_plus1 makes the second header + // byte nonzero, so the next byte cannot be an + // emulation-prevention byte. + 0..=31 => nal.len() >= 3 && nal[2] & 0x80 != 0, + _ => false, + }, EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { return Err(CaptureError::UnsupportedCodec(codec)); } }) } +fn split_start_code_index(bytes: &[u8], nal_start: usize) -> Result { + if nal_start >= 4 && bytes[nal_start - 4..nal_start] == [0, 0, 0, 1] { + return Ok(nal_start - 4); + } + if nal_start >= 3 && bytes[nal_start - 3..nal_start] == [0, 0, 1] { + return Ok(nal_start - 3); + } + Err(CaptureError::InvalidEncodedData("missing Annex-B start code")) +} + fn is_vcl_nal(codec: EncodedVideoCodec, nal: &[u8]) -> Result { Ok(match codec { EncodedVideoCodec::H264 => (1..=5).contains(&h264_nal_type(nal)?), @@ -544,4 +692,227 @@ mod tests { assert_eq!(au.timestamp_us, 33_433); assert_eq!(au.payload.as_ref(), &[0, 0, 0, 1, 0x09, 0x10, 0, 0, 0, 1, 0x41, 3]); } + + #[test] + fn splits_aud_less_h264_stream_per_frame() { + let mut parser = + AnnexBAccessUnitParser::new(EncodedVideoCodec::H264, 0, 33_333, 640, 480).unwrap(); + let stream = [ + 0, 0, 0, 1, 0x67, 0x42, 0x00, 0x1e, // SPS + 0, 0, 0, 1, 0x68, 0xce, // PPS + 0, 0, 1, 0x65, 0x88, 0x84, 0x21, // IDR slice, first_mb_in_slice == 0 + 0, 0, 1, 0x41, 0x9a, 0x22, // P slice, first_mb_in_slice == 0 + 0, 0, 1, 0x41, 0x9a, 0x33, // P slice, first_mb_in_slice == 0 + ]; + + let au = parser.push(&stream).unwrap().unwrap(); + assert_eq!(au.timestamp_us, 0); + assert_eq!(au.frame_type, EncodedFrameType::Key); + assert_eq!(au.payload.as_ref(), &stream[..21]); + + let au = parser.drain().unwrap().unwrap(); + assert_eq!(au.timestamp_us, 33_333); + assert_eq!(au.frame_type, EncodedFrameType::Delta); + assert_eq!(au.payload.as_ref(), &stream[21..27]); + + let au = parser.flush().unwrap().unwrap(); + assert_eq!(au.timestamp_us, 66_666); + assert_eq!(au.payload.as_ref(), &stream[27..]); + } + + #[test] + fn keeps_multi_slice_h264_access_unit_together() { + let mut parser = + AnnexBAccessUnitParser::new(EncodedVideoCodec::H264, 0, 33_333, 640, 480).unwrap(); + let stream = [ + 0, 0, 1, 0x65, 0x88, 0x11, // IDR slice, first_mb_in_slice == 0 + 0, 0, 1, 0x65, 0x21, 0x22, // IDR slice, first_mb_in_slice != 0 + 0, 0, 1, 0x41, 0x9a, 0x33, // next picture + ]; + + let au = parser.push(&stream).unwrap().unwrap(); + assert_eq!(au.timestamp_us, 0); + assert_eq!(au.frame_type, EncodedFrameType::Key); + assert_eq!(au.payload.as_ref(), &stream[..12]); + + let au = parser.flush().unwrap().unwrap(); + assert_eq!(au.timestamp_us, 33_333); + assert_eq!(au.payload.as_ref(), &stream[12..]); + } + + #[test] + fn splits_aud_less_h265_stream_per_frame() { + let mut parser = + AnnexBAccessUnitParser::new(EncodedVideoCodec::H265, 0, 33_333, 640, 480).unwrap(); + let stream = [ + 0, 0, 0, 1, 0x40, 0x01, 0x0c, // VPS + 0, 0, 0, 1, 0x42, 0x01, 0x02, // SPS + 0, 0, 0, 1, 0x44, 0x01, 0x03, // PPS + 0, 0, 1, 0x26, 0x01, 0xaf, + 0x04, // IDR_W_RADL, first_slice_segment_in_pic_flag == 1 + 0, 0, 1, 0x02, 0x01, 0xd0, 0x05, // TRAIL_R, first_slice_segment_in_pic_flag == 1 + ]; + + let au = parser.push(&stream).unwrap().unwrap(); + assert_eq!(au.timestamp_us, 0); + assert_eq!(au.frame_type, EncodedFrameType::Key); + assert_eq!(au.payload.as_ref(), &stream[..28]); + + let au = parser.flush().unwrap().unwrap(); + assert_eq!(au.timestamp_us, 33_333); + assert_eq!(au.frame_type, EncodedFrameType::Delta); + assert_eq!(au.payload.as_ref(), &stream[28..]); + } + + #[test] + fn keeps_multi_slice_h265_access_unit_together() { + let mut parser = + AnnexBAccessUnitParser::new(EncodedVideoCodec::H265, 0, 33_333, 640, 480).unwrap(); + let stream = [ + 0, 0, 1, 0x26, 0x01, 0xaf, + 0x11, // IDR slice, first_slice_segment_in_pic_flag == 1 + 0, 0, 1, 0x26, 0x01, 0x40, + 0x22, // IDR slice, first_slice_segment_in_pic_flag == 0 + 0, 0, 1, 0x02, 0x01, 0xd0, 0x33, // next picture + ]; + + let au = parser.push(&stream).unwrap().unwrap(); + assert_eq!(au.timestamp_us, 0); + assert_eq!(au.frame_type, EncodedFrameType::Key); + assert_eq!(au.payload.as_ref(), &stream[..14]); + + let au = parser.flush().unwrap().unwrap(); + assert_eq!(au.timestamp_us, 33_333); + assert_eq!(au.payload.as_ref(), &stream[14..]); + } + + #[test] + fn groups_parameter_sets_with_following_frame() { + let mut parser = + AnnexBAccessUnitParser::new(EncodedVideoCodec::H264, 0, 33_333, 640, 480).unwrap(); + let stream = [ + 0, 0, 1, 0x67, 0x42, 0x1e, // SPS + 0, 0, 1, 0x68, 0xce, // PPS + 0, 0, 1, 0x65, 0x88, 0x11, // IDR + 0, 0, 1, 0x67, 0x42, 0x1e, // SPS + 0, 0, 1, 0x68, 0xce, // PPS + 0, 0, 1, 0x65, 0x88, 0x22, // IDR + ]; + + let au = parser.push(&stream).unwrap().unwrap(); + assert_eq!(au.timestamp_us, 0); + assert_eq!(au.frame_type, EncodedFrameType::Key); + assert_eq!(au.payload.as_ref(), &stream[..17]); + + let au = parser.flush().unwrap().unwrap(); + assert_eq!(au.timestamp_us, 33_333); + assert_eq!(au.frame_type, EncodedFrameType::Key); + assert_eq!(au.payload.as_ref(), &stream[17..]); + } + + fn collect_units( + parser: &mut impl AccessUnitParser, + stream: &[u8], + chunk_size: usize, + ) -> Vec<(Vec, i64, EncodedFrameType)> { + let mut units = Vec::new(); + for chunk in stream.chunks(chunk_size) { + let mut unit = parser.push(chunk).unwrap(); + while let Some(au) = unit { + units.push((au.payload.to_vec(), au.timestamp_us, au.frame_type)); + unit = parser.drain().unwrap(); + } + } + let mut unit = parser.flush().unwrap(); + while let Some(au) = unit { + units.push((au.payload.to_vec(), au.timestamp_us, au.frame_type)); + unit = parser.flush().unwrap(); + } + units + } + + fn assert_chunked_matches_one_shot( + make_parser: impl Fn() -> P, + stream: &[u8], + expected_units: usize, + ) { + let baseline = collect_units(&mut make_parser(), stream, stream.len()); + assert_eq!(baseline.len(), expected_units); + for chunk_size in [1, 7] { + assert_eq!(collect_units(&mut make_parser(), stream, chunk_size), baseline); + } + } + + #[test] + fn chunked_pushes_match_one_shot_parsing() { + let h264_annex_b = [ + 0, 0, 0, 1, 0x67, 0x64, 0x00, 0x1e, // SPS + 0, 0, 0, 1, 0x68, 0xce, 0x3c, 0x80, // PPS + 0, 0, 1, 0x65, 0x88, 0x84, 0x00, 0x01, // IDR, first_mb_in_slice == 0 + 0, 0, 1, 0x41, 0x9a, 0x02, // P, first_mb_in_slice == 0 + 0, 0, 1, 0x09, 0x10, // AUD + 0, 0, 1, 0x41, 0x9a, 0x03, // P + 0, 0, 0, 1, 0x41, 0x9a, 0x04, 0x00, // P, first_mb_in_slice == 0 + ]; + assert_chunked_matches_one_shot( + || AnnexBAccessUnitParser::new(EncodedVideoCodec::H264, 0, 33_333, 640, 480).unwrap(), + &h264_annex_b, + 4, + ); + + let h265_annex_b = [ + 0, 0, 0, 1, 0x40, 0x01, 0x0c, // VPS + 0, 0, 0, 1, 0x42, 0x01, 0x02, // SPS + 0, 0, 0, 1, 0x44, 0x01, 0x03, // PPS + 0, 0, 1, 0x26, 0x01, 0xaf, 0x08, // IDR_W_RADL + 0, 0, 1, 0x02, 0x01, 0xd0, 0x09, // TRAIL_R + 0, 0, 1, 0x46, 0x01, 0x50, // AUD + 0, 0, 1, 0x02, 0x01, 0xd0, 0x0a, // TRAIL_R + ]; + assert_chunked_matches_one_shot( + || AnnexBAccessUnitParser::new(EncodedVideoCodec::H265, 0, 33_333, 640, 480).unwrap(), + &h265_annex_b, + 3, + ); + + let h264_avc = [ + 0, 0, 0, 4, 0x67, 0x64, 0x00, 0x1e, // SPS + 0, 0, 0, 2, 0x68, 0xce, // PPS + 0, 0, 0, 4, 0x65, 0x88, 0x84, 0x00, // IDR, first_mb_in_slice == 0 + 0, 0, 0, 3, 0x41, 0x9a, 0x02, // P, first_mb_in_slice == 0 + 0, 0, 0, 2, 0x09, 0x10, // AUD + 0, 0, 0, 3, 0x41, 0x9a, 0x03, // P + ]; + assert_chunked_matches_one_shot( + || AvcAccessUnitParser::new(4, 0, 33_333, 640, 480).unwrap(), + &h264_avc, + 3, + ); + } + + #[test] + fn rejects_pending_access_unit_over_size_cap() { + let mut parser = + AnnexBAccessUnitParser::new(EncodedVideoCodec::H264, 0, 33_333, 640, 480).unwrap(); + assert!(parser.push(&[0, 0, 1, 0x65, 0x88]).unwrap().is_none()); + + let err = parser.push(&vec![0xff; MAX_PENDING_ACCESS_UNIT_BYTES]).unwrap_err(); + assert_eq!( + err, + CaptureError::InvalidEncodedData("access unit exceeds maximum buffered size") + ); + } + + #[test] + fn avc_rejects_pending_access_unit_over_size_cap() { + let mut parser = AvcAccessUnitParser::new(4, 0, 33_333, 640, 480).unwrap(); + let nal_len = (MAX_PENDING_ACCESS_UNIT_BYTES + 1) as u32; + assert!(parser.push(&nal_len.to_be_bytes()).unwrap().is_none()); + + let err = parser.push(&vec![0x41; MAX_PENDING_ACCESS_UNIT_BYTES]).unwrap_err(); + assert_eq!( + err, + CaptureError::InvalidEncodedData("access unit exceeds maximum buffered size") + ); + } } diff --git a/livekit-capture/src/encoded/ingress.rs b/livekit-capture/src/encoded/ingress.rs index 0032e0d98..0eac3f15d 100644 --- a/livekit-capture/src/encoded/ingress.rs +++ b/livekit-capture/src/encoded/ingress.rs @@ -12,7 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::{error::Error, fmt}; +use std::{ + error::Error, + fmt, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, +}; use crate::{encoded::OwnedEncodedAccessUnit, error::CaptureError, track::VideoCaptureTrack}; @@ -23,6 +30,13 @@ pub trait EncodedAccessUnitSource { /// Returns the next encoded access unit, or `Ok(None)` when the source reaches EOF. fn next_access_unit(&mut self) -> Result, Self::Error>; + + /// Forwards a downstream keyframe request (PLI/FIR, late subscriber) to + /// the producer so it can emit an IDR. + /// + /// The default implementation does nothing, for transports that cannot + /// influence the upstream encoder. + fn request_keyframe(&mut self) {} } /// Error returned while forwarding encoded access units into a track. @@ -55,17 +69,48 @@ where } } +/// Cancellation handle for [`EncodedIngress::run_until_end`]. +/// +/// Cheap to clone; wire it to a shutdown signal (e.g. Ctrl-C) and call +/// [`EncodedIngressStop::stop`] from any thread to make the ingest loop +/// return after the access unit in flight. +#[derive(Debug, Clone, Default)] +pub struct EncodedIngressStop(Arc); + +impl EncodedIngressStop { + /// Creates an un-stopped handle. + pub fn new() -> Self { + Self::default() + } + + /// Signals the ingest loop to stop. + pub fn stop(&self) { + self.0.store(true, Ordering::Release); + } + + /// Returns true once [`EncodedIngressStop::stop`] has been called. + pub fn is_stopped(&self) -> bool { + self.0.load(Ordering::Acquire) + } +} + /// Pulls encoded access units from a source and forwards them into a video track. #[derive(Debug)] pub struct EncodedIngress { track: VideoCaptureTrack, source: S, + stop: EncodedIngressStop, } impl EncodedIngress { /// Creates an encoded ingress runner. pub fn new(track: VideoCaptureTrack, source: S) -> Self { - Self { track, source } + Self { track, source, stop: EncodedIngressStop::new() } + } + + /// Returns a cancellation handle for this runner. + pub fn stop_handle(&self) -> EncodedIngressStop { + self.stop.clone() } /// Returns the capture track used by this runner. @@ -89,28 +134,54 @@ impl EncodedIngress { } } +/// Details of one access unit captured by [`EncodedIngress::capture_next`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct EncodedIngressCapture { + /// Capture timestamp of the access unit in microseconds. + pub timestamp_us: i64, + /// Frame type of the access unit. + pub frame_type: crate::encoded::EncodedFrameType, + /// Payload size in bytes. + pub payload_len: usize, +} + impl EncodedIngress where S: EncodedAccessUnitSource, { - /// Captures the next access unit and returns `false` after source EOF. - pub fn capture_next(&mut self) -> Result> { + /// Captures the next access unit, returning `None` after source EOF. + /// + /// Downstream keyframe requests (PLI/FIR raised by the passthrough + /// encoder) are polled on every call and forwarded to the source via + /// [`EncodedAccessUnitSource::request_keyframe`]. + pub fn capture_next( + &mut self, + ) -> Result, EncodedIngressError> { + if self.track.take_keyframe_request() { + self.source.request_keyframe(); + } + let Some(access_unit) = self.source.next_access_unit().map_err(EncodedIngressError::Source)? else { - return Ok(false); + return Ok(None); }; self.track .capture_encoded(&access_unit.as_access_unit()) .map_err(EncodedIngressError::Capture)?; - Ok(true) + Ok(Some(EncodedIngressCapture { + timestamp_us: access_unit.timestamp_us, + frame_type: access_unit.frame_type, + payload_len: access_unit.payload.len(), + })) } - /// Captures access units until the source reaches EOF. + /// Captures access units until the source reaches EOF or the stop + /// handle fires, returning the number of captured access units. pub fn run_until_end(&mut self) -> Result> { let mut captured = 0; - while self.capture_next()? { + while !self.stop.is_stopped() && self.capture_next()?.is_some() { captured += 1; } Ok(captured) diff --git a/livekit-capture/src/encoded/rtp.rs b/livekit-capture/src/encoded/rtp.rs index 8911dd61a..1f4feddf0 100644 --- a/livekit-capture/src/encoded/rtp.rs +++ b/livekit-capture/src/encoded/rtp.rs @@ -98,26 +98,35 @@ impl<'a> RtpPacket<'a> { #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct RtpTimestampMapper { clock_rate: u32, - base_rtp_timestamp: Option, + last_rtp_timestamp: Option, + extended_ticks: i64, base_timestamp_us: i64, } impl RtpTimestampMapper { /// Creates an RTP timestamp mapper. pub fn new(clock_rate: u32, base_timestamp_us: i64) -> Self { - Self { clock_rate, base_rtp_timestamp: None, base_timestamp_us } + Self { clock_rate, last_rtp_timestamp: None, extended_ticks: 0, base_timestamp_us } } - /// Maps an RTP timestamp to microseconds, handling `u32` RTP timestamp rollover. + /// Maps an RTP timestamp to microseconds, unwrapping `u32` RTP timestamp + /// rollover so mapped timestamps stay monotonic across any number of wraps. pub fn map(&mut self, rtp_timestamp: u32) -> Result { if self.clock_rate == 0 { return Err(RtpDepacketizerError::InvalidClockRate); } - let base = *self.base_rtp_timestamp.get_or_insert(rtp_timestamp); - let delta = rtp_timestamp.wrapping_sub(base) as u64; - let delta_us = delta.saturating_mul(1_000_000) / u64::from(self.clock_rate); - Ok(self.base_timestamp_us.saturating_add(delta_us as i64)) + let last = *self.last_rtp_timestamp.get_or_insert(rtp_timestamp); + self.last_rtp_timestamp = Some(rtp_timestamp); + // Reinterpreting the wrapped u32 delta as i32 picks the nearest extended + // timestamp, which unwraps rollover while tolerating small backwards + // jumps from reordered packets. + let delta_ticks = i64::from(rtp_timestamp.wrapping_sub(last) as i32); + self.extended_ticks = self.extended_ticks.saturating_add(delta_ticks); + + let extended_us = i128::from(self.extended_ticks) * 1_000_000 / i128::from(self.clock_rate); + let extended_us = extended_us.clamp(i128::from(i64::MIN), i128::from(i64::MAX)) as i64; + Ok(self.base_timestamp_us.saturating_add(extended_us)) } } @@ -133,14 +142,6 @@ pub enum RtpDepacketizerError { /// RTP clock rate must be non-zero. #[error("RTP clock rate must be non-zero")] InvalidClockRate, - /// RTP sequence number gap was detected. - #[error("RTP sequence gap: expected {expected}, got {actual}")] - SequenceGap { - /// Expected RTP sequence number. - expected: u16, - /// Actual RTP sequence number. - actual: u16, - }, /// RTP payload format is unsupported or malformed. #[error("unsupported or malformed RTP payload")] UnsupportedPayload, @@ -158,6 +159,17 @@ pub enum RtpDepacketizerError { Capture(#[from] CaptureError), } +/// Packet-loss recovery counters for an [`RtpAccessUnitAssembler`]. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct RtpDepacketizerStats { + /// Number of RTP sequence-number gaps detected. + pub sequence_gaps: u64, + /// Number of access units dropped while recovering from packet loss. + pub dropped_access_units: u64, + /// Whether output is gated until the next keyframe completes. + pub awaiting_keyframe: bool, +} + /// Reassembles RTP packets into encoded access units. #[derive(Debug, Clone)] pub struct RtpAccessUnitAssembler { @@ -170,6 +182,9 @@ pub struct RtpAccessUnitAssembler { fragment: Option, current_frame: Option, av1_fragment: Option, + awaiting_keyframe: bool, + sequence_gaps: u64, + dropped_access_units: u64, } #[derive(Debug, Clone)] @@ -222,9 +237,21 @@ impl RtpAccessUnitAssembler { fragment: None, current_frame: None, av1_fragment: None, + awaiting_keyframe: false, + sequence_gaps: 0, + dropped_access_units: 0, }) } + /// Returns packet-loss recovery counters. + pub fn stats(&self) -> RtpDepacketizerStats { + RtpDepacketizerStats { + sequence_gaps: self.sequence_gaps, + dropped_access_units: self.dropped_access_units, + awaiting_keyframe: self.awaiting_keyframe, + } + } + /// Pushes one encoded RTP packet and returns an access unit when a marker closes a frame. pub fn push( &mut self, @@ -235,11 +262,15 @@ impl RtpAccessUnitAssembler { } /// Pushes one parsed RTP packet and returns an access unit when a marker closes a frame. + /// + /// Packet loss is recovered internally: gaps and truncated fragments drop the + /// interrupted access unit and gate output on the next keyframe instead of + /// returning an error; see [`Self::stats`]. pub fn push_packet( &mut self, packet: RtpPacket<'_>, ) -> Result, RtpDepacketizerError> { - self.check_sequence(packet.sequence_number)?; + self.check_sequence(packet.sequence_number); match self.codec { EncodedVideoCodec::H264 => self.push_h264_payload(&packet)?, @@ -250,10 +281,12 @@ impl RtpAccessUnitAssembler { } if packet.marker { - if self.codec == EncodedVideoCodec::AV1 && self.av1_fragment.is_some() { - self.current_frame = None; - self.av1_fragment = None; - return Err(RtpDepacketizerError::InvalidFragment); + if self.fragment.is_some() || self.av1_fragment.is_some() { + // The marker closed the access unit before the open fragment's + // end arrived, so its tail packets were lost. + self.discard_in_progress(); + self.dropped_access_units += 1; + return Ok(None); } if matches!( self.codec, @@ -266,20 +299,41 @@ impl RtpAccessUnitAssembler { Ok(None) } - fn check_sequence(&mut self, sequence_number: u16) -> Result<(), RtpDepacketizerError> { + fn check_sequence(&mut self, sequence_number: u16) { let Some(expected) = self.expected_sequence_number.replace(sequence_number.wrapping_add(1)) else { - return Ok(()); + return; }; if sequence_number == expected { - return Ok(()); + return; } + self.sequence_gaps += 1; + self.discard_in_progress(); + } + + /// Discards all partially assembled state and gates output on the next keyframe. + fn discard_in_progress(&mut self) { self.current = None; self.fragment = None; self.current_frame = None; self.av1_fragment = None; - Err(RtpDepacketizerError::SequenceGap { expected, actual: sequence_number }) + self.awaiting_keyframe = true; + } + + /// Drops completed access units until a keyframe ends loss recovery. + fn gate_on_keyframe( + &mut self, + access_unit: OwnedEncodedAccessUnit, + ) -> Option { + if self.awaiting_keyframe { + if access_unit.frame_type != EncodedFrameType::Key { + self.dropped_access_units += 1; + return None; + } + self.awaiting_keyframe = false; + } + Some(access_unit) } fn current_mut( @@ -387,11 +441,13 @@ impl RtpAccessUnitAssembler { return Ok(()); } - let fragment = self - .fragment - .as_mut() - .filter(|fragment| fragment.rtp_timestamp == rtp_timestamp) - .ok_or(RtpDepacketizerError::InvalidFragment)?; + let Some(fragment) = + self.fragment.as_mut().filter(|fragment| fragment.rtp_timestamp == rtp_timestamp) + else { + // A continuation without its start means the preceding packets were lost. + self.discard_in_progress(); + return Ok(()); + }; fragment.nal_unit.extend_from_slice(&payload[2..]); if end { @@ -466,11 +522,13 @@ impl RtpAccessUnitAssembler { return Ok(()); } - let fragment = self - .fragment - .as_mut() - .filter(|fragment| fragment.rtp_timestamp == rtp_timestamp) - .ok_or(RtpDepacketizerError::InvalidFragment)?; + let Some(fragment) = + self.fragment.as_mut().filter(|fragment| fragment.rtp_timestamp == rtp_timestamp) + else { + // A continuation without its start means the preceding packets were lost. + self.discard_in_progress(); + return Ok(()); + }; fragment.nal_unit.extend_from_slice(&payload[3..]); if end { @@ -490,8 +548,9 @@ impl RtpAccessUnitAssembler { let frame = self.current_frame_mut(packet.timestamp)?; if frame.payload.is_empty() { if !descriptor.start_of_partition || descriptor.partition_id != 0 { - self.current_frame = None; - return Err(RtpDepacketizerError::InvalidFragment); + // The beginning of this frame was lost. + self.discard_in_progress(); + return Ok(()); } frame.frame_type = Some(if is_vp8_keyframe(descriptor.payload) { EncodedFrameType::Key @@ -517,8 +576,9 @@ impl RtpAccessUnitAssembler { let frame = self.current_frame_mut(packet.timestamp)?; if frame.payload.is_empty() { if !descriptor.beginning_of_frame { - self.current_frame = None; - return Err(RtpDepacketizerError::InvalidFragment); + // The beginning of this frame was lost. + self.discard_in_progress(); + return Ok(()); } frame.frame_type = Some( if !descriptor.inter_picture_predicted || is_vp9_keyframe(descriptor.payload) { @@ -546,14 +606,18 @@ impl RtpAccessUnitAssembler { } let obu = if index == 0 && descriptor.starts_fragment { - let mut fragment = self + let Some(fragment) = self .av1_fragment .take() .filter(|fragment| fragment.rtp_timestamp == packet.timestamp) - .ok_or(RtpDepacketizerError::InvalidFragment)? - .obu; - fragment.extend_from_slice(element); - fragment + else { + // A continuation without its start means the preceding packets were lost. + self.discard_in_progress(); + return Ok(()); + }; + let mut obu = fragment.obu; + obu.extend_from_slice(element); + obu } else { if index == 0 && self.av1_fragment.is_some() { return Err(RtpDepacketizerError::InvalidFragment); @@ -591,13 +655,14 @@ impl RtpAccessUnitAssembler { } let nal_units = current.nal_units.iter().map(Vec::as_slice).collect::>(); - Ok(Some(access_unit_from_nalus( + let access_unit = access_unit_from_nalus( self.codec, &nal_units, current.timestamp_us, self.width, self.height, - )?)) + )?; + Ok(self.gate_on_keyframe(access_unit)) } fn finish_current_frame( @@ -618,20 +683,8 @@ impl RtpAccessUnitAssembler { self.width, self.height, ); - access_unit.codec_specific = match self.codec { - EncodedVideoCodec::VP8 => CodecSpecific::VP8 { temporal_id: None, layer_sync: false }, - EncodedVideoCodec::VP9 => CodecSpecific::VP9 { - temporal_id: None, - spatial_id: None, - inter_layer_predicted: None, - }, - EncodedVideoCodec::AV1 => CodecSpecific::AV1 { - scalability_mode: Some("L1T1".to_string()), - dependency_descriptor: None, - }, - EncodedVideoCodec::H264 | EncodedVideoCodec::H265 => CodecSpecific::None, - }; - Ok(Some(access_unit)) + access_unit.codec_specific = CodecSpecific::default_for(self.codec); + Ok(self.gate_on_keyframe(access_unit)) } } @@ -910,11 +963,14 @@ fn is_vp8_keyframe(payload: &[u8]) -> bool { payload.first().is_some_and(|header| header & 0x01 == 0) } +/// Parses the start of a VP9 uncompressed frame header, whose `f(n)` fields +/// are MSB-first, and reports whether it begins a keyframe. fn is_vp9_keyframe(payload: &[u8]) -> bool { let Some(&first_byte) = payload.first() else { return false; }; - if first_byte & 0x03 != 0x02 { + // frame_marker: f(2), must be 0b10. + if first_byte >> 6 != 0b10 { return false; } @@ -925,17 +981,20 @@ fn is_vp9_keyframe(payload: &[u8]) -> bool { bit_offset += 1; let profile = profile_low | (profile_high << 1); if profile == 3 { - bit_offset += 1; + bit_offset += 1; // reserved_zero } + // show_existing_frame: a repeated frame is never a keyframe. if read_bit(first_byte, bit_offset) != 0 { return false; } bit_offset += 1; + // frame_type: 0 is KEY_FRAME. read_bit(first_byte, bit_offset) == 0 } +/// Reads bit `bit_offset` of `byte`, counting from the most significant bit. fn read_bit(byte: u8, bit_offset: usize) -> u8 { - (byte >> bit_offset) & 0x01 + (byte >> (7 - bit_offset)) & 0x01 } fn av1_obu_type(obu: &[u8]) -> Option { @@ -975,6 +1034,31 @@ mod tests { assert_eq!(mapper.map(0).unwrap(), 2_000); } + #[test] + fn maps_rtp_timestamps_across_multiple_rollovers() { + let mut mapper = RtpTimestampMapper::new(90_000, 0); + let step = 1u32 << 30; + let mut rtp_timestamp = 0u32; + let mut last_us = mapper.map(rtp_timestamp).unwrap(); + for _ in 0..20 { + rtp_timestamp = rtp_timestamp.wrapping_add(step); + let mapped_us = mapper.map(rtp_timestamp).unwrap(); + assert!(mapped_us > last_us, "mapped timestamps must stay monotonic"); + last_us = mapped_us; + } + assert_eq!(last_us, (20i64 << 30) * 1_000_000 / 90_000); + } + + #[test] + fn maps_reordered_rtp_timestamps() { + let mut mapper = RtpTimestampMapper::new(90_000, 1_000); + assert_eq!(mapper.map(9_000).unwrap(), 1_000); + assert_eq!(mapper.map(18_000).unwrap(), 101_000); + // A late packet maps behind the stream without disturbing what follows. + assert_eq!(mapper.map(15_000).unwrap(), 67_666); + assert_eq!(mapper.map(27_000).unwrap(), 201_000); + } + #[test] fn assembles_h264_fu_a() { let mut assembler = @@ -988,15 +1072,64 @@ mod tests { } #[test] - fn sequence_gap_clears_current_frame() { + fn sequence_gap_recovers_h264_at_next_keyframe() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::H264, 90_000, 0, 640, 480).unwrap(); + let start = rtp_packet(10, 12_000, false, &[0x7c, 0x85, 1, 2]); + let delta = rtp_packet(12, 15_000, true, &[0x41, 1, 2]); + let key = rtp_packet(13, 18_000, true, &[0x65, 3, 4]); + + assert!(assembler.push(&start).unwrap().is_none()); + // The gap dropped the fragment; the delta frame after it is withheld. + assert!(assembler.push(&delta).unwrap().is_none()); + let stats = assembler.stats(); + assert_eq!(stats.sequence_gaps, 1); + assert_eq!(stats.dropped_access_units, 1); + assert!(stats.awaiting_keyframe); + + let access_unit = assembler.push(&key).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 3, 4]); + let stats = assembler.stats(); + assert_eq!(stats.dropped_access_units, 1); + assert!(!stats.awaiting_keyframe); + } + + #[test] + fn marker_with_open_h264_fragment_drops_access_unit() { let mut assembler = RtpAccessUnitAssembler::new(EncodedVideoCodec::H264, 90_000, 0, 640, 480).unwrap(); let start = rtp_packet(10, 12_000, false, &[0x7c, 0x85, 1, 2]); - let end = rtp_packet(12, 12_000, true, &[0x7c, 0x45, 3, 4]); + let truncated = rtp_packet(11, 12_000, true, &[0x7c, 0x05, 3, 4]); + let key = rtp_packet(12, 15_000, true, &[0x65, 5, 6]); assert!(assembler.push(&start).unwrap().is_none()); - let err = assembler.push(&end).unwrap_err(); - assert_eq!(err, RtpDepacketizerError::SequenceGap { expected: 11, actual: 12 }); + // The marker arrived without the FU end bit: the fragment is truncated. + assert!(assembler.push(&truncated).unwrap().is_none()); + let stats = assembler.stats(); + assert_eq!(stats.sequence_gaps, 0); + assert_eq!(stats.dropped_access_units, 1); + assert!(stats.awaiting_keyframe); + + let access_unit = assembler.push(&key).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 5, 6]); + assert!(!assembler.stats().awaiting_keyframe); + } + + #[test] + fn drops_h264_fu_continuation_without_start() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::H264, 90_000, 0, 640, 480).unwrap(); + let continuation = rtp_packet(10, 12_000, false, &[0x7c, 0x05, 1, 2]); + let key = rtp_packet(11, 15_000, true, &[0x65, 3, 4]); + + assert!(assembler.push(&continuation).unwrap().is_none()); + assert!(assembler.stats().awaiting_keyframe); + + let access_unit = assembler.push(&key).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 3, 4]); } #[test] @@ -1014,13 +1147,18 @@ mod tests { } #[test] - fn rejects_vp8_mid_frame_start() { + fn drops_vp8_mid_frame_start() { let mut assembler = RtpAccessUnitAssembler::new(EncodedVideoCodec::VP8, 90_000, 0, 640, 480).unwrap(); - let packet = rtp_packet(10, 12_000, true, &[0x00, 1, 2]); + let mid_frame = rtp_packet(10, 12_000, true, &[0x00, 1, 2]); + let key = rtp_packet(11, 15_000, true, &[0x10, 0x00, 3, 4]); - let err = assembler.push(&packet).unwrap_err(); - assert_eq!(err, RtpDepacketizerError::InvalidFragment); + assert!(assembler.push(&mid_frame).unwrap().is_none()); + assert!(assembler.stats().awaiting_keyframe); + + let access_unit = assembler.push(&key).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0x00, 3, 4]); } #[test] @@ -1099,11 +1237,43 @@ mod tests { fn assembles_vp9_predicted_frame_as_delta() { let mut assembler = RtpAccessUnitAssembler::new(EncodedVideoCodec::VP9, 90_000, 0, 640, 480).unwrap(); - let packet = rtp_packet(10, 12_000, true, &[0x4c, 0x83, 1, 2]); + // P is set and the payload is an inter frame: must not classify as Key. + let packet = rtp_packet(10, 12_000, true, &[0x4c, 0x86, 1, 2]); let access_unit = assembler.push(&packet).unwrap().unwrap(); assert_eq!(access_unit.frame_type, EncodedFrameType::Delta); - assert_eq!(access_unit.payload.as_ref(), &[0x83, 1, 2]); + assert_eq!(access_unit.payload.as_ref(), &[0x86, 1, 2]); + } + + #[test] + fn vp9_bitstream_keyframe_overrides_predicted_bit() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP9, 90_000, 0, 640, 480).unwrap(); + // P is set but the uncompressed header says KEY_FRAME. + let packet = rtp_packet(10, 12_000, true, &[0x4c, 0x82, 1, 2]); + + let access_unit = assembler.push(&packet).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0x82, 1, 2]); + } + + #[test] + fn classifies_vp9_uncompressed_header_frame_types() { + // 0b1000_0010: marker, profile 0, show_existing=0, KEY_FRAME, show_frame=1. + assert!(is_vp9_keyframe(&[0x82])); + // 0b1000_0011: keyframe with error_resilient_mode set. + assert!(is_vp9_keyframe(&[0x83])); + // 0b1011_0000: profile 3 keyframe. + assert!(is_vp9_keyframe(&[0xb0])); + // 0b1000_0110: frame_type=1, an inter frame. + assert!(!is_vp9_keyframe(&[0x86])); + // 0b1011_0010: profile 3 inter frame. + assert!(!is_vp9_keyframe(&[0xb2])); + // 0b1000_1000: show_existing_frame repeats a decoded frame. + assert!(!is_vp9_keyframe(&[0x88])); + // 0b0000_0010: invalid frame_marker. + assert!(!is_vp9_keyframe(&[0x02])); + assert!(!is_vp9_keyframe(&[])); } #[test] @@ -1117,13 +1287,18 @@ mod tests { } #[test] - fn rejects_vp9_mid_frame_start() { + fn drops_vp9_mid_frame_start() { let mut assembler = RtpAccessUnitAssembler::new(EncodedVideoCodec::VP9, 90_000, 0, 640, 480).unwrap(); - let packet = rtp_packet(10, 12_000, true, &[0x04, 0x82, 1, 2]); + let mid_frame = rtp_packet(10, 12_000, true, &[0x04, 0x82, 1, 2]); + let key = rtp_packet(11, 15_000, true, &[0x0c, 0x82, 3, 4]); - let err = assembler.push(&packet).unwrap_err(); - assert_eq!(err, RtpDepacketizerError::InvalidFragment); + assert!(assembler.push(&mid_frame).unwrap().is_none()); + assert!(assembler.stats().awaiting_keyframe); + + let access_unit = assembler.push(&key).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0x82, 3, 4]); } #[test] @@ -1173,14 +1348,59 @@ mod tests { } #[test] - fn sequence_gap_clears_vp8_frame() { + fn marker_with_open_av1_fragment_drops_frame() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::AV1, 90_000, 0, 640, 480).unwrap(); + // Y is set, so the OBU fragment is unterminated when the marker closes it. + let truncated = rtp_packet(10, 12_000, true, &[0x50, 0x30, 1]); + let key = rtp_packet(11, 15_000, true, &[0x18, 0x08]); + + assert!(assembler.push(&truncated).unwrap().is_none()); + let stats = assembler.stats(); + assert_eq!(stats.dropped_access_units, 1); + assert!(stats.awaiting_keyframe); + + let access_unit = assembler.push(&key).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0x0a, 0x00]); + assert!(!assembler.stats().awaiting_keyframe); + } + + #[test] + fn drops_av1_fragment_continuation_without_start() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::AV1, 90_000, 0, 640, 480).unwrap(); + // Z is set: this continues an OBU whose start was never received. + let continuation = rtp_packet(10, 12_000, true, &[0x90, 2, 3]); + let key = rtp_packet(11, 15_000, true, &[0x18, 0x08]); + + assert!(assembler.push(&continuation).unwrap().is_none()); + assert!(assembler.stats().awaiting_keyframe); + + let access_unit = assembler.push(&key).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0x0a, 0x00]); + } + + #[test] + fn sequence_gap_recovers_vp8_at_next_keyframe() { let mut assembler = RtpAccessUnitAssembler::new(EncodedVideoCodec::VP8, 90_000, 0, 640, 480).unwrap(); let start = rtp_packet(10, 12_000, false, &[0x10, 0x00, 1, 2]); - let end = rtp_packet(12, 12_000, true, &[0x00, 3, 4]); + let delta = rtp_packet(12, 15_000, true, &[0x10, 0x01, 3, 4]); + let key = rtp_packet(13, 18_000, true, &[0x10, 0x00, 5, 6]); assert!(assembler.push(&start).unwrap().is_none()); - let err = assembler.push(&end).unwrap_err(); - assert_eq!(err, RtpDepacketizerError::SequenceGap { expected: 11, actual: 12 }); + // The gap dropped the fragment; the delta frame after it is withheld. + assert!(assembler.push(&delta).unwrap().is_none()); + let stats = assembler.stats(); + assert_eq!(stats.sequence_gaps, 1); + assert_eq!(stats.dropped_access_units, 1); + assert!(stats.awaiting_keyframe); + + let access_unit = assembler.push(&key).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0x00, 5, 6]); + assert!(!assembler.stats().awaiting_keyframe); } } diff --git a/livekit-capture/src/error.rs b/livekit-capture/src/error.rs index c3714ec54..77d794dd8 100644 --- a/livekit-capture/src/error.rs +++ b/livekit-capture/src/error.rs @@ -28,6 +28,12 @@ pub enum CaptureError { /// DMA-BUF frame did not include any planes. #[error("DMA-BUF frame did not include any planes")] MissingDmaBufPlane, + /// DMA-BUF frame layout cannot be represented by the native capture path. + #[error("unsupported DMA-BUF layout: {0}")] + UnsupportedDmaBufLayout(&'static str), + /// Access unit carries layering metadata the passthrough cannot forward. + #[error("unsupported layered encoding: {0}")] + UnsupportedLayeredEncoding(&'static str), /// Codec is represented by the API but not yet supported by native passthrough. #[error("encoded passthrough does not support {0:?} yet")] UnsupportedCodec(EncodedVideoCodec), diff --git a/livekit-capture/src/lib.rs b/livekit-capture/src/lib.rs index a50ffb4c5..af8b8ffb1 100644 --- a/livekit-capture/src/lib.rs +++ b/livekit-capture/src/lib.rs @@ -18,20 +18,21 @@ pub mod device; pub mod dmabuf; pub mod encoded; mod error; -pub mod platform; pub mod source; pub mod sources; +pub(crate) mod time; pub mod track; -#[allow(deprecated)] -pub use device::CapturePixelFormat; pub use device::{ CaptureBackend, CaptureDeviceInfo, CaptureDeviceQueryError, CaptureDeviceSelector, CaptureFormat, CaptureFormatRequest, CaptureFrameFormat, CapturePath, CaptureResolution, }; pub use dmabuf::{DmaBufFrame, DmaBufPixelFormat, DmaBufPlane}; pub use encoded::{ - ingress::{EncodedAccessUnitSource, EncodedIngress, EncodedIngressError}, + ingress::{ + EncodedAccessUnitSource, EncodedIngress, EncodedIngressCapture, EncodedIngressError, + EncodedIngressStop, + }, CodecSpecific, EncodedAccessUnit, EncodedFragment, EncodedFrameType, EncodedLayerInfo, EncodedPayload, EncodedVideoCodec, EncodedWireFormat, H264PacketizationMode, OwnedEncodedAccessUnit, @@ -39,7 +40,7 @@ pub use encoded::{ pub use error::CaptureError; pub use source::{ CaptureFrame, CaptureFrameSource, CaptureSourceError, CaptureSourceOptions, - EncodedCaptureFrameSource, EncodedFrameSourceError, NativeVideoFrame, RawVideoFrame, - VideoCaptureSource, + EncodedCaptureFrameSource, EncodedEndpoint, EncodedFrameSourceError, NativeVideoFrame, + RawVideoFrame, VideoCaptureSource, }; pub use track::VideoCaptureTrack; diff --git a/livekit-capture/src/platform/mod.rs b/livekit-capture/src/platform/mod.rs deleted file mode 100644 index 739bbda39..000000000 --- a/livekit-capture/src/platform/mod.rs +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2026 LiveKit, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Platform capture backends. - -#[cfg(feature = "avfoundation")] -pub mod avfoundation; diff --git a/livekit-capture/src/source.rs b/livekit-capture/src/source.rs index 41fc6cc0f..be0243b0f 100644 --- a/livekit-capture/src/source.rs +++ b/livekit-capture/src/source.rs @@ -39,6 +39,11 @@ pub struct CaptureSourceOptions { pub format: CaptureFormatRequest, /// Whether the resulting track should be marked as a screencast. pub is_screencast: bool, + /// Prefer CPU-accessible frames over zero-copy native buffers, for + /// callers that modify pixels before publishing. + pub prefer_raw_frames: bool, + /// Endpoint for the encoded ingest backends (RTSP/TCP/GStreamer). + pub encoded: Option, } impl Default for CaptureSourceOptions { @@ -48,10 +53,51 @@ impl Default for CaptureSourceOptions { device: CaptureDeviceSelector::Default, format: CaptureFormatRequest::Default, is_screencast: false, + prefer_raw_frames: false, + encoded: None, } } } +/// Endpoint configuration for the encoded ingest backends. +#[derive(Debug, Clone, PartialEq, Eq)] +#[non_exhaustive] +pub enum EncodedEndpoint { + /// RTSP camera URL ingested over TCP-interleaved RTP. + #[cfg(feature = "rtsp")] + Rtsp { + /// RTSP URL, e.g. `rtsp://user:pass@camera/stream`. + url: String, + /// RTSP source options (codec expectation, dimensions, timeouts). + options: crate::sources::rtsp::RtspSourceOptions, + }, + /// TCP byte-stream endpoint to connect to. + #[cfg(feature = "tcpsink")] + TcpConnect { + /// `host:port` to connect to. + address: String, + /// Byte-stream configuration (wire format, dimensions, timing). + config: crate::sources::tcp::ByteStreamSourceConfig, + }, + /// GStreamer launch description that contains or feeds an encoded appsink. + /// + /// The pipeline must either contain `appsink name=lk_appsink` or leave + /// one encoded video source pad unlinked (a parser, capsfilter, and + /// appsink are attached automatically). + #[cfg(feature = "gstreamer")] + GstreamerLaunch { + /// `gst-launch`-style pipeline description. + launch: String, + /// Expected codec; inferred from the pipeline caps when `None`. + codec: Option, + /// Appsink source configuration (dimensions and timestamp fallbacks). + /// + /// The `sample_format` field is overridden by what the pipeline caps + /// advertise. + config: crate::sources::gstreamer::GStreamerAppSinkConfig, + }, +} + /// Uncompressed CPU-accessible video frame buffer produced by a capture source. #[derive(Debug)] pub struct RawVideoFrame { @@ -247,6 +293,12 @@ pub enum CaptureSourceError { /// The requested backend cannot be used by this façade on this target or build. #[error("capture backend {0} is not supported by VideoCaptureSource on this target or build")] UnsupportedBackend(CaptureBackend), + /// The backend requires an [`EncodedEndpoint`] in [`CaptureSourceOptions::encoded`]. + #[error("capture backend {0} requires a matching CaptureSourceOptions::encoded endpoint")] + MissingEncodedEndpoint(CaptureBackend), + /// The encoded source reached end of stream. + #[error("capture source reached end of stream")] + EndOfStream, /// The backend source failed. #[error("capture backend {backend} failed: {message}")] Backend { @@ -260,23 +312,75 @@ pub enum CaptureSourceError { Capture(#[from] CaptureError), } -/// High-level capture source façade for common camera capture use cases. +/// GStreamer pipeline plus the encoded appsink source reading from it. +/// +/// Stops the pipeline when dropped. +#[cfg(feature = "gstreamer")] +#[derive(Debug)] +pub struct GStreamerCaptureSource { + pipeline: ::gstreamer::Pipeline, + source: EncodedCaptureFrameSource, +} + +#[cfg(feature = "gstreamer")] +impl GStreamerCaptureSource { + /// Returns the running pipeline. + pub fn pipeline(&self) -> &::gstreamer::Pipeline { + &self.pipeline + } + + /// Returns the encoded appsink source. + pub fn source_mut( + &mut self, + ) -> &mut EncodedCaptureFrameSource + { + &mut self.source + } +} + +#[cfg(feature = "gstreamer")] +impl Drop for GStreamerCaptureSource { + fn drop(&mut self) { + use ::gstreamer::prelude::ElementExt; + let _ = self.pipeline.set_state(::gstreamer::State::Null); + } +} + +/// High-level capture source façade for the crate's capture backends. #[derive(Debug)] #[non_exhaustive] pub enum VideoCaptureSource { /// AVFoundation decoded-frame source. #[cfg(feature = "avfoundation")] - AvFoundation(crate::sources::avfoundation::AvFoundationCaptureSession), + AvFoundation { + /// Underlying capture session. + session: crate::sources::avfoundation::AvFoundationCaptureSession, + /// Prefer CPU-accessible frames over zero-copy native buffers. + prefer_raw_frames: bool, + }, /// Linux V4L2 decoded-frame source. #[cfg(feature = "v4l")] V4l(crate::sources::v4l::V4lCaptureSession), /// Jetson libargus DMA-BUF source. #[cfg(feature = "libargus")] LibArgus(crate::sources::argus::ArgusCaptureSession), + /// RTSP encoded ingest source. + #[cfg(feature = "rtsp")] + Rtsp(EncodedCaptureFrameSource), + /// TCP byte-stream encoded ingest source. + #[cfg(feature = "tcpsink")] + Tcp(EncodedCaptureFrameSource), + /// GStreamer pipeline encoded ingest source. + #[cfg(feature = "gstreamer")] + Gstreamer(GStreamerCaptureSource), } impl VideoCaptureSource { /// Lists capture devices for a backend. + /// + /// The encoded ingest backends (RTSP/TCP/GStreamer) address network + /// endpoints rather than enumerable devices, so they report + /// [`CaptureDeviceQueryError::UnsupportedBackend`]. pub fn list_devices( backend: CaptureBackend, ) -> Result, CaptureDeviceQueryError> { @@ -298,9 +402,9 @@ impl VideoCaptureSource { CaptureBackend::AvFoundation => open_avfoundation_source(options), CaptureBackend::V4l2 => open_v4l_source(options), CaptureBackend::LibArgus => open_argus_source(options), - CaptureBackend::Rtsp | CaptureBackend::Tcp | CaptureBackend::Gstreamer => { - Err(CaptureSourceError::UnsupportedBackend(options.backend)) - } + CaptureBackend::Rtsp => open_rtsp_source(options), + CaptureBackend::Tcp => open_tcp_source(options), + CaptureBackend::Gstreamer => open_gstreamer_source(options), } } @@ -308,11 +412,23 @@ impl VideoCaptureSource { pub fn capture_path(&self) -> CapturePath { match self { #[cfg(feature = "avfoundation")] - Self::AvFoundation(source) => source.capture_path(), + Self::AvFoundation { session, prefer_raw_frames } => { + if session.native_capture_supported() && !prefer_raw_frames { + CapturePath::Native + } else { + CapturePath::Raw + } + } #[cfg(feature = "v4l")] Self::V4l(source) => source.capture_path(), #[cfg(feature = "libargus")] Self::LibArgus(source) => source.capture_path(), + #[cfg(feature = "rtsp")] + Self::Rtsp(_) => CapturePath::Encoded, + #[cfg(feature = "tcpsink")] + Self::Tcp(_) => CapturePath::Encoded, + #[cfg(feature = "gstreamer")] + Self::Gstreamer(_) => CapturePath::Encoded, #[allow(unreachable_patterns)] _ => unreachable!("VideoCaptureSource has no enabled backend variants"), } @@ -322,23 +438,38 @@ impl VideoCaptureSource { pub fn format(&self) -> Option { match self { #[cfg(feature = "avfoundation")] - Self::AvFoundation(source) => Some(source.format()), + Self::AvFoundation { session, .. } => Some(session.format()), #[cfg(feature = "v4l")] Self::V4l(source) => Some(source.format()), #[cfg(feature = "libargus")] Self::LibArgus(source) => Some(source.format()), + #[cfg(feature = "rtsp")] + Self::Rtsp(_) => None, + #[cfg(feature = "tcpsink")] + Self::Tcp(_) => None, + #[cfg(feature = "gstreamer")] + Self::Gstreamer(_) => None, #[allow(unreachable_patterns)] _ => unreachable!("VideoCaptureSource has no enabled backend variants"), } } /// Captures the next frame. + /// + /// The encoded ingest backends return + /// [`CaptureSourceError::EndOfStream`] when the stream terminates + /// normally. pub fn next_frame(&mut self) -> Result { match self { #[cfg(feature = "avfoundation")] - Self::AvFoundation(source) => source - .next_frame() - .map_err(|err| backend_source_error(CaptureBackend::AvFoundation, err)), + Self::AvFoundation { session, prefer_raw_frames } => { + let frame = if session.native_capture_supported() && !*prefer_raw_frames { + session.capture_native_frame().map(|frame| CaptureFrame::Native(frame.into())) + } else { + session.capture_frame().map(|frame| CaptureFrame::Raw(frame.into())) + }; + frame.map_err(|err| backend_source_error(CaptureBackend::AvFoundation, err)) + } #[cfg(feature = "v4l")] Self::V4l(source) => { source.next_frame().map_err(|err| backend_source_error(CaptureBackend::V4l2, err)) @@ -347,14 +478,68 @@ impl VideoCaptureSource { Self::LibArgus(source) => source .next_frame() .map_err(|err| backend_source_error(CaptureBackend::LibArgus, err)), + #[cfg(feature = "rtsp")] + Self::Rtsp(source) => { + source.next_frame().map_err(|err| encoded_source_error(CaptureBackend::Rtsp, err)) + } + #[cfg(feature = "tcpsink")] + Self::Tcp(source) => { + source.next_frame().map_err(|err| encoded_source_error(CaptureBackend::Tcp, err)) + } + #[cfg(feature = "gstreamer")] + Self::Gstreamer(source) => source + .source + .next_frame() + .map_err(|err| encoded_source_error(CaptureBackend::Gstreamer, err)), #[allow(unreachable_patterns)] _ => unreachable!("VideoCaptureSource has no enabled backend variants"), } } - /// Captures and publishes the next frame. + /// Signals the source to stop, interrupting a blocked + /// [`VideoCaptureSource::next_frame`] where the backend supports it + /// (AVFoundation today); other backends return at the next frame + /// boundary. + pub fn stop(&self) { + match self { + #[cfg(feature = "avfoundation")] + Self::AvFoundation { session, .. } => session.stop(), + #[allow(unreachable_patterns)] + _ => {} + } + } + + /// Forwards a downstream keyframe request to the source's producer. + /// + /// No-op for the decoded camera backends, which have no upstream + /// encoder. + pub fn request_keyframe(&mut self) { + match self { + #[cfg(feature = "rtsp")] + Self::Rtsp(source) => source.source_mut().request_keyframe(), + #[cfg(feature = "tcpsink")] + Self::Tcp(source) => source.source_mut().request_keyframe(), + #[cfg(feature = "gstreamer")] + Self::Gstreamer(source) => source.source.source_mut().request_keyframe(), + #[allow(unreachable_patterns)] + _ => {} + } + } + + /// Captures and publishes the next frame, returning `false` once an + /// encoded source reaches end of stream. + /// + /// Keyframe requests raised by the passthrough encoder are polled from + /// the track and forwarded to the source before each capture. pub fn publish_next(&mut self, track: &VideoCaptureTrack) -> Result { - let frame = self.next_frame()?; + if track.take_keyframe_request() { + self.request_keyframe(); + } + let frame = match self.next_frame() { + Ok(frame) => frame, + Err(CaptureSourceError::EndOfStream) => return Ok(false), + Err(err) => return Err(err), + }; frame.publish_to(track)?; Ok(true) } @@ -472,6 +657,102 @@ fn backend_query_error( CaptureDeviceQueryError::Backend { backend, message: error.to_string() } } +#[allow(dead_code)] +fn encoded_source_error( + backend: CaptureBackend, + error: EncodedFrameSourceError, +) -> CaptureSourceError { + match error { + EncodedFrameSourceError::EndOfStream => CaptureSourceError::EndOfStream, + EncodedFrameSourceError::Source(err) => backend_source_error(backend, err), + } +} + +#[cfg(feature = "rtsp")] +fn open_rtsp_source( + options: CaptureSourceOptions, +) -> Result { + let Some(EncodedEndpoint::Rtsp { url, options: rtsp_options }) = options.encoded else { + return Err(CaptureSourceError::MissingEncodedEndpoint(CaptureBackend::Rtsp)); + }; + let source = crate::sources::rtsp::RtspEncodedSource::connect(&url, rtsp_options) + .map_err(|err| backend_source_error(CaptureBackend::Rtsp, err))?; + Ok(VideoCaptureSource::Rtsp(EncodedCaptureFrameSource::new(source))) +} + +#[cfg(not(feature = "rtsp"))] +fn open_rtsp_source( + _options: CaptureSourceOptions, +) -> Result { + Err(CaptureSourceError::UnsupportedBackend(CaptureBackend::Rtsp)) +} + +#[cfg(feature = "tcpsink")] +fn open_tcp_source( + options: CaptureSourceOptions, +) -> Result { + let Some(EncodedEndpoint::TcpConnect { address, config }) = options.encoded else { + return Err(CaptureSourceError::MissingEncodedEndpoint(CaptureBackend::Tcp)); + }; + let source = crate::sources::tcp::TcpEncodedSource::connect(address.as_str(), config) + .map_err(|err| backend_source_error(CaptureBackend::Tcp, err))?; + Ok(VideoCaptureSource::Tcp(EncodedCaptureFrameSource::new(source))) +} + +#[cfg(not(feature = "tcpsink"))] +fn open_tcp_source( + _options: CaptureSourceOptions, +) -> Result { + Err(CaptureSourceError::UnsupportedBackend(CaptureBackend::Tcp)) +} + +#[cfg(feature = "gstreamer")] +fn open_gstreamer_source( + options: CaptureSourceOptions, +) -> Result { + use ::gstreamer::prelude::*; + + let Some(EncodedEndpoint::GstreamerLaunch { launch, codec, mut config }) = options.encoded + else { + return Err(CaptureSourceError::MissingEncodedEndpoint(CaptureBackend::Gstreamer)); + }; + + let gst_error = |err: &dyn std::fmt::Display| CaptureSourceError::Backend { + backend: CaptureBackend::Gstreamer, + message: err.to_string(), + }; + + ::gstreamer::init().map_err(|err| gst_error(&err))?; + let pipeline = ::gstreamer::parse::launch(&launch) + .map_err(|err| gst_error(&err))? + .downcast::<::gstreamer::Pipeline>() + .map_err(|element| CaptureSourceError::Backend { + backend: CaptureBackend::Gstreamer, + message: format!( + "launch description did not produce a pipeline (got {})", + element.name() + ), + })?; + let (appsink, sample_format) = + crate::sources::gstreamer::ensure_encoded_appsink(&pipeline, codec) + .map_err(|err| gst_error(&err))?; + config.sample_format = sample_format; + pipeline.set_state(::gstreamer::State::Playing).map_err(|err| gst_error(&err))?; + + let source = crate::sources::gstreamer::GStreamerAppSinkEncodedSource::new(appsink, config); + Ok(VideoCaptureSource::Gstreamer(GStreamerCaptureSource { + pipeline, + source: EncodedCaptureFrameSource::new(source), + })) +} + +#[cfg(not(feature = "gstreamer"))] +fn open_gstreamer_source( + _options: CaptureSourceOptions, +) -> Result { + Err(CaptureSourceError::UnsupportedBackend(CaptureBackend::Gstreamer)) +} + fn list_auto_devices() -> Result, CaptureDeviceQueryError> { #[cfg(all(target_os = "macos", feature = "avfoundation"))] { @@ -524,6 +805,7 @@ fn list_avfoundation_devices() -> Result, CaptureDeviceQu fn open_avfoundation_source( options: CaptureSourceOptions, ) -> Result { + let prefer_raw_frames = options.prefer_raw_frames; let source = crate::sources::avfoundation::AvFoundationCaptureSession::new(options.into()) .map_err(|err| match err { crate::sources::avfoundation::AvFoundationError::UnsupportedPlatform => { @@ -531,7 +813,7 @@ fn open_avfoundation_source( } other => backend_source_error(CaptureBackend::AvFoundation, other), })?; - Ok(VideoCaptureSource::AvFoundation(source)) + Ok(VideoCaptureSource::AvFoundation { session: source, prefer_raw_frames }) } #[cfg(not(feature = "avfoundation"))] @@ -775,9 +1057,8 @@ mod tests { #[cfg(feature = "avfoundation")] #[test] - fn avfoundation_canonical_and_compatibility_imports_compile() { + fn avfoundation_canonical_import_compiles() { let _ = std::any::TypeId::of::(); - let _ = std::any::TypeId::of::(); } #[cfg(feature = "v4l")] diff --git a/livekit-capture/src/sources/argus.rs b/livekit-capture/src/sources/argus.rs index f522d944e..28363a92f 100644 --- a/livekit-capture/src/sources/argus.rs +++ b/livekit-capture/src/sources/argus.rs @@ -29,7 +29,9 @@ use crate::{ #[cfg(livekit_capture_argus)] use crate::dmabuf::{DmaBufPixelFormat, DmaBufPlane}; #[cfg(livekit_capture_argus)] -use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; +use crate::time::{elapsed_us, unix_time_us_now}; +#[cfg(livekit_capture_argus)] +use std::time::Instant; #[cfg(livekit_capture_argus)] use std::{ffi::c_int, ffi::c_void}; @@ -454,11 +456,6 @@ fn c_int_from_u32(value: u32, field: &'static str) -> Result c_int::try_from(value).map_err(|_| ArgusError::OptionOutOfRange(field)) } -#[cfg(livekit_capture_argus)] -fn elapsed_us(duration: Duration) -> i64 { - i64::try_from(duration.as_micros()).unwrap_or(i64::MAX) -} - #[cfg(livekit_capture_argus)] fn sensor_wall_time_us(sensor_timestamp_ns: u64) -> Option { let wall_time_us = unix_time_us_now()?; @@ -476,12 +473,6 @@ pub fn sensor_monotonic_ns_to_unix_us(sensor_timestamp_ns: u64, wall_time_us: u6 } } -#[cfg(livekit_capture_argus)] -fn unix_time_us_now() -> Option { - let elapsed = SystemTime::now().duration_since(UNIX_EPOCH).ok()?; - u64::try_from(elapsed.as_micros()).ok() -} - #[cfg(target_os = "linux")] fn monotonic_time_ns_now() -> Option { #[repr(C)] diff --git a/livekit-capture/src/platform/avfoundation.rs b/livekit-capture/src/sources/avfoundation.rs similarity index 90% rename from livekit-capture/src/platform/avfoundation.rs rename to livekit-capture/src/sources/avfoundation.rs index 62dc56947..1519014f5 100644 --- a/livekit-capture/src/platform/avfoundation.rs +++ b/livekit-capture/src/sources/avfoundation.rs @@ -32,8 +32,6 @@ use crate::{ #[cfg(target_os = "macos")] const FIRST_FRAME_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(5); -#[cfg(target_os = "macos")] -const MAX_CAPTURE_TIMESTAMP_AGE_US: u64 = 5_000_000; /// Options used to create an AVFoundation capture session. #[derive(Debug, Clone, PartialEq, Eq)] @@ -137,15 +135,39 @@ impl AvFoundationCaptureSession { } /// Captures the next decoded frame and converts it to I420. + /// + /// Blocks until AVFoundation delivers a frame. Fails with + /// [`AvFoundationError::NotRunning`] once the session has been stopped via + /// [`Self::stop`] or an [`AvFoundationStopHandle`]. pub fn capture_frame(&mut self) -> Result { self.capture_frame_inner() } /// Captures the next frame as a native `CVPixelBuffer`. + /// + /// Blocks until AVFoundation delivers a frame. Fails with + /// [`AvFoundationError::NotRunning`] once the session has been stopped via + /// [`Self::stop`] or an [`AvFoundationStopHandle`]. pub fn capture_native_frame(&mut self) -> Result { self.capture_native_frame_inner() } + /// Returns a cheaply cloneable handle that stops this session from another + /// thread. See [`AvFoundationStopHandle::stop`]. + pub fn stop_handle(&self) -> AvFoundationStopHandle { + AvFoundationStopHandle { + #[cfg(target_os = "macos")] + shared: self.inner.frame_queue(), + } + } + + /// Stops frame delivery, waking any thread blocked in + /// [`Self::capture_frame`] or [`Self::capture_native_frame`]. See + /// [`AvFoundationStopHandle::stop`] for the exact contract. + pub fn stop(&self) { + self.stop_handle().stop(); + } + /// Returns the negotiated capture format. pub fn format(&self) -> CaptureFormat { self.format @@ -242,6 +264,37 @@ impl AvFoundationCaptureSession { } } +/// Cheaply cloneable handle that stops an [`AvFoundationCaptureSession`] from +/// another thread. +/// +/// The thread that owns the session is typically blocked inside +/// [`AvFoundationCaptureSession::capture_frame`] waiting for the camera, so it +/// cannot stop itself if the device stalls without delivering an error +/// (unplug, sleep, exclusive use by another app). Obtaining this handle before +/// handing the session to that thread gives the rest of the process a way to +/// abort the wait. +#[derive(Clone, Debug)] +pub struct AvFoundationStopHandle { + #[cfg(target_os = "macos")] + shared: Arc, +} + +impl AvFoundationStopHandle { + /// Stops frame delivery for the associated session and wakes all blocked + /// capture calls. + /// + /// Stopping is idempotent. Once stopped, + /// [`AvFoundationCaptureSession::capture_frame`] and + /// [`AvFoundationCaptureSession::capture_native_frame`] fail with + /// [`AvFoundationError::NotRunning`]; a frame that was already queued may + /// still be returned before the first error. The underlying AVFoundation + /// session is torn down when the session value itself is dropped. + pub fn stop(&self) { + #[cfg(target_os = "macos")] + self.shared.stop(); + } +} + /// AVFoundation decoded-frame capture session that forwards frames into a track. pub struct AvFoundationCapture { track: VideoCaptureTrack, @@ -299,6 +352,9 @@ impl Drop for AvFoundationCapture { #[derive(Debug)] struct CaptureRunner { stop: Arc, + /// Wakes the capture thread out of a blocking frame wait so `stop_capture` + /// can join it even when the camera has stalled. + stop_handle: AvFoundationStopHandle, handle: JoinHandle<()>, } @@ -526,6 +582,9 @@ fn start_capture(capture: &mut AvFoundationCapture) -> Result<(), AvFoundationEr let track = capture.track.clone(); let mut session = AvFoundationCaptureSession::new(capture.options.clone())?; let capture_native = session.native_capture_supported(); + // Keep a stop handle outside the capture thread: once the session moves + // into the thread, this is the only way to wake a blocked frame wait. + let stop_handle = session.stop_handle(); let stop = Arc::new(AtomicBool::new(false)); let stop_for_thread = stop.clone(); let handle = std::thread::Builder::new() @@ -547,7 +606,7 @@ fn start_capture(capture: &mut AvFoundationCapture) -> Result<(), AvFoundationEr }) .map_err(|err| AvFoundationError::SessionSetup(err.to_string()))?; - capture.runner = Some(CaptureRunner { stop, handle }); + capture.runner = Some(CaptureRunner { stop, stop_handle, handle }); Ok(()) } @@ -563,6 +622,11 @@ fn stop_capture(capture: &mut AvFoundationCapture) -> Result<(), AvFoundationErr }; runner.stop.store(true, Ordering::Release); + // Wake the capture thread if it is blocked waiting for the next frame so a + // stalled camera cannot keep the join below from completing. The woken + // wait fails with `NotRunning`, and the thread exits via the loop's error + // path or the stop flag. + runner.stop_handle.stop(); runner.handle.join().map_err(|_| { AvFoundationError::Runtime("AVFoundation capture thread panicked".to_string()) })?; @@ -580,7 +644,7 @@ mod macos { use std::ops::Deref; use std::ptr::NonNull; use std::sync::{Arc, Condvar, Mutex}; - use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; + use std::time::{Duration, Instant}; use dispatch2::{DispatchQueue, DispatchRetained}; use livekit::webrtc::video_frame::{ @@ -618,12 +682,14 @@ mod macos { use super::{ AvFoundationCaptureOptions, AvFoundationError, AvFoundationFrame, AvFoundationNativeFrame, - MAX_CAPTURE_TIMESTAMP_AGE_US, }; use crate::device::{ CaptureDeviceSelector, CaptureFormat, CaptureFormatRequest, CaptureFrameFormat, CaptureResolution, }; + use crate::time::{ + elapsed_us, unix_time_us_now, validate_capture_timestamp_us, MAX_CAPTURE_TIMESTAMP_AGE_US, + }; unsafe extern "C" { fn CFRelease(cf: *const c_void); @@ -747,6 +813,12 @@ mod macos { pub(super) fn discard_pending_frame(&self) { self.shared.discard_latest(); } + + /// Returns the shared frame queue so callers outside the session-owning + /// thread can stop a blocked frame wait. + pub(super) fn frame_queue(&self) -> Arc { + Arc::clone(&self.shared) + } } fn preferred_video_settings( @@ -886,8 +958,11 @@ mod macos { } } + /// Latest-frame mailbox shared between the AVFoundation delegate queue and + /// the capturing thread. `pub(super)` so [`super::AvFoundationStopHandle`] + /// can hold it and unit tests can exercise the stop path without a camera. #[derive(Debug)] - struct FrameQueue { + pub(super) struct FrameQueue { state: Mutex, ready: Condvar, started_at: Instant, @@ -933,7 +1008,13 @@ mod macos { self.ready.notify_all(); } - fn stop(&self) { + /// Signals shutdown and wakes every blocked capture wait. + /// + /// Stopping is idempotent. `push_frame` discards frames delivered after + /// this point, and `take_frame`/`take_native_frame` fail with + /// [`AvFoundationError::NotRunning`] once any already-queued frame has + /// been drained. + pub(super) fn stop(&self) { let mut state = self.state.lock().expect("AVFoundation frame queue poisoned"); state.stopped = true; self.ready.notify_all(); @@ -979,27 +1060,33 @@ mod macos { } } - fn take_frame(&self) -> Result { - let mut state = self.state.lock().expect("AVFoundation frame queue poisoned"); - loop { - if let Some(frame) = state.latest.take() { - return frame.into_i420_frame(); - } - if let Some(error) = state.error.take() { - return Err(AvFoundationError::Runtime(error)); - } - if state.stopped { - return Err(AvFoundationError::NotRunning); - } - state = self.ready.wait(state).expect("AVFoundation frame queue poisoned"); - } + pub(super) fn take_frame(&self) -> Result { + // Convert only after `wait_take_queued_frame` has released the + // state mutex: the conversion locks the pixel buffer and runs a + // full-frame libyuv copy, and holding the mutex through that would + // block `push_frame` on the AVFoundation delegate queue, which + // drops camera frames while stalled + // (`setAlwaysDiscardsLateVideoFrames(true)`). + self.wait_take_queued_frame()?.into_i420_frame() + } + + pub(super) fn take_native_frame( + &self, + ) -> Result { + // See `take_frame` for why conversion happens outside the mutex. + self.wait_take_queued_frame()?.into_native_frame() } - fn take_native_frame(&self) -> Result { + /// Blocks until a frame, a delegate error, or a stop signal arrives and + /// moves the frame out of the shared state. The state mutex guard is + /// dropped when this returns, so callers convert the fully owned frame + /// without holding the lock. Fails with + /// [`AvFoundationError::NotRunning`] once the queue has been stopped. + fn wait_take_queued_frame(&self) -> Result { let mut state = self.state.lock().expect("AVFoundation frame queue poisoned"); loop { if let Some(frame) = state.latest.take() { - return frame.into_native_frame(); + return Ok(frame); } if let Some(error) = state.error.take() { return Err(AvFoundationError::Runtime(error)); @@ -1535,19 +1622,6 @@ mod macos { (micros <= u64::MAX as f64).then_some(micros.round() as u64) } - fn validate_capture_timestamp_us( - capture_timestamp_us: u64, - read_wall_time_us: u64, - ) -> Option { - if capture_timestamp_us == 0 || capture_timestamp_us > read_wall_time_us { - return None; - } - if read_wall_time_us - capture_timestamp_us > MAX_CAPTURE_TIMESTAMP_AGE_US { - return None; - } - Some(capture_timestamp_us) - } - fn convert_pixel_buffer( pixel_buffer: &CVPixelBuffer, ) -> Result<(I420Buffer, CaptureFrameFormat), AvFoundationError> { @@ -1843,15 +1917,59 @@ mod macos { let data = unsafe { std::slice::from_raw_parts(base.cast::(), min_len) }; Ok(Plane { data, stride }) } +} - fn elapsed_us(duration: Duration) -> i64 { - i64::try_from(duration.as_micros()).unwrap_or(i64::MAX) - } +#[cfg(all(test, target_os = "macos"))] +mod tests { + use std::sync::{mpsc, Arc}; + use std::time::Duration; + + use super::{macos::FrameQueue, AvFoundationError, AvFoundationStopHandle}; + + /// Upper bound on how long a woken capture wait may take to return before + /// the test declares the stop path broken. + const STOP_WAIT_TIMEOUT: Duration = Duration::from_secs(5); + + // `FrameQueue` is pure Rust state, so these tests run on macOS CI hosts + // without camera hardware or AVFoundation involvement. + + #[test] + fn stop_handle_unblocks_take_frame() { + let queue = Arc::new(FrameQueue::default()); + let stop_handle = AvFoundationStopHandle { shared: Arc::clone(&queue) }; + + let (done_tx, done_rx) = mpsc::channel(); + let waiter = std::thread::spawn(move || { + let result = queue.take_frame(); + let _ = done_tx.send(()); + result + }); - fn unix_time_us_now() -> Option { - SystemTime::now() - .duration_since(UNIX_EPOCH) - .ok() - .and_then(|duration| u64::try_from(duration.as_micros()).ok()) + // Give the waiter time to block on the condvar. There is no race if + // the stop lands first: the wait loop re-checks `stopped` before every + // wait. + std::thread::sleep(Duration::from_millis(50)); + stop_handle.stop(); + + done_rx + .recv_timeout(STOP_WAIT_TIMEOUT) + .expect("take_frame did not return after the stop handle fired"); + let result = waiter.join().expect("take_frame thread panicked"); + assert!( + matches!(result, Err(AvFoundationError::NotRunning)), + "unexpected take_frame result: {result:?}" + ); + } + + #[test] + fn capture_waits_fail_fast_once_stopped() { + let queue = Arc::new(FrameQueue::default()); + let stop_handle = AvFoundationStopHandle { shared: Arc::clone(&queue) }; + stop_handle.stop(); + // Stopping is idempotent. + stop_handle.stop(); + + assert!(matches!(queue.take_frame(), Err(AvFoundationError::NotRunning))); + assert!(matches!(queue.take_native_frame(), Err(AvFoundationError::NotRunning))); } } diff --git a/livekit-capture/src/sources/gstreamer.rs b/livekit-capture/src/sources/gstreamer.rs index 75499af53..58bf0906f 100644 --- a/livekit-capture/src/sources/gstreamer.rs +++ b/livekit-capture/src/sources/gstreamer.rs @@ -19,13 +19,13 @@ use thiserror::Error; use ::gstreamer as gst; use ::gstreamer_app as gst_app; +use gst::prelude::*; use crate::{ encoded::{ h26x::{access_unit_from_annex_b, access_unit_from_h264_avc}, ingress::EncodedAccessUnitSource, - CodecSpecific, EncodedFrameType, EncodedVideoCodec, H264PacketizationMode, - OwnedEncodedAccessUnit, + CodecSpecific, EncodedFrameType, EncodedVideoCodec, OwnedEncodedAccessUnit, }, error::CaptureError, }; @@ -172,6 +172,15 @@ impl EncodedAccessUnitSource for GStreamerAppSinkEncodedSource { Err(err) => Err(GStreamerSourceError::PullSample(err.to_string())), } } + + fn request_keyframe(&mut self) { + // The `GstForceKeyUnit` custom upstream event is understood by every + // GStreamer video encoder (it is what gst-video's force-key-unit + // helper builds), so downstream PLI/FIR reaches the producer. + let structure = + gst::Structure::builder("GstForceKeyUnit").field("all-headers", true).build(); + let _ = self.appsink.send_event(gst::event::CustomUpstream::new(structure)); + } } /// Error returned by GStreamer appsink encoded sources. @@ -270,32 +279,334 @@ fn access_unit_from_sample_payload( width, height, ); - access_unit.codec_specific = codec_specific_for(codec); + access_unit.codec_specific = CodecSpecific::default_for(codec); Ok(access_unit) } } } -fn codec_specific_for(codec: EncodedVideoCodec) -> CodecSpecific { +fn clock_time_to_timestamp_us(start_timestamp_us: i64, timestamp: gst::ClockTime) -> i64 { + let timestamp_us = timestamp.useconds().min(i64::MAX as u64) as i64; + start_timestamp_us.saturating_add(timestamp_us) +} + +/// Name of the appsink element the pipeline helpers look up or create. +pub const ENCODED_APPSINK_NAME: &str = "lk_appsink"; + +/// Error returned by the GStreamer pipeline helpers. +#[derive(Debug, Error)] +#[non_exhaustive] +pub enum GStreamerPipelineError { + /// The requested codec does not match what the pipeline advertises. + #[error( + "GStreamer codec mismatch: requested {requested:?}, but {location} advertises {advertised:?}" + )] + CodecMismatch { + /// Codec requested by the caller. + requested: EncodedVideoCodec, + /// Codec advertised by the pipeline. + advertised: EncodedVideoCodec, + /// Pipeline location that advertised the codec. + location: String, + }, + /// The pipeline has no usable appsink and no unlinked encoded pad. + #[error( + "GStreamer pipeline must include `appsink name={ENCODED_APPSINK_NAME}` or leave one \ + encoded video source pad unlinked" + )] + MissingAppSink, + /// The named element exists but is not an appsink. + #[error("GStreamer element {ENCODED_APPSINK_NAME} is not an appsink")] + NotAnAppSink, + /// Pad caps advertise no supported encoded video codec. + #[error("unlinked GStreamer pad '{0}' does not advertise supported encoded video caps")] + UnsupportedPadCaps(String), + /// Caps advertise a stream layout the encoded sources cannot consume. + #[error("unsupported GStreamer caps: {0}")] + UnsupportedCaps(String), + /// Element creation or linking failed. + #[error("{0}")] + Pipeline(String), +} + +/// Returns the appsink caps for a codec as a launch-string fragment. +/// +/// This is the single per-codec caps table: [`encoded_caps`] and pipeline +/// descriptions embedding a capsfilter should all derive from it. +pub fn encoded_caps_string(codec: EncodedVideoCodec) -> &'static str { + match codec { + EncodedVideoCodec::H264 => "video/x-h264,stream-format=byte-stream,alignment=au", + EncodedVideoCodec::H265 => "video/x-h265,stream-format=byte-stream,alignment=au", + EncodedVideoCodec::VP8 => "video/x-vp8", + EncodedVideoCodec::VP9 => "video/x-vp9,profile=(string)0", + EncodedVideoCodec::AV1 => "video/x-av1,stream-format=obu-stream,alignment=tu", + } +} + +/// Returns the appsink caps for a codec. +pub fn encoded_caps(codec: EncodedVideoCodec) -> Result { + encoded_caps_string(codec) + .parse::() + .map_err(|err| GStreamerPipelineError::Pipeline(format!("invalid encoded caps: {err}"))) +} + +/// Returns the appsink sample format used to ingest a codec. +pub fn sample_format_for_codec(codec: EncodedVideoCodec) -> GStreamerSampleFormat { + match codec { + EncodedVideoCodec::H264 => GStreamerSampleFormat::H264AnnexB, + EncodedVideoCodec::H265 => GStreamerSampleFormat::H265AnnexB, + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { + GStreamerSampleFormat::AccessUnit { codec } + } + } +} + +/// Returns the parser element name used to normalize a codec, when one is needed. +pub fn parser_name(codec: EncodedVideoCodec) -> Option<&'static str> { + match codec { + EncodedVideoCodec::H264 => Some("h264parse"), + EncodedVideoCodec::H265 => Some("h265parse"), + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 => None, + EncodedVideoCodec::AV1 => Some("av1parse"), + } +} + +/// Finds or builds the encoded appsink in a pipeline. +/// +/// When the pipeline already contains `appsink name=lk_appsink`, it is used +/// as-is (its sink caps decide the sample format). Otherwise the pipeline +/// must leave one encoded video source pad unlinked; the codec parser, a +/// capsfilter, and an appsink are created and linked to it. +pub fn ensure_encoded_appsink( + pipeline: &gst::Pipeline, + requested_codec: Option, +) -> Result<(gst_app::AppSink, GStreamerSampleFormat), GStreamerPipelineError> { + if let Some(appsink) = pipeline.by_name(ENCODED_APPSINK_NAME) { + let sample_format = match sample_format_from_element_sink_caps(&appsink)? { + Some(sample_format) => { + if let Some(requested_codec) = requested_codec { + if requested_codec != sample_format.codec() { + return Err(GStreamerPipelineError::CodecMismatch { + requested: requested_codec, + advertised: sample_format.codec(), + location: format!("appsink '{ENCODED_APPSINK_NAME}'"), + }); + } + } + sample_format + } + None => sample_format_for_codec(requested_codec.unwrap_or(EncodedVideoCodec::H264)), + }; + let appsink = appsink + .downcast::() + .map_err(|_| GStreamerPipelineError::NotAnAppSink)?; + return Ok((appsink, sample_format)); + } + + let src_pad = pipeline + .find_unlinked_pad(gst::PadDirection::Src) + .ok_or(GStreamerPipelineError::MissingAppSink)?; + let inferred_codec = codec_from_pad_caps(&src_pad) + .ok_or_else(|| GStreamerPipelineError::UnsupportedPadCaps(src_pad.name().to_string()))?; + let codec = match requested_codec { + Some(requested_codec) if requested_codec != inferred_codec => { + return Err(GStreamerPipelineError::CodecMismatch { + requested: requested_codec, + advertised: inferred_codec, + location: format!("unlinked pad '{}'", src_pad.name()), + }); + } + Some(requested_codec) => requested_codec, + None => inferred_codec, + }; + let sample_format = sample_format_for_codec(codec); + let src_element = src_pad.parent_element().ok_or_else(|| { + GStreamerPipelineError::Pipeline( + "unlinked GStreamer encoded pad has no parent element".to_owned(), + ) + })?; + + let parser = parser_element_for_codec(codec)?; + let codec_caps = encoded_caps(codec)?; + let capsfilter = gst::ElementFactory::make("capsfilter") + .property("caps", codec_caps) + .build() + .map_err(|err| { + GStreamerPipelineError::Pipeline(format!("failed to create {codec:?} capsfilter: {err}")) + })?; + let appsink = gst::ElementFactory::make("appsink") + .name(ENCODED_APPSINK_NAME) + .property("sync", false) + .property("max-buffers", 8u32) + .property("drop", true) + .build() + .map_err(|err| { + GStreamerPipelineError::Pipeline(format!("failed to create appsink: {err}")) + })?; + + if let Some(parser) = &parser { + pipeline.add(parser).map_err(|err| { + GStreamerPipelineError::Pipeline(format!( + "failed to add {} to GStreamer pipeline: {err}", + parser.name() + )) + })?; + } + pipeline.add(&capsfilter).map_err(|err| { + GStreamerPipelineError::Pipeline(format!( + "failed to add capsfilter to GStreamer pipeline: {err}" + )) + })?; + pipeline.add(&appsink).map_err(|err| { + GStreamerPipelineError::Pipeline(format!( + "failed to add appsink to GStreamer pipeline: {err}" + )) + })?; + if let Some(parser) = &parser { + gst::Element::link_many([parser, &capsfilter, &appsink]).map_err(|err| { + GStreamerPipelineError::Pipeline(format!( + "failed to link {} to appsink: {err}", + parser.name() + )) + })?; + } else { + gst::Element::link_many([&capsfilter, &appsink]).map_err(|err| { + GStreamerPipelineError::Pipeline(format!("failed to link capsfilter to appsink: {err}")) + })?; + } + let link_target = parser.as_ref().unwrap_or(&capsfilter); + let sink_pad = link_target.static_pad("sink").ok_or_else(|| { + GStreamerPipelineError::Pipeline(format!( + "{} did not expose a sink pad", + link_target.name() + )) + })?; + src_pad.link(&sink_pad).map_err(|err| { + GStreamerPipelineError::Pipeline(format!( + "failed to link '{}' to {}: {err}", + src_element.name(), + link_target.name() + )) + })?; + + let appsink = + appsink.downcast::().map_err(|_| GStreamerPipelineError::NotAnAppSink)?; + Ok((appsink, sample_format)) +} + +fn parser_element_for_codec( + codec: EncodedVideoCodec, +) -> Result, GStreamerPipelineError> { + let Some(name) = parser_name(codec) else { + return Ok(None); + }; + let mut builder = gst::ElementFactory::make(name); + if matches!(codec, EncodedVideoCodec::H264 | EncodedVideoCodec::H265) { + builder = builder.property("config-interval", -1i32); + } + builder + .build() + .map(Some) + .map_err(|err| GStreamerPipelineError::Pipeline(format!("failed to create {name}: {err}"))) +} + +fn sample_format_from_element_sink_caps( + element: &gst::Element, +) -> Result, GStreamerPipelineError> { + let Some(sink_pad) = element.static_pad("sink") else { + return Ok(None); + }; + sample_format_from_pad_caps(&sink_pad) +} + +fn sample_format_from_pad_caps( + pad: &gst::Pad, +) -> Result, GStreamerPipelineError> { + let caps = pad.current_caps().unwrap_or_else(|| pad.query_caps(None)); + for structure in caps.iter() { + if let Some(sample_format) = sample_format_from_caps_structure(structure)? { + return Ok(Some(sample_format)); + } + } + Ok(None) +} + +/// Infers the appsink sample format from a caps structure. +pub fn sample_format_from_caps_structure( + structure: &gst::StructureRef, +) -> Result, GStreamerPipelineError> { + let Some(codec) = codec_from_caps_name(structure.name()) else { + return Ok(None); + }; + match codec { EncodedVideoCodec::H264 => { - CodecSpecific::H264 { packetization_mode: H264PacketizationMode::NonInterleaved } + let stream_format = structure.get::("stream-format").ok(); + match stream_format.as_deref() { + Some("avc") | Some("avc3") => Ok(Some(GStreamerSampleFormat::H264Avc { + nal_length_size: h264_avc_nal_length_size_from_caps(structure), + })), + Some("byte-stream") | None => Ok(Some(GStreamerSampleFormat::H264AnnexB)), + Some(stream_format) => Err(GStreamerPipelineError::UnsupportedCaps(format!( + "H.264 stream-format '{stream_format}'; expected byte-stream or avc" + ))), + } } - EncodedVideoCodec::H265 => CodecSpecific::H265, - EncodedVideoCodec::VP8 => CodecSpecific::VP8 { temporal_id: None, layer_sync: false }, + EncodedVideoCodec::H265 => Ok(Some(GStreamerSampleFormat::H265AnnexB)), + EncodedVideoCodec::VP8 => Ok(Some(GStreamerSampleFormat::AccessUnit { codec })), EncodedVideoCodec::VP9 => { - CodecSpecific::VP9 { temporal_id: None, spatial_id: None, inter_layer_predicted: None } + let profile = structure.get::("profile").ok(); + match profile.as_deref() { + Some("0") | None => Ok(Some(GStreamerSampleFormat::AccessUnit { codec })), + Some(profile) => Err(GStreamerPipelineError::UnsupportedCaps(format!( + "VP9 profile '{profile}'; expected profile 0" + ))), + } + } + EncodedVideoCodec::AV1 => { + let stream_format = structure.get::("stream-format").ok(); + match stream_format.as_deref() { + Some("obu-stream") | None => Ok(Some(GStreamerSampleFormat::AccessUnit { codec })), + Some(stream_format) => Err(GStreamerPipelineError::UnsupportedCaps(format!( + "AV1 stream-format '{stream_format}'; expected obu-stream" + ))), + } } - EncodedVideoCodec::AV1 => CodecSpecific::AV1 { - scalability_mode: Some("L1T1".to_string()), - dependency_descriptor: None, - }, } } -fn clock_time_to_timestamp_us(start_timestamp_us: i64, timestamp: gst::ClockTime) -> i64 { - let timestamp_us = timestamp.useconds().min(i64::MAX as u64) as i64; - start_timestamp_us.saturating_add(timestamp_us) +fn h264_avc_nal_length_size_from_caps(structure: &gst::StructureRef) -> u8 { + let Ok(codec_data) = structure.get::("codec_data") else { + return 4; + }; + let Ok(codec_data) = codec_data.map_readable() else { + return 4; + }; + h264_avc_nal_length_size_from_codec_data(codec_data.as_ref()).unwrap_or(4) +} + +/// Reads the AVC NAL length-prefix size from `avcC` codec data. +pub fn h264_avc_nal_length_size_from_codec_data(codec_data: &[u8]) -> Option { + let length_size = (codec_data.get(4)? & 0x03) + 1; + (1..=4).contains(&length_size).then_some(length_size) +} + +/// Infers the encoded codec advertised by a pad's caps. +pub fn codec_from_pad_caps(pad: &gst::Pad) -> Option { + let caps = pad.current_caps().unwrap_or_else(|| pad.query_caps(None)); + caps.iter().find_map(|structure| codec_from_caps_name(structure.name())) +} + +/// Maps a caps media-type name to an encoded codec. +pub fn codec_from_caps_name(name: &str) -> Option { + match name { + "video/x-h264" => Some(EncodedVideoCodec::H264), + "video/x-h265" => Some(EncodedVideoCodec::H265), + "video/x-vp8" => Some(EncodedVideoCodec::VP8), + "video/x-vp9" => Some(EncodedVideoCodec::VP9), + "video/x-av1" => Some(EncodedVideoCodec::AV1), + _ => None, + } } #[cfg(test)] @@ -367,7 +678,7 @@ mod tests { 480, ) .unwrap(); - assert_eq!(vp9.codec_specific, codec_specific_for(EncodedVideoCodec::VP9)); + assert_eq!(vp9.codec_specific, CodecSpecific::default_for(EncodedVideoCodec::VP9)); let av1 = access_unit_from_sample_payload( GStreamerSampleFormat::AccessUnit { codec: EncodedVideoCodec::AV1 }, @@ -378,7 +689,7 @@ mod tests { 480, ) .unwrap(); - assert_eq!(av1.codec_specific, codec_specific_for(EncodedVideoCodec::AV1)); + assert_eq!(av1.codec_specific, CodecSpecific::default_for(EncodedVideoCodec::AV1)); } #[test] diff --git a/livekit-capture/src/sources/lk_argus.cpp b/livekit-capture/src/sources/lk_argus.cpp index abf740d84..1337fb729 100644 --- a/livekit-capture/src/sources/lk_argus.cpp +++ b/livekit-capture/src/sources/lk_argus.cpp @@ -253,6 +253,19 @@ static SensorTimestampStatus read_sensor_timestamp_ns( return egl_status; } +// Destroys the persistent NvBufSurface ring entries [0, count), releasing +// their DMA-BUF fds. Entries that were never created (nullptr) are skipped, +// so this is safe on a partially-initialized session. +static void destroy_dmabuf_surfaces(LkArgusSession* s, int count) { + for (int i = 0; i < count; i++) { + if (s->dmabuf_surfaces[i]) { + NvBufSurfaceDestroy(s->dmabuf_surfaces[i]); + s->dmabuf_surfaces[i] = nullptr; + } + s->dmabuf_fds[i] = -1; + } +} + extern "C" { void* lk_argus_create_session(int sensor_index, int width, int height, int fps) { @@ -478,6 +491,7 @@ void* lk_argus_create_session(int sensor_index, int width, int height, int fps) NvBufSurface* surface = nullptr; if (NvBufSurfaceCreate(&surface, 1, &create_params) != 0 || !surface) { fprintf(stderr, "[lk_argus] Failed to create NvBufSurface[%d]\n", i); + destroy_dmabuf_surfaces(s, i); delete s; return nullptr; } @@ -491,6 +505,7 @@ void* lk_argus_create_session(int sensor_index, int width, int height, int fps) if (status != Argus::STATUS_OK) { fprintf(stderr, "[lk_argus] Failed to start repeating capture: %d\n", static_cast(status)); + destroy_dmabuf_surfaces(s, kNumDmaBufs); delete s; return nullptr; } @@ -725,13 +740,7 @@ void lk_argus_destroy_session(void* handle) { s->current_frame.reset(); // Free all persistent NvBufSurface buffers using the original pointers. - for (int i = 0; i < kNumDmaBufs; i++) { - if (s->dmabuf_surfaces[i]) { - NvBufSurfaceDestroy(s->dmabuf_surfaces[i]); - s->dmabuf_surfaces[i] = nullptr; - } - s->dmabuf_fds[i] = -1; - } + destroy_dmabuf_surfaces(s, kNumDmaBufs); delete s; fprintf(stderr, "[lk_argus] Session destroyed\n"); diff --git a/livekit-capture/src/sources/mod.rs b/livekit-capture/src/sources/mod.rs index 4eaf64992..8f13320a2 100644 --- a/livekit-capture/src/sources/mod.rs +++ b/livekit-capture/src/sources/mod.rs @@ -14,16 +14,14 @@ //! Optional capture sources that feed the shared capture paths. -#[cfg(feature = "avfoundation")] -pub mod avfoundation { - //! macOS AVFoundation decoded-frame capture. - - pub use crate::platform::avfoundation::*; -} #[cfg(feature = "libargus")] pub mod argus; +#[cfg(feature = "avfoundation")] +pub mod avfoundation; #[cfg(feature = "gstreamer")] pub mod gstreamer; +#[cfg(feature = "tcpsink")] +pub(crate) mod io; #[cfg(feature = "rtsp")] pub mod rtsp; #[cfg(feature = "tcpsink")] diff --git a/livekit-capture/src/sources/rtsp.rs b/livekit-capture/src/sources/rtsp.rs index bbca3ca3a..316d88935 100644 --- a/livekit-capture/src/sources/rtsp.rs +++ b/livekit-capture/src/sources/rtsp.rs @@ -15,6 +15,7 @@ use std::{ io::{self, Read, Write}, net::TcpStream, + ops::Range, str, time::{Duration, Instant, SystemTime, UNIX_EPOCH}, }; @@ -31,6 +32,9 @@ use crate::encoded::{ const DEFAULT_RTSP_CLOCK_RATE: u32 = 90_000; const MAX_RTSP_HEADER_BYTES: usize = 64 * 1024; +const RTSP_STREAM_READ_CHUNK_BYTES: usize = 8 * 1024; +const DEFAULT_RTSP_READ_TIMEOUT: Duration = Duration::from_secs(10); +const DEFAULT_RTSP_IDLE_TIMEOUT: Duration = Duration::from_secs(30); /// Options used to open an RTSP encoded video source. #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -43,12 +47,28 @@ pub struct RtspSourceOptions { pub width: u32, /// Encoded frame height in pixels. pub height: u32, + /// Non-zero socket read timeout applied to the RTSP TCP stream (default 10s). + /// + /// Handshake reads that exceed it fail with [`RtspSourceError::Timeout`]. + /// Streaming reads treat it as the retry granularity instead, so session + /// keepalives keep flowing while the stream is silent. + pub read_timeout: Duration, + /// Maximum stream silence tolerated before [`RtspSourceError::Timeout`] + /// (default 30s). Receiving any interleaved bytes resets the limit. + pub idle_timeout: Duration, } impl RtspSourceOptions { /// Creates RTSP source options for encoded frames with the supplied dimensions. pub fn new(width: u32, height: u32) -> Self { - Self { expected_codec: None, start_timestamp_us: 0, width, height } + Self { + expected_codec: None, + start_timestamp_us: 0, + width, + height, + read_timeout: DEFAULT_RTSP_READ_TIMEOUT, + idle_timeout: DEFAULT_RTSP_IDLE_TIMEOUT, + } } /// Requires the SDP video track to use the supplied codec. @@ -62,6 +82,18 @@ impl RtspSourceOptions { self.start_timestamp_us = start_timestamp_us; self } + + /// Sets the socket read timeout. + pub fn with_read_timeout(mut self, read_timeout: Duration) -> Self { + self.read_timeout = read_timeout; + self + } + + /// Sets the maximum stream silence tolerated before a timeout error. + pub fn with_idle_timeout(mut self, idle_timeout: Duration) -> Self { + self.idle_timeout = idle_timeout; + self + } } /// RTSP session details discovered while opening a source. @@ -96,6 +128,7 @@ impl RtspEncodedSource { let mut stream = TcpStream::connect((rtsp_url.connect_host.as_str(), rtsp_url.port)) .map_err(RtspSourceError::Io)?; let _ = stream.set_nodelay(true); + stream.set_read_timeout(Some(options.read_timeout)).map_err(RtspSourceError::Io)?; let mut auth = RtspAuthContext::new(rtsp_url.credentials.clone()); let mut cseq = 1; @@ -155,6 +188,7 @@ impl RtspEncodedSource { start_timestamp_us: options.start_timestamp_us, width: options.width, height: options.height, + idle_timeout: options.idle_timeout, }; let source = RtspInterleavedRtpSource::new(stream, config)?; let keepalive = RtspKeepalive::new( @@ -184,8 +218,16 @@ impl EncodedAccessUnitSource for RtspEncodedSource { type Error = RtspSourceError; fn next_access_unit(&mut self) -> Result, Self::Error> { - self.keepalive.maybe_send(self.source.reader_mut())?; - self.source.next_access_unit() + loop { + self.keepalive.maybe_send(self.source.reader_mut())?; + match self.source.poll_access_unit()? { + AccessUnitPoll::AccessUnit(access_unit) => return Ok(Some(access_unit)), + AccessUnitPoll::EndOfStream => return Ok(None), + // A stream read timed out; loop so a due keepalive can be + // sent even while the interleaved stream is silent. + AccessUnitPoll::TimedOut => {} + } + } } } @@ -256,6 +298,9 @@ pub struct RtspInterleavedSourceConfig { pub width: u32, /// Encoded frame height in pixels. pub height: u32, + /// Maximum stream silence tolerated before timed-out reads become a hard + /// [`RtspSourceError::Timeout`]. Receiving any bytes resets the limit. + pub idle_timeout: Duration, } /// Encoded source for RTSP interleaved RTP streams. @@ -264,9 +309,35 @@ pub struct RtspInterleavedRtpSource { reader: R, config: RtspInterleavedSourceConfig, assembler: RtpAccessUnitAssembler, + /// Unconsumed stream bytes; may end with a partial unit that is kept + /// across timed-out reads so framing survives read timeouts. + stream_buf: Vec, + /// Consumed prefix of `stream_buf`, compacted before each fill. + stream_pos: usize, + /// When the last stream bytes were received, for the idle limit. + last_read_at: Instant, eof: bool, } +/// Progress from polling the interleaved stream for one access unit. +#[derive(Debug)] +enum AccessUnitPoll { + /// A complete access unit was assembled. + AccessUnit(OwnedEncodedAccessUnit), + /// The stream ended cleanly at a unit boundary. + EndOfStream, + /// A read timed out mid-stream; retry after running periodic work. + TimedOut, +} + +/// Result of one attempt to read more interleaved stream bytes. +#[derive(Debug)] +enum StreamFill { + Filled, + Eof, + TimedOut, +} + impl RtspInterleavedRtpSource where R: Read, @@ -280,7 +351,15 @@ where config.width, config.height, )?; - Ok(Self { reader, config, assembler, eof: false }) + Ok(Self { + reader, + config, + assembler, + stream_buf: Vec::new(), + stream_pos: 0, + last_read_at: Instant::now(), + eof: false, + }) } /// Returns the wrapped reader. @@ -298,34 +377,86 @@ where self.reader } - fn read_next_interleaved_frame(&mut self) -> Result)>, RtspSourceError> { - while !self.eof { - let mut magic = [0u8; 1]; - if !read_exact_or_clean_eof(&mut self.reader, &mut magic) - .map_err(RtspSourceError::Io)? - { - self.eof = true; - return Ok(None); + /// Advances the stream until an access unit completes, the stream ends, + /// or a read times out with framing state preserved for the next poll. + fn poll_access_unit(&mut self) -> Result { + loop { + if self.eof { + return Ok(AccessUnitPoll::EndOfStream); } - if magic[0] == b'R' { - let _ = read_rtsp_response_with_initial_byte(&mut self.reader, magic[0])?; - continue; - } - if magic[0] != b'$' { - return Err(RtspSourceError::UnexpectedData); + while let Some(unit) = parse_interleaved_unit(&self.stream_buf[self.stream_pos..])? { + let unit_start = self.stream_pos; + match unit { + ParsedInterleavedUnit::Frame { channel, payload, len } => { + self.stream_pos = unit_start + len; + if channel != self.config.video_channel { + continue; + } + let payload = + &self.stream_buf[unit_start + payload.start..unit_start + payload.end]; + if let Some(access_unit) = self.assembler.push(payload)? { + return Ok(AccessUnitPoll::AccessUnit(access_unit)); + } + } + ParsedInterleavedUnit::RtspResponse { len } => { + self.stream_pos = unit_start + len; + } + } } - let mut header = [0u8; 3]; - self.reader.read_exact(&mut header).map_err(RtspSourceError::Io)?; - let channel = header[0]; - let len = u16::from_be_bytes([header[1], header[2]]) as usize; - let mut payload = vec![0; len]; - self.reader.read_exact(&mut payload).map_err(RtspSourceError::Io)?; - return Ok(Some((channel, payload))); + match self.fill_stream_buf()? { + StreamFill::Filled => {} + StreamFill::Eof => { + self.eof = true; + return Ok(AccessUnitPoll::EndOfStream); + } + StreamFill::TimedOut => return Ok(AccessUnitPoll::TimedOut), + } } + } - Ok(None) + /// Reads more stream bytes into `stream_buf`, compacting consumed data first. + fn fill_stream_buf(&mut self) -> Result { + if self.stream_pos > 0 { + self.stream_buf.drain(..self.stream_pos); + self.stream_pos = 0; + } + let filled = self.stream_buf.len(); + self.stream_buf.resize(filled + RTSP_STREAM_READ_CHUNK_BYTES, 0); + loop { + match self.reader.read(&mut self.stream_buf[filled..]) { + Ok(0) => { + self.stream_buf.truncate(filled); + return if filled == 0 { + Ok(StreamFill::Eof) + } else { + // The stream ended inside an interleaved unit. + Err(RtspSourceError::Io(io::Error::from(io::ErrorKind::UnexpectedEof))) + }; + } + Ok(read) => { + self.stream_buf.truncate(filled + read); + self.last_read_at = Instant::now(); + return Ok(StreamFill::Filled); + } + Err(err) if err.kind() == io::ErrorKind::Interrupted => {} + Err(err) if is_timeout_io_error(&err) => { + self.stream_buf.truncate(filled); + return if self.last_read_at.elapsed() >= self.config.idle_timeout { + Err(RtspSourceError::Timeout { + phase: "interleaved stream data".to_owned(), + }) + } else { + Ok(StreamFill::TimedOut) + }; + } + Err(err) => { + self.stream_buf.truncate(filled); + return Err(RtspSourceError::Io(err)); + } + } + } } } @@ -337,16 +468,57 @@ where fn next_access_unit(&mut self) -> Result, Self::Error> { loop { - let Some((channel, payload)) = self.read_next_interleaved_frame()? else { + match self.poll_access_unit()? { + AccessUnitPoll::AccessUnit(access_unit) => return Ok(Some(access_unit)), + AccessUnitPoll::EndOfStream => return Ok(None), + // Keep waiting until the configured idle limit turns + // timed-out reads into a hard error. + AccessUnitPoll::TimedOut => {} + } + } + } +} + +/// One unit parsed from the front of the interleaved stream buffer. +#[derive(Debug)] +enum ParsedInterleavedUnit { + /// Interleaved binary frame with its payload range and total length. + Frame { channel: u8, payload: Range, len: usize }, + /// In-stream RTSP response (for example a keepalive reply) to skip. + RtspResponse { len: usize }, +} + +/// Parses one interleaved unit from the front of `buf`, returning `Ok(None)` +/// when more bytes are needed. +fn parse_interleaved_unit(buf: &[u8]) -> Result, RtspSourceError> { + let Some(&magic) = buf.first() else { + return Ok(None); + }; + match magic { + b'$' => { + if buf.len() < 4 { return Ok(None); - }; - if channel != self.config.video_channel { - continue; } - if let Some(access_unit) = self.assembler.push(&payload)? { - return Ok(Some(access_unit)); + let channel = buf[1]; + let len = 4 + u16::from_be_bytes([buf[2], buf[3]]) as usize; + if buf.len() < len { + return Ok(None); + } + Ok(Some(ParsedInterleavedUnit::Frame { channel, payload: 4..len, len })) + } + b'R' => { + let mut remaining = buf; + match read_rtsp_response(&mut remaining) { + Ok(_response) => Ok(Some(ParsedInterleavedUnit::RtspResponse { + len: buf.len() - remaining.len(), + })), + Err(RtspSourceError::Io(err)) if err.kind() == io::ErrorKind::UnexpectedEof => { + Ok(None) + } + Err(err) => Err(err), } } + _ => Err(RtspSourceError::UnexpectedData), } } @@ -356,6 +528,12 @@ pub enum RtspSourceError { /// I/O failed while reading RTSP interleaved data. #[error("RTSP I/O failed: {0}")] Io(io::Error), + /// An RTSP read exceeded the configured timeout. + #[error("RTSP timed out waiting for {phase}")] + Timeout { + /// Protocol phase or data the client was waiting for. + phase: String, + }, /// RTSP URL was invalid or unsupported. #[error("invalid RTSP URL: {0}")] InvalidUrl(&'static str), @@ -385,13 +563,13 @@ pub enum RtspSourceError { /// SDP was missing a supported video track. #[error("RTSP SDP does not contain a supported video track")] MissingVideoTrack, - /// SDP selected a codec different from the requested codec. - #[error("RTSP SDP codec mismatch: expected {expected:?}, got {actual:?}")] + /// SDP did not offer the requested codec on any video track. + #[error("RTSP SDP codec mismatch: expected {expected:?}, offered {actual:?}")] CodecMismatch { /// Codec requested by the caller. expected: EncodedVideoCodec, - /// Codec selected from SDP. - actual: EncodedVideoCodec, + /// Supported codecs offered by the SDP video tracks. + actual: Vec, }, /// SDP body was malformed or not valid UTF-8. #[error("invalid RTSP SDP")] @@ -404,16 +582,8 @@ pub enum RtspSourceError { Rtp(#[from] RtpDepacketizerError), } -fn read_exact_or_clean_eof(reader: &mut impl Read, buf: &mut [u8]) -> io::Result { - let mut offset = 0; - while offset < buf.len() { - match reader.read(&mut buf[offset..])? { - 0 if offset == 0 => return Ok(false), - 0 => return Err(io::Error::from(io::ErrorKind::UnexpectedEof)), - read => offset += read, - } - } - Ok(true) +fn is_timeout_io_error(err: &io::Error) -> bool { + matches!(err.kind(), io::ErrorKind::WouldBlock | io::ErrorKind::TimedOut) } #[derive(Debug, Clone, PartialEq, Eq)] @@ -576,7 +746,13 @@ fn send_rtsp_request( authorization: Option, ) -> Result { write_rtsp_request(stream, method, uri, cseq, headers, authorization)?; - read_rtsp_response(stream) + read_rtsp_response(stream).map_err(|err| match err { + // Handshake reads must complete within the socket read timeout. + RtspSourceError::Io(io_err) if is_timeout_io_error(&io_err) => { + RtspSourceError::Timeout { phase: format!("{method} response") } + } + err => err, + }) } fn write_rtsp_request( @@ -844,20 +1020,7 @@ fn make_cnonce() -> String { } fn read_rtsp_response(reader: &mut impl Read) -> Result { - read_rtsp_response_with_header_prefix(reader, Vec::new()) -} - -fn read_rtsp_response_with_initial_byte( - reader: &mut impl Read, - initial_byte: u8, -) -> Result { - read_rtsp_response_with_header_prefix(reader, vec![initial_byte]) -} - -fn read_rtsp_response_with_header_prefix( - reader: &mut impl Read, - mut header: Vec, -) -> Result { + let mut header = Vec::new(); let mut byte = [0u8; 1]; loop { reader.read_exact(&mut byte).map_err(RtspSourceError::Io)?; @@ -965,6 +1128,7 @@ fn parse_sdp_video_track( tracks.push(track); } + let mut offered = Vec::new(); for track in tracks { for payload_type in &track.payload_types { let Some(rtp_map) = track.rtp_maps.iter().find(|map| map.payload_type == *payload_type) @@ -973,7 +1137,10 @@ fn parse_sdp_video_track( }; if let Some(expected) = expected_codec { if rtp_map.codec != expected { - return Err(RtspSourceError::CodecMismatch { expected, actual: rtp_map.codec }); + if !offered.contains(&rtp_map.codec) { + offered.push(rtp_map.codec); + } + continue; } } @@ -986,7 +1153,12 @@ fn parse_sdp_video_track( } } - Err(RtspSourceError::MissingVideoTrack) + match expected_codec { + Some(expected) if !offered.is_empty() => { + Err(RtspSourceError::CodecMismatch { expected, actual: offered }) + } + _ => Err(RtspSourceError::MissingVideoTrack), + } } fn parse_video_media(media: &str) -> PartialSdpVideoTrack { @@ -1107,19 +1279,24 @@ mod tests { frame } - #[test] - fn reads_rtsp_interleaved_rtp_access_unit() { - let packet = rtp_packet(10, 12_000, true, &[0x65, 1, 2]); - let stream = interleaved(0, &packet); - let config = RtspInterleavedSourceConfig { + fn interleaved_config(video_channel: u8) -> RtspInterleavedSourceConfig { + RtspInterleavedSourceConfig { codec: EncodedVideoCodec::H264, clock_rate: 90_000, - video_channel: 0, + video_channel, start_timestamp_us: 0, width: 640, height: 480, - }; - let mut source = RtspInterleavedRtpSource::new(Cursor::new(stream), config).unwrap(); + idle_timeout: Duration::from_secs(30), + } + } + + #[test] + fn reads_rtsp_interleaved_rtp_access_unit() { + let packet = rtp_packet(10, 12_000, true, &[0x65, 1, 2]); + let stream = interleaved(0, &packet); + let mut source = + RtspInterleavedRtpSource::new(Cursor::new(stream), interleaved_config(0)).unwrap(); let access_unit = source.next_access_unit().unwrap().unwrap(); assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 1, 2]); @@ -1132,15 +1309,8 @@ mod tests { let mut stream = Vec::new(); write_status_response(&mut stream, 4, &[], &[], 200, "OK"); stream.extend_from_slice(&interleaved(0, &packet)); - let config = RtspInterleavedSourceConfig { - codec: EncodedVideoCodec::H264, - clock_rate: 90_000, - video_channel: 0, - start_timestamp_us: 0, - width: 640, - height: 480, - }; - let mut source = RtspInterleavedRtpSource::new(Cursor::new(stream), config).unwrap(); + let mut source = + RtspInterleavedRtpSource::new(Cursor::new(stream), interleaved_config(0)).unwrap(); let access_unit = source.next_access_unit().unwrap().unwrap(); @@ -1148,6 +1318,59 @@ mod tests { assert!(source.next_access_unit().unwrap().is_none()); } + #[test] + fn recovers_interleaved_framing_across_read_timeouts() { + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let addr = listener.local_addr().unwrap(); + let server = thread::spawn(move || { + let (mut stream, _) = listener.accept().unwrap(); + let packet = rtp_packet(10, 12_000, true, &[0x65, 1, 2]); + let frame = interleaved(0, &packet); + // Split inside the 4-byte interleaved header and pause long + // enough for several client read timeouts in between. + let (head, tail) = frame.split_at(2); + stream.write_all(head).unwrap(); + stream.flush().unwrap(); + thread::sleep(Duration::from_millis(150)); + stream.write_all(tail).unwrap(); + stream.flush().unwrap(); + }); + + let client = std::net::TcpStream::connect(addr).unwrap(); + client.set_read_timeout(Some(Duration::from_millis(25))).unwrap(); + let mut source = RtspInterleavedRtpSource::new(client, interleaved_config(0)).unwrap(); + + let access_unit = source.next_access_unit().unwrap().unwrap(); + + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 1, 2]); + server.join().unwrap(); + } + + #[test] + fn interleaved_stream_times_out_after_idle_limit() { + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let addr = listener.local_addr().unwrap(); + let server = thread::spawn(move || { + let (stream, _) = listener.accept().unwrap(); + // Stay silent past the client's idle limit before closing. + thread::sleep(Duration::from_millis(500)); + drop(stream); + }); + + let client = std::net::TcpStream::connect(addr).unwrap(); + client.set_read_timeout(Some(Duration::from_millis(20))).unwrap(); + let config = RtspInterleavedSourceConfig { + idle_timeout: Duration::from_millis(80), + ..interleaved_config(0) + }; + let mut source = RtspInterleavedRtpSource::new(client, config).unwrap(); + + let err = source.next_access_unit().unwrap_err(); + + assert!(matches!(err, RtspSourceError::Timeout { .. })); + server.join().unwrap(); + } + #[test] fn parses_sdp_video_track() { let base_url = RtspUrl::parse("rtsp://camera.example/live").unwrap(); @@ -1201,13 +1424,71 @@ a=rtpmap:96 VP9/90000\r\n"; let err = parse_sdp_video_track(&base_url, sdp, Some(EncodedVideoCodec::AV1)).unwrap_err(); - assert!(matches!( - err, - RtspSourceError::CodecMismatch { - expected: EncodedVideoCodec::AV1, - actual: EncodedVideoCodec::VP9 + match err { + RtspSourceError::CodecMismatch { expected, actual } => { + assert_eq!(expected, EncodedVideoCodec::AV1); + assert_eq!(actual, vec![EncodedVideoCodec::VP9]); } - )); + other => panic!("expected codec mismatch, got {other:?}"), + } + } + + #[test] + fn selects_expected_codec_among_multiple_payload_types() { + let base_url = RtspUrl::parse("rtsp://camera.example/live").unwrap(); + let sdp = "\ +v=0\r\n\ +m=video 0 RTP/AVP 98 96\r\n\ +a=control:trackID=1\r\n\ +a=rtpmap:98 H265/90000\r\n\ +a=rtpmap:96 H264/90000\r\n"; + + let track = parse_sdp_video_track(&base_url, sdp, Some(EncodedVideoCodec::H264)).unwrap(); + + assert_eq!(track.codec, EncodedVideoCodec::H264); + assert_eq!(track.payload_type, 96); + assert_eq!(track.clock_rate, 90_000); + assert_eq!(track.control_url, "rtsp://camera.example/live/trackID=1"); + } + + #[test] + fn selects_expected_codec_from_later_video_section() { + let base_url = RtspUrl::parse("rtsp://camera.example/live").unwrap(); + let sdp = "\ +v=0\r\n\ +m=video 0 RTP/AVP 98\r\n\ +a=control:trackID=1\r\n\ +a=rtpmap:98 H265/90000\r\n\ +m=video 0 RTP/AVP 96\r\n\ +a=control:trackID=2\r\n\ +a=rtpmap:96 H264/90000\r\n"; + + let track = parse_sdp_video_track(&base_url, sdp, Some(EncodedVideoCodec::H264)).unwrap(); + + assert_eq!(track.codec, EncodedVideoCodec::H264); + assert_eq!(track.payload_type, 96); + assert_eq!(track.control_url, "rtsp://camera.example/live/trackID=2"); + } + + #[test] + fn rejects_sdp_listing_all_offered_codecs_when_none_match() { + let base_url = RtspUrl::parse("rtsp://camera.example/live").unwrap(); + let sdp = "\ +v=0\r\n\ +m=video 0 RTP/AVP 98 96\r\n\ +a=control:trackID=1\r\n\ +a=rtpmap:98 H265/90000\r\n\ +a=rtpmap:96 H264/90000\r\n"; + + let err = parse_sdp_video_track(&base_url, sdp, Some(EncodedVideoCodec::VP8)).unwrap_err(); + + match err { + RtspSourceError::CodecMismatch { expected, actual } => { + assert_eq!(expected, EncodedVideoCodec::VP8); + assert_eq!(actual, vec![EncodedVideoCodec::H265, EncodedVideoCodec::H264]); + } + other => panic!("expected codec mismatch, got {other:?}"), + } } #[test] @@ -1420,6 +1701,84 @@ a=rtpmap:96 H264/90000\r\n"; server.join().unwrap(); } + #[test] + fn sends_keepalive_during_stream_silence() { + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let addr = listener.local_addr().unwrap(); + let server = thread::spawn(move || { + let (mut stream, _) = listener.accept().unwrap(); + let _describe = read_request(&mut stream); + let sdp = "\ +v=0\r\n\ +m=video 0 RTP/AVP 96\r\n\ +a=control:trackID=0\r\n\ +a=rtpmap:96 H264/90000\r\n"; + write_response( + &mut stream, + 1, + &[("Content-Type", "application/sdp"), ("Content-Length", &sdp.len().to_string())], + sdp.as_bytes(), + ); + let _setup = read_request(&mut stream); + write_response( + &mut stream, + 2, + &[ + ("Session", "abc123;timeout=60"), + ("Transport", "RTP/AVP/TCP;unicast;interleaved=0-1"), + ], + &[], + ); + let _play = read_request(&mut stream); + write_response(&mut stream, 3, &[], &[]); + + // Send no interleaved data; the keepalive must arrive during the + // silence. Only then reply and send the first video frame. + let keepalive = read_request(&mut stream); + write_response(&mut stream, 4, &[], &[]); + let packet = rtp_packet(10, 12_000, true, &[0x65, 1, 2]); + stream.write_all(&interleaved(0, &packet)).unwrap(); + keepalive + }); + + let options = RtspSourceOptions::new(640, 480) + .with_expected_codec(EncodedVideoCodec::H264) + .with_read_timeout(Duration::from_millis(100)) + .with_idle_timeout(Duration::from_secs(5)); + let mut source = + RtspEncodedSource::connect(&format!("rtsp://{addr}/camera"), options).unwrap(); + source.keepalive.next_due = Instant::now() + Duration::from_millis(250); + + let access_unit = source.next_access_unit().unwrap().unwrap(); + + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 1, 2]); + let keepalive = server.join().unwrap(); + assert!(keepalive.starts_with("OPTIONS rtsp://")); + assert!(keepalive.contains("Session: abc123")); + } + + #[test] + fn handshake_read_timeout_is_hard_error() { + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let addr = listener.local_addr().unwrap(); + let server = thread::spawn(move || { + let (mut stream, _) = listener.accept().unwrap(); + let _describe = read_request(&mut stream); + // Never respond; hold the connection open past the read timeout. + thread::sleep(Duration::from_millis(300)); + }); + + let options = RtspSourceOptions::new(640, 480).with_read_timeout(Duration::from_millis(50)); + let err = + RtspEncodedSource::connect(&format!("rtsp://{addr}/camera"), options).unwrap_err(); + + assert!( + matches!(&err, RtspSourceError::Timeout { phase } if phase.contains("DESCRIBE")), + "expected DESCRIBE timeout, got {err:?}" + ); + server.join().unwrap(); + } + fn read_request(stream: &mut impl Read) -> String { let mut request = Vec::new(); let mut byte = [0u8; 1]; diff --git a/livekit-capture/src/sources/tcp.rs b/livekit-capture/src/sources/tcp.rs index d2c96f544..d84715f35 100644 --- a/livekit-capture/src/sources/tcp.rs +++ b/livekit-capture/src/sources/tcp.rs @@ -21,12 +21,13 @@ use thiserror::Error; use crate::{ encoded::{ - h26x::{AnnexBAccessUnitParser, AvcAccessUnitParser}, + h26x::{AccessUnitParser, AnnexBAccessUnitParser, AvcAccessUnitParser}, ingress::EncodedAccessUnitSource, rtp::{RtpAccessUnitAssembler, RtpDepacketizerError}, EncodedVideoCodec, EncodedWireFormat, OwnedEncodedAccessUnit, }, error::CaptureError, + sources::io::read_exact_or_clean_eof, }; const DEFAULT_CHUNK_SIZE: usize = 4096; @@ -82,6 +83,9 @@ pub struct ByteStreamEncodedSource { parser: ByteStreamParser, read_chunk: Vec, eof: bool, + /// Whether the parser may still hold complete access units from the last + /// push, which must be drained before reading more from the stream. + drain_pending: bool, } /// TCP encoded source using the same parser as other byte streams. @@ -151,6 +155,7 @@ where parser, read_chunk: vec![0; config.read_chunk_size.max(1)], eof: false, + drain_pending: false, }) } @@ -174,42 +179,19 @@ where self.reader } - fn next_annex_b( + fn next_from_parser( reader: &mut R, read_chunk: &mut [u8], - parser: &mut AnnexBAccessUnitParser, + parser: &mut P, eof: &mut bool, + drain_pending: &mut bool, ) -> Result, TcpSourceError> { loop { - if let Some(access_unit) = parser.push(&[]).map_err(TcpSourceError::Capture)? { - return Ok(Some(access_unit)); - } - if *eof { - return parser.flush().map_err(TcpSourceError::Capture); - } - - let read = reader.read(read_chunk).map_err(TcpSourceError::Io)?; - if read == 0 { - *eof = true; - continue; - } - if let Some(access_unit) = - parser.push(&read_chunk[..read]).map_err(TcpSourceError::Capture)? - { - return Ok(Some(access_unit)); - } - } - } - - fn next_avc( - reader: &mut R, - read_chunk: &mut [u8], - parser: &mut AvcAccessUnitParser, - eof: &mut bool, - ) -> Result, TcpSourceError> { - loop { - if let Some(access_unit) = parser.push(&[]).map_err(TcpSourceError::Capture)? { - return Ok(Some(access_unit)); + if *drain_pending { + if let Some(access_unit) = parser.drain().map_err(TcpSourceError::Capture)? { + return Ok(Some(access_unit)); + } + *drain_pending = false; } if *eof { return parser.flush().map_err(TcpSourceError::Capture); @@ -223,6 +205,7 @@ where if let Some(access_unit) = parser.push(&read_chunk[..read]).map_err(TcpSourceError::Capture)? { + *drain_pending = true; return Ok(Some(access_unit)); } } @@ -230,6 +213,7 @@ where fn next_rtp( reader: &mut R, + packet: &mut Vec, assembler: &mut RtpAccessUnitAssembler, eof: &mut bool, ) -> Result, TcpSourceError> { @@ -245,9 +229,9 @@ where continue; } - let mut packet = vec![0; packet_len]; - reader.read_exact(&mut packet).map_err(TcpSourceError::Io)?; - if let Some(access_unit) = assembler.push(&packet)? { + packet.resize(packet_len, 0); + reader.read_exact(packet).map_err(TcpSourceError::Io)?; + if let Some(access_unit) = assembler.push(packet)? { return Ok(Some(access_unit)); } } @@ -337,14 +321,22 @@ where fn next_access_unit(&mut self) -> Result, Self::Error> { match &mut self.parser { - ByteStreamParser::H26x(parser) => { - Self::next_annex_b(&mut self.reader, &mut self.read_chunk, parser, &mut self.eof) - } - ByteStreamParser::H264Avc(parser) => { - Self::next_avc(&mut self.reader, &mut self.read_chunk, parser, &mut self.eof) - } + ByteStreamParser::H26x(parser) => Self::next_from_parser( + &mut self.reader, + &mut self.read_chunk, + parser, + &mut self.eof, + &mut self.drain_pending, + ), + ByteStreamParser::H264Avc(parser) => Self::next_from_parser( + &mut self.reader, + &mut self.read_chunk, + parser, + &mut self.eof, + &mut self.drain_pending, + ), ByteStreamParser::Rtp(assembler) => { - Self::next_rtp(&mut self.reader, assembler, &mut self.eof) + Self::next_rtp(&mut self.reader, &mut self.read_chunk, assembler, &mut self.eof) } } } @@ -367,18 +359,6 @@ pub enum TcpSourceError { Capture(CaptureError), } -fn read_exact_or_clean_eof(reader: &mut impl Read, buf: &mut [u8]) -> io::Result { - let mut offset = 0; - while offset < buf.len() { - match reader.read(&mut buf[offset..])? { - 0 if offset == 0 => return Ok(false), - 0 => return Err(io::Error::from(io::ErrorKind::UnexpectedEof)), - read => offset += read, - } - } - Ok(true) -} - #[cfg(test)] mod tests { use std::{ diff --git a/livekit-capture/src/sources/v4l.rs b/livekit-capture/src/sources/v4l.rs index 899282f4e..6c203c880 100644 --- a/livekit-capture/src/sources/v4l.rs +++ b/livekit-capture/src/sources/v4l.rs @@ -16,10 +16,7 @@ use std::time::Duration; #[cfg(target_os = "linux")] -use std::{ - path::Path, - time::{Instant, SystemTime, UNIX_EPOCH}, -}; +use std::{path::Path, time::Instant}; #[cfg(target_os = "linux")] use livekit::webrtc::video_frame::VideoRotation; @@ -44,9 +41,10 @@ use crate::device::{ CaptureDeviceInfo, CaptureDeviceSelector, CaptureFormat, CaptureFormatRequest, CaptureFrameFormat, CapturePath, CaptureResolution, }; - #[cfg(any(target_os = "linux", test))] -const MAX_BACKEND_CAPTURE_TIMESTAMP_AGE_US: u64 = 5_000_000; +use crate::time::validate_capture_timestamp_us; +#[cfg(target_os = "linux")] +use crate::time::{elapsed_us, unix_time_us_now}; /// Options used to open a Linux V4L2 capture session. #[derive(Debug, Clone, PartialEq, Eq)] @@ -146,6 +144,9 @@ pub struct V4lCaptureSession { #[cfg(target_os = "linux")] stream: MmapStream<'static>, format: CaptureFormat, + /// Driver-reported row stride in bytes (V4L2 `bytesperline`). + #[cfg(target_os = "linux")] + stride: u32, options: V4lCaptureOptions, #[cfg(target_os = "linux")] started_at: Instant, @@ -192,10 +193,11 @@ impl V4lCaptureSession { let frame_formats = frame_formats_for_request(&options)?; let device = open_device(&options.device)?; let all_formats = enumerate_device_formats(&device)?; - let format = apply_format_request(&device, &options, &frame_formats, &all_formats)?; + let (format, stride) = + apply_format_request(&device, &options, &frame_formats, &all_formats)?; let stream = MmapStream::with_buffers(&device, V4lBufferType::VideoCapture, 4).map_err(v4l_error)?; - Ok(Self { stream, format, options, started_at: Instant::now() }) + Ok(Self { stream, format, stride, options, started_at: Instant::now() }) } #[cfg(not(target_os = "linux"))] @@ -225,8 +227,14 @@ impl V4lCaptureSession { buffer: I420Buffer::new(width, height), }; let source = frame_bytes(buffer, metadata.bytesused); - let used_decode_path = - convert_to_i420(format.frame_format, source, width, height, &mut frame.buffer)?; + let used_decode_path = convert_to_i420( + format.frame_format, + source, + width, + height, + self.stride, + &mut frame.buffer, + )?; Ok(V4lFrame { frame, @@ -458,7 +466,7 @@ fn apply_format_request( options: &V4lCaptureOptions, frame_formats: &[CaptureFrameFormat], all_formats: &[CaptureFormat], -) -> Result { +) -> Result<(CaptureFormat, u32), V4lError> { match options.format { CaptureFormatRequest::Default => { let selected = select_format_for_request(&options.format, frame_formats, all_formats)?; @@ -481,7 +489,7 @@ fn apply_ordered_format_request( options: &V4lCaptureOptions, frame_formats: &[CaptureFrameFormat], all_formats: &[CaptureFormat], -) -> Result { +) -> Result<(CaptureFormat, u32), V4lError> { let mut last_error = None; for frame_format in frame_formats { let request = format_request_with_frame_format(&options.format, *frame_format); @@ -684,8 +692,11 @@ fn compare_format_preference( } #[cfg(target_os = "linux")] -fn set_device_format(device: &Device, selected: CaptureFormat) -> Result { - let current = device_capture_format(device)?; +fn set_device_format( + device: &Device, + selected: CaptureFormat, +) -> Result<(CaptureFormat, u32), V4lError> { + let (current, _) = device_capture_format(device)?; let format_changed = current.resolution != selected.resolution || current.frame_format != selected.frame_format; if format_changed { @@ -704,28 +715,32 @@ fn set_device_format(device: &Device, selected: CaptureFormat) -> Result Result { +fn device_capture_format(device: &Device) -> Result<(CaptureFormat, u32), V4lError> { let format = device.format().map_err(v4l_error)?; let params = device.params().map_err(v4l_error)?; - let frame_rate = frame_rate_from_fraction(params.interval) - .ok_or(V4lError::InvalidOption("V4L frame interval must be a whole frame rate"))?; - Ok(CaptureFormat::new( + let frame_rate = + frame_rate_from_fraction(params.interval.numerator, params.interval.denominator) + .ok_or(V4lError::InvalidOption("V4L frame interval must be non-zero"))?; + let capture_format = CaptureFormat::new( CaptureResolution::new(format.width, format.height), frame_rate, capture_frame_format_from_fourcc(format.fourcc) .ok_or_else(|| V4lError::Camera(format!("unsupported V4L fourcc {}", format.fourcc)))?, - )) + ); + Ok((capture_format, format.stride)) } #[cfg(target_os = "linux")] @@ -840,16 +855,17 @@ fn push_stepwise_resolution( fn frame_rates_from_interval(interval: v4l::FrameInterval) -> Vec { match interval.interval { FrameIntervalEnum::Discrete(fraction) => { - frame_rate_from_fraction(fraction).into_iter().collect() + frame_rate_from_fraction(fraction.numerator, fraction.denominator).into_iter().collect() } FrameIntervalEnum::Stepwise(stepwise) => { let mut frame_rates = Vec::new(); - if let Some(frame_rate) = frame_rate_from_fraction(stepwise.min) { - frame_rates.push(frame_rate); - } - if let Some(frame_rate) = frame_rate_from_fraction(stepwise.max) { - if !frame_rates.contains(&frame_rate) { - frame_rates.push(frame_rate); + for fraction in [stepwise.min, stepwise.max] { + if let Some(frame_rate) = + frame_rate_from_fraction(fraction.numerator, fraction.denominator) + { + if !frame_rates.contains(&frame_rate) { + frame_rates.push(frame_rate); + } } } frame_rates @@ -857,15 +873,20 @@ fn frame_rates_from_interval(interval: v4l::FrameInterval) -> Vec { } } -#[cfg(target_os = "linux")] -fn frame_rate_from_fraction(fraction: v4l::Fraction) -> Option { - if fraction.numerator == 0 || fraction.denominator == 0 { +/// Converts a V4L2 frame interval (seconds per frame) to frames per second. +/// +/// Non-integer rates (e.g. the NTSC interval 1001/30000 = 29.97fps) round to +/// the nearest whole rate, never below 1. +#[cfg(any(target_os = "linux", test))] +fn frame_rate_from_fraction(numerator: u32, denominator: u32) -> Option { + if numerator == 0 || denominator == 0 { return None; } - if fraction.denominator % fraction.numerator != 0 { - return None; + if denominator % numerator == 0 { + return Some(denominator / numerator); } - Some(fraction.denominator / fraction.numerator) + let rounded = (u64::from(denominator) + u64::from(numerator) / 2) / u64::from(numerator); + Some(u32::try_from(rounded).unwrap_or(u32::MAX).max(1)) } #[cfg(target_os = "linux")] @@ -878,12 +899,13 @@ fn frame_bytes(buffer: &[u8], bytes_used: u32) -> &[u8] { } } -#[cfg(target_os = "linux")] +#[cfg(any(target_os = "linux", test))] fn convert_to_i420( source_format: CaptureFrameFormat, source: &[u8], width: u32, height: u32, + source_stride: u32, destination: &mut I420Buffer, ) -> Result { let (stride_y, stride_u, stride_v) = destination.strides(); @@ -893,12 +915,14 @@ fn convert_to_i420( let ret = match source_format { CaptureFrameFormat::Yuyv => { - validate_len(source, width as usize * height as usize * 2, "YUYV frame")?; + let stride = source_row_stride(source_stride, width as usize * 2); + validate_len(source, stride * height as usize, "YUYV frame")?; + let stride_i32 = i32_from_usize(stride, "stride")?; unsafe { // SAFETY: Source and destination slices are valid for the dimensions and strides. yuv_sys::rs_YUY2ToI420( source.as_ptr(), - width_i32 * 2, + stride_i32, dst_y.as_mut_ptr(), stride_y as i32, dst_u.as_mut_ptr(), @@ -911,12 +935,14 @@ fn convert_to_i420( } } CaptureFrameFormat::Rgb24 => { - validate_len(source, width as usize * height as usize * 3, "RGB24 frame")?; + let stride = source_row_stride(source_stride, width as usize * 3); + validate_len(source, stride * height as usize, "RGB24 frame")?; + let stride_i32 = i32_from_usize(stride, "stride")?; unsafe { // SAFETY: Source and destination slices are valid for the dimensions and strides. yuv_sys::rs_RGB24ToI420( source.as_ptr(), - width_i32 * 3, + stride_i32, dst_y.as_mut_ptr(), stride_y as i32, dst_u.as_mut_ptr(), @@ -929,12 +955,14 @@ fn convert_to_i420( } } CaptureFrameFormat::Bgr24 => { - validate_len(source, width as usize * height as usize * 3, "BGR24 frame")?; + let stride = source_row_stride(source_stride, width as usize * 3); + validate_len(source, stride * height as usize, "BGR24 frame")?; + let stride_i32 = i32_from_usize(stride, "stride")?; unsafe { // SAFETY: Source and destination slices are valid for the dimensions and strides. yuv_sys::rs_RAWToI420( source.as_ptr(), - width_i32 * 3, + stride_i32, dst_y.as_mut_ptr(), stride_y as i32, dst_u.as_mut_ptr(), @@ -947,12 +975,14 @@ fn convert_to_i420( } } CaptureFrameFormat::Grey => { - validate_len(source, width as usize * height as usize, "GREY frame")?; + let stride = source_row_stride(source_stride, width as usize); + validate_len(source, stride * height as usize, "GREY frame")?; + let stride_i32 = i32_from_usize(stride, "stride")?; unsafe { // SAFETY: Source and destination slices are valid for the dimensions and strides. yuv_sys::rs_I400ToI420( source.as_ptr(), - width_i32, + stride_i32, dst_y.as_mut_ptr(), stride_y as i32, dst_u.as_mut_ptr(), @@ -965,15 +995,19 @@ fn convert_to_i420( } } CaptureFrameFormat::Nv12 => { - let y_size = width as usize * height as usize; + // Single-planar V4L2 NV12: the interleaved chroma plane follows the + // luma plane at `stride * height` and shares the luma stride. + let stride = source_row_stride(source_stride, width as usize); + let y_size = stride * height as usize; validate_len(source, y_size + y_size / 2, "NV12 frame")?; + let stride_i32 = i32_from_usize(stride, "stride")?; unsafe { // SAFETY: Source and destination slices are valid for the dimensions and strides. yuv_sys::rs_NV12ToI420( source.as_ptr(), - width_i32, + stride_i32, source[y_size..].as_ptr(), - width_i32, + stride_i32, dst_y.as_mut_ptr(), stride_y as i32, dst_u.as_mut_ptr(), @@ -1000,7 +1034,15 @@ fn convert_to_i420( } } -#[cfg(target_os = "linux")] +/// Returns the effective source row stride in bytes, falling back to the +/// packed width-derived stride when the driver reports `bytesperline` as zero +/// or smaller than one packed row. +#[cfg(any(target_os = "linux", test))] +fn source_row_stride(reported_stride: u32, packed_stride: usize) -> usize { + (reported_stride as usize).max(packed_stride) +} + +#[cfg(any(target_os = "linux", test))] fn convert_mjpeg_to_i420( source: &[u8], width: u32, @@ -1061,7 +1103,7 @@ fn convert_mjpeg_to_i420( } } -#[cfg(target_os = "linux")] +#[cfg(any(target_os = "linux", test))] fn validate_len(source: &[u8], expected: usize, label: &'static str) -> Result<(), V4lError> { if source.len() < expected { return Err(V4lError::InvalidFrame(label)); @@ -1076,38 +1118,18 @@ fn select_capture_wall_time_us( read_wall_time_us: u64, ) -> u64 { backend_capture_timestamp - .and_then(|timestamp| validate_backend_capture_timestamp_us(timestamp, read_wall_time_us)) + .and_then(|timestamp| u64::try_from(timestamp.as_micros()).ok()) + .and_then(|timestamp_us| validate_capture_timestamp_us(timestamp_us, read_wall_time_us)) .unwrap_or(fallback_wall_time_us) } #[cfg(any(target_os = "linux", test))] -fn validate_backend_capture_timestamp_us( - capture_timestamp: Duration, - read_wall_time_us: u64, -) -> Option { - let capture_timestamp_us = u64::try_from(capture_timestamp.as_micros()).ok()?; - if capture_timestamp_us == 0 || capture_timestamp_us > read_wall_time_us { - return None; - } - if read_wall_time_us - capture_timestamp_us > MAX_BACKEND_CAPTURE_TIMESTAMP_AGE_US { - return None; - } - Some(capture_timestamp_us) -} - -#[cfg(target_os = "linux")] -fn unix_time_us_now() -> Option { - let elapsed = SystemTime::now().duration_since(UNIX_EPOCH).ok()?; - u64::try_from(elapsed.as_micros()).ok() -} - -#[cfg(target_os = "linux")] -fn elapsed_us(duration: Duration) -> i64 { - i64::try_from(duration.as_micros()).unwrap_or(i64::MAX) +fn i32_from_u32(value: u32, field: &'static str) -> Result { + i32::try_from(value).map_err(|_| V4lError::OptionOutOfRange(field)) } -#[cfg(target_os = "linux")] -fn i32_from_u32(value: u32, field: &'static str) -> Result { +#[cfg(any(target_os = "linux", test))] +fn i32_from_usize(value: usize, field: &'static str) -> Result { i32::try_from(value).map_err(|_| V4lError::OptionOutOfRange(field)) } @@ -1146,6 +1168,7 @@ fn clock_time(clock_id: libc::clockid_t) -> Option { #[cfg(test)] mod tests { use super::*; + use crate::time::MAX_CAPTURE_TIMESTAMP_AGE_US; #[test] fn rejects_empty_frame_format_preferences() { @@ -1180,4 +1203,159 @@ mod tests { select_capture_wall_time_us(Some(Duration::from_micros(10)), 10_000_000, 10_000_000); assert_eq!(selected, 10_000_000); } + + #[test] + fn accepts_recent_backend_capture_timestamp() { + let read_us = 20_000_000; + let recent = Duration::from_micros(read_us - 1_000); + assert_eq!(select_capture_wall_time_us(Some(recent), 42, read_us), read_us - 1_000); + } + + #[test] + fn ignores_backend_capture_timestamp_older_than_max_age() { + let read_us = 20_000_000; + let stale = Duration::from_micros(read_us - MAX_CAPTURE_TIMESTAMP_AGE_US - 1); + assert_eq!(select_capture_wall_time_us(Some(stale), 42, read_us), 42); + } + + #[test] + fn frame_rate_from_fraction_rounds_fractional_intervals() { + assert_eq!(frame_rate_from_fraction(1, 30), Some(30)); + assert_eq!(frame_rate_from_fraction(1001, 30000), Some(30)); + assert_eq!(frame_rate_from_fraction(1001, 60000), Some(60)); + assert_eq!(frame_rate_from_fraction(3, 1), Some(1)); + } + + #[test] + fn frame_rate_from_fraction_rejects_zero_terms() { + assert_eq!(frame_rate_from_fraction(0, 30000), None); + assert_eq!(frame_rate_from_fraction(1001, 0), None); + } + + #[test] + fn converts_padded_stride_nv12_frame() { + let width = 6u32; + let height = 4u32; + let stride = 8usize; + let y_size = stride * height as usize; + // Padding bytes past each 6-pixel row must never reach the output. + let mut source = vec![0xEE; y_size + y_size / 2]; + for row in 0..height as usize { + for col in 0..width as usize { + source[row * stride + col] = (100 + row * 10 + col) as u8; + } + } + for row in 0..height as usize / 2 { + for pair in 0..width as usize / 2 { + source[y_size + row * stride + pair * 2] = (50 + row * 10 + pair) as u8; + source[y_size + row * stride + pair * 2 + 1] = (150 + row * 10 + pair) as u8; + } + } + + let mut destination = I420Buffer::new(width, height); + let used_decode_path = convert_to_i420( + CaptureFrameFormat::Nv12, + &source, + width, + height, + stride as u32, + &mut destination, + ) + .expect("padded NV12 frame must convert"); + assert!(!used_decode_path); + + let (stride_y, stride_u, stride_v) = destination.strides(); + let (dst_y, dst_u, dst_v) = destination.data(); + for row in 0..height as usize { + for col in 0..width as usize { + assert_eq!( + dst_y[row * stride_y as usize + col], + (100 + row * 10 + col) as u8, + "Y({row},{col})" + ); + } + } + for row in 0..height as usize / 2 { + for pair in 0..width as usize / 2 { + assert_eq!(dst_u[row * stride_u as usize + pair], (50 + row * 10 + pair) as u8); + assert_eq!(dst_v[row * stride_v as usize + pair], (150 + row * 10 + pair) as u8); + } + } + } + + #[test] + fn converts_padded_stride_yuyv_frame() { + let width = 6u32; + let height = 2u32; + let stride = 16usize; + // Padding bytes past each 12-byte packed row must never reach the output. + let mut source = vec![0xEE; stride * height as usize]; + for row in 0..height as usize { + for col in 0..width as usize { + source[row * stride + col * 2] = (40 + row * 10 + col) as u8; + source[row * stride + col * 2 + 1] = 128; + } + } + + let mut destination = I420Buffer::new(width, height); + convert_to_i420( + CaptureFrameFormat::Yuyv, + &source, + width, + height, + stride as u32, + &mut destination, + ) + .expect("padded YUYV frame must convert"); + + let (stride_y, _, _) = destination.strides(); + let (dst_y, _, _) = destination.data(); + for row in 0..height as usize { + for col in 0..width as usize { + assert_eq!( + dst_y[row * stride_y as usize + col], + (40 + row * 10 + col) as u8, + "Y({row},{col})" + ); + } + } + } + + #[test] + fn rejects_nv12_frame_shorter_than_padded_stride_size() { + let width = 6u32; + let height = 4u32; + let packed = vec![0u8; (width * height) as usize * 3 / 2]; + let mut destination = I420Buffer::new(width, height); + let err = + convert_to_i420(CaptureFrameFormat::Nv12, &packed, width, height, 8, &mut destination) + .expect_err("packed-size buffer must fail the stride-aware length check"); + assert!(matches!(err, V4lError::InvalidFrame("NV12 frame"))); + } + + #[test] + fn falls_back_to_packed_stride_when_driver_reports_zero() { + let width = 4u32; + let height = 2u32; + let y_size = (width * height) as usize; + let mut source = vec![128u8; y_size + y_size / 2]; + for (index, value) in source.iter_mut().take(y_size).enumerate() { + *value = index as u8; + } + + let mut destination = I420Buffer::new(width, height); + convert_to_i420(CaptureFrameFormat::Nv12, &source, width, height, 0, &mut destination) + .expect("packed NV12 frame with zero reported stride must convert"); + + let (stride_y, _, _) = destination.strides(); + let (dst_y, _, _) = destination.data(); + for row in 0..height as usize { + for col in 0..width as usize { + assert_eq!( + dst_y[row * stride_y as usize + col], + (row * width as usize + col) as u8 + ); + } + } + } } diff --git a/livekit-capture/src/track.rs b/livekit-capture/src/track.rs index c9bfdf26b..0576001bc 100644 --- a/livekit-capture/src/track.rs +++ b/livekit-capture/src/track.rs @@ -22,7 +22,9 @@ use livekit::{ }; use crate::{ - encoded::{EncodedAccessUnit, EncodedVideoCodec}, + encoded::{ + CodecSpecific, EncodedAccessUnit, EncodedLayerInfo, EncodedPayload, EncodedVideoCodec, + }, error::CaptureError, }; @@ -46,6 +48,18 @@ impl VideoCaptureTrack { Self { source, track } } + /// Creates a capture track for pre-encoded access units. + /// + /// Unlike [`VideoCaptureTrack::new`], no raw keepalive frames are + /// injected before the first capture, so the sender starts directly on + /// the passthrough encoder instead of briefly encoding black frames. + pub fn new_encoded(name: &str, resolution: VideoResolution) -> Self { + let source = NativeVideoSource::new_encoded(resolution); + let track = + LocalVideoTrack::create_video_track(name, RtcVideoSource::Native(source.clone())); + Self { source, track } + } + /// Returns the publishable local video track. pub fn track(&self) -> LocalVideoTrack { self.track.clone() @@ -57,9 +71,26 @@ impl VideoCaptureTrack { } /// Captures one DMA-BUF backed frame. + /// + /// The native capture path hands a single file descriptor to the driver + /// and derives the plane layout from the underlying buffer itself + /// (NvBufSurface); per-plane offsets, strides, and DRM modifiers in + /// [`DmaBufFrame`] are informational and must describe that derived + /// layout. Frames whose planes span multiple file descriptors or start + /// at a nonzero offset are rejected rather than silently truncated. #[cfg(target_os = "linux")] pub fn capture_dmabuf(&self, frame: &DmaBufFrame) -> Result<(), CaptureError> { let plane = frame.planes.first().ok_or(CaptureError::MissingDmaBufPlane)?; + if frame.planes.iter().any(|other| other.fd != plane.fd) { + return Err(CaptureError::UnsupportedDmaBufLayout( + "planes must share one DMA-BUF file descriptor", + )); + } + if plane.offset != 0 { + return Err(CaptureError::UnsupportedDmaBufLayout( + "first plane must start at offset 0", + )); + } let ok = self.source.capture_dmabuf_frame( plane.fd, frame.width, @@ -71,13 +102,26 @@ impl VideoCaptureTrack { } /// Captures one encoded video access unit. + /// + /// The passthrough path forwards single-layer streams: access units + /// carrying temporal/spatial layer ids, an AV1 dependency descriptor, or + /// a non-`L1T1` scalability mode are rejected so callers are not misled + /// into thinking that metadata reaches the wire. pub fn capture_encoded(&self, access_unit: &EncodedAccessUnit<'_>) -> Result<(), CaptureError> { validate_encoded_access_unit(access_unit)?; - let payload = access_unit.payload.to_vec(); + let mut scratch = Vec::new(); + let payload: &[u8] = match &access_unit.payload { + EncodedPayload::Contiguous(bytes) => bytes, + EncodedPayload::Owned(bytes) => bytes, + EncodedPayload::Fragments(_) => { + scratch = access_unit.payload.to_vec(); + &scratch + } + }; let frame = EncodedVideoFrame { codec: access_unit.codec.into(), - payload: &payload, + payload, timestamp_us: access_unit.timestamp_us, frame_type: access_unit.frame_type.into(), width: access_unit.width, @@ -87,6 +131,17 @@ impl VideoCaptureTrack { self.source.capture_encoded_frame(&frame).then_some(()).ok_or(CaptureError::CaptureFailed) } + /// Returns and clears the pending keyframe request raised by the + /// passthrough encoder (PLI/FIR from the SFU, late subscriber join, or + /// sender reconfiguration). + /// + /// Poll this from the capture loop and forward the request to the + /// upstream encoder so it produces an IDR; until one arrives, new + /// subscribers cannot render the track. + pub fn take_keyframe_request(&self) -> bool { + self.source.take_keyframe_request() + } + /// Returns publish options appropriate for encoded passthrough. pub fn encoded_publish_options(codec: EncodedVideoCodec) -> TrackPublishOptions { TrackPublishOptions { @@ -102,6 +157,19 @@ fn validate_encoded_access_unit(access_unit: &EncodedAccessUnit<'_>) -> Result<( if access_unit.payload.is_empty() { return Err(CaptureError::EmptyPayload); } + if access_unit.layers != EncodedLayerInfo::default() { + return Err(CaptureError::UnsupportedLayeredEncoding( + "temporal/spatial layer ids are not forwarded by the passthrough encoder", + )); + } + let default_specific = CodecSpecific::default_for(access_unit.codec); + if access_unit.codec_specific != CodecSpecific::None + && access_unit.codec_specific != default_specific + { + return Err(CaptureError::UnsupportedLayeredEncoding( + "codec-specific layering metadata is not forwarded by the passthrough encoder", + )); + } Ok(()) } @@ -139,4 +207,55 @@ mod tests { assert_eq!(validate_encoded_access_unit(&access_unit), Err(CaptureError::EmptyPayload)); } + + #[test] + fn accepts_default_codec_specific_metadata() { + let mut access_unit = EncodedAccessUnit::contiguous( + EncodedVideoCodec::AV1, + &[1, 2, 3], + 0, + EncodedFrameType::Key, + 640, + 480, + ); + access_unit.codec_specific = CodecSpecific::default_for(EncodedVideoCodec::AV1); + + assert!(validate_encoded_access_unit(&access_unit).is_ok()); + } + + #[test] + fn rejects_layered_access_units() { + let mut access_unit = EncodedAccessUnit::contiguous( + EncodedVideoCodec::VP9, + &[1, 2, 3], + 0, + EncodedFrameType::Key, + 640, + 480, + ); + access_unit.layers = EncodedLayerInfo { spatial_id: None, temporal_id: Some(1) }; + + assert!(matches!( + validate_encoded_access_unit(&access_unit), + Err(CaptureError::UnsupportedLayeredEncoding(_)) + )); + } + + #[test] + fn rejects_non_default_codec_specific_metadata() { + let mut access_unit = EncodedAccessUnit::contiguous( + EncodedVideoCodec::VP8, + &[1, 2, 3], + 0, + EncodedFrameType::Key, + 640, + 480, + ); + access_unit.codec_specific = CodecSpecific::VP8 { temporal_id: Some(1), layer_sync: true }; + + assert!(matches!( + validate_encoded_access_unit(&access_unit), + Err(CaptureError::UnsupportedLayeredEncoding(_)) + )); + } } diff --git a/webrtc-sys/include/livekit/encoded_video_frame_buffer.h b/webrtc-sys/include/livekit/encoded_video_frame_buffer.h index f5bec6665..a5be935f0 100644 --- a/webrtc-sys/include/livekit/encoded_video_frame_buffer.h +++ b/webrtc-sys/include/livekit/encoded_video_frame_buffer.h @@ -16,9 +16,11 @@ #pragma once +#include #include -#include +#include +#include "api/video/encoded_image.h" #include "api/video/video_frame_buffer.h" namespace livekit { @@ -39,11 +41,17 @@ enum class EncodedFrameType { // A native WebRTC frame buffer carrying one encoded video access unit. class EncodedVideoFrameBuffer : public webrtc::VideoFrameBuffer { public: - EncodedVideoFrameBuffer(int width, - int height, - EncodedVideoCodec codec, - EncodedFrameType frame_type, - std::vector payload); + // `keyframe_request_flag` is shared with the owning video source: the + // pass-through encoder sets it when the RTP layer asks for a keyframe the + // pending frame cannot satisfy, and the capture side polls it to forward + // the request upstream. + EncodedVideoFrameBuffer( + int width, + int height, + EncodedVideoCodec codec, + EncodedFrameType frame_type, + webrtc::scoped_refptr payload, + std::shared_ptr> keyframe_request_flag = nullptr); ~EncodedVideoFrameBuffer() override = default; Type type() const override; @@ -60,7 +68,17 @@ class EncodedVideoFrameBuffer : public webrtc::VideoFrameBuffer { EncodedVideoCodec codec() const { return codec_; } EncodedFrameType frame_type() const { return frame_type_; } - const std::vector& payload() const { return payload_; } + + // The encoded access unit. Shared with the pass-through encoder so the + // payload is not copied again on the send path. + webrtc::scoped_refptr encoded_data() const { + return payload_; + } + const uint8_t* payload_data() const { return payload_->data(); } + size_t payload_size() const { return payload_->size(); } + + // Asks the capture side to produce a keyframe (e.g. on PLI/FIR). + void request_keyframe() const; static EncodedVideoFrameBuffer* FromNative(webrtc::VideoFrameBuffer* buffer); @@ -69,7 +87,8 @@ class EncodedVideoFrameBuffer : public webrtc::VideoFrameBuffer { int height_; EncodedVideoCodec codec_; EncodedFrameType frame_type_; - std::vector payload_; + webrtc::scoped_refptr payload_; + std::shared_ptr> keyframe_request_flag_; }; } // namespace livekit diff --git a/webrtc-sys/include/livekit/video_track.h b/webrtc-sys/include/livekit/video_track.h index fc1d614fd..8b35774bf 100644 --- a/webrtc-sys/include/livekit/video_track.h +++ b/webrtc-sys/include/livekit/video_track.h @@ -16,6 +16,7 @@ #pragma once +#include #include #include "api/media_stream_interface.h" @@ -105,11 +106,19 @@ class VideoTrackSource { void set_packet_trailer_handler( std::shared_ptr handler); + // Shared with every EncodedVideoFrameBuffer this source emits; the + // pass-through encoder raises it on unsatisfied keyframe requests. + std::shared_ptr> keyframe_request_flag() const { + return keyframe_request_flag_; + } + private: mutable webrtc::Mutex mutex_; webrtc::TimestampAligner timestamp_aligner_; VideoResolution resolution_; std::shared_ptr packet_trailer_handler_; + std::shared_ptr> keyframe_request_flag_ = + std::make_shared>(false); bool is_screencast_; }; @@ -135,8 +144,14 @@ class VideoTrackSource { bool capture_encoded_frame(int width, int height, const EncodedVideoFrameData& frame, + rust::Slice payload, const FrameMetadata& frame_metadata) const; + // Returns and clears the pending upstream keyframe request raised by the + // pass-through encoder (PLI/FIR or post-reconfigure). Poll from the + // capture loop. + bool take_keyframe_request() const; + void set_packet_trailer_handler( std::shared_ptr handler) const; diff --git a/webrtc-sys/src/encoded_video_frame_buffer.cpp b/webrtc-sys/src/encoded_video_frame_buffer.cpp index c5d321fd0..62a474d6f 100644 --- a/webrtc-sys/src/encoded_video_frame_buffer.cpp +++ b/webrtc-sys/src/encoded_video_frame_buffer.cpp @@ -18,6 +18,7 @@ #include +#include "api/video/i420_buffer.h" #include "rtc_base/logging.h" namespace livekit { @@ -27,12 +28,14 @@ EncodedVideoFrameBuffer::EncodedVideoFrameBuffer( int height, EncodedVideoCodec codec, EncodedFrameType frame_type, - std::vector payload) + webrtc::scoped_refptr payload, + std::shared_ptr> keyframe_request_flag) : width_(width), height_(height), codec_(codec), frame_type_(frame_type), - payload_(std::move(payload)) {} + payload_(std::move(payload)), + keyframe_request_flag_(std::move(keyframe_request_flag)) {} webrtc::VideoFrameBuffer::Type EncodedVideoFrameBuffer::type() const { return Type::kNative; @@ -48,8 +51,19 @@ int EncodedVideoFrameBuffer::height() const { webrtc::scoped_refptr EncodedVideoFrameBuffer::ToI420() { - RTC_LOG(LS_ERROR) << "EncodedVideoFrameBuffer::ToI420 is unsupported"; - return nullptr; + // Sinks attached to a pre-encoded track (local preview, FFI color + // conversion) convert whatever buffer they receive; the encoded payload + // cannot be decoded here, so hand back a black frame instead of a null + // buffer that would crash the caller. + static std::atomic logged{false}; + if (!logged.exchange(true)) { + RTC_LOG(LS_WARNING) << "EncodedVideoFrameBuffer::ToI420 cannot decode an " + "encoded access unit; returning black frames"; + } + webrtc::scoped_refptr buffer = + webrtc::I420Buffer::Create(width_, height_); + webrtc::I420Buffer::SetBlack(buffer.get()); + return buffer; } webrtc::scoped_refptr @@ -59,8 +73,18 @@ EncodedVideoFrameBuffer::CropAndScale(int /* offset_x */, int /* crop_height */, int /* scaled_width */, int /* scaled_height */) { - RTC_LOG(LS_ERROR) << "EncodedVideoFrameBuffer::CropAndScale is unsupported"; - return nullptr; + // Encoded payloads cannot be rescaled; returning the buffer unchanged + // keeps misbehaving callers alive (the capture path never scales encoded + // frames). + RTC_LOG(LS_WARNING) << "EncodedVideoFrameBuffer::CropAndScale is " + "unsupported; returning the frame unscaled"; + return webrtc::scoped_refptr(this); +} + +void EncodedVideoFrameBuffer::request_keyframe() const { + if (keyframe_request_flag_) { + keyframe_request_flag_->store(true, std::memory_order_relaxed); + } } EncodedVideoFrameBuffer* EncodedVideoFrameBuffer::FromNative( diff --git a/webrtc-sys/src/jetson/av1_encoder_impl.cpp b/webrtc-sys/src/jetson/av1_encoder_impl.cpp index 959ac3185..679328d64 100644 --- a/webrtc-sys/src/jetson/av1_encoder_impl.cpp +++ b/webrtc-sys/src/jetson/av1_encoder_impl.cpp @@ -283,14 +283,12 @@ int32_t JetsonAV1EncoderImpl::Encode( return WEBRTC_VIDEO_CODEC_NO_OUTPUT; } - livekit::av1::StripIvfFrameHeaderIfPresent(&packet); + livekit::av1::NormalizeForRtp(&packet); if (packet.empty()) { - RTC_LOG(LS_ERROR) - << "Jetson MMAPI AV1 packet contained only IVF framing; skipping."; + RTC_LOG(LS_ERROR) << "Jetson MMAPI AV1 packet contained no transferable " + "OBUs after RTP normalization; skipping."; return WEBRTC_VIDEO_CODEC_NO_OUTPUT; } - livekit::av1::ConvertAnnexBToLowOverheadIfPresent(&packet); - livekit::av1::StripNonTransferObusIfPresent(&packet); std::vector sequence_header; if (livekit::av1::ExtractSequenceHeaderObu(packet.data(), packet.size(), diff --git a/webrtc-sys/src/jetson/jetson_av1_bitstream.cpp b/webrtc-sys/src/jetson/jetson_av1_bitstream.cpp index 0c24672f5..2792ac36a 100644 --- a/webrtc-sys/src/jetson/jetson_av1_bitstream.cpp +++ b/webrtc-sys/src/jetson/jetson_av1_bitstream.cpp @@ -345,6 +345,12 @@ void StripNonTransferObusIfPresent(std::vector* packet) { StripNonTransferObus(packet); } +void NormalizeForRtp(std::vector* packet) { + StripIvfFrameHeaderIfPresent(packet); + ConvertAnnexBToLowOverheadIfPresent(packet); + StripNonTransferObusIfPresent(packet); +} + bool IsWebRtcParseable(const uint8_t* data, size_t len) { if (!data || len == 0) { return false; diff --git a/webrtc-sys/src/jetson/jetson_av1_bitstream.h b/webrtc-sys/src/jetson/jetson_av1_bitstream.h index f8d9b46fe..2a7605661 100644 --- a/webrtc-sys/src/jetson/jetson_av1_bitstream.h +++ b/webrtc-sys/src/jetson/jetson_av1_bitstream.h @@ -58,6 +58,12 @@ void ConvertAnnexBToLowOverheadIfPresent(std::vector* packet); /// Strip OBUs that should not be transferred in WebRTC RTP payloads when present. void StripNonTransferObusIfPresent(std::vector* packet); +/// Normalizes an AV1 temporal unit for WebRTC RTP packetization: strips IVF +/// framing, converts Annex-B units to low-overhead OBUs, and strips +/// non-transfer OBUs. Shared by every encoder that emits AV1 into the RTP +/// pipeline so the steps cannot drift apart. +void NormalizeForRtp(std::vector* packet); + /// Basic validation that WebRTC's AV1 packetizer can parse the bitstream. bool IsWebRtcParseable(const uint8_t* data, size_t len); diff --git a/webrtc-sys/src/passthrough_video_encoder.cpp b/webrtc-sys/src/passthrough_video_encoder.cpp index 1a1014bbe..f10a8aaf4 100644 --- a/webrtc-sys/src/passthrough_video_encoder.cpp +++ b/webrtc-sys/src/passthrough_video_encoder.cpp @@ -16,6 +16,7 @@ #include "livekit/passthrough_video_encoder.h" +#include #include #include #include @@ -110,15 +111,13 @@ bool IsKeyframe(livekit::EncodedFrameType frame_type) { return frame_type == livekit::EncodedFrameType::kKey; } -std::vector NormalizedPayloadForEncode( - const livekit::EncodedVideoFrameBuffer& encoded_buffer) { - std::vector payload = encoded_buffer.payload(); - if (encoded_buffer.codec() == livekit::EncodedVideoCodec::kAV1) { - livekit::av1::StripIvfFrameHeaderIfPresent(&payload); - livekit::av1::ConvertAnnexBToLowOverheadIfPresent(&payload); - livekit::av1::StripNonTransferObusIfPresent(&payload); - } - return payload; +// SDP profile parameters constrain real encoders, not a pass-through: the +// forwarded bytes are whatever the upstream encoder produced. Match formats +// by codec only (H265/HEVC are aliases via CodecTypeFromFormat). +bool IsSameCodecType(const SdpVideoFormat& a, const SdpVideoFormat& b) { + VideoCodecType type_a = CodecTypeFromFormat(a); + return type_a != webrtc::kVideoCodecGeneric && + type_a == CodecTypeFromFormat(b); } void FillSingleLayerCodecSpecific( @@ -213,7 +212,7 @@ class PassthroughVideoEncoder final : public VideoEncoder { } int32_t Encode(const VideoFrame& frame, - const std::vector* /* frame_types */) override { + const std::vector* frame_types) override { if (!encoded_image_callback_) { RTC_LOG(LS_ERROR) << "PassthroughVideoEncoder callback is not registered"; @@ -236,13 +235,36 @@ class PassthroughVideoEncoder final : public VideoEncoder { return WEBRTC_VIDEO_CODEC_ERR_PARAMETER; } - std::vector payload = NormalizedPayloadForEncode(*encoded_buffer); - if (payload.empty()) { + const bool is_keyframe = IsKeyframe(encoded_buffer->frame_type()); + + // A pass-through cannot synthesize the keyframe the RTP layer wants + // (PLI/FIR, late subscriber, reconfiguration); forward the request to + // the capture source so the upstream encoder can produce an IDR. + const bool keyframe_requested = + frame_types != nullptr && + std::any_of(frame_types->begin(), frame_types->end(), + [](VideoFrameType type) { + return type == VideoFrameType::kVideoFrameKey; + }); + if (keyframe_requested && !is_keyframe) { + encoded_buffer->request_keyframe(); + } + + if (encoded_buffer->payload_size() == 0) { RTC_LOG(LS_ERROR) << "PassthroughVideoEncoder received an empty frame"; return WEBRTC_VIDEO_CODEC_ERR_PARAMETER; } - const bool is_keyframe = IsKeyframe(encoded_buffer->frame_type()); + + // Non-AV1 payloads are forwarded without copying: the buffer already + // owns a webrtc::EncodedImageBuffer. AV1 needs RTP normalization, which + // may rewrite the bytes, so it works on a copy. + webrtc::scoped_refptr encoded_data; if (IsAv1Codec(codec_type_)) { + std::vector payload( + encoded_buffer->payload_data(), + encoded_buffer->payload_data() + encoded_buffer->payload_size()); + livekit::av1::NormalizeForRtp(&payload); + std::vector sequence_header; if (livekit::av1::ExtractSequenceHeaderObu( payload.data(), payload.size(), &sequence_header)) { @@ -251,12 +273,16 @@ class PassthroughVideoEncoder final : public VideoEncoder { livekit::av1::EnsureSequenceHeaderOnKeyframe( &payload, cached_sequence_header_obu_); } - if (!livekit::av1::IsWebRtcParseable(payload.data(), payload.size())) { + if (payload.empty() || + !livekit::av1::IsWebRtcParseable(payload.data(), payload.size())) { RTC_LOG(LS_ERROR) << "PassthroughVideoEncoder received an AV1 frame that WebRTC " "cannot packetize"; return WEBRTC_VIDEO_CODEC_ERR_PARAMETER; } + encoded_data = EncodedImageBuffer::Create(payload.data(), payload.size()); + } else { + encoded_data = encoded_buffer->encoded_data(); } EncodedImage encoded_image; @@ -271,9 +297,9 @@ class PassthroughVideoEncoder final : public VideoEncoder { encoded_image.timing_.flags = webrtc::VideoSendTiming::kInvalid; encoded_image._frameType = FrameTypeFromBuffer(encoded_buffer->frame_type()); encoded_image.SetColorSpace(frame.color_space()); - encoded_image.SetEncodedData( - EncodedImageBuffer::Create(payload.data(), payload.size())); - encoded_image.set_size(payload.size()); + const size_t encoded_size = encoded_data->size(); + encoded_image.SetEncodedData(std::move(encoded_data)); + encoded_image.set_size(encoded_size); encoded_image.qp_ = -1; CodecSpecificInfo codec_info; @@ -350,7 +376,7 @@ PassthroughVideoEncoderFactory::QueryCodecSupport( const SdpVideoFormat& format, std::optional scalability_mode) const { for (const auto& supported_format : supported_formats_) { - if (format.IsSameCodec(supported_format)) { + if (IsSameCodecType(format, supported_format)) { if (format.name == "AV1" && scalability_mode.has_value() && *scalability_mode != "L1T1") { return {.is_supported = false, .is_power_efficient = false}; @@ -364,9 +390,12 @@ PassthroughVideoEncoderFactory::QueryCodecSupport( std::unique_ptr PassthroughVideoEncoderFactory::Create( const Environment& env, const SdpVideoFormat& format) { + // Match by codec, not by exact profile: rejecting e.g. a High-profile + // H264 negotiation here would hand the session to a real encoder that + // cannot consume pre-encoded frames. for (const auto& supported_format : supported_formats_) { - if (format.IsSameCodec(supported_format)) { - return std::make_unique(env, supported_format); + if (IsSameCodecType(format, supported_format)) { + return std::make_unique(env, format); } } return nullptr; diff --git a/webrtc-sys/src/rtp_sender.cpp b/webrtc-sys/src/rtp_sender.cpp index 1351c83dc..e97bed2d6 100644 --- a/webrtc-sys/src/rtp_sender.cpp +++ b/webrtc-sys/src/rtp_sender.cpp @@ -110,7 +110,16 @@ class FixedVideoEncoderSelector final } std::optional OnEncoderBroken() override { - return std::nullopt; + // The preferred backend is a hard requirement for this sender (e.g. + // pre-encoded pass-through). When the active encoder breaks — including + // when the initial untagged encoder could not even be created — request + // the preferred backend explicitly instead of giving up, so the sender + // recovers onto the right encoder. + if (!current_encoder_) { + return std::nullopt; + } + requested_ = true; + return WithBackend(*current_encoder_, backend_); } private: diff --git a/webrtc-sys/src/video_encoder_factory.cpp b/webrtc-sys/src/video_encoder_factory.cpp index 8ec524a15..d8b7fd454 100644 --- a/webrtc-sys/src/video_encoder_factory.cpp +++ b/webrtc-sys/src/video_encoder_factory.cpp @@ -16,6 +16,8 @@ #include "livekit/video_encoder_factory.h" +#include +#include #include #include #include @@ -174,6 +176,37 @@ bool IsAutomaticFallbackBackend(VideoEncoderBackend backend) { return backend != VideoEncoderBackend::PreEncoded; } +bool EqualsIgnoreAsciiCase(std::string_view a, std::string_view b) { + return a.size() == b.size() && + std::equal(a.begin(), a.end(), b.begin(), [](char x, char y) { + return std::tolower(static_cast(x)) == + std::tolower(static_cast(y)); + }); +} + +bool IsSameCodecName(std::string_view a, std::string_view b) { + if (EqualsIgnoreAsciiCase(a, b)) { + return true; + } + auto is_h265 = [](std::string_view name) { + return EqualsIgnoreAsciiCase(name, "H265") || + EqualsIgnoreAsciiCase(name, "HEVC"); + }; + return is_h265(a) && is_h265(b); +} + +// The pass-through backend forwards pre-encoded bytes, so SDP profile +// parameters do not constrain it: match it by codec name only. Real +// encoder backends keep exact profile matching. +bool FormatSupportedByBackendFactory(VideoEncoderBackend backend, + const webrtc::SdpVideoFormat& supported, + const webrtc::SdpVideoFormat& requested) { + if (backend == VideoEncoderBackend::PreEncoded) { + return IsSameCodecName(supported.name, requested.name); + } + return supported.IsSameCodec(requested); +} + void AddBackendFactory( std::vector& factories, VideoEncoderBackend backend, @@ -347,10 +380,35 @@ VideoEncoderFactory::InternalFactory::GetSupportedFormats() const { std::vector formats = Factory().GetSupportedFormats(); for (const auto& backend_factory : factories_) { + if (backend_factory.backend == VideoEncoderBackend::PreEncoded) { + continue; + } auto supported_formats = backend_factory.factory->GetSupportedFormats(); formats.insert(formats.end(), supported_formats.begin(), supported_formats.end()); } + + // The pass-through factory would otherwise advertise codecs no real + // encoder implements (e.g. H265 on desktops); a normal session + // negotiating such a codec would end up with a sender that cannot create + // an encoder. Only advertise pass-through formats for codecs some real + // encoder already supports. + const size_t real_format_count = formats.size(); + for (const auto& backend_factory : factories_) { + if (backend_factory.backend != VideoEncoderBackend::PreEncoded) { + continue; + } + for (const auto& format : backend_factory.factory->GetSupportedFormats()) { + const bool codec_available = std::any_of( + formats.begin(), formats.begin() + real_format_count, + [&](const webrtc::SdpVideoFormat& existing) { + return IsSameCodecName(existing.name, format.name); + }); + if (codec_available) { + formats.push_back(format); + } + } + } return formats; } @@ -403,7 +461,9 @@ VideoEncoderFactory::InternalFactory::QueryCodecSupport( for (const auto& supported_format : backend_factory.factory->GetSupportedFormats()) { - if (stripped_format.IsSameCodec(supported_format)) { + if (FormatSupportedByBackendFactory(backend_factory.backend, + supported_format, + stripped_format)) { return webrtc::VideoEncoderFactory::CodecSupport{ .is_supported = true, .is_power_efficient = true, @@ -470,7 +530,9 @@ VideoEncoderFactory::InternalFactory::Create( for (const auto& supported_format : backend_factory.factory->GetSupportedFormats()) { - if (supported_format.IsSameCodec(stripped_format)) { + if (FormatSupportedByBackendFactory(backend_factory.backend, + supported_format, + stripped_format)) { auto encoder = backend_factory.factory->Create(env, stripped_format); if (encoder) { return encoder; @@ -479,6 +541,17 @@ VideoEncoderFactory::InternalFactory::Create( } } + // A real encoder cannot consume the pre-encoded native frame buffers + // this session produces, so falling back would yield a silently broken + // sender. Fail loudly instead. + if (*requested_backend == VideoEncoderBackend::PreEncoded) { + RTC_LOG(LS_ERROR) + << "Pre-encoded pass-through encoder is unavailable for " + << stripped_format.name + << "; refusing to fall back to a real encoder."; + return nullptr; + } + requested_backend_unavailable = true; } diff --git a/webrtc-sys/src/video_track.cpp b/webrtc-sys/src/video_track.cpp index 622179c9b..8a08aee0c 100644 --- a/webrtc-sys/src/video_track.cpp +++ b/webrtc-sys/src/video_track.cpp @@ -192,6 +192,25 @@ bool VideoTrackSource::InternalSource::on_captured_frame( static_cast(buffer->height())}; } + // Pre-encoded access units bypass the adapter entirely: frame-rate and + // resolution adaptation operate on raw frames, and dropping or scaling an + // encoded delta frame would corrupt the bitstream for every receiver. + if (livekit::EncodedVideoFrameBuffer::FromNative(buffer.get())) { + if (packet_trailer_handler_) { + packet_trailer_handler_->emit_publish_timing( + VideoPublishTimingStage::EncoderUpload, + frame_metadata.has_packet_trailer ? frame_metadata.user_timestamp + : 0, + frame_metadata.has_packet_trailer ? frame_metadata.frame_id : 0); + } + OnFrame(webrtc::VideoFrame::Builder() + .set_video_frame_buffer(buffer) + .set_rotation(frame.rotation()) + .set_timestamp_us(aligned_timestamp_us) + .build()); + return true; + } + int adapted_width, adapted_height, crop_width, crop_height, crop_x, crop_y; if (!AdaptFrame(buffer->width(), buffer->height(), aligned_timestamp_us, &adapted_width, &adapted_height, &crop_width, &crop_height, @@ -199,8 +218,7 @@ bool VideoTrackSource::InternalSource::on_captured_frame( return false; } - if ((adapted_width != frame.width() || adapted_height != frame.height()) && - buffer->type() != webrtc::VideoFrameBuffer::Type::kNative) { + if (adapted_width != frame.width() || adapted_height != frame.height()) { buffer = buffer->CropAndScale(crop_x, crop_y, crop_width, crop_height, adapted_width, adapted_height); } @@ -279,12 +297,16 @@ bool VideoTrackSource::capture_encoded_frame( int width, int height, const EncodedVideoFrameData& encoded_frame, + rust::Slice payload, const FrameMetadata& frame_metadata) const { + // The single unavoidable copy on this path: the Rust payload only lives + // for the duration of this call, while the EncodedImageBuffer is shared + // (uncopied) with the pass-through encoder downstream. auto buffer = webrtc::make_ref_counted( width, height, ToNativeEncodedCodec(encoded_frame.codec), ToNativeEncodedFrameType(encoded_frame.frame_type), - std::vector(encoded_frame.payload.begin(), - encoded_frame.payload.end())); + webrtc::EncodedImageBuffer::Create(payload.data(), payload.size()), + source_->keyframe_request_flag()); auto frame = webrtc::VideoFrame::Builder() .set_video_frame_buffer(std::move(buffer)) @@ -295,6 +317,11 @@ bool VideoTrackSource::capture_encoded_frame( return source_->on_captured_frame(frame, frame_metadata); } +bool VideoTrackSource::take_keyframe_request() const { + return source_->keyframe_request_flag()->exchange(false, + std::memory_order_relaxed); +} + void VideoTrackSource::set_packet_trailer_handler( std::shared_ptr handler) const { source_->set_packet_trailer_handler(std::move(handler)); diff --git a/webrtc-sys/src/video_track.rs b/webrtc-sys/src/video_track.rs index 8e12786b2..f7c8b354c 100644 --- a/webrtc-sys/src/video_track.rs +++ b/webrtc-sys/src/video_track.rs @@ -71,7 +71,6 @@ pub mod ffi { pub struct EncodedVideoFrameData { pub codec: EncodedVideoCodec, pub frame_type: EncodedFrameType, - pub payload: Vec, pub timestamp_us: i64, } @@ -124,8 +123,10 @@ pub mod ffi { width: i32, height: i32, frame: &EncodedVideoFrameData, + payload: &[u8], frame_metadata: &FrameMetadata, ) -> bool; + fn take_keyframe_request(self: &VideoTrackSource) -> bool; fn set_packet_trailer_handler( self: &VideoTrackSource, handler: SharedPtr, From 7708627d10c7d8d2ac94e24a21926ce37bee2088 Mon Sep 17 00:00:00 2001 From: David Chen Date: Thu, 2 Jul 2026 10:14:02 -0700 Subject: [PATCH 24/24] add missing files --- livekit-capture/src/sources/io.rs | 32 +++++++++++++++ livekit-capture/src/time.rs | 65 +++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 livekit-capture/src/sources/io.rs create mode 100644 livekit-capture/src/time.rs diff --git a/livekit-capture/src/sources/io.rs b/livekit-capture/src/sources/io.rs new file mode 100644 index 000000000..e56473c22 --- /dev/null +++ b/livekit-capture/src/sources/io.rs @@ -0,0 +1,32 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Shared blocking-I/O helpers for the encoded ingest sources. + +use std::io::{self, Read}; + +/// Reads exactly `buf.len()` bytes, returning `Ok(false)` when the stream +/// ends cleanly before the first byte and `UnexpectedEof` when it ends +/// mid-buffer. +pub(crate) fn read_exact_or_clean_eof(reader: &mut impl Read, buf: &mut [u8]) -> io::Result { + let mut offset = 0; + while offset < buf.len() { + match reader.read(&mut buf[offset..])? { + 0 if offset == 0 => return Ok(false), + 0 => return Err(io::Error::from(io::ErrorKind::UnexpectedEof)), + read => offset += read, + } + } + Ok(true) +} diff --git a/livekit-capture/src/time.rs b/livekit-capture/src/time.rs new file mode 100644 index 000000000..e9733149f --- /dev/null +++ b/livekit-capture/src/time.rs @@ -0,0 +1,65 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Shared capture-timestamp helpers used by the capture backends. + +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +/// Maximum age a backend-reported capture timestamp may have, relative to the +/// wall-clock read time, before it is considered stale and discarded. +pub(crate) const MAX_CAPTURE_TIMESTAMP_AGE_US: u64 = 5_000_000; + +/// Returns the current UNIX wall-clock time in microseconds. +pub(crate) fn unix_time_us_now() -> Option { + let elapsed = SystemTime::now().duration_since(UNIX_EPOCH).ok()?; + u64::try_from(elapsed.as_micros()).ok() +} + +/// Converts a duration to whole microseconds, saturating at `i64::MAX`. +pub(crate) fn elapsed_us(duration: Duration) -> i64 { + i64::try_from(duration.as_micros()).unwrap_or(i64::MAX) +} + +/// Validates a backend-reported capture timestamp against the wall-clock read +/// time: zero, future, and stale (older than +/// [`MAX_CAPTURE_TIMESTAMP_AGE_US`]) timestamps are rejected. +pub(crate) fn validate_capture_timestamp_us( + capture_timestamp_us: u64, + read_wall_time_us: u64, +) -> Option { + if capture_timestamp_us == 0 || capture_timestamp_us > read_wall_time_us { + return None; + } + if read_wall_time_us - capture_timestamp_us > MAX_CAPTURE_TIMESTAMP_AGE_US { + return None; + } + Some(capture_timestamp_us) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn validate_rejects_zero_future_and_stale_timestamps() { + let now = 10_000_000; + assert_eq!(validate_capture_timestamp_us(0, now), None); + assert_eq!(validate_capture_timestamp_us(now + 1, now), None); + assert_eq!( + validate_capture_timestamp_us(now - MAX_CAPTURE_TIMESTAMP_AGE_US - 1, now), + None + ); + assert_eq!(validate_capture_timestamp_us(now - 1, now), Some(now - 1)); + } +}