diff --git a/.changeset/livekit-capture-preencoded.md b/.changeset/livekit-capture-preencoded.md new file mode 100644 index 000000000..53738f0a6 --- /dev/null +++ b/.changeset/livekit-capture-preencoded.md @@ -0,0 +1,8 @@ +--- +"livekit-capture": minor +"livekit": patch +"libwebrtc": patch +"webrtc-sys": patch +--- + +Add a `livekit-capture` crate with codec-neutral capture types, H264/H265/VP8/VP9/AV1 passthrough support, common encoded ingress helpers, TCP byte-stream encoded ingress, RTSP-over-TCP encoded ingress, GStreamer appsink encoded ingress, macOS AVFoundation decoded-frame capture, Linux V4L capture, and Jetson libargus capture hooks. The capture crate reports capture-origin timing such as optional sensor timestamps, while packet-trailer frame metadata remains a publishing concern. The `local_video` examples now open platform camera capture through `livekit-capture`, and a `preencode_publish` example demonstrates publishing H264/H265 Annex-B TCP or RTSP streams as pre-encoded video tracks. diff --git a/Cargo.lock b/Cargo.lock index 19abebd05..99d81c806 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -62,7 +62,7 @@ version = "0.8.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "getrandom 0.3.4", "once_cell", "serde", @@ -111,7 +111,7 @@ checksum = "ed7572b7ba83a31e20d1b48970ee402d2e3e0537dcfe0a3ff4d6eb7508617d43" dependencies = [ "alsa-sys", "bitflags 2.13.0", - "cfg-if 1.0.4", + "cfg-if", "libc", ] @@ -435,7 +435,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "456b8a8feb6f42d237746d4b3e9a178494627745c3c56c6ea55d92ba50d026fc" dependencies = [ "autocfg", - "cfg-if 1.0.4", + "cfg-if", "concurrent-queue", "futures-io", "futures-lite 2.6.1", @@ -536,6 +536,12 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" +[[package]] +name = "atomic_refcell" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21e4227379beff4205943696e6c3e0cd809bacdf3f0edd6e3dd153e2269571a4" + [[package]] name = "autocfg" version = "1.5.0" @@ -555,7 +561,7 @@ dependencies = [ "log", "num-rational", "num-traits", - "pastey", + "pastey 0.1.1", "rayon", "thiserror 2.0.18", "v_frame", @@ -637,7 +643,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6" dependencies = [ "addr2line", - "cfg-if 1.0.4", + "cfg-if", "libc", "miniz_oxide", "object", @@ -1048,12 +1054,6 @@ dependencies = [ "target-lexicon", ] -[[package]] -name = "cfg-if" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" - [[package]] name = "cfg-if" version = "1.0.4" @@ -1140,34 +1140,6 @@ dependencies = [ "error-code", ] -[[package]] -name = "cocoa" -version = "0.20.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c49e86fc36d5704151f5996b7b3795385f50ce09e3be0f47a0cfde869681cf8" -dependencies = [ - "bitflags 1.3.2", - "block", - "core-foundation 0.7.0", - "core-graphics 0.19.2", - "foreign-types 0.3.2", - "libc", - "objc", -] - -[[package]] -name = "cocoa-foundation" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81411967c50ee9a1fc11365f8c585f863a22a9697c89239c452292c40ba79b0d" -dependencies = [ - "bitflags 2.13.0", - "block", - "core-foundation 0.10.1", - "core-graphics-types 0.2.0", - "objc", -] - [[package]] name = "codespan-reporting" version = "0.13.1" @@ -1274,23 +1246,13 @@ dependencies = [ "unicode-segmentation", ] -[[package]] -name = "core-foundation" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57d24c7a13c43e870e37c1556b74555437870a04514f7685f5b354e090567171" -dependencies = [ - "core-foundation-sys 0.7.0", - "libc", -] - [[package]] name = "core-foundation" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" dependencies = [ - "core-foundation-sys 0.8.7", + "core-foundation-sys", "libc", ] @@ -1300,34 +1262,16 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6" dependencies = [ - "core-foundation-sys 0.8.7", + "core-foundation-sys", "libc", ] -[[package]] -name = "core-foundation-sys" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3a71ab494c0b5b860bdc8407ae08978052417070c2ced38573a9157ad75b8ac" - [[package]] name = "core-foundation-sys" version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" -[[package]] -name = "core-graphics" -version = "0.19.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3889374e6ea6ab25dba90bb5d96202f61108058361f6dc72e8b03e6f8bbe923" -dependencies = [ - "bitflags 1.3.2", - "core-foundation 0.7.0", - "foreign-types 0.3.2", - "libc", -] - [[package]] name = "core-graphics" version = "0.23.2" @@ -1363,31 +1307,6 @@ dependencies = [ "libc", ] -[[package]] -name = "core-media-sys" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "273bf3fc5bf51fd06a7766a84788c1540b6527130a0bce39e00567d6ab9f31f1" -dependencies = [ - "cfg-if 0.1.10", - "core-foundation-sys 0.7.0", - "libc", -] - -[[package]] -name = "core-video-sys" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34ecad23610ad9757664d644e369246edde1803fcb43ed72876565098a5d3828" -dependencies = [ - "cfg-if 0.1.10", - "core-foundation-sys 0.7.0", - "core-graphics 0.19.2", - "libc", - "metal 0.18.0", - "objc", -] - [[package]] name = "core2" version = "0.4.0" @@ -1404,7 +1323,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "321077172d79c662f64f5071a03120748d5bb652f5231570141be24cfcd2bace" dependencies = [ "bitflags 1.3.2", - "core-foundation-sys 0.8.7", + "core-foundation-sys", "coreaudio-sys", ] @@ -1424,7 +1343,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "873dab07c8f743075e57f524c583985fbaf745602acbe916a01539364369a779" dependencies = [ "alsa", - "core-foundation-sys 0.8.7", + "core-foundation-sys", "coreaudio-rs", "dasp_sample", "jni 0.21.1", @@ -1455,7 +1374,7 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", ] [[package]] @@ -1718,7 +1637,7 @@ version = "6.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6361d5c062261c78a176addb82d4c821ae42bed6089de0e12603cd25de2059c" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "crossbeam-utils", "hashbrown 0.14.5", "lock_api", @@ -1818,6 +1737,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e0e367e4e7da84520dedcac1901e4da967309406d1e51017ae1abfb97adbd38" dependencies = [ "bitflags 2.13.0", + "block2 0.6.2", "objc2 0.6.4", ] @@ -2033,7 +1953,7 @@ version = "0.8.35" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", ] [[package]] @@ -2285,7 +2205,7 @@ version = "0.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "libc", "libredox", ] @@ -2318,18 +2238,6 @@ dependencies = [ "miniz_oxide", ] -[[package]] -name = "flume" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0e4dd2a88388a1f4ccc7c9ce104604dab68d9f408dc34cd45823d5a9069095" -dependencies = [ - "futures-core", - "futures-sink", - "nanorand", - "spin", -] - [[package]] name = "fnv" version = "1.0.7" @@ -2591,7 +2499,7 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "js-sys", "libc", "wasi", @@ -2604,7 +2512,7 @@ version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "js-sys", "libc", "r-efi 5.3.0", @@ -2618,7 +2526,7 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "libc", "r-efi 6.0.0", "wasip2", @@ -2647,8 +2555,21 @@ version = "0.21.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0071fe88dba8e40086c8ff9bbb62622999f49628344b1d1bf490a48a29d80f22" dependencies = [ - "glib-sys", - "gobject-sys", + "glib-sys 0.21.5", + "gobject-sys 0.21.5", + "libc", + "system-deps", + "windows-sys 0.61.2", +] + +[[package]] +name = "gio-sys" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64729ba2772c080448f9f966dba8f4456beeb100d8c28a865ef8a0f2ef4987e1" +dependencies = [ + "glib-sys 0.22.6", + "gobject-sys 0.22.6", "libc", "system-deps", "windows-sys 0.61.2", @@ -2677,10 +2598,31 @@ dependencies = [ "futures-executor", "futures-task", "futures-util", - "gio-sys", - "glib-macros", - "glib-sys", - "gobject-sys", + "gio-sys 0.21.5", + "glib-macros 0.21.5", + "glib-sys 0.21.5", + "gobject-sys 0.21.5", + "libc", + "memchr", + "smallvec", +] + +[[package]] +name = "glib" +version = "0.22.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c207e04e51605dcf7b2924c41591b3a10e1438eaac5bcf448fb91f325381104a" +dependencies = [ + "bitflags 2.13.0", + "futures-channel", + "futures-core", + "futures-executor", + "futures-task", + "futures-util", + "gio-sys 0.22.0", + "glib-macros 0.22.6", + "glib-sys 0.22.6", + "gobject-sys 0.22.6", "libc", "memchr", "smallvec", @@ -2699,6 +2641,18 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "glib-macros" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "506d23499707c7142898429757e8d9a3871d965239a2cb66dfa05052be6d6f19" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "glib-sys" version = "0.21.5" @@ -2709,6 +2663,16 @@ dependencies = [ "system-deps", ] +[[package]] +name = "glib-sys" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f7fbac234ed5bc2a28359b7bde8e1b9cdf1441cc2d7f068e4824672d7db9445" +dependencies = [ + "libc", + "system-deps", +] + [[package]] name = "glifo" version = "0.1.1" @@ -2770,7 +2734,18 @@ version = "0.21.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dca35da0d19a18f4575f3cb99fe1c9e029a2941af5662f326f738a21edaf294" dependencies = [ - "glib-sys", + "glib-sys 0.21.5", + "libc", + "system-deps", +] + +[[package]] +name = "gobject-sys" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22a861859b887a79cf461359c192c97a57d8fb0229dd291232e57aa11f6fa72c" +dependencies = [ + "glib-sys 0.22.6", "libc", "system-deps", ] @@ -2820,6 +2795,99 @@ dependencies = [ "bitflags 2.13.0", ] +[[package]] +name = "gstreamer" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28ca0c594cac4e86f5444aaa767c7bb810340c0710667a6467d3ead248e35e84" +dependencies = [ + "cfg-if", + "futures-channel", + "futures-core", + "futures-util", + "glib 0.22.7", + "gstreamer-sys", + "itertools 0.14.0", + "kstring", + "libc", + "muldiv", + "num-integer", + "num-rational", + "option-operations", + "pastey 0.2.3", + "pin-project-lite", + "smallvec", + "thiserror 2.0.18", +] + +[[package]] +name = "gstreamer-app" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97f8ae9238c2352398dcc084de28df3f7099af216ac6c160b52318d23f25c010" +dependencies = [ + "futures-core", + "futures-sink", + "glib 0.22.7", + "gstreamer", + "gstreamer-app-sys", + "gstreamer-base", + "libc", +] + +[[package]] +name = "gstreamer-app-sys" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a74a8211e5d7df2f45b612c284ddf56b92bdf4e879e8ed72e7c46dd0842e158" +dependencies = [ + "glib-sys 0.22.6", + "gstreamer-base-sys", + "gstreamer-sys", + "libc", + "system-deps", +] + +[[package]] +name = "gstreamer-base" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c279df2918be97fb9570e589a32ade33598f643b0c4f0c92c17f06be6940574e" +dependencies = [ + "atomic_refcell", + "cfg-if", + "glib 0.22.7", + "gstreamer", + "gstreamer-base-sys", + "libc", +] + +[[package]] +name = "gstreamer-base-sys" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6569606feeb89cfcf95a6476a64a0f0aec83fadcef0e91c24e576f7851ceac3a" +dependencies = [ + "glib-sys 0.22.6", + "gobject-sys 0.22.6", + "gstreamer-sys", + "libc", + "system-deps", +] + +[[package]] +name = "gstreamer-sys" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "533fa8d28fc830eafccbcfcfddb390563ea5d3a351af2c3aab99e197e5f5b1ba" +dependencies = [ + "cfg-if", + "glib-sys 0.22.6", + "gobject-sys 0.22.6", + "libc", + "system-deps", +] + [[package]] name = "guillotiere" version = "0.7.0" @@ -2854,7 +2922,7 @@ version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "crunchy", "num-traits", "zerocopy", @@ -3171,7 +3239,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" dependencies = [ "android_system_properties", - "core-foundation-sys 0.8.7", + "core-foundation-sys", "iana-time-zone-haiku", "js-sys", "log", @@ -3377,7 +3445,7 @@ version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", ] [[package]] @@ -3524,7 +3592,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a87aa2bb7d2af34197c04845522473242e1aa17c12f4935d5856491a7fb8c97" dependencies = [ "cesu8", - "cfg-if 1.0.4", + "cfg-if", "combine", "jni-sys 0.3.1", "log", @@ -3539,7 +3607,7 @@ version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5efd9a482cf3a427f00d6b35f14332adc7902ce91efb778580e180ff90fa3498" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "combine", "jni-macros", "jni-sys 0.4.1", @@ -3719,7 +3787,7 @@ version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "windows-link 0.2.1", ] @@ -3729,7 +3797,7 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "windows-link 0.2.1", ] @@ -3757,7 +3825,7 @@ version = "0.3.38" dependencies = [ "cxx", "env_logger 0.11.10", - "glib", + "glib 0.21.5", "jni 0.21.1", "js-sys", "lazy_static", @@ -3948,6 +4016,30 @@ dependencies = [ "url", ] +[[package]] +name = "livekit-capture" +version = "0.1.0" +dependencies = [ + "base64 0.22.1", + "bytes", + "cc", + "dispatch2", + "gstreamer", + "gstreamer-app", + "image", + "libc", + "livekit", + "md-5", + "objc2 0.6.4", + "objc2-av-foundation", + "objc2-core-media", + "objc2-core-video", + "objc2-foundation 0.3.2", + "thiserror 2.0.18", + "v4l", + "yuv-sys", +] + [[package]] name = "livekit-datatrack" version = "0.1.9" @@ -4066,7 +4158,6 @@ version = "0.2.0" dependencies = [ "anyhow", "bytemuck", - "cc", "chrono", "clap", "eframe", @@ -4074,12 +4165,11 @@ dependencies = [ "egui-wgpu", "env_logger 0.11.10", "futures", - "image", "livekit", "livekit-api", + "livekit-capture", "log", - "metal 0.32.0", - "nokhwa", + "metal", "objc2 0.6.4", "objc2-metal 0.3.2", "parking_lot", @@ -4087,7 +4177,6 @@ dependencies = [ "tokio-stream", "wgpu", "winit", - "yuv-sys", ] [[package]] @@ -4178,10 +4267,20 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ea1f30cedd69f0a2954655f7188c6a834246d2bcf1e315e2ac40c4b24dc9519" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "rayon", ] +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + [[package]] name = "memchr" version = "2.8.0" @@ -4197,21 +4296,6 @@ dependencies = [ "libc", ] -[[package]] -name = "metal" -version = "0.18.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e198a0ee42bdbe9ef2c09d0b9426f3b2b47d90d93a4a9b0395c4cea605e92dc0" -dependencies = [ - "bitflags 1.3.2", - "block", - "cocoa", - "core-graphics 0.19.2", - "foreign-types 0.3.2", - "log", - "objc", -] - [[package]] name = "metal" version = "0.32.0" @@ -4285,6 +4369,12 @@ dependencies = [ "pxfm", ] +[[package]] +name = "muldiv" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "956787520e75e9bd233246045d19f42fb73242759cc57fba9611d940ae96d4b0" + [[package]] name = "multimap" version = "0.10.1" @@ -4300,7 +4390,7 @@ dependencies = [ "arrayvec", "bit-set 0.9.1", "bitflags 2.13.0", - "cfg-if 1.0.4", + "cfg-if", "cfg_aliases", "codespan-reporting", "half", @@ -4317,15 +4407,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "nanorand" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a51313c5820b0b02bd422f4b44776fbf47961755c74ce64afc73bfad10226c3" -dependencies = [ - "getrandom 0.2.17", -] - [[package]] name = "napi" version = "3.8.3" @@ -4497,7 +4578,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" dependencies = [ "bitflags 2.13.0", - "cfg-if 1.0.4", + "cfg-if", "cfg_aliases", "libc", ] @@ -4508,68 +4589,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451" -[[package]] -name = "nokhwa" -version = "0.10.10" -source = "git+https://github.com/l1npengtul/nokhwa?rev=4923ecab7cf26f9dba83867a15a9d8662d021296#4923ecab7cf26f9dba83867a15a9d8662d021296" -dependencies = [ - "flume", - "image", - "nokhwa-bindings-linux", - "nokhwa-bindings-macos", - "nokhwa-bindings-windows", - "nokhwa-core", - "parking_lot", - "paste", - "thiserror 2.0.18", -] - -[[package]] -name = "nokhwa-bindings-linux" -version = "0.1.3" -source = "git+https://github.com/l1npengtul/nokhwa?rev=4923ecab7cf26f9dba83867a15a9d8662d021296#4923ecab7cf26f9dba83867a15a9d8662d021296" -dependencies = [ - "libc", - "nokhwa-core", - "v4l", -] - -[[package]] -name = "nokhwa-bindings-macos" -version = "0.2.3" -source = "git+https://github.com/l1npengtul/nokhwa?rev=4923ecab7cf26f9dba83867a15a9d8662d021296#4923ecab7cf26f9dba83867a15a9d8662d021296" -dependencies = [ - "block", - "cocoa-foundation", - "core-foundation 0.10.1", - "core-media-sys", - "core-video-sys", - "flume", - "nokhwa-core", - "objc", - "once_cell", -] - -[[package]] -name = "nokhwa-bindings-windows" -version = "0.4.5" -source = "git+https://github.com/l1npengtul/nokhwa?rev=4923ecab7cf26f9dba83867a15a9d8662d021296#4923ecab7cf26f9dba83867a15a9d8662d021296" -dependencies = [ - "nokhwa-core", - "once_cell", - "windows 0.62.2", -] - -[[package]] -name = "nokhwa-core" -version = "0.1.8" -source = "git+https://github.com/l1npengtul/nokhwa?rev=4923ecab7cf26f9dba83867a15a9d8662d021296#4923ecab7cf26f9dba83867a15a9d8662d021296" -dependencies = [ - "bytes", - "image", - "thiserror 2.0.18", -] - [[package]] name = "nom" version = "7.1.3" @@ -4708,7 +4727,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "915b1b472bc21c53464d6c8461c9d3af805ba1ef837e1cac254428f4a77177b1" dependencies = [ "malloc_buf", - "objc_exception", ] [[package]] @@ -4764,6 +4782,19 @@ dependencies = [ "objc2-foundation 0.3.2", ] +[[package]] +name = "objc2-av-foundation" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "478ae33fcac9df0a18db8302387c666b8ef08a3e2d62b510ca4fc278a384b6c0" +dependencies = [ + "bitflags 2.13.0", + "dispatch2", + "objc2 0.6.4", + "objc2-core-media", + "objc2-foundation 0.3.2", +] + [[package]] name = "objc2-cloud-kit" version = "0.2.2" @@ -4799,6 +4830,28 @@ dependencies = [ "objc2-foundation 0.2.2", ] +[[package]] +name = "objc2-core-audio" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1eebcea8b0dbff5f7c8504f3107c68fc061a3eb44932051c8cf8a68d969c3b2" +dependencies = [ + "dispatch2", + "objc2 0.6.4", + "objc2-core-audio-types", + "objc2-core-foundation", +] + +[[package]] +name = "objc2-core-audio-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a89f2ec274a0cf4a32642b2991e8b351a404d290da87bb6a9a9d8632490bd1c" +dependencies = [ + "bitflags 2.13.0", + "objc2 0.6.4", +] + [[package]] name = "objc2-core-data" version = "0.2.2" @@ -4889,6 +4942,21 @@ dependencies = [ "objc2-foundation 0.3.2", ] +[[package]] +name = "objc2-core-media" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05ec576860167a15dd9fce7fbee7512beb4e31f532159d3482d1f9c6caedf31d" +dependencies = [ + "bitflags 2.13.0", + "dispatch2", + "objc2 0.6.4", + "objc2-core-audio", + "objc2-core-audio-types", + "objc2-core-foundation", + "objc2-core-video", +] + [[package]] name = "objc2-core-text" version = "0.3.2" @@ -4901,6 +4969,19 @@ dependencies = [ "objc2-core-graphics", ] +[[package]] +name = "objc2-core-video" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d425caf1df73233f29fd8a5c3e5edbc30d2d4307870f802d18f00d83dc5141a6" +dependencies = [ + "bitflags 2.13.0", + "objc2 0.6.4", + "objc2-core-foundation", + "objc2-core-graphics", + "objc2-io-surface", +] + [[package]] name = "objc2-encode" version = "4.1.0" @@ -5094,15 +5175,6 @@ dependencies = [ "objc2-foundation 0.3.2", ] -[[package]] -name = "objc_exception" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad970fb455818ad6cba4c122ad012fae53ae8b4795f86378bce65e4f6bab2ca4" -dependencies = [ - "cc", -] - [[package]] name = "object" version = "0.37.3" @@ -5154,7 +5226,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a45fa2aa886c42762255da344f0a0d313e254066c46aad76f300c3d3da62d967" dependencies = [ "bitflags 2.13.0", - "cfg-if 1.0.4", + "cfg-if", "foreign-types 0.3.2", "libc", "openssl-macros", @@ -5206,6 +5278,15 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "option-operations" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aca39cf52b03268400c16eeb9b56382ea3c3353409309b63f5c8f0b1faf42754" +dependencies = [ + "pastey 0.2.3", +] + [[package]] name = "orbclient" version = "0.3.51" @@ -5302,7 +5383,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" dependencies = [ "backtrace", - "cfg-if 1.0.4", + "cfg-if", "libc", "petgraph 0.6.5", "redox_syscall 0.5.18", @@ -5322,6 +5403,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35fb2e5f958ec131621fdd531e9fc186ed768cbe395337403ae56c17a74c68ec" +[[package]] +name = "pastey" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ee67f1008b1ba2321834326597b8e186293b049a023cdef258527550b9935b4" + [[package]] name = "pbjson" version = "0.6.0" @@ -5535,7 +5622,7 @@ checksum = "4b2d323e8ca7996b3e23126511a523f7e62924d93ecd5ae73b333815b0eb3dce" dependencies = [ "autocfg", "bitflags 1.3.2", - "cfg-if 1.0.4", + "cfg-if", "concurrent-queue", "libc", "log", @@ -5549,7 +5636,7 @@ version = "3.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d0e4f59085d47d8241c88ead0f274e8a0cb551f3625263c05eb8dd897c34218" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "concurrent-queue", "hermit-abi", "pin-project-lite", @@ -5611,6 +5698,21 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "preencode_publish" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "env_logger 0.11.10", + "gstreamer", + "livekit", + "livekit-api", + "livekit-capture", + "log", + "tokio", +] + [[package]] name = "presser" version = "0.3.1" @@ -6004,7 +6106,7 @@ dependencies = [ "av1-grain", "bitstream-io", "built", - "cfg-if 1.0.4", + "cfg-if", "interpolate_name", "itertools 0.14.0", "libc", @@ -6219,7 +6321,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" dependencies = [ "cc", - "cfg-if 1.0.4", + "cfg-if", "getrandom 0.2.17", "libc", "untrusted", @@ -6533,7 +6635,7 @@ checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" dependencies = [ "bitflags 2.13.0", "core-foundation 0.10.1", - "core-foundation-sys 0.8.7", + "core-foundation-sys", "libc", "security-framework-sys", ] @@ -6544,7 +6646,7 @@ version = "2.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3" dependencies = [ - "core-foundation-sys 0.8.7", + "core-foundation-sys", "libc", ] @@ -6673,7 +6775,7 @@ version = "0.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "cpufeatures", "digest", ] @@ -6684,7 +6786,7 @@ version = "0.10.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "cpufeatures", "digest", ] @@ -6911,15 +7013,6 @@ dependencies = [ "hound", ] -[[package]] -name = "spin" -version = "0.9.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" -dependencies = [ - "lock_api", -] - [[package]] name = "spirv" version = "0.4.0+sdk-1.4.341.0" @@ -6959,7 +7052,7 @@ version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07f9fdfdd31a0ff38b59deb401be81b73913d76c9cc5b1aed4e1330a223420b9" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "hashbrown 0.14.5", "serde", ] @@ -7097,7 +7190,7 @@ version = "3.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adcb7fd841cd518e279be3d5a3eb0636409487998a4aff22f3de87b81e88384f" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "proc-macro2", "quote", "syn 2.0.117", @@ -7192,7 +7285,7 @@ version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", ] [[package]] @@ -7249,7 +7342,7 @@ dependencies = [ "arrayref", "arrayvec", "bytemuck", - "cfg-if 1.0.4", + "cfg-if", "log", "tiny-skia-path", ] @@ -7637,7 +7730,7 @@ dependencies = [ "num-complex", "num-integer", "num-traits", - "pastey", + "pastey 0.1.1", "rustfft", "smallvec", "tract-data", @@ -7700,7 +7793,7 @@ dependencies = [ "liquid-derive", "log", "num-traits", - "pastey", + "pastey 0.1.1", "scan_fmt", "smallvec", "time", @@ -8210,7 +8303,7 @@ version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "once_cell", "rustversion", "wasm-bindgen-macro", @@ -8223,7 +8316,7 @@ version = "0.4.64" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8" dependencies = [ - "cfg-if 1.0.4", + "cfg-if", "futures-util", "js-sys", "once_cell", @@ -8546,7 +8639,7 @@ dependencies = [ "arrayvec", "bitflags 2.13.0", "bytemuck", - "cfg-if 1.0.4", + "cfg-if", "cfg_aliases", "document-features", "hashbrown 0.16.1", @@ -8650,7 +8743,7 @@ dependencies = [ "bitflags 2.13.0", "block2 0.6.2", "bytemuck", - "cfg-if 1.0.4", + "cfg-if", "cfg_aliases", "glow", "glutin_wgl_sys", @@ -9236,7 +9329,7 @@ dependencies = [ "cfg_aliases", "concurrent-queue", "core-foundation 0.9.4", - "core-graphics 0.23.2", + "core-graphics", "cursor-icon", "dpi", "js-sys", diff --git a/Cargo.toml b/Cargo.toml index bac58cbef..c76c818d5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ members = [ "livekit-uniffi", "livekit-datatrack", "livekit-ffi-node-bindings", + "livekit-capture", "livekit-runtime", "livekit-wakeword", "libwebrtc", @@ -29,6 +30,7 @@ members = [ "examples/local_video", "examples/mobile", "examples/play_from_disk", + "examples/preencode_publish", "examples/rpc", "examples/save_to_disk", "examples/screensharing", @@ -49,6 +51,7 @@ imgproc = { version = "0.3.19", path = "imgproc" } libwebrtc = { version = "0.3.38", path = "libwebrtc" } livekit = { version = "0.7.49", path = "livekit" } livekit-api = { version = "0.5.4", path = "livekit-api" } +livekit-capture = { version = "0.1.0", path = "livekit-capture" } livekit-ffi = { version = "0.12.67", path = "livekit-ffi" } livekit-datatrack = { version = "0.1.9", path = "livekit-datatrack" } livekit-protocol = { version = "0.7.9", path = "livekit-protocol" } @@ -59,6 +62,7 @@ webrtc-sys-build = { version = "0.3.18", path = "webrtc-sys/build" } yuv-sys = { version = "0.3.14", path = "yuv-sys" } anyhow = "1.0" +base64 = "0.22" bytes = "1.10" clap = "4.5" console-subscriber = "0.1" @@ -67,8 +71,11 @@ from_variants = "1.0.2" futures = "0.3" futures-core = "0.3" futures-util = { version = "0.3", default-features = false } +gstreamer = "0.25.2" +gstreamer-app = "0.25.2" lazy_static = "1.4" log = "0.4" +md-5 = "0.10" parking_lot = "0.12" prost = "0.14" prost-build = "0.14" diff --git a/examples/local_video/Cargo.toml b/examples/local_video/Cargo.toml index 2f51d50d5..d4c36ade0 100644 --- a/examples/local_video/Cargo.toml +++ b/examples/local_video/Cargo.toml @@ -32,7 +32,7 @@ tokio = { workspace = true, features = ["full", "parking_lot"] } tokio-stream = { workspace = true } livekit = { workspace = true, features = ["rustls-tls-native-roots"] } livekit-api = { workspace = true } -yuv-sys = { workspace = true, features = ["jpeg"] } +livekit-capture = { workspace = true } futures = { workspace = true } clap = { workspace = true, features = ["derive"] } log = { workspace = true } @@ -40,7 +40,6 @@ env_logger = { workspace = true } eframe = { workspace = true, features = ["default_fonts", "wgpu", "persistence"] } egui = { workspace = true } egui-wgpu = { workspace = true } -image = { workspace = true } wgpu = { workspace = true } winit = { workspace = true, features = [ "android-native-activity" ] } parking_lot = { workspace = true, features = ["deadlock_detection"] } @@ -48,19 +47,11 @@ anyhow = { workspace = true } chrono = "0.4" bytemuck = { version = "1.16", features = ["derive"] } -nokhwa = { git = "https://github.com/l1npengtul/nokhwa", rev = "4923ecab7cf26f9dba83867a15a9d8662d021296", default-features = false, features = ["output-threaded"] } - [target.'cfg(target_os = "macos")'.dependencies] -nokhwa = { git = "https://github.com/l1npengtul/nokhwa", rev = "4923ecab7cf26f9dba83867a15a9d8662d021296", default-features = false, features = ["input-avfoundation"] } +livekit-capture = { workspace = true, features = ["avfoundation"] } metal = "0.32" objc2 = { version = "0.6.0", features = ["relax-sign-encoding"] } objc2-metal = "0.3.2" [target.'cfg(target_os = "linux")'.dependencies] -nokhwa = { git = "https://github.com/l1npengtul/nokhwa", rev = "4923ecab7cf26f9dba83867a15a9d8662d021296", default-features = false, features = ["input-v4l"] } - -[target.'cfg(target_os = "windows")'.dependencies] -nokhwa = { git = "https://github.com/l1npengtul/nokhwa", rev = "4923ecab7cf26f9dba83867a15a9d8662d021296", default-features = false, features = ["input-msmf"] } - -[build-dependencies] -cc = { workspace = true } +livekit-capture = { workspace = true, features = ["libargus", "v4l"] } diff --git a/examples/local_video/README.md b/examples/local_video/README.md index edb608802..70806a2dd 100644 --- a/examples/local_video/README.md +++ b/examples/local_video/README.md @@ -2,7 +2,8 @@ Examples demonstrating capturing frames from a local camera video and publishing to LiveKit, listing camera capabilities, subscribing to render video in a window, and showing a low-latency clock for measurement. -**Note:** These examples are intended for **desktop platforms only** (macOS, Linux, Windows). +**Note:** These examples are intended for **macOS and Linux** (including NVIDIA Jetson). +Windows camera capture is not currently supported; the test-pattern publisher (`--test-pattern`), subscriber, and clock still work there. You must enable the `desktop` feature when building or running them. For smoother local rendering, especially above 720p, run the publisher/subscriber with `cargo run --release`. @@ -69,10 +70,20 @@ Publisher usage: cargo run -p local_video -F desktop --bin publisher -- \ --source argus \ --camera-index 0 \ + --zero-copy \ --codec h265 \ --room-name demo \ --identity jetson-cam-1 + # publish from Argus with a visible burned timestamp (uses CPU I420 copy) + cargo run -p local_video -F desktop --bin publisher -- \ + --source argus \ + --camera-index 0 \ + --attach-timestamp \ + --burn-timestamp \ + --room-name demo \ + --identity jetson-cam-1 + # publish AV1 through the Jetson hardware encoder (Orin only) cargo run -p local_video -F desktop --bin publisher -- \ --source argus \ @@ -83,7 +94,13 @@ Publisher usage: # publish a static SMPTE color-bar test pattern (no camera required) cargo run -p local_video -F desktop --bin publisher -- \ - --test-pattern \ + --test-pattern 0 \ + --room-name demo \ + --identity test-1 + + # publish an animated encoder exercise test pattern (no camera required) + cargo run -p local_video -F desktop --bin publisher -- \ + --test-pattern 1 \ --room-name demo \ --identity test-1 @@ -123,8 +140,9 @@ The clock draws a 3x9 grid below the time. The top row fills from `0` to `9` for Publisher flags (in addition to the common connection flags above): - `--camera-index `: Camera index to use (default: `0`). Use `--list-cameras` to see available indices. - `--source `: Camera backend to use (default: `uvc`). `argus` uses NVIDIA libargus for MIPI CSI cameras and is available only on Linux aarch64 Jetson builds. -- `--format `: UVC camera capture format (default: `auto`). `auto` tries uncompressed YUYV first and falls back to MJPEG; `mjpeg` can reduce USB bandwidth when running multiple cameras. -- `--test-pattern`: Generate a standard SMPTE 75% color-bar test pattern instead of capturing from a camera. `--camera-index` is ignored when this is set; `--width`, `--height`, and `--fps` still control the output resolution and frame rate. +- `--format `: UVC camera capture format (default: `auto`). `auto` prefers uncompressed YUYV and falls back to the camera's other supported formats; `mjpeg` can reduce USB bandwidth when running multiple cameras. If an explicitly requested format is unavailable, the publisher logs a warning and continues with the negotiated format. +- `--zero-copy`: Use a platform zero-copy capture/encode path when available, such as AVFoundation IOSurface-backed CVPixelBuffers on macOS or Argus DMA-BUF frames on Jetson. If the selected source does not support zero-copy, the publisher logs a warning and uses CPU I420 capture. +- `--test-pattern [0|1]`: Generate a test pattern instead of capturing from a camera. `0` is a static SMPTE 75% color-bar pattern and `1` is an animated encoder exercise graphic. Omitting the value defaults to `0`. `--camera-index` is ignored when this is set; `--width`, `--height`, and `--fps` still control the output resolution and frame rate. - `--width `: Desired capture width (default: `1280`). - `--height `: Desired capture height (default: `720`). - `--fps `: Desired capture framerate (default: `30`). @@ -132,7 +150,7 @@ Publisher flags (in addition to the common connection flags above): - `--simulcast`: Publish simulcast video (multiple layers when the resolution is large enough). - `--max-bitrate `: Max video bitrate for the main (highest) layer in bits per second (e.g. `1500000`). - `--attach-timestamp`: Attach the current wall-clock time (microseconds since UNIX epoch) as the user timestamp on each published frame. The subscriber can display this to measure end-to-end latency. -- `--burn-timestamp`: Burn the attached timestamp into the video frame as a visible overlay. Has no effect unless `--attach-timestamp` is also set. +- `--burn-timestamp`: Burn the attached timestamp into the video frame as a visible overlay. Has no effect unless `--attach-timestamp` is also set. With `--zero-copy`, frames stay out of CPU memory, so the publisher logs a warning and skips the visible burn while still attaching timestamp metadata. - `--attach-frame-id`: Attach a monotonically increasing frame ID to each published frame via the packet trailer. The subscriber displays this in the timestamp overlay when `--display-timestamp` is used. - `--display-video`: Open a window that displays the video frames being published. - `--display-timing`: Burn publisher timing metrics into the local preview window. Requires `--display-video`. @@ -179,6 +197,6 @@ Notes: - If the active video track is unsubscribed or unpublished, the app clears its state and will automatically attach to the next matching video track when it appears. - For E2EE to work, both publisher and subscriber must specify the same `--e2ee-key` value. If the keys don't match, the subscriber will not be able to decode the video. - The timestamp overlay updates at ~2 Hz so the latency value is readable rather than flickering every frame. -- On Jetson, `--source argus` requires the Jetson Multimedia API headers under `/usr/src/jetson_multimedia_api`. It publishes NV12 DMA buffers through the Jetson hardware encoder; local publisher preview and burned timestamps are not supported on that path. +- On Jetson, `--source argus` requires the Jetson Multimedia API headers under `/usr/src/jetson_multimedia_api`. Use `--zero-copy` to publish NV12 DMA-BUF frames through the Jetson hardware encoder. Without `--zero-copy`, Argus frames are copied to CPU I420 before publish so `--attach-timestamp --burn-timestamp` can draw the timestamp into the frame. - Jetson AV1 hardware encoding requires an Orin-class device (e.g. Orin NX or AGX Orin on JetPack 5+); the encoder is probed at startup and on devices without AV1 support (e.g. Xavier) `--codec av1` automatically falls back to the software libaom encoder. The Jetson AV1 encoder produces a single L1T1 stream (no SVC). - On Linux, preview windows use the Vulkan `wgpu` backend by default to avoid GLES/EGL conflicts on Jetson desktops. Set `WGPU_BACKEND=gl` or another supported `wgpu` backend to override this. diff --git a/examples/local_video/build.rs b/examples/local_video/build.rs deleted file mode 100644 index 1edde6e87..000000000 --- a/examples/local_video/build.rs +++ /dev/null @@ -1,46 +0,0 @@ -use std::path::PathBuf; - -fn main() { - let target_os = std::env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); - let target_arch = std::env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default(); - - // Only compile the Argus shim on aarch64 Linux (Jetson). - if target_os != "linux" || target_arch != "aarch64" { - return; - } - - let argus_include = PathBuf::from("/usr/src/jetson_multimedia_api/argus/include"); - let mmapi_include = PathBuf::from("/usr/src/jetson_multimedia_api/include"); - - if !argus_include.exists() { - println!( - "cargo:warning=Argus headers not found at {}; skipping lk_argus build", - argus_include.display() - ); - return; - } - - println!("cargo:rerun-if-changed=src/lk_argus.cpp"); - - cc::Build::new() - .cpp(true) - .file("src/lk_argus.cpp") - .include(&argus_include) - .include(&mmapi_include) - .flag("-std=c++14") - .flag("-Wno-deprecated-declarations") - .compile("lk_argus"); - - // Link Argus client library (talks to nvargus-daemon) and NvBufSurface - println!("cargo:rustc-link-lib=dylib=nvargus_socketclient"); - println!("cargo:rustc-link-lib=dylib=nvbufsurface"); - - // Tegra library path - let tegra_lib_dir = PathBuf::from("/usr/lib/aarch64-linux-gnu/tegra"); - if tegra_lib_dir.exists() { - println!("cargo:rustc-link-search=native={}", tegra_lib_dir.display()); - } - - // Standard aarch64 library path - println!("cargo:rustc-link-search=native=/usr/lib/aarch64-linux-gnu"); -} diff --git a/examples/local_video/src/argus.rs b/examples/local_video/src/argus.rs deleted file mode 100644 index 0fadbc9d3..000000000 --- a/examples/local_video/src/argus.rs +++ /dev/null @@ -1,169 +0,0 @@ -//! Thin FFI wrapper around NVIDIA Argus/libargus for MIPI CSI camera capture. -//! -//! This module provides DMA-buffer frame acquisition from MIPI cameras on Jetson -//! platforms. Frames are blitted from Argus' EGLStream frame into NvBufSurface -//! DMA file descriptors that can be passed to the hardware encoder without -//! CPU-side pixel copies. -//! -//! The Argus API is C++, so we use a small C shim (linked via build.rs on -//! Jetson) to expose the capture session lifecycle. - -use std::ffi::c_int; -use std::io; - -/// Opaque handle to an Argus capture session. -pub struct ArgusCaptureSession { - handle: *mut std::ffi::c_void, - width: u32, - height: u32, -} - -/// A captured Argus frame backed by a DMA buffer. -pub struct ArgusFrame { - /// DMA buffer file descriptor containing an NV12 frame. - pub dmabuf_fd: i32, - /// Argus sensor start timestamp in nanoseconds, when available. - pub sensor_timestamp_ns: Option, - /// Time spent waiting for `FrameConsumer::acquireFrame` to return. - pub acquire_wait_ns: u64, - /// Time spent copying the acquired EGLStream frame into the DMA buffer. - pub blit_ns: u64, -} - -// The C++ session is single-threaded but we move it across the tokio runtime. -unsafe impl Send for ArgusCaptureSession {} - -extern "C" { - fn lk_argus_create_session( - sensor_index: c_int, - width: c_int, - height: c_int, - fps: c_int, - ) -> *mut std::ffi::c_void; - - fn lk_argus_destroy_session(session: *mut std::ffi::c_void); - - /// Acquire the next frame and optionally return the Argus sensor timestamp. - /// Returns the NvBufSurface DMA fd, or -1 on error. - /// The fd is valid until the next acquire call or `lk_argus_release_frame`. - fn lk_argus_acquire_frame_with_metadata( - session: *mut std::ffi::c_void, - sensor_timestamp_ns: *mut u64, - acquire_wait_ns: *mut u64, - blit_ns: *mut u64, - ) -> c_int; - - /// Release the most recently acquired frame back to the Argus buffer pool. - fn lk_argus_release_frame(session: *mut std::ffi::c_void); -} - -impl ArgusCaptureSession { - /// Open an Argus capture session on the given MIPI CSI sensor. - /// - /// `sensor_index` selects the camera (0 for the first CSI camera). - /// The session negotiates the given resolution and framerate with the ISP. - pub fn new(sensor_index: u32, width: u32, height: u32, fps: u32) -> io::Result { - let handle = unsafe { - lk_argus_create_session( - sensor_index as c_int, - width as c_int, - height as c_int, - fps as c_int, - ) - }; - if handle.is_null() { - return Err(io::Error::new( - io::ErrorKind::Other, - "Failed to create Argus capture session", - )); - } - Ok(Self { handle, width, height }) - } - - /// Acquire the next captured frame as a DMA buffer. - /// - /// The returned fd refers to an NvBufSurface in NV12 format. It remains - /// valid until [`release_frame`](Self::release_frame) is called or the - /// next `acquire_frame` implicitly releases the previous one. - pub fn acquire_frame(&mut self) -> io::Result { - let mut sensor_timestamp_ns = 0; - let mut acquire_wait_ns = 0; - let mut blit_ns = 0; - let fd = unsafe { - lk_argus_acquire_frame_with_metadata( - self.handle, - &mut sensor_timestamp_ns, - &mut acquire_wait_ns, - &mut blit_ns, - ) - }; - if fd < 0 { - return Err(io::Error::new(io::ErrorKind::Other, "Argus frame acquisition failed")); - } - Ok(ArgusFrame { - dmabuf_fd: fd, - sensor_timestamp_ns: (sensor_timestamp_ns > 0).then_some(sensor_timestamp_ns), - acquire_wait_ns, - blit_ns, - }) - } - - /// Release the most recently acquired frame back to the buffer pool. - pub fn release_frame(&mut self) { - unsafe { lk_argus_release_frame(self.handle) }; - } - - pub fn width(&self) -> u32 { - self.width - } - - pub fn height(&self) -> u32 { - self.height - } -} - -impl Drop for ArgusCaptureSession { - fn drop(&mut self) { - if !self.handle.is_null() { - unsafe { lk_argus_destroy_session(self.handle) }; - self.handle = std::ptr::null_mut(); - } - } -} - -/// Convert an Argus `CLOCK_MONOTONIC` sensor timestamp into a UNIX-epoch microsecond value -/// by computing the offset between the current monotonic clock and the supplied wall time. -pub fn sensor_monotonic_ns_to_unix_us(sensor_timestamp_ns: u64, wall_time_us: u64) -> Option { - let monotonic_now_ns = monotonic_time_ns_now()?; - let monotonic_delta_us = monotonic_now_ns.abs_diff(sensor_timestamp_ns) / 1_000; - if sensor_timestamp_ns <= monotonic_now_ns { - Some(wall_time_us.saturating_sub(monotonic_delta_us)) - } else { - Some(wall_time_us.saturating_add(monotonic_delta_us)) - } -} - -/// Current `CLOCK_MONOTONIC` value in nanoseconds, used to translate Argus sensor -/// timestamps into wall time. -fn monotonic_time_ns_now() -> Option { - #[repr(C)] - struct Timespec { - tv_sec: i64, - tv_nsec: i64, - } - - extern "C" { - fn clock_gettime(clk_id: i32, tp: *mut Timespec) -> i32; - } - - const CLOCK_MONOTONIC: i32 = 1; - let mut ts = Timespec { tv_sec: 0, tv_nsec: 0 }; - let ret = unsafe { - // SAFETY: `ts` is a valid, writable `Timespec` for the duration of the call. - clock_gettime(CLOCK_MONOTONIC, &mut ts) - }; - if ret != 0 || ts.tv_sec < 0 || ts.tv_nsec < 0 { - return None; - } - Some(ts.tv_sec as u64 * 1_000_000_000 + ts.tv_nsec as u64) -} diff --git a/examples/local_video/src/list_devices.rs b/examples/local_video/src/list_devices.rs index 194aedfa3..59c90d2ab 100644 --- a/examples/local_video/src/list_devices.rs +++ b/examples/local_video/src/list_devices.rs @@ -1,14 +1,8 @@ use anyhow::Result; -use nokhwa::pixel_format::RgbFormat; -use nokhwa::utils::{ - ApiBackend, CameraFormat, CameraInfo, FrameFormat, RequestedFormat, RequestedFormatType, - Resolution, -}; -use nokhwa::Camera; -use std::collections::BTreeMap; +use livekit_capture::device::{CaptureDeviceInfo, CaptureFormat}; fn main() -> Result<()> { - let cameras = nokhwa::query(ApiBackend::Auto)?; + let cameras = platform_devices()?; if cameras.is_empty() { println!("No cameras detected."); return Ok(()); @@ -17,96 +11,66 @@ fn main() -> Result<()> { println!("Available cameras and capabilities:"); for (idx, info) in cameras.iter().enumerate() { println!(); - println!("{}. {}", idx, info.human_name()); - match enumerate_capabilities(info) { - Ok(formats) => print_capabilities(&formats), - Err(err) => println!(" Capabilities: unavailable ({})", err), - } + println!("{}. {}", idx, info.name); + print_device_details(info); } Ok(()) } -/// Enumerate camera capabilities using only Nokhwa public APIs. -/// -/// This avoids any direct dependency on platform-specific bindings crates like -/// `nokhwa_bindings_macos`, making the example portable across targets. -fn enumerate_capabilities( - info: &CameraInfo, -) -> Result>>> { - // We don't need to actually capture frames; we just want to query supported formats. - // Using "None" requested format keeps it flexible. - let requested = RequestedFormat::new::(RequestedFormatType::None); - - // `CameraInfo::index()` is what Nokhwa uses to open the device. Depending on Nokhwa - // version, this may be Copy/Clone; clone defensively. - let mut camera = Camera::new(info.index().clone(), requested)?; - - // Prefer FourCC-based queries if available; otherwise fall back to camera formats. - let mut capabilities = BTreeMap::new(); - - if let Ok(mut fourccs) = camera.compatible_fourcc() { - fourccs.sort(); - for fourcc in fourccs { - // Returns a map: Resolution -> Vec - let mut res_map = camera.compatible_list_by_resolution(fourcc)?; - let mut res_sorted = BTreeMap::new(); - - for (res, mut fps_list) in res_map.drain() { - fps_list.sort(); - fps_list.dedup(); - res_sorted.insert(res, fps_list); - } +#[cfg(target_os = "macos")] +fn platform_devices() -> Result> { + Ok(livekit_capture::sources::avfoundation::devices()?) +} - capabilities.insert(fourcc, res_sorted); - } - } else { - // Some backends don’t support FourCC enumeration; use generic formats instead. - let formats = camera.compatible_camera_formats()?; - capabilities = capabilities_from_formats(formats); - } +#[cfg(target_os = "linux")] +fn platform_devices() -> Result> { + Ok(livekit_capture::sources::v4l::devices()?) +} - Ok(capabilities) +#[cfg(not(any(target_os = "macos", target_os = "linux")))] +fn platform_devices() -> Result> { + anyhow::bail!( + "camera listing is not supported on {}; local_video supports macOS AVFoundation and Linux V4L2", + std::env::consts::OS + ); } -fn capabilities_from_formats( - formats: Vec, -) -> BTreeMap>> { - let mut capabilities = BTreeMap::new(); - for fmt in formats { - let res_map = capabilities.entry(fmt.format()).or_insert_with(BTreeMap::new); - let fps_list = res_map.entry(fmt.resolution()).or_insert_with(Vec::new); - fps_list.push(fmt.frame_rate()); +fn print_device_details(info: &CaptureDeviceInfo) { + println!(" ID: {}", info.id); + if let Some(model_id) = info.model_id.as_deref() { + println!(" Model: {}", model_id); } - for res_map in capabilities.values_mut() { - for fps_list in res_map.values_mut() { - fps_list.sort(); - fps_list.dedup(); - } + if let Some(manufacturer) = info.manufacturer.as_deref() { + println!(" Manufacturer: {}", manufacturer); } - capabilities + print_capabilities(&info.formats); } -fn print_capabilities(capabilities: &BTreeMap>>) { - if capabilities.is_empty() { - println!(" Capabilities: none reported"); +fn print_capabilities(formats: &[CaptureFormat]) { + if formats.is_empty() { + println!(" Capabilities: none reported by backend"); return; } + let mut formats = formats.to_vec(); + formats.sort_by_key(|format| { + ( + format!("{:?}", format.frame_format), + format.resolution.width, + format.resolution.height, + format.frame_rate, + ) + }); + println!(" Capabilities:"); - for (format, resolutions) in capabilities { - println!(" - Format: {}", format); - if resolutions.is_empty() { - println!(" (no resolutions reported)"); - continue; - } - for (resolution, fps_list) in resolutions { - let fps_text = if fps_list.is_empty() { - "unknown".to_string() - } else { - fps_list.iter().map(|fps| fps.to_string()).collect::>().join(", ") - }; - println!(" {} @ {} fps", resolution, fps_text); - } + for format in formats { + println!( + " - {:?}: {}x{} @ {} fps", + format.frame_format, + format.resolution.width, + format.resolution.height, + format.frame_rate + ); } } diff --git a/examples/local_video/src/publisher.rs b/examples/local_video/src/publisher.rs index 28932c5b9..68c5d2807 100644 --- a/examples/local_video/src/publisher.rs +++ b/examples/local_video/src/publisher.rs @@ -6,19 +6,25 @@ use livekit::options::{ VideoEncoderBackend, VideoEncoding, VideoPreset, }; use livekit::prelude::*; -use livekit::webrtc::video_frame::{FrameMetadata, I420Buffer, VideoFrame, VideoRotation}; +use livekit::webrtc::video_frame::{ + native::{NativeBuffer, VideoFrameBufferExt}, + FrameMetadata, I420Buffer, VideoFrame, VideoRotation, +}; use livekit::webrtc::video_source::native::NativeVideoSource; use livekit::webrtc::video_source::{RtcVideoSource, VideoResolution}; use livekit_api::access_token; use livekit_api::services::room::{CreateRoomOptions, RoomClient}; use livekit_api::services::{ServiceError, TwirpError, TwirpErrorCode}; -use log::{debug, info}; -use nokhwa::pixel_format::RgbFormat; -use nokhwa::utils::{ - ApiBackend, CameraFormat, CameraIndex, FrameFormat, RequestedFormat, RequestedFormatType, - Resolution, +use livekit_capture::device::{ + CaptureBackend, CaptureDeviceSelector, CaptureFormat as LkCaptureFormat, CaptureFormatRequest, + CaptureFrameFormat, CapturePath as LkCapturePath, CaptureResolution, }; -use nokhwa::Camera; +use livekit_capture::source::{CaptureFrame, CaptureSourceOptions, VideoCaptureSource}; +#[cfg(all(target_os = "linux", target_arch = "aarch64"))] +use livekit_capture::sources::argus::{self, ArgusCaptureOptions, ArgusCaptureSession}; +#[cfg(target_os = "macos")] +use livekit_capture::sources::avfoundation::AvFoundationStopHandle; +use log::{debug, info}; use parking_lot::Mutex; use std::collections::{HashMap, VecDeque}; use std::env; @@ -27,10 +33,7 @@ use std::sync::{ Arc, }; use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; -use yuv_sys; -#[cfg(all(target_os = "linux", target_arch = "aarch64"))] -mod argus; mod codec_display; mod test_pattern; mod timestamp_burn; @@ -38,7 +41,7 @@ mod user_data; mod video_display; mod viewport_aspect; -use test_pattern::TestPattern; +use test_pattern::{TestPattern, TestPatternKind}; use timestamp_burn::TimestampOverlay; use video_display::{align_up, PublisherTimingSample, SharedYuv}; @@ -66,29 +69,35 @@ impl From for VideoCodec { /// Selects the camera backend used by the publisher. #[derive(Copy, Clone, Debug, PartialEq, Eq, ValueEnum)] enum SourceKind { - /// USB / V4L2 camera via the `nokhwa` crate (default). + /// Platform camera via livekit-capture (AVFoundation on macOS, V4L2 on Linux). Uvc, /// NVIDIA Jetson MIPI CSI camera via libargus (Jetson-only). Argus, } -/// Selects the UVC camera capture pixel format. +/// Selects the UVC camera capture frame format. #[derive(Copy, Clone, Debug, PartialEq, Eq, ValueEnum)] enum CaptureFormat { - /// Try YUYV first and fall back to MJPEG. + /// Prefer YUYV, falling back to other formats supported by the camera. Auto, /// Request uncompressed YUYV capture. Yuv, /// Request compressed MJPEG capture. Mjpeg, + /// Request uncompressed GREY capture. + Grey, } impl CaptureFormat { - fn frame_formats(self) -> &'static [FrameFormat] { + /// Preferred source frame format used for V4L2 format negotiation; the + /// capture facade falls back to the camera's other supported formats when + /// the preferred one is unavailable. + #[cfg(target_os = "linux")] + fn preferred_frame_format(self) -> CaptureFrameFormat { match self { - Self::Auto => &[FrameFormat::YUYV, FrameFormat::MJPEG], - Self::Yuv => &[FrameFormat::YUYV], - Self::Mjpeg => &[FrameFormat::MJPEG], + Self::Auto | Self::Yuv => CaptureFrameFormat::Yuyv, + Self::Mjpeg => CaptureFrameFormat::Mjpeg, + Self::Grey => CaptureFrameFormat::Grey, } } } @@ -99,6 +108,7 @@ impl std::fmt::Display for CaptureFormat { Self::Auto => write!(f, "auto"), Self::Yuv => write!(f, "yuv"), Self::Mjpeg => write!(f, "mjpeg"), + Self::Grey => write!(f, "grey"), } } } @@ -148,6 +158,7 @@ fn video_encoder_backend_name(backend: VideoEncoderBackend) -> &'static str { VideoEncoderBackend::Nvenc => "nvenc", VideoEncoderBackend::Vaapi => "vaapi", VideoEncoderBackend::VideoToolbox => "videotoolbox", + VideoEncoderBackend::PreEncoded => "preencoded", _ => "unknown", } } @@ -167,17 +178,28 @@ struct Args { #[arg(long, default_value_t = 0)] camera_index: usize, - /// Camera backend: `uvc` (default, V4L2/USB via nokhwa) or `argus` (Jetson MIPI CSI). + /// Camera backend: `uvc` (default platform camera) or `argus` (Jetson MIPI CSI). #[arg(long, value_enum, default_value_t = SourceKind::Uvc)] source: SourceKind, - /// UVC camera capture format: `auto` tries YUYV then MJPEG; `mjpeg` uses less USB bandwidth. + /// UVC camera capture format: `auto` prefers YUYV and falls back to other supported formats. #[arg(long, value_enum, default_value_t = CaptureFormat::Auto)] format: CaptureFormat, - /// Generate a standard SMPTE color-bar test pattern instead of using a camera - #[arg(long, default_value_t = false, conflicts_with_all = ["list_cameras", "list_encoders"])] - test_pattern: bool, + /// Use zero-copy platform camera buffers when available. + #[arg(long, default_value_t = false)] + zero_copy: bool, + + /// Generate a numeric test pattern instead of using a camera: 0 = static bars, 1 = animated + #[arg( + long, + value_name = "N", + num_args = 0..=1, + default_missing_value = "0", + value_parser = parse_test_pattern_kind, + conflicts_with_all = ["list_cameras", "list_encoders"] + )] + test_pattern: Option, /// Desired width #[arg(long, default_value_t = 1280)] @@ -275,72 +297,6 @@ fn unix_time_us_now() -> u64 { SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_micros() as u64 } -const MAX_BACKEND_CAPTURE_TIMESTAMP_AGE_US: u64 = 5_000_000; - -#[derive(Default)] -struct CaptureTimestampLogState { - logged_source: bool, - logged_missing: bool, - logged_invalid: bool, -} - -fn validate_backend_capture_timestamp_us( - capture_timestamp: Duration, - read_wall_time_us: u64, -) -> Result { - let capture_timestamp_us = - u64::try_from(capture_timestamp.as_micros()).map_err(|_| "overflows u64")?; - if capture_timestamp_us == 0 { - return Err("is zero"); - } - if capture_timestamp_us > read_wall_time_us { - return Err("is in the future"); - } - if read_wall_time_us - capture_timestamp_us > MAX_BACKEND_CAPTURE_TIMESTAMP_AGE_US { - return Err("is too old"); - } - Ok(capture_timestamp_us) -} - -fn select_capture_wall_time_us( - backend_capture_timestamp: Option, - fallback_wall_time_us: u64, - read_wall_time_us: u64, - log_state: &mut CaptureTimestampLogState, -) -> u64 { - match backend_capture_timestamp { - Some(capture_timestamp) => { - match validate_backend_capture_timestamp_us(capture_timestamp, read_wall_time_us) { - Ok(capture_timestamp_us) => { - if !log_state.logged_source { - info!("Using camera capture_timestamp for user_timestamp"); - log_state.logged_source = true; - } - capture_timestamp_us - } - Err(reason) => { - if !log_state.logged_invalid { - log::warn!( - "Ignoring camera capture_timestamp because it {reason}; falling back to system wall clock" - ); - log_state.logged_invalid = true; - } - fallback_wall_time_us - } - } - } - None => { - if !log_state.logged_missing { - log::warn!( - "Buffer::capture_timestamp() not available; falling back to system wall clock" - ); - log_state.logged_missing = true; - } - fallback_wall_time_us - } - } -} - fn is_twirp_not_found(err: &ServiceError) -> bool { matches!( err, @@ -361,6 +317,13 @@ fn requested_playout_delay( } } +fn parse_test_pattern_kind(value: &str) -> Result { + let numeric = + value.parse::().map_err(|_| format!("test pattern must be 0 or 1, got `{value}`"))?; + TestPatternKind::try_from(numeric) + .map_err(|_| format!("test pattern must be 0 or 1, got `{value}`")) +} + fn normalize_twirp_host(url: &str) -> String { if let Some(rest) = url.strip_prefix("wss://") { return format!("https://{}", rest.trim_end_matches("/rtc")); @@ -371,6 +334,16 @@ fn normalize_twirp_host(url: &str) -> String { url.trim_end_matches("/rtc").to_string() } +fn capture_path_name(path: LkCapturePath) -> &'static str { + match path { + LkCapturePath::Native => "native platform buffer", + LkCapturePath::Raw => "CPU I420", + LkCapturePath::DmaBuf => "DMA-BUF", + LkCapturePath::Encoded => "pre-encoded", + _ => "unknown", + } +} + #[derive(Default)] struct RollingMs { total_ms: f64, @@ -396,8 +369,8 @@ impl RollingMs { struct PublisherTimingSummary { paced_wait_ms: RollingMs, camera_frame_read_ms: RollingMs, - decode_mjpeg_ms: RollingMs, - buffer_convert_ms: RollingMs, + capture_timestamp_age_ms: RollingMs, + capture_timestamp_to_webrtc_ms: RollingMs, frame_draw_ms: RollingMs, submit_to_webrtc_ms: RollingMs, capture_to_webrtc_total_ms: RollingMs, @@ -472,9 +445,45 @@ fn log_publisher_outbound_health(stats: &[livekit::webrtc::stats::RtcStats]) { } } -async fn update_publisher_video_stats(track: LocalVideoTrack, ctrl_c_received: Arc) { +fn maybe_request_zero_copy_fallback( + outbound: &livekit::webrtc::stats::OutboundRtpStats, + first_starved_at: &mut Option, + zero_copy_fallback: &AtomicBool, +) { + if zero_copy_fallback.load(Ordering::Acquire) { + return; + } + if outbound.outbound.frames_encoded > 0 || outbound.outbound.key_frames_encoded > 0 { + *first_starved_at = None; + return; + } + if outbound.outbound.pli_count == 0 && outbound.outbound.fir_count == 0 { + return; + } + + let starved_at = first_starved_at.get_or_insert_with(Instant::now); + if starved_at.elapsed() < Duration::from_secs(3) + && outbound.outbound.pli_count < 3 + && outbound.outbound.fir_count == 0 + { + return; + } + + zero_copy_fallback.store(true, Ordering::Release); + log::warn!( + "Zero-copy AVFoundation CVPixelBuffer publish produced no encoded frames; falling back to CPU I420 capture" + ); +} + +async fn update_publisher_video_stats( + track: LocalVideoTrack, + ctrl_c_received: Arc, + zero_copy_fallback: Option>, +) { let mut last_log = Instant::now().checked_sub(Duration::from_secs(2)).unwrap_or_else(Instant::now); + let mut last_encoder_implementation = String::new(); + let mut zero_copy_starved_at = None; let mut interval = tokio::time::interval(Duration::from_secs(1)); interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); @@ -484,6 +493,17 @@ async fn update_publisher_video_stats(track: LocalVideoTrack, ctrl_c_received: A } if let Ok(stats) = track.get_stats().await { + if let Some(implementation) = find_video_outbound_encoder(&stats) { + if implementation != last_encoder_implementation { + info!("Publisher encode path: WebRTC encoder implementation={implementation}"); + last_encoder_implementation = implementation.to_string(); + } + } + if let (Some(outbound), Some(fallback)) = + (find_video_outbound_stats(&stats), zero_copy_fallback.as_ref()) + { + maybe_request_zero_copy_fallback(&outbound, &mut zero_copy_starved_at, fallback); + } if last_log.elapsed() >= Duration::from_secs(2) { log_publisher_outbound_health(&stats); last_log = Instant::now(); @@ -500,7 +520,6 @@ async fn update_publisher_encoder_overlay( ctrl_c_received: Arc, ) { let mut logged_initial = false; - let mut last_implementation = String::new(); let mut interval = tokio::time::interval(Duration::from_secs(1)); interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); @@ -512,11 +531,6 @@ async fn update_publisher_encoder_overlay( match track.get_stats().await { Ok(stats) => { if let Some(implementation) = find_video_outbound_encoder(&stats) { - if implementation != last_implementation { - info!("Publisher video encoder implementation: {implementation}"); - last_implementation = implementation.to_string(); - } - let mut shared = shared.lock(); shared.codec_implementation = implementation.to_string(); } @@ -537,8 +551,8 @@ impl PublisherTimingSummary { fn reset(&mut self) { self.paced_wait_ms.reset(); self.camera_frame_read_ms.reset(); - self.decode_mjpeg_ms.reset(); - self.buffer_convert_ms.reset(); + self.capture_timestamp_age_ms.reset(); + self.capture_timestamp_to_webrtc_ms.reset(); self.frame_draw_ms.reset(); self.submit_to_webrtc_ms.reset(); self.capture_to_webrtc_total_ms.reset(); @@ -552,17 +566,17 @@ fn format_timing_line(timings: &PublisherTimingSummary) -> String { "camera_frame_read {:.2}", timings.camera_frame_read_ms.average().unwrap_or_default() ), + format!( + "capture_ts_age {:.2}", + timings.capture_timestamp_age_ms.average().unwrap_or_default() + ), + format!( + "capture_ts_to_webrtc {:.2}", + timings.capture_timestamp_to_webrtc_ms.average().unwrap_or_default() + ), ]; let mut line_two = Vec::new(); - if let Some(decode_ms) = timings.decode_mjpeg_ms.average() { - line_two.push(format!("decode_mjpeg {:.2}", decode_ms)); - } - - line_two.push(format!( - "convert_to_i420 {:.2}", - timings.buffer_convert_ms.average().unwrap_or_default() - )); if let Some(frame_draw_ms) = timings.frame_draw_ms.average() { line_two.push(format!("frame_draw {:.2}", frame_draw_ms)); } @@ -769,47 +783,73 @@ mod tests { } #[test] - fn capture_timestamp_validation_rejects_future_timestamp() { - assert_eq!( - validate_backend_capture_timestamp_us(Duration::from_micros(1_001), 1_000), - Err("is in the future") - ); + fn test_pattern_is_absent_by_default() { + let args = Args::try_parse_from(["publisher"]).expect("default args should parse"); + + assert_eq!(args.test_pattern, None); } #[test] - fn capture_timestamp_selection_falls_back_for_invalid_backend_timestamp() { - let mut log_state = CaptureTimestampLogState::default(); - - let selected = select_capture_wall_time_us( - Some(Duration::from_micros(1_001)), - 900, - 1_000, - &mut log_state, - ); + fn zero_copy_is_disabled_by_default() { + let args = Args::try_parse_from(["publisher"]).expect("default args should parse"); - assert_eq!(selected, 900); + assert!(!args.zero_copy); } #[test] - fn capture_timestamp_selection_uses_valid_backend_timestamp() { - let mut log_state = CaptureTimestampLogState::default(); - - let selected = select_capture_wall_time_us( - Some(Duration::from_micros(950)), - 900, - 1_000, - &mut log_state, - ); + fn zero_copy_flag_enables_zero_copy() { + let args = Args::try_parse_from(["publisher", "--zero-copy"]).expect("args should parse"); + + assert!(args.zero_copy); + } - assert_eq!(selected, 950); + #[test] + fn test_pattern_without_value_defaults_to_static_bars() { + let args = + Args::try_parse_from(["publisher", "--test-pattern"]).expect("args should parse"); + + assert_eq!(args.test_pattern, Some(TestPatternKind::StaticColorBars)); + } + + #[test] + fn test_pattern_without_value_allows_following_option() { + let args = Args::try_parse_from(["publisher", "--test-pattern", "--room-name", "demo"]) + .expect("args should parse"); + + assert_eq!(args.test_pattern, Some(TestPatternKind::StaticColorBars)); + assert_eq!(args.room_name, "demo"); + } + + #[test] + fn test_pattern_accepts_numeric_mode() { + let args = + Args::try_parse_from(["publisher", "--test-pattern", "1"]).expect("args should parse"); + + assert_eq!(args.test_pattern, Some(TestPatternKind::AnimatedGraphic)); + } + + #[test] + fn capture_format_accepts_grey() { + let args = + Args::try_parse_from(["publisher", "--format", "grey"]).expect("args should parse"); + + assert_eq!(args.format, CaptureFormat::Grey); + } + + #[test] + fn test_pattern_rejects_unknown_numeric_mode() { + let err = + Args::try_parse_from(["publisher", "--test-pattern", "2"]).expect_err("2 is invalid"); + + assert!(err.to_string().contains("test pattern must be 0 or 1")); } } fn list_cameras() -> Result<()> { - let cams = nokhwa::query(ApiBackend::Auto)?; + let cams = VideoCaptureSource::list_devices(CaptureBackend::Auto)?; println!("Available cameras:"); for (i, cam) in cams.iter().enumerate() { - println!("{}. {}", i, cam.human_name()); + println!("{}. {}", i, cam.name); } Ok(()) } @@ -823,17 +863,262 @@ fn list_encoders() { enum VideoInput { TestPattern(TestPattern), - Camera { - camera: Camera, - is_yuyv: bool, - }, + /// Platform camera opened through the `livekit-capture` facade + /// (AVFoundation on macOS, V4L2 on Linux). + Camera(VideoCaptureSource), + /// Jetson MIPI CSI camera driven directly so the `--zero-copy` CPU/DMA + /// toggle stays available; see [`run_argus_capture_loop`]. #[cfg(all(target_os = "linux", target_arch = "aarch64"))] - Argus(argus::ArgusCaptureSession), + Argus(ArgusCaptureSession), +} + +/// Human-readable name of the backend behind a facade camera source. +fn camera_backend_name(source: &VideoCaptureSource) -> &'static str { + match source { + #[cfg(target_os = "macos")] + VideoCaptureSource::AvFoundation { .. } => "AVFoundation", + #[cfg(target_os = "linux")] + VideoCaptureSource::V4l(_) => "V4L2", + _ => "livekit-capture", + } +} + +fn publisher_capture_path_label( + video_input: &VideoInput, + burn_timestamp: bool, + zero_copy: bool, +) -> String { + match video_input { + VideoInput::TestPattern(_) => "test-pattern CPU I420".to_string(), + VideoInput::Camera(source) => match source { + #[cfg(target_os = "macos")] + VideoCaptureSource::AvFoundation { session, .. } => { + let source_format = session.format().frame_format; + let core_video_format = core_video_fourcc(session.core_video_pixel_format()); + if zero_copy { + match source.capture_path() { + LkCapturePath::Native if burn_timestamp => { + format!( + "AVFoundation zero-copy IOSurface CVPixelBuffer {core_video_format} from {source_format} (timestamp burn disabled)" + ) + } + LkCapturePath::Native => { + format!( + "AVFoundation zero-copy IOSurface CVPixelBuffer {core_video_format} from {source_format}" + ) + } + path => { + let suffix = if burn_timestamp { + "zero-copy unsupported, timestamp burn" + } else { + "zero-copy unsupported" + }; + format!( + "AVFoundation {} fallback from {source_format}/{core_video_format} ({suffix})", + capture_path_name(path), + ) + } + } + } else if burn_timestamp { + format!( + "AVFoundation CPU I420 from {source_format}/{core_video_format} (timestamp burn)" + ) + } else { + format!("AVFoundation CPU I420 from {source_format}/{core_video_format}") + } + } + #[cfg(target_os = "linux")] + VideoCaptureSource::V4l(session) => { + let format = session.format(); + let decode_suffix = if format.frame_format == CaptureFrameFormat::Mjpeg { + " with MJPEG decode" + } else { + "" + }; + if zero_copy { + let suffix = if burn_timestamp { + "zero-copy unsupported, timestamp burn" + } else { + "zero-copy unsupported" + }; + format!( + "V4L2 {} fallback from {}{} ({suffix})", + capture_path_name(session.capture_path()), + format.frame_format, + decode_suffix + ) + } else { + format!( + "V4L2 {} from {}{}", + capture_path_name(session.capture_path()), + format.frame_format, + decode_suffix + ) + } + } + other => { + format!( + "{} {} capture", + camera_backend_name(other), + capture_path_name(other.capture_path()) + ) + } + }, + #[cfg(all(target_os = "linux", target_arch = "aarch64"))] + VideoInput::Argus(_) => { + if zero_copy && burn_timestamp { + "libargus NV12 DMA-BUF zero-copy (timestamp burn disabled)".to_string() + } else if zero_copy { + "libargus NV12 DMA-BUF zero-copy".to_string() + } else if burn_timestamp { + "libargus CPU I420 from NV12 DMA-BUF (timestamp burn)".to_string() + } else { + "libargus CPU I420 from NV12 DMA-BUF".to_string() + } + } + } +} + +#[cfg(target_os = "macos")] +fn core_video_fourcc(pixel_format: u32) -> String { + let bytes = pixel_format.to_be_bytes(); + if bytes.iter().all(|byte| byte.is_ascii_graphic() || *byte == b' ') { + String::from_utf8_lossy(&bytes).into_owned() + } else { + format!("0x{pixel_format:08x}") + } +} + +fn publisher_zero_copy_unsupported_reason(video_input: &VideoInput) -> Option<&'static str> { + match video_input { + VideoInput::TestPattern(_) => Some("test pattern frames are generated in CPU I420 memory"), + VideoInput::Camera(source) => match source { + #[cfg(target_os = "macos")] + VideoCaptureSource::AvFoundation { .. } => { + if source.capture_path() == LkCapturePath::Native { + None + } else { + Some("the selected AVFoundation format is not IOSurface-backed NV12") + } + } + #[cfg(target_os = "linux")] + VideoCaptureSource::V4l(_) => { + Some("V4L2 UVC capture does not expose a zero-copy capture/encode path here") + } + _ => Some( + "the selected capture backend does not expose a zero-copy capture/encode path here", + ), + }, + #[cfg(all(target_os = "linux", target_arch = "aarch64"))] + VideoInput::Argus(_) => None, + } +} + +fn publisher_zero_copy_supported(video_input: &VideoInput) -> bool { + publisher_zero_copy_unsupported_reason(video_input).is_none() +} + +fn publisher_uses_zero_copy_camera_capture(video_input: &VideoInput, zero_copy: bool) -> bool { + if !zero_copy { + return false; + } + + match video_input { + VideoInput::Camera(source) => source.capture_path() == LkCapturePath::Native, + _ => false, + } +} + +enum CapturedFrameBuffer { + I420(VideoFrame), + #[cfg(target_os = "macos")] + Native(VideoFrame), +} + +/// One frame obtained from the active video input, together with the timing +/// context the publish pipeline records. +struct SourcedFrame { + buffer: CapturedFrameBuffer, + /// Wall-clock capture timestamp in microseconds (camera-provided when available). + capture_wall_time_us: u64, + /// Wall-clock time the frame was read from the source, in microseconds. + read_wall_time_us: u64, + /// When the frame buffer became available to the publish pipeline. + acquired_at: Instant, + /// When work on this frame began; `capture_to_webrtc_total` is measured from here. + pipeline_started_at: Instant, + /// Whether `capture_wall_time_us` came from a camera-provided timestamp. + has_camera_timestamp: bool, +} + +fn sourced_frame_from_capture(frame: CaptureFrame) -> Result { + let acquired_at = Instant::now(); + match frame { + CaptureFrame::Raw(raw) => Ok(SourcedFrame { + has_camera_timestamp: raw.sensor_timestamp_us.is_some(), + capture_wall_time_us: raw.capture_wall_time_us, + read_wall_time_us: raw.read_wall_time_us, + buffer: CapturedFrameBuffer::I420(raw.frame), + acquired_at, + pipeline_started_at: acquired_at, + }), + #[cfg(target_os = "macos")] + CaptureFrame::Native(native) => Ok(SourcedFrame { + has_camera_timestamp: native.sensor_timestamp_us.is_some(), + capture_wall_time_us: native.capture_wall_time_us, + read_wall_time_us: native.read_wall_time_us, + buffer: CapturedFrameBuffer::Native(native.frame), + acquired_at, + pipeline_started_at: acquired_at, + }), + other => anyhow::bail!( + "camera capture returned an unsupported {} frame", + capture_path_name(other.capture_path()) + ), + } +} + +/// Cross-thread stop signal for a capture input blocked inside +/// [`VideoCaptureSource::next_frame`]. +#[derive(Clone)] +enum CaptureStopHandle { + /// AVFoundation wakes a blocked capture call via its stop handle. + #[cfg(target_os = "macos")] + AvFoundation(AvFoundationStopHandle), + /// The input either never blocks (test pattern) or returns at the next + /// frame boundary, where the loop observes the shutdown flag. + FrameBoundary, +} + +impl CaptureStopHandle { + fn for_input(video_input: &VideoInput) -> Self { + match video_input { + #[cfg(target_os = "macos")] + VideoInput::Camera(VideoCaptureSource::AvFoundation { session, .. }) => { + Self::AvFoundation(session.stop_handle()) + } + _ => Self::FrameBoundary, + } + } + + /// Interrupts a blocked `next_frame` when the backend supports it. + fn stop(&self) { + match self { + #[cfg(target_os = "macos")] + Self::AvFoundation(handle) => handle.stop(), + Self::FrameBoundary => {} + } + } } #[derive(Clone, Copy)] struct CaptureConfig { fps: u32, + /// Read by the Argus capture loop to pick DMA-BUF vs CPU I420 publish; the + /// facade camera path bakes the zero-copy preference into the source when + /// it is opened instead. + #[cfg_attr(not(all(target_os = "linux", target_arch = "aarch64")), allow(dead_code))] + zero_copy: bool, attach_timestamp: bool, burn_timestamp: bool, attach_frame_id: bool, @@ -855,6 +1140,84 @@ fn create_i420_buffer(width: u32, height: u32, align_for_display: bool) -> I420B } } +/// Opens the platform camera through the `livekit-capture` facade +/// (AVFoundation on macOS, V4L2 on Linux). +fn open_camera_source(args: &Args) -> Result<(u32, u32, VideoInput)> { + #[cfg(any(target_os = "macos", target_os = "linux"))] + { + #[cfg(target_os = "macos")] + let format_request = { + if args.format != CaptureFormat::Auto { + log::warn!( + "--format={} is ignored for AVFoundation decoded capture; AVFoundation supplies decoded CVPixelBuffers", + args.format + ); + } + CaptureFormatRequest::Closest(LkCaptureFormat::new( + CaptureResolution::new(args.width, args.height), + args.fps, + CaptureFrameFormat::Nv12, + )) + }; + #[cfg(target_os = "linux")] + let format_request = { + let requested = LkCaptureFormat::new( + CaptureResolution::new(args.width, args.height), + args.fps, + args.format.preferred_frame_format(), + ); + if args.format == CaptureFormat::Auto { + CaptureFormatRequest::Closest(requested) + } else { + CaptureFormatRequest::Exact(requested) + } + }; + + // Without --zero-copy, ask for CPU-accessible frames so pixel work + // (e.g. the --burn-timestamp overlay) is possible; with --zero-copy, + // let AVFoundation deliver native platform buffers when supported. + let source = VideoCaptureSource::open(CaptureSourceOptions { + backend: CaptureBackend::Auto, + device: CaptureDeviceSelector::Index(args.camera_index), + format: format_request, + prefer_raw_frames: !args.zero_copy, + ..Default::default() + })?; + let format = source + .format() + .ok_or_else(|| anyhow::anyhow!("camera source did not report a negotiated format"))?; + info!( + "Camera opened with {}: {}x{} @ {} fps (source format: {}, requested: {}, camera {})", + camera_backend_name(&source), + format.resolution.width, + format.resolution.height, + format.frame_rate, + format.frame_format, + args.format, + args.camera_index, + ); + #[cfg(target_os = "linux")] + if args.format != CaptureFormat::Auto + && format.frame_format != args.format.preferred_frame_format() + { + log::warn!( + "--format={} was requested but the camera negotiated {}; continuing with the negotiated format", + args.format, + format.frame_format, + ); + } + Ok((format.resolution.width, format.resolution.height, VideoInput::Camera(source))) + } + + #[cfg(not(any(target_os = "macos", target_os = "linux")))] + { + anyhow::bail!( + "camera capture is not supported on {}; local_video supports macOS AVFoundation and Linux V4L2", + std::env::consts::OS + ); + } +} + #[tokio::main] async fn main() -> Result<()> { env_logger::init(); @@ -885,14 +1248,17 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { // LiveKit connection details let url = args .url + .clone() .or_else(|| env::var("LIVEKIT_URL").ok()) .expect("LIVEKIT_URL must be provided via --url or env"); let api_key = args .api_key + .clone() .or_else(|| env::var("LIVEKIT_API_KEY").ok()) .expect("LIVEKIT_API_KEY must be provided via --api-key or env"); let api_secret = args .api_secret + .clone() .or_else(|| env::var("LIVEKIT_API_SECRET").ok()) .expect("LIVEKIT_API_SECRET must be provided via --api-secret or env"); @@ -976,23 +1342,17 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { SourceKind::Argus => { #[cfg(all(target_os = "linux", target_arch = "aarch64"))] { - if args.test_pattern { + if args.test_pattern.is_some() { anyhow::bail!("--test-pattern is not supported with --source argus"); } if args.display_video { anyhow::bail!("--display-video is not supported with --source argus"); } - if args.burn_timestamp { - log::warn!( - "--burn-timestamp is ignored with --source argus (DMA buffers are not CPU-mapped on the publish path)" - ); - } - let session = argus::ArgusCaptureSession::new( + let session = ArgusCaptureSession::new(ArgusCaptureOptions::new( args.camera_index as u32, - args.width, - args.height, + CaptureResolution::new(args.width, args.height), args.fps, - )?; + ))?; info!( "Argus MIPI capture session opened: {}x{} @ {} fps (camera {})", session.width(), @@ -1012,95 +1372,24 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { } } SourceKind::Uvc => { - if args.test_pattern { + if let Some(test_pattern) = args.test_pattern { let width = args.width; let height = args.height; let fps = args.fps; info!( - "Test pattern enabled: SMPTE 75% color bars at {}x{} @ {} fps", - width, height, fps - ); - (width, height, VideoInput::TestPattern(TestPattern::new(width, height))) - } else { - // Setup camera - let index = CameraIndex::Index(args.camera_index as u32); - let requested = RequestedFormat::new::( - RequestedFormatType::AbsoluteHighestFrameRate, - ); - let mut camera = Camera::new(index, requested)?; - - let mut requested_camera_format = None; - let mut last_request_error = None; - for frame_format in args.format.frame_formats() { - let wanted = CameraFormat::new( - Resolution::new(args.width, args.height), - *frame_format, - args.fps, - ); - match camera.set_camera_requset(RequestedFormat::new::( - RequestedFormatType::Exact(wanted), - )) { - Ok(format) => { - requested_camera_format = Some(format); - break; - } - Err(err) => { - last_request_error = Some(err); - } - } - } - if let Some(requested_camera_format) = requested_camera_format { - debug!("Requested nokhwa CameraFormat: {:?}", requested_camera_format); - } else if args.format == CaptureFormat::Auto { - if let Some(err) = last_request_error { - log::warn!( - "Failed to request YUYV or MJPEG at {}x{} @ {} fps; using backend-selected camera format: {}", - args.width, - args.height, - args.fps, - err - ); - } - } else { - let formats = args - .format - .frame_formats() - .iter() - .map(ToString::to_string) - .collect::>() - .join(" or "); - return Err(match last_request_error { - Some(err) => anyhow::anyhow!( - "failed to request camera format {} at {}x{} @ {} fps: {}", - formats, - args.width, - args.height, - args.fps, - err - ), - None => anyhow::anyhow!("no camera capture formats were requested"), - }); - } - camera.open_stream()?; - let fmt = camera.camera_format(); - let width = fmt.width(); - let height = fmt.height(); - let fps = fmt.frame_rate(); - let is_yuyv = fmt.format() == FrameFormat::YUYV; - info!( - "Camera opened: {}x{} @ {} fps (format: {}, requested: {})", + "Test pattern enabled: {} at {}x{} @ {} fps", + test_pattern.label(), width, height, - fps, - fmt.format(), - args.format + fps ); - debug!("Negotiated nokhwa CameraFormat: {:?}", fmt); - info!( - "Selected conversion path: {}", - if is_yuyv { "YUYV->I420 (libyuv)" } else { "Auto (RGB24 or MJPEG)" } - ); - (width, height, VideoInput::Camera { camera, is_yuyv }) + ( + width, + height, + VideoInput::TestPattern(TestPattern::new(width, height, test_pattern)), + ) + } else { + open_camera_source(&args)? } } }; @@ -1222,9 +1511,32 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { info!("Published camera track"); requested_codec }; + let burn_timestamp_requested = args.attach_timestamp && args.burn_timestamp; + let zero_copy_supported = publisher_zero_copy_supported(&video_input); + let zero_copy_active = args.zero_copy && zero_copy_supported; + if args.zero_copy { + if let Some(reason) = publisher_zero_copy_unsupported_reason(&video_input) { + log::warn!("--zero-copy requested, but {reason}; using CPU I420 capture"); + } + } + if zero_copy_active && burn_timestamp_requested { + log::warn!( + "--zero-copy keeps frames out of CPU memory; --burn-timestamp will not draw an overlay" + ); + } + info!( + "Publisher media path: capture={}, encode=requested codec {} via {}", + publisher_capture_path_label(&video_input, burn_timestamp_requested, args.zero_copy), + actual_codec.as_str(), + video_encoder_backend_name(requested_encoder), + ); + let zero_copy_fallback = + publisher_uses_zero_copy_camera_capture(&video_input, zero_copy_active) + .then(|| Arc::new(AtomicBool::new(false))); let capture_config = CaptureConfig { fps: args.fps, + zero_copy: zero_copy_active, attach_timestamp: args.attach_timestamp, burn_timestamp: args.burn_timestamp, attach_frame_id: args.attach_frame_id, @@ -1236,8 +1548,11 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { let user_data_channels = args.attach_user_data.then(|| Arc::new(Mutex::new([0.0f32; user_data::NUM_CHANNELS]))); - let publish_stats_task = - tokio::spawn(update_publisher_video_stats(track.clone(), ctrl_c_received.clone())); + let publish_stats_task = tokio::spawn(update_publisher_video_stats( + track.clone(), + ctrl_c_received.clone(), + zero_copy_fallback.clone(), + )); match video_input { #[cfg(all(target_os = "linux", target_arch = "aarch64"))] @@ -1280,6 +1595,7 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { Some(shared.clone()), publish_timing_state.clone(), user_data_channels.clone(), + zero_copy_fallback.clone(), )); let display_result = video_display::run_display( @@ -1307,6 +1623,7 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { None, publish_timing_state.clone(), user_data_channels.clone(), + zero_copy_fallback.clone(), ) .await; let _ = publish_stats_task.await; @@ -1318,7 +1635,71 @@ async fn run(args: Args, ctrl_c_received: Arc) -> Result<()> { Ok(()) } +/// Maximum number of back-to-back camera capture/convert failures tolerated +/// before the publish is aborted; isolated failures (e.g. one corrupt MJPEG +/// frame) are logged and skipped. +const MAX_CONSECUTIVE_CAPTURE_FAILURES: u32 = 30; + +/// Runs the test-pattern/camera capture loop. +/// +/// Camera backends block inside [`VideoCaptureSource::next_frame`] until a +/// frame arrives (AVFoundation parks on a condvar), so the loop body runs on a +/// dedicated blocking thread, mirroring [`run_argus_capture_loop`]. A watcher +/// task turns the shutdown flag (Ctrl-C or preview window close) into a +/// [`CaptureStopHandle::stop`] call so a blocked `next_frame` returns promptly +/// instead of hanging the process. async fn run_capture_loop( + config: CaptureConfig, + ctrl_c_received: Arc, + track: LocalVideoTrack, + rtc_source: NativeVideoSource, + video_input: VideoInput, + width: u32, + height: u32, + display_shared: Option>>, + publish_timing_state: Option>>, + user_data_channels: Option>>, + zero_copy_fallback: Option>, +) -> Result<()> { + let stop_handle = CaptureStopHandle::for_input(&video_input); + let stop_watcher = tokio::spawn({ + let ctrl_c_received = ctrl_c_received.clone(); + let stop_handle = stop_handle.clone(); + async move { + while !ctrl_c_received.load(Ordering::Acquire) { + tokio::time::sleep(Duration::from_millis(100)).await; + } + stop_handle.stop(); + } + }); + + let capture_result = tokio::task::spawn_blocking({ + let ctrl_c_received = ctrl_c_received.clone(); + move || { + run_capture_loop_blocking( + config, + ctrl_c_received, + track, + rtc_source, + video_input, + width, + height, + display_shared, + publish_timing_state, + user_data_channels, + zero_copy_fallback, + ) + } + }) + .await; + stop_watcher.abort(); + // Unblock the stats/overlay/display tasks when the loop exits on its own + // (e.g. after repeated capture failures) rather than via the shutdown flag. + ctrl_c_received.store(true, Ordering::Release); + capture_result? +} + +fn run_capture_loop_blocking( config: CaptureConfig, ctrl_c_received: Arc, track: LocalVideoTrack, @@ -1329,14 +1710,18 @@ async fn run_capture_loop( display_shared: Option>>, publish_timing_state: Option>>, user_data_channels: Option>>, + zero_copy_fallback: Option>, ) -> Result<()> { - // Pace publishing at the requested FPS (not the camera-reported FPS) to hit desired cadence let pace_fps = config.fps as f64; - // Accurate pacing using absolute schedule (no drift) - let mut ticker = tokio::time::interval(Duration::from_secs_f64(1.0 / pace_fps)); - ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); - // Align the first tick to now - ticker.tick().await; + #[cfg(target_os = "macos")] + let camera_driven_pacing = + matches!(&video_input, VideoInput::Camera(VideoCaptureSource::AvFoundation { .. })); + #[cfg(not(target_os = "macos"))] + let camera_driven_pacing = false; + let target = Duration::from_secs_f64(1.0 / pace_fps); + // Deadline-based pacing with skipped missed intervals, equivalent to the + // previous tokio interval with `MissedTickBehavior::Skip`. + let mut next_frame_deadline = Instant::now() + target; let start_ts = Instant::now(); // Capture loop @@ -1345,52 +1730,58 @@ async fn run_capture_loop( let mut fps_window_frames: u64 = 0; let mut fps_window_start = Instant::now(); let mut fps_smoothed: f32 = 0.0; - let target = Duration::from_secs_f64(1.0 / pace_fps); + let burn_timestamp_requested = config.attach_timestamp && config.burn_timestamp; info!("Target frame interval: {:.2} ms", target.as_secs_f64() * 1000.0); + if camera_driven_pacing { + info!("Capture pacing: camera frame-arrival driven"); + } else { + info!("Capture pacing: application timer driven"); + } // Timing accumulators (ms) for rolling stats let mut timings = PublisherTimingSummary::default(); - let mut logged_mjpeg_fallback = false; - let mut capture_timestamp_log_state = CaptureTimestampLogState::default(); let mut frame_counter: u32 = 1; - let mut timestamp_overlay = (config.attach_timestamp && config.burn_timestamp) - .then(|| TimestampOverlay::new(width, height)); + let mut test_pattern_frame_index: u64 = 0; + let mut timestamp_overlay = + burn_timestamp_requested.then(|| TimestampOverlay::new(width, height)); let align_buffers_for_display = display_shared.is_some(); + let mut logged_camera_timestamp_source = false; + let mut logged_camera_timestamp_fallback = false; + let mut logged_zero_copy_fallback = false; + let mut consecutive_capture_failures: u32 = 0; loop { if ctrl_c_received.load(Ordering::Acquire) { break; } - // Wait until the scheduled next frame time let paced_wait_started_at = Instant::now(); - ticker.tick().await; + if !camera_driven_pacing { + if let Some(wait) = next_frame_deadline.checked_duration_since(paced_wait_started_at) { + std::thread::sleep(wait); + } + let now = Instant::now(); + next_frame_deadline += target; + while next_frame_deadline <= now { + next_frame_deadline += target; + } + } let paced_wait_finished_at = Instant::now(); - // WebRTC may queue the frame and hardware encoders may upload it asynchronously. - // Give each submitted frame unique backing storage so later captures cannot - // overwrite buffers that are still in-flight. - let mut frame = VideoFrame { - rotation: VideoRotation::VideoRotation0, - timestamp_us: 0, - frame_metadata: None, - buffer: create_i420_buffer(width, height, align_buffers_for_display), - }; - let (stride_y, stride_u, stride_v) = frame.buffer.strides(); - let stride_y_usize = stride_y as usize; - - let source_frame_started_at = Instant::now(); - let frame_wall_time_us = unix_time_us_now(); - let (data_y, data_u, data_v) = frame.buffer.data_mut(); - let ( - capture_wall_time_us, - read_wall_time_us, - source_frame_acquired_at, - decode_finished_at, - convert_finished_at, - used_decode_path, - record_convert_timing, - ) = match &mut video_input { + let source_frame_read_started_at = Instant::now(); + let mut sourced = match &mut video_input { VideoInput::TestPattern(pattern) => { + let frame_wall_time_us = unix_time_us_now(); + // WebRTC may queue the frame and hardware encoders may upload it asynchronously. + // Give each submitted frame unique backing storage so later captures cannot + // overwrite buffers that are still in-flight. + let mut frame = VideoFrame { + rotation: VideoRotation::VideoRotation0, + timestamp_us: 0, + frame_metadata: None, + buffer: create_i420_buffer(width, height, align_buffers_for_display), + }; + let (stride_y, stride_u, stride_v) = frame.buffer.strides(); + let (data_y, data_u, data_v) = frame.buffer.data_mut(); pattern.render( data_y, stride_y as i32, @@ -1398,159 +1789,89 @@ async fn run_capture_loop( stride_u as i32, data_v, stride_v as i32, + test_pattern_frame_index, ); + test_pattern_frame_index = test_pattern_frame_index.wrapping_add(1); let frame_acquired_at = Instant::now(); - ( - frame_wall_time_us, - unix_time_us_now(), - frame_acquired_at, - frame_acquired_at, - frame_acquired_at, - false, - false, - ) + SourcedFrame { + buffer: CapturedFrameBuffer::I420(frame), + capture_wall_time_us: frame_wall_time_us, + read_wall_time_us: unix_time_us_now(), + acquired_at: frame_acquired_at, + pipeline_started_at: source_frame_read_started_at, + has_camera_timestamp: false, + } } - VideoInput::Camera { camera, is_yuyv } => { - // Capture the frame as early as possible so the attached timestamp is - // close to the camera acquisition point. - let frame_buf = camera.frame()?; - let read_wall_time_us = unix_time_us_now(); - let camera_frame_acquired_at = Instant::now(); - - // Prefer backend capture timestamps only when they are plausible Unix - // wall-clock times. Some camera APIs expose stream-relative or future - // presentation timestamps; attaching those makes latency appear negative. - let capture_wall_time_us = select_capture_wall_time_us( - frame_buf.capture_timestamp(), - frame_wall_time_us, - read_wall_time_us, - &mut capture_timestamp_log_state, - ); - - let (decode_finished_at, convert_finished_at, used_decode_path) = if *is_yuyv { - // Fast path for YUYV: convert directly to I420 via libyuv - let src = frame_buf.buffer(); - let src_bytes = src.as_ref(); - let src_stride = (width * 2) as i32; // YUYV packed 4:2:2 - unsafe { - // returns 0 on success - let _ = yuv_sys::rs_YUY2ToI420( - src_bytes.as_ptr(), - src_stride, - data_y.as_mut_ptr(), - stride_y as i32, - data_u.as_mut_ptr(), - stride_u as i32, - data_v.as_mut_ptr(), - stride_v as i32, - width as i32, - height as i32, - ); + VideoInput::Camera(source) => { + let force_raw_after_zero_copy_failure = zero_copy_fallback + .as_ref() + .is_some_and(|fallback| fallback.load(Ordering::Acquire)); + if force_raw_after_zero_copy_failure && !logged_zero_copy_fallback { + log::warn!( + "Publisher media path changed: capture=AVFoundation CPU I420 fallback after zero-copy encode starvation" + ); + logged_zero_copy_fallback = true; + // Switch the facade to CPU-accessible frames for the rest of the run. + #[cfg(target_os = "macos")] + if let VideoCaptureSource::AvFoundation { prefer_raw_frames, .. } = source { + *prefer_raw_frames = true; } - (camera_frame_acquired_at, Instant::now(), false) - } else { - // Auto path (either RGB24 already or compressed MJPEG) - let src = frame_buf.buffer(); - if src.len() == (width as usize * height as usize * 3) { - // Already RGB24 from backend; convert directly - unsafe { - let _ = yuv_sys::rs_RGB24ToI420( - src.as_ref().as_ptr(), - (width * 3) as i32, - data_y.as_mut_ptr(), - stride_y as i32, - data_u.as_mut_ptr(), - stride_u as i32, - data_v.as_mut_ptr(), - stride_v as i32, - width as i32, - height as i32, - ); + } + let captured = match source.next_frame() { + Ok(frame) => { + consecutive_capture_failures = 0; + frame + } + Err(err) => { + if ctrl_c_received.load(Ordering::Acquire) { + // `stop()` interrupted a blocked `next_frame` during shutdown. + break; } - (camera_frame_acquired_at, Instant::now(), false) - } else { - // Try fast MJPEG->I420 via libyuv if available; fallback to image crate - let mut used_fast_mjpeg = false; - let fast_mjpeg_buffer_ready_at = unsafe { - // rs_MJPGToI420 returns 0 on success - let ret = yuv_sys::rs_MJPGToI420( - src.as_ref().as_ptr(), - src.len(), - data_y.as_mut_ptr(), - stride_y as i32, - data_u.as_mut_ptr(), - stride_u as i32, - data_v.as_mut_ptr(), - stride_v as i32, - width as i32, - height as i32, - width as i32, - height as i32, - ); - if ret == 0 { - used_fast_mjpeg = true; - Instant::now() - } else { - camera_frame_acquired_at - } - }; - if used_fast_mjpeg { - (fast_mjpeg_buffer_ready_at, fast_mjpeg_buffer_ready_at, true) - } else { - // Fallback: decode MJPEG using image crate then RGB24->I420 - match image::load_from_memory(src.as_ref()) { - Ok(img_dyn) => { - let rgb8 = img_dyn.to_rgb8(); - let decode_finished_at = Instant::now(); - let dec_w = rgb8.width() as u32; - let dec_h = rgb8.height() as u32; - if dec_w != width || dec_h != height { - log::warn!( - "Decoded MJPEG size {}x{} differs from requested {}x{}; dropping frame", - dec_w, dec_h, width, height - ); - continue; - } - unsafe { - let _ = yuv_sys::rs_RGB24ToI420( - rgb8.as_raw().as_ptr(), - (dec_w * 3) as i32, - data_y.as_mut_ptr(), - stride_y as i32, - data_u.as_mut_ptr(), - stride_u as i32, - data_v.as_mut_ptr(), - stride_v as i32, - width as i32, - height as i32, - ); - } - (decode_finished_at, Instant::now(), true) - } - Err(e2) => { - if !logged_mjpeg_fallback { - log::error!( - "MJPEG decode failed; buffer not RGB24 and image decode failed: {}", - e2 - ); - logged_mjpeg_fallback = true; - } - continue; - } - } + consecutive_capture_failures += 1; + log::warn!( + "Camera frame capture failed ({consecutive_capture_failures} consecutive): {err}" + ); + if consecutive_capture_failures >= MAX_CONSECUTIVE_CAPTURE_FAILURES { + return Err(anyhow::Error::new(err).context(format!( + "camera capture failed {MAX_CONSECUTIVE_CAPTURE_FAILURES} times in a row" + ))); } + std::thread::sleep(Duration::from_millis( + 5 * u64::from(consecutive_capture_failures.min(20)), + )); + continue; } }; + let mut sourced = sourced_frame_from_capture(captured)?; + match &mut sourced.buffer { + CapturedFrameBuffer::I420(frame) => { + frame.rotation = VideoRotation::VideoRotation0; + } + #[cfg(target_os = "macos")] + CapturedFrameBuffer::Native(frame) => { + frame.rotation = VideoRotation::VideoRotation0; + } + } + if sourced.has_camera_timestamp { + if !logged_camera_timestamp_source { + let capture_timestamp_age_ms = + sourced.read_wall_time_us.saturating_sub(sourced.capture_wall_time_us) + as f64 + / 1000.0; + info!( + "Using camera-provided capture timestamp (age at frame read {:.2} ms)", + capture_timestamp_age_ms + ); + logged_camera_timestamp_source = true; + } + } else if !logged_camera_timestamp_fallback { + log::warn!( + "Camera-provided capture timestamp unavailable or implausible; using frame read wall clock" + ); + logged_camera_timestamp_fallback = true; + } - ( - capture_wall_time_us, - read_wall_time_us, - camera_frame_acquired_at, - decode_finished_at, - convert_finished_at, - used_decode_path, - true, - ) + sourced } #[cfg(all(target_os = "linux", target_arch = "aarch64"))] VideoInput::Argus(_) => { @@ -1569,43 +1890,76 @@ async fn run_capture_loop( None }; if let Some(timing_state) = publish_timing_state.as_ref() { - timing_state.lock().record_frame_buffer(capture_wall_time_us, read_wall_time_us, fid); + timing_state.lock().record_frame_buffer( + sourced.capture_wall_time_us, + sourced.read_wall_time_us, + fid, + ); } - let mut buffer_ready_at = convert_finished_at; + let mut buffer_ready_at = sourced.acquired_at; let mut frame_draw_ms = None; let mut burned_timestamp_us = None; - if let Some(overlay) = timestamp_overlay.as_mut() { - let overlay_started_at = Instant::now(); - overlay.draw(data_y, stride_y_usize, capture_wall_time_us, fid); - burned_timestamp_us = Some(capture_wall_time_us); - let overlay_finished_at = Instant::now(); - frame_draw_ms = Some((overlay_finished_at - overlay_started_at).as_secs_f64() * 1000.0); - buffer_ready_at = overlay_finished_at; + let frame_uses_zero_copy = match &sourced.buffer { + #[cfg(target_os = "macos")] + CapturedFrameBuffer::Native(_) => true, + _ => false, + }; + if !frame_uses_zero_copy { + if let Some(overlay) = timestamp_overlay.as_mut() { + let overlay_started_at = Instant::now(); + match &mut sourced.buffer { + CapturedFrameBuffer::I420(frame) => { + let (stride_y, _, _) = frame.buffer.strides(); + let (data_y, _, _) = frame.buffer.data_mut(); + overlay.draw(data_y, stride_y as usize, sourced.capture_wall_time_us, fid); + } + #[cfg(target_os = "macos")] + CapturedFrameBuffer::Native(_) => { + unreachable!("native frame was classified as zero-copy"); + } + } + burned_timestamp_us = Some(sourced.capture_wall_time_us); + let overlay_finished_at = Instant::now(); + frame_draw_ms = + Some((overlay_finished_at - overlay_started_at).as_secs_f64() * 1000.0); + buffer_ready_at = overlay_finished_at; + } } // Build frame metadata from enabled packet trailer features and local timing correlation. let user_ts = if config.attach_timestamp || config.display_timing { - Some(capture_wall_time_us) + Some(sourced.capture_wall_time_us) } else { None }; if burned_timestamp_us.is_some() { - debug_assert_eq!(burned_timestamp_us, Some(capture_wall_time_us)); + debug_assert_eq!(burned_timestamp_us, Some(sourced.capture_wall_time_us)); } let user_data = user_data_channels.as_ref().map(|targets| user_data::encode(&targets.lock())); - frame.frame_metadata = if user_ts.is_some() || fid.is_some() || user_data.is_some() { + let frame_metadata = if user_ts.is_some() || fid.is_some() || user_data.is_some() { Some(FrameMetadata { user_timestamp: user_ts, frame_id: fid, user_data }) } else { None }; // Monotonic, microseconds since start. - frame.timestamp_us = start_ts.elapsed().as_micros() as i64; - rtc_source.capture_frame(&frame); + let timestamp_us = start_ts.elapsed().as_micros() as i64; + match &mut sourced.buffer { + CapturedFrameBuffer::I420(frame) => { + frame.frame_metadata = frame_metadata; + frame.timestamp_us = timestamp_us; + rtc_source.capture_frame(frame); + } + #[cfg(target_os = "macos")] + CapturedFrameBuffer::Native(frame) => { + frame.frame_metadata = frame_metadata; + frame.timestamp_us = timestamp_us; + rtc_source.capture_frame(frame); + } + } let webrtc_capture_finished_at = Instant::now(); + let webrtc_capture_finished_wall_time_us = unix_time_us_now(); if let Some(shared) = display_shared.as_ref() { - let (stride_y, stride_u, stride_v) = frame.buffer.strides(); - let (data_y, data_u, data_v) = frame.buffer.data(); let timing_sample = if config.display_timing { publish_timing_state .as_ref() @@ -1613,18 +1967,42 @@ async fn run_capture_loop( } else { None }; - video_display::pack_i420_into_shared( - shared, - width, - height, - data_y, - stride_y as u32, - data_u, - stride_u as u32, - data_v, - stride_v as u32, - timing_sample, - ); + match &sourced.buffer { + CapturedFrameBuffer::I420(frame) => { + let (stride_y, stride_u, stride_v) = frame.buffer.strides(); + let (data_y, data_u, data_v) = frame.buffer.data(); + video_display::pack_i420_into_shared( + shared, + width, + height, + data_y, + stride_y as u32, + data_u, + stride_u as u32, + data_v, + stride_v as u32, + timing_sample, + ); + } + #[cfg(target_os = "macos")] + CapturedFrameBuffer::Native(frame) => { + let i420 = frame.buffer.to_i420(); + let (stride_y, stride_u, stride_v) = i420.strides(); + let (data_y, data_u, data_v) = i420.data(); + video_display::pack_i420_into_shared( + shared, + width, + height, + data_y, + stride_y as u32, + data_u, + stride_u as u32, + data_v, + stride_v as u32, + timing_sample, + ); + } + } } frames += 1; @@ -1650,16 +2028,20 @@ async fn run_capture_loop( .record((paced_wait_finished_at - paced_wait_started_at).as_secs_f64() * 1000.0); timings .camera_frame_read_ms - .record((source_frame_acquired_at - source_frame_started_at).as_secs_f64() * 1000.0); - if used_decode_path { + .record((sourced.acquired_at - source_frame_read_started_at).as_secs_f64() * 1000.0); + if sourced.has_camera_timestamp && sourced.read_wall_time_us >= sourced.capture_wall_time_us + { timings - .decode_mjpeg_ms - .record((decode_finished_at - source_frame_acquired_at).as_secs_f64() * 1000.0); + .capture_timestamp_age_ms + .record((sourced.read_wall_time_us - sourced.capture_wall_time_us) as f64 / 1000.0); } - if record_convert_timing { - timings - .buffer_convert_ms - .record((convert_finished_at - decode_finished_at).as_secs_f64() * 1000.0); + if sourced.has_camera_timestamp + && webrtc_capture_finished_wall_time_us >= sourced.capture_wall_time_us + { + timings.capture_timestamp_to_webrtc_ms.record( + (webrtc_capture_finished_wall_time_us - sourced.capture_wall_time_us) as f64 + / 1000.0, + ); } if let Some(frame_draw_ms) = frame_draw_ms { timings.frame_draw_ms.record(frame_draw_ms); @@ -1667,9 +2049,9 @@ async fn run_capture_loop( timings .submit_to_webrtc_ms .record((webrtc_capture_finished_at - buffer_ready_at).as_secs_f64() * 1000.0); - timings - .capture_to_webrtc_total_ms - .record((webrtc_capture_finished_at - source_frame_started_at).as_secs_f64() * 1000.0); + timings.capture_to_webrtc_total_ms.record( + (webrtc_capture_finished_at - sourced.pipeline_started_at).as_secs_f64() * 1000.0, + ); if last_fps_log.elapsed() >= std::time::Duration::from_secs(2) { let secs = last_fps_log.elapsed().as_secs_f64(); @@ -1712,15 +2094,15 @@ async fn run_capture_loop( /// Capture loop dedicated to Jetson MIPI capture via libargus. /// /// Argus blocks inside `acquireFrame`, pacing capture itself, so this loop runs in a -/// dedicated OS thread and pushes NV12 DMA-buffer fds straight into `NativeVideoSource` -/// via [`NativeVideoSource::capture_dmabuf_frame_with_metadata`] for zero-copy hand-off -/// to the Jetson hardware encoder. +/// dedicated OS thread. With `--zero-copy`, the path pushes NV12 DMA-buffer fds +/// straight into [`NativeVideoSource::capture_dmabuf_frame_with_metadata`] for +/// hand-off to the Jetson hardware encoder; otherwise it copies to CPU I420. #[cfg(all(target_os = "linux", target_arch = "aarch64"))] async fn run_argus_capture_loop( config: CaptureConfig, ctrl_c_received: Arc, rtc_source: NativeVideoSource, - session: argus::ArgusCaptureSession, + session: ArgusCaptureSession, width: u32, height: u32, user_data_channels: Option>>, @@ -1728,13 +2110,23 @@ async fn run_argus_capture_loop( let capture_handle = std::thread::Builder::new() .name("mipi-capture".into()) .spawn(move || -> Result<()> { + enum CapturedArgusFrame { + DmaBuf(argus::ArgusFrame), + I420(argus::ArgusI420Frame), + } + let mut session = session; - let start_ts = Instant::now(); + let burn_timestamp_requested = config.attach_timestamp && config.burn_timestamp; + let burn_timestamp_active = burn_timestamp_requested && !config.zero_copy; + let mut timestamp_overlay = + burn_timestamp_active.then(|| TimestampOverlay::new(width, height)); let mut frames: u64 = 0; let mut last_fps_log = Instant::now(); let mut sum_acquire_ms = 0.0; let mut sum_argus_wait_ms = 0.0; let mut sum_argus_blit_ms = 0.0; + let mut sum_argus_i420_copy_ms = 0.0; + let mut sum_timestamp_burn_ms = 0.0; let mut sum_capture_ms = 0.0; let mut sum_iter_ms = 0.0; let mut consecutive_failures: u32 = 0; @@ -1746,6 +2138,11 @@ async fn run_argus_capture_loop( let mut backup_timestamp_frames: u64 = 0; let mut sum_sensor_to_acquire_ms = 0.0; let mut sum_sensor_to_argus_acquire_ms = 0.0; + if burn_timestamp_active { + info!( + "Argus timestamp burn enabled: copying NV12 DMA-BUF frames to CPU I420 before publish" + ); + } loop { if ctrl_c_received.load(Ordering::Acquire) { @@ -1754,7 +2151,12 @@ async fn run_argus_capture_loop( let iter_start = Instant::now(); let acquire_started_at = Instant::now(); - let argus_frame = match session.acquire_frame() { + let capture_result = if config.zero_copy { + session.capture_frame().map(CapturedArgusFrame::DmaBuf) + } else { + session.capture_i420_frame().map(CapturedArgusFrame::I420) + }; + let captured_frame = match capture_result { Ok(frame) => { consecutive_failures = 0; frame @@ -1775,6 +2177,12 @@ async fn run_argus_capture_loop( } }; let acquire_finished_at = Instant::now(); + let argus_frame = match &captured_frame { + CapturedArgusFrame::DmaBuf(frame) => frame, + CapturedArgusFrame::I420(frame) => &frame.dmabuf, + }; + let argus_wait_ms = argus_frame.acquire_wait_ns as f64 / 1_000_000.0; + let argus_blit_ms = argus_frame.blit_ns as f64 / 1_000_000.0; let fallback_wall_time_us = if config.attach_timestamp { unix_time_us_now() } else { 0 }; @@ -1819,14 +2227,12 @@ async fn run_argus_capture_loop( if config.attach_timestamp { if timestamp_from_sensor { sensor_timestamp_frames += 1; - let sensor_to_acquire_ms = fallback_wall_time_us - .saturating_sub(capture_wall_time_us) - as f64 - / 1_000.0; - let blit_ms = argus_frame.blit_ns as f64 / 1_000_000.0; + let sensor_to_acquire_ms = + fallback_wall_time_us.saturating_sub(capture_wall_time_us) as f64 + / 1_000.0; sum_sensor_to_acquire_ms += sensor_to_acquire_ms; sum_sensor_to_argus_acquire_ms += - (sensor_to_acquire_ms - blit_ms).max(0.0); + (sensor_to_acquire_ms - argus_blit_ms).max(0.0); } else { backup_timestamp_frames += 1; } @@ -1848,20 +2254,45 @@ async fn run_argus_capture_loop( None }; - rtc_source.capture_dmabuf_frame_with_metadata( - argus_frame.dmabuf_fd, - width, - height, - 0, // NV12 - start_ts.elapsed().as_micros() as i64, - frame_metadata, - ); + match captured_frame { + CapturedArgusFrame::DmaBuf(argus_frame) => { + let plane = argus_frame + .dmabuf + .planes + .first() + .ok_or_else(|| anyhow::anyhow!("Argus DMA-BUF frame missing plane"))?; + rtc_source.capture_dmabuf_frame_with_metadata( + plane.fd, + argus_frame.dmabuf.width, + argus_frame.dmabuf.height, + 0, // NV12 + argus_frame.dmabuf.timestamp_us, + frame_metadata, + ); + } + CapturedArgusFrame::I420(mut argus_i420_frame) => { + if let Some(overlay) = timestamp_overlay.as_mut() { + let overlay_started_at = Instant::now(); + let (stride_y, _, _) = argus_i420_frame.frame.buffer.strides(); + let (data_y, _, _) = argus_i420_frame.frame.buffer.data_mut(); + overlay.draw(data_y, stride_y as usize, capture_wall_time_us, fid); + sum_timestamp_burn_ms += + overlay_started_at.elapsed().as_secs_f64() * 1000.0; + } + sum_argus_i420_copy_ms += + argus_i420_frame.copy_to_i420_ns as f64 / 1_000_000.0; + argus_i420_frame.frame.frame_metadata = frame_metadata; + argus_i420_frame.frame.timestamp_us = + argus_i420_frame.dmabuf.dmabuf.timestamp_us; + rtc_source.capture_frame(&argus_i420_frame.frame); + } + } let capture_finished_at = Instant::now(); frames += 1; sum_acquire_ms += (acquire_finished_at - acquire_started_at).as_secs_f64() * 1000.0; - sum_argus_wait_ms += argus_frame.acquire_wait_ns as f64 / 1_000_000.0; - sum_argus_blit_ms += argus_frame.blit_ns as f64 / 1_000_000.0; + sum_argus_wait_ms += argus_wait_ms; + sum_argus_blit_ms += argus_blit_ms; sum_capture_ms += (capture_finished_at - acquire_finished_at).as_secs_f64() * 1000.0; sum_iter_ms += (Instant::now() - iter_start).as_secs_f64() * 1000.0; @@ -1881,21 +2312,41 @@ async fn run_argus_capture_loop( } else { 0.0 }; - info!( - "MIPI publishing: {}x{}, ~{:.1} fps | packet trailer timestamp source: sensor {} frames, backup system {} frames | avg ms: sensor_to_argus_acquire {:.2}, argus_wait {:.2}, argus_blit {:.2}, sensor_to_acquire {:.2}, acquire {:.2}, capture {:.2}, iter {:.2}", - width, - height, - fps_est, - sensor_timestamp_frames, - backup_timestamp_frames, - sensor_to_argus_acquire_ms, - sum_argus_wait_ms / n, - sum_argus_blit_ms / n, - sensor_age_ms, - sum_acquire_ms / n, - sum_capture_ms / n, - sum_iter_ms / n, - ); + if burn_timestamp_active { + info!( + "MIPI publishing: {}x{}, ~{:.1} fps | packet trailer timestamp source: sensor {} frames, backup system {} frames | avg ms: sensor_to_argus_acquire {:.2}, argus_wait {:.2}, argus_blit {:.2}, argus_i420_copy {:.2}, timestamp_burn {:.2}, sensor_to_acquire {:.2}, acquire {:.2}, capture {:.2}, iter {:.2}", + width, + height, + fps_est, + sensor_timestamp_frames, + backup_timestamp_frames, + sensor_to_argus_acquire_ms, + sum_argus_wait_ms / n, + sum_argus_blit_ms / n, + sum_argus_i420_copy_ms / n, + sum_timestamp_burn_ms / n, + sensor_age_ms, + sum_acquire_ms / n, + sum_capture_ms / n, + sum_iter_ms / n, + ); + } else { + info!( + "MIPI publishing: {}x{}, ~{:.1} fps | packet trailer timestamp source: sensor {} frames, backup system {} frames | avg ms: sensor_to_argus_acquire {:.2}, argus_wait {:.2}, argus_blit {:.2}, sensor_to_acquire {:.2}, acquire {:.2}, capture {:.2}, iter {:.2}", + width, + height, + fps_est, + sensor_timestamp_frames, + backup_timestamp_frames, + sensor_to_argus_acquire_ms, + sum_argus_wait_ms / n, + sum_argus_blit_ms / n, + sensor_age_ms, + sum_acquire_ms / n, + sum_capture_ms / n, + sum_iter_ms / n, + ); + } } else { info!( "MIPI publishing: {}x{}, ~{:.1} fps | packet trailer timestamp: disabled | avg ms: argus_wait {:.2}, argus_blit {:.2}, acquire {:.2}, capture {:.2}, iter {:.2}", @@ -1915,6 +2366,8 @@ async fn run_argus_capture_loop( sum_acquire_ms = 0.0; sum_argus_wait_ms = 0.0; sum_argus_blit_ms = 0.0; + sum_argus_i420_copy_ms = 0.0; + sum_timestamp_burn_ms = 0.0; sum_capture_ms = 0.0; sum_iter_ms = 0.0; sum_sensor_to_acquire_ms = 0.0; diff --git a/examples/local_video/src/subscriber.rs b/examples/local_video/src/subscriber.rs index 2aa102464..76f5c7175 100644 --- a/examples/local_video/src/subscriber.rs +++ b/examples/local_video/src/subscriber.rs @@ -698,6 +698,22 @@ fn log_video_decode_health(stats: &[livekit::webrtc::stats::RtcStats]) { inbound.inbound.total_decode_time, inbound.inbound.decoder_implementation, ); + info!( + "RTP receive health: packets={}, lost={}, discarded={}, jitter={:.1}ms, nacks={}, plis={}, firs={}, rtx_packets={}, rtx_bytes={}, freezes={} ({:.3}s), pauses={} ({:.3}s)", + inbound.received.packets_received, + inbound.received.packets_lost, + inbound.inbound.packets_discarded, + inbound.received.jitter * 1000.0, + inbound.inbound.nack_count, + inbound.inbound.pli_count, + inbound.inbound.fir_count, + inbound.inbound.retransmitted_packets_received, + inbound.inbound.retransmitted_bytes_received, + inbound.inbound.freeze_count, + inbound.inbound.total_freeze_duration, + inbound.inbound.pause_count, + inbound.inbound.total_pause_duration, + ); if inbound.inbound.frames_received > 0 && inbound.inbound.frames_decoded == 0 { log::warn!( @@ -767,6 +783,10 @@ fn update_receive_bitrate_from_stats( } } +fn stats_poll_interval() -> Duration { + Duration::from_secs(10) +} + struct TimestampAnchor { unix_timestamp_us: u64, instant: Instant, @@ -1026,7 +1046,7 @@ async fn handle_track_subscribed( let mut receive_bitrate_snapshot = None; let mut last_jitter_buffer_log = Instant::now().checked_sub(Duration::from_secs(5)).unwrap_or_else(Instant::now); - let mut interval = tokio::time::interval(Duration::from_secs(1)); + let mut interval = tokio::time::interval(stats_poll_interval()); interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); loop { @@ -1329,8 +1349,6 @@ impl eframe::App for VideoApp { ); egui::CentralPanel::default().frame(egui::Frame::NONE).show(root_ui, |ui| { - ui.ctx().request_repaint(); - // Let the native window follow live resize, and letterbox the video instead of // programmatically resizing the window while the user is dragging it. let size = diff --git a/examples/local_video/src/test_pattern.rs b/examples/local_video/src/test_pattern.rs index d9c689cda..34a64e4d9 100644 --- a/examples/local_video/src/test_pattern.rs +++ b/examples/local_video/src/test_pattern.rs @@ -1,9 +1,54 @@ -/// Generates a static SMPTE-style 75% color-bar pattern in I420 format. -pub struct TestPattern { +/// Selects the generated test pattern. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(super) enum TestPatternKind { + /// Static SMPTE-style 75% color bars. + StaticColorBars, + /// Animated motion graphic for exercising video encoders. + AnimatedGraphic, +} + +/// Returned when a numeric test pattern selector is unsupported. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(super) struct UnsupportedTestPatternKind; + +impl TryFrom for TestPatternKind { + type Error = UnsupportedTestPatternKind; + + fn try_from(value: u8) -> Result { + match value { + 0 => Ok(Self::StaticColorBars), + 1 => Ok(Self::AnimatedGraphic), + _ => Err(UnsupportedTestPatternKind), + } + } +} + +impl TestPatternKind { + /// Returns a short label for logs and help text. + pub(super) fn label(self) -> &'static str { + match self { + Self::StaticColorBars => "SMPTE 75% color bars", + Self::AnimatedGraphic => "animated encoder exercise graphic", + } + } +} + +/// Generates a test pattern in I420 format. +pub(super) struct TestPattern { width: usize, height: usize, chroma_width: usize, chroma_height: usize, + frames: TestPatternFrames, +} + +enum TestPatternFrames { + Static(I420Frame), + AnimatedCached(Vec), + AnimatedDynamic, +} + +struct I420Frame { y_plane: Vec, u_plane: Vec, v_plane: Vec, @@ -16,6 +61,10 @@ struct I420Color { v: u8, } +const ANIMATED_CACHE_TARGET_FRAMES: usize = 60; +const ANIMATED_CACHE_MIN_FRAMES: usize = 2; +const ANIMATED_CACHE_MAX_BYTES: usize = 128 * 1024 * 1024; + const BARS: [I420Color; 7] = [ rgb_to_i420(191, 191, 191), // white rgb_to_i420(191, 191, 0), // yellow @@ -26,38 +75,52 @@ const BARS: [I420Color; 7] = [ rgb_to_i420(0, 0, 191), // blue ]; +const ANIMATED_PALETTE: [I420Color; 6] = [ + rgb_to_i420(235, 64, 32), + rgb_to_i420(64, 224, 72), + rgb_to_i420(48, 128, 255), + rgb_to_i420(245, 220, 64), + rgb_to_i420(224, 72, 220), + rgb_to_i420(64, 224, 224), +]; + impl TestPattern { - /// Precompute a static SMPTE-style 75% color-bar pattern for the requested resolution. - pub fn new(width: u32, height: u32) -> Self { + /// Precompute the reusable planes for the requested pattern and resolution. + pub(super) fn new(width: u32, height: u32, kind: TestPatternKind) -> Self { let width = width as usize; let height = height as usize; let chroma_width = width.div_ceil(2); let chroma_height = height.div_ceil(2); - let mut y_plane = vec![0; width * height]; - let mut u_plane = vec![128; chroma_width * chroma_height]; - let mut v_plane = vec![128; chroma_width * chroma_height]; - - for row in 0..height { - let row_start = row * width; - for col in 0..width { - y_plane[row_start + col] = color_for_luma_column(col, width).y; - } - } - for row in 0..chroma_height { - let row_start = row * chroma_width; - for col in 0..chroma_width { - let color = color_for_luma_column(col * 2, width); - u_plane[row_start + col] = color.u; - v_plane[row_start + col] = color.v; + let frames = match kind { + TestPatternKind::StaticColorBars => TestPatternFrames::Static(color_bars_frame( + width, + height, + chroma_width, + chroma_height, + )), + TestPatternKind::AnimatedGraphic => { + if let Some(frame_count) = + cached_animation_frame_count(width, height, chroma_width, chroma_height) + { + TestPatternFrames::AnimatedCached(animated_frames( + width, + height, + chroma_width, + chroma_height, + frame_count, + )) + } else { + TestPatternFrames::AnimatedDynamic + } } - } + }; - Self { width, height, chroma_width, chroma_height, y_plane, u_plane, v_plane } + Self { width, height, chroma_width, chroma_height, frames } } - /// Copy the precomputed pattern into the provided I420 destination planes. - pub fn render( + /// Render the selected pattern into the provided I420 destination planes. + pub(super) fn render( &self, data_y: &mut [u8], stride_y: i32, @@ -65,11 +128,160 @@ impl TestPattern { stride_u: i32, data_v: &mut [u8], stride_v: i32, + frame_index: u64, ) { - copy_plane(data_y, stride_y as usize, &self.y_plane, self.width, self.height); - copy_plane(data_u, stride_u as usize, &self.u_plane, self.chroma_width, self.chroma_height); - copy_plane(data_v, stride_v as usize, &self.v_plane, self.chroma_width, self.chroma_height); + match &self.frames { + TestPatternFrames::Static(frame) => { + frame.copy_to( + data_y, + stride_y as usize, + data_u, + stride_u as usize, + data_v, + stride_v as usize, + self.width, + self.height, + self.chroma_width, + self.chroma_height, + ); + } + TestPatternFrames::AnimatedCached(frames) => { + let frame = &frames[(frame_index % frames.len() as u64) as usize]; + frame.copy_to( + data_y, + stride_y as usize, + data_u, + stride_u as usize, + data_v, + stride_v as usize, + self.width, + self.height, + self.chroma_width, + self.chroma_height, + ); + } + TestPatternFrames::AnimatedDynamic => { + render_animated_pattern( + data_y, + stride_y as usize, + data_u, + stride_u as usize, + data_v, + stride_v as usize, + self.width, + self.height, + self.chroma_width, + self.chroma_height, + frame_index, + ); + } + } + } +} + +impl I420Frame { + fn new(width: usize, height: usize, chroma_width: usize, chroma_height: usize) -> Self { + Self { + y_plane: vec![0; width * height], + u_plane: vec![128; chroma_width * chroma_height], + v_plane: vec![128; chroma_width * chroma_height], + } + } + + fn copy_to( + &self, + data_y: &mut [u8], + stride_y: usize, + data_u: &mut [u8], + stride_u: usize, + data_v: &mut [u8], + stride_v: usize, + width: usize, + height: usize, + chroma_width: usize, + chroma_height: usize, + ) { + copy_plane(data_y, stride_y, &self.y_plane, width, height); + copy_plane(data_u, stride_u, &self.u_plane, chroma_width, chroma_height); + copy_plane(data_v, stride_v, &self.v_plane, chroma_width, chroma_height); + } +} + +fn color_bars_frame( + width: usize, + height: usize, + chroma_width: usize, + chroma_height: usize, +) -> I420Frame { + let mut frame = I420Frame::new(width, height, chroma_width, chroma_height); + + for row in 0..height { + let row_start = row * width; + for col in 0..width { + frame.y_plane[row_start + col] = color_for_luma_column(col, width).y; + } + } + + for row in 0..chroma_height { + let row_start = row * chroma_width; + for col in 0..chroma_width { + let color = color_for_luma_column(col * 2, width); + frame.u_plane[row_start + col] = color.u; + frame.v_plane[row_start + col] = color.v; + } + } + + frame +} + +fn cached_animation_frame_count( + width: usize, + height: usize, + chroma_width: usize, + chroma_height: usize, +) -> Option { + let bytes_per_frame = i420_frame_len(width, height, chroma_width, chroma_height); + if bytes_per_frame == 0 { + return Some(1); } + + let max_frames = ANIMATED_CACHE_MAX_BYTES / bytes_per_frame; + (max_frames >= ANIMATED_CACHE_MIN_FRAMES) + .then_some(max_frames.min(ANIMATED_CACHE_TARGET_FRAMES)) +} + +fn animated_frames( + width: usize, + height: usize, + chroma_width: usize, + chroma_height: usize, + frame_count: usize, +) -> Vec { + (0..frame_count) + .map(|frame_index| { + let mut frame = I420Frame::new(width, height, chroma_width, chroma_height); + render_animated_pattern( + &mut frame.y_plane, + width, + &mut frame.u_plane, + chroma_width, + &mut frame.v_plane, + chroma_width, + width, + height, + chroma_width, + chroma_height, + frame_index as u64, + ); + frame + }) + .collect() +} + +fn i420_frame_len(width: usize, height: usize, chroma_width: usize, chroma_height: usize) -> usize { + width + .saturating_mul(height) + .saturating_add(chroma_width.saturating_mul(chroma_height).saturating_mul(2)) } const fn rgb_to_i420(r: u8, g: u8, b: u8) -> I420Color { @@ -102,6 +314,92 @@ fn color_for_luma_column(col: usize, width: usize) -> I420Color { BARS[bar.min(BARS.len() - 1)] } +fn render_animated_pattern( + data_y: &mut [u8], + stride_y: usize, + data_u: &mut [u8], + stride_u: usize, + data_v: &mut [u8], + stride_v: usize, + width: usize, + height: usize, + chroma_width: usize, + chroma_height: usize, + frame_index: u64, +) { + if width == 0 || height == 0 { + return; + } + + let frame = frame_index as usize; + let tile = (width.min(height) / 10).clamp(16, 96); + let sweep_x = frame.wrapping_mul(7) % width; + let sweep_y = frame.wrapping_mul(5) % height; + let box_w = (width / 5).clamp(32, 256).min(width); + let box_h = (height / 4).clamp(24, 192).min(height); + let box_x = bouncing_offset(frame.wrapping_mul(9), width.saturating_sub(box_w)); + let box_y = bouncing_offset(frame.wrapping_mul(5), height.saturating_sub(box_h)); + + for row in 0..height { + let dst_start = row * stride_y; + for col in 0..width { + let shifted_x = col.wrapping_add(sweep_x); + let shifted_y = row.wrapping_add(sweep_y); + let checker = ((shifted_x / tile) ^ (shifted_y / tile)) & 1; + let ramp = if width > 1 { (col * 144) / (width - 1) } else { 0 }; + let diagonal = (col.wrapping_add(row).wrapping_add(frame.wrapping_mul(11)) % tile) < 3; + let mut luma = 42 + ramp as i32 + (checker as i32 * 42); + + if diagonal { + luma += 64; + } + if in_box(col, row, box_x, box_y, box_w, box_h) { + luma = if ((col / 8) ^ (row / 8) ^ (frame / 2)) & 1 == 0 { 235 } else { 24 }; + } + + data_y[dst_start + col] = clamp_to_u8(luma); + } + } + + for row in 0..chroma_height { + let dst_u_start = row * stride_u; + let dst_v_start = row * stride_v; + for col in 0..chroma_width { + let luma_col = col * 2; + let luma_row = row * 2; + let color = if in_box(luma_col, luma_row, box_x, box_y, box_w, box_h) { + ANIMATED_PALETTE[(frame / 4) % ANIMATED_PALETTE.len()] + } else { + let palette_index = ((luma_col.wrapping_add(sweep_x) / tile) + + (luma_row.wrapping_add(sweep_y) / tile) + + (frame / 12)) + % ANIMATED_PALETTE.len(); + ANIMATED_PALETTE[palette_index] + }; + data_u[dst_u_start + col] = color.u; + data_v[dst_v_start + col] = color.v; + } + } +} + +fn bouncing_offset(position: usize, travel: usize) -> usize { + if travel == 0 { + return 0; + } + + let period = travel.saturating_mul(2); + let phase = position % period; + if phase <= travel { + phase + } else { + period - phase + } +} + +fn in_box(col: usize, row: usize, box_x: usize, box_y: usize, box_w: usize, box_h: usize) -> bool { + (box_x..box_x + box_w).contains(&col) && (box_y..box_y + box_h).contains(&row) +} + fn copy_plane(dst: &mut [u8], dst_stride: usize, src: &[u8], width: usize, height: usize) { if width == 0 || height == 0 { return; @@ -119,3 +417,65 @@ fn copy_plane(dst: &mut [u8], dst_stride: usize, src: &[u8], width: usize, heigh dst[dst_start..dst_start + width].copy_from_slice(&src[src_start..src_start + width]); } } + +#[cfg(test)] +mod tests { + use super::*; + + fn render_frame(kind: TestPatternKind, frame_index: u64) -> (Vec, Vec, Vec) { + let pattern = TestPattern::new(64, 36, kind); + let mut y = vec![0; 64 * 36]; + let mut u = vec![0; 32 * 18]; + let mut v = vec![0; 32 * 18]; + pattern.render(&mut y, 64, &mut u, 32, &mut v, 32, frame_index); + (y, u, v) + } + + #[test] + fn test_pattern_kind_accepts_supported_numeric_selectors() { + assert_eq!(TestPatternKind::try_from(0), Ok(TestPatternKind::StaticColorBars)); + assert_eq!(TestPatternKind::try_from(1), Ok(TestPatternKind::AnimatedGraphic)); + assert_eq!(TestPatternKind::try_from(2), Err(UnsupportedTestPatternKind)); + } + + #[test] + fn animated_graphic_uses_cached_frames_when_memory_allows() { + let pattern = TestPattern::new(64, 36, TestPatternKind::AnimatedGraphic); + + let TestPatternFrames::AnimatedCached(frames) = pattern.frames else { + panic!("small animated pattern should use cached frames"); + }; + assert_eq!(frames.len(), ANIMATED_CACHE_TARGET_FRAMES); + } + + #[test] + fn animated_cache_is_bounded_by_memory_budget() { + let frame_count = cached_animation_frame_count(1920, 1080, 960, 540) + .expect("1080p should still cache multiple frames"); + + assert!(frame_count >= ANIMATED_CACHE_MIN_FRAMES); + assert!(frame_count < ANIMATED_CACHE_TARGET_FRAMES); + assert!(frame_count * i420_frame_len(1920, 1080, 960, 540) <= ANIMATED_CACHE_MAX_BYTES); + } + + #[test] + fn very_large_animated_patterns_fall_back_to_dynamic_rendering() { + assert_eq!(cached_animation_frame_count(16_384, 9_216, 8_192, 4_608), None); + } + + #[test] + fn static_color_bars_do_not_change_between_frames() { + assert_eq!( + render_frame(TestPatternKind::StaticColorBars, 0), + render_frame(TestPatternKind::StaticColorBars, 24) + ); + } + + #[test] + fn animated_graphic_changes_between_frames() { + assert_ne!( + render_frame(TestPatternKind::AnimatedGraphic, 0), + render_frame(TestPatternKind::AnimatedGraphic, 24) + ); + } +} diff --git a/examples/local_video/src/video_display.rs b/examples/local_video/src/video_display.rs index 4ce1b6ae2..55c937c1f 100644 --- a/examples/local_video/src/video_display.rs +++ b/examples/local_video/src/video_display.rs @@ -661,8 +661,6 @@ impl eframe::App for VideoApp { let channel_values = self.channels.as_ref().map(|targets| drive_channels(&ctx, targets)); egui::CentralPanel::default().frame(egui::Frame::NONE).show(root_ui, |ui| { - ui.ctx().request_repaint(); - let size = viewport_aspect::fitted_video_size(ui.available_size(), self.viewport.aspect()); diff --git a/examples/local_video/src/viewport_aspect.rs b/examples/local_video/src/viewport_aspect.rs index 98551e655..67544de29 100644 --- a/examples/local_video/src/viewport_aspect.rs +++ b/examples/local_video/src/viewport_aspect.rs @@ -53,7 +53,7 @@ pub(crate) fn native_options(initial_aspect: Option) -> eframe::NativeOptio let mut wgpu_options = egui_wgpu_backend::WgpuConfiguration::default(); #[cfg(target_os = "macos")] { - wgpu_options.surface.present_mode = wgpu::PresentMode::Immediate; + wgpu_options.surface.present_mode = wgpu::PresentMode::AutoVsync; } #[cfg(not(target_os = "macos"))] { diff --git a/examples/preencode_publish/Cargo.toml b/examples/preencode_publish/Cargo.toml new file mode 100644 index 000000000..af740f6ed --- /dev/null +++ b/examples/preencode_publish/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "preencode_publish" +version = "0.1.0" +edition.workspace = true +publish = false + +[features] +default = [] +gstreamer = ["dep:gstreamer", "livekit-capture/gstreamer"] + +[dependencies] +anyhow = { workspace = true } +clap = { workspace = true, features = ["derive", "env"] } +env_logger = { workspace = true } +gstreamer = { workspace = true, optional = true } +livekit = { workspace = true, features = ["rustls-tls-native-roots"] } +livekit-api = { workspace = true, features = ["rustls-tls-native-roots"] } +livekit-capture = { workspace = true, features = ["rtsp", "tcpsink"] } +log = { workspace = true } +tokio = { workspace = true, features = ["full"] } diff --git a/examples/preencode_publish/scripts/gst-test-source-common.sh b/examples/preencode_publish/scripts/gst-test-source-common.sh new file mode 100755 index 000000000..b651f8a0c --- /dev/null +++ b/examples/preencode_publish/scripts/gst-test-source-common.sh @@ -0,0 +1,194 @@ +#!/usr/bin/env bash + +set -euo pipefail + +GST_LAUNCH=${GST_LAUNCH:-gst-launch-1.0} +GST_RTSP_TEST_LAUNCH=${GST_RTSP_TEST_LAUNCH:-test-launch} + +WIDTH=1280 +HEIGHT=720 +FPS=30 +BITRATE_KBPS=2500 +PRINT_ONLY=0 +GST_COMMON_SHIFT=0 + +gst_error() { + echo "error: $*" >&2 + exit 1 +} + +gst_require_command() { + if ! command -v "$1" >/dev/null 2>&1; then + gst_error "required command not found: $1" + fi +} + +gst_validate_positive_int() { + local name=$1 + local value=$2 + + case "$value" in + ''|*[!0-9]*) + gst_error "$name must be a positive integer, got '$value'" + ;; + esac + + if [ "$value" -eq 0 ]; then + gst_error "$name must be greater than zero" + fi +} + +gst_normalize_codec() { + local codec + codec=$(printf '%s' "$1" | tr '[:upper:]' '[:lower:]') + + case "$codec" in + h264|h265|vp8|vp9|av1) + printf '%s\n' "$codec" + ;; + *) + return 1 + ;; + esac +} + +gst_parse_common_option() { + GST_COMMON_SHIFT=0 + + case "$1" in + --width) + [ "$#" -ge 2 ] || gst_error "--width requires a value" + WIDTH=$2 + GST_COMMON_SHIFT=2 + ;; + --height) + [ "$#" -ge 2 ] || gst_error "--height requires a value" + HEIGHT=$2 + GST_COMMON_SHIFT=2 + ;; + --fps) + [ "$#" -ge 2 ] || gst_error "--fps requires a value" + FPS=$2 + GST_COMMON_SHIFT=2 + ;; + --bitrate-kbps) + [ "$#" -ge 2 ] || gst_error "--bitrate-kbps requires a value" + BITRATE_KBPS=$2 + GST_COMMON_SHIFT=2 + ;; + --print) + PRINT_ONLY=1 + GST_COMMON_SHIFT=1 + ;; + *) + gst_error "unknown common option: $1" + ;; + esac +} + +gst_validate_common_options() { + gst_validate_positive_int "--width" "$WIDTH" + gst_validate_positive_int "--height" "$HEIGHT" + gst_validate_positive_int "--fps" "$FPS" + gst_validate_positive_int "--bitrate-kbps" "$BITRATE_KBPS" +} + +gst_animated_video_source() { + printf 'videotestsrc is-live=true do-timestamp=true pattern=ball motion=wavy animation-mode=frames ! video/x-raw,width=%s,height=%s,framerate=%s/1 ! timeoverlay halignment=right valignment=bottom shaded-background=true ! videoconvert ! video/x-raw,format=I420 ! queue' \ + "$WIDTH" "$HEIGHT" "$FPS" +} + +gst_encoded_access_unit_pipeline() { + local codec=$1 + local key_int_max=$FPS + + case "$codec" in + h264|h265) + gst_h26x_annex_b_pipeline "$codec" + ;; + vp8) + printf 'vp8enc deadline=1 cpu-used=8 keyframe-max-dist=%s lag-in-frames=0 target-bitrate=%s000 ! video/x-vp8' \ + "$key_int_max" "$BITRATE_KBPS" + ;; + vp9) + printf 'vp9enc deadline=1 cpu-used=8 keyframe-max-dist=%s lag-in-frames=0 target-bitrate=%s000 ! video/x-vp9,profile=(string)0' \ + "$key_int_max" "$BITRATE_KBPS" + ;; + av1) + printf 'av1enc cpu-used=8 usage-profile=realtime keyframe-max-dist=%s lag-in-frames=0 target-bitrate=%s ! av1parse ! video/x-av1,stream-format=obu-stream,alignment=tu' \ + "$key_int_max" "$BITRATE_KBPS" + ;; + *) + gst_error "unsupported codec: $codec" + ;; + esac +} + +gst_h26x_annex_b_pipeline() { + local codec=$1 + local key_int_max=$FPS + + case "$codec" in + h264) + printf 'x264enc tune=zerolatency speed-preset=ultrafast key-int-max=%s bitrate=%s byte-stream=true aud=true ! h264parse config-interval=-1 ! video/x-h264,stream-format=byte-stream,alignment=au' \ + "$key_int_max" "$BITRATE_KBPS" + ;; + h265) + printf 'x265enc tune=zerolatency speed-preset=ultrafast key-int-max=%s bitrate=%s ! h265parse config-interval=-1 ! video/x-h265,stream-format=byte-stream,alignment=au' \ + "$key_int_max" "$BITRATE_KBPS" + ;; + *) + gst_error "unsupported codec: $codec" + ;; + esac +} + +gst_rtp_payloader_pipeline() { + case "$1" in + h264) + printf 'rtph264pay name=pay0 pt=96 config-interval=1' + ;; + h265) + printf 'rtph265pay name=pay0 pt=96 config-interval=1' + ;; + vp8) + printf 'rtpvp8pay name=pay0 pt=96' + ;; + vp9) + printf 'rtpvp9pay name=pay0 pt=96' + ;; + av1) + printf 'rtpav1pay name=pay0 pt=96' + ;; + *) + gst_error "unsupported codec: $1" + ;; + esac +} + +gst_run_launch_line() { + local pipeline=$1 + + if [ "$PRINT_ONLY" -eq 1 ]; then + printf 'pipeline=%q\n%q -e $pipeline\n' "$pipeline" "$GST_LAUNCH" + return + fi + + gst_require_command "$GST_LAUNCH" + # Intentionally split the launch line into gst-launch arguments. + # The line is assembled from validated flags and fixed pipeline fragments. + exec "$GST_LAUNCH" -e $pipeline +} + +gst_run_rtsp_launch_line() { + local port=$1 + local pipeline=$2 + + if [ "$PRINT_ONLY" -eq 1 ]; then + printf '%q -p %q %q\n' "$GST_RTSP_TEST_LAUNCH" "$port" "$pipeline" + return + fi + + gst_require_command "$GST_RTSP_TEST_LAUNCH" + exec "$GST_RTSP_TEST_LAUNCH" -p "$port" "$pipeline" +} diff --git a/examples/preencode_publish/scripts/run-rtsp-test-source.sh b/examples/preencode_publish/scripts/run-rtsp-test-source.sh new file mode 100755 index 000000000..82c3568ed --- /dev/null +++ b/examples/preencode_publish/scripts/run-rtsp-test-source.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=examples/preencode_publish/scripts/gst-test-source-common.sh +source "$SCRIPT_DIR/gst-test-source-common.sh" + +PORT=8554 +CODEC= + +usage() { + cat <<'USAGE' +Usage: run-rtsp-test-source.sh --codec h264|h265|vp8|vp9|av1 [options] + +Starts a gst-rtsp-server test-launch server that serves an animated +test-pattern stream at rtsp://127.0.0.1:PORT/test. + +Options: + --codec CODEC Required encoded codec. + --port PORT RTSP server port. Default: 8554. + --width PIXELS Source width. Default: 1280. + --height PIXELS Source height. Default: 720. + --fps FPS Source frame rate. Default: 30. + --bitrate-kbps KBPS Encoder bitrate. Default: 2500. + --print Print the test-launch command instead of running it. + -h, --help Show this help. +USAGE +} + +while [ "$#" -gt 0 ]; do + case "$1" in + --codec) + [ "$#" -ge 2 ] || gst_error "--codec requires h264, h265, vp8, vp9, or av1" + CODEC=$2 + shift 2 + ;; + --port) + [ "$#" -ge 2 ] || gst_error "--port requires a value" + PORT=$2 + shift 2 + ;; + --width|--height|--fps|--bitrate-kbps|--print) + gst_parse_common_option "$@" + shift "$GST_COMMON_SHIFT" + ;; + -h|--help) + usage + exit 0 + ;; + *) + gst_error "unknown option: $1" + ;; + esac +done + +[ -n "$CODEC" ] || gst_error "--codec is required" +if ! CODEC=$(gst_normalize_codec "$CODEC"); then + gst_error "--codec must be h264, h265, vp8, vp9, or av1" +fi + +gst_validate_common_options +gst_validate_positive_int "--port" "$PORT" + +PIPELINE="( $(gst_animated_video_source) ! $(gst_encoded_access_unit_pipeline "$CODEC") ! $(gst_rtp_payloader_pipeline "$CODEC") )" + +echo "Serving $CODEC RTSP test pattern at rtsp://127.0.0.1:$PORT/test" >&2 +gst_run_rtsp_launch_line "$PORT" "$PIPELINE" diff --git a/examples/preencode_publish/scripts/run-shm-test-source.sh b/examples/preencode_publish/scripts/run-shm-test-source.sh new file mode 100755 index 000000000..29a2cb10e --- /dev/null +++ b/examples/preencode_publish/scripts/run-shm-test-source.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=examples/preencode_publish/scripts/gst-test-source-common.sh +source "$SCRIPT_DIR/gst-test-source-common.sh" + +SOCKET_PATH=/tmp/livekit-preencode-test.shm +SHM_SIZE=67108864 +CODEC= + +usage() { + cat <<'USAGE' +Usage: run-shm-test-source.sh --codec h264|h265|vp8|vp9|av1 [options] + +Starts a GStreamer animated test-pattern encoder that writes encoded access +units to shmsink. + +Options: + --codec CODEC Required encoded codec. + --socket-path PATH shmsink control socket. Default: /tmp/livekit-preencode-test.shm. + --shm-size BYTES Shared-memory buffer size. Default: 67108864. + --width PIXELS Source width. Default: 1280. + --height PIXELS Source height. Default: 720. + --fps FPS Source frame rate. Default: 30. + --bitrate-kbps KBPS Encoder bitrate. Default: 2500. + --print Print the gst-launch command instead of running it. + -h, --help Show this help. +USAGE +} + +while [ "$#" -gt 0 ]; do + case "$1" in + --codec) + [ "$#" -ge 2 ] || gst_error "--codec requires h264, h265, vp8, vp9, or av1" + CODEC=$2 + shift 2 + ;; + --socket-path) + [ "$#" -ge 2 ] || gst_error "--socket-path requires a value" + SOCKET_PATH=$2 + shift 2 + ;; + --shm-size) + [ "$#" -ge 2 ] || gst_error "--shm-size requires a value" + SHM_SIZE=$2 + shift 2 + ;; + --width|--height|--fps|--bitrate-kbps|--print) + gst_parse_common_option "$@" + shift "$GST_COMMON_SHIFT" + ;; + -h|--help) + usage + exit 0 + ;; + *) + gst_error "unknown option: $1" + ;; + esac +done + +[ -n "$CODEC" ] || gst_error "--codec is required" +if ! CODEC=$(gst_normalize_codec "$CODEC"); then + gst_error "--codec must be h264, h265, vp8, vp9, or av1" +fi + +gst_validate_common_options +gst_validate_positive_int "--shm-size" "$SHM_SIZE" + +if [ "$PRINT_ONLY" -eq 0 ]; then + rm -f "$SOCKET_PATH" +fi +PIPELINE="$(gst_animated_video_source) ! $(gst_encoded_access_unit_pipeline "$CODEC") ! shmsink socket-path=$SOCKET_PATH shm-size=$SHM_SIZE wait-for-connection=true sync=true" + +echo "Writing $CODEC test pattern to shmsink socket $SOCKET_PATH" >&2 +gst_run_launch_line "$PIPELINE" diff --git a/examples/preencode_publish/scripts/run-tcp-test-source.sh b/examples/preencode_publish/scripts/run-tcp-test-source.sh new file mode 100755 index 000000000..1c0144bc3 --- /dev/null +++ b/examples/preencode_publish/scripts/run-tcp-test-source.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=examples/preencode_publish/scripts/gst-test-source-common.sh +source "$SCRIPT_DIR/gst-test-source-common.sh" + +HOST=127.0.0.1 +PORT=5000 +CODEC= + +usage() { + cat <<'USAGE' +Usage: run-tcp-test-source.sh --codec h264|h265|vp8|vp9|av1 [options] + +Starts a GStreamer animated test-pattern encoder from tcpserversink. +H.264/H.265 are served as Annex-B byte streams. VP8/VP9/AV1 are served as +RFC4571-style length-prefixed RTP packets. + +Options: + --codec CODEC Required encoded codec. + --host HOST Address to listen on. Default: 127.0.0.1. + --port PORT TCP port to listen on. Default: 5000. + --width PIXELS Source width. Default: 1280. + --height PIXELS Source height. Default: 720. + --fps FPS Source frame rate. Default: 30. + --bitrate-kbps KBPS Encoder bitrate. Default: 2500. + --print Print the gst-launch command instead of running it. + -h, --help Show this help. +USAGE +} + +while [ "$#" -gt 0 ]; do + case "$1" in + --codec) + [ "$#" -ge 2 ] || gst_error "--codec requires h264, h265, vp8, vp9, or av1" + CODEC=$2 + shift 2 + ;; + --host) + [ "$#" -ge 2 ] || gst_error "--host requires a value" + HOST=$2 + shift 2 + ;; + --port) + [ "$#" -ge 2 ] || gst_error "--port requires a value" + PORT=$2 + shift 2 + ;; + --width|--height|--fps|--bitrate-kbps|--print) + gst_parse_common_option "$@" + shift "$GST_COMMON_SHIFT" + ;; + -h|--help) + usage + exit 0 + ;; + *) + gst_error "unknown option: $1" + ;; + esac +done + +[ -n "$CODEC" ] || gst_error "--codec is required" +if ! CODEC=$(gst_normalize_codec "$CODEC"); then + gst_error "--codec must be h264, h265, vp8, vp9, or av1" +fi + +gst_validate_common_options +gst_validate_positive_int "--port" "$PORT" + +case "$CODEC" in + h264|h265) + PIPELINE="$(gst_animated_video_source) ! $(gst_h26x_annex_b_pipeline "$CODEC") ! tcpserversink host=$HOST port=$PORT sync-method=next-keyframe recover-policy=keyframe" + FORMAT="Annex-B" + ;; + vp8|vp9|av1) + PIPELINE="$(gst_animated_video_source) ! $(gst_encoded_access_unit_pipeline "$CODEC") ! $(gst_rtp_payloader_pipeline "$CODEC") ! rtpstreampay ! tcpserversink host=$HOST port=$PORT sync-method=next-keyframe recover-policy=keyframe" + FORMAT="RTP" + ;; + *) + gst_error "unsupported codec: $CODEC" + ;; +esac + +echo "Serving $CODEC $FORMAT test pattern on tcp://$HOST:$PORT" >&2 +gst_run_launch_line "$PIPELINE" diff --git a/examples/preencode_publish/src/main.rs b/examples/preencode_publish/src/main.rs new file mode 100644 index 000000000..5fd6c1318 --- /dev/null +++ b/examples/preencode_publish/src/main.rs @@ -0,0 +1,1338 @@ +//! Publish a pre-encoded video stream into a LiveKit room. +//! +//! Encoded access units are pulled from a TCP, RTSP, or GStreamer source and +//! pumped into a passthrough `VideoCaptureTrack` by +//! `livekit_capture::EncodedIngress`, which also forwards downstream keyframe +//! requests (PLI/FIR from the SFU) back to the source. The higher-level +//! `livekit_capture::VideoCaptureSource` facade covers the same encoded +//! endpoints via `CaptureSourceOptions::encoded`; this example drives +//! `EncodedIngress` directly to keep its per-access-unit diagnostics. + +use std::{ + net::{Shutdown, TcpStream}, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, + time::{Duration, Instant, SystemTime, UNIX_EPOCH}, +}; + +use anyhow::{bail, Context, Result}; +use clap::{Parser, ValueEnum}; +#[cfg(feature = "gstreamer")] +use gstreamer as gst; +#[cfg(feature = "gstreamer")] +use gstreamer::prelude::*; +use livekit::{ + options::{self, VideoEncoding}, + prelude::*, + webrtc::video_source::VideoResolution, +}; +use livekit_api::access_token; +#[cfg(feature = "gstreamer")] +use livekit_capture::sources::gstreamer::{ + encoded_caps_string, ensure_encoded_appsink, GStreamerAppSinkConfig, + GStreamerAppSinkEncodedSource, ENCODED_APPSINK_NAME, +}; +use livekit_capture::{ + sources::{ + rtsp::{RtspEncodedSource, RtspSourceOptions}, + tcp::{ByteStreamSourceConfig, TcpEncodedSource}, + }, + CaptureError, EncodedAccessUnitSource, EncodedFrameType, EncodedIngress, EncodedIngressCapture, + EncodedIngressError, EncodedVideoCodec, EncodedWireFormat, OwnedEncodedAccessUnit, + VideoCaptureTrack, +}; + +const DIAGNOSTIC_REPORT_INTERVAL: Duration = Duration::from_secs(1); +const SOURCE_STALL_THRESHOLD: Duration = Duration::from_millis(250); +const BURST_WALL_DELTA_THRESHOLD: Duration = Duration::from_millis(5); +const KEYFRAME_GAP_THRESHOLD: Duration = Duration::from_secs(5); + +/// Publish a pre-encoded video stream into a LiveKit room. +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + /// Encoded stream source. + #[arg(long, value_enum, default_value_t = SourceKind::Tcpsink)] + source: SourceKind, + + /// Encoded video codec. Required with --source tcpsink and --source shmsink; optional + /// validation with --source rtsp. Optional with --source gstappsink; omitted custom + /// GStreamer pipelines infer the codec from their unlinked encoded output when possible. + #[arg(long, value_enum)] + codec: Option, + + /// TCP server address as host:port. Required with --source tcpsink. + #[arg(long)] + host: Option, + + /// RTSP URL. Required with --source rtsp. + #[arg(long)] + rtsp_url: Option, + + /// LiveKit server URL. + #[arg(long, env = "LIVEKIT_URL")] + url: String, + + /// LiveKit API key. + #[arg(long, env = "LIVEKIT_API_KEY")] + api_key: String, + + /// LiveKit API secret. + #[arg(long, env = "LIVEKIT_API_SECRET")] + api_secret: String, + + /// Room name to join. + #[arg(long)] + room_name: String, + + /// Participant identity to publish as. + #[arg(long)] + identity: String, + + /// Encoded frame width in pixels. + #[arg(long, default_value_t = 1920)] + width: u32, + + /// Encoded frame height in pixels. + #[arg(long, default_value_t = 1080)] + height: u32, + + /// Frame rate used for generated video and fallback timestamps. + #[arg(long, default_value_t = 30)] + fps: u32, + + /// Maximum publish bitrate in bits per second. Generated GStreamer test + /// sources use the same target bitrate so local smoke tests do not overrun + /// the advertised send cap. + #[arg(long)] + max_bitrate: Option, + + /// H.264 TCP byte-stream format. + #[arg(long, value_enum, default_value_t = H264FormatArg::AnnexB)] + h264_format: H264FormatArg, + + /// Length-prefix size in bytes for --h264-format avc. + #[arg(long, default_value_t = 4)] + avc_nal_length_size: u8, + + /// TCP transport framing. + #[arg(long, value_enum, default_value_t = TcpFormatArg::Auto)] + tcp_format: TcpFormatArg, + + /// RTP timestamp clock rate used with --tcp-format rtp. + #[arg(long, default_value_t = 90_000)] + rtp_clock_rate: u32, + + /// Log access-unit timing, keyframe, and keyframe-request diagnostics. + #[arg(long)] + diagnostics: bool, + + /// GStreamer shmsink socket path. Used with --source shmsink. + #[cfg(feature = "gstreamer")] + #[arg(long, default_value = "/tmp/livekit-preencode-test.shm")] + shmsink_socket_path: String, + + /// GStreamer launch pipeline used with --source gstappsink. If the pipeline does not include + /// appsink name=lk_appsink, codec-specific normalization and an appsink are attached to its + /// unlinked output. + #[cfg(feature = "gstreamer")] + #[arg(last = true, value_name = "PIPELINE")] + gstreamer_pipeline: Vec, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] +enum SourceKind { + Tcpsink, + Rtsp, + #[cfg(feature = "gstreamer")] + Gstappsink, + #[cfg(feature = "gstreamer")] + Shmsink, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] +enum CodecArg { + H264, + H265, + Vp8, + Vp9, + Av1, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] +enum H264FormatArg { + AnnexB, + Avc, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] +enum TcpFormatArg { + Auto, + ByteStream, + Rtp, +} + +impl CodecArg { + fn encoded_codec(self) -> EncodedVideoCodec { + match self { + Self::H264 => EncodedVideoCodec::H264, + Self::H265 => EncodedVideoCodec::H265, + Self::Vp8 => EncodedVideoCodec::VP8, + Self::Vp9 => EncodedVideoCodec::VP9, + Self::Av1 => EncodedVideoCodec::AV1, + } + } + + fn tcp_wire_format( + self, + tcp_format: TcpFormatArg, + h264_format: H264FormatArg, + avc_nal_length_size: u8, + rtp_clock_rate: u32, + ) -> Result { + match tcp_format.resolve(self) { + ResolvedTcpFormat::ByteStream => match self { + Self::H264 => match h264_format { + H264FormatArg::AnnexB => Ok(EncodedWireFormat::H264AnnexB), + H264FormatArg::Avc => { + Ok(EncodedWireFormat::H264Avc { nal_length_size: avc_nal_length_size }) + } + } + Self::H265 => Ok(EncodedWireFormat::H265AnnexB), + Self::Vp8 | Self::Vp9 | Self::Av1 => bail!( + "--tcp-format byte-stream is only supported for H.264/H.265; use --tcp-format rtp for {:?}", + self.encoded_codec() + ), + }, + ResolvedTcpFormat::Rtp => Ok(EncodedWireFormat::Rtp { + codec: self.encoded_codec(), + clock_rate: rtp_clock_rate, + }), + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ResolvedTcpFormat { + ByteStream, + Rtp, +} + +impl TcpFormatArg { + fn resolve(self, codec: CodecArg) -> ResolvedTcpFormat { + match self { + Self::Auto => match codec { + CodecArg::H264 | CodecArg::H265 => ResolvedTcpFormat::ByteStream, + CodecArg::Vp8 | CodecArg::Vp9 | CodecArg::Av1 => ResolvedTcpFormat::Rtp, + }, + Self::ByteStream => ResolvedTcpFormat::ByteStream, + Self::Rtp => ResolvedTcpFormat::Rtp, + } + } +} + +#[tokio::main] +async fn main() -> Result<()> { + env_logger::init(); + run(Args::parse()).await +} + +async fn run(args: Args) -> Result<()> { + validate_dimensions(args.width, args.height)?; + validate_max_bitrate(args.max_bitrate)?; + validate_h264_format_args(&args)?; + #[cfg(feature = "gstreamer")] + validate_gstreamer_args(&args)?; + + match args.source { + SourceKind::Tcpsink => { + let frame_interval_us = frame_interval_us(args.fps)?; + run_tcp_source(args, frame_interval_us).await + } + SourceKind::Rtsp => run_rtsp_source(args).await, + #[cfg(feature = "gstreamer")] + SourceKind::Gstappsink => { + let frame_interval_us = frame_interval_us(args.fps)?; + run_gstreamer_source(args, frame_interval_us).await + } + #[cfg(feature = "gstreamer")] + SourceKind::Shmsink => { + let frame_interval_us = frame_interval_us(args.fps)?; + run_shmsink_source(args, frame_interval_us).await + } + } +} + +#[cfg(feature = "gstreamer")] +fn validate_gstreamer_args(args: &Args) -> Result<()> { + if args.source != SourceKind::Gstappsink && !args.gstreamer_pipeline.is_empty() { + bail!("trailing GStreamer pipeline arguments are only valid with --source gstappsink"); + } + Ok(()) +} + +fn validate_h264_format_args(args: &Args) -> Result<()> { + if !(1..=4).contains(&args.avc_nal_length_size) { + bail!("--avc-nal-length-size must be between 1 and 4 bytes"); + } + if args.rtp_clock_rate == 0 { + bail!("--rtp-clock-rate must be greater than zero"); + } + if args.source == SourceKind::Tcpsink { + if let Some(codec) = args.codec { + if args.tcp_format.resolve(codec) == ResolvedTcpFormat::ByteStream + && matches!(codec, CodecArg::Vp8 | CodecArg::Vp9 | CodecArg::Av1) + { + bail!("--tcp-format byte-stream is only supported for H.264/H.265"); + } + } + } + if args.h264_format == H264FormatArg::Avc { + if args.source != SourceKind::Tcpsink { + bail!("--h264-format avc is only valid with --source tcpsink"); + } + if args.tcp_format == TcpFormatArg::Rtp { + bail!("--h264-format avc is only valid with TCP byte-stream input"); + } + if args.codec != Some(CodecArg::H264) { + bail!("--h264-format avc requires --codec h264"); + } + } + Ok(()) +} + +async fn run_tcp_source(args: Args, frame_interval_us: i64) -> Result<()> { + let codec_arg = args.codec.context("--codec is required with --source tcpsink")?; + let codec = codec_arg.encoded_codec(); + let host = args.host.clone().context("--host is required with --source tcpsink")?; + let wire_format = codec_arg.tcp_wire_format( + args.tcp_format, + args.h264_format, + args.avc_nal_length_size, + args.rtp_clock_rate, + )?; + let config = ByteStreamSourceConfig::new( + wire_format, + current_time_us(), + frame_interval_us, + args.width, + args.height, + ); + + log::info!("Connecting to TCP {wire_format:?} encoded stream at {host}"); + let stream = TcpStream::connect(&host) + .with_context(|| format!("failed to connect to TCP source at {host}"))?; + let shutdown_stream = stream.try_clone().context("failed to clone TCP stream")?; + let source = TcpEncodedSource::from_tcp_stream(stream, config)?; + + publish_encoded_source( + args, + codec, + "TCP", + source, + move || { + let _ = shutdown_stream.shutdown(Shutdown::Both); + }, + Some(frame_interval_us), + ) + .await +} + +async fn run_rtsp_source(args: Args) -> Result<()> { + let rtsp_url = args.rtsp_url.clone().context("--rtsp-url is required with --source rtsp")?; + let mut options = + RtspSourceOptions::new(args.width, args.height).with_start_timestamp_us(current_time_us()); + if let Some(codec) = args.codec { + options = options.with_expected_codec(codec.encoded_codec()); + } + + log::info!("Connecting to RTSP encoded stream at {rtsp_url}"); + let source = RtspEncodedSource::connect(&rtsp_url, options) + .with_context(|| format!("failed to connect to RTSP source at {rtsp_url}"))?; + let shutdown_stream = source.try_clone_stream().context("failed to clone RTSP TCP stream")?; + let codec = source.session_info().codec; + log::info!( + "RTSP setup selected {:?} payload type {} on interleaved channel {}", + codec, + source.session_info().payload_type, + source.session_info().video_channel + ); + + publish_encoded_source( + args, + codec, + "RTSP", + source, + move || { + let _ = shutdown_stream.shutdown(Shutdown::Both); + }, + None, + ) + .await +} + +#[cfg(feature = "gstreamer")] +async fn run_gstreamer_source(args: Args, frame_interval_us: i64) -> Result<()> { + let source = GStreamerTestSource::start( + args.width, + args.height, + args.fps, + current_time_us(), + frame_interval_us, + args.codec.map(CodecArg::encoded_codec), + &args.gstreamer_pipeline, + args.max_bitrate, + )?; + let codec = source.codec(); + let shutdown_pipeline = source.shutdown_pipeline(); + log::info!("Started GStreamer {:?} pipeline: {}", codec, source.pipeline_description()); + + publish_encoded_source( + args, + codec, + "GStreamer", + source, + move || { + let _ = shutdown_pipeline.set_state(gst::State::Null); + }, + Some(frame_interval_us), + ) + .await +} + +#[cfg(feature = "gstreamer")] +async fn run_shmsink_source(args: Args, frame_interval_us: i64) -> Result<()> { + let codec_arg = args.codec.context("--codec is required with --source shmsink")?; + let codec = codec_arg.encoded_codec(); + let socket_path = args.shmsink_socket_path.clone(); + let pipeline_args = vec![gstreamer_shmsink_pipeline_description(&socket_path, codec)]; + let source = GStreamerTestSource::start( + args.width, + args.height, + args.fps, + current_time_us(), + frame_interval_us, + Some(codec), + &pipeline_args, + args.max_bitrate, + )?; + let shutdown_pipeline = source.shutdown_pipeline(); + log::info!( + "Started GStreamer {:?} shmsink reader for {}: {}", + codec, + socket_path, + source.pipeline_description() + ); + + publish_encoded_source( + args, + codec, + "GStreamer shmsink", + source, + move || { + let _ = shutdown_pipeline.set_state(gst::State::Null); + }, + Some(frame_interval_us), + ) + .await +} + +#[cfg(feature = "gstreamer")] +#[derive(Debug)] +struct GStreamerTestSource { + pipeline: gst::Pipeline, + source: GStreamerAppSinkEncodedSource, + pipeline_description: String, +} + +#[cfg(feature = "gstreamer")] +impl GStreamerTestSource { + fn start( + width: u32, + height: u32, + fps: u32, + start_timestamp_us: i64, + frame_interval_us: i64, + requested_codec: Option, + pipeline_args: &[String], + max_bitrate: Option, + ) -> Result { + gst::init().context("failed to initialize GStreamer")?; + + let generated_codec = requested_codec.unwrap_or(EncodedVideoCodec::H264); + let pipeline_description = gstreamer_pipeline_description( + width, + height, + fps, + generated_codec, + pipeline_args, + max_bitrate, + ); + let element = gst::parse::launch(&pipeline_description).with_context(|| { + format!("failed to create GStreamer pipeline: {pipeline_description}") + })?; + let Ok(pipeline) = element.downcast::() else { + bail!("GStreamer description did not create a pipeline"); + }; + let requested_codec = + if pipeline_args.is_empty() { Some(generated_codec) } else { requested_codec }; + let (appsink, sample_format) = ensure_encoded_appsink(&pipeline, requested_codec) + .context("failed to prepare GStreamer encoded appsink")?; + + let config = GStreamerAppSinkConfig::new( + sample_format, + start_timestamp_us, + frame_interval_us, + width, + height, + ); + pipeline + .set_state(gst::State::Playing) + .context("failed to start GStreamer test pipeline")?; + + Ok(Self { + pipeline, + source: GStreamerAppSinkEncodedSource::new(appsink, config), + pipeline_description, + }) + } + + fn pipeline_description(&self) -> &str { + &self.pipeline_description + } + + fn codec(&self) -> EncodedVideoCodec { + self.source.config().sample_format.codec() + } + + fn shutdown_pipeline(&self) -> gst::Pipeline { + self.pipeline.clone() + } +} + +#[cfg(feature = "gstreamer")] +impl EncodedAccessUnitSource for GStreamerTestSource { + type Error = ::Error; + + fn next_access_unit(&mut self) -> Result, Self::Error> { + self.source.next_access_unit() + } + + fn request_keyframe(&mut self) { + // Forward downstream PLI/FIR to the appsink source, which raises a + // GstForceKeyUnit event so the upstream encoder emits an IDR. + self.source.request_keyframe(); + } +} + +#[cfg(feature = "gstreamer")] +impl Drop for GStreamerTestSource { + fn drop(&mut self) { + let _ = self.pipeline.set_state(gst::State::Null); + } +} + +#[cfg(feature = "gstreamer")] +fn gstreamer_pipeline_description( + width: u32, + height: u32, + fps: u32, + codec: EncodedVideoCodec, + pipeline_args: &[String], + max_bitrate: Option, +) -> String { + if pipeline_args.is_empty() { + return gstreamer_test_pipeline_description(width, height, fps, codec, max_bitrate); + } + + pipeline_args.join(" ") +} + +#[cfg(feature = "gstreamer")] +fn gstreamer_test_pipeline_description( + width: u32, + height: u32, + fps: u32, + codec: EncodedVideoCodec, + max_bitrate: Option, +) -> String { + let bitrate = publish_video_encoding(max_bitrate, width, height, fps, codec).max_bitrate; + let codec_pipeline = gstreamer_test_encode_pipeline(fps, codec, bitrate); + + format!( + "videotestsrc is-live=true do-timestamp=true pattern=ball motion=wavy animation-mode=frames ! \ + video/x-raw,width={width},height={height},framerate={fps}/1 ! \ + timeoverlay halignment=right valignment=bottom shaded-background=true ! \ + videoconvert ! \ + video/x-raw,format=I420 ! \ + {codec_pipeline} ! \ + appsink name={ENCODED_APPSINK_NAME} sync=false max-buffers=8 drop=true" + ) +} + +#[cfg(feature = "gstreamer")] +fn gstreamer_test_encode_pipeline(fps: u32, codec: EncodedVideoCodec, bitrate: u64) -> String { + let key_int_max = fps.max(1); + let bitrate_kbps = u64::max(1, bitrate / 1000); + // The trailing capsfilter is the appsink contract, so it comes from the + // crate's caps table; encoder-specific settings before the parser stay + // inline because they configure the encoder, not the appsink. + let caps = encoded_caps_string(codec); + match codec { + EncodedVideoCodec::H264 => format!( + "x264enc tune=zerolatency speed-preset=ultrafast key-int-max={key_int_max} \ + bitrate={bitrate_kbps} byte-stream=true aud=true ! h264parse config-interval=-1 ! \ + {caps}" + ), + EncodedVideoCodec::H265 => format!( + "x265enc tune=zerolatency speed-preset=ultrafast key-int-max={key_int_max} \ + bitrate={bitrate_kbps} ! h265parse config-interval=-1 ! {caps}" + ), + EncodedVideoCodec::VP8 => format!( + "vp8enc deadline=1 cpu-used=8 keyframe-max-dist={key_int_max} lag-in-frames=0 \ + target-bitrate={bitrate} ! {caps}" + ), + EncodedVideoCodec::VP9 => format!( + "vp9enc deadline=1 cpu-used=8 keyframe-max-dist={key_int_max} lag-in-frames=0 \ + target-bitrate={bitrate} ! {caps}" + ), + EncodedVideoCodec::AV1 => format!( + "av1enc cpu-used=8 usage-profile=realtime keyframe-max-dist={key_int_max} \ + lag-in-frames=0 target-bitrate={bitrate_kbps} ! av1parse ! {caps}" + ), + _ => unreachable!("unknown generated GStreamer codec"), + } +} + +#[cfg(feature = "gstreamer")] +fn gstreamer_shmsink_pipeline_description(socket_path: &str, codec: EncodedVideoCodec) -> String { + let socket_path = gstreamer_launch_string_value(socket_path); + let caps = encoded_caps_string(codec); + + format!( + "shmsrc socket-path={socket_path} is-live=true do-timestamp=true ! capsfilter caps={caps}" + ) +} + +#[cfg(feature = "gstreamer")] +fn gstreamer_launch_string_value(value: &str) -> String { + if value.chars().all(|c| c.is_ascii_alphanumeric() || matches!(c, '/' | '_' | '-' | '.' | ':')) + { + return value.to_string(); + } + + format!("\"{}\"", value.replace('\\', "\\\\").replace('"', "\\\"")) +} + +async fn publish_encoded_source( + args: Args, + codec: EncodedVideoCodec, + source_label: &'static str, + source: S, + shutdown_source: ShutdownSource, + expected_frame_interval_us: Option, +) -> Result<()> +where + S: EncodedAccessUnitSource + Send + 'static, + ShutdownSource: FnOnce() + Send + 'static, +{ + let diagnostics_enabled = args.diagnostics; + let token = access_token::AccessToken::with_api_key(&args.api_key, &args.api_secret) + .with_identity(&args.identity) + .with_name(&args.identity) + .with_grants(access_token::VideoGrants { + room_join: true, + room: args.room_name.clone(), + can_publish: true, + can_subscribe: false, + ..Default::default() + }) + .to_jwt()?; + + log::info!("Connecting to LiveKit room '{}' as '{}'", args.room_name, args.identity); + let (room, _) = Room::connect(&args.url, &token, RoomOptions::default()) + .await + .context("failed to connect to LiveKit room")?; + + let capture_track = VideoCaptureTrack::new_encoded( + "preencoded", + VideoResolution { width: args.width, height: args.height }, + ); + let mut publish_options = VideoCaptureTrack::encoded_publish_options(codec); + let video_encoding = + publish_video_encoding(args.max_bitrate, args.width, args.height, args.fps, codec); + publish_options.video_encoding = Some(video_encoding.clone()); + publish_options.source = TrackSource::Camera; + + room.local_participant() + .publish_track(LocalTrack::Video(capture_track.track()), publish_options) + .await + .context("failed to publish pre-encoded video track")?; + log::info!( + "Published pre-encoded {:?} track at {}x{} (max_bitrate={}bps max_framerate={:.1}); forwarding {} access units", + codec, + args.width, + args.height, + video_encoding.max_bitrate, + video_encoding.max_framerate, + source_label + ); + + let keyframe_requests_forwarded = Arc::new(AtomicU64::new(0)); + let ingress = EncodedIngress::new( + capture_track, + KeyframeRequestLogger::new(source, source_label, keyframe_requests_forwarded.clone()), + ); + let stop = ingress.stop_handle(); + let signal_task = tokio::spawn(async move { + let _ = tokio::signal::ctrl_c().await; + stop.stop(); + shutdown_source(); + }); + + let capture_task = tokio::task::spawn_blocking(move || { + let diagnostics = AccessUnitDiagnostics::new( + diagnostics_enabled, + source_label, + expected_frame_interval_us, + keyframe_requests_forwarded, + ); + forward_access_units(ingress, diagnostics) + }); + let captured = capture_task.await.context("capture task failed to join")??; + signal_task.abort(); + room.close().await.context("failed to close LiveKit room")?; + + log::info!("Stopped after publishing {captured} encoded access units"); + Ok(()) +} + +/// Drives [`EncodedIngress::capture_next`] until EOF or shutdown, feeding the +/// example's per-access-unit diagnostics from each capture. +fn forward_access_units( + mut ingress: EncodedIngress, + mut diagnostics: AccessUnitDiagnostics, +) -> Result +where + S: EncodedAccessUnitSource, +{ + let stop = ingress.stop_handle(); + let mut captured = 0; + let mut dropped = 0; + while !stop.is_stopped() { + let read_started = Instant::now(); + let capture = match ingress.capture_next() { + Ok(Some(capture)) => capture, + Ok(None) => break, + Err(EncodedIngressError::Capture(CaptureError::CaptureFailed)) => { + dropped += 1; + if dropped == 1 || dropped % 300 == 0 { + log::info!("Dropped {dropped} encoded access units before capture"); + } + continue; + } + Err(EncodedIngressError::Source(err)) if stop.is_stopped() => { + log::debug!("encoded source stopped after shutdown: {err}"); + break; + } + Err(err) => return Err(err.into()), + }; + diagnostics.observe_source_wait(read_started.elapsed()); + diagnostics.observe_capture(&capture); + captured += 1; + if captured % 300 == 0 { + log::info!("Published {captured} encoded access units"); + } + } + diagnostics.finish(); + + Ok(captured) +} + +/// Wraps an encoded source to count and log the downstream keyframe requests +/// (PLI/FIR polled by [`EncodedIngress::capture_next`]) forwarded to it. +struct KeyframeRequestLogger { + source: S, + source_label: &'static str, + forwarded: Arc, +} + +impl KeyframeRequestLogger { + fn new(source: S, source_label: &'static str, forwarded: Arc) -> Self { + Self { source, source_label, forwarded } + } +} + +impl EncodedAccessUnitSource for KeyframeRequestLogger +where + S: EncodedAccessUnitSource, +{ + type Error = S::Error; + + fn next_access_unit(&mut self) -> Result, Self::Error> { + self.source.next_access_unit() + } + + fn request_keyframe(&mut self) { + let forwarded = self.forwarded.fetch_add(1, Ordering::Relaxed) + 1; + log::info!( + "{} forwarding downstream keyframe request {forwarded} to the encoded source", + self.source_label + ); + self.source.request_keyframe(); + } +} + +#[derive(Debug)] +struct AccessUnitDiagnostics { + enabled: bool, + source_label: &'static str, + expected_frame_interval_us: Option, + keyframe_requests_forwarded: Arc, + last_report: Instant, + last_wall_time: Option, + last_timestamp_us: Option, + last_keyframe_wall_time: Option, + last_keyframe_warning: Option, + total_frames: u64, + total_keyframes: u64, + report_frames: u64, + report_keyframes: u64, + report_bytes: u64, + report_max_bytes: usize, + report_max_source_wait: Duration, + report_max_wall_gap: Duration, + report_max_timestamp_gap_us: i64, + report_stalls: u64, + report_bursts: u64, +} + +impl AccessUnitDiagnostics { + fn new( + enabled: bool, + source_label: &'static str, + expected_frame_interval_us: Option, + keyframe_requests_forwarded: Arc, + ) -> Self { + let now = Instant::now(); + if enabled { + match expected_frame_interval_us { + Some(interval_us) => log::info!( + "{source_label} diagnostics enabled; expected frame interval {:.2}ms", + interval_us as f64 / 1000.0 + ), + None => log::info!("{source_label} diagnostics enabled"), + } + } + + Self { + enabled, + source_label, + expected_frame_interval_us, + keyframe_requests_forwarded, + last_report: now, + last_wall_time: None, + last_timestamp_us: None, + last_keyframe_wall_time: None, + last_keyframe_warning: None, + total_frames: 0, + total_keyframes: 0, + report_frames: 0, + report_keyframes: 0, + report_bytes: 0, + report_max_bytes: 0, + report_max_source_wait: Duration::ZERO, + report_max_wall_gap: Duration::ZERO, + report_max_timestamp_gap_us: 0, + report_stalls: 0, + report_bursts: 0, + } + } + + fn observe_source_wait(&mut self, wait: Duration) { + if !self.enabled { + return; + } + + self.report_max_source_wait = self.report_max_source_wait.max(wait); + if wait > SOURCE_STALL_THRESHOLD { + self.report_stalls += 1; + log::warn!( + "{} source wait {:.1}ms before next access unit", + self.source_label, + wait.as_secs_f64() * 1000.0 + ); + } + } + + fn observe_capture(&mut self, capture: &EncodedIngressCapture) { + if !self.enabled { + return; + } + + let now = Instant::now(); + let payload_len = capture.payload_len; + let is_keyframe = capture.frame_type == EncodedFrameType::Key; + let timestamp_gap_us = + self.last_timestamp_us.map(|last| capture.timestamp_us.saturating_sub(last)); + + self.total_frames += 1; + self.report_frames += 1; + self.report_bytes = self.report_bytes.saturating_add(payload_len as u64); + self.report_max_bytes = self.report_max_bytes.max(payload_len); + if is_keyframe { + self.total_keyframes += 1; + self.report_keyframes += 1; + self.last_keyframe_wall_time = Some(now); + self.last_keyframe_warning = None; + } + + if let Some(last_wall_time) = self.last_wall_time { + let wall_gap = now.saturating_duration_since(last_wall_time); + self.report_max_wall_gap = self.report_max_wall_gap.max(wall_gap); + if wall_gap > SOURCE_STALL_THRESHOLD { + self.report_stalls += 1; + log::warn!( + "{} publish gap {:.1}ms before frame {}", + self.source_label, + wall_gap.as_secs_f64() * 1000.0, + self.total_frames + ); + } + if wall_gap < BURST_WALL_DELTA_THRESHOLD { + if let Some(timestamp_gap_us) = timestamp_gap_us { + if timestamp_gap_us > BURST_WALL_DELTA_THRESHOLD.as_micros() as i64 { + self.report_bursts += 1; + } + } + } + } + + if let Some(timestamp_gap_us) = timestamp_gap_us { + self.report_max_timestamp_gap_us = + self.report_max_timestamp_gap_us.max(timestamp_gap_us); + self.observe_timestamp_gap(timestamp_gap_us); + } + + if is_keyframe { + log::info!( + "{} keyframe {} ts={} size={}", + self.source_label, + self.total_frames, + capture.timestamp_us, + payload_len + ); + } + + self.warn_if_keyframe_gap(now); + self.last_wall_time = Some(now); + self.last_timestamp_us = Some(capture.timestamp_us); + self.report_if_due(now); + } + + fn observe_timestamp_gap(&mut self, timestamp_gap_us: i64) { + let Some(expected_us) = self.expected_frame_interval_us else { + return; + }; + let tolerance_us = (expected_us / 2).max(10_000); + let deviation_us = (timestamp_gap_us - expected_us).abs(); + if deviation_us > tolerance_us { + log::warn!( + "{} timestamp gap {:.2}ms differs from expected {:.2}ms", + self.source_label, + timestamp_gap_us as f64 / 1000.0, + expected_us as f64 / 1000.0 + ); + } + } + + fn warn_if_keyframe_gap(&mut self, now: Instant) { + let Some(last_keyframe_wall_time) = self.last_keyframe_wall_time else { + if self.total_frames > 1 + && self.last_keyframe_warning.is_none_or(|last| { + now.saturating_duration_since(last) >= KEYFRAME_GAP_THRESHOLD + }) + { + self.last_keyframe_warning = Some(now); + log::warn!( + "{} has not seen a keyframe after {} access units", + self.source_label, + self.total_frames + ); + } + return; + }; + + let keyframe_gap = now.saturating_duration_since(last_keyframe_wall_time); + if keyframe_gap >= KEYFRAME_GAP_THRESHOLD + && self + .last_keyframe_warning + .is_none_or(|last| now.saturating_duration_since(last) >= KEYFRAME_GAP_THRESHOLD) + { + self.last_keyframe_warning = Some(now); + log::warn!( + "{} no keyframe for {:.1}s; {} downstream keyframe request(s) forwarded to the \ + source so far", + self.source_label, + keyframe_gap.as_secs_f64(), + self.keyframe_requests_forwarded.load(Ordering::Relaxed) + ); + } + } + + fn report_if_due(&mut self, now: Instant) { + let elapsed = now.saturating_duration_since(self.last_report); + if elapsed < DIAGNOSTIC_REPORT_INTERVAL { + return; + } + + let avg_size = + if self.report_frames == 0 { 0 } else { self.report_bytes / self.report_frames }; + let fps = self.report_frames as f64 / elapsed.as_secs_f64(); + log::info!( + "{} diagnostics: frames={} fps={:.1} keys={} avg_size={} max_size={} \ + max_source_wait={:.1}ms max_publish_gap={:.1}ms max_ts_gap={:.1}ms stalls={} \ + bursts={} keyframe_requests={}", + self.source_label, + self.report_frames, + fps, + self.report_keyframes, + avg_size, + self.report_max_bytes, + self.report_max_source_wait.as_secs_f64() * 1000.0, + self.report_max_wall_gap.as_secs_f64() * 1000.0, + self.report_max_timestamp_gap_us as f64 / 1000.0, + self.report_stalls, + self.report_bursts, + self.keyframe_requests_forwarded.load(Ordering::Relaxed) + ); + self.reset_report(now); + } + + fn reset_report(&mut self, now: Instant) { + self.last_report = now; + self.report_frames = 0; + self.report_keyframes = 0; + self.report_bytes = 0; + self.report_max_bytes = 0; + self.report_max_source_wait = Duration::ZERO; + self.report_max_wall_gap = Duration::ZERO; + self.report_max_timestamp_gap_us = 0; + self.report_stalls = 0; + self.report_bursts = 0; + } + + fn finish(&mut self) { + if !self.enabled { + return; + } + + log::info!( + "{} diagnostics finished: frames={} keyframes={}", + self.source_label, + self.total_frames, + self.total_keyframes + ); + } +} + +fn validate_dimensions(width: u32, height: u32) -> Result<()> { + if width == 0 || height == 0 { + bail!("--width and --height must be greater than zero"); + } + Ok(()) +} + +fn validate_max_bitrate(max_bitrate: Option) -> Result<()> { + if max_bitrate == Some(0) { + bail!("--max-bitrate must be greater than zero"); + } + Ok(()) +} + +fn frame_interval_us(fps: u32) -> Result { + if fps == 0 { + bail!("--fps must be greater than zero"); + } + Ok(1_000_000_i64 / i64::from(fps)) +} + +fn publish_video_encoding( + max_bitrate: Option, + width: u32, + height: u32, + fps: u32, + codec: EncodedVideoCodec, +) -> VideoEncoding { + let mut encoding = options::compute_appropriate_encoding(false, width, height, codec.into()); + if let Some(max_bitrate) = max_bitrate { + encoding.max_bitrate = max_bitrate; + } + encoding.max_framerate = f64::from(fps); + encoding +} + +fn current_time_us() -> i64 { + let Ok(duration) = SystemTime::now().duration_since(UNIX_EPOCH) else { + return 0; + }; + duration.as_micros().min(i64::MAX as u128) as i64 +} + +#[cfg(all(test, feature = "gstreamer"))] +mod tests { + use super::*; + + #[test] + fn gstreamer_pipeline_description_routes_test_source_to_h264_appsink() { + let description = + gstreamer_test_pipeline_description(320, 180, 30, EncodedVideoCodec::H264, None); + + assert!(description.contains("videotestsrc is-live=true do-timestamp=true")); + assert!(description.contains("pattern=ball motion=wavy animation-mode=frames")); + assert!(description.contains("timeoverlay")); + assert!(description.contains("video/x-raw,format=I420")); + assert!(description.contains("x264enc")); + assert!(description.contains("video/x-h264,stream-format=byte-stream,alignment=au")); + assert!(description.contains(&format!("appsink name={ENCODED_APPSINK_NAME}"))); + } + + #[test] + fn gstreamer_pipeline_description_routes_test_source_to_h265_appsink() { + let description = + gstreamer_test_pipeline_description(320, 180, 30, EncodedVideoCodec::H265, None); + + assert!(description.contains("videotestsrc is-live=true do-timestamp=true")); + assert!(description.contains("timeoverlay")); + assert!(description.contains("video/x-raw,format=I420")); + assert!(description.contains("x265enc")); + assert!(description.contains("h265parse config-interval=-1")); + assert!(description.contains("video/x-h265,stream-format=byte-stream,alignment=au")); + assert!(description.contains(&format!("appsink name={ENCODED_APPSINK_NAME}"))); + } + + #[test] + fn gstreamer_pipeline_description_routes_test_source_to_vp8_vp9_and_av1_appsink() { + let vp8 = gstreamer_test_pipeline_description(320, 180, 30, EncodedVideoCodec::VP8, None); + assert!(vp8.contains("video/x-raw,format=I420")); + assert!(vp8.contains("vp8enc")); + assert!(vp8.contains("video/x-vp8")); + assert!(vp8.contains(&format!("appsink name={ENCODED_APPSINK_NAME}"))); + + let vp9 = gstreamer_test_pipeline_description(320, 180, 30, EncodedVideoCodec::VP9, None); + assert!(vp9.contains("video/x-raw,format=I420")); + assert!(vp9.contains("vp9enc")); + assert!(vp9.contains("video/x-vp9,profile=(string)0")); + assert!(vp9.contains(&format!("appsink name={ENCODED_APPSINK_NAME}"))); + + let av1 = gstreamer_test_pipeline_description(320, 180, 30, EncodedVideoCodec::AV1, None); + assert!(av1.contains("video/x-raw,format=I420")); + assert!(av1.contains("av1enc")); + assert!(av1.contains("av1parse")); + assert!(av1.contains("video/x-av1,stream-format=obu-stream,alignment=tu")); + assert!(av1.contains(&format!("appsink name={ENCODED_APPSINK_NAME}"))); + } + + #[test] + fn gstreamer_pipeline_description_uses_trailing_pipeline_args() { + let pipeline = [ + "videotestsrc".to_string(), + "is-live=true".to_string(), + "!".to_string(), + "x264enc".to_string(), + ]; + + assert_eq!( + gstreamer_pipeline_description(320, 180, 30, EncodedVideoCodec::H265, &pipeline, None), + "videotestsrc is-live=true ! x264enc" + ); + } + + #[test] + fn gstreamer_shmsink_pipeline_description_uses_socket_path_and_codec_caps() { + let h264 = gstreamer_shmsink_pipeline_description( + "/tmp/livekit h264.shm", + EncodedVideoCodec::H264, + ); + assert!(h264.contains("shmsrc socket-path=\"/tmp/livekit h264.shm\"")); + assert!(h264.contains("is-live=true do-timestamp=true")); + assert!(h264.contains("capsfilter caps=")); + assert!(h264.contains("video/x-h264,stream-format=byte-stream,alignment=au")); + + let vp8 = + gstreamer_shmsink_pipeline_description("/tmp/livekit-vp8.shm", EncodedVideoCodec::VP8); + assert!(vp8.contains("shmsrc socket-path=/tmp/livekit-vp8.shm")); + assert!(vp8.contains("video/x-vp8")); + + let vp9 = + gstreamer_shmsink_pipeline_description("/tmp/livekit-vp9.shm", EncodedVideoCodec::VP9); + assert!(vp9.contains("video/x-vp9,profile=(string)0")); + + let av1 = + gstreamer_shmsink_pipeline_description("/tmp/livekit-av1.shm", EncodedVideoCodec::AV1); + assert!(av1.contains("video/x-av1,stream-format=obu-stream,alignment=tu")); + } + + #[test] + fn gstreamer_test_source_pulls_h264_access_units_when_plugins_are_available() { + let frame_interval_us = frame_interval_us(30).unwrap(); + let mut source = match GStreamerTestSource::start( + 320, + 180, + 30, + 10_000, + frame_interval_us, + Some(EncodedVideoCodec::H264), + &[], + None, + ) { + Ok(source) => source, + Err(err) => { + eprintln!("skipping GStreamer appsink smoke test: {err:#}"); + return; + } + }; + + assert_h264_access_units(&mut source); + } + + #[test] + fn gstreamer_test_source_pulls_h265_access_units_when_plugins_are_available() { + let frame_interval_us = frame_interval_us(30).unwrap(); + let mut source = match GStreamerTestSource::start( + 320, + 180, + 30, + 10_000, + frame_interval_us, + Some(EncodedVideoCodec::H265), + &[], + None, + ) { + Ok(source) => source, + Err(err) => { + eprintln!("skipping GStreamer H.265 appsink smoke test: {err:#}"); + return; + } + }; + + assert_h265_access_units(&mut source); + } + + #[test] + fn gstreamer_test_source_attaches_appsink_to_trailing_h264_pipeline() { + let frame_interval_us = frame_interval_us(30).unwrap(); + let pipeline = [ + "videotestsrc".to_string(), + "is-live=true".to_string(), + "do-timestamp=true".to_string(), + "pattern=smpte".to_string(), + "!".to_string(), + "video/x-raw,width=320,height=180,framerate=30/1".to_string(), + "!".to_string(), + "videoconvert".to_string(), + "!".to_string(), + "x264enc".to_string(), + "tune=zerolatency".to_string(), + "speed-preset=ultrafast".to_string(), + "key-int-max=30".to_string(), + "byte-stream=true".to_string(), + "aud=true".to_string(), + ]; + let mut source = match GStreamerTestSource::start( + 320, + 180, + 30, + 10_000, + frame_interval_us, + None, + &pipeline, + None, + ) { + Ok(source) => source, + Err(err) => { + eprintln!("skipping custom GStreamer pipeline smoke test: {err:#}"); + return; + } + }; + + assert_h264_access_units(&mut source); + } + + #[test] + fn gstreamer_test_source_attaches_appsink_to_trailing_h265_pipeline() { + let frame_interval_us = frame_interval_us(30).unwrap(); + let pipeline = [ + "videotestsrc".to_string(), + "is-live=true".to_string(), + "do-timestamp=true".to_string(), + "pattern=smpte".to_string(), + "!".to_string(), + "video/x-raw,width=320,height=180,framerate=30/1".to_string(), + "!".to_string(), + "videoconvert".to_string(), + "!".to_string(), + "x265enc".to_string(), + "tune=zerolatency".to_string(), + "speed-preset=ultrafast".to_string(), + "key-int-max=30".to_string(), + "bitrate=2500".to_string(), + ]; + let mut source = match GStreamerTestSource::start( + 320, + 180, + 30, + 10_000, + frame_interval_us, + None, + &pipeline, + None, + ) { + Ok(source) => source, + Err(err) => { + eprintln!("skipping custom GStreamer H.265 pipeline smoke test: {err:#}"); + return; + } + }; + + assert_h265_access_units(&mut source); + } + + fn assert_h264_access_units(source: &mut GStreamerTestSource) { + let first = source + .next_access_unit() + .expect("GStreamer appsink source should read the first sample") + .expect("GStreamer appsink should produce a first access unit"); + let second = source + .next_access_unit() + .expect("GStreamer appsink source should read the second sample") + .expect("GStreamer appsink should produce a second access unit"); + + assert_eq!(first.codec, EncodedVideoCodec::H264); + assert_eq!(first.width, 320); + assert_eq!(first.height, 180); + assert!(!first.payload.is_empty()); + assert!(first.timestamp_us >= 10_000); + assert!(second.timestamp_us > first.timestamp_us); + } + + fn assert_h265_access_units(source: &mut GStreamerTestSource) { + let first = source + .next_access_unit() + .expect("GStreamer appsink source should read the first sample") + .expect("GStreamer appsink should produce a first access unit"); + let second = source + .next_access_unit() + .expect("GStreamer appsink source should read the second sample") + .expect("GStreamer appsink should produce a second access unit"); + + assert_eq!(first.codec, EncodedVideoCodec::H265); + assert_eq!(first.width, 320); + assert_eq!(first.height, 180); + assert!(!first.payload.is_empty()); + assert!(first.timestamp_us >= 10_000); + assert!(second.timestamp_us > first.timestamp_us); + } +} diff --git a/libwebrtc/src/native/rtp_sender.rs b/libwebrtc/src/native/rtp_sender.rs index f58b08484..e90b43a75 100644 --- a/libwebrtc/src/native/rtp_sender.rs +++ b/libwebrtc/src/native/rtp_sender.rs @@ -95,6 +95,7 @@ impl From for sys_webrtc::ffi::VideoEncoderBackend { VideoEncoderBackend::Nvenc => Self::Nvenc, VideoEncoderBackend::Vaapi => Self::Vaapi, VideoEncoderBackend::VideoToolbox => Self::VideoToolbox, + VideoEncoderBackend::PreEncoded => Self::PreEncoded, } } } @@ -108,6 +109,7 @@ impl From for VideoEncoderBackend { sys_webrtc::ffi::VideoEncoderBackend::Nvenc => Self::Nvenc, sys_webrtc::ffi::VideoEncoderBackend::Vaapi => Self::Vaapi, sys_webrtc::ffi::VideoEncoderBackend::VideoToolbox => Self::VideoToolbox, + sys_webrtc::ffi::VideoEncoderBackend::PreEncoded => Self::PreEncoded, _ => panic!("unknown VideoEncoderBackend"), } } @@ -130,6 +132,7 @@ mod tests { (VideoEncoderBackend::Nvenc, sys_webrtc::ffi::VideoEncoderBackend::Nvenc), (VideoEncoderBackend::Vaapi, sys_webrtc::ffi::VideoEncoderBackend::Vaapi), (VideoEncoderBackend::VideoToolbox, sys_webrtc::ffi::VideoEncoderBackend::VideoToolbox), + (VideoEncoderBackend::PreEncoded, sys_webrtc::ffi::VideoEncoderBackend::PreEncoded), ]; for (backend, expected) in cases { diff --git a/libwebrtc/src/native/video_frame.rs b/libwebrtc/src/native/video_frame.rs index aaa5491ff..dc3d5fbb8 100644 --- a/libwebrtc/src/native/video_frame.rs +++ b/libwebrtc/src/native/video_frame.rs @@ -51,7 +51,9 @@ pub fn new_video_frame_buffer( vfb_sys::ffi::VideoFrameBufferType::NV12 => Box::new(vf::NV12Buffer { handle: NV12Buffer { sys_handle: sys_handle.pin_mut().get_nv12() }, }), - _ => unreachable!(), + _ => { + Box::new(vf::I420Buffer { handle: I420Buffer { sys_handle: sys_handle.to_i420() } }) + } } } } diff --git a/libwebrtc/src/native/video_source.rs b/libwebrtc/src/native/video_source.rs index 323e61c88..446214fdc 100644 --- a/libwebrtc/src/native/video_source.rs +++ b/libwebrtc/src/native/video_source.rs @@ -26,7 +26,7 @@ use webrtc_sys::{video_frame as vf_sys, video_frame::ffi::VideoRotation, video_t use crate::video_frame::FrameMetadata; use crate::{ native::packet_trailer::PacketTrailerHandler, - video_frame::{I420Buffer, VideoBuffer, VideoFrame}, + video_frame::{EncodedVideoFrame, I420Buffer, VideoBuffer, VideoFrame}, video_source::VideoResolution, }; @@ -54,6 +54,24 @@ struct VideoSourceInner { impl NativeVideoSource { pub fn new(resolution: VideoResolution, is_screencast: bool) -> NativeVideoSource { + Self::new_inner(resolution, is_screencast, true) + } + + /// Creates a source for pre-encoded access units. + /// + /// Unlike [`NativeVideoSource::new`], no raw black-frame keepalive is + /// injected before the first capture: raw frames would start a real + /// encoder on a sender meant for the pass-through encoder and corrupt + /// the encoded stream. + pub fn new_encoded(resolution: VideoResolution) -> NativeVideoSource { + Self::new_inner(resolution, false, false) + } + + fn new_inner( + resolution: VideoResolution, + is_screencast: bool, + raw_keepalive: bool, + ) -> NativeVideoSource { let source = Self { sys_handle: vt_sys::ffi::new_video_track_source( &vt_sys::ffi::VideoResolution::from(resolution.clone()), @@ -62,39 +80,41 @@ impl NativeVideoSource { inner: Arc::new(Mutex::new(VideoSourceInner { captured_frames: 0 })), }; - livekit_runtime::spawn({ - let source = source.clone(); - let i420 = I420Buffer::new(resolution.width, resolution.height); - async move { - let mut interval = interval(Duration::from_millis(100)); // 10 fps + if raw_keepalive { + livekit_runtime::spawn({ + let source = source.clone(); + let i420 = I420Buffer::new(resolution.width, resolution.height); + async move { + let mut interval = interval(Duration::from_millis(100)); // 10 fps - loop { - interval.tick().await; + loop { + interval.tick().await; - let inner = source.inner.lock(); - if inner.captured_frames > 0 { - break; - } + let inner = source.inner.lock(); + if inner.captured_frames > 0 { + break; + } + + let mut builder = vf_sys::ffi::new_video_frame_builder(); + builder.pin_mut().set_rotation(VideoRotation::VideoRotation0); + builder.pin_mut().set_video_frame_buffer(i420.as_ref().sys_handle()); + + let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap(); + builder.pin_mut().set_timestamp_us(now.as_micros() as i64); - let mut builder = vf_sys::ffi::new_video_frame_builder(); - builder.pin_mut().set_rotation(VideoRotation::VideoRotation0); - builder.pin_mut().set_video_frame_buffer(i420.as_ref().sys_handle()); - - let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap(); - builder.pin_mut().set_timestamp_us(now.as_micros() as i64); - - source.sys_handle.on_captured_frame( - &builder.pin_mut().build(), - &vt_sys::ffi::FrameMetadata { - has_packet_trailer: false, - user_timestamp: 0, - frame_id: 0, - user_data: Vec::new(), - }, - ); + source.sys_handle.on_captured_frame( + &builder.pin_mut().build(), + &vt_sys::ffi::FrameMetadata { + has_packet_trailer: false, + user_timestamp: 0, + frame_id: 0, + user_data: Vec::new(), + }, + ); + } } - } - }); + }); + } source } @@ -139,6 +159,50 @@ impl NativeVideoSource { ); } + pub fn capture_encoded_frame(&self, frame: &EncodedVideoFrame<'_>) -> bool { + let (has_trailer, user_ts, fid, user_data) = match &frame.frame_metadata { + Some(meta) => ( + true, + meta.user_timestamp.unwrap_or(0), + meta.frame_id.unwrap_or(0), + meta.user_data.clone().unwrap_or_default(), + ), + None => (false, 0, 0, Vec::new()), + }; + + let capture_ts = if frame.timestamp_us == 0 { + let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap(); + now.as_micros() as i64 + } else { + frame.timestamp_us + }; + + self.inner.lock().captured_frames += 1; + self.sys_handle.capture_encoded_frame( + frame.width as i32, + frame.height as i32, + &vt_sys::ffi::EncodedVideoFrameData { + codec: frame.codec.into(), + frame_type: frame.frame_type.into(), + timestamp_us: capture_ts, + }, + frame.payload, + &vt_sys::ffi::FrameMetadata { + has_packet_trailer: has_trailer, + user_timestamp: user_ts, + frame_id: fid, + user_data, + }, + ) + } + + /// Returns and clears the pending keyframe request raised by the + /// pass-through encoder (PLI/FIR or reconfiguration). Poll from the + /// capture loop and forward the request to the upstream encoder. + pub fn take_keyframe_request(&self) -> bool { + self.sys_handle.take_keyframe_request() + } + /// Captures a Jetson DMA-buffer backed video frame. /// /// `pixel_format` is `0` for NV12 and `1` for YUV420M. diff --git a/libwebrtc/src/rtp_sender.rs b/libwebrtc/src/rtp_sender.rs index b16c008c2..1bdf96add 100644 --- a/libwebrtc/src/rtp_sender.rs +++ b/libwebrtc/src/rtp_sender.rs @@ -36,6 +36,8 @@ pub enum VideoEncoderBackend { Vaapi, /// Prefer VideoToolbox on Apple platforms when available. VideoToolbox, + /// Pass pre-encoded frames through without encoding raw video frames. + PreEncoded, } impl VideoEncoderBackend { diff --git a/libwebrtc/src/video_frame.rs b/libwebrtc/src/video_frame.rs index 1c728c3e9..d89095f13 100644 --- a/libwebrtc/src/video_frame.rs +++ b/libwebrtc/src/video_frame.rs @@ -71,6 +71,73 @@ pub struct FrameMetadata { pub user_data: Option>, } +/// Codec carried by a pre-encoded video access unit. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum EncodedVideoCodec { + /// H.264/AVC video. + H264, + /// H.265/HEVC video. + H265, + /// VP8 video. + VP8, + /// VP9 video. + VP9, + /// AV1 video. + AV1, +} + +/// Frame type of a pre-encoded video access unit. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum EncodedFrameType { + /// A key frame. + Key, + /// A delta frame. + Delta, +} + +/// A pre-encoded video access unit ready for passthrough publishing. +#[derive(Debug, Clone)] +pub struct EncodedVideoFrame<'a> { + /// Encoded video codec. + pub codec: EncodedVideoCodec, + /// Encoded access-unit payload. + pub payload: &'a [u8], + /// Capture timestamp in microseconds. + pub timestamp_us: i64, + /// Encoded frame type. + pub frame_type: EncodedFrameType, + /// Encoded frame width in pixels. + pub width: u32, + /// Encoded frame height in pixels. + pub height: u32, + /// Optional metadata to attach through packet trailers. + pub frame_metadata: Option, +} + +#[cfg(not(target_arch = "wasm32"))] +impl From for webrtc_sys::video_track::ffi::EncodedVideoCodec { + fn from(value: EncodedVideoCodec) -> Self { + match value { + EncodedVideoCodec::H264 => Self::H264, + EncodedVideoCodec::H265 => Self::H265, + EncodedVideoCodec::VP8 => Self::VP8, + EncodedVideoCodec::VP9 => Self::VP9, + EncodedVideoCodec::AV1 => Self::AV1, + } + } +} + +#[cfg(not(target_arch = "wasm32"))] +impl From for webrtc_sys::video_track::ffi::EncodedFrameType { + fn from(value: EncodedFrameType) -> Self { + match value { + EncodedFrameType::Key => Self::Key, + EncodedFrameType::Delta => Self::Delta, + } + } +} + #[derive(Debug)] pub struct VideoFrame where diff --git a/libwebrtc/src/video_source.rs b/libwebrtc/src/video_source.rs index e88fd73de..e9d374b36 100644 --- a/libwebrtc/src/video_source.rs +++ b/libwebrtc/src/video_source.rs @@ -51,7 +51,7 @@ pub mod native { use crate::native::packet_trailer::PacketTrailerHandler; #[cfg(target_os = "linux")] use crate::video_frame::FrameMetadata; - use crate::video_frame::{VideoBuffer, VideoFrame}; + use crate::video_frame::{EncodedVideoFrame, VideoBuffer, VideoFrame}; #[derive(Clone)] pub struct NativeVideoSource { @@ -75,10 +75,27 @@ pub mod native { Self { handle: vs_imp::NativeVideoSource::new(resolution, is_screencast) } } + /// Creates a source for pre-encoded access units: no raw black-frame + /// keepalive is injected before the first capture. + pub fn new_encoded(resolution: VideoResolution) -> Self { + Self { handle: vs_imp::NativeVideoSource::new_encoded(resolution) } + } + pub fn capture_frame>(&self, frame: &VideoFrame) { self.handle.capture_frame(frame) } + /// Captures one pre-encoded video access unit. + pub fn capture_encoded_frame(&self, frame: &EncodedVideoFrame<'_>) -> bool { + self.handle.capture_encoded_frame(frame) + } + + /// Returns and clears the pending keyframe request raised by the + /// pass-through encoder (PLI/FIR or reconfiguration). + pub fn take_keyframe_request(&self) -> bool { + self.handle.take_keyframe_request() + } + /// Captures a Jetson DMA-buffer backed video frame. /// /// `pixel_format` is `0` for NV12 and `1` for YUV420M. diff --git a/livekit-capture/Cargo.toml b/livekit-capture/Cargo.toml new file mode 100644 index 000000000..e4d78d499 --- /dev/null +++ b/livekit-capture/Cargo.toml @@ -0,0 +1,78 @@ +[package] +name = "livekit-capture" +description = "Capture sources and pre-encoded video publishing helpers for LiveKit" +version = "0.1.0" +readme = "README.md" +license.workspace = true +edition.workspace = true +repository.workspace = true + +[dependencies] +base64 = { workspace = true, optional = true } +bytes = { workspace = true } +gstreamer = { workspace = true, optional = true } +gstreamer-app = { workspace = true, optional = true } +image = { workspace = true, optional = true } +livekit = { workspace = true } +md-5 = { workspace = true, optional = true } +thiserror = { workspace = true } +yuv-sys = { workspace = true, features = ["jpeg"], optional = true } + +[features] +default = [] +avfoundation = [ + "dep:dispatch2", + "dep:objc2", + "dep:objc2-av-foundation", + "dep:objc2-core-media", + "dep:objc2-core-video", + "dep:objc2-foundation", + "dep:yuv-sys", + "objc2-av-foundation/AVCaptureDevice", + "objc2-av-foundation/AVCaptureInput", + "objc2-av-foundation/AVCaptureOutputBase", + "objc2-av-foundation/AVCaptureSession", + "objc2-av-foundation/AVCaptureSessionPreset", + "objc2-av-foundation/AVCaptureVideoDataOutput", + "objc2-av-foundation/AVMediaFormat", + "objc2-av-foundation/AVVideoSettings", + "objc2-av-foundation/dispatch2", + "objc2-av-foundation/objc2-core-media", + "objc2-core-media/CMFormatDescription", + "objc2-core-media/CMSync", + "objc2-core-media/CMTime", + "objc2-core-media/CMSampleBuffer", + "objc2-core-media/objc2-core-video", + "objc2-core-video/CVBase", + "objc2-core-video/CVBuffer", + "objc2-core-video/CVImageBuffer", + "objc2-core-video/CVPixelBuffer", + "objc2-core-video/CVReturn", + "objc2-foundation/NSArray", + "objc2-foundation/NSDictionary", + "objc2-foundation/NSError", + "objc2-foundation/NSObject", + "objc2-foundation/NSValue", + "objc2-foundation/NSString", + "objc2-foundation/objc2-core-foundation", +] +gstreamer = ["dep:gstreamer", "dep:gstreamer-app"] +libargus = [] +rtsp = ["dep:base64", "dep:md-5"] +tcpsink = [] +v4l = ["dep:image", "dep:libc", "dep:v4l", "dep:yuv-sys"] + +[build-dependencies] +cc = { workspace = true } + +[target.'cfg(target_os = "macos")'.dependencies] +dispatch2 = { version = "0.3.1", default-features = false, features = ["std"], optional = true } +objc2 = { version = "0.6.4", default-features = false, features = ["std"], optional = true } +objc2-av-foundation = { version = "0.3.2", default-features = false, optional = true } +objc2-core-media = { version = "0.3.2", default-features = false, optional = true } +objc2-core-video = { version = "0.3.2", default-features = false, optional = true } +objc2-foundation = { version = "0.3.2", default-features = false, features = ["std"], optional = true } + +[target.'cfg(target_os = "linux")'.dependencies] +libc = { version = "0.2", optional = true } +v4l = { version = "0.14", default-features = false, features = ["v4l2"], optional = true } diff --git a/livekit-capture/README.md b/livekit-capture/README.md new file mode 100644 index 000000000..6d81f3bc9 --- /dev/null +++ b/livekit-capture/README.md @@ -0,0 +1,273 @@ +# livekit-capture + +Capture helpers for publishing decoded, native platform, DMA-BUF, and +pre-encoded video frames with the LiveKit Rust SDK. Optional source features +include `avfoundation`, `libargus`, `v4l`, `tcpsink`, `rtsp`, and `gstreamer`. + +## Library entry points + +- `VideoCaptureSource::open(CaptureSourceOptions)` — one facade over every + backend. Camera backends (`AvFoundation`, `V4l2`, `LibArgus`, `Auto`) are + selected with device/format options; the encoded ingest backends (`Rtsp`, + `Tcp`, `Gstreamer`) take an `EncodedEndpoint` describing the URL, socket, or + `gst-launch` description. `publish_next(&track)` pumps one frame and returns + `Ok(false)` at end of stream; `stop()` interrupts a blocked capture. +- `VideoCaptureTrack::new` for decoded/native/DMA-BUF publishing and + `VideoCaptureTrack::new_encoded` for pre-encoded passthrough (no raw + keepalive frames, so the sender starts directly on the passthrough encoder). +- `EncodedIngress` — the lower-level pre-encoded pump used when the caller + manages its own source: `capture_next()` reports each published access unit, + `stop_handle()` cancels from any thread, and downstream keyframe requests + (PLI/FIR) are forwarded to the source automatically. The GStreamer source + answers them with a `GstForceKeyUnit` upstream event; passthrough is + single-layer (`L1T1`), and access units carrying other layering metadata are + rejected. +- `sources::gstreamer::ensure_encoded_appsink` and friends turn an arbitrary + pipeline (containing `appsink name=lk_appsink` or one unlinked encoded pad) + into an encoded source; `encoded_caps_string` is the single per-codec caps + table. + +## Pre-encoded source modes + +The `preencode_publish` example publishes H.264, H.265, VP8, VP9, and AV1 +access units from these sources: + +| Source | Feature | Input shape | +| --- | --- | --- | +| `gstappsink` | `gstreamer` | Generated or custom GStreamer pipeline ending in `appsink` or one unlinked encoded pad | +| `tcpsink` | `tcpsink` | TCP connection to an encoded byte-stream or RFC4571 RTP producer | +| `shmsink` | `gstreamer` | GStreamer `shmsink` producer read through `shmsrc` | +| `rtsp` | `rtsp` | RTSP over TCP with interleaved RTP video | + +H.264/H.265 TCP defaults to Annex-B byte streams, while VP8, VP9, and AV1 use +RTP framing over TCP because those codecs need explicit frame boundaries. + +## Pre-encoded test sources + +The example ships GStreamer fixture scripts that exercise the H.264, H.265, +VP8, VP9, and AV1 capture paths with an animated `videotestsrc` at +`1280x720@30fps`. Generated encoder pipelines force 8-bit I420 input, and VP9 +fixture caps are pinned to profile 0 to match the WebRTC passthrough profile. + +Before running a publisher, provide LiveKit credentials through the environment +or command-line flags: + +```sh +export LIVEKIT_URL=wss://example.livekit.cloud +export LIVEKIT_API_KEY=devkey +export LIVEKIT_API_SECRET=secret +``` + +All scripts require `--codec h264|h265|vp8|vp9|av1` and also accept `--width`, +`--height`, `--fps`, `--bitrate-kbps`, and `--print`; the defaults match the +test profile above. + +### Local SFU example + +Run a local LiveKit server in dev mode and use its dev credentials in the +publisher examples: + +```sh +livekit-server --dev --bind 0.0.0.0 +``` + +```sh +export LIVEKIT_URL=ws://127.0.0.1:7880 +export LIVEKIT_API_KEY=devkey +export LIVEKIT_API_SECRET=secret +``` + +Run a subscriber in another terminal to verify the negotiated codec and decoder +health: + +```sh +cargo run -p local_video --features desktop --bin subscriber -- \ + --url "$LIVEKIT_URL" \ + --api-key "$LIVEKIT_API_KEY" \ + --api-secret "$LIVEKIT_API_SECRET" \ + --room-name video-room \ + --identity sub-vp8 \ + --participant gst-vp8-pub \ + --display-timestamp +``` + +Then publish a pre-encoded GStreamer fixture: + +```sh +cargo run -p preencode_publish --features gstreamer -- \ + --source gstappsink \ + --codec vp8 \ + --url "$LIVEKIT_URL" \ + --api-key "$LIVEKIT_API_KEY" \ + --api-secret "$LIVEKIT_API_SECRET" \ + --room-name video-room \ + --identity gst-vp8-pub \ + --width 1280 \ + --height 720 \ + --fps 30 \ + --diagnostics +``` + +Expected publisher signs are a successful room connection, a +`Published pre-encoded ... track at 1280x720` log line, and diagnostics near +30 access units per second. A healthy subscriber shows a matching +`Subscribed to video track` codec and rising decoded-frame counts with low loss +and no repeated PLI loop. + +### GStreamer `gstappsink` source + +Exercises +`GStreamer videotestsrc -> encoder -> appsink -> GStreamerAppSinkEncodedSource -> VideoCaptureTrack`. + +```sh +cargo run -p preencode_publish --features gstreamer -- \ + --source gstappsink \ + --codec h264 \ + --width 1280 \ + --height 720 \ + --fps 30 \ + --room-name video-room \ + --identity gst-h264-pub \ + --diagnostics +``` + +For H.265, VP8, VP9, or AV1, change `--codec` accordingly. + +Custom GStreamer launch fragments can be passed after `--`. If the pipeline +does not include `appsink name=lk_appsink`, it must leave exactly one encoded +video source pad unlinked so the example can attach codec-specific parsing, +caps, and appsink: + +```sh +cargo run -p preencode_publish --features gstreamer -- \ + --source gstappsink \ + --codec h264 \ + --width 1280 \ + --height 720 \ + --fps 30 \ + --room-name video-room \ + --identity custom-gst-h264-pub \ + --diagnostics \ + -- \ + 'videotestsrc is-live=true do-timestamp=true ! video/x-raw,width=1280,height=720,framerate=30/1 ! videoconvert ! x264enc tune=zerolatency speed-preset=ultrafast key-int-max=30 byte-stream=true aud=true' +``` + +### TCP `tcpsink` source + +Exercises +`GStreamer videotestsrc -> encoder -> tcpserversink -> TcpEncodedSource -> VideoCaptureTrack`. +The `tcpsink` source connects to a TCP producer such as the fixture script's +GStreamer `tcpserversink`. + +Start the producer, then publish: + +```sh +examples/preencode_publish/scripts/run-tcp-test-source.sh --codec h264 --port 5000 +``` + +```sh +cargo run -p preencode_publish -- \ + --source tcpsink \ + --host 127.0.0.1:5000 \ + --codec h264 \ + --width 1280 \ + --height 720 \ + --fps 30 \ + --room-name video-room \ + --identity tcp-h264-pub \ + --diagnostics +``` + +For H.265, use `--codec h265` in both commands. For VP8, VP9, or AV1, use the +same script with the matching `--codec` and add `--tcp-format auto` to the +publisher, which selects RTP automatically: + +```sh +cargo run -p preencode_publish -- \ + --source tcpsink \ + --host 127.0.0.1:5000 \ + --codec vp8 \ + --tcp-format auto \ + --width 1280 \ + --height 720 \ + --fps 30 \ + --room-name video-room \ + --identity tcp-vp8-pub \ + --diagnostics +``` + +### Shared-memory `shmsink` source + +Exercises +`GStreamer videotestsrc -> encoder -> shmsink -> shmsrc -> GStreamerAppSinkEncodedSource -> VideoCaptureTrack`. + +Start the producer, then publish by connecting the `shmsink` source to that +socket: + +```sh +examples/preencode_publish/scripts/run-shm-test-source.sh \ + --codec h264 \ + --socket-path /tmp/livekit-preencode-h264.shm +``` + +```sh +cargo run -p preencode_publish --features gstreamer -- \ + --source shmsink \ + --codec h264 \ + --shmsink-socket-path /tmp/livekit-preencode-h264.shm \ + --width 1280 \ + --height 720 \ + --fps 30 \ + --room-name video-room \ + --identity shm-h264-pub \ + --diagnostics +``` + +For H.265, VP8, or VP9, use the same command shape with the matching `--codec` +(and a different socket path if desired). + +```sh +cargo run -p preencode_publish --features gstreamer -- \ + --source shmsink \ + --codec av1 \ + --shmsink-socket-path /tmp/livekit-preencode-av1.shm \ + --width 1280 \ + --height 720 \ + --fps 30 \ + --room-name video-room \ + --identity shm-av1-pub \ + --diagnostics +``` + +### RTSP source + +Exercises +`GStreamer videotestsrc -> encoder -> RTP payloader -> gst-rtsp-server -> RtspEncodedSource -> VideoCaptureTrack`. + +Start the RTSP server (the script uses the `test-launch` tool from +`gst-rtsp-server` and serves `/test`), then publish: + +```sh +examples/preencode_publish/scripts/run-rtsp-test-source.sh --codec h264 --port 8555 +``` + +```sh +cargo run -p preencode_publish -- \ + --source rtsp \ + --rtsp-url rtsp://127.0.0.1:8555/test \ + --codec h264 \ + --width 1280 \ + --height 720 \ + --fps 30 \ + --room-name video-room \ + --identity rtsp-h264-pub \ + --diagnostics +``` + +For H.265, use `--codec h265` in both commands. For VP8, VP9, or AV1, use the +matching `--codec` in both commands; the RTSP fixture switches to `rtpvp8pay`, +`rtpvp9pay`, or `rtpav1pay` automatically. + +Publisher-side success signs are a successful room connection, a +`Published pre-encoded ... track at 1280x720` log line, and diagnostics near +30 access units per second. diff --git a/livekit-capture/build.rs b/livekit-capture/build.rs new file mode 100644 index 000000000..296eb6d04 --- /dev/null +++ b/livekit-capture/build.rs @@ -0,0 +1,65 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::path::PathBuf; + +fn main() { + println!("cargo:rustc-check-cfg=cfg(livekit_capture_argus)"); + println!("cargo:rerun-if-env-changed=JETSON_MULTIMEDIA_API_DIR"); + + if std::env::var_os("CARGO_FEATURE_LIBARGUS").is_none() { + return; + } + + let target_os = std::env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); + let target_arch = std::env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default(); + if target_os != "linux" || target_arch != "aarch64" { + return; + } + + let mmapi_root = std::env::var_os("JETSON_MULTIMEDIA_API_DIR") + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from("/usr/src/jetson_multimedia_api")); + let argus_include = mmapi_root.join("argus/include"); + let mmapi_include = mmapi_root.join("include"); + + if !argus_include.exists() || !mmapi_include.exists() { + println!( + "cargo:warning=Argus headers not found under {}; skipping libargus capture shim", + mmapi_root.display() + ); + return; + } + + println!("cargo:rerun-if-changed=src/sources/lk_argus.cpp"); + + cc::Build::new() + .cpp(true) + .file("src/sources/lk_argus.cpp") + .include(&argus_include) + .include(&mmapi_include) + .flag("-std=c++14") + .flag("-Wno-deprecated-declarations") + .compile("lk_argus"); + + println!("cargo:rustc-cfg=livekit_capture_argus"); + println!("cargo:rustc-link-lib=dylib=nvargus_socketclient"); + println!("cargo:rustc-link-lib=dylib=nvbufsurface"); + + let tegra_lib_dir = PathBuf::from("/usr/lib/aarch64-linux-gnu/tegra"); + if tegra_lib_dir.exists() { + println!("cargo:rustc-link-search=native={}", tegra_lib_dir.display()); + } + println!("cargo:rustc-link-search=native=/usr/lib/aarch64-linux-gnu"); +} diff --git a/livekit-capture/src/device.rs b/livekit-capture/src/device.rs new file mode 100644 index 000000000..734f85834 --- /dev/null +++ b/livekit-capture/src/device.rs @@ -0,0 +1,307 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt; + +use livekit::webrtc::video_source::VideoResolution; +use thiserror::Error; + +/// Capture backend used by a source implementation. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[non_exhaustive] +pub enum CaptureBackend { + /// Let `livekit-capture` choose the platform default backend. + Auto, + /// macOS AVFoundation camera capture. + AvFoundation, + /// Linux Video4Linux2 camera capture. + V4l2, + /// NVIDIA Jetson libargus camera capture. + LibArgus, + /// RTSP encoded ingress. + Rtsp, + /// TCP byte-stream encoded ingress. + Tcp, + /// GStreamer appsink encoded ingress. + Gstreamer, +} + +impl CaptureBackend { + /// Returns a stable backend name. + pub const fn as_str(self) -> &'static str { + match self { + Self::Auto => "auto", + Self::AvFoundation => "avfoundation", + Self::V4l2 => "v4l2", + Self::LibArgus => "libargus", + Self::Rtsp => "rtsp", + Self::Tcp => "tcp", + Self::Gstreamer => "gstreamer", + } + } +} + +impl fmt::Display for CaptureBackend { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.as_str()) + } +} + +/// Capture path used by a source implementation. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[non_exhaustive] +pub enum CapturePath { + /// Platform-native uncompressed frame buffers. + Native, + /// Uncompressed CPU-accessible frame buffers. + Raw, + /// Linux DMA-BUF backed frames. + DmaBuf, + /// Compressed encoded access units. + Encoded, +} + +/// Error returned while querying capture devices. +#[derive(Debug, Error)] +#[non_exhaustive] +pub enum CaptureDeviceQueryError { + /// The backend does not support device enumeration on this target or build. + #[error("capture backend {0} does not support device enumeration")] + UnsupportedBackend(CaptureBackend), + /// The backend failed while querying devices. + #[error("capture backend {backend} device query failed: {message}")] + Backend { + /// Backend that failed. + backend: CaptureBackend, + /// Backend error message. + message: String, + }, +} + +/// Capture device discovered by a platform backend. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CaptureDeviceInfo { + /// Backend that reported this device. + pub backend: CaptureBackend, + /// Backend-stable device identifier. + pub id: String, + /// Preferred selector that reopens this exact device. + pub selector: CaptureDeviceSelector, + /// Human-readable device name. + pub name: String, + /// Device model identifier, when available. + pub model_id: Option, + /// Device manufacturer, when available. + pub manufacturer: Option, + /// Capture paths supported by this device. + pub paths: Vec, + /// Capture formats reported by the backend. + pub formats: Vec, + /// Whether [`CaptureDeviceInfo::formats`] is a complete backend-reported list. + pub formats_complete: bool, +} + +/// Device selector used by capture backends. +#[derive(Debug, Clone, PartialEq, Eq)] +#[non_exhaustive] +pub enum CaptureDeviceSelector { + /// Use the backend default video device. + Default, + /// Use the device at the backend enumeration index. + Index(usize), + /// Use a backend-stable device identifier. + Id(String), +} + +/// Frame format used by a raw-frame capture backend. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum CaptureFrameFormat { + /// Planar I420/YUV420P. + I420, + /// Biplanar NV12. + Nv12, + /// Packed BGRA. + Bgra, + /// Packed RGB24. + Rgb24, + /// Packed BGR24. + Bgr24, + /// Packed YUYV/YUY2. + Yuyv, + /// Packed UYVY. + Uyvy, + /// Single-plane 8-bit luma. + Grey, + /// Encoded MJPEG frames. + Mjpeg, +} + +impl CaptureFrameFormat { + /// Returns a stable lower-case frame-format name. + pub const fn as_str(self) -> &'static str { + match self { + Self::I420 => "i420", + Self::Nv12 => "nv12", + Self::Bgra => "bgra", + Self::Rgb24 => "rgb24", + Self::Bgr24 => "bgr24", + Self::Yuyv => "yuyv", + Self::Uyvy => "uyvy", + Self::Grey => "grey", + Self::Mjpeg => "mjpeg", + } + } +} + +impl fmt::Display for CaptureFrameFormat { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.as_str()) + } +} + +impl std::str::FromStr for CaptureFrameFormat { + type Err = CaptureFrameFormatParseError; + + fn from_str(value: &str) -> Result { + match value.to_ascii_lowercase().as_str() { + "i420" | "yuv420p" => Ok(Self::I420), + "nv12" => Ok(Self::Nv12), + "bgra" => Ok(Self::Bgra), + "rgb24" | "rgb" => Ok(Self::Rgb24), + "bgr24" | "bgr" => Ok(Self::Bgr24), + "yuyv" | "yuy2" => Ok(Self::Yuyv), + "uyvy" => Ok(Self::Uyvy), + "grey" | "greyscale" => Ok(Self::Grey), + "mjpeg" | "mjpg" => Ok(Self::Mjpeg), + _ => Err(CaptureFrameFormatParseError), + } + } +} + +/// Error returned when parsing a [`CaptureFrameFormat`] from a string. +#[derive(Debug, Clone, Copy, Error, PartialEq, Eq)] +#[error("unknown capture frame format")] +pub struct CaptureFrameFormatParseError; + +/// Pixel dimensions for a capture format. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct CaptureResolution { + /// Frame width in pixels. + pub width: u32, + /// Frame height in pixels. + pub height: u32, +} + +impl CaptureResolution { + /// Creates a capture resolution. + pub const fn new(width: u32, height: u32) -> Self { + Self { width, height } + } +} + +impl From for VideoResolution { + fn from(value: CaptureResolution) -> Self { + Self { width: value.width, height: value.height } + } +} + +/// Raw-frame capture format. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct CaptureFormat { + /// Frame dimensions. + pub resolution: CaptureResolution, + /// Frame rate in frames per second. + pub frame_rate: u32, + /// Frame format. + pub frame_format: CaptureFrameFormat, +} + +impl CaptureFormat { + /// Creates a raw-frame capture format. + pub const fn new( + resolution: CaptureResolution, + frame_rate: u32, + frame_format: CaptureFrameFormat, + ) -> Self { + Self { resolution, frame_rate, frame_format } + } +} + +/// Format selection requested from a capture backend. +#[derive(Debug, Clone, PartialEq, Eq)] +#[non_exhaustive] +pub enum CaptureFormatRequest { + /// Let the backend choose its default format. + Default, + /// Require an exact format match. + Exact(CaptureFormat), + /// Use the backend's closest supported format. + Closest(CaptureFormat), + /// Prefer the highest frame rate, optionally constrained by resolution and frame format. + HighestFrameRate { + /// Optional resolution constraint. + resolution: Option, + /// Optional frame format constraint. + frame_format: Option, + }, + /// Prefer the highest resolution, optionally constrained by frame rate and frame format. + HighestResolution { + /// Optional frame-rate constraint. + frame_rate: Option, + /// Optional frame format constraint. + frame_format: Option, + }, +} + +#[cfg(test)] +mod tests { + use super::*; + use std::str::FromStr; + + #[test] + fn capture_frame_format_parses_common_names() { + assert_eq!(CaptureFrameFormat::from_str("MJPEG"), Ok(CaptureFrameFormat::Mjpeg)); + assert_eq!(CaptureFrameFormat::from_str("mjpg"), Ok(CaptureFrameFormat::Mjpeg)); + assert_eq!(CaptureFrameFormat::from_str("grey"), Ok(CaptureFrameFormat::Grey)); + assert_eq!(CaptureFrameFormat::from_str("GREY"), Ok(CaptureFrameFormat::Grey)); + assert_eq!(CaptureFrameFormat::from_str("yuy2"), Ok(CaptureFrameFormat::Yuyv)); + } + + #[test] + fn capture_frame_format_displays_canonical_names() { + assert_eq!(CaptureFrameFormat::Mjpeg.to_string(), "mjpeg"); + assert_eq!(CaptureFrameFormat::Grey.to_string(), "grey"); + } + + #[test] + fn device_info_can_report_incomplete_format_lists() { + let info = CaptureDeviceInfo { + backend: CaptureBackend::AvFoundation, + id: "camera-0".to_string(), + selector: CaptureDeviceSelector::Id("camera-0".to_string()), + name: "Camera".to_string(), + model_id: None, + manufacturer: None, + paths: vec![CapturePath::Native, CapturePath::Raw], + formats: Vec::new(), + formats_complete: false, + }; + + assert_eq!(info.backend, CaptureBackend::AvFoundation); + assert_eq!(info.selector, CaptureDeviceSelector::Id("camera-0".to_string())); + assert_eq!(info.paths, vec![CapturePath::Native, CapturePath::Raw]); + assert!(!info.formats_complete); + } +} diff --git a/livekit-capture/src/dmabuf.rs b/livekit-capture/src/dmabuf.rs new file mode 100644 index 000000000..041f2c545 --- /dev/null +++ b/livekit-capture/src/dmabuf.rs @@ -0,0 +1,62 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// DMA-BUF pixel format. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DmaBufPixelFormat { + /// NV12 biplanar format. + Nv12, + /// YUV420M multiplanar format. + Yuv420M, +} + +impl DmaBufPixelFormat { + #[cfg(target_os = "linux")] + pub(crate) fn as_native(self) -> i32 { + match self { + Self::Nv12 => 0, + Self::Yuv420M => 1, + } + } +} + +/// One DMA-BUF plane descriptor. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct DmaBufPlane { + /// DMA-BUF file descriptor. + pub fd: i32, + /// Plane byte offset. + pub offset: u32, + /// Plane byte stride. + pub stride: u32, +} + +/// One DMA-BUF backed captured frame. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DmaBufFrame { + /// Frame width in pixels. + pub width: u32, + /// Frame height in pixels. + pub height: u32, + /// Pixel format. + pub pixel_format: DmaBufPixelFormat, + /// DMA-BUF planes. + pub planes: Vec, + /// Optional DRM format modifier. + pub modifier: Option, + /// Capture timestamp in microseconds. + pub timestamp_us: i64, + /// Sensor timestamp translated to UNIX-epoch microseconds, when available. + pub sensor_timestamp_us: Option, +} diff --git a/livekit-capture/src/encoded.rs b/livekit-capture/src/encoded.rs new file mode 100644 index 000000000..11843551a --- /dev/null +++ b/livekit-capture/src/encoded.rs @@ -0,0 +1,508 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod h26x; +pub mod ingress; +pub mod rtp; + +use bytes::Bytes; +use livekit::{ + options::VideoCodec, + webrtc::video_frame::{ + EncodedFrameType as RtcEncodedFrameType, EncodedVideoCodec as RtcEncodedVideoCodec, + }, +}; + +use crate::error::CaptureError; + +const ANNEX_B_START_CODE: [u8; 4] = [0, 0, 0, 1]; + +/// Encoded byte-stream framing used by encoded source backends. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum EncodedWireFormat { + /// H.264 Annex-B byte stream. + H264AnnexB, + /// H.264/AVC byte stream with length-prefixed NAL units. + /// + /// `nal_length_size` is the number of big-endian length bytes before each NAL unit. Values + /// from 1 through 4 are accepted; 4 is the common AVC configuration. + H264Avc { + /// Length-prefix size in bytes. + nal_length_size: u8, + }, + /// H.265 Annex-B byte stream. + H265AnnexB, + /// RTP packets for the supplied codec and RTP clock rate. + Rtp { + /// RTP payload codec. + codec: EncodedVideoCodec, + /// RTP timestamp clock rate. + clock_rate: u32, + }, + /// MPEG transport stream carrying encoded video. + MpegTs, +} + +/// Encoded video codec carried by an [`EncodedAccessUnit`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum EncodedVideoCodec { + /// H.264/AVC video. + H264, + /// H.265/HEVC video. + H265, + /// VP8 video. + VP8, + /// VP9 video. + VP9, + /// AV1 video. + AV1, +} + +/// Encoded video frame type. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum EncodedFrameType { + /// A key frame. + Key, + /// A delta frame. + Delta, +} + +/// Layer identifiers associated with an encoded frame. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct EncodedLayerInfo { + /// Spatial layer index, when present. + pub spatial_id: Option, + /// Temporal layer index, when present. + pub temporal_id: Option, +} + +/// H.264 packetization mode for passthrough metadata. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum H264PacketizationMode { + /// Non-interleaved packetization mode. + NonInterleaved, +} + +/// Codec-specific metadata for encoded passthrough. +#[derive(Debug, Clone, PartialEq, Eq)] +#[non_exhaustive] +pub enum CodecSpecific { + /// No codec-specific metadata. + None, + /// H.264-specific metadata. + H264 { + /// H.264 RTP packetization mode. + packetization_mode: H264PacketizationMode, + }, + /// H.265-specific metadata. + H265, + /// VP8-specific metadata. + VP8 { + /// Temporal layer index, when present. + temporal_id: Option, + /// Whether this frame synchronizes a temporal layer. + layer_sync: bool, + }, + /// VP9-specific metadata. + VP9 { + /// Temporal layer index, when present. + temporal_id: Option, + /// Spatial layer index, when present. + spatial_id: Option, + /// Whether this frame depends on an inter-layer reference. + inter_layer_predicted: Option, + }, + /// AV1-specific metadata. + AV1 { + /// RTP scalability mode, such as `L1T1`. + scalability_mode: Option, + /// Encoded dependency descriptor bytes, when supplied by the caller. + dependency_descriptor: Option>, + }, +} + +impl Default for CodecSpecific { + fn default() -> Self { + Self::None + } +} + +impl CodecSpecific { + /// Returns the single-layer default metadata for a codec, matching what + /// the passthrough encoder synthesizes on the wire. + pub fn default_for(codec: EncodedVideoCodec) -> Self { + match codec { + EncodedVideoCodec::H264 => { + Self::H264 { packetization_mode: H264PacketizationMode::NonInterleaved } + } + EncodedVideoCodec::H265 => Self::H265, + EncodedVideoCodec::VP8 => Self::VP8 { temporal_id: None, layer_sync: false }, + EncodedVideoCodec::VP9 => { + Self::VP9 { temporal_id: None, spatial_id: None, inter_layer_predicted: None } + } + EncodedVideoCodec::AV1 => { + Self::AV1 { scalability_mode: Some("L1T1".to_owned()), dependency_descriptor: None } + } + } + } +} + +/// Borrowed encoded payload fragment. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct EncodedFragment<'a> { + /// Encoded fragment bytes. + pub bytes: &'a [u8], +} + +/// Encoded access-unit payload. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum EncodedPayload<'a> { + /// One contiguous payload buffer. + Contiguous(&'a [u8]), + /// Multiple payload fragments. + Fragments(&'a [EncodedFragment<'a>]), + /// Owned payload bytes. + Owned(Vec), +} + +impl EncodedPayload<'_> { + pub(crate) fn is_empty(&self) -> bool { + match self { + Self::Contiguous(bytes) => bytes.is_empty(), + Self::Fragments(fragments) => { + fragments.is_empty() || fragments.iter().any(|fragment| fragment.bytes.is_empty()) + } + Self::Owned(bytes) => bytes.is_empty(), + } + } + + pub(crate) fn to_vec(&self) -> Vec { + match self { + Self::Contiguous(bytes) => bytes.to_vec(), + Self::Fragments(fragments) => { + let len = fragments.iter().map(|fragment| fragment.bytes.len()).sum(); + let mut payload = Vec::with_capacity(len); + for fragment in *fragments { + payload.extend_from_slice(fragment.bytes); + } + payload + } + Self::Owned(bytes) => bytes.clone(), + } + } +} + +/// One encoded video access unit. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct EncodedAccessUnit<'a> { + /// Encoded codec. + pub codec: EncodedVideoCodec, + /// Encoded payload. + pub payload: EncodedPayload<'a>, + /// Capture timestamp in microseconds. + pub timestamp_us: i64, + /// Encoded frame type. + pub frame_type: EncodedFrameType, + /// Encoded frame width in pixels. + pub width: u32, + /// Encoded frame height in pixels. + pub height: u32, + /// Optional layer identifiers. + pub layers: EncodedLayerInfo, + /// Optional codec-specific metadata. + pub codec_specific: CodecSpecific, +} + +/// Owned encoded video access unit. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct OwnedEncodedAccessUnit { + /// Encoded codec. + pub codec: EncodedVideoCodec, + /// Encoded payload bytes. + pub payload: Bytes, + /// Capture timestamp in microseconds. + pub timestamp_us: i64, + /// Encoded frame type. + pub frame_type: EncodedFrameType, + /// Encoded frame width in pixels. + pub width: u32, + /// Encoded frame height in pixels. + pub height: u32, + /// Optional layer identifiers. + pub layers: EncodedLayerInfo, + /// Optional codec-specific metadata. + pub codec_specific: CodecSpecific, +} + +impl OwnedEncodedAccessUnit { + /// Creates an owned encoded access unit from contiguous bytes. + pub fn new( + codec: EncodedVideoCodec, + payload: impl Into, + timestamp_us: i64, + frame_type: EncodedFrameType, + width: u32, + height: u32, + ) -> Self { + Self { + codec, + payload: payload.into(), + timestamp_us, + frame_type, + width, + height, + layers: EncodedLayerInfo::default(), + codec_specific: CodecSpecific::None, + } + } + + /// Borrows this owned access unit as an [`EncodedAccessUnit`]. + pub fn as_access_unit(&self) -> EncodedAccessUnit<'_> { + EncodedAccessUnit { + codec: self.codec, + payload: EncodedPayload::Contiguous(&self.payload), + timestamp_us: self.timestamp_us, + frame_type: self.frame_type, + width: self.width, + height: self.height, + layers: self.layers, + codec_specific: self.codec_specific.clone(), + } + } + + /// Creates an owned access unit by copying a borrowed access unit. + pub fn copy_from(access_unit: &EncodedAccessUnit<'_>) -> Self { + Self { + codec: access_unit.codec, + payload: Bytes::from(access_unit.payload.to_vec()), + timestamp_us: access_unit.timestamp_us, + frame_type: access_unit.frame_type, + width: access_unit.width, + height: access_unit.height, + layers: access_unit.layers, + codec_specific: access_unit.codec_specific.clone(), + } + } +} + +impl<'a> EncodedAccessUnit<'a> { + /// Creates an access unit from one contiguous payload. + pub fn contiguous( + codec: EncodedVideoCodec, + payload: &'a [u8], + timestamp_us: i64, + frame_type: EncodedFrameType, + width: u32, + height: u32, + ) -> Self { + Self { + codec, + payload: EncodedPayload::Contiguous(payload), + timestamp_us, + frame_type, + width, + height, + layers: EncodedLayerInfo::default(), + codec_specific: CodecSpecific::None, + } + } + + /// Creates an H.264 access unit from raw NAL-unit payloads. + pub fn from_h264_nalus( + nal_units: &[&[u8]], + timestamp_us: i64, + width: u32, + height: u32, + ) -> Result, CaptureError> { + Self::from_nalus(EncodedVideoCodec::H264, nal_units, timestamp_us, width, height) + } + + /// Creates an H.265 access unit from raw NAL-unit payloads. + pub fn from_h265_nalus( + nal_units: &[&[u8]], + timestamp_us: i64, + width: u32, + height: u32, + ) -> Result, CaptureError> { + Self::from_nalus(EncodedVideoCodec::H265, nal_units, timestamp_us, width, height) + } + + fn from_nalus( + codec: EncodedVideoCodec, + nal_units: &[&[u8]], + timestamp_us: i64, + width: u32, + height: u32, + ) -> Result, CaptureError> { + let is_key = is_keyframe_nalus(codec, nal_units)?; + Ok(EncodedAccessUnit { + codec, + payload: EncodedPayload::Owned(annex_b_payload(nal_units)?), + timestamp_us, + frame_type: if is_key { EncodedFrameType::Key } else { EncodedFrameType::Delta }, + width, + height, + layers: EncodedLayerInfo::default(), + codec_specific: CodecSpecific::default_for(codec), + }) + } +} + +/// Returns true when any NAL unit in the slice is an intra/key picture. +pub(crate) fn is_keyframe_nalus( + codec: EncodedVideoCodec, + nal_units: &[&[u8]], +) -> Result { + match codec { + EncodedVideoCodec::H264 => { + nal_units.iter().try_fold(false, |is_key, nal| Ok(is_key || h264_nal_type(nal)? == 5)) + } + EncodedVideoCodec::H265 => nal_units.iter().try_fold(false, |is_key, nal| { + let nal_type = h265_nal_type(nal)?; + Ok(is_key || (16..=21).contains(&nal_type)) + }), + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { + Err(CaptureError::UnsupportedCodec(codec)) + } + } +} + +impl From for VideoCodec { + fn from(value: EncodedVideoCodec) -> Self { + match value { + EncodedVideoCodec::H264 => Self::H264, + EncodedVideoCodec::H265 => Self::H265, + EncodedVideoCodec::VP8 => Self::VP8, + EncodedVideoCodec::VP9 => Self::VP9, + EncodedVideoCodec::AV1 => Self::AV1, + } + } +} + +impl From for RtcEncodedVideoCodec { + fn from(value: EncodedVideoCodec) -> Self { + match value { + EncodedVideoCodec::H264 => Self::H264, + EncodedVideoCodec::H265 => Self::H265, + EncodedVideoCodec::VP8 => Self::VP8, + EncodedVideoCodec::VP9 => Self::VP9, + EncodedVideoCodec::AV1 => Self::AV1, + } + } +} + +impl From for RtcEncodedFrameType { + fn from(value: EncodedFrameType) -> Self { + match value { + EncodedFrameType::Key => Self::Key, + EncodedFrameType::Delta => Self::Delta, + } + } +} + +pub(crate) fn h264_nal_type(nal: &[u8]) -> Result { + let header = nal.first().ok_or(CaptureError::EmptyPayload)?; + Ok(header & 0x1f) +} + +pub(crate) fn h265_nal_type(nal: &[u8]) -> Result { + if nal.is_empty() { + return Err(CaptureError::EmptyPayload); + } + if nal.len() < 2 { + return Err(CaptureError::H265NalTooShort); + } + Ok((nal[0] >> 1) & 0x3f) +} + +pub(crate) fn annex_b_payload(nal_units: &[&[u8]]) -> Result, CaptureError> { + if nal_units.is_empty() { + return Err(CaptureError::EmptyPayload); + } + let len = nal_units.iter().try_fold(0usize, |len, nal| { + if nal.is_empty() { + Err(CaptureError::EmptyPayload) + } else { + Ok(len + ANNEX_B_START_CODE.len() + nal.len()) + } + })?; + + let mut payload = Vec::with_capacity(len); + for nal in nal_units { + payload.extend_from_slice(&ANNEX_B_START_CODE); + payload.extend_from_slice(nal); + } + Ok(payload) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn h264_nal_helper_assembles_annex_b_and_detects_keyframe() { + let sps = [0x67, 1, 2, 3]; + let idr = [0x65, 4, 5, 6]; + let au = EncodedAccessUnit::from_h264_nalus(&[&sps, &idr], 10, 640, 480).unwrap(); + + assert_eq!(au.codec, EncodedVideoCodec::H264); + assert_eq!(au.frame_type, EncodedFrameType::Key); + assert_eq!( + au.payload, + EncodedPayload::Owned(vec![0, 0, 0, 1, 0x67, 1, 2, 3, 0, 0, 0, 1, 0x65, 4, 5, 6]) + ); + } + + #[test] + fn h265_nal_helper_detects_irap_keyframe() { + let vps = [0x40, 1, 2]; + let idr_w_radl = [19 << 1, 1, 3]; + let au = EncodedAccessUnit::from_h265_nalus(&[&vps, &idr_w_radl], 10, 640, 480).unwrap(); + + assert_eq!(au.codec, EncodedVideoCodec::H265); + assert_eq!(au.frame_type, EncodedFrameType::Key); + } + + #[test] + fn h265_rejects_too_short_nal_header() { + let err = EncodedAccessUnit::from_h265_nalus(&[&[0x26]], 10, 640, 480).unwrap_err(); + assert_eq!(err, CaptureError::H265NalTooShort); + } + + #[test] + fn fragments_reject_empty_fragment() { + let fragments = [EncodedFragment { bytes: &[1] }, EncodedFragment { bytes: &[] }]; + let payload = EncodedPayload::Fragments(&fragments); + assert!(payload.is_empty()); + } + + #[test] + fn owned_access_unit_borrows_without_copying_payload() { + let owned = OwnedEncodedAccessUnit::new( + EncodedVideoCodec::H264, + Bytes::from_static(&[1, 2, 3]), + 10, + EncodedFrameType::Delta, + 640, + 480, + ); + + let borrowed = owned.as_access_unit(); + assert_eq!(borrowed.codec, EncodedVideoCodec::H264); + assert_eq!(borrowed.payload, EncodedPayload::Contiguous(&[1, 2, 3])); + assert_eq!(borrowed.timestamp_us, 10); + } +} diff --git a/livekit-capture/src/encoded/h26x.rs b/livekit-capture/src/encoded/h26x.rs new file mode 100644 index 000000000..1b3e96970 --- /dev/null +++ b/livekit-capture/src/encoded/h26x.rs @@ -0,0 +1,918 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::ops::Range; + +use bytes::Bytes; + +use crate::{ + encoded::{ + annex_b_payload, h264_nal_type, h265_nal_type, is_keyframe_nalus, CodecSpecific, + EncodedFrameType, EncodedVideoCodec, OwnedEncodedAccessUnit, + }, + error::CaptureError, +}; + +/// Upper bound on bytes buffered while waiting for an access-unit boundary. +const MAX_PENDING_ACCESS_UNIT_BYTES: usize = 32 * 1024 * 1024; + +/// Byte-stream access-unit parser shared by the encoded ingest sources. +/// +/// `push` appends bytes and returns at most one completed access unit; call +/// `drain` repeatedly to pull further access units already buffered, and +/// `flush` once at end of stream to emit the final pending access unit. +pub(crate) trait AccessUnitParser { + /// Appends bytes and returns the next complete access unit, if any. + fn push(&mut self, bytes: &[u8]) -> Result, CaptureError>; + + /// Returns the next complete access unit from already-buffered bytes. + fn drain(&mut self) -> Result, CaptureError> { + self.push(&[]) + } + + /// Flushes remaining buffered bytes as the final access unit. + fn flush(&mut self) -> Result, CaptureError>; +} + +/// H26x Annex-B parser state. +#[derive(Debug, Clone)] +pub struct AnnexBAccessUnitParser { + codec: EncodedVideoCodec, + pending: Vec, + /// NAL ranges found in `pending`; the last range's end is provisional + /// until the next start code (or flush) confirms it. + nal_ranges: Vec>, + /// Offset up to which `pending` has been scanned for start codes. + scan_cursor: usize, + next_timestamp_us: i64, + frame_interval_us: i64, + width: u32, + height: u32, +} + +/// H.264/AVC length-prefixed parser state. +#[cfg(any(feature = "tcpsink", test))] +#[derive(Debug, Clone)] +pub(crate) struct AvcAccessUnitParser { + pending: Vec, + /// Complete NAL ranges found in `pending`. + nal_ranges: Vec>, + /// Offset of the first unparsed length prefix or incomplete NAL in `pending`. + scan_cursor: usize, + nal_length_size: u8, + next_timestamp_us: i64, + frame_interval_us: i64, + width: u32, + height: u32, +} + +impl AnnexBAccessUnitParser { + /// Creates a parser for H.264 or H.265 Annex-B byte streams. + pub fn new( + codec: EncodedVideoCodec, + start_timestamp_us: i64, + frame_interval_us: i64, + width: u32, + height: u32, + ) -> Result { + match codec { + EncodedVideoCodec::H264 | EncodedVideoCodec::H265 => {} + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { + return Err(CaptureError::UnsupportedCodec(codec)); + } + } + + Ok(Self { + codec, + pending: Vec::new(), + nal_ranges: Vec::new(), + scan_cursor: 0, + next_timestamp_us: start_timestamp_us, + frame_interval_us, + width, + height, + }) + } + + /// Pushes encoded bytes and returns the next complete access unit if one is found. + pub fn push(&mut self, bytes: &[u8]) -> Result, CaptureError> { + self.pending.extend_from_slice(bytes); + self.drain_next(false) + } + + /// Flushes the pending bytes as the final access unit. + pub fn flush(&mut self) -> Result, CaptureError> { + self.drain_next(true) + } + + fn drain_next(&mut self, at_eof: bool) -> Result, CaptureError> { + self.scan_pending(); + + if let Some(split_at) = + access_unit_split_index(self.codec, &self.pending, &self.nal_ranges)? + { + return self.take_access_unit(split_at); + } + if at_eof && self.nal_ranges.iter().any(|range| range.start < range.end) { + return self.take_access_unit(self.pending.len()); + } + if !at_eof && self.pending.len() > MAX_PENDING_ACCESS_UNIT_BYTES { + return Err(CaptureError::InvalidEncodedData( + "access unit exceeds maximum buffered size", + )); + } + Ok(None) + } + + /// Scans bytes appended since the previous call, extending the cached NAL ranges. + fn scan_pending(&mut self) { + // Resume behind the previous scan end so a start code straddling the + // boundary is found, but never before the last NAL start so an + // already-found start code is not rediscovered. + let mut cursor = self.scan_cursor.saturating_sub(3); + if let Some(last) = self.nal_ranges.last() { + cursor = cursor.max(last.start); + } + while let Some((offset, prefix_len)) = find_start_code(&self.pending[cursor..]) { + let prefix_start = cursor + offset; + let nal_start = prefix_start + prefix_len; + if let Some(last) = self.nal_ranges.last_mut() { + last.end = prefix_start; + if last.start >= prefix_start { + self.nal_ranges.pop(); + } + } + self.nal_ranges.push(nal_start..nal_start); + cursor = nal_start; + } + if let Some(last) = self.nal_ranges.last_mut() { + last.end = self.pending.len(); + } + self.scan_cursor = self.pending.len(); + } + + fn take_access_unit( + &mut self, + byte_len: usize, + ) -> Result, CaptureError> { + if byte_len == 0 { + return Ok(None); + } + + let access_unit = self.pending[..byte_len].to_vec(); + self.pending.drain(..byte_len); + self.nal_ranges.retain_mut(|range| { + if range.end <= byte_len { + return false; + } + range.start -= byte_len; + range.end -= byte_len; + true + }); + self.scan_cursor -= byte_len; + let timestamp_us = self.next_timestamp_us; + self.next_timestamp_us = self.next_timestamp_us.saturating_add(self.frame_interval_us); + access_unit_from_annex_b( + self.codec, + Bytes::from(access_unit), + timestamp_us, + self.width, + self.height, + ) + .map(Some) + } +} + +impl AccessUnitParser for AnnexBAccessUnitParser { + fn push(&mut self, bytes: &[u8]) -> Result, CaptureError> { + AnnexBAccessUnitParser::push(self, bytes) + } + + fn flush(&mut self) -> Result, CaptureError> { + AnnexBAccessUnitParser::flush(self) + } +} + +#[cfg(any(feature = "tcpsink", test))] +impl AvcAccessUnitParser { + /// Creates a parser for H.264/AVC length-prefixed byte streams. + pub(crate) fn new( + nal_length_size: u8, + start_timestamp_us: i64, + frame_interval_us: i64, + width: u32, + height: u32, + ) -> Result { + validate_avc_nal_length_size(nal_length_size)?; + + Ok(Self { + pending: Vec::new(), + nal_ranges: Vec::new(), + scan_cursor: 0, + nal_length_size, + next_timestamp_us: start_timestamp_us, + frame_interval_us, + width, + height, + }) + } + + /// Pushes encoded bytes and returns the next complete access unit if one is found. + pub(crate) fn push( + &mut self, + bytes: &[u8], + ) -> Result, CaptureError> { + self.pending.extend_from_slice(bytes); + self.drain_next(false) + } + + /// Flushes the pending bytes as the final access unit. + pub(crate) fn flush(&mut self) -> Result, CaptureError> { + self.drain_next(true) + } + + fn drain_next(&mut self, at_eof: bool) -> Result, CaptureError> { + self.scan_pending(at_eof)?; + + if let Some(split_at) = avc_access_unit_split_index( + &self.pending, + &self.nal_ranges, + self.nal_length_size as usize, + )? { + return self.take_access_unit(split_at); + } + if at_eof && !self.nal_ranges.is_empty() { + return self.take_access_unit(self.pending.len()); + } + if !at_eof && self.pending.len() > MAX_PENDING_ACCESS_UNIT_BYTES { + return Err(CaptureError::InvalidEncodedData( + "access unit exceeds maximum buffered size", + )); + } + Ok(None) + } + + /// Parses length-prefixed NAL units appended since the previous call. + fn scan_pending(&mut self, at_eof: bool) -> Result<(), CaptureError> { + let nal_length_size = self.nal_length_size as usize; + while self.scan_cursor < self.pending.len() { + if self.pending.len() - self.scan_cursor < nal_length_size { + if at_eof { + return Err(CaptureError::InvalidEncodedData("truncated AVC NAL length")); + } + break; + } + + let nal_start = self.scan_cursor + nal_length_size; + let nal_len = read_avc_nal_length(&self.pending[self.scan_cursor..nal_start]); + if nal_len == 0 { + return Err(CaptureError::InvalidEncodedData("empty AVC NAL unit")); + } + + let Some(nal_end) = nal_start.checked_add(nal_len) else { + return Err(CaptureError::InvalidEncodedData("AVC NAL unit length overflow")); + }; + if nal_end > self.pending.len() { + if at_eof { + return Err(CaptureError::InvalidEncodedData("truncated AVC NAL unit")); + } + break; + } + + self.nal_ranges.push(nal_start..nal_end); + self.scan_cursor = nal_end; + } + Ok(()) + } + + fn take_access_unit( + &mut self, + byte_len: usize, + ) -> Result, CaptureError> { + if byte_len == 0 { + return Ok(None); + } + + let access_unit = self.pending[..byte_len].to_vec(); + self.pending.drain(..byte_len); + self.nal_ranges.retain_mut(|range| { + if range.end <= byte_len { + return false; + } + range.start -= byte_len; + range.end -= byte_len; + true + }); + self.scan_cursor -= byte_len; + let timestamp_us = self.next_timestamp_us; + self.next_timestamp_us = self.next_timestamp_us.saturating_add(self.frame_interval_us); + access_unit_from_h264_avc( + &access_unit, + self.nal_length_size, + timestamp_us, + self.width, + self.height, + ) + .map(Some) + } +} + +#[cfg(any(feature = "tcpsink", test))] +impl AccessUnitParser for AvcAccessUnitParser { + fn push(&mut self, bytes: &[u8]) -> Result, CaptureError> { + AvcAccessUnitParser::push(self, bytes) + } + + fn flush(&mut self) -> Result, CaptureError> { + AvcAccessUnitParser::flush(self) + } +} + +/// Returns NAL-unit byte ranges for an Annex-B access unit or stream chunk. +pub fn annex_b_nal_ranges(bytes: &[u8]) -> Vec> { + let mut ranges = Vec::new(); + let mut cursor = 0; + let mut current_start = None; + + while let Some((prefix_start, prefix_len)) = find_start_code(&bytes[cursor..]) { + let prefix_start = cursor + prefix_start; + let nal_start = prefix_start + prefix_len; + if let Some(start) = current_start.replace(nal_start) { + if start < prefix_start { + ranges.push(start..prefix_start); + } + } + cursor = nal_start; + } + + if let Some(start) = current_start { + if start < bytes.len() { + ranges.push(start..bytes.len()); + } + } + + ranges +} + +/// Returns borrowed NAL units from an Annex-B buffer. +pub fn annex_b_nalus(bytes: &[u8]) -> Result, CaptureError> { + let nals = annex_b_nal_ranges(bytes) + .into_iter() + .map(|range| &bytes[range]) + .filter(|nal| !nal.is_empty()) + .collect::>(); + Ok(nals) +} + +/// Creates an Annex-B access unit from H.264/AVC length-prefixed NAL units. +pub(crate) fn access_unit_from_h264_avc( + payload: &[u8], + nal_length_size: u8, + timestamp_us: i64, + width: u32, + height: u32, +) -> Result { + let nals = avc_nalus(payload, nal_length_size)?; + access_unit_from_nalus(EncodedVideoCodec::H264, &nals, timestamp_us, width, height) +} + +/// Creates an access unit from an Annex-B buffer. +pub fn access_unit_from_annex_b( + codec: EncodedVideoCodec, + payload: Bytes, + timestamp_us: i64, + width: u32, + height: u32, +) -> Result { + if payload.is_empty() { + return Err(CaptureError::EmptyPayload); + } + + let frame_type = if is_keyframe_annex_b(codec, &payload)? { + EncodedFrameType::Key + } else { + EncodedFrameType::Delta + }; + let mut access_unit = + OwnedEncodedAccessUnit::new(codec, payload, timestamp_us, frame_type, width, height); + access_unit.codec_specific = CodecSpecific::default_for(codec); + Ok(access_unit) +} + +/// Creates an Annex-B access unit from raw NAL units. +pub fn access_unit_from_nalus( + codec: EncodedVideoCodec, + nal_units: &[&[u8]], + timestamp_us: i64, + width: u32, + height: u32, +) -> Result { + let payload = Bytes::from(annex_b_payload(nal_units)?); + access_unit_from_annex_b(codec, payload, timestamp_us, width, height) +} + +/// Returns true when an Annex-B access unit contains an intra/key picture. +pub fn is_keyframe_annex_b(codec: EncodedVideoCodec, bytes: &[u8]) -> Result { + let nals = annex_b_nalus(bytes)?; + is_keyframe_nalus(codec, &nals) +} + +fn access_unit_split_index( + codec: EncodedVideoCodec, + bytes: &[u8], + ranges: &[Range], +) -> Result, CaptureError> { + match access_unit_boundary_nal(codec, bytes, ranges)? { + Some(index) => split_start_code_index(bytes, ranges[index].start).map(Some), + None => Ok(None), + } +} + +#[cfg(any(feature = "tcpsink", test))] +fn avc_access_unit_split_index( + bytes: &[u8], + ranges: &[Range], + nal_length_size: usize, +) -> Result, CaptureError> { + match access_unit_boundary_nal(EncodedVideoCodec::H264, bytes, ranges)? { + Some(index) => ranges[index] + .start + .checked_sub(nal_length_size) + .ok_or(CaptureError::InvalidEncodedData("missing AVC NAL length")) + .map(Some), + None => Ok(None), + } +} + +/// Returns the index of the first NAL that starts a new access unit, once at +/// least one VCL NAL has been seen in the current one. +fn access_unit_boundary_nal( + codec: EncodedVideoCodec, + bytes: &[u8], + ranges: &[Range], +) -> Result, CaptureError> { + let mut seen_vcl = false; + for (index, range) in ranges.iter().enumerate() { + let nal = &bytes[range.clone()]; + // The final NAL may still be streaming in; wait for its header. + if index + 1 == ranges.len() && nal.len() < min_nal_header_len(codec) { + return Ok(None); + } + if seen_vcl && starts_new_access_unit(codec, nal)? { + return Ok(Some(index)); + } + seen_vcl |= is_vcl_nal(codec, nal)?; + } + Ok(None) +} + +fn min_nal_header_len(codec: EncodedVideoCodec) -> usize { + match codec { + EncodedVideoCodec::H265 => 2, + _ => 1, + } +} + +fn starts_new_access_unit(codec: EncodedVideoCodec, nal: &[u8]) -> Result { + Ok(match codec { + EncodedVideoCodec::H264 => match h264_nal_type(nal)? { + // Prefix SEI(6), SPS(7), PPS(8), and AUD(9) open a new access unit. + 6..=9 => true, + // A VCL NAL opens a new picture when first_mb_in_slice == 0: + // ue(v) == 0 is a lone 1 bit, so the first RBSP bit after the + // header is set. The header byte is nonzero, so the next byte + // cannot be an emulation-prevention byte. + 1..=5 => nal.len() >= 2 && nal[1] & 0x80 != 0, + _ => false, + }, + EncodedVideoCodec::H265 => match h265_nal_type(nal)? { + // VPS(32), SPS(33), PPS(34), AUD(35), and prefix SEI(39). + 32..=35 | 39 => true, + // A VCL NAL opens a new picture when + // first_slice_segment_in_pic_flag (the bit after the 2-byte + // header) is set. nuh_temporal_id_plus1 makes the second header + // byte nonzero, so the next byte cannot be an + // emulation-prevention byte. + 0..=31 => nal.len() >= 3 && nal[2] & 0x80 != 0, + _ => false, + }, + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { + return Err(CaptureError::UnsupportedCodec(codec)); + } + }) +} + +fn split_start_code_index(bytes: &[u8], nal_start: usize) -> Result { + if nal_start >= 4 && bytes[nal_start - 4..nal_start] == [0, 0, 0, 1] { + return Ok(nal_start - 4); + } + if nal_start >= 3 && bytes[nal_start - 3..nal_start] == [0, 0, 1] { + return Ok(nal_start - 3); + } + Err(CaptureError::InvalidEncodedData("missing Annex-B start code")) +} + +fn is_vcl_nal(codec: EncodedVideoCodec, nal: &[u8]) -> Result { + Ok(match codec { + EncodedVideoCodec::H264 => (1..=5).contains(&h264_nal_type(nal)?), + EncodedVideoCodec::H265 => h265_nal_type(nal)? <= 31, + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { + return Err(CaptureError::UnsupportedCodec(codec)); + } + }) +} + +fn find_start_code(bytes: &[u8]) -> Option<(usize, usize)> { + let mut idx = 0; + while idx + 3 <= bytes.len() { + if bytes[idx..].starts_with(&[0, 0, 1]) { + return Some((idx, 3)); + } + if idx + 4 <= bytes.len() && bytes[idx..].starts_with(&[0, 0, 0, 1]) { + return Some((idx, 4)); + } + idx += 1; + } + None +} + +fn avc_nalus(payload: &[u8], nal_length_size: u8) -> Result, CaptureError> { + let ranges = avc_nal_ranges(payload, nal_length_size, true)?; + if ranges.is_empty() { + return Err(CaptureError::EmptyPayload); + } + Ok(ranges.into_iter().map(|range| &payload[range]).collect()) +} + +fn avc_nal_ranges( + bytes: &[u8], + nal_length_size: u8, + at_eof: bool, +) -> Result>, CaptureError> { + validate_avc_nal_length_size(nal_length_size)?; + + let nal_length_size = nal_length_size as usize; + let mut ranges = Vec::new(); + let mut cursor = 0; + while cursor < bytes.len() { + if bytes.len() - cursor < nal_length_size { + if at_eof { + return Err(CaptureError::InvalidEncodedData("truncated AVC NAL length")); + } + break; + } + + let nal_len = read_avc_nal_length(&bytes[cursor..cursor + nal_length_size]); + cursor += nal_length_size; + if nal_len == 0 { + return Err(CaptureError::InvalidEncodedData("empty AVC NAL unit")); + } + + let Some(nal_end) = cursor.checked_add(nal_len) else { + return Err(CaptureError::InvalidEncodedData("AVC NAL unit length overflow")); + }; + if nal_end > bytes.len() { + if at_eof { + return Err(CaptureError::InvalidEncodedData("truncated AVC NAL unit")); + } + break; + } + + ranges.push(cursor..nal_end); + cursor = nal_end; + } + + Ok(ranges) +} + +fn read_avc_nal_length(bytes: &[u8]) -> usize { + bytes.iter().fold(0usize, |len, byte| (len << 8) | usize::from(*byte)) +} + +fn validate_avc_nal_length_size(nal_length_size: u8) -> Result<(), CaptureError> { + if (1..=4).contains(&nal_length_size) { + return Ok(()); + } + Err(CaptureError::InvalidEncodedData("invalid AVC NAL length size")) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn splits_annex_b_nals_with_three_and_four_byte_prefixes() { + let bytes = [0, 0, 1, 0x67, 1, 0, 0, 0, 1, 0x65, 2, 3]; + let nals = annex_b_nalus(&bytes).unwrap(); + assert_eq!(nals, vec![&[0x67, 1][..], &[0x65, 2, 3][..]]); + } + + #[test] + fn detects_h264_keyframe_from_annex_b() { + let bytes = [0, 0, 0, 1, 0x61, 1, 0, 0, 0, 1, 0x65, 2]; + assert!(is_keyframe_annex_b(EncodedVideoCodec::H264, &bytes).unwrap()); + } + + #[test] + fn access_unit_from_avc_converts_length_prefixed_nals() { + let bytes = [0, 0, 0, 4, 0x67, 1, 2, 3, 0, 0, 0, 3, 0x65, 4, 5]; + let au = access_unit_from_h264_avc(&bytes, 4, 10, 640, 480).unwrap(); + + assert_eq!(au.codec, EncodedVideoCodec::H264); + assert_eq!(au.frame_type, EncodedFrameType::Key); + assert_eq!(au.payload.as_ref(), &[0, 0, 0, 1, 0x67, 1, 2, 3, 0, 0, 0, 1, 0x65, 4, 5]); + } + + #[test] + fn access_unit_from_avc_supports_two_byte_lengths() { + let bytes = [0, 2, 0x61, 1]; + let au = access_unit_from_h264_avc(&bytes, 2, 10, 640, 480).unwrap(); + + assert_eq!(au.frame_type, EncodedFrameType::Delta); + assert_eq!(au.payload.as_ref(), &[0, 0, 0, 1, 0x61, 1]); + } + + #[test] + fn access_unit_from_avc_rejects_truncated_nal() { + let err = access_unit_from_h264_avc(&[0, 0, 0, 3, 0x65], 4, 10, 640, 480).unwrap_err(); + + assert_eq!(err, CaptureError::InvalidEncodedData("truncated AVC NAL unit")); + } + + #[test] + fn parser_flushes_final_access_unit() { + let mut parser = + AnnexBAccessUnitParser::new(EncodedVideoCodec::H264, 100, 33_333, 640, 480).unwrap(); + assert!(parser.push(&[0, 0, 1, 0x65, 1, 2]).unwrap().is_none()); + let au = parser.flush().unwrap().unwrap(); + assert_eq!(au.timestamp_us, 100); + assert_eq!(au.frame_type, EncodedFrameType::Key); + } + + #[test] + fn parser_splits_at_next_access_unit_delimiter() { + let mut parser = + AnnexBAccessUnitParser::new(EncodedVideoCodec::H264, 100, 33_333, 640, 480).unwrap(); + let stream = + [0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x65, 1, 2, 0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x41, 3]; + + let au = parser.push(&stream).unwrap().unwrap(); + assert_eq!(au.timestamp_us, 100); + assert_eq!(au.payload.as_ref(), &[0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x65, 1, 2]); + + let au = parser.flush().unwrap().unwrap(); + assert_eq!(au.timestamp_us, 33_433); + assert_eq!(au.payload.as_ref(), &[0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x41, 3]); + } + + #[test] + fn avc_parser_splits_at_next_access_unit_delimiter() { + let mut parser = AvcAccessUnitParser::new(4, 100, 33_333, 640, 480).unwrap(); + let stream = [ + 0, 0, 0, 2, 0x09, 0x10, 0, 0, 0, 3, 0x65, 1, 2, 0, 0, 0, 2, 0x09, 0x10, 0, 0, 0, 2, + 0x41, 3, + ]; + + let au = parser.push(&stream).unwrap().unwrap(); + assert_eq!(au.timestamp_us, 100); + assert_eq!(au.payload.as_ref(), &[0, 0, 0, 1, 0x09, 0x10, 0, 0, 0, 1, 0x65, 1, 2]); + + let au = parser.flush().unwrap().unwrap(); + assert_eq!(au.timestamp_us, 33_433); + assert_eq!(au.payload.as_ref(), &[0, 0, 0, 1, 0x09, 0x10, 0, 0, 0, 1, 0x41, 3]); + } + + #[test] + fn splits_aud_less_h264_stream_per_frame() { + let mut parser = + AnnexBAccessUnitParser::new(EncodedVideoCodec::H264, 0, 33_333, 640, 480).unwrap(); + let stream = [ + 0, 0, 0, 1, 0x67, 0x42, 0x00, 0x1e, // SPS + 0, 0, 0, 1, 0x68, 0xce, // PPS + 0, 0, 1, 0x65, 0x88, 0x84, 0x21, // IDR slice, first_mb_in_slice == 0 + 0, 0, 1, 0x41, 0x9a, 0x22, // P slice, first_mb_in_slice == 0 + 0, 0, 1, 0x41, 0x9a, 0x33, // P slice, first_mb_in_slice == 0 + ]; + + let au = parser.push(&stream).unwrap().unwrap(); + assert_eq!(au.timestamp_us, 0); + assert_eq!(au.frame_type, EncodedFrameType::Key); + assert_eq!(au.payload.as_ref(), &stream[..21]); + + let au = parser.drain().unwrap().unwrap(); + assert_eq!(au.timestamp_us, 33_333); + assert_eq!(au.frame_type, EncodedFrameType::Delta); + assert_eq!(au.payload.as_ref(), &stream[21..27]); + + let au = parser.flush().unwrap().unwrap(); + assert_eq!(au.timestamp_us, 66_666); + assert_eq!(au.payload.as_ref(), &stream[27..]); + } + + #[test] + fn keeps_multi_slice_h264_access_unit_together() { + let mut parser = + AnnexBAccessUnitParser::new(EncodedVideoCodec::H264, 0, 33_333, 640, 480).unwrap(); + let stream = [ + 0, 0, 1, 0x65, 0x88, 0x11, // IDR slice, first_mb_in_slice == 0 + 0, 0, 1, 0x65, 0x21, 0x22, // IDR slice, first_mb_in_slice != 0 + 0, 0, 1, 0x41, 0x9a, 0x33, // next picture + ]; + + let au = parser.push(&stream).unwrap().unwrap(); + assert_eq!(au.timestamp_us, 0); + assert_eq!(au.frame_type, EncodedFrameType::Key); + assert_eq!(au.payload.as_ref(), &stream[..12]); + + let au = parser.flush().unwrap().unwrap(); + assert_eq!(au.timestamp_us, 33_333); + assert_eq!(au.payload.as_ref(), &stream[12..]); + } + + #[test] + fn splits_aud_less_h265_stream_per_frame() { + let mut parser = + AnnexBAccessUnitParser::new(EncodedVideoCodec::H265, 0, 33_333, 640, 480).unwrap(); + let stream = [ + 0, 0, 0, 1, 0x40, 0x01, 0x0c, // VPS + 0, 0, 0, 1, 0x42, 0x01, 0x02, // SPS + 0, 0, 0, 1, 0x44, 0x01, 0x03, // PPS + 0, 0, 1, 0x26, 0x01, 0xaf, + 0x04, // IDR_W_RADL, first_slice_segment_in_pic_flag == 1 + 0, 0, 1, 0x02, 0x01, 0xd0, 0x05, // TRAIL_R, first_slice_segment_in_pic_flag == 1 + ]; + + let au = parser.push(&stream).unwrap().unwrap(); + assert_eq!(au.timestamp_us, 0); + assert_eq!(au.frame_type, EncodedFrameType::Key); + assert_eq!(au.payload.as_ref(), &stream[..28]); + + let au = parser.flush().unwrap().unwrap(); + assert_eq!(au.timestamp_us, 33_333); + assert_eq!(au.frame_type, EncodedFrameType::Delta); + assert_eq!(au.payload.as_ref(), &stream[28..]); + } + + #[test] + fn keeps_multi_slice_h265_access_unit_together() { + let mut parser = + AnnexBAccessUnitParser::new(EncodedVideoCodec::H265, 0, 33_333, 640, 480).unwrap(); + let stream = [ + 0, 0, 1, 0x26, 0x01, 0xaf, + 0x11, // IDR slice, first_slice_segment_in_pic_flag == 1 + 0, 0, 1, 0x26, 0x01, 0x40, + 0x22, // IDR slice, first_slice_segment_in_pic_flag == 0 + 0, 0, 1, 0x02, 0x01, 0xd0, 0x33, // next picture + ]; + + let au = parser.push(&stream).unwrap().unwrap(); + assert_eq!(au.timestamp_us, 0); + assert_eq!(au.frame_type, EncodedFrameType::Key); + assert_eq!(au.payload.as_ref(), &stream[..14]); + + let au = parser.flush().unwrap().unwrap(); + assert_eq!(au.timestamp_us, 33_333); + assert_eq!(au.payload.as_ref(), &stream[14..]); + } + + #[test] + fn groups_parameter_sets_with_following_frame() { + let mut parser = + AnnexBAccessUnitParser::new(EncodedVideoCodec::H264, 0, 33_333, 640, 480).unwrap(); + let stream = [ + 0, 0, 1, 0x67, 0x42, 0x1e, // SPS + 0, 0, 1, 0x68, 0xce, // PPS + 0, 0, 1, 0x65, 0x88, 0x11, // IDR + 0, 0, 1, 0x67, 0x42, 0x1e, // SPS + 0, 0, 1, 0x68, 0xce, // PPS + 0, 0, 1, 0x65, 0x88, 0x22, // IDR + ]; + + let au = parser.push(&stream).unwrap().unwrap(); + assert_eq!(au.timestamp_us, 0); + assert_eq!(au.frame_type, EncodedFrameType::Key); + assert_eq!(au.payload.as_ref(), &stream[..17]); + + let au = parser.flush().unwrap().unwrap(); + assert_eq!(au.timestamp_us, 33_333); + assert_eq!(au.frame_type, EncodedFrameType::Key); + assert_eq!(au.payload.as_ref(), &stream[17..]); + } + + fn collect_units( + parser: &mut impl AccessUnitParser, + stream: &[u8], + chunk_size: usize, + ) -> Vec<(Vec, i64, EncodedFrameType)> { + let mut units = Vec::new(); + for chunk in stream.chunks(chunk_size) { + let mut unit = parser.push(chunk).unwrap(); + while let Some(au) = unit { + units.push((au.payload.to_vec(), au.timestamp_us, au.frame_type)); + unit = parser.drain().unwrap(); + } + } + let mut unit = parser.flush().unwrap(); + while let Some(au) = unit { + units.push((au.payload.to_vec(), au.timestamp_us, au.frame_type)); + unit = parser.flush().unwrap(); + } + units + } + + fn assert_chunked_matches_one_shot( + make_parser: impl Fn() -> P, + stream: &[u8], + expected_units: usize, + ) { + let baseline = collect_units(&mut make_parser(), stream, stream.len()); + assert_eq!(baseline.len(), expected_units); + for chunk_size in [1, 7] { + assert_eq!(collect_units(&mut make_parser(), stream, chunk_size), baseline); + } + } + + #[test] + fn chunked_pushes_match_one_shot_parsing() { + let h264_annex_b = [ + 0, 0, 0, 1, 0x67, 0x64, 0x00, 0x1e, // SPS + 0, 0, 0, 1, 0x68, 0xce, 0x3c, 0x80, // PPS + 0, 0, 1, 0x65, 0x88, 0x84, 0x00, 0x01, // IDR, first_mb_in_slice == 0 + 0, 0, 1, 0x41, 0x9a, 0x02, // P, first_mb_in_slice == 0 + 0, 0, 1, 0x09, 0x10, // AUD + 0, 0, 1, 0x41, 0x9a, 0x03, // P + 0, 0, 0, 1, 0x41, 0x9a, 0x04, 0x00, // P, first_mb_in_slice == 0 + ]; + assert_chunked_matches_one_shot( + || AnnexBAccessUnitParser::new(EncodedVideoCodec::H264, 0, 33_333, 640, 480).unwrap(), + &h264_annex_b, + 4, + ); + + let h265_annex_b = [ + 0, 0, 0, 1, 0x40, 0x01, 0x0c, // VPS + 0, 0, 0, 1, 0x42, 0x01, 0x02, // SPS + 0, 0, 0, 1, 0x44, 0x01, 0x03, // PPS + 0, 0, 1, 0x26, 0x01, 0xaf, 0x08, // IDR_W_RADL + 0, 0, 1, 0x02, 0x01, 0xd0, 0x09, // TRAIL_R + 0, 0, 1, 0x46, 0x01, 0x50, // AUD + 0, 0, 1, 0x02, 0x01, 0xd0, 0x0a, // TRAIL_R + ]; + assert_chunked_matches_one_shot( + || AnnexBAccessUnitParser::new(EncodedVideoCodec::H265, 0, 33_333, 640, 480).unwrap(), + &h265_annex_b, + 3, + ); + + let h264_avc = [ + 0, 0, 0, 4, 0x67, 0x64, 0x00, 0x1e, // SPS + 0, 0, 0, 2, 0x68, 0xce, // PPS + 0, 0, 0, 4, 0x65, 0x88, 0x84, 0x00, // IDR, first_mb_in_slice == 0 + 0, 0, 0, 3, 0x41, 0x9a, 0x02, // P, first_mb_in_slice == 0 + 0, 0, 0, 2, 0x09, 0x10, // AUD + 0, 0, 0, 3, 0x41, 0x9a, 0x03, // P + ]; + assert_chunked_matches_one_shot( + || AvcAccessUnitParser::new(4, 0, 33_333, 640, 480).unwrap(), + &h264_avc, + 3, + ); + } + + #[test] + fn rejects_pending_access_unit_over_size_cap() { + let mut parser = + AnnexBAccessUnitParser::new(EncodedVideoCodec::H264, 0, 33_333, 640, 480).unwrap(); + assert!(parser.push(&[0, 0, 1, 0x65, 0x88]).unwrap().is_none()); + + let err = parser.push(&vec![0xff; MAX_PENDING_ACCESS_UNIT_BYTES]).unwrap_err(); + assert_eq!( + err, + CaptureError::InvalidEncodedData("access unit exceeds maximum buffered size") + ); + } + + #[test] + fn avc_rejects_pending_access_unit_over_size_cap() { + let mut parser = AvcAccessUnitParser::new(4, 0, 33_333, 640, 480).unwrap(); + let nal_len = (MAX_PENDING_ACCESS_UNIT_BYTES + 1) as u32; + assert!(parser.push(&nal_len.to_be_bytes()).unwrap().is_none()); + + let err = parser.push(&vec![0x41; MAX_PENDING_ACCESS_UNIT_BYTES]).unwrap_err(); + assert_eq!( + err, + CaptureError::InvalidEncodedData("access unit exceeds maximum buffered size") + ); + } +} diff --git a/livekit-capture/src/encoded/ingress.rs b/livekit-capture/src/encoded/ingress.rs new file mode 100644 index 000000000..0eac3f15d --- /dev/null +++ b/livekit-capture/src/encoded/ingress.rs @@ -0,0 +1,189 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{ + error::Error, + fmt, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, +}; + +use crate::{encoded::OwnedEncodedAccessUnit, error::CaptureError, track::VideoCaptureTrack}; + +/// Source of owned encoded access units. +pub trait EncodedAccessUnitSource { + /// Error returned by the source. + type Error: Error + Send + Sync + 'static; + + /// Returns the next encoded access unit, or `Ok(None)` when the source reaches EOF. + fn next_access_unit(&mut self) -> Result, Self::Error>; + + /// Forwards a downstream keyframe request (PLI/FIR, late subscriber) to + /// the producer so it can emit an IDR. + /// + /// The default implementation does nothing, for transports that cannot + /// influence the upstream encoder. + fn request_keyframe(&mut self) {} +} + +/// Error returned while forwarding encoded access units into a track. +#[derive(Debug)] +pub enum EncodedIngressError { + /// The encoded source failed. + Source(E), + /// The capture track rejected an access unit. + Capture(CaptureError), +} + +impl fmt::Display for EncodedIngressError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Source(err) => write!(f, "encoded source failed: {err}"), + Self::Capture(err) => write!(f, "encoded capture failed: {err}"), + } + } +} + +impl Error for EncodedIngressError +where + E: Error + 'static, +{ + fn source(&self) -> Option<&(dyn Error + 'static)> { + match self { + Self::Source(err) => Some(err), + Self::Capture(err) => Some(err), + } + } +} + +/// Cancellation handle for [`EncodedIngress::run_until_end`]. +/// +/// Cheap to clone; wire it to a shutdown signal (e.g. Ctrl-C) and call +/// [`EncodedIngressStop::stop`] from any thread to make the ingest loop +/// return after the access unit in flight. +#[derive(Debug, Clone, Default)] +pub struct EncodedIngressStop(Arc); + +impl EncodedIngressStop { + /// Creates an un-stopped handle. + pub fn new() -> Self { + Self::default() + } + + /// Signals the ingest loop to stop. + pub fn stop(&self) { + self.0.store(true, Ordering::Release); + } + + /// Returns true once [`EncodedIngressStop::stop`] has been called. + pub fn is_stopped(&self) -> bool { + self.0.load(Ordering::Acquire) + } +} + +/// Pulls encoded access units from a source and forwards them into a video track. +#[derive(Debug)] +pub struct EncodedIngress { + track: VideoCaptureTrack, + source: S, + stop: EncodedIngressStop, +} + +impl EncodedIngress { + /// Creates an encoded ingress runner. + pub fn new(track: VideoCaptureTrack, source: S) -> Self { + Self { track, source, stop: EncodedIngressStop::new() } + } + + /// Returns a cancellation handle for this runner. + pub fn stop_handle(&self) -> EncodedIngressStop { + self.stop.clone() + } + + /// Returns the capture track used by this runner. + pub fn track(&self) -> &VideoCaptureTrack { + &self.track + } + + /// Returns the underlying encoded source. + pub fn source(&self) -> &S { + &self.source + } + + /// Returns the underlying encoded source mutably. + pub fn source_mut(&mut self) -> &mut S { + &mut self.source + } + + /// Consumes this runner and returns its parts. + pub fn into_parts(self) -> (VideoCaptureTrack, S) { + (self.track, self.source) + } +} + +/// Details of one access unit captured by [`EncodedIngress::capture_next`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct EncodedIngressCapture { + /// Capture timestamp of the access unit in microseconds. + pub timestamp_us: i64, + /// Frame type of the access unit. + pub frame_type: crate::encoded::EncodedFrameType, + /// Payload size in bytes. + pub payload_len: usize, +} + +impl EncodedIngress +where + S: EncodedAccessUnitSource, +{ + /// Captures the next access unit, returning `None` after source EOF. + /// + /// Downstream keyframe requests (PLI/FIR raised by the passthrough + /// encoder) are polled on every call and forwarded to the source via + /// [`EncodedAccessUnitSource::request_keyframe`]. + pub fn capture_next( + &mut self, + ) -> Result, EncodedIngressError> { + if self.track.take_keyframe_request() { + self.source.request_keyframe(); + } + + let Some(access_unit) = + self.source.next_access_unit().map_err(EncodedIngressError::Source)? + else { + return Ok(None); + }; + + self.track + .capture_encoded(&access_unit.as_access_unit()) + .map_err(EncodedIngressError::Capture)?; + Ok(Some(EncodedIngressCapture { + timestamp_us: access_unit.timestamp_us, + frame_type: access_unit.frame_type, + payload_len: access_unit.payload.len(), + })) + } + + /// Captures access units until the source reaches EOF or the stop + /// handle fires, returning the number of captured access units. + pub fn run_until_end(&mut self) -> Result> { + let mut captured = 0; + while !self.stop.is_stopped() && self.capture_next()?.is_some() { + captured += 1; + } + Ok(captured) + } +} diff --git a/livekit-capture/src/encoded/rtp.rs b/livekit-capture/src/encoded/rtp.rs new file mode 100644 index 000000000..1f4feddf0 --- /dev/null +++ b/livekit-capture/src/encoded/rtp.rs @@ -0,0 +1,1406 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use bytes::Bytes; +use thiserror::Error; + +use crate::{ + encoded::{ + h26x::access_unit_from_nalus, CodecSpecific, EncodedFrameType, EncodedVideoCodec, + OwnedEncodedAccessUnit, + }, + error::CaptureError, +}; + +/// Parsed RTP packet header and payload. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct RtpPacket<'a> { + /// RTP marker bit. + pub marker: bool, + /// RTP payload type. + pub payload_type: u8, + /// RTP sequence number. + pub sequence_number: u16, + /// RTP timestamp. + pub timestamp: u32, + /// RTP SSRC. + pub ssrc: u32, + /// RTP payload bytes. + pub payload: &'a [u8], +} + +impl<'a> RtpPacket<'a> { + /// Parses a single RTP packet. + pub fn parse(bytes: &'a [u8]) -> Result { + if bytes.len() < 12 { + return Err(RtpDepacketizerError::PacketTooShort); + } + if bytes[0] >> 6 != 2 { + return Err(RtpDepacketizerError::UnsupportedVersion(bytes[0] >> 6)); + } + + let has_padding = (bytes[0] & 0x20) != 0; + let has_extension = (bytes[0] & 0x10) != 0; + let csrc_count = (bytes[0] & 0x0f) as usize; + let mut payload_start = 12 + csrc_count * 4; + if bytes.len() < payload_start { + return Err(RtpDepacketizerError::PacketTooShort); + } + + if has_extension { + if bytes.len() < payload_start + 4 { + return Err(RtpDepacketizerError::PacketTooShort); + } + let extension_words = + u16::from_be_bytes([bytes[payload_start + 2], bytes[payload_start + 3]]) as usize; + payload_start += 4 + extension_words * 4; + if bytes.len() < payload_start { + return Err(RtpDepacketizerError::PacketTooShort); + } + } + + let payload_end = if has_padding { + let Some(padding) = bytes.last().copied() else { + return Err(RtpDepacketizerError::PacketTooShort); + }; + let padding = padding as usize; + if padding == 0 || bytes.len() < payload_start + padding { + return Err(RtpDepacketizerError::PacketTooShort); + } + bytes.len() - padding + } else { + bytes.len() + }; + + Ok(Self { + marker: (bytes[1] & 0x80) != 0, + payload_type: bytes[1] & 0x7f, + sequence_number: u16::from_be_bytes([bytes[2], bytes[3]]), + timestamp: u32::from_be_bytes([bytes[4], bytes[5], bytes[6], bytes[7]]), + ssrc: u32::from_be_bytes([bytes[8], bytes[9], bytes[10], bytes[11]]), + payload: &bytes[payload_start..payload_end], + }) + } +} + +/// Maps RTP timestamps to capture timestamps in microseconds. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct RtpTimestampMapper { + clock_rate: u32, + last_rtp_timestamp: Option, + extended_ticks: i64, + base_timestamp_us: i64, +} + +impl RtpTimestampMapper { + /// Creates an RTP timestamp mapper. + pub fn new(clock_rate: u32, base_timestamp_us: i64) -> Self { + Self { clock_rate, last_rtp_timestamp: None, extended_ticks: 0, base_timestamp_us } + } + + /// Maps an RTP timestamp to microseconds, unwrapping `u32` RTP timestamp + /// rollover so mapped timestamps stay monotonic across any number of wraps. + pub fn map(&mut self, rtp_timestamp: u32) -> Result { + if self.clock_rate == 0 { + return Err(RtpDepacketizerError::InvalidClockRate); + } + + let last = *self.last_rtp_timestamp.get_or_insert(rtp_timestamp); + self.last_rtp_timestamp = Some(rtp_timestamp); + // Reinterpreting the wrapped u32 delta as i32 picks the nearest extended + // timestamp, which unwraps rollover while tolerating small backwards + // jumps from reordered packets. + let delta_ticks = i64::from(rtp_timestamp.wrapping_sub(last) as i32); + self.extended_ticks = self.extended_ticks.saturating_add(delta_ticks); + + let extended_us = i128::from(self.extended_ticks) * 1_000_000 / i128::from(self.clock_rate); + let extended_us = extended_us.clamp(i128::from(i64::MIN), i128::from(i64::MAX)) as i64; + Ok(self.base_timestamp_us.saturating_add(extended_us)) + } +} + +/// Error returned by RTP depayloading and access-unit assembly. +#[derive(Debug, Error, PartialEq, Eq)] +pub enum RtpDepacketizerError { + /// RTP packet is shorter than its declared header. + #[error("RTP packet is too short")] + PacketTooShort, + /// RTP version is not supported. + #[error("unsupported RTP version {0}")] + UnsupportedVersion(u8), + /// RTP clock rate must be non-zero. + #[error("RTP clock rate must be non-zero")] + InvalidClockRate, + /// RTP payload format is unsupported or malformed. + #[error("unsupported or malformed RTP payload")] + UnsupportedPayload, + /// RTP fragmentation state was invalid. + #[error("invalid RTP fragmentation sequence")] + InvalidFragment, + /// The payload descriptor is unsupported by the single-layer depacketizer. + #[error("unsupported RTP payload descriptor")] + UnsupportedPayloadDescriptor, + /// Codec is not supported by this RTP assembler. + #[error("RTP assembler does not support {0:?}")] + UnsupportedCodec(EncodedVideoCodec), + /// Capture data could not be converted into an access unit. + #[error(transparent)] + Capture(#[from] CaptureError), +} + +/// Packet-loss recovery counters for an [`RtpAccessUnitAssembler`]. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct RtpDepacketizerStats { + /// Number of RTP sequence-number gaps detected. + pub sequence_gaps: u64, + /// Number of access units dropped while recovering from packet loss. + pub dropped_access_units: u64, + /// Whether output is gated until the next keyframe completes. + pub awaiting_keyframe: bool, +} + +/// Reassembles RTP packets into encoded access units. +#[derive(Debug, Clone)] +pub struct RtpAccessUnitAssembler { + codec: EncodedVideoCodec, + width: u32, + height: u32, + timestamp_mapper: RtpTimestampMapper, + expected_sequence_number: Option, + current: Option, + fragment: Option, + current_frame: Option, + av1_fragment: Option, + awaiting_keyframe: bool, + sequence_gaps: u64, + dropped_access_units: u64, +} + +#[derive(Debug, Clone)] +struct PartialAccessUnit { + rtp_timestamp: u32, + timestamp_us: i64, + nal_units: Vec>, +} + +#[derive(Debug, Clone)] +struct FragmentState { + rtp_timestamp: u32, + nal_unit: Vec, +} + +#[derive(Debug, Clone)] +struct PartialFrame { + rtp_timestamp: u32, + timestamp_us: i64, + payload: Vec, + frame_type: Option, +} + +#[derive(Debug, Clone)] +struct Av1FragmentState { + rtp_timestamp: u32, + obu: Vec, +} + +impl RtpAccessUnitAssembler { + /// Creates an RTP access-unit assembler for supported video payloads. + pub fn new( + codec: EncodedVideoCodec, + clock_rate: u32, + start_timestamp_us: i64, + width: u32, + height: u32, + ) -> Result { + if clock_rate == 0 { + return Err(RtpDepacketizerError::InvalidClockRate); + } + + Ok(Self { + codec, + width, + height, + timestamp_mapper: RtpTimestampMapper::new(clock_rate, start_timestamp_us), + expected_sequence_number: None, + current: None, + fragment: None, + current_frame: None, + av1_fragment: None, + awaiting_keyframe: false, + sequence_gaps: 0, + dropped_access_units: 0, + }) + } + + /// Returns packet-loss recovery counters. + pub fn stats(&self) -> RtpDepacketizerStats { + RtpDepacketizerStats { + sequence_gaps: self.sequence_gaps, + dropped_access_units: self.dropped_access_units, + awaiting_keyframe: self.awaiting_keyframe, + } + } + + /// Pushes one encoded RTP packet and returns an access unit when a marker closes a frame. + pub fn push( + &mut self, + bytes: &[u8], + ) -> Result, RtpDepacketizerError> { + let packet = RtpPacket::parse(bytes)?; + self.push_packet(packet) + } + + /// Pushes one parsed RTP packet and returns an access unit when a marker closes a frame. + /// + /// Packet loss is recovered internally: gaps and truncated fragments drop the + /// interrupted access unit and gate output on the next keyframe instead of + /// returning an error; see [`Self::stats`]. + pub fn push_packet( + &mut self, + packet: RtpPacket<'_>, + ) -> Result, RtpDepacketizerError> { + self.check_sequence(packet.sequence_number); + + match self.codec { + EncodedVideoCodec::H264 => self.push_h264_payload(&packet)?, + EncodedVideoCodec::H265 => self.push_h265_payload(&packet)?, + EncodedVideoCodec::VP8 => self.push_vp8_payload(&packet)?, + EncodedVideoCodec::VP9 => self.push_vp9_payload(&packet)?, + EncodedVideoCodec::AV1 => self.push_av1_payload(&packet)?, + } + + if packet.marker { + if self.fragment.is_some() || self.av1_fragment.is_some() { + // The marker closed the access unit before the open fragment's + // end arrived, so its tail packets were lost. + self.discard_in_progress(); + self.dropped_access_units += 1; + return Ok(None); + } + if matches!( + self.codec, + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 + ) { + return self.finish_current_frame(); + } + return self.finish_current(); + } + Ok(None) + } + + fn check_sequence(&mut self, sequence_number: u16) { + let Some(expected) = self.expected_sequence_number.replace(sequence_number.wrapping_add(1)) + else { + return; + }; + if sequence_number == expected { + return; + } + + self.sequence_gaps += 1; + self.discard_in_progress(); + } + + /// Discards all partially assembled state and gates output on the next keyframe. + fn discard_in_progress(&mut self) { + self.current = None; + self.fragment = None; + self.current_frame = None; + self.av1_fragment = None; + self.awaiting_keyframe = true; + } + + /// Drops completed access units until a keyframe ends loss recovery. + fn gate_on_keyframe( + &mut self, + access_unit: OwnedEncodedAccessUnit, + ) -> Option { + if self.awaiting_keyframe { + if access_unit.frame_type != EncodedFrameType::Key { + self.dropped_access_units += 1; + return None; + } + self.awaiting_keyframe = false; + } + Some(access_unit) + } + + fn current_mut( + &mut self, + rtp_timestamp: u32, + ) -> Result<&mut PartialAccessUnit, RtpDepacketizerError> { + if self.current.as_ref().is_some_and(|current| current.rtp_timestamp != rtp_timestamp) { + self.current = None; + self.fragment = None; + } + + if self.current.is_none() { + let timestamp_us = self.timestamp_mapper.map(rtp_timestamp)?; + self.current = + Some(PartialAccessUnit { rtp_timestamp, timestamp_us, nal_units: Vec::new() }); + } + + self.current.as_mut().ok_or(RtpDepacketizerError::InvalidFragment) + } + + fn current_frame_mut( + &mut self, + rtp_timestamp: u32, + ) -> Result<&mut PartialFrame, RtpDepacketizerError> { + if self.current_frame.as_ref().is_some_and(|current| current.rtp_timestamp != rtp_timestamp) + { + self.current_frame = None; + self.av1_fragment = None; + } + + if self.current_frame.is_none() { + let timestamp_us = self.timestamp_mapper.map(rtp_timestamp)?; + self.current_frame = Some(PartialFrame { + rtp_timestamp, + timestamp_us, + payload: Vec::new(), + frame_type: None, + }); + } + + self.current_frame.as_mut().ok_or(RtpDepacketizerError::InvalidFragment) + } + + fn push_h264_payload(&mut self, packet: &RtpPacket<'_>) -> Result<(), RtpDepacketizerError> { + let payload = packet.payload; + let Some(&header) = payload.first() else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + let nal_type = header & 0x1f; + + match nal_type { + 1..=23 => self.current_mut(packet.timestamp)?.nal_units.push(payload.to_vec()), + 24 => self.push_h264_stap_a(packet.timestamp, &payload[1..])?, + 28 => self.push_h264_fu_a(packet.timestamp, payload)?, + _ => return Err(RtpDepacketizerError::UnsupportedPayload), + } + + Ok(()) + } + + fn push_h264_stap_a( + &mut self, + rtp_timestamp: u32, + payload: &[u8], + ) -> Result<(), RtpDepacketizerError> { + let mut cursor = 0; + while cursor < payload.len() { + if payload.len() < cursor + 2 { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + let len = u16::from_be_bytes([payload[cursor], payload[cursor + 1]]) as usize; + cursor += 2; + if len == 0 || payload.len() < cursor + len { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + self.current_mut(rtp_timestamp)?.nal_units.push(payload[cursor..cursor + len].to_vec()); + cursor += len; + } + Ok(()) + } + + fn push_h264_fu_a( + &mut self, + rtp_timestamp: u32, + payload: &[u8], + ) -> Result<(), RtpDepacketizerError> { + if payload.len() < 2 { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + + let indicator = payload[0]; + let header = payload[1]; + let start = (header & 0x80) != 0; + let end = (header & 0x40) != 0; + let nal_type = header & 0x1f; + if nal_type == 0 || nal_type > 23 { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + + if start { + let mut nal_unit = Vec::with_capacity(1 + payload.len().saturating_sub(2)); + nal_unit.push((indicator & 0xe0) | nal_type); + nal_unit.extend_from_slice(&payload[2..]); + self.fragment = Some(FragmentState { rtp_timestamp, nal_unit }); + return Ok(()); + } + + let Some(fragment) = + self.fragment.as_mut().filter(|fragment| fragment.rtp_timestamp == rtp_timestamp) + else { + // A continuation without its start means the preceding packets were lost. + self.discard_in_progress(); + return Ok(()); + }; + fragment.nal_unit.extend_from_slice(&payload[2..]); + + if end { + let nal_unit = + self.fragment.take().ok_or(RtpDepacketizerError::InvalidFragment)?.nal_unit; + self.current_mut(rtp_timestamp)?.nal_units.push(nal_unit); + } + Ok(()) + } + + fn push_h265_payload(&mut self, packet: &RtpPacket<'_>) -> Result<(), RtpDepacketizerError> { + let payload = packet.payload; + if payload.len() < 2 { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + let nal_type = (payload[0] >> 1) & 0x3f; + + match nal_type { + 0..=47 => self.current_mut(packet.timestamp)?.nal_units.push(payload.to_vec()), + 48 => self.push_h265_aggregation(packet.timestamp, &payload[2..])?, + 49 => self.push_h265_fragment(packet.timestamp, payload)?, + _ => return Err(RtpDepacketizerError::UnsupportedPayload), + } + + Ok(()) + } + + fn push_h265_aggregation( + &mut self, + rtp_timestamp: u32, + payload: &[u8], + ) -> Result<(), RtpDepacketizerError> { + let mut cursor = 0; + while cursor < payload.len() { + if payload.len() < cursor + 2 { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + let len = u16::from_be_bytes([payload[cursor], payload[cursor + 1]]) as usize; + cursor += 2; + if len == 0 || payload.len() < cursor + len { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + self.current_mut(rtp_timestamp)?.nal_units.push(payload[cursor..cursor + len].to_vec()); + cursor += len; + } + Ok(()) + } + + fn push_h265_fragment( + &mut self, + rtp_timestamp: u32, + payload: &[u8], + ) -> Result<(), RtpDepacketizerError> { + if payload.len() < 3 { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + + let fu_header = payload[2]; + let start = (fu_header & 0x80) != 0; + let end = (fu_header & 0x40) != 0; + let nal_type = fu_header & 0x3f; + if nal_type > 47 { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + + if start { + let mut nal_unit = Vec::with_capacity(2 + payload.len().saturating_sub(3)); + nal_unit.push((payload[0] & 0x81) | (nal_type << 1)); + nal_unit.push(payload[1]); + nal_unit.extend_from_slice(&payload[3..]); + self.fragment = Some(FragmentState { rtp_timestamp, nal_unit }); + return Ok(()); + } + + let Some(fragment) = + self.fragment.as_mut().filter(|fragment| fragment.rtp_timestamp == rtp_timestamp) + else { + // A continuation without its start means the preceding packets were lost. + self.discard_in_progress(); + return Ok(()); + }; + fragment.nal_unit.extend_from_slice(&payload[3..]); + + if end { + let nal_unit = + self.fragment.take().ok_or(RtpDepacketizerError::InvalidFragment)?.nal_unit; + self.current_mut(rtp_timestamp)?.nal_units.push(nal_unit); + } + Ok(()) + } + + fn push_vp8_payload(&mut self, packet: &RtpPacket<'_>) -> Result<(), RtpDepacketizerError> { + let descriptor = parse_vp8_payload_descriptor(packet.payload)?; + if descriptor.payload.is_empty() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + + let frame = self.current_frame_mut(packet.timestamp)?; + if frame.payload.is_empty() { + if !descriptor.start_of_partition || descriptor.partition_id != 0 { + // The beginning of this frame was lost. + self.discard_in_progress(); + return Ok(()); + } + frame.frame_type = Some(if is_vp8_keyframe(descriptor.payload) { + EncodedFrameType::Key + } else { + EncodedFrameType::Delta + }); + } + frame.payload.extend_from_slice(descriptor.payload); + Ok(()) + } + + fn push_vp9_payload(&mut self, packet: &RtpPacket<'_>) -> Result<(), RtpDepacketizerError> { + let descriptor = parse_vp9_payload_descriptor(packet.payload)?; + if descriptor.payload.is_empty() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + if descriptor.spatial_id.unwrap_or(0) != 0 + || descriptor.inter_layer_predicted.unwrap_or(false) + { + return Err(RtpDepacketizerError::UnsupportedPayloadDescriptor); + } + + let frame = self.current_frame_mut(packet.timestamp)?; + if frame.payload.is_empty() { + if !descriptor.beginning_of_frame { + // The beginning of this frame was lost. + self.discard_in_progress(); + return Ok(()); + } + frame.frame_type = Some( + if !descriptor.inter_picture_predicted || is_vp9_keyframe(descriptor.payload) { + EncodedFrameType::Key + } else { + EncodedFrameType::Delta + }, + ); + } + frame.payload.extend_from_slice(descriptor.payload); + Ok(()) + } + + fn push_av1_payload(&mut self, packet: &RtpPacket<'_>) -> Result<(), RtpDepacketizerError> { + let descriptor = parse_av1_payload_descriptor(packet.payload)?; + if descriptor.elements.is_empty() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + + let mut saw_sequence_header = descriptor.new_sequence; + let last_index = descriptor.elements.len() - 1; + for (index, element) in descriptor.elements.iter().enumerate() { + if element.is_empty() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + + let obu = if index == 0 && descriptor.starts_fragment { + let Some(fragment) = self + .av1_fragment + .take() + .filter(|fragment| fragment.rtp_timestamp == packet.timestamp) + else { + // A continuation without its start means the preceding packets were lost. + self.discard_in_progress(); + return Ok(()); + }; + let mut obu = fragment.obu; + obu.extend_from_slice(element); + obu + } else { + if index == 0 && self.av1_fragment.is_some() { + return Err(RtpDepacketizerError::InvalidFragment); + } + element.to_vec() + }; + + if index == last_index && descriptor.ends_fragment { + self.av1_fragment = Some(Av1FragmentState { rtp_timestamp: packet.timestamp, obu }); + return Ok(()); + } + + let mut obu = av1_obu_from_rtp_element(&obu)?; + saw_sequence_header |= av1_obu_type(&obu) == Some(1); + let frame = self.current_frame_mut(packet.timestamp)?; + if frame.payload.is_empty() || saw_sequence_header { + frame.frame_type = Some(if saw_sequence_header { + EncodedFrameType::Key + } else { + EncodedFrameType::Delta + }); + } + frame.payload.append(&mut obu); + } + + Ok(()) + } + + fn finish_current(&mut self) -> Result, RtpDepacketizerError> { + let Some(current) = self.current.take() else { + return Ok(None); + }; + if current.nal_units.is_empty() { + return Ok(None); + } + + let nal_units = current.nal_units.iter().map(Vec::as_slice).collect::>(); + let access_unit = access_unit_from_nalus( + self.codec, + &nal_units, + current.timestamp_us, + self.width, + self.height, + )?; + Ok(self.gate_on_keyframe(access_unit)) + } + + fn finish_current_frame( + &mut self, + ) -> Result, RtpDepacketizerError> { + let Some(current) = self.current_frame.take() else { + return Ok(None); + }; + if current.payload.is_empty() { + return Ok(None); + } + + let mut access_unit = OwnedEncodedAccessUnit::new( + self.codec, + Bytes::from(current.payload), + current.timestamp_us, + current.frame_type.unwrap_or(EncodedFrameType::Delta), + self.width, + self.height, + ); + access_unit.codec_specific = CodecSpecific::default_for(self.codec); + Ok(self.gate_on_keyframe(access_unit)) + } +} + +#[derive(Debug, Clone, Copy)] +struct Vp8PayloadDescriptor<'a> { + start_of_partition: bool, + partition_id: u8, + payload: &'a [u8], +} + +#[derive(Debug, Clone, Copy)] +struct Vp9PayloadDescriptor<'a> { + beginning_of_frame: bool, + inter_picture_predicted: bool, + spatial_id: Option, + inter_layer_predicted: Option, + payload: &'a [u8], +} + +#[derive(Debug, Clone)] +struct Av1PayloadDescriptor<'a> { + starts_fragment: bool, + ends_fragment: bool, + new_sequence: bool, + elements: Vec<&'a [u8]>, +} + +fn parse_vp8_payload_descriptor( + payload: &[u8], +) -> Result, RtpDepacketizerError> { + let Some(&descriptor) = payload.first() else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + let start_of_partition = descriptor & 0x10 != 0; + let partition_id = descriptor & 0x0f; + let mut cursor = 1; + if descriptor & 0x80 != 0 { + let Some(&extension) = payload.get(cursor) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + cursor += 1; + if extension & 0x80 != 0 { + let Some(&picture_id) = payload.get(cursor) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + cursor += if picture_id & 0x80 != 0 { 2 } else { 1 }; + } + if extension & 0x40 != 0 { + cursor += 1; + } + if extension & 0x20 != 0 || extension & 0x10 != 0 { + cursor += 1; + } + } + if cursor > payload.len() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + Ok(Vp8PayloadDescriptor { start_of_partition, partition_id, payload: &payload[cursor..] }) +} + +fn parse_vp9_payload_descriptor( + payload: &[u8], +) -> Result, RtpDepacketizerError> { + let Some(&descriptor) = payload.first() else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + if descriptor & 0x10 != 0 { + return Err(RtpDepacketizerError::UnsupportedPayloadDescriptor); + } + + let beginning_of_frame = descriptor & 0x08 != 0; + let inter_picture_predicted = descriptor & 0x40 != 0; + let mut cursor = 1; + if descriptor & 0x80 != 0 { + let Some(&picture_id) = payload.get(cursor) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + cursor += if picture_id & 0x80 != 0 { 2 } else { 1 }; + } + + let mut spatial_id = None; + let mut inter_layer_predicted = None; + if descriptor & 0x20 != 0 { + let Some(&layer_info) = payload.get(cursor) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + cursor += 1; + spatial_id = Some((layer_info >> 1) & 0x07); + inter_layer_predicted = Some(layer_info & 0x01 != 0); + cursor += 1; // TL0PICIDX is present in non-flexible mode. + } + + if descriptor & 0x02 != 0 { + skip_vp9_scalability_structure(payload, &mut cursor)?; + } + + if cursor > payload.len() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + Ok(Vp9PayloadDescriptor { + beginning_of_frame, + inter_picture_predicted, + spatial_id, + inter_layer_predicted, + payload: &payload[cursor..], + }) +} + +fn skip_vp9_scalability_structure( + payload: &[u8], + cursor: &mut usize, +) -> Result<(), RtpDepacketizerError> { + let Some(&structure) = payload.get(*cursor) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + *cursor += 1; + + let spatial_layers = ((structure >> 5) & 0x07) + 1; + if spatial_layers != 1 { + return Err(RtpDepacketizerError::UnsupportedPayloadDescriptor); + } + + if structure & 0x10 != 0 { + let bytes = usize::from(spatial_layers) * 4; + skip_bytes(payload, cursor, bytes)?; + } + + if structure & 0x08 != 0 { + let Some(&group_count) = payload.get(*cursor) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + *cursor += 1; + for _ in 0..group_count { + let Some(&group) = payload.get(*cursor) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + *cursor += 1; + skip_bytes(payload, cursor, usize::from((group >> 2) & 0x03))?; + } + } + + Ok(()) +} + +fn skip_bytes( + payload: &[u8], + cursor: &mut usize, + bytes: usize, +) -> Result<(), RtpDepacketizerError> { + let Some(next) = cursor.checked_add(bytes) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + if next > payload.len() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + *cursor = next; + Ok(()) +} + +fn parse_av1_payload_descriptor( + payload: &[u8], +) -> Result, RtpDepacketizerError> { + let Some(&header) = payload.first() else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + let starts_fragment = header & 0x80 != 0; + let ends_fragment = header & 0x40 != 0; + let element_count = (header >> 4) & 0x03; + let new_sequence = header & 0x08 != 0; + + let mut cursor = 1; + let mut elements = Vec::new(); + if element_count == 0 { + while cursor < payload.len() { + let len = read_leb128(payload, &mut cursor)?; + let Some(end) = cursor.checked_add(len) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + if end > payload.len() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + elements.push(&payload[cursor..end]); + cursor = end; + } + } else { + for index in 0..usize::from(element_count) { + let len = if index + 1 == usize::from(element_count) { + payload.len().saturating_sub(cursor) + } else { + read_leb128(payload, &mut cursor)? + }; + let Some(end) = cursor.checked_add(len) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + if end > payload.len() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + elements.push(&payload[cursor..end]); + cursor = end; + } + } + + Ok(Av1PayloadDescriptor { starts_fragment, ends_fragment, new_sequence, elements }) +} + +fn read_leb128(bytes: &[u8], cursor: &mut usize) -> Result { + let mut value = 0usize; + let mut shift = 0usize; + loop { + let Some(&byte) = bytes.get(*cursor) else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + *cursor += 1; + value |= usize::from(byte & 0x7f) << shift; + if byte & 0x80 == 0 { + return Ok(value); + } + shift += 7; + if shift >= usize::BITS as usize { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + } +} + +fn write_leb128(mut value: usize, out: &mut Vec) { + loop { + let mut byte = (value & 0x7f) as u8; + value >>= 7; + if value != 0 { + byte |= 0x80; + } + out.push(byte); + if value == 0 { + break; + } + } +} + +fn av1_obu_from_rtp_element(element: &[u8]) -> Result, RtpDepacketizerError> { + let Some(&header) = element.first() else { + return Err(RtpDepacketizerError::UnsupportedPayload); + }; + if header & 0x80 != 0 { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + + if header & 0x02 != 0 { + let mut cursor = if header & 0x04 != 0 { 2 } else { 1 }; + if cursor > element.len() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + let payload_size = read_leb128(element, &mut cursor)?; + if payload_size != element.len().saturating_sub(cursor) { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + return Ok(element.to_vec()); + } + + let payload_offset = if header & 0x04 != 0 { 2 } else { 1 }; + if payload_offset > element.len() { + return Err(RtpDepacketizerError::UnsupportedPayload); + } + + let payload_size = element.len() - payload_offset; + let mut obu = Vec::with_capacity(element.len() + 8); + obu.push(header | 0x02); + if header & 0x04 != 0 { + obu.push(element[1]); + } + write_leb128(payload_size, &mut obu); + obu.extend_from_slice(&element[payload_offset..]); + Ok(obu) +} + +fn is_vp8_keyframe(payload: &[u8]) -> bool { + payload.first().is_some_and(|header| header & 0x01 == 0) +} + +/// Parses the start of a VP9 uncompressed frame header, whose `f(n)` fields +/// are MSB-first, and reports whether it begins a keyframe. +fn is_vp9_keyframe(payload: &[u8]) -> bool { + let Some(&first_byte) = payload.first() else { + return false; + }; + // frame_marker: f(2), must be 0b10. + if first_byte >> 6 != 0b10 { + return false; + } + + let mut bit_offset = 2usize; + let profile_low = read_bit(first_byte, bit_offset); + bit_offset += 1; + let profile_high = read_bit(first_byte, bit_offset); + bit_offset += 1; + let profile = profile_low | (profile_high << 1); + if profile == 3 { + bit_offset += 1; // reserved_zero + } + // show_existing_frame: a repeated frame is never a keyframe. + if read_bit(first_byte, bit_offset) != 0 { + return false; + } + bit_offset += 1; + // frame_type: 0 is KEY_FRAME. + read_bit(first_byte, bit_offset) == 0 +} + +/// Reads bit `bit_offset` of `byte`, counting from the most significant bit. +fn read_bit(byte: u8, bit_offset: usize) -> u8 { + (byte >> (7 - bit_offset)) & 0x01 +} + +fn av1_obu_type(obu: &[u8]) -> Option { + obu.first().map(|header| (header & 0x78) >> 3) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn rtp_packet(sequence_number: u16, timestamp: u32, marker: bool, payload: &[u8]) -> Vec { + let mut packet = Vec::with_capacity(12 + payload.len()); + packet.push(0x80); + packet.push(if marker { 0x80 | 96 } else { 96 }); + packet.extend_from_slice(&sequence_number.to_be_bytes()); + packet.extend_from_slice(×tamp.to_be_bytes()); + packet.extend_from_slice(&0x1122_3344_u32.to_be_bytes()); + packet.extend_from_slice(payload); + packet + } + + #[test] + fn parses_rtp_packet_header() { + let bytes = rtp_packet(7, 90_000, true, &[0x65, 1, 2]); + let packet = RtpPacket::parse(&bytes).unwrap(); + assert!(packet.marker); + assert_eq!(packet.payload_type, 96); + assert_eq!(packet.sequence_number, 7); + assert_eq!(packet.timestamp, 90_000); + assert_eq!(packet.payload, &[0x65, 1, 2]); + } + + #[test] + fn maps_rtp_timestamp_rollover() { + let mut mapper = RtpTimestampMapper::new(90_000, 1_000); + assert_eq!(mapper.map(u32::MAX - 89).unwrap(), 1_000); + assert_eq!(mapper.map(0).unwrap(), 2_000); + } + + #[test] + fn maps_rtp_timestamps_across_multiple_rollovers() { + let mut mapper = RtpTimestampMapper::new(90_000, 0); + let step = 1u32 << 30; + let mut rtp_timestamp = 0u32; + let mut last_us = mapper.map(rtp_timestamp).unwrap(); + for _ in 0..20 { + rtp_timestamp = rtp_timestamp.wrapping_add(step); + let mapped_us = mapper.map(rtp_timestamp).unwrap(); + assert!(mapped_us > last_us, "mapped timestamps must stay monotonic"); + last_us = mapped_us; + } + assert_eq!(last_us, (20i64 << 30) * 1_000_000 / 90_000); + } + + #[test] + fn maps_reordered_rtp_timestamps() { + let mut mapper = RtpTimestampMapper::new(90_000, 1_000); + assert_eq!(mapper.map(9_000).unwrap(), 1_000); + assert_eq!(mapper.map(18_000).unwrap(), 101_000); + // A late packet maps behind the stream without disturbing what follows. + assert_eq!(mapper.map(15_000).unwrap(), 67_666); + assert_eq!(mapper.map(27_000).unwrap(), 201_000); + } + + #[test] + fn assembles_h264_fu_a() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::H264, 90_000, 0, 640, 480).unwrap(); + let start = rtp_packet(10, 12_000, false, &[0x7c, 0x85, 1, 2]); + let end = rtp_packet(11, 12_000, true, &[0x7c, 0x45, 3, 4]); + + assert!(assembler.push(&start).unwrap().is_none()); + let access_unit = assembler.push(&end).unwrap().unwrap(); + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 1, 2, 3, 4]); + } + + #[test] + fn sequence_gap_recovers_h264_at_next_keyframe() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::H264, 90_000, 0, 640, 480).unwrap(); + let start = rtp_packet(10, 12_000, false, &[0x7c, 0x85, 1, 2]); + let delta = rtp_packet(12, 15_000, true, &[0x41, 1, 2]); + let key = rtp_packet(13, 18_000, true, &[0x65, 3, 4]); + + assert!(assembler.push(&start).unwrap().is_none()); + // The gap dropped the fragment; the delta frame after it is withheld. + assert!(assembler.push(&delta).unwrap().is_none()); + let stats = assembler.stats(); + assert_eq!(stats.sequence_gaps, 1); + assert_eq!(stats.dropped_access_units, 1); + assert!(stats.awaiting_keyframe); + + let access_unit = assembler.push(&key).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 3, 4]); + let stats = assembler.stats(); + assert_eq!(stats.dropped_access_units, 1); + assert!(!stats.awaiting_keyframe); + } + + #[test] + fn marker_with_open_h264_fragment_drops_access_unit() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::H264, 90_000, 0, 640, 480).unwrap(); + let start = rtp_packet(10, 12_000, false, &[0x7c, 0x85, 1, 2]); + let truncated = rtp_packet(11, 12_000, true, &[0x7c, 0x05, 3, 4]); + let key = rtp_packet(12, 15_000, true, &[0x65, 5, 6]); + + assert!(assembler.push(&start).unwrap().is_none()); + // The marker arrived without the FU end bit: the fragment is truncated. + assert!(assembler.push(&truncated).unwrap().is_none()); + let stats = assembler.stats(); + assert_eq!(stats.sequence_gaps, 0); + assert_eq!(stats.dropped_access_units, 1); + assert!(stats.awaiting_keyframe); + + let access_unit = assembler.push(&key).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 5, 6]); + assert!(!assembler.stats().awaiting_keyframe); + } + + #[test] + fn drops_h264_fu_continuation_without_start() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::H264, 90_000, 0, 640, 480).unwrap(); + let continuation = rtp_packet(10, 12_000, false, &[0x7c, 0x05, 1, 2]); + let key = rtp_packet(11, 15_000, true, &[0x65, 3, 4]); + + assert!(assembler.push(&continuation).unwrap().is_none()); + assert!(assembler.stats().awaiting_keyframe); + + let access_unit = assembler.push(&key).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 3, 4]); + } + + #[test] + fn assembles_vp8_fragments() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP8, 90_000, 0, 640, 480).unwrap(); + let start = rtp_packet(10, 12_000, false, &[0x10, 0x00, 1, 2]); + let end = rtp_packet(11, 12_000, true, &[0x00, 3, 4]); + + assert!(assembler.push(&start).unwrap().is_none()); + let access_unit = assembler.push(&end).unwrap().unwrap(); + assert_eq!(access_unit.codec, EncodedVideoCodec::VP8); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0x00, 1, 2, 3, 4]); + } + + #[test] + fn drops_vp8_mid_frame_start() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP8, 90_000, 0, 640, 480).unwrap(); + let mid_frame = rtp_packet(10, 12_000, true, &[0x00, 1, 2]); + let key = rtp_packet(11, 15_000, true, &[0x10, 0x00, 3, 4]); + + assert!(assembler.push(&mid_frame).unwrap().is_none()); + assert!(assembler.stats().awaiting_keyframe); + + let access_unit = assembler.push(&key).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0x00, 3, 4]); + } + + #[test] + fn assembles_vp9_single_layer_frame() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP9, 90_000, 0, 640, 480).unwrap(); + let packet = rtp_packet(10, 12_000, true, &[0x0c, 0x82, 1, 2]); + + let access_unit = assembler.push(&packet).unwrap().unwrap(); + assert_eq!(access_unit.codec, EncodedVideoCodec::VP9); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0x82, 1, 2]); + } + + #[test] + fn assembles_vp9_non_flexible_layer_descriptor() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP9, 90_000, 0, 640, 480).unwrap(); + let packet = rtp_packet(10, 12_000, true, &[0x2c, 0x10, 7, 0x82, 1, 2]); + + let access_unit = assembler.push(&packet).unwrap().unwrap(); + assert_eq!(access_unit.codec, EncodedVideoCodec::VP9); + assert_eq!(access_unit.payload.as_ref(), &[0x82, 1, 2]); + } + + #[test] + fn assembles_vp9_single_layer_scalability_structure() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP9, 90_000, 0, 640, 480).unwrap(); + let packet = rtp_packet( + 10, + 12_000, + true, + &[ + 0x0e, // B, E, V + 0x18, // one spatial layer, resolution present, picture group present + 0x01, 0x40, 0x00, 0xb4, // 320x180 + 0x01, // one picture group + 0x04, // one reference index + 0x01, // P_DIFF + 0x82, 1, 2, + ], + ); + + let access_unit = assembler.push(&packet).unwrap().unwrap(); + assert_eq!(access_unit.codec, EncodedVideoCodec::VP9); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0x82, 1, 2]); + } + + #[test] + fn assembles_vp9_descriptor_keyframe_from_prediction_bit() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP9, 90_000, 0, 640, 480).unwrap(); + let packet = rtp_packet( + 10, + 12_000, + true, + &[ + 0x0e, // B, E, V; P is clear, so this is not inter-picture predicted. + 0x18, // one spatial layer, resolution present, picture group present + 0x02, 0x80, 0x01, 0x68, // 640x360 + 0x01, // one picture group + 0x04, // one reference index + 0x01, // P_DIFF + 0xb1, 1, 2, + ], + ); + + let access_unit = assembler.push(&packet).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0xb1, 1, 2]); + } + + #[test] + fn assembles_vp9_predicted_frame_as_delta() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP9, 90_000, 0, 640, 480).unwrap(); + // P is set and the payload is an inter frame: must not classify as Key. + let packet = rtp_packet(10, 12_000, true, &[0x4c, 0x86, 1, 2]); + + let access_unit = assembler.push(&packet).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Delta); + assert_eq!(access_unit.payload.as_ref(), &[0x86, 1, 2]); + } + + #[test] + fn vp9_bitstream_keyframe_overrides_predicted_bit() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP9, 90_000, 0, 640, 480).unwrap(); + // P is set but the uncompressed header says KEY_FRAME. + let packet = rtp_packet(10, 12_000, true, &[0x4c, 0x82, 1, 2]); + + let access_unit = assembler.push(&packet).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0x82, 1, 2]); + } + + #[test] + fn classifies_vp9_uncompressed_header_frame_types() { + // 0b1000_0010: marker, profile 0, show_existing=0, KEY_FRAME, show_frame=1. + assert!(is_vp9_keyframe(&[0x82])); + // 0b1000_0011: keyframe with error_resilient_mode set. + assert!(is_vp9_keyframe(&[0x83])); + // 0b1011_0000: profile 3 keyframe. + assert!(is_vp9_keyframe(&[0xb0])); + // 0b1000_0110: frame_type=1, an inter frame. + assert!(!is_vp9_keyframe(&[0x86])); + // 0b1011_0010: profile 3 inter frame. + assert!(!is_vp9_keyframe(&[0xb2])); + // 0b1000_1000: show_existing_frame repeats a decoded frame. + assert!(!is_vp9_keyframe(&[0x88])); + // 0b0000_0010: invalid frame_marker. + assert!(!is_vp9_keyframe(&[0x02])); + assert!(!is_vp9_keyframe(&[])); + } + + #[test] + fn rejects_vp9_multi_layer_scalability_structure() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP9, 90_000, 0, 640, 480).unwrap(); + let packet = rtp_packet(10, 12_000, true, &[0x0e, 0x20, 0x82, 1, 2]); + + let err = assembler.push(&packet).unwrap_err(); + assert_eq!(err, RtpDepacketizerError::UnsupportedPayloadDescriptor); + } + + #[test] + fn drops_vp9_mid_frame_start() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP9, 90_000, 0, 640, 480).unwrap(); + let mid_frame = rtp_packet(10, 12_000, true, &[0x04, 0x82, 1, 2]); + let key = rtp_packet(11, 15_000, true, &[0x0c, 0x82, 3, 4]); + + assert!(assembler.push(&mid_frame).unwrap().is_none()); + assert!(assembler.stats().awaiting_keyframe); + + let access_unit = assembler.push(&key).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0x82, 3, 4]); + } + + #[test] + fn rejects_vp9_flexible_mode() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP9, 90_000, 0, 640, 480).unwrap(); + let packet = rtp_packet(10, 12_000, true, &[0x1c, 0xa2, 1, 2]); + + let err = assembler.push(&packet).unwrap_err(); + assert_eq!(err, RtpDepacketizerError::UnsupportedPayloadDescriptor); + } + + #[test] + fn assembles_av1_temporal_unit() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::AV1, 90_000, 0, 640, 480).unwrap(); + let packet = rtp_packet(10, 12_000, true, &[0x18, 0x08]); + + let access_unit = assembler.push(&packet).unwrap().unwrap(); + assert_eq!(access_unit.codec, EncodedVideoCodec::AV1); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0x0a, 0x00]); + } + + #[test] + fn assembles_fragmented_av1_obu() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::AV1, 90_000, 0, 640, 480).unwrap(); + let start = rtp_packet(10, 12_000, false, &[0x50, 0x30, 1]); + let end = rtp_packet(11, 12_000, true, &[0x90, 2, 3]); + + assert!(assembler.push(&start).unwrap().is_none()); + let access_unit = assembler.push(&end).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Delta); + assert_eq!(access_unit.payload.as_ref(), &[0x32, 0x03, 1, 2, 3]); + } + + #[test] + fn assembles_av1_obu_payload_with_size_field() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::AV1, 90_000, 0, 640, 480).unwrap(); + let packet = rtp_packet(10, 12_000, true, &[0x10, 0x30, 1, 2, 3]); + + let access_unit = assembler.push(&packet).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Delta); + assert_eq!(access_unit.payload.as_ref(), &[0x32, 0x03, 1, 2, 3]); + } + + #[test] + fn marker_with_open_av1_fragment_drops_frame() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::AV1, 90_000, 0, 640, 480).unwrap(); + // Y is set, so the OBU fragment is unterminated when the marker closes it. + let truncated = rtp_packet(10, 12_000, true, &[0x50, 0x30, 1]); + let key = rtp_packet(11, 15_000, true, &[0x18, 0x08]); + + assert!(assembler.push(&truncated).unwrap().is_none()); + let stats = assembler.stats(); + assert_eq!(stats.dropped_access_units, 1); + assert!(stats.awaiting_keyframe); + + let access_unit = assembler.push(&key).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0x0a, 0x00]); + assert!(!assembler.stats().awaiting_keyframe); + } + + #[test] + fn drops_av1_fragment_continuation_without_start() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::AV1, 90_000, 0, 640, 480).unwrap(); + // Z is set: this continues an OBU whose start was never received. + let continuation = rtp_packet(10, 12_000, true, &[0x90, 2, 3]); + let key = rtp_packet(11, 15_000, true, &[0x18, 0x08]); + + assert!(assembler.push(&continuation).unwrap().is_none()); + assert!(assembler.stats().awaiting_keyframe); + + let access_unit = assembler.push(&key).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0x0a, 0x00]); + } + + #[test] + fn sequence_gap_recovers_vp8_at_next_keyframe() { + let mut assembler = + RtpAccessUnitAssembler::new(EncodedVideoCodec::VP8, 90_000, 0, 640, 480).unwrap(); + let start = rtp_packet(10, 12_000, false, &[0x10, 0x00, 1, 2]); + let delta = rtp_packet(12, 15_000, true, &[0x10, 0x01, 3, 4]); + let key = rtp_packet(13, 18_000, true, &[0x10, 0x00, 5, 6]); + + assert!(assembler.push(&start).unwrap().is_none()); + // The gap dropped the fragment; the delta frame after it is withheld. + assert!(assembler.push(&delta).unwrap().is_none()); + let stats = assembler.stats(); + assert_eq!(stats.sequence_gaps, 1); + assert_eq!(stats.dropped_access_units, 1); + assert!(stats.awaiting_keyframe); + + let access_unit = assembler.push(&key).unwrap().unwrap(); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0x00, 5, 6]); + assert!(!assembler.stats().awaiting_keyframe); + } +} diff --git a/livekit-capture/src/error.rs b/livekit-capture/src/error.rs new file mode 100644 index 000000000..77d794dd8 --- /dev/null +++ b/livekit-capture/src/error.rs @@ -0,0 +1,52 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use thiserror::Error; + +use crate::encoded::{EncodedVideoCodec, EncodedWireFormat}; + +/// Error returned by capture helpers. +#[derive(Debug, Error, PartialEq, Eq)] +pub enum CaptureError { + /// Encoded payload is empty. + #[error("encoded payload is empty")] + EmptyPayload, + /// H.265 NAL unit is too short to contain its header. + #[error("H.265 NAL unit is too short")] + H265NalTooShort, + /// DMA-BUF frame did not include any planes. + #[error("DMA-BUF frame did not include any planes")] + MissingDmaBufPlane, + /// DMA-BUF frame layout cannot be represented by the native capture path. + #[error("unsupported DMA-BUF layout: {0}")] + UnsupportedDmaBufLayout(&'static str), + /// Access unit carries layering metadata the passthrough cannot forward. + #[error("unsupported layered encoding: {0}")] + UnsupportedLayeredEncoding(&'static str), + /// Codec is represented by the API but not yet supported by native passthrough. + #[error("encoded passthrough does not support {0:?} yet")] + UnsupportedCodec(EncodedVideoCodec), + /// Encoded payload or transport data is malformed. + #[error("invalid encoded data: {0}")] + InvalidEncodedData(&'static str), + /// Wire format is represented by the API but not supported by this source. + #[error("encoded wire format is not supported by this source: {0:?}")] + UnsupportedWireFormat(EncodedWireFormat), + /// Capture backend is not available on this platform. + #[error("{0} is not supported on this platform")] + UnsupportedPlatform(&'static str), + /// The underlying source rejected the frame. + #[error("capture source rejected the frame")] + CaptureFailed, +} diff --git a/livekit-capture/src/lib.rs b/livekit-capture/src/lib.rs new file mode 100644 index 000000000..af8b8ffb1 --- /dev/null +++ b/livekit-capture/src/lib.rs @@ -0,0 +1,46 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Capture helpers for publishing decoded, DMA-BUF, and encoded video with LiveKit. + +pub mod device; +pub mod dmabuf; +pub mod encoded; +mod error; +pub mod source; +pub mod sources; +pub(crate) mod time; +pub mod track; + +pub use device::{ + CaptureBackend, CaptureDeviceInfo, CaptureDeviceQueryError, CaptureDeviceSelector, + CaptureFormat, CaptureFormatRequest, CaptureFrameFormat, CapturePath, CaptureResolution, +}; +pub use dmabuf::{DmaBufFrame, DmaBufPixelFormat, DmaBufPlane}; +pub use encoded::{ + ingress::{ + EncodedAccessUnitSource, EncodedIngress, EncodedIngressCapture, EncodedIngressError, + EncodedIngressStop, + }, + CodecSpecific, EncodedAccessUnit, EncodedFragment, EncodedFrameType, EncodedLayerInfo, + EncodedPayload, EncodedVideoCodec, EncodedWireFormat, H264PacketizationMode, + OwnedEncodedAccessUnit, +}; +pub use error::CaptureError; +pub use source::{ + CaptureFrame, CaptureFrameSource, CaptureSourceError, CaptureSourceOptions, + EncodedCaptureFrameSource, EncodedEndpoint, EncodedFrameSourceError, NativeVideoFrame, + RawVideoFrame, VideoCaptureSource, +}; +pub use track::VideoCaptureTrack; diff --git a/livekit-capture/src/source.rs b/livekit-capture/src/source.rs new file mode 100644 index 000000000..be0243b0f --- /dev/null +++ b/livekit-capture/src/source.rs @@ -0,0 +1,1069 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{error::Error, fmt}; + +use livekit::webrtc::video_frame::{native::NativeBuffer, I420Buffer, VideoFrame}; +use thiserror::Error; + +use crate::{ + device::{ + CaptureBackend, CaptureDeviceInfo, CaptureDeviceQueryError, CaptureDeviceSelector, + CaptureFormat, CaptureFormatRequest, CaptureFrameFormat, CapturePath, + }, + dmabuf::DmaBufFrame, + encoded::{ingress::EncodedAccessUnitSource, OwnedEncodedAccessUnit}, + error::CaptureError, + track::VideoCaptureTrack, +}; + +/// Options used by [`VideoCaptureSource::open`]. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CaptureSourceOptions { + /// Backend to open. + pub backend: CaptureBackend, + /// Device to open. + pub device: CaptureDeviceSelector, + /// Format requested from the backend. + pub format: CaptureFormatRequest, + /// Whether the resulting track should be marked as a screencast. + pub is_screencast: bool, + /// Prefer CPU-accessible frames over zero-copy native buffers, for + /// callers that modify pixels before publishing. + pub prefer_raw_frames: bool, + /// Endpoint for the encoded ingest backends (RTSP/TCP/GStreamer). + pub encoded: Option, +} + +impl Default for CaptureSourceOptions { + fn default() -> Self { + Self { + backend: CaptureBackend::Auto, + device: CaptureDeviceSelector::Default, + format: CaptureFormatRequest::Default, + is_screencast: false, + prefer_raw_frames: false, + encoded: None, + } + } +} + +/// Endpoint configuration for the encoded ingest backends. +#[derive(Debug, Clone, PartialEq, Eq)] +#[non_exhaustive] +pub enum EncodedEndpoint { + /// RTSP camera URL ingested over TCP-interleaved RTP. + #[cfg(feature = "rtsp")] + Rtsp { + /// RTSP URL, e.g. `rtsp://user:pass@camera/stream`. + url: String, + /// RTSP source options (codec expectation, dimensions, timeouts). + options: crate::sources::rtsp::RtspSourceOptions, + }, + /// TCP byte-stream endpoint to connect to. + #[cfg(feature = "tcpsink")] + TcpConnect { + /// `host:port` to connect to. + address: String, + /// Byte-stream configuration (wire format, dimensions, timing). + config: crate::sources::tcp::ByteStreamSourceConfig, + }, + /// GStreamer launch description that contains or feeds an encoded appsink. + /// + /// The pipeline must either contain `appsink name=lk_appsink` or leave + /// one encoded video source pad unlinked (a parser, capsfilter, and + /// appsink are attached automatically). + #[cfg(feature = "gstreamer")] + GstreamerLaunch { + /// `gst-launch`-style pipeline description. + launch: String, + /// Expected codec; inferred from the pipeline caps when `None`. + codec: Option, + /// Appsink source configuration (dimensions and timestamp fallbacks). + /// + /// The `sample_format` field is overridden by what the pipeline caps + /// advertise. + config: crate::sources::gstreamer::GStreamerAppSinkConfig, + }, +} + +/// Uncompressed CPU-accessible video frame buffer produced by a capture source. +#[derive(Debug)] +pub struct RawVideoFrame { + /// I420 video frame suitable for [`VideoCaptureTrack::capture_frame`]. + pub frame: VideoFrame, + /// Source format delivered by the capture backend before conversion to I420. + pub source_format: CaptureFrameFormat, + /// Wall-clock capture timestamp in microseconds. + pub capture_wall_time_us: u64, + /// Wall-clock timestamp recorded after the frame was read, in microseconds. + pub read_wall_time_us: u64, + /// Sensor timestamp translated to UNIX-epoch microseconds, when available. + pub sensor_timestamp_us: Option, + /// Whether the backend converted the source buffer before publishing. + pub used_conversion: bool, +} + +impl RawVideoFrame { + /// Returns the decoded I420 video frame. + pub fn video_frame(&self) -> &VideoFrame { + &self.frame + } +} + +/// Platform-native uncompressed video frame buffer produced by a capture source. +#[derive(Debug)] +pub struct NativeVideoFrame { + /// Native video frame suitable for [`VideoCaptureTrack::capture_frame`]. + pub frame: VideoFrame, + /// Source format delivered by the capture backend. + pub source_format: CaptureFrameFormat, + /// Wall-clock capture timestamp in microseconds. + pub capture_wall_time_us: u64, + /// Wall-clock timestamp recorded after the frame was read, in microseconds. + pub read_wall_time_us: u64, + /// Sensor timestamp translated to UNIX-epoch microseconds, when available. + pub sensor_timestamp_us: Option, +} + +impl NativeVideoFrame { + /// Returns the native video frame. + pub fn video_frame(&self) -> &VideoFrame { + &self.frame + } +} + +/// Frame produced by a capture source. +#[derive(Debug)] +#[non_exhaustive] +pub enum CaptureFrame { + /// Platform-native uncompressed frame. + Native(NativeVideoFrame), + /// Uncompressed CPU-accessible frame. + Raw(RawVideoFrame), + /// Linux DMA-BUF backed frame. + DmaBuf(DmaBufFrame), + /// Encoded video access unit. + Encoded(OwnedEncodedAccessUnit), +} + +impl CaptureFrame { + /// Returns the capture path used by this frame. + pub fn capture_path(&self) -> CapturePath { + match self { + Self::Native(_) => CapturePath::Native, + Self::Raw(_) => CapturePath::Raw, + Self::DmaBuf(_) => CapturePath::DmaBuf, + Self::Encoded(_) => CapturePath::Encoded, + } + } + + /// Publishes this frame into a LiveKit capture track. + pub fn publish_to(&self, track: &VideoCaptureTrack) -> Result<(), CaptureError> { + match self { + Self::Native(frame) => { + track.capture_frame(&frame.frame); + Ok(()) + } + Self::Raw(frame) => { + track.capture_frame(&frame.frame); + Ok(()) + } + #[cfg(target_os = "linux")] + Self::DmaBuf(frame) => track.capture_dmabuf(frame), + #[cfg(not(target_os = "linux"))] + Self::DmaBuf(_) => Err(CaptureError::UnsupportedPlatform("DMA-BUF capture")), + Self::Encoded(access_unit) => track.capture_encoded(&access_unit.as_access_unit()), + } + } +} + +/// Source that produces one of the common capture frame paths. +pub trait CaptureFrameSource { + /// Error returned by the source. + type Error: Error + Send + Sync + 'static; + + /// Returns the capture path produced by this source. + fn capture_path(&self) -> CapturePath; + + /// Returns the negotiated capture format when the source has one. + fn format(&self) -> Option; + + /// Captures the next frame. + fn next_frame(&mut self) -> Result; +} + +/// Adapts an [`EncodedAccessUnitSource`] into the common frame-source model. +#[derive(Debug)] +pub struct EncodedCaptureFrameSource { + source: S, +} + +impl EncodedCaptureFrameSource { + /// Creates a frame-source adapter for an encoded access-unit source. + pub fn new(source: S) -> Self { + Self { source } + } + + /// Returns the underlying encoded source. + pub fn source(&self) -> &S { + &self.source + } + + /// Returns the underlying encoded source mutably. + pub fn source_mut(&mut self) -> &mut S { + &mut self.source + } + + /// Consumes this adapter and returns the underlying encoded source. + pub fn into_inner(self) -> S { + self.source + } +} + +impl CaptureFrameSource for EncodedCaptureFrameSource +where + S: EncodedAccessUnitSource, +{ + type Error = EncodedFrameSourceError; + + fn capture_path(&self) -> CapturePath { + CapturePath::Encoded + } + + fn format(&self) -> Option { + None + } + + fn next_frame(&mut self) -> Result { + let Some(access_unit) = + self.source.next_access_unit().map_err(EncodedFrameSourceError::Source)? + else { + return Err(EncodedFrameSourceError::EndOfStream); + }; + Ok(CaptureFrame::Encoded(access_unit)) + } +} + +/// Error returned by [`EncodedCaptureFrameSource`]. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum EncodedFrameSourceError { + /// The encoded source reached EOF. + EndOfStream, + /// The encoded source failed. + Source(E), +} + +impl fmt::Display for EncodedFrameSourceError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::EndOfStream => f.write_str("encoded source reached end of stream"), + Self::Source(err) => write!(f, "encoded source failed: {err}"), + } + } +} + +impl Error for EncodedFrameSourceError +where + E: Error + 'static, +{ + fn source(&self) -> Option<&(dyn Error + 'static)> { + match self { + Self::EndOfStream => None, + Self::Source(err) => Some(err), + } + } +} + +/// Error returned by the high-level capture source façade. +#[derive(Debug, Error)] +#[non_exhaustive] +pub enum CaptureSourceError { + /// The requested backend cannot be used by this façade on this target or build. + #[error("capture backend {0} is not supported by VideoCaptureSource on this target or build")] + UnsupportedBackend(CaptureBackend), + /// The backend requires an [`EncodedEndpoint`] in [`CaptureSourceOptions::encoded`]. + #[error("capture backend {0} requires a matching CaptureSourceOptions::encoded endpoint")] + MissingEncodedEndpoint(CaptureBackend), + /// The encoded source reached end of stream. + #[error("capture source reached end of stream")] + EndOfStream, + /// The backend source failed. + #[error("capture backend {backend} failed: {message}")] + Backend { + /// Backend that failed. + backend: CaptureBackend, + /// Backend error message. + message: String, + }, + /// The capture track rejected the frame. + #[error(transparent)] + Capture(#[from] CaptureError), +} + +/// GStreamer pipeline plus the encoded appsink source reading from it. +/// +/// Stops the pipeline when dropped. +#[cfg(feature = "gstreamer")] +#[derive(Debug)] +pub struct GStreamerCaptureSource { + pipeline: ::gstreamer::Pipeline, + source: EncodedCaptureFrameSource, +} + +#[cfg(feature = "gstreamer")] +impl GStreamerCaptureSource { + /// Returns the running pipeline. + pub fn pipeline(&self) -> &::gstreamer::Pipeline { + &self.pipeline + } + + /// Returns the encoded appsink source. + pub fn source_mut( + &mut self, + ) -> &mut EncodedCaptureFrameSource + { + &mut self.source + } +} + +#[cfg(feature = "gstreamer")] +impl Drop for GStreamerCaptureSource { + fn drop(&mut self) { + use ::gstreamer::prelude::ElementExt; + let _ = self.pipeline.set_state(::gstreamer::State::Null); + } +} + +/// High-level capture source façade for the crate's capture backends. +#[derive(Debug)] +#[non_exhaustive] +pub enum VideoCaptureSource { + /// AVFoundation decoded-frame source. + #[cfg(feature = "avfoundation")] + AvFoundation { + /// Underlying capture session. + session: crate::sources::avfoundation::AvFoundationCaptureSession, + /// Prefer CPU-accessible frames over zero-copy native buffers. + prefer_raw_frames: bool, + }, + /// Linux V4L2 decoded-frame source. + #[cfg(feature = "v4l")] + V4l(crate::sources::v4l::V4lCaptureSession), + /// Jetson libargus DMA-BUF source. + #[cfg(feature = "libargus")] + LibArgus(crate::sources::argus::ArgusCaptureSession), + /// RTSP encoded ingest source. + #[cfg(feature = "rtsp")] + Rtsp(EncodedCaptureFrameSource), + /// TCP byte-stream encoded ingest source. + #[cfg(feature = "tcpsink")] + Tcp(EncodedCaptureFrameSource), + /// GStreamer pipeline encoded ingest source. + #[cfg(feature = "gstreamer")] + Gstreamer(GStreamerCaptureSource), +} + +impl VideoCaptureSource { + /// Lists capture devices for a backend. + /// + /// The encoded ingest backends (RTSP/TCP/GStreamer) address network + /// endpoints rather than enumerable devices, so they report + /// [`CaptureDeviceQueryError::UnsupportedBackend`]. + pub fn list_devices( + backend: CaptureBackend, + ) -> Result, CaptureDeviceQueryError> { + match backend { + CaptureBackend::Auto => list_auto_devices(), + CaptureBackend::AvFoundation => list_avfoundation_devices(), + CaptureBackend::V4l2 => list_v4l_devices(), + CaptureBackend::LibArgus => list_argus_devices(), + CaptureBackend::Rtsp | CaptureBackend::Tcp | CaptureBackend::Gstreamer => { + Err(CaptureDeviceQueryError::UnsupportedBackend(backend)) + } + } + } + + /// Opens a capture source. + pub fn open(options: CaptureSourceOptions) -> Result { + match options.backend { + CaptureBackend::Auto => open_auto_source(options), + CaptureBackend::AvFoundation => open_avfoundation_source(options), + CaptureBackend::V4l2 => open_v4l_source(options), + CaptureBackend::LibArgus => open_argus_source(options), + CaptureBackend::Rtsp => open_rtsp_source(options), + CaptureBackend::Tcp => open_tcp_source(options), + CaptureBackend::Gstreamer => open_gstreamer_source(options), + } + } + + /// Returns the capture path produced by this source. + pub fn capture_path(&self) -> CapturePath { + match self { + #[cfg(feature = "avfoundation")] + Self::AvFoundation { session, prefer_raw_frames } => { + if session.native_capture_supported() && !prefer_raw_frames { + CapturePath::Native + } else { + CapturePath::Raw + } + } + #[cfg(feature = "v4l")] + Self::V4l(source) => source.capture_path(), + #[cfg(feature = "libargus")] + Self::LibArgus(source) => source.capture_path(), + #[cfg(feature = "rtsp")] + Self::Rtsp(_) => CapturePath::Encoded, + #[cfg(feature = "tcpsink")] + Self::Tcp(_) => CapturePath::Encoded, + #[cfg(feature = "gstreamer")] + Self::Gstreamer(_) => CapturePath::Encoded, + #[allow(unreachable_patterns)] + _ => unreachable!("VideoCaptureSource has no enabled backend variants"), + } + } + + /// Returns the negotiated capture format when the source has one. + pub fn format(&self) -> Option { + match self { + #[cfg(feature = "avfoundation")] + Self::AvFoundation { session, .. } => Some(session.format()), + #[cfg(feature = "v4l")] + Self::V4l(source) => Some(source.format()), + #[cfg(feature = "libargus")] + Self::LibArgus(source) => Some(source.format()), + #[cfg(feature = "rtsp")] + Self::Rtsp(_) => None, + #[cfg(feature = "tcpsink")] + Self::Tcp(_) => None, + #[cfg(feature = "gstreamer")] + Self::Gstreamer(_) => None, + #[allow(unreachable_patterns)] + _ => unreachable!("VideoCaptureSource has no enabled backend variants"), + } + } + + /// Captures the next frame. + /// + /// The encoded ingest backends return + /// [`CaptureSourceError::EndOfStream`] when the stream terminates + /// normally. + pub fn next_frame(&mut self) -> Result { + match self { + #[cfg(feature = "avfoundation")] + Self::AvFoundation { session, prefer_raw_frames } => { + let frame = if session.native_capture_supported() && !*prefer_raw_frames { + session.capture_native_frame().map(|frame| CaptureFrame::Native(frame.into())) + } else { + session.capture_frame().map(|frame| CaptureFrame::Raw(frame.into())) + }; + frame.map_err(|err| backend_source_error(CaptureBackend::AvFoundation, err)) + } + #[cfg(feature = "v4l")] + Self::V4l(source) => { + source.next_frame().map_err(|err| backend_source_error(CaptureBackend::V4l2, err)) + } + #[cfg(feature = "libargus")] + Self::LibArgus(source) => source + .next_frame() + .map_err(|err| backend_source_error(CaptureBackend::LibArgus, err)), + #[cfg(feature = "rtsp")] + Self::Rtsp(source) => { + source.next_frame().map_err(|err| encoded_source_error(CaptureBackend::Rtsp, err)) + } + #[cfg(feature = "tcpsink")] + Self::Tcp(source) => { + source.next_frame().map_err(|err| encoded_source_error(CaptureBackend::Tcp, err)) + } + #[cfg(feature = "gstreamer")] + Self::Gstreamer(source) => source + .source + .next_frame() + .map_err(|err| encoded_source_error(CaptureBackend::Gstreamer, err)), + #[allow(unreachable_patterns)] + _ => unreachable!("VideoCaptureSource has no enabled backend variants"), + } + } + + /// Signals the source to stop, interrupting a blocked + /// [`VideoCaptureSource::next_frame`] where the backend supports it + /// (AVFoundation today); other backends return at the next frame + /// boundary. + pub fn stop(&self) { + match self { + #[cfg(feature = "avfoundation")] + Self::AvFoundation { session, .. } => session.stop(), + #[allow(unreachable_patterns)] + _ => {} + } + } + + /// Forwards a downstream keyframe request to the source's producer. + /// + /// No-op for the decoded camera backends, which have no upstream + /// encoder. + pub fn request_keyframe(&mut self) { + match self { + #[cfg(feature = "rtsp")] + Self::Rtsp(source) => source.source_mut().request_keyframe(), + #[cfg(feature = "tcpsink")] + Self::Tcp(source) => source.source_mut().request_keyframe(), + #[cfg(feature = "gstreamer")] + Self::Gstreamer(source) => source.source.source_mut().request_keyframe(), + #[allow(unreachable_patterns)] + _ => {} + } + } + + /// Captures and publishes the next frame, returning `false` once an + /// encoded source reaches end of stream. + /// + /// Keyframe requests raised by the passthrough encoder are polled from + /// the track and forwarded to the source before each capture. + pub fn publish_next(&mut self, track: &VideoCaptureTrack) -> Result { + if track.take_keyframe_request() { + self.request_keyframe(); + } + let frame = match self.next_frame() { + Ok(frame) => frame, + Err(CaptureSourceError::EndOfStream) => return Ok(false), + Err(err) => return Err(err), + }; + frame.publish_to(track)?; + Ok(true) + } +} + +#[cfg(feature = "avfoundation")] +impl CaptureFrameSource for crate::sources::avfoundation::AvFoundationCaptureSession { + type Error = crate::sources::avfoundation::AvFoundationError; + + fn capture_path(&self) -> CapturePath { + self.capture_path() + } + + fn format(&self) -> Option { + Some(self.format()) + } + + fn next_frame(&mut self) -> Result { + if self.native_capture_supported() { + self.capture_native_frame().map(|frame| CaptureFrame::Native(frame.into())) + } else { + self.capture_frame().map(|frame| CaptureFrame::Raw(frame.into())) + } + } +} + +#[cfg(feature = "avfoundation")] +impl From for NativeVideoFrame { + fn from(frame: crate::sources::avfoundation::AvFoundationNativeFrame) -> Self { + Self { + frame: frame.frame, + source_format: frame.source_format, + capture_wall_time_us: frame.capture_wall_time_us, + read_wall_time_us: frame.read_wall_time_us, + sensor_timestamp_us: frame.sensor_timestamp_us, + } + } +} + +#[cfg(feature = "avfoundation")] +impl From for RawVideoFrame { + fn from(frame: crate::sources::avfoundation::AvFoundationFrame) -> Self { + Self { + frame: frame.frame, + source_format: frame.source_format, + capture_wall_time_us: frame.capture_wall_time_us, + read_wall_time_us: frame.read_wall_time_us, + sensor_timestamp_us: frame.sensor_timestamp_us, + used_conversion: frame.used_conversion, + } + } +} + +#[cfg(feature = "v4l")] +impl CaptureFrameSource for crate::sources::v4l::V4lCaptureSession { + type Error = crate::sources::v4l::V4lError; + + fn capture_path(&self) -> CapturePath { + self.capture_path() + } + + fn format(&self) -> Option { + Some(self.format()) + } + + fn next_frame(&mut self) -> Result { + self.capture_frame().map(|frame| CaptureFrame::Raw(frame.into())) + } +} + +#[cfg(feature = "v4l")] +impl From for RawVideoFrame { + fn from(frame: crate::sources::v4l::V4lFrame) -> Self { + Self { + used_conversion: frame.used_conversion, + frame: frame.frame, + source_format: frame.source_format, + capture_wall_time_us: frame.capture_wall_time_us, + read_wall_time_us: frame.read_wall_time_us, + sensor_timestamp_us: frame.sensor_timestamp_us, + } + } +} + +#[cfg(feature = "libargus")] +impl CaptureFrameSource for crate::sources::argus::ArgusCaptureSession { + type Error = crate::sources::argus::ArgusError; + + fn capture_path(&self) -> CapturePath { + self.capture_path() + } + + fn format(&self) -> Option { + Some(self.format()) + } + + fn next_frame(&mut self) -> Result { + self.capture_frame().map(|frame| CaptureFrame::DmaBuf(frame.dmabuf)) + } +} + +#[allow(dead_code)] +fn backend_source_error( + backend: CaptureBackend, + error: impl Error + Send + Sync + 'static, +) -> CaptureSourceError { + CaptureSourceError::Backend { backend, message: error.to_string() } +} + +#[allow(dead_code)] +fn backend_query_error( + backend: CaptureBackend, + error: impl Error + Send + Sync + 'static, +) -> CaptureDeviceQueryError { + CaptureDeviceQueryError::Backend { backend, message: error.to_string() } +} + +#[allow(dead_code)] +fn encoded_source_error( + backend: CaptureBackend, + error: EncodedFrameSourceError, +) -> CaptureSourceError { + match error { + EncodedFrameSourceError::EndOfStream => CaptureSourceError::EndOfStream, + EncodedFrameSourceError::Source(err) => backend_source_error(backend, err), + } +} + +#[cfg(feature = "rtsp")] +fn open_rtsp_source( + options: CaptureSourceOptions, +) -> Result { + let Some(EncodedEndpoint::Rtsp { url, options: rtsp_options }) = options.encoded else { + return Err(CaptureSourceError::MissingEncodedEndpoint(CaptureBackend::Rtsp)); + }; + let source = crate::sources::rtsp::RtspEncodedSource::connect(&url, rtsp_options) + .map_err(|err| backend_source_error(CaptureBackend::Rtsp, err))?; + Ok(VideoCaptureSource::Rtsp(EncodedCaptureFrameSource::new(source))) +} + +#[cfg(not(feature = "rtsp"))] +fn open_rtsp_source( + _options: CaptureSourceOptions, +) -> Result { + Err(CaptureSourceError::UnsupportedBackend(CaptureBackend::Rtsp)) +} + +#[cfg(feature = "tcpsink")] +fn open_tcp_source( + options: CaptureSourceOptions, +) -> Result { + let Some(EncodedEndpoint::TcpConnect { address, config }) = options.encoded else { + return Err(CaptureSourceError::MissingEncodedEndpoint(CaptureBackend::Tcp)); + }; + let source = crate::sources::tcp::TcpEncodedSource::connect(address.as_str(), config) + .map_err(|err| backend_source_error(CaptureBackend::Tcp, err))?; + Ok(VideoCaptureSource::Tcp(EncodedCaptureFrameSource::new(source))) +} + +#[cfg(not(feature = "tcpsink"))] +fn open_tcp_source( + _options: CaptureSourceOptions, +) -> Result { + Err(CaptureSourceError::UnsupportedBackend(CaptureBackend::Tcp)) +} + +#[cfg(feature = "gstreamer")] +fn open_gstreamer_source( + options: CaptureSourceOptions, +) -> Result { + use ::gstreamer::prelude::*; + + let Some(EncodedEndpoint::GstreamerLaunch { launch, codec, mut config }) = options.encoded + else { + return Err(CaptureSourceError::MissingEncodedEndpoint(CaptureBackend::Gstreamer)); + }; + + let gst_error = |err: &dyn std::fmt::Display| CaptureSourceError::Backend { + backend: CaptureBackend::Gstreamer, + message: err.to_string(), + }; + + ::gstreamer::init().map_err(|err| gst_error(&err))?; + let pipeline = ::gstreamer::parse::launch(&launch) + .map_err(|err| gst_error(&err))? + .downcast::<::gstreamer::Pipeline>() + .map_err(|element| CaptureSourceError::Backend { + backend: CaptureBackend::Gstreamer, + message: format!( + "launch description did not produce a pipeline (got {})", + element.name() + ), + })?; + let (appsink, sample_format) = + crate::sources::gstreamer::ensure_encoded_appsink(&pipeline, codec) + .map_err(|err| gst_error(&err))?; + config.sample_format = sample_format; + pipeline.set_state(::gstreamer::State::Playing).map_err(|err| gst_error(&err))?; + + let source = crate::sources::gstreamer::GStreamerAppSinkEncodedSource::new(appsink, config); + Ok(VideoCaptureSource::Gstreamer(GStreamerCaptureSource { + pipeline, + source: EncodedCaptureFrameSource::new(source), + })) +} + +#[cfg(not(feature = "gstreamer"))] +fn open_gstreamer_source( + _options: CaptureSourceOptions, +) -> Result { + Err(CaptureSourceError::UnsupportedBackend(CaptureBackend::Gstreamer)) +} + +fn list_auto_devices() -> Result, CaptureDeviceQueryError> { + #[cfg(all(target_os = "macos", feature = "avfoundation"))] + { + return list_avfoundation_devices(); + } + #[cfg(all(target_os = "linux", feature = "v4l"))] + { + return list_v4l_devices(); + } + #[allow(unreachable_code)] + Err(CaptureDeviceQueryError::UnsupportedBackend(CaptureBackend::Auto)) +} + +fn open_auto_source( + options: CaptureSourceOptions, +) -> Result { + let _ = &options; + #[cfg(all(target_os = "macos", feature = "avfoundation"))] + { + let mut options = options; + options.backend = CaptureBackend::AvFoundation; + return open_avfoundation_source(options); + } + #[cfg(all(target_os = "linux", feature = "v4l"))] + { + let mut options = options; + options.backend = CaptureBackend::V4l2; + return open_v4l_source(options); + } + #[allow(unreachable_code)] + Err(CaptureSourceError::UnsupportedBackend(CaptureBackend::Auto)) +} + +#[cfg(feature = "avfoundation")] +fn list_avfoundation_devices() -> Result, CaptureDeviceQueryError> { + crate::sources::avfoundation::devices().map_err(|err| match err { + crate::sources::avfoundation::AvFoundationError::UnsupportedPlatform => { + CaptureDeviceQueryError::UnsupportedBackend(CaptureBackend::AvFoundation) + } + other => backend_query_error(CaptureBackend::AvFoundation, other), + }) +} + +#[cfg(not(feature = "avfoundation"))] +fn list_avfoundation_devices() -> Result, CaptureDeviceQueryError> { + Err(CaptureDeviceQueryError::UnsupportedBackend(CaptureBackend::AvFoundation)) +} + +#[cfg(feature = "avfoundation")] +fn open_avfoundation_source( + options: CaptureSourceOptions, +) -> Result { + let prefer_raw_frames = options.prefer_raw_frames; + let source = crate::sources::avfoundation::AvFoundationCaptureSession::new(options.into()) + .map_err(|err| match err { + crate::sources::avfoundation::AvFoundationError::UnsupportedPlatform => { + CaptureSourceError::UnsupportedBackend(CaptureBackend::AvFoundation) + } + other => backend_source_error(CaptureBackend::AvFoundation, other), + })?; + Ok(VideoCaptureSource::AvFoundation { session: source, prefer_raw_frames }) +} + +#[cfg(not(feature = "avfoundation"))] +fn open_avfoundation_source( + _options: CaptureSourceOptions, +) -> Result { + Err(CaptureSourceError::UnsupportedBackend(CaptureBackend::AvFoundation)) +} + +#[cfg(feature = "avfoundation")] +impl From for crate::sources::avfoundation::AvFoundationCaptureOptions { + fn from(options: CaptureSourceOptions) -> Self { + Self { + device: options.device, + format: options.format, + is_screencast: options.is_screencast, + } + } +} + +#[cfg(feature = "v4l")] +fn list_v4l_devices() -> Result, CaptureDeviceQueryError> { + crate::sources::v4l::devices().map_err(|err| match err { + crate::sources::v4l::V4lError::UnsupportedPlatform => { + CaptureDeviceQueryError::UnsupportedBackend(CaptureBackend::V4l2) + } + other => backend_query_error(CaptureBackend::V4l2, other), + }) +} + +#[cfg(not(feature = "v4l"))] +fn list_v4l_devices() -> Result, CaptureDeviceQueryError> { + Err(CaptureDeviceQueryError::UnsupportedBackend(CaptureBackend::V4l2)) +} + +#[cfg(feature = "v4l")] +fn open_v4l_source( + options: CaptureSourceOptions, +) -> Result { + let source = + crate::sources::v4l::V4lCaptureSession::new(options.into()).map_err(|err| match err { + crate::sources::v4l::V4lError::UnsupportedPlatform => { + CaptureSourceError::UnsupportedBackend(CaptureBackend::V4l2) + } + other => backend_source_error(CaptureBackend::V4l2, other), + })?; + Ok(VideoCaptureSource::V4l(source)) +} + +#[cfg(not(feature = "v4l"))] +fn open_v4l_source( + _options: CaptureSourceOptions, +) -> Result { + Err(CaptureSourceError::UnsupportedBackend(CaptureBackend::V4l2)) +} + +#[cfg(feature = "v4l")] +impl From for crate::sources::v4l::V4lCaptureOptions { + fn from(options: CaptureSourceOptions) -> Self { + let mut source_options = Self { + device: options.device, + format: options.format, + frame_formats: crate::sources::v4l::default_frame_formats(), + }; + if let CaptureFormatRequest::Exact(format) | CaptureFormatRequest::Closest(format) = + source_options.format + { + source_options.frame_formats = + crate::sources::v4l::ordered_frame_formats_with_first(format.frame_format); + } + source_options + } +} + +#[cfg(feature = "libargus")] +fn list_argus_devices() -> Result, CaptureDeviceQueryError> { + crate::sources::argus::devices().map_err(|err| match err { + crate::sources::argus::ArgusError::Unsupported => { + CaptureDeviceQueryError::UnsupportedBackend(CaptureBackend::LibArgus) + } + other => backend_query_error(CaptureBackend::LibArgus, other), + }) +} + +#[cfg(not(feature = "libargus"))] +fn list_argus_devices() -> Result, CaptureDeviceQueryError> { + Err(CaptureDeviceQueryError::UnsupportedBackend(CaptureBackend::LibArgus)) +} + +#[cfg(feature = "libargus")] +fn open_argus_source( + options: CaptureSourceOptions, +) -> Result { + let source = + crate::sources::argus::ArgusCaptureSession::new(options.try_into()?).map_err(|err| { + match err { + crate::sources::argus::ArgusError::Unsupported => { + CaptureSourceError::UnsupportedBackend(CaptureBackend::LibArgus) + } + other => backend_source_error(CaptureBackend::LibArgus, other), + } + })?; + Ok(VideoCaptureSource::LibArgus(source)) +} + +#[cfg(not(feature = "libargus"))] +fn open_argus_source( + _options: CaptureSourceOptions, +) -> Result { + Err(CaptureSourceError::UnsupportedBackend(CaptureBackend::LibArgus)) +} + +#[cfg(feature = "libargus")] +impl TryFrom for crate::sources::argus::ArgusCaptureOptions { + type Error = CaptureSourceError; + + fn try_from(options: CaptureSourceOptions) -> Result { + let sensor_index = match options.device { + CaptureDeviceSelector::Default => 0, + CaptureDeviceSelector::Index(index) => { + u32::try_from(index).map_err(|_| CaptureSourceError::Backend { + backend: CaptureBackend::LibArgus, + message: "device index is out of range".to_string(), + })? + } + CaptureDeviceSelector::Id(_) => { + return Err(CaptureSourceError::Backend { + backend: CaptureBackend::LibArgus, + message: "libargus does not support string device selectors".to_string(), + }); + } + }; + let format = match options.format { + CaptureFormatRequest::Exact(format) | CaptureFormatRequest::Closest(format) => format, + CaptureFormatRequest::Default => { + crate::sources::argus::ArgusCaptureOptions::default().format + } + CaptureFormatRequest::HighestFrameRate { .. } + | CaptureFormatRequest::HighestResolution { .. } => { + return Err(CaptureSourceError::Backend { + backend: CaptureBackend::LibArgus, + message: "libargus requires an exact or closest format".to_string(), + }); + } + }; + Ok(Self { sensor_index, format }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::dmabuf::{DmaBufPixelFormat, DmaBufPlane}; + use crate::encoded::{EncodedFrameType, EncodedVideoCodec}; + use livekit::webrtc::video_frame::VideoRotation; + + #[derive(Debug, Error)] + #[error("fake source failed")] + struct FakeSourceError; + + #[derive(Debug)] + struct FakeEncodedSource { + next: Option, + } + + impl EncodedAccessUnitSource for FakeEncodedSource { + type Error = FakeSourceError; + + fn next_access_unit(&mut self) -> Result, Self::Error> { + Ok(self.next.take()) + } + } + + #[test] + fn encoded_source_adapts_to_capture_frame_source() { + let access_unit = OwnedEncodedAccessUnit::new( + EncodedVideoCodec::H264, + vec![0, 0, 0, 1, 0x65], + 10, + EncodedFrameType::Key, + 640, + 480, + ); + let mut source = + EncodedCaptureFrameSource::new(FakeEncodedSource { next: Some(access_unit.clone()) }); + + assert_eq!(source.capture_path(), CapturePath::Encoded); + let frame = source.next_frame().expect("encoded frame should be returned"); + assert_eq!(frame.capture_path(), CapturePath::Encoded); + let CaptureFrame::Encoded(returned) = frame else { + panic!("expected encoded frame"); + }; + assert_eq!(returned, access_unit); + } + + #[test] + fn encoded_source_reports_end_of_stream() { + let mut source = EncodedCaptureFrameSource::new(FakeEncodedSource { next: None }); + let err = source.next_frame().expect_err("EOF should be reported"); + assert!(matches!(err, EncodedFrameSourceError::EndOfStream)); + } + + #[test] + fn capture_frame_reports_common_paths() { + let raw = CaptureFrame::Raw(RawVideoFrame { + frame: VideoFrame { + rotation: VideoRotation::VideoRotation0, + timestamp_us: 0, + frame_metadata: None, + buffer: I420Buffer::new(2, 2), + }, + source_format: CaptureFrameFormat::I420, + capture_wall_time_us: 1, + read_wall_time_us: 2, + sensor_timestamp_us: None, + used_conversion: false, + }); + assert_eq!(raw.capture_path(), CapturePath::Raw); + + let dmabuf = CaptureFrame::DmaBuf(DmaBufFrame { + width: 2, + height: 2, + pixel_format: DmaBufPixelFormat::Nv12, + planes: vec![DmaBufPlane { fd: -1, offset: 0, stride: 2 }], + modifier: None, + timestamp_us: 0, + sensor_timestamp_us: None, + }); + assert_eq!(dmabuf.capture_path(), CapturePath::DmaBuf); + + let encoded = CaptureFrame::Encoded(OwnedEncodedAccessUnit::new( + EncodedVideoCodec::H264, + vec![0, 0, 0, 1, 0x65], + 0, + EncodedFrameType::Key, + 2, + 2, + )); + assert_eq!(encoded.capture_path(), CapturePath::Encoded); + } + + #[cfg(feature = "avfoundation")] + #[test] + fn avfoundation_canonical_import_compiles() { + let _ = std::any::TypeId::of::(); + } + + #[cfg(feature = "v4l")] + #[test] + fn v4l_canonical_import_compiles() { + let _ = std::any::TypeId::of::(); + } +} diff --git a/livekit-capture/src/sources/argus.rs b/livekit-capture/src/sources/argus.rs new file mode 100644 index 000000000..28363a92f --- /dev/null +++ b/livekit-capture/src/sources/argus.rs @@ -0,0 +1,526 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! NVIDIA Argus/libargus capture for Jetson MIPI CSI cameras. + +use livekit::webrtc::video_frame::{I420Buffer, VideoFrame, VideoRotation}; +use thiserror::Error; + +#[cfg(livekit_capture_argus)] +use crate::device::{CaptureBackend, CaptureDeviceSelector}; +use crate::{ + device::{ + CaptureDeviceInfo, CaptureFormat, CaptureFrameFormat, CapturePath, CaptureResolution, + }, + dmabuf::DmaBufFrame, +}; + +#[cfg(livekit_capture_argus)] +use crate::dmabuf::{DmaBufPixelFormat, DmaBufPlane}; +#[cfg(livekit_capture_argus)] +use crate::time::{elapsed_us, unix_time_us_now}; +#[cfg(livekit_capture_argus)] +use std::time::Instant; +#[cfg(livekit_capture_argus)] +use std::{ffi::c_int, ffi::c_void}; + +#[cfg(livekit_capture_argus)] +extern "C" { + fn lk_argus_create_session( + sensor_index: c_int, + width: c_int, + height: c_int, + fps: c_int, + ) -> *mut c_void; + + fn lk_argus_destroy_session(session: *mut c_void); + + fn lk_argus_acquire_frame_with_metadata( + session: *mut c_void, + sensor_timestamp_ns: *mut u64, + acquire_wait_ns: *mut u64, + blit_ns: *mut u64, + ) -> c_int; + + fn lk_argus_copy_frame_to_i420( + session: *mut c_void, + dmabuf_fd: c_int, + dst_y: *mut u8, + dst_stride_y: c_int, + dst_u: *mut u8, + dst_stride_u: c_int, + dst_v: *mut u8, + dst_stride_v: c_int, + copy_to_i420_ns: *mut u64, + ) -> c_int; + + fn lk_argus_release_frame(session: *mut c_void); +} + +/// Options used to open a Jetson Argus capture session. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ArgusCaptureOptions { + /// MIPI CSI sensor index. + pub sensor_index: u32, + /// Requested capture format. + pub format: CaptureFormat, +} + +impl ArgusCaptureOptions { + /// Creates options for NV12 DMA-BUF capture from a Jetson MIPI CSI sensor. + pub const fn new(sensor_index: u32, resolution: CaptureResolution, frame_rate: u32) -> Self { + Self { + sensor_index, + format: CaptureFormat::new(resolution, frame_rate, CaptureFrameFormat::Nv12), + } + } +} + +impl Default for ArgusCaptureOptions { + fn default() -> Self { + Self::new(0, CaptureResolution::new(1280, 720), 30) + } +} + +/// Error returned by the Argus capture backend. +#[derive(Debug, Error, PartialEq, Eq)] +pub enum ArgusError { + /// Argus capture is not available for this target or build. + #[error("libargus capture is not available on this target or build")] + Unsupported, + /// Argus only publishes NV12 DMA-BUF frames in this backend. + #[error("libargus capture only supports NV12 DMA-BUF frames, got {0:?}")] + UnsupportedFrameFormat(CaptureFrameFormat), + /// The requested format contains an invalid value. + #[error("invalid Argus capture option: {0}")] + InvalidOption(&'static str), + /// A numeric option could not be represented by the C shim. + #[error("Argus capture option is out of range for the C shim: {0}")] + OptionOutOfRange(&'static str), + /// The C shim failed to create an Argus capture session. + #[error("failed to create Argus capture session")] + CreateSessionFailed, + /// The C shim failed to acquire a frame. + #[error("Argus frame acquisition failed")] + AcquireFrameFailed, + /// The captured DMA-BUF frame did not include a plane descriptor. + #[error("Argus frame did not include a DMA-BUF plane")] + MissingDmaBufPlane, + /// The C shim failed to copy the captured frame to I420. + #[error("failed to copy Argus frame to I420: {0}")] + CopyToI420Failed(ArgusI420CopyError), +} + +/// Error returned while copying an Argus DMA-BUF frame to CPU I420. +#[derive(Debug, Clone, Copy, Error, PartialEq, Eq)] +pub enum ArgusI420CopyError { + /// The C shim received invalid arguments. + #[error("invalid argument")] + InvalidArgument, + /// The DMA-BUF fd was not found in the active Argus buffer ring. + #[error("DMA-BUF surface not found")] + SurfaceNotFound, + /// Mapping the DMA-BUF surface for CPU readback failed. + #[error("failed to map DMA-BUF surface for CPU readback: {0}")] + MapFailed(i32), + /// Synchronizing the DMA-BUF surface for CPU readback failed. + #[error("failed to synchronize DMA-BUF surface for CPU readback: {0}")] + SyncForCpuFailed(i32), + /// The mapped surface did not expose valid NV12 planes. + #[error("invalid mapped NV12 surface")] + InvalidSurface, + /// Unmapping the DMA-BUF surface failed. + #[error("failed to unmap DMA-BUF surface: {0}")] + UnmapFailed(i32), + /// The C shim returned an unknown error code. + #[error("unknown error code {0}")] + Unknown(i32), +} + +#[cfg(livekit_capture_argus)] +impl ArgusI420CopyError { + fn from_status(status: i32) -> Self { + match status { + -1 => Self::InvalidArgument, + -2 => Self::SurfaceNotFound, + -4 => Self::InvalidSurface, + code if code <= -2000 => Self::SyncForCpuFailed(-2000 - code), + code if code <= -1000 => Self::MapFailed(-1000 - code), + code if code <= -100 => Self::UnmapFailed(-100 - code), + code => Self::Unknown(code), + } + } +} + +/// One Argus frame backed by an NV12 DMA-BUF. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ArgusFrame { + /// DMA-BUF frame suitable for [`crate::VideoCaptureTrack::capture_dmabuf`]. + pub dmabuf: DmaBufFrame, + /// Argus sensor start timestamp in nanoseconds, when available. + pub sensor_timestamp_ns: Option, + /// Argus sensor start timestamp translated to UNIX-epoch microseconds, when available. + pub sensor_timestamp_us: Option, + /// Time spent waiting for `FrameConsumer::acquireFrame` to return. + pub acquire_wait_ns: u64, + /// Time spent copying the acquired EGLStream frame into the DMA buffer. + pub blit_ns: u64, +} + +impl ArgusFrame { + /// Returns the DMA-BUF frame descriptor. + pub fn dmabuf_frame(&self) -> &DmaBufFrame { + &self.dmabuf + } +} + +/// One Argus frame copied to CPU-accessible I420. +#[derive(Debug)] +pub struct ArgusI420Frame { + /// I420 frame suitable for timestamp burning or other CPU-side mutation. + pub frame: VideoFrame, + /// Original Argus DMA-BUF frame descriptor. + pub dmabuf: ArgusFrame, + /// Time spent copying NV12 DMA-BUF data into the I420 frame. + pub copy_to_i420_ns: u64, +} + +/// Jetson Argus capture session that emits NV12 DMA-BUF frames. +#[derive(Debug)] +pub struct ArgusCaptureSession { + #[cfg(livekit_capture_argus)] + handle: *mut c_void, + options: ArgusCaptureOptions, + #[cfg(livekit_capture_argus)] + started_at: Instant, +} + +// SAFETY: The C++ Argus session is driven by one mutable Rust owner at a time. +unsafe impl Send for ArgusCaptureSession {} + +impl ArgusCaptureSession { + /// Opens an Argus capture session. + pub fn new(options: ArgusCaptureOptions) -> Result { + validate_options(&options)?; + Self::open(options) + } + + /// Captures the next frame as an NV12 DMA-BUF. + /// + /// The returned DMA-BUF file descriptor is owned by the Argus session's + /// internal buffer ring. It remains valid until the session is dropped, but + /// callers should publish frames promptly so the ring can be reused. + pub fn capture_frame(&mut self) -> Result { + self.acquire_frame_inner() + } + + /// Captures the next frame and copies it to CPU-accessible I420. + /// + /// This intentionally maps the DMA-BUF for CPU readback and should be used + /// only when the caller needs to mutate pixels before publishing. + pub fn capture_i420_frame(&mut self) -> Result { + let dmabuf = self.capture_frame()?; + let mut frame = VideoFrame { + rotation: VideoRotation::VideoRotation0, + timestamp_us: dmabuf.dmabuf.timestamp_us, + frame_metadata: None, + buffer: I420Buffer::new(dmabuf.dmabuf.width, dmabuf.dmabuf.height), + }; + let copy_to_i420_ns = self.copy_frame_to_i420(&dmabuf.dmabuf, &mut frame.buffer)?; + Ok(ArgusI420Frame { frame, dmabuf, copy_to_i420_ns }) + } + + /// Acquires the next captured frame as an NV12 DMA-BUF. + #[deprecated(note = "use capture_frame")] + pub fn acquire_frame(&mut self) -> Result { + self.capture_frame() + } + + /// Releases the currently held Argus frame, when one is held by the shim. + pub fn release_frame(&mut self) { + self.release_frame_inner(); + } + + /// Returns the configured frame width. + pub fn width(&self) -> u32 { + self.options.format.resolution.width + } + + /// Returns the configured frame height. + pub fn height(&self) -> u32 { + self.options.format.resolution.height + } + + /// Returns the requested capture format. + pub fn format(&self) -> CaptureFormat { + self.options.format + } + + /// Returns the configured capture options. + pub fn options(&self) -> &ArgusCaptureOptions { + &self.options + } + + /// Returns the capture path produced by this session. + pub fn capture_path(&self) -> CapturePath { + CapturePath::DmaBuf + } + + #[cfg(livekit_capture_argus)] + fn open(options: ArgusCaptureOptions) -> Result { + let sensor_index = c_int_from_u32(options.sensor_index, "sensor_index")?; + let width = c_int_from_u32(options.format.resolution.width, "width")?; + let height = c_int_from_u32(options.format.resolution.height, "height")?; + let frame_rate = c_int_from_u32(options.format.frame_rate, "frame_rate")?; + + let handle = unsafe { + // SAFETY: The C shim expects plain integer values and returns either + // a valid opaque session pointer or null on failure. + lk_argus_create_session(sensor_index, width, height, frame_rate) + }; + if handle.is_null() { + return Err(ArgusError::CreateSessionFailed); + } + + Ok(Self { handle, options, started_at: Instant::now() }) + } + + #[cfg(not(livekit_capture_argus))] + fn open(_options: ArgusCaptureOptions) -> Result { + Err(ArgusError::Unsupported) + } + + #[cfg(livekit_capture_argus)] + fn acquire_frame_inner(&mut self) -> Result { + let mut sensor_timestamp_ns = 0; + let mut acquire_wait_ns = 0; + let mut blit_ns = 0; + let fd = unsafe { + // SAFETY: `self.handle` is created by `lk_argus_create_session` and + // remains valid until `Drop`; the out-pointers are valid for the call. + lk_argus_acquire_frame_with_metadata( + self.handle, + &mut sensor_timestamp_ns, + &mut acquire_wait_ns, + &mut blit_ns, + ) + }; + if fd < 0 { + return Err(ArgusError::AcquireFrameFailed); + } + + let sensor_timestamp_ns = (sensor_timestamp_ns > 0).then_some(sensor_timestamp_ns); + let sensor_timestamp_us = sensor_timestamp_ns.and_then(sensor_wall_time_us); + let resolution = self.options.format.resolution; + let dmabuf = DmaBufFrame { + width: resolution.width, + height: resolution.height, + pixel_format: DmaBufPixelFormat::Nv12, + planes: vec![DmaBufPlane { fd, offset: 0, stride: resolution.width }], + modifier: None, + timestamp_us: elapsed_us(self.started_at.elapsed()), + sensor_timestamp_us, + }; + + Ok(ArgusFrame { + dmabuf, + sensor_timestamp_ns, + sensor_timestamp_us, + acquire_wait_ns, + blit_ns, + }) + } + + #[cfg(not(livekit_capture_argus))] + fn acquire_frame_inner(&mut self) -> Result { + Err(ArgusError::Unsupported) + } + + #[cfg(livekit_capture_argus)] + fn copy_frame_to_i420( + &self, + dmabuf: &DmaBufFrame, + destination: &mut I420Buffer, + ) -> Result { + let plane = dmabuf.planes.first().ok_or(ArgusError::MissingDmaBufPlane)?; + let (stride_y, stride_u, stride_v) = destination.strides(); + let (dst_y, dst_u, dst_v) = destination.data_mut(); + let mut copy_to_i420_ns = 0; + let status = unsafe { + // SAFETY: `self.handle` owns the Argus session; destination slices + // come from a mutable I420 buffer and remain valid for this call. + lk_argus_copy_frame_to_i420( + self.handle, + plane.fd, + dst_y.as_mut_ptr(), + c_int_from_u32(stride_y, "stride_y")?, + dst_u.as_mut_ptr(), + c_int_from_u32(stride_u, "stride_u")?, + dst_v.as_mut_ptr(), + c_int_from_u32(stride_v, "stride_v")?, + &mut copy_to_i420_ns, + ) + }; + if status == 0 { + Ok(copy_to_i420_ns) + } else { + Err(ArgusError::CopyToI420Failed(ArgusI420CopyError::from_status(status))) + } + } + + #[cfg(not(livekit_capture_argus))] + fn copy_frame_to_i420( + &self, + _dmabuf: &DmaBufFrame, + _destination: &mut I420Buffer, + ) -> Result { + Err(ArgusError::Unsupported) + } + + #[cfg(livekit_capture_argus)] + fn release_frame_inner(&mut self) { + unsafe { + // SAFETY: `self.handle` is owned by this session and valid until `Drop`. + lk_argus_release_frame(self.handle); + } + } + + #[cfg(not(livekit_capture_argus))] + fn release_frame_inner(&mut self) {} +} + +impl Drop for ArgusCaptureSession { + fn drop(&mut self) { + #[cfg(livekit_capture_argus)] + if !self.handle.is_null() { + unsafe { + // SAFETY: `self.handle` is owned by this session and is destroyed once here. + lk_argus_destroy_session(self.handle); + } + self.handle = std::ptr::null_mut(); + } + } +} + +fn validate_options(options: &ArgusCaptureOptions) -> Result<(), ArgusError> { + if options.format.frame_format != CaptureFrameFormat::Nv12 { + return Err(ArgusError::UnsupportedFrameFormat(options.format.frame_format)); + } + if options.format.resolution.width == 0 { + return Err(ArgusError::InvalidOption("width must be non-zero")); + } + if options.format.resolution.height == 0 { + return Err(ArgusError::InvalidOption("height must be non-zero")); + } + if options.format.frame_rate == 0 { + return Err(ArgusError::InvalidOption("frame_rate must be non-zero")); + } + Ok(()) +} + +/// Returns Jetson Argus capture devices. +pub fn devices() -> Result, ArgusError> { + #[cfg(livekit_capture_argus)] + { + return Ok(vec![CaptureDeviceInfo { + backend: CaptureBackend::LibArgus, + id: "0".to_string(), + selector: CaptureDeviceSelector::Index(0), + name: "Jetson Argus sensor 0".to_string(), + model_id: None, + manufacturer: Some("NVIDIA".to_string()), + paths: vec![CapturePath::DmaBuf], + formats: vec![ArgusCaptureOptions::default().format], + formats_complete: false, + }]); + } + #[cfg(not(livekit_capture_argus))] + { + Err(ArgusError::Unsupported) + } +} + +#[cfg(livekit_capture_argus)] +fn c_int_from_u32(value: u32, field: &'static str) -> Result { + c_int::try_from(value).map_err(|_| ArgusError::OptionOutOfRange(field)) +} + +#[cfg(livekit_capture_argus)] +fn sensor_wall_time_us(sensor_timestamp_ns: u64) -> Option { + let wall_time_us = unix_time_us_now()?; + sensor_monotonic_ns_to_unix_us(sensor_timestamp_ns, wall_time_us) +} + +/// Converts an Argus `CLOCK_MONOTONIC` timestamp into a UNIX-epoch microsecond timestamp. +pub fn sensor_monotonic_ns_to_unix_us(sensor_timestamp_ns: u64, wall_time_us: u64) -> Option { + let monotonic_now_ns = monotonic_time_ns_now()?; + let monotonic_delta_us = monotonic_now_ns.abs_diff(sensor_timestamp_ns) / 1_000; + if sensor_timestamp_ns <= monotonic_now_ns { + Some(wall_time_us.saturating_sub(monotonic_delta_us)) + } else { + Some(wall_time_us.saturating_add(monotonic_delta_us)) + } +} + +#[cfg(target_os = "linux")] +fn monotonic_time_ns_now() -> Option { + #[repr(C)] + struct Timespec { + tv_sec: i64, + tv_nsec: i64, + } + + extern "C" { + fn clock_gettime(clk_id: i32, tp: *mut Timespec) -> i32; + } + + const CLOCK_MONOTONIC: i32 = 1; + let mut ts = Timespec { tv_sec: 0, tv_nsec: 0 }; + let ret = unsafe { + // SAFETY: `ts` is a valid writable `Timespec` for the duration of the call. + clock_gettime(CLOCK_MONOTONIC, &mut ts) + }; + if ret != 0 || ts.tv_sec < 0 || ts.tv_nsec < 0 { + return None; + } + + let seconds = u64::try_from(ts.tv_sec).ok()?; + let nanos = u64::try_from(ts.tv_nsec).ok()?; + seconds.checked_mul(1_000_000_000)?.checked_add(nanos) +} + +#[cfg(not(target_os = "linux"))] +fn monotonic_time_ns_now() -> Option { + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn validates_nv12_only() { + let mut options = ArgusCaptureOptions::default(); + options.format.frame_format = CaptureFrameFormat::I420; + let err = ArgusCaptureSession::new(options).expect_err("I420 must be rejected"); + assert_eq!(err, ArgusError::UnsupportedFrameFormat(CaptureFrameFormat::I420)); + } + + #[test] + fn validates_non_zero_frame_rate() { + let options = ArgusCaptureOptions::new(0, CaptureResolution::new(1280, 720), 0); + let err = ArgusCaptureSession::new(options).expect_err("zero frame rate must be rejected"); + assert_eq!(err, ArgusError::InvalidOption("frame_rate must be non-zero")); + } +} diff --git a/livekit-capture/src/sources/avfoundation.rs b/livekit-capture/src/sources/avfoundation.rs new file mode 100644 index 000000000..1519014f5 --- /dev/null +++ b/livekit-capture/src/sources/avfoundation.rs @@ -0,0 +1,1975 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::{ + atomic::{AtomicBool, Ordering}, + Arc, +}; +use std::thread::JoinHandle; + +use livekit::webrtc::video_frame::{native::NativeBuffer, I420Buffer, VideoBuffer, VideoFrame}; +use thiserror::Error; + +use crate::{ + device::{ + CaptureBackend, CaptureDeviceInfo, CaptureDeviceSelector, CaptureFormat, + CaptureFormatRequest, CaptureFrameFormat, CapturePath, CaptureResolution, + }, + error::CaptureError, + track::VideoCaptureTrack, +}; + +#[cfg(target_os = "macos")] +const FIRST_FRAME_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(5); + +/// Options used to create an AVFoundation capture session. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct AvFoundationCaptureOptions { + /// Device to use for capture. + pub device: CaptureDeviceSelector, + /// Format requested from the device. + pub format: CaptureFormatRequest, + /// Whether the resulting track should be marked as a screencast. + pub is_screencast: bool, +} + +impl Default for AvFoundationCaptureOptions { + fn default() -> Self { + Self { + device: CaptureDeviceSelector::Default, + format: CaptureFormatRequest::Default, + is_screencast: false, + } + } +} + +/// One AVFoundation frame converted to I420. +#[derive(Debug)] +pub struct AvFoundationFrame { + /// Decoded I420 frame suitable for [`crate::VideoCaptureTrack::capture_frame`]. + pub frame: VideoFrame, + /// Source frame format delivered by AVFoundation. + pub source_format: CaptureFrameFormat, + /// Wall-clock timestamp selected for metadata and timing correlation. + pub capture_wall_time_us: u64, + /// Wall-clock timestamp recorded after the frame was read from AVFoundation. + pub read_wall_time_us: u64, + /// Sensor timestamp translated to UNIX-epoch microseconds, when available. + pub sensor_timestamp_us: Option, + /// Whether conversion from the source format to I420 was needed. + pub used_conversion: bool, +} + +impl AvFoundationFrame { + /// Returns the decoded video frame. + pub fn video_frame(&self) -> &VideoFrame { + &self.frame + } +} + +/// One AVFoundation frame backed by a native IOSurface-backed `CVPixelBuffer`. +#[derive(Debug)] +pub struct AvFoundationNativeFrame { + /// Native frame suitable for [`crate::VideoCaptureTrack::capture_frame`]. + pub frame: VideoFrame, + /// Source frame format delivered by AVFoundation. + pub source_format: CaptureFrameFormat, + /// Wall-clock timestamp selected for metadata and timing correlation. + pub capture_wall_time_us: u64, + /// Wall-clock timestamp recorded after the frame was read from AVFoundation. + pub read_wall_time_us: u64, + /// Sensor timestamp translated to UNIX-epoch microseconds, when available. + pub sensor_timestamp_us: Option, +} + +impl AvFoundationNativeFrame { + /// Returns the native video frame. + pub fn video_frame(&self) -> &VideoFrame { + &self.frame + } +} + +/// AVFoundation capture session that emits I420 frames or native `CVPixelBuffer`s. +pub struct AvFoundationCaptureSession { + format: CaptureFormat, + options: AvFoundationCaptureOptions, + target_resolution: Option, + native_frame_supported: bool, + #[cfg(target_os = "macos")] + core_video_pixel_format: u32, + #[cfg(target_os = "macos")] + inner: macos::SessionInner, +} + +impl std::fmt::Debug for AvFoundationCaptureSession { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AvFoundationCaptureSession") + .field("format", &self.format) + .field("options", &self.options) + .finish() + } +} + +// SAFETY: `AvFoundationCaptureSession` owns AVFoundation objects and only exposes +// `&mut self` frame capture plus `Drop`; moving ownership to another thread does +// not create concurrent access to those Objective-C objects. +#[cfg(target_os = "macos")] +unsafe impl Send for AvFoundationCaptureSession {} + +impl AvFoundationCaptureSession { + /// Opens an AVFoundation decoded-frame capture session. + pub fn new(options: AvFoundationCaptureOptions) -> Result { + validate_options(&options)?; + Self::open(options) + } + + /// Captures the next decoded frame and converts it to I420. + /// + /// Blocks until AVFoundation delivers a frame. Fails with + /// [`AvFoundationError::NotRunning`] once the session has been stopped via + /// [`Self::stop`] or an [`AvFoundationStopHandle`]. + pub fn capture_frame(&mut self) -> Result { + self.capture_frame_inner() + } + + /// Captures the next frame as a native `CVPixelBuffer`. + /// + /// Blocks until AVFoundation delivers a frame. Fails with + /// [`AvFoundationError::NotRunning`] once the session has been stopped via + /// [`Self::stop`] or an [`AvFoundationStopHandle`]. + pub fn capture_native_frame(&mut self) -> Result { + self.capture_native_frame_inner() + } + + /// Returns a cheaply cloneable handle that stops this session from another + /// thread. See [`AvFoundationStopHandle::stop`]. + pub fn stop_handle(&self) -> AvFoundationStopHandle { + AvFoundationStopHandle { + #[cfg(target_os = "macos")] + shared: self.inner.frame_queue(), + } + } + + /// Stops frame delivery, waking any thread blocked in + /// [`Self::capture_frame`] or [`Self::capture_native_frame`]. See + /// [`AvFoundationStopHandle::stop`] for the exact contract. + pub fn stop(&self) { + self.stop_handle().stop(); + } + + /// Returns the negotiated capture format. + pub fn format(&self) -> CaptureFormat { + self.format + } + + /// Returns the configured capture options. + pub fn options(&self) -> &AvFoundationCaptureOptions { + &self.options + } + + /// Returns the capture path produced by this session. + pub fn capture_path(&self) -> CapturePath { + if self.native_capture_supported() { + CapturePath::Native + } else { + CapturePath::Raw + } + } + + /// Returns the CoreVideo pixel format type delivered by AVFoundation. + #[cfg(target_os = "macos")] + pub fn core_video_pixel_format(&self) -> u32 { + self.core_video_pixel_format + } + + pub(crate) fn native_capture_supported(&self) -> bool { + self.native_frame_supported + && self.target_resolution.is_none() + && self.format.frame_format == CaptureFrameFormat::Nv12 + } + + #[cfg(target_os = "macos")] + fn open(options: AvFoundationCaptureOptions) -> Result { + let inner = macos::SessionInner::new(&options)?; + let initial_frame = inner.wait_for_format(FIRST_FRAME_TIMEOUT)?; + inner.discard_pending_frame(); + let mut format = initial_frame.format; + format.frame_rate = requested_frame_rate_hint(&options.format).unwrap_or(30); + let target_resolution = requested_output_resolution(&options.format, format.resolution); + if let Some(resolution) = target_resolution { + format.resolution = resolution; + } + Ok(Self { + format, + options, + target_resolution, + native_frame_supported: initial_frame.native_frame_supported, + core_video_pixel_format: initial_frame.core_video_pixel_format, + inner, + }) + } + + #[cfg(not(target_os = "macos"))] + fn open(_options: AvFoundationCaptureOptions) -> Result { + Err(AvFoundationError::UnsupportedPlatform) + } + + #[cfg(target_os = "macos")] + fn capture_frame_inner(&mut self) -> Result { + let mut frame = self.inner.capture_frame()?; + if let Some(resolution) = self.target_resolution { + if frame.frame.buffer.width() != resolution.width + || frame.frame.buffer.height() != resolution.height + { + let width = i32::try_from(resolution.width) + .map_err(|_| AvFoundationError::InvalidFrame("scaled width exceeds i32"))?; + let height = i32::try_from(resolution.height) + .map_err(|_| AvFoundationError::InvalidFrame("scaled height exceeds i32"))?; + frame.frame.buffer = frame.frame.buffer.scale(width, height); + } + } + Ok(frame) + } + + #[cfg(not(target_os = "macos"))] + fn capture_frame_inner(&mut self) -> Result { + Err(AvFoundationError::UnsupportedPlatform) + } + + #[cfg(target_os = "macos")] + fn capture_native_frame_inner(&mut self) -> Result { + if self.target_resolution.is_some() { + return Err(AvFoundationError::NativeCaptureUnavailable); + } + if self.format.frame_format != CaptureFrameFormat::Nv12 { + return Err(AvFoundationError::UnsupportedFrameFormat(self.format.frame_format)); + } + self.inner.capture_native_frame() + } + + #[cfg(not(target_os = "macos"))] + fn capture_native_frame_inner(&mut self) -> Result { + Err(AvFoundationError::UnsupportedPlatform) + } +} + +/// Cheaply cloneable handle that stops an [`AvFoundationCaptureSession`] from +/// another thread. +/// +/// The thread that owns the session is typically blocked inside +/// [`AvFoundationCaptureSession::capture_frame`] waiting for the camera, so it +/// cannot stop itself if the device stalls without delivering an error +/// (unplug, sleep, exclusive use by another app). Obtaining this handle before +/// handing the session to that thread gives the rest of the process a way to +/// abort the wait. +#[derive(Clone, Debug)] +pub struct AvFoundationStopHandle { + #[cfg(target_os = "macos")] + shared: Arc, +} + +impl AvFoundationStopHandle { + /// Stops frame delivery for the associated session and wakes all blocked + /// capture calls. + /// + /// Stopping is idempotent. Once stopped, + /// [`AvFoundationCaptureSession::capture_frame`] and + /// [`AvFoundationCaptureSession::capture_native_frame`] fail with + /// [`AvFoundationError::NotRunning`]; a frame that was already queued may + /// still be returned before the first error. The underlying AVFoundation + /// session is torn down when the session value itself is dropped. + pub fn stop(&self) { + #[cfg(target_os = "macos")] + self.shared.stop(); + } +} + +/// AVFoundation decoded-frame capture session that forwards frames into a track. +pub struct AvFoundationCapture { + track: VideoCaptureTrack, + options: AvFoundationCaptureOptions, + runner: Option, +} + +impl std::fmt::Debug for AvFoundationCapture { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AvFoundationCapture") + .field("track", &self.track) + .field("options", &self.options) + .field("running", &self.runner.is_some()) + .finish() + } +} + +impl AvFoundationCapture { + /// Creates an AVFoundation capture session wrapper for a capture track. + pub fn new( + track: VideoCaptureTrack, + options: AvFoundationCaptureOptions, + ) -> Result { + ensure_platform_available()?; + Ok(Self { track, options, runner: None }) + } + + /// Returns the capture track that receives decoded frames. + pub fn track(&self) -> &VideoCaptureTrack { + &self.track + } + + /// Returns the configured capture options. + pub fn options(&self) -> &AvFoundationCaptureOptions { + &self.options + } + + /// Starts AVFoundation capture on a background thread. + pub fn start(&mut self) -> Result<(), AvFoundationError> { + start_capture(self) + } + + /// Stops AVFoundation capture. + pub fn stop(&mut self) -> Result<(), AvFoundationError> { + stop_capture(self) + } +} + +impl Drop for AvFoundationCapture { + fn drop(&mut self) { + let _ = self.stop(); + } +} + +#[derive(Debug)] +struct CaptureRunner { + stop: Arc, + /// Wakes the capture thread out of a blocking frame wait so `stop_capture` + /// can join it even when the camera has stalled. + stop_handle: AvFoundationStopHandle, + handle: JoinHandle<()>, +} + +/// Lists AVFoundation video capture devices. +pub fn devices() -> Result, AvFoundationError> { + list_devices() +} + +/// Error returned by AVFoundation capture. +#[derive(Debug, Error)] +#[non_exhaustive] +pub enum AvFoundationError { + /// AVFoundation capture is only available on macOS. + #[error("AVFoundation capture is only available on macOS")] + UnsupportedPlatform, + /// The requested device was not found. + #[error("AVFoundation capture device was not found")] + DeviceNotFound, + /// The requested option is invalid. + #[error("invalid AVFoundation capture option: {0}")] + InvalidOption(&'static str), + /// The requested capture frame format is not supported by this backend. + #[error("AVFoundation capture does not support frame format {0:?}")] + UnsupportedFrameFormat(CaptureFrameFormat), + /// The requested capture format is not available on the selected device. + #[error("AVFoundation capture format is not available: {0:?}")] + UnsupportedFormat(CaptureFormat), + /// AVFoundation could not configure the capture session. + #[error("AVFoundation session setup failed: {0}")] + SessionSetup(String), + /// Timed out waiting for AVFoundation to deliver a frame. + #[error("timed out waiting for AVFoundation frame")] + FrameTimeout, + /// The capture session is already running. + #[error("AVFoundation capture is already running")] + AlreadyRunning, + /// The capture session is not running. + #[error("AVFoundation capture is not running")] + NotRunning, + /// Captured frame bytes did not match the negotiated format. + #[error("invalid AVFoundation frame buffer: {0}")] + InvalidFrame(&'static str), + /// AVFoundation produced a pixel format this backend cannot convert yet. + #[error("unsupported AVFoundation pixel format 0x{0:08x}")] + UnsupportedCoreVideoPixelFormat(u32), + /// Native capture cannot be used for the negotiated session. + #[error("AVFoundation native capture requires NV12 without software scaling")] + NativeCaptureUnavailable, + /// Pixel conversion failed. + #[error("failed to convert AVFoundation frame to I420: {0}")] + Convert(&'static str), + /// AVFoundation reported a runtime capture error. + #[error("AVFoundation runtime error: {0}")] + Runtime(String), + /// The shared capture track rejected a frame. + #[error(transparent)] + Capture(#[from] CaptureError), +} + +fn validate_options(options: &AvFoundationCaptureOptions) -> Result<(), AvFoundationError> { + match &options.device { + CaptureDeviceSelector::Default | CaptureDeviceSelector::Index(_) => {} + CaptureDeviceSelector::Id(id) => { + if id.is_empty() { + return Err(AvFoundationError::InvalidOption("device id must be non-empty")); + } + } + } + + validate_format_request(&options.format) +} + +fn validate_format_request(format: &CaptureFormatRequest) -> Result<(), AvFoundationError> { + let validate_format = |format: &CaptureFormat| { + if format.resolution.width == 0 { + return Err(AvFoundationError::InvalidOption("width must be non-zero")); + } + if format.resolution.height == 0 { + return Err(AvFoundationError::InvalidOption("height must be non-zero")); + } + if format.frame_rate == 0 { + return Err(AvFoundationError::InvalidOption("frame_rate must be non-zero")); + } + validate_frame_format(format.frame_format)?; + Ok(()) + }; + + match format { + CaptureFormatRequest::Default => Ok(()), + CaptureFormatRequest::Exact(format) | CaptureFormatRequest::Closest(format) => { + validate_format(format) + } + CaptureFormatRequest::HighestFrameRate { resolution, frame_format } => { + if let Some(resolution) = resolution { + validate_resolution(*resolution)?; + } + if let Some(frame_format) = frame_format { + validate_frame_format(*frame_format)?; + } + Ok(()) + } + CaptureFormatRequest::HighestResolution { frame_rate, frame_format } => { + if matches!(frame_rate, Some(0)) { + return Err(AvFoundationError::InvalidOption("frame_rate must be non-zero")); + } + if let Some(frame_format) = frame_format { + validate_frame_format(*frame_format)?; + } + Ok(()) + } + } +} + +fn validate_frame_format(frame_format: CaptureFrameFormat) -> Result<(), AvFoundationError> { + if !matches!( + frame_format, + CaptureFrameFormat::Nv12 | CaptureFrameFormat::Bgra | CaptureFrameFormat::I420 + ) { + return Err(AvFoundationError::UnsupportedFrameFormat(frame_format)); + } + Ok(()) +} + +fn requested_frame_rate_hint(format: &CaptureFormatRequest) -> Option { + match format { + CaptureFormatRequest::Default => None, + CaptureFormatRequest::Exact(format) | CaptureFormatRequest::Closest(format) => { + Some(format.frame_rate) + } + CaptureFormatRequest::HighestFrameRate { .. } => None, + CaptureFormatRequest::HighestResolution { frame_rate, .. } => *frame_rate, + } +} + +fn requested_output_resolution( + request: &CaptureFormatRequest, + delivered: CaptureResolution, +) -> Option { + let CaptureFormatRequest::Closest(format) = request else { + return None; + }; + if format.resolution == delivered { + return None; + } + (resolution_area(format.resolution) <= resolution_area(delivered)).then_some(format.resolution) +} + +fn resolution_area(resolution: CaptureResolution) -> u64 { + resolution.width as u64 * resolution.height as u64 +} + +fn validate_resolution(resolution: CaptureResolution) -> Result<(), AvFoundationError> { + if resolution.width == 0 { + return Err(AvFoundationError::InvalidOption("width must be non-zero")); + } + if resolution.height == 0 { + return Err(AvFoundationError::InvalidOption("height must be non-zero")); + } + Ok(()) +} + +#[cfg(target_os = "macos")] +fn ensure_platform_available() -> Result<(), AvFoundationError> { + Ok(()) +} + +#[cfg(not(target_os = "macos"))] +fn ensure_platform_available() -> Result<(), AvFoundationError> { + Err(AvFoundationError::UnsupportedPlatform) +} + +#[cfg(target_os = "macos")] +fn list_devices() -> Result, AvFoundationError> { + use objc2_av_foundation::{AVCaptureDevice, AVMediaTypeVideo}; + + // SAFETY: AVMediaTypeVideo is a framework-provided immutable NSString + // constant. We only borrow it to ask AVFoundation for video devices. + let media_type = unsafe { AVMediaTypeVideo }.ok_or(AvFoundationError::DeviceNotFound)?; + // SAFETY: AVFoundation returns an immutable NSArray of currently available + // AVCaptureDevice instances. We only retain/copy string properties from it. + #[allow(deprecated)] + let devices = unsafe { AVCaptureDevice::devicesWithMediaType(media_type) }; + + let mut results = Vec::with_capacity(devices.len()); + for device in devices.iter() { + // SAFETY: These Objective-C property getters return retained NSStrings + // for a live AVCaptureDevice from the immutable devices array. + let id = unsafe { device.uniqueID() }.to_string(); + let name = unsafe { device.localizedName() }.to_string(); + let model_id = non_empty_string(unsafe { device.modelID() }.to_string()); + let manufacturer = non_empty_string(unsafe { device.manufacturer() }.to_string()); + + results.push(CaptureDeviceInfo { + backend: CaptureBackend::AvFoundation, + id: id.clone(), + selector: CaptureDeviceSelector::Id(id), + name, + model_id, + manufacturer, + paths: vec![CapturePath::Native, CapturePath::Raw], + formats: Vec::new(), + formats_complete: false, + }); + } + + Ok(results) +} + +#[cfg(not(target_os = "macos"))] +fn list_devices() -> Result, AvFoundationError> { + Err(AvFoundationError::UnsupportedPlatform) +} + +#[cfg(target_os = "macos")] +fn non_empty_string(value: String) -> Option { + (!value.is_empty()).then_some(value) +} + +#[cfg(target_os = "macos")] +fn start_capture(capture: &mut AvFoundationCapture) -> Result<(), AvFoundationError> { + if capture.runner.is_some() { + return Err(AvFoundationError::AlreadyRunning); + } + + let track = capture.track.clone(); + let mut session = AvFoundationCaptureSession::new(capture.options.clone())?; + let capture_native = session.native_capture_supported(); + // Keep a stop handle outside the capture thread: once the session moves + // into the thread, this is the only way to wake a blocked frame wait. + let stop_handle = session.stop_handle(); + let stop = Arc::new(AtomicBool::new(false)); + let stop_for_thread = stop.clone(); + let handle = std::thread::Builder::new() + .name("avfoundation-capture".into()) + .spawn(move || { + while !stop_for_thread.load(Ordering::Acquire) { + if capture_native { + match session.capture_native_frame() { + Ok(frame) => track.capture_frame(&frame.frame), + Err(_) => break, + } + } else { + match session.capture_frame() { + Ok(frame) => track.capture_frame(&frame.frame), + Err(_) => break, + } + } + } + }) + .map_err(|err| AvFoundationError::SessionSetup(err.to_string()))?; + + capture.runner = Some(CaptureRunner { stop, stop_handle, handle }); + Ok(()) +} + +#[cfg(not(target_os = "macos"))] +fn start_capture(_capture: &mut AvFoundationCapture) -> Result<(), AvFoundationError> { + Err(AvFoundationError::UnsupportedPlatform) +} + +#[cfg(target_os = "macos")] +fn stop_capture(capture: &mut AvFoundationCapture) -> Result<(), AvFoundationError> { + let Some(runner) = capture.runner.take() else { + return Ok(()); + }; + + runner.stop.store(true, Ordering::Release); + // Wake the capture thread if it is blocked waiting for the next frame so a + // stalled camera cannot keep the join below from completing. The woken + // wait fails with `NotRunning`, and the thread exits via the loop's error + // path or the stop flag. + runner.stop_handle.stop(); + runner.handle.join().map_err(|_| { + AvFoundationError::Runtime("AVFoundation capture thread panicked".to_string()) + })?; + Ok(()) +} + +#[cfg(not(target_os = "macos"))] +fn stop_capture(_capture: &mut AvFoundationCapture) -> Result<(), AvFoundationError> { + Err(AvFoundationError::UnsupportedPlatform) +} + +#[cfg(target_os = "macos")] +mod macos { + use std::ffi::c_void; + use std::ops::Deref; + use std::ptr::NonNull; + use std::sync::{Arc, Condvar, Mutex}; + use std::time::{Duration, Instant}; + + use dispatch2::{DispatchQueue, DispatchRetained}; + use livekit::webrtc::video_frame::{ + native::NativeBuffer, I420Buffer, VideoFrame, VideoRotation, + }; + use objc2::rc::Retained; + use objc2::runtime::{AnyObject, ProtocolObject}; + use objc2::{define_class, msg_send, AnyThread, DefinedClass, Message}; + use objc2_av_foundation::{ + AVCaptureDevice, AVCaptureDeviceFormat, AVCaptureDeviceInput, AVCaptureOutput, + AVCaptureSession, AVCaptureSessionPreset1280x720, AVCaptureSessionPreset1920x1080, + AVCaptureSessionPreset640x480, AVCaptureSessionPresetHigh, + AVCaptureSessionPresetInputPriority, AVCaptureSessionPresetMedium, + AVCaptureVideoDataOutput, AVCaptureVideoDataOutputSampleBufferDelegate, + AVCaptureVideoStabilizationMode, AVMediaTypeVideo, + }; + use objc2_core_media::{ + CMClock, CMSampleBuffer, CMTime, CMTimeFlags, CMVideoFormatDescriptionGetDimensions, + }; + use objc2_core_video::{ + kCVPixelBufferIOSurfacePropertiesKey, kCVPixelBufferMetalCompatibilityKey, + kCVPixelBufferPixelFormatTypeKey, kCVPixelFormatType_32BGRA, + kCVPixelFormatType_420YpCbCr8BiPlanarFullRange, + kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange, kCVPixelFormatType_420YpCbCr8Planar, + kCVPixelFormatType_420YpCbCr8PlanarFullRange, kCVPixelFormatType_422YpCbCr8, + kCVPixelFormatType_422YpCbCr8FullRange, kCVPixelFormatType_422YpCbCr8_yuvs, + kCVReturnSuccess, CVImageBuffer, CVPixelBuffer, CVPixelBufferGetBaseAddress, + CVPixelBufferGetBaseAddressOfPlane, CVPixelBufferGetBytesPerRow, + CVPixelBufferGetBytesPerRowOfPlane, CVPixelBufferGetHeight, CVPixelBufferGetHeightOfPlane, + CVPixelBufferGetPixelFormatType, CVPixelBufferGetPlaneCount, CVPixelBufferGetWidth, + CVPixelBufferGetWidthOfPlane, CVPixelBufferLockBaseAddress, CVPixelBufferLockFlags, + CVPixelBufferUnlockBaseAddress, + }; + use objc2_foundation::{NSDictionary, NSNumber, NSObject, NSObjectProtocol, NSString}; + + use super::{ + AvFoundationCaptureOptions, AvFoundationError, AvFoundationFrame, AvFoundationNativeFrame, + }; + use crate::device::{ + CaptureDeviceSelector, CaptureFormat, CaptureFormatRequest, CaptureFrameFormat, + CaptureResolution, + }; + use crate::time::{ + elapsed_us, unix_time_us_now, validate_capture_timestamp_us, MAX_CAPTURE_TIMESTAMP_AGE_US, + }; + + unsafe extern "C" { + fn CFRelease(cf: *const c_void); + fn CVPixelBufferGetIOSurface(pixel_buffer: *const CVPixelBuffer) -> *const c_void; + } + pub(super) struct SessionInner { + session: Retained, + _input: Retained, + output: Retained, + _delegate: Retained, + _queue: DispatchRetained, + shared: Arc, + } + + impl std::fmt::Debug for SessionInner { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SessionInner").finish_non_exhaustive() + } + } + + impl Drop for SessionInner { + fn drop(&mut self) { + self.shared.stop(); + // SAFETY: The output and session are owned by this wrapper. Clearing + // the delegate before stopping prevents callbacks from racing with + // the delegate being released during teardown. + unsafe { + self.output.setSampleBufferDelegate_queue(None, None); + self.session.stopRunning(); + } + } + } + + impl SessionInner { + pub(super) fn new(options: &AvFoundationCaptureOptions) -> Result { + let device = select_device(&options.device)?; + let session = unsafe { AVCaptureSession::new() }; + let input = unsafe { AVCaptureDeviceInput::deviceInputWithDevice_error(&device) } + .map_err(|err| { + AvFoundationError::SessionSetup(err.localizedDescription().to_string()) + })?; + let output = unsafe { AVCaptureVideoDataOutput::new() }; + let shared = Arc::new(FrameQueue::default()); + let delegate = CaptureDelegate::new(shared.clone()); + let queue = DispatchQueue::new("io.livekit.capture.avfoundation", None); + let active_format = select_active_format(&device, &options.format)?; + + // SAFETY: The session is newly created and not running. We add a + // camera input and video data output only after canAdd* checks. + unsafe { + session.beginConfiguration(); + session.setAutomaticallyConfiguresCaptureDeviceForWideColor(false); + if active_format.is_none() { + if let Some(preset) = session_preset(&options.format) { + session.setSessionPreset(preset); + } + } + let config_result = (|| { + if !session.canAddInput(&input) { + return Err(AvFoundationError::SessionSetup( + "capture device input could not be added".to_string(), + )); + } + session.addInput(&input); + + configure_device(&device, &options.format, active_format.as_deref())?; + if active_format.is_some() + && session.canSetSessionPreset(AVCaptureSessionPresetInputPriority) + { + session.setSessionPreset(AVCaptureSessionPresetInputPriority); + } + configure_input_frame_duration(&input, &device, &options.format); + + if let Some(video_settings) = preferred_video_settings(&output) { + output.setVideoSettings(Some(&video_settings)); + } + output.setAlwaysDiscardsLateVideoFrames(true); + output.setSampleBufferDelegate_queue( + Some(ProtocolObject::from_ref(&*delegate)), + Some(&queue), + ); + if !session.canAddOutput(&output) { + return Err(AvFoundationError::SessionSetup( + "video data output could not be added".to_string(), + )); + } + session.addOutput(&output); + configure_output_connection(&output)?; + Ok(()) + })(); + session.commitConfiguration(); + config_result?; + } + + // SAFETY: Configuration has been committed and the session is ready + // to synchronously start delivering video samples. + unsafe { + session.startRunning(); + } + + Ok(Self { session, _input: input, output, _delegate: delegate, _queue: queue, shared }) + } + + pub(super) fn wait_for_format( + &self, + timeout: Duration, + ) -> Result { + self.shared.wait_for_format(timeout) + } + + pub(super) fn capture_frame(&mut self) -> Result { + self.shared.take_frame() + } + + pub(super) fn capture_native_frame( + &mut self, + ) -> Result { + self.shared.take_native_frame() + } + + pub(super) fn discard_pending_frame(&self) { + self.shared.discard_latest(); + } + + /// Returns the shared frame queue so callers outside the session-owning + /// thread can stop a blocked frame wait. + pub(super) fn frame_queue(&self) -> Arc { + Arc::clone(&self.shared) + } + } + + fn preferred_video_settings( + output: &AVCaptureVideoDataOutput, + ) -> Option>> { + let preferred = [ + // WebRTC's VideoToolbox H.264 encoder allocates full-range NV12 + // buffers for its CPU upload path. Prefer the same CoreVideo + // format for direct CVPixelBuffer input so the native path does + // not have to reset VideoToolbox into a separate video-range pool. + kCVPixelFormatType_420YpCbCr8BiPlanarFullRange, + kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange, + ]; + // SAFETY: `output` is a live AVCaptureVideoDataOutput owned by the session setup path, and + // querying advertised CV pixel formats does not mutate Rust-managed memory. + let supported_formats = unsafe { output.availableVideoCVPixelFormatTypes() }; + let pixel_format_type = preferred.into_iter().find(|preferred| { + supported_formats.iter().any(|format| format.as_u32() == *preferred) + })?; + + let pixel_format = NSNumber::new_u32(pixel_format_type); + let metal_compatible = NSNumber::new_bool(true); + let iosurface_properties = NSDictionary::::new(); + // SAFETY: The CoreVideo constants are immutable CFString keys. + // `CFString` and `NSString` are toll-free bridged, which + // objc2-foundation exposes through `AsRef`. + let pixel_format_key: &NSString = unsafe { kCVPixelBufferPixelFormatTypeKey }.as_ref(); + // SAFETY: Same as above. + let iosurface_key: &NSString = unsafe { kCVPixelBufferIOSurfacePropertiesKey }.as_ref(); + // SAFETY: Same as above. + let metal_key: &NSString = unsafe { kCVPixelBufferMetalCompatibilityKey }.as_ref(); + Some(NSDictionary::from_slices( + &[pixel_format_key, iosurface_key, metal_key], + &[pixel_format.as_ref(), iosurface_properties.as_ref(), metal_compatible.as_ref()], + )) + } + + fn configure_input_frame_duration( + input: &AVCaptureDeviceInput, + device: &AVCaptureDevice, + request: &CaptureFormatRequest, + ) { + let Some(frame_rate) = requested_frame_rate(request).filter(|frame_rate| *frame_rate > 0) + else { + return; + }; + // SAFETY: `input` is the live input just added to the session. The + // support predicate is checked before setting the locked duration. + if !unsafe { input.isLockedVideoFrameDurationSupported() } { + return; + } + + let duration = unsafe { CMTime::with_seconds(1.0 / frame_rate as f64, 600) }; + // SAFETY: `device` and `input` belong to the same session setup path. + // The requested rate has already been checked against the active format + // before the device frame durations are set, and `input` reports locked + // frame duration support. + unsafe { + if device_format_supports_frame_rate(&device.activeFormat(), frame_rate) { + input.setActiveLockedVideoFrameDuration(duration); + } + } + } + + fn configure_output_connection( + output: &AVCaptureVideoDataOutput, + ) -> Result<(), AvFoundationError> { + let media_type = unsafe { AVMediaTypeVideo }.ok_or(AvFoundationError::DeviceNotFound)?; + // SAFETY: `output` has just been added to a configured session. Querying + // its video connection does not mutate Rust-managed memory. + let Some(connection) = (unsafe { output.connectionWithMediaType(media_type) }) else { + return Err(AvFoundationError::SessionSetup( + "video data output connection was not created".to_string(), + )); + }; + + // Keep frame-duration control on the device/input path. The deprecated + // output connection frame-duration setters can change whether macOS + // delivers IOSurface-backed pixel buffers. + // SAFETY: The connection is the video data output connection. Each + // setter is guarded by the corresponding support/configuration checks + // required by AVFoundation's API contract. + unsafe { + if connection.isVideoStabilizationSupported() { + connection.setPreferredVideoStabilizationMode(AVCaptureVideoStabilizationMode::Off); + } + if connection.automaticallyAdjustsVideoMirroring() { + connection.setAutomaticallyAdjustsVideoMirroring(false); + } + if connection.isVideoMirroringSupported() && connection.isVideoMirrored() { + connection.setVideoMirrored(false); + } + } + Ok(()) + } + + #[derive(Debug)] + struct CaptureDelegateIvars { + shared: Arc, + } + + define_class!( + // SAFETY: + // - The superclass NSObject does not have subclassing requirements. + // - CaptureDelegate does not implement Drop; retained Rust state lives in ivars. + #[unsafe(super = NSObject)] + #[thread_kind = AnyThread] + #[ivars = CaptureDelegateIvars] + struct CaptureDelegate; + + // SAFETY: `NSObjectProtocol` has no additional safety requirements. + unsafe impl NSObjectProtocol for CaptureDelegate {} + + // SAFETY: The selector signatures match the generated AVFoundation protocol. + unsafe impl AVCaptureVideoDataOutputSampleBufferDelegate for CaptureDelegate { + #[unsafe(method(captureOutput:didOutputSampleBuffer:fromConnection:))] + #[allow(non_snake_case)] + unsafe fn captureOutput_didOutputSampleBuffer_fromConnection( + &self, + _output: &AVCaptureOutput, + sample_buffer: &CMSampleBuffer, + _connection: &objc2_av_foundation::AVCaptureConnection, + ) { + if let Err(err) = process_sample_buffer(sample_buffer, &self.ivars().shared) { + self.ivars().shared.set_error(err.to_string()); + } + } + } + ); + + impl CaptureDelegate { + fn new(shared: Arc) -> Retained { + let this = Self::alloc().set_ivars(CaptureDelegateIvars { shared }); + // SAFETY: `this` is freshly allocated and initialized exactly once + // using NSObject's designated initializer. + unsafe { msg_send![super(this), init] } + } + } + + /// Latest-frame mailbox shared between the AVFoundation delegate queue and + /// the capturing thread. `pub(super)` so [`super::AvFoundationStopHandle`] + /// can hold it and unit tests can exercise the stop path without a camera. + #[derive(Debug)] + pub(super) struct FrameQueue { + state: Mutex, + ready: Condvar, + started_at: Instant, + } + + impl Default for FrameQueue { + fn default() -> Self { + Self { + state: Mutex::new(FrameQueueState::default()), + ready: Condvar::new(), + started_at: Instant::now(), + } + } + } + + #[derive(Debug, Default)] + struct FrameQueueState { + latest: Option, + stopped: bool, + error: Option, + } + + #[derive(Debug)] + pub(super) struct InitialFrameInfo { + pub(super) format: CaptureFormat, + pub(super) native_frame_supported: bool, + pub(super) core_video_pixel_format: u32, + } + + impl FrameQueue { + fn push_frame(&self, frame: QueuedAvFoundationFrame) { + let mut state = self.state.lock().expect("AVFoundation frame queue poisoned"); + if state.stopped { + return; + } + state.latest = Some(frame); + self.ready.notify_one(); + } + + fn set_error(&self, error: String) { + let mut state = self.state.lock().expect("AVFoundation frame queue poisoned"); + state.error = Some(error); + self.ready.notify_all(); + } + + /// Signals shutdown and wakes every blocked capture wait. + /// + /// Stopping is idempotent. `push_frame` discards frames delivered after + /// this point, and `take_frame`/`take_native_frame` fail with + /// [`AvFoundationError::NotRunning`] once any already-queued frame has + /// been drained. + pub(super) fn stop(&self) { + let mut state = self.state.lock().expect("AVFoundation frame queue poisoned"); + state.stopped = true; + self.ready.notify_all(); + } + + fn discard_latest(&self) { + let mut state = self.state.lock().expect("AVFoundation frame queue poisoned"); + state.latest = None; + } + + fn wait_for_format( + &self, + timeout: Duration, + ) -> Result { + let mut state = self.state.lock().expect("AVFoundation frame queue poisoned"); + loop { + if let Some(frame) = state.latest.as_ref() { + return Ok(InitialFrameInfo { + format: CaptureFormat::new( + CaptureResolution::new(frame.width, frame.height), + 0, + frame.source_format, + ), + native_frame_supported: frame.native_frame_supported(), + core_video_pixel_format: frame.core_video_pixel_format, + }); + } + if let Some(error) = state.error.take() { + return Err(AvFoundationError::Runtime(error)); + } + if state.stopped { + return Err(AvFoundationError::NotRunning); + } + + let (next_state, wait_result) = self + .ready + .wait_timeout(state, timeout) + .expect("AVFoundation frame queue poisoned"); + if wait_result.timed_out() { + return Err(AvFoundationError::FrameTimeout); + } + state = next_state; + } + } + + pub(super) fn take_frame(&self) -> Result { + // Convert only after `wait_take_queued_frame` has released the + // state mutex: the conversion locks the pixel buffer and runs a + // full-frame libyuv copy, and holding the mutex through that would + // block `push_frame` on the AVFoundation delegate queue, which + // drops camera frames while stalled + // (`setAlwaysDiscardsLateVideoFrames(true)`). + self.wait_take_queued_frame()?.into_i420_frame() + } + + pub(super) fn take_native_frame( + &self, + ) -> Result { + // See `take_frame` for why conversion happens outside the mutex. + self.wait_take_queued_frame()?.into_native_frame() + } + + /// Blocks until a frame, a delegate error, or a stop signal arrives and + /// moves the frame out of the shared state. The state mutex guard is + /// dropped when this returns, so callers convert the fully owned frame + /// without holding the lock. Fails with + /// [`AvFoundationError::NotRunning`] once the queue has been stopped. + fn wait_take_queued_frame(&self) -> Result { + let mut state = self.state.lock().expect("AVFoundation frame queue poisoned"); + loop { + if let Some(frame) = state.latest.take() { + return Ok(frame); + } + if let Some(error) = state.error.take() { + return Err(AvFoundationError::Runtime(error)); + } + if state.stopped { + return Err(AvFoundationError::NotRunning); + } + state = self.ready.wait(state).expect("AVFoundation frame queue poisoned"); + } + } + + fn timestamp_us(&self) -> i64 { + elapsed_us(self.started_at.elapsed()) + } + } + + #[derive(Debug)] + struct QueuedAvFoundationFrame { + pixel_buffer: RetainedPixelBuffer, + width: u32, + height: u32, + source_format: CaptureFrameFormat, + core_video_pixel_format: u32, + capture_wall_time_us: u64, + read_wall_time_us: u64, + sensor_timestamp_us: Option, + timestamp_us: i64, + is_iosurface_backed: bool, + } + + impl QueuedAvFoundationFrame { + fn into_i420_frame(self) -> Result { + let (buffer, source_format) = convert_pixel_buffer(self.pixel_buffer.as_ref())?; + let frame = VideoFrame { + rotation: VideoRotation::VideoRotation0, + timestamp_us: self.timestamp_us, + frame_metadata: None, + buffer, + }; + + Ok(AvFoundationFrame { + frame, + source_format, + capture_wall_time_us: self.capture_wall_time_us, + read_wall_time_us: self.read_wall_time_us, + sensor_timestamp_us: self.sensor_timestamp_us, + used_conversion: source_format != CaptureFrameFormat::I420, + }) + } + + fn into_native_frame(self) -> Result { + if self.source_format != CaptureFrameFormat::Nv12 { + return Err(AvFoundationError::UnsupportedFrameFormat(self.source_format)); + } + if self.core_video_pixel_format != kCVPixelFormatType_420YpCbCr8BiPlanarFullRange { + return Err(AvFoundationError::NativeCaptureUnavailable); + } + if !self.is_iosurface_backed { + return Err(AvFoundationError::NativeCaptureUnavailable); + } + + let buffer = self.pixel_buffer.into_native_buffer(); + let frame = VideoFrame { + rotation: VideoRotation::VideoRotation0, + timestamp_us: self.timestamp_us, + frame_metadata: None, + buffer, + }; + + Ok(AvFoundationNativeFrame { + frame, + source_format: self.source_format, + capture_wall_time_us: self.capture_wall_time_us, + read_wall_time_us: self.read_wall_time_us, + sensor_timestamp_us: self.sensor_timestamp_us, + }) + } + + fn native_frame_supported(&self) -> bool { + self.source_format == CaptureFrameFormat::Nv12 + && self.core_video_pixel_format == kCVPixelFormatType_420YpCbCr8BiPlanarFullRange + && self.is_iosurface_backed + } + } + + fn pixel_buffer_has_iosurface(pixel_buffer: &CVPixelBuffer) -> bool { + // SAFETY: `pixel_buffer` is a valid CVPixelBufferRef. CoreVideo returns + // an unretained IOSurfaceRef; this code only checks for null and does + // not store or release the returned pointer. + !unsafe { CVPixelBufferGetIOSurface(pixel_buffer) }.is_null() + } + + #[derive(Debug)] + struct RetainedPixelBuffer { + ptr: NonNull, + } + + // SAFETY: `RetainedPixelBuffer` owns a +1 CoreFoundation reference to a + // CVPixelBuffer. CoreFoundation retain/release and CoreVideo pixel-buffer + // inspection are thread-safe for this usage, and mutable pixel access still + // goes through CoreVideo's lock/unlock API. + unsafe impl Send for RetainedPixelBuffer {} + // SAFETY: The wrapper exposes only shared access to the pixel buffer and + // releases its retained reference on drop. + unsafe impl Sync for RetainedPixelBuffer {} + + impl RetainedPixelBuffer { + fn from_image_buffer(image_buffer: T) -> Self + where + T: Deref, + { + let ptr = NonNull::from(&*image_buffer).cast::(); + std::mem::forget(image_buffer); + Self { ptr } + } + + fn as_ref(&self) -> &CVPixelBuffer { + // SAFETY: `ptr` was created from a retained CVImageBuffer returned + // by CMSampleBufferGetImageBuffer and remains valid until this + // wrapper drops or transfers that retain. + unsafe { self.ptr.as_ref() } + } + + fn into_native_buffer(self) -> NativeBuffer { + let ptr = self.ptr.as_ptr().cast::(); + std::mem::forget(self); + // SAFETY: `ptr` is a valid retained CVPixelBufferRef. The WebRTC + // bridge wraps it in RTCCVPixelBuffer and then releases the +1 + // retain we transfer here, so Rust must not release it afterwards. + unsafe { NativeBuffer::from_cv_pixel_buffer(ptr) } + } + } + + impl Drop for RetainedPixelBuffer { + fn drop(&mut self) { + // SAFETY: `ptr` owns one CoreFoundation retain unless ownership was + // transferred by `into_native_buffer`, which forgets `self`. + unsafe { CFRelease(self.ptr.as_ptr().cast::()) }; + } + } + + fn select_device( + selector: &CaptureDeviceSelector, + ) -> Result, AvFoundationError> { + let media_type = unsafe { AVMediaTypeVideo }.ok_or(AvFoundationError::DeviceNotFound)?; + match selector { + CaptureDeviceSelector::Default => { + unsafe { AVCaptureDevice::defaultDeviceWithMediaType(media_type) } + .ok_or(AvFoundationError::DeviceNotFound) + } + CaptureDeviceSelector::Index(index) => { + #[allow(deprecated)] + let devices = unsafe { AVCaptureDevice::devicesWithMediaType(media_type) }; + devices + .iter() + .nth(*index) + .map(|device| device.retain()) + .ok_or(AvFoundationError::DeviceNotFound) + } + CaptureDeviceSelector::Id(id) => { + let id = NSString::from_str(id); + unsafe { AVCaptureDevice::deviceWithUniqueID(&id) } + .ok_or(AvFoundationError::DeviceNotFound) + } + } + } + + fn select_active_format( + device: &AVCaptureDevice, + request: &CaptureFormatRequest, + ) -> Result>, AvFoundationError> { + match request { + CaptureFormatRequest::Default => Ok(None), + CaptureFormatRequest::Exact(format) => { + let selected = best_device_format( + device, + Some(format.resolution), + Some(format.frame_rate), + SelectionMode::Exact, + ); + selected.map(Some).ok_or(AvFoundationError::UnsupportedFormat(*format)) + } + CaptureFormatRequest::Closest(format) => Ok(best_device_format( + device, + Some(format.resolution), + Some(format.frame_rate), + SelectionMode::Closest, + )), + CaptureFormatRequest::HighestFrameRate { resolution, .. } => { + Ok(best_device_format(device, *resolution, None, SelectionMode::HighestFrameRate)) + } + CaptureFormatRequest::HighestResolution { frame_rate, .. } => { + Ok(best_device_format(device, None, *frame_rate, SelectionMode::HighestResolution)) + } + } + } + + #[derive(Debug, Clone, Copy, PartialEq, Eq)] + enum SelectionMode { + Exact, + Closest, + HighestFrameRate, + HighestResolution, + } + + #[derive(Debug)] + struct DeviceFormatCandidate { + format: Retained, + resolution: CaptureResolution, + frame_rate_supported: bool, + max_frame_rate: u32, + } + + fn best_device_format( + device: &AVCaptureDevice, + resolution: Option, + frame_rate: Option, + mode: SelectionMode, + ) -> Option> { + // SAFETY: The AVCaptureDevice is retained for the session setup path; querying the + // immutable list of supported formats does not mutate Rust-managed memory. + let formats = unsafe { device.formats() }; + let mut candidates = formats + .iter() + .filter_map(|format| { + let candidate_resolution = device_format_resolution(&format)?; + let frame_rate_supported = frame_rate + .map(|frame_rate| device_format_supports_frame_rate(&format, frame_rate)) + .unwrap_or(true); + Some(DeviceFormatCandidate { + format: format.retain(), + resolution: candidate_resolution, + frame_rate_supported, + max_frame_rate: device_format_max_frame_rate(&format), + }) + }) + .collect::>(); + + if let Some(resolution) = resolution { + if mode == SelectionMode::Exact { + return candidates + .into_iter() + .find(|candidate| { + candidate.resolution == resolution && candidate.frame_rate_supported + }) + .map(|candidate| candidate.format); + } + } + + if frame_rate.is_some() && candidates.iter().any(|candidate| candidate.frame_rate_supported) + { + candidates.retain(|candidate| candidate.frame_rate_supported); + } + + match mode { + SelectionMode::Exact => None, + SelectionMode::Closest => { + let resolution = resolution?; + candidates + .into_iter() + .min_by_key(|candidate| resolution_distance(candidate.resolution, resolution)) + .map(|candidate| candidate.format) + } + SelectionMode::HighestFrameRate => candidates + .into_iter() + .filter(|candidate| { + resolution.map(|resolution| candidate.resolution == resolution).unwrap_or(true) + }) + .max_by_key(|candidate| { + ( + candidate.max_frame_rate, + candidate.resolution.width as u64 * candidate.resolution.height as u64, + ) + }) + .map(|candidate| candidate.format), + SelectionMode::HighestResolution => candidates + .into_iter() + .max_by_key(|candidate| { + ( + candidate.resolution.width as u64 * candidate.resolution.height as u64, + candidate.max_frame_rate, + ) + }) + .map(|candidate| candidate.format), + } + } + + fn device_format_resolution(format: &AVCaptureDeviceFormat) -> Option { + // SAFETY: `format` is an AVCaptureDeviceFormat from the device's immutable formats array. + // Its format description is a valid CMVideoFormatDescription for video capture formats. + let description = unsafe { format.formatDescription() }; + // SAFETY: `description` is the video format description returned by AVFoundation. + let dimensions = unsafe { CMVideoFormatDescriptionGetDimensions(&description) }; + if dimensions.width <= 0 || dimensions.height <= 0 { + return None; + } + Some(CaptureResolution::new(dimensions.width as u32, dimensions.height as u32)) + } + + fn device_format_supports_frame_rate(format: &AVCaptureDeviceFormat, frame_rate: u32) -> bool { + let requested = frame_rate as f64; + // SAFETY: `format` is an AVCaptureDeviceFormat from the device's immutable formats array. + // The returned frame-rate ranges are immutable AVFoundation objects. + unsafe { format.videoSupportedFrameRateRanges() }.iter().any(|range| { + // SAFETY: AVFrameRateRange values are immutable for the lifetime of the object. + let min = unsafe { range.minFrameRate() }; + // SAFETY: AVFrameRateRange values are immutable for the lifetime of the object. + let max = unsafe { range.maxFrameRate() }; + requested >= min.floor() && requested <= max.ceil() + }) + } + + fn device_format_max_frame_rate(format: &AVCaptureDeviceFormat) -> u32 { + // SAFETY: `format` is an AVCaptureDeviceFormat from the device's immutable formats array. + // The returned frame-rate ranges are immutable AVFoundation objects. + unsafe { format.videoSupportedFrameRateRanges() } + .iter() + .map(|range| { + // SAFETY: AVFrameRateRange values are immutable for the lifetime of the object. + unsafe { range.maxFrameRate() }.floor().max(0.0) as u32 + }) + .max() + .unwrap_or_default() + } + + fn resolution_distance(actual: CaptureResolution, requested: CaptureResolution) -> u64 { + let width_delta = actual.width.abs_diff(requested.width) as u64; + let height_delta = actual.height.abs_diff(requested.height) as u64; + let pixel_delta = (actual.width as u64 * actual.height as u64) + .abs_diff(requested.width as u64 * requested.height as u64); + pixel_delta + width_delta * width_delta + height_delta * height_delta + } + + fn configure_device( + device: &AVCaptureDevice, + request: &CaptureFormatRequest, + active_format: Option<&AVCaptureDeviceFormat>, + ) -> Result<(), AvFoundationError> { + let frame_rate = requested_frame_rate(request); + if active_format.is_none() && frame_rate.is_none() { + return Ok(()); + } + + unsafe { device.lockForConfiguration() }.map_err(|err| { + AvFoundationError::SessionSetup(err.localizedDescription().to_string()) + })?; + + let configure_result = configure_locked_device(device, active_format, frame_rate); + // SAFETY: The device was successfully locked above and must be unlocked exactly once. + unsafe { + device.unlockForConfiguration(); + } + configure_result + } + + fn configure_locked_device( + device: &AVCaptureDevice, + active_format: Option<&AVCaptureDeviceFormat>, + frame_rate: Option, + ) -> Result<(), AvFoundationError> { + // SAFETY: The caller holds the AVCaptureDevice configuration lock, and `active_format` + // was selected from this device's formats array. + unsafe { + if let Some(active_format) = active_format { + device.setActiveFormat(active_format); + } + } + configure_low_latency_device_processing(device); + + let Some(frame_rate) = frame_rate.filter(|frame_rate| *frame_rate > 0) else { + return Ok(()); + }; + + let active_format = match active_format { + Some(active_format) => active_format.retain(), + // SAFETY: The caller holds the configuration lock, and reading activeFormat is valid. + None => unsafe { device.activeFormat() }, + }; + if !device_format_supports_frame_rate(&active_format, frame_rate) { + return Ok(()); + } + + let duration = unsafe { CMTime::with_seconds(1.0 / frame_rate as f64, 600) }; + // SAFETY: The device is locked for configuration and the CMTime value is finite. + unsafe { + device.setActiveVideoMinFrameDuration(duration); + device.setActiveVideoMaxFrameDuration(duration); + } + Ok(()) + } + + fn configure_low_latency_device_processing(device: &AVCaptureDevice) { + // SAFETY: The caller holds the AVCaptureDevice configuration lock. + // Setters are guarded by their support/current-state predicates where + // AVFoundation requires that. + unsafe { + if device.automaticallyAdjustsVideoHDREnabled() { + device.setAutomaticallyAdjustsVideoHDREnabled(false); + } + if device.isVideoHDREnabled() { + device.setVideoHDREnabled(false); + } + if device.isLowLightBoostSupported() + && device.automaticallyEnablesLowLightBoostWhenAvailable() + { + device.setAutomaticallyEnablesLowLightBoostWhenAvailable(false); + } + if device.isSmoothAutoFocusSupported() && device.isSmoothAutoFocusEnabled() { + device.setSmoothAutoFocusEnabled(false); + } + } + } + + fn requested_frame_rate(request: &CaptureFormatRequest) -> Option { + match request { + CaptureFormatRequest::Default => None, + CaptureFormatRequest::Exact(format) | CaptureFormatRequest::Closest(format) => { + Some(format.frame_rate) + } + CaptureFormatRequest::HighestFrameRate { .. } => None, + CaptureFormatRequest::HighestResolution { frame_rate, .. } => *frame_rate, + } + } + + fn session_preset( + request: &CaptureFormatRequest, + ) -> Option<&'static objc2_av_foundation::AVCaptureSessionPreset> { + let resolution = match request { + CaptureFormatRequest::Exact(format) | CaptureFormatRequest::Closest(format) => { + Some(format.resolution) + } + CaptureFormatRequest::HighestFrameRate { resolution, .. } => *resolution, + CaptureFormatRequest::Default + | CaptureFormatRequest::HighestResolution { frame_rate: _, frame_format: _ } => None, + }?; + + exact_session_preset(resolution).or(Some(unsafe { AVCaptureSessionPresetHigh })) + } + + fn exact_session_preset( + resolution: CaptureResolution, + ) -> Option<&'static objc2_av_foundation::AVCaptureSessionPreset> { + match (resolution.width, resolution.height) { + (1920, 1080) => Some(unsafe { AVCaptureSessionPreset1920x1080 }), + (1280, 720) => Some(unsafe { AVCaptureSessionPreset1280x720 }), + (640, 480) => Some(unsafe { AVCaptureSessionPreset640x480 }), + (w, h) if w <= 640 && h <= 480 => Some(unsafe { AVCaptureSessionPresetMedium }), + _ => None, + } + } + + fn process_sample_buffer( + sample_buffer: &CMSampleBuffer, + shared: &FrameQueue, + ) -> Result<(), AvFoundationError> { + let read_wall_time_us = unix_time_us_now().unwrap_or_default(); + let sensor_timestamp_us = + sample_buffer_capture_wall_time_us(sample_buffer, read_wall_time_us); + let image_buffer = unsafe { sample_buffer.image_buffer() } + .ok_or(AvFoundationError::InvalidFrame("sample buffer has no image buffer"))?; + let pixel_buffer = RetainedPixelBuffer::from_image_buffer(image_buffer); + let pixel_buffer_ref = pixel_buffer.as_ref(); + let width = u32::try_from(CVPixelBufferGetWidth(pixel_buffer_ref)) + .map_err(|_| AvFoundationError::InvalidFrame("width is out of range"))?; + let height = u32::try_from(CVPixelBufferGetHeight(pixel_buffer_ref)) + .map_err(|_| AvFoundationError::InvalidFrame("height is out of range"))?; + let source_format = capture_frame_format_from_core_video(CVPixelBufferGetPixelFormatType( + pixel_buffer_ref, + ))?; + let core_video_pixel_format = CVPixelBufferGetPixelFormatType(pixel_buffer_ref); + let is_iosurface_backed = pixel_buffer_has_iosurface(pixel_buffer_ref); + + let capture_wall_time_us = sensor_timestamp_us.unwrap_or(read_wall_time_us); + shared.push_frame(QueuedAvFoundationFrame { + pixel_buffer, + width, + height, + source_format, + core_video_pixel_format, + capture_wall_time_us, + read_wall_time_us, + sensor_timestamp_us, + timestamp_us: shared.timestamp_us(), + is_iosurface_backed, + }); + Ok(()) + } + + fn sample_buffer_capture_wall_time_us( + sample_buffer: &CMSampleBuffer, + read_wall_time_us: u64, + ) -> Option { + let sample_time = unsafe { sample_buffer.presentation_time_stamp() }; + + let timestamp_us = cm_time_to_us(sample_time)?; + if validate_capture_timestamp_us(timestamp_us, read_wall_time_us).is_some() { + return Some(timestamp_us); + } + + let host_now_us = current_host_time_us()?; + let age_us = host_now_us.checked_sub(timestamp_us)?; + if age_us > MAX_CAPTURE_TIMESTAMP_AGE_US { + return None; + } + read_wall_time_us.checked_sub(age_us) + } + + fn current_host_time_us() -> Option { + // SAFETY: The CoreMedia host time clock is a process-wide singleton and + // reading it does not mutate Rust-managed memory. + let host_clock = unsafe { CMClock::host_time_clock() }; + // SAFETY: `host_clock` is a valid retained CoreMedia clock. + let host_time = unsafe { host_clock.time() }; + cm_time_to_us(host_time) + } + + fn cm_time_to_us(time: CMTime) -> Option { + let flags = time.flags; + if !flags.contains(CMTimeFlags::Valid) + || flags.intersects(CMTimeFlags::ImpliedValueFlagsMask) + { + return None; + } + + // SAFETY: `time` is a valid CMTime value returned by CoreMedia. Invalid + // and indefinite values were filtered above. + let seconds = unsafe { time.seconds() }; + if !seconds.is_finite() || seconds < 0.0 { + return None; + } + + let micros = seconds * 1_000_000.0; + (micros <= u64::MAX as f64).then_some(micros.round() as u64) + } + + fn convert_pixel_buffer( + pixel_buffer: &CVPixelBuffer, + ) -> Result<(I420Buffer, CaptureFrameFormat), AvFoundationError> { + let lock_flags = CVPixelBufferLockFlags::ReadOnly; + let lock_result = unsafe { CVPixelBufferLockBaseAddress(pixel_buffer, lock_flags) }; + if lock_result != kCVReturnSuccess { + return Err(AvFoundationError::InvalidFrame("CVPixelBuffer lock failed")); + } + + let result = convert_locked_pixel_buffer(pixel_buffer); + + // SAFETY: The pixel buffer was locked above with the same flags. + let unlock_result = unsafe { CVPixelBufferUnlockBaseAddress(pixel_buffer, lock_flags) }; + if unlock_result != kCVReturnSuccess { + return Err(AvFoundationError::InvalidFrame("CVPixelBuffer unlock failed")); + } + + result + } + + fn convert_locked_pixel_buffer( + pixel_buffer: &CVPixelBuffer, + ) -> Result<(I420Buffer, CaptureFrameFormat), AvFoundationError> { + let width = u32::try_from(CVPixelBufferGetWidth(pixel_buffer)) + .map_err(|_| AvFoundationError::InvalidFrame("width is out of range"))?; + let height = u32::try_from(CVPixelBufferGetHeight(pixel_buffer)) + .map_err(|_| AvFoundationError::InvalidFrame("height is out of range"))?; + let source_format = + capture_frame_format_from_core_video(CVPixelBufferGetPixelFormatType(pixel_buffer))?; + + match source_format { + CaptureFrameFormat::Nv12 => convert_nv12(pixel_buffer, width, height) + .map(|buffer| (buffer, CaptureFrameFormat::Nv12)), + CaptureFrameFormat::Bgra => convert_bgra(pixel_buffer, width, height) + .map(|buffer| (buffer, CaptureFrameFormat::Bgra)), + CaptureFrameFormat::I420 => convert_i420(pixel_buffer, width, height) + .map(|buffer| (buffer, CaptureFrameFormat::I420)), + CaptureFrameFormat::Uyvy => convert_uyvy(pixel_buffer, width, height) + .map(|buffer| (buffer, CaptureFrameFormat::Uyvy)), + CaptureFrameFormat::Yuyv => convert_yuy2(pixel_buffer, width, height) + .map(|buffer| (buffer, CaptureFrameFormat::Yuyv)), + other => Err(AvFoundationError::UnsupportedFrameFormat(other)), + } + } + + fn capture_frame_format_from_core_video( + pixel_format: u32, + ) -> Result { + match pixel_format { + format + if format == kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange + || format == kCVPixelFormatType_420YpCbCr8BiPlanarFullRange => + { + Ok(CaptureFrameFormat::Nv12) + } + format if format == kCVPixelFormatType_32BGRA => Ok(CaptureFrameFormat::Bgra), + format + if format == kCVPixelFormatType_420YpCbCr8Planar + || format == kCVPixelFormatType_420YpCbCr8PlanarFullRange => + { + Ok(CaptureFrameFormat::I420) + } + format if format == kCVPixelFormatType_422YpCbCr8 => Ok(CaptureFrameFormat::Uyvy), + format + if format == kCVPixelFormatType_422YpCbCr8_yuvs + || format == kCVPixelFormatType_422YpCbCr8FullRange => + { + Ok(CaptureFrameFormat::Yuyv) + } + other => Err(AvFoundationError::UnsupportedCoreVideoPixelFormat(other)), + } + } + + fn convert_nv12( + pixel_buffer: &CVPixelBuffer, + width: u32, + height: u32, + ) -> Result { + if CVPixelBufferGetPlaneCount(pixel_buffer) < 2 { + return Err(AvFoundationError::InvalidFrame("NV12 buffer has fewer than two planes")); + } + + let y = plane(pixel_buffer, 0)?; + let uv = plane(pixel_buffer, 1)?; + let mut buffer = I420Buffer::new(width, height); + let (stride_y, stride_u, stride_v) = buffer.strides(); + let (dst_y, dst_u, dst_v) = buffer.data_mut(); + let ret = unsafe { + yuv_sys::rs_NV12ToI420( + y.data.as_ptr(), + y.stride as i32, + uv.data.as_ptr(), + uv.stride as i32, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width as i32, + height as i32, + ) + }; + if ret != 0 { + return Err(AvFoundationError::Convert("NV12ToI420 failed")); + } + Ok(buffer) + } + + fn convert_bgra( + pixel_buffer: &CVPixelBuffer, + width: u32, + height: u32, + ) -> Result { + let bgra = packed_plane(pixel_buffer, 4)?; + let mut buffer = I420Buffer::new(width, height); + let (stride_y, stride_u, stride_v) = buffer.strides(); + let (dst_y, dst_u, dst_v) = buffer.data_mut(); + let ret = unsafe { + yuv_sys::rs_BGRAToI420( + bgra.data.as_ptr(), + bgra.stride as i32, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width as i32, + height as i32, + ) + }; + if ret != 0 { + return Err(AvFoundationError::Convert("BGRAToI420 failed")); + } + Ok(buffer) + } + + fn convert_uyvy( + pixel_buffer: &CVPixelBuffer, + width: u32, + height: u32, + ) -> Result { + let uyvy = packed_plane(pixel_buffer, 2)?; + let mut buffer = I420Buffer::new(width, height); + let (stride_y, stride_u, stride_v) = buffer.strides(); + let (dst_y, dst_u, dst_v) = buffer.data_mut(); + // SAFETY: The source slice covers the locked CVPixelBuffer plane for the duration of this + // call, and the destination planes come from a freshly allocated I420Buffer with matching + // width, height, and strides. + let ret = unsafe { + yuv_sys::rs_UYVYToI420( + uyvy.data.as_ptr(), + uyvy.stride as i32, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width as i32, + height as i32, + ) + }; + if ret != 0 { + return Err(AvFoundationError::Convert("UYVYToI420 failed")); + } + Ok(buffer) + } + + fn convert_yuy2( + pixel_buffer: &CVPixelBuffer, + width: u32, + height: u32, + ) -> Result { + let yuy2 = packed_plane(pixel_buffer, 2)?; + let mut buffer = I420Buffer::new(width, height); + let (stride_y, stride_u, stride_v) = buffer.strides(); + let (dst_y, dst_u, dst_v) = buffer.data_mut(); + // SAFETY: The source slice covers the locked CVPixelBuffer plane for the duration of this + // call, and the destination planes come from a freshly allocated I420Buffer with matching + // width, height, and strides. + let ret = unsafe { + yuv_sys::rs_YUY2ToI420( + yuy2.data.as_ptr(), + yuy2.stride as i32, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width as i32, + height as i32, + ) + }; + if ret != 0 { + return Err(AvFoundationError::Convert("YUY2ToI420 failed")); + } + Ok(buffer) + } + + fn convert_i420( + pixel_buffer: &CVPixelBuffer, + width: u32, + height: u32, + ) -> Result { + if CVPixelBufferGetPlaneCount(pixel_buffer) < 3 { + return Err(AvFoundationError::InvalidFrame("I420 buffer has fewer than three planes")); + } + + let y = plane(pixel_buffer, 0)?; + let u = plane(pixel_buffer, 1)?; + let v = plane(pixel_buffer, 2)?; + let mut buffer = I420Buffer::new(width, height); + let (stride_y, stride_u, stride_v) = buffer.strides(); + let (dst_y, dst_u, dst_v) = buffer.data_mut(); + let ret = unsafe { + yuv_sys::rs_I420Copy( + y.data.as_ptr(), + y.stride as i32, + u.data.as_ptr(), + u.stride as i32, + v.data.as_ptr(), + v.stride as i32, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width as i32, + height as i32, + ) + }; + if ret != 0 { + return Err(AvFoundationError::Convert("I420Copy failed")); + } + Ok(buffer) + } + + struct Plane<'a> { + data: &'a [u8], + stride: usize, + } + + fn plane(pixel_buffer: &CVPixelBuffer, index: usize) -> Result, AvFoundationError> { + let plane_count = CVPixelBufferGetPlaneCount(pixel_buffer); + if index >= plane_count { + return Err(AvFoundationError::InvalidFrame("plane index is out of range")); + } + + let base = CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, index); + if base.is_null() { + return Err(AvFoundationError::InvalidFrame("pixel plane has no base address")); + } + let stride = CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, index); + let height = CVPixelBufferGetHeightOfPlane(pixel_buffer, index); + let width = CVPixelBufferGetWidthOfPlane(pixel_buffer, index); + let min_len = stride + .checked_mul(height.saturating_sub(1)) + .and_then(|value| value.checked_add(width)) + .ok_or(AvFoundationError::InvalidFrame("pixel plane size overflow"))?; + + // SAFETY: The CVPixelBuffer is locked for read-only access, the plane + // base address is non-null, and CoreVideo reports the minimum readable + // extent for this plane. + let data = unsafe { std::slice::from_raw_parts(base.cast::(), min_len) }; + Ok(Plane { data, stride }) + } + + fn packed_plane( + pixel_buffer: &CVPixelBuffer, + bytes_per_pixel: usize, + ) -> Result, AvFoundationError> { + let base = CVPixelBufferGetBaseAddress(pixel_buffer); + if base.is_null() { + return Err(AvFoundationError::InvalidFrame("pixel buffer has no base address")); + } + let stride = CVPixelBufferGetBytesPerRow(pixel_buffer); + let height = CVPixelBufferGetHeight(pixel_buffer); + let width = CVPixelBufferGetWidth(pixel_buffer) + .checked_mul(bytes_per_pixel) + .ok_or(AvFoundationError::InvalidFrame("packed pixel row size overflow"))?; + let min_len = stride + .checked_mul(height.saturating_sub(1)) + .and_then(|value| value.checked_add(width)) + .ok_or(AvFoundationError::InvalidFrame("packed pixel buffer size overflow"))?; + + // SAFETY: The CVPixelBuffer is locked for read-only access, the base + // address is non-null, and CoreVideo reports the minimum readable extent + // for this packed buffer. + let data = unsafe { std::slice::from_raw_parts(base.cast::(), min_len) }; + Ok(Plane { data, stride }) + } +} + +#[cfg(all(test, target_os = "macos"))] +mod tests { + use std::sync::{mpsc, Arc}; + use std::time::Duration; + + use super::{macos::FrameQueue, AvFoundationError, AvFoundationStopHandle}; + + /// Upper bound on how long a woken capture wait may take to return before + /// the test declares the stop path broken. + const STOP_WAIT_TIMEOUT: Duration = Duration::from_secs(5); + + // `FrameQueue` is pure Rust state, so these tests run on macOS CI hosts + // without camera hardware or AVFoundation involvement. + + #[test] + fn stop_handle_unblocks_take_frame() { + let queue = Arc::new(FrameQueue::default()); + let stop_handle = AvFoundationStopHandle { shared: Arc::clone(&queue) }; + + let (done_tx, done_rx) = mpsc::channel(); + let waiter = std::thread::spawn(move || { + let result = queue.take_frame(); + let _ = done_tx.send(()); + result + }); + + // Give the waiter time to block on the condvar. There is no race if + // the stop lands first: the wait loop re-checks `stopped` before every + // wait. + std::thread::sleep(Duration::from_millis(50)); + stop_handle.stop(); + + done_rx + .recv_timeout(STOP_WAIT_TIMEOUT) + .expect("take_frame did not return after the stop handle fired"); + let result = waiter.join().expect("take_frame thread panicked"); + assert!( + matches!(result, Err(AvFoundationError::NotRunning)), + "unexpected take_frame result: {result:?}" + ); + } + + #[test] + fn capture_waits_fail_fast_once_stopped() { + let queue = Arc::new(FrameQueue::default()); + let stop_handle = AvFoundationStopHandle { shared: Arc::clone(&queue) }; + stop_handle.stop(); + // Stopping is idempotent. + stop_handle.stop(); + + assert!(matches!(queue.take_frame(), Err(AvFoundationError::NotRunning))); + assert!(matches!(queue.take_native_frame(), Err(AvFoundationError::NotRunning))); + } +} diff --git a/livekit-capture/src/sources/gstreamer.rs b/livekit-capture/src/sources/gstreamer.rs new file mode 100644 index 000000000..58bf0906f --- /dev/null +++ b/livekit-capture/src/sources/gstreamer.rs @@ -0,0 +1,700 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::error::Error as StdError; + +use bytes::Bytes; +use thiserror::Error; + +use ::gstreamer as gst; +use ::gstreamer_app as gst_app; +use gst::prelude::*; + +use crate::{ + encoded::{ + h26x::{access_unit_from_annex_b, access_unit_from_h264_avc}, + ingress::EncodedAccessUnitSource, + CodecSpecific, EncodedFrameType, EncodedVideoCodec, OwnedEncodedAccessUnit, + }, + error::CaptureError, +}; + +/// Encoded sample format expected from a GStreamer appsink. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum GStreamerSampleFormat { + /// H.264 Annex-B access units, usually from `h264parse` with byte-stream caps. + H264AnnexB, + /// H.264 access units with AVC length-prefixed NAL units. + H264Avc { + /// Length-prefix size in bytes. + nal_length_size: u8, + }, + /// H.265 Annex-B access units, usually from `h265parse` with byte-stream caps. + H265AnnexB, + /// One already-delimited encoded access unit per appsink sample. + AccessUnit { + /// Codec carried by each appsink sample. + codec: EncodedVideoCodec, + }, +} + +impl GStreamerSampleFormat { + /// Returns the encoded codec carried by this sample format. + pub fn codec(self) -> EncodedVideoCodec { + match self { + Self::H264AnnexB => EncodedVideoCodec::H264, + Self::H264Avc { .. } => EncodedVideoCodec::H264, + Self::H265AnnexB => EncodedVideoCodec::H265, + Self::AccessUnit { codec } => codec, + } + } +} + +/// Configuration for a GStreamer appsink encoded source. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct GStreamerAppSinkConfig { + /// Format of encoded buffers pulled from appsink. + pub sample_format: GStreamerSampleFormat, + /// Timestamp added to the first buffer timestamp, or used directly as fallback. + pub start_timestamp_us: i64, + /// Fallback frame interval when a GStreamer buffer has no PTS or DTS. + pub frame_interval_us: i64, + /// Encoded frame width in pixels. + pub width: u32, + /// Encoded frame height in pixels. + pub height: u32, +} + +impl GStreamerAppSinkConfig { + /// Creates GStreamer appsink source configuration. + pub fn new( + sample_format: GStreamerSampleFormat, + start_timestamp_us: i64, + frame_interval_us: i64, + width: u32, + height: u32, + ) -> Self { + Self { sample_format, start_timestamp_us, frame_interval_us, width, height } + } +} + +/// Encoded source backed by a GStreamer appsink. +#[derive(Debug)] +pub struct GStreamerAppSinkEncodedSource { + appsink: gst_app::AppSink, + config: GStreamerAppSinkConfig, + next_fallback_timestamp_us: i64, +} + +impl GStreamerAppSinkEncodedSource { + /// Creates an encoded source from an existing GStreamer appsink. + pub fn new(appsink: gst_app::AppSink, config: GStreamerAppSinkConfig) -> Self { + Self { appsink, config, next_fallback_timestamp_us: config.start_timestamp_us } + } + + /// Returns the wrapped appsink. + pub fn appsink(&self) -> &gst_app::AppSink { + &self.appsink + } + + /// Returns the source configuration. + pub fn config(&self) -> GStreamerAppSinkConfig { + self.config + } + + /// Consumes this source and returns the wrapped appsink. + pub fn into_appsink(self) -> gst_app::AppSink { + self.appsink + } + + fn access_unit_from_sample( + &mut self, + sample: &gst::Sample, + ) -> Result { + let buffer = sample.buffer().ok_or(GStreamerSourceError::MissingBuffer)?; + let timestamp_us = self.timestamp_us(buffer); + let frame_type = if buffer.flags().contains(gst::BufferFlags::DELTA_UNIT) { + EncodedFrameType::Delta + } else { + EncodedFrameType::Key + }; + + let map = buffer + .map_readable() + .map_err(|err| GStreamerSourceError::MapReadable(err.to_string()))?; + let payload = map.as_ref(); + access_unit_from_sample_payload( + self.config.sample_format, + payload, + timestamp_us, + frame_type, + self.config.width, + self.config.height, + ) + .map_err(GStreamerSourceError::Capture) + } + + fn timestamp_us(&mut self, buffer: &gst::BufferRef) -> i64 { + if let Some(timestamp) = buffer.pts().or_else(|| buffer.dts()) { + let timestamp_us = + clock_time_to_timestamp_us(self.config.start_timestamp_us, timestamp); + self.next_fallback_timestamp_us = + timestamp_us.saturating_add(self.config.frame_interval_us); + return timestamp_us; + } + + let timestamp_us = self.next_fallback_timestamp_us; + self.next_fallback_timestamp_us = + self.next_fallback_timestamp_us.saturating_add(self.config.frame_interval_us); + timestamp_us + } +} + +impl EncodedAccessUnitSource for GStreamerAppSinkEncodedSource { + type Error = GStreamerSourceError; + + fn next_access_unit(&mut self) -> Result, Self::Error> { + match self.appsink.pull_sample() { + Ok(sample) => self.access_unit_from_sample(&sample).map(Some), + Err(_err) if self.appsink.is_eos() => Ok(None), + Err(err) => Err(GStreamerSourceError::PullSample(err.to_string())), + } + } + + fn request_keyframe(&mut self) { + // The `GstForceKeyUnit` custom upstream event is understood by every + // GStreamer video encoder (it is what gst-video's force-key-unit + // helper builds), so downstream PLI/FIR reaches the producer. + let structure = + gst::Structure::builder("GstForceKeyUnit").field("all-headers", true).build(); + let _ = self.appsink.send_event(gst::event::CustomUpstream::new(structure)); + } +} + +/// Error returned by GStreamer appsink encoded sources. +#[derive(Debug, Error)] +pub enum GStreamerSourceError { + /// The appsink failed to produce a sample. + #[error("failed to pull GStreamer appsink sample: {0}")] + PullSample(String), + /// The sample did not contain an encoded buffer. + #[error("GStreamer sample did not contain a buffer")] + MissingBuffer, + /// The sample buffer could not be mapped for reading. + #[error("failed to map GStreamer buffer for reading: {0}")] + MapReadable(String), + /// Access-unit construction failed. + #[error(transparent)] + Capture(CaptureError), +} + +/// Callback-backed encoded source for GStreamer appsink integrations. +#[derive(Debug)] +pub struct GStreamerAppSinkSource { + next_access_unit: F, +} + +impl GStreamerAppSinkSource { + /// Creates a source from a callback that pulls the next encoded appsink sample. + pub fn new(next_access_unit: F) -> Self { + Self { next_access_unit } + } + + /// Returns the wrapped callback. + pub fn callback(&self) -> &F { + &self.next_access_unit + } + + /// Returns the wrapped callback mutably. + pub fn callback_mut(&mut self) -> &mut F { + &mut self.next_access_unit + } + + /// Consumes this source and returns the wrapped callback. + pub fn into_callback(self) -> F { + self.next_access_unit + } +} + +impl EncodedAccessUnitSource for GStreamerAppSinkSource +where + F: FnMut() -> Result, E>, + E: StdError + Send + Sync + 'static, +{ + type Error = E; + + fn next_access_unit(&mut self) -> Result, Self::Error> { + (self.next_access_unit)() + } +} + +fn access_unit_from_sample_payload( + sample_format: GStreamerSampleFormat, + payload: &[u8], + timestamp_us: i64, + frame_type: EncodedFrameType, + width: u32, + height: u32, +) -> Result { + match sample_format { + GStreamerSampleFormat::H264AnnexB => access_unit_from_annex_b( + EncodedVideoCodec::H264, + Bytes::copy_from_slice(payload), + timestamp_us, + width, + height, + ), + GStreamerSampleFormat::H264Avc { nal_length_size } => { + access_unit_from_h264_avc(payload, nal_length_size, timestamp_us, width, height) + } + GStreamerSampleFormat::H265AnnexB => access_unit_from_annex_b( + EncodedVideoCodec::H265, + Bytes::copy_from_slice(payload), + timestamp_us, + width, + height, + ), + GStreamerSampleFormat::AccessUnit { codec } => { + if payload.is_empty() { + return Err(CaptureError::EmptyPayload); + } + + let mut access_unit = OwnedEncodedAccessUnit::new( + codec, + Bytes::copy_from_slice(payload), + timestamp_us, + frame_type, + width, + height, + ); + access_unit.codec_specific = CodecSpecific::default_for(codec); + Ok(access_unit) + } + } +} + +fn clock_time_to_timestamp_us(start_timestamp_us: i64, timestamp: gst::ClockTime) -> i64 { + let timestamp_us = timestamp.useconds().min(i64::MAX as u64) as i64; + start_timestamp_us.saturating_add(timestamp_us) +} + +/// Name of the appsink element the pipeline helpers look up or create. +pub const ENCODED_APPSINK_NAME: &str = "lk_appsink"; + +/// Error returned by the GStreamer pipeline helpers. +#[derive(Debug, Error)] +#[non_exhaustive] +pub enum GStreamerPipelineError { + /// The requested codec does not match what the pipeline advertises. + #[error( + "GStreamer codec mismatch: requested {requested:?}, but {location} advertises {advertised:?}" + )] + CodecMismatch { + /// Codec requested by the caller. + requested: EncodedVideoCodec, + /// Codec advertised by the pipeline. + advertised: EncodedVideoCodec, + /// Pipeline location that advertised the codec. + location: String, + }, + /// The pipeline has no usable appsink and no unlinked encoded pad. + #[error( + "GStreamer pipeline must include `appsink name={ENCODED_APPSINK_NAME}` or leave one \ + encoded video source pad unlinked" + )] + MissingAppSink, + /// The named element exists but is not an appsink. + #[error("GStreamer element {ENCODED_APPSINK_NAME} is not an appsink")] + NotAnAppSink, + /// Pad caps advertise no supported encoded video codec. + #[error("unlinked GStreamer pad '{0}' does not advertise supported encoded video caps")] + UnsupportedPadCaps(String), + /// Caps advertise a stream layout the encoded sources cannot consume. + #[error("unsupported GStreamer caps: {0}")] + UnsupportedCaps(String), + /// Element creation or linking failed. + #[error("{0}")] + Pipeline(String), +} + +/// Returns the appsink caps for a codec as a launch-string fragment. +/// +/// This is the single per-codec caps table: [`encoded_caps`] and pipeline +/// descriptions embedding a capsfilter should all derive from it. +pub fn encoded_caps_string(codec: EncodedVideoCodec) -> &'static str { + match codec { + EncodedVideoCodec::H264 => "video/x-h264,stream-format=byte-stream,alignment=au", + EncodedVideoCodec::H265 => "video/x-h265,stream-format=byte-stream,alignment=au", + EncodedVideoCodec::VP8 => "video/x-vp8", + EncodedVideoCodec::VP9 => "video/x-vp9,profile=(string)0", + EncodedVideoCodec::AV1 => "video/x-av1,stream-format=obu-stream,alignment=tu", + } +} + +/// Returns the appsink caps for a codec. +pub fn encoded_caps(codec: EncodedVideoCodec) -> Result { + encoded_caps_string(codec) + .parse::() + .map_err(|err| GStreamerPipelineError::Pipeline(format!("invalid encoded caps: {err}"))) +} + +/// Returns the appsink sample format used to ingest a codec. +pub fn sample_format_for_codec(codec: EncodedVideoCodec) -> GStreamerSampleFormat { + match codec { + EncodedVideoCodec::H264 => GStreamerSampleFormat::H264AnnexB, + EncodedVideoCodec::H265 => GStreamerSampleFormat::H265AnnexB, + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 | EncodedVideoCodec::AV1 => { + GStreamerSampleFormat::AccessUnit { codec } + } + } +} + +/// Returns the parser element name used to normalize a codec, when one is needed. +pub fn parser_name(codec: EncodedVideoCodec) -> Option<&'static str> { + match codec { + EncodedVideoCodec::H264 => Some("h264parse"), + EncodedVideoCodec::H265 => Some("h265parse"), + EncodedVideoCodec::VP8 | EncodedVideoCodec::VP9 => None, + EncodedVideoCodec::AV1 => Some("av1parse"), + } +} + +/// Finds or builds the encoded appsink in a pipeline. +/// +/// When the pipeline already contains `appsink name=lk_appsink`, it is used +/// as-is (its sink caps decide the sample format). Otherwise the pipeline +/// must leave one encoded video source pad unlinked; the codec parser, a +/// capsfilter, and an appsink are created and linked to it. +pub fn ensure_encoded_appsink( + pipeline: &gst::Pipeline, + requested_codec: Option, +) -> Result<(gst_app::AppSink, GStreamerSampleFormat), GStreamerPipelineError> { + if let Some(appsink) = pipeline.by_name(ENCODED_APPSINK_NAME) { + let sample_format = match sample_format_from_element_sink_caps(&appsink)? { + Some(sample_format) => { + if let Some(requested_codec) = requested_codec { + if requested_codec != sample_format.codec() { + return Err(GStreamerPipelineError::CodecMismatch { + requested: requested_codec, + advertised: sample_format.codec(), + location: format!("appsink '{ENCODED_APPSINK_NAME}'"), + }); + } + } + sample_format + } + None => sample_format_for_codec(requested_codec.unwrap_or(EncodedVideoCodec::H264)), + }; + let appsink = appsink + .downcast::() + .map_err(|_| GStreamerPipelineError::NotAnAppSink)?; + return Ok((appsink, sample_format)); + } + + let src_pad = pipeline + .find_unlinked_pad(gst::PadDirection::Src) + .ok_or(GStreamerPipelineError::MissingAppSink)?; + let inferred_codec = codec_from_pad_caps(&src_pad) + .ok_or_else(|| GStreamerPipelineError::UnsupportedPadCaps(src_pad.name().to_string()))?; + let codec = match requested_codec { + Some(requested_codec) if requested_codec != inferred_codec => { + return Err(GStreamerPipelineError::CodecMismatch { + requested: requested_codec, + advertised: inferred_codec, + location: format!("unlinked pad '{}'", src_pad.name()), + }); + } + Some(requested_codec) => requested_codec, + None => inferred_codec, + }; + let sample_format = sample_format_for_codec(codec); + let src_element = src_pad.parent_element().ok_or_else(|| { + GStreamerPipelineError::Pipeline( + "unlinked GStreamer encoded pad has no parent element".to_owned(), + ) + })?; + + let parser = parser_element_for_codec(codec)?; + let codec_caps = encoded_caps(codec)?; + let capsfilter = gst::ElementFactory::make("capsfilter") + .property("caps", codec_caps) + .build() + .map_err(|err| { + GStreamerPipelineError::Pipeline(format!("failed to create {codec:?} capsfilter: {err}")) + })?; + let appsink = gst::ElementFactory::make("appsink") + .name(ENCODED_APPSINK_NAME) + .property("sync", false) + .property("max-buffers", 8u32) + .property("drop", true) + .build() + .map_err(|err| { + GStreamerPipelineError::Pipeline(format!("failed to create appsink: {err}")) + })?; + + if let Some(parser) = &parser { + pipeline.add(parser).map_err(|err| { + GStreamerPipelineError::Pipeline(format!( + "failed to add {} to GStreamer pipeline: {err}", + parser.name() + )) + })?; + } + pipeline.add(&capsfilter).map_err(|err| { + GStreamerPipelineError::Pipeline(format!( + "failed to add capsfilter to GStreamer pipeline: {err}" + )) + })?; + pipeline.add(&appsink).map_err(|err| { + GStreamerPipelineError::Pipeline(format!( + "failed to add appsink to GStreamer pipeline: {err}" + )) + })?; + if let Some(parser) = &parser { + gst::Element::link_many([parser, &capsfilter, &appsink]).map_err(|err| { + GStreamerPipelineError::Pipeline(format!( + "failed to link {} to appsink: {err}", + parser.name() + )) + })?; + } else { + gst::Element::link_many([&capsfilter, &appsink]).map_err(|err| { + GStreamerPipelineError::Pipeline(format!("failed to link capsfilter to appsink: {err}")) + })?; + } + let link_target = parser.as_ref().unwrap_or(&capsfilter); + let sink_pad = link_target.static_pad("sink").ok_or_else(|| { + GStreamerPipelineError::Pipeline(format!( + "{} did not expose a sink pad", + link_target.name() + )) + })?; + src_pad.link(&sink_pad).map_err(|err| { + GStreamerPipelineError::Pipeline(format!( + "failed to link '{}' to {}: {err}", + src_element.name(), + link_target.name() + )) + })?; + + let appsink = + appsink.downcast::().map_err(|_| GStreamerPipelineError::NotAnAppSink)?; + Ok((appsink, sample_format)) +} + +fn parser_element_for_codec( + codec: EncodedVideoCodec, +) -> Result, GStreamerPipelineError> { + let Some(name) = parser_name(codec) else { + return Ok(None); + }; + let mut builder = gst::ElementFactory::make(name); + if matches!(codec, EncodedVideoCodec::H264 | EncodedVideoCodec::H265) { + builder = builder.property("config-interval", -1i32); + } + builder + .build() + .map(Some) + .map_err(|err| GStreamerPipelineError::Pipeline(format!("failed to create {name}: {err}"))) +} + +fn sample_format_from_element_sink_caps( + element: &gst::Element, +) -> Result, GStreamerPipelineError> { + let Some(sink_pad) = element.static_pad("sink") else { + return Ok(None); + }; + sample_format_from_pad_caps(&sink_pad) +} + +fn sample_format_from_pad_caps( + pad: &gst::Pad, +) -> Result, GStreamerPipelineError> { + let caps = pad.current_caps().unwrap_or_else(|| pad.query_caps(None)); + for structure in caps.iter() { + if let Some(sample_format) = sample_format_from_caps_structure(structure)? { + return Ok(Some(sample_format)); + } + } + Ok(None) +} + +/// Infers the appsink sample format from a caps structure. +pub fn sample_format_from_caps_structure( + structure: &gst::StructureRef, +) -> Result, GStreamerPipelineError> { + let Some(codec) = codec_from_caps_name(structure.name()) else { + return Ok(None); + }; + + match codec { + EncodedVideoCodec::H264 => { + let stream_format = structure.get::("stream-format").ok(); + match stream_format.as_deref() { + Some("avc") | Some("avc3") => Ok(Some(GStreamerSampleFormat::H264Avc { + nal_length_size: h264_avc_nal_length_size_from_caps(structure), + })), + Some("byte-stream") | None => Ok(Some(GStreamerSampleFormat::H264AnnexB)), + Some(stream_format) => Err(GStreamerPipelineError::UnsupportedCaps(format!( + "H.264 stream-format '{stream_format}'; expected byte-stream or avc" + ))), + } + } + EncodedVideoCodec::H265 => Ok(Some(GStreamerSampleFormat::H265AnnexB)), + EncodedVideoCodec::VP8 => Ok(Some(GStreamerSampleFormat::AccessUnit { codec })), + EncodedVideoCodec::VP9 => { + let profile = structure.get::("profile").ok(); + match profile.as_deref() { + Some("0") | None => Ok(Some(GStreamerSampleFormat::AccessUnit { codec })), + Some(profile) => Err(GStreamerPipelineError::UnsupportedCaps(format!( + "VP9 profile '{profile}'; expected profile 0" + ))), + } + } + EncodedVideoCodec::AV1 => { + let stream_format = structure.get::("stream-format").ok(); + match stream_format.as_deref() { + Some("obu-stream") | None => Ok(Some(GStreamerSampleFormat::AccessUnit { codec })), + Some(stream_format) => Err(GStreamerPipelineError::UnsupportedCaps(format!( + "AV1 stream-format '{stream_format}'; expected obu-stream" + ))), + } + } + } +} + +fn h264_avc_nal_length_size_from_caps(structure: &gst::StructureRef) -> u8 { + let Ok(codec_data) = structure.get::("codec_data") else { + return 4; + }; + let Ok(codec_data) = codec_data.map_readable() else { + return 4; + }; + h264_avc_nal_length_size_from_codec_data(codec_data.as_ref()).unwrap_or(4) +} + +/// Reads the AVC NAL length-prefix size from `avcC` codec data. +pub fn h264_avc_nal_length_size_from_codec_data(codec_data: &[u8]) -> Option { + let length_size = (codec_data.get(4)? & 0x03) + 1; + (1..=4).contains(&length_size).then_some(length_size) +} + +/// Infers the encoded codec advertised by a pad's caps. +pub fn codec_from_pad_caps(pad: &gst::Pad) -> Option { + let caps = pad.current_caps().unwrap_or_else(|| pad.query_caps(None)); + caps.iter().find_map(|structure| codec_from_caps_name(structure.name())) +} + +/// Maps a caps media-type name to an encoded codec. +pub fn codec_from_caps_name(name: &str) -> Option { + match name { + "video/x-h264" => Some(EncodedVideoCodec::H264), + "video/x-h265" => Some(EncodedVideoCodec::H265), + "video/x-vp8" => Some(EncodedVideoCodec::VP8), + "video/x-vp9" => Some(EncodedVideoCodec::VP9), + "video/x-av1" => Some(EncodedVideoCodec::AV1), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn sample_payload_h264_annex_b_detects_keyframe() { + let access_unit = access_unit_from_sample_payload( + GStreamerSampleFormat::H264AnnexB, + &[0, 0, 1, 0x65, 1, 2], + 1_000, + EncodedFrameType::Delta, + 640, + 480, + ) + .unwrap(); + + assert_eq!(access_unit.codec, EncodedVideoCodec::H264); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.timestamp_us, 1_000); + } + + #[test] + fn sample_payload_h264_avc_converts_to_annex_b_and_detects_keyframe() { + let access_unit = access_unit_from_sample_payload( + GStreamerSampleFormat::H264Avc { nal_length_size: 4 }, + &[0, 0, 0, 3, 0x65, 1, 2], + 1_000, + EncodedFrameType::Delta, + 640, + 480, + ) + .unwrap(); + + assert_eq!(access_unit.codec, EncodedVideoCodec::H264); + assert_eq!(access_unit.frame_type, EncodedFrameType::Key); + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 1, 2]); + } + + #[test] + fn sample_payload_access_unit_uses_buffer_delta_flag() { + let access_unit = access_unit_from_sample_payload( + GStreamerSampleFormat::AccessUnit { codec: EncodedVideoCodec::VP8 }, + &[1, 2, 3], + 2_000, + EncodedFrameType::Delta, + 640, + 480, + ) + .unwrap(); + + assert_eq!(access_unit.codec, EncodedVideoCodec::VP8); + assert_eq!(access_unit.frame_type, EncodedFrameType::Delta); + assert_eq!( + access_unit.codec_specific, + CodecSpecific::VP8 { temporal_id: None, layer_sync: false } + ); + } + + #[test] + fn sample_payload_access_unit_sets_vp9_and_av1_specifics() { + let vp9 = access_unit_from_sample_payload( + GStreamerSampleFormat::AccessUnit { codec: EncodedVideoCodec::VP9 }, + &[1, 2, 3], + 2_000, + EncodedFrameType::Key, + 640, + 480, + ) + .unwrap(); + assert_eq!(vp9.codec_specific, CodecSpecific::default_for(EncodedVideoCodec::VP9)); + + let av1 = access_unit_from_sample_payload( + GStreamerSampleFormat::AccessUnit { codec: EncodedVideoCodec::AV1 }, + &[1, 2, 3], + 2_000, + EncodedFrameType::Key, + 640, + 480, + ) + .unwrap(); + assert_eq!(av1.codec_specific, CodecSpecific::default_for(EncodedVideoCodec::AV1)); + } + + #[test] + fn clock_time_is_offset_from_start_timestamp() { + let timestamp = clock_time_to_timestamp_us(10_000, gst::ClockTime::from_useconds(1_234)); + assert_eq!(timestamp, 11_234); + } +} diff --git a/livekit-capture/src/sources/io.rs b/livekit-capture/src/sources/io.rs new file mode 100644 index 000000000..e56473c22 --- /dev/null +++ b/livekit-capture/src/sources/io.rs @@ -0,0 +1,32 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Shared blocking-I/O helpers for the encoded ingest sources. + +use std::io::{self, Read}; + +/// Reads exactly `buf.len()` bytes, returning `Ok(false)` when the stream +/// ends cleanly before the first byte and `UnexpectedEof` when it ends +/// mid-buffer. +pub(crate) fn read_exact_or_clean_eof(reader: &mut impl Read, buf: &mut [u8]) -> io::Result { + let mut offset = 0; + while offset < buf.len() { + match reader.read(&mut buf[offset..])? { + 0 if offset == 0 => return Ok(false), + 0 => return Err(io::Error::from(io::ErrorKind::UnexpectedEof)), + read => offset += read, + } + } + Ok(true) +} diff --git a/examples/local_video/src/lk_argus.cpp b/livekit-capture/src/sources/lk_argus.cpp similarity index 82% rename from examples/local_video/src/lk_argus.cpp rename to livekit-capture/src/sources/lk_argus.cpp index 8deb5e1fc..1337fb729 100644 --- a/examples/local_video/src/lk_argus.cpp +++ b/livekit-capture/src/sources/lk_argus.cpp @@ -1,13 +1,28 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// // C shim around NVIDIA libargus for MIPI CSI camera capture on Jetson. // // Exposes a simple C API for the Rust FFI in argus.rs: -// lk_argus_create_session – open sensor, configure ISP, start repeating capture -// lk_argus_acquire_frame – dequeue next frame, return NvBufSurface DMA fd -// lk_argus_release_frame – release frame back to Argus buffer pool -// lk_argus_destroy_session – tear down everything +// lk_argus_create_session - open sensor, configure ISP, start repeating capture +// lk_argus_acquire_frame - dequeue next frame, return NvBufSurface DMA fd +// lk_argus_release_frame - release frame back to Argus buffer pool +// lk_argus_destroy_session - tear down everything #include #include +#include #include #include @@ -53,6 +68,26 @@ struct LkArgusSession { static const uint64_t kAcquireTimeoutNs = 1000000000ULL; // 1 second +static constexpr int kCopyI420InvalidArgument = -1; +static constexpr int kCopyI420SurfaceNotFound = -2; +static constexpr int kCopyI420InvalidSurface = -4; + +static int copy_i420_error_code(int ret) { + return ret < 0 ? -ret : ret; +} + +static int copy_i420_map_error(int ret) { + return -1000 - copy_i420_error_code(ret); +} + +static int copy_i420_sync_error(int ret) { + return -2000 - copy_i420_error_code(ret); +} + +static int copy_i420_unmap_error(int ret) { + return -100 - copy_i420_error_code(ret); +} + enum class SensorTimestampStatus { Available, InvalidArgs, @@ -218,6 +253,19 @@ static SensorTimestampStatus read_sensor_timestamp_ns( return egl_status; } +// Destroys the persistent NvBufSurface ring entries [0, count), releasing +// their DMA-BUF fds. Entries that were never created (nullptr) are skipped, +// so this is safe on a partially-initialized session. +static void destroy_dmabuf_surfaces(LkArgusSession* s, int count) { + for (int i = 0; i < count; i++) { + if (s->dmabuf_surfaces[i]) { + NvBufSurfaceDestroy(s->dmabuf_surfaces[i]); + s->dmabuf_surfaces[i] = nullptr; + } + s->dmabuf_fds[i] = -1; + } +} + extern "C" { void* lk_argus_create_session(int sensor_index, int width, int height, int fps) { @@ -443,9 +491,11 @@ void* lk_argus_create_session(int sensor_index, int width, int height, int fps) NvBufSurface* surface = nullptr; if (NvBufSurfaceCreate(&surface, 1, &create_params) != 0 || !surface) { fprintf(stderr, "[lk_argus] Failed to create NvBufSurface[%d]\n", i); + destroy_dmabuf_surfaces(s, i); delete s; return nullptr; } + surface->numFilled = 1; s->dmabuf_fds[i] = surface->surfaceList[0].bufferDesc; s->dmabuf_surfaces[i] = surface; } @@ -455,6 +505,7 @@ void* lk_argus_create_session(int sensor_index, int width, int height, int fps) if (status != Argus::STATUS_OK) { fprintf(stderr, "[lk_argus] Failed to start repeating capture: %d\n", static_cast(status)); + destroy_dmabuf_surfaces(s, kNumDmaBufs); delete s; return nullptr; } @@ -552,7 +603,7 @@ int lk_argus_acquire_frame_with_metadata( if (acquire_wait_ns) *acquire_wait_ns = static_cast(acquire_duration_ns); if (blit_ns) *blit_ns = static_cast(blit_duration_ns); - // Release the Argus frame immediately – the pixel data has been blitted + // Release the Argus frame immediately - the pixel data has been blitted // into our persistent NvBufSurface so we no longer need the EGLStream frame. s->current_frame.reset(); @@ -569,6 +620,106 @@ int lk_argus_acquire_frame(void* handle) { return lk_argus_acquire_frame_with_metadata(handle, nullptr, nullptr, nullptr); } +int lk_argus_copy_frame_to_i420( + void* handle, + int dmabuf_fd, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + uint64_t* copy_to_i420_ns) { + using Clock = std::chrono::steady_clock; + + auto* s = static_cast(handle); + if (!s || dmabuf_fd < 0 || !dst_y || !dst_u || !dst_v) { + return kCopyI420InvalidArgument; + } + + const int width = s->width; + const int height = s->height; + const int chroma_width = (width + 1) / 2; + const int chroma_height = (height + 1) / 2; + if (width <= 0 || height <= 0 || + dst_stride_y < width || + dst_stride_u < chroma_width || + dst_stride_v < chroma_width) { + return kCopyI420InvalidArgument; + } + + NvBufSurface* surface = nullptr; + for (int i = 0; i < kNumDmaBufs; i++) { + if (s->dmabuf_fds[i] == dmabuf_fd) { + surface = s->dmabuf_surfaces[i]; + break; + } + } + if (!surface || surface->batchSize < 1) { + return kCopyI420SurfaceNotFound; + } + + auto t0 = Clock::now(); + int ret = NvBufSurfaceMap(surface, 0, -1, NVBUF_MAP_READ); + if (ret != 0) { + return copy_i420_map_error(ret); + } + + ret = NvBufSurfaceSyncForCpu(surface, 0, -1); + if (ret != 0) { + int unmap_ret = NvBufSurfaceUnMap(surface, 0, -1); + if (unmap_ret != 0) { + return copy_i420_unmap_error(unmap_ret); + } + return copy_i420_sync_error(ret); + } + + const NvBufSurfaceParams& params = surface->surfaceList[0]; + const uint8_t* src_y = + static_cast(params.mappedAddr.addr[0]); + const uint8_t* src_uv = + static_cast(params.mappedAddr.addr[1]); + const int src_stride_y = static_cast(params.planeParams.pitch[0]); + const int src_stride_uv = static_cast(params.planeParams.pitch[1]); + + if (!src_y || !src_uv || + src_stride_y < width || + src_stride_uv < chroma_width * 2) { + ret = NvBufSurfaceUnMap(surface, 0, -1); + if (ret != 0) { + return copy_i420_unmap_error(ret); + } + return kCopyI420InvalidSurface; + } + + for (int row = 0; row < height; row++) { + std::memcpy(dst_y + row * dst_stride_y, + src_y + row * src_stride_y, + static_cast(width)); + } + + for (int row = 0; row < chroma_height; row++) { + const uint8_t* src_row = src_uv + row * src_stride_uv; + uint8_t* dst_u_row = dst_u + row * dst_stride_u; + uint8_t* dst_v_row = dst_v + row * dst_stride_v; + for (int col = 0; col < chroma_width; col++) { + dst_u_row[col] = src_row[col * 2]; + dst_v_row[col] = src_row[col * 2 + 1]; + } + } + + ret = NvBufSurfaceUnMap(surface, 0, -1); + auto t1 = Clock::now(); + if (copy_to_i420_ns) { + *copy_to_i420_ns = static_cast( + std::chrono::duration_cast(t1 - t0).count()); + } + if (ret != 0) { + return copy_i420_unmap_error(ret); + } + return 0; +} + void lk_argus_release_frame(void* handle) { auto* s = static_cast(handle); if (!s) return; @@ -589,13 +740,7 @@ void lk_argus_destroy_session(void* handle) { s->current_frame.reset(); // Free all persistent NvBufSurface buffers using the original pointers. - for (int i = 0; i < kNumDmaBufs; i++) { - if (s->dmabuf_surfaces[i]) { - NvBufSurfaceDestroy(s->dmabuf_surfaces[i]); - s->dmabuf_surfaces[i] = nullptr; - } - s->dmabuf_fds[i] = -1; - } + destroy_dmabuf_surfaces(s, kNumDmaBufs); delete s; fprintf(stderr, "[lk_argus] Session destroyed\n"); diff --git a/livekit-capture/src/sources/mod.rs b/livekit-capture/src/sources/mod.rs new file mode 100644 index 000000000..8f13320a2 --- /dev/null +++ b/livekit-capture/src/sources/mod.rs @@ -0,0 +1,30 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Optional capture sources that feed the shared capture paths. + +#[cfg(feature = "libargus")] +pub mod argus; +#[cfg(feature = "avfoundation")] +pub mod avfoundation; +#[cfg(feature = "gstreamer")] +pub mod gstreamer; +#[cfg(feature = "tcpsink")] +pub(crate) mod io; +#[cfg(feature = "rtsp")] +pub mod rtsp; +#[cfg(feature = "tcpsink")] +pub mod tcp; +#[cfg(feature = "v4l")] +pub mod v4l; diff --git a/livekit-capture/src/sources/rtsp.rs b/livekit-capture/src/sources/rtsp.rs new file mode 100644 index 000000000..316d88935 --- /dev/null +++ b/livekit-capture/src/sources/rtsp.rs @@ -0,0 +1,1817 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{ + io::{self, Read, Write}, + net::TcpStream, + ops::Range, + str, + time::{Duration, Instant, SystemTime, UNIX_EPOCH}, +}; + +use base64::{engine::general_purpose, Engine as _}; +use md5::{Digest, Md5}; +use thiserror::Error; + +use crate::encoded::{ + ingress::EncodedAccessUnitSource, + rtp::{RtpAccessUnitAssembler, RtpDepacketizerError}, + EncodedVideoCodec, OwnedEncodedAccessUnit, +}; + +const DEFAULT_RTSP_CLOCK_RATE: u32 = 90_000; +const MAX_RTSP_HEADER_BYTES: usize = 64 * 1024; +const RTSP_STREAM_READ_CHUNK_BYTES: usize = 8 * 1024; +const DEFAULT_RTSP_READ_TIMEOUT: Duration = Duration::from_secs(10); +const DEFAULT_RTSP_IDLE_TIMEOUT: Duration = Duration::from_secs(30); + +/// Options used to open an RTSP encoded video source. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct RtspSourceOptions { + /// Expected video codec, when the caller wants to reject mismatched SDP. + pub expected_codec: Option, + /// Timestamp assigned to the first emitted access unit. + pub start_timestamp_us: i64, + /// Encoded frame width in pixels. + pub width: u32, + /// Encoded frame height in pixels. + pub height: u32, + /// Non-zero socket read timeout applied to the RTSP TCP stream (default 10s). + /// + /// Handshake reads that exceed it fail with [`RtspSourceError::Timeout`]. + /// Streaming reads treat it as the retry granularity instead, so session + /// keepalives keep flowing while the stream is silent. + pub read_timeout: Duration, + /// Maximum stream silence tolerated before [`RtspSourceError::Timeout`] + /// (default 30s). Receiving any interleaved bytes resets the limit. + pub idle_timeout: Duration, +} + +impl RtspSourceOptions { + /// Creates RTSP source options for encoded frames with the supplied dimensions. + pub fn new(width: u32, height: u32) -> Self { + Self { + expected_codec: None, + start_timestamp_us: 0, + width, + height, + read_timeout: DEFAULT_RTSP_READ_TIMEOUT, + idle_timeout: DEFAULT_RTSP_IDLE_TIMEOUT, + } + } + + /// Requires the SDP video track to use the supplied codec. + pub fn with_expected_codec(mut self, codec: EncodedVideoCodec) -> Self { + self.expected_codec = Some(codec); + self + } + + /// Sets the timestamp assigned to the first emitted access unit. + pub fn with_start_timestamp_us(mut self, start_timestamp_us: i64) -> Self { + self.start_timestamp_us = start_timestamp_us; + self + } + + /// Sets the socket read timeout. + pub fn with_read_timeout(mut self, read_timeout: Duration) -> Self { + self.read_timeout = read_timeout; + self + } + + /// Sets the maximum stream silence tolerated before a timeout error. + pub fn with_idle_timeout(mut self, idle_timeout: Duration) -> Self { + self.idle_timeout = idle_timeout; + self + } +} + +/// RTSP session details discovered while opening a source. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct RtspSessionInfo { + /// RTP payload codec selected from SDP. + pub codec: EncodedVideoCodec, + /// RTP payload type selected from SDP. + pub payload_type: u8, + /// RTP timestamp clock rate. + pub clock_rate: u32, + /// RTSP interleaved channel carrying video RTP packets. + pub video_channel: u8, + /// RTSP media control URL used for SETUP. + pub control_url: String, + /// RTSP session identifier returned by SETUP. + pub session_id: String, +} + +/// Encoded RTSP source that performs DESCRIBE, SETUP, and PLAY over TCP. +#[derive(Debug)] +pub struct RtspEncodedSource { + source: RtspInterleavedRtpSource, + session_info: RtspSessionInfo, + keepalive: RtspKeepalive, +} + +impl RtspEncodedSource { + /// Connects to an RTSP URL and starts TCP-interleaved RTP playback. + pub fn connect(url: &str, options: RtspSourceOptions) -> Result { + let rtsp_url = RtspUrl::parse(url)?; + let mut stream = TcpStream::connect((rtsp_url.connect_host.as_str(), rtsp_url.port)) + .map_err(RtspSourceError::Io)?; + let _ = stream.set_nodelay(true); + stream.set_read_timeout(Some(options.read_timeout)).map_err(RtspSourceError::Io)?; + let mut auth = RtspAuthContext::new(rtsp_url.credentials.clone()); + let mut cseq = 1; + + let describe = send_authenticated_rtsp_request( + &mut stream, + "DESCRIBE", + &rtsp_url.original, + &mut cseq, + &[("Host", rtsp_url.host_header.as_str()), ("Accept", "application/sdp")], + &mut auth, + )?; + let sdp = str::from_utf8(&describe.body).map_err(|_| RtspSourceError::InvalidSdp)?; + let media = parse_sdp_video_track(&rtsp_url, sdp, options.expected_codec)?; + + let setup = send_authenticated_rtsp_request( + &mut stream, + "SETUP", + &media.control_url, + &mut cseq, + &[ + ("Host", rtsp_url.host_header.as_str()), + ("Transport", "RTP/AVP/TCP;unicast;interleaved=0-1"), + ], + &mut auth, + )?; + let session_header = + setup.header("session").ok_or(RtspSourceError::MissingHeader("Session"))?; + let session_id = parse_session_id(session_header)?; + let session_timeout_secs = parse_session_timeout_secs(session_header); + let video_channel = parse_interleaved_channel(setup.header("transport")); + + send_authenticated_rtsp_request( + &mut stream, + "PLAY", + &rtsp_url.original, + &mut cseq, + &[ + ("Host", rtsp_url.host_header.as_str()), + ("Session", session_id.as_str()), + ("Range", "npt=0.000-"), + ], + &mut auth, + )?; + + let session_info = RtspSessionInfo { + codec: media.codec, + payload_type: media.payload_type, + clock_rate: media.clock_rate, + video_channel, + control_url: media.control_url, + session_id, + }; + let config = RtspInterleavedSourceConfig { + codec: session_info.codec, + clock_rate: session_info.clock_rate, + video_channel: session_info.video_channel, + start_timestamp_us: options.start_timestamp_us, + width: options.width, + height: options.height, + idle_timeout: options.idle_timeout, + }; + let source = RtspInterleavedRtpSource::new(stream, config)?; + let keepalive = RtspKeepalive::new( + rtsp_url.original, + rtsp_url.host_header, + session_info.session_id.clone(), + cseq, + auth, + session_timeout_secs, + ); + + Ok(Self { source, session_info, keepalive }) + } + + /// Returns RTSP session details discovered during setup. + pub fn session_info(&self) -> &RtspSessionInfo { + &self.session_info + } + + /// Attempts to clone the underlying TCP stream. + pub fn try_clone_stream(&self) -> io::Result { + self.source.reader().try_clone() + } +} + +impl EncodedAccessUnitSource for RtspEncodedSource { + type Error = RtspSourceError; + + fn next_access_unit(&mut self) -> Result, Self::Error> { + loop { + self.keepalive.maybe_send(self.source.reader_mut())?; + match self.source.poll_access_unit()? { + AccessUnitPoll::AccessUnit(access_unit) => return Ok(Some(access_unit)), + AccessUnitPoll::EndOfStream => return Ok(None), + // A stream read timed out; loop so a due keepalive can be + // sent even while the interleaved stream is silent. + AccessUnitPoll::TimedOut => {} + } + } + } +} + +#[derive(Debug)] +struct RtspKeepalive { + request_uri: String, + host_header: String, + session_id: String, + cseq: u32, + auth: RtspAuthContext, + interval: Duration, + next_due: Instant, +} + +impl RtspKeepalive { + fn new( + request_uri: String, + host_header: String, + session_id: String, + cseq: u32, + auth: RtspAuthContext, + session_timeout_secs: Option, + ) -> Self { + let interval_secs = session_timeout_secs.map(|timeout| (timeout / 2).max(1)).unwrap_or(30); + let interval = Duration::from_secs(interval_secs); + Self { + request_uri, + host_header, + session_id, + cseq, + auth, + interval, + next_due: Instant::now() + interval, + } + } + + fn maybe_send(&mut self, stream: &mut TcpStream) -> Result<(), RtspSourceError> { + if Instant::now() < self.next_due { + return Ok(()); + } + + let authorization = self.auth.header("OPTIONS", &self.request_uri)?; + write_rtsp_request( + stream, + "OPTIONS", + &self.request_uri, + next_cseq(&mut self.cseq), + &[("Host", self.host_header.as_str()), ("Session", self.session_id.as_str())], + authorization, + )?; + self.next_due = Instant::now() + self.interval; + Ok(()) + } +} + +/// Configuration for RTSP interleaved RTP media. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct RtspInterleavedSourceConfig { + /// RTP payload codec. + pub codec: EncodedVideoCodec, + /// RTP timestamp clock rate. + pub clock_rate: u32, + /// RTSP interleaved channel carrying video RTP packets. + pub video_channel: u8, + /// Timestamp assigned to the first emitted access unit. + pub start_timestamp_us: i64, + /// Encoded frame width in pixels. + pub width: u32, + /// Encoded frame height in pixels. + pub height: u32, + /// Maximum stream silence tolerated before timed-out reads become a hard + /// [`RtspSourceError::Timeout`]. Receiving any bytes resets the limit. + pub idle_timeout: Duration, +} + +/// Encoded source for RTSP interleaved RTP streams. +#[derive(Debug)] +pub struct RtspInterleavedRtpSource { + reader: R, + config: RtspInterleavedSourceConfig, + assembler: RtpAccessUnitAssembler, + /// Unconsumed stream bytes; may end with a partial unit that is kept + /// across timed-out reads so framing survives read timeouts. + stream_buf: Vec, + /// Consumed prefix of `stream_buf`, compacted before each fill. + stream_pos: usize, + /// When the last stream bytes were received, for the idle limit. + last_read_at: Instant, + eof: bool, +} + +/// Progress from polling the interleaved stream for one access unit. +#[derive(Debug)] +enum AccessUnitPoll { + /// A complete access unit was assembled. + AccessUnit(OwnedEncodedAccessUnit), + /// The stream ended cleanly at a unit boundary. + EndOfStream, + /// A read timed out mid-stream; retry after running periodic work. + TimedOut, +} + +/// Result of one attempt to read more interleaved stream bytes. +#[derive(Debug)] +enum StreamFill { + Filled, + Eof, + TimedOut, +} + +impl RtspInterleavedRtpSource +where + R: Read, +{ + /// Creates a source for an RTSP stream that is already in interleaved RTP mode. + pub fn new(reader: R, config: RtspInterleavedSourceConfig) -> Result { + let assembler = RtpAccessUnitAssembler::new( + config.codec, + config.clock_rate, + config.start_timestamp_us, + config.width, + config.height, + )?; + Ok(Self { + reader, + config, + assembler, + stream_buf: Vec::new(), + stream_pos: 0, + last_read_at: Instant::now(), + eof: false, + }) + } + + /// Returns the wrapped reader. + pub fn reader(&self) -> &R { + &self.reader + } + + /// Returns the wrapped reader mutably. + pub fn reader_mut(&mut self) -> &mut R { + &mut self.reader + } + + /// Consumes this source and returns its reader. + pub fn into_reader(self) -> R { + self.reader + } + + /// Advances the stream until an access unit completes, the stream ends, + /// or a read times out with framing state preserved for the next poll. + fn poll_access_unit(&mut self) -> Result { + loop { + if self.eof { + return Ok(AccessUnitPoll::EndOfStream); + } + + while let Some(unit) = parse_interleaved_unit(&self.stream_buf[self.stream_pos..])? { + let unit_start = self.stream_pos; + match unit { + ParsedInterleavedUnit::Frame { channel, payload, len } => { + self.stream_pos = unit_start + len; + if channel != self.config.video_channel { + continue; + } + let payload = + &self.stream_buf[unit_start + payload.start..unit_start + payload.end]; + if let Some(access_unit) = self.assembler.push(payload)? { + return Ok(AccessUnitPoll::AccessUnit(access_unit)); + } + } + ParsedInterleavedUnit::RtspResponse { len } => { + self.stream_pos = unit_start + len; + } + } + } + + match self.fill_stream_buf()? { + StreamFill::Filled => {} + StreamFill::Eof => { + self.eof = true; + return Ok(AccessUnitPoll::EndOfStream); + } + StreamFill::TimedOut => return Ok(AccessUnitPoll::TimedOut), + } + } + } + + /// Reads more stream bytes into `stream_buf`, compacting consumed data first. + fn fill_stream_buf(&mut self) -> Result { + if self.stream_pos > 0 { + self.stream_buf.drain(..self.stream_pos); + self.stream_pos = 0; + } + let filled = self.stream_buf.len(); + self.stream_buf.resize(filled + RTSP_STREAM_READ_CHUNK_BYTES, 0); + loop { + match self.reader.read(&mut self.stream_buf[filled..]) { + Ok(0) => { + self.stream_buf.truncate(filled); + return if filled == 0 { + Ok(StreamFill::Eof) + } else { + // The stream ended inside an interleaved unit. + Err(RtspSourceError::Io(io::Error::from(io::ErrorKind::UnexpectedEof))) + }; + } + Ok(read) => { + self.stream_buf.truncate(filled + read); + self.last_read_at = Instant::now(); + return Ok(StreamFill::Filled); + } + Err(err) if err.kind() == io::ErrorKind::Interrupted => {} + Err(err) if is_timeout_io_error(&err) => { + self.stream_buf.truncate(filled); + return if self.last_read_at.elapsed() >= self.config.idle_timeout { + Err(RtspSourceError::Timeout { + phase: "interleaved stream data".to_owned(), + }) + } else { + Ok(StreamFill::TimedOut) + }; + } + Err(err) => { + self.stream_buf.truncate(filled); + return Err(RtspSourceError::Io(err)); + } + } + } + } +} + +impl EncodedAccessUnitSource for RtspInterleavedRtpSource +where + R: Read + Send + Sync + 'static, +{ + type Error = RtspSourceError; + + fn next_access_unit(&mut self) -> Result, Self::Error> { + loop { + match self.poll_access_unit()? { + AccessUnitPoll::AccessUnit(access_unit) => return Ok(Some(access_unit)), + AccessUnitPoll::EndOfStream => return Ok(None), + // Keep waiting until the configured idle limit turns + // timed-out reads into a hard error. + AccessUnitPoll::TimedOut => {} + } + } + } +} + +/// One unit parsed from the front of the interleaved stream buffer. +#[derive(Debug)] +enum ParsedInterleavedUnit { + /// Interleaved binary frame with its payload range and total length. + Frame { channel: u8, payload: Range, len: usize }, + /// In-stream RTSP response (for example a keepalive reply) to skip. + RtspResponse { len: usize }, +} + +/// Parses one interleaved unit from the front of `buf`, returning `Ok(None)` +/// when more bytes are needed. +fn parse_interleaved_unit(buf: &[u8]) -> Result, RtspSourceError> { + let Some(&magic) = buf.first() else { + return Ok(None); + }; + match magic { + b'$' => { + if buf.len() < 4 { + return Ok(None); + } + let channel = buf[1]; + let len = 4 + u16::from_be_bytes([buf[2], buf[3]]) as usize; + if buf.len() < len { + return Ok(None); + } + Ok(Some(ParsedInterleavedUnit::Frame { channel, payload: 4..len, len })) + } + b'R' => { + let mut remaining = buf; + match read_rtsp_response(&mut remaining) { + Ok(_response) => Ok(Some(ParsedInterleavedUnit::RtspResponse { + len: buf.len() - remaining.len(), + })), + Err(RtspSourceError::Io(err)) if err.kind() == io::ErrorKind::UnexpectedEof => { + Ok(None) + } + Err(err) => Err(err), + } + } + _ => Err(RtspSourceError::UnexpectedData), + } +} + +/// Error returned by RTSP encoded sources. +#[derive(Debug, Error)] +pub enum RtspSourceError { + /// I/O failed while reading RTSP interleaved data. + #[error("RTSP I/O failed: {0}")] + Io(io::Error), + /// An RTSP read exceeded the configured timeout. + #[error("RTSP timed out waiting for {phase}")] + Timeout { + /// Protocol phase or data the client was waiting for. + phase: String, + }, + /// RTSP URL was invalid or unsupported. + #[error("invalid RTSP URL: {0}")] + InvalidUrl(&'static str), + /// RTSP server returned a non-success status. + #[error("RTSP request failed with status {code} {reason}")] + RtspStatus { + /// RTSP status code. + code: u16, + /// RTSP status reason. + reason: String, + }, + /// RTSP response was malformed. + #[error("invalid RTSP response: {0}")] + InvalidResponse(&'static str), + /// RTSP response was missing a required header. + #[error("RTSP response missing {0} header")] + MissingHeader(&'static str), + /// RTSP server requested authentication but no URL credentials were supplied. + #[error("RTSP authentication required but the URL does not contain credentials")] + MissingCredentials, + /// RTSP authentication challenge was malformed. + #[error("invalid RTSP authentication challenge")] + InvalidAuthChallenge, + /// RTSP authentication scheme is not supported. + #[error("unsupported RTSP authentication scheme: {0}")] + UnsupportedAuthScheme(String), + /// SDP was missing a supported video track. + #[error("RTSP SDP does not contain a supported video track")] + MissingVideoTrack, + /// SDP did not offer the requested codec on any video track. + #[error("RTSP SDP codec mismatch: expected {expected:?}, offered {actual:?}")] + CodecMismatch { + /// Codec requested by the caller. + expected: EncodedVideoCodec, + /// Supported codecs offered by the SDP video tracks. + actual: Vec, + }, + /// SDP body was malformed or not valid UTF-8. + #[error("invalid RTSP SDP")] + InvalidSdp, + /// Interleaved RTP was malformed or a non-interleaved byte was encountered. + #[error("unexpected RTSP interleaved data")] + UnexpectedData, + /// RTP depayloading failed. + #[error(transparent)] + Rtp(#[from] RtpDepacketizerError), +} + +fn is_timeout_io_error(err: &io::Error) -> bool { + matches!(err.kind(), io::ErrorKind::WouldBlock | io::ErrorKind::TimedOut) +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct RtspUrl { + original: String, + authority: String, + connect_host: String, + host_header: String, + port: u16, + credentials: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct RtspCredentials { + username: String, + password: String, +} + +impl RtspUrl { + fn parse(url: &str) -> Result { + let Some(rest) = url.strip_prefix("rtsp://") else { + return Err(RtspSourceError::InvalidUrl("expected rtsp:// scheme")); + }; + let (authority, path_suffix) = match rest.find('/') { + Some(path_start) => (&rest[..path_start], &rest[path_start..]), + None => (rest, ""), + }; + if authority.is_empty() { + return Err(RtspSourceError::InvalidUrl("missing host")); + } + + let (credentials, host_port) = match authority.rsplit_once('@') { + Some((userinfo, host_port)) => (Some(parse_userinfo(userinfo)?), host_port), + None => (None, authority), + }; + if host_port.is_empty() { + return Err(RtspSourceError::InvalidUrl("missing host")); + } + let (connect_host, port) = parse_host_port(host_port)?; + let host_header = if host_port.contains(':') { + host_port.to_owned() + } else { + format!("{host_port}:{port}") + }; + + Ok(Self { + original: format!("rtsp://{host_port}{path_suffix}"), + authority: host_port.to_owned(), + connect_host, + host_header, + port, + credentials, + }) + } +} + +fn parse_userinfo(userinfo: &str) -> Result { + let (username, password) = userinfo.split_once(':').unwrap_or((userinfo, "")); + if username.is_empty() { + return Err(RtspSourceError::InvalidUrl("missing username")); + } + Ok(RtspCredentials { username: username.to_owned(), password: password.to_owned() }) +} + +fn parse_host_port(host_port: &str) -> Result<(String, u16), RtspSourceError> { + if let Some(rest) = host_port.strip_prefix('[') { + let Some((host, after_host)) = rest.split_once(']') else { + return Err(RtspSourceError::InvalidUrl("malformed IPv6 host")); + }; + let port = after_host.strip_prefix(':').map(parse_port).transpose()?.unwrap_or(554); + return Ok((host.to_owned(), port)); + } + + if let Some((host, port)) = host_port.rsplit_once(':') { + if !host.contains(':') { + return Ok((host.to_owned(), parse_port(port)?)); + } + } + + Ok((host_port.to_owned(), 554)) +} + +fn parse_port(port: &str) -> Result { + port.parse().map_err(|_| RtspSourceError::InvalidUrl("invalid port")) +} + +#[derive(Debug, Clone)] +struct RtspResponse { + status_code: u16, + reason: String, + headers: Vec<(String, String)>, + body: Vec, +} + +impl RtspResponse { + fn header(&self, name: &str) -> Option<&str> { + self.headers + .iter() + .find(|(header_name, _)| header_name.eq_ignore_ascii_case(name)) + .map(|(_, value)| value.as_str()) + } + + fn headers<'a>(&'a self, name: &'a str) -> impl Iterator + 'a { + self.headers + .iter() + .filter(move |(header_name, _)| header_name.eq_ignore_ascii_case(name)) + .map(|(_, value)| value.as_str()) + } +} + +fn send_authenticated_rtsp_request( + stream: &mut TcpStream, + method: &str, + uri: &str, + cseq: &mut u32, + headers: &[(&str, &str)], + auth: &mut RtspAuthContext, +) -> Result { + let mut response = send_rtsp_request( + stream, + method, + uri, + next_cseq(cseq), + headers, + auth.header(method, uri)?, + )?; + if response.status_code == 401 { + auth.update_from_unauthorized(&response)?; + response = send_rtsp_request( + stream, + method, + uri, + next_cseq(cseq), + headers, + auth.header(method, uri)?, + )?; + } + + if !(200..300).contains(&response.status_code) { + return Err(RtspSourceError::RtspStatus { + code: response.status_code, + reason: response.reason, + }); + } + Ok(response) +} + +fn next_cseq(cseq: &mut u32) -> u32 { + let current = *cseq; + *cseq = cseq.saturating_add(1); + current +} + +fn send_rtsp_request( + stream: &mut TcpStream, + method: &str, + uri: &str, + cseq: u32, + headers: &[(&str, &str)], + authorization: Option, +) -> Result { + write_rtsp_request(stream, method, uri, cseq, headers, authorization)?; + read_rtsp_response(stream).map_err(|err| match err { + // Handshake reads must complete within the socket read timeout. + RtspSourceError::Io(io_err) if is_timeout_io_error(&io_err) => { + RtspSourceError::Timeout { phase: format!("{method} response") } + } + err => err, + }) +} + +fn write_rtsp_request( + stream: &mut TcpStream, + method: &str, + uri: &str, + cseq: u32, + headers: &[(&str, &str)], + authorization: Option, +) -> Result<(), RtspSourceError> { + write!(stream, "{method} {uri} RTSP/1.0\r\n").map_err(RtspSourceError::Io)?; + write!(stream, "CSeq: {cseq}\r\n").map_err(RtspSourceError::Io)?; + write!(stream, "User-Agent: livekit-capture/0.1\r\n").map_err(RtspSourceError::Io)?; + if let Some(authorization) = authorization { + write!(stream, "Authorization: {authorization}\r\n").map_err(RtspSourceError::Io)?; + } + for (name, value) in headers { + write!(stream, "{name}: {value}\r\n").map_err(RtspSourceError::Io)?; + } + write!(stream, "\r\n").map_err(RtspSourceError::Io)?; + stream.flush().map_err(RtspSourceError::Io)?; + Ok(()) +} + +#[derive(Debug, Clone)] +struct RtspAuthContext { + credentials: Option, + challenge: Option, + nonce_count: u32, + cnonce: String, +} + +impl RtspAuthContext { + fn new(credentials: Option) -> Self { + Self { credentials, challenge: None, nonce_count: 0, cnonce: make_cnonce() } + } + + fn header(&mut self, method: &str, uri: &str) -> Result, RtspSourceError> { + let Some(challenge) = self.challenge.clone() else { + return Ok(None); + }; + let credentials = self.credentials.as_ref().ok_or(RtspSourceError::MissingCredentials)?; + match challenge { + RtspAuthChallenge::Basic => { + let token = general_purpose::STANDARD + .encode(format!("{}:{}", credentials.username, credentials.password)); + Ok(Some(format!("Basic {token}"))) + } + RtspAuthChallenge::Digest(challenge) => { + self.nonce_count = self.nonce_count.saturating_add(1); + Ok(Some(build_digest_authorization( + credentials, + &challenge, + method, + uri, + self.nonce_count, + &self.cnonce, + ))) + } + } + } + + fn update_from_unauthorized(&mut self, response: &RtspResponse) -> Result<(), RtspSourceError> { + if self.credentials.is_none() { + return Err(RtspSourceError::MissingCredentials); + } + self.challenge = Some(parse_authenticate_header( + response.headers("www-authenticate").collect::>().as_slice(), + )?); + self.nonce_count = 0; + Ok(()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +enum RtspAuthChallenge { + Basic, + Digest(DigestAuthChallenge), +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct DigestAuthChallenge { + realm: String, + nonce: String, + opaque: Option, + qop: Option, +} + +fn parse_authenticate_header(headers: &[&str]) -> Result { + for header in headers { + if strip_auth_scheme(header, "Digest").is_some() { + return parse_digest_challenge(header); + } + } + for header in headers { + if strip_auth_scheme(header, "Basic").is_some() { + return Ok(RtspAuthChallenge::Basic); + } + } + Err(RtspSourceError::UnsupportedAuthScheme( + headers.first().copied().unwrap_or_default().to_owned(), + )) +} + +fn parse_digest_challenge(header: &str) -> Result { + let params = parse_auth_params( + strip_auth_scheme(header, "Digest").ok_or(RtspSourceError::InvalidAuthChallenge)?, + ); + let realm = params + .iter() + .find(|(name, _)| name.eq_ignore_ascii_case("realm")) + .map(|(_, value)| value.to_owned()) + .ok_or(RtspSourceError::InvalidAuthChallenge)?; + let nonce = params + .iter() + .find(|(name, _)| name.eq_ignore_ascii_case("nonce")) + .map(|(_, value)| value.to_owned()) + .ok_or(RtspSourceError::InvalidAuthChallenge)?; + if let Some((_, algorithm)) = + params.iter().find(|(name, _)| name.eq_ignore_ascii_case("algorithm")) + { + if !algorithm.eq_ignore_ascii_case("MD5") { + return Err(RtspSourceError::UnsupportedAuthScheme(format!( + "Digest algorithm={algorithm}" + ))); + } + } + let qop = params + .iter() + .find(|(name, _)| name.eq_ignore_ascii_case("qop")) + .and_then(|(_, value)| select_digest_qop(value)); + let opaque = params + .iter() + .find(|(name, _)| name.eq_ignore_ascii_case("opaque")) + .map(|(_, value)| value.to_owned()); + + Ok(RtspAuthChallenge::Digest(DigestAuthChallenge { realm, nonce, opaque, qop })) +} + +fn strip_auth_scheme<'a>(header: &'a str, scheme: &str) -> Option<&'a str> { + let header = header.trim_start(); + let rest = header.get(scheme.len()..)?; + if !header[..scheme.len()].eq_ignore_ascii_case(scheme) { + return None; + } + if rest.is_empty() { + return Some(rest); + } + rest.strip_prefix(' ') +} + +fn parse_auth_params(params: &str) -> Vec<(String, String)> { + let mut parsed = Vec::new(); + let mut current = String::new(); + let mut in_quotes = false; + let mut escaped = false; + for ch in params.chars() { + if escaped { + current.push(ch); + escaped = false; + continue; + } + match ch { + '\\' if in_quotes => { + escaped = true; + current.push(ch); + } + '"' => { + in_quotes = !in_quotes; + current.push(ch); + } + ',' if !in_quotes => { + push_auth_param(&mut parsed, ¤t); + current.clear(); + } + _ => current.push(ch), + } + } + push_auth_param(&mut parsed, ¤t); + parsed +} + +fn push_auth_param(parsed: &mut Vec<(String, String)>, param: &str) { + let Some((name, value)) = param.trim().split_once('=') else { + return; + }; + parsed.push((name.trim().to_owned(), unquote_auth_value(value.trim()))); +} + +fn unquote_auth_value(value: &str) -> String { + let Some(value) = value.strip_prefix('"').and_then(|value| value.strip_suffix('"')) else { + return value.to_owned(); + }; + let mut unquoted = String::new(); + let mut escaped = false; + for ch in value.chars() { + if escaped { + unquoted.push(ch); + escaped = false; + } else if ch == '\\' { + escaped = true; + } else { + unquoted.push(ch); + } + } + unquoted +} + +fn select_digest_qop(value: &str) -> Option { + value.split(',').map(str::trim).find(|qop| qop.eq_ignore_ascii_case("auth")).map(str::to_owned) +} + +fn build_digest_authorization( + credentials: &RtspCredentials, + challenge: &DigestAuthChallenge, + method: &str, + uri: &str, + nonce_count: u32, + cnonce: &str, +) -> String { + let ha1 = + md5_hex(format!("{}:{}:{}", credentials.username, challenge.realm, credentials.password)); + let ha2 = md5_hex(format!("{method}:{uri}")); + let response = if let Some(qop) = &challenge.qop { + md5_hex(format!("{ha1}:{}:{nonce_count:08x}:{cnonce}:{qop}:{ha2}", challenge.nonce)) + } else { + md5_hex(format!("{ha1}:{}:{ha2}", challenge.nonce)) + }; + + let mut header = format!( + "Digest username=\"{}\", realm=\"{}\", nonce=\"{}\", uri=\"{}\", response=\"{}\"", + quote_auth_value(&credentials.username), + quote_auth_value(&challenge.realm), + quote_auth_value(&challenge.nonce), + quote_auth_value(uri), + response + ); + if let Some(qop) = &challenge.qop { + header.push_str(&format!( + ", qop={}, nc={nonce_count:08x}, cnonce=\"{}\"", + quote_auth_value(qop), + quote_auth_value(cnonce) + )); + } + if let Some(opaque) = &challenge.opaque { + header.push_str(&format!(", opaque=\"{}\"", quote_auth_value(opaque))); + } + header +} + +fn quote_auth_value(value: &str) -> String { + value.replace('\\', "\\\\").replace('"', "\\\"") +} + +fn md5_hex(input: impl AsRef<[u8]>) -> String { + format!("{:x}", Md5::digest(input)) +} + +fn make_cnonce() -> String { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|duration| duration.as_nanos()) + .unwrap_or_default(); + format!("{nanos:032x}") +} + +fn read_rtsp_response(reader: &mut impl Read) -> Result { + let mut header = Vec::new(); + let mut byte = [0u8; 1]; + loop { + reader.read_exact(&mut byte).map_err(RtspSourceError::Io)?; + header.push(byte[0]); + if header.ends_with(b"\r\n\r\n") { + break; + } + if header.len() > MAX_RTSP_HEADER_BYTES { + return Err(RtspSourceError::InvalidResponse("header too large")); + } + } + + let header_text = + str::from_utf8(&header).map_err(|_| RtspSourceError::InvalidResponse("header UTF-8"))?; + let mut lines = header_text.trim_end_matches("\r\n\r\n").split("\r\n"); + let status_line = + lines.next().ok_or(RtspSourceError::InvalidResponse("missing status line"))?; + let mut status_parts = status_line.splitn(3, ' '); + if status_parts.next() != Some("RTSP/1.0") { + return Err(RtspSourceError::InvalidResponse("unsupported version")); + } + let status_code = status_parts + .next() + .ok_or(RtspSourceError::InvalidResponse("missing status code"))? + .parse() + .map_err(|_| RtspSourceError::InvalidResponse("invalid status code"))?; + let reason = status_parts.next().unwrap_or_default().to_owned(); + + let mut headers = Vec::new(); + for line in lines { + let Some((name, value)) = line.split_once(':') else { + return Err(RtspSourceError::InvalidResponse("malformed header")); + }; + headers.push((name.trim().to_owned(), value.trim().to_owned())); + } + + let content_length = headers + .iter() + .find(|(name, _)| name.eq_ignore_ascii_case("content-length")) + .map(|(_, value)| value.parse::()) + .transpose() + .map_err(|_| RtspSourceError::InvalidResponse("invalid content length"))? + .unwrap_or(0); + let mut body = vec![0; content_length]; + if content_length > 0 { + reader.read_exact(&mut body).map_err(RtspSourceError::Io)?; + } + + Ok(RtspResponse { status_code, reason, headers, body }) +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct SdpVideoTrack { + codec: EncodedVideoCodec, + payload_type: u8, + clock_rate: u32, + control_url: String, +} + +#[derive(Debug, Clone, Default)] +struct PartialSdpVideoTrack { + payload_types: Vec, + rtp_maps: Vec, + control: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct SdpRtpMap { + payload_type: u8, + codec: EncodedVideoCodec, + clock_rate: u32, +} + +fn parse_sdp_video_track( + base_url: &RtspUrl, + sdp: &str, + expected_codec: Option, +) -> Result { + let mut tracks = Vec::new(); + let mut current = None; + + for line in sdp.lines().map(str::trim).filter(|line| !line.is_empty()) { + if let Some(media) = line.strip_prefix("m=") { + if let Some(track) = current.take() { + tracks.push(track); + } + if let Some(video) = media.strip_prefix("video ") { + current = Some(parse_video_media(video)); + } + continue; + } + + let Some(track) = current.as_mut() else { + continue; + }; + if let Some(control) = line.strip_prefix("a=control:") { + track.control = Some(control.trim().to_owned()); + } else if let Some(rtpmap) = line.strip_prefix("a=rtpmap:") { + if let Some(rtp_map) = parse_rtpmap(rtpmap) { + track.rtp_maps.push(rtp_map); + } + } + } + if let Some(track) = current { + tracks.push(track); + } + + let mut offered = Vec::new(); + for track in tracks { + for payload_type in &track.payload_types { + let Some(rtp_map) = track.rtp_maps.iter().find(|map| map.payload_type == *payload_type) + else { + continue; + }; + if let Some(expected) = expected_codec { + if rtp_map.codec != expected { + if !offered.contains(&rtp_map.codec) { + offered.push(rtp_map.codec); + } + continue; + } + } + + return Ok(SdpVideoTrack { + codec: rtp_map.codec, + payload_type: *payload_type, + clock_rate: rtp_map.clock_rate, + control_url: resolve_control_url(base_url, track.control.as_deref()), + }); + } + } + + match expected_codec { + Some(expected) if !offered.is_empty() => { + Err(RtspSourceError::CodecMismatch { expected, actual: offered }) + } + _ => Err(RtspSourceError::MissingVideoTrack), + } +} + +fn parse_video_media(media: &str) -> PartialSdpVideoTrack { + let payload_types = media + .split_whitespace() + .skip(2) + .filter_map(|payload_type| payload_type.parse().ok()) + .collect(); + PartialSdpVideoTrack { payload_types, ..Default::default() } +} + +fn parse_rtpmap(rtpmap: &str) -> Option { + let (payload_type, encoding) = rtpmap.trim().split_once(' ')?; + let payload_type = payload_type.parse().ok()?; + let mut encoding_parts = encoding.split('/'); + let codec_name = encoding_parts.next()?; + let codec = parse_sdp_codec(codec_name)?; + let clock_rate = encoding_parts + .next() + .and_then(|clock_rate| clock_rate.parse().ok()) + .unwrap_or(DEFAULT_RTSP_CLOCK_RATE); + Some(SdpRtpMap { payload_type, codec, clock_rate }) +} + +fn parse_sdp_codec(codec_name: &str) -> Option { + if codec_name.eq_ignore_ascii_case("H264") { + Some(EncodedVideoCodec::H264) + } else if codec_name.eq_ignore_ascii_case("H265") || codec_name.eq_ignore_ascii_case("HEVC") { + Some(EncodedVideoCodec::H265) + } else if codec_name.eq_ignore_ascii_case("VP8") { + Some(EncodedVideoCodec::VP8) + } else if codec_name.eq_ignore_ascii_case("VP9") { + Some(EncodedVideoCodec::VP9) + } else if codec_name.eq_ignore_ascii_case("AV1") { + Some(EncodedVideoCodec::AV1) + } else { + None + } +} + +fn resolve_control_url(base_url: &RtspUrl, control: Option<&str>) -> String { + let Some(control) = control.map(str::trim).filter(|control| !control.is_empty()) else { + return base_url.original.clone(); + }; + if control == "*" { + return base_url.original.clone(); + } + if control.starts_with("rtsp://") { + return control.to_owned(); + } + if control.starts_with('/') { + return format!("rtsp://{}{}", base_url.authority, control); + } + format!("{}/{}", base_url.original.trim_end_matches('/'), control) +} + +fn parse_session_id(session_header: &str) -> Result { + let session_id = session_header.split(';').next().unwrap_or_default().trim(); + if session_id.is_empty() { + return Err(RtspSourceError::InvalidResponse("empty session id")); + } + Ok(session_id.to_owned()) +} + +fn parse_session_timeout_secs(session_header: &str) -> Option { + session_header.split(';').skip(1).find_map(|part| { + let (name, value) = part.trim().split_once('=')?; + if name.trim().eq_ignore_ascii_case("timeout") { + value.trim().parse().ok() + } else { + None + } + }) +} + +fn parse_interleaved_channel(transport_header: Option<&str>) -> u8 { + let Some(transport_header) = transport_header else { + return 0; + }; + for part in transport_header.split(';') { + let Some(value) = part.trim().strip_prefix("interleaved=") else { + continue; + }; + if let Some(first) = value.split('-').next().and_then(|channel| channel.parse().ok()) { + return first; + } + } + 0 +} + +#[cfg(test)] +mod tests { + use std::{ + io::{Cursor, Write}, + net::TcpListener, + thread, + }; + + use super::*; + + fn rtp_packet(sequence_number: u16, timestamp: u32, marker: bool, payload: &[u8]) -> Vec { + let mut packet = Vec::with_capacity(12 + payload.len()); + packet.push(0x80); + packet.push(if marker { 0x80 | 96 } else { 96 }); + packet.extend_from_slice(&sequence_number.to_be_bytes()); + packet.extend_from_slice(×tamp.to_be_bytes()); + packet.extend_from_slice(&0x1122_3344_u32.to_be_bytes()); + packet.extend_from_slice(payload); + packet + } + + fn interleaved(channel: u8, payload: &[u8]) -> Vec { + let mut frame = Vec::with_capacity(4 + payload.len()); + frame.push(b'$'); + frame.push(channel); + frame.extend_from_slice(&(payload.len() as u16).to_be_bytes()); + frame.extend_from_slice(payload); + frame + } + + fn interleaved_config(video_channel: u8) -> RtspInterleavedSourceConfig { + RtspInterleavedSourceConfig { + codec: EncodedVideoCodec::H264, + clock_rate: 90_000, + video_channel, + start_timestamp_us: 0, + width: 640, + height: 480, + idle_timeout: Duration::from_secs(30), + } + } + + #[test] + fn reads_rtsp_interleaved_rtp_access_unit() { + let packet = rtp_packet(10, 12_000, true, &[0x65, 1, 2]); + let stream = interleaved(0, &packet); + let mut source = + RtspInterleavedRtpSource::new(Cursor::new(stream), interleaved_config(0)).unwrap(); + + let access_unit = source.next_access_unit().unwrap().unwrap(); + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 1, 2]); + assert!(source.next_access_unit().unwrap().is_none()); + } + + #[test] + fn skips_rtsp_response_between_interleaved_frames() { + let packet = rtp_packet(10, 12_000, true, &[0x65, 1, 2]); + let mut stream = Vec::new(); + write_status_response(&mut stream, 4, &[], &[], 200, "OK"); + stream.extend_from_slice(&interleaved(0, &packet)); + let mut source = + RtspInterleavedRtpSource::new(Cursor::new(stream), interleaved_config(0)).unwrap(); + + let access_unit = source.next_access_unit().unwrap().unwrap(); + + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 1, 2]); + assert!(source.next_access_unit().unwrap().is_none()); + } + + #[test] + fn recovers_interleaved_framing_across_read_timeouts() { + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let addr = listener.local_addr().unwrap(); + let server = thread::spawn(move || { + let (mut stream, _) = listener.accept().unwrap(); + let packet = rtp_packet(10, 12_000, true, &[0x65, 1, 2]); + let frame = interleaved(0, &packet); + // Split inside the 4-byte interleaved header and pause long + // enough for several client read timeouts in between. + let (head, tail) = frame.split_at(2); + stream.write_all(head).unwrap(); + stream.flush().unwrap(); + thread::sleep(Duration::from_millis(150)); + stream.write_all(tail).unwrap(); + stream.flush().unwrap(); + }); + + let client = std::net::TcpStream::connect(addr).unwrap(); + client.set_read_timeout(Some(Duration::from_millis(25))).unwrap(); + let mut source = RtspInterleavedRtpSource::new(client, interleaved_config(0)).unwrap(); + + let access_unit = source.next_access_unit().unwrap().unwrap(); + + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 1, 2]); + server.join().unwrap(); + } + + #[test] + fn interleaved_stream_times_out_after_idle_limit() { + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let addr = listener.local_addr().unwrap(); + let server = thread::spawn(move || { + let (stream, _) = listener.accept().unwrap(); + // Stay silent past the client's idle limit before closing. + thread::sleep(Duration::from_millis(500)); + drop(stream); + }); + + let client = std::net::TcpStream::connect(addr).unwrap(); + client.set_read_timeout(Some(Duration::from_millis(20))).unwrap(); + let config = RtspInterleavedSourceConfig { + idle_timeout: Duration::from_millis(80), + ..interleaved_config(0) + }; + let mut source = RtspInterleavedRtpSource::new(client, config).unwrap(); + + let err = source.next_access_unit().unwrap_err(); + + assert!(matches!(err, RtspSourceError::Timeout { .. })); + server.join().unwrap(); + } + + #[test] + fn parses_sdp_video_track() { + let base_url = RtspUrl::parse("rtsp://camera.example/live").unwrap(); + let sdp = "\ +v=0\r\n\ +m=video 0 RTP/AVP 96\r\n\ +a=control:trackID=1\r\n\ +a=rtpmap:96 H264/90000\r\n"; + + let track = parse_sdp_video_track(&base_url, sdp, Some(EncodedVideoCodec::H264)).unwrap(); + + assert_eq!(track.codec, EncodedVideoCodec::H264); + assert_eq!(track.payload_type, 96); + assert_eq!(track.clock_rate, 90_000); + assert_eq!(track.control_url, "rtsp://camera.example/live/trackID=1"); + } + + #[test] + fn parses_vp8_vp9_and_av1_sdp_video_tracks() { + let base_url = RtspUrl::parse("rtsp://camera.example/live").unwrap(); + + for (rtpmap, codec) in [ + ("VP8/90000", EncodedVideoCodec::VP8), + ("VP9/90000", EncodedVideoCodec::VP9), + ("AV1/90000", EncodedVideoCodec::AV1), + ] { + let sdp = format!( + "\ +v=0\r\n\ +m=video 0 RTP/AVP 96\r\n\ +a=control:trackID=1\r\n\ +a=rtpmap:96 {rtpmap}\r\n" + ); + + let track = parse_sdp_video_track(&base_url, &sdp, Some(codec)).unwrap(); + + assert_eq!(track.codec, codec); + assert_eq!(track.payload_type, 96); + assert_eq!(track.clock_rate, 90_000); + } + } + + #[test] + fn rejects_sdp_codec_mismatch_for_vpx_av1() { + let base_url = RtspUrl::parse("rtsp://camera.example/live").unwrap(); + let sdp = "\ +v=0\r\n\ +m=video 0 RTP/AVP 96\r\n\ +a=control:trackID=1\r\n\ +a=rtpmap:96 VP9/90000\r\n"; + + let err = parse_sdp_video_track(&base_url, sdp, Some(EncodedVideoCodec::AV1)).unwrap_err(); + + match err { + RtspSourceError::CodecMismatch { expected, actual } => { + assert_eq!(expected, EncodedVideoCodec::AV1); + assert_eq!(actual, vec![EncodedVideoCodec::VP9]); + } + other => panic!("expected codec mismatch, got {other:?}"), + } + } + + #[test] + fn selects_expected_codec_among_multiple_payload_types() { + let base_url = RtspUrl::parse("rtsp://camera.example/live").unwrap(); + let sdp = "\ +v=0\r\n\ +m=video 0 RTP/AVP 98 96\r\n\ +a=control:trackID=1\r\n\ +a=rtpmap:98 H265/90000\r\n\ +a=rtpmap:96 H264/90000\r\n"; + + let track = parse_sdp_video_track(&base_url, sdp, Some(EncodedVideoCodec::H264)).unwrap(); + + assert_eq!(track.codec, EncodedVideoCodec::H264); + assert_eq!(track.payload_type, 96); + assert_eq!(track.clock_rate, 90_000); + assert_eq!(track.control_url, "rtsp://camera.example/live/trackID=1"); + } + + #[test] + fn selects_expected_codec_from_later_video_section() { + let base_url = RtspUrl::parse("rtsp://camera.example/live").unwrap(); + let sdp = "\ +v=0\r\n\ +m=video 0 RTP/AVP 98\r\n\ +a=control:trackID=1\r\n\ +a=rtpmap:98 H265/90000\r\n\ +m=video 0 RTP/AVP 96\r\n\ +a=control:trackID=2\r\n\ +a=rtpmap:96 H264/90000\r\n"; + + let track = parse_sdp_video_track(&base_url, sdp, Some(EncodedVideoCodec::H264)).unwrap(); + + assert_eq!(track.codec, EncodedVideoCodec::H264); + assert_eq!(track.payload_type, 96); + assert_eq!(track.control_url, "rtsp://camera.example/live/trackID=2"); + } + + #[test] + fn rejects_sdp_listing_all_offered_codecs_when_none_match() { + let base_url = RtspUrl::parse("rtsp://camera.example/live").unwrap(); + let sdp = "\ +v=0\r\n\ +m=video 0 RTP/AVP 98 96\r\n\ +a=control:trackID=1\r\n\ +a=rtpmap:98 H265/90000\r\n\ +a=rtpmap:96 H264/90000\r\n"; + + let err = parse_sdp_video_track(&base_url, sdp, Some(EncodedVideoCodec::VP8)).unwrap_err(); + + match err { + RtspSourceError::CodecMismatch { expected, actual } => { + assert_eq!(expected, EncodedVideoCodec::VP8); + assert_eq!(actual, vec![EncodedVideoCodec::H265, EncodedVideoCodec::H264]); + } + other => panic!("expected codec mismatch, got {other:?}"), + } + } + + #[test] + fn resolves_absolute_path_control_url() { + let base_url = RtspUrl::parse("rtsp://camera.example/live").unwrap(); + assert_eq!( + resolve_control_url(&base_url, Some("/stream/trackID=1")), + "rtsp://camera.example/stream/trackID=1" + ); + } + + #[test] + fn parses_session_timeout() { + assert_eq!(parse_session_timeout_secs("abc123;timeout=60"), Some(60)); + assert_eq!(parse_session_timeout_secs("abc123; Timeout = 30"), Some(30)); + assert_eq!(parse_session_timeout_secs("abc123"), None); + } + + #[test] + fn parses_credentials_but_strips_them_from_request_url() { + let url = RtspUrl::parse("rtsp://admin:secret@camera.example:554/live").unwrap(); + + assert_eq!(url.original, "rtsp://camera.example:554/live"); + assert_eq!(url.authority, "camera.example:554"); + assert_eq!( + url.credentials, + Some(RtspCredentials { username: "admin".to_owned(), password: "secret".to_owned() }) + ); + } + + #[test] + fn builds_digest_authorization_with_qop_auth() { + let credentials = RtspCredentials { + username: "Mufasa".to_owned(), + password: "Circle Of Life".to_owned(), + }; + let challenge = DigestAuthChallenge { + realm: "testrealm@host.com".to_owned(), + nonce: "dcd98b7102dd2f0e8b11d0f600bfb0c093".to_owned(), + opaque: Some("5ccc069c403ebaf9f0171e9517f40e41".to_owned()), + qop: Some("auth".to_owned()), + }; + + let authorization = build_digest_authorization( + &credentials, + &challenge, + "GET", + "/dir/index.html", + 1, + "0a4f113b", + ); + + assert!(authorization.contains("response=\"6629fae49393a05397450978507c4ef1\"")); + assert!(authorization.contains("qop=auth")); + assert!(authorization.contains("nc=00000001")); + } + + #[test] + fn sends_rtsp_keepalive_when_due() { + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let addr = listener.local_addr().unwrap(); + let server = thread::spawn(move || { + let (mut stream, _) = listener.accept().unwrap(); + read_request(&mut stream) + }); + let mut client = std::net::TcpStream::connect(addr).unwrap(); + let mut keepalive = RtspKeepalive::new( + "rtsp://camera.example/live".to_owned(), + "camera.example:554".to_owned(), + "abc123".to_owned(), + 4, + RtspAuthContext::new(None), + Some(2), + ); + keepalive.next_due = Instant::now() - Duration::from_secs(1); + + keepalive.maybe_send(&mut client).unwrap(); + let request = server.join().unwrap(); + + assert!(request.starts_with("OPTIONS rtsp://camera.example/live RTSP/1.0")); + assert!(request.contains("CSeq: 4")); + assert!(request.contains("Session: abc123")); + } + + #[test] + fn connects_and_reads_rtsp_access_unit() { + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let addr = listener.local_addr().unwrap(); + let server = thread::spawn(move || { + let (mut stream, _) = listener.accept().unwrap(); + let describe = read_request(&mut stream); + assert!(describe.starts_with("DESCRIBE rtsp://")); + let sdp = "\ +v=0\r\n\ +m=video 0 RTP/AVP 96\r\n\ +a=control:trackID=0\r\n\ +a=rtpmap:96 H264/90000\r\n"; + write_response( + &mut stream, + 1, + &[("Content-Type", "application/sdp"), ("Content-Length", &sdp.len().to_string())], + sdp.as_bytes(), + ); + + let setup = read_request(&mut stream); + assert!(setup.starts_with("SETUP rtsp://")); + assert!(setup.contains("Transport: RTP/AVP/TCP;unicast;interleaved=0-1")); + write_response( + &mut stream, + 2, + &[ + ("Session", "abc123;timeout=60"), + ("Transport", "RTP/AVP/TCP;unicast;interleaved=2-3"), + ], + &[], + ); + + let play = read_request(&mut stream); + assert!(play.starts_with("PLAY rtsp://")); + assert!(play.contains("Session: abc123")); + write_response(&mut stream, 3, &[], &[]); + + let packet = rtp_packet(10, 12_000, true, &[0x65, 1, 2]); + stream.write_all(&interleaved(2, &packet)).unwrap(); + }); + + let options = RtspSourceOptions::new(640, 480) + .with_expected_codec(EncodedVideoCodec::H264) + .with_start_timestamp_us(0); + let mut source = + RtspEncodedSource::connect(&format!("rtsp://{addr}/camera"), options).unwrap(); + assert_eq!(source.session_info().codec, EncodedVideoCodec::H264); + assert_eq!(source.session_info().video_channel, 2); + assert_eq!(source.session_info().session_id, "abc123"); + + let access_unit = source.next_access_unit().unwrap().unwrap(); + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 1, 2]); + server.join().unwrap(); + } + + #[test] + fn connects_with_rtsp_digest_auth() { + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let addr = listener.local_addr().unwrap(); + let server = thread::spawn(move || { + let (mut stream, _) = listener.accept().unwrap(); + let first_describe = read_request(&mut stream); + assert!(first_describe.starts_with(&format!("DESCRIBE rtsp://{addr}/camera"))); + assert!(!first_describe.contains("Authorization:")); + write_status_response( + &mut stream, + 1, + &[("WWW-Authenticate", "Digest realm=\"camera\", nonce=\"abcdef\", qop=\"auth\"")], + &[], + 401, + "Unauthorized", + ); + + let second_describe = read_request(&mut stream); + assert!(second_describe.starts_with(&format!("DESCRIBE rtsp://{addr}/camera"))); + assert!(!second_describe.contains("admin:secret@")); + assert!(second_describe.contains("Authorization: Digest username=\"admin\"")); + assert!(second_describe.contains(&format!("uri=\"rtsp://{addr}/camera\""))); + assert!(second_describe.contains("qop=auth")); + let sdp = "\ +v=0\r\n\ +m=video 0 RTP/AVP 96\r\n\ +a=control:trackID=0\r\n\ +a=rtpmap:96 H264/90000\r\n"; + write_status_response( + &mut stream, + 2, + &[("Content-Type", "application/sdp"), ("Content-Length", &sdp.len().to_string())], + sdp.as_bytes(), + 200, + "OK", + ); + + let setup = read_request(&mut stream); + assert!(setup.contains("Authorization: Digest username=\"admin\"")); + write_status_response( + &mut stream, + 3, + &[ + ("Session", "abc123;timeout=60"), + ("Transport", "RTP/AVP/TCP;unicast;interleaved=0-1"), + ], + &[], + 200, + "OK", + ); + + let play = read_request(&mut stream); + assert!(play.contains("Authorization: Digest username=\"admin\"")); + write_status_response(&mut stream, 4, &[], &[], 200, "OK"); + + let packet = rtp_packet(10, 12_000, true, &[0x65, 1, 2]); + stream.write_all(&interleaved(0, &packet)).unwrap(); + }); + + let options = RtspSourceOptions::new(640, 480) + .with_expected_codec(EncodedVideoCodec::H264) + .with_start_timestamp_us(0); + let mut source = + RtspEncodedSource::connect(&format!("rtsp://admin:secret@{addr}/camera"), options) + .unwrap(); + + let access_unit = source.next_access_unit().unwrap().unwrap(); + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 1, 2]); + server.join().unwrap(); + } + + #[test] + fn sends_keepalive_during_stream_silence() { + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let addr = listener.local_addr().unwrap(); + let server = thread::spawn(move || { + let (mut stream, _) = listener.accept().unwrap(); + let _describe = read_request(&mut stream); + let sdp = "\ +v=0\r\n\ +m=video 0 RTP/AVP 96\r\n\ +a=control:trackID=0\r\n\ +a=rtpmap:96 H264/90000\r\n"; + write_response( + &mut stream, + 1, + &[("Content-Type", "application/sdp"), ("Content-Length", &sdp.len().to_string())], + sdp.as_bytes(), + ); + let _setup = read_request(&mut stream); + write_response( + &mut stream, + 2, + &[ + ("Session", "abc123;timeout=60"), + ("Transport", "RTP/AVP/TCP;unicast;interleaved=0-1"), + ], + &[], + ); + let _play = read_request(&mut stream); + write_response(&mut stream, 3, &[], &[]); + + // Send no interleaved data; the keepalive must arrive during the + // silence. Only then reply and send the first video frame. + let keepalive = read_request(&mut stream); + write_response(&mut stream, 4, &[], &[]); + let packet = rtp_packet(10, 12_000, true, &[0x65, 1, 2]); + stream.write_all(&interleaved(0, &packet)).unwrap(); + keepalive + }); + + let options = RtspSourceOptions::new(640, 480) + .with_expected_codec(EncodedVideoCodec::H264) + .with_read_timeout(Duration::from_millis(100)) + .with_idle_timeout(Duration::from_secs(5)); + let mut source = + RtspEncodedSource::connect(&format!("rtsp://{addr}/camera"), options).unwrap(); + source.keepalive.next_due = Instant::now() + Duration::from_millis(250); + + let access_unit = source.next_access_unit().unwrap().unwrap(); + + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 1, 2]); + let keepalive = server.join().unwrap(); + assert!(keepalive.starts_with("OPTIONS rtsp://")); + assert!(keepalive.contains("Session: abc123")); + } + + #[test] + fn handshake_read_timeout_is_hard_error() { + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let addr = listener.local_addr().unwrap(); + let server = thread::spawn(move || { + let (mut stream, _) = listener.accept().unwrap(); + let _describe = read_request(&mut stream); + // Never respond; hold the connection open past the read timeout. + thread::sleep(Duration::from_millis(300)); + }); + + let options = RtspSourceOptions::new(640, 480).with_read_timeout(Duration::from_millis(50)); + let err = + RtspEncodedSource::connect(&format!("rtsp://{addr}/camera"), options).unwrap_err(); + + assert!( + matches!(&err, RtspSourceError::Timeout { phase } if phase.contains("DESCRIBE")), + "expected DESCRIBE timeout, got {err:?}" + ); + server.join().unwrap(); + } + + fn read_request(stream: &mut impl Read) -> String { + let mut request = Vec::new(); + let mut byte = [0u8; 1]; + loop { + stream.read_exact(&mut byte).unwrap(); + request.push(byte[0]); + if request.ends_with(b"\r\n\r\n") { + break; + } + } + String::from_utf8(request).unwrap() + } + + fn write_response(stream: &mut impl Write, cseq: u32, headers: &[(&str, &str)], body: &[u8]) { + write_status_response(stream, cseq, headers, body, 200, "OK"); + } + + fn write_status_response( + stream: &mut impl Write, + cseq: u32, + headers: &[(&str, &str)], + body: &[u8], + status_code: u16, + reason: &str, + ) { + write!(stream, "RTSP/1.0 {status_code} {reason}\r\nCSeq: {cseq}\r\n").unwrap(); + for (name, value) in headers { + write!(stream, "{name}: {value}\r\n").unwrap(); + } + write!(stream, "\r\n").unwrap(); + if !body.is_empty() { + stream.write_all(body).unwrap(); + } + stream.flush().unwrap(); + } +} diff --git a/livekit-capture/src/sources/tcp.rs b/livekit-capture/src/sources/tcp.rs new file mode 100644 index 000000000..d84715f35 --- /dev/null +++ b/livekit-capture/src/sources/tcp.rs @@ -0,0 +1,499 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{ + io::{self, Read}, + net::{SocketAddr, TcpListener, TcpStream, ToSocketAddrs}, +}; + +use thiserror::Error; + +use crate::{ + encoded::{ + h26x::{AccessUnitParser, AnnexBAccessUnitParser, AvcAccessUnitParser}, + ingress::EncodedAccessUnitSource, + rtp::{RtpAccessUnitAssembler, RtpDepacketizerError}, + EncodedVideoCodec, EncodedWireFormat, OwnedEncodedAccessUnit, + }, + error::CaptureError, + sources::io::read_exact_or_clean_eof, +}; + +const DEFAULT_CHUNK_SIZE: usize = 4096; + +/// Configuration for a byte-stream encoded source. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct ByteStreamSourceConfig { + /// Declared stream wire format. + pub wire_format: EncodedWireFormat, + /// Timestamp assigned to the first emitted access unit. + pub start_timestamp_us: i64, + /// Frame interval used for Annex-B byte streams. + pub frame_interval_us: i64, + /// Encoded frame width in pixels. + pub width: u32, + /// Encoded frame height in pixels. + pub height: u32, + /// Read chunk size for Annex-B byte streams. + pub read_chunk_size: usize, +} + +impl ByteStreamSourceConfig { + /// Creates byte-stream source configuration with a 4096-byte read chunk. + pub fn new( + wire_format: EncodedWireFormat, + start_timestamp_us: i64, + frame_interval_us: i64, + width: u32, + height: u32, + ) -> Self { + Self { + wire_format, + start_timestamp_us, + frame_interval_us, + width, + height, + read_chunk_size: DEFAULT_CHUNK_SIZE, + } + } + + /// Sets the read chunk size used for Annex-B byte streams. + pub fn with_read_chunk_size(mut self, read_chunk_size: usize) -> Self { + self.read_chunk_size = read_chunk_size.max(1); + self + } +} + +/// Encoded source backed by any blocking byte stream. +#[derive(Debug)] +pub struct ByteStreamEncodedSource { + reader: R, + config: ByteStreamSourceConfig, + parser: ByteStreamParser, + read_chunk: Vec, + eof: bool, + /// Whether the parser may still hold complete access units from the last + /// push, which must be drained before reading more from the stream. + drain_pending: bool, +} + +/// TCP encoded source using the same parser as other byte streams. +pub type TcpEncodedSource = ByteStreamEncodedSource; + +#[derive(Debug)] +enum ByteStreamParser { + H26x(AnnexBAccessUnitParser), + H264Avc(AvcAccessUnitParser), + Rtp(RtpAccessUnitAssembler), +} + +impl ByteStreamEncodedSource +where + R: Read, +{ + /// Creates an encoded source for a declared byte-stream wire format. + pub fn new(reader: R, config: ByteStreamSourceConfig) -> Result { + let parser = match config.wire_format { + EncodedWireFormat::H264AnnexB => ByteStreamParser::H26x( + AnnexBAccessUnitParser::new( + EncodedVideoCodec::H264, + config.start_timestamp_us, + config.frame_interval_us, + config.width, + config.height, + ) + .map_err(TcpSourceError::Capture)?, + ), + EncodedWireFormat::H264Avc { nal_length_size } => ByteStreamParser::H264Avc( + AvcAccessUnitParser::new( + nal_length_size, + config.start_timestamp_us, + config.frame_interval_us, + config.width, + config.height, + ) + .map_err(TcpSourceError::Capture)?, + ), + EncodedWireFormat::H265AnnexB => ByteStreamParser::H26x( + AnnexBAccessUnitParser::new( + EncodedVideoCodec::H265, + config.start_timestamp_us, + config.frame_interval_us, + config.width, + config.height, + ) + .map_err(TcpSourceError::Capture)?, + ), + EncodedWireFormat::Rtp { codec, clock_rate } => { + ByteStreamParser::Rtp(RtpAccessUnitAssembler::new( + codec, + clock_rate, + config.start_timestamp_us, + config.width, + config.height, + )?) + } + EncodedWireFormat::MpegTs => { + return Err(TcpSourceError::UnsupportedWireFormat(config.wire_format)); + } + }; + + Ok(Self { + reader, + config, + parser, + read_chunk: vec![0; config.read_chunk_size.max(1)], + eof: false, + drain_pending: false, + }) + } + + /// Returns the source configuration. + pub fn config(&self) -> ByteStreamSourceConfig { + self.config + } + + /// Returns the wrapped reader. + pub fn reader(&self) -> &R { + &self.reader + } + + /// Returns the wrapped reader mutably. + pub fn reader_mut(&mut self) -> &mut R { + &mut self.reader + } + + /// Consumes this source and returns its reader. + pub fn into_reader(self) -> R { + self.reader + } + + fn next_from_parser( + reader: &mut R, + read_chunk: &mut [u8], + parser: &mut P, + eof: &mut bool, + drain_pending: &mut bool, + ) -> Result, TcpSourceError> { + loop { + if *drain_pending { + if let Some(access_unit) = parser.drain().map_err(TcpSourceError::Capture)? { + return Ok(Some(access_unit)); + } + *drain_pending = false; + } + if *eof { + return parser.flush().map_err(TcpSourceError::Capture); + } + + let read = reader.read(read_chunk).map_err(TcpSourceError::Io)?; + if read == 0 { + *eof = true; + continue; + } + if let Some(access_unit) = + parser.push(&read_chunk[..read]).map_err(TcpSourceError::Capture)? + { + *drain_pending = true; + return Ok(Some(access_unit)); + } + } + } + + fn next_rtp( + reader: &mut R, + packet: &mut Vec, + assembler: &mut RtpAccessUnitAssembler, + eof: &mut bool, + ) -> Result, TcpSourceError> { + while !*eof { + let mut len = [0u8; 2]; + if !read_exact_or_clean_eof(reader, &mut len).map_err(TcpSourceError::Io)? { + *eof = true; + return Ok(None); + } + + let packet_len = u16::from_be_bytes(len) as usize; + if packet_len == 0 { + continue; + } + + packet.resize(packet_len, 0); + reader.read_exact(packet).map_err(TcpSourceError::Io)?; + if let Some(access_unit) = assembler.push(packet)? { + return Ok(Some(access_unit)); + } + } + + Ok(None) + } +} + +impl ByteStreamEncodedSource { + /// Connects to a TCP producer and parses the declared encoded wire format. + pub fn connect( + addr: A, + config: ByteStreamSourceConfig, + ) -> Result { + let stream = TcpStream::connect(addr).map_err(TcpSourceError::Io)?; + Self::new(stream, config) + } + + /// Creates a TCP encoded source from an already connected stream. + pub fn from_tcp_stream( + stream: TcpStream, + config: ByteStreamSourceConfig, + ) -> Result { + Self::new(stream, config) + } +} + +/// TCP listener for producer-initiated encoded byte streams. +#[derive(Debug)] +pub struct TcpEncodedListener { + listener: TcpListener, + config: ByteStreamSourceConfig, +} + +impl TcpEncodedListener { + /// Binds a TCP listener for encoded byte-stream producers. + pub fn bind( + addr: A, + config: ByteStreamSourceConfig, + ) -> Result { + let listener = TcpListener::bind(addr).map_err(TcpSourceError::Io)?; + Ok(Self { listener, config }) + } + + /// Creates an encoded listener from an existing [`TcpListener`]. + pub fn from_listener(listener: TcpListener, config: ByteStreamSourceConfig) -> Self { + Self { listener, config } + } + + /// Returns the listener configuration. + pub fn config(&self) -> ByteStreamSourceConfig { + self.config + } + + /// Returns the bound local socket address. + pub fn local_addr(&self) -> Result { + self.listener.local_addr().map_err(TcpSourceError::Io) + } + + /// Returns the wrapped TCP listener. + pub fn listener(&self) -> &TcpListener { + &self.listener + } + + /// Returns the wrapped TCP listener mutably. + pub fn listener_mut(&mut self) -> &mut TcpListener { + &mut self.listener + } + + /// Accepts one producer connection and returns it as a TCP encoded source. + pub fn accept(&self) -> Result { + self.accept_with_peer().map(|(source, _peer)| source) + } + + /// Accepts one producer connection and returns the source plus peer address. + pub fn accept_with_peer(&self) -> Result<(TcpEncodedSource, SocketAddr), TcpSourceError> { + let (stream, peer) = self.listener.accept().map_err(TcpSourceError::Io)?; + Ok((TcpEncodedSource::from_tcp_stream(stream, self.config)?, peer)) + } +} + +impl EncodedAccessUnitSource for ByteStreamEncodedSource +where + R: Read + Send + Sync + 'static, +{ + type Error = TcpSourceError; + + fn next_access_unit(&mut self) -> Result, Self::Error> { + match &mut self.parser { + ByteStreamParser::H26x(parser) => Self::next_from_parser( + &mut self.reader, + &mut self.read_chunk, + parser, + &mut self.eof, + &mut self.drain_pending, + ), + ByteStreamParser::H264Avc(parser) => Self::next_from_parser( + &mut self.reader, + &mut self.read_chunk, + parser, + &mut self.eof, + &mut self.drain_pending, + ), + ByteStreamParser::Rtp(assembler) => { + Self::next_rtp(&mut self.reader, &mut self.read_chunk, assembler, &mut self.eof) + } + } + } +} + +/// Error returned by byte-stream encoded sources. +#[derive(Debug, Error)] +pub enum TcpSourceError { + /// I/O failed while reading the byte stream. + #[error("byte-stream read failed: {0}")] + Io(io::Error), + /// The declared wire format is not supported by this source. + #[error("unsupported byte-stream wire format: {0:?}")] + UnsupportedWireFormat(EncodedWireFormat), + /// RTP depayloading failed. + #[error(transparent)] + Rtp(#[from] RtpDepacketizerError), + /// Access-unit construction failed. + #[error(transparent)] + Capture(CaptureError), +} + +#[cfg(test)] +mod tests { + use std::{ + io::{Cursor, Write}, + net::{Shutdown, TcpListener as StdTcpListener, TcpStream as StdTcpStream}, + thread, + }; + + use super::*; + + fn rtp_packet(sequence_number: u16, timestamp: u32, marker: bool, payload: &[u8]) -> Vec { + let mut packet = Vec::with_capacity(12 + payload.len()); + packet.push(0x80); + packet.push(if marker { 0x80 | 96 } else { 96 }); + packet.extend_from_slice(&sequence_number.to_be_bytes()); + packet.extend_from_slice(×tamp.to_be_bytes()); + packet.extend_from_slice(&0x1122_3344_u32.to_be_bytes()); + packet.extend_from_slice(payload); + packet + } + + fn rfc4571(packet: &[u8]) -> Vec { + let mut bytes = Vec::with_capacity(2 + packet.len()); + bytes.extend_from_slice(&(packet.len() as u16).to_be_bytes()); + bytes.extend_from_slice(packet); + bytes + } + + fn annex_b_stream() -> Vec { + vec![0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x65, 1, 2, 0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x41, 3] + } + + fn annex_b_config() -> ByteStreamSourceConfig { + ByteStreamSourceConfig::new(EncodedWireFormat::H264AnnexB, 0, 33_333, 640, 480) + } + + fn avc_stream() -> Vec { + vec![ + 0, 0, 0, 2, 0x09, 0x10, 0, 0, 0, 3, 0x65, 1, 2, 0, 0, 0, 2, 0x09, 0x10, 0, 0, 0, 2, + 0x41, 3, + ] + } + + fn avc_config() -> ByteStreamSourceConfig { + ByteStreamSourceConfig::new( + EncodedWireFormat::H264Avc { nal_length_size: 4 }, + 0, + 33_333, + 640, + 480, + ) + } + + #[test] + fn reads_annex_b_access_units() { + let stream = annex_b_stream(); + let config = annex_b_config(); + let mut source = ByteStreamEncodedSource::new(Cursor::new(stream), config).unwrap(); + + let first = source.next_access_unit().unwrap().unwrap(); + assert_eq!(first.payload.as_ref(), &[0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x65, 1, 2]); + let second = source.next_access_unit().unwrap().unwrap(); + assert_eq!(second.payload.as_ref(), &[0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x41, 3]); + assert!(source.next_access_unit().unwrap().is_none()); + } + + #[test] + fn reads_h264_avc_access_units_as_annex_b() { + let stream = avc_stream(); + let config = avc_config(); + let mut source = ByteStreamEncodedSource::new(Cursor::new(stream), config).unwrap(); + + let first = source.next_access_unit().unwrap().unwrap(); + assert_eq!(first.payload.as_ref(), &[0, 0, 0, 1, 0x09, 0x10, 0, 0, 0, 1, 0x65, 1, 2]); + let second = source.next_access_unit().unwrap().unwrap(); + assert_eq!(second.payload.as_ref(), &[0, 0, 0, 1, 0x09, 0x10, 0, 0, 0, 1, 0x41, 3]); + assert!(source.next_access_unit().unwrap().is_none()); + } + + #[test] + fn tcp_connect_reads_annex_b_access_units() { + let listener = StdTcpListener::bind("127.0.0.1:0").unwrap(); + let addr = listener.local_addr().unwrap(); + let writer = thread::spawn(move || { + let (mut stream, _) = listener.accept().unwrap(); + stream.write_all(&annex_b_stream()).unwrap(); + stream.shutdown(Shutdown::Write).unwrap(); + }); + + let mut source = TcpEncodedSource::connect(addr, annex_b_config()).unwrap(); + let first = source.next_access_unit().unwrap().unwrap(); + assert_eq!(first.payload.as_ref(), &[0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x65, 1, 2]); + let second = source.next_access_unit().unwrap().unwrap(); + assert_eq!(second.payload.as_ref(), &[0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x41, 3]); + assert!(source.next_access_unit().unwrap().is_none()); + writer.join().unwrap(); + } + + #[test] + fn tcp_listener_accepts_annex_b_source() { + let listener = TcpEncodedListener::bind("127.0.0.1:0", annex_b_config()).unwrap(); + let addr = listener.local_addr().unwrap(); + let writer = thread::spawn(move || { + let mut stream = StdTcpStream::connect(addr).unwrap(); + stream.write_all(&annex_b_stream()).unwrap(); + stream.shutdown(Shutdown::Write).unwrap(); + }); + + let (mut source, peer) = listener.accept_with_peer().unwrap(); + assert_eq!(peer.ip(), addr.ip()); + assert_eq!(source.config(), annex_b_config()); + + let first = source.next_access_unit().unwrap().unwrap(); + assert_eq!(first.payload.as_ref(), &[0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x65, 1, 2]); + let second = source.next_access_unit().unwrap().unwrap(); + assert_eq!(second.payload.as_ref(), &[0, 0, 1, 0x09, 0x10, 0, 0, 1, 0x41, 3]); + assert!(source.next_access_unit().unwrap().is_none()); + writer.join().unwrap(); + } + + #[test] + fn reads_rfc4571_rtp_access_unit() { + let packet = rtp_packet(10, 12_000, true, &[0x65, 1, 2]); + let stream = rfc4571(&packet); + let config = ByteStreamSourceConfig::new( + EncodedWireFormat::Rtp { codec: EncodedVideoCodec::H264, clock_rate: 90_000 }, + 0, + 33_333, + 640, + 480, + ); + let mut source = ByteStreamEncodedSource::new(Cursor::new(stream), config).unwrap(); + + let access_unit = source.next_access_unit().unwrap().unwrap(); + assert_eq!(access_unit.payload.as_ref(), &[0, 0, 0, 1, 0x65, 1, 2]); + assert!(source.next_access_unit().unwrap().is_none()); + } +} diff --git a/livekit-capture/src/sources/v4l.rs b/livekit-capture/src/sources/v4l.rs new file mode 100644 index 000000000..6c203c880 --- /dev/null +++ b/livekit-capture/src/sources/v4l.rs @@ -0,0 +1,1361 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Linux V4L2 capture using direct V4L2 access. + +use std::time::Duration; +#[cfg(target_os = "linux")] +use std::{path::Path, time::Instant}; + +#[cfg(target_os = "linux")] +use livekit::webrtc::video_frame::VideoRotation; +use livekit::webrtc::video_frame::{I420Buffer, VideoFrame}; +use thiserror::Error; +#[cfg(target_os = "linux")] +use v4l::{ + buffer::Type as V4lBufferType, + capability::Flags as V4lCapabilityFlags, + context, + format::{Format as V4lFormat, FourCC}, + frameinterval::FrameIntervalEnum, + framesize::FrameSizeEnum, + io::{mmap::Stream as MmapStream, traits::CaptureStream}, + video::{capture::Parameters as V4lCaptureParameters, Capture}, + Device, +}; + +#[cfg(target_os = "linux")] +use crate::device::CaptureBackend; +use crate::device::{ + CaptureDeviceInfo, CaptureDeviceSelector, CaptureFormat, CaptureFormatRequest, + CaptureFrameFormat, CapturePath, CaptureResolution, +}; +#[cfg(any(target_os = "linux", test))] +use crate::time::validate_capture_timestamp_us; +#[cfg(target_os = "linux")] +use crate::time::{elapsed_us, unix_time_us_now}; + +/// Options used to open a Linux V4L2 capture session. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct V4lCaptureOptions { + /// Device to open. + pub device: CaptureDeviceSelector, + /// Requested format policy. + pub format: CaptureFormatRequest, + /// Ordered source frame formats to try. + pub frame_formats: Vec, +} + +impl V4lCaptureOptions { + /// Creates options that try YUYV, MJPEG, greyscale, RGB24, and NV12 at the requested format. + pub fn new( + device: CaptureDeviceSelector, + resolution: CaptureResolution, + frame_rate: u32, + ) -> Self { + Self { + device, + format: CaptureFormatRequest::Exact(CaptureFormat::new( + resolution, + frame_rate, + CaptureFrameFormat::Yuyv, + )), + frame_formats: default_frame_formats(), + } + } +} + +impl Default for V4lCaptureOptions { + fn default() -> Self { + Self::new(CaptureDeviceSelector::Default, CaptureResolution::new(1280, 720), 30) + } +} + +/// Error returned by the V4L capture backend. +#[derive(Debug, Error)] +pub enum V4lError { + /// V4L capture is only available on Linux. + #[error("V4L capture is not supported on this platform")] + UnsupportedPlatform, + /// The requested frame format is not supported by this backend. + #[error("V4L capture does not support frame format {0:?}")] + UnsupportedFrameFormat(CaptureFrameFormat), + /// The requested option is invalid. + #[error("invalid V4L capture option: {0}")] + InvalidOption(&'static str), + /// A numeric option could not be represented by the V4L backend. + #[error("V4L capture option is out of range: {0}")] + OptionOutOfRange(&'static str), + /// The camera backend returned an error. + #[error("V4L camera error: {0}")] + Camera(String), + /// Captured frame bytes did not match the negotiated format. + #[error("invalid V4L frame buffer: {0}")] + InvalidFrame(&'static str), + /// Pixel conversion failed. + #[error("failed to convert V4L frame to I420: {0}")] + Convert(&'static str), + /// MJPEG fallback decoding failed. + #[error("failed to decode MJPEG frame: {0}")] + Decode(String), +} + +/// One V4L frame converted to I420. +#[derive(Debug)] +pub struct V4lFrame { + /// Decoded I420 frame suitable for [`crate::VideoCaptureTrack::capture_frame`]. + pub frame: VideoFrame, + /// Source frame format delivered by the camera backend. + pub source_format: CaptureFrameFormat, + /// Backend-provided capture timestamp, when available. + pub backend_capture_timestamp: Option, + /// Wall-clock timestamp selected for metadata and timing correlation. + pub capture_wall_time_us: u64, + /// Wall-clock timestamp recorded after the frame was read from the camera backend. + pub read_wall_time_us: u64, + /// Sensor timestamp translated to UNIX-epoch microseconds, when available. + pub sensor_timestamp_us: Option, + /// Whether conversion from the source format to I420 was needed. + pub used_conversion: bool, + /// Whether compressed image decoding was needed before conversion. + pub used_decode_path: bool, +} + +impl V4lFrame { + /// Returns the decoded video frame. + pub fn video_frame(&self) -> &VideoFrame { + &self.frame + } +} + +/// Linux V4L2 capture session that emits decoded I420 frames. +pub struct V4lCaptureSession { + #[cfg(target_os = "linux")] + stream: MmapStream<'static>, + format: CaptureFormat, + /// Driver-reported row stride in bytes (V4L2 `bytesperline`). + #[cfg(target_os = "linux")] + stride: u32, + options: V4lCaptureOptions, + #[cfg(target_os = "linux")] + started_at: Instant, +} + +impl std::fmt::Debug for V4lCaptureSession { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let mut debug = f.debug_struct("V4lCaptureSession"); + debug.field("format", &self.format); + debug.field("options", &self.options); + debug.finish() + } +} + +impl V4lCaptureSession { + /// Opens a Linux V4L2 capture session. + pub fn new(options: V4lCaptureOptions) -> Result { + validate_options(&options)?; + Self::open(options) + } + + /// Captures the next frame and converts it to I420. + pub fn capture_frame(&mut self) -> Result { + self.capture_frame_inner() + } + + /// Returns the negotiated capture format. + pub fn format(&self) -> CaptureFormat { + self.format + } + + /// Returns the configured capture options. + pub fn options(&self) -> &V4lCaptureOptions { + &self.options + } + + /// Returns the capture path produced by this session. + pub fn capture_path(&self) -> CapturePath { + CapturePath::Raw + } + + #[cfg(target_os = "linux")] + fn open(options: V4lCaptureOptions) -> Result { + let frame_formats = frame_formats_for_request(&options)?; + let device = open_device(&options.device)?; + let all_formats = enumerate_device_formats(&device)?; + let (format, stride) = + apply_format_request(&device, &options, &frame_formats, &all_formats)?; + let stream = + MmapStream::with_buffers(&device, V4lBufferType::VideoCapture, 4).map_err(v4l_error)?; + Ok(Self { stream, format, stride, options, started_at: Instant::now() }) + } + + #[cfg(not(target_os = "linux"))] + fn open(_options: V4lCaptureOptions) -> Result { + Err(V4lError::UnsupportedPlatform) + } + + #[cfg(target_os = "linux")] + fn capture_frame_inner(&mut self) -> Result { + let fallback_wall_time_us = unix_time_us_now().unwrap_or_default(); + let format = self.format; + let (buffer, metadata) = self.stream.next().map_err(v4l_error)?; + let read_wall_time_us = unix_time_us_now().unwrap_or(fallback_wall_time_us); + let backend_capture_timestamp = monotonic_to_wallclock(metadata.timestamp); + let capture_wall_time_us = select_capture_wall_time_us( + backend_capture_timestamp, + fallback_wall_time_us, + read_wall_time_us, + ); + + let width = format.resolution.width; + let height = format.resolution.height; + let mut frame = VideoFrame { + rotation: VideoRotation::VideoRotation0, + timestamp_us: elapsed_us(self.started_at.elapsed()), + frame_metadata: None, + buffer: I420Buffer::new(width, height), + }; + let source = frame_bytes(buffer, metadata.bytesused); + let used_decode_path = convert_to_i420( + format.frame_format, + source, + width, + height, + self.stride, + &mut frame.buffer, + )?; + + Ok(V4lFrame { + frame, + source_format: format.frame_format, + backend_capture_timestamp, + capture_wall_time_us, + read_wall_time_us, + sensor_timestamp_us: None, + used_conversion: format.frame_format != CaptureFrameFormat::I420, + used_decode_path, + }) + } + + #[cfg(not(target_os = "linux"))] + fn capture_frame_inner(&mut self) -> Result { + Err(V4lError::UnsupportedPlatform) + } +} + +/// Returns Linux V4L2 capture devices. +#[cfg(target_os = "linux")] +pub fn devices() -> Result, V4lError> { + context::enum_devices() + .into_iter() + .filter_map(|node| { + let id = node.index().to_string(); + let fallback_name = + node.name().unwrap_or_else(|| node.path().to_string_lossy().into_owned()); + let mut name = fallback_name; + let mut model_id = None; + let mut manufacturer = None; + let mut formats = Vec::new(); + let mut formats_complete = false; + + if let Ok(device) = Device::with_path(node.path()) { + if let Ok(capabilities) = device.query_caps() { + if !capabilities.capabilities.contains(V4lCapabilityFlags::VIDEO_CAPTURE) { + return None; + } + if !capabilities.card.is_empty() { + name = capabilities.card; + } + model_id = Some(capabilities.bus).filter(|value| !value.is_empty()); + manufacturer = Some(capabilities.driver).filter(|value| !value.is_empty()); + } + + if let Ok(device_formats) = enumerate_device_formats(&device) { + formats = device_formats; + formats_complete = true; + } + }; + + Some(Ok(CaptureDeviceInfo { + backend: CaptureBackend::V4l2, + id: id.clone(), + selector: CaptureDeviceSelector::Id(id), + name, + model_id, + manufacturer, + paths: vec![CapturePath::Raw], + formats, + formats_complete, + })) + }) + .collect() +} + +/// Returns Linux V4L2 capture devices. +#[cfg(not(target_os = "linux"))] +pub fn devices() -> Result, V4lError> { + Err(V4lError::UnsupportedPlatform) +} + +/// Returns the default ordered V4L source frame formats. +pub fn default_frame_formats() -> Vec { + vec![ + CaptureFrameFormat::Yuyv, + CaptureFrameFormat::Mjpeg, + CaptureFrameFormat::Grey, + CaptureFrameFormat::Rgb24, + CaptureFrameFormat::Nv12, + ] +} + +/// Returns default V4L source frame formats with `first` preferred. +pub fn ordered_frame_formats_with_first(first: CaptureFrameFormat) -> Vec { + ordered_formats_with_first(&default_frame_formats(), first) +} + +fn validate_options(options: &V4lCaptureOptions) -> Result<(), V4lError> { + match &options.device { + CaptureDeviceSelector::Default => {} + CaptureDeviceSelector::Index(index) => { + u32::try_from(*index).map_err(|_| V4lError::OptionOutOfRange("device index"))?; + } + CaptureDeviceSelector::Id(id) => { + if id.is_empty() { + return Err(V4lError::InvalidOption("device id must be non-empty")); + } + } + } + + if options.frame_formats.is_empty() { + return Err(V4lError::InvalidOption("frame_formats must include at least one format")); + } + for frame_format in &options.frame_formats { + if !is_supported_source_format(*frame_format) { + return Err(V4lError::UnsupportedFrameFormat(*frame_format)); + } + } + + validate_format_request(&options.format) +} + +fn validate_format_request(format: &CaptureFormatRequest) -> Result<(), V4lError> { + let validate_format = |format: &CaptureFormat| { + if format.resolution.width == 0 { + return Err(V4lError::InvalidOption("width must be non-zero")); + } + if format.resolution.height == 0 { + return Err(V4lError::InvalidOption("height must be non-zero")); + } + if format.frame_rate == 0 { + return Err(V4lError::InvalidOption("frame_rate must be non-zero")); + } + if !is_supported_source_format(format.frame_format) { + return Err(V4lError::UnsupportedFrameFormat(format.frame_format)); + } + Ok(()) + }; + + match format { + CaptureFormatRequest::Default => Ok(()), + CaptureFormatRequest::Exact(format) | CaptureFormatRequest::Closest(format) => { + validate_format(format) + } + CaptureFormatRequest::HighestFrameRate { resolution, frame_format } => { + if let Some(resolution) = resolution { + validate_resolution(*resolution)?; + } + if let Some(frame_format) = frame_format { + if !is_supported_source_format(*frame_format) { + return Err(V4lError::UnsupportedFrameFormat(*frame_format)); + } + } + Ok(()) + } + CaptureFormatRequest::HighestResolution { frame_rate, frame_format } => { + if matches!(frame_rate, Some(0)) { + return Err(V4lError::InvalidOption("frame_rate must be non-zero")); + } + if let Some(frame_format) = frame_format { + if !is_supported_source_format(*frame_format) { + return Err(V4lError::UnsupportedFrameFormat(*frame_format)); + } + } + Ok(()) + } + } +} + +fn validate_resolution(resolution: CaptureResolution) -> Result<(), V4lError> { + if resolution.width == 0 { + return Err(V4lError::InvalidOption("width must be non-zero")); + } + if resolution.height == 0 { + return Err(V4lError::InvalidOption("height must be non-zero")); + } + Ok(()) +} + +#[cfg(target_os = "linux")] +fn open_device(selector: &CaptureDeviceSelector) -> Result { + match selector { + CaptureDeviceSelector::Default => Device::new(0).map_err(v4l_error), + CaptureDeviceSelector::Index(index) => Device::new(*index).map_err(v4l_error), + CaptureDeviceSelector::Id(id) => open_device_id(id), + } +} + +#[cfg(target_os = "linux")] +fn open_device_id(id: &str) -> Result { + if let Ok(index) = id.parse::() { + return Device::new(index).map_err(v4l_error); + } + + Device::with_path(Path::new(id)).map_err(v4l_error) +} + +#[cfg(target_os = "linux")] +fn frame_formats_for_request( + options: &V4lCaptureOptions, +) -> Result, V4lError> { + let mut formats = match &options.format { + CaptureFormatRequest::Exact(format) | CaptureFormatRequest::Closest(format) => { + ordered_formats_with_first(&options.frame_formats, format.frame_format) + } + CaptureFormatRequest::HighestFrameRate { frame_format: Some(frame_format), .. } + | CaptureFormatRequest::HighestResolution { frame_format: Some(frame_format), .. } => { + vec![*frame_format] + } + CaptureFormatRequest::Default + | CaptureFormatRequest::HighestFrameRate { frame_format: None, .. } + | CaptureFormatRequest::HighestResolution { frame_format: None, .. } => { + options.frame_formats.clone() + } + }; + formats.dedup(); + for format in &formats { + if !is_supported_source_format(*format) { + return Err(V4lError::UnsupportedFrameFormat(*format)); + } + } + Ok(formats) +} + +fn ordered_formats_with_first( + frame_formats: &[CaptureFrameFormat], + first: CaptureFrameFormat, +) -> Vec { + std::iter::once(first) + .chain(frame_formats.iter().copied().filter(|format| *format != first)) + .collect() +} + +#[cfg(target_os = "linux")] +fn apply_format_request( + device: &Device, + options: &V4lCaptureOptions, + frame_formats: &[CaptureFrameFormat], + all_formats: &[CaptureFormat], +) -> Result<(CaptureFormat, u32), V4lError> { + match options.format { + CaptureFormatRequest::Default => { + let selected = select_format_for_request(&options.format, frame_formats, all_formats)?; + set_device_format(device, selected) + } + CaptureFormatRequest::Exact(_) | CaptureFormatRequest::Closest(_) => { + apply_ordered_format_request(device, options, frame_formats, all_formats) + } + CaptureFormatRequest::HighestFrameRate { .. } + | CaptureFormatRequest::HighestResolution { .. } => { + let selected = select_format_for_request(&options.format, frame_formats, all_formats)?; + set_device_format(device, selected) + } + } +} + +#[cfg(target_os = "linux")] +fn apply_ordered_format_request( + device: &Device, + options: &V4lCaptureOptions, + frame_formats: &[CaptureFrameFormat], + all_formats: &[CaptureFormat], +) -> Result<(CaptureFormat, u32), V4lError> { + let mut last_error = None; + for frame_format in frame_formats { + let request = format_request_with_frame_format(&options.format, *frame_format); + let selected = match select_format_for_request(&request, &[*frame_format], all_formats) { + Ok(selected) => selected, + Err(error) => { + last_error = Some(error); + continue; + } + }; + + match set_device_format(device, selected) { + Ok(format) => return Ok(format), + Err(error) => last_error = Some(error), + } + } + + Err(last_error.unwrap_or(V4lError::InvalidOption("no V4L frame formats were requested"))) +} + +#[cfg(target_os = "linux")] +fn format_request_with_frame_format( + request: &CaptureFormatRequest, + frame_format: CaptureFrameFormat, +) -> CaptureFormatRequest { + match request { + CaptureFormatRequest::Exact(format) => CaptureFormatRequest::Exact(CaptureFormat::new( + format.resolution, + format.frame_rate, + frame_format, + )), + CaptureFormatRequest::Closest(format) => CaptureFormatRequest::Closest(CaptureFormat::new( + format.resolution, + format.frame_rate, + frame_format, + )), + CaptureFormatRequest::Default => CaptureFormatRequest::Default, + CaptureFormatRequest::HighestFrameRate { resolution, .. } => { + CaptureFormatRequest::HighestFrameRate { + resolution: *resolution, + frame_format: Some(frame_format), + } + } + CaptureFormatRequest::HighestResolution { frame_rate, .. } => { + CaptureFormatRequest::HighestResolution { + frame_rate: *frame_rate, + frame_format: Some(frame_format), + } + } + } +} + +#[cfg(target_os = "linux")] +fn select_format_for_request( + request: &CaptureFormatRequest, + frame_formats: &[CaptureFrameFormat], + all_formats: &[CaptureFormat], +) -> Result { + let selected = match request { + CaptureFormatRequest::Default => { + all_formats.iter().find(|format| frame_formats.contains(&format.frame_format)).copied() + } + CaptureFormatRequest::Exact(format) => { + if frame_formats.contains(&format.frame_format) { + Some(*format) + } else { + None + } + } + CaptureFormatRequest::Closest(format) => { + select_closest_format(*format, frame_formats, all_formats) + } + CaptureFormatRequest::HighestFrameRate { .. } => { + select_highest_frame_rate_format(request, frame_formats, all_formats) + } + CaptureFormatRequest::HighestResolution { .. } => { + select_highest_resolution_format(request, frame_formats, all_formats) + } + }; + + selected.ok_or_else(|| V4lError::Camera("CameraFormat: Failed to Fufill".to_string())) +} + +#[cfg(target_os = "linux")] +fn select_closest_format( + requested: CaptureFormat, + frame_formats: &[CaptureFrameFormat], + all_formats: &[CaptureFormat], +) -> Option { + if !frame_formats.contains(&requested.frame_format) { + return None; + } + + let resolution = all_formats + .iter() + .copied() + .filter(|format| format.frame_format == requested.frame_format) + .min_by_key(|format| resolution_distance(format.resolution, requested.resolution))? + .resolution; + + let frame_rate = all_formats + .iter() + .copied() + .filter(|format| { + format.frame_format == requested.frame_format && format.resolution == resolution + }) + .min_by_key(|format| format.frame_rate.abs_diff(requested.frame_rate))? + .frame_rate; + + Some(CaptureFormat::new(resolution, frame_rate, requested.frame_format)) +} + +#[cfg(target_os = "linux")] +fn select_highest_frame_rate_format( + request: &CaptureFormatRequest, + frame_formats: &[CaptureFrameFormat], + all_formats: &[CaptureFormat], +) -> Option { + all_formats + .iter() + .copied() + .filter(|format| frame_formats.contains(&format.frame_format)) + .filter(|format| match request { + CaptureFormatRequest::HighestFrameRate { resolution, frame_format } => { + resolution.map(|resolution| format.resolution == resolution).unwrap_or(true) + && frame_format + .map(|frame_format| format.frame_format == frame_format) + .unwrap_or(true) + } + _ => false, + }) + .max_by(|left, right| { + left.frame_rate + .cmp(&right.frame_rate) + .then_with(|| compare_resolution(left.resolution, right.resolution)) + .then_with(|| { + compare_format_preference(left.frame_format, right.frame_format, frame_formats) + }) + }) +} + +#[cfg(target_os = "linux")] +fn select_highest_resolution_format( + request: &CaptureFormatRequest, + frame_formats: &[CaptureFrameFormat], + all_formats: &[CaptureFormat], +) -> Option { + all_formats + .iter() + .copied() + .filter(|format| frame_formats.contains(&format.frame_format)) + .filter(|format| match request { + CaptureFormatRequest::HighestResolution { frame_rate, frame_format } => { + frame_rate.map(|frame_rate| format.frame_rate == frame_rate).unwrap_or(true) + && frame_format + .map(|frame_format| format.frame_format == frame_format) + .unwrap_or(true) + } + _ => false, + }) + .max_by(|left, right| { + compare_resolution(left.resolution, right.resolution) + .then_with(|| left.frame_rate.cmp(&right.frame_rate)) + .then_with(|| { + compare_format_preference(left.frame_format, right.frame_format, frame_formats) + }) + }) +} + +#[cfg(target_os = "linux")] +fn compare_resolution(left: CaptureResolution, right: CaptureResolution) -> std::cmp::Ordering { + frame_area(left) + .cmp(&frame_area(right)) + .then_with(|| left.width.cmp(&right.width)) + .then_with(|| left.height.cmp(&right.height)) +} + +#[cfg(target_os = "linux")] +fn resolution_distance(left: CaptureResolution, right: CaptureResolution) -> u64 { + let width = i64::from(left.width) - i64::from(right.width); + let height = i64::from(left.height) - i64::from(right.height); + width.unsigned_abs().pow(2) + height.unsigned_abs().pow(2) +} + +#[cfg(target_os = "linux")] +fn frame_area(resolution: CaptureResolution) -> u64 { + u64::from(resolution.width) * u64::from(resolution.height) +} + +#[cfg(target_os = "linux")] +fn compare_format_preference( + left: CaptureFrameFormat, + right: CaptureFrameFormat, + frame_formats: &[CaptureFrameFormat], +) -> std::cmp::Ordering { + let left_index = frame_formats.iter().position(|format| *format == left).unwrap_or(usize::MAX); + let right_index = + frame_formats.iter().position(|format| *format == right).unwrap_or(usize::MAX); + right_index.cmp(&left_index) +} + +#[cfg(target_os = "linux")] +fn set_device_format( + device: &Device, + selected: CaptureFormat, +) -> Result<(CaptureFormat, u32), V4lError> { + let (current, _) = device_capture_format(device)?; + let format_changed = + current.resolution != selected.resolution || current.frame_format != selected.frame_format; + if format_changed { + device + .set_format(&V4lFormat::new( + selected.resolution.width, + selected.resolution.height, + fourcc_for_frame_format(selected.frame_format) + .ok_or(V4lError::UnsupportedFrameFormat(selected.frame_format))?, + )) + .map_err(v4l_error)?; + } + if format_changed || current.frame_rate != selected.frame_rate { + device + .set_params(&V4lCaptureParameters::with_fps(selected.frame_rate)) + .map_err(v4l_error)?; + } + + let (actual, stride) = device_capture_format(device)?; + if actual != selected { + return Err(V4lError::Camera(format!( + "CameraFormat rejected: requested {:?}, got {:?}", + selected, actual + ))); + } + Ok((actual, stride)) +} + +/// Returns the device's current capture format and its row stride in bytes +/// (V4L2 `bytesperline`). +#[cfg(target_os = "linux")] +fn device_capture_format(device: &Device) -> Result<(CaptureFormat, u32), V4lError> { + let format = device.format().map_err(v4l_error)?; + let params = device.params().map_err(v4l_error)?; + let frame_rate = + frame_rate_from_fraction(params.interval.numerator, params.interval.denominator) + .ok_or(V4lError::InvalidOption("V4L frame interval must be non-zero"))?; + let capture_format = CaptureFormat::new( + CaptureResolution::new(format.width, format.height), + frame_rate, + capture_frame_format_from_fourcc(format.fourcc) + .ok_or_else(|| V4lError::Camera(format!("unsupported V4L fourcc {}", format.fourcc)))?, + ); + Ok((capture_format, format.stride)) +} + +#[cfg(target_os = "linux")] +fn enumerate_device_formats(device: &Device) -> Result, V4lError> { + let mut formats = Vec::new(); + let fourccs = device + .enum_formats() + .map_err(v4l_error)? + .into_iter() + .filter_map(|format| capture_frame_format_from_fourcc(format.fourcc).map(|_| format.fourcc)) + .collect::>(); + + for fourcc in dedup_fourccs(fourccs) { + let Some(frame_format) = capture_frame_format_from_fourcc(fourcc) else { + continue; + }; + let frame_sizes = device.enum_framesizes(fourcc).map_err(v4l_error)?; + for resolution in frame_sizes.into_iter().flat_map(resolutions_from_frame_size) { + let intervals = device + .enum_frameintervals(fourcc, resolution.width, resolution.height) + .unwrap_or_default(); + for frame_rate in intervals.into_iter().flat_map(frame_rates_from_interval) { + formats.push(CaptureFormat::new(resolution, frame_rate, frame_format)); + } + } + } + + Ok(formats) +} + +fn is_supported_source_format(frame_format: CaptureFrameFormat) -> bool { + matches!( + frame_format, + CaptureFrameFormat::Nv12 + | CaptureFrameFormat::Rgb24 + | CaptureFrameFormat::Bgr24 + | CaptureFrameFormat::Yuyv + | CaptureFrameFormat::Grey + | CaptureFrameFormat::Mjpeg + ) +} + +#[cfg(target_os = "linux")] +fn fourcc_for_frame_format(frame_format: CaptureFrameFormat) -> Option { + match frame_format { + CaptureFrameFormat::Nv12 => Some(FourCC::new(b"NV12")), + CaptureFrameFormat::Rgb24 => Some(FourCC::new(b"RGB3")), + CaptureFrameFormat::Bgr24 => Some(FourCC::new(b"BGR3")), + CaptureFrameFormat::Yuyv => Some(FourCC::new(b"YUYV")), + CaptureFrameFormat::Grey => Some(FourCC::new(b"GREY")), + CaptureFrameFormat::Mjpeg => Some(FourCC::new(b"MJPG")), + CaptureFrameFormat::I420 | CaptureFrameFormat::Bgra | CaptureFrameFormat::Uyvy => None, + } +} + +#[cfg(target_os = "linux")] +fn capture_frame_format_from_fourcc(fourcc: FourCC) -> Option { + match fourcc.str().ok()? { + "NV12" => Some(CaptureFrameFormat::Nv12), + "RGB3" => Some(CaptureFrameFormat::Rgb24), + "BGR3" => Some(CaptureFrameFormat::Bgr24), + "YUYV" | "YUY2" => Some(CaptureFrameFormat::Yuyv), + "GREY" => Some(CaptureFrameFormat::Grey), + "MJPG" | "JPEG" => Some(CaptureFrameFormat::Mjpeg), + _ => None, + } +} + +#[cfg(target_os = "linux")] +fn dedup_fourccs(fourccs: Vec) -> Vec { + let mut deduped = Vec::new(); + for fourcc in fourccs { + if !deduped.contains(&fourcc) { + deduped.push(fourcc); + } + } + deduped +} + +#[cfg(target_os = "linux")] +fn resolutions_from_frame_size(size: v4l::FrameSize) -> Vec { + match size.size { + FrameSizeEnum::Discrete(discrete) => { + vec![CaptureResolution::new(discrete.width, discrete.height)] + } + FrameSizeEnum::Stepwise(stepwise) => { + let mut resolutions = Vec::new(); + push_stepwise_resolution( + &mut resolutions, + CaptureResolution::new(stepwise.min_width, stepwise.min_height), + ); + push_stepwise_resolution( + &mut resolutions, + CaptureResolution::new(stepwise.max_width, stepwise.max_height), + ); + resolutions + } + } +} + +#[cfg(target_os = "linux")] +fn push_stepwise_resolution( + resolutions: &mut Vec, + resolution: CaptureResolution, +) { + if resolution.width != 0 && resolution.height != 0 && !resolutions.contains(&resolution) { + resolutions.push(resolution); + } +} + +#[cfg(target_os = "linux")] +fn frame_rates_from_interval(interval: v4l::FrameInterval) -> Vec { + match interval.interval { + FrameIntervalEnum::Discrete(fraction) => { + frame_rate_from_fraction(fraction.numerator, fraction.denominator).into_iter().collect() + } + FrameIntervalEnum::Stepwise(stepwise) => { + let mut frame_rates = Vec::new(); + for fraction in [stepwise.min, stepwise.max] { + if let Some(frame_rate) = + frame_rate_from_fraction(fraction.numerator, fraction.denominator) + { + if !frame_rates.contains(&frame_rate) { + frame_rates.push(frame_rate); + } + } + } + frame_rates + } + } +} + +/// Converts a V4L2 frame interval (seconds per frame) to frames per second. +/// +/// Non-integer rates (e.g. the NTSC interval 1001/30000 = 29.97fps) round to +/// the nearest whole rate, never below 1. +#[cfg(any(target_os = "linux", test))] +fn frame_rate_from_fraction(numerator: u32, denominator: u32) -> Option { + if numerator == 0 || denominator == 0 { + return None; + } + if denominator % numerator == 0 { + return Some(denominator / numerator); + } + let rounded = (u64::from(denominator) + u64::from(numerator) / 2) / u64::from(numerator); + Some(u32::try_from(rounded).unwrap_or(u32::MAX).max(1)) +} + +#[cfg(target_os = "linux")] +fn frame_bytes(buffer: &[u8], bytes_used: u32) -> &[u8] { + let bytes_used = usize::try_from(bytes_used).unwrap_or(buffer.len()).min(buffer.len()); + if bytes_used == 0 { + buffer + } else { + &buffer[..bytes_used] + } +} + +#[cfg(any(target_os = "linux", test))] +fn convert_to_i420( + source_format: CaptureFrameFormat, + source: &[u8], + width: u32, + height: u32, + source_stride: u32, + destination: &mut I420Buffer, +) -> Result { + let (stride_y, stride_u, stride_v) = destination.strides(); + let (dst_y, dst_u, dst_v) = destination.data_mut(); + let width_i32 = i32_from_u32(width, "width")?; + let height_i32 = i32_from_u32(height, "height")?; + + let ret = match source_format { + CaptureFrameFormat::Yuyv => { + let stride = source_row_stride(source_stride, width as usize * 2); + validate_len(source, stride * height as usize, "YUYV frame")?; + let stride_i32 = i32_from_usize(stride, "stride")?; + unsafe { + // SAFETY: Source and destination slices are valid for the dimensions and strides. + yuv_sys::rs_YUY2ToI420( + source.as_ptr(), + stride_i32, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width_i32, + height_i32, + ) + } + } + CaptureFrameFormat::Rgb24 => { + let stride = source_row_stride(source_stride, width as usize * 3); + validate_len(source, stride * height as usize, "RGB24 frame")?; + let stride_i32 = i32_from_usize(stride, "stride")?; + unsafe { + // SAFETY: Source and destination slices are valid for the dimensions and strides. + yuv_sys::rs_RGB24ToI420( + source.as_ptr(), + stride_i32, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width_i32, + height_i32, + ) + } + } + CaptureFrameFormat::Bgr24 => { + let stride = source_row_stride(source_stride, width as usize * 3); + validate_len(source, stride * height as usize, "BGR24 frame")?; + let stride_i32 = i32_from_usize(stride, "stride")?; + unsafe { + // SAFETY: Source and destination slices are valid for the dimensions and strides. + yuv_sys::rs_RAWToI420( + source.as_ptr(), + stride_i32, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width_i32, + height_i32, + ) + } + } + CaptureFrameFormat::Grey => { + let stride = source_row_stride(source_stride, width as usize); + validate_len(source, stride * height as usize, "GREY frame")?; + let stride_i32 = i32_from_usize(stride, "stride")?; + unsafe { + // SAFETY: Source and destination slices are valid for the dimensions and strides. + yuv_sys::rs_I400ToI420( + source.as_ptr(), + stride_i32, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width_i32, + height_i32, + ) + } + } + CaptureFrameFormat::Nv12 => { + // Single-planar V4L2 NV12: the interleaved chroma plane follows the + // luma plane at `stride * height` and shares the luma stride. + let stride = source_row_stride(source_stride, width as usize); + let y_size = stride * height as usize; + validate_len(source, y_size + y_size / 2, "NV12 frame")?; + let stride_i32 = i32_from_usize(stride, "stride")?; + unsafe { + // SAFETY: Source and destination slices are valid for the dimensions and strides. + yuv_sys::rs_NV12ToI420( + source.as_ptr(), + stride_i32, + source[y_size..].as_ptr(), + stride_i32, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width_i32, + height_i32, + ) + } + } + CaptureFrameFormat::Mjpeg => { + return convert_mjpeg_to_i420(source, width, height, destination).map(|()| true); + } + CaptureFrameFormat::I420 | CaptureFrameFormat::Bgra | CaptureFrameFormat::Uyvy => { + return Err(V4lError::UnsupportedFrameFormat(source_format)); + } + }; + + if ret == 0 { + Ok(false) + } else { + Err(V4lError::Convert("libyuv conversion failed")) + } +} + +/// Returns the effective source row stride in bytes, falling back to the +/// packed width-derived stride when the driver reports `bytesperline` as zero +/// or smaller than one packed row. +#[cfg(any(target_os = "linux", test))] +fn source_row_stride(reported_stride: u32, packed_stride: usize) -> usize { + (reported_stride as usize).max(packed_stride) +} + +#[cfg(any(target_os = "linux", test))] +fn convert_mjpeg_to_i420( + source: &[u8], + width: u32, + height: u32, + destination: &mut I420Buffer, +) -> Result<(), V4lError> { + let (stride_y, stride_u, stride_v) = destination.strides(); + let (dst_y, dst_u, dst_v) = destination.data_mut(); + let width_i32 = i32_from_u32(width, "width")?; + let height_i32 = i32_from_u32(height, "height")?; + + let ret = unsafe { + // SAFETY: Source and destination slices are valid for the dimensions and strides. + yuv_sys::rs_MJPGToI420( + source.as_ptr(), + source.len(), + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width_i32, + height_i32, + width_i32, + height_i32, + ) + }; + if ret == 0 { + return Ok(()); + } + + let rgb = image::load_from_memory(source) + .map_err(|error| V4lError::Decode(error.to_string()))? + .to_rgb8(); + if rgb.width() != width || rgb.height() != height { + return Err(V4lError::InvalidFrame("decoded MJPEG dimensions changed")); + } + let ret = unsafe { + // SAFETY: Source and destination slices are valid for the dimensions and strides. + yuv_sys::rs_RGB24ToI420( + rgb.as_raw().as_ptr(), + width_i32 * 3, + dst_y.as_mut_ptr(), + stride_y as i32, + dst_u.as_mut_ptr(), + stride_u as i32, + dst_v.as_mut_ptr(), + stride_v as i32, + width_i32, + height_i32, + ) + }; + if ret == 0 { + Ok(()) + } else { + Err(V4lError::Convert("RGB24 fallback conversion failed")) + } +} + +#[cfg(any(target_os = "linux", test))] +fn validate_len(source: &[u8], expected: usize, label: &'static str) -> Result<(), V4lError> { + if source.len() < expected { + return Err(V4lError::InvalidFrame(label)); + } + Ok(()) +} + +#[cfg(any(target_os = "linux", test))] +fn select_capture_wall_time_us( + backend_capture_timestamp: Option, + fallback_wall_time_us: u64, + read_wall_time_us: u64, +) -> u64 { + backend_capture_timestamp + .and_then(|timestamp| u64::try_from(timestamp.as_micros()).ok()) + .and_then(|timestamp_us| validate_capture_timestamp_us(timestamp_us, read_wall_time_us)) + .unwrap_or(fallback_wall_time_us) +} + +#[cfg(any(target_os = "linux", test))] +fn i32_from_u32(value: u32, field: &'static str) -> Result { + i32::try_from(value).map_err(|_| V4lError::OptionOutOfRange(field)) +} + +#[cfg(any(target_os = "linux", test))] +fn i32_from_usize(value: usize, field: &'static str) -> Result { + i32::try_from(value).map_err(|_| V4lError::OptionOutOfRange(field)) +} + +#[cfg(target_os = "linux")] +fn v4l_error(error: std::io::Error) -> V4lError { + V4lError::Camera(error.to_string()) +} + +#[cfg(target_os = "linux")] +fn monotonic_to_wallclock(timestamp: v4l::Timestamp) -> Option { + let frame_monotonic = Duration::from(timestamp); + if frame_monotonic.is_zero() { + return None; + } + + let monotonic_now = clock_time(libc::CLOCK_MONOTONIC)?; + let wall_now = clock_time(libc::CLOCK_REALTIME)?; + let frame_age = monotonic_now.checked_sub(frame_monotonic)?; + wall_now.checked_sub(frame_age) +} + +#[cfg(target_os = "linux")] +fn clock_time(clock_id: libc::clockid_t) -> Option { + let mut time = libc::timespec { tv_sec: 0, tv_nsec: 0 }; + let ret = unsafe { + // SAFETY: `time` is a valid out pointer and `clock_id` is supplied by libc constants. + libc::clock_gettime(clock_id, &mut time) + }; + if ret != 0 || time.tv_sec < 0 || time.tv_nsec < 0 { + return None; + } + + Some(Duration::new(time.tv_sec as u64, time.tv_nsec as u32)) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::time::MAX_CAPTURE_TIMESTAMP_AGE_US; + + #[test] + fn rejects_empty_frame_format_preferences() { + let mut options = V4lCaptureOptions::default(); + options.frame_formats.clear(); + let err = V4lCaptureSession::new(options).expect_err("empty formats must be rejected"); + assert!(matches!(err, V4lError::InvalidOption(_))); + } + + #[test] + fn rejects_unsupported_i420_source_format() { + let mut options = V4lCaptureOptions::default(); + options.frame_formats = vec![CaptureFrameFormat::I420]; + let err = V4lCaptureSession::new(options).expect_err("I420 source must be rejected"); + assert!(matches!(err, V4lError::UnsupportedFrameFormat(CaptureFrameFormat::I420))); + } + + #[test] + fn rejects_zero_frame_rate() { + let options = V4lCaptureOptions::new( + CaptureDeviceSelector::Default, + CaptureResolution::new(640, 480), + 0, + ); + let err = V4lCaptureSession::new(options).expect_err("zero fps must be rejected"); + assert!(matches!(err, V4lError::InvalidOption(_))); + } + + #[test] + fn ignores_stream_relative_capture_timestamp() { + let selected = + select_capture_wall_time_us(Some(Duration::from_micros(10)), 10_000_000, 10_000_000); + assert_eq!(selected, 10_000_000); + } + + #[test] + fn accepts_recent_backend_capture_timestamp() { + let read_us = 20_000_000; + let recent = Duration::from_micros(read_us - 1_000); + assert_eq!(select_capture_wall_time_us(Some(recent), 42, read_us), read_us - 1_000); + } + + #[test] + fn ignores_backend_capture_timestamp_older_than_max_age() { + let read_us = 20_000_000; + let stale = Duration::from_micros(read_us - MAX_CAPTURE_TIMESTAMP_AGE_US - 1); + assert_eq!(select_capture_wall_time_us(Some(stale), 42, read_us), 42); + } + + #[test] + fn frame_rate_from_fraction_rounds_fractional_intervals() { + assert_eq!(frame_rate_from_fraction(1, 30), Some(30)); + assert_eq!(frame_rate_from_fraction(1001, 30000), Some(30)); + assert_eq!(frame_rate_from_fraction(1001, 60000), Some(60)); + assert_eq!(frame_rate_from_fraction(3, 1), Some(1)); + } + + #[test] + fn frame_rate_from_fraction_rejects_zero_terms() { + assert_eq!(frame_rate_from_fraction(0, 30000), None); + assert_eq!(frame_rate_from_fraction(1001, 0), None); + } + + #[test] + fn converts_padded_stride_nv12_frame() { + let width = 6u32; + let height = 4u32; + let stride = 8usize; + let y_size = stride * height as usize; + // Padding bytes past each 6-pixel row must never reach the output. + let mut source = vec![0xEE; y_size + y_size / 2]; + for row in 0..height as usize { + for col in 0..width as usize { + source[row * stride + col] = (100 + row * 10 + col) as u8; + } + } + for row in 0..height as usize / 2 { + for pair in 0..width as usize / 2 { + source[y_size + row * stride + pair * 2] = (50 + row * 10 + pair) as u8; + source[y_size + row * stride + pair * 2 + 1] = (150 + row * 10 + pair) as u8; + } + } + + let mut destination = I420Buffer::new(width, height); + let used_decode_path = convert_to_i420( + CaptureFrameFormat::Nv12, + &source, + width, + height, + stride as u32, + &mut destination, + ) + .expect("padded NV12 frame must convert"); + assert!(!used_decode_path); + + let (stride_y, stride_u, stride_v) = destination.strides(); + let (dst_y, dst_u, dst_v) = destination.data(); + for row in 0..height as usize { + for col in 0..width as usize { + assert_eq!( + dst_y[row * stride_y as usize + col], + (100 + row * 10 + col) as u8, + "Y({row},{col})" + ); + } + } + for row in 0..height as usize / 2 { + for pair in 0..width as usize / 2 { + assert_eq!(dst_u[row * stride_u as usize + pair], (50 + row * 10 + pair) as u8); + assert_eq!(dst_v[row * stride_v as usize + pair], (150 + row * 10 + pair) as u8); + } + } + } + + #[test] + fn converts_padded_stride_yuyv_frame() { + let width = 6u32; + let height = 2u32; + let stride = 16usize; + // Padding bytes past each 12-byte packed row must never reach the output. + let mut source = vec![0xEE; stride * height as usize]; + for row in 0..height as usize { + for col in 0..width as usize { + source[row * stride + col * 2] = (40 + row * 10 + col) as u8; + source[row * stride + col * 2 + 1] = 128; + } + } + + let mut destination = I420Buffer::new(width, height); + convert_to_i420( + CaptureFrameFormat::Yuyv, + &source, + width, + height, + stride as u32, + &mut destination, + ) + .expect("padded YUYV frame must convert"); + + let (stride_y, _, _) = destination.strides(); + let (dst_y, _, _) = destination.data(); + for row in 0..height as usize { + for col in 0..width as usize { + assert_eq!( + dst_y[row * stride_y as usize + col], + (40 + row * 10 + col) as u8, + "Y({row},{col})" + ); + } + } + } + + #[test] + fn rejects_nv12_frame_shorter_than_padded_stride_size() { + let width = 6u32; + let height = 4u32; + let packed = vec![0u8; (width * height) as usize * 3 / 2]; + let mut destination = I420Buffer::new(width, height); + let err = + convert_to_i420(CaptureFrameFormat::Nv12, &packed, width, height, 8, &mut destination) + .expect_err("packed-size buffer must fail the stride-aware length check"); + assert!(matches!(err, V4lError::InvalidFrame("NV12 frame"))); + } + + #[test] + fn falls_back_to_packed_stride_when_driver_reports_zero() { + let width = 4u32; + let height = 2u32; + let y_size = (width * height) as usize; + let mut source = vec![128u8; y_size + y_size / 2]; + for (index, value) in source.iter_mut().take(y_size).enumerate() { + *value = index as u8; + } + + let mut destination = I420Buffer::new(width, height); + convert_to_i420(CaptureFrameFormat::Nv12, &source, width, height, 0, &mut destination) + .expect("packed NV12 frame with zero reported stride must convert"); + + let (stride_y, _, _) = destination.strides(); + let (dst_y, _, _) = destination.data(); + for row in 0..height as usize { + for col in 0..width as usize { + assert_eq!( + dst_y[row * stride_y as usize + col], + (row * width as usize + col) as u8 + ); + } + } + } +} diff --git a/livekit-capture/src/time.rs b/livekit-capture/src/time.rs new file mode 100644 index 000000000..e9733149f --- /dev/null +++ b/livekit-capture/src/time.rs @@ -0,0 +1,65 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Shared capture-timestamp helpers used by the capture backends. + +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +/// Maximum age a backend-reported capture timestamp may have, relative to the +/// wall-clock read time, before it is considered stale and discarded. +pub(crate) const MAX_CAPTURE_TIMESTAMP_AGE_US: u64 = 5_000_000; + +/// Returns the current UNIX wall-clock time in microseconds. +pub(crate) fn unix_time_us_now() -> Option { + let elapsed = SystemTime::now().duration_since(UNIX_EPOCH).ok()?; + u64::try_from(elapsed.as_micros()).ok() +} + +/// Converts a duration to whole microseconds, saturating at `i64::MAX`. +pub(crate) fn elapsed_us(duration: Duration) -> i64 { + i64::try_from(duration.as_micros()).unwrap_or(i64::MAX) +} + +/// Validates a backend-reported capture timestamp against the wall-clock read +/// time: zero, future, and stale (older than +/// [`MAX_CAPTURE_TIMESTAMP_AGE_US`]) timestamps are rejected. +pub(crate) fn validate_capture_timestamp_us( + capture_timestamp_us: u64, + read_wall_time_us: u64, +) -> Option { + if capture_timestamp_us == 0 || capture_timestamp_us > read_wall_time_us { + return None; + } + if read_wall_time_us - capture_timestamp_us > MAX_CAPTURE_TIMESTAMP_AGE_US { + return None; + } + Some(capture_timestamp_us) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn validate_rejects_zero_future_and_stale_timestamps() { + let now = 10_000_000; + assert_eq!(validate_capture_timestamp_us(0, now), None); + assert_eq!(validate_capture_timestamp_us(now + 1, now), None); + assert_eq!( + validate_capture_timestamp_us(now - MAX_CAPTURE_TIMESTAMP_AGE_US - 1, now), + None + ); + assert_eq!(validate_capture_timestamp_us(now - 1, now), Some(now - 1)); + } +} diff --git a/livekit-capture/src/track.rs b/livekit-capture/src/track.rs new file mode 100644 index 000000000..0576001bc --- /dev/null +++ b/livekit-capture/src/track.rs @@ -0,0 +1,261 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use livekit::{ + options::{TrackPublishOptions, VideoEncoderBackend}, + prelude::LocalVideoTrack, + webrtc::{ + video_frame::{EncodedVideoFrame, VideoBuffer, VideoFrame}, + video_source::{native::NativeVideoSource, RtcVideoSource, VideoResolution}, + }, +}; + +use crate::{ + encoded::{ + CodecSpecific, EncodedAccessUnit, EncodedLayerInfo, EncodedPayload, EncodedVideoCodec, + }, + error::CaptureError, +}; + +pub use crate::device::CapturePath; +#[cfg(target_os = "linux")] +use crate::dmabuf::DmaBufFrame; + +/// Capture source backed by a LiveKit local video track. +#[derive(Debug, Clone)] +pub struct VideoCaptureTrack { + source: NativeVideoSource, + track: LocalVideoTrack, +} + +impl VideoCaptureTrack { + /// Creates a capture track with the supplied resolution. + pub fn new(name: &str, resolution: VideoResolution, is_screencast: bool) -> Self { + let source = NativeVideoSource::new(resolution, is_screencast); + let track = + LocalVideoTrack::create_video_track(name, RtcVideoSource::Native(source.clone())); + Self { source, track } + } + + /// Creates a capture track for pre-encoded access units. + /// + /// Unlike [`VideoCaptureTrack::new`], no raw keepalive frames are + /// injected before the first capture, so the sender starts directly on + /// the passthrough encoder instead of briefly encoding black frames. + pub fn new_encoded(name: &str, resolution: VideoResolution) -> Self { + let source = NativeVideoSource::new_encoded(resolution); + let track = + LocalVideoTrack::create_video_track(name, RtcVideoSource::Native(source.clone())); + Self { source, track } + } + + /// Returns the publishable local video track. + pub fn track(&self) -> LocalVideoTrack { + self.track.clone() + } + + /// Captures one decoded video frame. + pub fn capture_frame>(&self, frame: &VideoFrame) { + self.source.capture_frame(frame); + } + + /// Captures one DMA-BUF backed frame. + /// + /// The native capture path hands a single file descriptor to the driver + /// and derives the plane layout from the underlying buffer itself + /// (NvBufSurface); per-plane offsets, strides, and DRM modifiers in + /// [`DmaBufFrame`] are informational and must describe that derived + /// layout. Frames whose planes span multiple file descriptors or start + /// at a nonzero offset are rejected rather than silently truncated. + #[cfg(target_os = "linux")] + pub fn capture_dmabuf(&self, frame: &DmaBufFrame) -> Result<(), CaptureError> { + let plane = frame.planes.first().ok_or(CaptureError::MissingDmaBufPlane)?; + if frame.planes.iter().any(|other| other.fd != plane.fd) { + return Err(CaptureError::UnsupportedDmaBufLayout( + "planes must share one DMA-BUF file descriptor", + )); + } + if plane.offset != 0 { + return Err(CaptureError::UnsupportedDmaBufLayout( + "first plane must start at offset 0", + )); + } + let ok = self.source.capture_dmabuf_frame( + plane.fd, + frame.width, + frame.height, + frame.pixel_format.as_native(), + frame.timestamp_us, + ); + ok.then_some(()).ok_or(CaptureError::CaptureFailed) + } + + /// Captures one encoded video access unit. + /// + /// The passthrough path forwards single-layer streams: access units + /// carrying temporal/spatial layer ids, an AV1 dependency descriptor, or + /// a non-`L1T1` scalability mode are rejected so callers are not misled + /// into thinking that metadata reaches the wire. + pub fn capture_encoded(&self, access_unit: &EncodedAccessUnit<'_>) -> Result<(), CaptureError> { + validate_encoded_access_unit(access_unit)?; + + let mut scratch = Vec::new(); + let payload: &[u8] = match &access_unit.payload { + EncodedPayload::Contiguous(bytes) => bytes, + EncodedPayload::Owned(bytes) => bytes, + EncodedPayload::Fragments(_) => { + scratch = access_unit.payload.to_vec(); + &scratch + } + }; + let frame = EncodedVideoFrame { + codec: access_unit.codec.into(), + payload, + timestamp_us: access_unit.timestamp_us, + frame_type: access_unit.frame_type.into(), + width: access_unit.width, + height: access_unit.height, + frame_metadata: None, + }; + self.source.capture_encoded_frame(&frame).then_some(()).ok_or(CaptureError::CaptureFailed) + } + + /// Returns and clears the pending keyframe request raised by the + /// passthrough encoder (PLI/FIR from the SFU, late subscriber join, or + /// sender reconfiguration). + /// + /// Poll this from the capture loop and forward the request to the + /// upstream encoder so it produces an IDR; until one arrives, new + /// subscribers cannot render the track. + pub fn take_keyframe_request(&self) -> bool { + self.source.take_keyframe_request() + } + + /// Returns publish options appropriate for encoded passthrough. + pub fn encoded_publish_options(codec: EncodedVideoCodec) -> TrackPublishOptions { + TrackPublishOptions { + video_codec: codec.into(), + video_encoder: VideoEncoderBackend::PreEncoded, + simulcast: false, + ..Default::default() + } + } +} + +fn validate_encoded_access_unit(access_unit: &EncodedAccessUnit<'_>) -> Result<(), CaptureError> { + if access_unit.payload.is_empty() { + return Err(CaptureError::EmptyPayload); + } + if access_unit.layers != EncodedLayerInfo::default() { + return Err(CaptureError::UnsupportedLayeredEncoding( + "temporal/spatial layer ids are not forwarded by the passthrough encoder", + )); + } + let default_specific = CodecSpecific::default_for(access_unit.codec); + if access_unit.codec_specific != CodecSpecific::None + && access_unit.codec_specific != default_specific + { + return Err(CaptureError::UnsupportedLayeredEncoding( + "codec-specific layering metadata is not forwarded by the passthrough encoder", + )); + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::encoded::EncodedFrameType; + + #[test] + fn accepts_vp8_vp9_and_av1_access_units() { + for codec in [EncodedVideoCodec::VP8, EncodedVideoCodec::VP9, EncodedVideoCodec::AV1] { + let access_unit = EncodedAccessUnit::contiguous( + codec, + &[1, 2, 3], + 0, + EncodedFrameType::Key, + 640, + 480, + ); + + assert!(validate_encoded_access_unit(&access_unit).is_ok()); + } + } + + #[test] + fn rejects_empty_encoded_access_units() { + let access_unit = EncodedAccessUnit::contiguous( + EncodedVideoCodec::VP8, + &[], + 0, + EncodedFrameType::Key, + 640, + 480, + ); + + assert_eq!(validate_encoded_access_unit(&access_unit), Err(CaptureError::EmptyPayload)); + } + + #[test] + fn accepts_default_codec_specific_metadata() { + let mut access_unit = EncodedAccessUnit::contiguous( + EncodedVideoCodec::AV1, + &[1, 2, 3], + 0, + EncodedFrameType::Key, + 640, + 480, + ); + access_unit.codec_specific = CodecSpecific::default_for(EncodedVideoCodec::AV1); + + assert!(validate_encoded_access_unit(&access_unit).is_ok()); + } + + #[test] + fn rejects_layered_access_units() { + let mut access_unit = EncodedAccessUnit::contiguous( + EncodedVideoCodec::VP9, + &[1, 2, 3], + 0, + EncodedFrameType::Key, + 640, + 480, + ); + access_unit.layers = EncodedLayerInfo { spatial_id: None, temporal_id: Some(1) }; + + assert!(matches!( + validate_encoded_access_unit(&access_unit), + Err(CaptureError::UnsupportedLayeredEncoding(_)) + )); + } + + #[test] + fn rejects_non_default_codec_specific_metadata() { + let mut access_unit = EncodedAccessUnit::contiguous( + EncodedVideoCodec::VP8, + &[1, 2, 3], + 0, + EncodedFrameType::Key, + 640, + 480, + ); + access_unit.codec_specific = CodecSpecific::VP8 { temporal_id: Some(1), layer_sync: true }; + + assert!(matches!( + validate_encoded_access_unit(&access_unit), + Err(CaptureError::UnsupportedLayeredEncoding(_)) + )); + } +} diff --git a/livekit-protocol/src/livekit.serde.rs b/livekit-protocol/src/livekit.serde.rs index ebd07f129..29bd3ca22 100644 --- a/livekit-protocol/src/livekit.serde.rs +++ b/livekit-protocol/src/livekit.serde.rs @@ -12092,7 +12092,7 @@ impl<'de> serde::Deserialize<'de> for data_stream::Header { if inline_content__.is_some() { return Err(serde::de::Error::duplicate_field("inlineContent")); } - inline_content__ = + inline_content__ = map_.next_value::<::std::option::Option<::pbjson::private::BytesDeserialize<_>>>()?.map(|x| x.0) ; } diff --git a/webrtc-sys/build.rs b/webrtc-sys/build.rs index a7e73a6ef..b69c5a877 100644 --- a/webrtc-sys/build.rs +++ b/webrtc-sys/build.rs @@ -84,7 +84,9 @@ fn main() { "src/video_frame.cpp", "src/video_frame_buffer.cpp", "src/dmabuf_video_frame_buffer.cpp", + "src/encoded_video_frame_buffer.cpp", "src/video_encoder_factory.cpp", + "src/passthrough_video_encoder.cpp", "src/video_decoder_factory.cpp", "src/synthetic_audio_device.cpp", "src/adm_proxy.cpp", @@ -96,6 +98,7 @@ fn main() { "src/audio_mixer.cpp", "src/packet_trailer.cpp", "src/packet_trailer_av1.cpp", + "src/jetson/jetson_av1_bitstream.cpp", ]); if is_desktop { @@ -231,7 +234,6 @@ fn main() { .file("src/jetson/h264_encoder_impl.cpp") .file("src/jetson/h265_encoder_impl.cpp") .file("src/jetson/av1_encoder_impl.cpp") - .file("src/jetson/jetson_av1_bitstream.cpp") .file("src/jetson/jetson_encoder_factory.cpp") .flag("-DUSE_JETSON_VIDEO_CODEC=1"); diff --git a/webrtc-sys/include/livekit/encoded_video_frame_buffer.h b/webrtc-sys/include/livekit/encoded_video_frame_buffer.h new file mode 100644 index 000000000..a5be935f0 --- /dev/null +++ b/webrtc-sys/include/livekit/encoded_video_frame_buffer.h @@ -0,0 +1,94 @@ +/* + * Copyright 2026 LiveKit, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "api/video/encoded_image.h" +#include "api/video/video_frame_buffer.h" + +namespace livekit { + +enum class EncodedVideoCodec { + kH264, + kH265, + kVP8, + kVP9, + kAV1, +}; + +enum class EncodedFrameType { + kKey, + kDelta, +}; + +// A native WebRTC frame buffer carrying one encoded video access unit. +class EncodedVideoFrameBuffer : public webrtc::VideoFrameBuffer { + public: + // `keyframe_request_flag` is shared with the owning video source: the + // pass-through encoder sets it when the RTP layer asks for a keyframe the + // pending frame cannot satisfy, and the capture side polls it to forward + // the request upstream. + EncodedVideoFrameBuffer( + int width, + int height, + EncodedVideoCodec codec, + EncodedFrameType frame_type, + webrtc::scoped_refptr payload, + std::shared_ptr> keyframe_request_flag = nullptr); + ~EncodedVideoFrameBuffer() override = default; + + Type type() const override; + int width() const override; + int height() const override; + webrtc::scoped_refptr ToI420() override; + webrtc::scoped_refptr CropAndScale( + int offset_x, + int offset_y, + int crop_width, + int crop_height, + int scaled_width, + int scaled_height) override; + + EncodedVideoCodec codec() const { return codec_; } + EncodedFrameType frame_type() const { return frame_type_; } + + // The encoded access unit. Shared with the pass-through encoder so the + // payload is not copied again on the send path. + webrtc::scoped_refptr encoded_data() const { + return payload_; + } + const uint8_t* payload_data() const { return payload_->data(); } + size_t payload_size() const { return payload_->size(); } + + // Asks the capture side to produce a keyframe (e.g. on PLI/FIR). + void request_keyframe() const; + + static EncodedVideoFrameBuffer* FromNative(webrtc::VideoFrameBuffer* buffer); + + private: + int width_; + int height_; + EncodedVideoCodec codec_; + EncodedFrameType frame_type_; + webrtc::scoped_refptr payload_; + std::shared_ptr> keyframe_request_flag_; +}; + +} // namespace livekit diff --git a/webrtc-sys/include/livekit/passthrough_video_encoder.h b/webrtc-sys/include/livekit/passthrough_video_encoder.h new file mode 100644 index 000000000..a50f9f63a --- /dev/null +++ b/webrtc-sys/include/livekit/passthrough_video_encoder.h @@ -0,0 +1,47 @@ +/* + * Copyright 2026 LiveKit, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "api/environment/environment.h" +#include "api/video_codecs/sdp_video_format.h" +#include "api/video_codecs/video_encoder.h" +#include "api/video_codecs/video_encoder_factory.h" + +namespace livekit_ffi { + +class PassthroughVideoEncoderFactory : public webrtc::VideoEncoderFactory { + public: + PassthroughVideoEncoderFactory(); + ~PassthroughVideoEncoderFactory() override = default; + + std::vector GetSupportedFormats() const override; + std::vector GetImplementations() const override; + CodecSupport QueryCodecSupport( + const webrtc::SdpVideoFormat& format, + std::optional scalability_mode) const override; + std::unique_ptr Create( + const webrtc::Environment& env, + const webrtc::SdpVideoFormat& format) override; + + private: + std::vector supported_formats_; +}; + +} // namespace livekit_ffi diff --git a/webrtc-sys/include/livekit/video_track.h b/webrtc-sys/include/livekit/video_track.h index 5520cf2ec..8b35774bf 100644 --- a/webrtc-sys/include/livekit/video_track.h +++ b/webrtc-sys/include/livekit/video_track.h @@ -16,6 +16,7 @@ #pragma once +#include #include #include "api/media_stream_interface.h" @@ -105,11 +106,19 @@ class VideoTrackSource { void set_packet_trailer_handler( std::shared_ptr handler); + // Shared with every EncodedVideoFrameBuffer this source emits; the + // pass-through encoder raises it on unsatisfied keyframe requests. + std::shared_ptr> keyframe_request_flag() const { + return keyframe_request_flag_; + } + private: mutable webrtc::Mutex mutex_; webrtc::TimestampAligner timestamp_aligner_; VideoResolution resolution_; std::shared_ptr packet_trailer_handler_; + std::shared_ptr> keyframe_request_flag_ = + std::make_shared>(false); bool is_screencast_; }; @@ -132,6 +141,17 @@ class VideoTrackSource { int64_t timestamp_us, const FrameMetadata& frame_metadata) const; + bool capture_encoded_frame(int width, + int height, + const EncodedVideoFrameData& frame, + rust::Slice payload, + const FrameMetadata& frame_metadata) const; + + // Returns and clears the pending upstream keyframe request raised by the + // pass-through encoder (PLI/FIR or post-reconfigure). Poll from the + // capture loop. + bool take_keyframe_request() const; + void set_packet_trailer_handler( std::shared_ptr handler) const; diff --git a/webrtc-sys/src/encoded_video_frame_buffer.cpp b/webrtc-sys/src/encoded_video_frame_buffer.cpp new file mode 100644 index 000000000..62a474d6f --- /dev/null +++ b/webrtc-sys/src/encoded_video_frame_buffer.cpp @@ -0,0 +1,98 @@ +/* + * Copyright 2026 LiveKit, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "livekit/encoded_video_frame_buffer.h" + +#include + +#include "api/video/i420_buffer.h" +#include "rtc_base/logging.h" + +namespace livekit { + +EncodedVideoFrameBuffer::EncodedVideoFrameBuffer( + int width, + int height, + EncodedVideoCodec codec, + EncodedFrameType frame_type, + webrtc::scoped_refptr payload, + std::shared_ptr> keyframe_request_flag) + : width_(width), + height_(height), + codec_(codec), + frame_type_(frame_type), + payload_(std::move(payload)), + keyframe_request_flag_(std::move(keyframe_request_flag)) {} + +webrtc::VideoFrameBuffer::Type EncodedVideoFrameBuffer::type() const { + return Type::kNative; +} + +int EncodedVideoFrameBuffer::width() const { + return width_; +} + +int EncodedVideoFrameBuffer::height() const { + return height_; +} + +webrtc::scoped_refptr +EncodedVideoFrameBuffer::ToI420() { + // Sinks attached to a pre-encoded track (local preview, FFI color + // conversion) convert whatever buffer they receive; the encoded payload + // cannot be decoded here, so hand back a black frame instead of a null + // buffer that would crash the caller. + static std::atomic logged{false}; + if (!logged.exchange(true)) { + RTC_LOG(LS_WARNING) << "EncodedVideoFrameBuffer::ToI420 cannot decode an " + "encoded access unit; returning black frames"; + } + webrtc::scoped_refptr buffer = + webrtc::I420Buffer::Create(width_, height_); + webrtc::I420Buffer::SetBlack(buffer.get()); + return buffer; +} + +webrtc::scoped_refptr +EncodedVideoFrameBuffer::CropAndScale(int /* offset_x */, + int /* offset_y */, + int /* crop_width */, + int /* crop_height */, + int /* scaled_width */, + int /* scaled_height */) { + // Encoded payloads cannot be rescaled; returning the buffer unchanged + // keeps misbehaving callers alive (the capture path never scales encoded + // frames). + RTC_LOG(LS_WARNING) << "EncodedVideoFrameBuffer::CropAndScale is " + "unsupported; returning the frame unscaled"; + return webrtc::scoped_refptr(this); +} + +void EncodedVideoFrameBuffer::request_keyframe() const { + if (keyframe_request_flag_) { + keyframe_request_flag_->store(true, std::memory_order_relaxed); + } +} + +EncodedVideoFrameBuffer* EncodedVideoFrameBuffer::FromNative( + webrtc::VideoFrameBuffer* buffer) { + if (!buffer || buffer->type() != webrtc::VideoFrameBuffer::Type::kNative) { + return nullptr; + } + return dynamic_cast(buffer); +} + +} // namespace livekit diff --git a/webrtc-sys/src/jetson/av1_encoder_impl.cpp b/webrtc-sys/src/jetson/av1_encoder_impl.cpp index 9626de4b4..679328d64 100644 --- a/webrtc-sys/src/jetson/av1_encoder_impl.cpp +++ b/webrtc-sys/src/jetson/av1_encoder_impl.cpp @@ -283,13 +283,12 @@ int32_t JetsonAV1EncoderImpl::Encode( return WEBRTC_VIDEO_CODEC_NO_OUTPUT; } - livekit::av1::StripIvfFrameHeaderIfPresent(&packet); + livekit::av1::NormalizeForRtp(&packet); if (packet.empty()) { - RTC_LOG(LS_ERROR) - << "Jetson MMAPI AV1 packet contained only IVF framing; skipping."; + RTC_LOG(LS_ERROR) << "Jetson MMAPI AV1 packet contained no transferable " + "OBUs after RTP normalization; skipping."; return WEBRTC_VIDEO_CODEC_NO_OUTPUT; } - livekit::av1::ConvertAnnexBToLowOverheadIfPresent(&packet); std::vector sequence_header; if (livekit::av1::ExtractSequenceHeaderObu(packet.data(), packet.size(), @@ -353,6 +352,7 @@ int32_t JetsonAV1EncoderImpl::ProcessEncodedFrame( encoded_image_.qp_ = -1; CodecSpecificInfo codecInfo; + codecInfo.codecSpecific = {}; codecInfo.codecType = kVideoCodecAV1; codecInfo.end_of_picture = true; codecInfo.scalability_mode = ScalabilityMode::kL1T1; diff --git a/webrtc-sys/src/jetson/jetson_av1_bitstream.cpp b/webrtc-sys/src/jetson/jetson_av1_bitstream.cpp index bc3ab2206..2792ac36a 100644 --- a/webrtc-sys/src/jetson/jetson_av1_bitstream.cpp +++ b/webrtc-sys/src/jetson/jetson_av1_bitstream.cpp @@ -200,6 +200,39 @@ bool ConvertAnnexBToLowOverhead(std::vector* packet) { return true; } +bool StripNonTransferObus(std::vector* packet) { + if (!packet || packet->empty()) { + return false; + } + + const std::vector obus = ParseObus(packet->data(), packet->size()); + if (obus.empty()) { + return false; + } + + size_t transfer_size = 0; + bool already_contiguous = true; + size_t next_offset = 0; + for (const ObuSpan& obu : obus) { + transfer_size += obu.total_size; + already_contiguous = already_contiguous && obu.offset == next_offset; + next_offset = obu.offset + obu.total_size; + } + + if (transfer_size == packet->size() && already_contiguous) { + return false; + } + + std::vector filtered; + filtered.reserve(transfer_size); + for (const ObuSpan& obu : obus) { + filtered.insert(filtered.end(), packet->begin() + obu.offset, + packet->begin() + obu.offset + obu.total_size); + } + packet->swap(filtered); + return true; +} + } // namespace std::vector ParseObus(const uint8_t* data, size_t len) { @@ -308,6 +341,16 @@ void ConvertAnnexBToLowOverheadIfPresent(std::vector* packet) { ConvertAnnexBToLowOverhead(packet); } +void StripNonTransferObusIfPresent(std::vector* packet) { + StripNonTransferObus(packet); +} + +void NormalizeForRtp(std::vector* packet) { + StripIvfFrameHeaderIfPresent(packet); + ConvertAnnexBToLowOverheadIfPresent(packet); + StripNonTransferObusIfPresent(packet); +} + bool IsWebRtcParseable(const uint8_t* data, size_t len) { if (!data || len == 0) { return false; diff --git a/webrtc-sys/src/jetson/jetson_av1_bitstream.h b/webrtc-sys/src/jetson/jetson_av1_bitstream.h index 0af9614e0..2a7605661 100644 --- a/webrtc-sys/src/jetson/jetson_av1_bitstream.h +++ b/webrtc-sys/src/jetson/jetson_av1_bitstream.h @@ -55,6 +55,15 @@ void StripIvfFrameHeaderIfPresent(std::vector* packet); /// present. void ConvertAnnexBToLowOverheadIfPresent(std::vector* packet); +/// Strip OBUs that should not be transferred in WebRTC RTP payloads when present. +void StripNonTransferObusIfPresent(std::vector* packet); + +/// Normalizes an AV1 temporal unit for WebRTC RTP packetization: strips IVF +/// framing, converts Annex-B units to low-overhead OBUs, and strips +/// non-transfer OBUs. Shared by every encoder that emits AV1 into the RTP +/// pipeline so the steps cannot drift apart. +void NormalizeForRtp(std::vector* packet); + /// Basic validation that WebRTC's AV1 packetizer can parse the bitstream. bool IsWebRtcParseable(const uint8_t* data, size_t len); diff --git a/webrtc-sys/src/passthrough_video_encoder.cpp b/webrtc-sys/src/passthrough_video_encoder.cpp new file mode 100644 index 000000000..f10a8aaf4 --- /dev/null +++ b/webrtc-sys/src/passthrough_video_encoder.cpp @@ -0,0 +1,404 @@ +/* + * Copyright 2026 LiveKit, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "livekit/passthrough_video_encoder.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "absl/container/inlined_vector.h" +#include "api/video/encoded_image.h" +#include "api/video/video_frame.h" +#include "api/video/video_codec_constants.h" +#include "api/video_codecs/scalability_mode.h" +#include "api/video_codecs/video_encoder.h" +#include "common_video/h264/h264_common.h" +#include "jetson/jetson_av1_bitstream.h" +#include "livekit/encoded_video_frame_buffer.h" +#include "media/base/media_constants.h" +#include "modules/video_coding/include/video_codec_interface.h" +#include "modules/video_coding/include/video_error_codes.h" +#include "modules/video_coding/svc/scalable_video_controller_no_layering.h" +#include "rtc_base/logging.h" + +namespace livekit_ffi { +namespace { + +using livekit::EncodedVideoFrameBuffer; +using webrtc::CodecSpecificInfo; +using webrtc::EncodedImage; +using webrtc::EncodedImageBuffer; +using webrtc::EncodedImageCallback; +using webrtc::Environment; +using webrtc::H264PacketizationMode; +using webrtc::ScalabilityMode; +using webrtc::ScalableVideoController; +using webrtc::ScalableVideoControllerNoLayering; +using webrtc::SdpVideoFormat; +using webrtc::VideoCodec; +using webrtc::VideoCodecType; +using webrtc::VideoEncoder; +using webrtc::VideoFrame; +using webrtc::VideoFrameBuffer; +using webrtc::VideoFrameType; + +VideoCodecType CodecTypeFromFormat(const SdpVideoFormat& format) { + if (format.name == "H264") { + return webrtc::kVideoCodecH264; + } + if (format.name == "H265" || format.name == "HEVC") { + return webrtc::kVideoCodecH265; + } + if (format.name == "VP8") { + return webrtc::kVideoCodecVP8; + } + if (format.name == "VP9") { + return webrtc::kVideoCodecVP9; + } + if (format.name == "AV1") { + return webrtc::kVideoCodecAV1; + } + return webrtc::kVideoCodecGeneric; +} + +VideoCodecType CodecTypeFromBuffer(livekit::EncodedVideoCodec codec) { + switch (codec) { + case livekit::EncodedVideoCodec::kH264: + return webrtc::kVideoCodecH264; + case livekit::EncodedVideoCodec::kH265: + return webrtc::kVideoCodecH265; + case livekit::EncodedVideoCodec::kVP8: + return webrtc::kVideoCodecVP8; + case livekit::EncodedVideoCodec::kVP9: + return webrtc::kVideoCodecVP9; + case livekit::EncodedVideoCodec::kAV1: + return webrtc::kVideoCodecAV1; + } +} + +VideoFrameType FrameTypeFromBuffer(livekit::EncodedFrameType frame_type) { + switch (frame_type) { + case livekit::EncodedFrameType::kKey: + return VideoFrameType::kVideoFrameKey; + case livekit::EncodedFrameType::kDelta: + return VideoFrameType::kVideoFrameDelta; + } +} + +bool IsAv1Codec(VideoCodecType codec_type) { + return codec_type == webrtc::kVideoCodecAV1; +} + +bool IsKeyframe(livekit::EncodedFrameType frame_type) { + return frame_type == livekit::EncodedFrameType::kKey; +} + +// SDP profile parameters constrain real encoders, not a pass-through: the +// forwarded bytes are whatever the upstream encoder produced. Match formats +// by codec only (H265/HEVC are aliases via CodecTypeFromFormat). +bool IsSameCodecType(const SdpVideoFormat& a, const SdpVideoFormat& b) { + VideoCodecType type_a = CodecTypeFromFormat(a); + return type_a != webrtc::kVideoCodecGeneric && + type_a == CodecTypeFromFormat(b); +} + +void FillSingleLayerCodecSpecific( + CodecSpecificInfo* codec_info, + VideoCodecType codec_type, + int width, + int height, + bool keyframe, + ScalableVideoControllerNoLayering* av1_svc_controller) { + codec_info->codecType = codec_type; + codec_info->end_of_picture = true; + + switch (codec_type) { + case webrtc::kVideoCodecH264: + codec_info->codecSpecific.H264.packetization_mode = + H264PacketizationMode::NonInterleaved; + break; + case webrtc::kVideoCodecVP8: + codec_info->codecSpecific.VP8.nonReference = false; + codec_info->codecSpecific.VP8.temporalIdx = 0; + codec_info->codecSpecific.VP8.layerSync = false; + codec_info->codecSpecific.VP8.keyIdx = -1; + break; + case webrtc::kVideoCodecVP9: + codec_info->codecSpecific.VP9.first_frame_in_picture = true; + codec_info->codecSpecific.VP9.inter_pic_predicted = !keyframe; + codec_info->codecSpecific.VP9.flexible_mode = false; + codec_info->codecSpecific.VP9.ss_data_available = keyframe; + codec_info->codecSpecific.VP9.temporal_idx = 0; + codec_info->codecSpecific.VP9.temporal_up_switch = true; + codec_info->codecSpecific.VP9.inter_layer_predicted = false; + codec_info->codecSpecific.VP9.gof_idx = 0; + codec_info->codecSpecific.VP9.num_spatial_layers = 1; + codec_info->codecSpecific.VP9.first_active_layer = 0; + codec_info->codecSpecific.VP9.spatial_layer_resolution_present = keyframe; + codec_info->codecSpecific.VP9.width[0] = width; + codec_info->codecSpecific.VP9.height[0] = height; + codec_info->codecSpecific.VP9.gof.SetGofInfoVP9( + webrtc::kTemporalStructureMode1); + codec_info->codecSpecific.VP9.num_ref_pics = keyframe ? 0 : 1; + codec_info->codecSpecific.VP9.p_diff[0] = 1; + break; + case webrtc::kVideoCodecAV1: { + codec_info->scalability_mode = ScalabilityMode::kL1T1; + std::vector layer_frames = + av1_svc_controller->NextFrameConfig(/*restart=*/keyframe); + if (!layer_frames.empty()) { + const ScalableVideoController::LayerFrameConfig& layer_frame = + layer_frames.front(); + codec_info->generic_frame_info = + av1_svc_controller->OnEncodeDone(layer_frame); + if (layer_frame.IsKeyframe()) { + codec_info->template_structure = + av1_svc_controller->DependencyStructure(); + } + } + break; + } + default: + break; + } +} + +class PassthroughVideoEncoder final : public VideoEncoder { + public: + PassthroughVideoEncoder(const Environment& env, const SdpVideoFormat& format) + : env_(env), format_(format), codec_type_(CodecTypeFromFormat(format)) {} + + int32_t InitEncode(const VideoCodec* codec_settings, + const Settings& /* settings */) override { + if (!codec_settings || codec_settings->codecType != codec_type_) { + return WEBRTC_VIDEO_CODEC_ERR_PARAMETER; + } + codec_ = *codec_settings; + cached_sequence_header_obu_.clear(); + av1_svc_controller_ = ScalableVideoControllerNoLayering(); + if (IsAv1Codec(codec_type_) && !codec_.GetScalabilityMode().has_value()) { + codec_.SetScalabilityMode(ScalabilityMode::kL1T1); + } + return WEBRTC_VIDEO_CODEC_OK; + } + + int32_t RegisterEncodeCompleteCallback( + EncodedImageCallback* callback) override { + encoded_image_callback_ = callback; + return WEBRTC_VIDEO_CODEC_OK; + } + + int32_t Release() override { + encoded_image_callback_ = nullptr; + return WEBRTC_VIDEO_CODEC_OK; + } + + int32_t Encode(const VideoFrame& frame, + const std::vector* frame_types) override { + if (!encoded_image_callback_) { + RTC_LOG(LS_ERROR) + << "PassthroughVideoEncoder callback is not registered"; + return WEBRTC_VIDEO_CODEC_UNINITIALIZED; + } + + webrtc::scoped_refptr frame_buffer = + frame.video_frame_buffer(); + EncodedVideoFrameBuffer* encoded_buffer = + EncodedVideoFrameBuffer::FromNative(frame_buffer.get()); + if (!encoded_buffer) { + RTC_LOG(LS_ERROR) + << "PassthroughVideoEncoder received a non-encoded frame buffer"; + return WEBRTC_VIDEO_CODEC_ERR_PARAMETER; + } + + if (CodecTypeFromBuffer(encoded_buffer->codec()) != codec_type_) { + RTC_LOG(LS_ERROR) + << "PassthroughVideoEncoder frame codec does not match sender codec"; + return WEBRTC_VIDEO_CODEC_ERR_PARAMETER; + } + + const bool is_keyframe = IsKeyframe(encoded_buffer->frame_type()); + + // A pass-through cannot synthesize the keyframe the RTP layer wants + // (PLI/FIR, late subscriber, reconfiguration); forward the request to + // the capture source so the upstream encoder can produce an IDR. + const bool keyframe_requested = + frame_types != nullptr && + std::any_of(frame_types->begin(), frame_types->end(), + [](VideoFrameType type) { + return type == VideoFrameType::kVideoFrameKey; + }); + if (keyframe_requested && !is_keyframe) { + encoded_buffer->request_keyframe(); + } + + if (encoded_buffer->payload_size() == 0) { + RTC_LOG(LS_ERROR) << "PassthroughVideoEncoder received an empty frame"; + return WEBRTC_VIDEO_CODEC_ERR_PARAMETER; + } + + // Non-AV1 payloads are forwarded without copying: the buffer already + // owns a webrtc::EncodedImageBuffer. AV1 needs RTP normalization, which + // may rewrite the bytes, so it works on a copy. + webrtc::scoped_refptr encoded_data; + if (IsAv1Codec(codec_type_)) { + std::vector payload( + encoded_buffer->payload_data(), + encoded_buffer->payload_data() + encoded_buffer->payload_size()); + livekit::av1::NormalizeForRtp(&payload); + + std::vector sequence_header; + if (livekit::av1::ExtractSequenceHeaderObu( + payload.data(), payload.size(), &sequence_header)) { + cached_sequence_header_obu_ = std::move(sequence_header); + } else if (is_keyframe && !cached_sequence_header_obu_.empty()) { + livekit::av1::EnsureSequenceHeaderOnKeyframe( + &payload, cached_sequence_header_obu_); + } + if (payload.empty() || + !livekit::av1::IsWebRtcParseable(payload.data(), payload.size())) { + RTC_LOG(LS_ERROR) + << "PassthroughVideoEncoder received an AV1 frame that WebRTC " + "cannot packetize"; + return WEBRTC_VIDEO_CODEC_ERR_PARAMETER; + } + encoded_data = EncodedImageBuffer::Create(payload.data(), payload.size()); + } else { + encoded_data = encoded_buffer->encoded_data(); + } + + EncodedImage encoded_image; + encoded_image._encodedWidth = encoded_buffer->width(); + encoded_image._encodedHeight = encoded_buffer->height(); + encoded_image.SetRtpTimestamp(frame.rtp_timestamp()); + encoded_image.SetSimulcastIndex(0); + encoded_image.ntp_time_ms_ = frame.ntp_time_ms(); + encoded_image.capture_time_ms_ = frame.render_time_ms(); + encoded_image.rotation_ = frame.rotation(); + encoded_image.content_type_ = webrtc::VideoContentType::UNSPECIFIED; + encoded_image.timing_.flags = webrtc::VideoSendTiming::kInvalid; + encoded_image._frameType = FrameTypeFromBuffer(encoded_buffer->frame_type()); + encoded_image.SetColorSpace(frame.color_space()); + const size_t encoded_size = encoded_data->size(); + encoded_image.SetEncodedData(std::move(encoded_data)); + encoded_image.set_size(encoded_size); + encoded_image.qp_ = -1; + + CodecSpecificInfo codec_info; + codec_info.codecSpecific = {}; + FillSingleLayerCodecSpecific(&codec_info, codec_type_, encoded_buffer->width(), + encoded_buffer->height(), is_keyframe, + &av1_svc_controller_); + + const auto result = + encoded_image_callback_->OnEncodedImage(encoded_image, &codec_info); + if (result.error != EncodedImageCallback::Result::OK) { + RTC_LOG(LS_ERROR) << "PassthroughVideoEncoder callback failed " + << result.error; + return WEBRTC_VIDEO_CODEC_ERROR; + } + return WEBRTC_VIDEO_CODEC_OK; + } + + void SetRates(const RateControlParameters& /* parameters */) override {} + + EncoderInfo GetEncoderInfo() const override { + EncoderInfo info; + info.supports_native_handle = true; + info.implementation_name = "LiveKit pre-encoded passthrough"; + info.scaling_settings = VideoEncoder::ScalingSettings::kOff; + info.is_hardware_accelerated = false; + info.supports_simulcast = false; + info.preferred_pixel_formats = {VideoFrameBuffer::Type::kNative}; + return info; + } + + private: + Environment env_; + SdpVideoFormat format_; + VideoCodecType codec_type_; + VideoCodec codec_; + EncodedImageCallback* encoded_image_callback_ = nullptr; + ScalableVideoControllerNoLayering av1_svc_controller_; + std::vector cached_sequence_header_obu_; +}; + +} // namespace + +PassthroughVideoEncoderFactory::PassthroughVideoEncoderFactory() { + std::map h264_parameters = { + {"profile-level-id", "42e01f"}, + {"level-asymmetry-allowed", "1"}, + {"packetization-mode", "1"}, + }; + absl::InlinedVector + scalability_modes; + scalability_modes.push_back(ScalabilityMode::kL1T1); + supported_formats_.push_back(SdpVideoFormat::VP8()); + supported_formats_.push_back(SdpVideoFormat::VP9Profile0()); + supported_formats_.push_back( + SdpVideoFormat(SdpVideoFormat::AV1Profile0(), scalability_modes)); + supported_formats_.push_back(SdpVideoFormat("H264", h264_parameters)); + supported_formats_.push_back(SdpVideoFormat("H265")); + supported_formats_.push_back(SdpVideoFormat("HEVC")); +} + +std::vector +PassthroughVideoEncoderFactory::GetSupportedFormats() const { + return supported_formats_; +} + +std::vector +PassthroughVideoEncoderFactory::GetImplementations() const { + return supported_formats_; +} + +PassthroughVideoEncoderFactory::CodecSupport +PassthroughVideoEncoderFactory::QueryCodecSupport( + const SdpVideoFormat& format, + std::optional scalability_mode) const { + for (const auto& supported_format : supported_formats_) { + if (IsSameCodecType(format, supported_format)) { + if (format.name == "AV1" && scalability_mode.has_value() && + *scalability_mode != "L1T1") { + return {.is_supported = false, .is_power_efficient = false}; + } + return {.is_supported = true, .is_power_efficient = true}; + } + } + return {.is_supported = false, .is_power_efficient = false}; +} + +std::unique_ptr PassthroughVideoEncoderFactory::Create( + const Environment& env, + const SdpVideoFormat& format) { + // Match by codec, not by exact profile: rejecting e.g. a High-profile + // H264 negotiation here would hand the session to a real encoder that + // cannot consume pre-encoded frames. + for (const auto& supported_format : supported_formats_) { + if (IsSameCodecType(format, supported_format)) { + return std::make_unique(env, format); + } + } + return nullptr; +} + +} // namespace livekit_ffi diff --git a/webrtc-sys/src/rtp_sender.cpp b/webrtc-sys/src/rtp_sender.cpp index cb7809d4a..e97bed2d6 100644 --- a/webrtc-sys/src/rtp_sender.cpp +++ b/webrtc-sys/src/rtp_sender.cpp @@ -46,6 +46,8 @@ const char* BackendName(VideoEncoderBackend backend) { return "vaapi"; case VideoEncoderBackend::VideoToolbox: return "videotoolbox"; + case VideoEncoderBackend::PreEncoded: + return "preencoded"; } } @@ -71,6 +73,9 @@ std::optional BackendFromFormat( if (it->second == BackendName(VideoEncoderBackend::VideoToolbox)) { return VideoEncoderBackend::VideoToolbox; } + if (it->second == BackendName(VideoEncoderBackend::PreEncoded)) { + return VideoEncoderBackend::PreEncoded; + } return std::nullopt; } @@ -105,7 +110,16 @@ class FixedVideoEncoderSelector final } std::optional OnEncoderBroken() override { - return std::nullopt; + // The preferred backend is a hard requirement for this sender (e.g. + // pre-encoded pass-through). When the active encoder breaks — including + // when the initial untagged encoder could not even be created — request + // the preferred backend explicitly instead of giving up, so the sender + // recovers onto the right encoder. + if (!current_encoder_) { + return std::nullopt; + } + requested_ = true; + return WithBackend(*current_encoder_, backend_); } private: diff --git a/webrtc-sys/src/video_encoder_factory.cpp b/webrtc-sys/src/video_encoder_factory.cpp index 98505813e..d8b7fd454 100644 --- a/webrtc-sys/src/video_encoder_factory.cpp +++ b/webrtc-sys/src/video_encoder_factory.cpp @@ -16,6 +16,8 @@ #include "livekit/video_encoder_factory.h" +#include +#include #include #include #include @@ -26,6 +28,7 @@ #include "api/video_codecs/video_encoder.h" #include "api/video_codecs/video_encoder_factory_template.h" #include "livekit/objc_video_factory.h" +#include "livekit/passthrough_video_encoder.h" #include "livekit/webrtc.h" #include "media/base/media_constants.h" #include "media/engine/simulcast_encoder_adapter.h" @@ -107,6 +110,8 @@ const char* BackendName(VideoEncoderBackend backend) { return "vaapi"; case VideoEncoderBackend::VideoToolbox: return "videotoolbox"; + case VideoEncoderBackend::PreEncoded: + return "preencoded"; } } @@ -132,6 +137,9 @@ std::optional BackendFromFormat( if (it->second == BackendName(VideoEncoderBackend::VideoToolbox)) { return VideoEncoderBackend::VideoToolbox; } + if (it->second == BackendName(VideoEncoderBackend::PreEncoded)) { + return VideoEncoderBackend::PreEncoded; + } return std::nullopt; } @@ -160,8 +168,43 @@ bool IsSpecificHardwareBackend(VideoEncoderBackend backend) { bool BackendMatches(VideoEncoderBackend requested, VideoEncoderBackend actual) { return requested == actual || (requested == VideoEncoderBackend::Hardware && - actual != VideoEncoderBackend::Software && - actual != VideoEncoderBackend::Auto); + (actual == VideoEncoderBackend::Hardware || + IsSpecificHardwareBackend(actual))); +} + +bool IsAutomaticFallbackBackend(VideoEncoderBackend backend) { + return backend != VideoEncoderBackend::PreEncoded; +} + +bool EqualsIgnoreAsciiCase(std::string_view a, std::string_view b) { + return a.size() == b.size() && + std::equal(a.begin(), a.end(), b.begin(), [](char x, char y) { + return std::tolower(static_cast(x)) == + std::tolower(static_cast(y)); + }); +} + +bool IsSameCodecName(std::string_view a, std::string_view b) { + if (EqualsIgnoreAsciiCase(a, b)) { + return true; + } + auto is_h265 = [](std::string_view name) { + return EqualsIgnoreAsciiCase(name, "H265") || + EqualsIgnoreAsciiCase(name, "HEVC"); + }; + return is_h265(a) && is_h265(b); +} + +// The pass-through backend forwards pre-encoded bytes, so SDP profile +// parameters do not constrain it: match it by codec name only. Real +// encoder backends keep exact profile matching. +bool FormatSupportedByBackendFactory(VideoEncoderBackend backend, + const webrtc::SdpVideoFormat& supported, + const webrtc::SdpVideoFormat& requested) { + if (backend == VideoEncoderBackend::PreEncoded) { + return IsSameCodecName(supported.name, requested.name); + } + return supported.IsSameCodec(requested); } void AddBackendFactory( @@ -256,6 +299,7 @@ rust::Vec video_encoder_backend_list() { rust::Vec backends; backends.push_back(VideoEncoderBackend::Auto); backends.push_back(VideoEncoderBackend::Software); + backends.push_back(VideoEncoderBackend::PreEncoded); bool has_hardware_backend = false; bool hardware_backend_listed = false; @@ -299,6 +343,11 @@ rust::Vec video_encoder_backend_list() { } VideoEncoderFactory::InternalFactory::InternalFactory() { + AddBackendFactory( + factories_, + VideoEncoderBackend::PreEncoded, + std::make_unique()); + #ifdef __APPLE__ AddBackendFactory( factories_, @@ -331,10 +380,35 @@ VideoEncoderFactory::InternalFactory::GetSupportedFormats() const { std::vector formats = Factory().GetSupportedFormats(); for (const auto& backend_factory : factories_) { + if (backend_factory.backend == VideoEncoderBackend::PreEncoded) { + continue; + } auto supported_formats = backend_factory.factory->GetSupportedFormats(); formats.insert(formats.end(), supported_formats.begin(), supported_formats.end()); } + + // The pass-through factory would otherwise advertise codecs no real + // encoder implements (e.g. H265 on desktops); a normal session + // negotiating such a codec would end up with a sender that cannot create + // an encoder. Only advertise pass-through formats for codecs some real + // encoder already supports. + const size_t real_format_count = formats.size(); + for (const auto& backend_factory : factories_) { + if (backend_factory.backend != VideoEncoderBackend::PreEncoded) { + continue; + } + for (const auto& format : backend_factory.factory->GetSupportedFormats()) { + const bool codec_available = std::any_of( + formats.begin(), formats.begin() + real_format_count, + [&](const webrtc::SdpVideoFormat& existing) { + return IsSameCodecName(existing.name, format.name); + }); + if (codec_available) { + formats.push_back(format); + } + } + } return formats; } @@ -342,6 +416,9 @@ std::vector VideoEncoderFactory::InternalFactory::GetImplementations() const { std::vector formats; for (const auto& backend_factory : factories_) { + if (backend_factory.backend == VideoEncoderBackend::PreEncoded) { + continue; + } for (const auto& format : backend_factory.factory->GetImplementations()) { formats.push_back(WithBackend(format, backend_factory.backend)); if (IsSpecificHardwareBackend(backend_factory.backend)) { @@ -384,7 +461,9 @@ VideoEncoderFactory::InternalFactory::QueryCodecSupport( for (const auto& supported_format : backend_factory.factory->GetSupportedFormats()) { - if (stripped_format.IsSameCodec(supported_format)) { + if (FormatSupportedByBackendFactory(backend_factory.backend, + supported_format, + stripped_format)) { return webrtc::VideoEncoderFactory::CodecSupport{ .is_supported = true, .is_power_efficient = true, @@ -406,6 +485,9 @@ VideoEncoderFactory::InternalFactory::QueryCodecSupport( } for (const auto& backend_factory : factories_) { + if (!IsAutomaticFallbackBackend(backend_factory.backend)) { + continue; + } for (const auto& supported_format : backend_factory.factory->GetSupportedFormats()) { if (stripped_format.IsSameCodec(supported_format)) { @@ -448,7 +530,9 @@ VideoEncoderFactory::InternalFactory::Create( for (const auto& supported_format : backend_factory.factory->GetSupportedFormats()) { - if (supported_format.IsSameCodec(stripped_format)) { + if (FormatSupportedByBackendFactory(backend_factory.backend, + supported_format, + stripped_format)) { auto encoder = backend_factory.factory->Create(env, stripped_format); if (encoder) { return encoder; @@ -457,6 +541,17 @@ VideoEncoderFactory::InternalFactory::Create( } } + // A real encoder cannot consume the pre-encoded native frame buffers + // this session produces, so falling back would yield a silently broken + // sender. Fail loudly instead. + if (*requested_backend == VideoEncoderBackend::PreEncoded) { + RTC_LOG(LS_ERROR) + << "Pre-encoded pass-through encoder is unavailable for " + << stripped_format.name + << "; refusing to fall back to a real encoder."; + return nullptr; + } + requested_backend_unavailable = true; } @@ -468,6 +563,9 @@ VideoEncoderFactory::InternalFactory::Create( } for (const auto& backend_factory : factories_) { + if (!IsAutomaticFallbackBackend(backend_factory.backend)) { + continue; + } for (const auto& supported_format : backend_factory.factory->GetSupportedFormats()) { if (supported_format.IsSameCodec(stripped_format)) diff --git a/webrtc-sys/src/video_track.cpp b/webrtc-sys/src/video_track.cpp index 44d1351fa..8a08aee0c 100644 --- a/webrtc-sys/src/video_track.cpp +++ b/webrtc-sys/src/video_track.cpp @@ -27,6 +27,7 @@ #include "audio/remix_resample.h" #include "common_audio/include/audio_util.h" #include "livekit/dmabuf_video_frame_buffer.h" +#include "livekit/encoded_video_frame_buffer.h" #include "livekit/media_stream.h" #include "livekit/packet_trailer.h" #include "livekit/video_track.h" @@ -38,6 +39,33 @@ #include "webrtc-sys/src/video_track.rs.h" namespace livekit_ffi { +namespace { + +livekit::EncodedVideoCodec ToNativeEncodedCodec(EncodedVideoCodec codec) { + switch (codec) { + case EncodedVideoCodec::H264: + return livekit::EncodedVideoCodec::kH264; + case EncodedVideoCodec::H265: + return livekit::EncodedVideoCodec::kH265; + case EncodedVideoCodec::VP8: + return livekit::EncodedVideoCodec::kVP8; + case EncodedVideoCodec::VP9: + return livekit::EncodedVideoCodec::kVP9; + case EncodedVideoCodec::AV1: + return livekit::EncodedVideoCodec::kAV1; + } +} + +livekit::EncodedFrameType ToNativeEncodedFrameType(EncodedFrameType frame_type) { + switch (frame_type) { + case EncodedFrameType::Key: + return livekit::EncodedFrameType::kKey; + case EncodedFrameType::Delta: + return livekit::EncodedFrameType::kDelta; + } +} + +} // namespace VideoTrack::VideoTrack(std::shared_ptr rtc_runtime, webrtc::scoped_refptr track) @@ -164,6 +192,25 @@ bool VideoTrackSource::InternalSource::on_captured_frame( static_cast(buffer->height())}; } + // Pre-encoded access units bypass the adapter entirely: frame-rate and + // resolution adaptation operate on raw frames, and dropping or scaling an + // encoded delta frame would corrupt the bitstream for every receiver. + if (livekit::EncodedVideoFrameBuffer::FromNative(buffer.get())) { + if (packet_trailer_handler_) { + packet_trailer_handler_->emit_publish_timing( + VideoPublishTimingStage::EncoderUpload, + frame_metadata.has_packet_trailer ? frame_metadata.user_timestamp + : 0, + frame_metadata.has_packet_trailer ? frame_metadata.frame_id : 0); + } + OnFrame(webrtc::VideoFrame::Builder() + .set_video_frame_buffer(buffer) + .set_rotation(frame.rotation()) + .set_timestamp_us(aligned_timestamp_us) + .build()); + return true; + } + int adapted_width, adapted_height, crop_width, crop_height, crop_x, crop_y; if (!AdaptFrame(buffer->width(), buffer->height(), aligned_timestamp_us, &adapted_width, &adapted_height, &crop_width, &crop_height, @@ -246,6 +293,35 @@ bool VideoTrackSource::capture_dmabuf_frame(int dmabuf_fd, return source_->on_captured_frame(frame, frame_metadata); } +bool VideoTrackSource::capture_encoded_frame( + int width, + int height, + const EncodedVideoFrameData& encoded_frame, + rust::Slice payload, + const FrameMetadata& frame_metadata) const { + // The single unavoidable copy on this path: the Rust payload only lives + // for the duration of this call, while the EncodedImageBuffer is shared + // (uncopied) with the pass-through encoder downstream. + auto buffer = webrtc::make_ref_counted( + width, height, ToNativeEncodedCodec(encoded_frame.codec), + ToNativeEncodedFrameType(encoded_frame.frame_type), + webrtc::EncodedImageBuffer::Create(payload.data(), payload.size()), + source_->keyframe_request_flag()); + + auto frame = webrtc::VideoFrame::Builder() + .set_video_frame_buffer(std::move(buffer)) + .set_rotation(webrtc::kVideoRotation_0) + .set_timestamp_us(encoded_frame.timestamp_us) + .build(); + + return source_->on_captured_frame(frame, frame_metadata); +} + +bool VideoTrackSource::take_keyframe_request() const { + return source_->keyframe_request_flag()->exchange(false, + std::memory_order_relaxed); +} + void VideoTrackSource::set_packet_trailer_handler( std::shared_ptr handler) const { source_->set_packet_trailer_handler(std::move(handler)); diff --git a/webrtc-sys/src/video_track.rs b/webrtc-sys/src/video_track.rs index 6c0a584e9..f7c8b354c 100644 --- a/webrtc-sys/src/video_track.rs +++ b/webrtc-sys/src/video_track.rs @@ -50,6 +50,30 @@ pub mod ffi { pub user_data: Vec, } + #[derive(Debug, Clone, Copy, PartialEq, Eq)] + #[repr(i32)] + pub enum EncodedVideoCodec { + H264, + H265, + VP8, + VP9, + AV1, + } + + #[derive(Debug, Clone, Copy, PartialEq, Eq)] + #[repr(i32)] + pub enum EncodedFrameType { + Key, + Delta, + } + + #[derive(Debug)] + pub struct EncodedVideoFrameData { + pub codec: EncodedVideoCodec, + pub frame_type: EncodedFrameType, + pub timestamp_us: i64, + } + extern "C++" { include!("livekit/video_frame.h"); include!("livekit/media_stream_track.h"); @@ -94,6 +118,15 @@ pub mod ffi { timestamp_us: i64, frame_metadata: &FrameMetadata, ) -> bool; + fn capture_encoded_frame( + self: &VideoTrackSource, + width: i32, + height: i32, + frame: &EncodedVideoFrameData, + payload: &[u8], + frame_metadata: &FrameMetadata, + ) -> bool; + fn take_keyframe_request(self: &VideoTrackSource) -> bool; fn set_packet_trailer_handler( self: &VideoTrackSource, handler: SharedPtr, diff --git a/webrtc-sys/src/webrtc.rs b/webrtc-sys/src/webrtc.rs index b7878341c..fc7b7d099 100644 --- a/webrtc-sys/src/webrtc.rs +++ b/webrtc-sys/src/webrtc.rs @@ -63,6 +63,7 @@ pub mod ffi { Nvenc, Vaapi, VideoToolbox, + PreEncoded, } unsafe extern "C++" {