NVIDIA · farbod-nv · May 16, 2026
diff --git a/src/viz/core/cpp/device_image.cpp b/src/viz/core/cpp/device_image.cpp
@@ -127,13 +127,17 @@ std::unique_ptr<DeviceImage> DeviceImage::create(const VkContext& ctx,
     {
         throw std::invalid_argument("DeviceImage: resolution must be non-zero");
     }
-    if (format != PixelFormat::kRGBA8)
+    if (format != PixelFormat::kRGBA8 && format != PixelFormat::kD32F)
     {
-        // kD32F is reserved for ProjectionLayer's depth path. The
-        // CUDA-Vulkan interop contract for a depth image (sample
-        // semantics, layout transitions, color-space view) is not
-        // worked out yet, so refuse to half-build it.
-        throw std::invalid_argument("DeviceImage: only PixelFormat::kRGBA8 is supported");
+        throw std::invalid_argument("DeviceImage: unsupported PixelFormat");
+    }
+    if (format == PixelFormat::kD32F && mip_levels > 1)
+    {
+        // Depth + mip chain is meaningless (filtering depth between mip
+        // levels produces incorrect occlusion) and we'd have to
+        // special-case the blit-down pipeline. Reject explicitly rather
+        // than silently allocating the chain.
+        throw std::invalid_argument("DeviceImage: kD32F does not support mip_levels > 1");
     }
     // mip_levels == 0 -> auto-compute full chain to 1x1.
     if (mip_levels == 0)

diff --git a/src/viz/layers/cpp/CMakeLists.txt b/src/viz/layers/cpp/CMakeLists.txt
@@ -10,7 +10,9 @@ cmake_minimum_required(VERSION 3.20)
 # viz/layers_tests/.
 add_library(viz_layers STATIC
     quad_layer.cpp
+    projection_layer.cpp
     inc/viz/layers/quad_layer.hpp
+    inc/viz/layers/projection_layer.hpp
 )
 
 target_include_directories(viz_layers

diff --git a/src/viz/layers/cpp/inc/viz/layers/projection_layer.hpp b/src/viz/layers/cpp/inc/viz/layers/projection_layer.hpp
@@ -0,0 +1,200 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <viz/core/device_image.hpp>
+#include <viz/core/viz_buffer.hpp>
+#include <viz/core/viz_types.hpp>
+#include <viz/session/layer_base.hpp>
+#include <vulkan/vulkan.h>
+
+#include <array>
+#include <atomic>
+#include <cstdint>
+#include <cuda_runtime.h>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+namespace viz
+{
+
+class VkContext;
+
+// ProjectionLayer: full-view RGBD composited into the shared render
+// target. Designed for renderers (gsplat, nvblox, neural reconstruction)
+// that produce (color, depth) buffers per frame.
+//
+// Frame loop contract — IMPORTANT:
+//
+//     info = session.begin_frame()                    // xrLocateViews
+//     color, depth = renderer.render(info.views)      // render against THIS frame's views
+//     layer.submit(color, depth)                      // publish for THIS frame
+//     session.end_frame()                             // composite + xrEndFrame
+//
+// ``submit()`` MUST be called between ``begin_frame()`` and
+// ``end_frame()``. The renderer MUST render against
+// ``info.views[i].pose`` (the predicted-display-time pose for this
+// frame). The runtime / CloudXR paces the application via xrWaitFrame;
+// if the renderer takes longer than display rate, the runtime's
+// compositor reprojects the last submitted frame at display rate while
+// the app's framerate matches the renderer's speed.
+//
+// In ``kXr``, a visible ProjectionLayer that does NOT receive a
+// ``submit()`` for the current frame is SKIPPED at record time (the
+// layer's region of the shared RT keeps the clear color). This prevents
+// the runtime from compositing yesterday's RGBD content under today's
+// projection-layer pose, which would produce a visible reprojection
+// error. In ``kWindow`` / ``kOffscreen`` the freshness gate is off —
+// the most recent publish stays on screen until replaced (the QuadLayer
+// pattern), since no XR pose mismatch is possible.
+//
+// Mailbox: kSlotCount per-eye (color, depth) DeviceImage pairs. submit()
+// picks a slot that's neither ``latest_`` nor in any ``in_use_`` entry,
+// memcpys + signals cuda_done_writing on the caller's stream, blocks on
+// cudaStreamSynchronize so the caller can re-use source buffers
+// immediately, then atomically promotes the slot to ``latest_``.
+// record_pre_render_pass promotes ``latest_`` to ``in_use_[slot]``.
+//
+// Stereo: when Config::stereo is true, the layer allocates paired
+// (left, right) storage per slot. submit() must ship both eyes on a
+// single CUDA stream; stream ordering keeps the pair atomic. In kXr
+// view 0 (left eye) samples the left buffer, view 1 (right eye) the
+// right. In kWindow / kOffscreen the left buffer is sampled.
+//
+// Memory (per layer):
+//   mono   1024² RGBA8+D32F: 7 slots × 1024² × 8 B    ≈  56 MB
+//   stereo 1024² RGBA8+D32F:                          ≈ 112 MB
+//   stereo 2048² RGBA8+D32F:                          ≈ 448 MB
+class ProjectionLayer : public LayerBase
+{
+public:
+    // Sized to cover backend image counts up to 5, leave one free slot.
+    static constexpr uint32_t kMaxFramesInFlight = 5;
+    static constexpr uint32_t kSlotCount = kMaxFramesInFlight + 2;
+
+    struct Config
+    {
+        std::string name = "ProjectionLayer";
+        Resolution view_resolution{};
+        PixelFormat color_format = PixelFormat::kRGBA8;
+
+        // nullopt → no depth buffer allocated; ProjectionLayer always
+        // writes gl_FragDepth = 1.0 (far). Without depth, this layer
+        // loses Z-compositing with QuadLayer. Useful for renderers that
+        // genuinely have no depth (sky / background fills).
+        std::optional<PixelFormat> depth_format = PixelFormat::kD32F;
+
+        // true → per-eye paired storage. submit MUST ship both eyes.
+        // In kWindow / kOffscreen the LEFT buffer is sampled; in kXr
+        // view 0 → LEFT, view 1 → RIGHT.
+        bool stereo = false;
+    };
+
+    ProjectionLayer(const VkContext& ctx, VkRenderPass render_pass, Config config);
+    ~ProjectionLayer() override;
+    void destroy();
+
+    ProjectionLayer(const ProjectionLayer&) = delete;
+    ProjectionLayer& operator=(const ProjectionLayer&) = delete;
+
+    // Publish a frame. Each buffer is a CUDA-linear VizBuffer (kDevice
+    // space) matching the layer's resolution and the relevant format
+    // (color → color_format, depth → kD32F). Validated against the
+    // config; mismatch throws std::invalid_argument.
+    //
+    // Mono no-depth:           submit(color)
+    // Mono with depth:         submit(color, &depth)
+    // Stereo no-depth:         submit(left_color, nullptr, &right_color, nullptr)
+    // Stereo with depth:       submit(left_color, &left_depth, &right_color, &right_depth)
+    //
+    // submit() does one cudaMemcpy2DToArrayAsync per provided buffer
+    // on ``stream``, signals cuda_done_writing on the same stream, then
+    // BLOCKS on cudaStreamSynchronize so the caller can re-use source
+    // buffers immediately. Cost: ~0.5 ms / 1024² color + depth on a
+    // discrete GPU.
+    //
+    // Marks the layer "fresh for this frame" so record() will draw it.
+    // VizSession::begin_frame clears the flag at the start of each
+    // frame; a renderer that doesn't submit will see its content
+    // skipped in kXr.
+    //
+    // GIL: pybind binding releases the GIL across this whole call.
+    void submit(const VizBuffer& left_color,
+                const VizBuffer* left_depth = nullptr,
+                const VizBuffer* right_color = nullptr,
+                const VizBuffer* right_depth = nullptr,
+                cudaStream_t stream = 0);
+
+    // LayerBase contract.
+    void on_frame_begin() override; // clears submitted_this_frame_ flag
+    void record_pre_render_pass(VkCommandBuffer cmd, uint32_t in_flight_slot) override;
+    void record(VkCommandBuffer cmd,
+                const std::vector<ViewInfo>& views,
+                const RenderTarget& target,
+                uint32_t in_flight_slot) override;
+
+    // cuda_done_writing waits for color + depth of every active view in
+    // the in-use slot. kSlotNone → empty vector.
+    std::vector<LayerBase::WaitSemaphore> get_wait_semaphores() const override;
+
+    // Accessors.
+    Resolution view_resolution() const noexcept;
+    PixelFormat color_format() const noexcept;
+    std::optional<PixelFormat> depth_format() const noexcept;
+    bool is_stereo() const noexcept;
+    uint32_t view_count() const noexcept;
+
+    // Diagnostic — nullptr outside valid ranges.
+    const DeviceImage* color_image(uint32_t slot, uint32_t view) const noexcept;
+    const DeviceImage* depth_image(uint32_t slot, uint32_t view) const noexcept;
+
+private:
+    static constexpr uint8_t kSlotNone = 0xFF;
+
+    void init();
+    void create_sampler();
+    void create_descriptor_set_layout();
+    void create_pipeline_layout();
+    void create_pipeline();
+    void create_descriptor_pool();
+    void allocate_descriptor_sets();
+    void update_descriptor_sets();
+
+    uint8_t pick_free_slot() const noexcept;
+
+    void validate_submit_buffer(const VizBuffer& buf, PixelFormat expected_format, const char* label) const;
+    void enqueue_copy(const VizBuffer& src, DeviceImage& dst, cudaStream_t stream) const;
+
+    const VkContext* ctx_ = nullptr;
+    VkRenderPass render_pass_ = VK_NULL_HANDLE; // borrowed
+    Config config_;
+    uint32_t view_count_ = 1;
+    bool has_depth_ = true;
+
+    std::array<std::vector<std::unique_ptr<DeviceImage>>, kSlotCount> slots_color_;
+    std::array<std::vector<std::unique_ptr<DeviceImage>>, kSlotCount> slots_depth_;
+
+    VkSampler color_sampler_ = VK_NULL_HANDLE;
+    VkSampler depth_sampler_ = VK_NULL_HANDLE;
+    VkDescriptorSetLayout descriptor_set_layout_ = VK_NULL_HANDLE;
+    VkPipelineLayout pipeline_layout_ = VK_NULL_HANDLE;
+    VkPipeline pipeline_with_depth_ = VK_NULL_HANDLE;
+    VkPipeline pipeline_no_depth_ = VK_NULL_HANDLE;
+
+    VkDescriptorPool descriptor_pool_ = VK_NULL_HANDLE;
+    std::array<std::vector<VkDescriptorSet>, kSlotCount> descriptor_sets_;
+
+    // Mailbox.
+    std::atomic<uint8_t> latest_{ kSlotNone };
+    std::array<std::atomic<uint8_t>, kMaxFramesInFlight> in_use_{};
+    std::atomic<uint8_t> last_in_use_slot_{ kSlotNone };
+
+    // Set by submit(), cleared by on_frame_begin(). record() consults
+    // this in kXr to gate stale-RGBD-under-new-pose composites.
+    std::atomic<bool> submitted_this_frame_{ false };
+};
+
+} // namespace viz