From ba5ba4b4a22bdbb26ca71e3fec26adcdfbd5082a Mon Sep 17 00:00:00 2001 From: Alexandre Rulleau Date: Tue, 7 Apr 2026 17:08:21 +0200 Subject: [PATCH 1/8] fix(sidecar): retry connect on Windows to fix startup race in named-pipe IPC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Windows, a named-pipe client's CreateFileA fails immediately if the server has not yet called ConnectNamedPipe — there is no kernel-level connection backlog as Unix domain sockets provide. After daemonize() returns the child process exists but its Tokio runtime may not have entered the accept loop yet, causing test_ddog_sidecar_register_app to fail intermittently. Add connect_to_sidecar_with_retry() that retries the connection with exponential back-off (1 ms → 100 ms cap, up to 20 attempts ≈ 1.9 s total) when just_spawned is true. On non-Windows platforms is_pipe_not_ready() always returns false so the helper is a single-attempt no-op there. Remove the APMSP-2356 ignore annotation from test_ddog_sidecar_register_app so the fix is validated in CI. --- datadog-sidecar-ffi/tests/sidecar.rs | 4 --- datadog-sidecar/src/entry.rs | 54 +++++++++++++++++++++++++--- 2 files changed, 49 insertions(+), 9 deletions(-) diff --git a/datadog-sidecar-ffi/tests/sidecar.rs b/datadog-sidecar-ffi/tests/sidecar.rs index 1536995d2b..e086b89694 100644 --- a/datadog-sidecar-ffi/tests/sidecar.rs +++ b/datadog-sidecar-ffi/tests/sidecar.rs @@ -68,10 +68,6 @@ fn test_ddog_sidecar_connection() { } #[test] -#[cfg_attr( - target_os = "windows", - ignore = "APMSP-2356 Investigate flakiness on Windows" -)] #[cfg_attr(miri, ignore)] fn test_ddog_sidecar_register_app() { set_sidecar_per_process(); diff --git a/datadog-sidecar/src/entry.rs b/datadog-sidecar/src/entry.rs index 4da1491bd7..1033a1bba0 100644 --- a/datadog-sidecar/src/entry.rs +++ b/datadog-sidecar/src/entry.rs @@ -274,6 +274,50 @@ pub fn daemonize(listener: IpcServer, mut cfg: Config) -> anyhow::Result<()> { Ok(()) } +/// Returns `true` for transient Windows named-pipe errors that indicate the server is not +/// yet ready to accept connections (pipe instance doesn't exist yet, or all instances are +/// busy). Always returns `false` on non-Windows so the retry loop is a no-op there. +#[cfg(windows)] +fn is_pipe_not_ready(e: &io::Error) -> bool { + matches!( + e.kind(), + io::ErrorKind::NotFound + | io::ErrorKind::ConnectionRefused + | io::ErrorKind::WouldBlock + ) +} + +#[cfg(not(windows))] +fn is_pipe_not_ready(_e: &io::Error) -> bool { + false +} + +/// Connect to the sidecar, optionally retrying with exponential back-off. +/// +/// On Windows, a named-pipe client's `CreateFileA` fails immediately if the server has not +/// yet called `ConnectNamedPipe` — there is no kernel-level connection backlog like Unix +/// domain sockets provide. When `just_spawned` is `true` (we just called `daemonize()`), +/// we retry up to 20 times with doubling delays (1 ms → 100 ms cap, ≈ 1.9 s total budget) +/// to give the child time to start its Tokio runtime and enter the accept loop. +fn connect_to_sidecar_with_retry( + liaison: &impl setup::Liaison, + just_spawned: bool, +) -> io::Result { + let max_attempts: u32 = if just_spawned { 20 } else { 1 }; + let mut delay = std::time::Duration::from_millis(1); + for attempt in 0..max_attempts { + match liaison.connect_to_server() { + Ok(conn) => return Ok(SidecarTransport::from(conn)), + Err(e) if attempt + 1 < max_attempts && is_pipe_not_ready(&e) => { + std::thread::sleep(delay); + delay = (delay * 2).min(std::time::Duration::from_millis(100)); + } + Err(e) => return Err(e), + } + } + liaison.connect_to_server().map(SidecarTransport::from) +} + pub fn start_or_connect_to_sidecar(cfg: Config) -> anyhow::Result { // On Windows, named-pipe buffer sizes are fixed at creation time. Set the global before // attempt_listen so that the initial server pipe (created by this process and handed to the @@ -289,18 +333,18 @@ pub fn start_or_connect_to_sidecar(cfg: Config) -> anyhow::Result setup::DefaultLiason::ipc_per_process(), }; + let mut just_spawned = false; let err = match liaison.attempt_listen() { Ok(Some(listener)) => { daemonize(listener, cfg)?; + just_spawned = true; None } Ok(None) => None, err => err.context("Error starting sidecar").err(), }; - Ok(SidecarTransport::from( - liaison - .connect_to_server() - .map_err(|e| err.unwrap_or(e.into()))?, - )) + connect_to_sidecar_with_retry(&liaison, just_spawned) + .map_err(|e| err.unwrap_or(e.into())) + .map_err(Into::into) } From 5bdeb13f1e09c6af9b1c07f1abf7b4e33dd6f242 Mon Sep 17 00:00:00 2001 From: Alexandre Rulleau Date: Tue, 7 Apr 2026 17:29:02 +0200 Subject: [PATCH 2/8] fix(clippy): add parentheses for operator precedence in mapper.rs Fix clippy::precedence warning in libdd-trace-utils/src/otlp_encoder/mapper.rs: the shift operator binds tighter than bitwise OR, but explicit parentheses make the intent unambiguous. --- datadog-sidecar/src/entry.rs | 8 ++------ libdd-trace-utils/src/otlp_encoder/mapper.rs | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/datadog-sidecar/src/entry.rs b/datadog-sidecar/src/entry.rs index 1033a1bba0..1d5cf6c760 100644 --- a/datadog-sidecar/src/entry.rs +++ b/datadog-sidecar/src/entry.rs @@ -281,9 +281,7 @@ pub fn daemonize(listener: IpcServer, mut cfg: Config) -> anyhow::Result<()> { fn is_pipe_not_ready(e: &io::Error) -> bool { matches!( e.kind(), - io::ErrorKind::NotFound - | io::ErrorKind::ConnectionRefused - | io::ErrorKind::WouldBlock + io::ErrorKind::NotFound | io::ErrorKind::ConnectionRefused | io::ErrorKind::WouldBlock ) } @@ -344,7 +342,5 @@ pub fn start_or_connect_to_sidecar(cfg: Config) -> anyhow::Result err.context("Error starting sidecar").err(), }; - connect_to_sidecar_with_retry(&liaison, just_spawned) - .map_err(|e| err.unwrap_or(e.into())) - .map_err(Into::into) + connect_to_sidecar_with_retry(&liaison, just_spawned).map_err(|e| err.unwrap_or(e.into())) } diff --git a/libdd-trace-utils/src/otlp_encoder/mapper.rs b/libdd-trace-utils/src/otlp_encoder/mapper.rs index baa4ac6a89..399aa8cd88 100644 --- a/libdd-trace-utils/src/otlp_encoder/mapper.rs +++ b/libdd-trace-utils/src/otlp_encoder/mapper.rs @@ -173,7 +173,7 @@ fn map_span(span: &Span, resource_service: &str) -> OtlpSpan { } fn map_span_link(link: &SpanLink) -> OtlpSpanLink { - let trace_id_128 = (link.trace_id_high as u128) << 64 | (link.trace_id as u128); + let trace_id_128 = ((link.trace_id_high as u128) << 64) | (link.trace_id as u128); let trace_id_hex = format!("{:032x}", trace_id_128); let span_id_hex = format!("{:016x}", link.span_id); let trace_state = if link.tracestate.borrow().is_empty() { From b4d40b0976254d8a0454839a8e5a9f791cc86efd Mon Sep 17 00:00:00 2001 From: Alexandre Rulleau Date: Wed, 8 Apr 2026 11:13:41 +0200 Subject: [PATCH 3/8] fix(sidecar): address review feedback on Windows pipe retry logic - Remove WouldBlock from is_pipe_not_ready: CreateFileA uses synchronous mode and only returns ERROR_FILE_NOT_FOUND (NotFound) or ERROR_PIPE_BUSY (ConnectionRefused); WouldBlock can never appear on this path. Add comments mapping Windows error codes to io::ErrorKind variants. - Replace unreachable post-loop connect_to_server() call with unreachable!(): every loop iteration already returns via Ok, retry, or Err arm, making the trailing call dead code. - Add "Safe to block" comment on std::thread::sleep to clarify it is called from synchronous FFI context, not from within a Tokio task. --- datadog-sidecar/src/entry.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/datadog-sidecar/src/entry.rs b/datadog-sidecar/src/entry.rs index 1d5cf6c760..a8cbef2a5c 100644 --- a/datadog-sidecar/src/entry.rs +++ b/datadog-sidecar/src/entry.rs @@ -279,9 +279,12 @@ pub fn daemonize(listener: IpcServer, mut cfg: Config) -> anyhow::Result<()> { /// busy). Always returns `false` on non-Windows so the retry loop is a no-op there. #[cfg(windows)] fn is_pipe_not_ready(e: &io::Error) -> bool { + // ERROR_FILE_NOT_FOUND → NotFound: pipe name not created yet + // ERROR_PIPE_BUSY → ConnectionRefused: all pipe instances are occupied + // (see SeqpacketConn::connect in datadog-ipc/src/platform/windows/sockets.rs) matches!( e.kind(), - io::ErrorKind::NotFound | io::ErrorKind::ConnectionRefused | io::ErrorKind::WouldBlock + io::ErrorKind::NotFound | io::ErrorKind::ConnectionRefused ) } @@ -307,13 +310,14 @@ fn connect_to_sidecar_with_retry( match liaison.connect_to_server() { Ok(conn) => return Ok(SidecarTransport::from(conn)), Err(e) if attempt + 1 < max_attempts && is_pipe_not_ready(&e) => { + // Safe to block: called from synchronous FFI context, never from a Tokio task. std::thread::sleep(delay); delay = (delay * 2).min(std::time::Duration::from_millis(100)); } Err(e) => return Err(e), } } - liaison.connect_to_server().map(SidecarTransport::from) + unreachable!("every loop iteration returns via Ok, retry, or Err arm") } pub fn start_or_connect_to_sidecar(cfg: Config) -> anyhow::Result { From 189eac7ff14c337b98ec607a7d1ffd93eaae6eed Mon Sep 17 00:00:00 2001 From: Alexandre Rulleau Date: Wed, 8 Apr 2026 14:42:18 +0200 Subject: [PATCH 4/8] fix(ipc): replace retry workaround with WaitNamedPipe + readiness signal Replace the exponential-backoff retry loop in start_or_connect_to_sidecar() with two proper Windows-native fixes that address the root cause: 1. WaitNamedPipeA in SeqpacketConn::connect(): instead of immediately returning ConnectionRefused on ERROR_PIPE_BUSY, block up to 5 s for a pipe instance to become available, then retry CreateFileA. 2. Readiness signal via named Windows event: setup_daemon_process() creates a named manual-reset event (Local\dd-sidecar-ready-), passes its name to the child via DD_SIDECAR_READY_EVENT, and returns a ReadinessWaiter that blocks on WaitForSingleObject(30 s). The child calls signal_sidecar_ready() inside acquire_listener (Tokio runtime running, listener ready) before entering the accept loop. daemonize() calls the waiter after wait_spawn() so the parent cannot connect before the pipe is guaranteed to exist. On Unix, domain sockets have a kernel-level connection backlog so neither fix is needed; setup_daemon_process returns a no-op ReadinessWaiter. --- datadog-ipc/src/platform/windows/sockets.rs | 42 ++++++++------ datadog-sidecar/src/entry.rs | 61 ++++---------------- datadog-sidecar/src/unix.rs | 7 ++- datadog-sidecar/src/windows.rs | 63 +++++++++++++++++++-- 4 files changed, 98 insertions(+), 75 deletions(-) diff --git a/datadog-ipc/src/platform/windows/sockets.rs b/datadog-ipc/src/platform/windows/sockets.rs index f0b6e007ee..6f0edee020 100644 --- a/datadog-ipc/src/platform/windows/sockets.rs +++ b/datadog-ipc/src/platform/windows/sockets.rs @@ -53,8 +53,8 @@ use windows_sys::Win32::Storage::FileSystem::{ ReadFile, WriteFile, FILE_FLAG_FIRST_PIPE_INSTANCE, FILE_FLAG_OVERLAPPED, PIPE_ACCESS_DUPLEX, }; use windows_sys::Win32::System::Pipes::{ - ConnectNamedPipe, CreateNamedPipeA, PeekNamedPipe, SetNamedPipeHandleState, PIPE_NOWAIT, - PIPE_READMODE_MESSAGE, PIPE_TYPE_MESSAGE, PIPE_UNLIMITED_INSTANCES, PIPE_WAIT, + ConnectNamedPipe, CreateNamedPipeA, PeekNamedPipe, SetNamedPipeHandleState, WaitNamedPipeA, + PIPE_NOWAIT, PIPE_READMODE_MESSAGE, PIPE_TYPE_MESSAGE, PIPE_UNLIMITED_INSTANCES, PIPE_WAIT, }; use windows_sys::Win32::System::Threading::{ CreateEventA, SetEvent, WaitForMultipleObjects, WaitForSingleObject, INFINITE, @@ -479,24 +479,30 @@ impl SeqpacketConn { use winapi::um::winnt::{GENERIC_READ, GENERIC_WRITE}; let name = path_to_null_terminated(path.as_ref()); - let h = unsafe { - CreateFileA( - name.as_ptr() as *const i8, - GENERIC_READ | GENERIC_WRITE, - 0, - null_mut(), - OPEN_EXISTING, - 0, // synchronous, non-overlapped - null_mut(), - ) - }; - if h == INVALID_HANDLE_VALUE { + let h = loop { + let h = unsafe { + CreateFileA( + name.as_ptr() as *const i8, + GENERIC_READ | GENERIC_WRITE, + 0, + null_mut(), + OPEN_EXISTING, + 0, // synchronous, non-overlapped + null_mut(), + ) + }; + if h != INVALID_HANDLE_VALUE { + break h; + } let err = io::Error::last_os_error(); - if err.raw_os_error() == Some(ERROR_PIPE_BUSY as i32) { - return Err(io::ErrorKind::ConnectionRefused.into()); + if err.raw_os_error() != Some(ERROR_PIPE_BUSY as i32) { + return Err(err); } - return Err(err); - } + // All pipe instances are busy — block up to 5 s for one to become available. + if unsafe { WaitNamedPipeA(name.as_ptr() as *const u8, 5000) } == 0 { + return Err(io::Error::last_os_error()); + } + }; // Upgrade to message read-mode. let mode = PIPE_READMODE_MESSAGE; diff --git a/datadog-sidecar/src/entry.rs b/datadog-sidecar/src/entry.rs index a8cbef2a5c..69166c30ed 100644 --- a/datadog-sidecar/src/entry.rs +++ b/datadog-sidecar/src/entry.rs @@ -31,6 +31,10 @@ use crate::tracer::SHM_LIMITER; use crate::watchdog::Watchdog; use crate::{ddog_daemon_entry_point, setup_daemon_process}; +/// Callable returned by `setup_daemon_process` that blocks until the child sidecar +/// signals it is ready to accept connections. +pub type ReadinessWaiter = Box io::Result<()>>; + /// Configuration for main_loop behavior pub struct MainLoopConfig { pub enable_ctrl_c_handler: bool, @@ -256,7 +260,7 @@ pub fn daemonize(listener: IpcServer, mut cfg: Config) -> anyhow::Result<()> { } spawn_cfg.append_env("LSAN_OPTIONS", "detect_leaks=0"); - setup_daemon_process(listener, &mut spawn_cfg)?; + let wait_ready = setup_daemon_process(listener, &mut spawn_cfg)?; let mut lib_deps = cfg.library_dependencies; if let Some(appsec) = cfg.appsec_config.as_ref() { @@ -271,53 +275,9 @@ pub fn daemonize(listener: IpcServer, mut cfg: Config) -> anyhow::Result<()> { .map_err(io::Error::other) .context("Could not spawn the sidecar daemon")?; - Ok(()) -} - -/// Returns `true` for transient Windows named-pipe errors that indicate the server is not -/// yet ready to accept connections (pipe instance doesn't exist yet, or all instances are -/// busy). Always returns `false` on non-Windows so the retry loop is a no-op there. -#[cfg(windows)] -fn is_pipe_not_ready(e: &io::Error) -> bool { - // ERROR_FILE_NOT_FOUND → NotFound: pipe name not created yet - // ERROR_PIPE_BUSY → ConnectionRefused: all pipe instances are occupied - // (see SeqpacketConn::connect in datadog-ipc/src/platform/windows/sockets.rs) - matches!( - e.kind(), - io::ErrorKind::NotFound | io::ErrorKind::ConnectionRefused - ) -} + wait_ready().map_err(|e| anyhow::anyhow!("Sidecar failed to signal readiness: {e}"))?; -#[cfg(not(windows))] -fn is_pipe_not_ready(_e: &io::Error) -> bool { - false -} - -/// Connect to the sidecar, optionally retrying with exponential back-off. -/// -/// On Windows, a named-pipe client's `CreateFileA` fails immediately if the server has not -/// yet called `ConnectNamedPipe` — there is no kernel-level connection backlog like Unix -/// domain sockets provide. When `just_spawned` is `true` (we just called `daemonize()`), -/// we retry up to 20 times with doubling delays (1 ms → 100 ms cap, ≈ 1.9 s total budget) -/// to give the child time to start its Tokio runtime and enter the accept loop. -fn connect_to_sidecar_with_retry( - liaison: &impl setup::Liaison, - just_spawned: bool, -) -> io::Result { - let max_attempts: u32 = if just_spawned { 20 } else { 1 }; - let mut delay = std::time::Duration::from_millis(1); - for attempt in 0..max_attempts { - match liaison.connect_to_server() { - Ok(conn) => return Ok(SidecarTransport::from(conn)), - Err(e) if attempt + 1 < max_attempts && is_pipe_not_ready(&e) => { - // Safe to block: called from synchronous FFI context, never from a Tokio task. - std::thread::sleep(delay); - delay = (delay * 2).min(std::time::Duration::from_millis(100)); - } - Err(e) => return Err(e), - } - } - unreachable!("every loop iteration returns via Ok, retry, or Err arm") + Ok(()) } pub fn start_or_connect_to_sidecar(cfg: Config) -> anyhow::Result { @@ -335,16 +295,17 @@ pub fn start_or_connect_to_sidecar(cfg: Config) -> anyhow::Result setup::DefaultLiason::ipc_per_process(), }; - let mut just_spawned = false; let err = match liaison.attempt_listen() { Ok(Some(listener)) => { daemonize(listener, cfg)?; - just_spawned = true; None } Ok(None) => None, err => err.context("Error starting sidecar").err(), }; - connect_to_sidecar_with_retry(&liaison, just_spawned).map_err(|e| err.unwrap_or(e.into())) + liaison + .connect_to_server() + .map(SidecarTransport::from) + .map_err(|e| err.unwrap_or(e.into())) } diff --git a/datadog-sidecar/src/unix.rs b/datadog-sidecar/src/unix.rs index be42b52d49..31bf062f34 100644 --- a/datadog-sidecar/src/unix.rs +++ b/datadog-sidecar/src/unix.rs @@ -146,14 +146,17 @@ async fn accept_socket_loop( pub fn setup_daemon_process( listener: SeqpacketListener, spawn_cfg: &mut SpawnWorker, -) -> io::Result<()> { +) -> io::Result { spawn_cfg .daemonize(true) .process_name("datadog-ipc-helper") .pass_fd(unsafe { OwnedFd::from_raw_fd(listener.into_raw_fd()) }) .stdin(Stdio::Null); - Ok(()) + // Unix domain sockets have a kernel-level connection backlog: clients can connect + // successfully as soon as bind()+listen() have been called, even before the sidecar + // has called accept(). No readiness wait is needed on Unix. + Ok(Box::new(|| Ok(()))) } pub fn primary_sidecar_identifier() -> u32 { diff --git a/datadog-sidecar/src/windows.rs b/datadog-sidecar/src/windows.rs index 9bebe20d1f..774187833c 100644 --- a/datadog-sidecar/src/windows.rs +++ b/datadog-sidecar/src/windows.rs @@ -20,6 +20,9 @@ use std::sync::{Arc, Mutex}; use std::time::Instant; use tokio::select; use tracing::{error, info}; +use winapi::um::processthreadsapi::GetCurrentProcessId; +use winapi::um::synchapi::{CreateEventA, OpenEventA, SetEvent, WaitForSingleObject}; +use winapi::um::winbase::WAIT_OBJECT_0; use winapi::um::winnt::HANDLE; use winapi::{ shared::{sddl::ConvertSidToStringSidA, winerror::ERROR_INSUFFICIENT_BUFFER}, @@ -63,6 +66,9 @@ pub extern "C" fn ddog_daemon_entry_point(_trampoline_data: &TrampolineData) { } }; + // Notify the parent that we are ready to accept connections. + signal_sidecar_ready(); + Ok(( |handler| accept_socket_loop(listener, closed_future, handler), cancel, @@ -98,12 +104,47 @@ async fn accept_socket_loop( Ok(()) } +/// Signal the parent process that the sidecar is ready to accept connections. +/// +/// The parent's `wait_ready` closure (returned by `setup_daemon_process`) blocks on the +/// named event until this function fires. If the env var is absent (thread mode, tests) +/// this is a no-op. +fn signal_sidecar_ready() { + if let Ok(name) = std::env::var("DD_SIDECAR_READY_EVENT") { + if let Ok(name_c) = std::ffi::CString::new(name) { + unsafe { + // EVENT_MODIFY_STATE = 0x0002 + let h = OpenEventA(0x0002, 0, name_c.as_ptr() as *const i8); + if !h.is_null() { + SetEvent(h); + CloseHandle(h); + } + } + } + } +} + pub fn setup_daemon_process( listener: SeqpacketListener, spawn_cfg: &mut SpawnWorker, -) -> io::Result<()> { - // Ensure unique process names - we spawn one sidecar per console session id (see - // setup/windows.rs for the reasoning) +) -> io::Result { + // Create a named manual-reset event that the child sidecar will signal once its + // Tokio runtime is running and it is about to enter the accept loop. + let pid = unsafe { GetCurrentProcessId() }; + let event_name = format!("Local\\dd-sidecar-ready-{}", pid); + let event_name_nul = format!("{}\0", event_name); + let event = unsafe { + CreateEventA( + null_mut(), // default security attributes + 1, // manual-reset + 0, // initially non-signaled + event_name_nul.as_ptr() as *const i8, + ) + }; + if event.is_null() { + return Err(io::Error::last_os_error()); + } + let raw = listener.into_raw_handle(); let owned = unsafe { OwnedHandle::from_raw_handle(raw) }; spawn_cfg @@ -113,8 +154,20 @@ pub fn setup_daemon_process( )) .pass_handle(owned) .stdin(Stdio::Null); - - Ok(()) + spawn_cfg.append_env("DD_SIDECAR_READY_EVENT", &event_name); + + // Store as usize so the closure is Send (raw HANDLE is *mut c_void). + let event_val = event as usize; + Ok(Box::new(move || { + let h = event_val as HANDLE; + let result = unsafe { WaitForSingleObject(h, 30_000) }; + unsafe { CloseHandle(h) }; + if result == WAIT_OBJECT_0 { + Ok(()) + } else { + Err(io::Error::last_os_error()) + } + })) } pub fn ddog_setup_crashtracking(endpoint: Option<&Endpoint>, metadata: Metadata) -> bool { From c65ffccb3a1a86093fb70789e5d45015a8b2747e Mon Sep 17 00:00:00 2001 From: Alexandre Rulleau Date: Wed, 8 Apr 2026 14:50:39 +0200 Subject: [PATCH 5/8] fix(ipc): remove unnecessary cast on WaitNamedPipeA argument --- datadog-ipc/src/platform/windows/sockets.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datadog-ipc/src/platform/windows/sockets.rs b/datadog-ipc/src/platform/windows/sockets.rs index 6f0edee020..aad4a6376b 100644 --- a/datadog-ipc/src/platform/windows/sockets.rs +++ b/datadog-ipc/src/platform/windows/sockets.rs @@ -499,7 +499,7 @@ impl SeqpacketConn { return Err(err); } // All pipe instances are busy — block up to 5 s for one to become available. - if unsafe { WaitNamedPipeA(name.as_ptr() as *const u8, 5000) } == 0 { + if unsafe { WaitNamedPipeA(name.as_ptr(), 5000) } == 0 { return Err(io::Error::last_os_error()); } }; From ecfdeb6d84a66832aac569e8fe88254daf07190f Mon Sep 17 00:00:00 2001 From: Alexandre Rulleau Date: Wed, 8 Apr 2026 15:32:52 +0200 Subject: [PATCH 6/8] =?UTF-8?q?fix(ipc):=20apply=20review=20suggestions=20?= =?UTF-8?q?=E2=80=94=20handle=20leak,=20timeout=20msg,=20bounded=20retry,?= =?UTF-8?q?=20constants?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- datadog-ipc/src/platform/windows/sockets.rs | 17 +++++- datadog-sidecar/src/windows.rs | 68 +++++++++++++++------ 2 files changed, 64 insertions(+), 21 deletions(-) diff --git a/datadog-ipc/src/platform/windows/sockets.rs b/datadog-ipc/src/platform/windows/sockets.rs index aad4a6376b..5b1f9d2df0 100644 --- a/datadog-ipc/src/platform/windows/sockets.rs +++ b/datadog-ipc/src/platform/windows/sockets.rs @@ -24,6 +24,7 @@ use crate::platform::message::MAX_FDS; use std::task::{Context, Poll}; +use std::time::{Duration, Instant}; use std::{ cell::RefCell, future::Future, @@ -479,6 +480,7 @@ impl SeqpacketConn { use winapi::um::winnt::{GENERIC_READ, GENERIC_WRITE}; let name = path_to_null_terminated(path.as_ref()); + let deadline = Instant::now() + Duration::from_secs(30); let h = loop { let h = unsafe { CreateFileA( @@ -498,8 +500,19 @@ impl SeqpacketConn { if err.raw_os_error() != Some(ERROR_PIPE_BUSY as i32) { return Err(err); } - // All pipe instances are busy — block up to 5 s for one to become available. - if unsafe { WaitNamedPipeA(name.as_ptr(), 5000) } == 0 { + // All pipe instances are busy — block up to 5 s (capped at remaining budget) + // for one to become available, then retry CreateFileA. + let remaining_ms = deadline + .saturating_duration_since(Instant::now()) + .as_millis(); + if remaining_ms == 0 { + return Err(io::Error::new( + io::ErrorKind::TimedOut, + "named pipe busy for 30 s: giving up", + )); + } + let wait_ms = remaining_ms.min(5000) as u32; + if unsafe { WaitNamedPipeA(name.as_ptr(), wait_ms) } == 0 { return Err(io::Error::last_os_error()); } }; diff --git a/datadog-sidecar/src/windows.rs b/datadog-sidecar/src/windows.rs index 774187833c..0f29b68b06 100644 --- a/datadog-sidecar/src/windows.rs +++ b/datadog-sidecar/src/windows.rs @@ -15,6 +15,7 @@ use std::ffi::CStr; use std::io::{self, Error}; use std::os::windows::io::{FromRawHandle, IntoRawHandle, OwnedHandle}; use std::ptr::null_mut; +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::LazyLock; use std::sync::{Arc, Mutex}; use std::time::Instant; @@ -22,7 +23,7 @@ use tokio::select; use tracing::{error, info}; use winapi::um::processthreadsapi::GetCurrentProcessId; use winapi::um::synchapi::{CreateEventA, OpenEventA, SetEvent, WaitForSingleObject}; -use winapi::um::winbase::WAIT_OBJECT_0; +use winapi::um::winbase::{WAIT_OBJECT_0, WAIT_TIMEOUT}; use winapi::um::winnt::HANDLE; use winapi::{ shared::{sddl::ConvertSidToStringSidA, winerror::ERROR_INSUFFICIENT_BUFFER}, @@ -31,10 +32,17 @@ use winapi::{ processthreadsapi::{GetCurrentProcess, OpenProcessToken}, securitybaseapi::{GetSidSubAuthority, GetSidSubAuthorityCount, GetTokenInformation}, winbase::LocalFree, - winnt::{TokenIntegrityLevel, TokenUser, TOKEN_MANDATORY_LABEL, TOKEN_QUERY, TOKEN_USER}, + winnt::{ + TokenIntegrityLevel, TokenUser, EVENT_MODIFY_STATE, TOKEN_MANDATORY_LABEL, TOKEN_QUERY, + TOKEN_USER, + }, }, }; +/// Monotonic counter so concurrent `setup_daemon_process` calls (e.g. in tests) +/// always get a unique readiness-event name within the same process. +static SPAWN_SEQ: AtomicU64 = AtomicU64::new(0); + /// cbindgen:ignore #[no_mangle] pub extern "C" fn ddog_daemon_entry_point(_trampoline_data: &TrampolineData) { @@ -113,32 +121,52 @@ fn signal_sidecar_ready() { if let Ok(name) = std::env::var("DD_SIDECAR_READY_EVENT") { if let Ok(name_c) = std::ffi::CString::new(name) { unsafe { - // EVENT_MODIFY_STATE = 0x0002 - let h = OpenEventA(0x0002, 0, name_c.as_ptr() as *const i8); + let h = OpenEventA(EVENT_MODIFY_STATE, 0, name_c.as_ptr()); if !h.is_null() { SetEvent(h); CloseHandle(h); + } else { + tracing::warn!( + "signal_sidecar_ready: OpenEventA failed: {:?}", + io::Error::last_os_error() + ); } } } } } +/// RAII wrapper that closes a Windows event handle on drop, ensuring no handle +/// leak even if the `ReadinessWaiter` closure is dropped without being called +/// (e.g. when `wait_spawn()` returns an error). +struct OwnedEvent(usize); + +impl Drop for OwnedEvent { + fn drop(&mut self) { + unsafe { CloseHandle(self.0 as HANDLE) }; + } +} + pub fn setup_daemon_process( listener: SeqpacketListener, spawn_cfg: &mut SpawnWorker, ) -> io::Result { // Create a named manual-reset event that the child sidecar will signal once its // Tokio runtime is running and it is about to enter the accept loop. + // + // The name includes the PID and a monotonic sequence number so that concurrent + // calls from the same process (e.g. in parallel tests) use distinct events. let pid = unsafe { GetCurrentProcessId() }; - let event_name = format!("Local\\dd-sidecar-ready-{}", pid); - let event_name_nul = format!("{}\0", event_name); + let seq = SPAWN_SEQ.fetch_add(1, Ordering::Relaxed); + let event_name = format!("Local\\dd-sidecar-ready-{}-{}", pid, seq); + let event_name_c = std::ffi::CString::new(event_name.as_str()) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; let event = unsafe { CreateEventA( - null_mut(), // default security attributes - 1, // manual-reset - 0, // initially non-signaled - event_name_nul.as_ptr() as *const i8, + null_mut(), // default security attributes + 1, // manual-reset + 0, // initially non-signaled + event_name_c.as_ptr(), // CString::as_ptr() → *const c_char, no extra cast needed ) }; if event.is_null() { @@ -156,16 +184,18 @@ pub fn setup_daemon_process( .stdin(Stdio::Null); spawn_cfg.append_env("DD_SIDECAR_READY_EVENT", &event_name); - // Store as usize so the closure is Send (raw HANDLE is *mut c_void). - let event_val = event as usize; + // OwnedEvent ensures CloseHandle is called whether the closure is called or dropped. + let guard = OwnedEvent(event as usize); Ok(Box::new(move || { - let h = event_val as HANDLE; - let result = unsafe { WaitForSingleObject(h, 30_000) }; - unsafe { CloseHandle(h) }; - if result == WAIT_OBJECT_0 { - Ok(()) - } else { - Err(io::Error::last_os_error()) + let result = unsafe { WaitForSingleObject(guard.0 as HANDLE, 30_000) }; + // `guard` is dropped here (or when the Box itself is dropped), closing the handle. + match result { + WAIT_OBJECT_0 => Ok(()), + WAIT_TIMEOUT => Err(io::Error::new( + io::ErrorKind::TimedOut, + "sidecar did not signal readiness within 30 s", + )), + _ => Err(io::Error::last_os_error()), // WAIT_FAILED } })) } From 4d433340e1727b07e14169de422cb96aacf7b783 Mon Sep 17 00:00:00 2001 From: Alexandre Rulleau Date: Wed, 8 Apr 2026 15:47:31 +0200 Subject: [PATCH 7/8] fix(sidecar): move WAIT_TIMEOUT import to winapi::shared::winerror --- datadog-sidecar/src/windows.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/datadog-sidecar/src/windows.rs b/datadog-sidecar/src/windows.rs index 0f29b68b06..66d18c0887 100644 --- a/datadog-sidecar/src/windows.rs +++ b/datadog-sidecar/src/windows.rs @@ -23,10 +23,13 @@ use tokio::select; use tracing::{error, info}; use winapi::um::processthreadsapi::GetCurrentProcessId; use winapi::um::synchapi::{CreateEventA, OpenEventA, SetEvent, WaitForSingleObject}; -use winapi::um::winbase::{WAIT_OBJECT_0, WAIT_TIMEOUT}; +use winapi::um::winbase::WAIT_OBJECT_0; use winapi::um::winnt::HANDLE; use winapi::{ - shared::{sddl::ConvertSidToStringSidA, winerror::ERROR_INSUFFICIENT_BUFFER}, + shared::{ + sddl::ConvertSidToStringSidA, + winerror::{ERROR_INSUFFICIENT_BUFFER, WAIT_TIMEOUT}, + }, um::{ handleapi::CloseHandle, processthreadsapi::{GetCurrentProcess, OpenProcessToken}, From bb8deb214dfe4ea9fcfeaf53f919470a868db2c2 Mon Sep 17 00:00:00 2001 From: Alexandre Rulleau Date: Mon, 13 Apr 2026 13:18:42 +0200 Subject: [PATCH 8/8] fix(sidecar): store HANDLE directly in OwnedEvent instead of usize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pointer→integer→pointer round-trip (event as usize / guard.0 as HANDLE) creates a pointer with no provenance in Rust's strict provenance model, which can produce ERROR_INVALID_HANDLE (os error 6) at runtime. Store the HANDLE (*mut c_void) directly in OwnedEvent, eliminating all casts. Add `unsafe impl Send` with a SAFETY comment explaining why cross-thread use of a Windows HANDLE is valid. Also explicitly declare the winapi features used by this module (synchapi, handleapi, processthreadsapi) in Cargo.toml so they are guaranteed to be linked regardless of transitive dependencies. Fixes test_ddog_sidecar_register_app panicking with "Sidecar failed to signal readiness: The handle is invalid. (os error 6)". --- datadog-sidecar/Cargo.toml | 10 +++++++++- datadog-sidecar/src/windows.rs | 14 ++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/datadog-sidecar/Cargo.toml b/datadog-sidecar/Cargo.toml index e0837e8f37..91bf7fb866 100644 --- a/datadog-sidecar/Cargo.toml +++ b/datadog-sidecar/Cargo.toml @@ -95,7 +95,15 @@ sendfd = { version = "0.4", features = ["tokio"] } [target.'cfg(windows)'.dependencies] libdd-common-ffi = { path = "../libdd-common-ffi", default-features = false } libdd-crashtracker-ffi = { path = "../libdd-crashtracker-ffi", default-features = false, features = ["collector", "collector_windows"] } -winapi = { version = "0.3.9", features = ["securitybaseapi", "sddl", "winerror", "winbase"] } +winapi = { version = "0.3.9", features = [ + "securitybaseapi", + "sddl", + "winerror", + "winbase", + "synchapi", + "handleapi", + "processthreadsapi", +] } windows-sys = { version = "0.52.0", features = ["Win32_System_SystemInformation"] } [target.'cfg(windows_seh_wrapper)'.dependencies] diff --git a/datadog-sidecar/src/windows.rs b/datadog-sidecar/src/windows.rs index 66d18c0887..c552f050ee 100644 --- a/datadog-sidecar/src/windows.rs +++ b/datadog-sidecar/src/windows.rs @@ -142,11 +142,17 @@ fn signal_sidecar_ready() { /// RAII wrapper that closes a Windows event handle on drop, ensuring no handle /// leak even if the `ReadinessWaiter` closure is dropped without being called /// (e.g. when `wait_spawn()` returns an error). -struct OwnedEvent(usize); +struct OwnedEvent(HANDLE); + +// SAFETY: A Windows HANDLE is an opaque process-global integer (pointer-sized +// for historical reasons). Any thread within the process can call CloseHandle / +// WaitForSingleObject on the same handle value, so transferring ownership across +// threads is safe. +unsafe impl Send for OwnedEvent {} impl Drop for OwnedEvent { fn drop(&mut self) { - unsafe { CloseHandle(self.0 as HANDLE) }; + unsafe { CloseHandle(self.0) }; } } @@ -188,9 +194,9 @@ pub fn setup_daemon_process( spawn_cfg.append_env("DD_SIDECAR_READY_EVENT", &event_name); // OwnedEvent ensures CloseHandle is called whether the closure is called or dropped. - let guard = OwnedEvent(event as usize); + let guard = OwnedEvent(event); Ok(Box::new(move || { - let result = unsafe { WaitForSingleObject(guard.0 as HANDLE, 30_000) }; + let result = unsafe { WaitForSingleObject(guard.0, 30_000) }; // `guard` is dropped here (or when the Box itself is dropped), closing the handle. match result { WAIT_OBJECT_0 => Ok(()),