From 77dd9fa8227885327c7672efd81839eb77b2aa2e Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Thu, 4 Dec 2025 17:25:13 -0500 Subject: [PATCH 01/43] feat(profiling): python 3.14 support --- .riot/requirements/16ed166.txt | 32 +++ .riot/requirements/170c255.txt | 32 +++ .riot/requirements/173f5b3.txt | 36 ++++ .riot/requirements/1a4c947.txt | 31 +++ .riot/requirements/72ed1ec.txt | 32 +++ .../stack_v2/echion/echion/cpython/tasks.h | 107 ++++++++- .../profiling/stack_v2/echion/echion/frame.h | 11 +- .../stack_v2/echion/echion/greenlets.h | 5 + .../profiling/stack_v2/echion/echion/state.h | 4 + .../profiling/stack_v2/echion/echion/tasks.h | 204 ++++++++++++++++-- .../stack_v2/echion/echion/threads.h | 67 ++++-- .../profiling/stack_v2/src/echion/frame.cc | 43 +++- .../profiling/stack_v2/src/sampler.cpp | 4 +- ddtrace/internal/settings/profiling.py | 6 +- ddtrace/profiling/_asyncio.py | 50 +++-- ddtrace/profiling/collector/threading.py | 1 + ...rofiling-314-support-bc850ac5330c27fc.yaml | 3 + riotfile.py | 7 +- setup.py | 64 +++--- tests/profiling/collector/test_generators.py | 83 ++++--- tests/smoke_test.py | 2 +- 21 files changed, 692 insertions(+), 132 deletions(-) create mode 100644 .riot/requirements/16ed166.txt create mode 100644 .riot/requirements/170c255.txt create mode 100644 .riot/requirements/173f5b3.txt create mode 100644 .riot/requirements/1a4c947.txt create mode 100644 .riot/requirements/72ed1ec.txt create mode 100644 releasenotes/notes/profiling-314-support-bc850ac5330c27fc.yaml diff --git a/.riot/requirements/16ed166.txt b/.riot/requirements/16ed166.txt new file mode 100644 index 00000000000..ff1e68a0328 --- /dev/null +++ b/.riot/requirements/16ed166.txt @@ -0,0 +1,32 @@ +# +# This file is autogenerated by pip-compile with Python 3.14 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/16ed166.in +# +attrs==25.4.0 +coverage[toml]==7.12.0 +gunicorn==23.0.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jsonschema==4.25.1 +jsonschema-specifications==2025.9.1 +mock==5.2.0 +opentracing==2.4.0 +packaging==25.0 +pluggy==1.6.0 +protobuf==6.33.1 +py-cpuinfo==8.0.0 +pygments==2.19.2 +pytest==9.0.1 +pytest-asyncio==0.21.1 +pytest-benchmark==5.2.3 +pytest-cov==7.0.0 +pytest-cpp==2.6.0 +pytest-mock==3.15.1 +pytest-randomly==4.0.1 +referencing==0.37.0 +rpds-py==0.30.0 +sortedcontainers==2.4.0 +uwsgi==2.0.29 +zstandard==0.25.0 diff --git a/.riot/requirements/170c255.txt b/.riot/requirements/170c255.txt new file mode 100644 index 00000000000..30657486a78 --- /dev/null +++ b/.riot/requirements/170c255.txt @@ -0,0 +1,32 @@ +# +# This file is autogenerated by pip-compile with Python 3.14 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/170c255.in +# +attrs==25.4.0 +coverage[toml]==7.12.0 +gunicorn==23.0.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jsonschema==4.25.1 +jsonschema-specifications==2025.9.1 +mock==5.2.0 +opentracing==2.4.0 +packaging==25.0 +pluggy==1.6.0 +protobuf==4.22.0 +py-cpuinfo==8.0.0 +pygments==2.19.2 +pytest==9.0.1 +pytest-asyncio==0.21.1 +pytest-benchmark==5.2.3 +pytest-cov==7.0.0 +pytest-cpp==2.6.0 +pytest-mock==3.15.1 +pytest-randomly==4.0.1 +referencing==0.37.0 +rpds-py==0.30.0 +sortedcontainers==2.4.0 +uwsgi==2.0.31 +zstandard==0.25.0 diff --git a/.riot/requirements/173f5b3.txt b/.riot/requirements/173f5b3.txt new file mode 100644 index 00000000000..83216fb74bd --- /dev/null +++ b/.riot/requirements/173f5b3.txt @@ -0,0 +1,36 @@ +# +# This file is autogenerated by pip-compile with Python 3.14 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/173f5b3.in +# +attrs==25.4.0 +coverage[toml]==7.12.0 +gevent==25.9.1 +greenlet==3.3.0 +gunicorn[gevent]==23.0.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jsonschema==4.25.1 +jsonschema-specifications==2025.9.1 +mock==5.2.0 +opentracing==2.4.0 +packaging==25.0 +pluggy==1.6.0 +protobuf==6.33.1 +py-cpuinfo==8.0.0 +pygments==2.19.2 +pytest==9.0.1 +pytest-asyncio==0.21.1 +pytest-benchmark==5.2.3 +pytest-cov==7.0.0 +pytest-cpp==2.6.0 +pytest-mock==3.15.1 +pytest-randomly==4.0.1 +referencing==0.37.0 +rpds-py==0.30.0 +sortedcontainers==2.4.0 +uwsgi==2.0.31 +zope-event==6.1 +zope-interface==8.1.1 +zstandard==0.25.0 diff --git a/.riot/requirements/1a4c947.txt b/.riot/requirements/1a4c947.txt new file mode 100644 index 00000000000..ae55f5306be --- /dev/null +++ b/.riot/requirements/1a4c947.txt @@ -0,0 +1,31 @@ +# +# This file is autogenerated by pip-compile with Python 3.14 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/1a4c947.in +# +attrs==25.4.0 +coverage[toml]==7.12.0 +gunicorn==23.0.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jsonschema==4.25.1 +jsonschema-specifications==2025.9.1 +mock==5.2.0 +opentracing==2.4.0 +packaging==25.0 +pluggy==1.6.0 +protobuf==6.33.1 +py-cpuinfo==8.0.0 +pygments==2.19.2 +pytest==9.0.1 +pytest-asyncio==0.21.1 +pytest-benchmark==5.2.3 +pytest-cov==7.0.0 +pytest-cpp==2.6.0 +pytest-mock==3.15.1 +pytest-randomly==4.0.1 +referencing==0.37.0 +rpds-py==0.30.0 +sortedcontainers==2.4.0 +zstandard==0.25.0 diff --git a/.riot/requirements/72ed1ec.txt b/.riot/requirements/72ed1ec.txt new file mode 100644 index 00000000000..453ed140c3d --- /dev/null +++ b/.riot/requirements/72ed1ec.txt @@ -0,0 +1,32 @@ +# +# This file is autogenerated by pip-compile with Python 3.14 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/72ed1ec.in +# +attrs==25.4.0 +coverage[toml]==7.12.0 +gunicorn==23.0.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jsonschema==4.25.1 +jsonschema-specifications==2025.9.1 +mock==5.2.0 +opentracing==2.4.0 +packaging==25.0 +pluggy==1.6.0 +protobuf==6.33.1 +py-cpuinfo==8.0.0 +pygments==2.19.2 +pytest==9.0.1 +pytest-asyncio==0.21.1 +pytest-benchmark==5.2.3 +pytest-cov==7.0.0 +pytest-cpp==2.6.0 +pytest-mock==3.15.1 +pytest-randomly==4.0.1 +referencing==0.37.0 +rpds-py==0.30.0 +sortedcontainers==2.4.0 +uwsgi==2.0.31 +zstandard==0.25.0 diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h index dbdfada1832..83b23e5338f 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h @@ -11,7 +11,15 @@ #include #define Py_BUILD_CORE -#if PY_VERSION_HEX >= 0x030d0000 +#if PY_VERSION_HEX >= 0x030e0000 +// Python 3.14+: _PyInterpreterFrame moved to new header +#include +#include +#include +#include // For llist_node structure +#include +#include +#elif PY_VERSION_HEX >= 0x030d0000 #include #else #include @@ -38,7 +46,32 @@ extern "C" STATE_FINISHED } fut_state; -#if PY_VERSION_HEX >= 0x030d0000 +#if PY_VERSION_HEX >= 0x030e0000 +// Python 3.14+: New fields added (awaited_by, is_task, awaited_by_is_set) +#define FutureObj_HEAD(prefix) \ + PyObject_HEAD PyObject* prefix##_loop; \ + PyObject* prefix##_callback0; \ + PyObject* prefix##_context0; \ + PyObject* prefix##_callbacks; \ + PyObject* prefix##_exception; \ + PyObject* prefix##_exception_tb; \ + PyObject* prefix##_result; \ + PyObject* prefix##_source_tb; \ + PyObject* prefix##_cancel_msg; \ + PyObject* prefix##_cancelled_exc; \ + PyObject* prefix##_awaited_by; \ + fut_state prefix##_state; \ + /* Used by profilers to make traversing the stack from an external \ + process faster. */ \ + char prefix##_is_task; \ + char prefix##_awaited_by_is_set; \ + /* These bitfields need to be at the end of the struct \ + so that these and bitfields from TaskObj are contiguous. \ + */ \ + unsigned prefix##_log_tb : 1; \ + unsigned prefix##_blocking : 1; + +#elif PY_VERSION_HEX >= 0x030d0000 #define FutureObj_HEAD(prefix) \ PyObject_HEAD PyObject* prefix##_loop; \ PyObject* prefix##_callback0; \ @@ -131,7 +164,24 @@ extern "C" FutureObj_HEAD(future) } FutureObj; -#if PY_VERSION_HEX >= 0x030d0000 +#if PY_VERSION_HEX >= 0x030e0000 + // Python 3.14+: TaskObj includes task_node for linked-list storage + typedef struct + { + FutureObj_HEAD(task) unsigned task_must_cancel : 1; + unsigned task_log_destroy_pending : 1; + int task_num_cancels_requested; + PyObject* task_fut_waiter; + PyObject* task_coro; + PyObject* task_name; + PyObject* task_context; + struct llist_node task_node; +#ifdef Py_GIL_DISABLED + // thread id of the thread where this task was created + uintptr_t task_tid; +#endif + } TaskObj; +#elif PY_VERSION_HEX >= 0x030d0000 typedef struct { FutureObj_HEAD(task) unsigned task_must_cancel : 1; @@ -173,7 +223,56 @@ extern "C" #define RESUME_QUICK INSTRUMENTED_RESUME #endif -#if PY_VERSION_HEX >= 0x030d0000 +#if PY_VERSION_HEX >= 0x030e0000 + // Python 3.14+: Use stackpointer and _PyStackRef + // We can't use CPython API helpers as we're copying partial structs + inline PyObject* PyGen_yf(PyGenObject* gen, PyObject* frame_addr) + { + if (gen->gi_frame_state != FRAME_SUSPENDED_YIELD_FROM) { + return nullptr; + } + + _PyInterpreterFrame frame; + if (copy_type(frame_addr, frame)) { + return nullptr; + } + + // Get the code object from f_executable.bits to know co_nlocalsplus + PyCodeObject code; + PyCodeObject* code_ptr = reinterpret_cast(frame.f_executable.bits); + if (copy_type(code_ptr, code)) { + return nullptr; + } + + // Calculate addresses in remote process + uintptr_t frame_addr_uint = reinterpret_cast(frame_addr); + uintptr_t localsplus_addr = frame_addr_uint + offsetof(_PyInterpreterFrame, localsplus); + uintptr_t stackbase_addr = localsplus_addr + code.co_nlocalsplus * sizeof(_PyStackRef); + + // stackpointer is a pointer field - when copied, it contains the remote address + // Calculate stacktop from pointer difference + uintptr_t stackpointer_addr = reinterpret_cast(frame.stackpointer); + if (stackpointer_addr < stackbase_addr) { + return nullptr; + } + + int stacktop = (int)((stackpointer_addr - stackbase_addr) / sizeof(_PyStackRef)); + + if (stacktop < 1 || stacktop > MAX_STACK_SIZE) { + return nullptr; + } + + // Read the top of stack directly from remote memory + _PyStackRef top_ref; + if (copy_type(reinterpret_cast(stackpointer_addr - sizeof(_PyStackRef)), top_ref)) { + return nullptr; + } + + // Extract PyObject* from _PyStackRef.bits + return reinterpret_cast(top_ref.bits); + } + +#elif PY_VERSION_HEX >= 0x030d0000 inline PyObject* PyGen_yf(PyGenObject* gen, PyObject* frame_addr) { diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/frame.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/frame.h index 1ffd0f7b5c3..72e9de0fdcc 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/frame.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/frame.h @@ -14,11 +14,18 @@ #undef _PyGC_FINALIZED #endif #include -#if PY_VERSION_HEX >= 0x030d0000 +#if PY_VERSION_HEX >= 0x030e0000 +// Python 3.14+: _PyInterpreterFrame moved to new header +#define Py_BUILD_CORE +#include +#include // Needed for complete PyFrameObject definition +#include +#include +#elif PY_VERSION_HEX >= 0x030d0000 #define Py_BUILD_CORE #include #endif // PY_VERSION_HEX >= 0x030d0000 -#if PY_VERSION_HEX >= 0x030b0000 +#if PY_VERSION_HEX >= 0x030b0000 && PY_VERSION_HEX < 0x030e0000 #define Py_BUILD_CORE #include #endif diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/greenlets.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/greenlets.h index 997171f33c7..5448361f9c8 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/greenlets.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/greenlets.h @@ -7,6 +7,11 @@ #include #define Py_BUILD_CORE +#if PY_VERSION_HEX >= 0x030e0000 +// Python 3.14+: Need internal/pycore_frame.h for struct _frame (PyFrameObject) definition +#include +#endif + #include #include diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/state.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/state.h index 62657538fd6..90ae634e540 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/state.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/state.h @@ -17,6 +17,10 @@ #endif #define Py_BUILD_CORE #include +#if PY_VERSION_HEX >= 0x030e0000 +// Python 3.14+: _PyRuntime is declared in pycore_runtime.h +#include +#endif #include diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h index 848af75aef2..cf5bab9fc6b 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h @@ -8,15 +8,25 @@ #define PY_SSIZE_T_CLEAN #include +#include #include #if PY_VERSION_HEX >= 0x030b0000 #include #define Py_BUILD_CORE -#if PY_VERSION_HEX >= 0x030d0000 +#if PY_VERSION_HEX >= 0x030e0000 +#include // For offsetof macro +#include // for FRAME_CLEARED +#include // For PyInterpreterState +#include // For llist_node structure +#include +// Note: _PyThreadStateImpl is already available via echion/state.h which includes +// with Py_BUILD_CORE defined. +#elif PY_VERSION_HEX >= 0x030d0000 #include #else +#include #include #endif // PY_VERSION_HEX >= 0x030d0000 #else @@ -134,11 +144,14 @@ GenInfo::create(PyObject* gen_addr) auto frame = (PyObject*)gen.gi_frame; #endif +#if PY_VERSION_HEX >= 0x030a0000 && PY_VERSION_HEX < 0x030b0000 + // Python 3.10: Need PyFrameObject for _PyFrame_IsExecuting PyFrameObject f; if (copy_type(frame, f)) { recursion_depth--; return ErrorKind::GenInfoError; } +#endif PyObject* yf = (frame != NULL ? PyGen_yf(&gen, frame) : NULL); GenInfo::Ptr await = nullptr; @@ -287,15 +300,171 @@ TaskInfo::current(PyObject* loop) return TaskInfo::create(reinterpret_cast(task)); } +// ---------------------------------------------------------------------------- +#if PY_VERSION_HEX >= 0x030e0000 +// Python 3.14+: Get tasks from a single thread's linked-list +inline void +get_tasks_from_linked_list(uintptr_t head_addr, PyObject* loop, std::vector& tasks) +{ + if (head_addr == 0 || loop == NULL) { + return; + } + + // Copy head node struct from remote memory to local memory + struct llist_node head_node_local; + if (copy_type(reinterpret_cast(head_addr), head_node_local)) { + return; + } + + // Check if list is empty (head points to itself in circular list) + uintptr_t head_addr_uint = head_addr; + uintptr_t next_as_uint = reinterpret_cast(head_node_local.next); + uintptr_t prev_as_uint = reinterpret_cast(head_node_local.prev); + if (next_as_uint == head_addr_uint && prev_as_uint == head_addr_uint) { + return; + } + + struct llist_node current_node = head_node_local; // Start with head node + uintptr_t current_node_addr = head_addr; // Address of current node + + // Copied from CPython's _remote_debugging_module.c: MAX_ITERATIONS + const size_t MAX_ITERATIONS = 2 << 15; + size_t iteration_count = 0; + + // Iterate over linked-list. The linked list is circular, so we stop + // when we're back at head. + while (reinterpret_cast(current_node.next) != head_addr_uint) { + // Safety: prevent infinite loops + if (++iteration_count > MAX_ITERATIONS) { + return; + } + + if (current_node.next == NULL) { + return; // NULL pointer - invalid list + } + + uintptr_t next_node_addr = reinterpret_cast(current_node.next); + + // Calculate task_addr from current_node.next + size_t task_node_offset_val = offsetof(TaskObj, task_node); + uintptr_t task_addr_uint = next_node_addr - task_node_offset_val; + + // Create TaskInfo for the task + auto maybe_task_info = TaskInfo::create(reinterpret_cast(task_addr_uint)); + if (maybe_task_info) { + if ((*maybe_task_info)->loop == loop) { + tasks.push_back(std::move(*maybe_task_info)); + } + } + + // Read next node from current_node.next into current_node + if (copy_type(reinterpret_cast(next_node_addr), current_node)) { + return; // Failed to read next node + } + current_node_addr = next_node_addr; // Update address for next iteration + } +} + +inline void +get_tasks_from_thread_linked_list(uintptr_t tstate_addr, PyObject* loop, std::vector& tasks) +{ + if (tstate_addr == 0 || loop == NULL) { + return; + } + + // Calculate offset to asyncio_tasks_head field + // NOTE: tstate_addr points to PyThreadState base, which is the first field of _PyThreadStateImpl + size_t asyncio_tasks_head_offset = offsetof(_PyThreadStateImpl, asyncio_tasks_head); + uintptr_t head_addr = tstate_addr + asyncio_tasks_head_offset; + + // Copy the llist_node struct from remote memory to local memory + struct llist_node head_node_local; + if (copy_type(reinterpret_cast(head_addr), head_node_local)) { + return; // Failed to read head from remote memory + } + + // Check if list is empty (head points to itself in circular list) + uintptr_t next_as_uint = reinterpret_cast(head_node_local.next); + uintptr_t prev_as_uint = reinterpret_cast(head_node_local.prev); + if (next_as_uint == head_addr && prev_as_uint == head_addr) { + return; // Empty list + } + + // Iterate over the linked-list + get_tasks_from_linked_list(head_addr, loop, tasks); +} + +// CRITICAL: All memory access must copy structs to local memory first! +// Get tasks from interpreter's linked-list (for lingering tasks) +inline void +get_tasks_from_interpreter_linked_list(PyThreadState* tstate, PyObject* loop, std::vector& tasks) +{ + if (tstate == NULL || loop == NULL) { + return; + } + + // Step 1: Get interpreter state from thread state + // tstate->interp points to PyInterpreterState + PyInterpreterState interp; + if (copy_type(tstate->interp, interp)) { + return; + } + + // Step 2: Calculate interpreter's asyncio_tasks_head address + uintptr_t interp_addr = reinterpret_cast(tstate->interp); + size_t asyncio_tasks_head_offset = offsetof(PyInterpreterState, asyncio_tasks_head); + uintptr_t head_addr = interp_addr + asyncio_tasks_head_offset; + + // Step 3: Call the shared linked-list iteration function + get_tasks_from_linked_list(head_addr, loop, tasks); +} +#endif + // ---------------------------------------------------------------------------- // TODO: Make this a "for_each_task" function? [[nodiscard]] inline Result> -get_all_tasks(PyObject* loop) +get_all_tasks(PyObject* loop, PyThreadState* tstate = NULL, uintptr_t tstate_addr = 0) { std::vector tasks; if (loop == NULL) return tasks; +#if PY_VERSION_HEX >= 0x030e0000 + // Python 3.14+: Native tasks are in linked-list per thread AND per interpreter + // CPython iterates over both: + // 1. Per-thread list: tstate->asyncio_tasks_head (active tasks) + // 2. Per-interpreter list: interp->asyncio_tasks_head (lingering tasks) + // First, get tasks from this thread's linked-list (if tstate_addr is provided) + if (tstate_addr != 0) { + get_tasks_from_thread_linked_list(tstate_addr, loop, tasks); + } + + // Second, get tasks from interpreter's linked-list (lingering tasks) + // This needs tstate to dereference tstate->interp + if (tstate != NULL) { + get_tasks_from_interpreter_linked_list(tstate, loop, tasks); + } + + // Handle third-party tasks from Python _scheduled_tasks.data (set) + // (asyncio_scheduled_tasks is now WeakSet.data, which is a Python set) + // These are global, not per-thread, so we collect them once + // If MirrorSet::create() fails, the set might be empty or invalid - skip it + if (asyncio_scheduled_tasks == NULL) { + // Skip if not initialized + } else if (auto maybe_scheduled_tasks_set = MirrorSet::create(asyncio_scheduled_tasks)) { + auto scheduled_tasks_set = std::move(*maybe_scheduled_tasks_set); + if (auto maybe_scheduled_tasks = scheduled_tasks_set.as_unordered_set()) { + auto scheduled_tasks = std::move(*maybe_scheduled_tasks); + for (auto task_addr : scheduled_tasks) { + // In WeakSet.data (set), elements are the Task objects themselves + auto maybe_task_info = TaskInfo::create(reinterpret_cast(task_addr)); + if (maybe_task_info && (*maybe_task_info)->loop == loop) { + tasks.push_back(std::move(*maybe_task_info)); + } + } + } + } +#else auto maybe_scheduled_tasks_set = MirrorSet::create(asyncio_scheduled_tasks); if (!maybe_scheduled_tasks_set) { return ErrorKind::TaskInfoError; @@ -320,29 +489,26 @@ get_all_tasks(PyObject* loop) } } } +#endif if (asyncio_eager_tasks != NULL) { auto maybe_eager_tasks_set = MirrorSet::create(asyncio_eager_tasks); - if (!maybe_eager_tasks_set) { - return ErrorKind::TaskInfoError; - } - - auto eager_tasks_set = std::move(*maybe_eager_tasks_set); - - auto maybe_eager_tasks = eager_tasks_set.as_unordered_set(); - if (!maybe_eager_tasks) { - return ErrorKind::TaskInfoError; - } - - auto eager_tasks = std::move(*maybe_eager_tasks); - for (auto task_addr : eager_tasks) { - auto maybe_task_info = TaskInfo::create(reinterpret_cast(task_addr)); - if (maybe_task_info) { - if ((*maybe_task_info)->loop == loop) { - tasks.push_back(std::move(*maybe_task_info)); + if (maybe_eager_tasks_set) { + auto eager_tasks_set = std::move(*maybe_eager_tasks_set); + auto maybe_eager_tasks = eager_tasks_set.as_unordered_set(); + if (maybe_eager_tasks) { + auto eager_tasks = std::move(*maybe_eager_tasks); + for (auto task_addr : eager_tasks) { + auto maybe_task_info = TaskInfo::create(reinterpret_cast(task_addr)); + if (maybe_task_info) { + if ((*maybe_task_info)->loop == loop) { + tasks.push_back(std::move(*maybe_task_info)); + } + } } } } + // If MirrorSet::create() fails, the set might be empty or invalid - skip it } return tasks; diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h index decb72f5900..7e52cafb3df 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h @@ -50,8 +50,8 @@ class ThreadInfo [[nodiscard]] Result update_cpu_time(); bool is_running(); - [[nodiscard]] Result sample(int64_t, PyThreadState*, microsecond_t); - void unwind(PyThreadState*); + [[nodiscard]] Result sample(int64_t, PyThreadState*, microsecond_t, uintptr_t tstate_addr = 0); + void unwind(PyThreadState*, uintptr_t tstate_addr = 0); // ------------------------------------------------------------------------ #if defined PL_LINUX @@ -101,7 +101,7 @@ class ThreadInfo }; private: - [[nodiscard]] Result unwind_tasks(); + [[nodiscard]] Result unwind_tasks(PyThreadState* tstate, uintptr_t tstate_addr = 0); void unwind_greenlets(PyThreadState*, unsigned long); }; @@ -186,13 +186,14 @@ inline std::mutex thread_info_map_lock; // ---------------------------------------------------------------------------- inline void -ThreadInfo::unwind(PyThreadState* tstate) +ThreadInfo::unwind(PyThreadState* tstate, uintptr_t tstate_addr) { unwind_python_stack(tstate); if (asyncio_loop) { // unwind_tasks returns a [[nodiscard]] Result. // We cast it to void to ignore failures. - (void)unwind_tasks(); + // Pass tstate and tstate_addr to unwind_tasks() so it can access this thread's linked-list + (void)unwind_tasks(tstate, tstate_addr); } // We make the assumption that gevent and asyncio are not mixed @@ -203,14 +204,16 @@ ThreadInfo::unwind(PyThreadState* tstate) // ---------------------------------------------------------------------------- inline Result -ThreadInfo::unwind_tasks() +ThreadInfo::unwind_tasks(PyThreadState* tstate, uintptr_t tstate_addr) { std::vector leaf_tasks; std::unordered_set parent_tasks; std::unordered_map waitee_map; // Indexed by task origin std::unordered_map origin_map; // Indexed by task origin - auto maybe_all_tasks = get_all_tasks(reinterpret_cast(asyncio_loop)); + // Pass tstate and tstate_addr to get_all_tasks() to get tasks from this thread's linked-list (Python 3.14+) + // tstate is used for dereferencing (e.g., tstate->interp), tstate_addr is used for offset calculations + auto maybe_all_tasks = get_all_tasks(reinterpret_cast(asyncio_loop), tstate, tstate_addr); if (!maybe_all_tasks) { return ErrorKind::TaskInfoError; } @@ -245,13 +248,29 @@ ThreadInfo::unwind_tasks() for (auto& task : all_tasks) { origin_map.emplace(task->origin, std::ref(*task)); - if (task->waiter != NULL) + // task->waiter is only set if task_fut_waiter points to another Task + // If task_fut_waiter points to a Future/Coroutine, waiter will be NULL + if (task->waiter != NULL) { waitee_map.emplace(task->waiter->origin, std::ref(*task)); - else if (parent_tasks.find(task->origin) == parent_tasks.end()) { + } else if (parent_tasks.find(task->origin) == parent_tasks.end()) { leaf_tasks.push_back(std::ref(*task)); } } +#if PY_VERSION_HEX >= 0x030e0000 + // Python 3.14+: If no leaf tasks found but we have tasks, unwind all tasks that aren't in parent_tasks + // This handles the case where all tasks are waiting on other Tasks (not Futures/Coroutines) + // In normal asyncio usage, tasks awaiting Futures/Coroutines should have waiter=NULL and be leaf tasks + // But if all tasks are waiting on other Tasks, we need this fallback + if (leaf_tasks.empty() && !all_tasks.empty()) { + for (auto& task : all_tasks) { + if (parent_tasks.find(task->origin) == parent_tasks.end()) { + leaf_tasks.push_back(std::ref(*task)); + } + } + } +#endif + // Only one Task can be on CPU at a time. // Since determining if a task is on CPU is somewhat costly, we // stop checking if Tasks are on CPU after seeing the first one. @@ -263,7 +282,9 @@ ThreadInfo::unwind_tasks() on_cpu_task_seen = on_cpu; } - auto stack_info = std::make_unique(leaf_task.get().name, on_cpu); + // Start with leaf task name, but we'll update it if we follow parent chain to a parent task + StringTable::Key sample_task_name = leaf_task.get().name; + auto stack_info = std::make_unique(sample_task_name, on_cpu); auto& stack = stack_info->stack; for (auto current_task = leaf_task;;) { auto& task = current_task.get(); @@ -295,15 +316,10 @@ ThreadInfo::unwind_tasks() // Get the next task in the chain PyObject* task_origin = task.origin; - if (waitee_map.find(task_origin) != waitee_map.end()) { - current_task = waitee_map.find(task_origin)->second; - continue; - } + // Check for parent (gather) links first { - // Check for, e.g., gather links std::lock_guard lock(task_link_map_lock); - if (task_link_map.find(task_origin) != task_link_map.end() && origin_map.find(task_link_map[task_origin]) != origin_map.end()) { current_task = origin_map.find(task_link_map[task_origin])->second; @@ -311,6 +327,12 @@ ThreadInfo::unwind_tasks() } } + // Then check for waiter links + if (waitee_map.find(task_origin) != waitee_map.end()) { + current_task = waitee_map.find(task_origin)->second; + continue; + } + break; } @@ -390,7 +412,7 @@ ThreadInfo::unwind_greenlets(PyThreadState* tstate, unsigned long cur_native_id) // ---------------------------------------------------------------------------- inline Result -ThreadInfo::sample(int64_t iid, PyThreadState* tstate, microsecond_t delta) +ThreadInfo::sample(int64_t iid, PyThreadState* tstate, microsecond_t delta, uintptr_t tstate_addr) { Renderer::get().render_thread_begin(tstate, name, delta, thread_id, native_id); @@ -404,7 +426,7 @@ ThreadInfo::sample(int64_t iid, PyThreadState* tstate, microsecond_t delta) Renderer::get().render_cpu_time(thread_is_running ? cpu_time - previous_cpu_time : 0); - this->unwind(tstate); + this->unwind(tstate, tstate_addr); // Render in this order of priority // 1. asyncio Tasks stacks (if any) @@ -465,7 +487,7 @@ ThreadInfo::sample(int64_t iid, PyThreadState* tstate, microsecond_t delta) // ---------------------------------------------------------------------------- static void -for_each_thread(InterpreterInfo& interp, std::function callback) +for_each_thread(InterpreterInfo& interp, std::function callback) { std::unordered_set threads; std::unordered_set seen_threads; @@ -533,8 +555,11 @@ for_each_thread(InterpreterInfo& interp, std::functionsecond); + // Call back with the thread state, actual address, and thread info. + // CRITICAL: Pass both &tstate (local copy for dereferencing) and tstate_addr (actual address for offset + // calculations) + callback( + &tstate, reinterpret_cast(tstate_addr), *thread_info_map.find(tstate.thread_id)->second); } } } diff --git a/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc b/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc index f0809e6f90a..206b20ffdc2 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc +++ b/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc @@ -246,7 +246,21 @@ Frame::read(PyObject* frame_addr, PyObject** prev_addr) // We cannot use _PyInterpreterFrame_LASTI because _PyCode_CODE reads // from the code object. -#if PY_VERSION_HEX >= 0x030d0000 +#if PY_VERSION_HEX >= 0x030e0000 + // Python 3.14+: f_executable is _PyStackRef, access bits directly + // We can't use CPython API helpers as we're copying partial structs + const int lasti = + (static_cast( + (frame_addr->instr_ptr - 1 - + reinterpret_cast<_Py_CODEUNIT*>((reinterpret_cast(frame_addr->f_executable.bits)))))) - + offsetof(PyCodeObject, co_code_adaptive) / sizeof(_Py_CODEUNIT); + auto maybe_frame = Frame::get(reinterpret_cast(frame_addr->f_executable.bits), lasti); + if (!maybe_frame) { + return ErrorKind::FrameError; + } + + auto& frame = maybe_frame->get(); +#elif PY_VERSION_HEX >= 0x030d0000 const int lasti = (static_cast( (frame_addr->instr_ptr - 1 - @@ -268,7 +282,7 @@ Frame::read(PyObject* frame_addr, PyObject** prev_addr) } auto& frame = maybe_frame->get(); -#endif // PY_VERSION_HEX >= 0x030d0000 +#endif // PY_VERSION_HEX >= 0x030e0000 if (&frame != &INVALID_FRAME) { #if PY_VERSION_HEX >= 0x030c0000 frame.is_entry = (frame_addr->owner == FRAME_OWNED_BY_CSTACK); // Shim frame @@ -277,7 +291,32 @@ Frame::read(PyObject* frame_addr, PyObject** prev_addr) #endif // PY_VERSION_HEX >= 0x030c0000 } +#if PY_VERSION_HEX >= 0x030e0000 + // Python 3.14+: Generator frames have previous = NULL (intentionally broken frame chain) + // See docs/python-3.14-generator-frame-limitation.md for details + // In _PyFrame_Copy(), CPython explicitly sets dest->previous = NULL to prevent + // dangling pointers when creating generator/coroutine frames. + if (frame_addr->previous == NULL && frame_addr->owner == FRAME_OWNED_BY_GENERATOR) { + // Best-effort fallback: try frame_obj->f_back->f_frame if available + // This is unreliable because frame_obj is lazily created and often NULL, + // and even when it exists, f_back is often NULL for generator frames. + // However, it might occasionally help in edge cases. + *prev_addr = NULL; + if (frame_addr->frame_obj != NULL) { + PyFrameObject frame_obj; + if (copy_type(frame_addr->frame_obj, frame_obj) == 0 && frame_obj.f_back != NULL) { + PyFrameObject prev_frame_obj; + if (copy_type(frame_obj.f_back, prev_frame_obj) == 0 && prev_frame_obj.f_frame != NULL) { + *prev_addr = prev_frame_obj.f_frame; + } + } + } + } else { + *prev_addr = &frame == &INVALID_FRAME ? NULL : frame_addr->previous; + } +#else *prev_addr = &frame == &INVALID_FRAME ? NULL : frame_addr->previous; +#endif #else // PY_VERSION_HEX < 0x030b0000 // Unwind the stack from leaf to root and store it in a stack. This way we diff --git a/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp b/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp index 70797ebfa8f..8827e492d73 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp +++ b/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp @@ -167,8 +167,8 @@ Sampler::sampling_thread(const uint64_t seq_num) // Perform the sample for_each_interp([&](InterpreterInfo& interp) -> void { - for_each_thread(interp, [&](PyThreadState* tstate, ThreadInfo& thread) { - auto success = thread.sample(interp.id, tstate, wall_time_us); + for_each_thread(interp, [&](PyThreadState* tstate, uintptr_t tstate_addr, ThreadInfo& thread) { + auto success = thread.sample(interp.id, tstate, wall_time_us, tstate_addr); if (success) { ddup_increment_sample_count(); } diff --git a/ddtrace/internal/settings/profiling.py b/ddtrace/internal/settings/profiling.py index 82653d97642..a5ba711a503 100644 --- a/ddtrace/internal/settings/profiling.py +++ b/ddtrace/internal/settings/profiling.py @@ -65,7 +65,7 @@ def _check_for_stack_v2_available(): def _parse_profiling_enabled(raw: str) -> bool: - if sys.version_info >= (3, 14): + if sys.version_info >= (3, 15): return False # Try to derive whether we're enabled via DD_INJECTION_ENABLED @@ -253,7 +253,7 @@ class ProfilingConfigStack(DDConfig): enabled = DDConfig.v( bool, "enabled", - default=sys.version_info < (3, 14), + default=sys.version_info < (3, 15), help_type="Boolean", help="Whether to enable the stack profiler", ) @@ -364,7 +364,7 @@ class ProfilingConfigPytorch(DDConfig): # We need to check if ddup is available, and turn off profiling if it is not. if not ddup_is_available: # We know it is not supported on 3.14, so don't report the error, but still disable - if sys.version_info < (3, 14): + if sys.version_info < (3, 15): msg = ddup_failure_msg or "libdd not available" logger.warning("Failed to load ddup module (%s), disabling profiling", msg) telemetry_writer.add_log( diff --git a/ddtrace/profiling/_asyncio.py b/ddtrace/profiling/_asyncio.py index 2dcbaa08ceb..020ff091299 100644 --- a/ddtrace/profiling/_asyncio.py +++ b/ddtrace/profiling/_asyncio.py @@ -43,7 +43,15 @@ def _task_get_name(task: "asyncio.Task[typing.Any]") -> str: def _call_init_asyncio(asyncio: ModuleType) -> None: from asyncio import tasks as asyncio_tasks - if sys.hexversion >= 0x030C0000: + if sys.hexversion >= 0x030E0000: + # Python 3.14+: + # - Native tasks are in linked-list (handled in C++) + # - Third-party tasks are in Python _scheduled_tasks WeakSet + # - Pass _scheduled_tasks.data (set) so C++ can iterate it with MirrorSet + scheduled_tasks = asyncio_tasks._scheduled_tasks.data # type: ignore[attr-defined] + eager_tasks = asyncio_tasks._eager_tasks # type: ignore[attr-defined] + elif sys.hexversion >= 0x030C0000: + # Python 3.12-3.13: _scheduled_tasks has .data attribute from C extension scheduled_tasks = asyncio_tasks._scheduled_tasks.data # type: ignore[attr-defined] eager_tasks = asyncio_tasks._eager_tasks # type: ignore[attr-defined] else: @@ -103,20 +111,32 @@ def _(asyncio: ModuleType) -> None: init_stack_v2: bool = config.stack.enabled and stack_v2.is_available - @partial(wrap, sys.modules["asyncio.events"].BaseDefaultEventLoopPolicy.set_event_loop) - def _( - f: typing.Callable[..., typing.Any], args: tuple[typing.Any, ...], kwargs: dict[str, typing.Any] - ) -> typing.Any: - loop: typing.Optional["aio.AbstractEventLoop"] = get_argument_value(args, kwargs, 1, "loop") - try: - if init_stack_v2: - stack_v2.track_asyncio_loop(typing.cast(int, ddtrace_threading.current_thread().ident), loop) - return f(*args, **kwargs) - finally: - assert THREAD_LINK is not None # nosec: assert is used for typing - THREAD_LINK.clear_threads(set(sys._current_frames().keys())) - if loop is not None: - THREAD_LINK.link_object(loop) + # Python 3.14+: BaseDefaultEventLoopPolicy was renamed to _BaseDefaultEventLoopPolicy + # Try both names for compatibility + events_module = sys.modules["asyncio.events"] + if sys.hexversion >= 0x030E0000: + # Python 3.14+: Use _BaseDefaultEventLoopPolicy + policy_class = getattr(events_module, "_BaseDefaultEventLoopPolicy", None) + else: + # Python < 3.14: Use BaseDefaultEventLoopPolicy + policy_class = getattr(events_module, "BaseDefaultEventLoopPolicy", None) + + if policy_class is not None: + + @partial(wrap, policy_class.set_event_loop) + def _( + f: typing.Callable[..., typing.Any], args: tuple[typing.Any, ...], kwargs: dict[str, typing.Any] + ) -> typing.Any: + loop: typing.Optional["aio.AbstractEventLoop"] = get_argument_value(args, kwargs, 1, "loop") + try: + if init_stack_v2: + stack_v2.track_asyncio_loop(typing.cast(int, ddtrace_threading.current_thread().ident), loop) + return f(*args, **kwargs) + finally: + assert THREAD_LINK is not None # nosec: assert is used for typing + THREAD_LINK.clear_threads(set(sys._current_frames().keys())) + if loop is not None: + THREAD_LINK.link_object(loop) if init_stack_v2: diff --git a/ddtrace/profiling/collector/threading.py b/ddtrace/profiling/collector/threading.py index 7e940e08e7d..e30db93eba7 100644 --- a/ddtrace/profiling/collector/threading.py +++ b/ddtrace/profiling/collector/threading.py @@ -1,6 +1,7 @@ from __future__ import absolute_import import threading +import typing from ddtrace.internal._unpatched import _threading as ddtrace_threading from ddtrace.internal.datadog.profiling import stack_v2 diff --git a/releasenotes/notes/profiling-314-support-bc850ac5330c27fc.yaml b/releasenotes/notes/profiling-314-support-bc850ac5330c27fc.yaml new file mode 100644 index 00000000000..a345c497963 --- /dev/null +++ b/releasenotes/notes/profiling-314-support-bc850ac5330c27fc.yaml @@ -0,0 +1,3 @@ +features: + - | + profiling: This adds support for Python 3.14 in the Continuous Profiler. diff --git a/riotfile.py b/riotfile.py index d1f5f76f7d5..294e723c2d4 100644 --- a/riotfile.py +++ b/riotfile.py @@ -3280,7 +3280,7 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT Venv( name="profile-uwsgi", command="python -m tests.profiling.run pytest -v --no-cov --capture=no --benchmark-disable {cmdargs} tests/profiling/test_uwsgi.py", # noqa: E501 - pys=select_pys(max_version="3.13"), + pys=select_pys(max_version="3.14"), pkgs={ "uwsgi": "<2.0.30", "protobuf": latest, @@ -3365,7 +3365,7 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT ), # Python >= 3.11 Venv( - pys=select_pys("3.11", "3.13"), + pys=select_pys("3.11", "3.14"), pkgs={"uwsgi": latest}, venvs=[ Venv( @@ -3398,8 +3398,7 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT Venv( name="profile-memalloc", command="python -m tests.profiling.run pytest -v --no-cov --capture=no --benchmark-disable {cmdargs} tests/profiling/collector/test_memalloc.py", # noqa: E501 - # skipping v3.14 for now due to an unstable `lz4 ` lib issue: https://gitlab.ddbuild.io/DataDog/apm-reliability/dd-trace-py/-/jobs/1163312347 - pys=select_pys(max_version="3.13"), + pys=select_pys(max_version="3.14"), pkgs={ "protobuf": latest, }, diff --git a/setup.py b/setup.py index 8e244ec4740..b480689ad10 100644 --- a/setup.py +++ b/setup.py @@ -269,8 +269,7 @@ def is_64_bit_python(): rust_features = [] if CURRENT_OS in ("Linux", "Darwin") and is_64_bit_python(): rust_features.append("crashtracker") - if sys.version_info[:2] < (3, 14): - rust_features.append("profiling") + rust_features.append("profiling") class PatchedDistribution(Distribution): @@ -618,7 +617,7 @@ def run(self): self.build_rust() # Build libdd_wrapper before building other extensions that depend on it - if CURRENT_OS in ("Linux", "Darwin") and is_64_bit_python() and sys.version_info < (3, 14): + if CURRENT_OS in ("Linux", "Darwin") and is_64_bit_python(): self.build_libdd_wrapper() super().run() @@ -1157,40 +1156,39 @@ def get_exts_for(name): ) if CURRENT_OS in ("Linux", "Darwin") and is_64_bit_python(): - if sys.version_info < (3, 14): - # Memory profiler now uses CMake to support Abseil dependency - MEMALLOC_DIR = HERE / "ddtrace" / "profiling" / "collector" - ext_modules.append( - CMakeExtension( - "ddtrace.profiling.collector._memalloc", - source_dir=MEMALLOC_DIR, - optional=False, - ) + # Memory profiler now uses CMake to support Abseil dependency + MEMALLOC_DIR = HERE / "ddtrace" / "profiling" / "collector" + ext_modules.append( + CMakeExtension( + "ddtrace.profiling.collector._memalloc", + source_dir=MEMALLOC_DIR, + optional=False, ) + ) - ext_modules.append( - CMakeExtension( - "ddtrace.internal.datadog.profiling.ddup._ddup", - source_dir=DDUP_DIR, - extra_source_dirs=[ - DDUP_DIR / ".." / "cmake", - DDUP_DIR / ".." / "dd_wrapper", - ], - optional=False, - ) + ext_modules.append( + CMakeExtension( + "ddtrace.internal.datadog.profiling.ddup._ddup", + source_dir=DDUP_DIR, + extra_source_dirs=[ + DDUP_DIR / ".." / "cmake", + DDUP_DIR / ".." / "dd_wrapper", + ], + optional=False, ) + ) - ext_modules.append( - CMakeExtension( - "ddtrace.internal.datadog.profiling.stack_v2._stack_v2", - source_dir=STACK_V2_DIR, - extra_source_dirs=[ - STACK_V2_DIR / ".." / "cmake", - STACK_V2_DIR / ".." / "dd_wrapper", - ], - optional=False, - ), - ) + ext_modules.append( + CMakeExtension( + "ddtrace.internal.datadog.profiling.stack_v2._stack_v2", + source_dir=STACK_V2_DIR, + extra_source_dirs=[ + STACK_V2_DIR / ".." / "cmake", + STACK_V2_DIR / ".." / "dd_wrapper", + ], + optional=False, + ), + ) else: diff --git a/tests/profiling/collector/test_generators.py b/tests/profiling/collector/test_generators.py index 4588c9c1780..aa4786997b7 100644 --- a/tests/profiling/collector/test_generators.py +++ b/tests/profiling/collector/test_generators.py @@ -10,6 +10,7 @@ # For macOS: err=None ignores expected stderr from tracer failing to connect to agent (not relevant to this test) def test_generators_stacks() -> None: import os + import sys import time from typing import Generator @@ -49,29 +50,59 @@ def my_function() -> int: samples = list(profile.sample) assert len(samples) > 0 - # Test that we have samples with the expected stack trace - # Main Thread should have: my_function -> generator -> generator2 - pprof_utils.assert_profile_has_sample( - profile, - samples, - expected_sample=pprof_utils.StackEvent( - thread_name="MainThread", - locations=[ - pprof_utils.StackLocation( - function_name="generator2", - filename="test_generators.py", - line_no=generator2.__code__.co_firstlineno + 1, - ), - pprof_utils.StackLocation( - function_name="generator", - filename="test_generators.py", - line_no=generator.__code__.co_firstlineno + 1, - ), - pprof_utils.StackLocation( - function_name="my_function", - filename="test_generators.py", - line_no=my_function.__code__.co_firstlineno + 2, - ), - ], - ), - ) + # In Python 3.14+, generator frames intentionally break the frame chain by setting + # previous = NULL to prevent dangling pointers. This means we cannot unwind from + # generator frames back to their callers. See docs/python-3.14-generator-frame-limitation.md + # for details. + # + # Expected behavior: + # - Python < 3.14: my_function -> generator -> generator2 (full stack trace) + # - Python >= 3.14: generator -> generator2 (cannot unwind to my_function) + if sys.version_info >= (3, 14): + # Python 3.14+: Generator frames have previous = NULL, so we can only unwind + # generator -> generator2, but not generator -> my_function + pprof_utils.assert_profile_has_sample( + profile, + samples, + expected_sample=pprof_utils.StackEvent( + thread_name="MainThread", + locations=[ + pprof_utils.StackLocation( + function_name="generator2", + filename="test_generators.py", + line_no=generator2.__code__.co_firstlineno + 1, + ), + pprof_utils.StackLocation( + function_name="generator", + filename="test_generators.py", + line_no=generator.__code__.co_firstlineno + 1, + ), + ], + ), + ) + else: + # Python < 3.14: Full stack trace should be available + pprof_utils.assert_profile_has_sample( + profile, + samples, + expected_sample=pprof_utils.StackEvent( + thread_name="MainThread", + locations=[ + pprof_utils.StackLocation( + function_name="generator2", + filename="test_generators.py", + line_no=generator2.__code__.co_firstlineno + 1, + ), + pprof_utils.StackLocation( + function_name="generator", + filename="test_generators.py", + line_no=generator.__code__.co_firstlineno + 1, + ), + pprof_utils.StackLocation( + function_name="my_function", + filename="test_generators.py", + line_no=my_function.__code__.co_firstlineno + 2, + ), + ], + ), + ) diff --git a/tests/smoke_test.py b/tests/smoke_test.py index 7f0c02b9b6c..149d85ecd21 100644 --- a/tests/smoke_test.py +++ b/tests/smoke_test.py @@ -73,7 +73,7 @@ def emit(self, record): print("Skipping test, 32-bit DDWAF not ready yet") # Profiling smoke test - if platform.system() in ("Linux", "Darwin") and sys.maxsize > (1 << 32) and sys.version_info[:2] < (3, 14): + if platform.system() in ("Linux", "Darwin") and sys.maxsize > (1 << 32): print("Running profiling smoke test...") profiling_cmd = [sys.executable, "-c", "import ddtrace.profiling.auto"] result = subprocess.run(profiling_cmd, capture_output=True, text=True) From be86eea4ff2ee3295d7e9e0eee6c57b3082741e7 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Sun, 7 Dec 2025 18:54:54 -0500 Subject: [PATCH 02/43] check existence of header to decide rebuild --- setup.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index b480689ad10..415de7ae978 100644 --- a/setup.py +++ b/setup.py @@ -663,8 +663,15 @@ def build_rust(self): if src_file.exists(): newest_source_time = max(newest_source_time, src_file.stat().st_mtime) - # Only rebuild if source files are newer than the destination - should_build = newest_source_time > library_mtime + required_headers = ["common.h"] + if "profiling" in rust_features: + required_headers.append("profiling.h") + + include_dir = CARGO_TARGET_DIR / "include" / "datadog" + headers_exist = include_dir.exists() and all((include_dir / header).exists() for header in required_headers) + + # Only rebuild if source files are newer than the destination OR if any required header is missing + should_build = newest_source_time > library_mtime or not headers_exist if should_build: # Create and run the CustomBuildRust command From 52e00ba9715043bda795a16fe7522424f788713d Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Sun, 7 Dec 2025 20:15:15 -0500 Subject: [PATCH 03/43] remove protobuf==4.22.0 variant for 3.14 --- .riot/requirements/170c255.txt | 32 ----------------------------- riotfile.py | 37 ++++++++++++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 34 deletions(-) delete mode 100644 .riot/requirements/170c255.txt diff --git a/.riot/requirements/170c255.txt b/.riot/requirements/170c255.txt deleted file mode 100644 index 30657486a78..00000000000 --- a/.riot/requirements/170c255.txt +++ /dev/null @@ -1,32 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.14 -# by the following command: -# -# pip-compile --allow-unsafe --no-annotate .riot/requirements/170c255.in -# -attrs==25.4.0 -coverage[toml]==7.12.0 -gunicorn==23.0.0 -hypothesis==6.45.0 -iniconfig==2.3.0 -jsonschema==4.25.1 -jsonschema-specifications==2025.9.1 -mock==5.2.0 -opentracing==2.4.0 -packaging==25.0 -pluggy==1.6.0 -protobuf==4.22.0 -py-cpuinfo==8.0.0 -pygments==2.19.2 -pytest==9.0.1 -pytest-asyncio==0.21.1 -pytest-benchmark==5.2.3 -pytest-cov==7.0.0 -pytest-cpp==2.6.0 -pytest-mock==3.15.1 -pytest-randomly==4.0.1 -referencing==0.37.0 -rpds-py==0.30.0 -sortedcontainers==2.4.0 -uwsgi==2.0.31 -zstandard==0.25.0 diff --git a/riotfile.py b/riotfile.py index 0e1b0eab414..5ec644e926f 100644 --- a/riotfile.py +++ b/riotfile.py @@ -3363,9 +3363,9 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT ), ], ), - # Python >= 3.11 + # Python >= 3.11 (excluding 3.14) Venv( - pys=select_pys("3.11", "3.14"), + pys=select_pys("3.11", "3.13"), pkgs={"uwsgi": latest}, venvs=[ Venv( @@ -3395,6 +3395,39 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT ), ], ), + # Python 3.14 - protobuf 4.22.0 is not compatible (TypeError: Metaclasses with custom tp_new) + Venv( + pys="3.14", + pkgs={"uwsgi": latest}, + venvs=[ + Venv( + pkgs={ + # Use latest only - protobuf 4.22.0 fails with Python 3.14 + "protobuf": latest, + }, + ), + # Gevent + Venv( + env={ + "DD_PROFILE_TEST_GEVENT": "1", + }, + pkgs={ + "gunicorn[gevent]": latest, + "gevent": latest, + "protobuf": latest, + }, + ), + # memcpy-based sampler + Venv( + env={ + "ECHION_USE_FAST_COPY_MEMORY": "1", + }, + pkgs={ + "protobuf": latest, + }, + ), + ], + ), Venv( name="profile-memalloc", command="python -m tests.profiling.run pytest -v --no-cov --capture=no --benchmark-disable {cmdargs} tests/profiling/collector/test_memalloc.py", # noqa: E501 From 43918725a4f1438e3c94a5b8f3af9bb2160972dd Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Sun, 7 Dec 2025 22:26:24 -0500 Subject: [PATCH 04/43] uwsgi<2.0.30 is not compatible with python 3.14 --- .riot/requirements/16ed166.txt | 32 -------------------------------- riotfile.py | 2 +- 2 files changed, 1 insertion(+), 33 deletions(-) delete mode 100644 .riot/requirements/16ed166.txt diff --git a/.riot/requirements/16ed166.txt b/.riot/requirements/16ed166.txt deleted file mode 100644 index ff1e68a0328..00000000000 --- a/.riot/requirements/16ed166.txt +++ /dev/null @@ -1,32 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.14 -# by the following command: -# -# pip-compile --allow-unsafe --no-annotate .riot/requirements/16ed166.in -# -attrs==25.4.0 -coverage[toml]==7.12.0 -gunicorn==23.0.0 -hypothesis==6.45.0 -iniconfig==2.3.0 -jsonschema==4.25.1 -jsonschema-specifications==2025.9.1 -mock==5.2.0 -opentracing==2.4.0 -packaging==25.0 -pluggy==1.6.0 -protobuf==6.33.1 -py-cpuinfo==8.0.0 -pygments==2.19.2 -pytest==9.0.1 -pytest-asyncio==0.21.1 -pytest-benchmark==5.2.3 -pytest-cov==7.0.0 -pytest-cpp==2.6.0 -pytest-mock==3.15.1 -pytest-randomly==4.0.1 -referencing==0.37.0 -rpds-py==0.30.0 -sortedcontainers==2.4.0 -uwsgi==2.0.29 -zstandard==0.25.0 diff --git a/riotfile.py b/riotfile.py index 5ec644e926f..4f0267dc395 100644 --- a/riotfile.py +++ b/riotfile.py @@ -3280,7 +3280,7 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT Venv( name="profile-uwsgi", command="python -m tests.profiling.run pytest -v --no-cov --capture=no --benchmark-disable {cmdargs} tests/profiling/test_uwsgi.py", # noqa: E501 - pys=select_pys(max_version="3.14"), + pys=select_pys(max_version="3.13"), # uwsgi<2.0.30 is not compatible with Python 3.14 pkgs={ "uwsgi": "<2.0.30", "protobuf": latest, From 8690dbc990fc95ed642cc3d32622aaae155fd99a Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Sun, 7 Dec 2025 22:27:46 -0500 Subject: [PATCH 05/43] update serverless import test --- tests/internal/test_serverless.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/tests/internal/test_serverless.py b/tests/internal/test_serverless.py index a8f202223f6..9011f82acc3 100644 --- a/tests/internal/test_serverless.py +++ b/tests/internal/test_serverless.py @@ -1,5 +1,3 @@ -import sys - import pytest from ddtrace.internal.serverless import in_azure_function @@ -134,16 +132,7 @@ def find_spec(self, fullname, *args): ("ddtrace.internal.utils", "http"), ("ddtrace.llmobs", "LLMObs"), ("ddtrace.opentelemetry", "TracerProvider"), - pytest.param( - "ddtrace.profiling", - "profiler", - # when 3.14 is officially supported, this xfail can be removed. - marks=pytest.mark.xfail( - reason="throws AttributeError: module 'asyncio.events' has no attribute 'BaseDefaultEventLoopPolicy'", - condition=sys.version_info >= (3, 14), - strict=True, - ), - ), + ("ddtrace.profiling", "profiler"), ("ddtrace.propagation.http", "HTTPPropagator"), ("ddtrace.trace", "Context, Span, tracer"), ("ddtrace.trace", "Span"), From 28a956fbf2968d5943a1316a159e57330a0beecc Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Sun, 7 Dec 2025 22:40:22 -0500 Subject: [PATCH 06/43] fix internal telemetry test --- ddtrace/internal/settings/profiling.py | 19 ++++++++++--------- tests/telemetry/test_writer.py | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/ddtrace/internal/settings/profiling.py b/ddtrace/internal/settings/profiling.py index a5ba711a503..636b0a15a62 100644 --- a/ddtrace/internal/settings/profiling.py +++ b/ddtrace/internal/settings/profiling.py @@ -65,6 +65,9 @@ def _check_for_stack_v2_available(): def _parse_profiling_enabled(raw: str) -> bool: + # We keep the default value to be gated by Python version, as the Profiler + # relies on CPython internal structs and APIs that could change between + # different Python versions. We disable it by default for Python 3.15+ if sys.version_info >= (3, 15): return False @@ -253,7 +256,7 @@ class ProfilingConfigStack(DDConfig): enabled = DDConfig.v( bool, "enabled", - default=sys.version_info < (3, 15), + default=True, help_type="Boolean", help="Whether to enable the stack profiler", ) @@ -363,14 +366,12 @@ class ProfilingConfigPytorch(DDConfig): # We need to check if ddup is available, and turn off profiling if it is not. if not ddup_is_available: - # We know it is not supported on 3.14, so don't report the error, but still disable - if sys.version_info < (3, 15): - msg = ddup_failure_msg or "libdd not available" - logger.warning("Failed to load ddup module (%s), disabling profiling", msg) - telemetry_writer.add_log( - TELEMETRY_LOG_LEVEL.ERROR, - "Failed to load ddup module (%s), disabling profiling" % ddup_failure_msg, - ) + msg = ddup_failure_msg or "libdd not available" + logger.warning("Failed to load ddup module (%s), disabling profiling", msg) + telemetry_writer.add_log( + TELEMETRY_LOG_LEVEL.ERROR, + "Failed to load ddup module (%s), disabling profiling" % ddup_failure_msg, + ) config.enabled = False # We also need to check if stack_v2 module is available, and turn if off diff --git a/tests/telemetry/test_writer.py b/tests/telemetry/test_writer.py index 914ab46a4bc..f95926874dd 100644 --- a/tests/telemetry/test_writer.py +++ b/tests/telemetry/test_writer.py @@ -290,7 +290,7 @@ def test_app_started_event_configuration_override(test_agent_session, run_python {"name": "DD_PROFILING_AGENTLESS", "origin": "default", "value": False}, {"name": "DD_PROFILING_API_TIMEOUT_MS", "origin": "default", "value": 10000}, {"name": "DD_PROFILING_CAPTURE_PCT", "origin": "env_var", "value": 5.0}, - {"name": "DD_PROFILING_ENABLED", "origin": "env_var", "value": PYTHON_VERSION_INFO < (3, 14)}, + {"name": "DD_PROFILING_ENABLED", "origin": "env_var", "value": PYTHON_VERSION_INFO < (3, 15)}, {"name": "DD_PROFILING_ENABLE_ASSERTS", "origin": "default", "value": False}, {"name": "DD_PROFILING_ENABLE_CODE_PROVENANCE", "origin": "default", "value": True}, {"name": "DD_PROFILING_ENDPOINT_COLLECTION_ENABLED", "origin": "default", "value": True}, From ed01ba71da29691165efa933d56430cb2a50412d Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Mon, 8 Dec 2025 01:09:03 -0500 Subject: [PATCH 07/43] update test --- tests/profiling/collector/test_memalloc.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/profiling/collector/test_memalloc.py b/tests/profiling/collector/test_memalloc.py index 3006c100149..06c08a27f79 100644 --- a/tests/profiling/collector/test_memalloc.py +++ b/tests/profiling/collector/test_memalloc.py @@ -12,6 +12,7 @@ from tests.profiling.collector import pprof_utils +PY_314_OR_ABOVE = sys.version_info[:2] >= (3, 14) PY_313_OR_ABOVE = sys.version_info[:2] >= (3, 13) PY_311_OR_ABOVE = sys.version_info[:2] >= (3, 11) @@ -847,7 +848,14 @@ def test_memory_collector_thread_lifecycle(tmp_path): def worker(): for i in range(10): - data = [i] * 100 + # On Python 3.14+, increase the allocation size to more reliably + # trigger sampling. The CPython internal could have optimized + # small allocations, and/or allocations that are deallocated too + # quickly. + if PY_314_OR_ABOVE: + data = [i] * 10000000 + else: + data = [i] * 100 del data # Capture reference before context manager exits From a97c618056bc0e10b589ad23621b7c1d1e974ca9 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Mon, 8 Dec 2025 01:31:43 -0500 Subject: [PATCH 08/43] simplify code a bit --- ddtrace/internal/settings/profiling.py | 7 ------- tests/telemetry/test_writer.py | 3 +-- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/ddtrace/internal/settings/profiling.py b/ddtrace/internal/settings/profiling.py index 636b0a15a62..003df3f28c2 100644 --- a/ddtrace/internal/settings/profiling.py +++ b/ddtrace/internal/settings/profiling.py @@ -1,7 +1,6 @@ import itertools import math import os -import sys import typing as t from ddtrace.ext.git import COMMIT_SHA @@ -65,12 +64,6 @@ def _check_for_stack_v2_available(): def _parse_profiling_enabled(raw: str) -> bool: - # We keep the default value to be gated by Python version, as the Profiler - # relies on CPython internal structs and APIs that could change between - # different Python versions. We disable it by default for Python 3.15+ - if sys.version_info >= (3, 15): - return False - # Try to derive whether we're enabled via DD_INJECTION_ENABLED # - Are we injected (DD_INJECTION_ENABLED set) # - Is profiling enabled ("profiler" in the list) diff --git a/tests/telemetry/test_writer.py b/tests/telemetry/test_writer.py index f95926874dd..4c3d1b1339d 100644 --- a/tests/telemetry/test_writer.py +++ b/tests/telemetry/test_writer.py @@ -11,7 +11,6 @@ import pytest from ddtrace import config -from ddtrace.internal.compat import PYTHON_VERSION_INFO from ddtrace.internal.settings._agent import get_agent_hostname from ddtrace.internal.settings._telemetry import config as telemetry_config import ddtrace.internal.telemetry @@ -290,7 +289,7 @@ def test_app_started_event_configuration_override(test_agent_session, run_python {"name": "DD_PROFILING_AGENTLESS", "origin": "default", "value": False}, {"name": "DD_PROFILING_API_TIMEOUT_MS", "origin": "default", "value": 10000}, {"name": "DD_PROFILING_CAPTURE_PCT", "origin": "env_var", "value": 5.0}, - {"name": "DD_PROFILING_ENABLED", "origin": "env_var", "value": PYTHON_VERSION_INFO < (3, 15)}, + {"name": "DD_PROFILING_ENABLED", "origin": "env_var", "value": True}, {"name": "DD_PROFILING_ENABLE_ASSERTS", "origin": "default", "value": False}, {"name": "DD_PROFILING_ENABLE_CODE_PROVENANCE", "origin": "default", "value": True}, {"name": "DD_PROFILING_ENDPOINT_COLLECTION_ENABLED", "origin": "default", "value": True}, From f86b3eb47fec02cb0ce141a5b1720c092377264c Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Mon, 8 Dec 2025 08:56:23 -0500 Subject: [PATCH 09/43] reduce diff - these are not necessary --- tests/profiling/collector/test_stack.py | 5 ++-- tests/profiling/collector/test_threading.py | 6 ++--- tests/profiling/exporter/test_ddup.py | 3 +-- tests/profiling/test_accuracy.py | 4 +-- tests/profiling/test_profiler.py | 28 +++++++-------------- 5 files changed, 15 insertions(+), 31 deletions(-) diff --git a/tests/profiling/collector/test_stack.py b/tests/profiling/collector/test_stack.py index 65fa565e668..6b0c07bd482 100644 --- a/tests/profiling/collector/test_stack.py +++ b/tests/profiling/collector/test_stack.py @@ -50,8 +50,7 @@ def func5(): env=dict( DD_PROFILING_MAX_FRAMES="5", DD_PROFILING_OUTPUT_PPROF="/tmp/test_collect_truncate", - ), - err=None, + ) ) def test_collect_truncate(): import os @@ -507,7 +506,7 @@ def _fib(n): @pytest.mark.skipif(not TESTING_GEVENT, reason="Not testing gevent") -@pytest.mark.subprocess(ddtrace_run=True, err=None) +@pytest.mark.subprocess(ddtrace_run=True) def test_collect_gevent_thread_task(): # TODO(taegyunkim): update echion to support gevent and test with stack v2 diff --git a/tests/profiling/collector/test_threading.py b/tests/profiling/collector/test_threading.py index 2edee5c2bd6..1360f21b68f 100644 --- a/tests/profiling/collector/test_threading.py +++ b/tests/profiling/collector/test_threading.py @@ -234,7 +234,6 @@ def test_user_threads_have_native_id(): @pytest.mark.skipif(not os.getenv("DD_PROFILE_TEST_GEVENT"), reason="gevent is not available") @pytest.mark.subprocess( env=dict(DD_PROFILING_FILE_PATH=__file__), - err=None, ) def test_lock_gevent_tasks() -> None: from gevent import monkey @@ -328,7 +327,6 @@ def validate_and_cleanup() -> None: @pytest.mark.skipif(not os.getenv("DD_PROFILE_TEST_GEVENT"), reason="gevent is not available") @pytest.mark.subprocess( env=dict(DD_PROFILING_FILE_PATH=__file__), - err=None, ) def test_rlock_gevent_tasks() -> None: from gevent import monkey @@ -417,7 +415,7 @@ def validate_and_cleanup() -> None: validate_and_cleanup() -@pytest.mark.subprocess(env=dict(DD_PROFILING_ENABLE_ASSERTS="true"), err=None) +@pytest.mark.subprocess(env=dict(DD_PROFILING_ENABLE_ASSERTS="true")) def test_assertion_error_raised_with_enable_asserts(): """Ensure that AssertionError is propagated when config.enable_asserts=True.""" import threading @@ -441,7 +439,7 @@ def test_assertion_error_raised_with_enable_asserts(): lock.acquire() -@pytest.mark.subprocess(env=dict(DD_PROFILING_ENABLE_ASSERTS="false"), err=None) +@pytest.mark.subprocess(env=dict(DD_PROFILING_ENABLE_ASSERTS="false")) def test_all_exceptions_suppressed_by_default() -> None: """ Ensure that exceptions are silently suppressed in the `_acquire` method diff --git a/tests/profiling/exporter/test_ddup.py b/tests/profiling/exporter/test_ddup.py index 838e6f2cc6e..f799bfe0e28 100644 --- a/tests/profiling/exporter/test_ddup.py +++ b/tests/profiling/exporter/test_ddup.py @@ -36,8 +36,7 @@ def test_ddup_start(): env=dict( DD_TAGS="hello:world", DD_PROFILING_TAGS="foo:bar,hello:python", - ), - err=None, + ) ) def test_tags_propagated(): import sys diff --git a/tests/profiling/test_accuracy.py b/tests/profiling/test_accuracy.py index affe0eca819..6fd014af181 100644 --- a/tests/profiling/test_accuracy.py +++ b/tests/profiling/test_accuracy.py @@ -60,9 +60,7 @@ def assert_almost_equal(value, target, tolerance=TOLERANCE): env=dict( DD_PROFILING_OUTPUT_PPROF="/tmp/test_accuracy_stack_v2.pprof", _DD_PROFILING_STACK_V2_ADAPTIVE_SAMPLING_ENABLED="0", - ), - # err=None suppresses psutil warning when running in Docker (Linux) on macOS - err=None, + ) ) def test_accuracy_stack_v2(): import collections diff --git a/tests/profiling/test_profiler.py b/tests/profiling/test_profiler.py index f0ec5323efa..09db302abe8 100644 --- a/tests/profiling/test_profiler.py +++ b/tests/profiling/test_profiler.py @@ -52,10 +52,7 @@ def test_tracer_api(monkeypatch): pytest.fail("Unable to find stack collector") -@pytest.mark.subprocess( - # err=None suppresses psutil warning when running in Docker (Linux) on macOS - err=None, -) +@pytest.mark.subprocess() def test_default_memory(): from ddtrace.profiling import profiler from ddtrace.profiling.collector import memalloc @@ -63,11 +60,7 @@ def test_default_memory(): assert any(isinstance(col, memalloc.MemoryCollector) for col in profiler.Profiler()._profiler._collectors) -@pytest.mark.subprocess( - env=dict(DD_PROFILING_MEMORY_ENABLED="true"), - # err=None suppresses psutil warning when running in Docker (Linux) on macOS - err=None, -) +@pytest.mark.subprocess(env=dict(DD_PROFILING_MEMORY_ENABLED="true")) def test_enable_memory(): from ddtrace.profiling import profiler from ddtrace.profiling.collector import memalloc @@ -75,11 +68,7 @@ def test_enable_memory(): assert any(isinstance(col, memalloc.MemoryCollector) for col in profiler.Profiler()._profiler._collectors) -@pytest.mark.subprocess( - env=dict(DD_PROFILING_MEMORY_ENABLED="false"), - # err=None suppresses psutil warning when running in Docker (Linux) on macOS - err=None, -) +@pytest.mark.subprocess(env=dict(DD_PROFILING_MEMORY_ENABLED="false")) def test_disable_memory(): from ddtrace.profiling import profiler from ddtrace.profiling.collector import memalloc @@ -160,7 +149,7 @@ def test_profiler_serverless(monkeypatch): @pytest.mark.skipif(PYTHON_VERSION_INFO < (3, 10), reason="ddtrace under Python 3.9 is deprecated") -@pytest.mark.subprocess(err=None) +@pytest.mark.subprocess() def test_profiler_ddtrace_deprecation(): """ ddtrace interfaces loaded by the profiler can be marked deprecated, and we should update @@ -183,7 +172,7 @@ def test_profiler_ddtrace_deprecation(): @pytest.mark.subprocess( env=dict(DD_PROFILING_ENABLED="true"), - err=lambda stderr: "Failed to load ddup module (mock failure message), disabling profiling" in stderr, + err="Failed to load ddup module (mock failure message), disabling profiling\n", ) def test_libdd_failure_telemetry_logging(): """Test that libdd initialization failures log to telemetry. This mimics @@ -214,7 +203,9 @@ def test_libdd_failure_telemetry_logging(): @pytest.mark.subprocess( - err=lambda stderr: "Failed to load ddup module" in stderr and "mock failure message" in stderr, + # We'd like to check the stderr, but it somehow leads to triggering the + # upload code path on macOS + err=None ) def test_libdd_failure_telemetry_logging_with_auto(): from unittest import mock @@ -240,8 +231,7 @@ def test_libdd_failure_telemetry_logging_with_auto(): @pytest.mark.subprocess( env=dict(DD_PROFILING_ENABLED="true"), - err=lambda stderr: "Failed to load stack_v2 module (mock failure message), falling back to v1 stack sampler" - in stderr, + err="Failed to load stack_v2 module (mock failure message), falling back to v1 stack sampler\n", ) def test_stack_v2_failure_telemetry_logging(): # Test that stack_v2 initialization failures log to telemetry. This is From c9117cfd5422bd8d7525bd37533792feabb03b0e Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Mon, 8 Dec 2025 09:53:06 -0500 Subject: [PATCH 10/43] collapse idential branches --- ddtrace/profiling/_asyncio.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/ddtrace/profiling/_asyncio.py b/ddtrace/profiling/_asyncio.py index 020ff091299..b1968305879 100644 --- a/ddtrace/profiling/_asyncio.py +++ b/ddtrace/profiling/_asyncio.py @@ -43,15 +43,7 @@ def _task_get_name(task: "asyncio.Task[typing.Any]") -> str: def _call_init_asyncio(asyncio: ModuleType) -> None: from asyncio import tasks as asyncio_tasks - if sys.hexversion >= 0x030E0000: - # Python 3.14+: - # - Native tasks are in linked-list (handled in C++) - # - Third-party tasks are in Python _scheduled_tasks WeakSet - # - Pass _scheduled_tasks.data (set) so C++ can iterate it with MirrorSet - scheduled_tasks = asyncio_tasks._scheduled_tasks.data # type: ignore[attr-defined] - eager_tasks = asyncio_tasks._eager_tasks # type: ignore[attr-defined] - elif sys.hexversion >= 0x030C0000: - # Python 3.12-3.13: _scheduled_tasks has .data attribute from C extension + if sys.hexversion >= 0x030C0000: scheduled_tasks = asyncio_tasks._scheduled_tasks.data # type: ignore[attr-defined] eager_tasks = asyncio_tasks._eager_tasks # type: ignore[attr-defined] else: From b03e5b4f612a491ea1d824f696dcd4fa2cbac36c Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Mon, 8 Dec 2025 09:54:28 -0500 Subject: [PATCH 11/43] remove redundant code --- .../profiling/stack_v2/echion/echion/threads.h | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h index a2ae058c622..32e6145fe9a 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h @@ -257,20 +257,6 @@ ThreadInfo::unwind_tasks(PyThreadState* tstate, uintptr_t tstate_addr) } } -#if PY_VERSION_HEX >= 0x030e0000 - // Python 3.14+: If no leaf tasks found but we have tasks, unwind all tasks that aren't in parent_tasks - // This handles the case where all tasks are waiting on other Tasks (not Futures/Coroutines) - // In normal asyncio usage, tasks awaiting Futures/Coroutines should have waiter=NULL and be leaf tasks - // But if all tasks are waiting on other Tasks, we need this fallback - if (leaf_tasks.empty() && !all_tasks.empty()) { - for (auto& task : all_tasks) { - if (parent_tasks.find(task->origin) == parent_tasks.end()) { - leaf_tasks.push_back(std::ref(*task)); - } - } - } -#endif - #if PY_VERSION_HEX >= 0x030e0000 // Python 3.14+: If no leaf tasks found but we have tasks, unwind all tasks that aren't in parent_tasks // This handles the case where all tasks are waiting on other Tasks (not Futures/Coroutines) From f9676e4532a1651ea4dbb049cfb56f2e96bf47b2 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Mon, 8 Dec 2025 09:57:29 -0500 Subject: [PATCH 12/43] use nullptr instead of NULL --- .../profiling/stack_v2/echion/echion/tasks.h | 16 ++++++++-------- .../profiling/stack_v2/echion/echion/threads.h | 6 +++--- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h index e0ac7a1fb06..83a36cdfc5e 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h @@ -295,7 +295,7 @@ TaskInfo::current(PyObject* loop) inline void get_tasks_from_linked_list(uintptr_t head_addr, PyObject* loop, std::vector& tasks) { - if (head_addr == 0 || loop == NULL) { + if (head_addr == 0 || loop == nullptr) { return; } @@ -328,8 +328,8 @@ get_tasks_from_linked_list(uintptr_t head_addr, PyObject* loop, std::vector(current_node.next); @@ -357,7 +357,7 @@ get_tasks_from_linked_list(uintptr_t head_addr, PyObject* loop, std::vector& tasks) { - if (tstate_addr == 0 || loop == NULL) { + if (tstate_addr == 0 || loop == nullptr) { return; } @@ -388,7 +388,7 @@ get_tasks_from_thread_linked_list(uintptr_t tstate_addr, PyObject* loop, std::ve inline void get_tasks_from_interpreter_linked_list(PyThreadState* tstate, PyObject* loop, std::vector& tasks) { - if (tstate == NULL || loop == NULL) { + if (tstate == nullptr || loop == nullptr) { return; } @@ -412,7 +412,7 @@ get_tasks_from_interpreter_linked_list(PyThreadState* tstate, PyObject* loop, st // ---------------------------------------------------------------------------- // TODO: Make this a "for_each_task" function? [[nodiscard]] inline Result> -get_all_tasks(PyObject* loop, PyThreadState* tstate = NULL, uintptr_t tstate_addr = 0) +get_all_tasks(PyObject* loop, PyThreadState* tstate = nullptr, uintptr_t tstate_addr = 0) { std::vector tasks; if (loop == NULL) @@ -430,7 +430,7 @@ get_all_tasks(PyObject* loop, PyThreadState* tstate = NULL, uintptr_t tstate_add // Second, get tasks from interpreter's linked-list (lingering tasks) // This needs tstate to dereference tstate->interp - if (tstate != NULL) { + if (tstate != nullptr) { get_tasks_from_interpreter_linked_list(tstate, loop, tasks); } @@ -438,7 +438,7 @@ get_all_tasks(PyObject* loop, PyThreadState* tstate = NULL, uintptr_t tstate_add // (asyncio_scheduled_tasks is now WeakSet.data, which is a Python set) // These are global, not per-thread, so we collect them once // If MirrorSet::create() fails, the set might be empty or invalid - skip it - if (asyncio_scheduled_tasks == NULL) { + if (asyncio_scheduled_tasks == nullptr) { // Skip if not initialized } else if (auto maybe_scheduled_tasks_set = MirrorSet::create(asyncio_scheduled_tasks)) { auto scheduled_tasks_set = std::move(*maybe_scheduled_tasks_set); diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h index 32e6145fe9a..b4fd2a2eb3e 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h @@ -249,8 +249,8 @@ ThreadInfo::unwind_tasks(PyThreadState* tstate, uintptr_t tstate_addr) origin_map.emplace(task->origin, std::ref(*task)); // task->waiter is only set if task_fut_waiter points to another Task - // If task_fut_waiter points to a Future/Coroutine, waiter will be NULL - if (task->waiter != NULL) { + // If task_fut_waiter points to a Future/Coroutine, waiter will be nullptr + if (task->waiter != nullptr) { waitee_map.emplace(task->waiter->origin, std::ref(*task)); } else if (parent_tasks.find(task->origin) == parent_tasks.end()) { leaf_tasks.push_back(std::ref(*task)); @@ -260,7 +260,7 @@ ThreadInfo::unwind_tasks(PyThreadState* tstate, uintptr_t tstate_addr) #if PY_VERSION_HEX >= 0x030e0000 // Python 3.14+: If no leaf tasks found but we have tasks, unwind all tasks that aren't in parent_tasks // This handles the case where all tasks are waiting on other Tasks (not Futures/Coroutines) - // In normal asyncio usage, tasks awaiting Futures/Coroutines should have waiter=NULL and be leaf tasks + // In normal asyncio usage, tasks awaiting Futures/Coroutines should have waiter=nullptr and be leaf tasks // But if all tasks are waiting on other Tasks, we need this fallback if (leaf_tasks.empty() && !all_tasks.empty()) { for (auto& task : all_tasks) { From f9ff0eb49db034a7c805f76c1656d4e5154ccfd1 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Mon, 8 Dec 2025 09:58:54 -0500 Subject: [PATCH 13/43] split into multiple lines for readability --- .../datadog/profiling/stack_v2/src/echion/frame.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc b/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc index 206b20ffdc2..bc09be9645b 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc +++ b/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc @@ -249,11 +249,11 @@ Frame::read(PyObject* frame_addr, PyObject** prev_addr) #if PY_VERSION_HEX >= 0x030e0000 // Python 3.14+: f_executable is _PyStackRef, access bits directly // We can't use CPython API helpers as we're copying partial structs - const int lasti = - (static_cast( - (frame_addr->instr_ptr - 1 - - reinterpret_cast<_Py_CODEUNIT*>((reinterpret_cast(frame_addr->f_executable.bits)))))) - - offsetof(PyCodeObject, co_code_adaptive) / sizeof(_Py_CODEUNIT); + PyCodeObject* code_obj = reinterpret_cast(frame_addr->f_executable.bits); + _Py_CODEUNIT* code_units = reinterpret_cast<_Py_CODEUNIT*>(code_obj); + int instr_offset = static_cast(frame_addr->instr_ptr - 1 - code_units); + int code_offset = offsetof(PyCodeObject, co_code_adaptive) / sizeof(_Py_CODEUNIT); + const int lasti = instr_offset - code_offset; auto maybe_frame = Frame::get(reinterpret_cast(frame_addr->f_executable.bits), lasti); if (!maybe_frame) { return ErrorKind::FrameError; From 514bfe99190f64e601f121cf22464c5bdef1960b Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Mon, 8 Dec 2025 10:59:30 -0500 Subject: [PATCH 14/43] reduce diff --- .../profiling/stack_v2/echion/echion/threads.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h index b4fd2a2eb3e..005934600ab 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h @@ -304,10 +304,15 @@ ThreadInfo::unwind_tasks(PyThreadState* tstate, uintptr_t tstate_addr) // Get the next task in the chain PyObject* task_origin = task.origin; + if (waitee_map.find(task_origin) != waitee_map.end()) { + current_task = waitee_map.find(task_origin)->second; + continue; + } - // Check for parent (gather) links first { + // Check for, e.g., gather links std::lock_guard lock(task_link_map_lock); + if (task_link_map.find(task_origin) != task_link_map.end() && origin_map.find(task_link_map[task_origin]) != origin_map.end()) { current_task = origin_map.find(task_link_map[task_origin])->second; @@ -315,12 +320,6 @@ ThreadInfo::unwind_tasks(PyThreadState* tstate, uintptr_t tstate_addr) } } - // Then check for waiter links - if (waitee_map.find(task_origin) != waitee_map.end()) { - current_task = waitee_map.find(task_origin)->second; - continue; - } - break; } From 95be05633e4a70fb03561d3755b2fba5cc60461b Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Mon, 8 Dec 2025 11:12:39 -0500 Subject: [PATCH 15/43] remove 3.14 special handling which is actually not needed --- .../profiling/stack_v2/echion/echion/threads.h | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h index 005934600ab..f25b6662dd8 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h @@ -257,20 +257,6 @@ ThreadInfo::unwind_tasks(PyThreadState* tstate, uintptr_t tstate_addr) } } -#if PY_VERSION_HEX >= 0x030e0000 - // Python 3.14+: If no leaf tasks found but we have tasks, unwind all tasks that aren't in parent_tasks - // This handles the case where all tasks are waiting on other Tasks (not Futures/Coroutines) - // In normal asyncio usage, tasks awaiting Futures/Coroutines should have waiter=nullptr and be leaf tasks - // But if all tasks are waiting on other Tasks, we need this fallback - if (leaf_tasks.empty() && !all_tasks.empty()) { - for (auto& task : all_tasks) { - if (parent_tasks.find(task->origin) == parent_tasks.end()) { - leaf_tasks.push_back(std::ref(*task)); - } - } - } -#endif - for (auto& leaf_task : leaf_tasks) { auto stack_info = std::make_unique(leaf_task.get().name, leaf_task.get().is_on_cpu); auto& stack = stack_info->stack; From 434ab996b70266a9c88d771f32b7045346d79a55 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Mon, 8 Dec 2025 11:30:44 -0500 Subject: [PATCH 16/43] reduce diff --- .../internal/datadog/profiling/stack_v2/echion/echion/tasks.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h index 83a36cdfc5e..2f0bea88f81 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h @@ -146,14 +146,11 @@ GenInfo::create(PyObject* gen_addr) auto frame = (PyObject*)gen.gi_frame; #endif -#if PY_VERSION_HEX >= 0x030a0000 && PY_VERSION_HEX < 0x030b0000 - // Python 3.10: Need PyFrameObject for _PyFrame_IsExecuting PyFrameObject f; if (copy_type(frame, f)) { recursion_depth--; return ErrorKind::GenInfoError; } -#endif PyObject* yf = (frame != NULL ? PyGen_yf(&gen, frame) : NULL); GenInfo::Ptr await = nullptr; From 17f3f89b50c085d6f6500fa2a2c57e174b3786bc Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Mon, 8 Dec 2025 11:34:30 -0500 Subject: [PATCH 17/43] remove code thats not doing anything --- .../profiling/stack_v2/src/echion/frame.cc | 26 ------------------- 1 file changed, 26 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc b/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc index bc09be9645b..29fd2447f6c 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc +++ b/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc @@ -290,33 +290,7 @@ Frame::read(PyObject* frame_addr, PyObject** prev_addr) frame.is_entry = frame_addr->is_entry; #endif // PY_VERSION_HEX >= 0x030c0000 } - -#if PY_VERSION_HEX >= 0x030e0000 - // Python 3.14+: Generator frames have previous = NULL (intentionally broken frame chain) - // See docs/python-3.14-generator-frame-limitation.md for details - // In _PyFrame_Copy(), CPython explicitly sets dest->previous = NULL to prevent - // dangling pointers when creating generator/coroutine frames. - if (frame_addr->previous == NULL && frame_addr->owner == FRAME_OWNED_BY_GENERATOR) { - // Best-effort fallback: try frame_obj->f_back->f_frame if available - // This is unreliable because frame_obj is lazily created and often NULL, - // and even when it exists, f_back is often NULL for generator frames. - // However, it might occasionally help in edge cases. - *prev_addr = NULL; - if (frame_addr->frame_obj != NULL) { - PyFrameObject frame_obj; - if (copy_type(frame_addr->frame_obj, frame_obj) == 0 && frame_obj.f_back != NULL) { - PyFrameObject prev_frame_obj; - if (copy_type(frame_obj.f_back, prev_frame_obj) == 0 && prev_frame_obj.f_frame != NULL) { - *prev_addr = prev_frame_obj.f_frame; - } - } - } - } else { - *prev_addr = &frame == &INVALID_FRAME ? NULL : frame_addr->previous; - } -#else *prev_addr = &frame == &INVALID_FRAME ? NULL : frame_addr->previous; -#endif #else // PY_VERSION_HEX < 0x030b0000 // Unwind the stack from leaf to root and store it in a stack. This way we From a6b070ff9f18d57c9a88d875d5588f119eaa2d0f Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Mon, 8 Dec 2025 11:39:50 -0500 Subject: [PATCH 18/43] reduce diff --- .../profiling/stack_v2/echion/echion/tasks.h | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h index 2f0bea88f81..ead4a128f21 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h @@ -479,22 +479,26 @@ get_all_tasks(PyObject* loop, PyThreadState* tstate = nullptr, uintptr_t tstate_ if (asyncio_eager_tasks != NULL) { auto maybe_eager_tasks_set = MirrorSet::create(asyncio_eager_tasks); - if (maybe_eager_tasks_set) { - auto eager_tasks_set = std::move(*maybe_eager_tasks_set); - auto maybe_eager_tasks = eager_tasks_set.as_unordered_set(); - if (maybe_eager_tasks) { - auto eager_tasks = std::move(*maybe_eager_tasks); - for (auto task_addr : eager_tasks) { - auto maybe_task_info = TaskInfo::create(reinterpret_cast(task_addr)); - if (maybe_task_info) { - if ((*maybe_task_info)->loop == loop) { - tasks.push_back(std::move(*maybe_task_info)); - } - } + if (!maybe_eager_tasks_set) { + return ErrorKind::TaskInfoError; + } + + auto eager_tasks_set = std::move(*maybe_eager_tasks_set); + + auto maybe_eager_tasks = eager_tasks_set.as_unordered_set(); + if (!maybe_eager_tasks) { + return ErrorKind::TaskInfoError; + } + + auto eager_tasks = std::move(*maybe_eager_tasks); + for (auto task_addr : eager_tasks) { + auto maybe_task_info = TaskInfo::create(reinterpret_cast(task_addr)); + if (maybe_task_info) { + if ((*maybe_task_info)->loop == loop) { + tasks.push_back(std::move(*maybe_task_info)); } } } - // If MirrorSet::create() fails, the set might be empty or invalid - skip it } return tasks; From 3d011893f1973946f724012aabc9440d924a1314 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Mon, 8 Dec 2025 12:58:53 -0500 Subject: [PATCH 19/43] tidy-up includes following include what you use principle --- .../stack_v2/echion/echion/cpython/tasks.h | 2 ++ .../profiling/stack_v2/echion/echion/frame.h | 14 ++++---------- .../profiling/stack_v2/echion/echion/tasks.h | 6 +++--- .../datadog/profiling/stack_v2/src/echion/frame.cc | 13 +++++++++++++ 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h index 83b23e5338f..3e441112f2c 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h @@ -13,6 +13,7 @@ #define Py_BUILD_CORE #if PY_VERSION_HEX >= 0x030e0000 // Python 3.14+: _PyInterpreterFrame moved to new header +#include // For offsetof macro #include #include #include @@ -22,6 +23,7 @@ #elif PY_VERSION_HEX >= 0x030d0000 #include #else +#include // For offsetof macro #include #include #endif // PY_VERSION_HEX >= 0x030d0000 diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/frame.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/frame.h index 72e9de0fdcc..35f2a063485 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/frame.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/frame.h @@ -17,17 +17,11 @@ #if PY_VERSION_HEX >= 0x030e0000 // Python 3.14+: _PyInterpreterFrame moved to new header #define Py_BUILD_CORE -#include -#include // Needed for complete PyFrameObject definition -#include -#include -#elif PY_VERSION_HEX >= 0x030d0000 +#include // For _PyInterpreterFrame type definition +#elif PY_VERSION_HEX >= 0x030b0000 +// Python 3.11-3.13: _PyInterpreterFrame is in pycore_frame.h #define Py_BUILD_CORE -#include -#endif // PY_VERSION_HEX >= 0x030d0000 -#if PY_VERSION_HEX >= 0x030b0000 && PY_VERSION_HEX < 0x030e0000 -#define Py_BUILD_CORE -#include +#include // For _PyInterpreterFrame type definition #endif #include diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h index ead4a128f21..143b492cb33 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h @@ -15,9 +15,9 @@ #include #define Py_BUILD_CORE +#include // For offsetof macro #if PY_VERSION_HEX >= 0x030e0000 -#include // For offsetof macro -#include // for FRAME_CLEARED +#include // for FRAME_CLEARED, FRAME_EXECUTING #include // For PyInterpreterState #include // For llist_node structure #include @@ -26,7 +26,7 @@ #elif PY_VERSION_HEX >= 0x030d0000 #include #else -#include +#include // for FRAME_CLEARED, FRAME_EXECUTING #include #endif // PY_VERSION_HEX >= 0x030d0000 #else diff --git a/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc b/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc index 29fd2447f6c..fdd57a01878 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc +++ b/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc @@ -3,6 +3,19 @@ #include #include +#if PY_VERSION_HEX >= 0x030b0000 +// Common headers needed for Python 3.11+ implementation +#include // For offsetof macro +#include // For _Py_CODEUNIT +#include // For _PyInterpreterFrame and FRAME_OWNED_BY_* constants + +#if PY_VERSION_HEX >= 0x030e0000 +// Python 3.14+: Additional headers for new structure definitions +#include // For _PyInterpreterFrame complete definition +#include // For _PyStackRef +#endif // PY_VERSION_HEX >= 0x030e0000 +#endif // PY_VERSION_HEX >= 0x030b0000 + // ---------------------------------------------------------------------------- #if PY_VERSION_HEX >= 0x030b0000 static inline int From 88a8e36ade2e44c03f9e89fca8ff6356e471088a Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Mon, 8 Dec 2025 13:03:21 -0500 Subject: [PATCH 20/43] udpate comment and include --- .../internal/datadog/profiling/stack_v2/echion/echion/tasks.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h index 143b492cb33..4bd525a366b 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h @@ -20,9 +20,8 @@ #include // for FRAME_CLEARED, FRAME_EXECUTING #include // For PyInterpreterState #include // For llist_node structure +#include // For _PyThreadStateImpl #include -// Note: _PyThreadStateImpl is already available via echion/state.h which includes -// with Py_BUILD_CORE defined. #elif PY_VERSION_HEX >= 0x030d0000 #include #else From ab58a609e0f2b8d70e6222fd209fd4c824264c3f Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Mon, 8 Dec 2025 13:43:53 -0500 Subject: [PATCH 21/43] clear LSB to get the PyCodeObject --- .../profiling/stack_v2/echion/echion/cpython/tasks.h | 6 ++++-- .../datadog/profiling/stack_v2/src/echion/frame.cc | 7 +++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h index 3e441112f2c..8308d730779 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h @@ -240,8 +240,9 @@ extern "C" } // Get the code object from f_executable.bits to know co_nlocalsplus + // Per Python 3.14 release notes (gh-123923): clear LSB to recover PyObject* pointer PyCodeObject code; - PyCodeObject* code_ptr = reinterpret_cast(frame.f_executable.bits); + PyCodeObject* code_ptr = reinterpret_cast(BITS_TO_PTR_MASKED(frame.f_executable)); if (copy_type(code_ptr, code)) { return nullptr; } @@ -271,7 +272,8 @@ extern "C" } // Extract PyObject* from _PyStackRef.bits - return reinterpret_cast(top_ref.bits); + // Per Python 3.14 release notes (gh-123923): clear LSB to recover PyObject* pointer + return BITS_TO_PTR_MASKED(top_ref); } #elif PY_VERSION_HEX >= 0x030d0000 diff --git a/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc b/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc index fdd57a01878..79a41b3497d 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc +++ b/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc @@ -262,12 +262,15 @@ Frame::read(PyObject* frame_addr, PyObject** prev_addr) #if PY_VERSION_HEX >= 0x030e0000 // Python 3.14+: f_executable is _PyStackRef, access bits directly // We can't use CPython API helpers as we're copying partial structs - PyCodeObject* code_obj = reinterpret_cast(frame_addr->f_executable.bits); + // + // Per Python 3.14 release notes (gh-123923): f_executable uses a tagged pointer. + // Profilers must clear the least significant bit to recover the PyObject* pointer. + PyCodeObject* code_obj = reinterpret_cast(BITS_TO_PTR_MASKED(frame_addr->f_executable)); _Py_CODEUNIT* code_units = reinterpret_cast<_Py_CODEUNIT*>(code_obj); int instr_offset = static_cast(frame_addr->instr_ptr - 1 - code_units); int code_offset = offsetof(PyCodeObject, co_code_adaptive) / sizeof(_Py_CODEUNIT); const int lasti = instr_offset - code_offset; - auto maybe_frame = Frame::get(reinterpret_cast(frame_addr->f_executable.bits), lasti); + auto maybe_frame = Frame::get(code_obj, lasti); if (!maybe_frame) { return ErrorKind::FrameError; } From 52f8a9b749d393c0f5036bbfa5a7d7efa08cc9af Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Mon, 8 Dec 2025 14:42:58 -0500 Subject: [PATCH 22/43] ignore frame owned by interpreter --- .../datadog/profiling/stack_v2/src/echion/frame.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc b/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc index 79a41b3497d..0e60733442b 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc +++ b/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc @@ -246,7 +246,15 @@ Frame::read(PyObject* frame_addr, PyObject** prev_addr) } #if PY_VERSION_HEX >= 0x030c0000 +#if PY_VERSION_HEX >= 0x030e0000 + // Python 3.14 introduced FRAME_OWNED_BY_INTERPRETER, and frames of this + // type are also ignored by the upstream profiler. + // See + // https://github.com/python/cpython/blob/ebf955df7a89ed0c7968f79faec1de49f61ed7cb/Modules/_remote_debugging_module.c#L2134 + if (frame_addr->owner == FRAME_OWNED_BY_CSTACK || frame_addr->owner == FRAME_OWNED_BY_INTERPRETER) { +#else if (frame_addr->owner == FRAME_OWNED_BY_CSTACK) { +#endif // PY_VERSION_HEX >= 0x030e0000 *prev_addr = frame_addr->previous; // This is a C frame, we just need to ignore it return std::ref(C_FRAME); From 4b92ca2dfd84e3baa94e4a67abd0cd5a29a97810 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Mon, 8 Dec 2025 15:31:07 -0500 Subject: [PATCH 23/43] cosmetic change --- .../datadog/profiling/stack_v2/echion/echion/cpython/tasks.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h index 8308d730779..715c0d893cd 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h @@ -242,8 +242,8 @@ extern "C" // Get the code object from f_executable.bits to know co_nlocalsplus // Per Python 3.14 release notes (gh-123923): clear LSB to recover PyObject* pointer PyCodeObject code; - PyCodeObject* code_ptr = reinterpret_cast(BITS_TO_PTR_MASKED(frame.f_executable)); - if (copy_type(code_ptr, code)) { + auto code_addr = reinterpret_cast(BITS_TO_PTR_MASKED(frame.f_executable)); + if (copy_type(code_addr, code)) { return nullptr; } From d163dfcc09e233e93c0b5f9041730b8b7bd72aa5 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Mon, 8 Dec 2025 16:35:25 -0500 Subject: [PATCH 24/43] update comments and checks for PyGen_yf --- .../stack_v2/echion/echion/cpython/tasks.h | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h index 715c0d893cd..9a1cf2e7d7b 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h @@ -227,7 +227,7 @@ extern "C" #if PY_VERSION_HEX >= 0x030e0000 // Python 3.14+: Use stackpointer and _PyStackRef - // We can't use CPython API helpers as we're copying partial structs + inline PyObject* PyGen_yf(PyGenObject* gen, PyObject* frame_addr) { if (gen->gi_frame_state != FRAME_SUSPENDED_YIELD_FROM) { @@ -239,26 +239,34 @@ extern "C" return nullptr; } - // Get the code object from f_executable.bits to know co_nlocalsplus - // Per Python 3.14 release notes (gh-123923): clear LSB to recover PyObject* pointer + // CPython asserts the following: + // assert(f->stackpointer > f->localsplus + _PyFrame_GetCode(f)->co_nlocalsplus); + // assert(!PyStackRef_IsNull(f->stackpointer[-1])); + + // Though we have to pay the price of copying the code object, we need + // to do this to catch the case where the stack is empty, as accessing + // frame.stackpointer[-1] would be an undefined behavior. + // This is necessary as frame.stacktop is removed in 3.14. PyCodeObject code; auto code_addr = reinterpret_cast(BITS_TO_PTR_MASKED(frame.f_executable)); if (copy_type(code_addr, code)) { return nullptr; } - // Calculate addresses in remote process uintptr_t frame_addr_uint = reinterpret_cast(frame_addr); uintptr_t localsplus_addr = frame_addr_uint + offsetof(_PyInterpreterFrame, localsplus); + // This computes f->localsplus + code.co_nlocalsplus. uintptr_t stackbase_addr = localsplus_addr + code.co_nlocalsplus * sizeof(_PyStackRef); - // stackpointer is a pointer field - when copied, it contains the remote address - // Calculate stacktop from pointer difference uintptr_t stackpointer_addr = reinterpret_cast(frame.stackpointer); - if (stackpointer_addr < stackbase_addr) { + // We want stackpointer_addr to be greater than the stackbase_addr, + // that is, the stack is not empty. + if (stackpointer_addr <= stackbase_addr) { return nullptr; } + // We can also calculate stacktop and check that it is within a reasonable range. + // Similar to 3.13's stacktop check below. int stacktop = (int)((stackpointer_addr - stackbase_addr) / sizeof(_PyStackRef)); if (stacktop < 1 || stacktop > MAX_STACK_SIZE) { @@ -266,6 +274,7 @@ extern "C" } // Read the top of stack directly from remote memory + // This is equivalent to CPython's frame.stackpointer[-1]. _PyStackRef top_ref; if (copy_type(reinterpret_cast(stackpointer_addr - sizeof(_PyStackRef)), top_ref)) { return nullptr; From 7104cb15ab4b12169a3e56bef89385598cf4ca53 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Mon, 8 Dec 2025 17:28:58 -0500 Subject: [PATCH 25/43] simplify code by not passing around uintptr tstate_addr --- .../profiling/stack_v2/echion/echion/tasks.h | 37 ++++---- .../stack_v2/echion/echion/threads.h | 87 ++++++++++++------- .../profiling/stack_v2/src/sampler.cpp | 14 +-- 3 files changed, 87 insertions(+), 51 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h index 4bd525a366b..57a3ffbb264 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h @@ -351,16 +351,15 @@ get_tasks_from_linked_list(uintptr_t head_addr, PyObject* loop, std::vector& tasks) +get_tasks_from_thread_linked_list(_PyThreadStateImpl* tstate_impl, PyObject* loop, std::vector& tasks) { - if (tstate_addr == 0 || loop == nullptr) { + if (tstate_impl == nullptr || loop == nullptr) { return; } - // Calculate offset to asyncio_tasks_head field - // NOTE: tstate_addr points to PyThreadState base, which is the first field of _PyThreadStateImpl - size_t asyncio_tasks_head_offset = offsetof(_PyThreadStateImpl, asyncio_tasks_head); - uintptr_t head_addr = tstate_addr + asyncio_tasks_head_offset; + // Access asyncio_tasks_head directly from _PyThreadStateImpl + // No need for offset calculations since we have the full struct + uintptr_t head_addr = reinterpret_cast(&tstate_impl->asyncio_tasks_head); // Copy the llist_node struct from remote memory to local memory struct llist_node head_node_local; @@ -407,26 +406,25 @@ get_tasks_from_interpreter_linked_list(PyThreadState* tstate, PyObject* loop, st // ---------------------------------------------------------------------------- // TODO: Make this a "for_each_task" function? +#if PY_VERSION_HEX >= 0x030e0000 [[nodiscard]] inline Result> -get_all_tasks(PyObject* loop, PyThreadState* tstate = nullptr, uintptr_t tstate_addr = 0) +get_all_tasks(PyObject* loop, _PyThreadStateImpl* tstate_impl = nullptr) { std::vector tasks; if (loop == NULL) return tasks; -#if PY_VERSION_HEX >= 0x030e0000 // Python 3.14+: Native tasks are in linked-list per thread AND per interpreter // CPython iterates over both: - // 1. Per-thread list: tstate->asyncio_tasks_head (active tasks) + // 1. Per-thread list: tstate_impl->asyncio_tasks_head (active tasks) // 2. Per-interpreter list: interp->asyncio_tasks_head (lingering tasks) - // First, get tasks from this thread's linked-list (if tstate_addr is provided) - if (tstate_addr != 0) { - get_tasks_from_thread_linked_list(tstate_addr, loop, tasks); - } + // First, get tasks from this thread's linked-list (if tstate_impl is provided) + if (tstate_impl != nullptr) { + get_tasks_from_thread_linked_list(tstate_impl, loop, tasks); - // Second, get tasks from interpreter's linked-list (lingering tasks) - // This needs tstate to dereference tstate->interp - if (tstate != nullptr) { + // Second, get tasks from interpreter's linked-list (lingering tasks) + // Access PyThreadState via the first field of _PyThreadStateImpl + PyThreadState* tstate = reinterpret_cast(tstate_impl); get_tasks_from_interpreter_linked_list(tstate, loop, tasks); } @@ -450,6 +448,13 @@ get_all_tasks(PyObject* loop, PyThreadState* tstate = nullptr, uintptr_t tstate_ } } #else +[[nodiscard]] inline Result> +get_all_tasks(PyObject* loop, PyThreadState* tstate = nullptr) +{ + std::vector tasks; + if (loop == NULL) + return tasks; + auto maybe_scheduled_tasks_set = MirrorSet::create(asyncio_scheduled_tasks); if (!maybe_scheduled_tasks_set) { return ErrorKind::TaskInfoError; diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h index f25b6662dd8..e1fb9916db0 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h @@ -7,6 +7,10 @@ #include #define Py_BUILD_CORE +#if PY_VERSION_HEX >= 0x030e0000 +#include // For _PyThreadStateImpl +#endif + #include #include #include @@ -28,6 +32,16 @@ #include #include +#if PY_VERSION_HEX >= 0x030e0000 +// Note: _PythreadStateImpl was introduced in Python 3.13. Every PyThreadState +// is actually allocated as a _PyThreadStateImpl. +// Python 3.14+: Use _PyThreadStateImpl to access asyncio_tasks_head directly +using ThreadStateType = _PyThreadStateImpl; +#else +// Pre-Python 3.14: Use PyThreadState (no asyncio_tasks_head field) +using ThreadStateType = PyThreadState; +#endif + class ThreadInfo { public: @@ -50,8 +64,8 @@ class ThreadInfo [[nodiscard]] Result update_cpu_time(); bool is_running(); - [[nodiscard]] Result sample(int64_t, PyThreadState*, microsecond_t, uintptr_t tstate_addr = 0); - void unwind(PyThreadState*, uintptr_t tstate_addr = 0); + [[nodiscard]] Result sample(int64_t, ThreadStateType*, microsecond_t); + void unwind(ThreadStateType*); // ------------------------------------------------------------------------ #if defined PL_LINUX @@ -101,7 +115,7 @@ class ThreadInfo }; private: - [[nodiscard]] Result unwind_tasks(PyThreadState* tstate, uintptr_t tstate_addr = 0); + [[nodiscard]] Result unwind_tasks(ThreadStateType*); void unwind_greenlets(PyThreadState*, unsigned long); }; @@ -186,14 +200,15 @@ inline std::mutex thread_info_map_lock; // ---------------------------------------------------------------------------- inline void -ThreadInfo::unwind(PyThreadState* tstate, uintptr_t tstate_addr) +ThreadInfo::unwind(ThreadStateType* tstate_ptr) { + PyThreadState* tstate = reinterpret_cast(tstate_ptr); + unwind_python_stack(tstate); if (asyncio_loop) { // unwind_tasks returns a [[nodiscard]] Result. // We cast it to void to ignore failures. - // Pass tstate and tstate_addr to unwind_tasks() so it can access this thread's linked-list - (void)unwind_tasks(tstate, tstate_addr); + (void)unwind_tasks(tstate_ptr); } // We make the assumption that gevent and asyncio are not mixed @@ -204,16 +219,14 @@ ThreadInfo::unwind(PyThreadState* tstate, uintptr_t tstate_addr) // ---------------------------------------------------------------------------- inline Result -ThreadInfo::unwind_tasks(PyThreadState* tstate, uintptr_t tstate_addr) +ThreadInfo::unwind_tasks(ThreadStateType* tstate_ptr) { std::vector leaf_tasks; std::unordered_set parent_tasks; std::unordered_map waitee_map; // Indexed by task origin std::unordered_map origin_map; // Indexed by task origin - // Pass tstate and tstate_addr to get_all_tasks() to get tasks from this thread's linked-list (Python 3.14+) - // tstate is used for dereferencing (e.g., tstate->interp), tstate_addr is used for offset calculations - auto maybe_all_tasks = get_all_tasks(reinterpret_cast(asyncio_loop), tstate, tstate_addr); + auto maybe_all_tasks = get_all_tasks(reinterpret_cast(asyncio_loop), tstate_ptr); if (!maybe_all_tasks) { return ErrorKind::TaskInfoError; } @@ -385,8 +398,10 @@ ThreadInfo::unwind_greenlets(PyThreadState* tstate, unsigned long cur_native_id) // ---------------------------------------------------------------------------- inline Result -ThreadInfo::sample(int64_t iid, PyThreadState* tstate, microsecond_t delta, uintptr_t tstate_addr) +ThreadInfo::sample(int64_t iid, ThreadStateType* tstate_ptr, microsecond_t delta) { + PyThreadState* tstate = reinterpret_cast(tstate_ptr); + Renderer::get().render_thread_begin(tstate, name, delta, thread_id, native_id); microsecond_t previous_cpu_time = cpu_time; @@ -399,7 +414,7 @@ ThreadInfo::sample(int64_t iid, PyThreadState* tstate, microsecond_t delta, uint Renderer::get().render_cpu_time(thread_is_running ? cpu_time - previous_cpu_time : 0); - this->unwind(tstate, tstate_addr); + this->unwind(tstate_ptr); // Render in this order of priority // 1. asyncio Tasks stacks (if any) @@ -459,8 +474,14 @@ ThreadInfo::sample(int64_t iid, PyThreadState* tstate, microsecond_t delta, uint } // ---------------------------------------------------------------------------- +#if PY_VERSION_HEX >= 0x030e0000 +using ThreadStateCallback = std::function; +#else +using ThreadStateCallback = std::function; +#endif + static void -for_each_thread(InterpreterInfo& interp, std::function callback) +for_each_thread(InterpreterInfo& interp, microsecond_t delta, ThreadStateCallback callback) { std::unordered_set threads; std::unordered_set seen_threads; @@ -481,21 +502,32 @@ for_each_thread(InterpreterInfo& interp, std::function= 0x030e0000 + // For Python 3.14+, copy _PyThreadStateImpl (which contains PyThreadState as first field) + // so we can access asyncio_tasks_head directly without offset calculations. + ThreadStateType tstate_copy; + if (copy_type(reinterpret_cast(tstate_addr), tstate_copy)) continue; + // Access PyThreadState fields via the first field of _PyThreadStateImpl + PyThreadState* tstate = reinterpret_cast(&tstate_copy); +#else + // Pre-Python 3.14: copy PyThreadState directly + ThreadStateType tstate_copy; + if (copy_type(tstate_addr, tstate_copy)) + continue; + PyThreadState* tstate = &tstate_copy; +#endif // Enqueue the unseen threads that we can reach from this thread. - if (tstate.next != NULL && seen_threads.find(tstate.next) == seen_threads.end()) - threads.insert(tstate.next); - if (tstate.prev != NULL && seen_threads.find(tstate.prev) == seen_threads.end()) - threads.insert(tstate.prev); + if (tstate->next != NULL && seen_threads.find(tstate->next) == seen_threads.end()) + threads.insert(tstate->next); + if (tstate->prev != NULL && seen_threads.find(tstate->prev) == seen_threads.end()) + threads.insert(tstate->prev); { const std::lock_guard guard(thread_info_map_lock); - if (thread_info_map.find(tstate.thread_id) == thread_info_map.end()) { + if (thread_info_map.find(tstate->thread_id) == thread_info_map.end()) { // If the threading module was not imported in the target then // we mistakenly take the hypno thread as the main thread. We // assume that any missing thread is the actual main thread, @@ -503,7 +535,7 @@ for_each_thread(InterpreterInfo& interp, std::function= 0x030b0000 - auto native_id = tstate.native_thread_id; + auto native_id = tstate->native_thread_id; #else auto native_id = getpid(); #endif @@ -517,7 +549,7 @@ for_each_thread(InterpreterInfo& interp, std::functionthread_id, native_id, "MainThread"); if (!maybe_thread_info) { // We failed to create the thread info object so we skip it. // We'll likely try again later with the valid thread @@ -525,14 +557,11 @@ for_each_thread(InterpreterInfo& interp, std::functionthread_id, std::move(*maybe_thread_info)); } - // Call back with the thread state, actual address, and thread info. - // CRITICAL: Pass both &tstate (local copy for dereferencing) and tstate_addr (actual address for offset - // calculations) - callback( - &tstate, reinterpret_cast(tstate_addr), *thread_info_map.find(tstate.thread_id)->second); + // Call back with the copied thread state + callback(&tstate_copy, delta, *thread_info_map.find(tstate->thread_id)->second); } } } diff --git a/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp b/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp index 8827e492d73..d96c3fb7d94 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp +++ b/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp @@ -167,12 +167,14 @@ Sampler::sampling_thread(const uint64_t seq_num) // Perform the sample for_each_interp([&](InterpreterInfo& interp) -> void { - for_each_thread(interp, [&](PyThreadState* tstate, uintptr_t tstate_addr, ThreadInfo& thread) { - auto success = thread.sample(interp.id, tstate, wall_time_us, tstate_addr); - if (success) { - ddup_increment_sample_count(); - } - }); + // Use ThreadStateType typedef which is _PyThreadStateImpl* for 3.14+ and PyThreadState* for pre-3.14 + for_each_thread( + interp, wall_time_us, [&](ThreadStateType* tstate, microsecond_t delta, ThreadInfo& thread) { + auto success = thread.sample(interp.id, tstate, delta); + if (success) { + ddup_increment_sample_count(); + } + }); }); ddup_increment_sampling_event_count(); From d1a10ef0b666686a33b364bd4a076a54d2b1c176 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Mon, 8 Dec 2025 17:55:19 -0500 Subject: [PATCH 26/43] clean up some code and comments --- .../profiling/stack_v2/echion/echion/tasks.h | 48 +++++++------------ .../profiling/stack_v2/src/echion/frame.cc | 3 -- 2 files changed, 16 insertions(+), 35 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h index 57a3ffbb264..e9c25b47806 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h @@ -357,28 +357,11 @@ get_tasks_from_thread_linked_list(_PyThreadStateImpl* tstate_impl, PyObject* loo return; } - // Access asyncio_tasks_head directly from _PyThreadStateImpl - // No need for offset calculations since we have the full struct uintptr_t head_addr = reinterpret_cast(&tstate_impl->asyncio_tasks_head); - // Copy the llist_node struct from remote memory to local memory - struct llist_node head_node_local; - if (copy_type(reinterpret_cast(head_addr), head_node_local)) { - return; // Failed to read head from remote memory - } - - // Check if list is empty (head points to itself in circular list) - uintptr_t next_as_uint = reinterpret_cast(head_node_local.next); - uintptr_t prev_as_uint = reinterpret_cast(head_node_local.prev); - if (next_as_uint == head_addr && prev_as_uint == head_addr) { - return; // Empty list - } - - // Iterate over the linked-list get_tasks_from_linked_list(head_addr, loop, tasks); } -// CRITICAL: All memory access must copy structs to local memory first! // Get tasks from interpreter's linked-list (for lingering tasks) inline void get_tasks_from_interpreter_linked_list(PyThreadState* tstate, PyObject* loop, std::vector& tasks) @@ -428,21 +411,22 @@ get_all_tasks(PyObject* loop, _PyThreadStateImpl* tstate_impl = nullptr) get_tasks_from_interpreter_linked_list(tstate, loop, tasks); } - // Handle third-party tasks from Python _scheduled_tasks.data (set) - // (asyncio_scheduled_tasks is now WeakSet.data, which is a Python set) - // These are global, not per-thread, so we collect them once - // If MirrorSet::create() fails, the set might be empty or invalid - skip it - if (asyncio_scheduled_tasks == nullptr) { - // Skip if not initialized - } else if (auto maybe_scheduled_tasks_set = MirrorSet::create(asyncio_scheduled_tasks)) { - auto scheduled_tasks_set = std::move(*maybe_scheduled_tasks_set); - if (auto maybe_scheduled_tasks = scheduled_tasks_set.as_unordered_set()) { - auto scheduled_tasks = std::move(*maybe_scheduled_tasks); - for (auto task_addr : scheduled_tasks) { - // In WeakSet.data (set), elements are the Task objects themselves - auto maybe_task_info = TaskInfo::create(reinterpret_cast(task_addr)); - if (maybe_task_info && (*maybe_task_info)->loop == loop) { - tasks.push_back(std::move(*maybe_task_info)); + // Handle third-party tasks from Python _scheduled_tasks WeakSet + // In Python 3.14+, _scheduled_tasks is a Python-level weakref.WeakSet() that only contains + // tasks that don't inherit from asyncio.Task. Native asyncio.Task instances are stored + // in linked-lists (handled above) and are NOT added to _scheduled_tasks. + // This is typically empty in practice, but we handle it for completeness. + if (asyncio_scheduled_tasks != nullptr) { + if (auto maybe_scheduled_tasks_set = MirrorSet::create(asyncio_scheduled_tasks)) { + auto scheduled_tasks_set = std::move(*maybe_scheduled_tasks_set); + if (auto maybe_scheduled_tasks = scheduled_tasks_set.as_unordered_set()) { + auto scheduled_tasks = std::move(*maybe_scheduled_tasks); + for (auto task_addr : scheduled_tasks) { + // In WeakSet.data (set), elements are the Task objects themselves + auto maybe_task_info = TaskInfo::create(reinterpret_cast(task_addr)); + if (maybe_task_info && (*maybe_task_info)->loop == loop) { + tasks.push_back(std::move(*maybe_task_info)); + } } } } diff --git a/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc b/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc index 0e60733442b..d99058f8e0e 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc +++ b/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc @@ -268,9 +268,6 @@ Frame::read(PyObject* frame_addr, PyObject** prev_addr) // We cannot use _PyInterpreterFrame_LASTI because _PyCode_CODE reads // from the code object. #if PY_VERSION_HEX >= 0x030e0000 - // Python 3.14+: f_executable is _PyStackRef, access bits directly - // We can't use CPython API helpers as we're copying partial structs - // // Per Python 3.14 release notes (gh-123923): f_executable uses a tagged pointer. // Profilers must clear the least significant bit to recover the PyObject* pointer. PyCodeObject* code_obj = reinterpret_cast(BITS_TO_PTR_MASKED(frame_addr->f_executable)); From ea99d06a90af31cb537556385209c929a5f5d7b2 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Tue, 9 Dec 2025 11:55:53 -0500 Subject: [PATCH 27/43] update test_asyncio_as_completed --- tests/profiling/collector/test_asyncio_as_completed.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/profiling/collector/test_asyncio_as_completed.py b/tests/profiling/collector/test_asyncio_as_completed.py index 67d237f4387..47f09622f5a 100644 --- a/tests/profiling/collector/test_asyncio_as_completed.py +++ b/tests/profiling/collector/test_asyncio_as_completed.py @@ -32,10 +32,13 @@ async def wait_and_return_delay(t: float) -> float: async def main() -> None: # Create a mix of Tasks and Coroutines + # TODO(taegyunkim): For Python 3.14+, investigate why we need to increase + # the sleep time to get all the samples as expected. The divisor was 10 + # and changed to 5. futures = [ - asyncio.create_task(wait_and_return_delay(float(i) / 10)) + asyncio.create_task(wait_and_return_delay(float(i) / 5)) if i % 2 == 0 - else wait_and_return_delay(float(i) / 10) + else wait_and_return_delay(float(i) / 5) for i in range(2, 12) ] assert len(futures) == 10 From bc24c6a9c63e039341bc1175b0e43f0ea00e0acb Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Tue, 9 Dec 2025 12:31:46 -0500 Subject: [PATCH 28/43] update lineno --- tests/profiling/collector/test_asyncio_as_completed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/profiling/collector/test_asyncio_as_completed.py b/tests/profiling/collector/test_asyncio_as_completed.py index 47f09622f5a..2e715b4a8d7 100644 --- a/tests/profiling/collector/test_asyncio_as_completed.py +++ b/tests/profiling/collector/test_asyncio_as_completed.py @@ -93,7 +93,7 @@ async def main() -> None: pprof_utils.StackLocation( function_name="main", filename="test_asyncio_as_completed.py", - line_no=main.__code__.co_firstlineno + 17, + line_no=main.__code__.co_firstlineno + 20, ), ] From 49444d7548c777a4dd83391e33300435d98c83e9 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Tue, 9 Dec 2025 16:44:49 -0500 Subject: [PATCH 29/43] update to 3 --- tests/profiling/collector/test_asyncio_as_completed.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/profiling/collector/test_asyncio_as_completed.py b/tests/profiling/collector/test_asyncio_as_completed.py index 2e715b4a8d7..52da6d69530 100644 --- a/tests/profiling/collector/test_asyncio_as_completed.py +++ b/tests/profiling/collector/test_asyncio_as_completed.py @@ -34,11 +34,11 @@ async def main() -> None: # Create a mix of Tasks and Coroutines # TODO(taegyunkim): For Python 3.14+, investigate why we need to increase # the sleep time to get all the samples as expected. The divisor was 10 - # and changed to 5. + # and changed to 3. futures = [ - asyncio.create_task(wait_and_return_delay(float(i) / 5)) + asyncio.create_task(wait_and_return_delay(float(i) / 3)) if i % 2 == 0 - else wait_and_return_delay(float(i) / 5) + else wait_and_return_delay(float(i) / 3) for i in range(2, 12) ] assert len(futures) == 10 From cc21de71c37e9c5984802630d02b7a27ba8321d5 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Tue, 9 Dec 2025 16:58:22 -0500 Subject: [PATCH 30/43] Update comment --- .../collector/test_asyncio_as_completed.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/profiling/collector/test_asyncio_as_completed.py b/tests/profiling/collector/test_asyncio_as_completed.py index 52da6d69530..0dc071d8bfd 100644 --- a/tests/profiling/collector/test_asyncio_as_completed.py +++ b/tests/profiling/collector/test_asyncio_as_completed.py @@ -32,13 +32,15 @@ async def wait_and_return_delay(t: float) -> float: async def main() -> None: # Create a mix of Tasks and Coroutines - # TODO(taegyunkim): For Python 3.14+, investigate why we need to increase - # the sleep time to get all the samples as expected. The divisor was 10 - # and changed to 3. + divisor = 10 if PYVERSION < (3, 14) else 3 + # For Python 3.14+, we increase the sleep time to get all the samples + # as expected. It's likely because the CPython 3.14+ keeps track of + # the tasks in a linked list, and each node needs to be copied using + # a system call. futures = [ - asyncio.create_task(wait_and_return_delay(float(i) / 3)) + asyncio.create_task(wait_and_return_delay(float(i) / divisor)) if i % 2 == 0 - else wait_and_return_delay(float(i) / 3) + else wait_and_return_delay(float(i) / divisor) for i in range(2, 12) ] assert len(futures) == 10 @@ -93,7 +95,7 @@ async def main() -> None: pprof_utils.StackLocation( function_name="main", filename="test_asyncio_as_completed.py", - line_no=main.__code__.co_firstlineno + 20, + line_no=main.__code__.co_firstlineno + 22, ), ] From 7104d623fc2caa69b53466147381c709cc7704e6 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Wed, 10 Dec 2025 15:23:57 -0500 Subject: [PATCH 31/43] remove unnecessary comments --- .../stack_v2/echion/echion/cpython/tasks.h | 6 +++--- .../profiling/stack_v2/echion/echion/frame.h | 4 ++-- .../profiling/stack_v2/echion/echion/tasks.h | 12 ++++++------ .../profiling/stack_v2/echion/echion/threads.h | 2 +- .../datadog/profiling/stack_v2/src/echion/frame.cc | 14 +++++++------- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h index 9a1cf2e7d7b..c5dbf088181 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h @@ -13,17 +13,17 @@ #define Py_BUILD_CORE #if PY_VERSION_HEX >= 0x030e0000 // Python 3.14+: _PyInterpreterFrame moved to new header -#include // For offsetof macro +#include #include #include #include -#include // For llist_node structure +#include #include #include #elif PY_VERSION_HEX >= 0x030d0000 #include #else -#include // For offsetof macro +#include #include #include #endif // PY_VERSION_HEX >= 0x030d0000 diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/frame.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/frame.h index 35f2a063485..1bcab6c5cd9 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/frame.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/frame.h @@ -17,11 +17,11 @@ #if PY_VERSION_HEX >= 0x030e0000 // Python 3.14+: _PyInterpreterFrame moved to new header #define Py_BUILD_CORE -#include // For _PyInterpreterFrame type definition +#include #elif PY_VERSION_HEX >= 0x030b0000 // Python 3.11-3.13: _PyInterpreterFrame is in pycore_frame.h #define Py_BUILD_CORE -#include // For _PyInterpreterFrame type definition +#include #endif #include diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h index e9c25b47806..2d1dae47606 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h @@ -15,17 +15,17 @@ #include #define Py_BUILD_CORE -#include // For offsetof macro +#include #if PY_VERSION_HEX >= 0x030e0000 -#include // for FRAME_CLEARED, FRAME_EXECUTING -#include // For PyInterpreterState -#include // For llist_node structure -#include // For _PyThreadStateImpl +#include +#include +#include +#include #include #elif PY_VERSION_HEX >= 0x030d0000 #include #else -#include // for FRAME_CLEARED, FRAME_EXECUTING +#include #include #endif // PY_VERSION_HEX >= 0x030d0000 #else diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h index b9b81a84081..336f8583c67 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h @@ -8,7 +8,7 @@ #define Py_BUILD_CORE #if PY_VERSION_HEX >= 0x030e0000 -#include // For _PyThreadStateImpl +#include #endif #include diff --git a/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc b/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc index d99058f8e0e..d5e1d512832 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc +++ b/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc @@ -5,16 +5,16 @@ #if PY_VERSION_HEX >= 0x030b0000 // Common headers needed for Python 3.11+ implementation -#include // For offsetof macro -#include // For _Py_CODEUNIT -#include // For _PyInterpreterFrame and FRAME_OWNED_BY_* constants +#include +#include +#include #if PY_VERSION_HEX >= 0x030e0000 // Python 3.14+: Additional headers for new structure definitions -#include // For _PyInterpreterFrame complete definition -#include // For _PyStackRef -#endif // PY_VERSION_HEX >= 0x030e0000 -#endif // PY_VERSION_HEX >= 0x030b0000 +#include +#include +#endif // PY_VERSION_HEX >= 0x030e0000 +#endif // PY_VERSION_HEX >= 0x030b0000 // ---------------------------------------------------------------------------- #if PY_VERSION_HEX >= 0x030b0000 From 7f3be8aaefbe7cbd5bd530c045e1db64b0ee1880 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Wed, 10 Dec 2025 15:26:13 -0500 Subject: [PATCH 32/43] use 1<<16 --- .../internal/datadog/profiling/stack_v2/echion/echion/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h index 2d1dae47606..1c6dadb3d33 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h @@ -313,7 +313,7 @@ get_tasks_from_linked_list(uintptr_t head_addr, PyObject* loop, std::vector Date: Wed, 10 Dec 2025 15:29:43 -0500 Subject: [PATCH 33/43] adopt review suggestions --- .../datadog/profiling/stack_v2/echion/echion/tasks.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h index 1c6dadb3d33..ca1ebe7800f 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h @@ -337,8 +337,9 @@ get_tasks_from_linked_list(uintptr_t head_addr, PyObject* loop, std::vector(task_addr_uint)); if (maybe_task_info) { - if ((*maybe_task_info)->loop == loop) { - tasks.push_back(std::move(*maybe_task_info)); + auto& task_info = *maybe_task_info; + if (task_info->loop == loop) { + tasks.push_back(std::move(task_info)); } } From a2971f704275973b265a1c593826bf437eafeffc Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Wed, 10 Dec 2025 15:51:48 -0500 Subject: [PATCH 34/43] use return --- .../profiling/stack_v2/echion/echion/tasks.h | 46 ++++++++++++------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h index ca1ebe7800f..763ba34a903 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h @@ -288,17 +288,17 @@ TaskInfo::current(PyObject* loop) // ---------------------------------------------------------------------------- #if PY_VERSION_HEX >= 0x030e0000 // Python 3.14+: Get tasks from a single thread's linked-list -inline void +[[nodiscard]] inline Result get_tasks_from_linked_list(uintptr_t head_addr, PyObject* loop, std::vector& tasks) { if (head_addr == 0 || loop == nullptr) { - return; + return ErrorKind::TaskInfoError; } // Copy head node struct from remote memory to local memory struct llist_node head_node_local; if (copy_type(reinterpret_cast(head_addr), head_node_local)) { - return; + return ErrorKind::TaskInfoError; } // Check if list is empty (head points to itself in circular list) @@ -306,7 +306,7 @@ get_tasks_from_linked_list(uintptr_t head_addr, PyObject* loop, std::vector(head_node_local.next); uintptr_t prev_as_uint = reinterpret_cast(head_node_local.prev); if (next_as_uint == head_addr_uint && prev_as_uint == head_addr_uint) { - return; + return Result::ok(); } struct llist_node current_node = head_node_local; // Start with head node @@ -321,11 +321,11 @@ get_tasks_from_linked_list(uintptr_t head_addr, PyObject* loop, std::vector(current_node.next) != head_addr_uint) { // Safety: prevent infinite loops if (++iteration_count > MAX_ITERATIONS) { - return; + return ErrorKind::TaskInfoError; } if (current_node.next == nullptr) { - return; // nullptr pointer - invalid list + return ErrorKind::TaskInfoError; // nullptr pointer - invalid list } uintptr_t next_node_addr = reinterpret_cast(current_node.next); @@ -345,46 +345,57 @@ get_tasks_from_linked_list(uintptr_t head_addr, PyObject* loop, std::vector(next_node_addr), current_node)) { - return; // Failed to read next node + return ErrorKind::TaskInfoError; // Failed to read next node } current_node_addr = next_node_addr; // Update address for next iteration } + + return Result::ok(); } -inline void +// Get tasks from thread's linked-list (for active tasks) +// NOTE: This function uses an output parameter instead of returning Result> +// for performance reasons. When accumulating tasks from multiple sources (thread list, interpreter list, +// scheduled tasks), using output parameters allows direct appending to a single vector, avoiding the +// overhead of moving/copying elements between intermediate vectors. +[[nodiscard]] inline Result get_tasks_from_thread_linked_list(_PyThreadStateImpl* tstate_impl, PyObject* loop, std::vector& tasks) { if (tstate_impl == nullptr || loop == nullptr) { - return; + return ErrorKind::TaskInfoError; } uintptr_t head_addr = reinterpret_cast(&tstate_impl->asyncio_tasks_head); - get_tasks_from_linked_list(head_addr, loop, tasks); + return get_tasks_from_linked_list(head_addr, loop, tasks); } // Get tasks from interpreter's linked-list (for lingering tasks) -inline void +// NOTE: This function uses an output parameter instead of returning Result> +// for performance reasons. When accumulating tasks from multiple sources (thread list, interpreter list, +// scheduled tasks), using output parameters allows direct appending to a single vector, avoiding the +// overhead of moving/copying elements between intermediate vectors. +[[nodiscard]] inline Result get_tasks_from_interpreter_linked_list(PyThreadState* tstate, PyObject* loop, std::vector& tasks) { if (tstate == nullptr || loop == nullptr) { - return; + return ErrorKind::TaskInfoError; } // Step 1: Get interpreter state from thread state // tstate->interp points to PyInterpreterState PyInterpreterState interp; if (copy_type(tstate->interp, interp)) { - return; + return ErrorKind::TaskInfoError; } // Step 2: Calculate interpreter's asyncio_tasks_head address uintptr_t interp_addr = reinterpret_cast(tstate->interp); - size_t asyncio_tasks_head_offset = offsetof(PyInterpreterState, asyncio_tasks_head); + constexpr size_t asyncio_tasks_head_offset = offsetof(PyInterpreterState, asyncio_tasks_head); uintptr_t head_addr = interp_addr + asyncio_tasks_head_offset; // Step 3: Call the shared linked-list iteration function - get_tasks_from_linked_list(head_addr, loop, tasks); + return get_tasks_from_linked_list(head_addr, loop, tasks); } #endif @@ -403,13 +414,14 @@ get_all_tasks(PyObject* loop, _PyThreadStateImpl* tstate_impl = nullptr) // 1. Per-thread list: tstate_impl->asyncio_tasks_head (active tasks) // 2. Per-interpreter list: interp->asyncio_tasks_head (lingering tasks) // First, get tasks from this thread's linked-list (if tstate_impl is provided) + // Note: We continue processing even if one source fails to maximize partial results if (tstate_impl != nullptr) { - get_tasks_from_thread_linked_list(tstate_impl, loop, tasks); + (void)get_tasks_from_thread_linked_list(tstate_impl, loop, tasks); // Second, get tasks from interpreter's linked-list (lingering tasks) // Access PyThreadState via the first field of _PyThreadStateImpl PyThreadState* tstate = reinterpret_cast(tstate_impl); - get_tasks_from_interpreter_linked_list(tstate, loop, tasks); + (void)get_tasks_from_interpreter_linked_list(tstate, loop, tasks); } // Handle third-party tasks from Python _scheduled_tasks WeakSet From 9f762f396d9c707ff2dd96000eda1939d0d613bf Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Wed, 10 Dec 2025 15:52:01 -0500 Subject: [PATCH 35/43] revert lines --- .../datadog/profiling/stack_v2/echion/echion/threads.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h index 336f8583c67..0e1925888e9 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h @@ -273,11 +273,9 @@ ThreadInfo::unwind_tasks(ThreadStateType* tstate_ptr) for (auto& task : all_tasks) { origin_map.emplace(task->origin, std::ref(*task)); - // task->waiter is only set if task_fut_waiter points to another Task - // If task_fut_waiter points to a Future/Coroutine, waiter will be nullptr - if (task->waiter != nullptr) { + if (task->waiter != nullptr) waitee_map.emplace(task->waiter->origin, std::ref(*task)); - } else if (parent_tasks.find(task->origin) == parent_tasks.end()) { + else if (parent_tasks.find(task->origin) == parent_tasks.end()) { leaf_tasks.push_back(std::ref(*task)); } } From 613505353edcffb7c64ee1d2ba724692c6a5311c Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Wed, 10 Dec 2025 17:32:21 -0500 Subject: [PATCH 36/43] avoid copying over large struct, _PyThreadStateImpl, and copy only needed --- .../profiling/stack_v2/echion/echion/tasks.h | 220 ------------ .../stack_v2/echion/echion/threads.h | 317 +++++++++++++++--- .../profiling/stack_v2/src/sampler.cpp | 14 +- 3 files changed, 271 insertions(+), 280 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h index 763ba34a903..30061c9bf0e 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h @@ -285,226 +285,6 @@ TaskInfo::current(PyObject* loop) return TaskInfo::create(reinterpret_cast(task)); } -// ---------------------------------------------------------------------------- -#if PY_VERSION_HEX >= 0x030e0000 -// Python 3.14+: Get tasks from a single thread's linked-list -[[nodiscard]] inline Result -get_tasks_from_linked_list(uintptr_t head_addr, PyObject* loop, std::vector& tasks) -{ - if (head_addr == 0 || loop == nullptr) { - return ErrorKind::TaskInfoError; - } - - // Copy head node struct from remote memory to local memory - struct llist_node head_node_local; - if (copy_type(reinterpret_cast(head_addr), head_node_local)) { - return ErrorKind::TaskInfoError; - } - - // Check if list is empty (head points to itself in circular list) - uintptr_t head_addr_uint = head_addr; - uintptr_t next_as_uint = reinterpret_cast(head_node_local.next); - uintptr_t prev_as_uint = reinterpret_cast(head_node_local.prev); - if (next_as_uint == head_addr_uint && prev_as_uint == head_addr_uint) { - return Result::ok(); - } - - struct llist_node current_node = head_node_local; // Start with head node - uintptr_t current_node_addr = head_addr; // Address of current node - - // Copied from CPython's _remote_debugging_module.c: MAX_ITERATIONS - const size_t MAX_ITERATIONS = 1 << 16; - size_t iteration_count = 0; - - // Iterate over linked-list. The linked list is circular, so we stop - // when we're back at head. - while (reinterpret_cast(current_node.next) != head_addr_uint) { - // Safety: prevent infinite loops - if (++iteration_count > MAX_ITERATIONS) { - return ErrorKind::TaskInfoError; - } - - if (current_node.next == nullptr) { - return ErrorKind::TaskInfoError; // nullptr pointer - invalid list - } - - uintptr_t next_node_addr = reinterpret_cast(current_node.next); - - // Calculate task_addr from current_node.next - size_t task_node_offset_val = offsetof(TaskObj, task_node); - uintptr_t task_addr_uint = next_node_addr - task_node_offset_val; - - // Create TaskInfo for the task - auto maybe_task_info = TaskInfo::create(reinterpret_cast(task_addr_uint)); - if (maybe_task_info) { - auto& task_info = *maybe_task_info; - if (task_info->loop == loop) { - tasks.push_back(std::move(task_info)); - } - } - - // Read next node from current_node.next into current_node - if (copy_type(reinterpret_cast(next_node_addr), current_node)) { - return ErrorKind::TaskInfoError; // Failed to read next node - } - current_node_addr = next_node_addr; // Update address for next iteration - } - - return Result::ok(); -} - -// Get tasks from thread's linked-list (for active tasks) -// NOTE: This function uses an output parameter instead of returning Result> -// for performance reasons. When accumulating tasks from multiple sources (thread list, interpreter list, -// scheduled tasks), using output parameters allows direct appending to a single vector, avoiding the -// overhead of moving/copying elements between intermediate vectors. -[[nodiscard]] inline Result -get_tasks_from_thread_linked_list(_PyThreadStateImpl* tstate_impl, PyObject* loop, std::vector& tasks) -{ - if (tstate_impl == nullptr || loop == nullptr) { - return ErrorKind::TaskInfoError; - } - - uintptr_t head_addr = reinterpret_cast(&tstate_impl->asyncio_tasks_head); - - return get_tasks_from_linked_list(head_addr, loop, tasks); -} - -// Get tasks from interpreter's linked-list (for lingering tasks) -// NOTE: This function uses an output parameter instead of returning Result> -// for performance reasons. When accumulating tasks from multiple sources (thread list, interpreter list, -// scheduled tasks), using output parameters allows direct appending to a single vector, avoiding the -// overhead of moving/copying elements between intermediate vectors. -[[nodiscard]] inline Result -get_tasks_from_interpreter_linked_list(PyThreadState* tstate, PyObject* loop, std::vector& tasks) -{ - if (tstate == nullptr || loop == nullptr) { - return ErrorKind::TaskInfoError; - } - - // Step 1: Get interpreter state from thread state - // tstate->interp points to PyInterpreterState - PyInterpreterState interp; - if (copy_type(tstate->interp, interp)) { - return ErrorKind::TaskInfoError; - } - - // Step 2: Calculate interpreter's asyncio_tasks_head address - uintptr_t interp_addr = reinterpret_cast(tstate->interp); - constexpr size_t asyncio_tasks_head_offset = offsetof(PyInterpreterState, asyncio_tasks_head); - uintptr_t head_addr = interp_addr + asyncio_tasks_head_offset; - - // Step 3: Call the shared linked-list iteration function - return get_tasks_from_linked_list(head_addr, loop, tasks); -} -#endif - -// ---------------------------------------------------------------------------- -// TODO: Make this a "for_each_task" function? -#if PY_VERSION_HEX >= 0x030e0000 -[[nodiscard]] inline Result> -get_all_tasks(PyObject* loop, _PyThreadStateImpl* tstate_impl = nullptr) -{ - std::vector tasks; - if (loop == NULL) - return tasks; - - // Python 3.14+: Native tasks are in linked-list per thread AND per interpreter - // CPython iterates over both: - // 1. Per-thread list: tstate_impl->asyncio_tasks_head (active tasks) - // 2. Per-interpreter list: interp->asyncio_tasks_head (lingering tasks) - // First, get tasks from this thread's linked-list (if tstate_impl is provided) - // Note: We continue processing even if one source fails to maximize partial results - if (tstate_impl != nullptr) { - (void)get_tasks_from_thread_linked_list(tstate_impl, loop, tasks); - - // Second, get tasks from interpreter's linked-list (lingering tasks) - // Access PyThreadState via the first field of _PyThreadStateImpl - PyThreadState* tstate = reinterpret_cast(tstate_impl); - (void)get_tasks_from_interpreter_linked_list(tstate, loop, tasks); - } - - // Handle third-party tasks from Python _scheduled_tasks WeakSet - // In Python 3.14+, _scheduled_tasks is a Python-level weakref.WeakSet() that only contains - // tasks that don't inherit from asyncio.Task. Native asyncio.Task instances are stored - // in linked-lists (handled above) and are NOT added to _scheduled_tasks. - // This is typically empty in practice, but we handle it for completeness. - if (asyncio_scheduled_tasks != nullptr) { - if (auto maybe_scheduled_tasks_set = MirrorSet::create(asyncio_scheduled_tasks)) { - auto scheduled_tasks_set = std::move(*maybe_scheduled_tasks_set); - if (auto maybe_scheduled_tasks = scheduled_tasks_set.as_unordered_set()) { - auto scheduled_tasks = std::move(*maybe_scheduled_tasks); - for (auto task_addr : scheduled_tasks) { - // In WeakSet.data (set), elements are the Task objects themselves - auto maybe_task_info = TaskInfo::create(reinterpret_cast(task_addr)); - if (maybe_task_info && (*maybe_task_info)->loop == loop) { - tasks.push_back(std::move(*maybe_task_info)); - } - } - } - } - } -#else -[[nodiscard]] inline Result> -get_all_tasks(PyObject* loop, PyThreadState* tstate = nullptr) -{ - std::vector tasks; - if (loop == NULL) - return tasks; - - auto maybe_scheduled_tasks_set = MirrorSet::create(asyncio_scheduled_tasks); - if (!maybe_scheduled_tasks_set) { - return ErrorKind::TaskInfoError; - } - - auto scheduled_tasks_set = std::move(*maybe_scheduled_tasks_set); - auto maybe_scheduled_tasks = scheduled_tasks_set.as_unordered_set(); - if (!maybe_scheduled_tasks) { - return ErrorKind::TaskInfoError; - } - - auto scheduled_tasks = std::move(*maybe_scheduled_tasks); - for (auto task_wr_addr : scheduled_tasks) { - PyWeakReference task_wr; - if (copy_type(task_wr_addr, task_wr)) - continue; - - auto maybe_task_info = TaskInfo::create(reinterpret_cast(task_wr.wr_object)); - if (maybe_task_info) { - if ((*maybe_task_info)->loop == loop) { - tasks.push_back(std::move(*maybe_task_info)); - } - } - } -#endif - - if (asyncio_eager_tasks != NULL) { - auto maybe_eager_tasks_set = MirrorSet::create(asyncio_eager_tasks); - if (!maybe_eager_tasks_set) { - return ErrorKind::TaskInfoError; - } - - auto eager_tasks_set = std::move(*maybe_eager_tasks_set); - - auto maybe_eager_tasks = eager_tasks_set.as_unordered_set(); - if (!maybe_eager_tasks) { - return ErrorKind::TaskInfoError; - } - - auto eager_tasks = std::move(*maybe_eager_tasks); - for (auto task_addr : eager_tasks) { - auto maybe_task_info = TaskInfo::create(reinterpret_cast(task_addr)); - if (maybe_task_info) { - if ((*maybe_task_info)->loop == loop) { - tasks.push_back(std::move(*maybe_task_info)); - } - } - } - } - - return tasks; -} - // ---------------------------------------------------------------------------- inline std::vector> current_tasks; diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h index 0e1925888e9..78cdc93e2af 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h @@ -32,16 +32,6 @@ #include #include -#if PY_VERSION_HEX >= 0x030e0000 -// Note: _PythreadStateImpl was introduced in Python 3.13. Every PyThreadState -// is actually allocated as a _PyThreadStateImpl. -// Python 3.14+: Use _PyThreadStateImpl to access asyncio_tasks_head directly -using ThreadStateType = _PyThreadStateImpl; -#else -// Pre-Python 3.14: Use PyThreadState (no asyncio_tasks_head field) -using ThreadStateType = PyThreadState; -#endif - class ThreadInfo { public: @@ -60,12 +50,13 @@ class ThreadInfo microsecond_t cpu_time; uintptr_t asyncio_loop = 0; + uintptr_t tstate_addr = 0; // Remote address of PyThreadState for accessing asyncio_tasks_head [[nodiscard]] Result update_cpu_time(); bool is_running(); - [[nodiscard]] Result sample(int64_t, ThreadStateType*, microsecond_t); - void unwind(ThreadStateType*); + [[nodiscard]] Result sample(int64_t, PyThreadState*, microsecond_t); + void unwind(PyThreadState*); // ------------------------------------------------------------------------ #if defined PL_LINUX @@ -115,8 +106,15 @@ class ThreadInfo }; private: - [[nodiscard]] Result unwind_tasks(ThreadStateType*); + [[nodiscard]] Result unwind_tasks(PyThreadState*); void unwind_greenlets(PyThreadState*, unsigned long); + [[nodiscard]] Result> get_all_tasks(PyThreadState* tstate); +#if PY_VERSION_HEX >= 0x030e0000 + [[nodiscard]] Result get_tasks_from_thread_linked_list(std::vector& tasks); + [[nodiscard]] Result get_tasks_from_interpreter_linked_list(PyThreadState* tstate, + std::vector& tasks); + [[nodiscard]] Result get_tasks_from_linked_list(uintptr_t head_addr, std::vector& tasks); +#endif }; inline Result @@ -200,15 +198,13 @@ inline std::mutex thread_info_map_lock; // ---------------------------------------------------------------------------- inline void -ThreadInfo::unwind(ThreadStateType* tstate_ptr) +ThreadInfo::unwind(PyThreadState* tstate) { - PyThreadState* tstate = reinterpret_cast(tstate_ptr); - unwind_python_stack(tstate); if (asyncio_loop) { // unwind_tasks returns a [[nodiscard]] Result. // We cast it to void to ignore failures. - (void)unwind_tasks(tstate_ptr); + (void)unwind_tasks(tstate); } // We make the assumption that gevent and asyncio are not mixed @@ -219,7 +215,7 @@ ThreadInfo::unwind(ThreadStateType* tstate_ptr) // ---------------------------------------------------------------------------- inline Result -ThreadInfo::unwind_tasks(ThreadStateType* tstate_ptr) +ThreadInfo::unwind_tasks(PyThreadState* tstate) { std::vector leaf_tasks; std::unordered_set parent_tasks; @@ -227,7 +223,7 @@ ThreadInfo::unwind_tasks(ThreadStateType* tstate_ptr) std::unordered_map origin_map; // Indexed by task origin static std::unordered_set previous_task_objects; - auto maybe_all_tasks = get_all_tasks(reinterpret_cast(asyncio_loop), tstate_ptr); + auto maybe_all_tasks = get_all_tasks(tstate); if (!maybe_all_tasks) { return ErrorKind::TaskInfoError; } @@ -342,6 +338,235 @@ ThreadInfo::unwind_tasks(ThreadStateType* tstate_ptr) return Result::ok(); } +// ---------------------------------------------------------------------------- +#if PY_VERSION_HEX >= 0x030e0000 +inline Result +ThreadInfo::get_tasks_from_thread_linked_list(std::vector& tasks) +{ + if (this->tstate_addr == 0 || this->asyncio_loop == 0) { + return ErrorKind::TaskInfoError; + } + + // Calculate thread state's asyncio_tasks_head remote address + // Note: Since 3.13+, every PyThreadState is actually allocated as a _PyThreadStateImpl. + // We use PyThreadState* everywhere and cast to _PyThreadStateImpl* only when we need + // to access asyncio_tasks_head (which is only available in Python 3.14+). + // Since tstate_addr is a remote address, we calculate the offset and add it to the address. + // get_tasks_from_linked_list will handle copying the head node from remote memory internally. + constexpr size_t asyncio_tasks_head_offset = offsetof(_PyThreadStateImpl, asyncio_tasks_head); + uintptr_t head_addr = this->tstate_addr + asyncio_tasks_head_offset; + + return get_tasks_from_linked_list(head_addr, tasks); +} + +inline Result +ThreadInfo::get_tasks_from_interpreter_linked_list(PyThreadState* tstate, std::vector& tasks) +{ + if (tstate == nullptr || tstate->interp == nullptr || this->asyncio_loop == 0) { + return ErrorKind::TaskInfoError; + } + + constexpr size_t asyncio_tasks_head_offset = offsetof(PyInterpreterState, asyncio_tasks_head); + uintptr_t head_addr = reinterpret_cast(tstate->interp) + asyncio_tasks_head_offset; + + return get_tasks_from_linked_list(head_addr, tasks); +} + +inline Result +ThreadInfo::get_tasks_from_linked_list(uintptr_t head_addr, std::vector& tasks) +{ + if (head_addr == 0 || this->asyncio_loop == 0) { + return ErrorKind::TaskInfoError; + } + + // Copy head node struct from remote memory to local memory + struct llist_node head_node_local; + if (copy_type(reinterpret_cast(head_addr), head_node_local)) { + return ErrorKind::TaskInfoError; + } + + // Check if list is empty (head points to itself in circular list) + uintptr_t head_addr_uint = head_addr; + uintptr_t next_as_uint = reinterpret_cast(head_node_local.next); + uintptr_t prev_as_uint = reinterpret_cast(head_node_local.prev); + if (next_as_uint == head_addr_uint && prev_as_uint == head_addr_uint) { + return Result::ok(); + } + + struct llist_node current_node = head_node_local; // Start with head node + uintptr_t current_node_addr = head_addr; // Address of current node + + // Copied from CPython's _remote_debugging_module.c: MAX_ITERATIONS + const size_t MAX_ITERATIONS = 1 << 16; + size_t iteration_count = 0; + + // Iterate over linked-list. The linked list is circular, so we stop + // when we're back at head. + while (reinterpret_cast(current_node.next) != head_addr_uint) { + // Safety: prevent infinite loops + if (++iteration_count > MAX_ITERATIONS) { + return ErrorKind::TaskInfoError; + } + + if (current_node.next == nullptr) { + return ErrorKind::TaskInfoError; // nullptr pointer - invalid list + } + + uintptr_t next_node_addr = reinterpret_cast(current_node.next); + + // Calculate task_addr from current_node.next + size_t task_node_offset_val = offsetof(TaskObj, task_node); + uintptr_t task_addr_uint = next_node_addr - task_node_offset_val; + + // Create TaskInfo for the task + auto maybe_task_info = TaskInfo::create(reinterpret_cast(task_addr_uint)); + if (maybe_task_info) { + auto& task_info = *maybe_task_info; + if (task_info->loop == reinterpret_cast(this->asyncio_loop)) { + tasks.push_back(std::move(task_info)); + } + } + + // Read next node from current_node.next into current_node + if (copy_type(reinterpret_cast(next_node_addr), current_node)) { + return ErrorKind::TaskInfoError; // Failed to read next node + } + current_node_addr = next_node_addr; // Update address for next iteration + } + + return Result::ok(); +} + +inline Result> +ThreadInfo::get_all_tasks(PyThreadState* tstate) +{ + std::vector tasks; + if (this->asyncio_loop == 0) + return tasks; + + // Python 3.14+: Native tasks are in linked-list per thread AND per interpreter + // CPython iterates over both: + // 1. Per-thread list: tstate->asyncio_tasks_head (active tasks) + // 2. Per-interpreter list: interp->asyncio_tasks_head (lingering tasks) + // First, get tasks from this thread's linked-list (if tstate_addr is set) + // Note: We continue processing even if one source fails to maximize partial results + if (tstate != nullptr && this->tstate_addr != 0) { + (void)get_tasks_from_thread_linked_list(tasks); + + // Second, get tasks from interpreter's linked-list (lingering tasks) + (void)get_tasks_from_interpreter_linked_list(tstate, tasks); + } + + // Handle third-party tasks from Python _scheduled_tasks WeakSet + // In Python 3.14+, _scheduled_tasks is a Python-level weakref.WeakSet() that only contains + // tasks that don't inherit from asyncio.Task. Native asyncio.Task instances are stored + // in linked-lists (handled above) and are NOT added to _scheduled_tasks. + // This is typically empty in practice, but we handle it for completeness. + if (asyncio_scheduled_tasks != nullptr) { + if (auto maybe_scheduled_tasks_set = MirrorSet::create(asyncio_scheduled_tasks)) { + auto scheduled_tasks_set = std::move(*maybe_scheduled_tasks_set); + if (auto maybe_scheduled_tasks = scheduled_tasks_set.as_unordered_set()) { + auto scheduled_tasks = std::move(*maybe_scheduled_tasks); + for (auto task_addr : scheduled_tasks) { + // In WeakSet.data (set), elements are the Task objects themselves + auto maybe_task_info = TaskInfo::create(reinterpret_cast(task_addr)); + if (maybe_task_info && + (*maybe_task_info)->loop == reinterpret_cast(this->asyncio_loop)) { + tasks.push_back(std::move(*maybe_task_info)); + } + } + } + } + } + + if (asyncio_eager_tasks != NULL) { + auto maybe_eager_tasks_set = MirrorSet::create(asyncio_eager_tasks); + if (!maybe_eager_tasks_set) { + return ErrorKind::TaskInfoError; + } + + auto eager_tasks_set = std::move(*maybe_eager_tasks_set); + + auto maybe_eager_tasks = eager_tasks_set.as_unordered_set(); + if (!maybe_eager_tasks) { + return ErrorKind::TaskInfoError; + } + + auto eager_tasks = std::move(*maybe_eager_tasks); + for (auto task_addr : eager_tasks) { + auto maybe_task_info = TaskInfo::create(reinterpret_cast(task_addr)); + if (maybe_task_info) { + if ((*maybe_task_info)->loop == reinterpret_cast(this->asyncio_loop)) { + tasks.push_back(std::move(*maybe_task_info)); + } + } + } + } + + return tasks; +} +#else +// Pre-Python 3.14: get_all_tasks uses WeakSet approach +inline Result> +ThreadInfo::get_all_tasks(PyThreadState*) +{ + std::vector tasks; + if (this->asyncio_loop == 0) + return tasks; + + auto maybe_scheduled_tasks_set = MirrorSet::create(asyncio_scheduled_tasks); + if (!maybe_scheduled_tasks_set) { + return ErrorKind::TaskInfoError; + } + + auto scheduled_tasks_set = std::move(*maybe_scheduled_tasks_set); + auto maybe_scheduled_tasks = scheduled_tasks_set.as_unordered_set(); + if (!maybe_scheduled_tasks) { + return ErrorKind::TaskInfoError; + } + + auto scheduled_tasks = std::move(*maybe_scheduled_tasks); + for (auto task_wr_addr : scheduled_tasks) { + PyWeakReference task_wr; + if (copy_type(task_wr_addr, task_wr)) + continue; + + auto maybe_task_info = TaskInfo::create(reinterpret_cast(task_wr.wr_object)); + if (maybe_task_info) { + if ((*maybe_task_info)->loop == reinterpret_cast(this->asyncio_loop)) { + tasks.push_back(std::move(*maybe_task_info)); + } + } + } + + if (asyncio_eager_tasks != NULL) { + auto maybe_eager_tasks_set = MirrorSet::create(asyncio_eager_tasks); + if (!maybe_eager_tasks_set) { + return ErrorKind::TaskInfoError; + } + + auto eager_tasks_set = std::move(*maybe_eager_tasks_set); + + auto maybe_eager_tasks = eager_tasks_set.as_unordered_set(); + if (!maybe_eager_tasks) { + return ErrorKind::TaskInfoError; + } + + auto eager_tasks = std::move(*maybe_eager_tasks); + for (auto task_addr : eager_tasks) { + auto maybe_task_info = TaskInfo::create(reinterpret_cast(task_addr)); + if (maybe_task_info) { + if ((*maybe_task_info)->loop == reinterpret_cast(this->asyncio_loop)) { + tasks.push_back(std::move(*maybe_task_info)); + } + } + } + } + + return tasks; +} +#endif // PY_VERSION_HEX >= 0x030e0000 + // ---------------------------------------------------------------------------- inline void ThreadInfo::unwind_greenlets(PyThreadState* tstate, unsigned long cur_native_id) @@ -408,10 +633,8 @@ ThreadInfo::unwind_greenlets(PyThreadState* tstate, unsigned long cur_native_id) // ---------------------------------------------------------------------------- inline Result -ThreadInfo::sample(int64_t iid, ThreadStateType* tstate_ptr, microsecond_t delta) +ThreadInfo::sample(int64_t iid, PyThreadState* tstate, microsecond_t delta) { - PyThreadState* tstate = reinterpret_cast(tstate_ptr); - Renderer::get().render_thread_begin(tstate, name, delta, thread_id, native_id); microsecond_t previous_cpu_time = cpu_time; @@ -424,7 +647,7 @@ ThreadInfo::sample(int64_t iid, ThreadStateType* tstate_ptr, microsecond_t delta Renderer::get().render_cpu_time(thread_is_running ? cpu_time - previous_cpu_time : 0); - this->unwind(tstate_ptr); + this->unwind(tstate); // Render in this order of priority // 1. asyncio Tasks stacks (if any) @@ -484,14 +707,10 @@ ThreadInfo::sample(int64_t iid, ThreadStateType* tstate_ptr, microsecond_t delta } // ---------------------------------------------------------------------------- -#if PY_VERSION_HEX >= 0x030e0000 -using ThreadStateCallback = std::function; -#else -using ThreadStateCallback = std::function; -#endif +using PyThreadStateCallback = std::function; static void -for_each_thread(InterpreterInfo& interp, microsecond_t delta, ThreadStateCallback callback) +for_each_thread(InterpreterInfo& interp, microsecond_t delta, PyThreadStateCallback callback) { std::unordered_set threads; std::unordered_set seen_threads; @@ -512,32 +731,20 @@ for_each_thread(InterpreterInfo& interp, microsecond_t delta, ThreadStateCallbac // Since threads can be created and destroyed at any time, we make // a copy of the structure before trying to read its fields. -#if PY_VERSION_HEX >= 0x030e0000 - // For Python 3.14+, copy _PyThreadStateImpl (which contains PyThreadState as first field) - // so we can access asyncio_tasks_head directly without offset calculations. - ThreadStateType tstate_copy; - if (copy_type(reinterpret_cast(tstate_addr), tstate_copy)) - continue; - // Access PyThreadState fields via the first field of _PyThreadStateImpl - PyThreadState* tstate = reinterpret_cast(&tstate_copy); -#else - // Pre-Python 3.14: copy PyThreadState directly - ThreadStateType tstate_copy; - if (copy_type(tstate_addr, tstate_copy)) + PyThreadState tstate; + if (copy_type(tstate_addr, tstate)) continue; - PyThreadState* tstate = &tstate_copy; -#endif // Enqueue the unseen threads that we can reach from this thread. - if (tstate->next != NULL && seen_threads.find(tstate->next) == seen_threads.end()) - threads.insert(tstate->next); - if (tstate->prev != NULL && seen_threads.find(tstate->prev) == seen_threads.end()) - threads.insert(tstate->prev); + if (tstate.next != NULL && seen_threads.find(tstate.next) == seen_threads.end()) + threads.insert(tstate.next); + if (tstate.prev != NULL && seen_threads.find(tstate.prev) == seen_threads.end()) + threads.insert(tstate.prev); { const std::lock_guard guard(thread_info_map_lock); - if (thread_info_map.find(tstate->thread_id) == thread_info_map.end()) { + if (thread_info_map.find(tstate.thread_id) == thread_info_map.end()) { // If the threading module was not imported in the target then // we mistakenly take the hypno thread as the main thread. We // assume that any missing thread is the actual main thread, @@ -545,7 +752,7 @@ for_each_thread(InterpreterInfo& interp, microsecond_t delta, ThreadStateCallbac // "MainThread". Note that this can also happen on shutdown, so // we need to avoid doing anything in that case. #if PY_VERSION_HEX >= 0x030b0000 - auto native_id = tstate->native_thread_id; + auto native_id = tstate.native_thread_id; #else auto native_id = getpid(); #endif @@ -559,7 +766,7 @@ for_each_thread(InterpreterInfo& interp, microsecond_t delta, ThreadStateCallbac if (main_thread_tracked) continue; - auto maybe_thread_info = ThreadInfo::create(tstate->thread_id, native_id, "MainThread"); + auto maybe_thread_info = ThreadInfo::create(tstate.thread_id, native_id, "MainThread"); if (!maybe_thread_info) { // We failed to create the thread info object so we skip it. // We'll likely try again later with the valid thread @@ -567,11 +774,17 @@ for_each_thread(InterpreterInfo& interp, microsecond_t delta, ThreadStateCallbac continue; } - thread_info_map.emplace(tstate->thread_id, std::move(*maybe_thread_info)); + thread_info_map.emplace(tstate.thread_id, std::move(*maybe_thread_info)); } + // Update the tstate_addr for thread info, so we can access + // asyncio_tasks_head field from `_PyThreadStateImpl` struct + // later when we unwind tasks. + auto thread_info = thread_info_map.find(tstate.thread_id)->second.get(); + thread_info->tstate_addr = reinterpret_cast(tstate_addr); + // Call back with the copied thread state - callback(&tstate_copy, delta, *thread_info_map.find(tstate->thread_id)->second); + callback(&tstate, delta, *thread_info); } } } diff --git a/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp b/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp index d96c3fb7d94..094e72c5d05 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp +++ b/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp @@ -167,14 +167,12 @@ Sampler::sampling_thread(const uint64_t seq_num) // Perform the sample for_each_interp([&](InterpreterInfo& interp) -> void { - // Use ThreadStateType typedef which is _PyThreadStateImpl* for 3.14+ and PyThreadState* for pre-3.14 - for_each_thread( - interp, wall_time_us, [&](ThreadStateType* tstate, microsecond_t delta, ThreadInfo& thread) { - auto success = thread.sample(interp.id, tstate, delta); - if (success) { - ddup_increment_sample_count(); - } - }); + for_each_thread(interp, wall_time_us, [&](PyThreadState* tstate, microsecond_t delta, ThreadInfo& thread) { + auto success = thread.sample(interp.id, tstate, delta); + if (success) { + ddup_increment_sample_count(); + } + }); }); ddup_increment_sampling_event_count(); From 8429a249a7b98ec7c4cb373455ccb5f0d6cbe2d3 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Wed, 10 Dec 2025 17:33:05 -0500 Subject: [PATCH 37/43] Would this work with the previous commit? --- .../profiling/collector/test_asyncio_as_completed.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/tests/profiling/collector/test_asyncio_as_completed.py b/tests/profiling/collector/test_asyncio_as_completed.py index 0dc071d8bfd..67d237f4387 100644 --- a/tests/profiling/collector/test_asyncio_as_completed.py +++ b/tests/profiling/collector/test_asyncio_as_completed.py @@ -32,15 +32,10 @@ async def wait_and_return_delay(t: float) -> float: async def main() -> None: # Create a mix of Tasks and Coroutines - divisor = 10 if PYVERSION < (3, 14) else 3 - # For Python 3.14+, we increase the sleep time to get all the samples - # as expected. It's likely because the CPython 3.14+ keeps track of - # the tasks in a linked list, and each node needs to be copied using - # a system call. futures = [ - asyncio.create_task(wait_and_return_delay(float(i) / divisor)) + asyncio.create_task(wait_and_return_delay(float(i) / 10)) if i % 2 == 0 - else wait_and_return_delay(float(i) / divisor) + else wait_and_return_delay(float(i) / 10) for i in range(2, 12) ] assert len(futures) == 10 @@ -95,7 +90,7 @@ async def main() -> None: pprof_utils.StackLocation( function_name="main", filename="test_asyncio_as_completed.py", - line_no=main.__code__.co_firstlineno + 22, + line_no=main.__code__.co_firstlineno + 17, ), ] From 1cf23bb9619bf15c16f485f7eb8b01f5618ff7db Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Thu, 11 Dec 2025 11:32:59 -0500 Subject: [PATCH 38/43] somehow this is now fixed? --- tests/profiling/collector/test_generators.py | 83 ++++++-------------- 1 file changed, 26 insertions(+), 57 deletions(-) diff --git a/tests/profiling/collector/test_generators.py b/tests/profiling/collector/test_generators.py index aa4786997b7..4588c9c1780 100644 --- a/tests/profiling/collector/test_generators.py +++ b/tests/profiling/collector/test_generators.py @@ -10,7 +10,6 @@ # For macOS: err=None ignores expected stderr from tracer failing to connect to agent (not relevant to this test) def test_generators_stacks() -> None: import os - import sys import time from typing import Generator @@ -50,59 +49,29 @@ def my_function() -> int: samples = list(profile.sample) assert len(samples) > 0 - # In Python 3.14+, generator frames intentionally break the frame chain by setting - # previous = NULL to prevent dangling pointers. This means we cannot unwind from - # generator frames back to their callers. See docs/python-3.14-generator-frame-limitation.md - # for details. - # - # Expected behavior: - # - Python < 3.14: my_function -> generator -> generator2 (full stack trace) - # - Python >= 3.14: generator -> generator2 (cannot unwind to my_function) - if sys.version_info >= (3, 14): - # Python 3.14+: Generator frames have previous = NULL, so we can only unwind - # generator -> generator2, but not generator -> my_function - pprof_utils.assert_profile_has_sample( - profile, - samples, - expected_sample=pprof_utils.StackEvent( - thread_name="MainThread", - locations=[ - pprof_utils.StackLocation( - function_name="generator2", - filename="test_generators.py", - line_no=generator2.__code__.co_firstlineno + 1, - ), - pprof_utils.StackLocation( - function_name="generator", - filename="test_generators.py", - line_no=generator.__code__.co_firstlineno + 1, - ), - ], - ), - ) - else: - # Python < 3.14: Full stack trace should be available - pprof_utils.assert_profile_has_sample( - profile, - samples, - expected_sample=pprof_utils.StackEvent( - thread_name="MainThread", - locations=[ - pprof_utils.StackLocation( - function_name="generator2", - filename="test_generators.py", - line_no=generator2.__code__.co_firstlineno + 1, - ), - pprof_utils.StackLocation( - function_name="generator", - filename="test_generators.py", - line_no=generator.__code__.co_firstlineno + 1, - ), - pprof_utils.StackLocation( - function_name="my_function", - filename="test_generators.py", - line_no=my_function.__code__.co_firstlineno + 2, - ), - ], - ), - ) + # Test that we have samples with the expected stack trace + # Main Thread should have: my_function -> generator -> generator2 + pprof_utils.assert_profile_has_sample( + profile, + samples, + expected_sample=pprof_utils.StackEvent( + thread_name="MainThread", + locations=[ + pprof_utils.StackLocation( + function_name="generator2", + filename="test_generators.py", + line_no=generator2.__code__.co_firstlineno + 1, + ), + pprof_utils.StackLocation( + function_name="generator", + filename="test_generators.py", + line_no=generator.__code__.co_firstlineno + 1, + ), + pprof_utils.StackLocation( + function_name="my_function", + filename="test_generators.py", + line_no=my_function.__code__.co_firstlineno + 2, + ), + ], + ), + ) From e61b878a539beee7b8e48667e56c64d309c156c7 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Thu, 11 Dec 2025 11:33:15 -0500 Subject: [PATCH 39/43] simplify includes --- .../datadog/profiling/stack_v2/echion/echion/tasks.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h index 9b8d3e7f99d..e3fa35ac120 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h @@ -4,12 +4,9 @@ #pragma once -#include - #define PY_SSIZE_T_CLEAN #include #include -#include #if PY_VERSION_HEX >= 0x030b0000 #include @@ -18,9 +15,6 @@ #include #if PY_VERSION_HEX >= 0x030e0000 #include -#include -#include -#include #include #elif PY_VERSION_HEX >= 0x030d0000 #include From 9eadc943f8cc760b21066f41450f72f8c327e9b0 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Thu, 11 Dec 2025 11:58:09 -0500 Subject: [PATCH 40/43] revert unnecessary changes --- .../datadog/profiling/stack_v2/echion/echion/threads.h | 6 +++--- ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h index d66eed8c7f2..9f4de0e7722 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h @@ -734,10 +734,10 @@ ThreadInfo::sample(int64_t iid, PyThreadState* tstate, microsecond_t delta) } // ---------------------------------------------------------------------------- -using PyThreadStateCallback = std::function; +using PyThreadStateCallback = std::function; static void -for_each_thread(InterpreterInfo& interp, microsecond_t delta, PyThreadStateCallback callback) +for_each_thread(InterpreterInfo& interp, PyThreadStateCallback callback) { std::unordered_set threads; std::unordered_set seen_threads; @@ -811,7 +811,7 @@ for_each_thread(InterpreterInfo& interp, microsecond_t delta, PyThreadStateCallb thread_info->tstate_addr = reinterpret_cast(tstate_addr); // Call back with the copied thread state - callback(&tstate, delta, *thread_info); + callback(&tstate, *thread_info); } } } diff --git a/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp b/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp index 094e72c5d05..70797ebfa8f 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp +++ b/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp @@ -167,8 +167,8 @@ Sampler::sampling_thread(const uint64_t seq_num) // Perform the sample for_each_interp([&](InterpreterInfo& interp) -> void { - for_each_thread(interp, wall_time_us, [&](PyThreadState* tstate, microsecond_t delta, ThreadInfo& thread) { - auto success = thread.sample(interp.id, tstate, delta); + for_each_thread(interp, [&](PyThreadState* tstate, ThreadInfo& thread) { + auto success = thread.sample(interp.id, tstate, wall_time_us); if (success) { ddup_increment_sample_count(); } From 76eabaa236f2a9d57be988490c83f677b69838b8 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Thu, 11 Dec 2025 12:00:43 -0500 Subject: [PATCH 41/43] remove unnecessary comments --- .../datadog/profiling/stack_v2/echion/echion/cpython/tasks.h | 1 - .../internal/datadog/profiling/stack_v2/echion/echion/frame.h | 2 -- .../datadog/profiling/stack_v2/echion/echion/greenlets.h | 1 - .../internal/datadog/profiling/stack_v2/echion/echion/state.h | 1 - ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc | 2 -- 5 files changed, 7 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h index c5dbf088181..2d88e67821b 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/cpython/tasks.h @@ -12,7 +12,6 @@ #define Py_BUILD_CORE #if PY_VERSION_HEX >= 0x030e0000 -// Python 3.14+: _PyInterpreterFrame moved to new header #include #include #include diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/frame.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/frame.h index 1bcab6c5cd9..1092d15d52f 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/frame.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/frame.h @@ -15,11 +15,9 @@ #endif #include #if PY_VERSION_HEX >= 0x030e0000 -// Python 3.14+: _PyInterpreterFrame moved to new header #define Py_BUILD_CORE #include #elif PY_VERSION_HEX >= 0x030b0000 -// Python 3.11-3.13: _PyInterpreterFrame is in pycore_frame.h #define Py_BUILD_CORE #include #endif diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/greenlets.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/greenlets.h index 5448361f9c8..4aba2d9961f 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/greenlets.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/greenlets.h @@ -8,7 +8,6 @@ #define Py_BUILD_CORE #if PY_VERSION_HEX >= 0x030e0000 -// Python 3.14+: Need internal/pycore_frame.h for struct _frame (PyFrameObject) definition #include #endif diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/state.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/state.h index 90ae634e540..bc033bb8e66 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/state.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/state.h @@ -18,7 +18,6 @@ #define Py_BUILD_CORE #include #if PY_VERSION_HEX >= 0x030e0000 -// Python 3.14+: _PyRuntime is declared in pycore_runtime.h #include #endif diff --git a/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc b/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc index d5e1d512832..582acc6ce82 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc +++ b/ddtrace/internal/datadog/profiling/stack_v2/src/echion/frame.cc @@ -4,13 +4,11 @@ #include #if PY_VERSION_HEX >= 0x030b0000 -// Common headers needed for Python 3.11+ implementation #include #include #include #if PY_VERSION_HEX >= 0x030e0000 -// Python 3.14+: Additional headers for new structure definitions #include #include #endif // PY_VERSION_HEX >= 0x030e0000 From c49290f643e3c7445b22c6be7f93f687b257cc39 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Thu, 11 Dec 2025 13:03:29 -0500 Subject: [PATCH 42/43] keep the comment --- .../internal/datadog/profiling/stack_v2/echion/echion/threads.h | 1 + 1 file changed, 1 insertion(+) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h index 9f4de0e7722..4b1ba9da22c 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h +++ b/ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h @@ -760,6 +760,7 @@ for_each_thread(InterpreterInfo& interp, PyThreadStateCallback callback) // a copy of the structure before trying to read its fields. PyThreadState tstate; if (copy_type(tstate_addr, tstate)) + // We failed to copy the thread so we skip it. continue; // Enqueue the unseen threads that we can reach from this thread. From 870e787ec340344f240721ee761923d7eacfa942 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Fri, 12 Dec 2025 05:15:08 +0900 Subject: [PATCH 43/43] Update riotfile.py Co-authored-by: Brett Langdon --- riotfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/riotfile.py b/riotfile.py index 4e98e82cc69..12a58b9efe3 100644 --- a/riotfile.py +++ b/riotfile.py @@ -3433,7 +3433,7 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT Venv( name="profile-memalloc", command="python -m tests.profiling.run pytest -v --no-cov --capture=no --benchmark-disable {cmdargs} tests/profiling/collector/test_memalloc.py", # noqa: E501 - pys=select_pys(max_version="3.14"), + pys=select_pys(), pkgs={ "protobuf": latest, },