threads: Implement asymmetric atomic fences

Keno · Keno · commit 44b50e2b6ae1 · 2025-12-04T08:05:00.000Z
Asymmetric atomic fences are a performance optimization of regular atomic fences (the seq_cst version of which we expose as `Base.Threads.atomic_fence`). The problem with these regular fences is that they require a CPU fence instruction, which can be very expensive and is thus unsuitable for code in the hot path. Asymmetric fences on the other hand split an ordinary fence into two: A `light` side where the fence is extremely cheap (only a compiler reordering barrier) and a `heavy` side where the fence is very expensive. Basically the way it works is that the heavy side does a system call that issues an inter-processor-interrupt (IPI) which then issues the appropriate barrier instruction on the other CPU (i.e. both CPUs will have issues a barrier instruction, one of them just does it asynchronously due to interrupt). The `light` and `heavy` naming here is taken from C++ PR1202R5 [1], which is the proposal for the same feature in the C++ standard library (to appear in the next iteration of the C++ concurrency spec). On the julia side, these functions are exposed as `Threads.atomic_fence_light` and `Threads.atomic_fence_heavy`. The light side lowers to `fence singlethread` in llvm IR (the Core.Intrinsic atomic_fence is adjusted appropriately to faciliate this). The heavy side has OS-specifc implementations, where: 1. Linux/FreeBSD try to use the `membarrier` syscall or a fallback to `mprotect` for systems that don't have it. 2. Windows uses the `FlushProcessWriteBuffers` syscall. 3. macOS uses an implementation from the dotnet runtime (dotnet/runtime#44670), which the dotnet folks have checked with Apple does the right thing by happenstance (i.e. an IPI/memory barrier is needed to execute the syscall), but looks a little nonsensical by itself. However, since it's what Apple recommended to dotnet, I don't see much risk here, though I wouldn't be surprised if Apple added a proper syscall for this in the future (since freebsd has it now). Note that unlike the C++ spec, I have specified that `atomic_fence_heavy` does synchronize with `atomic_fence`. This matches the underlying system call. I suspect C++ chose to omit this for a hypothetical future architecture that has instruction support for doing this from userspace that would then not synchronize with ordinary barriers, but I think I would rather cross that bridge when we get there. I intend to use this in #60281, but it's an independently useful feature. [1] https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p1202r5.pdf
diff --git a/Compiler/src/tfuncs.jl b/Compiler/src/tfuncs.jl
@@ -716,7 +716,7 @@ end
 @nospecs function pointerset_tfunc(𝕃::AbstractLattice, a, v, i, align)
     return a
 end
-@nospecs function atomic_fence_tfunc(𝕃::AbstractLattice, order)
+@nospecs function atomic_fence_tfunc(𝕃::AbstractLattice, order, syncscope)
     return Nothing
 end
 @nospecs function atomic_pointerref_tfunc(𝕃::AbstractLattice, a, order)
@@ -757,7 +757,7 @@ add_tfunc(add_ptr, 2, 2, pointerarith_tfunc, 1)
 add_tfunc(sub_ptr, 2, 2, pointerarith_tfunc, 1)
 add_tfunc(pointerref, 3, 3, pointerref_tfunc, 4)
 add_tfunc(pointerset, 4, 4, pointerset_tfunc, 5)
-add_tfunc(atomic_fence, 1, 1, atomic_fence_tfunc, 4)
+add_tfunc(atomic_fence, 2, 2, atomic_fence_tfunc, 4)
 add_tfunc(atomic_pointerref, 2, 2, atomic_pointerref_tfunc, 4)
 add_tfunc(atomic_pointerset, 3, 3, atomic_pointerset_tfunc, 5)
 add_tfunc(atomic_pointerswap, 3, 3, atomic_pointerswap_tfunc, 5)
diff --git a/base/asyncevent.jl b/base/asyncevent.jl
@@ -165,7 +165,7 @@ function _trywait(t::Union{Timer, AsyncCondition})
     set = t.set
     if set
         # full barrier now for AsyncCondition
-        t isa Timer || Core.Intrinsics.atomic_fence(:acquire_release)
+        t isa Timer || Core.Intrinsics.atomic_fence(:acquire_release, :system)
     else
         if !isopen(t)
             set = t.set
diff --git a/base/atomics.jl b/base/atomics.jl
@@ -329,4 +329,28 @@ fences should not be necessary in most cases.
 
 For further details, see LLVM's `fence` instruction.
 """
-atomic_fence() = Core.Intrinsics.atomic_fence(:sequentially_consistent)
+atomic_fence() = Core.Intrinsics.atomic_fence(:sequentially_consistent, :system)
+
+"""
+    Threads.atomic_fence_light()
+
+This is a read-optimized sequential-consistency memory fence.
+On supported operating systems and architectures, this fence is cheaper
+than `Threads.atomic_fence()`, but synchronizes only with
+[`atomic_fence_heavy`](@ref) calls from other threads.
+"""
+atomic_fence_light() = Core.Intrinsics.atomic_fence(:sequentially_consistent, :singlethread)
+
+"""
+    Threads.atomic_fence_heavy()
+
+This is a write-optimized sequential-consistency memory fence.
+This fence is significantly more expensive than `Threads.atomic_fence`.
+It generally requires a system call and a full interprocessor interrupt
+to all other processors in the system. It synchronizes with both
+[`atomic_fence_light`](@ref) and [`atomic_fence`](@ref) calls from other threads.
+
+For further details, see the Linux `membarrier` syscall or the Windows
+`FlushProcessWriteBuffers` API.
+"""
+atomic_fence_heavy() = ccall(:jl_membarrier, Cvoid, ())
diff --git a/src/ast.c b/src/ast.c
@@ -319,6 +319,8 @@ void jl_init_common_symbols(void)
     jl_atomic_sym = jl_symbol("atomic");
     jl_not_atomic_sym = jl_symbol("not_atomic");
     jl_unordered_sym = jl_symbol("unordered");
+    jl_singlethread_sym = jl_symbol("singlethread");
+    jl_system_sym = jl_symbol("system");
     jl_monotonic_sym = jl_symbol("monotonic");
     jl_acquire_sym = jl_symbol("acquire");
     jl_release_sym = jl_symbol("release");
diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
@@ -915,17 +915,26 @@ static jl_cgval_t emit_pointerarith(jl_codectx_t &ctx, intrinsic f,
 static jl_cgval_t emit_atomicfence(jl_codectx_t &ctx, ArrayRef<jl_cgval_t> argv)
 {
     const jl_cgval_t &ord = argv[0];
+    const jl_cgval_t &ssid_arg = argv[1];
+    llvm::SyncScope::ID ssid = llvm::SyncScope::System;
+    if (!ssid_arg.constant || !jl_is_symbol(ssid_arg.constant) ||
+        ((jl_sym_t*)ssid_arg.constant != jl_singlethread_sym &&
+            (jl_sym_t*)ssid_arg.constant != jl_system_sym)) {
+        return emit_runtime_call(ctx, atomic_fence, argv, 2);
+    }
+    if ((jl_sym_t*)ssid_arg.constant == jl_singlethread_sym)
+        ssid = llvm::SyncScope::SingleThread;
     if (ord.constant && jl_is_symbol(ord.constant)) {
         enum jl_memory_order order = jl_get_atomic_order((jl_sym_t*)ord.constant, true, true);
         if (order == jl_memory_order_invalid) {
             emit_atomic_error(ctx, "invalid atomic ordering");
             return jl_cgval_t(); // unreachable
         }
         if (order > jl_memory_order_monotonic)
-            ctx.builder.CreateFence(get_llvm_atomic_order(order));
+            ctx.builder.CreateFence(get_llvm_atomic_order(order), ssid);
         return ghostValue(ctx, jl_nothing_type);
     }
-    return emit_runtime_call(ctx, atomic_fence, argv, 1);
+    return emit_runtime_call(ctx, atomic_fence, argv, 2);
 }
 
 static jl_cgval_t emit_atomic_pointerref(jl_codectx_t &ctx, ArrayRef<jl_cgval_t> argv)
@@ -1339,7 +1348,7 @@ static jl_cgval_t emit_intrinsic(jl_codectx_t &ctx, intrinsic f, jl_value_t **ar
 
     case atomic_fence:
         ++Emitted_atomic_fence;
-        assert(nargs == 1);
+        assert(nargs == 2);
         return emit_atomicfence(ctx, argv);
     case atomic_pointerref:
         ++Emitted_atomic_pointerref;
diff --git a/src/intrinsics.h b/src/intrinsics.h
@@ -95,7 +95,7 @@
     ADD_I(pointerref, 3) \
     ADD_I(pointerset, 4) \
     /*  pointer atomics */ \
-    ADD_I(atomic_fence, 1) \
+    ADD_I(atomic_fence, 2) \
     ADD_I(atomic_pointerref, 2) \
     ADD_I(atomic_pointerset, 3) \
     ADD_I(atomic_pointerswap, 3) \
diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc
@@ -506,6 +506,7 @@
     XX(jl_vprintf) \
     XX(jl_wakeup_thread) \
     XX(jl_write_compiler_output) \
+    XX(jl_membarrier) \
 
 #define JL_RUNTIME_EXPORTED_FUNCS_WIN(XX) \
     XX(jl_setjmp) \
diff --git a/src/julia_internal.h b/src/julia_internal.h
@@ -1689,7 +1689,7 @@ STATIC_INLINE int is_valid_intrinsic_elptr(jl_value_t *ety)
 JL_DLLEXPORT jl_value_t *jl_bitcast(jl_value_t *ty, jl_value_t *v);
 JL_DLLEXPORT jl_value_t *jl_pointerref(jl_value_t *p, jl_value_t *i, jl_value_t *align);
 JL_DLLEXPORT jl_value_t *jl_pointerset(jl_value_t *p, jl_value_t *x, jl_value_t *align, jl_value_t *i);
-JL_DLLEXPORT jl_value_t *jl_atomic_fence(jl_value_t *order);
+JL_DLLEXPORT jl_value_t *jl_atomic_fence(jl_value_t *order, jl_value_t *syncscope);
 JL_DLLEXPORT jl_value_t *jl_atomic_pointerref(jl_value_t *p, jl_value_t *order);
 JL_DLLEXPORT jl_value_t *jl_atomic_pointerset(jl_value_t *p, jl_value_t *x, jl_value_t *order);
 JL_DLLEXPORT jl_value_t *jl_atomic_pointerswap(jl_value_t *p, jl_value_t *x, jl_value_t *order);
@@ -2010,6 +2010,8 @@ JL_DLLEXPORT int jl_isabspath(const char *in) JL_NOTSAFEPOINT;
     XX(uninferred_sym) \
     XX(unordered_sym) \
     XX(unused_sym) \
+    XX(singlethread_sym) \
+    XX(system_sym)
 
 #define XX(name) extern JL_DLLEXPORT jl_sym_t *jl_##name;
 JL_COMMON_SYMBOLS(XX)
diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c
@@ -622,9 +622,16 @@ JL_DLLEXPORT jl_value_t *jl_atomic_pointerreplace(jl_value_t *p, jl_value_t *exp
     return result;
 }
 
-JL_DLLEXPORT jl_value_t *jl_atomic_fence(jl_value_t *order_sym)
+JL_DLLEXPORT jl_value_t *jl_atomic_fence(jl_value_t *order_sym, jl_value_t *syncscope_sym)
 {
     JL_TYPECHK(fence, symbol, order_sym);
+    JL_TYPECHK(fence, symbol, syncscope_sym);
+    if ((jl_sym_t*)syncscope_sym == jl_singlethread_sym) {
+        asm volatile ("" : : : "memory");
+        return jl_nothing;
+    } else if ((jl_sym_t*)syncscope_sym != jl_system_sym) {
+        jl_error("atomic_fence: invalid syncscope");
+    }
     enum jl_memory_order order = jl_get_atomic_order_checked((jl_sym_t*)order_sym, 1, 1);
     if (order > jl_memory_order_monotonic)
         jl_fence();
diff --git a/src/signals-mach.c b/src/signals-mach.c
@@ -5,6 +5,7 @@
 #include <mach/clock.h>
 #include <mach/clock_types.h>
 #include <mach/clock_reply.h>
+#include <mach/thread_state.h>
 #include <mach/mach_traps.h>
 #include <mach/task.h>
 #include <mach/mig_errors.h>
@@ -891,3 +892,54 @@ JL_DLLEXPORT void jl_profile_stop_timer(void)
     profile_all_tasks = 0;
     uv_mutex_unlock(&bt_data_prof_lock);
 }
+
+// The mprotect implementation in signals-unix.c does not work on macOS/aarch64, as mentioned.
+// This implementation comes from dotnet, but is similarly dependent on undocumented behavior of the OS.
+// Copyright (c) .NET Foundation and Contributors
+// MIT LICENSE
+JL_DLLEXPORT void jl_membarrier(void) {
+    mach_msg_type_number_t cThreads;
+    thread_act_t *pThreads;
+    kern_return_t machret = task_threads(mach_task_self(), &pThreads, &cThreads);
+    HANDLE_MACH_ERROR("task_threads()", machret);
+
+    uintptr_t sp;
+    uintptr_t registerValues[128];
+
+    // Iterate through each of the threads in the list.
+    for (mach_msg_type_number_t i = 0; i < cThreads; i++)
+    {
+        if (__builtin_available (macOS 10.14, iOS 12, tvOS 9, *))
+        {
+            // Request the threads pointer values to force the thread to emit a memory barrier
+            size_t registers = 128;
+            machret = thread_get_register_pointer_values(pThreads[i], &sp, &registers, registerValues);
+        }
+        else
+        {
+            // fallback implementation for older OS versions
+#if defined(_CPU_X86_64_)
+            x86_thread_state64_t threadState;
+            mach_msg_type_number_t count = x86_THREAD_STATE64_COUNT;
+            machret = thread_get_state(pThreads[i], x86_THREAD_STATE64, (thread_state_t)&threadState, &count);
+#elif defined(_CPU_AARCH64_)
+            arm_thread_state64_t threadState;
+            mach_msg_type_number_t count = ARM_THREAD_STATE64_COUNT;
+            machret = thread_get_state(pThreads[i], ARM_THREAD_STATE64, (thread_state_t)&threadState, &count);
+#else
+            #error Unexpected architecture
+#endif
+        }
+
+        if (machret == KERN_INSUFFICIENT_BUFFER_SIZE)
+        {
+            HANDLE_MACH_ERROR("thread_get_register_pointer_values()", machret);
+        }
+
+        machret = mach_port_deallocate(mach_task_self(), pThreads[i]);
+        HANDLE_MACH_ERROR("mach_port_deallocate()", machret);
+    }
+    // Deallocate the thread list now we're done with it.
+    machret = vm_deallocate(mach_task_self(), (vm_address_t)pThreads, cThreads * sizeof(thread_act_t));
+    HANDLE_MACH_ERROR("vm_deallocate()", machret);
+}
diff --git a/src/signals-unix.c b/src/signals-unix.c
@@ -1274,3 +1274,128 @@ JL_DLLEXPORT int jl_repl_raise_sigtstp(void)
 {
     return raise(SIGTSTP);
 }
+
+// Linux and FreeBSD have compatible membarrier support
+#if defined(_OS_LINUX_) || defined(_OS_FREEBSD_)
+#if defined(_OS_LINUX_)
+# include <sys/syscall.h>
+# if defined(__has_include)
+#   if __has_include(<linux/membarrier.h>)
+#       include <linux/membarrier.h>
+#       define membarrier(...) syscall(__NR_membarrier, __VA_ARGS__)
+#   else
+#     if defined(__NR_membarrier)
+enum membarrier_cmd {
+    MEMBARRIER_CMD_QUERY                        = 0,
+    MEMBARRIER_CMD_PRIVATE_EXPEDITED            = (1 << 3),
+    MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED   = (1 << 4),
+};
+#         define membarrier(...) syscall(__NR_membarrier, __VA_ARGS__)
+#     else
+#         warning "Missing linux kernel headers for membarrier syscall, support disabled"
+#         define membarrier(...) -ENOSYS
+#     endif
+#  endif
+# else
+#   include <linux/membarrier.h>
+# endif
+#elif defined(_OS_FREEBSD_)
+# include <sys/param.h>
+# if __FreeBSD_version >= 1401500
+#   include <sys/membarrier.h>
+# else
+#   define MEMBARRIER_CMD_QUERY                         0x00
+#   define MEMBARRIER_CMD_PRIVATE_EXPEDITED             0x08
+#   define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED    0x10
+#   define membarrier(...) -ENOSYS
+# endif
+#endif
+
+// Implementation of the `mprotect` based membarrier fallback.
+// This is a common fallback based on the observation that `mprotect` happens to
+// issue the necessary memory barriers. However, there is no spec that
+// guarantees this behavior, and indeed AArch64 macos does not. However, we
+// only use it as a fallback here for older versions of Linux and FreeBSD where
+// we know that it happens to work.
+static pthread_mutex_t mprotect_barrier_lock = PTHREAD_MUTEX_INITIALIZER;
+static _Atomic(uint64_t) *mprotect_barrier_page = NULL;
+static void jl_init_mprotect_membarrier(void)
+{
+    int result = pthread_mutex_lock(&mprotect_barrier_lock);
+    assert(result == 0);
+    if (mprotect_barrier_page == NULL) {
+        size_t pagesize = jl_getpagesize();
+
+        mprotect_barrier_page = (_Atomic(uint64_t) *)
+                                     mmap(NULL, pagesize, PROT_NONE,
+                                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+        if (mprotect_barrier_page == MAP_FAILED) {
+            jl_safe_printf("fatal: failed to allocate barrier page.\n");
+            abort();
+        }
+        result = mlock(mprotect_barrier_page, pagesize);
+        if (result != 0) {
+            jl_safe_printf("fatal: failed to mlock barrier page.\n");
+            abort();
+        }
+    }
+    result = pthread_mutex_unlock(&mprotect_barrier_lock);
+    assert(result == 0);
+    (void)result;
+}
+
+static void jl_mprotect_membarrier(void)
+{
+    int result = pthread_mutex_lock(&mprotect_barrier_lock);
+    assert(result == 0);
+    size_t pagesize = jl_getpagesize();
+    result = mprotect(mprotect_barrier_page, pagesize, PROT_NONE);
+    jl_atomic_fetch_add_relaxed(mprotect_barrier_page, 1);
+    assert(result == 0);
+    result = mprotect(mprotect_barrier_page, pagesize, PROT_READ | PROT_WRITE);
+    assert(result == 0);
+    result = pthread_mutex_unlock(&mprotect_barrier_lock);
+    assert(result == 0);
+    (void)result;
+}
+
+// Implementation of `jl_membarrier`
+enum membarrier_implementation {
+    MEMBARRIER_IMPLEMENTATION_UNKNOWN        = 0,
+    MEMBARRIER_IMPLEMENTATION_SYS_MEMBARRIER = 1,
+    MEMBARRIER_IMPLEMENTATION_MPROTECT       = 2
+};
+
+static _Atomic(enum membarrier_implementation) membarrier_impl = MEMBARRIER_IMPLEMENTATION_UNKNOWN;
+
+static enum membarrier_implementation jl_init_membarrier(void) {
+    int ret = membarrier(MEMBARRIER_CMD_QUERY, 0);
+    int needed = MEMBARRIER_CMD_PRIVATE_EXPEDITED | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED;
+    if (ret > 0 && ((ret & needed) == needed)) {
+        // supported
+        if (membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0) == 0) {
+            // working
+            jl_atomic_store_relaxed(&membarrier_impl, MEMBARRIER_IMPLEMENTATION_SYS_MEMBARRIER);
+            return MEMBARRIER_IMPLEMENTATION_SYS_MEMBARRIER;
+        }
+    }
+    jl_init_mprotect_membarrier();
+    jl_atomic_store_relaxed(&membarrier_impl, MEMBARRIER_IMPLEMENTATION_MPROTECT);
+    return MEMBARRIER_IMPLEMENTATION_MPROTECT;
+}
+
+JL_DLLEXPORT void jl_membarrier(void) {
+    enum membarrier_implementation impl = jl_atomic_load_relaxed(&membarrier_impl);
+    if (impl == MEMBARRIER_IMPLEMENTATION_UNKNOWN) {
+        impl = jl_init_membarrier();
+    }
+    if (impl == MEMBARRIER_IMPLEMENTATION_SYS_MEMBARRIER) {
+        int ret = membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0);
+        assert(ret);
+        (void)ret;
+    } else {
+        assert(impl == MEMBARRIER_IMPLEMENTATION_MPROTECT);
+        jl_mprotect_membarrier();
+    }
+}
+#endif
diff --git a/src/signals-win.c b/src/signals-win.c
@@ -664,3 +664,7 @@ void jl_install_thread_signal_handler(jl_ptls_t ptls)
         have_backtrace_fiber = 1;
     }
 }
+
+JL_DLLEXPORT void jl_membarrier(void) {
+    FlushProcessWriteBuffers();
+}
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
@@ -395,13 +395,13 @@ end
 end
 
 using Base.Experimental: @force_compile
-@test_throws ConcurrencyViolationError("invalid atomic ordering") (@force_compile; Core.Intrinsics.atomic_fence(:u)) === nothing
-@test_throws ConcurrencyViolationError("invalid atomic ordering") (@force_compile; Core.Intrinsics.atomic_fence(Symbol("u", "x"))) === nothing
-@test_throws ConcurrencyViolationError("invalid atomic ordering") Core.Intrinsics.atomic_fence(Symbol("u", "x")) === nothing
+@test_throws ConcurrencyViolationError("invalid atomic ordering") (@force_compile; Core.Intrinsics.atomic_fence(:u, :system)) === nothing
+@test_throws ConcurrencyViolationError("invalid atomic ordering") (@force_compile; Core.Intrinsics.atomic_fence(Symbol("u", "x"), :system)) === nothing
+@test_throws ConcurrencyViolationError("invalid atomic ordering") Core.Intrinsics.atomic_fence(Symbol("u", "x"), :system) === nothing
 for order in (:not_atomic, :monotonic, :acquire, :release, :acquire_release, :sequentially_consistent)
-    @test Core.Intrinsics.atomic_fence(order) === nothing
-    @test (order -> Core.Intrinsics.atomic_fence(order))(order) === nothing
-    @test Base.invokelatest(@eval () -> Core.Intrinsics.atomic_fence($(QuoteNode(order)))) === nothing
+    @test Core.Intrinsics.atomic_fence(order, :system) === nothing
+    @test (order -> Core.Intrinsics.atomic_fence(order, :system))(order) === nothing
+    @test Base.invokelatest(@eval () -> Core.Intrinsics.atomic_fence($(QuoteNode(order)), :system)) === nothing
 end
 @test Core.Intrinsics.atomic_pointerref(C_NULL, :sequentially_consistent) === nothing
 @test (@force_compile; Core.Intrinsics.atomic_pointerref(C_NULL, :sequentially_consistent)) === nothing
diff --git a/test/threads_exec.jl b/test/threads_exec.jl