Skip to content

Commit 44b50e2

Browse files
committed
threads: Implement asymmetric atomic fences
Asymmetric atomic fences are a performance optimization of regular atomic fences (the seq_cst version of which we expose as `Base.Threads.atomic_fence`). The problem with these regular fences is that they require a CPU fence instruction, which can be very expensive and is thus unsuitable for code in the hot path. Asymmetric fences on the other hand split an ordinary fence into two: A `light` side where the fence is extremely cheap (only a compiler reordering barrier) and a `heavy` side where the fence is very expensive. Basically the way it works is that the heavy side does a system call that issues an inter-processor-interrupt (IPI) which then issues the appropriate barrier instruction on the other CPU (i.e. both CPUs will have issues a barrier instruction, one of them just does it asynchronously due to interrupt). The `light` and `heavy` naming here is taken from C++ PR1202R5 [1], which is the proposal for the same feature in the C++ standard library (to appear in the next iteration of the C++ concurrency spec). On the julia side, these functions are exposed as `Threads.atomic_fence_light` and `Threads.atomic_fence_heavy`. The light side lowers to `fence singlethread` in llvm IR (the Core.Intrinsic atomic_fence is adjusted appropriately to faciliate this). The heavy side has OS-specifc implementations, where: 1. Linux/FreeBSD try to use the `membarrier` syscall or a fallback to `mprotect` for systems that don't have it. 2. Windows uses the `FlushProcessWriteBuffers` syscall. 3. macOS uses an implementation from the dotnet runtime (dotnet/runtime#44670), which the dotnet folks have checked with Apple does the right thing by happenstance (i.e. an IPI/memory barrier is needed to execute the syscall), but looks a little nonsensical by itself. However, since it's what Apple recommended to dotnet, I don't see much risk here, though I wouldn't be surprised if Apple added a proper syscall for this in the future (since freebsd has it now). Note that unlike the C++ spec, I have specified that `atomic_fence_heavy` does synchronize with `atomic_fence`. This matches the underlying system call. I suspect C++ chose to omit this for a hypothetical future architecture that has instruction support for doing this from userspace that would then not synchronize with ordinary barriers, but I think I would rather cross that bridge when we get there. I intend to use this in #60281, but it's an independently useful feature. [1] https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p1202r5.pdf
1 parent 6c75e91 commit 44b50e2

File tree

14 files changed

+290
-16
lines changed

14 files changed

+290
-16
lines changed

Compiler/src/tfuncs.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -716,7 +716,7 @@ end
716716
@nospecs function pointerset_tfunc(𝕃::AbstractLattice, a, v, i, align)
717717
return a
718718
end
719-
@nospecs function atomic_fence_tfunc(𝕃::AbstractLattice, order)
719+
@nospecs function atomic_fence_tfunc(𝕃::AbstractLattice, order, syncscope)
720720
return Nothing
721721
end
722722
@nospecs function atomic_pointerref_tfunc(𝕃::AbstractLattice, a, order)
@@ -757,7 +757,7 @@ add_tfunc(add_ptr, 2, 2, pointerarith_tfunc, 1)
757757
add_tfunc(sub_ptr, 2, 2, pointerarith_tfunc, 1)
758758
add_tfunc(pointerref, 3, 3, pointerref_tfunc, 4)
759759
add_tfunc(pointerset, 4, 4, pointerset_tfunc, 5)
760-
add_tfunc(atomic_fence, 1, 1, atomic_fence_tfunc, 4)
760+
add_tfunc(atomic_fence, 2, 2, atomic_fence_tfunc, 4)
761761
add_tfunc(atomic_pointerref, 2, 2, atomic_pointerref_tfunc, 4)
762762
add_tfunc(atomic_pointerset, 3, 3, atomic_pointerset_tfunc, 5)
763763
add_tfunc(atomic_pointerswap, 3, 3, atomic_pointerswap_tfunc, 5)

base/asyncevent.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ function _trywait(t::Union{Timer, AsyncCondition})
165165
set = t.set
166166
if set
167167
# full barrier now for AsyncCondition
168-
t isa Timer || Core.Intrinsics.atomic_fence(:acquire_release)
168+
t isa Timer || Core.Intrinsics.atomic_fence(:acquire_release, :system)
169169
else
170170
if !isopen(t)
171171
set = t.set

base/atomics.jl

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,4 +329,28 @@ fences should not be necessary in most cases.
329329
330330
For further details, see LLVM's `fence` instruction.
331331
"""
332-
atomic_fence() = Core.Intrinsics.atomic_fence(:sequentially_consistent)
332+
atomic_fence() = Core.Intrinsics.atomic_fence(:sequentially_consistent, :system)
333+
334+
"""
335+
Threads.atomic_fence_light()
336+
337+
This is a read-optimized sequential-consistency memory fence.
338+
On supported operating systems and architectures, this fence is cheaper
339+
than `Threads.atomic_fence()`, but synchronizes only with
340+
[`atomic_fence_heavy`](@ref) calls from other threads.
341+
"""
342+
atomic_fence_light() = Core.Intrinsics.atomic_fence(:sequentially_consistent, :singlethread)
343+
344+
"""
345+
Threads.atomic_fence_heavy()
346+
347+
This is a write-optimized sequential-consistency memory fence.
348+
This fence is significantly more expensive than `Threads.atomic_fence`.
349+
It generally requires a system call and a full interprocessor interrupt
350+
to all other processors in the system. It synchronizes with both
351+
[`atomic_fence_light`](@ref) and [`atomic_fence`](@ref) calls from other threads.
352+
353+
For further details, see the Linux `membarrier` syscall or the Windows
354+
`FlushProcessWriteBuffers` API.
355+
"""
356+
atomic_fence_heavy() = ccall(:jl_membarrier, Cvoid, ())

src/ast.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,8 @@ void jl_init_common_symbols(void)
319319
jl_atomic_sym = jl_symbol("atomic");
320320
jl_not_atomic_sym = jl_symbol("not_atomic");
321321
jl_unordered_sym = jl_symbol("unordered");
322+
jl_singlethread_sym = jl_symbol("singlethread");
323+
jl_system_sym = jl_symbol("system");
322324
jl_monotonic_sym = jl_symbol("monotonic");
323325
jl_acquire_sym = jl_symbol("acquire");
324326
jl_release_sym = jl_symbol("release");

src/intrinsics.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -915,17 +915,26 @@ static jl_cgval_t emit_pointerarith(jl_codectx_t &ctx, intrinsic f,
915915
static jl_cgval_t emit_atomicfence(jl_codectx_t &ctx, ArrayRef<jl_cgval_t> argv)
916916
{
917917
const jl_cgval_t &ord = argv[0];
918+
const jl_cgval_t &ssid_arg = argv[1];
919+
llvm::SyncScope::ID ssid = llvm::SyncScope::System;
920+
if (!ssid_arg.constant || !jl_is_symbol(ssid_arg.constant) ||
921+
((jl_sym_t*)ssid_arg.constant != jl_singlethread_sym &&
922+
(jl_sym_t*)ssid_arg.constant != jl_system_sym)) {
923+
return emit_runtime_call(ctx, atomic_fence, argv, 2);
924+
}
925+
if ((jl_sym_t*)ssid_arg.constant == jl_singlethread_sym)
926+
ssid = llvm::SyncScope::SingleThread;
918927
if (ord.constant && jl_is_symbol(ord.constant)) {
919928
enum jl_memory_order order = jl_get_atomic_order((jl_sym_t*)ord.constant, true, true);
920929
if (order == jl_memory_order_invalid) {
921930
emit_atomic_error(ctx, "invalid atomic ordering");
922931
return jl_cgval_t(); // unreachable
923932
}
924933
if (order > jl_memory_order_monotonic)
925-
ctx.builder.CreateFence(get_llvm_atomic_order(order));
934+
ctx.builder.CreateFence(get_llvm_atomic_order(order), ssid);
926935
return ghostValue(ctx, jl_nothing_type);
927936
}
928-
return emit_runtime_call(ctx, atomic_fence, argv, 1);
937+
return emit_runtime_call(ctx, atomic_fence, argv, 2);
929938
}
930939

931940
static jl_cgval_t emit_atomic_pointerref(jl_codectx_t &ctx, ArrayRef<jl_cgval_t> argv)
@@ -1339,7 +1348,7 @@ static jl_cgval_t emit_intrinsic(jl_codectx_t &ctx, intrinsic f, jl_value_t **ar
13391348

13401349
case atomic_fence:
13411350
++Emitted_atomic_fence;
1342-
assert(nargs == 1);
1351+
assert(nargs == 2);
13431352
return emit_atomicfence(ctx, argv);
13441353
case atomic_pointerref:
13451354
++Emitted_atomic_pointerref;

src/intrinsics.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@
9595
ADD_I(pointerref, 3) \
9696
ADD_I(pointerset, 4) \
9797
/* pointer atomics */ \
98-
ADD_I(atomic_fence, 1) \
98+
ADD_I(atomic_fence, 2) \
9999
ADD_I(atomic_pointerref, 2) \
100100
ADD_I(atomic_pointerset, 3) \
101101
ADD_I(atomic_pointerswap, 3) \

src/jl_exported_funcs.inc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,7 @@
506506
XX(jl_vprintf) \
507507
XX(jl_wakeup_thread) \
508508
XX(jl_write_compiler_output) \
509+
XX(jl_membarrier) \
509510

510511
#define JL_RUNTIME_EXPORTED_FUNCS_WIN(XX) \
511512
XX(jl_setjmp) \

src/julia_internal.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1689,7 +1689,7 @@ STATIC_INLINE int is_valid_intrinsic_elptr(jl_value_t *ety)
16891689
JL_DLLEXPORT jl_value_t *jl_bitcast(jl_value_t *ty, jl_value_t *v);
16901690
JL_DLLEXPORT jl_value_t *jl_pointerref(jl_value_t *p, jl_value_t *i, jl_value_t *align);
16911691
JL_DLLEXPORT jl_value_t *jl_pointerset(jl_value_t *p, jl_value_t *x, jl_value_t *align, jl_value_t *i);
1692-
JL_DLLEXPORT jl_value_t *jl_atomic_fence(jl_value_t *order);
1692+
JL_DLLEXPORT jl_value_t *jl_atomic_fence(jl_value_t *order, jl_value_t *syncscope);
16931693
JL_DLLEXPORT jl_value_t *jl_atomic_pointerref(jl_value_t *p, jl_value_t *order);
16941694
JL_DLLEXPORT jl_value_t *jl_atomic_pointerset(jl_value_t *p, jl_value_t *x, jl_value_t *order);
16951695
JL_DLLEXPORT jl_value_t *jl_atomic_pointerswap(jl_value_t *p, jl_value_t *x, jl_value_t *order);
@@ -2010,6 +2010,8 @@ JL_DLLEXPORT int jl_isabspath(const char *in) JL_NOTSAFEPOINT;
20102010
XX(uninferred_sym) \
20112011
XX(unordered_sym) \
20122012
XX(unused_sym) \
2013+
XX(singlethread_sym) \
2014+
XX(system_sym)
20132015

20142016
#define XX(name) extern JL_DLLEXPORT jl_sym_t *jl_##name;
20152017
JL_COMMON_SYMBOLS(XX)

src/runtime_intrinsics.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -622,9 +622,16 @@ JL_DLLEXPORT jl_value_t *jl_atomic_pointerreplace(jl_value_t *p, jl_value_t *exp
622622
return result;
623623
}
624624

625-
JL_DLLEXPORT jl_value_t *jl_atomic_fence(jl_value_t *order_sym)
625+
JL_DLLEXPORT jl_value_t *jl_atomic_fence(jl_value_t *order_sym, jl_value_t *syncscope_sym)
626626
{
627627
JL_TYPECHK(fence, symbol, order_sym);
628+
JL_TYPECHK(fence, symbol, syncscope_sym);
629+
if ((jl_sym_t*)syncscope_sym == jl_singlethread_sym) {
630+
asm volatile ("" : : : "memory");
631+
return jl_nothing;
632+
} else if ((jl_sym_t*)syncscope_sym != jl_system_sym) {
633+
jl_error("atomic_fence: invalid syncscope");
634+
}
628635
enum jl_memory_order order = jl_get_atomic_order_checked((jl_sym_t*)order_sym, 1, 1);
629636
if (order > jl_memory_order_monotonic)
630637
jl_fence();

src/signals-mach.c

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <mach/clock.h>
66
#include <mach/clock_types.h>
77
#include <mach/clock_reply.h>
8+
#include <mach/thread_state.h>
89
#include <mach/mach_traps.h>
910
#include <mach/task.h>
1011
#include <mach/mig_errors.h>
@@ -891,3 +892,54 @@ JL_DLLEXPORT void jl_profile_stop_timer(void)
891892
profile_all_tasks = 0;
892893
uv_mutex_unlock(&bt_data_prof_lock);
893894
}
895+
896+
// The mprotect implementation in signals-unix.c does not work on macOS/aarch64, as mentioned.
897+
// This implementation comes from dotnet, but is similarly dependent on undocumented behavior of the OS.
898+
// Copyright (c) .NET Foundation and Contributors
899+
// MIT LICENSE
900+
JL_DLLEXPORT void jl_membarrier(void) {
901+
mach_msg_type_number_t cThreads;
902+
thread_act_t *pThreads;
903+
kern_return_t machret = task_threads(mach_task_self(), &pThreads, &cThreads);
904+
HANDLE_MACH_ERROR("task_threads()", machret);
905+
906+
uintptr_t sp;
907+
uintptr_t registerValues[128];
908+
909+
// Iterate through each of the threads in the list.
910+
for (mach_msg_type_number_t i = 0; i < cThreads; i++)
911+
{
912+
if (__builtin_available (macOS 10.14, iOS 12, tvOS 9, *))
913+
{
914+
// Request the threads pointer values to force the thread to emit a memory barrier
915+
size_t registers = 128;
916+
machret = thread_get_register_pointer_values(pThreads[i], &sp, &registers, registerValues);
917+
}
918+
else
919+
{
920+
// fallback implementation for older OS versions
921+
#if defined(_CPU_X86_64_)
922+
x86_thread_state64_t threadState;
923+
mach_msg_type_number_t count = x86_THREAD_STATE64_COUNT;
924+
machret = thread_get_state(pThreads[i], x86_THREAD_STATE64, (thread_state_t)&threadState, &count);
925+
#elif defined(_CPU_AARCH64_)
926+
arm_thread_state64_t threadState;
927+
mach_msg_type_number_t count = ARM_THREAD_STATE64_COUNT;
928+
machret = thread_get_state(pThreads[i], ARM_THREAD_STATE64, (thread_state_t)&threadState, &count);
929+
#else
930+
#error Unexpected architecture
931+
#endif
932+
}
933+
934+
if (machret == KERN_INSUFFICIENT_BUFFER_SIZE)
935+
{
936+
HANDLE_MACH_ERROR("thread_get_register_pointer_values()", machret);
937+
}
938+
939+
machret = mach_port_deallocate(mach_task_self(), pThreads[i]);
940+
HANDLE_MACH_ERROR("mach_port_deallocate()", machret);
941+
}
942+
// Deallocate the thread list now we're done with it.
943+
machret = vm_deallocate(mach_task_self(), (vm_address_t)pThreads, cThreads * sizeof(thread_act_t));
944+
HANDLE_MACH_ERROR("vm_deallocate()", machret);
945+
}

0 commit comments

Comments
 (0)