Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 53 additions & 7 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -426,17 +426,43 @@ impl<N: SimdRealCopy> SimdAngularInertia<N> for SdpMatrix3<N> {
// to zero, and automatically resetting previous flags once it is dropped.
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct FlushToZeroDenormalsAreZeroFlags {
#[cfg(any(
feature = "enhanced-determinism",
not(any(
target_arch = "aarch64",
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "sse"
)
))
))]
original_flags: (),

#[cfg(all(
not(feature = "enhanced-determinism"),
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "sse"
))]
original_flags: u32,

#[cfg(all(not(feature = "enhanced-determinism"), target_arch = "aarch64"))]
original_flags: u64,
}

// Flush denormals & underflows to zero as this as a significant impact on the solver's performances.
impl FlushToZeroDenormalsAreZeroFlags {
#[cfg(not(all(
not(feature = "enhanced-determinism"),
any(target_arch = "x86_64", target_arch = "x86"),
target_feature = "sse"
)))]
#[cfg(any(
feature = "enhanced-determinism",
not(any(
target_arch = "aarch64",
all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "sse"
)
))
))]
pub fn flush_denormal_to_zero() -> Self {
Self { original_flags: 0 }
Self { original_flags: () }
}

#[cfg(all(
Expand All @@ -452,14 +478,27 @@ impl FlushToZeroDenormalsAreZeroFlags {
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::{_MM_FLUSH_ZERO_ON, _mm_getcsr, _mm_setcsr};

// Flush denormals & underflows to zero as this as a significant impact on the solver's performances.
// To enable this we need to set the bit 15 (given by _MM_FLUSH_ZERO_ON) and the bit 6 (for denormals-are-zero).
// See https://software.intel.com/content/www/us/en/develop/articles/x87-and-sse-floating-point-assists-in-ia-32-flush-to-zero-ftz-and-denormals-are-zero-daz.html
let original_flags = _mm_getcsr();
_mm_setcsr(original_flags | _MM_FLUSH_ZERO_ON | (1 << 6));
Self { original_flags }
}
}

#[cfg(all(not(feature = "enhanced-determinism"), target_arch = "aarch64"))]
pub fn flush_denormal_to_zero() -> Self {
let mut original_flags: u64;
unsafe {
std::arch::asm!("mrs {}, fpcr", out(reg) original_flags);
// This sets following bits of FPCR (Floating-point Control Register):
// FZ, bit 24 - Flushing denormalized numbers to zero
// FZ16, bit 19 - Enable flushing for half-precision (f16) numbers
// See https://developer.arm.com/documentation/ddi0601/2025-06/AArch64-Registers/FPCR--Floating-point-Control-Register
std::arch::asm!("msr fpcr, {}", in(reg) original_flags | (1 << 24) | (1 << 19));
}
Self { original_flags }
}
}

#[cfg(all(
Expand All @@ -481,6 +520,13 @@ impl Drop for FlushToZeroDenormalsAreZeroFlags {
}
}

#[cfg(all(not(feature = "enhanced-determinism"), target_arch = "aarch64"))]
impl Drop for FlushToZeroDenormalsAreZeroFlags {
fn drop(&mut self) {
unsafe { std::arch::asm!("msr fpcr, {}", in(reg) self.original_flags) }
}
}

/// This is an RAII structure that disables floating point exceptions while
/// it is alive, so that operations which generate NaNs and infinite values
/// intentionally will not trip an exception when debugging problematic
Expand Down