Skip to content

Commit 79598bf

Browse files
committed
Rework SSE intrinsics to avoid Clang 19 bugs
Fixes the Clang part of #1312. The problem is that LLVM wrongly considers __m128 to have floating-point semantics at all times. When it sees an all-ones mask in some component it wrongly sees it as a NaN and replaces it with an undefined value when in -ffinite-math mode. In the cases where the new first_XYZ_second_W is used, that is actually what Clang itself outputs after optimizing the original intrinsics code (with a non-bugged flag configuration). So hopefully it is faster anyway.
1 parent cc3592e commit 79598bf

File tree

1 file changed

+14
-16
lines changed

1 file changed

+14
-16
lines changed

src/engine/qcommon/q_shared.h

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1166,7 +1166,7 @@ inline vec_t VectorNormalize2( const vec3_t v, vec3_t out )
11661166
#define sseSwizzle( a, mask ) _mm_shuffle_ps( (a), (a), SWZ_##mask )
11671167

11681168
inline __m128 unitQuat() {
1169-
return _mm_set_ps( 1.0f, 0.0f, 0.0f, 0.0f ); // order is reversed
1169+
return _mm_setr_ps( 0.0f, 0.0f, 0.0f, 1.0f );
11701170
}
11711171
inline __m128 sseLoadInts( const int vec[4] ) {
11721172
return *(__m128 *)vec;
@@ -1175,13 +1175,14 @@ inline vec_t VectorNormalize2( const vec3_t v, vec3_t out )
11751175
alignas(16) static const std::array<int, 4> vec = { 0, 0, 0, 0 };
11761176
return sseLoadInts( vec.data() );
11771177
}
1178-
inline __m128 mask_000W() {
1179-
alignas(16) static const std::array<int, 4> vec = { 0, 0, 0, -1 };
1180-
return sseLoadInts( vec.data() );
1181-
}
1182-
inline __m128 mask_XYZ0() {
1183-
alignas(16) static const std::array<int, 4> vec = { -1, -1, -1, 0 };
1184-
return sseLoadInts( vec.data() );
1178+
1179+
// {first.x, first,y, first.z, second.w}
1180+
inline __m128 first_XYZ_second_W( __m128 first, __m128 second)
1181+
{
1182+
// second.w, dontcare, first.z, dontcare
1183+
__m128 tmp = _mm_shuffle_ps(second, first, 3 << 0 | 2 << 4);
1184+
// first.x, first.y, tmp.z, tmp.x
1185+
return _mm_shuffle_ps(first, tmp, 0 << 0 | 1 << 2 | 2 << 4 | 0 << 6);
11851186
}
11861187

11871188
inline __m128 sign_000W() {
@@ -1295,10 +1296,8 @@ inline vec_t VectorNormalize2( const vec3_t v, vec3_t out )
12951296
t->sseTransScale = _mm_or_ps( v, unitQuat() );
12961297
}
12971298
inline void TransInitScale( float factor, transform_t *t ) {
1298-
__m128 f = _mm_set1_ps( factor );
1299-
f = _mm_and_ps( f, mask_000W() );
13001299
t->sseRot = unitQuat();
1301-
t->sseTransScale = f;
1300+
t->sseTransScale = _mm_setr_ps( 0.0f, 0.0f, 0.0f, factor );
13021301
}
13031302
inline void TransInsRotationQuat( const quat_t quat, transform_t *t ) {
13041303
__m128 q = _mm_loadu_ps( quat );
@@ -1318,11 +1317,10 @@ inline vec_t VectorNormalize2( const vec3_t v, vec3_t out )
13181317
}
13191318
inline void TransInsTranslation(
13201319
const vec3_t vec, transform_t *t ) {
1321-
__m128 v = sseLoadVec3Unsafe( vec );
1320+
__m128 v = sseLoadVec3( vec );
13221321
__m128 ts = t->sseTransScale;
13231322
v = sseQuatTransform( t->sseRot, v );
13241323
v = _mm_mul_ps( v, sseSwizzle( ts, WWWW ) );
1325-
v = _mm_and_ps( v, mask_XYZ0() );
13261324
t->sseTransScale = _mm_add_ps( ts, v );
13271325
}
13281326
inline void TransAddTranslation(
@@ -1339,7 +1337,8 @@ inline vec_t VectorNormalize2( const vec3_t v, vec3_t out )
13391337
__m128 bTS = b->sseTransScale;
13401338
__m128 tmp = sseQuatTransform( bRot, aTS );
13411339
tmp = _mm_mul_ps( tmp, sseSwizzle( bTS, WWWW ) );
1342-
out->sseTransScale = _mm_add_ps( tmp, _mm_and_ps( bTS, mask_XYZ0() ) );
1340+
__m128 bT = first_XYZ_second_W( bTS, mask_0000() );
1341+
out->sseTransScale = _mm_add_ps( tmp, bT );
13431342
out->sseRot = sseQuatMul( bRot, aRot );
13441343
}
13451344
inline void TransInverse( const transform_t *in,
@@ -1352,8 +1351,7 @@ inline vec_t VectorNormalize2( const vec3_t v, vec3_t out )
13521351
__m128 tmp = sseQuatTransform( invRot, invT );
13531352
tmp = _mm_mul_ps( tmp, invS );
13541353
out->sseRot = invRot;
1355-
out->sseTransScale = _mm_or_ps( _mm_and_ps( tmp, mask_XYZ0() ),
1356-
_mm_and_ps( invS, mask_000W() ) );
1354+
out->sseTransScale = first_XYZ_second_W( tmp, invS );
13571355
}
13581356
inline void TransStartLerp( transform_t *t ) {
13591357
t->sseRot = mask_0000();

0 commit comments

Comments
 (0)