official-clockwork · TheRealGioviok · Dec 26, 2025 · Dec 26, 2025
diff --git a/src/geometry.cpp b/src/geometry.cpp
@@ -3,22 +3,31 @@
 
 namespace Clockwork::geometry {
 
-// clang-format off
 // Offset arrangement is AVX2-specific (due to punpck-ordering).
 constexpr std::array<u8, 64> AVX2_OFFSETS{{
-    0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217,
-    0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317,
-    0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237,
-    0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337,
-    0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257,
-    0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357,
-    0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277,
-    0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377,
+  0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217,  //
+  0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317,  //
+  0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237,  //
+  0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337,  //
+  0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257,  //
+  0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357,  //
+  0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277,  //
+  0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377,  //
 }};
-// clang-format on
 
+constexpr std::array<u8, 64> AVX512_OFFSETS{{
+  0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217,  //
+  0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237,  //
+  0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257,  //
+  0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277,  //
+  0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317,  //
+  0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337,  //
+  0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357,  //
+  0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377,  //
+}};
 
-const std::array<u8x64, 64> SUPERPIECE_INVERSE_RAYS_AVX2_TABLE = []() {
+template<std::array<u8, 64> OFFSETS, u8 RAY_OFFSET>
+consteval std::array<u8x64, 64> calc_superpiece_inverse_rays_table() {
     // clang-format off
     constexpr u8 NONE = 0x80;
     constexpr std::array<u8, 256> BASE{{
@@ -46,15 +55,25 @@ const std::array<u8x64, 64> SUPERPIECE_INVERSE_RAYS_AVX2_TABLE = []() {
         u8                 esq = internal::expand_sq(Square{sq});
         std::array<u8, 64> b;
         for (usize i = 0; i < 64; i++) {
-            u8 value = BASE[AVX2_OFFSETS[i] - esq];
+            u8 value = BASE[OFFSETS[i] - esq];
+            value    = value != NONE ? (value + RAY_OFFSET) % 64 : NONE;
             b[i]     = value;
         }
         table[sq] = u8x64{b};
     }
     return table;
-}();
+}
+
 
-const std::array<u8x64, 64> PIECE_MOVES_AVX2_TABLE = []() {
+const std::array<u8x64, 64> SUPERPIECE_INVERSE_RAYS_AVX2_TABLE =
+  calc_superpiece_inverse_rays_table<AVX2_OFFSETS, 0>();
+const std::array<u8x64, 64> SUPERPIECE_INVERSE_RAYS_AVX512_TABLE =
+  calc_superpiece_inverse_rays_table<AVX512_OFFSETS, 0>();
+const std::array<u8x64, 64> SUPERPIECE_INVERSE_RAYS_FLIPPED_AVX512_TABLE =
+  calc_superpiece_inverse_rays_table<AVX512_OFFSETS, 32>();
+
+template<std::array<u8, 64> OFFSETS>
+consteval std::array<u8x64, 64> calc_piece_moves_table() {
     // clang-format off
     constexpr u8 K = 1 << static_cast<i32>(PieceType::King);
     constexpr u8 Q = 1 << static_cast<i32>(PieceType::Queen);
@@ -92,11 +111,14 @@ const std::array<u8x64, 64> PIECE_MOVES_AVX2_TABLE = []() {
         u8                 esq = internal::expand_sq(Square{sq});
         std::array<u8, 64> b;
         for (usize i = 0; i < 64; i++) {
-            b[i] = BASE[AVX2_OFFSETS[i] - esq];
+            b[i] = BASE[OFFSETS[i] - esq];
         }
         table[sq] = u8x64{b};
     }
     return table;
-}();
+}
+
+const std::array<u8x64, 64> PIECE_MOVES_AVX2_TABLE   = calc_piece_moves_table<AVX2_OFFSETS>();
+const std::array<u8x64, 64> PIECE_MOVES_AVX512_TABLE = calc_piece_moves_table<AVX512_OFFSETS>();
 
 }  // namespace Clockwork::geometry
diff --git a/src/geometry.hpp b/src/geometry.hpp
@@ -124,6 +124,18 @@ forceinline u8x64 superpiece_inverse_rays_avx2(Square sq) {
     return SUPERPIECE_INVERSE_RAYS_AVX2_TABLE[sq.raw];
 }
 
+extern const std::array<u8x64, 64> SUPERPIECE_INVERSE_RAYS_AVX512_TABLE;
+
+forceinline u8x64 superpiece_inverse_rays_avx512(Square sq) {
+    return SUPERPIECE_INVERSE_RAYS_AVX512_TABLE[sq.raw];
+}
+
+extern const std::array<u8x64, 64> SUPERPIECE_INVERSE_RAYS_FLIPPED_AVX512_TABLE;
+
+forceinline u8x64 superpiece_inverse_rays_flipped_avx512(Square sq) {
+    return SUPERPIECE_INVERSE_RAYS_FLIPPED_AVX512_TABLE[sq.raw];
+}
+
 extern const std::array<u8x64, 64> PIECE_MOVES_AVX2_TABLE;
 
 forceinline m8x64 piece_moves_avx2(bool color, PieceType ptype, Square sq) {
@@ -134,6 +146,16 @@ forceinline m8x64 piece_moves_avx2(bool color, PieceType ptype, Square sq) {
     return table.test(bit);
 }
 
+extern const std::array<u8x64, 64> PIECE_MOVES_AVX512_TABLE;
+
+forceinline m8x64 piece_moves_avx512(bool color, PieceType ptype, Square sq) {
+    assert(ptype != PieceType::None);
+    i32   index = ptype == PieceType::Pawn ? color : static_cast<i32>(ptype);
+    u8x64 bit   = u8x64::splat(static_cast<u8>(1 << index));
+    u8x64 table = PIECE_MOVES_AVX512_TABLE[sq.raw];
+    return table.test(bit);
+}
+
 forceinline u8x64 slider_broadcast(u8x64 x) {
 #if LPS_AVX512
     constexpr u8 NONE = 0xFF;
@@ -172,7 +194,19 @@ forceinline u8x64 slider_broadcast(u8x64 x) {
 }
 
 forceinline u8x64 lane_broadcast(u8x64 x) {
-#if LPS_AVX2
+#if LPS_AVX512
+    u8x64 EXPAND_IDX{{
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  //
+      0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,  //
+      0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,  //
+      0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,  //
+      0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,  //
+      0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28,  //
+      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,  //
+      0x38, 0x38, 0x38, 0x38, 0x38, 0x38, 0x38, 0x38,  //
+    }};
+    return EXPAND_IDX.swizzle(u8x64{_mm512_sad_epu8(x.raw, _mm512_setzero_si512())});
+#elif LPS_AVX2
     u8x64 y;
     y.raw[0].raw = _mm256_sad_epu8(x.raw[0].raw, _mm256_setzero_si256());
     y.raw[1].raw = _mm256_sad_epu8(x.raw[1].raw, _mm256_setzero_si256());

diff --git a/src/movegen.cpp b/src/movegen.cpp
@@ -443,10 +443,21 @@ bool MoveGen::is_hside_castling_legal(Bitboard empty, Bitboard danger) const {
 }
 
 void MoveGen::write(MoveList& moves, Square dest, PieceMask piecemask, MoveFlags mf) {
+#if LPS_AVX512
+    moves.unsafe_append([&](Move* data) {
+        u16x16 vec =
+          u16x16::splat(Move{Square{0}, dest, mf}.raw)
+          | u16x16{_mm256_cvtepu8_epi16(m_position.piece_list_sq(m_active_color).to_vector().raw)};
+        vec = m16x16{piecemask.value()}.compress(vec);
+        std::memcpy(data, &vec, sizeof(vec));
+        return piecemask.popcount();
+    });
+#else
     for (PieceId id : piecemask) {
         Square src = m_position.piece_list_sq(m_active_color)[id];
         moves.push_back(Move{src, dest, mf});
     }
+#endif
 }
 
 void MoveGen::write(MoveList&                        moves,
@@ -460,10 +471,32 @@ void MoveGen::write(MoveList&                        moves,
 }
 
 void MoveGen::write_pawn(MoveList& moves, Bitboard src_bb, i32 shift, MoveFlags mf) {
+#if LPS_AVX512
+    u16x32 base = []() consteval {
+        std::array<Move, 32> base;
+        for (usize i = 0; i < 32; i++) {
+            Square src{static_cast<u8>(i)};
+            Square dest{static_cast<u8>(i)};
+            base[i] = Move{src, dest, static_cast<MoveFlags>(0)};
+        }
+        return std::bit_cast<u16x32>(base);
+    }();
+    for (int i : {0, 32}) {
+        moves.unsafe_append([&](Move* data) {
+            m16x32 mask{static_cast<u32>(src_bb.value() >> i)};
+            u16x32 vec =
+              u16x32::splat(static_cast<u16>(i + ((i + shift) << 6) + static_cast<u16>(mf))) + base;
+            vec = mask.compress(vec);
+            std::memcpy(data, &vec, sizeof(vec));
+            return static_cast<usize>(std::popcount(mask.raw));
+        });
+    }
+#else
     for (Square src : src_bb) {
         Square dest{static_cast<u8>(src.raw + shift)};
         moves.push_back(Move{src, dest, mf});
     }
+#endif
 }
 
 bool MoveGen::is_ep_clearance_pinned(PieceMask ep_attackers_mask) const {

diff --git a/src/position.cpp b/src/position.cpp
@@ -175,22 +175,44 @@ void Position::incrementally_move_piece(
     dst_slider_ids = dst_raymask.mask(geometry::flip_rays(dst_slider_ids));  // flip rays
     dst_slider_ids |= dst_raymask.mask(u8x64::splat(0x20));  // pack information for efficiency
 
+#if LPS_AVX512
+    u8x64 src_inv_perm = geometry::superpiece_inverse_rays_avx512(from);
+    u8x64 dst_inv_perm = geometry::superpiece_inverse_rays_avx512(to);
+#else
     u8x64 src_inv_perm = geometry::superpiece_inverse_rays_avx2(from);
     u8x64 dst_inv_perm = geometry::superpiece_inverse_rays_avx2(to);
+#endif
 
     // Transform into board layout
     src_slider_ids = src_inv_perm.swizzle(src_slider_ids);
     dst_slider_ids = dst_inv_perm.swizzle(dst_slider_ids);
 
     // Recover color information
-    u8x64 src_col = src_slider_ids.test(u8x64::splat(0x10)).to_vector();
-    u8x64 dst_col = dst_slider_ids.test(u8x64::splat(0x10)).to_vector();
+    m8x64 src_col = src_slider_ids.test(u8x64::splat(0x10));
+    m8x64 dst_col = dst_slider_ids.test(u8x64::splat(0x10));
     // Recover ray mask information
     m8x64 ret = dst_slider_ids.test(u8x64::splat(0x20));
 
     src_slider_ids &= u8x64::splat(0x0F);
     dst_slider_ids &= u8x64::splat(0x0F);
 
+#if LPS_AVX512
+    u16x64 src_slider_ids2 = std::bit_cast<u16x64>(
+      std::array{_mm512_cvtepu8_epi16(_mm512_castsi512_si256(src_slider_ids.raw)),
+                 _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(src_slider_ids.raw, 1))});
+    u16x64 dst_slider_ids2 = std::bit_cast<u16x64>(
+      std::array{_mm512_cvtepu8_epi16(_mm512_castsi512_si256(dst_slider_ids.raw)),
+                 _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(dst_slider_ids.raw, 1))});
+
+    u16x64 src_at = m16x64{src_slider_ids.nonzeros().raw}.mask(u16x64::splat(1) << src_slider_ids2);
+    u16x64 dst_at = m16x64{dst_slider_ids.nonzeros().raw}.mask(u16x64::splat(1) << dst_slider_ids2);
+
+    m16x64 src_color{src_col.raw};
+    m16x64 dst_color{dst_col.raw};
+
+    m_attack_table[0].raw ^= (~src_color).mask(src_at) ^ (~dst_color).mask(dst_at);
+    m_attack_table[1].raw ^= src_color.mask(src_at) ^ dst_color.mask(dst_at);
+#else
     // AVX2 doesn't have a variable word shift, so were're doing it this way.
     // Index zero is invalid here (the king is never a slider), so 0 converts to 0.
     static const u8x16 BITS_LO{{0x00, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,  //
@@ -202,10 +224,10 @@ void Position::incrementally_move_piece(
     u8x64              dst_at_lo = dst_slider_ids.swizzle(BITS_LO);
     u8x64              dst_at_hi = dst_slider_ids.swizzle(BITS_HI);
 
-    u8x64 src_color0 = src_col.zip_low_128lanes(src_col);
-    u8x64 src_color1 = src_col.zip_high_128lanes(src_col);
-    u8x64 dst_color0 = dst_col.zip_low_128lanes(dst_col);
-    u8x64 dst_color1 = dst_col.zip_high_128lanes(dst_col);
+    u8x64 src_color0 = src_col.to_vector().zip_low_128lanes(src_col.to_vector());
+    u8x64 src_color1 = src_col.to_vector().zip_high_128lanes(src_col.to_vector());
+    u8x64 dst_color0 = dst_col.to_vector().zip_low_128lanes(dst_col.to_vector());
+    u8x64 dst_color1 = dst_col.to_vector().zip_high_128lanes(dst_col.to_vector());
 
     u16x64 src_color = std::bit_cast<u16x64>(std::array<u8x64, 2>{src_color0, src_color1});
     u16x64 dst_color = std::bit_cast<u16x64>(std::array<u8x64, 2>{dst_color0, dst_color1});
@@ -220,6 +242,7 @@ void Position::incrementally_move_piece(
 
     m_attack_table[0].raw ^= src_at.andnot(src_color) ^ dst_at.andnot(dst_color);
     m_attack_table[1].raw ^= (src_at & src_color) ^ (dst_at & dst_color);
+#endif
 
     add_attacks(color, p.id(), to, p.ptype(), ret);
 }
@@ -242,18 +265,34 @@ m8x64 Position::toggle_rays(Square sq) {
     slider_ids = raymask.mask(geometry::flip_rays(slider_ids));  // flip rays
     slider_ids |= raymask.mask(u8x64::splat(0x20));              // pack information for efficiency
 
+#if LPS_AVX512
+    u8x64 inv_perm = geometry::superpiece_inverse_rays_avx512(sq);
+#else
     u8x64 inv_perm = geometry::superpiece_inverse_rays_avx2(sq);
+#endif
 
     // Transform into board layout
     slider_ids = inv_perm.swizzle(slider_ids);
 
     // Recover color information
-    u8x64 col = slider_ids.test(u8x64::splat(0x10)).to_vector();
+    m8x64 col = slider_ids.test(u8x64::splat(0x10));
     // Recover ray mask information
     m8x64 ret = slider_ids.test(u8x64::splat(0x20));
 
     slider_ids &= u8x64::splat(0x0F);
 
+#if LPS_AVX512
+    u16x64 slider_ids2 = std::bit_cast<u16x64>(
+      std::array{_mm512_cvtepu8_epi16(_mm512_castsi512_si256(slider_ids.raw)),
+                 _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(slider_ids.raw, 1))});
+
+    u16x64 at = m16x64{slider_ids.nonzeros().raw}.mask(u16x64::splat(1) << slider_ids2);
+
+    m16x64 color{col.raw};
+
+    m_attack_table[0].raw ^= (~color).mask(at);
+    m_attack_table[1].raw ^= color.mask(at);
+#else
     // AVX2 doesn't have a variable word shift, so were're doing it this way.
     // Index zero is invalid here (the king is never a slider), so 0 converts to 0.
     static const u8x16 BITS_LO{{0x00, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,  //
@@ -263,8 +302,8 @@ m8x64 Position::toggle_rays(Square sq) {
     u8x64              at_lo = slider_ids.swizzle(BITS_LO);
     u8x64              at_hi = slider_ids.swizzle(BITS_HI);
 
-    u8x64  color0 = col.zip_low_128lanes(col);
-    u8x64  color1 = col.zip_high_128lanes(col);
+    u8x64  color0 = col.to_vector().zip_low_128lanes(col.to_vector());
+    u8x64  color1 = col.to_vector().zip_high_128lanes(col.to_vector());
     u16x64 color  = std::bit_cast<u16x64>(std::array<u8x64, 2>{color0, color1});
 
     u8x64  at0 = at_lo.zip_low_128lanes(at_hi);
@@ -273,6 +312,7 @@ m8x64 Position::toggle_rays(Square sq) {
 
     m_attack_table[0].raw ^= at.andnot(color);
     m_attack_table[1].raw ^= at & color;
+#endif
 
     return ret;
 }
@@ -293,7 +333,11 @@ void Position::add_attacks(bool color, PieceId id, Square sq, PieceType ptype) {
         u8x64 ray_places             = ray_coords.swizzle(m_board.to_vector());
         m8x64 raymask                = geometry::superpiece_attacks(ray_places, ray_valid);
 
-        u8x64 inv_perm  = geometry::superpiece_inverse_rays_avx2(sq);
+#if LPS_AVX512
+        u8x64 inv_perm = geometry::superpiece_inverse_rays_avx512(sq);
+#else
+        u8x64 inv_perm = geometry::superpiece_inverse_rays_avx2(sq);
+#endif
         m8x64 boardmask = inv_perm.swizzle(raymask);
 
         add_attacks(color, id, sq, ptype, boardmask);
@@ -303,14 +347,22 @@ void Position::add_attacks(bool color, PieceId id, Square sq, PieceType ptype) {
 }
 
 void Position::add_attacks(bool color, PieceId id, Square sq, PieceType ptype, m8x64 mask) {
-    u8x64 moves = (mask & geometry::piece_moves_avx2(color, ptype, sq)).to_vector();
+    u16x64 bit = u16x64::splat(id.to_piece_mask().value());
 
-    u8x64  m0 = moves.zip_low_128lanes(moves);
-    u8x64  m1 = moves.zip_high_128lanes(moves);
+#if LPS_AVX512
+    m8x64 moves = mask & geometry::piece_moves_avx512(color, ptype, sq);
+
+    m_attack_table[color].raw |= m16x64{moves.raw}.mask(bit);
+#else
+    m8x64 moves     = mask & geometry::piece_moves_avx2(color, ptype, sq);
+    u8x64 moves_vec = moves.to_vector();
+
+    u8x64  m0 = moves_vec.zip_low_128lanes(moves_vec);
+    u8x64  m1 = moves_vec.zip_high_128lanes(moves_vec);
     u16x64 m  = std::bit_cast<u16x64>(std::array<u8x64, 2>{m0, m1});
 
-    u16x64 bit = u16x64::splat(id.to_piece_mask().value());
     m_attack_table[color].raw |= m & bit;
+#endif
 }
 
 template<bool UPDATE_PSQT>
@@ -523,13 +575,14 @@ std::tuple<Wordboard, Bitboard> Position::calc_pin_mask() const {
 
 // Does this ray have a pinner?
 #if LPS_AVX512
-    m8x64 no_pinner_mask{
-      std::bit_cast<vm8x64>(std::bit_cast<u64x8>(pinner.to_vector()).zeros().to_vector())
-        .to_bits()};
+    const m8x16 has_attacker_vecmask = u8x16{_mm_set1_epi64x(pinner.raw)}.nonzeros();
+    const m8x64 pinned               = m8x64{static_cast<u64>(
+      _mm_cvtsi128_si64(has_attacker_vecmask.mask(u8x16{_mm_set1_epi64x(maybe_pinned.raw)}).raw))};
 #else
     m8x64 no_pinner_mask = std::bit_cast<m8x64>(std::bit_cast<m64x8>(pinner).to_vector().zeros());
+    m8x64 pinned         = maybe_pinned.andnot(no_pinner_mask);
 #endif
-    m8x64 pinned = maybe_pinned.andnot(no_pinner_mask);
+
 
     u8x64 nonmasked_pinned_ids =
       geometry::lane_broadcast(pinned.mask(ray_places & u8x64::splat(0xF)));

diff --git a/src/util/static_vector.hpp b/src/util/static_vector.hpp
@@ -92,6 +92,12 @@ class StaticVector {
         return res;
     }
 
+    template<typename F>
+    void unsafe_append(F&& f) {
+        usize sz = f(end());
+        m_len += sz;
+    }
+
     void append(const StaticVector& other) {
         assert(m_len + other.m_len <= cap);
         std::uninitialized_copy(other.begin(), other.end(), end());