compsec-epfl · MuhtasimTanmoy · Dec 26, 2025 · Dec 26, 2025 · Dec 26, 2025 · Dec 27, 2025
diff --git a/benches/experimental_benches.rs b/benches/experimental_benches.rs
@@ -7,11 +7,11 @@ use efficient_sumcheck::{
         m31::{
             evaluate_bf::evaluate_bf, evaluate_ef::evaluate_ef, reduce_bf::reduce_bf,
             reduce_ef::reduce_ef, sumcheck,
-        },
+        }
     },
     multilinear::{pairwise, ReduceMode, TimeProver},
     prover::Prover,
-    tests::{BenchStream, Fp4SmallM31, SmallM31, F128},
+    tests::{BenchStream, Fp4SmallM31, SmallM31, F128, SmallGoldilocks, Fp2SmallGoldilocks},
     Sumcheck,
 };
 
@@ -181,6 +181,107 @@ fn bench_reduce_ef(c: &mut Criterion) {
     });
 }
 
+
+fn bench_reduce_ef_goldilocks(c: &mut Criterion) {
+
+    use efficient_sumcheck::experimental::goldilocks::reduce_ef::reduce_ef;
+    const LEN_XSMALL: usize = 1 << 10; // 1K
+    const LEN_SMALL: usize = 1 << 14; // 16K
+    const LEN_MED: usize = 1 << 16; // 64K
+    const LEN_LARGE: usize = 1 << 18; // 256K
+    const LEN_XLARGE: usize = 1 << 20; // 1M
+
+    let mut rng = test_rng();
+
+    // Shared input vector in the base field
+    let src_xsmall: Vec<Fp2SmallGoldilocks> = (0..LEN_XSMALL)
+        .map(|_| Fp2SmallGoldilocks::rand(&mut rng))
+        .collect();
+    let src_small: Vec<Fp2SmallGoldilocks> = (0..LEN_SMALL)
+        .map(|_| Fp2SmallGoldilocks::rand(&mut rng))
+        .collect();
+    let src_med: Vec<Fp2SmallGoldilocks> = (0..LEN_MED).map(|_| Fp2SmallGoldilocks::rand(&mut rng)).collect();
+    let src_large: Vec<Fp2SmallGoldilocks> = (0..LEN_LARGE)
+        .map(|_| Fp2SmallGoldilocks::rand(&mut rng))
+        .collect();
+    let src_xlarge: Vec<Fp2SmallGoldilocks> = (0..LEN_XLARGE)
+        .map(|_| Fp2SmallGoldilocks::rand(&mut rng))
+        .collect();
+
+    let challenge_ef = Fp2SmallGoldilocks::from(7);
+
+    // This should be faster
+    c.bench_function("reduce_ef::goldilocks::reduce_1K", |b| {
+        b.iter(|| {
+            let mut v = src_xsmall.clone();
+            reduce_ef(black_box(&mut v), challenge_ef);
+        });
+    });
+
+    c.bench_function("reduce_ef::goldilocks::reduce_16K", |b| {
+        b.iter(|| {
+            let mut v = src_small.clone();
+            reduce_ef(black_box(&mut v), challenge_ef);
+        });
+    });
+
+    c.bench_function("reduce_ef::goldilocks::reduce_64K", |b| {
+        b.iter(|| {
+            let mut v = src_med.clone();
+            reduce_ef(black_box(&mut v), challenge_ef);
+        });
+    });
+
+    c.bench_function("reduce_ef::goldilocks::reduce_256K", |b| {
+        b.iter(|| {
+            let mut v = src_large.clone();
+            reduce_ef(black_box(&mut v), challenge_ef);
+        });
+    });
+
+    c.bench_function("reduce_ef::goldilocks::reduce_1M", |b| {
+        b.iter(|| {
+            let mut v = src_xlarge.clone();
+            reduce_ef(black_box(&mut v), challenge_ef);
+        });
+    });
+
+    c.bench_function("ef_pairwise::reduce_1K", |b| {
+        b.iter(|| {
+            let mut v = src_xsmall.clone();
+            pairwise::reduce_evaluations(black_box(&mut v), challenge_ef);
+        });
+    });
+
+    c.bench_function("ef_pairwise::reduce_16K", |b| {
+        b.iter(|| {
+            let mut v = src_small.clone();
+            pairwise::reduce_evaluations(black_box(&mut v), challenge_ef);
+        });
+    });
+
+    c.bench_function("ef_pairwise::reduce_64K", |b| {
+        b.iter(|| {
+            let mut v = src_med.clone();
+            pairwise::reduce_evaluations(black_box(&mut v), challenge_ef);
+        });
+    });
+
+    c.bench_function("ef_pairwise::reduce_256K", |b| {
+        b.iter(|| {
+            let mut v = src_large.clone();
+            pairwise::reduce_evaluations(black_box(&mut v), challenge_ef);
+        });
+    });
+
+    c.bench_function("ef_pairwise::reduce_1M", |b| {
+        b.iter(|| {
+            let mut v = src_xlarge.clone();
+            pairwise::reduce_evaluations(black_box(&mut v), challenge_ef);
+        });
+    });
+}
+
 fn bench_reduce_bf(c: &mut Criterion) {
     const LEN_XSMALL: usize = 1 << 10; // 1K
     const LEN_SMALL: usize = 1 << 14; // 16K
@@ -476,5 +577,6 @@ criterion_group!(
     bench_evaluate_bf,
     bench_evaluate_ef,
     bench_reduce_ef,
+    bench_reduce_ef_goldilocks
 );
 criterion_main!(benches);
diff --git a/src/experimental/goldilocks/arithmetic/add.rs b/src/experimental/goldilocks/arithmetic/add.rs
@@ -0,0 +1,57 @@
+use ark_std::simd::{cmp::SimdPartialOrd, LaneCount, Simd, SupportedLaneCount};
+use super::super::{MODULUS, EPSILON};
+use crate::experimental::goldilocks::utils::{assume, branch_hint};
+
+// https://github.com/zhenfeizhang/Goldilocks/blob/872114997b82d0157e29a702992a3bd2023aa7ba/src/primefield/fp.rs#L377
+#[inline(always)]
+pub fn add(a: u64, b: u64) -> u64 {
+    let (sum, over) = a.overflowing_add(b);
+    let (mut sum, over) = sum.overflowing_add((over as u64) * EPSILON);
+    if over {
+        // NB: a > Self::ORDER && b > Self::ORDER is necessary but not sufficient for double-overflow.
+        // This assume does two things:
+        //  1. If compiler knows that either a or b <= ORDER, then it can skip this check.
+        //  2. Hints to the compiler how rare this double-overflow is (thus handled better with a branch).
+        assume(a > MODULUS && b > MODULUS);
+        branch_hint();
+        sum += EPSILON; // Cannot overflow.
+    }
+    sum   
+}
+
+#[inline(always)]
+pub fn add_v<const LANES: usize>(a: &Simd<u64, LANES>, b: &Simd<u64, LANES>) -> Simd<u64, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    let modulus = Simd::<u64, LANES>::splat(MODULUS);
+    let epsilon = Simd::<u64, LANES>::splat(EPSILON);
+    let sum = a + b;
+
+    // 2. Detect where overflow occurred (a + b >= 2^64)
+    // In SIMD, if the sum is less than one of the inputs, an overflow happened.
+    let overflow_mask = sum.simd_lt(*a);
+
+    // 3. Add epsilon to lanes that overflowed
+    let mut res = overflow_mask.select(sum + epsilon, sum);
+
+    // 4. Final canonical reduction: if res >= modulus { res - modulus }
+    res = res.simd_ge(modulus).select(res - modulus, res);
+
+    res
+}
+
+
+#[cfg(test)]
+mod tests {
+    use super::add_v;
+    use ark_std::simd::Simd;
+
+    #[test]
+    fn sanity() {
+        let a: [u64; 1] = [9];
+        let b: [u64; 1] = [7];
+        let sum = add_v(&Simd::from_array(a), &Simd::from_array(b));
+        assert_eq!(sum[0], 16);
+    }
+}
diff --git a/src/experimental/goldilocks/arithmetic/mod.rs b/src/experimental/goldilocks/arithmetic/mod.rs
@@ -0,0 +1,3 @@
+pub mod add;
+pub mod mul;
+pub mod sub;
diff --git a/src/experimental/goldilocks/arithmetic/mul.rs b/src/experimental/goldilocks/arithmetic/mul.rs
@@ -0,0 +1,142 @@
+use std::simd::{Mask};
+
+use ark_std::{
+    mem,
+    simd::{cmp::SimdPartialOrd, LaneCount, Simd, SupportedLaneCount},
+};
+
+use crate::tests::SmallGoldilocks;
+use super::super::{MODULUS, EPSILON};
+
+pub fn mul(a: u64, b: u64) -> u64 {
+    let prod = unsafe { mem::transmute::<u64, SmallGoldilocks>(a) }
+        * unsafe { mem::transmute::<u64, SmallGoldilocks>(b) };
+    unsafe { mem::transmute::<SmallGoldilocks, u64>(prod) }
+}
+
+
+#[inline(always)]
+pub fn mul_v<const LANES: usize>(
+    a: &Simd<u64, LANES>,
+    b: &Simd<u64, LANES>,
+) -> Simd<u64, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    let mask32 = Simd::splat(0xFFFFFFFFu64);
+
+    let a_lo = *a & mask32;
+    let a_hi = *a >> 32;
+    let b_lo = *b & mask32;
+    let b_hi = *b >> 32;
+
+    let lo_lo = a_lo * b_lo;
+    let lo_hi = a_lo * b_hi;
+    let hi_lo = a_hi * b_lo;
+    let hi_hi = a_hi * b_hi;
+
+    let mid = lo_hi + hi_lo;
+    let mid_carry = mid.simd_lt(lo_hi).select(Simd::splat(1 << 32), Simd::splat(0));
+
+    let mid_lo = mid & mask32;
+    let mid_hi = mid >> 32;
+
+    let x_lo = lo_lo + (mid_lo << 32);
+    let x_lo_carry = x_lo.simd_lt(lo_lo).select(Simd::splat(1), Simd::splat(0));
+    let x_hi = hi_hi + mid_hi + mid_carry + x_lo_carry;
+
+    let x_hi_hi = x_hi >> 32;
+    let x_hi_lo = x_hi & mask32;
+
+
+    let mut t0 = x_lo - x_hi_hi;
+    let borrow_mask = x_lo.simd_lt(x_hi_hi);
+    t0 = borrow_mask.select(t0 - Simd::splat(EPSILON), t0);
+
+
+    let t1 = x_hi_lo * Simd::splat(EPSILON);
+
+    let (t2_wrapped, carry) = overflowing_add_simd(t0, t1);
+    let mut r = t2_wrapped + (carry.select(Simd::splat(EPSILON), Simd::splat(0)));
+
+    let p = Simd::splat(MODULUS);
+    r = r.simd_ge(p).select(r - p, r);
+
+    r
+}
+
+/// Helper for overflowing add in SIMD
+#[inline(always)]
+fn overflowing_add_simd<const LANES: usize>(
+    a: Simd<u64, LANES>, 
+    b: Simd<u64, LANES>
+) -> (Simd<u64, LANES>, Mask<i64, LANES>) 
+where LaneCount<LANES>: SupportedLaneCount 
+{
+    let res = a + b;
+    (res, res.simd_lt(a))
+}
+
+
+#[cfg(test)]
+mod tests {
+    use crate::experimental::goldilocks::MODULUS;
+
+    use super::mul_v;
+    use ark_std::{rand::RngCore, simd::Simd, test_rng};
+
+     #[test]
+    fn single() {        
+        // https://asecuritysite.com/zk/go_plonk4
+
+        let a_input: [u64; 1] = [10719222850664546238];
+        let b_input: [u64; 1] = [301075827032876239];
+
+        // 1. Calculate Expected using u128
+        let expected = ((a_input[0] as u128 * b_input[0] as u128) % MODULUS as u128) as u64;
+
+        // 2. Calculate Received using your mul_v
+        const LANES: usize = 1;
+        let a_simd = Simd::<u64, LANES>::from_slice(&a_input);
+        let b_simd = Simd::<u64, LANES>::from_slice(&b_input);
+        let res_simd = mul_v(&a_simd, &b_simd);
+        let received = res_simd.as_array()[0];
+
+        println!("Expected: {}, Received: {}", expected, received);
+        assert_eq!(expected, received);
+    }
+
+    #[test]
+    fn sanity() {
+        const LEN: usize = 1 << 20;
+        let mut rng = test_rng();
+
+        // random elements
+        let multipliers: Vec<u64> = (0..LEN).map(|_| rng.next_u64() % MODULUS).collect();
+        let mut expected_ef: Vec<u64> = (0..LEN).map(|_| rng.next_u64()).collect();
+
+        let mut received_ef = expected_ef.clone();
+
+        // control
+        expected_ef
+            .iter_mut()
+            .zip(multipliers.iter())
+            .for_each(|(a, b)| {
+                let prod = (*a as u128) * (*b as u128);
+                *a = (prod % MODULUS as u128) as u64;
+            });
+
+
+        const LANES: usize = 16;
+        for (a_chunk, b_chunk) in received_ef.chunks_mut(LANES).zip(multipliers.chunks(LANES)) {
+            let a_simd = Simd::<u64, LANES>::from_slice(a_chunk);
+            let b_simd = Simd::<u64, LANES>::from_slice(b_chunk);
+            // perfom op
+            let res = mul_v(&a_simd, &b_simd);
+            // write back into slice
+            a_chunk.copy_from_slice(res.as_array());
+        }
+
+        assert_eq!(expected_ef, received_ef);
+    }
+}
diff --git a/src/experimental/goldilocks/arithmetic/sub.rs b/src/experimental/goldilocks/arithmetic/sub.rs
@@ -0,0 +1,47 @@
+use ark_std::simd::{cmp::SimdPartialOrd, LaneCount, Simd, SupportedLaneCount};
+
+// https://github.com/zhenfeizhang/Goldilocks/blob/872114997b82d0157e29a702992a3bd2023aa7ba/src/primefield/fp.rs#L424
+#[inline(always)]
+pub fn sub(a: u64, b: u64) -> u64 {
+    let (diff, underflow) = a.overflowing_sub(b);
+    if underflow {
+        // If a < b, the raw diff is (a - b) + 2^64.
+        // Since 2^64 mod p = 2^32 - 1, we subtract (2^32 - 1) to correct it.
+        diff.wrapping_sub(0xFFFFFFFF)
+    } else {
+        diff
+    }
+}
+
+#[inline(always)]
+pub fn sub_v<const LANES: usize>(a: &Simd<u64, LANES>, b: &Simd<u64, LANES>) -> Simd<u64, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    let epsilon = Simd::<u64, LANES>::splat(0xFFFFFFFF);
+
+    // 1. Standard wrapping subtraction
+    let diff = a - b;
+
+    // 2. Detect underflow (a < b)
+    let underflow_mask = a.simd_lt(*b);
+
+    // 3. If underflowed, we have diff = (a - b) + 2^64.
+    // To get (a - b) mod p, we need (a - b) + (2^64 - 2^32 + 1).
+    // So we subtract (2^32 - 1).
+    underflow_mask.select(diff - epsilon, diff)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::sub_v;
+    use ark_std::simd::Simd;
+
+    #[test]
+    fn sanity() {
+        let a: [u64; 1] = [9];
+        let b: [u64; 1] = [7];
+        let diff = sub_v(&Simd::from_array(a), &Simd::from_array(b));
+        assert_eq!(diff[0], 2);
+    }
+}
diff --git a/src/experimental/goldilocks/constants.rs b/src/experimental/goldilocks/constants.rs
@@ -0,0 +1,5 @@
+/// 2^64 - 2^32 + 1
+pub const MODULUS: u64 = 0xffffffff00000001;
+
+/// 2^32 - 1
+pub const EPSILON: u64 = 0xffffffff;