diff --git a/benches/experimental_benches.rs b/benches/experimental_benches.rs
index d0b9e70..a133e3b 100644
--- a/benches/experimental_benches.rs
+++ b/benches/experimental_benches.rs
@@ -7,11 +7,11 @@ use efficient_sumcheck::{
         m31::{
             evaluate_bf::evaluate_bf, evaluate_ef::evaluate_ef, reduce_bf::reduce_bf,
             reduce_ef::reduce_ef, sumcheck,
-        },
+        }
     },
     multilinear::{pairwise, ReduceMode, TimeProver},
     prover::Prover,
-    tests::{BenchStream, Fp4SmallM31, SmallM31, F128},
+    tests::{BenchStream, Fp4SmallM31, SmallM31, F128, SmallGoldilocks, Fp2SmallGoldilocks},
     Sumcheck,
 };
 
@@ -181,6 +181,107 @@ fn bench_reduce_ef(c: &mut Criterion) {
     });
 }
 
+
+fn bench_reduce_ef_goldilocks(c: &mut Criterion) {
+
+    use efficient_sumcheck::experimental::goldilocks::reduce_ef::reduce_ef;
+    const LEN_XSMALL: usize = 1 << 10; // 1K
+    const LEN_SMALL: usize = 1 << 14; // 16K
+    const LEN_MED: usize = 1 << 16; // 64K
+    const LEN_LARGE: usize = 1 << 18; // 256K
+    const LEN_XLARGE: usize = 1 << 20; // 1M
+
+    let mut rng = test_rng();
+
+    // Shared input vector in the base field
+    let src_xsmall: Vec<Fp2SmallGoldilocks> = (0..LEN_XSMALL)
+        .map(|_| Fp2SmallGoldilocks::rand(&mut rng))
+        .collect();
+    let src_small: Vec<Fp2SmallGoldilocks> = (0..LEN_SMALL)
+        .map(|_| Fp2SmallGoldilocks::rand(&mut rng))
+        .collect();
+    let src_med: Vec<Fp2SmallGoldilocks> = (0..LEN_MED).map(|_| Fp2SmallGoldilocks::rand(&mut rng)).collect();
+    let src_large: Vec<Fp2SmallGoldilocks> = (0..LEN_LARGE)
+        .map(|_| Fp2SmallGoldilocks::rand(&mut rng))
+        .collect();
+    let src_xlarge: Vec<Fp2SmallGoldilocks> = (0..LEN_XLARGE)
+        .map(|_| Fp2SmallGoldilocks::rand(&mut rng))
+        .collect();
+
+    let challenge_ef = Fp2SmallGoldilocks::from(7);
+
+    // This should be faster
+    c.bench_function("reduce_ef::goldilocks::reduce_1K", |b| {
+        b.iter(|| {
+            let mut v = src_xsmall.clone();
+            reduce_ef(black_box(&mut v), challenge_ef);
+        });
+    });
+
+    c.bench_function("reduce_ef::goldilocks::reduce_16K", |b| {
+        b.iter(|| {
+            let mut v = src_small.clone();
+            reduce_ef(black_box(&mut v), challenge_ef);
+        });
+    });
+
+    c.bench_function("reduce_ef::goldilocks::reduce_64K", |b| {
+        b.iter(|| {
+            let mut v = src_med.clone();
+            reduce_ef(black_box(&mut v), challenge_ef);
+        });
+    });
+
+    c.bench_function("reduce_ef::goldilocks::reduce_256K", |b| {
+        b.iter(|| {
+            let mut v = src_large.clone();
+            reduce_ef(black_box(&mut v), challenge_ef);
+        });
+    });
+
+    c.bench_function("reduce_ef::goldilocks::reduce_1M", |b| {
+        b.iter(|| {
+            let mut v = src_xlarge.clone();
+            reduce_ef(black_box(&mut v), challenge_ef);
+        });
+    });
+
+    c.bench_function("ef_pairwise::reduce_1K", |b| {
+        b.iter(|| {
+            let mut v = src_xsmall.clone();
+            pairwise::reduce_evaluations(black_box(&mut v), challenge_ef);
+        });
+    });
+
+    c.bench_function("ef_pairwise::reduce_16K", |b| {
+        b.iter(|| {
+            let mut v = src_small.clone();
+            pairwise::reduce_evaluations(black_box(&mut v), challenge_ef);
+        });
+    });
+
+    c.bench_function("ef_pairwise::reduce_64K", |b| {
+        b.iter(|| {
+            let mut v = src_med.clone();
+            pairwise::reduce_evaluations(black_box(&mut v), challenge_ef);
+        });
+    });
+
+    c.bench_function("ef_pairwise::reduce_256K", |b| {
+        b.iter(|| {
+            let mut v = src_large.clone();
+            pairwise::reduce_evaluations(black_box(&mut v), challenge_ef);
+        });
+    });
+
+    c.bench_function("ef_pairwise::reduce_1M", |b| {
+        b.iter(|| {
+            let mut v = src_xlarge.clone();
+            pairwise::reduce_evaluations(black_box(&mut v), challenge_ef);
+        });
+    });
+}
+
 fn bench_reduce_bf(c: &mut Criterion) {
     const LEN_XSMALL: usize = 1 << 10; // 1K
     const LEN_SMALL: usize = 1 << 14; // 16K
@@ -373,6 +474,65 @@ fn bench_evaluate_bf(c: &mut Criterion) {
     });
 }
 
+
+fn bench_evaluate_bf_goldilocks(c: &mut Criterion) {
+    use efficient_sumcheck::experimental::goldilocks::evaluate_bf::evaluate_bf;
+    use efficient_sumcheck::experimental::goldilocks::MODULUS;
+
+    const LEN_XSMALL: usize = 1 << 10; // 1K
+    const LEN_SMALL: usize = 1 << 14; // 16K
+    const LEN_MED: usize = 1 << 16; // 64K
+    const LEN_LARGE: usize = 1 << 18; // 256K
+    const LEN_XLARGE: usize = 1 << 20; // 1M
+
+    let mut rng = test_rng();
+
+    // Shared input vector in the base field
+    let src_xsmall: Vec<SmallGoldilocks> = (0..LEN_XSMALL).map(|_| SmallGoldilocks::rand(&mut rng)).collect();
+    let src_small: Vec<SmallGoldilocks> = (0..LEN_SMALL).map(|_| SmallGoldilocks::rand(&mut rng)).collect();
+    let src_med: Vec<SmallGoldilocks> = (0..LEN_MED).map(|_| SmallGoldilocks::rand(&mut rng)).collect();
+    let src_large: Vec<SmallGoldilocks> = (0..LEN_LARGE).map(|_| SmallGoldilocks::rand(&mut rng)).collect();
+    let src_xlarge: Vec<SmallGoldilocks> = (0..LEN_XLARGE).map(|_| SmallGoldilocks::rand(&mut rng)).collect();
+
+    // This should be faster
+    c.bench_function("evaluate_bf::goldilocks::evaluate_1K", |b| {
+        b.iter(|| {
+            let v = src_xsmall.clone();
+            evaluate_bf::<MODULUS>(black_box(&v));
+        });
+    });
+
+    c.bench_function("evaluate_bf::goldilocks::evaluate_16K", |b| {
+        b.iter(|| {
+            let v = src_small.clone();
+            evaluate_bf::<MODULUS>(black_box(&v));
+        });
+    });
+
+    c.bench_function("evaluate_bf::goldilocks::evaluate_64K", |b| {
+        b.iter(|| {
+            let v = src_med.clone();
+            evaluate_bf::<MODULUS>(black_box(&v));
+        });
+    });
+
+    c.bench_function("evaluate_bf::goldilocks::evaluate_256K", |b| {
+        b.iter(|| {
+            let v = src_large.clone();
+            evaluate_bf::<MODULUS>(black_box(&v));
+        });
+    });
+
+    c.bench_function("evaluate_bf::goldilocks::evaluate_1M", |b| {
+        b.iter(|| {
+            let v = src_xlarge.clone();
+            evaluate_bf::<MODULUS>(black_box(&v));
+        });
+    });
+
+}
+
+
 fn bench_evaluate_ef(c: &mut Criterion) {
     const LEN_XSMALL: usize = 1 << 10; // 1K
     const LEN_SMALL: usize = 1 << 14; // 16K
@@ -469,12 +629,114 @@ fn bench_evaluate_ef(c: &mut Criterion) {
     });
 }
 
+fn bench_evaluate_ef_goldilocks(c: &mut Criterion) {
+    use efficient_sumcheck::experimental::goldilocks::evaluate_ef::evaluate_ef;
+    use efficient_sumcheck::experimental::goldilocks::MODULUS;
+    
+    const LEN_XSMALL: usize = 1 << 10; // 1K
+    const LEN_SMALL: usize = 1 << 14; // 16K
+    const LEN_MED: usize = 1 << 16; // 64K
+    const LEN_LARGE: usize = 1 << 18; // 256K
+    const LEN_XLARGE: usize = 1 << 20; // 1M
+
+    let mut rng = test_rng();
+
+    // Shared input vector in the base field
+    let src_xsmall: Vec<Fp2SmallGoldilocks> = (0..LEN_XSMALL)
+        .map(|_| Fp2SmallGoldilocks::rand(&mut rng))
+        .collect();
+    let src_small: Vec<Fp2SmallGoldilocks> = (0..LEN_SMALL)
+        .map(|_| Fp2SmallGoldilocks::rand(&mut rng))
+        .collect();
+    let src_med: Vec<Fp2SmallGoldilocks> = (0..LEN_MED).map(|_| Fp2SmallGoldilocks::rand(&mut rng)).collect();
+    let src_large: Vec<Fp2SmallGoldilocks> = (0..LEN_LARGE)
+        .map(|_| Fp2SmallGoldilocks::rand(&mut rng))
+        .collect();
+    let src_xlarge: Vec<Fp2SmallGoldilocks> = (0..LEN_XLARGE)
+        .map(|_| Fp2SmallGoldilocks::rand(&mut rng))
+        .collect();
+
+    // This should be faster
+    c.bench_function("evaluate_ef::goldilocks::evaluate_1K", |b| {
+        b.iter(|| {
+            let v = src_xsmall.clone();
+            evaluate_ef::<MODULUS>(black_box(&v));
+        });
+    });
+
+    c.bench_function("evaluate_ef::goldilocks::evaluate_16K", |b| {
+        b.iter(|| {
+            let v = src_small.clone();
+            evaluate_ef::<MODULUS>(black_box(&v));
+        });
+    });
+
+    c.bench_function("evaluate_ef::goldilocks::evaluate_64K", |b| {
+        b.iter(|| {
+            let v = src_med.clone();
+            evaluate_ef::<MODULUS>(black_box(&v));
+        });
+    });
+
+    c.bench_function("evaluate_ef::goldilocks::evaluate_256K", |b| {
+        b.iter(|| {
+            let v = src_large.clone();
+            evaluate_ef::<MODULUS>(black_box(&v));
+        });
+    });
+
+    c.bench_function("evaluate_ef::goldilocks::evaluate_1M", |b| {
+        b.iter(|| {
+            let v = src_xlarge.clone();
+            evaluate_ef::<MODULUS>(black_box(&v));
+        });
+    });
+
+    c.bench_function("pairwise::goldilocks::evaluate_1K", |b| {
+        b.iter(|| {
+            let v = src_xsmall.clone();
+            pairwise::evaluate(black_box(&v));
+        });
+    });
+
+    c.bench_function("pairwise::goldilocks::evaluate_16K", |b| {
+        b.iter(|| {
+            let v = src_small.clone();
+            pairwise::evaluate(black_box(&v));
+        });
+    });
+
+    c.bench_function("pairwise::goldilocks::evaluate_64K", |b| {
+        b.iter(|| {
+            let v = src_med.clone();
+            pairwise::evaluate(black_box(&v));
+        });
+    });
+
+    c.bench_function("pairwise::goldilocks::evaluate_256K", |b| {
+        b.iter(|| {
+            let v = src_large.clone();
+            pairwise::evaluate(black_box(&v));
+        });
+    });
+
+    c.bench_function("pairwise::goldilocks::evaluate_1M", |b| {
+        b.iter(|| {
+            let v = src_xlarge.clone();
+            pairwise::evaluate(black_box(&v));
+        });
+    });
+}
+
 criterion_group!(
     benches,
     bench_reduce_bf,
     bench_sumcheck_time,
     bench_evaluate_bf,
+    bench_evaluate_bf_goldilocks,
     bench_evaluate_ef,
+    bench_evaluate_ef_goldilocks,
     bench_reduce_ef,
+    bench_reduce_ef_goldilocks
 );
 criterion_main!(benches);
diff --git a/src/experimental/goldilocks/arithmetic/add.rs b/src/experimental/goldilocks/arithmetic/add.rs
new file mode 100644
index 0000000..a2686a9
--- /dev/null
+++ b/src/experimental/goldilocks/arithmetic/add.rs
@@ -0,0 +1,57 @@
+use ark_std::simd::{cmp::SimdPartialOrd, LaneCount, Simd, SupportedLaneCount};
+use super::super::{MODULUS, EPSILON};
+use crate::experimental::goldilocks::utils::{assume, branch_hint};
+
+// https://github.com/zhenfeizhang/Goldilocks/blob/872114997b82d0157e29a702992a3bd2023aa7ba/src/primefield/fp.rs#L377
+#[inline(always)]
+pub fn add(a: u64, b: u64) -> u64 {
+    let (sum, over) = a.overflowing_add(b);
+    let (mut sum, over) = sum.overflowing_add((over as u64) * EPSILON);
+    if over {
+        // NB: a > Self::ORDER && b > Self::ORDER is necessary but not sufficient for double-overflow.
+        // This assume does two things:
+        //  1. If compiler knows that either a or b <= ORDER, then it can skip this check.
+        //  2. Hints to the compiler how rare this double-overflow is (thus handled better with a branch).
+        assume(a > MODULUS && b > MODULUS);
+        branch_hint();
+        sum += EPSILON; // Cannot overflow.
+    }
+    sum   
+}
+
+#[inline(always)]
+pub fn add_v<const LANES: usize>(a: &Simd<u64, LANES>, b: &Simd<u64, LANES>) -> Simd<u64, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    let modulus = Simd::<u64, LANES>::splat(MODULUS);
+    let epsilon = Simd::<u64, LANES>::splat(EPSILON);
+    let sum = a + b;
+
+    // 2. Detect where overflow occurred (a + b >= 2^64)
+    // In SIMD, if the sum is less than one of the inputs, an overflow happened.
+    let overflow_mask = sum.simd_lt(*a);
+
+    // 3. Add epsilon to lanes that overflowed
+    let mut res = overflow_mask.select(sum + epsilon, sum);
+
+    // 4. Final canonical reduction: if res >= modulus { res - modulus }
+    res = res.simd_ge(modulus).select(res - modulus, res);
+
+    res
+}
+
+
+#[cfg(test)]
+mod tests {
+    use super::add_v;
+    use ark_std::simd::Simd;
+
+    #[test]
+    fn sanity() {
+        let a: [u64; 1] = [9];
+        let b: [u64; 1] = [7];
+        let sum = add_v(&Simd::from_array(a), &Simd::from_array(b));
+        assert_eq!(sum[0], 16);
+    }
+}
diff --git a/src/experimental/goldilocks/arithmetic/mod.rs b/src/experimental/goldilocks/arithmetic/mod.rs
new file mode 100644
index 0000000..3d90c41
--- /dev/null
+++ b/src/experimental/goldilocks/arithmetic/mod.rs
@@ -0,0 +1,3 @@
+pub mod add;
+pub mod mul;
+pub mod sub;
\ No newline at end of file
diff --git a/src/experimental/goldilocks/arithmetic/mul.rs b/src/experimental/goldilocks/arithmetic/mul.rs
new file mode 100644
index 0000000..4f39d18
--- /dev/null
+++ b/src/experimental/goldilocks/arithmetic/mul.rs
@@ -0,0 +1,160 @@
+use std::simd::{Mask};
+
+use ark_std::{
+    mem,
+    simd::{cmp::SimdPartialOrd, LaneCount, Simd, SupportedLaneCount},
+};
+
+use crate::tests::SmallGoldilocks;
+use super::super::{MODULUS, EPSILON};
+
+pub fn mul(a: u64, b: u64) -> u64 {
+    let prod = unsafe { mem::transmute::<u64, SmallGoldilocks>(a) }
+        * unsafe { mem::transmute::<u64, SmallGoldilocks>(b) };
+    unsafe { mem::transmute::<SmallGoldilocks, u64>(prod) }
+}
+
+
+#[inline(always)]
+pub fn mul_v<const LANES: usize>(
+    a: &Simd<u64, LANES>,
+    b: &Simd<u64, LANES>,
+) -> Simd<u64, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    let mask32 = Simd::splat(0xFFFFFFFFu64);
+
+    // 32-bit Limb Split
+    let a_lo = *a & mask32;
+    let a_hi = *a >> 32;
+
+    let b_lo = *b & mask32;
+    let b_hi = *b >> 32;
+
+    // lo_lo, alo ​× blo​,   0 through 63
+    // lo_hi, alo​ × bhi​,   32 through 95
+    // hi_lo, ahi​ × blo​,   32 through 95
+    // hi_hi, ahi​ × bhi​,   64 through 127
+    let lo_lo = a_lo * b_lo;
+    let lo_hi = a_lo * b_hi;
+    let hi_lo = a_hi * b_lo;
+    let hi_hi = a_hi * b_hi;
+
+    // Reconstruct 128-bit product (x_hi, x_lo)
+    let mid = lo_hi + hi_lo;
+    let mid_lo = mid & mask32;
+    let mid_hi = mid >> 32;
+
+    // if overflow cause sum is less than its arguemnts
+    let mid_carry = mid.simd_lt(lo_hi).select(Simd::splat(1 << 32), Simd::splat(0));
+
+    // take the absolute bottom product (lo_lo) and add the lower half of your middle sum. Since the middle sum starts at bit 32, you shift mid_lo left by 32 to align it.
+    let x_lo = lo_lo + (mid_lo << 32);
+
+    // if overflow cause sum is less than its arguemnts
+    let x_lo_carry = x_lo.simd_lt(lo_lo).select(Simd::splat(1), Simd::splat(0));
+    
+    // Goldilocks Reduction (Matching your reduce128 logic)
+    // x_hi_hi is the top 32 bits of the 128-bit product
+    // x_hi_lo is the bits 64..96
+    
+    let x_hi = hi_hi + mid_hi + mid_carry + x_lo_carry;
+    let x_hi_hi = x_hi >> 32;
+    let x_hi_lo = x_hi & mask32;
+
+    // Step A: t0 = x_lo - x_hi_hi
+    let mut t0 = x_lo - x_hi_hi;
+    let borrow_mask = x_lo.simd_lt(x_hi_hi);
+    // If borrow, subtract EPSILON (which is equivalent to adding 2^64 - EPSILON)
+    t0 = borrow_mask.select(t0 - Simd::splat(EPSILON), t0);
+
+    // Step B: t1 = x_hi_lo * EPSILON
+    let t1 = x_hi_lo * Simd::splat(EPSILON);
+
+    // Step C: t2 = t0 + t1 (mod 2^64)
+    let (t2_wrapped, carry) = overflowing_add_simd(t0, t1);
+    let mut r = t2_wrapped + (carry.select(Simd::splat(EPSILON), Simd::splat(0)));
+
+    // Final Canonicalization
+    let p = Simd::splat(MODULUS);
+    r = r.simd_ge(p).select(r - p, r);
+
+    r
+}
+
+/// Helper for overflowing add in SIMD
+#[inline(always)]
+fn overflowing_add_simd<const LANES: usize>(
+    a: Simd<u64, LANES>, 
+    b: Simd<u64, LANES>
+) -> (Simd<u64, LANES>, Mask<i64, LANES>) 
+where LaneCount<LANES>: SupportedLaneCount 
+{
+    let res = a + b;
+    (res, res.simd_lt(a))
+}
+
+
+#[cfg(test)]
+mod tests {
+    use crate::experimental::goldilocks::MODULUS;
+
+    use super::mul_v;
+    use ark_std::{rand::RngCore, simd::Simd, test_rng};
+
+     #[test]
+    fn single() {        
+        // https://asecuritysite.com/zk/go_plonk4
+
+        let a_input: [u64; 1] = [10719222850664546238];
+        let b_input: [u64; 1] = [301075827032876239];
+
+        // 1. Calculate Expected using u128
+        let expected = ((a_input[0] as u128 * b_input[0] as u128) % MODULUS as u128) as u64;
+
+        // 2. Calculate Received using your mul_v
+        const LANES: usize = 1;
+        let a_simd = Simd::<u64, LANES>::from_slice(&a_input);
+        let b_simd = Simd::<u64, LANES>::from_slice(&b_input);
+        let res_simd = mul_v(&a_simd, &b_simd);
+        let received = res_simd.as_array()[0];
+
+        println!("Expected: {}, Received: {}", expected, received);
+        assert_eq!(expected, received);
+    }
+
+    #[test]
+    fn sanity() {
+        const LEN: usize = 1 << 20;
+        let mut rng = test_rng();
+
+        // random elements
+        let multipliers: Vec<u64> = (0..LEN).map(|_| rng.next_u64() % MODULUS).collect();
+        let mut expected_ef: Vec<u64> = (0..LEN).map(|_| rng.next_u64()).collect();
+
+        let mut received_ef = expected_ef.clone();
+
+        // control
+        expected_ef
+            .iter_mut()
+            .zip(multipliers.iter())
+            .for_each(|(a, b)| {
+                let prod = (*a as u128) * (*b as u128);
+                *a = (prod % MODULUS as u128) as u64;
+            });
+
+        
+        const LANES: usize = 16;
+        for (a_chunk, b_chunk) in received_ef.chunks_mut(LANES).zip(multipliers.chunks(LANES)) {
+            let a_simd = Simd::<u64, LANES>::from_slice(a_chunk);
+            let b_simd = Simd::<u64, LANES>::from_slice(b_chunk);
+            // perfom op
+            let res = mul_v(&a_simd, &b_simd);
+            // write back into slice
+            a_chunk.copy_from_slice(res.as_array());
+        }
+
+        assert_eq!(expected_ef, received_ef);
+    }
+}
diff --git a/src/experimental/goldilocks/arithmetic/sub.rs b/src/experimental/goldilocks/arithmetic/sub.rs
new file mode 100644
index 0000000..3b69109
--- /dev/null
+++ b/src/experimental/goldilocks/arithmetic/sub.rs
@@ -0,0 +1,47 @@
+use ark_std::simd::{cmp::SimdPartialOrd, LaneCount, Simd, SupportedLaneCount};
+
+// https://github.com/zhenfeizhang/Goldilocks/blob/872114997b82d0157e29a702992a3bd2023aa7ba/src/primefield/fp.rs#L424
+#[inline(always)]
+pub fn sub(a: u64, b: u64) -> u64 {
+    let (diff, underflow) = a.overflowing_sub(b);
+    if underflow {
+        // If a < b, the raw diff is (a - b) + 2^64.
+        // Since 2^64 mod p = 2^32 - 1, we subtract (2^32 - 1) to correct it.
+        diff.wrapping_sub(0xFFFFFFFF)
+    } else {
+        diff
+    }
+}
+
+#[inline(always)]
+pub fn sub_v<const LANES: usize>(a: &Simd<u64, LANES>, b: &Simd<u64, LANES>) -> Simd<u64, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    let epsilon = Simd::<u64, LANES>::splat(0xFFFFFFFF);
+    
+    // 1. Standard wrapping subtraction
+    let diff = a - b;
+    
+    // 2. Detect underflow (a < b)
+    let underflow_mask = a.simd_lt(*b);
+    
+    // 3. If underflowed, we have diff = (a - b) + 2^64.
+    // To get (a - b) mod p, we need (a - b) + (2^64 - 2^32 + 1).
+    // So we subtract (2^32 - 1).
+    underflow_mask.select(diff - epsilon, diff)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::sub_v;
+    use ark_std::simd::Simd;
+
+    #[test]
+    fn sanity() {
+        let a: [u64; 1] = [9];
+        let b: [u64; 1] = [7];
+        let diff = sub_v(&Simd::from_array(a), &Simd::from_array(b));
+        assert_eq!(diff[0], 2);
+    }
+}
diff --git a/src/experimental/goldilocks/constants.rs b/src/experimental/goldilocks/constants.rs
new file mode 100644
index 0000000..45adb4b
--- /dev/null
+++ b/src/experimental/goldilocks/constants.rs
@@ -0,0 +1,5 @@
+/// 2^64 - 2^32 + 1
+pub const MODULUS: u64 = 0xffffffff00000001;
+
+/// 2^32 - 1
+pub const EPSILON: u64 = 0xffffffff;
\ No newline at end of file
diff --git a/src/experimental/goldilocks/evaluate_bf.rs b/src/experimental/goldilocks/evaluate_bf.rs
new file mode 100644
index 0000000..32f98df
--- /dev/null
+++ b/src/experimental/goldilocks/evaluate_bf.rs
@@ -0,0 +1,114 @@
+use std::simd::{LaneCount, SupportedLaneCount};
+
+use ark_std::{mem, simd::Simd};
+
+#[cfg(feature = "parallel")]
+use rayon::{iter::ParallelIterator, prelude::ParallelSlice};
+
+use crate::{
+    experimental::goldilocks::arithmetic::add::{add, add_v},
+    tests::SmallGoldilocks,
+};
+
+#[inline(always)]
+fn sum_v<const LANES: usize>(src: &[u64]) -> (u64, u64)
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    let mut acc0 = Simd::<u64, LANES>::splat(0);
+    let mut acc1 = Simd::<u64, LANES>::splat(0);
+    let mut acc2 = Simd::<u64, LANES>::splat(0);
+    let mut acc3 = Simd::<u64, LANES>::splat(0);
+
+    let chunk_size = 4 * LANES;
+    for i in (0..src.len()).step_by(chunk_size) {
+
+        acc0 = add_v(
+            &acc0, 
+            &Simd::<u64, LANES>::from_slice(&src[i..i + LANES]));
+
+        acc1 = add_v(
+            &acc1,
+            &Simd::<u64, LANES>::from_slice(&src[i + LANES..i + 2 * LANES]),
+        );
+
+        acc2 = add_v(
+            &acc2,
+            &Simd::<u64, LANES>::from_slice(&src[i + 2 * LANES..i + 3 * LANES]),
+        );
+        
+        acc3 = add_v(
+            &acc3,
+            &Simd::<u64, LANES>::from_slice(&src[i + 3 * LANES..i + 4 * LANES]),
+        );
+    }
+
+    #[inline(always)]
+    fn sum_indexwise<const LANES: usize>(accs: &[[u64; LANES]]) -> (u64, u64) {
+        let mut even_sum = 0u64;
+        let mut odd_sum = 0u64;
+        for i in (0..LANES).step_by(2) {
+            for acc in accs {
+                even_sum = add(even_sum, acc[i]);
+                odd_sum = add(odd_sum, acc[i + 1]);
+            }
+        }
+        (even_sum, odd_sum)
+    }
+
+    sum_indexwise(&[
+        acc0.to_array(),
+        acc1.to_array(),
+        acc2.to_array(),
+        acc3.to_array(),
+    ])
+}
+
+pub fn evaluate_bf<const MODULUS: u64>(src: &[SmallGoldilocks]) -> (SmallGoldilocks, SmallGoldilocks) {
+    const CHUNK_SIZE: usize = 32_768;
+    let sums = src
+        .par_chunks(CHUNK_SIZE)
+        .map(|chunk| {
+            let chunk_raw: &[u64] = unsafe { core::slice::from_raw_parts(chunk.as_ptr() as *const u64, chunk.len()) };
+            sum_v::<16>(chunk_raw)
+        })
+        .reduce(
+            || (0, 0),
+            |(e1, o1), (e2, o2)| {
+                (add(e1, e2), add(o1, o2))
+            },
+        );
+
+    let (sum_0_u64, sum_1_u64) = sums;
+    let sum_0: SmallGoldilocks = unsafe { mem::transmute::<u64, SmallGoldilocks>(sum_0_u64) };
+    let sum_1: SmallGoldilocks = unsafe { mem::transmute::<u64, SmallGoldilocks>(sum_1_u64) };
+    (sum_0, sum_1)
+}
+
+#[cfg(test)]
+mod tests {
+    use ark_ff::UniformRand;
+    use ark_std::test_rng;
+    use super::super::MODULUS;
+
+    use super::evaluate_bf;
+    use crate::multilinear::pairwise;
+    use crate::tests::SmallGoldilocks;
+
+    #[test]
+    fn sanity() {
+        // setup
+        const LEN: usize = 4 * 16;
+        let mut rng = test_rng();
+
+        // random elements
+        let src: Vec<SmallGoldilocks> = (0..LEN).map(|_| SmallGoldilocks::rand(&mut rng)).collect();
+
+        // run function
+        let expected = pairwise::evaluate(&src);
+
+        let received = evaluate_bf::<MODULUS>(&src);
+
+        assert_eq!(expected, received);
+    }
+}
\ No newline at end of file
diff --git a/src/experimental/goldilocks/evaluate_ef.rs b/src/experimental/goldilocks/evaluate_ef.rs
new file mode 100644
index 0000000..15c5e2f
--- /dev/null
+++ b/src/experimental/goldilocks/evaluate_ef.rs
@@ -0,0 +1,116 @@
+use std::simd::{LaneCount, SupportedLaneCount};
+
+use ark_std::{mem, simd::Simd, slice};
+#[cfg(feature = "parallel")]
+use rayon::{iter::ParallelIterator, prelude::ParallelSlice};
+use crate::{experimental::goldilocks::arithmetic::add::add_v, tests::Fp2SmallGoldilocks};
+
+#[inline(always)]
+fn sum_v<const LANES: usize>(src: &[u64]) -> ([u64; 2], [u64; 2])
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    let mut acc0 = Simd::<u64, LANES>::splat(0);
+    let mut acc1 = Simd::<u64, LANES>::splat(0);
+
+    // We process 2 * LANES to keep the accumulators independent for ILP
+    let chunk_size = 2 * LANES;
+    let fast_path_end = src.len() - (src.len() % chunk_size);
+    
+    for i in (0..fast_path_end).step_by(chunk_size) {
+        // add_v here is your Goldilocks modular addition
+        acc0 = add_v(&acc0, &Simd::<u64, LANES>::from_slice(&src[i..i + LANES]));
+        acc1 = add_v(&acc1, &Simd::<u64, LANES>::from_slice(&src[i + LANES..i + 2 * LANES]));
+    }
+
+    // This handles the "Horizontal" reduction to separate Even/Odd FP2 elements
+    #[inline(always)]
+    fn sum_fp2wise(accs: &[&[u64]]) -> ([u64; 2], [u64; 2]) {
+        let mut even_sum = Simd::<u64, 2>::splat(0); // [real, imag]
+        let mut odd_sum = Simd::<u64, 2>::splat(0);
+        let mut is_even = true;
+
+        for acc in accs {
+            for j in (0..acc.len()).step_by(2) {
+                let val = Simd::from_slice(&acc[j..j + 2]);
+                if is_even {
+                    even_sum = add_v(&even_sum, &val);
+                } else {
+                    odd_sum = add_v(&odd_sum, &val);
+                }
+                is_even = !is_even;
+            }
+        }
+        (even_sum.to_array(), odd_sum.to_array())
+    }
+
+    sum_fp2wise(&[
+        &acc0.to_array(),
+        &acc1.to_array(),
+        &src[fast_path_end..],
+    ])
+}
+
+
+pub fn evaluate_ef<const MODULUS: u64>(src: &[Fp2SmallGoldilocks]) -> (Fp2SmallGoldilocks, Fp2SmallGoldilocks) {
+    const CHUNK_SIZE: usize = 16_384; 
+    let sums = src
+        .par_chunks(CHUNK_SIZE)
+        .map(|chunk| {
+            // Treat the slice of FP2 elements as a raw slice of u64
+            let chunk_raw: &[u64] = unsafe { 
+                slice::from_raw_parts(chunk.as_ptr() as *const u64, chunk.len() * 2) 
+            };
+            sum_v::<16>(chunk_raw) // Using 16 lanes for AVX-512 (1024-bit) or 8 for AVX2
+        })
+        .reduce(
+            || ([0u64; 2], [0u64; 2]),
+            |(e1, o1), (e2, o2)| {
+                (
+                    add_v(&Simd::from_array(e1), &Simd::from_array(e2)).to_array(),
+                    add_v(&Simd::from_array(o1), &Simd::from_array(o2)).to_array(),
+                )
+            },
+        );
+
+    let sum0: Fp2SmallGoldilocks = unsafe { mem::transmute(sums.0) };
+    let sum1: Fp2SmallGoldilocks = unsafe { mem::transmute(sums.1) };
+    (sum0, sum1)
+}
+
+
+// has error
+#[cfg(test)]
+mod tests {
+    use ark_ff::{Field, UniformRand};
+    use ark_std::test_rng;
+
+    use super::evaluate_ef;
+    use crate::multilinear::pairwise;
+    use crate::tests::{Fp2SmallGoldilocks, SmallGoldilocks};
+    use super::super::{MODULUS};
+
+    #[test]
+    fn sanity() {
+        // setup
+        const LEN: usize = 1 << 20;
+        let mut rng = test_rng();
+
+        // random elements
+        let src_bf: Vec<SmallGoldilocks> = (0..LEN).map(|_| SmallGoldilocks::rand(&mut rng)).collect();
+        let src_ef: Vec<Fp2SmallGoldilocks> = src_bf
+            .clone()
+            .into_iter()
+            .map(Fp2SmallGoldilocks::from_base_prime_field)
+            .collect();
+
+        // run function
+        let expected_bf = pairwise::evaluate(&src_bf);
+        let expected_ef = (
+            Fp2SmallGoldilocks::from_base_prime_field(expected_bf.0),
+            Fp2SmallGoldilocks::from_base_prime_field(expected_bf.1),
+        );
+        let received_ef = evaluate_ef::<MODULUS>(&src_ef);
+        assert_eq!(expected_ef, received_ef);
+    }
+}
\ No newline at end of file
diff --git a/src/experimental/goldilocks/mod.rs b/src/experimental/goldilocks/mod.rs
new file mode 100644
index 0000000..0e2ba94
--- /dev/null
+++ b/src/experimental/goldilocks/mod.rs
@@ -0,0 +1,10 @@
+pub mod arithmetic;
+pub mod constants;
+pub mod evaluate_bf;
+pub mod evaluate_ef;
+pub mod reduce_bf;
+pub mod reduce_ef;
+pub mod sumcheck;
+pub mod utils;
+
+pub use constants::{MODULUS, EPSILON};
\ No newline at end of file
diff --git a/src/experimental/goldilocks/reduce_bf.rs b/src/experimental/goldilocks/reduce_bf.rs
new file mode 100644
index 0000000..eb4b96c
--- /dev/null
+++ b/src/experimental/goldilocks/reduce_bf.rs
@@ -0,0 +1,85 @@
+use ark_std::{cfg_chunks, cfg_into_iter, mem, simd::Simd};
+
+#[cfg(feature = "parallel")]
+use rayon::{
+    iter::{IntoParallelIterator, ParallelIterator},
+    prelude::ParallelSlice,
+};
+
+use crate::{
+    experimental::goldilocks::arithmetic::{
+        add::add,
+        mul::mul_v,
+        sub::{sub, sub_v},
+    },
+    tests::{Fp2SmallGoldilocks, SmallGoldilocks},
+};
+
+
+#[inline(always)]
+fn single_compress(src: &[u64], challenge: &Simd<u64, 2>) -> Fp2SmallGoldilocks {
+    let a = src.first().unwrap();
+    let b = src.get(1).unwrap();
+
+    let b_minus_a = sub(*b, *a);
+
+    let mut tmp = Simd::splat(b_minus_a);
+    tmp = mul_v(&tmp, challenge);
+    let mut raw = *tmp.as_array();
+
+    raw[0] = add(raw[0], *a);
+
+    unsafe { mem::transmute::<[u64; 2], Fp2SmallGoldilocks>(raw) }
+}
+
+pub fn reduce_bf(src: &[SmallGoldilocks], verifier_message: Fp2SmallGoldilocks) -> Vec<Fp2SmallGoldilocks> {
+    let verifier_challenge_raw =
+        unsafe { mem::transmute::<Fp2SmallGoldilocks, [u64; 2]>(verifier_message) };
+    let src_raw: &[u64] =
+        unsafe { core::slice::from_raw_parts(src.as_ptr() as *const u64, src.len()) };
+
+    let verifier_challenge_vector: Simd<u64, 2> = Simd::from_array(verifier_challenge_raw);
+    let out: Vec<Fp2SmallGoldilocks> = cfg_into_iter!(0..src.len() / 2)
+        .map(|i| {
+            let start = 2 * i;
+            let end = start + 2;
+            single_compress(&src_raw[start..end], &verifier_challenge_vector)
+        })
+        .collect();
+    out
+}
+
+
+// has error
+#[cfg(test)]
+mod tests {
+    use ark_ff::{Field, UniformRand};
+    use ark_std::test_rng;
+
+    use crate::experimental::goldilocks::reduce_bf::reduce_bf;
+    use crate::multilinear::pairwise;
+    use crate::tests::{Fp2SmallGoldilocks, SmallGoldilocks};
+
+    #[test]
+    fn sanity() {
+        // setup
+        const LEN: usize = 1 << 20;
+        let mut rng = test_rng();
+
+        // random elements
+        let mut src: Vec<SmallGoldilocks> = (0..LEN).map(|_| SmallGoldilocks::rand(&mut rng)).collect();
+        let src_copy: Vec<SmallGoldilocks> = src.clone();
+        let challenge_bf = SmallGoldilocks::from(7);
+        let challenge_ef = Fp2SmallGoldilocks::from_base_prime_field(challenge_bf);
+
+        // run function
+        pairwise::reduce_evaluations(&mut src, challenge_bf);
+        let expected_ef: Vec<Fp2SmallGoldilocks> = src
+            .into_iter()
+            .map(Fp2SmallGoldilocks::from_base_prime_field)
+            .collect();
+        let received_ef = reduce_bf(&src_copy, challenge_ef);
+
+        assert_eq!(expected_ef, received_ef);
+    }
+}
\ No newline at end of file
diff --git a/src/experimental/goldilocks/reduce_ef.rs b/src/experimental/goldilocks/reduce_ef.rs
new file mode 100644
index 0000000..5c77856
--- /dev/null
+++ b/src/experimental/goldilocks/reduce_ef.rs
@@ -0,0 +1,91 @@
+use ark_std::{cfg_into_iter, mem, simd::Simd};
+
+#[cfg(feature = "parallel")]
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
+
+use crate::{
+    experimental::goldilocks::arithmetic::{add::add, mul::mul},
+    tests::Fp2SmallGoldilocks,
+};
+
+#[inline(always)]
+pub fn mul_fp2_smallgoldilocks(scalar: [u64; 2], b: [u64; 2]) -> [u64; 2] {
+    let [a0, a1] = scalar;
+    let [b0, b1] = b;
+
+    // Standard schoolbook multiplication for extensions:
+    // c0 = a0*b0 + non_residue * a1*b1
+    // c1 = a0*b1 + a1*b0
+    
+    let a0b0 = mul(a0, b0);
+    let a1b1 = mul(a1, b1);
+    let a0b1 = mul(a0, b1);
+    let a1b0 = mul(a1, b0);
+
+    // Goldilocks non-residue is 7
+    let non_residue_times_a1b1 = mul(a1b1, 7);
+
+    let c0 = add(a0b0, non_residue_times_a1b1);
+    let c1 = add(a0b1, a1b0);
+
+    [c0, c1]
+}
+
+pub fn reduce_ef(src: &mut Vec<Fp2SmallGoldilocks>, verifier_message: Fp2SmallGoldilocks) {
+    // will use these in the loop
+    let verifier_message_raw = unsafe { mem::transmute::<Fp2SmallGoldilocks, [u64; 2]>(verifier_message) };
+
+    // generate out
+    let out: Vec<Fp2SmallGoldilocks> = cfg_into_iter!(0..src.len() / 2)
+        .map(|i| {
+            let a = src.get(2 * i).unwrap();
+            let b = src.get((2 * i) + 1).unwrap();
+
+            // (b - a)
+            let b_minus_a = b - a;
+            let b_minus_a_raw = unsafe { mem::transmute::<Fp2SmallGoldilocks, [u64; 2]>(b_minus_a) };
+
+            // verifier_message * (b - a)
+            let tmp0 = mul_fp2_smallgoldilocks(verifier_message_raw, b_minus_a_raw);
+
+            // a + verifier_message * (b - a)
+            let mut tmp1 = unsafe { mem::transmute::<[u64; 2], Fp2SmallGoldilocks>(tmp0) };
+            tmp1 += a;
+
+            tmp1
+        })
+        .collect();
+
+    // write back into src
+    src[..out.len()].copy_from_slice(&out);
+    src.truncate(out.len());
+}
+
+#[cfg(test)]
+mod tests {
+    use ark_ff::UniformRand;
+    use ark_std::test_rng;
+
+    use crate::experimental::goldilocks::reduce_ef::reduce_ef;
+    use crate::multilinear::pairwise;
+    use crate::tests::Fp2SmallGoldilocks;
+
+    #[test]
+    fn sanity() {
+        // setup
+        const LEN: usize = 1 << 20;
+        let mut rng = test_rng();
+
+        // random elements
+        let mut expected_ef: Vec<Fp2SmallGoldilocks> =
+            (0..LEN).map(|_| Fp2SmallGoldilocks::rand(&mut rng)).collect();
+        let mut received_ef: Vec<Fp2SmallGoldilocks> = expected_ef.clone();
+        let challenge_ef = Fp2SmallGoldilocks::from(7);
+
+        // run function
+        pairwise::reduce_evaluations(&mut expected_ef, challenge_ef);
+        reduce_ef(&mut received_ef, challenge_ef);
+
+        assert_eq!(expected_ef, received_ef);
+    }
+}
\ No newline at end of file
diff --git a/src/experimental/goldilocks/sumcheck.rs b/src/experimental/goldilocks/sumcheck.rs
new file mode 100644
index 0000000..0b45897
--- /dev/null
+++ b/src/experimental/goldilocks/sumcheck.rs
@@ -0,0 +1,147 @@
+use ark_ff::Field;
+
+use crate::experimental::fiat_shamir::FiatShamir;
+// Import the Goldilocks versions of your SIMD functions
+use crate::experimental::goldilocks::{
+    evaluate_bf::evaluate_bf, evaluate_ef::evaluate_ef, 
+    reduce_bf::reduce_bf, reduce_ef::reduce_ef,
+};
+use crate::multilinear::pairwise;
+// Swap types to Goldilocks
+use crate::tests::{Fp2SmallGoldilocks, SmallGoldilocks};
+use crate::Sumcheck;
+use super::MODULUS;
+
+pub fn prove(
+    evals: &[SmallGoldilocks], 
+    fs: &mut impl FiatShamir<Fp2SmallGoldilocks>
+) -> Sumcheck<Fp2SmallGoldilocks> {
+    let len = evals.len();
+    assert!(len.count_ones() == 1, "evals len must be power of 2");
+    let num_vars = len.trailing_zeros();
+    let mut prover_messages = vec![];
+    let mut verifier_messages = vec![];
+    let mut new_evals = vec![];
+
+    for i in 0..num_vars {
+        if i == 0 {
+            // 1. Evaluate Base Field (Goldilocks 2^64 - 2^32 + 1)
+            // Use the Goldilocks modulus constant
+            let sums: (SmallGoldilocks, SmallGoldilocks) = evaluate_bf::<MODULUS>(evals);
+            
+            // 2. Promote to Extension Field (Fp2)
+            let (sum_0, sum_1) = (
+                Fp2SmallGoldilocks::from_base_prime_field(sums.0),
+                Fp2SmallGoldilocks::from_base_prime_field(sums.1),
+            );
+            prover_messages.push((sum_0, sum_1));
+            
+            // 3. Fiat-Shamir
+            fs.absorb(sum_0);
+            fs.absorb(sum_1);
+            let verifier_message = fs.squeeze();
+            verifier_messages.push(verifier_message);
+            
+            // 4. Reduce Base Field to Extension Field
+            // This reduction takes base field elements and a challenge in Fp2
+            new_evals = reduce_bf(evals, verifier_message);
+        } else {
+            // Evaluate Extension Field
+            let sums = if i < num_vars - 1 {
+                evaluate_ef::<MODULUS>(&new_evals)
+            } else {
+                // Last step uses standard pairwise evaluation
+                pairwise::evaluate(&new_evals)
+            };
+
+            prover_messages.push(sums);
+
+            fs.absorb(sums.0);
+            fs.absorb(sums.1);
+            
+            if i != num_vars - 1 {
+                let verifier_message = fs.squeeze();
+                verifier_messages.push(verifier_message);
+                // Reduce Extension Field
+                reduce_ef(&mut new_evals, verifier_message);
+            }
+        }
+    }
+    
+    Sumcheck::<Fp2SmallGoldilocks> {
+        prover_messages,
+        verifier_messages,
+        is_accepted: true,
+    }
+}
+
+
+// has error
+#[cfg(test)]
+mod tests {
+    use super::Sumcheck;
+    use crate::{
+        experimental::{fiat_shamir::BenchFiatShamir, goldilocks::sumcheck::prove},
+        multilinear::{ReduceMode, TimeProver},
+        prover::Prover,
+        tests::{BenchStream, Fp2SmallGoldilocks, SmallGoldilocks},
+    };
+    use ark_ff::Field;
+
+    #[test]
+    fn sanity() {
+        const NUM_VARIABLES: usize = 16;
+
+        // take an evaluation stream
+        let evaluation_stream: BenchStream<SmallGoldilocks> = BenchStream::new(NUM_VARIABLES);
+        let claim = evaluation_stream.claimed_sum;
+
+        // time_prover
+        let mut time_prover =
+            TimeProver::<SmallGoldilocks, BenchStream<SmallGoldilocks>>::new(<TimeProver<
+                SmallGoldilocks,
+                BenchStream<SmallGoldilocks>,
+            > as Prover<SmallGoldilocks>>::ProverConfig::new(
+                claim,
+                NUM_VARIABLES,
+                evaluation_stream.clone(),
+                ReduceMode::Pairwise,
+            ));
+        let time_prover_transcript = Sumcheck::<SmallGoldilocks>::prove::<
+            BenchStream<SmallGoldilocks>,
+            TimeProver<SmallGoldilocks, BenchStream<SmallGoldilocks>>,
+        >(&mut time_prover, &mut ark_std::test_rng());
+
+        // take the same evaluation stream
+        let len = 1 << NUM_VARIABLES;
+        let evals: Vec<SmallGoldilocks> = (0..len).map(|x| SmallGoldilocks::from(x as u32)).collect();
+        let mut fs = BenchFiatShamir::<Fp2SmallGoldilocks, _>::new(ark_std::test_rng());
+        let transcript = prove(&evals, &mut fs);
+
+        // were the challenges the same?
+        let verifier_messages_fp4: Vec<Fp2SmallGoldilocks> = time_prover_transcript
+            .verifier_messages
+            .iter()
+            .map(|a| Fp2SmallGoldilocks::from_base_prime_field(*a))
+            .collect();
+        assert_eq!(
+            verifier_messages_fp4, transcript.verifier_messages,
+            "challenges not same"
+        );
+
+        let prover_messages_fp4: Vec<(Fp2SmallGoldilocks, Fp2SmallGoldilocks)> = time_prover_transcript
+            .prover_messages
+            .iter()
+            .map(|&(a, b)| {
+                (
+                    Fp2SmallGoldilocks::from_base_prime_field(a),
+                    Fp2SmallGoldilocks::from_base_prime_field(b),
+                )
+            })
+            .collect();
+        assert_eq!(
+            prover_messages_fp4, transcript.prover_messages,
+            "prover messages not same"
+        );
+    }
+}
diff --git a/src/experimental/goldilocks/utils.rs b/src/experimental/goldilocks/utils.rs
new file mode 100644
index 0000000..1c42312
--- /dev/null
+++ b/src/experimental/goldilocks/utils.rs
@@ -0,0 +1,27 @@
+use std::hint::unreachable_unchecked;
+use std::arch::asm;
+
+#[inline(always)]
+pub fn assume(p: bool) {
+    debug_assert!(p);
+    if !p {
+        unsafe {
+            unreachable_unchecked();
+        }
+    }
+}
+
+/// Try to force Rust to emit a branch. Example:
+///     if x > 2 {
+///         y = foo();
+///         branch_hint();
+///     } else {
+///         y = bar();
+///     }
+/// This function has no semantics. It is a hint only.
+#[inline(always)]
+pub fn branch_hint() {
+    unsafe {
+        asm!("", options(nomem, nostack, preserves_flags));
+    }
+}
\ No newline at end of file
diff --git a/src/experimental/mod.rs b/src/experimental/mod.rs
index db16a87..dc22bef 100644
--- a/src/experimental/mod.rs
+++ b/src/experimental/mod.rs
@@ -2,3 +2,4 @@ pub mod duplex_sponge;
 
 pub mod fiat_shamir;
 pub mod m31;
+pub mod goldilocks;
\ No newline at end of file
diff --git a/src/tests/fields.rs b/src/tests/fields.rs
index e0eed1c..45f3983 100644
--- a/src/tests/fields.rs
+++ b/src/tests/fields.rs
@@ -1,7 +1,7 @@
 use ark_ff::{
     ark_ff_macros::SmallFpConfig,
     fields::{Fp128, Fp64, MontBackend, MontConfig},
-    BigInt, SmallFp, SmallFpConfig,
+    BigInt, SmallFp,
 };
 use ark_ff::{Fp2, Fp2Config, Fp4, Fp4Config, SqrtPrecomputation};
 #[derive(MontConfig)]
@@ -84,3 +84,27 @@ impl Fp4Config for Fp4SmallM31Config {
 }
 
 pub type Fp4SmallM31 = Fp4<Fp4SmallM31Config>;
+
+
+// SmallGoldilocks extensions
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
+pub struct Fp2SmallGoldilocksConfig;
+
+impl Fp2Config for Fp2SmallGoldilocksConfig {
+    type Fp = SmallGoldilocks;
+
+    /// For Goldilocks, 7 is a quadratic non-residue. 
+    /// The extension is formed by SmallGoldilocks[u] / (u^2 - 7)
+    /// Reference: https://github.com/zhenfeizhang/Goldilocks
+    const NONRESIDUE: SmallGoldilocks = SmallGoldilocks::new(7);
+
+    /// Frobenius coefficients are used for computing elements raised to the power of the modulus.
+    /// In a quadratic extension, these are often just [1, -1] or precomputed constants.
+    const FROBENIUS_COEFF_FP2_C1: &'static [SmallGoldilocks] = &[
+        SmallGoldilocks::new(1),
+        // This is typically -1 in the field, or the precomputed Frobenius constant.
+        SmallGoldilocks::new(18446744069414584320), 
+    ];
+}
+
+pub type Fp2SmallGoldilocks = Fp2<Fp2SmallGoldilocksConfig>;
\ No newline at end of file
diff --git a/src/tests/mod.rs b/src/tests/mod.rs
index 49f8e6f..bea25e2 100644
--- a/src/tests/mod.rs
+++ b/src/tests/mod.rs
@@ -5,6 +5,6 @@ pub mod multilinear;
 pub mod multilinear_product;
 pub mod polynomials;
 pub use fields::{
-    Fp2SmallM31, Fp4SmallM31, SmallF16, SmallGoldilocks, SmallM31, F128, F19, F64, M31,
+    Fp2SmallGoldilocks, Fp2SmallM31, Fp4SmallM31, SmallF16, SmallGoldilocks, SmallM31, F128, F19, F64, M31,
 };
 pub use streams::BenchStream;