diff --git a/benches/experimental_benches.rs b/benches/experimental_benches.rs index d0b9e70..a133e3b 100644 --- a/benches/experimental_benches.rs +++ b/benches/experimental_benches.rs @@ -7,11 +7,11 @@ use efficient_sumcheck::{ m31::{ evaluate_bf::evaluate_bf, evaluate_ef::evaluate_ef, reduce_bf::reduce_bf, reduce_ef::reduce_ef, sumcheck, - }, + } }, multilinear::{pairwise, ReduceMode, TimeProver}, prover::Prover, - tests::{BenchStream, Fp4SmallM31, SmallM31, F128}, + tests::{BenchStream, Fp4SmallM31, SmallM31, F128, SmallGoldilocks, Fp2SmallGoldilocks}, Sumcheck, }; @@ -181,6 +181,107 @@ fn bench_reduce_ef(c: &mut Criterion) { }); } + +fn bench_reduce_ef_goldilocks(c: &mut Criterion) { + + use efficient_sumcheck::experimental::goldilocks::reduce_ef::reduce_ef; + const LEN_XSMALL: usize = 1 << 10; // 1K + const LEN_SMALL: usize = 1 << 14; // 16K + const LEN_MED: usize = 1 << 16; // 64K + const LEN_LARGE: usize = 1 << 18; // 256K + const LEN_XLARGE: usize = 1 << 20; // 1M + + let mut rng = test_rng(); + + // Shared input vector in the base field + let src_xsmall: Vec = (0..LEN_XSMALL) + .map(|_| Fp2SmallGoldilocks::rand(&mut rng)) + .collect(); + let src_small: Vec = (0..LEN_SMALL) + .map(|_| Fp2SmallGoldilocks::rand(&mut rng)) + .collect(); + let src_med: Vec = (0..LEN_MED).map(|_| Fp2SmallGoldilocks::rand(&mut rng)).collect(); + let src_large: Vec = (0..LEN_LARGE) + .map(|_| Fp2SmallGoldilocks::rand(&mut rng)) + .collect(); + let src_xlarge: Vec = (0..LEN_XLARGE) + .map(|_| Fp2SmallGoldilocks::rand(&mut rng)) + .collect(); + + let challenge_ef = Fp2SmallGoldilocks::from(7); + + // This should be faster + c.bench_function("reduce_ef::goldilocks::reduce_1K", |b| { + b.iter(|| { + let mut v = src_xsmall.clone(); + reduce_ef(black_box(&mut v), challenge_ef); + }); + }); + + c.bench_function("reduce_ef::goldilocks::reduce_16K", |b| { + b.iter(|| { + let mut v = src_small.clone(); + reduce_ef(black_box(&mut v), challenge_ef); + }); + }); + + c.bench_function("reduce_ef::goldilocks::reduce_64K", |b| { + b.iter(|| { + let mut v = src_med.clone(); + reduce_ef(black_box(&mut v), challenge_ef); + }); + }); + + c.bench_function("reduce_ef::goldilocks::reduce_256K", |b| { + b.iter(|| { + let mut v = src_large.clone(); + reduce_ef(black_box(&mut v), challenge_ef); + }); + }); + + c.bench_function("reduce_ef::goldilocks::reduce_1M", |b| { + b.iter(|| { + let mut v = src_xlarge.clone(); + reduce_ef(black_box(&mut v), challenge_ef); + }); + }); + + c.bench_function("ef_pairwise::reduce_1K", |b| { + b.iter(|| { + let mut v = src_xsmall.clone(); + pairwise::reduce_evaluations(black_box(&mut v), challenge_ef); + }); + }); + + c.bench_function("ef_pairwise::reduce_16K", |b| { + b.iter(|| { + let mut v = src_small.clone(); + pairwise::reduce_evaluations(black_box(&mut v), challenge_ef); + }); + }); + + c.bench_function("ef_pairwise::reduce_64K", |b| { + b.iter(|| { + let mut v = src_med.clone(); + pairwise::reduce_evaluations(black_box(&mut v), challenge_ef); + }); + }); + + c.bench_function("ef_pairwise::reduce_256K", |b| { + b.iter(|| { + let mut v = src_large.clone(); + pairwise::reduce_evaluations(black_box(&mut v), challenge_ef); + }); + }); + + c.bench_function("ef_pairwise::reduce_1M", |b| { + b.iter(|| { + let mut v = src_xlarge.clone(); + pairwise::reduce_evaluations(black_box(&mut v), challenge_ef); + }); + }); +} + fn bench_reduce_bf(c: &mut Criterion) { const LEN_XSMALL: usize = 1 << 10; // 1K const LEN_SMALL: usize = 1 << 14; // 16K @@ -373,6 +474,65 @@ fn bench_evaluate_bf(c: &mut Criterion) { }); } + +fn bench_evaluate_bf_goldilocks(c: &mut Criterion) { + use efficient_sumcheck::experimental::goldilocks::evaluate_bf::evaluate_bf; + use efficient_sumcheck::experimental::goldilocks::MODULUS; + + const LEN_XSMALL: usize = 1 << 10; // 1K + const LEN_SMALL: usize = 1 << 14; // 16K + const LEN_MED: usize = 1 << 16; // 64K + const LEN_LARGE: usize = 1 << 18; // 256K + const LEN_XLARGE: usize = 1 << 20; // 1M + + let mut rng = test_rng(); + + // Shared input vector in the base field + let src_xsmall: Vec = (0..LEN_XSMALL).map(|_| SmallGoldilocks::rand(&mut rng)).collect(); + let src_small: Vec = (0..LEN_SMALL).map(|_| SmallGoldilocks::rand(&mut rng)).collect(); + let src_med: Vec = (0..LEN_MED).map(|_| SmallGoldilocks::rand(&mut rng)).collect(); + let src_large: Vec = (0..LEN_LARGE).map(|_| SmallGoldilocks::rand(&mut rng)).collect(); + let src_xlarge: Vec = (0..LEN_XLARGE).map(|_| SmallGoldilocks::rand(&mut rng)).collect(); + + // This should be faster + c.bench_function("evaluate_bf::goldilocks::evaluate_1K", |b| { + b.iter(|| { + let v = src_xsmall.clone(); + evaluate_bf::(black_box(&v)); + }); + }); + + c.bench_function("evaluate_bf::goldilocks::evaluate_16K", |b| { + b.iter(|| { + let v = src_small.clone(); + evaluate_bf::(black_box(&v)); + }); + }); + + c.bench_function("evaluate_bf::goldilocks::evaluate_64K", |b| { + b.iter(|| { + let v = src_med.clone(); + evaluate_bf::(black_box(&v)); + }); + }); + + c.bench_function("evaluate_bf::goldilocks::evaluate_256K", |b| { + b.iter(|| { + let v = src_large.clone(); + evaluate_bf::(black_box(&v)); + }); + }); + + c.bench_function("evaluate_bf::goldilocks::evaluate_1M", |b| { + b.iter(|| { + let v = src_xlarge.clone(); + evaluate_bf::(black_box(&v)); + }); + }); + +} + + fn bench_evaluate_ef(c: &mut Criterion) { const LEN_XSMALL: usize = 1 << 10; // 1K const LEN_SMALL: usize = 1 << 14; // 16K @@ -469,12 +629,114 @@ fn bench_evaluate_ef(c: &mut Criterion) { }); } +fn bench_evaluate_ef_goldilocks(c: &mut Criterion) { + use efficient_sumcheck::experimental::goldilocks::evaluate_ef::evaluate_ef; + use efficient_sumcheck::experimental::goldilocks::MODULUS; + + const LEN_XSMALL: usize = 1 << 10; // 1K + const LEN_SMALL: usize = 1 << 14; // 16K + const LEN_MED: usize = 1 << 16; // 64K + const LEN_LARGE: usize = 1 << 18; // 256K + const LEN_XLARGE: usize = 1 << 20; // 1M + + let mut rng = test_rng(); + + // Shared input vector in the base field + let src_xsmall: Vec = (0..LEN_XSMALL) + .map(|_| Fp2SmallGoldilocks::rand(&mut rng)) + .collect(); + let src_small: Vec = (0..LEN_SMALL) + .map(|_| Fp2SmallGoldilocks::rand(&mut rng)) + .collect(); + let src_med: Vec = (0..LEN_MED).map(|_| Fp2SmallGoldilocks::rand(&mut rng)).collect(); + let src_large: Vec = (0..LEN_LARGE) + .map(|_| Fp2SmallGoldilocks::rand(&mut rng)) + .collect(); + let src_xlarge: Vec = (0..LEN_XLARGE) + .map(|_| Fp2SmallGoldilocks::rand(&mut rng)) + .collect(); + + // This should be faster + c.bench_function("evaluate_ef::goldilocks::evaluate_1K", |b| { + b.iter(|| { + let v = src_xsmall.clone(); + evaluate_ef::(black_box(&v)); + }); + }); + + c.bench_function("evaluate_ef::goldilocks::evaluate_16K", |b| { + b.iter(|| { + let v = src_small.clone(); + evaluate_ef::(black_box(&v)); + }); + }); + + c.bench_function("evaluate_ef::goldilocks::evaluate_64K", |b| { + b.iter(|| { + let v = src_med.clone(); + evaluate_ef::(black_box(&v)); + }); + }); + + c.bench_function("evaluate_ef::goldilocks::evaluate_256K", |b| { + b.iter(|| { + let v = src_large.clone(); + evaluate_ef::(black_box(&v)); + }); + }); + + c.bench_function("evaluate_ef::goldilocks::evaluate_1M", |b| { + b.iter(|| { + let v = src_xlarge.clone(); + evaluate_ef::(black_box(&v)); + }); + }); + + c.bench_function("pairwise::goldilocks::evaluate_1K", |b| { + b.iter(|| { + let v = src_xsmall.clone(); + pairwise::evaluate(black_box(&v)); + }); + }); + + c.bench_function("pairwise::goldilocks::evaluate_16K", |b| { + b.iter(|| { + let v = src_small.clone(); + pairwise::evaluate(black_box(&v)); + }); + }); + + c.bench_function("pairwise::goldilocks::evaluate_64K", |b| { + b.iter(|| { + let v = src_med.clone(); + pairwise::evaluate(black_box(&v)); + }); + }); + + c.bench_function("pairwise::goldilocks::evaluate_256K", |b| { + b.iter(|| { + let v = src_large.clone(); + pairwise::evaluate(black_box(&v)); + }); + }); + + c.bench_function("pairwise::goldilocks::evaluate_1M", |b| { + b.iter(|| { + let v = src_xlarge.clone(); + pairwise::evaluate(black_box(&v)); + }); + }); +} + criterion_group!( benches, bench_reduce_bf, bench_sumcheck_time, bench_evaluate_bf, + bench_evaluate_bf_goldilocks, bench_evaluate_ef, + bench_evaluate_ef_goldilocks, bench_reduce_ef, + bench_reduce_ef_goldilocks ); criterion_main!(benches); diff --git a/src/experimental/goldilocks/arithmetic/add.rs b/src/experimental/goldilocks/arithmetic/add.rs new file mode 100644 index 0000000..a2686a9 --- /dev/null +++ b/src/experimental/goldilocks/arithmetic/add.rs @@ -0,0 +1,57 @@ +use ark_std::simd::{cmp::SimdPartialOrd, LaneCount, Simd, SupportedLaneCount}; +use super::super::{MODULUS, EPSILON}; +use crate::experimental::goldilocks::utils::{assume, branch_hint}; + +// https://github.com/zhenfeizhang/Goldilocks/blob/872114997b82d0157e29a702992a3bd2023aa7ba/src/primefield/fp.rs#L377 +#[inline(always)] +pub fn add(a: u64, b: u64) -> u64 { + let (sum, over) = a.overflowing_add(b); + let (mut sum, over) = sum.overflowing_add((over as u64) * EPSILON); + if over { + // NB: a > Self::ORDER && b > Self::ORDER is necessary but not sufficient for double-overflow. + // This assume does two things: + // 1. If compiler knows that either a or b <= ORDER, then it can skip this check. + // 2. Hints to the compiler how rare this double-overflow is (thus handled better with a branch). + assume(a > MODULUS && b > MODULUS); + branch_hint(); + sum += EPSILON; // Cannot overflow. + } + sum +} + +#[inline(always)] +pub fn add_v(a: &Simd, b: &Simd) -> Simd +where + LaneCount: SupportedLaneCount, +{ + let modulus = Simd::::splat(MODULUS); + let epsilon = Simd::::splat(EPSILON); + let sum = a + b; + + // 2. Detect where overflow occurred (a + b >= 2^64) + // In SIMD, if the sum is less than one of the inputs, an overflow happened. + let overflow_mask = sum.simd_lt(*a); + + // 3. Add epsilon to lanes that overflowed + let mut res = overflow_mask.select(sum + epsilon, sum); + + // 4. Final canonical reduction: if res >= modulus { res - modulus } + res = res.simd_ge(modulus).select(res - modulus, res); + + res +} + + +#[cfg(test)] +mod tests { + use super::add_v; + use ark_std::simd::Simd; + + #[test] + fn sanity() { + let a: [u64; 1] = [9]; + let b: [u64; 1] = [7]; + let sum = add_v(&Simd::from_array(a), &Simd::from_array(b)); + assert_eq!(sum[0], 16); + } +} diff --git a/src/experimental/goldilocks/arithmetic/mod.rs b/src/experimental/goldilocks/arithmetic/mod.rs new file mode 100644 index 0000000..3d90c41 --- /dev/null +++ b/src/experimental/goldilocks/arithmetic/mod.rs @@ -0,0 +1,3 @@ +pub mod add; +pub mod mul; +pub mod sub; \ No newline at end of file diff --git a/src/experimental/goldilocks/arithmetic/mul.rs b/src/experimental/goldilocks/arithmetic/mul.rs new file mode 100644 index 0000000..4f39d18 --- /dev/null +++ b/src/experimental/goldilocks/arithmetic/mul.rs @@ -0,0 +1,160 @@ +use std::simd::{Mask}; + +use ark_std::{ + mem, + simd::{cmp::SimdPartialOrd, LaneCount, Simd, SupportedLaneCount}, +}; + +use crate::tests::SmallGoldilocks; +use super::super::{MODULUS, EPSILON}; + +pub fn mul(a: u64, b: u64) -> u64 { + let prod = unsafe { mem::transmute::(a) } + * unsafe { mem::transmute::(b) }; + unsafe { mem::transmute::(prod) } +} + + +#[inline(always)] +pub fn mul_v( + a: &Simd, + b: &Simd, +) -> Simd +where + LaneCount: SupportedLaneCount, +{ + let mask32 = Simd::splat(0xFFFFFFFFu64); + + // 32-bit Limb Split + let a_lo = *a & mask32; + let a_hi = *a >> 32; + + let b_lo = *b & mask32; + let b_hi = *b >> 32; + + // lo_lo, alo ​× blo​, 0 through 63 + // lo_hi, alo​ × bhi​, 32 through 95 + // hi_lo, ahi​ × blo​, 32 through 95 + // hi_hi, ahi​ × bhi​, 64 through 127 + let lo_lo = a_lo * b_lo; + let lo_hi = a_lo * b_hi; + let hi_lo = a_hi * b_lo; + let hi_hi = a_hi * b_hi; + + // Reconstruct 128-bit product (x_hi, x_lo) + let mid = lo_hi + hi_lo; + let mid_lo = mid & mask32; + let mid_hi = mid >> 32; + + // if overflow cause sum is less than its arguemnts + let mid_carry = mid.simd_lt(lo_hi).select(Simd::splat(1 << 32), Simd::splat(0)); + + // take the absolute bottom product (lo_lo) and add the lower half of your middle sum. Since the middle sum starts at bit 32, you shift mid_lo left by 32 to align it. + let x_lo = lo_lo + (mid_lo << 32); + + // if overflow cause sum is less than its arguemnts + let x_lo_carry = x_lo.simd_lt(lo_lo).select(Simd::splat(1), Simd::splat(0)); + + // Goldilocks Reduction (Matching your reduce128 logic) + // x_hi_hi is the top 32 bits of the 128-bit product + // x_hi_lo is the bits 64..96 + + let x_hi = hi_hi + mid_hi + mid_carry + x_lo_carry; + let x_hi_hi = x_hi >> 32; + let x_hi_lo = x_hi & mask32; + + // Step A: t0 = x_lo - x_hi_hi + let mut t0 = x_lo - x_hi_hi; + let borrow_mask = x_lo.simd_lt(x_hi_hi); + // If borrow, subtract EPSILON (which is equivalent to adding 2^64 - EPSILON) + t0 = borrow_mask.select(t0 - Simd::splat(EPSILON), t0); + + // Step B: t1 = x_hi_lo * EPSILON + let t1 = x_hi_lo * Simd::splat(EPSILON); + + // Step C: t2 = t0 + t1 (mod 2^64) + let (t2_wrapped, carry) = overflowing_add_simd(t0, t1); + let mut r = t2_wrapped + (carry.select(Simd::splat(EPSILON), Simd::splat(0))); + + // Final Canonicalization + let p = Simd::splat(MODULUS); + r = r.simd_ge(p).select(r - p, r); + + r +} + +/// Helper for overflowing add in SIMD +#[inline(always)] +fn overflowing_add_simd( + a: Simd, + b: Simd +) -> (Simd, Mask) +where LaneCount: SupportedLaneCount +{ + let res = a + b; + (res, res.simd_lt(a)) +} + + +#[cfg(test)] +mod tests { + use crate::experimental::goldilocks::MODULUS; + + use super::mul_v; + use ark_std::{rand::RngCore, simd::Simd, test_rng}; + + #[test] + fn single() { + // https://asecuritysite.com/zk/go_plonk4 + + let a_input: [u64; 1] = [10719222850664546238]; + let b_input: [u64; 1] = [301075827032876239]; + + // 1. Calculate Expected using u128 + let expected = ((a_input[0] as u128 * b_input[0] as u128) % MODULUS as u128) as u64; + + // 2. Calculate Received using your mul_v + const LANES: usize = 1; + let a_simd = Simd::::from_slice(&a_input); + let b_simd = Simd::::from_slice(&b_input); + let res_simd = mul_v(&a_simd, &b_simd); + let received = res_simd.as_array()[0]; + + println!("Expected: {}, Received: {}", expected, received); + assert_eq!(expected, received); + } + + #[test] + fn sanity() { + const LEN: usize = 1 << 20; + let mut rng = test_rng(); + + // random elements + let multipliers: Vec = (0..LEN).map(|_| rng.next_u64() % MODULUS).collect(); + let mut expected_ef: Vec = (0..LEN).map(|_| rng.next_u64()).collect(); + + let mut received_ef = expected_ef.clone(); + + // control + expected_ef + .iter_mut() + .zip(multipliers.iter()) + .for_each(|(a, b)| { + let prod = (*a as u128) * (*b as u128); + *a = (prod % MODULUS as u128) as u64; + }); + + + const LANES: usize = 16; + for (a_chunk, b_chunk) in received_ef.chunks_mut(LANES).zip(multipliers.chunks(LANES)) { + let a_simd = Simd::::from_slice(a_chunk); + let b_simd = Simd::::from_slice(b_chunk); + // perfom op + let res = mul_v(&a_simd, &b_simd); + // write back into slice + a_chunk.copy_from_slice(res.as_array()); + } + + assert_eq!(expected_ef, received_ef); + } +} diff --git a/src/experimental/goldilocks/arithmetic/sub.rs b/src/experimental/goldilocks/arithmetic/sub.rs new file mode 100644 index 0000000..3b69109 --- /dev/null +++ b/src/experimental/goldilocks/arithmetic/sub.rs @@ -0,0 +1,47 @@ +use ark_std::simd::{cmp::SimdPartialOrd, LaneCount, Simd, SupportedLaneCount}; + +// https://github.com/zhenfeizhang/Goldilocks/blob/872114997b82d0157e29a702992a3bd2023aa7ba/src/primefield/fp.rs#L424 +#[inline(always)] +pub fn sub(a: u64, b: u64) -> u64 { + let (diff, underflow) = a.overflowing_sub(b); + if underflow { + // If a < b, the raw diff is (a - b) + 2^64. + // Since 2^64 mod p = 2^32 - 1, we subtract (2^32 - 1) to correct it. + diff.wrapping_sub(0xFFFFFFFF) + } else { + diff + } +} + +#[inline(always)] +pub fn sub_v(a: &Simd, b: &Simd) -> Simd +where + LaneCount: SupportedLaneCount, +{ + let epsilon = Simd::::splat(0xFFFFFFFF); + + // 1. Standard wrapping subtraction + let diff = a - b; + + // 2. Detect underflow (a < b) + let underflow_mask = a.simd_lt(*b); + + // 3. If underflowed, we have diff = (a - b) + 2^64. + // To get (a - b) mod p, we need (a - b) + (2^64 - 2^32 + 1). + // So we subtract (2^32 - 1). + underflow_mask.select(diff - epsilon, diff) +} + +#[cfg(test)] +mod tests { + use super::sub_v; + use ark_std::simd::Simd; + + #[test] + fn sanity() { + let a: [u64; 1] = [9]; + let b: [u64; 1] = [7]; + let diff = sub_v(&Simd::from_array(a), &Simd::from_array(b)); + assert_eq!(diff[0], 2); + } +} diff --git a/src/experimental/goldilocks/constants.rs b/src/experimental/goldilocks/constants.rs new file mode 100644 index 0000000..45adb4b --- /dev/null +++ b/src/experimental/goldilocks/constants.rs @@ -0,0 +1,5 @@ +/// 2^64 - 2^32 + 1 +pub const MODULUS: u64 = 0xffffffff00000001; + +/// 2^32 - 1 +pub const EPSILON: u64 = 0xffffffff; \ No newline at end of file diff --git a/src/experimental/goldilocks/evaluate_bf.rs b/src/experimental/goldilocks/evaluate_bf.rs new file mode 100644 index 0000000..32f98df --- /dev/null +++ b/src/experimental/goldilocks/evaluate_bf.rs @@ -0,0 +1,114 @@ +use std::simd::{LaneCount, SupportedLaneCount}; + +use ark_std::{mem, simd::Simd}; + +#[cfg(feature = "parallel")] +use rayon::{iter::ParallelIterator, prelude::ParallelSlice}; + +use crate::{ + experimental::goldilocks::arithmetic::add::{add, add_v}, + tests::SmallGoldilocks, +}; + +#[inline(always)] +fn sum_v(src: &[u64]) -> (u64, u64) +where + LaneCount: SupportedLaneCount, +{ + let mut acc0 = Simd::::splat(0); + let mut acc1 = Simd::::splat(0); + let mut acc2 = Simd::::splat(0); + let mut acc3 = Simd::::splat(0); + + let chunk_size = 4 * LANES; + for i in (0..src.len()).step_by(chunk_size) { + + acc0 = add_v( + &acc0, + &Simd::::from_slice(&src[i..i + LANES])); + + acc1 = add_v( + &acc1, + &Simd::::from_slice(&src[i + LANES..i + 2 * LANES]), + ); + + acc2 = add_v( + &acc2, + &Simd::::from_slice(&src[i + 2 * LANES..i + 3 * LANES]), + ); + + acc3 = add_v( + &acc3, + &Simd::::from_slice(&src[i + 3 * LANES..i + 4 * LANES]), + ); + } + + #[inline(always)] + fn sum_indexwise(accs: &[[u64; LANES]]) -> (u64, u64) { + let mut even_sum = 0u64; + let mut odd_sum = 0u64; + for i in (0..LANES).step_by(2) { + for acc in accs { + even_sum = add(even_sum, acc[i]); + odd_sum = add(odd_sum, acc[i + 1]); + } + } + (even_sum, odd_sum) + } + + sum_indexwise(&[ + acc0.to_array(), + acc1.to_array(), + acc2.to_array(), + acc3.to_array(), + ]) +} + +pub fn evaluate_bf(src: &[SmallGoldilocks]) -> (SmallGoldilocks, SmallGoldilocks) { + const CHUNK_SIZE: usize = 32_768; + let sums = src + .par_chunks(CHUNK_SIZE) + .map(|chunk| { + let chunk_raw: &[u64] = unsafe { core::slice::from_raw_parts(chunk.as_ptr() as *const u64, chunk.len()) }; + sum_v::<16>(chunk_raw) + }) + .reduce( + || (0, 0), + |(e1, o1), (e2, o2)| { + (add(e1, e2), add(o1, o2)) + }, + ); + + let (sum_0_u64, sum_1_u64) = sums; + let sum_0: SmallGoldilocks = unsafe { mem::transmute::(sum_0_u64) }; + let sum_1: SmallGoldilocks = unsafe { mem::transmute::(sum_1_u64) }; + (sum_0, sum_1) +} + +#[cfg(test)] +mod tests { + use ark_ff::UniformRand; + use ark_std::test_rng; + use super::super::MODULUS; + + use super::evaluate_bf; + use crate::multilinear::pairwise; + use crate::tests::SmallGoldilocks; + + #[test] + fn sanity() { + // setup + const LEN: usize = 4 * 16; + let mut rng = test_rng(); + + // random elements + let src: Vec = (0..LEN).map(|_| SmallGoldilocks::rand(&mut rng)).collect(); + + // run function + let expected = pairwise::evaluate(&src); + + let received = evaluate_bf::(&src); + + assert_eq!(expected, received); + } +} \ No newline at end of file diff --git a/src/experimental/goldilocks/evaluate_ef.rs b/src/experimental/goldilocks/evaluate_ef.rs new file mode 100644 index 0000000..15c5e2f --- /dev/null +++ b/src/experimental/goldilocks/evaluate_ef.rs @@ -0,0 +1,116 @@ +use std::simd::{LaneCount, SupportedLaneCount}; + +use ark_std::{mem, simd::Simd, slice}; +#[cfg(feature = "parallel")] +use rayon::{iter::ParallelIterator, prelude::ParallelSlice}; +use crate::{experimental::goldilocks::arithmetic::add::add_v, tests::Fp2SmallGoldilocks}; + +#[inline(always)] +fn sum_v(src: &[u64]) -> ([u64; 2], [u64; 2]) +where + LaneCount: SupportedLaneCount, +{ + let mut acc0 = Simd::::splat(0); + let mut acc1 = Simd::::splat(0); + + // We process 2 * LANES to keep the accumulators independent for ILP + let chunk_size = 2 * LANES; + let fast_path_end = src.len() - (src.len() % chunk_size); + + for i in (0..fast_path_end).step_by(chunk_size) { + // add_v here is your Goldilocks modular addition + acc0 = add_v(&acc0, &Simd::::from_slice(&src[i..i + LANES])); + acc1 = add_v(&acc1, &Simd::::from_slice(&src[i + LANES..i + 2 * LANES])); + } + + // This handles the "Horizontal" reduction to separate Even/Odd FP2 elements + #[inline(always)] + fn sum_fp2wise(accs: &[&[u64]]) -> ([u64; 2], [u64; 2]) { + let mut even_sum = Simd::::splat(0); // [real, imag] + let mut odd_sum = Simd::::splat(0); + let mut is_even = true; + + for acc in accs { + for j in (0..acc.len()).step_by(2) { + let val = Simd::from_slice(&acc[j..j + 2]); + if is_even { + even_sum = add_v(&even_sum, &val); + } else { + odd_sum = add_v(&odd_sum, &val); + } + is_even = !is_even; + } + } + (even_sum.to_array(), odd_sum.to_array()) + } + + sum_fp2wise(&[ + &acc0.to_array(), + &acc1.to_array(), + &src[fast_path_end..], + ]) +} + + +pub fn evaluate_ef(src: &[Fp2SmallGoldilocks]) -> (Fp2SmallGoldilocks, Fp2SmallGoldilocks) { + const CHUNK_SIZE: usize = 16_384; + let sums = src + .par_chunks(CHUNK_SIZE) + .map(|chunk| { + // Treat the slice of FP2 elements as a raw slice of u64 + let chunk_raw: &[u64] = unsafe { + slice::from_raw_parts(chunk.as_ptr() as *const u64, chunk.len() * 2) + }; + sum_v::<16>(chunk_raw) // Using 16 lanes for AVX-512 (1024-bit) or 8 for AVX2 + }) + .reduce( + || ([0u64; 2], [0u64; 2]), + |(e1, o1), (e2, o2)| { + ( + add_v(&Simd::from_array(e1), &Simd::from_array(e2)).to_array(), + add_v(&Simd::from_array(o1), &Simd::from_array(o2)).to_array(), + ) + }, + ); + + let sum0: Fp2SmallGoldilocks = unsafe { mem::transmute(sums.0) }; + let sum1: Fp2SmallGoldilocks = unsafe { mem::transmute(sums.1) }; + (sum0, sum1) +} + + +// has error +#[cfg(test)] +mod tests { + use ark_ff::{Field, UniformRand}; + use ark_std::test_rng; + + use super::evaluate_ef; + use crate::multilinear::pairwise; + use crate::tests::{Fp2SmallGoldilocks, SmallGoldilocks}; + use super::super::{MODULUS}; + + #[test] + fn sanity() { + // setup + const LEN: usize = 1 << 20; + let mut rng = test_rng(); + + // random elements + let src_bf: Vec = (0..LEN).map(|_| SmallGoldilocks::rand(&mut rng)).collect(); + let src_ef: Vec = src_bf + .clone() + .into_iter() + .map(Fp2SmallGoldilocks::from_base_prime_field) + .collect(); + + // run function + let expected_bf = pairwise::evaluate(&src_bf); + let expected_ef = ( + Fp2SmallGoldilocks::from_base_prime_field(expected_bf.0), + Fp2SmallGoldilocks::from_base_prime_field(expected_bf.1), + ); + let received_ef = evaluate_ef::(&src_ef); + assert_eq!(expected_ef, received_ef); + } +} \ No newline at end of file diff --git a/src/experimental/goldilocks/mod.rs b/src/experimental/goldilocks/mod.rs new file mode 100644 index 0000000..0e2ba94 --- /dev/null +++ b/src/experimental/goldilocks/mod.rs @@ -0,0 +1,10 @@ +pub mod arithmetic; +pub mod constants; +pub mod evaluate_bf; +pub mod evaluate_ef; +pub mod reduce_bf; +pub mod reduce_ef; +pub mod sumcheck; +pub mod utils; + +pub use constants::{MODULUS, EPSILON}; \ No newline at end of file diff --git a/src/experimental/goldilocks/reduce_bf.rs b/src/experimental/goldilocks/reduce_bf.rs new file mode 100644 index 0000000..eb4b96c --- /dev/null +++ b/src/experimental/goldilocks/reduce_bf.rs @@ -0,0 +1,85 @@ +use ark_std::{cfg_chunks, cfg_into_iter, mem, simd::Simd}; + +#[cfg(feature = "parallel")] +use rayon::{ + iter::{IntoParallelIterator, ParallelIterator}, + prelude::ParallelSlice, +}; + +use crate::{ + experimental::goldilocks::arithmetic::{ + add::add, + mul::mul_v, + sub::{sub, sub_v}, + }, + tests::{Fp2SmallGoldilocks, SmallGoldilocks}, +}; + + +#[inline(always)] +fn single_compress(src: &[u64], challenge: &Simd) -> Fp2SmallGoldilocks { + let a = src.first().unwrap(); + let b = src.get(1).unwrap(); + + let b_minus_a = sub(*b, *a); + + let mut tmp = Simd::splat(b_minus_a); + tmp = mul_v(&tmp, challenge); + let mut raw = *tmp.as_array(); + + raw[0] = add(raw[0], *a); + + unsafe { mem::transmute::<[u64; 2], Fp2SmallGoldilocks>(raw) } +} + +pub fn reduce_bf(src: &[SmallGoldilocks], verifier_message: Fp2SmallGoldilocks) -> Vec { + let verifier_challenge_raw = + unsafe { mem::transmute::(verifier_message) }; + let src_raw: &[u64] = + unsafe { core::slice::from_raw_parts(src.as_ptr() as *const u64, src.len()) }; + + let verifier_challenge_vector: Simd = Simd::from_array(verifier_challenge_raw); + let out: Vec = cfg_into_iter!(0..src.len() / 2) + .map(|i| { + let start = 2 * i; + let end = start + 2; + single_compress(&src_raw[start..end], &verifier_challenge_vector) + }) + .collect(); + out +} + + +// has error +#[cfg(test)] +mod tests { + use ark_ff::{Field, UniformRand}; + use ark_std::test_rng; + + use crate::experimental::goldilocks::reduce_bf::reduce_bf; + use crate::multilinear::pairwise; + use crate::tests::{Fp2SmallGoldilocks, SmallGoldilocks}; + + #[test] + fn sanity() { + // setup + const LEN: usize = 1 << 20; + let mut rng = test_rng(); + + // random elements + let mut src: Vec = (0..LEN).map(|_| SmallGoldilocks::rand(&mut rng)).collect(); + let src_copy: Vec = src.clone(); + let challenge_bf = SmallGoldilocks::from(7); + let challenge_ef = Fp2SmallGoldilocks::from_base_prime_field(challenge_bf); + + // run function + pairwise::reduce_evaluations(&mut src, challenge_bf); + let expected_ef: Vec = src + .into_iter() + .map(Fp2SmallGoldilocks::from_base_prime_field) + .collect(); + let received_ef = reduce_bf(&src_copy, challenge_ef); + + assert_eq!(expected_ef, received_ef); + } +} \ No newline at end of file diff --git a/src/experimental/goldilocks/reduce_ef.rs b/src/experimental/goldilocks/reduce_ef.rs new file mode 100644 index 0000000..5c77856 --- /dev/null +++ b/src/experimental/goldilocks/reduce_ef.rs @@ -0,0 +1,91 @@ +use ark_std::{cfg_into_iter, mem, simd::Simd}; + +#[cfg(feature = "parallel")] +use rayon::iter::{IntoParallelIterator, ParallelIterator}; + +use crate::{ + experimental::goldilocks::arithmetic::{add::add, mul::mul}, + tests::Fp2SmallGoldilocks, +}; + +#[inline(always)] +pub fn mul_fp2_smallgoldilocks(scalar: [u64; 2], b: [u64; 2]) -> [u64; 2] { + let [a0, a1] = scalar; + let [b0, b1] = b; + + // Standard schoolbook multiplication for extensions: + // c0 = a0*b0 + non_residue * a1*b1 + // c1 = a0*b1 + a1*b0 + + let a0b0 = mul(a0, b0); + let a1b1 = mul(a1, b1); + let a0b1 = mul(a0, b1); + let a1b0 = mul(a1, b0); + + // Goldilocks non-residue is 7 + let non_residue_times_a1b1 = mul(a1b1, 7); + + let c0 = add(a0b0, non_residue_times_a1b1); + let c1 = add(a0b1, a1b0); + + [c0, c1] +} + +pub fn reduce_ef(src: &mut Vec, verifier_message: Fp2SmallGoldilocks) { + // will use these in the loop + let verifier_message_raw = unsafe { mem::transmute::(verifier_message) }; + + // generate out + let out: Vec = cfg_into_iter!(0..src.len() / 2) + .map(|i| { + let a = src.get(2 * i).unwrap(); + let b = src.get((2 * i) + 1).unwrap(); + + // (b - a) + let b_minus_a = b - a; + let b_minus_a_raw = unsafe { mem::transmute::(b_minus_a) }; + + // verifier_message * (b - a) + let tmp0 = mul_fp2_smallgoldilocks(verifier_message_raw, b_minus_a_raw); + + // a + verifier_message * (b - a) + let mut tmp1 = unsafe { mem::transmute::<[u64; 2], Fp2SmallGoldilocks>(tmp0) }; + tmp1 += a; + + tmp1 + }) + .collect(); + + // write back into src + src[..out.len()].copy_from_slice(&out); + src.truncate(out.len()); +} + +#[cfg(test)] +mod tests { + use ark_ff::UniformRand; + use ark_std::test_rng; + + use crate::experimental::goldilocks::reduce_ef::reduce_ef; + use crate::multilinear::pairwise; + use crate::tests::Fp2SmallGoldilocks; + + #[test] + fn sanity() { + // setup + const LEN: usize = 1 << 20; + let mut rng = test_rng(); + + // random elements + let mut expected_ef: Vec = + (0..LEN).map(|_| Fp2SmallGoldilocks::rand(&mut rng)).collect(); + let mut received_ef: Vec = expected_ef.clone(); + let challenge_ef = Fp2SmallGoldilocks::from(7); + + // run function + pairwise::reduce_evaluations(&mut expected_ef, challenge_ef); + reduce_ef(&mut received_ef, challenge_ef); + + assert_eq!(expected_ef, received_ef); + } +} \ No newline at end of file diff --git a/src/experimental/goldilocks/sumcheck.rs b/src/experimental/goldilocks/sumcheck.rs new file mode 100644 index 0000000..0b45897 --- /dev/null +++ b/src/experimental/goldilocks/sumcheck.rs @@ -0,0 +1,147 @@ +use ark_ff::Field; + +use crate::experimental::fiat_shamir::FiatShamir; +// Import the Goldilocks versions of your SIMD functions +use crate::experimental::goldilocks::{ + evaluate_bf::evaluate_bf, evaluate_ef::evaluate_ef, + reduce_bf::reduce_bf, reduce_ef::reduce_ef, +}; +use crate::multilinear::pairwise; +// Swap types to Goldilocks +use crate::tests::{Fp2SmallGoldilocks, SmallGoldilocks}; +use crate::Sumcheck; +use super::MODULUS; + +pub fn prove( + evals: &[SmallGoldilocks], + fs: &mut impl FiatShamir +) -> Sumcheck { + let len = evals.len(); + assert!(len.count_ones() == 1, "evals len must be power of 2"); + let num_vars = len.trailing_zeros(); + let mut prover_messages = vec![]; + let mut verifier_messages = vec![]; + let mut new_evals = vec![]; + + for i in 0..num_vars { + if i == 0 { + // 1. Evaluate Base Field (Goldilocks 2^64 - 2^32 + 1) + // Use the Goldilocks modulus constant + let sums: (SmallGoldilocks, SmallGoldilocks) = evaluate_bf::(evals); + + // 2. Promote to Extension Field (Fp2) + let (sum_0, sum_1) = ( + Fp2SmallGoldilocks::from_base_prime_field(sums.0), + Fp2SmallGoldilocks::from_base_prime_field(sums.1), + ); + prover_messages.push((sum_0, sum_1)); + + // 3. Fiat-Shamir + fs.absorb(sum_0); + fs.absorb(sum_1); + let verifier_message = fs.squeeze(); + verifier_messages.push(verifier_message); + + // 4. Reduce Base Field to Extension Field + // This reduction takes base field elements and a challenge in Fp2 + new_evals = reduce_bf(evals, verifier_message); + } else { + // Evaluate Extension Field + let sums = if i < num_vars - 1 { + evaluate_ef::(&new_evals) + } else { + // Last step uses standard pairwise evaluation + pairwise::evaluate(&new_evals) + }; + + prover_messages.push(sums); + + fs.absorb(sums.0); + fs.absorb(sums.1); + + if i != num_vars - 1 { + let verifier_message = fs.squeeze(); + verifier_messages.push(verifier_message); + // Reduce Extension Field + reduce_ef(&mut new_evals, verifier_message); + } + } + } + + Sumcheck:: { + prover_messages, + verifier_messages, + is_accepted: true, + } +} + + +// has error +#[cfg(test)] +mod tests { + use super::Sumcheck; + use crate::{ + experimental::{fiat_shamir::BenchFiatShamir, goldilocks::sumcheck::prove}, + multilinear::{ReduceMode, TimeProver}, + prover::Prover, + tests::{BenchStream, Fp2SmallGoldilocks, SmallGoldilocks}, + }; + use ark_ff::Field; + + #[test] + fn sanity() { + const NUM_VARIABLES: usize = 16; + + // take an evaluation stream + let evaluation_stream: BenchStream = BenchStream::new(NUM_VARIABLES); + let claim = evaluation_stream.claimed_sum; + + // time_prover + let mut time_prover = + TimeProver::>::new(, + > as Prover>::ProverConfig::new( + claim, + NUM_VARIABLES, + evaluation_stream.clone(), + ReduceMode::Pairwise, + )); + let time_prover_transcript = Sumcheck::::prove::< + BenchStream, + TimeProver>, + >(&mut time_prover, &mut ark_std::test_rng()); + + // take the same evaluation stream + let len = 1 << NUM_VARIABLES; + let evals: Vec = (0..len).map(|x| SmallGoldilocks::from(x as u32)).collect(); + let mut fs = BenchFiatShamir::::new(ark_std::test_rng()); + let transcript = prove(&evals, &mut fs); + + // were the challenges the same? + let verifier_messages_fp4: Vec = time_prover_transcript + .verifier_messages + .iter() + .map(|a| Fp2SmallGoldilocks::from_base_prime_field(*a)) + .collect(); + assert_eq!( + verifier_messages_fp4, transcript.verifier_messages, + "challenges not same" + ); + + let prover_messages_fp4: Vec<(Fp2SmallGoldilocks, Fp2SmallGoldilocks)> = time_prover_transcript + .prover_messages + .iter() + .map(|&(a, b)| { + ( + Fp2SmallGoldilocks::from_base_prime_field(a), + Fp2SmallGoldilocks::from_base_prime_field(b), + ) + }) + .collect(); + assert_eq!( + prover_messages_fp4, transcript.prover_messages, + "prover messages not same" + ); + } +} diff --git a/src/experimental/goldilocks/utils.rs b/src/experimental/goldilocks/utils.rs new file mode 100644 index 0000000..1c42312 --- /dev/null +++ b/src/experimental/goldilocks/utils.rs @@ -0,0 +1,27 @@ +use std::hint::unreachable_unchecked; +use std::arch::asm; + +#[inline(always)] +pub fn assume(p: bool) { + debug_assert!(p); + if !p { + unsafe { + unreachable_unchecked(); + } + } +} + +/// Try to force Rust to emit a branch. Example: +/// if x > 2 { +/// y = foo(); +/// branch_hint(); +/// } else { +/// y = bar(); +/// } +/// This function has no semantics. It is a hint only. +#[inline(always)] +pub fn branch_hint() { + unsafe { + asm!("", options(nomem, nostack, preserves_flags)); + } +} \ No newline at end of file diff --git a/src/experimental/mod.rs b/src/experimental/mod.rs index db16a87..dc22bef 100644 --- a/src/experimental/mod.rs +++ b/src/experimental/mod.rs @@ -2,3 +2,4 @@ pub mod duplex_sponge; pub mod fiat_shamir; pub mod m31; +pub mod goldilocks; \ No newline at end of file diff --git a/src/tests/fields.rs b/src/tests/fields.rs index e0eed1c..45f3983 100644 --- a/src/tests/fields.rs +++ b/src/tests/fields.rs @@ -1,7 +1,7 @@ use ark_ff::{ ark_ff_macros::SmallFpConfig, fields::{Fp128, Fp64, MontBackend, MontConfig}, - BigInt, SmallFp, SmallFpConfig, + BigInt, SmallFp, }; use ark_ff::{Fp2, Fp2Config, Fp4, Fp4Config, SqrtPrecomputation}; #[derive(MontConfig)] @@ -84,3 +84,27 @@ impl Fp4Config for Fp4SmallM31Config { } pub type Fp4SmallM31 = Fp4; + + +// SmallGoldilocks extensions +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] +pub struct Fp2SmallGoldilocksConfig; + +impl Fp2Config for Fp2SmallGoldilocksConfig { + type Fp = SmallGoldilocks; + + /// For Goldilocks, 7 is a quadratic non-residue. + /// The extension is formed by SmallGoldilocks[u] / (u^2 - 7) + /// Reference: https://github.com/zhenfeizhang/Goldilocks + const NONRESIDUE: SmallGoldilocks = SmallGoldilocks::new(7); + + /// Frobenius coefficients are used for computing elements raised to the power of the modulus. + /// In a quadratic extension, these are often just [1, -1] or precomputed constants. + const FROBENIUS_COEFF_FP2_C1: &'static [SmallGoldilocks] = &[ + SmallGoldilocks::new(1), + // This is typically -1 in the field, or the precomputed Frobenius constant. + SmallGoldilocks::new(18446744069414584320), + ]; +} + +pub type Fp2SmallGoldilocks = Fp2; \ No newline at end of file diff --git a/src/tests/mod.rs b/src/tests/mod.rs index 49f8e6f..bea25e2 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -5,6 +5,6 @@ pub mod multilinear; pub mod multilinear_product; pub mod polynomials; pub use fields::{ - Fp2SmallM31, Fp4SmallM31, SmallF16, SmallGoldilocks, SmallM31, F128, F19, F64, M31, + Fp2SmallGoldilocks, Fp2SmallM31, Fp4SmallM31, SmallF16, SmallGoldilocks, SmallM31, F128, F19, F64, M31, }; pub use streams::BenchStream;