Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ rayon = { version = "1.10.0" }
thread_local = "1.1.8"
crossbeam-channel = "0.5.15"
ordered-channel = { version = "1.2.0", features = ["crossbeam-channel"] }
fearless_simd = { version = "0.3.0", default-features = false }
fearless_simd = { git = "https://github.com/linebender/fearless_simd", rev = "5999991", default-features = false }

# The below crates are experimental!
vello_api = { path = "sparse_strips/vello_api", default-features = false }
Expand Down
1 change: 1 addition & 0 deletions sparse_strips/vello_bench/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ publish = false
vello_common = { workspace = true }
vello_cpu = { workspace = true }
vello_dev_macros = { workspace = true }
fearless_simd = { workspace = true, features = ["force_support_fallback"] }
criterion = { workspace = true }
parley = { version = "0.5.0", default-features = true }
rand = { workspace = true }
Expand Down
3 changes: 2 additions & 1 deletion sparse_strips/vello_bench/src/data.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright 2025 the Vello Authors
// SPDX-License-Identifier: Apache-2.0 OR MIT

use fearless_simd::Fallback;
use std::path::Path;
use std::sync::OnceLock;
use usvg::tiny_skia_path::PathSegment;
Expand Down Expand Up @@ -152,7 +153,7 @@ impl DataItem {
let tiles = self.sorted_tiles();

strip::render(
Level::fallback(),
Level::Fallback(Fallback::new()),
&tiles,
&mut strip_buf,
&mut alpha_buf,
Expand Down
2 changes: 1 addition & 1 deletion sparse_strips/vello_bench/src/glyph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ pub fn glyph(c: &mut Criterion) {
strip_generator: StripGenerator::new(
WIDTH,
HEIGHT,
Level::try_detect().unwrap_or(Level::fallback()),
Level::try_detect().unwrap_or(Level::baseline()),
),
strip_storage: StripStorage::default(),
glyph_caches: Default::default(),
Expand Down
6 changes: 3 additions & 3 deletions sparse_strips/vello_common/src/encode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -954,7 +954,7 @@ impl FromF32Color for u8 {

fn from_f32<S: Simd>(mut color: f32x4<S>) -> [Self; 4] {
let simd = color.simd;
color = color.madd(f32x4::splat(simd, 255.0), f32x4::splat(simd, 0.5));
color = color.mul_add(f32x4::splat(simd, 255.0), f32x4::splat(simd, 0.5));

[
color[0] as Self,
Expand Down Expand Up @@ -1009,11 +1009,11 @@ impl<T: FromF32Color> GradientLut<T> {
let scales = f32x16::block_splat(f32x4::from_slice(simd, &range.scale));

ramp_range.clone().step_by(4).for_each(|idx| {
let t_vals = f32x4::splat(simd, idx as f32).madd(inv_lut_scale, add_factor);
let t_vals = f32x4::splat(simd, idx as f32).mul_add(inv_lut_scale, add_factor);

let t_vals = element_wise_splat(simd, t_vals);

let mut result = scales.madd(t_vals, biases);
let mut result = scales.mul_add(t_vals, biases);
let alphas = result.splat_4th();
// Premultiply colors, since we did interpolation in unpremultiplied space.
if range.interpolation_alpha_space == InterpolationAlphaSpace::Unpremultiplied {
Expand Down
30 changes: 15 additions & 15 deletions sparse_strips/vello_common/src/flatten_simd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ fn approx_parabola_integral_simd<S: Simd>(x: f32x8<S>) -> f32x8<S> {
const D: f32 = 0.67;
const D_POWI_4: f32 = 0.201_511_2;

let temp1 = f32x8::splat(simd, 0.25).madd(x * x, f32x8::splat(simd, D_POWI_4));
let temp1 = f32x8::splat(simd, 0.25).mul_add(x * x, f32x8::splat(simd, D_POWI_4));
let temp2 = temp1.sqrt();
let temp3 = temp2.sqrt();
let temp4 = f32x8::splat(simd, 1.0) - f32x8::splat(simd, D);
Expand All @@ -278,7 +278,7 @@ fn approx_parabola_integral_simd_x4<S: Simd>(x: f32x4<S>) -> f32x4<S> {
const D: f32 = 0.67;
const D_POWI_4: f32 = 0.201_511_2;

let temp1 = f32x4::splat(simd, 0.25).madd(x * x, f32x4::splat(simd, D_POWI_4));
let temp1 = f32x4::splat(simd, 0.25).mul_add(x * x, f32x4::splat(simd, D_POWI_4));
let temp2 = temp1.sqrt();
let temp3 = temp2.sqrt();
let temp4 = f32x4::splat(simd, 1.0) - f32x4::splat(simd, D);
Expand All @@ -294,7 +294,7 @@ fn approx_parabola_inv_integral_simd<S: Simd>(x: f32x8<S>) -> f32x8<S> {
const ONE_MINUS_B: f32 = 1.0 - B;

let temp1 = f32x8::splat(simd, B * B);
let temp2 = f32x8::splat(simd, 0.25).madd(x * x, temp1);
let temp2 = f32x8::splat(simd, 0.25).mul_add(x * x, temp1);
let temp3 = temp2.sqrt();
let temp4 = f32x8::splat(simd, ONE_MINUS_B) + temp3;

Expand All @@ -319,9 +319,9 @@ fn eval_simd<S: Simd>(
let im0 = p0 * mt * mt * mt;
let im1 = p1 * mt * mt * 3.0;
let im2 = p2 * mt * 3.0;
let im3 = p3.madd(t, im2) * t;
let im3 = p3.mul_add(t, im2) * t;

(im1 + im3).madd(t, im0)
(im1 + im3).mul_add(t, im0)
}

#[inline(always)]
Expand Down Expand Up @@ -386,8 +386,8 @@ fn estimate_subdiv_simd<S: Simd>(simd: S, sqrt_tol: f32, ctx: &mut FlattenCtx) {
let p_onehalf = f32x8::from_slice(simd, &odd_pts[i * 8..][..8]);
let p2 = f32x8::from_slice(simd, &even_pts[(i * 8 + 2)..][..8]);
let x = p0 * -0.5;
let x1 = p_onehalf.madd(2.0, x);
let p1 = p2.madd(-0.5, x1);
let x1 = p_onehalf.mul_add(2.0, x);
let p1 = p2.mul_add(-0.5, x1);

odd_pts[(i * 8)..][..8].copy_from_slice(p1.as_slice());

Expand All @@ -402,7 +402,7 @@ fn estimate_subdiv_simd<S: Simd>(simd: S, sqrt_tol: f32, ctx: &mut FlattenCtx) {
let d02x = d01x + d12x;
let d02y = d01y + d12y;
// (d02x * ddy) - (d02y * ddx)
let cross = ddx.madd(-d02y, d02x * ddy);
let cross = ddx.mul_add(-d02y, d02x * ddy);

let x0_x2_a = {
let (d01x_low, _) = simd.split_f32x8(d01x);
Expand All @@ -416,11 +416,11 @@ fn estimate_subdiv_simd<S: Simd>(simd: S, sqrt_tol: f32, ctx: &mut FlattenCtx) {

simd.combine_f32x4(d12y_low, d01y_low)
};
let x0_x2_num = temp1.madd(ddy, x0_x2_a);
let x0_x2_num = temp1.mul_add(ddy, x0_x2_a);
let x0_x2 = x0_x2_num / cross;
let (ddx_low, _) = simd.split_f32x8(ddx);
let (ddy_low, _) = simd.split_f32x8(ddy);
let dd_hypot = ddy_low.madd(ddy_low, ddx_low * ddx_low).sqrt();
let dd_hypot = ddy_low.mul_add(ddy_low, ddx_low * ddx_low).sqrt();
let (x0, x2) = simd.split_f32x8(x0_x2);
let scale_denom = dd_hypot * (x2 - x0);
let (temp2, _) = simd.split_f32x8(cross);
Expand Down Expand Up @@ -469,19 +469,19 @@ fn output_lines_simd<S: Simd>(

const IOTA2: [f32; 8] = [0., 0., 1., 1., 2., 2., 3., 3.];
let iota2 = f32x8::from_slice(simd, IOTA2.as_ref());
let x = iota2.madd(dx, f32x8::splat(simd, x0));
let x = iota2.mul_add(dx, f32x8::splat(simd, x0));
let da = f32x8::splat(simd, ctx.da[i]);
let mut a = da.madd(x, f32x8::splat(simd, ctx.a0[i]));
let mut a = da.mul_add(x, f32x8::splat(simd, ctx.a0[i]));
let a_inc = 4.0 * dx * da;
let uscale = f32x8::splat(simd, ctx.uscale[i]);

for j in 0..n.div_ceil(4) {
let u = approx_parabola_inv_integral_simd(a);
let t = u.madd(uscale, -ctx.u0[i] * uscale);
let t = u.mul_add(uscale, -ctx.u0[i] * uscale);
let mt = 1.0 - t;
let z = p0 * mt * mt;
let z1 = p1.madd(2.0 * t * mt, z);
let p = p2.madd(t * t, z1);
let z1 = p1.mul_add(2.0 * t * mt, z);
let p = p2.mul_add(t * t, z1);

out[j * 8..][..8].copy_from_slice(p.as_slice());

Expand Down
25 changes: 13 additions & 12 deletions sparse_strips/vello_common/src/strip.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ fn render_impl<S: Simd>(
for x in 0..Tile::WIDTH as usize {
let area = location_winding[x];
let coverage = area.abs();
let mulled = coverage.madd(p2, p1);
let mulled = coverage.mul_add(p2, p1);
// Note that we are not storing the location winding here but the actual
// alpha value as f32, so we reuse the variable as a temporary storage.
// Also note that we need the `min` here because the winding can be > 1
Expand All @@ -185,9 +185,9 @@ fn render_impl<S: Simd>(
#[expect(clippy::needless_range_loop, reason = "dimension clarity")]
for x in 0..Tile::WIDTH as usize {
let area = location_winding[x];
let im1 = area.madd(p1, p1).floor();
let coverage = p2.madd(im1, area).abs();
let mulled = p3.madd(coverage, p1);
let im1 = area.mul_add(p1, p1).floor();
let coverage = p2.mul_add(im1, area).abs();
let mulled = p3.mul_add(coverage, p1);
// TODO: It is possible that, unlike for `NonZero`, we don't need the `min`
// here.
location_winding[x] = mulled.min(p3);
Expand Down Expand Up @@ -346,9 +346,10 @@ fn render_impl<S: Simd>(
let ymin = px_top_y.max(ymin);
let ymax = px_bottom_y.min(ymax);
let h = (ymax - ymin).max(0.0);
accumulated_winding = h.madd(sign, accumulated_winding);
accumulated_winding = h.mul_add(sign, accumulated_winding);
for x_idx in 0..Tile::WIDTH {
location_winding[x_idx as usize] = h.madd(sign, location_winding[x_idx as usize]);
location_winding[x_idx as usize] =
h.mul_add(sign, location_winding[x_idx as usize]);
}

if line_right_x < 0. {
Expand Down Expand Up @@ -392,26 +393,26 @@ fn render_impl<S: Simd>(
// will be NaN, as `0 * inf` results in NaN. This is true for both the left and
// right edge. In both cases, the call to `f32::max` will set this to `ymin`.
let line_px_left_y = (px_left_x - line_top_x)
.madd(y_slope, line_top_y)
.mul_add(y_slope, line_top_y)
.max_precise(ymin)
.min_precise(ymax);
let line_px_right_y = (px_right_x - line_top_x)
.madd(y_slope, line_top_y)
.mul_add(y_slope, line_top_y)
.max_precise(ymin)
.min_precise(ymax);

// `x_slope` is always finite, as horizontal geometry is elided.
let line_px_left_yx =
(line_px_left_y - line_top_y).madd(x_slope, f32x4::splat(s, line_top_x));
(line_px_left_y - line_top_y).mul_add(x_slope, f32x4::splat(s, line_top_x));
let line_px_right_yx =
(line_px_right_y - line_top_y).madd(x_slope, f32x4::splat(s, line_top_x));
(line_px_right_y - line_top_y).mul_add(x_slope, f32x4::splat(s, line_top_x));
let h = (line_px_right_y - line_px_left_y).abs();

// The trapezoidal area enclosed between the line and the right edge of the pixel
// square.
let area = 0.5 * h * (2. * px_right_x - line_px_right_yx - line_px_left_yx);
location_winding[x_idx as usize] += area.madd(sign, acc);
acc = h.madd(sign, acc);
location_winding[x_idx as usize] += area.mul_add(sign, acc);
acc = h.mul_add(sign, acc);
}

accumulated_winding += acc;
Expand Down
2 changes: 1 addition & 1 deletion sparse_strips/vello_common/src/strip_generator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ mod tests {

#[test]
fn reset() {
let mut generator = StripGenerator::new(100, 100, Level::fallback());
let mut generator = StripGenerator::new(100, 100, Level::baseline());
let mut storage = StripStorage::default();
let rect = Rect::new(0.0, 0.0, 100.0, 100.0);

Expand Down
Loading