Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ categories = ["multimedia::images"]
keywords = ["jpg", "jpeg", "encoder", "image"]
readme = "README.md"
repository = "https://github.com/vstroebel/jpeg-encoder"
rust-version = "1.61"
rust-version = "1.86"

[features]
default = ["std"]
Expand Down
104 changes: 50 additions & 54 deletions src/avx2/ycbcr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,38 @@ macro_rules! ycbcr_image_avx2 {

impl<'a> $name<'a> {
#[target_feature(enable = "avx2")]
unsafe fn fill_buffers_avx2(&self, y: u16, buffers: &mut [Vec<u8>; 4]) {
unsafe fn load3(data: *const u8) -> __m256i {
fn fill_buffers_avx2(&self, y: u16, buffers: &mut [Vec<u8>; 4]) {
#[inline]
#[target_feature(enable = "avx2")]
fn load3(data: &[u8]) -> __m256i {
_ = data[7 * $num_colors]; // dummy indexing operation up front to avoid bounds checks later
_mm256_set_epi32(
*data as i32,
*data.offset(1 * $num_colors) as i32,
*data.offset(2 * $num_colors) as i32,
*data.offset(3 * $num_colors) as i32,
*data.offset(4 * $num_colors) as i32,
*data.offset(5 * $num_colors) as i32,
*data.offset(6 * $num_colors) as i32,
*data.offset(7 * $num_colors) as i32,
data[0] as i32,
data[1 * $num_colors] as i32,
data[2 * $num_colors] as i32,
data[3 * $num_colors] as i32,
data[4 * $num_colors] as i32,
data[5 * $num_colors] as i32,
data[6 * $num_colors] as i32,
data[7 * $num_colors] as i32,
)
}

let mut y_buffer = buffers[0].as_mut_ptr().add(buffers[0].len());
buffers[0].set_len(buffers[0].len() + self.width() as usize);
let mut cb_buffer = buffers[1].as_mut_ptr().add(buffers[1].len());
buffers[1].set_len(buffers[1].len() + self.width() as usize);
let mut cr_buffer = buffers[2].as_mut_ptr().add(buffers[2].len());
buffers[2].set_len(buffers[2].len() + self.width() as usize);
#[inline]
#[target_feature(enable = "avx2")]
fn avx_as_i32_array(data: __m256i) -> [i32; 8] {
// Safety preconditions. Optimized away in release mode, no runtime cost.
assert!(core::mem::size_of::<__m256i>() == core::mem::size_of::<[i32; 8]>());
assert!(core::mem::align_of::<__m256i>() >= core::mem::align_of::<[i32; 8]>());
// SAFETY: size and alignment preconditions checked above.
// Both types are plain old data: no pointers, lifetimes, etc.
unsafe { core::mem::transmute(data) }
}

let [y_buffer, cb_buffer, cr_buffer, _] = buffers;
y_buffer.reserve(self.width() as usize);
cb_buffer.reserve(self.width() as usize);
cr_buffer.reserve(self.width() as usize);

let ymulr = _mm256_set1_epi32(19595);
let ymulg = _mm256_set1_epi32(38470);
Expand All @@ -53,17 +65,14 @@ macro_rules! ycbcr_image_avx2 {
let crmulg = _mm256_set1_epi32(27439);
let crmulb = _mm256_set1_epi32(5329);

let mut data = self
.0
.as_ptr()
.offset((y as isize * self.1 as isize * $num_colors));
let mut data = &self.0[(y as usize * self.1 as usize * $num_colors)..];

for _ in 0..self.width() / 8 {
let r = load3(data.offset($o1));
let g = load3(data.offset($o2));
let b = load3(data.offset($o3));
let r = load3(&data[$o1..]);
let g = load3(&data[$o2..]);
let b = load3(&data[$o3..]);

data = data.add($num_colors * 8);
data = &data[($num_colors * 8)..];

let yr = _mm256_mullo_epi32(ymulr, r);
let yg = _mm256_mullo_epi32(ymulg, g);
Expand All @@ -72,7 +81,10 @@ macro_rules! ycbcr_image_avx2 {
let y = _mm256_add_epi32(_mm256_add_epi32(yr, yg), yb);
let y = _mm256_add_epi32(y, _mm256_set1_epi32(0x7FFF));
let y = _mm256_srli_epi32(y, 16);
let y: [i32; 8] = core::mem::transmute(y);
let y: [i32; 8] = avx_as_i32_array(y);
let mut y: [u8; 8] = y.map(|x| x as u8);
y.reverse();
y_buffer.extend_from_slice(&y);

let cbr = _mm256_mullo_epi32(cbmulr, r);
let cbg = _mm256_mullo_epi32(cbmulg, g);
Expand All @@ -82,7 +94,10 @@ macro_rules! ycbcr_image_avx2 {
let cb = _mm256_add_epi32(cb, _mm256_set1_epi32(128 << 16));
let cb = _mm256_add_epi32(cb, _mm256_set1_epi32(0x7FFF));
let cb = _mm256_srli_epi32(cb, 16);
let cb: [i32; 8] = core::mem::transmute(cb);
let cb: [i32; 8] = avx_as_i32_array(cb);
let mut cb: [u8; 8] = cb.map(|x| x as u8);
cb.reverse();
cb_buffer.extend_from_slice(&cb);

let crr = _mm256_mullo_epi32(crmulr, r);
let crg = _mm256_mullo_epi32(crmulg, g);
Expand All @@ -92,38 +107,19 @@ macro_rules! ycbcr_image_avx2 {
let cr = _mm256_add_epi32(cr, _mm256_set1_epi32(128 << 16));
let cr = _mm256_add_epi32(cr, _mm256_set1_epi32(0x7FFF));
let cr = _mm256_srli_epi32(cr, 16);
let cr: [i32; 8] = core::mem::transmute(cr);

for y in y.iter().rev() {
*y_buffer = *y as u8;
y_buffer = y_buffer.offset(1);
}

for cb in cb.iter().rev() {
*cb_buffer = *cb as u8;
cb_buffer = cb_buffer.offset(1);
}

for cr in cr.iter().rev() {
*cr_buffer = *cr as u8;
cr_buffer = cr_buffer.offset(1);
}
let cr: [i32; 8] = avx_as_i32_array(cr);
let mut cr: [u8; 8] = cr.map(|x| x as u8);
cr.reverse();
cr_buffer.extend_from_slice(&cr);
}

for _ in 0..self.width() % 8 {
let (y, cb, cr) =
rgb_to_ycbcr(*data.offset($o1), *data.offset($o2), *data.offset($o3));

data = data.add($num_colors);

*y_buffer = y;
y_buffer = y_buffer.offset(1);

*cb_buffer = cb;
cb_buffer = cb_buffer.offset(1);
let (y, cb, cr) = rgb_to_ycbcr(data[$o1], data[$o2], data[$o3]);
data = &data[$num_colors..];

*cr_buffer = cr;
cr_buffer = cr_buffer.offset(1);
y_buffer.push(y);
cb_buffer.push(cb);
cr_buffer.push(cr);
}
}
}
Expand Down
Loading