From 0c46495b1d1bdab1618ef0be28e110d3e324172d Mon Sep 17 00:00:00 2001
From: Alfred Klomp <git@alfredklomp.com>
Date: Tue, 2 Sep 2014 18:20:44 +0200
Subject: [PATCH 01/11] alg_noise_tune: split off into separate file to enable
 testing

---
 alg.c                           | 33 ++--------------
 alg/alg_noise_tune.plain.c      | 41 +++++++++++++++++++
 alg/tests/Makefile              | 16 ++++++++
 alg/tests/test_alg_noise_tune.c | 70 +++++++++++++++++++++++++++++++++
 alg/tests/timer.c               | 35 +++++++++++++++++
 alg/tests/timer.h               |  3 ++
 6 files changed, 168 insertions(+), 30 deletions(-)
 create mode 100644 alg/alg_noise_tune.plain.c
 create mode 100644 alg/tests/Makefile
 create mode 100644 alg/tests/test_alg_noise_tune.c
 create mode 100644 alg/tests/timer.c
 create mode 100644 alg/tests/timer.h

diff --git a/alg.c b/alg.c
index 93c260f..72c3856 100644
--- a/alg.c
+++ b/alg.c
@@ -352,42 +352,15 @@ void alg_draw_red_location(struct coord *cent, struct images *imgs, int width, u
 #define DIFF(x, y)         (ABS((x)-(y)))
 #define NDIFF(x, y)        (ABS(x) * NORM / (ABS(x) + 2 * DIFF(x, y)))
 
+#include "alg/alg_noise_tune.plain.c"
+
 /**
  * alg_noise_tune
  *
  */
 void alg_noise_tune(struct context *cnt, unsigned char *new)
 {
-    struct images *imgs = &cnt->imgs;
-    int i;
-    unsigned char *ref = imgs->ref;
-    int diff, sum = 0, count = 0;
-    unsigned char *mask = imgs->mask;
-    unsigned char *smartmask = imgs->smartmask_final;
-
-    i = imgs->motionsize;
-            
-    for (; i > 0; i--) {
-        diff = ABS(*ref - *new);
-
-        if (mask)
-            diff = ((diff * *mask++) / 255);
-
-        if (*smartmask) {
-            sum += diff + 1;
-            count++;
-        }
-
-        ref++;
-        new++;
-        smartmask++;
-    }
-
-    if (count > 3)  /* Avoid divide by zero. */
-        sum /= count / 3;
-    
-    /* 5: safe, 4: regular, 3: more sensitive */
-    cnt->noise = 4 + (cnt->noise + sum) / 2;
+    alg_noise_tune_plain(cnt, new);
 }
 
 /**
diff --git a/alg/alg_noise_tune.plain.c b/alg/alg_noise_tune.plain.c
new file mode 100644
index 0000000..d35238d
--- /dev/null
+++ b/alg/alg_noise_tune.plain.c
@@ -0,0 +1,41 @@
+#ifndef ABS
+#define ABS(x)             ((x) < 0 ? -(x) : (x))
+#endif
+
+/**
+ * alg_noise_tune_plain
+ *
+ */
+static void alg_noise_tune_plain(struct context *cnt, unsigned char *new)
+{
+    struct images *imgs = &cnt->imgs;
+    int i;
+    unsigned char *ref = imgs->ref;
+    int diff, sum = 0, count = 0;
+    unsigned char *mask = imgs->mask;
+    unsigned char *smartmask = imgs->smartmask_final;
+
+    i = imgs->motionsize;
+
+    for (; i > 0; i--) {
+        diff = ABS(*ref - *new);
+
+        if (mask)
+            diff = ((diff * *mask++) / 255);
+
+        if (*smartmask) {
+            sum += diff + 1;
+            count++;
+        }
+
+        ref++;
+        new++;
+        smartmask++;
+    }
+
+    if (count > 3)  /* Avoid divide by zero. */
+        sum /= count / 3;
+
+    /* 5: safe, 4: regular, 3: more sensitive */
+    cnt->noise = 4 + (cnt->noise + sum) / 2;
+}
diff --git a/alg/tests/Makefile b/alg/tests/Makefile
new file mode 100644
index 0000000..c4d4ce3
--- /dev/null
+++ b/alg/tests/Makefile
@@ -0,0 +1,16 @@
+CFLAGS += -std=c89 -Werror -Wall -Wextra -pedantic -msse2 -O3
+LDFLAGS += -lrt
+
+.PHONY: clean
+
+test_alg_noise_tune: test_alg_noise_tune.o timer.o
+	$(CC) $(LDFLAGS) -o $@ $^
+
+test_alg_noise_tune.o: ../alg_noise_tune.plain.c test_alg_noise_tune.c
+	$(CC) $(CFLAGS) -o $@ -c test_alg_noise_tune.c
+
+timer.o: timer.c
+	$(CC) $(CFLAGS) -o $@ -c $^
+
+clean:
+	rm -f *.o test_alg_noise_tune
diff --git a/alg/tests/test_alg_noise_tune.c b/alg/tests/test_alg_noise_tune.c
new file mode 100644
index 0000000..49c5386
--- /dev/null
+++ b/alg/tests/test_alg_noise_tune.c
@@ -0,0 +1,70 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "timer.h"
+
+/* Stub structures for test purposes: */
+struct images
+{
+	unsigned char *ref;
+	unsigned char *mask;
+	unsigned char *smartmask_final;
+	int motionsize;
+};
+
+struct context
+{
+	struct images imgs;
+	int noise;
+};
+
+#define WIDTH    600
+#define HEIGHT   400
+
+static void
+init (struct context *ctx, unsigned char **new)
+{
+	ctx->noise = 0;
+	ctx->imgs.motionsize = WIDTH * HEIGHT;
+	ctx->imgs.ref = malloc(ctx->imgs.motionsize);
+	ctx->imgs.mask = malloc(ctx->imgs.motionsize);
+	ctx->imgs.smartmask_final = malloc(ctx->imgs.motionsize);
+	*new = malloc(ctx->imgs.motionsize);
+}
+
+static void
+testsuite (char *name, struct context *ctx, unsigned char *new, void (*func)(struct context *, unsigned char *))
+{
+	int i;
+
+	printf("---\n%s\n", name);
+
+	timer_start();
+	for (i = 100; i > 0; i--) {
+		func(ctx, new);
+	}
+	timer_stop();
+
+	printf("Noise level: %d\nTime: %.4f sec\n", ctx->noise, timer_sec());
+
+}
+
+#include "../alg_noise_tune.plain.c"
+
+int
+main ()
+{
+	struct context ctx;
+	unsigned char *new;
+
+	init(&ctx, &new);
+
+	testsuite("plain", &ctx, new, alg_noise_tune_plain);
+
+	free(new);
+	free(ctx.imgs.ref);
+	free(ctx.imgs.mask);
+	free(ctx.imgs.smartmask_final);
+
+	return 0;
+}
diff --git a/alg/tests/timer.c b/alg/tests/timer.c
new file mode 100644
index 0000000..10e64a5
--- /dev/null
+++ b/alg/tests/timer.c
@@ -0,0 +1,35 @@
+#define _POSIX_C_SOURCE 199309L
+
+#include <time.h>
+
+/* This is not threadsafe at all, but that's fine for our purposes. */
+
+static struct timespec start;
+static struct timespec end;
+
+void
+timer_start ()
+{
+	clock_gettime(CLOCK_MONOTONIC, &start);
+}
+
+void
+timer_stop ()
+{
+	clock_gettime(CLOCK_MONOTONIC, &end);
+}
+
+float
+timer_sec ()
+{
+	struct timespec temp;
+
+	if ((end.tv_nsec - start.tv_nsec) < 0) {
+		temp.tv_sec = end.tv_sec - start.tv_sec - 1;
+		temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
+	} else {
+		temp.tv_sec = end.tv_sec - start.tv_sec;
+		temp.tv_nsec = end.tv_nsec - start.tv_nsec;
+	}
+	return (float)(temp.tv_sec + ((float)temp.tv_nsec / 1000000000.0));
+}
diff --git a/alg/tests/timer.h b/alg/tests/timer.h
new file mode 100644
index 0000000..8c90baf
--- /dev/null
+++ b/alg/tests/timer.h
@@ -0,0 +1,3 @@
+void timer_start ();
+void timer_stop ();
+float timer_sec ();

From 6f4a1ba1950a5310e4db6e29e63afae05ac65f31 Mon Sep 17 00:00:00 2001
From: Alfred Klomp <git@alfredklomp.com>
Date: Tue, 2 Sep 2014 20:22:58 +0200
Subject: [PATCH 02/11] alg_noise_tune: add test pattern framework

---
 alg/tests/test_alg_noise_tune.c | 65 ++++++++++++++++++++++++++++++++-
 1 file changed, 64 insertions(+), 1 deletion(-)

diff --git a/alg/tests/test_alg_noise_tune.c b/alg/tests/test_alg_noise_tune.c
index 49c5386..8aaeb56 100644
--- a/alg/tests/test_alg_noise_tune.c
+++ b/alg/tests/test_alg_noise_tune.c
@@ -1,5 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "timer.h"
 
@@ -20,11 +21,11 @@ struct context
 
 #define WIDTH    600
 #define HEIGHT   400
+#define BLOCKPX   50
 
 static void
 init (struct context *ctx, unsigned char **new)
 {
-	ctx->noise = 0;
 	ctx->imgs.motionsize = WIDTH * HEIGHT;
 	ctx->imgs.ref = malloc(ctx->imgs.motionsize);
 	ctx->imgs.mask = malloc(ctx->imgs.motionsize);
@@ -32,12 +33,67 @@ init (struct context *ctx, unsigned char **new)
 	*new = malloc(ctx->imgs.motionsize);
 }
 
+static void
+clean (struct context *ctx, unsigned char *new)
+{
+	ctx->noise = 0;
+	memset(ctx->imgs.ref, 0, WIDTH * HEIGHT);
+	memset(ctx->imgs.mask, 0, WIDTH * HEIGHT);
+	memset(ctx->imgs.smartmask_final, 0, WIDTH * HEIGHT);
+	memset(new, 0, WIDTH * HEIGHT);
+}
+
+static void
+apply_pattern (unsigned char *pattern, unsigned char *img)
+{
+	int x = 0, y = 0;
+
+	/* Each pattern represents BLOCKPX * BLOCKPX pixels in the output: */
+	while (y < HEIGHT) {
+		unsigned char *col = pattern;
+		while (x < WIDTH) {
+			*img++ = *col;
+			if (++x % BLOCKPX == 0) {
+				col++;
+			}
+		}
+		/* After BLOCKPX rows, move to next: */
+		if (++y % BLOCKPX == 0) {
+			pattern += WIDTH / BLOCKPX;
+		}
+	}
+}
+
+static void
+random_patterns (int seed, struct context *ctx, unsigned char *new)
+{
+	int i;
+	unsigned char *c;
+	unsigned char pattern[(HEIGHT * WIDTH) / BLOCKPX];
+	unsigned char *ptrs[4];
+
+	ptrs[0] = ctx->imgs.ref;
+	ptrs[1] = ctx->imgs.mask;
+	ptrs[2] = ctx->imgs.smartmask_final;
+	ptrs[3] = new;
+
+	srand(seed);
+
+	for (i = 0; i < 4; i++) {
+		for (c = pattern; c < (pattern + sizeof(pattern)); c++) {
+			*c = rand() / (RAND_MAX / 256);
+		}
+		apply_pattern(pattern, ptrs[i]);
+	}
+}
+
 static void
 testsuite (char *name, struct context *ctx, unsigned char *new, void (*func)(struct context *, unsigned char *))
 {
 	int i;
 
 	printf("---\n%s\n", name);
+	clean(ctx, new);
 
 	timer_start();
 	for (i = 100; i > 0; i--) {
@@ -47,6 +103,13 @@ testsuite (char *name, struct context *ctx, unsigned char *new, void (*func)(str
 
 	printf("Noise level: %d\nTime: %.4f sec\n", ctx->noise, timer_sec());
 
+	for (i = 100; i > 0; i--) {
+		clean(ctx, new);
+		random_patterns(i, ctx, new);
+		func(ctx, new);
+		printf("%d ", ctx->noise);
+	}
+	puts("");
 }
 
 #include "../alg_noise_tune.plain.c"

From b825c6ef488e8ec84e8bfb11fc2913b8f25ac183 Mon Sep 17 00:00:00 2001
From: Alfred Klomp <git@alfredklomp.com>
Date: Tue, 2 Sep 2014 21:52:58 +0200
Subject: [PATCH 03/11] alg_noise_tune: change ints to unsigned for 10% speed
 improvement

---
 alg/alg_noise_tune.plain.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/alg/alg_noise_tune.plain.c b/alg/alg_noise_tune.plain.c
index d35238d..ddb861e 100644
--- a/alg/alg_noise_tune.plain.c
+++ b/alg/alg_noise_tune.plain.c
@@ -1,7 +1,3 @@
-#ifndef ABS
-#define ABS(x)             ((x) < 0 ? -(x) : (x))
-#endif
-
 /**
  * alg_noise_tune_plain
  *
@@ -9,22 +5,21 @@
 static void alg_noise_tune_plain(struct context *cnt, unsigned char *new)
 {
     struct images *imgs = &cnt->imgs;
-    int i;
     unsigned char *ref = imgs->ref;
-    int diff, sum = 0, count = 0;
+    unsigned int sum = 0, count = 0;
     unsigned char *mask = imgs->mask;
     unsigned char *smartmask = imgs->smartmask_final;
 
-    i = imgs->motionsize;
+    int i = imgs->motionsize;
 
     for (; i > 0; i--) {
-        diff = ABS(*ref - *new);
+        unsigned char absdiff = (*ref > *new) ? (*ref - *new) : (*new - *ref);
 
         if (mask)
-            diff = ((diff * *mask++) / 255);
+            absdiff = ((absdiff * *mask++) / 255);
 
         if (*smartmask) {
-            sum += diff + 1;
+            sum += absdiff + 1;
             count++;
         }
 

From bb321eb5e1f0b057bb12972a58ad174909dc348e Mon Sep 17 00:00:00 2001
From: Alfred Klomp <git@alfredklomp.com>
Date: Tue, 2 Sep 2014 22:27:43 +0200
Subject: [PATCH 04/11] alg_noise_tune: add optimized SSE2 routine

---
 alg.c                           |  14 ++++
 alg/alg_noise_tune.sse2.c       | 129 ++++++++++++++++++++++++++++++++
 alg/sse2.h                      |  58 ++++++++++++++
 alg/tests/Makefile              |   2 +-
 alg/tests/test_alg_noise_tune.c |   5 ++
 5 files changed, 207 insertions(+), 1 deletion(-)
 create mode 100644 alg/alg_noise_tune.sse2.c
 create mode 100644 alg/sse2.h

diff --git a/alg.c b/alg.c
index 72c3856..24ee518 100644
--- a/alg.c
+++ b/alg.c
@@ -14,6 +14,12 @@
 #include "mmx.h"
 #endif
 
+#ifdef __SSE2__
+#define HAVE_SSE2
+#include <emmintrin.h>
+#include "alg/sse2.h"
+#endif
+
 #define MAX2(x, y) ((x) > (y) ? (x) : (y))
 #define MAX3(x, y, z) ((x) > (y) ? ((x) > (z) ? (x) : (z)) : ((y) > (z) ? (y) : (z)))
 
@@ -352,7 +358,11 @@ void alg_draw_red_location(struct coord *cent, struct images *imgs, int width, u
 #define DIFF(x, y)         (ABS((x)-(y)))
 #define NDIFF(x, y)        (ABS(x) * NORM / (ABS(x) + 2 * DIFF(x, y)))
 
+#ifdef HAVE_SSE2
+#include "alg/alg_noise_tune.sse2.c"
+#else
 #include "alg/alg_noise_tune.plain.c"
+#endif
 
 /**
  * alg_noise_tune
@@ -360,7 +370,11 @@ void alg_draw_red_location(struct coord *cent, struct images *imgs, int width, u
  */
 void alg_noise_tune(struct context *cnt, unsigned char *new)
 {
+#ifdef HAVE_SSE2
+    alg_noise_tune_sse2(cnt, new);
+#else
     alg_noise_tune_plain(cnt, new);
+#endif
 }
 
 /**
diff --git a/alg/alg_noise_tune.sse2.c b/alg/alg_noise_tune.sse2.c
new file mode 100644
index 0000000..9c6f042
--- /dev/null
+++ b/alg/alg_noise_tune.sse2.c
@@ -0,0 +1,129 @@
+/**
+ * alg_noise_tune_sse2
+ *
+ */
+static void alg_noise_tune_sse2(struct context *cnt, unsigned char *new)
+{
+    struct images *imgs = &cnt->imgs;
+    unsigned char *ref = imgs->ref;
+    unsigned int sum = 0, count = 0;
+    unsigned char *mask = imgs->mask;
+    unsigned char *smartmask = imgs->smartmask_final;
+
+    int j, i = imgs->motionsize;
+
+    int sse_iters;
+    __m128i maskrow, zeromask;
+    __m128i alo, ahi;
+    __m128i ones = _mm_set1_epi8(1);
+    __m128i sum16lo = _mm_setzero_si128();
+    __m128i sum16hi = _mm_setzero_si128();
+    __m128i sum32 = _mm_setzero_si128();
+    __m128i count8 = _mm_setzero_si128();
+    uint32_t total[4];
+    uint8_t counts[16] __attribute__((aligned(16)));
+
+    /* SSE reads 16 bytes at a time; truncating division: */
+    for (sse_iters = i >> 4; sse_iters > 0; sse_iters--)
+    {
+        /* Load 16 bytes from images. Addresses need not be 16-byte aligned: */
+        __m128i refrow = _mm_loadu_si128((__m128i *)ref);
+        __m128i newrow = _mm_loadu_si128((__m128i *)new);
+
+        /* Calculate absolute difference per byte: abs(ref - new): */
+        __m128i absdiff = _mm_absdiff_epu8(refrow, newrow);
+
+        /* If there is a mask image, alpha blend the absdiff by its pixels: */
+        if (mask)
+        {
+            /* Load mask image data: */
+            maskrow = _mm_loadu_si128((__m128i *)mask);
+            mask += 16;
+
+            /* "Alpha blend" absdiff with mask, absdiff *= (mask / 255): */
+            absdiff = _mm_scale_epu8(absdiff, maskrow);
+        }
+        /* Add 1 to all diff values: */
+        absdiff = _mm_adds_epu8(absdiff, ones);
+
+        /* Fetch the smartmask values: */
+        maskrow = _mm_loadu_si128((__m128i *)smartmask);
+
+        /* Set diff values to 0 where smartmask is 0: */
+        zeromask = _mm_cmpeq_epi8(maskrow, _mm_setzero_si128());
+        absdiff = _mm_andnot_si128(zeromask, absdiff);
+
+        /* Increment count for every nonzero value of smartmask: */
+        count8 = _mm_adds_epu8(count8, _mm_andnot_si128(zeromask, ones));
+
+        /* Split 16 bytes of sum into 16x16-bit values:
+         * 0 . 1 . 2 . 3 . 4 . 5 . 6 . 7 .
+         * 8 . 9 . A . B . C . D . E . F .
+         */
+        sse_u8_to_u16(absdiff, &alo, &ahi);
+        sum16lo = _mm_adds_epu16(sum16lo, alo);
+        sum16hi = _mm_adds_epu16(sum16hi, ahi);
+
+        /* Offload these 16-bit counters into a 32-bit counter at least once
+         * every 128 rounds to prevent overflow:
+         * Also do this in the last iteration to empty out the counters: */
+        if (!(sse_iters & 0x7F) || sse_iters == 1)
+        {
+            /* Split these two into 4x32 bits and do 32-bit additions:
+             * 0 . . . 1 . . . 2 . . . 3 . . . +
+             * 4 . . . 5 . . . 6 . . . 7 . . . +
+             * 8 . . . 9 . . . A . . . B . . . +
+             * C . . . D . . . E . . . F . . .
+             * Add all of these to the running sum: */
+
+            sse_u16_to_u32(sum16lo, &alo, &ahi);
+            sum32 = _mm_add_epi32(sum32, _mm_add_epi32(alo, ahi));
+
+            sse_u16_to_u32(sum16hi, &alo, &ahi);
+            sum32 = _mm_add_epi32(sum32, _mm_add_epi32(alo, ahi));
+
+            sum16lo = _mm_setzero_si128();
+            sum16hi = _mm_setzero_si128();
+
+            _mm_store_si128((__m128i *)counts, count8);
+            for (j = 0; j < 16; j++) {
+                count += counts[j];
+            }
+            count8 = _mm_setzero_si128();
+        }
+
+        ref += 16;
+        new += 16;
+        smartmask += 16;
+    }
+    /* Outside the hot loop, write out the running sum to memory
+     * and add the four component uint32's to get the total sum: */
+    _mm_storeu_si128((__m128i *)&total, sum32);
+    sum = total[0] + total[1] + total[2] + total[3];
+
+    /* We handled all 16-bit blocks. Truncate i to its value mod 16, so that
+     * the regular bytewise code can handle the remainder: */
+    i &= 0x0F;
+
+    for (; i > 0; i--) {
+        unsigned char absdiff = (*ref > *new) ? (*ref - *new) : (*new - *ref);
+
+        if (mask)
+            absdiff = ((absdiff * *mask++) / 255);
+
+        if (*smartmask) {
+            sum += absdiff + 1;
+            count++;
+        }
+
+        ref++;
+        new++;
+        smartmask++;
+    }
+
+    if (count > 3)  /* Avoid divide by zero. */
+        sum /= count / 3;
+
+    /* 5: safe, 4: regular, 3: more sensitive */
+    cnt->noise = 4 + (cnt->noise + sum) / 2;
+}
diff --git a/alg/sse2.h b/alg/sse2.h
new file mode 100644
index 0000000..8e193d7
--- /dev/null
+++ b/alg/sse2.h
@@ -0,0 +1,58 @@
+static __inline __m128i
+_mm_absdiff_epu8 (__m128i x, __m128i y)
+{
+    /* Calculate absolute difference: abs(x - y): */
+    return _mm_or_si128(_mm_subs_epu8(x, y), _mm_subs_epu8(y, x));
+}
+
+static __inline __m128i
+_mm_div255_epu16 (__m128i x)
+{
+    /* Divide 8 16-bit uints by 255:
+     * x := ((x + 1) + (x >> 8)) >> 8: */
+    return _mm_srli_epi16(_mm_adds_epu16(
+        _mm_adds_epu16(x, _mm_set1_epi16(1)),
+        _mm_srli_epi16(x, 8)), 8);
+}
+
+static __inline void
+sse_u8_to_u16 (__m128i in, __m128i *__restrict lo, __m128i *__restrict hi)
+{
+    /* Zero-extend an 8-bit vector to two 16-bit vectors: */
+    *lo = _mm_unpacklo_epi8(in, _mm_setzero_si128());
+    *hi = _mm_unpackhi_epi8(in, _mm_setzero_si128());
+}
+
+static __inline void
+sse_u16_to_u32 (__m128i in, __m128i *__restrict lo, __m128i *__restrict hi)
+{
+    /* Zero-extend a 16-bit vector to two 32-bit vectors: */
+    *lo = _mm_unpacklo_epi16(in, _mm_setzero_si128());
+    *hi = _mm_unpackhi_epi16(in, _mm_setzero_si128());
+}
+
+static __inline __m128i
+_mm_scale_epu8 (__m128i x, __m128i y)
+{
+    /* Returns an "alpha blend" of x with y;
+     *   x := x * (y / 255)
+     * Reorder: x := (x * y) / 255
+     */
+    __m128i xlo, xhi;
+    __m128i ylo, yhi;
+
+    /* Unpack x and y into 16-bit uints: */
+    sse_u8_to_u16(x, &xlo, &xhi);
+    sse_u8_to_u16(y, &ylo, &yhi);
+
+    /* Multiply x with y, keeping the low 16 bits: */
+    xlo = _mm_mullo_epi16(xlo, ylo);
+    xhi = _mm_mullo_epi16(xhi, yhi);
+
+    /* Divide by 255: */
+    xlo = _mm_div255_epu16(xlo);
+    xhi = _mm_div255_epu16(xhi);
+
+    /* Repack the 16-bit uints to 8-bit values: */
+    return _mm_packus_epi16(xlo, xhi);
+}
diff --git a/alg/tests/Makefile b/alg/tests/Makefile
index c4d4ce3..e815551 100644
--- a/alg/tests/Makefile
+++ b/alg/tests/Makefile
@@ -6,7 +6,7 @@ LDFLAGS += -lrt
 test_alg_noise_tune: test_alg_noise_tune.o timer.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
-test_alg_noise_tune.o: ../alg_noise_tune.plain.c test_alg_noise_tune.c
+test_alg_noise_tune.o: ../alg_noise_tune.plain.c ../alg_noise_tune.sse2.c test_alg_noise_tune.c
 	$(CC) $(CFLAGS) -o $@ -c test_alg_noise_tune.c
 
 timer.o: timer.c
diff --git a/alg/tests/test_alg_noise_tune.c b/alg/tests/test_alg_noise_tune.c
index 8aaeb56..50cf807 100644
--- a/alg/tests/test_alg_noise_tune.c
+++ b/alg/tests/test_alg_noise_tune.c
@@ -1,7 +1,10 @@
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <emmintrin.h>
 
+#include "../sse2.h"
 #include "timer.h"
 
 /* Stub structures for test purposes: */
@@ -113,6 +116,7 @@ testsuite (char *name, struct context *ctx, unsigned char *new, void (*func)(str
 }
 
 #include "../alg_noise_tune.plain.c"
+#include "../alg_noise_tune.sse2.c"
 
 int
 main ()
@@ -123,6 +127,7 @@ main ()
 	init(&ctx, &new);
 
 	testsuite("plain", &ctx, new, alg_noise_tune_plain);
+	testsuite("sse2", &ctx, new, alg_noise_tune_sse2);
 
 	free(new);
 	free(ctx.imgs.ref);

From 316171c6817c6c28cf6a9a6b863ecfa033e33054 Mon Sep 17 00:00:00 2001
From: Alfred Klomp <git@alfredklomp.com>
Date: Tue, 2 Sep 2014 20:41:48 +0200
Subject: [PATCH 05/11] alg_update_reference_frame: factor out existing
 function

---
 alg.c                                  | 55 +++-----------------------
 alg/alg_update_reference_frame.plain.c | 50 +++++++++++++++++++++++
 2 files changed, 56 insertions(+), 49 deletions(-)
 create mode 100644 alg/alg_update_reference_frame.plain.c

diff --git a/alg.c b/alg.c
index 24ee518..fcdbec2 100644
--- a/alg.c
+++ b/alg.c
@@ -1288,6 +1288,11 @@ int alg_switchfilter(struct context *cnt, int diffs, unsigned char *newimg)
     return 0;
 }
 
+#define ACCEPT_STATIC_OBJECT_TIME 10  /* Seconds */
+#define EXCLUDE_LEVEL_PERCENT 20
+
+#include "alg/alg_update_reference_frame.plain.c"
+
 /** 
  * alg_update_reference_frame
  *
@@ -1301,55 +1306,7 @@ int alg_switchfilter(struct context *cnt, int diffs, unsigned char *newimg)
  *   action - UPDATE_REF_FRAME or RESET_REF_FRAME
  *
  */
-#define ACCEPT_STATIC_OBJECT_TIME 10  /* Seconds */
-#define EXCLUDE_LEVEL_PERCENT 20
 void alg_update_reference_frame(struct context *cnt, int action) 
 {
-    int accept_timer = cnt->lastrate * ACCEPT_STATIC_OBJECT_TIME;
-    int i, threshold_ref;
-    int *ref_dyn = cnt->imgs.ref_dyn;
-    unsigned char *image_virgin = cnt->imgs.image_virgin;
-    unsigned char *ref = cnt->imgs.ref;
-    unsigned char *smartmask = cnt->imgs.smartmask_final;
-    unsigned char *out = cnt->imgs.out;
-
-    if (cnt->lastrate > 5)  /* Match rate limit */
-        accept_timer /= (cnt->lastrate / 3);
-
-    if (action == UPDATE_REF_FRAME) { /* Black&white only for better performance. */
-        threshold_ref = cnt->noise * EXCLUDE_LEVEL_PERCENT / 100;
-
-        for (i = cnt->imgs.motionsize; i > 0; i--) {
-            /* Exclude pixels from ref frame well below noise level. */
-            if (((int)(abs(*ref - *image_virgin)) > threshold_ref) && (*smartmask)) {
-                if (*ref_dyn == 0) { /* Always give new pixels a chance. */
-                    *ref_dyn = 1;
-                } else if (*ref_dyn > accept_timer) { /* Include static Object after some time. */
-                    *ref_dyn = 0;
-                    *ref = *image_virgin;
-                } else if (*out) {
-                    (*ref_dyn)++; /* Motionpixel? Keep excluding from ref frame. */
-                } else {
-                    *ref_dyn = 0; /* Nothing special - release pixel. */
-                    *ref = (*ref + *image_virgin) / 2;
-                }
-
-            } else {  /* No motion: copy to ref frame. */
-                *ref_dyn = 0; /* Reset pixel */
-                *ref = *image_virgin;
-            }
-
-            ref++;
-            image_virgin++;
-            smartmask++;
-            ref_dyn++;
-            out++;
-        } /* end for i */
-
-    } else {   /* action == RESET_REF_FRAME - also used to initialize the frame at startup. */
-        /* Copy fresh image */
-        memcpy(cnt->imgs.ref, cnt->imgs.image_virgin, cnt->imgs.size);
-        /* Reset static objects */
-        memset(cnt->imgs.ref_dyn, 0, cnt->imgs.motionsize * sizeof(cnt->imgs.ref_dyn)); 
-    }
+    alg_update_reference_frame_plain(cnt, action);
 }
diff --git a/alg/alg_update_reference_frame.plain.c b/alg/alg_update_reference_frame.plain.c
new file mode 100644
index 0000000..d4b2b76
--- /dev/null
+++ b/alg/alg_update_reference_frame.plain.c
@@ -0,0 +1,50 @@
+static void alg_update_reference_frame_plain(struct context *cnt, int action)
+{
+    int accept_timer = cnt->lastrate * ACCEPT_STATIC_OBJECT_TIME;
+    int i, threshold_ref;
+    int *ref_dyn = cnt->imgs.ref_dyn;
+    unsigned char *image_virgin = cnt->imgs.image_virgin;
+    unsigned char *ref = cnt->imgs.ref;
+    unsigned char *smartmask = cnt->imgs.smartmask_final;
+    unsigned char *out = cnt->imgs.out;
+
+    if (cnt->lastrate > 5)  /* Match rate limit */
+        accept_timer /= (cnt->lastrate / 3);
+
+    if (action == UPDATE_REF_FRAME) { /* Black&white only for better performance. */
+        threshold_ref = cnt->noise * EXCLUDE_LEVEL_PERCENT / 100;
+
+        for (i = cnt->imgs.motionsize; i > 0; i--) {
+            /* Exclude pixels from ref frame well below noise level. */
+            if (((int)(abs(*ref - *image_virgin)) > threshold_ref) && (*smartmask)) {
+                if (*ref_dyn == 0) { /* Always give new pixels a chance. */
+                    *ref_dyn = 1;
+                } else if (*ref_dyn > accept_timer) { /* Include static Object after some time. */
+                    *ref_dyn = 0;
+                    *ref = *image_virgin;
+                } else if (*out) {
+                    (*ref_dyn)++; /* Motionpixel? Keep excluding from ref frame. */
+                } else {
+                    *ref_dyn = 0; /* Nothing special - release pixel. */
+                    *ref = (*ref + *image_virgin) / 2;
+                }
+
+            } else {  /* No motion: copy to ref frame. */
+                *ref_dyn = 0; /* Reset pixel */
+                *ref = *image_virgin;
+            }
+
+            ref++;
+            image_virgin++;
+            smartmask++;
+            ref_dyn++;
+            out++;
+        } /* end for i */
+
+    } else {   /* action == RESET_REF_FRAME - also used to initialize the frame at startup. */
+        /* Copy fresh image */
+        memcpy(cnt->imgs.ref, cnt->imgs.image_virgin, cnt->imgs.size);
+        /* Reset static objects */
+        memset(cnt->imgs.ref_dyn, 0, cnt->imgs.motionsize * sizeof(*cnt->imgs.ref_dyn));
+    }
+}

From 77210cab07ee403699e2e522429800a2dace1b14 Mon Sep 17 00:00:00 2001
From: Alfred Klomp <git@alfredklomp.com>
Date: Tue, 2 Sep 2014 20:54:05 +0200
Subject: [PATCH 06/11] alg_update_reference_frame: add test harness

---
 alg/tests/Makefile                          |  12 +-
 alg/tests/test_alg_update_reference_frame.c | 159 ++++++++++++++++++++
 2 files changed, 169 insertions(+), 2 deletions(-)
 create mode 100644 alg/tests/test_alg_update_reference_frame.c

diff --git a/alg/tests/Makefile b/alg/tests/Makefile
index e815551..78ca939 100644
--- a/alg/tests/Makefile
+++ b/alg/tests/Makefile
@@ -1,7 +1,9 @@
 CFLAGS += -std=c89 -Werror -Wall -Wextra -pedantic -msse2 -O3
 LDFLAGS += -lrt
 
-.PHONY: clean
+.PHONY: all clean
+
+all: test_alg_noise_tune test_alg_update_reference_frame
 
 test_alg_noise_tune: test_alg_noise_tune.o timer.o
 	$(CC) $(LDFLAGS) -o $@ $^
@@ -9,8 +11,14 @@ test_alg_noise_tune: test_alg_noise_tune.o timer.o
 test_alg_noise_tune.o: ../alg_noise_tune.plain.c ../alg_noise_tune.sse2.c test_alg_noise_tune.c
 	$(CC) $(CFLAGS) -o $@ -c test_alg_noise_tune.c
 
+test_alg_update_reference_frame: test_alg_update_reference_frame.o timer.o
+	$(CC) $(LDFLAGS) -o $@ $^
+
+test_alg_update_reference_frame.o: ../alg_update_reference_frame.plain.c test_alg_update_reference_frame.c
+	$(CC) $(CFLAGS) -o $@ -c test_alg_update_reference_frame.c
+
 timer.o: timer.c
 	$(CC) $(CFLAGS) -o $@ -c $^
 
 clean:
-	rm -f *.o test_alg_noise_tune
+	rm -f *.o test_alg_noise_tune test_alg_update_reference_frame
diff --git a/alg/tests/test_alg_update_reference_frame.c b/alg/tests/test_alg_update_reference_frame.c
new file mode 100644
index 0000000..2d8fc6c
--- /dev/null
+++ b/alg/tests/test_alg_update_reference_frame.c
@@ -0,0 +1,159 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "timer.h"
+
+/* Stub structures for test purposes: */
+struct images
+{
+	unsigned char *ref;
+	unsigned char *out;
+	int *ref_dyn;
+	unsigned char *image_virgin;
+	unsigned char *smartmask_final;
+	int size;
+	int motionsize;
+};
+
+struct context
+{
+	struct images imgs;
+	int noise;
+	unsigned int lastrate;
+};
+
+#define WIDTH    600
+#define HEIGHT   400
+#define BLOCKPX   50
+
+static void
+init (struct context *ctx)
+{
+	ctx->imgs.motionsize = WIDTH * HEIGHT;
+	ctx->imgs.ref = malloc(ctx->imgs.motionsize);
+	ctx->imgs.out = malloc(ctx->imgs.motionsize);
+	ctx->imgs.ref_dyn = malloc(ctx->imgs.motionsize * sizeof(*ctx->imgs.ref_dyn));
+	ctx->imgs.image_virgin = malloc(ctx->imgs.motionsize);
+	ctx->imgs.smartmask_final = malloc(ctx->imgs.motionsize);
+}
+
+static void
+clean (struct context *ctx)
+{
+	ctx->noise = 0;
+	ctx->lastrate = 0;
+	memset(ctx->imgs.ref, 0, WIDTH * HEIGHT);
+	memset(ctx->imgs.out, 0, WIDTH * HEIGHT);
+	memset(ctx->imgs.ref_dyn, 0, WIDTH * HEIGHT * sizeof(*ctx->imgs.ref_dyn));
+	memset(ctx->imgs.image_virgin, 0, WIDTH * HEIGHT);
+	memset(ctx->imgs.smartmask_final, 0, WIDTH * HEIGHT);
+	ctx->imgs.size = WIDTH * HEIGHT;
+	ctx->imgs.motionsize = WIDTH * HEIGHT;
+}
+
+static void
+permutate (int action, void (*func)(struct context *, int))
+{
+	unsigned char ref[16];
+	unsigned char out[16];
+	unsigned char image_virgin[16];
+	unsigned char smartmask_final[16];
+	int ref_dyn[16];
+	struct context ctx;
+	unsigned int ref_cksum;
+	unsigned int ref_dyn_cksum;
+
+	int i, iter_ref_dyn, iter_smartmask, iter_image_virgin, iter_out, iter_ref;
+
+	ctx.noise = 0;
+	ctx.lastrate = 0;
+	ctx.imgs.ref = ref;
+	ctx.imgs.out = out;
+	ctx.imgs.image_virgin = image_virgin;
+	ctx.imgs.smartmask_final = smartmask_final;
+	ctx.imgs.ref_dyn = ref_dyn;
+	ctx.imgs.size = 16;
+	ctx.imgs.motionsize = 16;
+
+	/* For the purposes of the routine, smartmask is zero or nonzero: */
+	for (iter_smartmask = 0; iter_smartmask < 2; iter_smartmask++) {
+		memset(smartmask_final, iter_smartmask, ctx.imgs.size);
+
+		/* For the purposes of the routine, out is zero or nonzero: */
+		for (iter_out = 0; iter_out < 2; iter_out++) {
+			memset(out, iter_out, ctx.imgs.size);
+
+			ref_cksum = 0;
+			ref_dyn_cksum = 0;
+
+			for (iter_image_virgin = 0; iter_image_virgin < 256; iter_image_virgin++) {
+				memset(image_virgin, iter_image_virgin, ctx.imgs.size);
+
+				/* ref_dyn has a limited range: */
+				for (iter_ref_dyn = 0; iter_ref_dyn < 10; iter_ref_dyn++) {
+					for (i = 0; i < 16; i++) {
+						ref_dyn[i] = iter_ref_dyn + 1;
+					}
+					for (iter_ref = 0; iter_ref < 256; iter_ref++) {
+						memset(ref, iter_ref, ctx.imgs.size);
+						func(&ctx, action);
+						ref_cksum += ref[0];
+
+						for (i = 0; i < 16; i++) {
+							ref_dyn_cksum += ref_dyn[i];
+						}
+					}
+				}
+			}
+			printf("%d %d\n", ref_cksum, ref_dyn_cksum);
+		}
+	}
+}
+
+static void
+testsuite (char *name, struct context *ctx, int action, void (*func)(struct context *, int))
+{
+	int i;
+	float total_time = 0.0f;
+
+	printf("---\n%s\n", name);
+	clean(ctx);
+
+	for (i = 300; i > 0; i--) {
+		timer_start();
+		func(ctx, action);
+		timer_stop();
+		total_time += timer_sec();
+	}
+
+	/* Print bogus value to prevent the loop from being optimized out: */
+	printf("Value: %d\nTime: %.4f sec\n", ctx->imgs.ref[0], total_time);
+
+	permutate(action, func);
+}
+
+#define UPDATE_REF_FRAME  1
+#define ACCEPT_STATIC_OBJECT_TIME 10  /* Seconds */
+#define EXCLUDE_LEVEL_PERCENT 20
+
+#include "../alg_update_reference_frame.plain.c"
+
+int
+main ()
+{
+	struct context ctx;
+
+	init(&ctx);
+
+	testsuite("plain", &ctx, UPDATE_REF_FRAME, alg_update_reference_frame_plain);
+
+	free(ctx.imgs.ref);
+	free(ctx.imgs.out);
+	free(ctx.imgs.ref_dyn);
+	free(ctx.imgs.image_virgin);
+	free(ctx.imgs.smartmask_final);
+
+	return 0;
+}

From 61c7a307132ad9b0f5905e18a036012595be41c8 Mon Sep 17 00:00:00 2001
From: Alfred Klomp <git@alfredklomp.com>
Date: Wed, 3 Sep 2014 19:23:06 +0200
Subject: [PATCH 07/11] alg_update_reference_frame: factor test masks out of
 branch condition

Taking the tests out of the branch condition results in a 50% speedup
with GCC when compiled with -O3.
---
 alg/alg_update_reference_frame.plain.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/alg/alg_update_reference_frame.plain.c b/alg/alg_update_reference_frame.plain.c
index d4b2b76..b03d82f 100644
--- a/alg/alg_update_reference_frame.plain.c
+++ b/alg/alg_update_reference_frame.plain.c
@@ -15,8 +15,11 @@ static void alg_update_reference_frame_plain(struct context *cnt, int action)
         threshold_ref = cnt->noise * EXCLUDE_LEVEL_PERCENT / 100;
 
         for (i = cnt->imgs.motionsize; i > 0; i--) {
+            int thresholdmask = ((int)(abs(*ref - *image_virgin)) > threshold_ref);
+            int includemask = (thresholdmask && (*smartmask != 0));
+
             /* Exclude pixels from ref frame well below noise level. */
-            if (((int)(abs(*ref - *image_virgin)) > threshold_ref) && (*smartmask)) {
+            if (includemask) {
                 if (*ref_dyn == 0) { /* Always give new pixels a chance. */
                     *ref_dyn = 1;
                 } else if (*ref_dyn > accept_timer) { /* Include static Object after some time. */

From d7a388a2ee7f47c7cc9e7385c638eb4ed6e6bf34 Mon Sep 17 00:00:00 2001
From: Alfred Klomp <git@alfredklomp.com>
Date: Wed, 3 Sep 2014 19:39:10 +0200
Subject: [PATCH 08/11] alg_update_reference_frame: add SSE2 algorithm
 demo/test

This file implements the masking algorithm we will use in the SSE2 code.
It demonstrates identical output when compared to the plain function.
It's a lot slower than the regular routine when used on individual
pixels, but is speedier when vectorized.

Instead of branching, we use masks to composite the output values. The
masks were found by breaking down the branch conditions into boolean
operations, and repeatedly applying De Morgan's laws to simplify them.
Since SSE has the 'andnot' operator, optimize for that form.
---
 alg/alg_update_reference_frame.sse2-algo.c  | 55 +++++++++++++++++++++
 alg/tests/Makefile                          |  2 +-
 alg/tests/test_alg_update_reference_frame.c |  2 +
 3 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 alg/alg_update_reference_frame.sse2-algo.c

diff --git a/alg/alg_update_reference_frame.sse2-algo.c b/alg/alg_update_reference_frame.sse2-algo.c
new file mode 100644
index 0000000..eecb62b
--- /dev/null
+++ b/alg/alg_update_reference_frame.sse2-algo.c
@@ -0,0 +1,55 @@
+/* This file is not meant to be included into the main program; it's intended
+ * to showcase, benchmark and test the algorithm used in the SSE2 version of
+ * this routine, in simple, non-vectorized code.
+ * The idea is to replace all conditionals from the "plain" function with a
+ * series of mask operations. This is slow when done per pixel (since we do all
+ * calculations for all pixels), but fast in parallel.
+ */
+static void alg_update_reference_frame_sse2_algo(struct context *cnt, int action)
+{
+    int accept_timer = cnt->lastrate * ACCEPT_STATIC_OBJECT_TIME;
+    int i, threshold_ref;
+    int *ref_dyn = cnt->imgs.ref_dyn;
+    unsigned char *image_virgin = cnt->imgs.image_virgin;
+    unsigned char *ref = cnt->imgs.ref;
+    unsigned char *smartmask = cnt->imgs.smartmask_final;
+    unsigned char *out = cnt->imgs.out;
+
+    if (cnt->lastrate > 5)  /* Match rate limit */
+        accept_timer /= (cnt->lastrate / 3);
+
+    if (action == UPDATE_REF_FRAME) { /* Black&white only for better performance. */
+        threshold_ref = cnt->noise * EXCLUDE_LEVEL_PERCENT / 100;
+
+        for (i = cnt->imgs.motionsize; i > 0; i--) {
+            int thresholdmask = ((int)(abs(*ref - *image_virgin)) > threshold_ref);
+            int includemask = (thresholdmask && !(*smartmask == 0));
+            int refdynzero = (*ref_dyn == 0);
+            int refdyntimer = (*ref_dyn > accept_timer);
+            int outzero = (*out == 0);
+
+            *ref_dyn &= (includemask && !(refdynzero || refdyntimer || outzero));
+
+            if (includemask && !(refdynzero || refdyntimer) && outzero) {
+                *ref = (*ref + *image_virgin) / 2;
+            }
+            if (includemask && !((refdyntimer || outzero) && !refdynzero)) {
+                *ref_dyn += 1;
+            }
+            if (!(includemask && !(refdyntimer && !refdynzero))) {
+                *ref = *image_virgin;
+            }
+            ref++;
+            image_virgin++;
+            smartmask++;
+            ref_dyn++;
+            out++;
+        } /* end for i */
+
+    } else {   /* action == RESET_REF_FRAME - also used to initialize the frame at startup. */
+        /* Copy fresh image */
+        memcpy(cnt->imgs.ref, cnt->imgs.image_virgin, cnt->imgs.size);
+        /* Reset static objects */
+        memset(cnt->imgs.ref_dyn, 0, cnt->imgs.motionsize * sizeof(*cnt->imgs.ref_dyn));
+    }
+}
diff --git a/alg/tests/Makefile b/alg/tests/Makefile
index 78ca939..2a0f3b7 100644
--- a/alg/tests/Makefile
+++ b/alg/tests/Makefile
@@ -14,7 +14,7 @@ test_alg_noise_tune.o: ../alg_noise_tune.plain.c ../alg_noise_tune.sse2.c test_a
 test_alg_update_reference_frame: test_alg_update_reference_frame.o timer.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
-test_alg_update_reference_frame.o: ../alg_update_reference_frame.plain.c test_alg_update_reference_frame.c
+test_alg_update_reference_frame.o: ../alg_update_reference_frame.plain.c ../alg_update_reference_frame.sse2-algo.c test_alg_update_reference_frame.c
 	$(CC) $(CFLAGS) -o $@ -c test_alg_update_reference_frame.c
 
 timer.o: timer.c
diff --git a/alg/tests/test_alg_update_reference_frame.c b/alg/tests/test_alg_update_reference_frame.c
index 2d8fc6c..061b0e7 100644
--- a/alg/tests/test_alg_update_reference_frame.c
+++ b/alg/tests/test_alg_update_reference_frame.c
@@ -139,6 +139,7 @@ testsuite (char *name, struct context *ctx, int action, void (*func)(struct cont
 #define EXCLUDE_LEVEL_PERCENT 20
 
 #include "../alg_update_reference_frame.plain.c"
+#include "../alg_update_reference_frame.sse2-algo.c"
 
 int
 main ()
@@ -148,6 +149,7 @@ main ()
 	init(&ctx);
 
 	testsuite("plain", &ctx, UPDATE_REF_FRAME, alg_update_reference_frame_plain);
+	testsuite("plain, SSE2 algorithm demo", &ctx, UPDATE_REF_FRAME, alg_update_reference_frame_sse2_algo);
 
 	free(ctx.imgs.ref);
 	free(ctx.imgs.out);

From 391587bf69773704d924f59127465c94fb10a461 Mon Sep 17 00:00:00 2001
From: Alfred Klomp <git@alfredklomp.com>
Date: Wed, 3 Sep 2014 21:12:34 +0200
Subject: [PATCH 09/11] cnt->imgs.ref_dyn: specify type as uint16_t, not
 generic int

To convert the algorithms to SSE2, we need to know exactly which width
and type of int we're dealing with. Make this an uint16_t; the type
looks large enough for a counter that updates every frame. At 10 hz, it
will take almost 2 hours for the counter to saturate; enough time to
finally accept a static object.
---
 alg/alg_update_reference_frame.plain.c      | 2 +-
 alg/alg_update_reference_frame.sse2-algo.c  | 2 +-
 alg/tests/test_alg_update_reference_frame.c | 4 ++--
 motion.h                                    | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/alg/alg_update_reference_frame.plain.c b/alg/alg_update_reference_frame.plain.c
index b03d82f..71c0ab1 100644
--- a/alg/alg_update_reference_frame.plain.c
+++ b/alg/alg_update_reference_frame.plain.c
@@ -2,7 +2,7 @@ static void alg_update_reference_frame_plain(struct context *cnt, int action)
 {
     int accept_timer = cnt->lastrate * ACCEPT_STATIC_OBJECT_TIME;
     int i, threshold_ref;
-    int *ref_dyn = cnt->imgs.ref_dyn;
+    uint16_t *ref_dyn = cnt->imgs.ref_dyn;
     unsigned char *image_virgin = cnt->imgs.image_virgin;
     unsigned char *ref = cnt->imgs.ref;
     unsigned char *smartmask = cnt->imgs.smartmask_final;
diff --git a/alg/alg_update_reference_frame.sse2-algo.c b/alg/alg_update_reference_frame.sse2-algo.c
index eecb62b..b8fbef8 100644
--- a/alg/alg_update_reference_frame.sse2-algo.c
+++ b/alg/alg_update_reference_frame.sse2-algo.c
@@ -9,7 +9,7 @@ static void alg_update_reference_frame_sse2_algo(struct context *cnt, int action
 {
     int accept_timer = cnt->lastrate * ACCEPT_STATIC_OBJECT_TIME;
     int i, threshold_ref;
-    int *ref_dyn = cnt->imgs.ref_dyn;
+    uint16_t *ref_dyn = cnt->imgs.ref_dyn;
     unsigned char *image_virgin = cnt->imgs.image_virgin;
     unsigned char *ref = cnt->imgs.ref;
     unsigned char *smartmask = cnt->imgs.smartmask_final;
diff --git a/alg/tests/test_alg_update_reference_frame.c b/alg/tests/test_alg_update_reference_frame.c
index 061b0e7..5431030 100644
--- a/alg/tests/test_alg_update_reference_frame.c
+++ b/alg/tests/test_alg_update_reference_frame.c
@@ -10,7 +10,7 @@ struct images
 {
 	unsigned char *ref;
 	unsigned char *out;
-	int *ref_dyn;
+	uint16_t *ref_dyn;
 	unsigned char *image_virgin;
 	unsigned char *smartmask_final;
 	int size;
@@ -60,7 +60,7 @@ permutate (int action, void (*func)(struct context *, int))
 	unsigned char out[16];
 	unsigned char image_virgin[16];
 	unsigned char smartmask_final[16];
-	int ref_dyn[16];
+	uint16_t ref_dyn[16];
 	struct context ctx;
 	unsigned int ref_cksum;
 	unsigned int ref_dyn_cksum;
diff --git a/motion.h b/motion.h
index c08d84f..9c12255 100644
--- a/motion.h
+++ b/motion.h
@@ -289,7 +289,7 @@ struct images {
 
     unsigned char *ref;               /* The reference frame */
     unsigned char *out;               /* Picture buffer for motion images */
-    int *ref_dyn;                     /* Dynamic objects to be excluded from reference frame */
+    uint16_t *ref_dyn;                /* Dynamic objects to be excluded from reference frame */
     unsigned char *image_virgin;      /* Last picture frame with no text or locate overlay */
     struct image_data preview_image;  /* Picture buffer for best image when enables */
     unsigned char *mask;              /* Buffer for the mask file */

From e28d698f45f7e8bad7cec3e807f033ce975b1bff Mon Sep 17 00:00:00 2001
From: Alfred Klomp <git@alfredklomp.com>
Date: Wed, 3 Sep 2014 22:09:13 +0200
Subject: [PATCH 10/11] alg_update_reference_frame: add optimized SSE2 routine

---
 alg.c                                       |   8 ++
 alg/alg_update_reference_frame.sse2.c       | 142 ++++++++++++++++++++
 alg/sse2.h                                  |  31 +++++
 alg/tests/Makefile                          |   8 +-
 alg/tests/test_alg_update_reference_frame.c |   4 +
 5 files changed, 191 insertions(+), 2 deletions(-)
 create mode 100644 alg/alg_update_reference_frame.sse2.c

diff --git a/alg.c b/alg.c
index fcdbec2..1692537 100644
--- a/alg.c
+++ b/alg.c
@@ -1291,7 +1291,11 @@ int alg_switchfilter(struct context *cnt, int diffs, unsigned char *newimg)
 #define ACCEPT_STATIC_OBJECT_TIME 10  /* Seconds */
 #define EXCLUDE_LEVEL_PERCENT 20
 
+#ifdef HAVE_SSE2
+#include "alg/alg_update_reference_frame.sse2.c"
+#else
 #include "alg/alg_update_reference_frame.plain.c"
+#endif
 
 /** 
  * alg_update_reference_frame
@@ -1308,5 +1312,9 @@ int alg_switchfilter(struct context *cnt, int diffs, unsigned char *newimg)
  */
 void alg_update_reference_frame(struct context *cnt, int action) 
 {
+#ifdef HAVE_SSE2
+    alg_update_reference_frame_sse2(cnt, action);
+#else
     alg_update_reference_frame_plain(cnt, action);
+#endif
 }
diff --git a/alg/alg_update_reference_frame.sse2.c b/alg/alg_update_reference_frame.sse2.c
new file mode 100644
index 0000000..4ce6668
--- /dev/null
+++ b/alg/alg_update_reference_frame.sse2.c
@@ -0,0 +1,142 @@
+/* The basic algorithm is demonstrated in 'alg_update_reference_frame.sse2-algo.c'
+ *  as regular (non-SIMD), more readable code. Comments below allude to
+ *  snippets from that file. The idea is to use masks instead of
+ *  branches to compose the output, then do it in parallel. */
+
+static void alg_update_reference_frame_sse2(struct context *cnt, int action)
+{
+    int accept_timer = cnt->lastrate * ACCEPT_STATIC_OBJECT_TIME;
+    int i, threshold_ref;
+    uint16_t *ref_dyn = cnt->imgs.ref_dyn;
+    unsigned char *image_virgin = cnt->imgs.image_virgin;
+    unsigned char *ref = cnt->imgs.ref;
+    unsigned char *smartmask = cnt->imgs.smartmask_final;
+    unsigned char *out = cnt->imgs.out;
+
+    int sse_iters;
+    __m128i threshrow, accepttimerrow, mask;
+
+    if (cnt->lastrate > 5)  /* Match rate limit */
+        accept_timer /= (cnt->lastrate / 3);
+
+    if (action == UPDATE_REF_FRAME) { /* Black&white only for better performance. */
+        threshold_ref = cnt->noise * EXCLUDE_LEVEL_PERCENT / 100;
+
+        i = cnt->imgs.motionsize;
+
+        /* Below we'll do a calculation to see whether our 8-bit uints
+         * are *larger* than threshold_ref. Threshold_ref is an int, but
+         * for the purposes of this check we can cast it to an 8-bit uint
+         * and clamp it to 255; the comparator can never exceed that value: */
+        threshrow = _mm_set1_epi8((threshold_ref > 0xFF) ? 0xFF : threshold_ref);
+
+        /* Create a row of 8 uint16_t's with almost clamped accept timer: */
+        accepttimerrow = _mm_set1_epi16((accept_timer > 0xFFFE) ? 0xFFFE : accept_timer);
+
+        /* SSE row size is 16 bytes: */
+        for (sse_iters = i >> 4; sse_iters > 0; sse_iters--)
+        {
+            /* Load reference row and virgin image: */
+            __m128i refrow = _mm_loadu_si128((__m128i *)ref);
+            __m128i vgnrow = _mm_loadu_si128((__m128i *)image_virgin);
+
+            /* int thresholdmask = ((int)(abs(*ref - *image_virgin)) > threshold_ref); */
+            __m128i thresholdmask = _mm_cmpgt_epu8(_mm_absdiff_epu8(refrow, vgnrow), threshrow);
+
+            /* int includemask = (thresholdmask && !(*smartmask == 0)); */
+            __m128i smartmaskzero = _mm_cmpeq_epi8(_mm_loadu_si128((__m128i *)smartmask), _mm_setzero_si128());
+            __m128i includemask = _mm_andnot_si128(smartmaskzero, thresholdmask);
+
+            /* Load the two ref_dyn's: */
+            __m128i refdynlo = _mm_loadu_si128((__m128i *)(ref_dyn + 0));
+            __m128i refdynhi = _mm_loadu_si128((__m128i *)(ref_dyn + 8));
+
+            /* int refdynzero = (*ref_dyn == 0); */
+            /* Make an 8-bit mask with 0xFF where ref_dyn == 0: */
+            __m128i refdynzero = _mm_packs_epi16(
+                _mm_cmpeq_epi16(refdynlo, _mm_setzero_si128()),
+                _mm_cmpeq_epi16(refdynhi, _mm_setzero_si128())
+            );
+
+            /* int refdyntimer = (*ref_dyn > accept_timer); */
+            /* Make an 8-bit mask with 0xFF where ref_dyn > accept_timer: */
+            __m128i refdyntimer = _mm_packs_epi16(
+                _mm_cmpgt_epu16(refdynlo, accepttimerrow),
+                _mm_cmpgt_epu16(refdynhi, accepttimerrow)
+            );
+
+            /* int outzero = (*out == 0); */
+            __m128i outzero = _mm_cmpeq_epi8(_mm_loadu_si128((__m128i *)out), _mm_setzero_si128());
+
+            /* *ref_dyn &= (includemask && !(refdynzero || refdyntimer || outzero)); */
+            mask = _mm_andnot_si128(_mm_or_si128(_mm_or_si128(refdynzero, refdyntimer), outzero), includemask);
+
+            /* Duplicate mask to 16-bit widths: */
+            refdynlo = _mm_and_si128(refdynlo, _mm_unpacklo_epi8(mask, mask));
+            refdynhi = _mm_and_si128(refdynhi, _mm_unpackhi_epi8(mask, mask));
+
+            /* if (includemask && !(refdynzero || refdyntimer) && outzero) *ref = (*ref + *image_virgin) / 2; */
+            mask = _mm_and_si128(_mm_andnot_si128(_mm_or_si128(refdynzero, refdyntimer), includemask), outzero);
+            refrow = _mm_blendv_si128(refrow, _mm_avg_epu8(refrow, vgnrow), mask);
+
+            /* if (includemask && !((refdyntimer || outzero) && !refdynzero)) *ref_dyn += 1; */
+            mask = _mm_andnot_si128(_mm_andnot_si128(refdynzero, _mm_or_si128(refdyntimer, outzero)), includemask);
+            refdynlo = _mm_adds_epu16(refdynlo, _mm_and_si128(_mm_set1_epi16(1), _mm_unpacklo_epi8(mask, mask)));
+            refdynhi = _mm_adds_epu16(refdynhi, _mm_and_si128(_mm_set1_epi16(1), _mm_unpackhi_epi8(mask, mask)));
+
+            /* Store the two ref dyn's back: */
+            _mm_storeu_si128((__m128i *)(ref_dyn + 0), refdynlo);
+            _mm_storeu_si128((__m128i *)(ref_dyn + 8), refdynhi);
+
+            /* if (!(includemask && !(refdyntimer && !refdynzero))) *ref = *image_virgin; */
+            mask = _mm_andnot_si128(_mm_andnot_si128(refdynzero, refdyntimer), includemask);
+            refrow = _mm_blendv_si128(vgnrow, refrow, mask);
+
+            /* Store ref back: */
+            _mm_storeu_si128((__m128i *)ref, refrow);
+
+            ref += 16;
+            image_virgin += 16;
+            smartmask += 16;
+            ref_dyn += 16;
+            out += 16;
+        }
+
+        /* Let the bytewise code handle the remaining bytes: */
+        for (i = cnt->imgs.motionsize & 0x0F; i > 0; i--) {
+            int thresholdmask = ((int)(abs(*ref - *image_virgin)) > threshold_ref);
+            int includemask = (thresholdmask && (*smartmask != 0));
+
+            /* Exclude pixels from ref frame well below noise level. */
+            if (includemask) {
+                if (*ref_dyn == 0) { /* Always give new pixels a chance. */
+                    *ref_dyn = 1;
+                } else if (*ref_dyn > accept_timer) { /* Include static Object after some time. */
+                    *ref_dyn = 0;
+                    *ref = *image_virgin;
+                } else if (*out) {
+                    (*ref_dyn)++; /* Motionpixel? Keep excluding from ref frame. */
+                } else {
+                    *ref_dyn = 0; /* Nothing special - release pixel. */
+                    *ref = (*ref + *image_virgin) / 2;
+                }
+
+            } else {  /* No motion: copy to ref frame. */
+                *ref_dyn = 0; /* Reset pixel */
+                *ref = *image_virgin;
+            }
+
+            ref++;
+            image_virgin++;
+            smartmask++;
+            ref_dyn++;
+            out++;
+        } /* end for i */
+
+    } else {   /* action == RESET_REF_FRAME - also used to initialize the frame at startup. */
+        /* Copy fresh image */
+        memcpy(cnt->imgs.ref, cnt->imgs.image_virgin, cnt->imgs.size);
+        /* Reset static objects */
+        memset(cnt->imgs.ref_dyn, 0, cnt->imgs.motionsize * sizeof(*cnt->imgs.ref_dyn));
+    }
+}
diff --git a/alg/sse2.h b/alg/sse2.h
index 8e193d7..f7e5a64 100644
--- a/alg/sse2.h
+++ b/alg/sse2.h
@@ -1,3 +1,27 @@
+static __inline __m128i
+_mm_cmpgt_epu8 (__m128i x, __m128i y)
+{
+    /* Returns 0xFF where x > y: */
+    return _mm_andnot_si128(
+        _mm_cmpeq_epi8(x, y),
+        _mm_cmpeq_epi8(_mm_max_epu8(x, y), x)
+    );
+}
+
+static __inline __m128i
+_mm_cmple_epu16 (__m128i x, __m128i y)
+{
+    /* Returns 0xFFFF where x <= y: */
+    return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128());
+}
+
+static __inline __m128i
+_mm_cmpgt_epu16 (__m128i x, __m128i y)
+{
+    /* Returns 0xFFFF where x > y: */
+    return _mm_andnot_si128(_mm_cmpeq_epi16(x, y), _mm_cmple_epu16(y, x));
+}
+
 static __inline __m128i
 _mm_absdiff_epu8 (__m128i x, __m128i y)
 {
@@ -5,6 +29,13 @@ _mm_absdiff_epu8 (__m128i x, __m128i y)
     return _mm_or_si128(_mm_subs_epu8(x, y), _mm_subs_epu8(y, x));
 }
 
+static __inline __m128i
+_mm_blendv_si128 (__m128i x, __m128i y, __m128i mask)
+{
+    /* Replace bit in x with bit in y when matching bit in mask is set: */
+    return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(mask, y));
+}
+
 static __inline __m128i
 _mm_div255_epu16 (__m128i x)
 {
diff --git a/alg/tests/Makefile b/alg/tests/Makefile
index 2a0f3b7..a02b4f5 100644
--- a/alg/tests/Makefile
+++ b/alg/tests/Makefile
@@ -14,11 +14,15 @@ test_alg_noise_tune.o: ../alg_noise_tune.plain.c ../alg_noise_tune.sse2.c test_a
 test_alg_update_reference_frame: test_alg_update_reference_frame.o timer.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
-test_alg_update_reference_frame.o: ../alg_update_reference_frame.plain.c ../alg_update_reference_frame.sse2-algo.c test_alg_update_reference_frame.c
+test_alg_update_reference_frame.o: ../alg_update_reference_frame.plain.c ../alg_update_reference_frame.sse2-algo.c ../alg_update_reference_frame.sse2.c test_alg_update_reference_frame.c
 	$(CC) $(CFLAGS) -o $@ -c test_alg_update_reference_frame.c
 
 timer.o: timer.c
 	$(CC) $(CFLAGS) -o $@ -c $^
 
+# This one is just for curiosity:
+test_alg_update_reference_frame.s: test_alg_update_reference_frame.c
+	$(CC) $(CFLAGS) -S -o $@ -c $^
+
 clean:
-	rm -f *.o test_alg_noise_tune test_alg_update_reference_frame
+	rm -f *.o *.s test_alg_noise_tune test_alg_update_reference_frame
diff --git a/alg/tests/test_alg_update_reference_frame.c b/alg/tests/test_alg_update_reference_frame.c
index 5431030..2925538 100644
--- a/alg/tests/test_alg_update_reference_frame.c
+++ b/alg/tests/test_alg_update_reference_frame.c
@@ -2,7 +2,9 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <emmintrin.h>
 
+#include "../sse2.h"
 #include "timer.h"
 
 /* Stub structures for test purposes: */
@@ -140,6 +142,7 @@ testsuite (char *name, struct context *ctx, int action, void (*func)(struct cont
 
 #include "../alg_update_reference_frame.plain.c"
 #include "../alg_update_reference_frame.sse2-algo.c"
+#include "../alg_update_reference_frame.sse2.c"
 
 int
 main ()
@@ -150,6 +153,7 @@ main ()
 
 	testsuite("plain", &ctx, UPDATE_REF_FRAME, alg_update_reference_frame_plain);
 	testsuite("plain, SSE2 algorithm demo", &ctx, UPDATE_REF_FRAME, alg_update_reference_frame_sse2_algo);
+	testsuite("SSE2", &ctx, UPDATE_REF_FRAME, alg_update_reference_frame_sse2);
 
 	free(ctx.imgs.ref);
 	free(ctx.imgs.out);

From 282e571655278889b87a59b889ad93c176a0b3b5 Mon Sep 17 00:00:00 2001
From: Alfred Klomp <git@alfredklomp.com>
Date: Wed, 3 Sep 2014 22:34:20 +0200
Subject: [PATCH 11/11] alg_update_reference_frame: test harness: directly
 compare functions

Directly run two functions on the same input and check whether they give
the same output; don't just rely on printing a few numbers to the screen
and eyeballing the results.
---
 alg/tests/test_alg_update_reference_frame.c | 103 ++++++++++++++------
 1 file changed, 73 insertions(+), 30 deletions(-)

diff --git a/alg/tests/test_alg_update_reference_frame.c b/alg/tests/test_alg_update_reference_frame.c
index 2925538..218fa31 100644
--- a/alg/tests/test_alg_update_reference_frame.c
+++ b/alg/tests/test_alg_update_reference_frame.c
@@ -55,17 +55,59 @@ clean (struct context *ctx)
 	ctx->imgs.motionsize = WIDTH * HEIGHT;
 }
 
+static int
+equal_output (struct context *ctx, int action, void (*func_a)(struct context *, int), void (*func_b)(struct context *, int))
+{
+	int i, ret = 1;
+	struct context cxs[2];
+
+	for (i = 0; i < 2; i++)
+	{
+		/* Copy original context: */
+		memcpy(&cxs[i], ctx, sizeof(*ctx));
+		memcpy(&cxs[i].imgs, &ctx->imgs, sizeof(ctx->imgs));
+
+		/* Copy the original image structures: */
+		#define CPY(x)  cxs[i].imgs.x = malloc(ctx->imgs.size * sizeof(*ctx->imgs.x)); memcpy(cxs[i].imgs.x, ctx->imgs.x, ctx->imgs.size * sizeof(*ctx->imgs.x));
+		CPY(ref)
+		CPY(out)
+		CPY(image_virgin)
+		CPY(smartmask_final)
+		CPY(ref_dyn)
+		#undef CPY
+	}
+	/* Run both functions on their own copy: */
+	func_a(&cxs[0], action);
+	func_b(&cxs[1], action);
+
+	/* Compare image outputs: */
+	#define CMP(x)  if (memcmp(cxs[0].imgs.x, cxs[1].imgs.x, sizeof(*cxs[0].imgs.x)) != 0) { ret = 0; goto out; }
+	CMP(ref)
+	CMP(ref_dyn)
+	#undef CMP
+
+out:	/* Free memory, return: */
+	for (i = 0; i < 2; i++) {
+		free(cxs[i].imgs.ref);
+		free(cxs[i].imgs.out);
+		free(cxs[i].imgs.image_virgin);
+		free(cxs[i].imgs.smartmask_final);
+		free(cxs[i].imgs.ref_dyn);
+	}
+	return ret;
+}
+
 static void
-permutate (int action, void (*func)(struct context *, int))
+permutate (int action, void (*func_a)(struct context *, int), void (*func_b)(struct context *, int))
 {
-	unsigned char ref[16];
-	unsigned char out[16];
-	unsigned char image_virgin[16];
-	unsigned char smartmask_final[16];
-	uint16_t ref_dyn[16];
+	#define STRIPSZ 41
+
+	unsigned char ref[STRIPSZ];
+	unsigned char out[STRIPSZ];
+	unsigned char image_virgin[STRIPSZ];
+	unsigned char smartmask_final[STRIPSZ];
+	uint16_t ref_dyn[STRIPSZ];
 	struct context ctx;
-	unsigned int ref_cksum;
-	unsigned int ref_dyn_cksum;
 
 	int i, iter_ref_dyn, iter_smartmask, iter_image_virgin, iter_out, iter_ref;
 
@@ -76,8 +118,8 @@ permutate (int action, void (*func)(struct context *, int))
 	ctx.imgs.image_virgin = image_virgin;
 	ctx.imgs.smartmask_final = smartmask_final;
 	ctx.imgs.ref_dyn = ref_dyn;
-	ctx.imgs.size = 16;
-	ctx.imgs.motionsize = 16;
+	ctx.imgs.size = STRIPSZ;
+	ctx.imgs.motionsize = STRIPSZ;
 
 	/* For the purposes of the routine, smartmask is zero or nonzero: */
 	for (iter_smartmask = 0; iter_smartmask < 2; iter_smartmask++) {
@@ -87,35 +129,35 @@ permutate (int action, void (*func)(struct context *, int))
 		for (iter_out = 0; iter_out < 2; iter_out++) {
 			memset(out, iter_out, ctx.imgs.size);
 
-			ref_cksum = 0;
-			ref_dyn_cksum = 0;
-
 			for (iter_image_virgin = 0; iter_image_virgin < 256; iter_image_virgin++) {
-				memset(image_virgin, iter_image_virgin, ctx.imgs.size);
-
+				for (i = 0; i < ctx.imgs.size; i++) {
+					image_virgin[i] = iter_image_virgin + i;
+				}
 				/* ref_dyn has a limited range: */
 				for (iter_ref_dyn = 0; iter_ref_dyn < 10; iter_ref_dyn++) {
-					for (i = 0; i < 16; i++) {
-						ref_dyn[i] = iter_ref_dyn + 1;
+					for (i = 0; i < ctx.imgs.size; i++) {
+						ref_dyn[i] = iter_ref_dyn + i;
 					}
 					for (iter_ref = 0; iter_ref < 256; iter_ref++) {
-						memset(ref, iter_ref, ctx.imgs.size);
-						func(&ctx, action);
-						ref_cksum += ref[0];
-
-						for (i = 0; i < 16; i++) {
-							ref_dyn_cksum += ref_dyn[i];
+						for (i = 0; i < ctx.imgs.size; i++) {
+							ref[i] = iter_ref + i;
+						}
+						/* For this permutation, check that both functions
+						 * return the same output data: */
+						if (equal_output(&ctx, action, func_a, func_b) == 0) {
+							printf("Functions do NOT match!\n");
+							return;
 						}
 					}
 				}
 			}
-			printf("%d %d\n", ref_cksum, ref_dyn_cksum);
 		}
 	}
+	printf("Functions MATCH\n");
 }
 
 static void
-testsuite (char *name, struct context *ctx, int action, void (*func)(struct context *, int))
+timing (char *name, struct context *ctx, int action, void (*func)(struct context *, int))
 {
 	int i;
 	float total_time = 0.0f;
@@ -132,8 +174,6 @@ testsuite (char *name, struct context *ctx, int action, void (*func)(struct cont
 
 	/* Print bogus value to prevent the loop from being optimized out: */
 	printf("Value: %d\nTime: %.4f sec\n", ctx->imgs.ref[0], total_time);
-
-	permutate(action, func);
 }
 
 #define UPDATE_REF_FRAME  1
@@ -151,9 +191,12 @@ main ()
 
 	init(&ctx);
 
-	testsuite("plain", &ctx, UPDATE_REF_FRAME, alg_update_reference_frame_plain);
-	testsuite("plain, SSE2 algorithm demo", &ctx, UPDATE_REF_FRAME, alg_update_reference_frame_sse2_algo);
-	testsuite("SSE2", &ctx, UPDATE_REF_FRAME, alg_update_reference_frame_sse2);
+	timing("plain", &ctx, UPDATE_REF_FRAME, alg_update_reference_frame_plain);
+	timing("plain, SSE2 algorithm demo", &ctx, UPDATE_REF_FRAME, alg_update_reference_frame_sse2_algo);
+	timing("SSE2", &ctx, UPDATE_REF_FRAME, alg_update_reference_frame_sse2);
+
+	permutate(UPDATE_REF_FRAME, alg_update_reference_frame_plain, alg_update_reference_frame_sse2_algo);
+	permutate(UPDATE_REF_FRAME, alg_update_reference_frame_plain, alg_update_reference_frame_sse2);
 
 	free(ctx.imgs.ref);
 	free(ctx.imgs.out);