diff --git a/CMakeLists.txt b/CMakeLists.txt
index afc7423f..a8bc6a39 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,7 +69,7 @@ if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Release)
 endif()
 
-set(CMAKE_CXX_FLAGS_RELEASE "-O2")
+set(CMAKE_CXX_FLAGS_RELEASE "-O2 -march=native")
 
 #== GTest 
 option(RUN_GTEST "Downloads google unit test API and runs google test scripts to test Nyxus" OFF)
@@ -137,11 +137,13 @@ set(SOURCE
 	src/nyx/output_2_csv.cpp
 	src/nyx/parallel.cpp
 	src/nyx/phase1.cpp
+	src/nyx/phase1_fastloop.cpp
 	src/nyx/phase2.cpp
 	src/nyx/phase3.cpp
 	src/nyx/pixel_feed.cpp
 	src/nyx/reduce_by_feature.cpp
 	src/nyx/reduce_trivial_rois.cpp
+	src/nyx/rle.cpp
 	src/nyx/roi_cache.cpp
 	src/nyx/roi_cache_basic.cpp
 	src/nyx/scan_fastloader_way.cpp
diff --git a/src/nyx/features/aabb.h b/src/nyx/features/aabb.h
index 7c93d032..5ed5ca90 100644
--- a/src/nyx/features/aabb.h
+++ b/src/nyx/features/aabb.h
@@ -26,6 +26,14 @@ class AABB
 		xmin = std::min(xmin, x);
 		xmax = std::max(xmax, x);
 	}
+	void update_xmax(StatsInt x)
+	{
+		xmax = std::max(xmax, x);
+	}
+	void update_xmin(StatsInt x)
+	{
+		xmin = std::min(xmin, x);
+	}
 	void update_y(StatsInt y)
 	{
 		ymin = std::min(ymin, y);
@@ -52,4 +60,4 @@ class AABB
 
 private:
 	StatsInt xmin = INT32_MAX, xmax = INT32_MIN, ymin = INT32_MAX, ymax = INT32_MIN;
-};
\ No newline at end of file
+};
diff --git a/src/nyx/features/basic_morphology.cpp b/src/nyx/features/basic_morphology.cpp
index b77293e2..8d2adb24 100644
--- a/src/nyx/features/basic_morphology.cpp
+++ b/src/nyx/features/basic_morphology.cpp
@@ -36,10 +36,19 @@ void BasicMorphologyFeatures::calculate(LR& r)
 	// --CENTROID_XY
 	double cen_x = 0.0,
 		cen_y = 0.0;
-	for (auto& px : r.raw_pixels)
-	{
-		cen_x += px.x;
-		cen_y += px.y;
+	if (r.rle_pixels.empty()) {
+		for (auto& px : r.raw_pixels)
+		{
+			cen_x += px.x;
+			cen_y += px.y;
+		}
+	} else {
+		for (int i = 0; i < r.rle_pixels.size(); i+=2) {
+			double n = r.rle_pixels[i+1].x - r.rle_pixels[i].x + 1;
+			double s = r.rle_pixels[i+1].x + r.rle_pixels[i].x;
+			cen_x += n/2 * s;
+			cen_y += r.rle_pixels[i].y * n;
+		}
 	}
 
 	val_CENTROID_X = cen_x;
@@ -62,10 +71,19 @@ void BasicMorphologyFeatures::calculate(LR& r)
 
 	//==== Basic morphology :: Centroids
 	val_CENTROID_X = val_CENTROID_Y = 0;
-	for (auto& px : r.raw_pixels)
-	{
-		val_CENTROID_X += px.x;
-		val_CENTROID_Y += px.y;
+	if (r.rle_pixels.empty()) {
+		for (auto& px : r.raw_pixels)
+		{
+			val_CENTROID_X += px.x;
+			val_CENTROID_Y += px.y;
+		}
+	} else {
+		for (int i = 0; i < r.rle_pixels.size(); i+=2) {
+			double n = r.rle_pixels[i+1].x - r.rle_pixels[i].x + 1;
+			double s = r.rle_pixels[i+1].x + r.rle_pixels[i].x;
+			val_CENTROID_X += n/2 * s;
+			val_CENTROID_Y += r.rle_pixels[i].y * n;
+		}
 	}
 	val_CENTROID_X /= n;
 	val_CENTROID_Y /= n;
diff --git a/src/nyx/features/contour.cpp b/src/nyx/features/contour.cpp
index 1632ea93..c169e2f1 100644
--- a/src/nyx/features/contour.cpp
+++ b/src/nyx/features/contour.cpp
@@ -1,3 +1,6 @@
+#include "pixel.h"
+#include <cstdint>
+#include <vector>
 #define _USE_MATH_DEFINES	// For M_PI, etc.
 #include <cmath>
 #include <memory>
@@ -30,8 +33,169 @@ ContourFeature::ContourFeature() : FeatureMethod("ContourFeature")
 		});
 }
 
+void ContourFeature::buildRegularContourFast(LR& r)
+{
+
+	r.contour.clear();
+
+	// Order the RLE
+	std::vector<Pixel2> rle = r.rle_pixels;
+	std::sort (rle.begin(), rle.end(), compare_locations);
+
+	// Get starting points of each row
+	// Also merge overlapping RLE
+	std::vector<uint32_t> rows;
+	rows.push_back(0);
+	for (uint32_t i = 2; i < rle.size(); i+=2) {
+
+		// Finds row boundary
+		if (rle[i].y != rle[i-1].y) {
+			rows.push_back(i);
+			continue;
+		}
+
+		// Find overlapping RLE and merge them
+		if (rle[i].x-1 == rle[i-1].x) {
+			rle[i-1] = rle[i+1];
+			rle[i].x = rle[i+1].x+1;
+		}
+	}
+	rows.push_back(rle.size());
+
+	// The first row must be entirely on the edge, so store it in the contour
+	uint32_t pix_ind = 0;
+	for (uint32_t i = 0; i < rows[1]; i+=2) {
+		uint32_t n = rle[i+1].x - rle[i].x + 1;
+		for (uint32_t ind = pix_ind; ind < pix_ind + n; ++ind) {
+			Pixel2 p(r.raw_pixels[ind].x,r.raw_pixels[ind].y,r.raw_pixels[ind].inten);
+			r.contour.push_back(p);
+		}
+		pix_ind += n;
+	}
+
+	// Loop over rows from the 2nd row until the 2nd to last row
+	for (uint32_t p = 1; p < rows.size()-2; ++p) {
+		uint32_t prev = rows[p-1];
+		uint32_t prev_end = rows[p];
+		uint32_t next = rows[p+1];
+		uint32_t next_end = rows[p+2];
+
+		// Find the intersection of previous and next rows
+		std::vector<uint32_t> overlap;
+		while (prev < prev_end && next < next_end) {
+			if (rle[prev].x > rle[next+1].x) {
+				next += 2;
+				continue;
+			}
+			if (rle[next].x > rle[prev+1].x) {
+				prev += 2;
+				continue;
+			}
+
+			// If we get to this point, we have found overlap
+			overlap.push_back(std::max(rle[prev].x,rle[next].x));
+			overlap.push_back(std::min(rle[prev+1].x,rle[next+1].x));
+
+			// Determine whether to advance next or previous
+			if (rle[prev+1].x > rle[next+1].x) {
+				next += 2;
+			} else {
+				prev += 2;
+			}
+		}
+
+		// If there is no overlap, all pixels are edge pixels
+		if (overlap.empty()) {
+			for (uint32_t current = rows[p]; current < rows[p+1]; current+=2) {
+				uint32_t n = rle[current+1].x - rle[current].x + 1;
+				for (uint32_t ind = pix_ind; ind < pix_ind + n; ++ind) {
+					Pixel2 p(r.raw_pixels[ind].x,r.raw_pixels[ind].y,r.raw_pixels[ind].inten);
+					r.contour.push_back(p);
+				}
+				pix_ind += n;
+			}
+		} else {
+			// Add pixels to non-overlapping regions
+			uint32_t oind = 0;
+			uint32_t offset = 0;
+			std::vector<uint32_t> nonoverlap;
+			for (uint32_t current = rows[p]; current < rows[p+1]; current+=2) {
+
+				// skip over invalid rle
+				if (rle[current+1].x < rle[current].x) {
+					continue;
+				}
+				
+				// The first pixel is always on the edge
+				nonoverlap.push_back(offset);
+
+				uint32_t x_start = rle[current].x;
+
+				while (oind < overlap.size()) {
+
+					if (overlap[oind+1] < rle[current].x) {
+						oind += 2;
+						continue;
+					}
+					
+					if (overlap[oind] < rle[current+1].x) {
+						long end = std::max(rle[current].x + 1, (long) overlap[oind]);
+						long start = std::min(rle[current + 1].x, (long) overlap[oind + 1]+1);
+						nonoverlap.push_back(end - x_start + offset);
+						nonoverlap.push_back(start - x_start + offset);
+					}
+
+					if (overlap[oind+1] < rle[current+1].x) {
+						oind += 2;
+					} else {
+						break;
+					}
+				}
+
+				// The last pixel is always on the edge
+				// In the case of a single pixel gap, it might already be there
+				uint32_t last = rle[current+1].x - x_start + 1 + offset;
+				if (nonoverlap[nonoverlap.size()-2] == last) {
+					nonoverlap.pop_back();
+				} else {
+					nonoverlap.push_back(last);
+				}
+
+				offset += rle[current+1].x - rle[current].x + 1;
+			}
+
+			// Add pixels to the contour
+			for (uint32_t ind = 0; ind < nonoverlap.size(); ind += 2) {
+				for (uint32_t pind = nonoverlap[ind]; pind < nonoverlap[ind+1]; ++pind) {
+					Pixel2 p(r.raw_pixels[pix_ind+pind].x, r.raw_pixels[pix_ind+pind].y, r.raw_pixels[pix_ind+pind].inten);
+					r.contour.push_back(p);
+				}
+			}
+			pix_ind += offset;
+		}
+	}
+
+	// The last row must be entirely on the edge, so store it in the contour
+	pix_ind = r.raw_pixels.size();
+	for (uint32_t i = rows[rows.size()-2]; i < rows[rows.size()-1]; i+=2) {
+		uint32_t n = rle[i+1].x - rle[i].x + 1;
+		pix_ind -= n;
+		for (uint32_t ind = pix_ind; ind < pix_ind + n; ++ind) {
+			Pixel2 p(r.raw_pixels[ind].x,r.raw_pixels[ind].y,r.raw_pixels[ind].inten);
+			r.contour.push_back(p);
+		}
+	}
+}
+
 void ContourFeature::buildRegularContour(LR& r)
 {
+
+	// If RLE is present, use it
+	if (!r.rle_pixels.empty()) {
+		buildRegularContourFast(r);
+		return;
+	}
+
 	//==== Pad the image
 
 	int width = r.aux_image_matrix.width,
@@ -179,6 +343,11 @@ void ContourFeature::buildRegularContour(LR& r)
 	}
 }
 
+bool ContourFeature::compare_locations (const Pixel2& lhs, const Pixel2& rhs)
+{
+	return (lhs.y < rhs.y) || (lhs.y == rhs.y && lhs.x < rhs.x);
+}
+
 void ContourFeature::buildWholeSlideContour(LR& r)
 {
 	// Push the 4 slide vertices of dummy intensity 999
@@ -265,8 +434,10 @@ namespace Nyxus
 		if (r.roi_disabled)
 			return;
 
-		//==== Calculate ROI's image matrix
-		r.aux_image_matrix.calculate_from_pixelcloud (r.raw_pixels, r.aabb);
+		//==== Calculate ROI's image matrix if no RLE is present
+		if (r.rle_pixels.empty()) {
+			r.aux_image_matrix.calculate_from_pixelcloud (r.raw_pixels, r.aabb);
+		}
 
 		//==== Contour, ROI perimeter, equivalent circle diameter
 		ContourFeature f;
diff --git a/src/nyx/features/contour.h b/src/nyx/features/contour.h
index 1e4a817c..2696981d 100644
--- a/src/nyx/features/contour.h
+++ b/src/nyx/features/contour.h
@@ -113,6 +113,8 @@ class ContourFeature: public FeatureMethod
 
 private:
 	void buildRegularContour(LR& r);
+	void buildRegularContourFast(LR& r);
+	static bool compare_locations (const Pixel2& lhs, const Pixel2& rhs);
 	void buildWholeSlideContour(LR& r);
 	std::tuple<double, double, double, double> calc_min_max_mean_stddev_intensity (const std::vector<Pixel2> & contour_pixels);
 	double
diff --git a/src/nyx/features/convex_hull_nontriv.cpp b/src/nyx/features/convex_hull_nontriv.cpp
index 27bdd6dd..17d098a7 100644
--- a/src/nyx/features/convex_hull_nontriv.cpp
+++ b/src/nyx/features/convex_hull_nontriv.cpp
@@ -24,7 +24,11 @@ void ConvexHullFeature::cleanup_instance()
 void ConvexHullFeature::calculate (LR& r)
 {
 	// Build the convex hull
-	build_convex_hull(r.raw_pixels, r.convHull_CH);
+	if (r.rle_pixels.empty()) {
+		build_convex_hull(r.raw_pixels, r.convHull_CH);
+	} else {
+		build_convex_hull(r.rle_pixels, r.convHull_CH);
+	}
 
 	// Calculate related features
 	double s_hull = polygon_area(r.convHull_CH),
diff --git a/src/nyx/features_calc_workflow.cpp b/src/nyx/features_calc_workflow.cpp
index b165f34c..122890c4 100644
--- a/src/nyx/features_calc_workflow.cpp
+++ b/src/nyx/features_calc_workflow.cpp
@@ -134,6 +134,27 @@ namespace Nyxus
 		r.intFname = intFile;
 	}
 
+	void init_label_record_2_fast (LR& r, const std::string& segFile, const std::string& intFile, int x1, int x2, int y, int label, PixIntens maxInt, PixIntens minInt, unsigned int tile_index)
+	{
+		// Cache the host tile's index
+		r.host_tiles.insert (tile_index);
+
+		// Initialize basic counters
+		r.aux_area = x2-x1;
+		r.aux_min = minInt;
+		r.aux_max = maxInt;
+		r.aabb.init_y (y);
+		r.aabb.init_x(x1);
+		r.aabb.update_xmax(x2-1);
+
+		// Cache the ROI label
+		r.label = label;
+
+		// File names
+		r.segFname = segFile;
+		r.intFname = intFile;
+	}
+
 	// This function 'digests' the 2nd and the following pixel of a label and updates the label's feature calculation state - the instance of structure 'LR'
 	void update_label_record(LR& lr, int x, int y, int label, PixIntens intensity)
 	{
@@ -163,4 +184,18 @@ namespace Nyxus
 		lr.update_aabb (x,y);
 	}
 
-}
\ No newline at end of file
+	void update_label_record_2_fast (LR& lr, int x1, int x2, int y, int label, PixIntens maxInt, PixIntens minInt, unsigned int tile_index)
+	{
+		// Cache the host tile's index
+		lr.host_tiles.insert(tile_index);
+
+		// Initialize basic counters
+		lr.aux_area += x2-x1;
+		lr.aux_min = std::min(lr.aux_min, minInt);
+		lr.aux_max = std::max(lr.aux_max, maxInt);
+		lr.aabb.update_xmin (x1);
+		lr.aabb.update_xmax (x2-1);
+		lr.aabb.update_y (y);
+	}
+
+}
diff --git a/src/nyx/globals.h b/src/nyx/globals.h
index a4dcbeaa..5c4aee7e 100644
--- a/src/nyx/globals.h
+++ b/src/nyx/globals.h
@@ -7,6 +7,7 @@
 #include <vector>
 #include <unordered_map>
 #include <unordered_set>
+#include "features/pixel.h"
 #include "featureset.h"
 #include "feature_method.h"
 #include "feature_mgr.h"
@@ -21,9 +22,11 @@ namespace Nyxus
 
 	bool scanFilePairParallel(const std::string& intens_fpath, const std::string& label_fpath, int num_fastloader_threads, int num_sensemaker_threads, int filepair_index, int tot_num_filepairs);
 	std::string getPureFname(const std::string& fpath);
+	int processDataset(const std::vector<std::string>& intensFiles, const std::vector<std::string>& labelFiles, int numFastloaderThreads, int numSensemakerThreads, int numReduceThreads, int min_online_roi_size, bool save2csv, const std::string& csvOutputDir, bool use_fastloop);
 	int processDataset(const std::vector<std::string>& intensFiles, const std::vector<std::string>& labelFiles, int numFastloaderThreads, int numSensemakerThreads, int numReduceThreads, int min_online_roi_size, bool save2csv, const std::string& csvOutputDir);
 	bool gatherRoisMetrics(const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads);
-	bool processTrivialRois (const std::vector<int>& trivRoiLabels, const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads, size_t memory_limit);
+	bool gatherRoisMetricsFast(const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads);
+	bool processTrivialRois (const std::vector<int>& trivRoiLabels, const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads, size_t memory_limit, bool use_fastloop);
 	bool processNontrivialRois (const std::vector<int>& nontrivRoiLabels, const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads);
 	void dump_roi_metrics(const std::string & label_fpath);
 
@@ -49,8 +52,11 @@ namespace Nyxus
 
 	void init_label_record(LR& lr, const std::string& segFile, const std::string& intFile, int x, int y, int label, PixIntens intensity);
 	void init_label_record_2(LR& lr, const std::string& segFile, const std::string& intFile, int x, int y, int label, PixIntens intensity, unsigned int tile_index);
+	void init_label_record_2_fast (LR& r, const std::string& segFile, const std::string& intFile, int x1, int x2, int y, int label, PixIntens maxInt, PixIntens minInt, unsigned int tile_index);
 	void update_label_record(LR& lr, int x, int y, int label, PixIntens intensity);
+	void update_label_record_fast(LR& lr, int x1, int x2, int y, int label, PixIntens maxInt, PixIntens minInt);
 	void update_label_record_2(LR& lr, int x, int y, int label, PixIntens intensity, unsigned int tile_index);
+	void update_label_record_2_fast(LR& lr, int x1, int x2, int y, int label, PixIntens maxInt, PixIntens minInt, unsigned int tile_index);
 	void reduce_neighbors(int labels_collision_radius);
 
 	void allocateTrivialRoisBuffers(const std::vector<int>& Pending);
@@ -69,6 +75,7 @@ namespace Nyxus
 	/// @param intensity -- pixel's intensity
 	/// @param tile_index -- index of pixel's tile in the image
 	void feed_pixel_2_metrics(int x, int y, PixIntens intensity, int label, unsigned int tile_index);
+	void feed_pixel_2_metrics_fast(int x1, int x2, int y, PixIntens maxInt, PixIntens minInt, int label, unsigned int tile_index);
 
 	/// @brief Copies a pixel to the ROI's cache. 
 	/// @param x -- x-coordinate of the pixel in the image
@@ -76,6 +83,8 @@ namespace Nyxus
 	/// @param label -- label of pixel's segment 
 	/// @param intensity -- pixel's intensity
 	void feed_pixel_2_cache(int x, int y, PixIntens intensity, int label);
+	
+	void feed_rle_2_cache(int x1, int x2, PixIntens i1, PixIntens i2, int y, int label);
 
 	// System resources
 	unsigned long long getAvailPhysMemory();
diff --git a/src/nyx/phase1_fastloop.cpp b/src/nyx/phase1_fastloop.cpp
new file mode 100644
index 00000000..82e477d2
--- /dev/null
+++ b/src/nyx/phase1_fastloop.cpp
@@ -0,0 +1,140 @@
+#include <cstdint>
+#include <popcntintrin.h>
+#include <ratio>
+#include <smmintrin.h>
+#include <string>
+#include <sstream>
+#include <vector>
+#include <map>
+#include <array>
+#ifdef WITH_PYTHON_H
+#include <pybind11/pybind11.h>
+#endif
+#include "environment.h"
+#include "globals.h"
+#include "helpers/timing.h"
+#include "rle.hpp"
+
+namespace Nyxus
+{
+	bool gatherRoisMetricsFast (const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads)
+	{
+
+		int lvl = 0, // Pyramid level
+			lyr = 0; //	Layer
+
+		// Read the tiff. The image loader is put in the open state in processDataset()
+		size_t nth = theImLoader.get_num_tiles_hor(),
+			ntv = theImLoader.get_num_tiles_vert(),
+			fw = theImLoader.get_tile_width(),
+			th = theImLoader.get_tile_height(),
+			tw = theImLoader.get_tile_width(),
+			tileSize = theImLoader.get_tile_size(),
+			fullwidth = theImLoader.get_full_width(),
+			fullheight = theImLoader.get_full_height();
+
+		int cnt = 1;
+		for (unsigned int row = 0; row < nth; row++)
+			for (unsigned int col = 0; col < ntv; col++)
+			{
+				// Fetch the tile 
+				bool ok = theImLoader.load_tile(row, col);
+				if (!ok)
+				{
+					std::stringstream ss;
+					ss << "Error fetching tile row=" << row << " col=" << col;
+					#ifdef WITH_PYTHON_H
+						throw ss.str();
+					#endif	
+					std::cerr << ss.str() << "\n";
+					return false;
+				}
+
+				// Get ahold of tile's pixel buffer
+				auto tileIdx = row * nth + col;
+				auto dataI = theImLoader.get_int_tile_buffer(),
+					dataL = theImLoader.get_seg_tile_buffer();
+
+				// Iterate pixels
+                int y = 0;
+				int y_max = (fullheight < y+th) ? fullheight - row*th : th;
+				while (y < y_max)
+				{
+
+                    size_t i = y*tw;
+                    
+                    int x = 0;
+					int x_max = (fullwidth < x+tw) ? fullwidth - col*tw : tw;
+
+
+					int x_stream = 32 * (x_max / 32);
+
+					// Compress row to an RLE stream
+					RLEStream<uint32_t, uint16_t> stream = rle_encode_long_stream_32(dataL.data()+i,x_stream);
+
+					// Loop over row objects in RLE stream
+					unsigned int ind = 0;
+					while (ind+1 < stream.offsets.size()) {
+						uint16_t x1 = stream.offsets[ind++];
+						auto label = dataL[i+x1];
+
+						// If not background, store the object
+						if (label > 0) {
+							// Collapse all the labels to one if single-ROI mde is requested
+							if (theEnvironment.singleROI)
+								label = 1;
+
+							uint16_t x2 = stream.offsets[ind];
+							auto minInt = dataI[i+x1];
+							auto maxInt = dataI[i+x1];
+
+							// Find the min and max intensity
+							for (int xi=i+x1; xi<i+x2; xi++) {
+								if (maxInt < dataI[xi]) {
+									maxInt = dataI[xi];
+								} else if (minInt > dataI[xi]) {
+									minInt = dataI[xi];
+								}
+
+							}
+
+							feed_pixel_2_metrics_fast (x1, x2, y, maxInt, minInt, label, tileIdx); // Updates 'uniqueLabels' and 'roiData'
+						}
+					}
+
+					i += x_stream;
+					x = x_stream;
+					while (x < x_max) {
+
+                        // Skip non-mask pixels
+                        auto label = dataL[i];
+                        if (label == 0) {
+                            ++x, ++i;
+                            continue;
+                        }
+
+                        // Collapse all the labels to one if single-ROI mde is requested
+                        if (theEnvironment.singleROI)
+                            label = 1;
+                        
+                        // Update pixel's ROI metrics
+                        feed_pixel_2_metrics (x, y, dataI[i], label, tileIdx); // Updates 'uniqueLabels' and 'roiData'
+                        ++x, ++i;
+                    }
+                    ++y;
+				}
+
+#ifdef WITH_PYTHON_H
+				if (PyErr_CheckSignals() != 0)
+					throw pybind11::error_already_set();
+#endif
+
+				// Show stayalive progress info
+				if (cnt++ % 4 == 0)
+					std::cout << "\t" << int((row * nth + col) * 100 / float(nth * ntv) * 100) / 100. << "%\t" << uniqueLabels.size() << " ROIs" << "\n";
+			}
+
+		return true;
+	}
+
+}
diff --git a/src/nyx/phase2.cpp b/src/nyx/phase2.cpp
index 43559a88..0bdc025c 100644
--- a/src/nyx/phase2.cpp
+++ b/src/nyx/phase2.cpp
@@ -10,6 +10,7 @@
 #include "environment.h"
 #include "globals.h"
 #include "helpers/timing.h"
+#include "rle.hpp"
 
 namespace Nyxus
 {
@@ -105,7 +106,6 @@ namespace Nyxus
 		for (unsigned int row = 0; row < nth; row++)
 			for (unsigned int col = 0; col < ntv; col++)
 			{
-				// Fetch the tile 
 				bool ok = theImLoader.load_tile(row, col);
 				if (!ok)
 				{
@@ -158,6 +158,124 @@ namespace Nyxus
 		return true;
 	}
 
+	bool scanTrivialRoisFast (const std::vector<int>& batch_labels, const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads)
+	{
+		// Sort the batch's labels to enable binary searching in it
+		std::vector<int> whiteList = batch_labels;
+		std::sort (whiteList.begin(), whiteList.end());
+
+		int lvl = 0,	// Pyramid level
+			lyr = 0;	//	Layer
+
+		// Read the tiffs
+		size_t nth = theImLoader.get_num_tiles_hor(),
+			ntv = theImLoader.get_num_tiles_vert(),
+			fw = theImLoader.get_tile_width(),
+			th = theImLoader.get_tile_height(),
+			tw = theImLoader.get_tile_width(),
+			tileSize = theImLoader.get_tile_size(),
+			fullwidth = theImLoader.get_full_width(),
+			fullheight = theImLoader.get_full_height();
+
+		int cnt = 1;
+		for (unsigned int row = 0; row < nth; row++)
+			for (unsigned int col = 0; col < ntv; col++)
+		{
+			// Fetch the tile 
+			bool ok = theImLoader.load_tile(row, col);
+			if (!ok)
+			{
+				std::stringstream ss;
+				ss << "Error fetching tile row=" << row << " col=" << col;
+				#ifdef WITH_PYTHON_H
+					throw ss.str();
+				#endif	
+				std::cerr << ss.str() << "\n";
+				return false;
+			}
+
+			// Get ahold of tile's pixel buffer
+			auto dataI = theImLoader.get_int_tile_buffer(),
+				dataL = theImLoader.get_seg_tile_buffer();
+
+			// Iterate pixels
+			int y = 0;
+			int y_max = (fullheight < y+th) ? fullheight - row*th : th;
+			while (y < y_max)
+			{
+
+				size_t i = y*tw;
+				
+				int x = 0;
+				int x_max = (fullwidth < x+tw) ? fullwidth - col*tw : tw;
+				
+
+				int x_stream = 32 * (x_max / 32);
+
+				RLEStream<uint32_t, uint16_t> stream = rle_encode_long_stream_32(dataL.data()+i,x_stream);
+
+				unsigned int ind = 0;
+				while (ind+1 < stream.offsets.size()) {
+					uint16_t x1 = stream.offsets[ind++];
+					auto label = dataL[i+x1];
+
+					if (label > 0) {
+						uint16_t x2 = stream.offsets[ind];
+
+						if (! theEnvironment.singleROI && ! std::binary_search(whiteList.begin(), whiteList.end(), label)) //--slow-- if (std::find(PendingRoiLabels.begin(), PendingRoiLabels.end(), label) == PendingRoiLabels.end())
+						{
+							continue;
+						}
+
+						for (int xi=x1; xi<x2; xi++) {
+
+							feed_pixel_2_cache (xi, y, dataI[xi+i], label);
+
+						}
+
+						feed_rle_2_cache(x1, x2, dataI[x1+i], dataI[x2+i], y, label);
+					}
+				}
+
+				i += x_stream;
+				x = x_stream;
+
+				stream = rle_encode_stream(dataL.data() + i, (uint16_t) (x_max - x_stream));
+
+				ind = 0;
+				while (ind+1 < stream.offsets.size()) {
+					uint16_t x1 = stream.offsets[ind++];
+					auto label = dataL[i+x1];
+
+					if (label > 0) {
+						uint16_t x2 = stream.offsets[ind];
+
+						if (! theEnvironment.singleROI && ! std::binary_search(whiteList.begin(), whiteList.end(), label)) //--slow-- if (std::find(PendingRoiLabels.begin(), PendingRoiLabels.end(), label) == PendingRoiLabels.end())
+						{
+							continue;
+						}
+
+						for (int xi=x1; xi<x2; xi++) {
+
+							feed_pixel_2_cache (xi+x, y, dataI[xi+i], label);
+
+						}
+
+						feed_rle_2_cache(x1+x, x2+x, dataI[x1+i], dataI[x2+i], y, label);
+					}
+				}
+
+				++y;
+			}
+
+			// Show stayalive progress info
+			if (cnt++ % 4 == 0)
+				VERBOSLVL1(std::cout << "\tscan trivial " << int((row * nth + col) * 100 / float(nth * ntv) * 100) / 100. << "% of image scanned \n";)
+		}
+
+		return true;
+	}
+
 	PixIntens* ImageMatrixBuffer = nullptr;
 	size_t imageMatrixBufferLen = 0;
 
@@ -196,7 +314,7 @@ namespace Nyxus
 		delete ImageMatrixBuffer;
 	}
 
-	bool processTrivialRois (const std::vector<int>& trivRoiLabels, const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads, size_t memory_limit)
+	bool processTrivialRois (const std::vector<int>& trivRoiLabels, const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads, size_t memory_limit, bool use_fastloop)
 	{
 		std::vector<int> Pending;
 		size_t batchDemand = 0;
@@ -225,7 +343,11 @@ namespace Nyxus
 					else
 						std::cout << ">>> (ROIs " << Pending[0] << " ... " << Pending[Pending.size() - 1] << ")\n";
 					)
-				scanTrivialRois(Pending, intens_fpath, label_fpath, num_FL_threads);
+				if (use_fastloop) {
+					scanTrivialRoisFast(Pending, intens_fpath, label_fpath, num_FL_threads);
+				} else {
+					scanTrivialRois(Pending, intens_fpath, label_fpath, num_FL_threads);
+				}
 
 				// Allocate memory
 				VERBOSLVL1(std::cout << "\tallocating ROI buffers\n";)
@@ -273,7 +395,11 @@ namespace Nyxus
 				else
 					std::cout << ">>> (ROIs " << Pending[0] << " ... " << Pending[Pending.size() - 1] << ")\n";
 				)
-			scanTrivialRois(Pending, intens_fpath, label_fpath, num_FL_threads);
+				if (use_fastloop) {
+					scanTrivialRoisFast(Pending, intens_fpath, label_fpath, num_FL_threads);
+				} else {
+					scanTrivialRois(Pending, intens_fpath, label_fpath, num_FL_threads);
+				}
 
 			// Allocate memory
 			VERBOSLVL1(std::cout << "\tallocating ROI buffers\n";)
diff --git a/src/nyx/pixel_feed.cpp b/src/nyx/pixel_feed.cpp
index b965dd7f..c95fa4af 100644
--- a/src/nyx/pixel_feed.cpp
+++ b/src/nyx/pixel_feed.cpp
@@ -1,3 +1,4 @@
+#include "features/pixel.h"
 #include <string>
 #include <vector>
 #include <map>
@@ -37,6 +38,26 @@ namespace Nyxus
 		}
 	}
 
+	void feed_pixel_2_metrics_fast(int x1, int x2, int y, PixIntens maxInt, PixIntens minInt, int label, unsigned int tile_index)
+	{
+		if (uniqueLabels.find(label) == uniqueLabels.end())
+		{
+			// Remember this label
+			uniqueLabels.insert(label);
+
+			// Initialize the ROI label record
+			LR newData;
+			init_label_record_2_fast(newData, theSegFname, theIntFname, x1, x2, y, label, maxInt, minInt, tile_index);
+			roiData[label] = newData;
+		}
+		else
+		{
+			// Update basic ROI info (info that doesn't require costly calculations)
+			LR& existingData = roiData[label];
+			update_label_record_2_fast(existingData, x1, x2, y, label, maxInt, minInt, tile_index);
+		}
+	}
+
 	/// @brief Copies a pixel to the ROI's cache. 
 	/// @param x -- x-coordinate of the pixel in the image
 	/// @param y -- y-coordinate of the pixel in the image
@@ -49,4 +70,12 @@ namespace Nyxus
 		r.raw_pixels.push_back(Pixel2(x, y, intensity));
 	}
 
+	void feed_rle_2_cache(int x1, int x2, PixIntens i1, PixIntens i2, int y, int label)
+	{
+		// Update basic ROI info (info that doesn't require costly calculations)
+		LR& r = roiData[label];
+		r.rle_pixels.push_back(Pixel2(x1, y, i1));
+		r.rle_pixels.push_back(Pixel2(x2-1, y, i2));
+	}
+
 }
diff --git a/src/nyx/python/new_bindings_py.cpp b/src/nyx/python/new_bindings_py.cpp
index 0e365ff6..2284b373 100644
--- a/src/nyx/python/new_bindings_py.cpp
+++ b/src/nyx/python/new_bindings_py.cpp
@@ -70,10 +70,11 @@ void initialize_environment(
     #endif
 }
 
-py::tuple featurize_directory_imp (
+py::tuple featurize_directory_imp_fast (
     const std::string &intensity_dir,
     const std::string &labels_dir,
-    const std::string &file_pattern)
+    const std::string &file_pattern,
+    bool use_fastloop)
 {
     theEnvironment.intensity_dir = intensity_dir;
     theEnvironment.labels_dir = labels_dir;
@@ -110,7 +111,8 @@ py::tuple featurize_directory_imp (
         theEnvironment.n_reduce_threads,
         min_online_roi_size,
         false, // 'true' to save to csv
-        theEnvironment.output_dir);
+        theEnvironment.output_dir,
+        use_fastloop);
 
     if (errorCode)
         throw std::runtime_error("Error occurred during dataset processing.");
@@ -125,7 +127,19 @@ py::tuple featurize_directory_imp (
     return py::make_tuple(pyHeader, pyStrData, pyNumData);
 }
 
-py::tuple featurize_fname_lists_imp (const py::list& int_fnames, const py::list & seg_fnames)
+py::tuple featurize_directory_imp (
+    const std::string &intensity_dir,
+    const std::string &labels_dir,
+    const std::string &file_pattern)
+{
+    return featurize_directory_imp_fast (
+    intensity_dir,
+    labels_dir,
+    file_pattern,
+    false);
+}
+
+py::tuple featurize_fname_lists_imp_fast (const py::list& int_fnames, const py::list & seg_fnames, bool use_fastloop)
 {
     std::vector<std::string> intensFiles, labelFiles;
     for (auto it = int_fnames.begin(); it != int_fnames.end(); ++it)
@@ -177,7 +191,8 @@ py::tuple featurize_fname_lists_imp (const py::list& int_fnames, const py::list
         theEnvironment.n_reduce_threads,
         min_online_roi_size,
         false, // 'true' to save to csv
-        theEnvironment.output_dir);
+        theEnvironment.output_dir,
+        use_fastloop);
     if (errorCode)
         throw std::runtime_error("Error occurred during dataset processing.");
 
@@ -191,6 +206,12 @@ py::tuple featurize_fname_lists_imp (const py::list& int_fnames, const py::list
     return py::make_tuple(pyHeader, pyStrData, pyNumData);
 }
 
+
+py::tuple featurize_fname_lists_imp (const py::list& int_fnames, const py::list & seg_fnames)
+{
+    return featurize_fname_lists_imp_fast (int_fnames, seg_fnames, false);
+}
+
 py::tuple findrelations_imp(
         std::string& label_dir,
         std::string& parent_file_pattern,
@@ -255,6 +276,8 @@ PYBIND11_MODULE(backend, m)
     m.def("initialize_environment", &initialize_environment, "Environment initialization");
     m.def("featurize_directory_imp", &featurize_directory_imp, "Calculate features of images defined by intensity and mask image collection directories");
     m.def("featurize_fname_lists_imp", &featurize_fname_lists_imp, "Calculate features of intensity-mask image pairs defined by lists of image file names");
+    m.def("featurize_directory_imp_fast", &featurize_directory_imp_fast, "Calculate features of images defined by intensity and mask image collection directories");
+    m.def("featurize_fname_lists_imp_fast", &featurize_fname_lists_imp_fast, "Calculate features of intensity-mask image pairs defined by lists of image file names");
     m.def("findrelations_imp", &findrelations_imp, "Find relations in segmentation images");
     m.def("gpu_available", &Environment::gpu_is_available, "Check if CUDA gpu is available");
     m.def("use_gpu", &use_gpu, "Enable/disable GPU features");
diff --git a/src/nyx/python/nyxus/nyxus.py b/src/nyx/python/nyxus/nyxus.py
index f913154a..c392ed34 100644
--- a/src/nyx/python/nyxus/nyxus.py
+++ b/src/nyx/python/nyxus/nyxus.py
@@ -1,4 +1,4 @@
-from .backend import initialize_environment, featurize_directory_imp, featurize_fname_lists_imp, findrelations_imp, use_gpu, gpu_available 
+from .backend import initialize_environment, featurize_directory_imp, featurize_directory_imp_fast, featurize_fname_lists_imp, findrelations_imp, use_gpu, gpu_available 
 import os
 import numpy as np
 import pandas as pd
@@ -96,6 +96,7 @@ def featurize_directory(
         intensity_dir: str,
         label_dir: Optional[str] = None,
         file_pattern: Optional[str] = ".*",
+        use_fastloop: bool = False
     ):
         """Extract features from all the images satisfying the file pattern of provided image directories.
 
@@ -133,7 +134,7 @@ def featurize_directory(
         if label_dir is None:
             label_dir = intensity_dir
 
-        header, string_data, numeric_data = featurize_directory_imp (intensity_dir, label_dir, file_pattern)
+        header, string_data, numeric_data = featurize_directory_imp_fast (intensity_dir, label_dir, file_pattern, use_fastloop)
 
         df = pd.concat(
             [
diff --git a/src/nyx/rle.cpp b/src/nyx/rle.cpp
new file mode 100644
index 00000000..60440dee
--- /dev/null
+++ b/src/nyx/rle.cpp
@@ -0,0 +1,291 @@
+#include "rle.hpp"
+#include <emmintrin.h>
+#include <immintrin.h>
+
+// Vector to shuffle 4-bit data into the lower 64 bits
+const __m128i pshufbcnst_128 =
+    _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0E, 0x0C,
+                 0x0A, 0x08, 0x06, 0x04, 0x02, 0x00);
+
+// zero vector
+const __m128i zeroes_128 = _mm_setzero_si128();
+const __m256i zeroes_256 = _mm256_setzero_si256();
+
+// Shuffle 4-bit vectors
+const __m256i shf = _mm256_set_epi8(
+    0x80, 0x0F, 0x80, 0x07, 0x80, 0x0E, 0x80, 0x06, 0x80, 0x0D, 0x80, 0x05,
+    0x80, 0x0C, 0x80, 0x04, 0x80, 0x0B, 0x80, 0x03, 0x80, 0x0A, 0x80, 0x02,
+    0x80, 0x09, 0x80, 0x01, 0x80, 0x08, 0x80, 0x00);
+
+// Bit shifts to decode 4-bit packing
+const __m256i shft = _mm256_set_epi64x(0x04, 0x00, 0x04, 0x00);
+
+// For casting to 4-bit mask
+const __m256i vmsk = _mm256_set1_epi8(0x0F);
+
+// For casting 4-bit boolean to position index
+const uint64_t cast_index = 0xFEDCBA9876543210;
+
+/**
+ * @brief Get indices of non-zero byte values in a 128 bit vector.
+ *
+ * Assuming data in a 128 bit vector is composed of byte data, this function
+ * returns indices of non-zero values as packed uint16 values stored in a 256
+ * bit vector.
+ *
+ * @param data
+ * @return PackedIndex
+ */
+PackedIndex<__m256i> get_128_indices_word(__m128i data) {
+
+  uint64_t ind; // Holds 4-bit indexes
+
+  PackedIndex<__m256i> output{};
+
+  // Determine how many edges were found
+  data = _mm_srli_epi64(data, 4); // Pack 16x8 bit to 32x4 bit mask.
+  data = _mm_shuffle_epi8(data, pshufbcnst_128); // Align the 16x4 bit mask.
+  ind = ~_mm_cvtsi128_si64(data);                // Extract the 16x4 bit mask.
+  output.count = static_cast<uint64_t>(_mm_popcnt_u64(ind)) >> 2u;
+  ind = _pext_u64(cast_index, ind); // Get 1-15 index offset of edges
+
+  // Unpack the 4 bit values
+  output.indices = _mm256_set1_epi64x(ind);
+  output.indices = _mm256_srlv_epi64(output.indices, shft);
+  output.indices = _mm256_and_si256(output.indices, vmsk);
+  output.indices = _mm256_shuffle_epi8(output.indices, shf);
+
+  return output;
+}
+
+PackedIndex<__m256i> get_256_indices_word(__m256i data) {
+
+  uint64_t ind; // Holds 4-bit indexes
+
+  PackedIndex<__m256i> output{};
+
+  // Determine how many edges were found
+  ind = static_cast<unsigned int>(~_mm256_movemask_epi8(data));
+  output.count = static_cast<uint64_t>(_mm_popcnt_u64(ind)) >> 2u;
+  ind = _pext_u64(cast_index, ind); // Get 1-15 index offset of edges
+
+  // Unpack the 4 bit values
+  output.indices = _mm256_set1_epi64x(ind);
+  output.indices = _mm256_srlv_epi64(output.indices, shft);
+  output.indices = _mm256_and_si256(output.indices, vmsk);
+  output.indices = _mm256_shuffle_epi8(output.indices, shf);
+
+  return output;
+}
+
+void get_long_edges_256(const uint32_t *data, __m256i *mask, int i,
+                        uint32_t last_val) {
+
+  __m256i offset[4];
+
+  int index = i * 8;
+  int i1 = i + 1;
+  int i2 = i + 2;
+  int i3 = i + 3;
+
+  // Unrolled data loading
+  mask[i] = _mm256_loadu_si256((__m256i *)&data[index]);
+  mask[i1] = _mm256_loadu_si256((__m256i *)&data[index + 8]);
+  mask[i2] = _mm256_loadu_si256((__m256i *)&data[index + 16]);
+  mask[i3] = _mm256_loadu_si256((__m256i *)&data[index + 24]);
+
+  // Create shifted values
+  offset[0] = _mm256_srli_si256(mask[i], 4);
+  offset[1] = _mm256_srli_si256(mask[i1], 4);
+  offset[2] = _mm256_srli_si256(mask[i2], 4);
+  offset[3] = _mm256_srli_si256(mask[i3], 4);
+  offset[0] = _mm256_insert_epi32(offset[0], _mm256_extract_epi32(mask[i1], 0), 7);
+  offset[1] = _mm256_insert_epi32(offset[1], _mm256_extract_epi32(mask[i2], 0), 7);
+  offset[2] = _mm256_insert_epi32(offset[2], _mm256_extract_epi32(mask[i3], 0), 7);
+  offset[3] = _mm256_insert_epi32(offset[3], last_val, 7);
+  offset[0] = _mm256_insert_epi32(offset[0], _mm256_extract_epi32(mask[i], 4), 3);
+  offset[1] = _mm256_insert_epi32(offset[1], _mm256_extract_epi32(mask[i1], 4), 3);
+  offset[2] = _mm256_insert_epi32(offset[2], _mm256_extract_epi32(mask[i2], 4), 3);
+  offset[3] = _mm256_insert_epi32(offset[3], _mm256_extract_epi32(mask[i3], 4), 3);
+  
+  // Unrolled edge detection using xor
+  mask[i] = _mm256_xor_si256(mask[i], offset[0]);
+  mask[i1] = _mm256_xor_si256(mask[i1], offset[1]);
+  mask[i2] = _mm256_xor_si256(mask[i2], offset[2]);
+  mask[i3] = _mm256_xor_si256(mask[i3], offset[3]);
+
+  // Generate 16x8 bit mask.
+  mask[i] = _mm256_cmpeq_epi32(mask[i], zeroes_256);
+  mask[i1] = _mm256_cmpeq_epi32(mask[i1], zeroes_256);
+  mask[i2] = _mm256_cmpeq_epi32(mask[i2], zeroes_256);
+  mask[i3] = _mm256_cmpeq_epi32(mask[i3], zeroes_256);
+
+}
+
+void get_byte_edges_128(const uint8_t *data, __m128i *mask, int i,
+                        uint8_t last_val) {
+
+  __m128i offset[4];
+
+  int index = i * 16;
+  int i1 = i + 1;
+  int i2 = i + 2;
+  int i3 = i + 3;
+
+  // Unrolled data loading
+  mask[i] = _mm_load_si128((__m128i *)&data[index]);
+  mask[i1] = _mm_load_si128((__m128i *)&data[index + 16]);
+  mask[i2] = _mm_load_si128((__m128i *)&data[index + 32]);
+  mask[i3] = _mm_load_si128((__m128i *)&data[index + 48]);
+
+  // Create shifted values
+  offset[0] = _mm_bsrli_si128(mask[i], 1);
+  offset[1] = _mm_bsrli_si128(mask[i1], 1);
+  offset[2] = _mm_bsrli_si128(mask[i2], 1);
+  offset[3] = _mm_bsrli_si128(mask[i3], 1);
+  offset[0] = _mm_insert_epi8(offset[0], _mm_extract_epi8(mask[i1], 0), 15);
+  offset[1] = _mm_insert_epi8(offset[1], _mm_extract_epi8(mask[i2], 0), 15);
+  offset[2] = _mm_insert_epi8(offset[2], _mm_extract_epi8(mask[i3], 0), 15);
+  offset[3] = _mm_insert_epi8(offset[3], last_val, 15);
+
+  // Unrolled edge detection using xor
+  mask[i] = _mm_xor_si128(mask[i], offset[0]);
+  mask[i1] = _mm_xor_si128(mask[i1], offset[1]);
+  mask[i2] = _mm_xor_si128(mask[i2], offset[2]);
+  mask[i3] = _mm_xor_si128(mask[i3], offset[3]);
+
+  // Generate 16x8 bit mask.
+  mask[i] = _mm_cmpeq_epi8(mask[i], zeroes_128);
+  mask[i1] = _mm_cmpeq_epi8(mask[i1], zeroes_128);
+  mask[i2] = _mm_cmpeq_epi8(mask[i2], zeroes_128);
+  mask[i3] = _mm_cmpeq_epi8(mask[i3], zeroes_128);
+}
+
+RLEStream<uint32_t, uint16_t> rle_encode_long_stream_32(const uint32_t *data,
+                                                        const uint16_t len) {
+  if ((len % 32) != 0) {
+    throw std::invalid_argument(
+        "rle.rle_encode_long_stream_32: len must be a multiple of 32. "
+        "Arbitrary lengths should use rle_encode_stream.");
+  }
+
+  RLEStream<uint32_t, uint16_t> stream; // Initialize the output
+  stream.offsets.resize(len + 1);       // Allocate memory
+  uint16_t stream_ind = 0;              // Current index of stream.data
+  __m256i edges;                        // Edge data
+
+  stream.start = data;
+
+  // Load the data and get offsets
+  int num_chunks = len / 8;
+  __m256i mask[num_chunks];
+
+  // Loop over data
+  for (int i = 0; i < num_chunks - 4; i += 4) {
+    get_long_edges_256(data, mask, i, data[(i + 4) * 8]);
+  }
+
+  // Get edges of the last 16 values
+  get_long_edges_256(data, mask, (num_chunks - 4), data[len - 1]);
+
+  if (data[0] != 0u) {
+    stream.offsets[stream_ind++] = 0;
+  }
+
+  for (int i = 0; i < num_chunks; ++i) {
+    // Skip if no edges present
+    if (_mm256_movemask_epi8(mask[i]) == 0xFFFFFFFF) {
+      continue;
+    }
+
+    // Get the indices of edges
+    PackedIndex<__m256i> edges = get_256_indices_word(mask[i]);
+    edges.indices =
+        _mm256_add_epi16(_mm256_set1_epi16(i * 8 + 1), edges.indices);
+
+    // store the results
+    _mm256_storeu_si256(
+        reinterpret_cast<__m256i *>(&stream.offsets[stream_ind]),
+        edges.indices);
+    stream_ind += edges.count;
+  }
+
+  // Check the last value and add the index if present
+  if (data[len - 1] != 0u) {
+    stream.offsets[stream_ind++] = len;
+  }
+
+  stream.offsets.resize(stream_ind);
+
+  return stream;
+};
+
+RLEStream<uint8_t, uint16_t> rle_encode_byte_stream_64(const uint8_t *data,
+                                                       const uint16_t len) {
+  if ((len % 64) != 0) {
+    throw std::invalid_argument(
+        "rle.rle_encode_byte_64: len must be a multiple of 64. "
+        "Arbitrary lengths should use rle_encode_stream.");
+  }
+
+  RLEStream<uint8_t, uint16_t> stream; // Initialize the output
+  stream.offsets.resize(len + 1);      // Allocate memory
+  uint16_t stream_ind = 0;             // Current index of stream.data
+  __m256i edges;                       // Edge data
+
+  stream.start = data;
+
+  // Load the data and get offsets
+  int num_chunks = len / 16;
+  __m128i mask[num_chunks];
+  // Loop over data
+  for (int i = 0; i < num_chunks - 4; i += 4) {
+    get_byte_edges_128(data, mask, i, data[(i + 4) * 16]);
+  }
+  // Get edges of the late 64 bytes
+  get_byte_edges_128(data, mask, (num_chunks - 4), data[len - 1]);
+
+  if (data[0] != 0u) {
+    stream.offsets[stream_ind++] = 0;
+  }
+
+  for (int i = 0; i < num_chunks; ++i) {
+    // Skip if no edges present
+    if (_mm_movemask_epi8(mask[i]) == 0xFFFFFFFFFFFFFFFF) {
+      continue;
+    }
+
+    // Get the indices of edges
+    PackedIndex<__m256i> edges = get_128_indices_word(mask[i]);
+    edges.indices =
+        _mm256_add_epi16(_mm256_set1_epi16(i * 16 + 1), edges.indices);
+
+    // store the results
+    _mm256_storeu_si256(
+        reinterpret_cast<__m256i *>(&stream.offsets[stream_ind]),
+        edges.indices);
+    stream_ind += edges.count;
+  }
+
+  // Check the last value and add the index if present
+  if (data[len - 1] != 0u) {
+    stream.offsets[stream_ind++] = len;
+  }
+
+  stream.offsets.resize(stream_ind);
+
+  return stream;
+};
+
+std::vector<RLEStream<uint8_t, uint16_t>>
+rle_encode_byte_stream_64_tile(const uint8_t *data, const uint8_t *end,
+                               const uint16_t stride,
+                               const uint16_t chunk_size) {
+  std::vector<RLEStream<uint8_t, uint16_t>> tile;
+
+  for (; data < end; data += stride) {
+    tile.push_back(rle_encode_byte_stream_64(data, chunk_size));
+  }
+
+  return tile;
+}
diff --git a/src/nyx/rle.hpp b/src/nyx/rle.hpp
new file mode 100644
index 00000000..7eb98951
--- /dev/null
+++ b/src/nyx/rle.hpp
@@ -0,0 +1,147 @@
+#ifndef _FTL_RLE_HPP
+#define _FTL_RLE_HPP
+/**
+ * @file rle.hpp
+ * @author Nick Schaub (nicholas.j.schaub@gmail.com)
+ * @brief Optimized methods for performing run length encoding
+ * @version 0.1
+ * @date 2022-04-02
+ *
+ * @copyright Copyright (c) 2022
+ *
+ */
+
+#include <cstdint>
+#include <stdexcept>
+#include <type_traits>
+#include <vector>
+#include <iostream>
+#include <x86intrin.h>
+
+/**
+ * @brief Offsets from a a starting point where data changes value.
+ *
+ * Run Length Encoding (RLE) compresses data by finding consecutive,
+ * repeated values. This is usually represented by a value followed by the
+ * number of times the value should be repeated. This structure only records the
+ * offsets to a change in value, and not the actual values themselves.
+ *
+ * A good explanation of RLE is in the packbits compression algorithm
+ * Wikipedia page.
+ *
+ * NOTE: The quirk to this data representation is that the first offset may not
+ * be 0, and in that case the values from start to the first offset are 0.
+ *
+ * https://en.wikipedia.org/wiki/PackBits
+ *
+ * @tparam T Data type of data to be compressed
+ * @tparam U Data type of offsets to change in value
+ */
+template <typename T, typename U> struct RLEStream {
+  const T *start;
+  std::vector<U> offsets;
+};
+
+/**
+ * @brief Run Length Encoding representation
+ *
+ * This structure contains all information needed for run length encoding. It
+ * inherits from RLEStream, but unlike RLEStream it stores values for each
+ * offset. Refer to RLEStream for additional details.
+ *
+ * @tparam T Data type of data to be compressed
+ * @tparam U Data type of offsets to change in value
+ */
+template <typename T, typename U> struct RLE : public RLEStream<U, T> {
+  std::vector<uint32_t> values;
+};
+
+// Structure to hold packed index
+template <typename T> struct PackedIndex {
+  static_assert(std::is_same<T, __m128i>::value or
+#ifdef __AVX512BW__
+                    std::is_same<T, __m512i>::value or
+#endif
+                    std::is_same<T, __m256i>::value,
+                "!");
+  T indices;
+  uint8_t count;
+};
+
+RLEStream<uint8_t, uint16_t> rle_encode_byte_stream_64(const uint8_t *data,
+                                                       uint16_t len);
+
+RLEStream<uint32_t, uint16_t> rle_encode_long_stream_32(const uint32_t *data,
+                                                        uint16_t len);
+
+template <typename T>
+RLEStream<T, uint16_t> rle_encode_byte_stream_64(const T *data, uint16_t len) {
+  static_assert(std::is_same<T, unsigned char>::value or
+                    std::is_same<T, char>::value or
+                    std::is_same<T, bool>::value,
+                "!");
+
+  RLEStream<uint8_t, uint16_t> rle =
+      rle_encode_byte_stream_64((uint8_t *)data, len);
+
+  RLEStream<T, uint16_t> output = {(std::vector<T>)rle.offsets, rle.start};
+
+  return output;
+}
+
+/**
+ * @brief Universal RLE compressor, get offsets to changes in value.
+ *
+ * This function is a universal handler for calculating offsets to changes in
+ * value. Handles all data stream types and streams of any size.
+ *
+ * @tparam U Data type of input data
+ * @tparam T Data type of offsets
+ * @param data Pointer to data for compression
+ * @param len Length of stream to compress
+ * @return RLEByteStream<U, T>
+ */
+template <typename U, typename T>
+RLEStream<U, T> rle_encode_stream(const U *data, T len) {
+  RLEStream<U, T> stream;
+  stream.start = data;
+  U on_object = 0;
+  const U *end = data + len;
+  const U *start = data;
+  for (; data < end; data++) {
+    if (*data) {
+      if (*data != on_object) {
+        stream.offsets.push_back((T)(data - start));
+        on_object = *data;
+      }
+    } else if (on_object) {
+      stream.offsets.push_back((T)(data - start));
+      on_object = 0;
+    }
+  }
+
+  if (on_object) {
+    stream.offsets.push_back((T)(end - start));
+  }
+
+  return stream;
+};
+
+template <typename U, typename T>
+std::vector<RLEStream<U, T>> rle_encode_stream_tile(const U *data, const U *end,
+                                                    const T stride,
+                                                    const T chunk_size) {
+  std::vector<RLEStream<U, T>> tile;
+
+  for (; data < end; data += stride) {
+    tile.push_back(rle_encode_stream(data, chunk_size));
+  }
+
+  return tile;
+}
+
+std::vector<RLEStream<uint8_t, uint16_t>>
+rle_encode_byte_stream_64_tile(const uint8_t *data, const uint8_t *end,
+                               uint16_t stride, uint16_t chunk_size);
+
+#endif // _FTL_RLE_HPP
diff --git a/src/nyx/roi_cache.h b/src/nyx/roi_cache.h
index 1b5fa5a1..55f80291 100644
--- a/src/nyx/roi_cache.h
+++ b/src/nyx/roi_cache.h
@@ -42,6 +42,7 @@ class LR: public BasicLR
 
 	bool roi_disabled = false;
 
+	std::vector <Pixel2> rle_pixels;
 	std::vector <Pixel2> raw_pixels;
 	OutOfRamPixelCloud osized_pixel_cloud;
 	unsigned int aux_area = 0;
diff --git a/src/nyx/scan_fastloader_way.cpp b/src/nyx/scan_fastloader_way.cpp
index b6ed7a0a..5d5f29af 100644
--- a/src/nyx/scan_fastloader_way.cpp
+++ b/src/nyx/scan_fastloader_way.cpp
@@ -30,7 +30,7 @@
 
 namespace Nyxus
 {
-	bool processIntSegImagePair (const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads, int filepair_index, int tot_num_filepairs)
+	bool processIntSegImagePair (const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads, int filepair_index, int tot_num_filepairs, bool use_fast)
 	{
 		std::vector<int> trivRoiLabels, nontrivRoiLabels;
 
@@ -52,14 +52,17 @@ namespace Nyxus
 					int digits = 2, k = std::pow(10.f, digits);
 				float perCent = float(filepair_index * 100 * k / tot_num_filepairs) / float(k);
 				VERBOSLVL1(std::cout << "[ " << std::setw(digits + 2) << perCent << "% ]\t" << " INT: " << intens_fpath << " SEG: " << label_fpath << "\n";)
-
 			}
 
 			{ STOPWATCH("Image scan2a/ImgScan2a/Scan2a/lightsteelblue", "\t=");
 
 			// Phase 1: gather ROI metrics
 			VERBOSLVL1(std::cout << "Gathering ROI metrics\n";)
-				gatherRoisMetrics(intens_fpath, label_fpath, num_FL_threads);	// Output - set of ROI labels, label-ROI cache mappings
+				if (use_fast) {
+					gatherRoisMetricsFast(intens_fpath, label_fpath, num_FL_threads);	// Output - set of ROI labels, label-ROI cache mappings
+				} else {
+					gatherRoisMetrics(intens_fpath, label_fpath, num_FL_threads);	// Output - set of ROI labels, label-ROI cache mappings
+				}
 
 			}
 
@@ -101,7 +104,7 @@ namespace Nyxus
 		if (trivRoiLabels.size())
 		{
 			VERBOSLVL1(std::cout << "Processing trivial ROIs\n";)
-			processTrivialRois (trivRoiLabels, intens_fpath, label_fpath, num_FL_threads, theEnvironment.get_ram_limit());
+			processTrivialRois (trivRoiLabels, intens_fpath, label_fpath, num_FL_threads, theEnvironment.get_ram_limit(), use_fast);
 		}
 
 		// Phase 3: process nontrivial (oversized) ROIs, if any
@@ -122,7 +125,8 @@ namespace Nyxus
 		int numReduceThreads,
 		int min_online_roi_size,
 		bool save2csv,
-		const std::string& csvOutputDir)
+		const std::string& csvOutputDir,
+		bool use_fastloop)
 	{
 		bool ok = true;
 
@@ -148,7 +152,7 @@ namespace Nyxus
 				return 1;
 			}
 
-			ok = processIntSegImagePair (ifp, lfp, numFastloaderThreads, i, nf);		// Phased processing
+			ok = processIntSegImagePair (ifp, lfp, numFastloaderThreads, i, nf, use_fastloop);		// Phased processing
 
 			if (ok == false)
 			{
@@ -185,6 +189,30 @@ namespace Nyxus
 		return 0; // success
 	}
 
+	int processDataset(
+		const std::vector<std::string>& intensFiles,
+		const std::vector<std::string>& labelFiles,
+		int numFastloaderThreads,
+		int numSensemakerThreads,
+		int numReduceThreads,
+		int min_online_roi_size,
+		bool save2csv,
+		const std::string& csvOutputDir) 
+		{
+			return processDataset(
+				intensFiles,
+				labelFiles,
+				numFastloaderThreads,
+				numSensemakerThreads,
+				numReduceThreads,
+				min_online_roi_size,
+				save2csv,
+				csvOutputDir,
+				false
+			);
+
+		}
+
 	void dump_roi_metrics(const std::string & label_fpath)
 	{
 		fs::path pseg (label_fpath);
diff --git a/src/nyx/sort.c b/src/nyx/sort.c
new file mode 100644
index 00000000..111e1021
--- /dev/null
+++ b/src/nyx/sort.c
@@ -0,0 +1,486 @@
+/*
+ * Sorting algorithms for compression
+ */
+
+#include "sort.h"
+
+/*
+ * Sorting networks
+ *
+ * Below is a series of bitonic sorting network components. Sorting networks
+ * for 2, 4, and 8 values receive all receive a 256-bit vector of 2, 4, or 8
+ * uint32 values (respectively).
+ *
+ * NOTE: These sort data into descending order.
+ *
+ * TODO: Optimize the layered sorting to remove extraneous shuffle operations.
+ *       For example, when sorting an array of 4 values merge_sort_2 is called
+ *       which currently swaps the values into place if they are out of order.
+ *       This could be optimized to swap values into the proper order for the
+ *       next set of comparisons, thus removing one swap operation for each
+ *       level of sorting.
+ *
+ * Ideas in these sorting implementations were taken from the following sources:
+ * https://ieeexplore.ieee.org/document/8638394
+ * https://www.vldb.org/pvldb/vol1/1454171.pdf
+ */
+
+// Swap adjacent integers into order
+const uint8_t merge_swap_2 = 0xB1;
+const uint8_t merge_shuf_2 = 0xF5;
+void merge_sort_2(__m256i *values, __m256i *indices) {
+
+    // Switch adjacent values
+    __m256i values_swapped = _mm256_shuffle_epi32(*values,merge_swap_2);
+
+    // Swap adjacent indices
+    __m256i indices_swapped = _mm256_shuffle_epi32(*indices,merge_swap_2);
+
+    // Compare values
+    __m256i cmp = _mm256_cmpgt_epi32(*values,values_swapped);
+
+    // Get blend indices
+    cmp = _mm256_shuffle_epi32(cmp,merge_shuf_2);
+
+    // Swap values
+    *values = _mm256_blendv_epi8(*values,values_swapped,cmp);
+    *indices = _mm256_blendv_epi8(*indices,indices_swapped,cmp);
+
+}
+
+// Sort integers in upper and lower lanes
+const uint8_t merge_swap_4 = 0x1B;
+const uint8_t merge_shuf_4 = 0xEB;
+void merge_sort_4(__m256i *values, __m256i *indices) {
+
+    // Sort adjacent integers
+    merge_sort_2(values,indices);
+
+    // Swap inner and outer values for comparison
+    __m256i values_swapped = _mm256_shuffle_epi32(*values,merge_swap_4);
+
+    // Swap inner and outer indices
+    __m256i indices_swapped = _mm256_shuffle_epi32(*indices,merge_swap_4);
+
+    // Compare values
+    __m256i cmp = _mm256_cmpgt_epi32(*values,values_swapped);
+
+    // Get blend indices
+    cmp = _mm256_shuffle_epi32(cmp,merge_shuf_4);
+
+    // Swap values
+    *values = _mm256_blendv_epi8(*values,values_swapped,cmp);
+    *indices = _mm256_blendv_epi8(*indices,indices_swapped,cmp);
+
+    // Finally, sort adjacent integers
+    merge_sort_2(values,indices);
+
+}
+
+// Sort integers in upper and lower lanes using interleaved comparisons
+// Used when more than 4 values are compared and merge_sort_4 has been called
+const uint8_t merge_swap_inter_4 = 0x4E;
+const uint8_t merge_shuf_inter_4 = 0xEE;
+void merge_sort_inter_4(__m256i *values, __m256i *indices) {
+
+    // Swap inner and outer values for comparison
+    __m256i values_swapped = _mm256_shuffle_epi32(*values,merge_swap_inter_4);
+
+    // Swap inner and outer indices
+    __m256i indices_swapped = _mm256_shuffle_epi32(*indices,merge_swap_inter_4);
+
+    // Compare values
+    __m256i cmp = _mm256_cmpgt_epi32(*values,values_swapped);
+
+    // Get blend indices
+    cmp = _mm256_shuffle_epi32(cmp,merge_shuf_inter_4);
+
+    // Swap values
+    *values = _mm256_blendv_epi8(*values,values_swapped,cmp);
+    *indices = _mm256_blendv_epi8(*indices,indices_swapped,cmp);
+
+    // Finally, sort adjacent integers
+    merge_sort_2(values,indices);
+
+}
+
+// Sort integers across 128 bit lanes
+void merge_sort_8(__m256i *values, __m256i *indices) {
+    const __m256i merge_swap_8 = _mm256_set_epi32(0,1,2,3,4,5,6,7);
+    const __m256i merge_shuf_8 = _mm256_set_epi32(7,6,5,4,4,5,6,7);
+
+    // Sort integers within each lane
+    merge_sort_4(values,indices);
+
+    // Swap inner and outer values for comparison
+    __m256i values_swapped = _mm256_permutevar8x32_epi32(*values,merge_swap_8);
+
+    // Swap inner and outer indices
+    __m256i indices_swapped = _mm256_permutevar8x32_epi32(*indices,merge_swap_8);
+
+    // Compare values
+    __m256i cmp = _mm256_cmpgt_epi32(*values,values_swapped);
+
+    // Get blend indices
+    cmp = _mm256_permutevar8x32_epi32(cmp,merge_shuf_8);
+
+    // Swap values
+    *values = _mm256_blendv_epi8(*values,values_swapped,cmp);
+    *indices = _mm256_blendv_epi8(*indices,indices_swapped,cmp);
+
+    // Values are sorted into high and low values, 
+    merge_sort_inter_4(values,indices);
+
+}
+
+// Sort integers across 128 bit lanes using interleaved comparisons
+// Used when more than 8 values are compared
+void merge_sort_inter_8(__m256i *values, __m256i *indices) {
+    const __m256i merge_swap_8 = _mm256_set_epi32(3,2,1,0,7,6,5,4);
+    const __m256i merge_shuf_8 = _mm256_set_epi32(7,6,5,4,7,6,5,4);
+
+    // Swap inner and outer values for comparison
+    __m256i values_swapped = _mm256_permutevar8x32_epi32(*values,merge_swap_8);
+
+    // Swap inner and outer indices
+    __m256i indices_swapped = _mm256_permutevar8x32_epi32(*indices,merge_swap_8);
+
+    // Compare values
+    __m256i cmp = _mm256_cmpgt_epi32(*values,values_swapped);
+
+    // Get blend indices
+    cmp = _mm256_permutevar8x32_epi32(cmp,merge_shuf_8);
+
+    // Swap values
+    *values = _mm256_blendv_epi8(*values,values_swapped,cmp);
+    *indices = _mm256_blendv_epi8(*indices,indices_swapped,cmp);
+
+    // Values are sorted into high and low values, 
+    merge_sort_inter_4(values,indices);
+
+}
+
+// Sort integers across vectors
+// values and indices must be an array of __m256i vectors, and the first two
+// are sorted
+void merge_sort_16(__m256i *values, __m256i *indices) {
+    const __m256i merge_swap_16 = _mm256_set_epi32(0,1,2,3,4,5,6,7);
+
+    // Sort integers within each vector
+    merge_sort_8(&values[0],&indices[0]);
+    merge_sort_8(&values[1],&indices[1]);
+
+    // Swap inner and outer values for comparison
+    __m256i values_swapped[2];
+    __m256i indices_swapped[2];
+    values_swapped[0] = _mm256_permutevar8x32_epi32(values[0],merge_swap_16);
+    indices_swapped[1] = _mm256_permutevar8x32_epi32(indices[1],merge_swap_16);
+    values_swapped[1] = _mm256_permutevar8x32_epi32(values[1],merge_swap_16);
+    indices_swapped[0] = _mm256_permutevar8x32_epi32(indices[0],merge_swap_16);
+
+    // Compare values
+    __m256i cmp = _mm256_cmpgt_epi32(values_swapped[1],values[0]);
+
+    // Blend the first vector
+    values[0] = _mm256_blendv_epi8(values[0],values_swapped[1],cmp);
+    __m256i cmp_reversed = _mm256_permutevar8x32_epi32(cmp,merge_swap_16);
+    indices[0] = _mm256_blendv_epi8(indices[0],indices_swapped[1],cmp);
+
+    // Blend the second vector
+    values[1] = _mm256_blendv_epi8(values[1],values_swapped[0],cmp_reversed);
+    indices[1] = _mm256_blendv_epi8(indices[1],indices_swapped[0],cmp_reversed);
+
+    // Values are sorted into high and low values, 
+    merge_sort_inter_8(&values[0],&indices[0]);
+    merge_sort_inter_8(&values[1],&indices[1]);
+
+}
+
+// Sort integers across vectors using interleaved comparisons
+void merge_sort_inter_16(__m256i *values, __m256i *indices) {
+
+    // Swap inner and outer values for comparison
+    __m256i values_swapped[2];
+    __m256i indices_swapped[2];
+    values_swapped[0] = values[0];
+    indices_swapped[1] = indices[1];
+    values_swapped[1] = values[1];
+    indices_swapped[0] = indices[0];
+
+    // Compare values
+    __m256i cmp = _mm256_cmpgt_epi32(values_swapped[1],values[0]);
+
+    // Blend the first vector
+    values[0] = _mm256_blendv_epi8(values[0],values_swapped[1],cmp);
+    __m256i cmp_reversed = ~cmp;
+    indices[0] = _mm256_blendv_epi8(indices[0],indices_swapped[1],cmp);
+
+    // Blend the second vector
+    values[1] = _mm256_blendv_epi8(values[1],values_swapped[0],cmp_reversed);
+    indices[1] = _mm256_blendv_epi8(indices[1],indices_swapped[0],cmp_reversed);
+
+    // Values are sorted into high and low values, 
+    merge_sort_inter_8(&values[0],&indices[0]);
+    merge_sort_inter_8(&values[1],&indices[1]);
+
+}
+
+// Sort integers across 4 vectors
+// values and indices must be an array of __m256i vectors, and the first four
+// are sorted
+void merge_sort_16(__m256i *values, __m256i *indices) {
+    const __m256i merge_swap_16 = _mm256_set_epi32(0,1,2,3,4,5,6,7);
+
+    // Sort integers within each vector
+    merge_sort_16(values,indices);
+    merge_sort_16(values+2,indices+2);
+
+    // Swap inner and outer values for comparison
+    __m256i values_swapped[4];
+    __m256i indices_swapped[4];
+    values_swapped[0] = _mm256_permutevar8x32_epi32(values[0],merge_swap_16);
+    indices_swapped[1] = _mm256_permutevar8x32_epi32(indices[1],merge_swap_16);
+    values_swapped[1] = _mm256_permutevar8x32_epi32(values[1],merge_swap_16);
+    indices_swapped[0] = _mm256_permutevar8x32_epi32(indices[0],merge_swap_16);
+
+    // Compare values
+    __m256i cmp[2];
+    __m256i cmp[0] = _mm256_cmpgt_epi32(values_swapped[1],values[0]);
+
+    // Blend the first vector
+    values[0] = _mm256_blendv_epi8(values[0],values_swapped[1],cmp);
+    __m256i cmp_reversed = _mm256_permutevar8x32_epi32(cmp,merge_swap_16);
+    indices[0] = _mm256_blendv_epi8(indices[0],indices_swapped[1],cmp);
+
+    // Blend the second vector
+    values[1] = _mm256_blendv_epi8(values[1],values_swapped[0],cmp_reversed);
+    indices[1] = _mm256_blendv_epi8(indices[1],indices_swapped[0],cmp_reversed);
+
+    // Values are sorted into high and low values, 
+    merge_sort_inter_8(&values[0],&indices[0]);
+    merge_sort_inter_8(&values[1],&indices[1]);
+
+}
+
+/*
+ * First pass binning algorithm for matching uint8 values
+ *
+ * This function determines the number of values in each bin of a histogram and
+ * determines the offsets where values belong as a preliminary step in a count
+ * sort algorithm. Each bin in the histogram represents a pair of bytes, so a
+ * bin value of 10 in bin 0x0000, then there are exactly 10 locations in the
+ * byte array where there is repeated null byte (with possible overlap).
+ *
+ * There are two steps to this algorithm
+ * 1. Count the number of times each pair of bytes occurs.
+ * 2. Calculate the offsets for where an index of a byte pair should be placed
+ *    using count sort, such that the pairs of bytes that occur most frequently
+ *    occur first.
+ *
+ * Modified from https://github.com/powturbo/Turbo-Histogram/blob/master/turbohist_.c
+ */
+static void bin_turbo_init(uint8_t *bytes,
+                           uint32_t size,
+                           uint32_t *counts,
+                           uint32_t *parts,
+                           uint32_t *part_count) {
+
+    /*
+     * Step 1: Count the number of occurences of each pair of bytes
+     */
+
+    // Two arrays to hold byte pair counts. Two arrays helps prevent collisions.
+    uint32_t c[2][65536]={0};
+
+    // Pointer to position in byte array
+    uint8_t *ip;
+
+    // Variable for preloading next set of bytes
+    __m128i cpv = _mm_loadu_si128((__m128i*)bytes);
+
+    // TODO: Remove the vector reversal as an optimization
+    // Vector used to reverse the vector, useful when debugging.
+    __m128i reverse = _mm_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7,
+                                    8, 9,10,11,12,13,14,15);
+
+    // Loop through the data 32 bytes at a time.
+    // Intrinsics are used to load the data to help reduce cache misses
+    for( ip = bytes; ip != bytes+(size&~(32-1)); ) {
+        ip+=16;
+
+        // Load and reverse the vectors
+        uint8_t cvt = ip[0];
+        __m128i cv=cpv;
+        __m128i dv = _mm_loadu_si128((__m128i*)(ip));
+        cv = _mm_shuffle_epi8(cv,reverse);
+        dv = _mm_shuffle_epi8(dv,reverse);
+        ip+=16;
+        uint8_t dvt = ip[0];
+        cpv = _mm_loadu_si128((__m128i*)(ip));
+
+        // Bin the data as uint16, bins even indexed values
+        c[0][_mm_extract_epi16(cv,  0)]++;
+        c[1][_mm_extract_epi16(dv,  0)]++;
+        c[0][_mm_extract_epi16(cv,  1)]++;
+        c[1][_mm_extract_epi16(dv,  1)]++;
+        c[0][_mm_extract_epi16(cv,  2)]++;
+        c[1][_mm_extract_epi16(dv,  2)]++;
+        c[0][_mm_extract_epi16(cv,  3)]++;
+        c[1][_mm_extract_epi16(dv,  3)]++;
+        c[0][_mm_extract_epi16(cv,  4)]++;
+        c[1][_mm_extract_epi16(dv,  4)]++;
+        c[0][_mm_extract_epi16(cv,  5)]++;
+        c[1][_mm_extract_epi16(dv,  5)]++;
+        c[0][_mm_extract_epi16(cv,  6)]++;
+        c[1][_mm_extract_epi16(dv,  6)]++;
+        c[0][_mm_extract_epi16(cv,  7)]++;
+        c[1][_mm_extract_epi16(dv,  7)]++;
+
+        // shift, add in missing value, bin odd indexed values
+        cv = _mm_slli_si128(cv,1);
+        dv = _mm_slli_si128(dv,1);
+        cv = _mm_insert_epi8(cv,cvt,0);
+        dv = _mm_insert_epi8(dv,dvt,0);
+
+        c[0][_mm_extract_epi16(cv,  0)]++;
+        c[1][_mm_extract_epi16(dv,  0)]++;
+        c[0][_mm_extract_epi16(cv,  1)]++;
+        c[1][_mm_extract_epi16(dv,  1)]++;
+        c[0][_mm_extract_epi16(cv,  2)]++;
+        c[1][_mm_extract_epi16(dv,  2)]++;
+        c[0][_mm_extract_epi16(cv,  3)]++;
+        c[1][_mm_extract_epi16(dv,  3)]++;
+        c[0][_mm_extract_epi16(cv,  4)]++;
+        c[1][_mm_extract_epi16(dv,  4)]++;
+        c[0][_mm_extract_epi16(cv,  5)]++;
+        c[1][_mm_extract_epi16(dv,  5)]++;
+        c[0][_mm_extract_epi16(cv,  6)]++;
+        c[1][_mm_extract_epi16(dv,  6)]++;
+        c[0][_mm_extract_epi16(cv,  7)]++;
+        c[1][_mm_extract_epi16(dv,  7)]++;
+
+        // Prefetch data to improve vector loading
+        __builtin_prefetch(ip+512, 0);
+    }
+
+    // Finish binning the data
+    uint16_t val;
+    while( ip < bytes+size ) {
+        val = *((uint16_t *) ip);
+        c[0][(uint16_t) ((val << 8) + (val >> 8))]++;
+        ip++;
+    }
+
+    /*
+     * Step 2: Calculate the index offsets such that the indices of the most
+     *         frequently occuring values occur first.
+     */
+
+    uint32_t indices[65536];
+    __m256i index = _mm256_set_epi32(7,6,5,4,3,2,1,0);
+    __m256i delta = _mm256_set_epi32(8,8,8,8,8,8,8,8);
+    __m256i c0;
+    for ( uint32_t i = 0; i < 65536; ) {
+        _mm256_storeu_si256((__m256i *) &indices[i],index);
+        c0 = _mm256_loadu_si256((__m256i *) &c[0][i]);
+        __m256i c1 = _mm256_loadu_si256((__m256i *) &c[1][i]);
+        index = _mm256_add_epi32(index,delta);
+        c0 = _mm256_add_epi32(c0,c1);
+        _mm256_storeu_si256((__m256i *) &c[0][i],c0);
+        i += 8;
+    }
+
+    // Calculate the offsets - Could probably be optimized
+    // Modified from https://stackoverflow.com/a/31229764
+    int cmp( const void *a, const void *b ){
+        int ia = *(int *)a;
+        int ib = *(int *)b;
+        return (c[0][ia] > c[0][ib]) - (c[0][ia] < c[0][ib]);
+    }
+
+    qsort(indices, 65536, sizeof(*indices), cmp);
+
+    uint32_t count = 0;
+
+    // Loop backwards so smallest clusters are last
+    parts[0] = 0;
+    (*part_count)++;
+    for (int32_t i = 65535; i >= 0;--i) {
+        counts[indices[i]] = count;
+        count += c[0][indices[i]];
+        if (count - counts[indices[i]] > 1) {
+            parts[65536 - i] = count;
+            (*part_count)++;
+        }
+    }
+}
+
+/*
+ * Function for the first pass of sorting
+ *
+ * The first pass of sorting is simple: Count the number of times each pair of
+ * characters  appears, then directly assign an index to the offset for each
+ * set of values. This is a fast, stable sort. Subsequent sorting of partitions
+ * requires slightly more complexity because previous sorting positions need to
+ * be known, requiring a reference from a separate array.
+ */
+void count_sort_init(uint8_t *bytes,
+                     uint32_t *indices,
+                     uint32_t size,
+                     uint32_t *counts,
+                     uint32_t *parts,
+                     uint32_t *part_count) {
+
+    // Bin the data
+    bin_turbo_init(bytes,size,counts,parts,part_count);
+    uint32_t c;
+
+    __m128i reverse = _mm_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7,
+                                    8, 9,10,11,12,13,14,15);
+
+    // Sort the indices, unroll 16 times
+    uint8_t *ip;
+    __m256i index1 = _mm256_set_epi32( 7 ,6, 5, 4, 3, 2, 1, 0);
+    __m256i index2 = _mm256_set_epi32(15,14,13,12,11,10, 9, 8);
+    __m128i cpv = _mm_loadu_si128((__m128i*)bytes);
+    __m256i delta = _mm256_set1_epi32(16);
+    for(ip = bytes; ip != bytes+(size&~(16-1)); ) {
+        // Load the values and start loading a new set of values
+        __m128i cv = cpv;
+        __m128i dv = _mm_srli_si128(cpv,1);
+        ip += 16;
+        cv = _mm_shuffle_epi8(cv,reverse);
+        dv = _mm_insert_epi8(dv,ip[0],15);
+        cpv = _mm_loadu_si128((__m128i*)(ip));
+        dv = _mm_shuffle_epi8(dv,reverse);
+
+        // Set the data
+        indices[counts[_mm_extract_epi16(cv,  7)]++] = _mm256_extract_epi32(index1,  0);
+        indices[counts[_mm_extract_epi16(dv,  7)]++] = _mm256_extract_epi32(index1,  1);
+        indices[counts[_mm_extract_epi16(cv,  6)]++] = _mm256_extract_epi32(index1,  2);
+        indices[counts[_mm_extract_epi16(dv,  6)]++] = _mm256_extract_epi32(index1,  3);
+        indices[counts[_mm_extract_epi16(cv,  5)]++] = _mm256_extract_epi32(index1,  4);
+        indices[counts[_mm_extract_epi16(dv,  5)]++] = _mm256_extract_epi32(index1,  5);
+        indices[counts[_mm_extract_epi16(cv,  4)]++] = _mm256_extract_epi32(index1,  6);
+        indices[counts[_mm_extract_epi16(dv,  4)]++] = _mm256_extract_epi32(index1,  7);
+        index1 = _mm256_add_epi32(index1,delta);
+        indices[counts[_mm_extract_epi16(cv,  3)]++] = _mm256_extract_epi32(index2,  0);
+        indices[counts[_mm_extract_epi16(dv,  3)]++] = _mm256_extract_epi32(index2,  1);
+        indices[counts[_mm_extract_epi16(cv,  2)]++] = _mm256_extract_epi32(index2,  2);
+        indices[counts[_mm_extract_epi16(dv,  2)]++] = _mm256_extract_epi32(index2,  3);
+        indices[counts[_mm_extract_epi16(cv,  1)]++] = _mm256_extract_epi32(index2,  4);
+        indices[counts[_mm_extract_epi16(dv,  1)]++] = _mm256_extract_epi32(index2,  5);
+        indices[counts[_mm_extract_epi16(cv,  0)]++] = _mm256_extract_epi32(index2,  6);
+        indices[counts[_mm_extract_epi16(dv,  0)]++] = _mm256_extract_epi32(index2,  7);
+        index2 = _mm256_add_epi32(index2,delta);
+        __builtin_prefetch(ip+512, 0);
+    }
+    c = _mm256_extract_epi32(index1,  0);
+    uint16_t val;
+    while(ip < bytes+size) {
+        val = *((uint16_t *) ip);
+        indices[counts[(uint16_t) ((val << 8) + (val >> 8))]++] = c;
+        c++;
+        ip++;
+    }
+};
diff --git a/src/nyx/sort.h b/src/nyx/sort.h
new file mode 100644
index 00000000..727cd4db
--- /dev/null
+++ b/src/nyx/sort.h
@@ -0,0 +1,25 @@
+#ifndef SORT_H
+#define SORT_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <immintrin.h>
+
+void count_sort_init(uint8_t *bytes,
+                     uint32_t *indices,
+                     uint32_t size,
+                     uint32_t *counts,
+                     uint32_t *parts,
+                     uint32_t *part_count);
+
+void merge_sort_2(__m256i *values, __m256i *indices);
+
+void merge_sort_4(__m256i *values, __m256i *indices);
+
+void merge_sort_8(__m256i *values, __m256i *indices);
+
+void merge_sort_16(__m256i *values, __m256i *indices);
+
+#endif // SORT_H
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e7ce3eb6..275b0588 100755
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -92,6 +92,7 @@ set(TEST_SRC
 	../src/nyx/output_2_buffer.cpp
 	../src/nyx/output_2_csv.cpp
 	../src/nyx/parallel.cpp
+	../src/nyx/phase1_fastloop.cpp
 	../src/nyx/phase1.cpp
 	../src/nyx/phase2.cpp
 	../src/nyx/phase3.cpp
diff --git a/tests/python/test_nyxus.py b/tests/python/test_nyxus.py
index 6e4320a8..f41f0a1a 100644
--- a/tests/python/test_nyxus.py
+++ b/tests/python/test_nyxus.py
@@ -17,34 +17,90 @@ def test_import(self):
         assert nyxus.__name__ == "nyxus" 
         
 class TestNyxus():
-        PATH = PATH = Path(__file__).with_name('data')
+    PATH = PATH = Path(__file__).with_name('data')
 
-        def test_gabor_gpu(self):
-                urls = ['https://github.com/stardist/stardist/releases/download/0.1.0/dsb2018.zip']
-                dir_names = ['dsb2018']
+    def test_gabor_gpu_validate(self):
+            urls = ['https://github.com/stardist/stardist/releases/download/0.1.0/dsb2018.zip']
+            dir_names = ['dsb2018']
+            
+            download_datasets(urls, dir_names)
+            
+            for dset in dir_names:
                 
-                download_datasets(urls, dir_names)
+                intens = str(self.PATH/dset/"train/images")
+                seg = str(self.PATH/dset/"train/masks")
                 
-                for dset in dir_names:
-                    
-                    intens = str(self.PATH/dset/"train/images")
-                    seg = str(self.PATH/dset/"train/masks")
+                # cpu gabor
+                cpu_nyx = nyxus.Nyxus(["GABOR"])
+                if (nyxus.gpu_is_available()):
+                    cpu_nyx.using_gpu(False)
+                cpu_features = cpu_nyx.featurize_directory(intens, seg)
+
+                if (nyxus.gpu_is_available()):
+                    # gpu gabor
+                    gpu_nyx = nyxus.Nyxus(["GABOR"], using_gpu=0)
+                    gpu_nyx.using_gpu(True)
+                    gpu_features = gpu_nyx.featurize_directory(intens, seg)
                     
-                    # cpu gabor
-                    cpu_nyx = nyxus.Nyxus(["GABOR"])
-                    if (nyxus.gpu_is_available()):
-                        cpu_nyx.using_gpu(False)
-                    cpu_features = cpu_nyx.featurize_directory(intens, seg)
+                    assert gpu_features.equals(cpu_features)
+                else:
+                    print("Gpu not available")
+                    assert True
+    
+    @pytest.mark.parametrize("use_fastloop",[True, False])
+    def test_intensity_benchmark(self,use_fastloop,benchmark):
+        
+        urls = ['https://github.com/stardist/stardist/releases/download/0.1.0/dsb2018.zip']
+        dir_names = ['dsb2018']
+        
+        download_datasets(urls, dir_names)
+        
+        for dset in dir_names:
+            
+            intens = str(self.PATH/dset/"train/images")
+            seg = str(self.PATH/dset/"train/masks")
+            
+            # cpu gabor
+            cpu_nyx = nyxus.Nyxus(["*ALL_MORPHOLOGY*"])
+            cpu_nyx.featurize_directory
+            cpu_features = benchmark(cpu_nyx.featurize_directory,intens, seg, ".*", use_fastloop)
+            
+    def test_intensity_validate(self):
+        
+        urls = ['https://github.com/stardist/stardist/releases/download/0.1.0/dsb2018.zip']
+        dir_names = ['dsb2018']
+        
+        download_datasets(urls, dir_names)
+        
+        for dset in dir_names:
+            
+            intens = str(self.PATH/dset/"train/images")
+            seg = str(self.PATH/dset/"train/masks")
+            
+            # cpu gabor
+            cpu_nyx = nyxus.Nyxus(["*ALL_MORPHOLOGY*"])
+            cpu_features = cpu_nyx.featurize_directory(intens, seg, ".*", False)
+            
+            fast_features = cpu_nyx.featurize_directory(intens, seg, ".*", True)
+            
+            assert cpu_features.equals(fast_features)
 
-                    if (nyxus.gpu_is_available()):
-                        # gpu gabor
-                        gpu_nyx = nyxus.Nyxus(["GABOR"], using_gpu=0)
-                        gpu_nyx.using_gpu(True)
-                        gpu_features = gpu_nyx.featurize_directory(intens, seg)
-                        
-                        assert gpu_features.equals(cpu_features)
-                    else:
-                        print("Gpu not available")
-                        assert True
-                
+    # For benchmarking gabor features            
+    # @pytest.mark.parametrize("use_gpu",[True, False])
+    # def test_gabor_benchmark(self, use_gpu, benchmark):
+        
+    #     urls = ['https://github.com/stardist/stardist/releases/download/0.1.0/dsb2018.zip']
+    #     dir_names = ['dsb2018']
+        
+    #     download_datasets(urls, dir_names)
+        
+    #     for dset in dir_names:
+            
+    #         intens = str(self.PATH/dset/"train/images")
+    #         seg = str(self.PATH/dset/"train/masks")
+            
+    #         # cpu gabor
+    #         cpu_nyx = nyxus.Nyxus(["GABOR"])
+    #         cpu_nyx.using_gpu(use_gpu)
+    #         cpu_features = benchmark(cpu_nyx.featurize_directory,intens, seg)
         
\ No newline at end of file