diff --git a/CMakeLists.txt b/CMakeLists.txt index afc7423f..a8bc6a39 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,7 +69,7 @@ if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() -set(CMAKE_CXX_FLAGS_RELEASE "-O2") +set(CMAKE_CXX_FLAGS_RELEASE "-O2 -march=native") #== GTest option(RUN_GTEST "Downloads google unit test API and runs google test scripts to test Nyxus" OFF) @@ -137,11 +137,13 @@ set(SOURCE src/nyx/output_2_csv.cpp src/nyx/parallel.cpp src/nyx/phase1.cpp + src/nyx/phase1_fastloop.cpp src/nyx/phase2.cpp src/nyx/phase3.cpp src/nyx/pixel_feed.cpp src/nyx/reduce_by_feature.cpp src/nyx/reduce_trivial_rois.cpp + src/nyx/rle.cpp src/nyx/roi_cache.cpp src/nyx/roi_cache_basic.cpp src/nyx/scan_fastloader_way.cpp diff --git a/src/nyx/features/aabb.h b/src/nyx/features/aabb.h index 7c93d032..5ed5ca90 100644 --- a/src/nyx/features/aabb.h +++ b/src/nyx/features/aabb.h @@ -26,6 +26,14 @@ class AABB xmin = std::min(xmin, x); xmax = std::max(xmax, x); } + void update_xmax(StatsInt x) + { + xmax = std::max(xmax, x); + } + void update_xmin(StatsInt x) + { + xmin = std::min(xmin, x); + } void update_y(StatsInt y) { ymin = std::min(ymin, y); @@ -52,4 +60,4 @@ class AABB private: StatsInt xmin = INT32_MAX, xmax = INT32_MIN, ymin = INT32_MAX, ymax = INT32_MIN; -}; \ No newline at end of file +}; diff --git a/src/nyx/features/basic_morphology.cpp b/src/nyx/features/basic_morphology.cpp index b77293e2..8d2adb24 100644 --- a/src/nyx/features/basic_morphology.cpp +++ b/src/nyx/features/basic_morphology.cpp @@ -36,10 +36,19 @@ void BasicMorphologyFeatures::calculate(LR& r) // --CENTROID_XY double cen_x = 0.0, cen_y = 0.0; - for (auto& px : r.raw_pixels) - { - cen_x += px.x; - cen_y += px.y; + if (r.rle_pixels.empty()) { + for (auto& px : r.raw_pixels) + { + cen_x += px.x; + cen_y += px.y; + } + } else { + for (int i = 0; i < r.rle_pixels.size(); i+=2) { + double n = r.rle_pixels[i+1].x - r.rle_pixels[i].x + 1; + double s = r.rle_pixels[i+1].x + r.rle_pixels[i].x; + cen_x += n/2 * s; + cen_y += r.rle_pixels[i].y * n; + } } val_CENTROID_X = cen_x; @@ -62,10 +71,19 @@ void BasicMorphologyFeatures::calculate(LR& r) //==== Basic morphology :: Centroids val_CENTROID_X = val_CENTROID_Y = 0; - for (auto& px : r.raw_pixels) - { - val_CENTROID_X += px.x; - val_CENTROID_Y += px.y; + if (r.rle_pixels.empty()) { + for (auto& px : r.raw_pixels) + { + val_CENTROID_X += px.x; + val_CENTROID_Y += px.y; + } + } else { + for (int i = 0; i < r.rle_pixels.size(); i+=2) { + double n = r.rle_pixels[i+1].x - r.rle_pixels[i].x + 1; + double s = r.rle_pixels[i+1].x + r.rle_pixels[i].x; + val_CENTROID_X += n/2 * s; + val_CENTROID_Y += r.rle_pixels[i].y * n; + } } val_CENTROID_X /= n; val_CENTROID_Y /= n; diff --git a/src/nyx/features/contour.cpp b/src/nyx/features/contour.cpp index 1632ea93..c169e2f1 100644 --- a/src/nyx/features/contour.cpp +++ b/src/nyx/features/contour.cpp @@ -1,3 +1,6 @@ +#include "pixel.h" +#include +#include #define _USE_MATH_DEFINES // For M_PI, etc. #include #include @@ -30,8 +33,169 @@ ContourFeature::ContourFeature() : FeatureMethod("ContourFeature") }); } +void ContourFeature::buildRegularContourFast(LR& r) +{ + + r.contour.clear(); + + // Order the RLE + std::vector rle = r.rle_pixels; + std::sort (rle.begin(), rle.end(), compare_locations); + + // Get starting points of each row + // Also merge overlapping RLE + std::vector rows; + rows.push_back(0); + for (uint32_t i = 2; i < rle.size(); i+=2) { + + // Finds row boundary + if (rle[i].y != rle[i-1].y) { + rows.push_back(i); + continue; + } + + // Find overlapping RLE and merge them + if (rle[i].x-1 == rle[i-1].x) { + rle[i-1] = rle[i+1]; + rle[i].x = rle[i+1].x+1; + } + } + rows.push_back(rle.size()); + + // The first row must be entirely on the edge, so store it in the contour + uint32_t pix_ind = 0; + for (uint32_t i = 0; i < rows[1]; i+=2) { + uint32_t n = rle[i+1].x - rle[i].x + 1; + for (uint32_t ind = pix_ind; ind < pix_ind + n; ++ind) { + Pixel2 p(r.raw_pixels[ind].x,r.raw_pixels[ind].y,r.raw_pixels[ind].inten); + r.contour.push_back(p); + } + pix_ind += n; + } + + // Loop over rows from the 2nd row until the 2nd to last row + for (uint32_t p = 1; p < rows.size()-2; ++p) { + uint32_t prev = rows[p-1]; + uint32_t prev_end = rows[p]; + uint32_t next = rows[p+1]; + uint32_t next_end = rows[p+2]; + + // Find the intersection of previous and next rows + std::vector overlap; + while (prev < prev_end && next < next_end) { + if (rle[prev].x > rle[next+1].x) { + next += 2; + continue; + } + if (rle[next].x > rle[prev+1].x) { + prev += 2; + continue; + } + + // If we get to this point, we have found overlap + overlap.push_back(std::max(rle[prev].x,rle[next].x)); + overlap.push_back(std::min(rle[prev+1].x,rle[next+1].x)); + + // Determine whether to advance next or previous + if (rle[prev+1].x > rle[next+1].x) { + next += 2; + } else { + prev += 2; + } + } + + // If there is no overlap, all pixels are edge pixels + if (overlap.empty()) { + for (uint32_t current = rows[p]; current < rows[p+1]; current+=2) { + uint32_t n = rle[current+1].x - rle[current].x + 1; + for (uint32_t ind = pix_ind; ind < pix_ind + n; ++ind) { + Pixel2 p(r.raw_pixels[ind].x,r.raw_pixels[ind].y,r.raw_pixels[ind].inten); + r.contour.push_back(p); + } + pix_ind += n; + } + } else { + // Add pixels to non-overlapping regions + uint32_t oind = 0; + uint32_t offset = 0; + std::vector nonoverlap; + for (uint32_t current = rows[p]; current < rows[p+1]; current+=2) { + + // skip over invalid rle + if (rle[current+1].x < rle[current].x) { + continue; + } + + // The first pixel is always on the edge + nonoverlap.push_back(offset); + + uint32_t x_start = rle[current].x; + + while (oind < overlap.size()) { + + if (overlap[oind+1] < rle[current].x) { + oind += 2; + continue; + } + + if (overlap[oind] < rle[current+1].x) { + long end = std::max(rle[current].x + 1, (long) overlap[oind]); + long start = std::min(rle[current + 1].x, (long) overlap[oind + 1]+1); + nonoverlap.push_back(end - x_start + offset); + nonoverlap.push_back(start - x_start + offset); + } + + if (overlap[oind+1] < rle[current+1].x) { + oind += 2; + } else { + break; + } + } + + // The last pixel is always on the edge + // In the case of a single pixel gap, it might already be there + uint32_t last = rle[current+1].x - x_start + 1 + offset; + if (nonoverlap[nonoverlap.size()-2] == last) { + nonoverlap.pop_back(); + } else { + nonoverlap.push_back(last); + } + + offset += rle[current+1].x - rle[current].x + 1; + } + + // Add pixels to the contour + for (uint32_t ind = 0; ind < nonoverlap.size(); ind += 2) { + for (uint32_t pind = nonoverlap[ind]; pind < nonoverlap[ind+1]; ++pind) { + Pixel2 p(r.raw_pixels[pix_ind+pind].x, r.raw_pixels[pix_ind+pind].y, r.raw_pixels[pix_ind+pind].inten); + r.contour.push_back(p); + } + } + pix_ind += offset; + } + } + + // The last row must be entirely on the edge, so store it in the contour + pix_ind = r.raw_pixels.size(); + for (uint32_t i = rows[rows.size()-2]; i < rows[rows.size()-1]; i+=2) { + uint32_t n = rle[i+1].x - rle[i].x + 1; + pix_ind -= n; + for (uint32_t ind = pix_ind; ind < pix_ind + n; ++ind) { + Pixel2 p(r.raw_pixels[ind].x,r.raw_pixels[ind].y,r.raw_pixels[ind].inten); + r.contour.push_back(p); + } + } +} + void ContourFeature::buildRegularContour(LR& r) { + + // If RLE is present, use it + if (!r.rle_pixels.empty()) { + buildRegularContourFast(r); + return; + } + //==== Pad the image int width = r.aux_image_matrix.width, @@ -179,6 +343,11 @@ void ContourFeature::buildRegularContour(LR& r) } } +bool ContourFeature::compare_locations (const Pixel2& lhs, const Pixel2& rhs) +{ + return (lhs.y < rhs.y) || (lhs.y == rhs.y && lhs.x < rhs.x); +} + void ContourFeature::buildWholeSlideContour(LR& r) { // Push the 4 slide vertices of dummy intensity 999 @@ -265,8 +434,10 @@ namespace Nyxus if (r.roi_disabled) return; - //==== Calculate ROI's image matrix - r.aux_image_matrix.calculate_from_pixelcloud (r.raw_pixels, r.aabb); + //==== Calculate ROI's image matrix if no RLE is present + if (r.rle_pixels.empty()) { + r.aux_image_matrix.calculate_from_pixelcloud (r.raw_pixels, r.aabb); + } //==== Contour, ROI perimeter, equivalent circle diameter ContourFeature f; diff --git a/src/nyx/features/contour.h b/src/nyx/features/contour.h index 1e4a817c..2696981d 100644 --- a/src/nyx/features/contour.h +++ b/src/nyx/features/contour.h @@ -113,6 +113,8 @@ class ContourFeature: public FeatureMethod private: void buildRegularContour(LR& r); + void buildRegularContourFast(LR& r); + static bool compare_locations (const Pixel2& lhs, const Pixel2& rhs); void buildWholeSlideContour(LR& r); std::tuple calc_min_max_mean_stddev_intensity (const std::vector & contour_pixels); double diff --git a/src/nyx/features/convex_hull_nontriv.cpp b/src/nyx/features/convex_hull_nontriv.cpp index 27bdd6dd..17d098a7 100644 --- a/src/nyx/features/convex_hull_nontriv.cpp +++ b/src/nyx/features/convex_hull_nontriv.cpp @@ -24,7 +24,11 @@ void ConvexHullFeature::cleanup_instance() void ConvexHullFeature::calculate (LR& r) { // Build the convex hull - build_convex_hull(r.raw_pixels, r.convHull_CH); + if (r.rle_pixels.empty()) { + build_convex_hull(r.raw_pixels, r.convHull_CH); + } else { + build_convex_hull(r.rle_pixels, r.convHull_CH); + } // Calculate related features double s_hull = polygon_area(r.convHull_CH), diff --git a/src/nyx/features_calc_workflow.cpp b/src/nyx/features_calc_workflow.cpp index b165f34c..122890c4 100644 --- a/src/nyx/features_calc_workflow.cpp +++ b/src/nyx/features_calc_workflow.cpp @@ -134,6 +134,27 @@ namespace Nyxus r.intFname = intFile; } + void init_label_record_2_fast (LR& r, const std::string& segFile, const std::string& intFile, int x1, int x2, int y, int label, PixIntens maxInt, PixIntens minInt, unsigned int tile_index) + { + // Cache the host tile's index + r.host_tiles.insert (tile_index); + + // Initialize basic counters + r.aux_area = x2-x1; + r.aux_min = minInt; + r.aux_max = maxInt; + r.aabb.init_y (y); + r.aabb.init_x(x1); + r.aabb.update_xmax(x2-1); + + // Cache the ROI label + r.label = label; + + // File names + r.segFname = segFile; + r.intFname = intFile; + } + // This function 'digests' the 2nd and the following pixel of a label and updates the label's feature calculation state - the instance of structure 'LR' void update_label_record(LR& lr, int x, int y, int label, PixIntens intensity) { @@ -163,4 +184,18 @@ namespace Nyxus lr.update_aabb (x,y); } -} \ No newline at end of file + void update_label_record_2_fast (LR& lr, int x1, int x2, int y, int label, PixIntens maxInt, PixIntens minInt, unsigned int tile_index) + { + // Cache the host tile's index + lr.host_tiles.insert(tile_index); + + // Initialize basic counters + lr.aux_area += x2-x1; + lr.aux_min = std::min(lr.aux_min, minInt); + lr.aux_max = std::max(lr.aux_max, maxInt); + lr.aabb.update_xmin (x1); + lr.aabb.update_xmax (x2-1); + lr.aabb.update_y (y); + } + +} diff --git a/src/nyx/globals.h b/src/nyx/globals.h index a4dcbeaa..5c4aee7e 100644 --- a/src/nyx/globals.h +++ b/src/nyx/globals.h @@ -7,6 +7,7 @@ #include #include #include +#include "features/pixel.h" #include "featureset.h" #include "feature_method.h" #include "feature_mgr.h" @@ -21,9 +22,11 @@ namespace Nyxus bool scanFilePairParallel(const std::string& intens_fpath, const std::string& label_fpath, int num_fastloader_threads, int num_sensemaker_threads, int filepair_index, int tot_num_filepairs); std::string getPureFname(const std::string& fpath); + int processDataset(const std::vector& intensFiles, const std::vector& labelFiles, int numFastloaderThreads, int numSensemakerThreads, int numReduceThreads, int min_online_roi_size, bool save2csv, const std::string& csvOutputDir, bool use_fastloop); int processDataset(const std::vector& intensFiles, const std::vector& labelFiles, int numFastloaderThreads, int numSensemakerThreads, int numReduceThreads, int min_online_roi_size, bool save2csv, const std::string& csvOutputDir); bool gatherRoisMetrics(const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads); - bool processTrivialRois (const std::vector& trivRoiLabels, const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads, size_t memory_limit); + bool gatherRoisMetricsFast(const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads); + bool processTrivialRois (const std::vector& trivRoiLabels, const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads, size_t memory_limit, bool use_fastloop); bool processNontrivialRois (const std::vector& nontrivRoiLabels, const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads); void dump_roi_metrics(const std::string & label_fpath); @@ -49,8 +52,11 @@ namespace Nyxus void init_label_record(LR& lr, const std::string& segFile, const std::string& intFile, int x, int y, int label, PixIntens intensity); void init_label_record_2(LR& lr, const std::string& segFile, const std::string& intFile, int x, int y, int label, PixIntens intensity, unsigned int tile_index); + void init_label_record_2_fast (LR& r, const std::string& segFile, const std::string& intFile, int x1, int x2, int y, int label, PixIntens maxInt, PixIntens minInt, unsigned int tile_index); void update_label_record(LR& lr, int x, int y, int label, PixIntens intensity); + void update_label_record_fast(LR& lr, int x1, int x2, int y, int label, PixIntens maxInt, PixIntens minInt); void update_label_record_2(LR& lr, int x, int y, int label, PixIntens intensity, unsigned int tile_index); + void update_label_record_2_fast(LR& lr, int x1, int x2, int y, int label, PixIntens maxInt, PixIntens minInt, unsigned int tile_index); void reduce_neighbors(int labels_collision_radius); void allocateTrivialRoisBuffers(const std::vector& Pending); @@ -69,6 +75,7 @@ namespace Nyxus /// @param intensity -- pixel's intensity /// @param tile_index -- index of pixel's tile in the image void feed_pixel_2_metrics(int x, int y, PixIntens intensity, int label, unsigned int tile_index); + void feed_pixel_2_metrics_fast(int x1, int x2, int y, PixIntens maxInt, PixIntens minInt, int label, unsigned int tile_index); /// @brief Copies a pixel to the ROI's cache. /// @param x -- x-coordinate of the pixel in the image @@ -76,6 +83,8 @@ namespace Nyxus /// @param label -- label of pixel's segment /// @param intensity -- pixel's intensity void feed_pixel_2_cache(int x, int y, PixIntens intensity, int label); + + void feed_rle_2_cache(int x1, int x2, PixIntens i1, PixIntens i2, int y, int label); // System resources unsigned long long getAvailPhysMemory(); diff --git a/src/nyx/phase1_fastloop.cpp b/src/nyx/phase1_fastloop.cpp new file mode 100644 index 00000000..82e477d2 --- /dev/null +++ b/src/nyx/phase1_fastloop.cpp @@ -0,0 +1,140 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef WITH_PYTHON_H +#include +#endif +#include "environment.h" +#include "globals.h" +#include "helpers/timing.h" +#include "rle.hpp" + +namespace Nyxus +{ + bool gatherRoisMetricsFast (const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads) + { + + int lvl = 0, // Pyramid level + lyr = 0; // Layer + + // Read the tiff. The image loader is put in the open state in processDataset() + size_t nth = theImLoader.get_num_tiles_hor(), + ntv = theImLoader.get_num_tiles_vert(), + fw = theImLoader.get_tile_width(), + th = theImLoader.get_tile_height(), + tw = theImLoader.get_tile_width(), + tileSize = theImLoader.get_tile_size(), + fullwidth = theImLoader.get_full_width(), + fullheight = theImLoader.get_full_height(); + + int cnt = 1; + for (unsigned int row = 0; row < nth; row++) + for (unsigned int col = 0; col < ntv; col++) + { + // Fetch the tile + bool ok = theImLoader.load_tile(row, col); + if (!ok) + { + std::stringstream ss; + ss << "Error fetching tile row=" << row << " col=" << col; + #ifdef WITH_PYTHON_H + throw ss.str(); + #endif + std::cerr << ss.str() << "\n"; + return false; + } + + // Get ahold of tile's pixel buffer + auto tileIdx = row * nth + col; + auto dataI = theImLoader.get_int_tile_buffer(), + dataL = theImLoader.get_seg_tile_buffer(); + + // Iterate pixels + int y = 0; + int y_max = (fullheight < y+th) ? fullheight - row*th : th; + while (y < y_max) + { + + size_t i = y*tw; + + int x = 0; + int x_max = (fullwidth < x+tw) ? fullwidth - col*tw : tw; + + + int x_stream = 32 * (x_max / 32); + + // Compress row to an RLE stream + RLEStream stream = rle_encode_long_stream_32(dataL.data()+i,x_stream); + + // Loop over row objects in RLE stream + unsigned int ind = 0; + while (ind+1 < stream.offsets.size()) { + uint16_t x1 = stream.offsets[ind++]; + auto label = dataL[i+x1]; + + // If not background, store the object + if (label > 0) { + // Collapse all the labels to one if single-ROI mde is requested + if (theEnvironment.singleROI) + label = 1; + + uint16_t x2 = stream.offsets[ind]; + auto minInt = dataI[i+x1]; + auto maxInt = dataI[i+x1]; + + // Find the min and max intensity + for (int xi=i+x1; xi dataI[xi]) { + minInt = dataI[xi]; + } + + } + + feed_pixel_2_metrics_fast (x1, x2, y, maxInt, minInt, label, tileIdx); // Updates 'uniqueLabels' and 'roiData' + } + } + + i += x_stream; + x = x_stream; + while (x < x_max) { + + // Skip non-mask pixels + auto label = dataL[i]; + if (label == 0) { + ++x, ++i; + continue; + } + + // Collapse all the labels to one if single-ROI mde is requested + if (theEnvironment.singleROI) + label = 1; + + // Update pixel's ROI metrics + feed_pixel_2_metrics (x, y, dataI[i], label, tileIdx); // Updates 'uniqueLabels' and 'roiData' + ++x, ++i; + } + ++y; + } + +#ifdef WITH_PYTHON_H + if (PyErr_CheckSignals() != 0) + throw pybind11::error_already_set(); +#endif + + // Show stayalive progress info + if (cnt++ % 4 == 0) + std::cout << "\t" << int((row * nth + col) * 100 / float(nth * ntv) * 100) / 100. << "%\t" << uniqueLabels.size() << " ROIs" << "\n"; + } + + return true; + } + +} diff --git a/src/nyx/phase2.cpp b/src/nyx/phase2.cpp index 43559a88..0bdc025c 100644 --- a/src/nyx/phase2.cpp +++ b/src/nyx/phase2.cpp @@ -10,6 +10,7 @@ #include "environment.h" #include "globals.h" #include "helpers/timing.h" +#include "rle.hpp" namespace Nyxus { @@ -105,7 +106,6 @@ namespace Nyxus for (unsigned int row = 0; row < nth; row++) for (unsigned int col = 0; col < ntv; col++) { - // Fetch the tile bool ok = theImLoader.load_tile(row, col); if (!ok) { @@ -158,6 +158,124 @@ namespace Nyxus return true; } + bool scanTrivialRoisFast (const std::vector& batch_labels, const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads) + { + // Sort the batch's labels to enable binary searching in it + std::vector whiteList = batch_labels; + std::sort (whiteList.begin(), whiteList.end()); + + int lvl = 0, // Pyramid level + lyr = 0; // Layer + + // Read the tiffs + size_t nth = theImLoader.get_num_tiles_hor(), + ntv = theImLoader.get_num_tiles_vert(), + fw = theImLoader.get_tile_width(), + th = theImLoader.get_tile_height(), + tw = theImLoader.get_tile_width(), + tileSize = theImLoader.get_tile_size(), + fullwidth = theImLoader.get_full_width(), + fullheight = theImLoader.get_full_height(); + + int cnt = 1; + for (unsigned int row = 0; row < nth; row++) + for (unsigned int col = 0; col < ntv; col++) + { + // Fetch the tile + bool ok = theImLoader.load_tile(row, col); + if (!ok) + { + std::stringstream ss; + ss << "Error fetching tile row=" << row << " col=" << col; + #ifdef WITH_PYTHON_H + throw ss.str(); + #endif + std::cerr << ss.str() << "\n"; + return false; + } + + // Get ahold of tile's pixel buffer + auto dataI = theImLoader.get_int_tile_buffer(), + dataL = theImLoader.get_seg_tile_buffer(); + + // Iterate pixels + int y = 0; + int y_max = (fullheight < y+th) ? fullheight - row*th : th; + while (y < y_max) + { + + size_t i = y*tw; + + int x = 0; + int x_max = (fullwidth < x+tw) ? fullwidth - col*tw : tw; + + + int x_stream = 32 * (x_max / 32); + + RLEStream stream = rle_encode_long_stream_32(dataL.data()+i,x_stream); + + unsigned int ind = 0; + while (ind+1 < stream.offsets.size()) { + uint16_t x1 = stream.offsets[ind++]; + auto label = dataL[i+x1]; + + if (label > 0) { + uint16_t x2 = stream.offsets[ind]; + + if (! theEnvironment.singleROI && ! std::binary_search(whiteList.begin(), whiteList.end(), label)) //--slow-- if (std::find(PendingRoiLabels.begin(), PendingRoiLabels.end(), label) == PendingRoiLabels.end()) + { + continue; + } + + for (int xi=x1; xi 0) { + uint16_t x2 = stream.offsets[ind]; + + if (! theEnvironment.singleROI && ! std::binary_search(whiteList.begin(), whiteList.end(), label)) //--slow-- if (std::find(PendingRoiLabels.begin(), PendingRoiLabels.end(), label) == PendingRoiLabels.end()) + { + continue; + } + + for (int xi=x1; xi& trivRoiLabels, const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads, size_t memory_limit) + bool processTrivialRois (const std::vector& trivRoiLabels, const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads, size_t memory_limit, bool use_fastloop) { std::vector Pending; size_t batchDemand = 0; @@ -225,7 +343,11 @@ namespace Nyxus else std::cout << ">>> (ROIs " << Pending[0] << " ... " << Pending[Pending.size() - 1] << ")\n"; ) - scanTrivialRois(Pending, intens_fpath, label_fpath, num_FL_threads); + if (use_fastloop) { + scanTrivialRoisFast(Pending, intens_fpath, label_fpath, num_FL_threads); + } else { + scanTrivialRois(Pending, intens_fpath, label_fpath, num_FL_threads); + } // Allocate memory VERBOSLVL1(std::cout << "\tallocating ROI buffers\n";) @@ -273,7 +395,11 @@ namespace Nyxus else std::cout << ">>> (ROIs " << Pending[0] << " ... " << Pending[Pending.size() - 1] << ")\n"; ) - scanTrivialRois(Pending, intens_fpath, label_fpath, num_FL_threads); + if (use_fastloop) { + scanTrivialRoisFast(Pending, intens_fpath, label_fpath, num_FL_threads); + } else { + scanTrivialRois(Pending, intens_fpath, label_fpath, num_FL_threads); + } // Allocate memory VERBOSLVL1(std::cout << "\tallocating ROI buffers\n";) diff --git a/src/nyx/pixel_feed.cpp b/src/nyx/pixel_feed.cpp index b965dd7f..c95fa4af 100644 --- a/src/nyx/pixel_feed.cpp +++ b/src/nyx/pixel_feed.cpp @@ -1,3 +1,4 @@ +#include "features/pixel.h" #include #include #include @@ -37,6 +38,26 @@ namespace Nyxus } } + void feed_pixel_2_metrics_fast(int x1, int x2, int y, PixIntens maxInt, PixIntens minInt, int label, unsigned int tile_index) + { + if (uniqueLabels.find(label) == uniqueLabels.end()) + { + // Remember this label + uniqueLabels.insert(label); + + // Initialize the ROI label record + LR newData; + init_label_record_2_fast(newData, theSegFname, theIntFname, x1, x2, y, label, maxInt, minInt, tile_index); + roiData[label] = newData; + } + else + { + // Update basic ROI info (info that doesn't require costly calculations) + LR& existingData = roiData[label]; + update_label_record_2_fast(existingData, x1, x2, y, label, maxInt, minInt, tile_index); + } + } + /// @brief Copies a pixel to the ROI's cache. /// @param x -- x-coordinate of the pixel in the image /// @param y -- y-coordinate of the pixel in the image @@ -49,4 +70,12 @@ namespace Nyxus r.raw_pixels.push_back(Pixel2(x, y, intensity)); } + void feed_rle_2_cache(int x1, int x2, PixIntens i1, PixIntens i2, int y, int label) + { + // Update basic ROI info (info that doesn't require costly calculations) + LR& r = roiData[label]; + r.rle_pixels.push_back(Pixel2(x1, y, i1)); + r.rle_pixels.push_back(Pixel2(x2-1, y, i2)); + } + } diff --git a/src/nyx/python/new_bindings_py.cpp b/src/nyx/python/new_bindings_py.cpp index 0e365ff6..2284b373 100644 --- a/src/nyx/python/new_bindings_py.cpp +++ b/src/nyx/python/new_bindings_py.cpp @@ -70,10 +70,11 @@ void initialize_environment( #endif } -py::tuple featurize_directory_imp ( +py::tuple featurize_directory_imp_fast ( const std::string &intensity_dir, const std::string &labels_dir, - const std::string &file_pattern) + const std::string &file_pattern, + bool use_fastloop) { theEnvironment.intensity_dir = intensity_dir; theEnvironment.labels_dir = labels_dir; @@ -110,7 +111,8 @@ py::tuple featurize_directory_imp ( theEnvironment.n_reduce_threads, min_online_roi_size, false, // 'true' to save to csv - theEnvironment.output_dir); + theEnvironment.output_dir, + use_fastloop); if (errorCode) throw std::runtime_error("Error occurred during dataset processing."); @@ -125,7 +127,19 @@ py::tuple featurize_directory_imp ( return py::make_tuple(pyHeader, pyStrData, pyNumData); } -py::tuple featurize_fname_lists_imp (const py::list& int_fnames, const py::list & seg_fnames) +py::tuple featurize_directory_imp ( + const std::string &intensity_dir, + const std::string &labels_dir, + const std::string &file_pattern) +{ + return featurize_directory_imp_fast ( + intensity_dir, + labels_dir, + file_pattern, + false); +} + +py::tuple featurize_fname_lists_imp_fast (const py::list& int_fnames, const py::list & seg_fnames, bool use_fastloop) { std::vector intensFiles, labelFiles; for (auto it = int_fnames.begin(); it != int_fnames.end(); ++it) @@ -177,7 +191,8 @@ py::tuple featurize_fname_lists_imp (const py::list& int_fnames, const py::list theEnvironment.n_reduce_threads, min_online_roi_size, false, // 'true' to save to csv - theEnvironment.output_dir); + theEnvironment.output_dir, + use_fastloop); if (errorCode) throw std::runtime_error("Error occurred during dataset processing."); @@ -191,6 +206,12 @@ py::tuple featurize_fname_lists_imp (const py::list& int_fnames, const py::list return py::make_tuple(pyHeader, pyStrData, pyNumData); } + +py::tuple featurize_fname_lists_imp (const py::list& int_fnames, const py::list & seg_fnames) +{ + return featurize_fname_lists_imp_fast (int_fnames, seg_fnames, false); +} + py::tuple findrelations_imp( std::string& label_dir, std::string& parent_file_pattern, @@ -255,6 +276,8 @@ PYBIND11_MODULE(backend, m) m.def("initialize_environment", &initialize_environment, "Environment initialization"); m.def("featurize_directory_imp", &featurize_directory_imp, "Calculate features of images defined by intensity and mask image collection directories"); m.def("featurize_fname_lists_imp", &featurize_fname_lists_imp, "Calculate features of intensity-mask image pairs defined by lists of image file names"); + m.def("featurize_directory_imp_fast", &featurize_directory_imp_fast, "Calculate features of images defined by intensity and mask image collection directories"); + m.def("featurize_fname_lists_imp_fast", &featurize_fname_lists_imp_fast, "Calculate features of intensity-mask image pairs defined by lists of image file names"); m.def("findrelations_imp", &findrelations_imp, "Find relations in segmentation images"); m.def("gpu_available", &Environment::gpu_is_available, "Check if CUDA gpu is available"); m.def("use_gpu", &use_gpu, "Enable/disable GPU features"); diff --git a/src/nyx/python/nyxus/nyxus.py b/src/nyx/python/nyxus/nyxus.py index f913154a..c392ed34 100644 --- a/src/nyx/python/nyxus/nyxus.py +++ b/src/nyx/python/nyxus/nyxus.py @@ -1,4 +1,4 @@ -from .backend import initialize_environment, featurize_directory_imp, featurize_fname_lists_imp, findrelations_imp, use_gpu, gpu_available +from .backend import initialize_environment, featurize_directory_imp, featurize_directory_imp_fast, featurize_fname_lists_imp, findrelations_imp, use_gpu, gpu_available import os import numpy as np import pandas as pd @@ -96,6 +96,7 @@ def featurize_directory( intensity_dir: str, label_dir: Optional[str] = None, file_pattern: Optional[str] = ".*", + use_fastloop: bool = False ): """Extract features from all the images satisfying the file pattern of provided image directories. @@ -133,7 +134,7 @@ def featurize_directory( if label_dir is None: label_dir = intensity_dir - header, string_data, numeric_data = featurize_directory_imp (intensity_dir, label_dir, file_pattern) + header, string_data, numeric_data = featurize_directory_imp_fast (intensity_dir, label_dir, file_pattern, use_fastloop) df = pd.concat( [ diff --git a/src/nyx/rle.cpp b/src/nyx/rle.cpp new file mode 100644 index 00000000..60440dee --- /dev/null +++ b/src/nyx/rle.cpp @@ -0,0 +1,291 @@ +#include "rle.hpp" +#include +#include + +// Vector to shuffle 4-bit data into the lower 64 bits +const __m128i pshufbcnst_128 = + _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0E, 0x0C, + 0x0A, 0x08, 0x06, 0x04, 0x02, 0x00); + +// zero vector +const __m128i zeroes_128 = _mm_setzero_si128(); +const __m256i zeroes_256 = _mm256_setzero_si256(); + +// Shuffle 4-bit vectors +const __m256i shf = _mm256_set_epi8( + 0x80, 0x0F, 0x80, 0x07, 0x80, 0x0E, 0x80, 0x06, 0x80, 0x0D, 0x80, 0x05, + 0x80, 0x0C, 0x80, 0x04, 0x80, 0x0B, 0x80, 0x03, 0x80, 0x0A, 0x80, 0x02, + 0x80, 0x09, 0x80, 0x01, 0x80, 0x08, 0x80, 0x00); + +// Bit shifts to decode 4-bit packing +const __m256i shft = _mm256_set_epi64x(0x04, 0x00, 0x04, 0x00); + +// For casting to 4-bit mask +const __m256i vmsk = _mm256_set1_epi8(0x0F); + +// For casting 4-bit boolean to position index +const uint64_t cast_index = 0xFEDCBA9876543210; + +/** + * @brief Get indices of non-zero byte values in a 128 bit vector. + * + * Assuming data in a 128 bit vector is composed of byte data, this function + * returns indices of non-zero values as packed uint16 values stored in a 256 + * bit vector. + * + * @param data + * @return PackedIndex + */ +PackedIndex<__m256i> get_128_indices_word(__m128i data) { + + uint64_t ind; // Holds 4-bit indexes + + PackedIndex<__m256i> output{}; + + // Determine how many edges were found + data = _mm_srli_epi64(data, 4); // Pack 16x8 bit to 32x4 bit mask. + data = _mm_shuffle_epi8(data, pshufbcnst_128); // Align the 16x4 bit mask. + ind = ~_mm_cvtsi128_si64(data); // Extract the 16x4 bit mask. + output.count = static_cast(_mm_popcnt_u64(ind)) >> 2u; + ind = _pext_u64(cast_index, ind); // Get 1-15 index offset of edges + + // Unpack the 4 bit values + output.indices = _mm256_set1_epi64x(ind); + output.indices = _mm256_srlv_epi64(output.indices, shft); + output.indices = _mm256_and_si256(output.indices, vmsk); + output.indices = _mm256_shuffle_epi8(output.indices, shf); + + return output; +} + +PackedIndex<__m256i> get_256_indices_word(__m256i data) { + + uint64_t ind; // Holds 4-bit indexes + + PackedIndex<__m256i> output{}; + + // Determine how many edges were found + ind = static_cast(~_mm256_movemask_epi8(data)); + output.count = static_cast(_mm_popcnt_u64(ind)) >> 2u; + ind = _pext_u64(cast_index, ind); // Get 1-15 index offset of edges + + // Unpack the 4 bit values + output.indices = _mm256_set1_epi64x(ind); + output.indices = _mm256_srlv_epi64(output.indices, shft); + output.indices = _mm256_and_si256(output.indices, vmsk); + output.indices = _mm256_shuffle_epi8(output.indices, shf); + + return output; +} + +void get_long_edges_256(const uint32_t *data, __m256i *mask, int i, + uint32_t last_val) { + + __m256i offset[4]; + + int index = i * 8; + int i1 = i + 1; + int i2 = i + 2; + int i3 = i + 3; + + // Unrolled data loading + mask[i] = _mm256_loadu_si256((__m256i *)&data[index]); + mask[i1] = _mm256_loadu_si256((__m256i *)&data[index + 8]); + mask[i2] = _mm256_loadu_si256((__m256i *)&data[index + 16]); + mask[i3] = _mm256_loadu_si256((__m256i *)&data[index + 24]); + + // Create shifted values + offset[0] = _mm256_srli_si256(mask[i], 4); + offset[1] = _mm256_srli_si256(mask[i1], 4); + offset[2] = _mm256_srli_si256(mask[i2], 4); + offset[3] = _mm256_srli_si256(mask[i3], 4); + offset[0] = _mm256_insert_epi32(offset[0], _mm256_extract_epi32(mask[i1], 0), 7); + offset[1] = _mm256_insert_epi32(offset[1], _mm256_extract_epi32(mask[i2], 0), 7); + offset[2] = _mm256_insert_epi32(offset[2], _mm256_extract_epi32(mask[i3], 0), 7); + offset[3] = _mm256_insert_epi32(offset[3], last_val, 7); + offset[0] = _mm256_insert_epi32(offset[0], _mm256_extract_epi32(mask[i], 4), 3); + offset[1] = _mm256_insert_epi32(offset[1], _mm256_extract_epi32(mask[i1], 4), 3); + offset[2] = _mm256_insert_epi32(offset[2], _mm256_extract_epi32(mask[i2], 4), 3); + offset[3] = _mm256_insert_epi32(offset[3], _mm256_extract_epi32(mask[i3], 4), 3); + + // Unrolled edge detection using xor + mask[i] = _mm256_xor_si256(mask[i], offset[0]); + mask[i1] = _mm256_xor_si256(mask[i1], offset[1]); + mask[i2] = _mm256_xor_si256(mask[i2], offset[2]); + mask[i3] = _mm256_xor_si256(mask[i3], offset[3]); + + // Generate 16x8 bit mask. + mask[i] = _mm256_cmpeq_epi32(mask[i], zeroes_256); + mask[i1] = _mm256_cmpeq_epi32(mask[i1], zeroes_256); + mask[i2] = _mm256_cmpeq_epi32(mask[i2], zeroes_256); + mask[i3] = _mm256_cmpeq_epi32(mask[i3], zeroes_256); + +} + +void get_byte_edges_128(const uint8_t *data, __m128i *mask, int i, + uint8_t last_val) { + + __m128i offset[4]; + + int index = i * 16; + int i1 = i + 1; + int i2 = i + 2; + int i3 = i + 3; + + // Unrolled data loading + mask[i] = _mm_load_si128((__m128i *)&data[index]); + mask[i1] = _mm_load_si128((__m128i *)&data[index + 16]); + mask[i2] = _mm_load_si128((__m128i *)&data[index + 32]); + mask[i3] = _mm_load_si128((__m128i *)&data[index + 48]); + + // Create shifted values + offset[0] = _mm_bsrli_si128(mask[i], 1); + offset[1] = _mm_bsrli_si128(mask[i1], 1); + offset[2] = _mm_bsrli_si128(mask[i2], 1); + offset[3] = _mm_bsrli_si128(mask[i3], 1); + offset[0] = _mm_insert_epi8(offset[0], _mm_extract_epi8(mask[i1], 0), 15); + offset[1] = _mm_insert_epi8(offset[1], _mm_extract_epi8(mask[i2], 0), 15); + offset[2] = _mm_insert_epi8(offset[2], _mm_extract_epi8(mask[i3], 0), 15); + offset[3] = _mm_insert_epi8(offset[3], last_val, 15); + + // Unrolled edge detection using xor + mask[i] = _mm_xor_si128(mask[i], offset[0]); + mask[i1] = _mm_xor_si128(mask[i1], offset[1]); + mask[i2] = _mm_xor_si128(mask[i2], offset[2]); + mask[i3] = _mm_xor_si128(mask[i3], offset[3]); + + // Generate 16x8 bit mask. + mask[i] = _mm_cmpeq_epi8(mask[i], zeroes_128); + mask[i1] = _mm_cmpeq_epi8(mask[i1], zeroes_128); + mask[i2] = _mm_cmpeq_epi8(mask[i2], zeroes_128); + mask[i3] = _mm_cmpeq_epi8(mask[i3], zeroes_128); +} + +RLEStream rle_encode_long_stream_32(const uint32_t *data, + const uint16_t len) { + if ((len % 32) != 0) { + throw std::invalid_argument( + "rle.rle_encode_long_stream_32: len must be a multiple of 32. " + "Arbitrary lengths should use rle_encode_stream."); + } + + RLEStream stream; // Initialize the output + stream.offsets.resize(len + 1); // Allocate memory + uint16_t stream_ind = 0; // Current index of stream.data + __m256i edges; // Edge data + + stream.start = data; + + // Load the data and get offsets + int num_chunks = len / 8; + __m256i mask[num_chunks]; + + // Loop over data + for (int i = 0; i < num_chunks - 4; i += 4) { + get_long_edges_256(data, mask, i, data[(i + 4) * 8]); + } + + // Get edges of the last 16 values + get_long_edges_256(data, mask, (num_chunks - 4), data[len - 1]); + + if (data[0] != 0u) { + stream.offsets[stream_ind++] = 0; + } + + for (int i = 0; i < num_chunks; ++i) { + // Skip if no edges present + if (_mm256_movemask_epi8(mask[i]) == 0xFFFFFFFF) { + continue; + } + + // Get the indices of edges + PackedIndex<__m256i> edges = get_256_indices_word(mask[i]); + edges.indices = + _mm256_add_epi16(_mm256_set1_epi16(i * 8 + 1), edges.indices); + + // store the results + _mm256_storeu_si256( + reinterpret_cast<__m256i *>(&stream.offsets[stream_ind]), + edges.indices); + stream_ind += edges.count; + } + + // Check the last value and add the index if present + if (data[len - 1] != 0u) { + stream.offsets[stream_ind++] = len; + } + + stream.offsets.resize(stream_ind); + + return stream; +}; + +RLEStream rle_encode_byte_stream_64(const uint8_t *data, + const uint16_t len) { + if ((len % 64) != 0) { + throw std::invalid_argument( + "rle.rle_encode_byte_64: len must be a multiple of 64. " + "Arbitrary lengths should use rle_encode_stream."); + } + + RLEStream stream; // Initialize the output + stream.offsets.resize(len + 1); // Allocate memory + uint16_t stream_ind = 0; // Current index of stream.data + __m256i edges; // Edge data + + stream.start = data; + + // Load the data and get offsets + int num_chunks = len / 16; + __m128i mask[num_chunks]; + // Loop over data + for (int i = 0; i < num_chunks - 4; i += 4) { + get_byte_edges_128(data, mask, i, data[(i + 4) * 16]); + } + // Get edges of the late 64 bytes + get_byte_edges_128(data, mask, (num_chunks - 4), data[len - 1]); + + if (data[0] != 0u) { + stream.offsets[stream_ind++] = 0; + } + + for (int i = 0; i < num_chunks; ++i) { + // Skip if no edges present + if (_mm_movemask_epi8(mask[i]) == 0xFFFFFFFFFFFFFFFF) { + continue; + } + + // Get the indices of edges + PackedIndex<__m256i> edges = get_128_indices_word(mask[i]); + edges.indices = + _mm256_add_epi16(_mm256_set1_epi16(i * 16 + 1), edges.indices); + + // store the results + _mm256_storeu_si256( + reinterpret_cast<__m256i *>(&stream.offsets[stream_ind]), + edges.indices); + stream_ind += edges.count; + } + + // Check the last value and add the index if present + if (data[len - 1] != 0u) { + stream.offsets[stream_ind++] = len; + } + + stream.offsets.resize(stream_ind); + + return stream; +}; + +std::vector> +rle_encode_byte_stream_64_tile(const uint8_t *data, const uint8_t *end, + const uint16_t stride, + const uint16_t chunk_size) { + std::vector> tile; + + for (; data < end; data += stride) { + tile.push_back(rle_encode_byte_stream_64(data, chunk_size)); + } + + return tile; +} diff --git a/src/nyx/rle.hpp b/src/nyx/rle.hpp new file mode 100644 index 00000000..7eb98951 --- /dev/null +++ b/src/nyx/rle.hpp @@ -0,0 +1,147 @@ +#ifndef _FTL_RLE_HPP +#define _FTL_RLE_HPP +/** + * @file rle.hpp + * @author Nick Schaub (nicholas.j.schaub@gmail.com) + * @brief Optimized methods for performing run length encoding + * @version 0.1 + * @date 2022-04-02 + * + * @copyright Copyright (c) 2022 + * + */ + +#include +#include +#include +#include +#include +#include + +/** + * @brief Offsets from a a starting point where data changes value. + * + * Run Length Encoding (RLE) compresses data by finding consecutive, + * repeated values. This is usually represented by a value followed by the + * number of times the value should be repeated. This structure only records the + * offsets to a change in value, and not the actual values themselves. + * + * A good explanation of RLE is in the packbits compression algorithm + * Wikipedia page. + * + * NOTE: The quirk to this data representation is that the first offset may not + * be 0, and in that case the values from start to the first offset are 0. + * + * https://en.wikipedia.org/wiki/PackBits + * + * @tparam T Data type of data to be compressed + * @tparam U Data type of offsets to change in value + */ +template struct RLEStream { + const T *start; + std::vector offsets; +}; + +/** + * @brief Run Length Encoding representation + * + * This structure contains all information needed for run length encoding. It + * inherits from RLEStream, but unlike RLEStream it stores values for each + * offset. Refer to RLEStream for additional details. + * + * @tparam T Data type of data to be compressed + * @tparam U Data type of offsets to change in value + */ +template struct RLE : public RLEStream { + std::vector values; +}; + +// Structure to hold packed index +template struct PackedIndex { + static_assert(std::is_same::value or +#ifdef __AVX512BW__ + std::is_same::value or +#endif + std::is_same::value, + "!"); + T indices; + uint8_t count; +}; + +RLEStream rle_encode_byte_stream_64(const uint8_t *data, + uint16_t len); + +RLEStream rle_encode_long_stream_32(const uint32_t *data, + uint16_t len); + +template +RLEStream rle_encode_byte_stream_64(const T *data, uint16_t len) { + static_assert(std::is_same::value or + std::is_same::value or + std::is_same::value, + "!"); + + RLEStream rle = + rle_encode_byte_stream_64((uint8_t *)data, len); + + RLEStream output = {(std::vector)rle.offsets, rle.start}; + + return output; +} + +/** + * @brief Universal RLE compressor, get offsets to changes in value. + * + * This function is a universal handler for calculating offsets to changes in + * value. Handles all data stream types and streams of any size. + * + * @tparam U Data type of input data + * @tparam T Data type of offsets + * @param data Pointer to data for compression + * @param len Length of stream to compress + * @return RLEByteStream + */ +template +RLEStream rle_encode_stream(const U *data, T len) { + RLEStream stream; + stream.start = data; + U on_object = 0; + const U *end = data + len; + const U *start = data; + for (; data < end; data++) { + if (*data) { + if (*data != on_object) { + stream.offsets.push_back((T)(data - start)); + on_object = *data; + } + } else if (on_object) { + stream.offsets.push_back((T)(data - start)); + on_object = 0; + } + } + + if (on_object) { + stream.offsets.push_back((T)(end - start)); + } + + return stream; +}; + +template +std::vector> rle_encode_stream_tile(const U *data, const U *end, + const T stride, + const T chunk_size) { + std::vector> tile; + + for (; data < end; data += stride) { + tile.push_back(rle_encode_stream(data, chunk_size)); + } + + return tile; +} + +std::vector> +rle_encode_byte_stream_64_tile(const uint8_t *data, const uint8_t *end, + uint16_t stride, uint16_t chunk_size); + +#endif // _FTL_RLE_HPP diff --git a/src/nyx/roi_cache.h b/src/nyx/roi_cache.h index 1b5fa5a1..55f80291 100644 --- a/src/nyx/roi_cache.h +++ b/src/nyx/roi_cache.h @@ -42,6 +42,7 @@ class LR: public BasicLR bool roi_disabled = false; + std::vector rle_pixels; std::vector raw_pixels; OutOfRamPixelCloud osized_pixel_cloud; unsigned int aux_area = 0; diff --git a/src/nyx/scan_fastloader_way.cpp b/src/nyx/scan_fastloader_way.cpp index b6ed7a0a..5d5f29af 100644 --- a/src/nyx/scan_fastloader_way.cpp +++ b/src/nyx/scan_fastloader_way.cpp @@ -30,7 +30,7 @@ namespace Nyxus { - bool processIntSegImagePair (const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads, int filepair_index, int tot_num_filepairs) + bool processIntSegImagePair (const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads, int filepair_index, int tot_num_filepairs, bool use_fast) { std::vector trivRoiLabels, nontrivRoiLabels; @@ -52,14 +52,17 @@ namespace Nyxus int digits = 2, k = std::pow(10.f, digits); float perCent = float(filepair_index * 100 * k / tot_num_filepairs) / float(k); VERBOSLVL1(std::cout << "[ " << std::setw(digits + 2) << perCent << "% ]\t" << " INT: " << intens_fpath << " SEG: " << label_fpath << "\n";) - } { STOPWATCH("Image scan2a/ImgScan2a/Scan2a/lightsteelblue", "\t="); // Phase 1: gather ROI metrics VERBOSLVL1(std::cout << "Gathering ROI metrics\n";) - gatherRoisMetrics(intens_fpath, label_fpath, num_FL_threads); // Output - set of ROI labels, label-ROI cache mappings + if (use_fast) { + gatherRoisMetricsFast(intens_fpath, label_fpath, num_FL_threads); // Output - set of ROI labels, label-ROI cache mappings + } else { + gatherRoisMetrics(intens_fpath, label_fpath, num_FL_threads); // Output - set of ROI labels, label-ROI cache mappings + } } @@ -101,7 +104,7 @@ namespace Nyxus if (trivRoiLabels.size()) { VERBOSLVL1(std::cout << "Processing trivial ROIs\n";) - processTrivialRois (trivRoiLabels, intens_fpath, label_fpath, num_FL_threads, theEnvironment.get_ram_limit()); + processTrivialRois (trivRoiLabels, intens_fpath, label_fpath, num_FL_threads, theEnvironment.get_ram_limit(), use_fast); } // Phase 3: process nontrivial (oversized) ROIs, if any @@ -122,7 +125,8 @@ namespace Nyxus int numReduceThreads, int min_online_roi_size, bool save2csv, - const std::string& csvOutputDir) + const std::string& csvOutputDir, + bool use_fastloop) { bool ok = true; @@ -148,7 +152,7 @@ namespace Nyxus return 1; } - ok = processIntSegImagePair (ifp, lfp, numFastloaderThreads, i, nf); // Phased processing + ok = processIntSegImagePair (ifp, lfp, numFastloaderThreads, i, nf, use_fastloop); // Phased processing if (ok == false) { @@ -185,6 +189,30 @@ namespace Nyxus return 0; // success } + int processDataset( + const std::vector& intensFiles, + const std::vector& labelFiles, + int numFastloaderThreads, + int numSensemakerThreads, + int numReduceThreads, + int min_online_roi_size, + bool save2csv, + const std::string& csvOutputDir) + { + return processDataset( + intensFiles, + labelFiles, + numFastloaderThreads, + numSensemakerThreads, + numReduceThreads, + min_online_roi_size, + save2csv, + csvOutputDir, + false + ); + + } + void dump_roi_metrics(const std::string & label_fpath) { fs::path pseg (label_fpath); diff --git a/src/nyx/sort.c b/src/nyx/sort.c new file mode 100644 index 00000000..111e1021 --- /dev/null +++ b/src/nyx/sort.c @@ -0,0 +1,486 @@ +/* + * Sorting algorithms for compression + */ + +#include "sort.h" + +/* + * Sorting networks + * + * Below is a series of bitonic sorting network components. Sorting networks + * for 2, 4, and 8 values receive all receive a 256-bit vector of 2, 4, or 8 + * uint32 values (respectively). + * + * NOTE: These sort data into descending order. + * + * TODO: Optimize the layered sorting to remove extraneous shuffle operations. + * For example, when sorting an array of 4 values merge_sort_2 is called + * which currently swaps the values into place if they are out of order. + * This could be optimized to swap values into the proper order for the + * next set of comparisons, thus removing one swap operation for each + * level of sorting. + * + * Ideas in these sorting implementations were taken from the following sources: + * https://ieeexplore.ieee.org/document/8638394 + * https://www.vldb.org/pvldb/vol1/1454171.pdf + */ + +// Swap adjacent integers into order +const uint8_t merge_swap_2 = 0xB1; +const uint8_t merge_shuf_2 = 0xF5; +void merge_sort_2(__m256i *values, __m256i *indices) { + + // Switch adjacent values + __m256i values_swapped = _mm256_shuffle_epi32(*values,merge_swap_2); + + // Swap adjacent indices + __m256i indices_swapped = _mm256_shuffle_epi32(*indices,merge_swap_2); + + // Compare values + __m256i cmp = _mm256_cmpgt_epi32(*values,values_swapped); + + // Get blend indices + cmp = _mm256_shuffle_epi32(cmp,merge_shuf_2); + + // Swap values + *values = _mm256_blendv_epi8(*values,values_swapped,cmp); + *indices = _mm256_blendv_epi8(*indices,indices_swapped,cmp); + +} + +// Sort integers in upper and lower lanes +const uint8_t merge_swap_4 = 0x1B; +const uint8_t merge_shuf_4 = 0xEB; +void merge_sort_4(__m256i *values, __m256i *indices) { + + // Sort adjacent integers + merge_sort_2(values,indices); + + // Swap inner and outer values for comparison + __m256i values_swapped = _mm256_shuffle_epi32(*values,merge_swap_4); + + // Swap inner and outer indices + __m256i indices_swapped = _mm256_shuffle_epi32(*indices,merge_swap_4); + + // Compare values + __m256i cmp = _mm256_cmpgt_epi32(*values,values_swapped); + + // Get blend indices + cmp = _mm256_shuffle_epi32(cmp,merge_shuf_4); + + // Swap values + *values = _mm256_blendv_epi8(*values,values_swapped,cmp); + *indices = _mm256_blendv_epi8(*indices,indices_swapped,cmp); + + // Finally, sort adjacent integers + merge_sort_2(values,indices); + +} + +// Sort integers in upper and lower lanes using interleaved comparisons +// Used when more than 4 values are compared and merge_sort_4 has been called +const uint8_t merge_swap_inter_4 = 0x4E; +const uint8_t merge_shuf_inter_4 = 0xEE; +void merge_sort_inter_4(__m256i *values, __m256i *indices) { + + // Swap inner and outer values for comparison + __m256i values_swapped = _mm256_shuffle_epi32(*values,merge_swap_inter_4); + + // Swap inner and outer indices + __m256i indices_swapped = _mm256_shuffle_epi32(*indices,merge_swap_inter_4); + + // Compare values + __m256i cmp = _mm256_cmpgt_epi32(*values,values_swapped); + + // Get blend indices + cmp = _mm256_shuffle_epi32(cmp,merge_shuf_inter_4); + + // Swap values + *values = _mm256_blendv_epi8(*values,values_swapped,cmp); + *indices = _mm256_blendv_epi8(*indices,indices_swapped,cmp); + + // Finally, sort adjacent integers + merge_sort_2(values,indices); + +} + +// Sort integers across 128 bit lanes +void merge_sort_8(__m256i *values, __m256i *indices) { + const __m256i merge_swap_8 = _mm256_set_epi32(0,1,2,3,4,5,6,7); + const __m256i merge_shuf_8 = _mm256_set_epi32(7,6,5,4,4,5,6,7); + + // Sort integers within each lane + merge_sort_4(values,indices); + + // Swap inner and outer values for comparison + __m256i values_swapped = _mm256_permutevar8x32_epi32(*values,merge_swap_8); + + // Swap inner and outer indices + __m256i indices_swapped = _mm256_permutevar8x32_epi32(*indices,merge_swap_8); + + // Compare values + __m256i cmp = _mm256_cmpgt_epi32(*values,values_swapped); + + // Get blend indices + cmp = _mm256_permutevar8x32_epi32(cmp,merge_shuf_8); + + // Swap values + *values = _mm256_blendv_epi8(*values,values_swapped,cmp); + *indices = _mm256_blendv_epi8(*indices,indices_swapped,cmp); + + // Values are sorted into high and low values, + merge_sort_inter_4(values,indices); + +} + +// Sort integers across 128 bit lanes using interleaved comparisons +// Used when more than 8 values are compared +void merge_sort_inter_8(__m256i *values, __m256i *indices) { + const __m256i merge_swap_8 = _mm256_set_epi32(3,2,1,0,7,6,5,4); + const __m256i merge_shuf_8 = _mm256_set_epi32(7,6,5,4,7,6,5,4); + + // Swap inner and outer values for comparison + __m256i values_swapped = _mm256_permutevar8x32_epi32(*values,merge_swap_8); + + // Swap inner and outer indices + __m256i indices_swapped = _mm256_permutevar8x32_epi32(*indices,merge_swap_8); + + // Compare values + __m256i cmp = _mm256_cmpgt_epi32(*values,values_swapped); + + // Get blend indices + cmp = _mm256_permutevar8x32_epi32(cmp,merge_shuf_8); + + // Swap values + *values = _mm256_blendv_epi8(*values,values_swapped,cmp); + *indices = _mm256_blendv_epi8(*indices,indices_swapped,cmp); + + // Values are sorted into high and low values, + merge_sort_inter_4(values,indices); + +} + +// Sort integers across vectors +// values and indices must be an array of __m256i vectors, and the first two +// are sorted +void merge_sort_16(__m256i *values, __m256i *indices) { + const __m256i merge_swap_16 = _mm256_set_epi32(0,1,2,3,4,5,6,7); + + // Sort integers within each vector + merge_sort_8(&values[0],&indices[0]); + merge_sort_8(&values[1],&indices[1]); + + // Swap inner and outer values for comparison + __m256i values_swapped[2]; + __m256i indices_swapped[2]; + values_swapped[0] = _mm256_permutevar8x32_epi32(values[0],merge_swap_16); + indices_swapped[1] = _mm256_permutevar8x32_epi32(indices[1],merge_swap_16); + values_swapped[1] = _mm256_permutevar8x32_epi32(values[1],merge_swap_16); + indices_swapped[0] = _mm256_permutevar8x32_epi32(indices[0],merge_swap_16); + + // Compare values + __m256i cmp = _mm256_cmpgt_epi32(values_swapped[1],values[0]); + + // Blend the first vector + values[0] = _mm256_blendv_epi8(values[0],values_swapped[1],cmp); + __m256i cmp_reversed = _mm256_permutevar8x32_epi32(cmp,merge_swap_16); + indices[0] = _mm256_blendv_epi8(indices[0],indices_swapped[1],cmp); + + // Blend the second vector + values[1] = _mm256_blendv_epi8(values[1],values_swapped[0],cmp_reversed); + indices[1] = _mm256_blendv_epi8(indices[1],indices_swapped[0],cmp_reversed); + + // Values are sorted into high and low values, + merge_sort_inter_8(&values[0],&indices[0]); + merge_sort_inter_8(&values[1],&indices[1]); + +} + +// Sort integers across vectors using interleaved comparisons +void merge_sort_inter_16(__m256i *values, __m256i *indices) { + + // Swap inner and outer values for comparison + __m256i values_swapped[2]; + __m256i indices_swapped[2]; + values_swapped[0] = values[0]; + indices_swapped[1] = indices[1]; + values_swapped[1] = values[1]; + indices_swapped[0] = indices[0]; + + // Compare values + __m256i cmp = _mm256_cmpgt_epi32(values_swapped[1],values[0]); + + // Blend the first vector + values[0] = _mm256_blendv_epi8(values[0],values_swapped[1],cmp); + __m256i cmp_reversed = ~cmp; + indices[0] = _mm256_blendv_epi8(indices[0],indices_swapped[1],cmp); + + // Blend the second vector + values[1] = _mm256_blendv_epi8(values[1],values_swapped[0],cmp_reversed); + indices[1] = _mm256_blendv_epi8(indices[1],indices_swapped[0],cmp_reversed); + + // Values are sorted into high and low values, + merge_sort_inter_8(&values[0],&indices[0]); + merge_sort_inter_8(&values[1],&indices[1]); + +} + +// Sort integers across 4 vectors +// values and indices must be an array of __m256i vectors, and the first four +// are sorted +void merge_sort_16(__m256i *values, __m256i *indices) { + const __m256i merge_swap_16 = _mm256_set_epi32(0,1,2,3,4,5,6,7); + + // Sort integers within each vector + merge_sort_16(values,indices); + merge_sort_16(values+2,indices+2); + + // Swap inner and outer values for comparison + __m256i values_swapped[4]; + __m256i indices_swapped[4]; + values_swapped[0] = _mm256_permutevar8x32_epi32(values[0],merge_swap_16); + indices_swapped[1] = _mm256_permutevar8x32_epi32(indices[1],merge_swap_16); + values_swapped[1] = _mm256_permutevar8x32_epi32(values[1],merge_swap_16); + indices_swapped[0] = _mm256_permutevar8x32_epi32(indices[0],merge_swap_16); + + // Compare values + __m256i cmp[2]; + __m256i cmp[0] = _mm256_cmpgt_epi32(values_swapped[1],values[0]); + + // Blend the first vector + values[0] = _mm256_blendv_epi8(values[0],values_swapped[1],cmp); + __m256i cmp_reversed = _mm256_permutevar8x32_epi32(cmp,merge_swap_16); + indices[0] = _mm256_blendv_epi8(indices[0],indices_swapped[1],cmp); + + // Blend the second vector + values[1] = _mm256_blendv_epi8(values[1],values_swapped[0],cmp_reversed); + indices[1] = _mm256_blendv_epi8(indices[1],indices_swapped[0],cmp_reversed); + + // Values are sorted into high and low values, + merge_sort_inter_8(&values[0],&indices[0]); + merge_sort_inter_8(&values[1],&indices[1]); + +} + +/* + * First pass binning algorithm for matching uint8 values + * + * This function determines the number of values in each bin of a histogram and + * determines the offsets where values belong as a preliminary step in a count + * sort algorithm. Each bin in the histogram represents a pair of bytes, so a + * bin value of 10 in bin 0x0000, then there are exactly 10 locations in the + * byte array where there is repeated null byte (with possible overlap). + * + * There are two steps to this algorithm + * 1. Count the number of times each pair of bytes occurs. + * 2. Calculate the offsets for where an index of a byte pair should be placed + * using count sort, such that the pairs of bytes that occur most frequently + * occur first. + * + * Modified from https://github.com/powturbo/Turbo-Histogram/blob/master/turbohist_.c + */ +static void bin_turbo_init(uint8_t *bytes, + uint32_t size, + uint32_t *counts, + uint32_t *parts, + uint32_t *part_count) { + + /* + * Step 1: Count the number of occurences of each pair of bytes + */ + + // Two arrays to hold byte pair counts. Two arrays helps prevent collisions. + uint32_t c[2][65536]={0}; + + // Pointer to position in byte array + uint8_t *ip; + + // Variable for preloading next set of bytes + __m128i cpv = _mm_loadu_si128((__m128i*)bytes); + + // TODO: Remove the vector reversal as an optimization + // Vector used to reverse the vector, useful when debugging. + __m128i reverse = _mm_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9,10,11,12,13,14,15); + + // Loop through the data 32 bytes at a time. + // Intrinsics are used to load the data to help reduce cache misses + for( ip = bytes; ip != bytes+(size&~(32-1)); ) { + ip+=16; + + // Load and reverse the vectors + uint8_t cvt = ip[0]; + __m128i cv=cpv; + __m128i dv = _mm_loadu_si128((__m128i*)(ip)); + cv = _mm_shuffle_epi8(cv,reverse); + dv = _mm_shuffle_epi8(dv,reverse); + ip+=16; + uint8_t dvt = ip[0]; + cpv = _mm_loadu_si128((__m128i*)(ip)); + + // Bin the data as uint16, bins even indexed values + c[0][_mm_extract_epi16(cv, 0)]++; + c[1][_mm_extract_epi16(dv, 0)]++; + c[0][_mm_extract_epi16(cv, 1)]++; + c[1][_mm_extract_epi16(dv, 1)]++; + c[0][_mm_extract_epi16(cv, 2)]++; + c[1][_mm_extract_epi16(dv, 2)]++; + c[0][_mm_extract_epi16(cv, 3)]++; + c[1][_mm_extract_epi16(dv, 3)]++; + c[0][_mm_extract_epi16(cv, 4)]++; + c[1][_mm_extract_epi16(dv, 4)]++; + c[0][_mm_extract_epi16(cv, 5)]++; + c[1][_mm_extract_epi16(dv, 5)]++; + c[0][_mm_extract_epi16(cv, 6)]++; + c[1][_mm_extract_epi16(dv, 6)]++; + c[0][_mm_extract_epi16(cv, 7)]++; + c[1][_mm_extract_epi16(dv, 7)]++; + + // shift, add in missing value, bin odd indexed values + cv = _mm_slli_si128(cv,1); + dv = _mm_slli_si128(dv,1); + cv = _mm_insert_epi8(cv,cvt,0); + dv = _mm_insert_epi8(dv,dvt,0); + + c[0][_mm_extract_epi16(cv, 0)]++; + c[1][_mm_extract_epi16(dv, 0)]++; + c[0][_mm_extract_epi16(cv, 1)]++; + c[1][_mm_extract_epi16(dv, 1)]++; + c[0][_mm_extract_epi16(cv, 2)]++; + c[1][_mm_extract_epi16(dv, 2)]++; + c[0][_mm_extract_epi16(cv, 3)]++; + c[1][_mm_extract_epi16(dv, 3)]++; + c[0][_mm_extract_epi16(cv, 4)]++; + c[1][_mm_extract_epi16(dv, 4)]++; + c[0][_mm_extract_epi16(cv, 5)]++; + c[1][_mm_extract_epi16(dv, 5)]++; + c[0][_mm_extract_epi16(cv, 6)]++; + c[1][_mm_extract_epi16(dv, 6)]++; + c[0][_mm_extract_epi16(cv, 7)]++; + c[1][_mm_extract_epi16(dv, 7)]++; + + // Prefetch data to improve vector loading + __builtin_prefetch(ip+512, 0); + } + + // Finish binning the data + uint16_t val; + while( ip < bytes+size ) { + val = *((uint16_t *) ip); + c[0][(uint16_t) ((val << 8) + (val >> 8))]++; + ip++; + } + + /* + * Step 2: Calculate the index offsets such that the indices of the most + * frequently occuring values occur first. + */ + + uint32_t indices[65536]; + __m256i index = _mm256_set_epi32(7,6,5,4,3,2,1,0); + __m256i delta = _mm256_set_epi32(8,8,8,8,8,8,8,8); + __m256i c0; + for ( uint32_t i = 0; i < 65536; ) { + _mm256_storeu_si256((__m256i *) &indices[i],index); + c0 = _mm256_loadu_si256((__m256i *) &c[0][i]); + __m256i c1 = _mm256_loadu_si256((__m256i *) &c[1][i]); + index = _mm256_add_epi32(index,delta); + c0 = _mm256_add_epi32(c0,c1); + _mm256_storeu_si256((__m256i *) &c[0][i],c0); + i += 8; + } + + // Calculate the offsets - Could probably be optimized + // Modified from https://stackoverflow.com/a/31229764 + int cmp( const void *a, const void *b ){ + int ia = *(int *)a; + int ib = *(int *)b; + return (c[0][ia] > c[0][ib]) - (c[0][ia] < c[0][ib]); + } + + qsort(indices, 65536, sizeof(*indices), cmp); + + uint32_t count = 0; + + // Loop backwards so smallest clusters are last + parts[0] = 0; + (*part_count)++; + for (int32_t i = 65535; i >= 0;--i) { + counts[indices[i]] = count; + count += c[0][indices[i]]; + if (count - counts[indices[i]] > 1) { + parts[65536 - i] = count; + (*part_count)++; + } + } +} + +/* + * Function for the first pass of sorting + * + * The first pass of sorting is simple: Count the number of times each pair of + * characters appears, then directly assign an index to the offset for each + * set of values. This is a fast, stable sort. Subsequent sorting of partitions + * requires slightly more complexity because previous sorting positions need to + * be known, requiring a reference from a separate array. + */ +void count_sort_init(uint8_t *bytes, + uint32_t *indices, + uint32_t size, + uint32_t *counts, + uint32_t *parts, + uint32_t *part_count) { + + // Bin the data + bin_turbo_init(bytes,size,counts,parts,part_count); + uint32_t c; + + __m128i reverse = _mm_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9,10,11,12,13,14,15); + + // Sort the indices, unroll 16 times + uint8_t *ip; + __m256i index1 = _mm256_set_epi32( 7 ,6, 5, 4, 3, 2, 1, 0); + __m256i index2 = _mm256_set_epi32(15,14,13,12,11,10, 9, 8); + __m128i cpv = _mm_loadu_si128((__m128i*)bytes); + __m256i delta = _mm256_set1_epi32(16); + for(ip = bytes; ip != bytes+(size&~(16-1)); ) { + // Load the values and start loading a new set of values + __m128i cv = cpv; + __m128i dv = _mm_srli_si128(cpv,1); + ip += 16; + cv = _mm_shuffle_epi8(cv,reverse); + dv = _mm_insert_epi8(dv,ip[0],15); + cpv = _mm_loadu_si128((__m128i*)(ip)); + dv = _mm_shuffle_epi8(dv,reverse); + + // Set the data + indices[counts[_mm_extract_epi16(cv, 7)]++] = _mm256_extract_epi32(index1, 0); + indices[counts[_mm_extract_epi16(dv, 7)]++] = _mm256_extract_epi32(index1, 1); + indices[counts[_mm_extract_epi16(cv, 6)]++] = _mm256_extract_epi32(index1, 2); + indices[counts[_mm_extract_epi16(dv, 6)]++] = _mm256_extract_epi32(index1, 3); + indices[counts[_mm_extract_epi16(cv, 5)]++] = _mm256_extract_epi32(index1, 4); + indices[counts[_mm_extract_epi16(dv, 5)]++] = _mm256_extract_epi32(index1, 5); + indices[counts[_mm_extract_epi16(cv, 4)]++] = _mm256_extract_epi32(index1, 6); + indices[counts[_mm_extract_epi16(dv, 4)]++] = _mm256_extract_epi32(index1, 7); + index1 = _mm256_add_epi32(index1,delta); + indices[counts[_mm_extract_epi16(cv, 3)]++] = _mm256_extract_epi32(index2, 0); + indices[counts[_mm_extract_epi16(dv, 3)]++] = _mm256_extract_epi32(index2, 1); + indices[counts[_mm_extract_epi16(cv, 2)]++] = _mm256_extract_epi32(index2, 2); + indices[counts[_mm_extract_epi16(dv, 2)]++] = _mm256_extract_epi32(index2, 3); + indices[counts[_mm_extract_epi16(cv, 1)]++] = _mm256_extract_epi32(index2, 4); + indices[counts[_mm_extract_epi16(dv, 1)]++] = _mm256_extract_epi32(index2, 5); + indices[counts[_mm_extract_epi16(cv, 0)]++] = _mm256_extract_epi32(index2, 6); + indices[counts[_mm_extract_epi16(dv, 0)]++] = _mm256_extract_epi32(index2, 7); + index2 = _mm256_add_epi32(index2,delta); + __builtin_prefetch(ip+512, 0); + } + c = _mm256_extract_epi32(index1, 0); + uint16_t val; + while(ip < bytes+size) { + val = *((uint16_t *) ip); + indices[counts[(uint16_t) ((val << 8) + (val >> 8))]++] = c; + c++; + ip++; + } +}; diff --git a/src/nyx/sort.h b/src/nyx/sort.h new file mode 100644 index 00000000..727cd4db --- /dev/null +++ b/src/nyx/sort.h @@ -0,0 +1,25 @@ +#ifndef SORT_H +#define SORT_H + +#include +#include +#include +#include +#include + +void count_sort_init(uint8_t *bytes, + uint32_t *indices, + uint32_t size, + uint32_t *counts, + uint32_t *parts, + uint32_t *part_count); + +void merge_sort_2(__m256i *values, __m256i *indices); + +void merge_sort_4(__m256i *values, __m256i *indices); + +void merge_sort_8(__m256i *values, __m256i *indices); + +void merge_sort_16(__m256i *values, __m256i *indices); + +#endif // SORT_H diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e7ce3eb6..275b0588 100755 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -92,6 +92,7 @@ set(TEST_SRC ../src/nyx/output_2_buffer.cpp ../src/nyx/output_2_csv.cpp ../src/nyx/parallel.cpp + ../src/nyx/phase1_fastloop.cpp ../src/nyx/phase1.cpp ../src/nyx/phase2.cpp ../src/nyx/phase3.cpp diff --git a/tests/python/test_nyxus.py b/tests/python/test_nyxus.py index 6e4320a8..f41f0a1a 100644 --- a/tests/python/test_nyxus.py +++ b/tests/python/test_nyxus.py @@ -17,34 +17,90 @@ def test_import(self): assert nyxus.__name__ == "nyxus" class TestNyxus(): - PATH = PATH = Path(__file__).with_name('data') + PATH = PATH = Path(__file__).with_name('data') - def test_gabor_gpu(self): - urls = ['https://github.com/stardist/stardist/releases/download/0.1.0/dsb2018.zip'] - dir_names = ['dsb2018'] + def test_gabor_gpu_validate(self): + urls = ['https://github.com/stardist/stardist/releases/download/0.1.0/dsb2018.zip'] + dir_names = ['dsb2018'] + + download_datasets(urls, dir_names) + + for dset in dir_names: - download_datasets(urls, dir_names) + intens = str(self.PATH/dset/"train/images") + seg = str(self.PATH/dset/"train/masks") - for dset in dir_names: - - intens = str(self.PATH/dset/"train/images") - seg = str(self.PATH/dset/"train/masks") + # cpu gabor + cpu_nyx = nyxus.Nyxus(["GABOR"]) + if (nyxus.gpu_is_available()): + cpu_nyx.using_gpu(False) + cpu_features = cpu_nyx.featurize_directory(intens, seg) + + if (nyxus.gpu_is_available()): + # gpu gabor + gpu_nyx = nyxus.Nyxus(["GABOR"], using_gpu=0) + gpu_nyx.using_gpu(True) + gpu_features = gpu_nyx.featurize_directory(intens, seg) - # cpu gabor - cpu_nyx = nyxus.Nyxus(["GABOR"]) - if (nyxus.gpu_is_available()): - cpu_nyx.using_gpu(False) - cpu_features = cpu_nyx.featurize_directory(intens, seg) + assert gpu_features.equals(cpu_features) + else: + print("Gpu not available") + assert True + + @pytest.mark.parametrize("use_fastloop",[True, False]) + def test_intensity_benchmark(self,use_fastloop,benchmark): + + urls = ['https://github.com/stardist/stardist/releases/download/0.1.0/dsb2018.zip'] + dir_names = ['dsb2018'] + + download_datasets(urls, dir_names) + + for dset in dir_names: + + intens = str(self.PATH/dset/"train/images") + seg = str(self.PATH/dset/"train/masks") + + # cpu gabor + cpu_nyx = nyxus.Nyxus(["*ALL_MORPHOLOGY*"]) + cpu_nyx.featurize_directory + cpu_features = benchmark(cpu_nyx.featurize_directory,intens, seg, ".*", use_fastloop) + + def test_intensity_validate(self): + + urls = ['https://github.com/stardist/stardist/releases/download/0.1.0/dsb2018.zip'] + dir_names = ['dsb2018'] + + download_datasets(urls, dir_names) + + for dset in dir_names: + + intens = str(self.PATH/dset/"train/images") + seg = str(self.PATH/dset/"train/masks") + + # cpu gabor + cpu_nyx = nyxus.Nyxus(["*ALL_MORPHOLOGY*"]) + cpu_features = cpu_nyx.featurize_directory(intens, seg, ".*", False) + + fast_features = cpu_nyx.featurize_directory(intens, seg, ".*", True) + + assert cpu_features.equals(fast_features) - if (nyxus.gpu_is_available()): - # gpu gabor - gpu_nyx = nyxus.Nyxus(["GABOR"], using_gpu=0) - gpu_nyx.using_gpu(True) - gpu_features = gpu_nyx.featurize_directory(intens, seg) - - assert gpu_features.equals(cpu_features) - else: - print("Gpu not available") - assert True - + # For benchmarking gabor features + # @pytest.mark.parametrize("use_gpu",[True, False]) + # def test_gabor_benchmark(self, use_gpu, benchmark): + + # urls = ['https://github.com/stardist/stardist/releases/download/0.1.0/dsb2018.zip'] + # dir_names = ['dsb2018'] + + # download_datasets(urls, dir_names) + + # for dset in dir_names: + + # intens = str(self.PATH/dset/"train/images") + # seg = str(self.PATH/dset/"train/masks") + + # # cpu gabor + # cpu_nyx = nyxus.Nyxus(["GABOR"]) + # cpu_nyx.using_gpu(use_gpu) + # cpu_features = benchmark(cpu_nyx.featurize_directory,intens, seg) \ No newline at end of file