From 42ad9b7affbb6e63880a77fe9f461aa846eaf19e Mon Sep 17 00:00:00 2001 From: Kim Yang Date: Thu, 26 Feb 2026 21:50:52 +0800 Subject: [PATCH 01/14] feat: add Highway SIMD library as git submodule (v1.3) Co-Authored-By: Claude Opus 4.6 --- .gitmodules | 3 +++ third_party/highway | 1 + 2 files changed, 4 insertions(+) create mode 100644 .gitmodules create mode 160000 third_party/highway diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..e7ed026 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "third_party/highway"] + path = third_party/highway + url = https://github.com/google/highway.git diff --git a/third_party/highway b/third_party/highway new file mode 160000 index 0000000..ac0d5d2 --- /dev/null +++ b/third_party/highway @@ -0,0 +1 @@ +Subproject commit ac0d5d297b13ab1b89f48484fc7911082d76a93f From 936b9dea6196d0df945293e89706eaca4eca9682 Mon Sep 17 00:00:00 2001 From: Kim Yang Date: Thu, 26 Feb 2026 22:11:13 +0800 Subject: [PATCH 02/14] build: switch Highway from system library to vendored third_party submodule Co-Authored-By: Claude Opus 4.6 --- Makefile | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 05377a7..2f7892c 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,7 @@ DIR_INC := ./inc DIR_SRC := ./src DIR_OBJ := ./obj DIR_TEST := ./test +DIR_HWY := ./third_party/highway PREFIX ?= /usr/local BINDIR ?= $(PREFIX)/bin @@ -13,14 +14,18 @@ TEST := $(wildcard ${DIR_TEST}/*.cpp) OBJ := $(patsubst %.cpp,${DIR_OBJ}/%.o,$(notdir ${SRC})) TEST_OBJ := $(patsubst %.cpp,${DIR_OBJ}/%.o,$(notdir ${TEST})) +# Highway runtime dispatch support +HWY_OBJS := ${DIR_OBJ}/hwy_targets.o ${DIR_OBJ}/hwy_abort.o +OBJ += $(HWY_OBJS) + TARGET := fastplong BIN_TARGET := ${TARGET} TEST_TARGET := bin/fastplong_unittest CXX ?= g++ -CXXFLAGS := -std=c++14 -pthread -g -O3 -MP -MD -I${DIR_INC} -I${DIR_SRC} $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) ${CXXFLAGS} -LIBS := -lisal -ldeflate -lpthread -lhwy +CXXFLAGS := -std=c++14 -pthread -g -O3 -MP -MD -I. -I${DIR_HWY} -I${DIR_INC} -I${DIR_SRC} $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) ${CXXFLAGS} +LIBS := -lisal -ldeflate -lpthread STATIC_FLAGS := -static -L. -Wl,--no-as-needed -pthread LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(LIBS) $(LD_FLAGS) STATIC_LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(STATIC_FLAGS) $(LIBS) $(STATIC_LD_FLAGS) @@ -33,7 +38,16 @@ static:${OBJ} $(CXX) $(OBJ) -o ${BIN_TARGET} $(STATIC_LD_FLAGS) ${DIR_OBJ}/%.o:${DIR_SRC}/%.cpp - @mkdir -p $(@D) + @mkdir -p $(@D) + $(CXX) -c $< -o $@ $(CXXFLAGS) + +# Highway source files for runtime CPU detection and error handling +${DIR_OBJ}/hwy_targets.o:${DIR_HWY}/hwy/targets.cc + @mkdir -p $(@D) + $(CXX) -c $< -o $@ $(CXXFLAGS) + +${DIR_OBJ}/hwy_abort.o:${DIR_HWY}/hwy/abort.cc + @mkdir -p $(@D) $(CXX) -c $< -o $@ $(CXXFLAGS) .PHONY:clean From d4a6b147b9a002e4dc414b3d6ff0c4cd4c1c1486 Mon Sep 17 00:00:00 2001 From: Kim Yang Date: Thu, 26 Feb 2026 22:18:22 +0800 Subject: [PATCH 03/14] feat: add centralized SIMD module with Highway multi-target dispatch Co-Authored-By: Claude Opus 4.6 --- src/simd.cpp | 461 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/simd.h | 34 ++++ 2 files changed, 495 insertions(+) create mode 100644 src/simd.cpp create mode 100644 src/simd.h diff --git a/src/simd.cpp b/src/simd.cpp new file mode 100644 index 0000000..056f6f0 --- /dev/null +++ b/src/simd.cpp @@ -0,0 +1,461 @@ +// SIMD-accelerated functions for fastplong using Google Highway. +// This file is compiled once per SIMD target via foreach_target.h. + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "src/simd.cpp" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace fastplong_simd { +namespace HWY_NAMESPACE { + +namespace hn = hwy::HWY_NAMESPACE; + +void CountQualityMetricsImpl(const char* qualstr, const char* seqstr, int len, + char qualThreshold, int& lowQualNum, int& nBaseNum, + int& totalQual) { + const hn::ScalableTag d; + const int N = hn::Lanes(d); + + const auto vThresh = hn::Set(d, static_cast(qualThreshold)); + const auto vN = hn::Set(d, static_cast('N')); + const auto v33 = hn::Set(d, static_cast(33)); + + int lowQual = 0; + int nBase = 0; + int qualSum = 0; + int i = 0; + + // Process in chunks, accumulating into wider types to avoid overflow. + // Process in blocks of up to 255 vectors to avoid u16 overflow during + // quality sum accumulation (255 * 255 = 65025 fits in u16). + const int blockSize = 255 * N; + + for (int blockStart = 0; blockStart < len; blockStart += blockSize) { + int blockEnd = blockStart + blockSize; + if (blockEnd > len) blockEnd = len; + + // Use u16 accumulator for quality sum within this block. + // SumsOf2 pairwise-adds adjacent u8 lanes into u16, avoiding + // PromoteUpperTo which is unavailable on HWY_SCALAR. + const hn::ScalableTag d16; + auto vQualSum16 = hn::Zero(d16); + + for (i = blockStart; i + N <= blockEnd; i += N) { + const auto vQual = hn::LoadU(d, reinterpret_cast(qualstr + i)); + const auto vSeq = hn::LoadU(d, reinterpret_cast(seqstr + i)); + + // Count bases with quality < threshold + const auto maskLowQ = hn::Lt(vQual, vThresh); + lowQual += hn::CountTrue(d, maskLowQ); + + // Count N bases + const auto maskN = hn::Eq(vSeq, vN); + nBase += hn::CountTrue(d, maskN); + + // Subtract 33 and accumulate quality into u16 accumulator. + // SumsOf2 adds adjacent u8 pairs -> u16, works on all targets. + const auto vQualAdj = hn::Sub(vQual, v33); + vQualSum16 = hn::Add(vQualSum16, hn::SumsOf2(vQualAdj)); + } + + // Reduce u16 accumulator to scalar via SumsOf2 -> u32 then ReduceSum + const hn::ScalableTag d32; + qualSum += static_cast( + hn::ReduceSum(d32, hn::SumsOf2(vQualSum16))); + } + + // Scalar tail + for (; i < len; i++) { + uint8_t qual = static_cast(qualstr[i]); + qualSum += qual - 33; + if (qual < static_cast(qualThreshold)) lowQual++; + if (seqstr[i] == 'N') nBase++; + } + + lowQualNum = lowQual; + nBaseNum = nBase; + totalQual = qualSum; +} + +void ReverseComplementImpl(const char* src, char* dst, int len) { + const hn::ScalableTag d; + const int N = hn::Lanes(d); + + // Complement via comparisons — portable across all SIMD widths + const auto vA = hn::Set(d, static_cast('A')); + const auto vT = hn::Set(d, static_cast('T')); + const auto vC = hn::Set(d, static_cast('C')); + const auto vG = hn::Set(d, static_cast('G')); + const auto va = hn::Set(d, static_cast('a')); + const auto vt = hn::Set(d, static_cast('t')); + const auto vc = hn::Set(d, static_cast('c')); + const auto vg = hn::Set(d, static_cast('g')); + const auto vN = hn::Set(d, static_cast('N')); + + int i = 0; + for (; i + N <= len; i += N) { + const auto v = hn::LoadU(d, reinterpret_cast(src + i)); + + // Build complement using conditional selects + auto comp = vN; // default: N + comp = hn::IfThenElse(hn::Eq(v, vA), vT, comp); + comp = hn::IfThenElse(hn::Eq(v, vT), vA, comp); + comp = hn::IfThenElse(hn::Eq(v, vC), vG, comp); + comp = hn::IfThenElse(hn::Eq(v, vG), vC, comp); + comp = hn::IfThenElse(hn::Eq(v, va), vT, comp); + comp = hn::IfThenElse(hn::Eq(v, vt), vA, comp); + comp = hn::IfThenElse(hn::Eq(v, vc), vG, comp); + comp = hn::IfThenElse(hn::Eq(v, vg), vC, comp); + + // Reverse and store at mirrored position + const auto revComp = hn::Reverse(d, comp); + hn::StoreU(revComp, d, reinterpret_cast(dst + len - i - N)); + } + + // Scalar tail + for (; i < len; i++) { + char base = src[i]; + char comp; + switch (base) { + case 'A': case 'a': comp = 'T'; break; + case 'T': case 't': comp = 'A'; break; + case 'C': case 'c': comp = 'G'; break; + case 'G': case 'g': comp = 'C'; break; + default: comp = 'N'; break; + } + dst[len - 1 - i] = comp; + } +} + +int CountAdjacentDiffsImpl(const char* data, int len) { + if (len <= 1) return 0; + + const hn::ScalableTag d; + const int N = hn::Lanes(d); + + int diff = 0; + int i = 0; + + // Compare data[i..i+N-1] with data[i+1..i+N] + for (; i + N < len; i += N) { + const auto v1 = hn::LoadU(d, reinterpret_cast(data + i)); + const auto v2 = hn::LoadU(d, reinterpret_cast(data + i + 1)); + const auto maskNe = hn::Ne(v1, v2); + diff += hn::CountTrue(d, maskNe); + } + + // Scalar tail + for (; i < len - 1; i++) { + if (data[i] != data[i + 1]) diff++; + } + + return diff; +} + +int CountMismatchesImpl(const char* a, const char* b, int len) { + const hn::ScalableTag d; + const int N = hn::Lanes(d); + + int diff = 0; + int i = 0; + + for (; i + N <= len; i += N) { + const auto va = hn::LoadU(d, reinterpret_cast(a + i)); + const auto vb = hn::LoadU(d, reinterpret_cast(b + i)); + const auto maskNe = hn::Ne(va, vb); + diff += hn::CountTrue(d, maskNe); + } + + // Scalar tail + for (; i < len; i++) { + if (a[i] != b[i]) diff++; + } + + return diff; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace fastplong_simd +HWY_AFTER_NAMESPACE(); + +// ---- Dynamic dispatch wrappers (compiled once) ---- +#if HWY_ONCE + +#include +#include +#include + +namespace fastplong_simd { + +HWY_EXPORT(CountQualityMetricsImpl); +HWY_EXPORT(ReverseComplementImpl); +HWY_EXPORT(CountAdjacentDiffsImpl); +HWY_EXPORT(CountMismatchesImpl); + +void countQualityMetrics(const char* qualstr, const char* seqstr, int len, + char qualThreshold, int& lowQualNum, int& nBaseNum, + int& totalQual) { + HWY_DYNAMIC_DISPATCH(CountQualityMetricsImpl)(qualstr, seqstr, len, + qualThreshold, lowQualNum, + nBaseNum, totalQual); +} + +void reverseComplement(const char* src, char* dst, int len) { + HWY_DYNAMIC_DISPATCH(ReverseComplementImpl)(src, dst, len); +} + +int countAdjacentDiffs(const char* data, int len) { + return HWY_DYNAMIC_DISPATCH(CountAdjacentDiffsImpl)(data, len); +} + +int countMismatches(const char* a, const char* b, int len) { + return HWY_DYNAMIC_DISPATCH(CountMismatchesImpl)(a, b, len); +} + +// ---- Scalar reference implementations for testing ---- + +static void scalarCountQualityMetrics(const char* qualstr, const char* seqstr, + int len, char qualThreshold, + int& lowQualNum, int& nBaseNum, + int& totalQual) { + lowQualNum = 0; + nBaseNum = 0; + totalQual = 0; + for (int i = 0; i < len; i++) { + uint8_t q = static_cast(qualstr[i]); + totalQual += q - 33; + if (q < static_cast(qualThreshold)) lowQualNum++; + if (seqstr[i] == 'N') nBaseNum++; + } +} + +static void scalarReverseComplement(const char* src, char* dst, int len) { + for (int i = 0; i < len; i++) { + char c; + switch (src[i]) { + case 'A': case 'a': c = 'T'; break; + case 'T': case 't': c = 'A'; break; + case 'C': case 'c': c = 'G'; break; + case 'G': case 'g': c = 'C'; break; + default: c = 'N'; break; + } + dst[len - 1 - i] = c; + } +} + +static int scalarCountAdjacentDiffs(const char* data, int len) { + int diff = 0; + for (int i = 0; i < len - 1; i++) { + if (data[i] != data[i + 1]) diff++; + } + return diff; +} + +static int scalarCountMismatches(const char* a, const char* b, int len) { + int diff = 0; + for (int i = 0; i < len; i++) { + if (a[i] != b[i]) diff++; + } + return diff; +} + +bool testSimd() { + bool pass = true; + + // --- reverseComplement --- + // Basic case + { + const char* in = "AAAATTTTCCCCGGGG"; + int len = 16; + char out[16]; + reverseComplement(in, out, len); + char ref[16]; + scalarReverseComplement(in, ref, len); + if (memcmp(out, ref, len) != 0) { + fprintf(stderr, "FAIL: reverseComplement basic\n"); + pass = false; + } + } + // Mixed case + { + const char* in = "AaTtCcGgN"; + int len = 9; + char out[9]; + reverseComplement(in, out, len); + char ref[9]; + scalarReverseComplement(in, ref, len); + if (memcmp(out, ref, len) != 0) { + fprintf(stderr, "FAIL: reverseComplement mixed case\n"); + pass = false; + } + } + // Empty + { + char out[1] = {'X'}; + reverseComplement("", out, 0); + // Should not crash; out unchanged + } + // Length 1 + { + char out[1]; + reverseComplement("A", out, 1); + if (out[0] != 'T') { + fprintf(stderr, "FAIL: reverseComplement len=1\n"); + pass = false; + } + } + // Long string (> typical SIMD width) to exercise SIMD + tail + { + const char* in = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG"; + int len = 68; + char out[68], ref[68]; + reverseComplement(in, out, len); + scalarReverseComplement(in, ref, len); + if (memcmp(out, ref, len) != 0) { + fprintf(stderr, "FAIL: reverseComplement long string\n"); + pass = false; + } + } + + // --- countQualityMetrics --- + // Basic + { + // qual: "IIIII" (phred 40 each), seq: "ACGTN" + const char* qual = "IIIII"; + const char* seq = "ACGTN"; + int lowQ = 0, nBase = 0, totalQ = 0; + countQualityMetrics(qual, seq, 5, '5', lowQ, nBase, totalQ); + int refLowQ = 0, refNBase = 0, refTotalQ = 0; + scalarCountQualityMetrics(qual, seq, 5, '5', refLowQ, refNBase, refTotalQ); + if (lowQ != refLowQ || nBase != refNBase || totalQ != refTotalQ) { + fprintf(stderr, "FAIL: countQualityMetrics basic (lowQ=%d/%d nBase=%d/%d totalQ=%d/%d)\n", + lowQ, refLowQ, nBase, refNBase, totalQ, refTotalQ); + pass = false; + } + } + // All low quality + { + const char* qual = "!!!!!!!!!!"; // phred 0 + const char* seq = "AAAAAAAAAA"; + int lowQ = 0, nBase = 0, totalQ = 0; + countQualityMetrics(qual, seq, 10, '5', lowQ, nBase, totalQ); + int refLowQ = 0, refNBase = 0, refTotalQ = 0; + scalarCountQualityMetrics(qual, seq, 10, '5', refLowQ, refNBase, refTotalQ); + if (lowQ != refLowQ || nBase != refNBase || totalQ != refTotalQ) { + fprintf(stderr, "FAIL: countQualityMetrics all-low\n"); + pass = false; + } + } + // Long string with mixed content + { + const char qual[] = "IIIIII!!!!!IIIII55555NNNNN!!!!!IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII"; + const char seq[] = "ACGTNNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACG"; + int len = 68; + int lowQ = 0, nBase = 0, totalQ = 0; + countQualityMetrics(qual, seq, len, '5', lowQ, nBase, totalQ); + int refLowQ = 0, refNBase = 0, refTotalQ = 0; + scalarCountQualityMetrics(qual, seq, len, '5', refLowQ, refNBase, refTotalQ); + if (lowQ != refLowQ || nBase != refNBase || totalQ != refTotalQ) { + fprintf(stderr, "FAIL: countQualityMetrics long (lowQ=%d/%d nBase=%d/%d totalQ=%d/%d)\n", + lowQ, refLowQ, nBase, refNBase, totalQ, refTotalQ); + pass = false; + } + } + + // --- countAdjacentDiffs --- + // All same + { + int d = countAdjacentDiffs("AAAAAAAAAA", 10); + if (d != 0) { + fprintf(stderr, "FAIL: countAdjacentDiffs all-same got %d\n", d); + pass = false; + } + } + // All different + { + const char* s = "ACACACACAC"; + int d = countAdjacentDiffs(s, 10); + int ref = scalarCountAdjacentDiffs(s, 10); + if (d != ref) { + fprintf(stderr, "FAIL: countAdjacentDiffs all-diff got %d expected %d\n", d, ref); + pass = false; + } + } + // Length 0 and 1 + { + if (countAdjacentDiffs("A", 1) != 0) { + fprintf(stderr, "FAIL: countAdjacentDiffs len=1\n"); + pass = false; + } + if (countAdjacentDiffs("", 0) != 0) { + fprintf(stderr, "FAIL: countAdjacentDiffs len=0\n"); + pass = false; + } + } + // Long string + { + const char* s = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG"; + int len = 68; + int d = countAdjacentDiffs(s, len); + int ref = scalarCountAdjacentDiffs(s, len); + if (d != ref) { + fprintf(stderr, "FAIL: countAdjacentDiffs long got %d expected %d\n", d, ref); + pass = false; + } + } + + // --- countMismatches --- + // Identical + { + const char* s = "ACGTACGTACGT"; + int d = countMismatches(s, s, 12); + if (d != 0) { + fprintf(stderr, "FAIL: countMismatches identical got %d\n", d); + pass = false; + } + } + // All different + { + int d = countMismatches("AAAA", "TTTT", 4); + if (d != 4) { + fprintf(stderr, "FAIL: countMismatches all-diff got %d\n", d); + pass = false; + } + } + // Long string + { + const char* a = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG"; + const char* b = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG"; + int d = countMismatches(a, b, 68); + if (d != 0) { + fprintf(stderr, "FAIL: countMismatches long-identical got %d\n", d); + pass = false; + } + } + { + const char* a = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + const char* b = "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"; + int d = countMismatches(a, b, 66); + if (d != 66) { + fprintf(stderr, "FAIL: countMismatches long-alldiff got %d\n", d); + pass = false; + } + } + // Length 0 + { + int d = countMismatches("A", "T", 0); + if (d != 0) { + fprintf(stderr, "FAIL: countMismatches len=0 got %d\n", d); + pass = false; + } + } + + return pass; +} + +} // namespace fastplong_simd + +#endif // HWY_ONCE diff --git a/src/simd.h b/src/simd.h new file mode 100644 index 0000000..863fcf1 --- /dev/null +++ b/src/simd.h @@ -0,0 +1,34 @@ +#ifndef FASTPLONG_SIMD_H +#define FASTPLONG_SIMD_H + +#include + +namespace fastplong_simd { + +// Count quality metrics for a read in one pass. +// qualstr/seqstr: quality and sequence strings of length len. +// qualThreshold: phred+33 encoded quality threshold for "low quality". +// Outputs: lowQualNum, nBaseNum, totalQual (sum of qual-33 values). +void countQualityMetrics(const char* qualstr, const char* seqstr, int len, + char qualThreshold, int& lowQualNum, int& nBaseNum, + int& totalQual); + +// Reverse complement a DNA sequence. +// src: input sequence of length len. +// dst: output buffer of at least len bytes (must NOT alias src). +void reverseComplement(const char* src, char* dst, int len); + +// Count adjacent-base differences for low complexity filter. +// Returns the number of positions where data[i] != data[i+1]. +int countAdjacentDiffs(const char* data, int len); + +// Count mismatches between two byte strings. +// Returns the number of positions where a[i] != b[i], up to len bytes. +int countMismatches(const char* a, const char* b, int len); + +// Run all SIMD unit tests. Returns true if all pass. +bool testSimd(); + +} // namespace fastplong_simd + +#endif // FASTPLONG_SIMD_H From 201eebb812b54ce500355d0250ab7dd2a8203645 Mon Sep 17 00:00:00 2001 From: Kim Yang Date: Thu, 26 Feb 2026 22:28:28 +0800 Subject: [PATCH 04/14] perf: centralize SIMD with Highway multi-target dispatch - Refactor filter.cpp: use countQualityMetrics/countAdjacentDiffs - Refactor sequence.cpp: use centralized reverseComplement - Refactor adaptertrimmer.cpp: use countMismatches - Remove simdutil.h (merged into simd.cpp) - Add simd_test.cpp - Upgrade C++ standard to C++17 for gtest compatibility Co-Authored-By: Claude Opus 4.6 --- Makefile | 2 +- ...2026-02-26-simd-highway-refactor-design.md | 76 ++ .../plans/2026-02-26-simd-highway-refactor.md | 896 ++++++++++++++++++ src/adaptertrimmer.cpp | 36 +- src/filter.cpp | 27 +- src/sequence.cpp | 60 +- src/simdutil.h | 38 - test/simd_test.cpp | 6 + 8 files changed, 995 insertions(+), 146 deletions(-) create mode 100644 docs/plans/2026-02-26-simd-highway-refactor-design.md create mode 100644 docs/plans/2026-02-26-simd-highway-refactor.md delete mode 100644 src/simdutil.h create mode 100644 test/simd_test.cpp diff --git a/Makefile b/Makefile index 2f7892c..576d95a 100644 --- a/Makefile +++ b/Makefile @@ -24,7 +24,7 @@ BIN_TARGET := ${TARGET} TEST_TARGET := bin/fastplong_unittest CXX ?= g++ -CXXFLAGS := -std=c++14 -pthread -g -O3 -MP -MD -I. -I${DIR_HWY} -I${DIR_INC} -I${DIR_SRC} $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) ${CXXFLAGS} +CXXFLAGS := -std=c++17 -pthread -g -O3 -MP -MD -I. -I${DIR_HWY} -I${DIR_INC} -I${DIR_SRC} $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) ${CXXFLAGS} LIBS := -lisal -ldeflate -lpthread STATIC_FLAGS := -static -L. -Wl,--no-as-needed -pthread LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(LIBS) $(LD_FLAGS) diff --git a/docs/plans/2026-02-26-simd-highway-refactor-design.md b/docs/plans/2026-02-26-simd-highway-refactor-design.md new file mode 100644 index 0000000..47b8e4c --- /dev/null +++ b/docs/plans/2026-02-26-simd-highway-refactor-design.md @@ -0,0 +1,76 @@ +# SIMD Highway Refactor Design + +## Goal + +Centralize all SIMD code into a single `simd.h`/`simd.cpp` pair using Google Highway's `foreach_target.h` multi-target compilation + `HWY_DYNAMIC_DISPATCH` runtime dispatch pattern. Port proven implementations from fastp and accelerate remaining scalar hot loops. + +## Current State + +- fastplong links `-lhwy` as system library +- SIMD exists in `adaptertrimmer.cpp` (mismatch counting) and `sequence.cpp` (reverse complement) but without multi-target dispatch +- `simdutil.h` has a custom `Transform1Reversed` template +- `filter.cpp` hot loops (`passFilter`, `passLowComplexityFilter`) are pure scalar + +## Changes + +### New Files + +- `third_party/highway/` — Highway source (header-only + `targets.cc` / `abort.cc`) +- `src/simd.h` — Public SIMD API +- `src/simd.cpp` — Multi-target implementation with `foreach_target.h` + +### Modified Files + +- `Makefile` — Add `DIR_HWY`, compile `hwy_targets.o` / `hwy_abort.o`, remove `-lhwy`, add `-I${DIR_HWY}` +- `src/filter.cpp` — Replace scalar loops with `countQualityMetrics` and `countAdjacentDiffs` +- `src/sequence.cpp` — Replace inline Highway code with `reverseComplement` call +- `src/adaptertrimmer.cpp` — Replace inner SIMD loop with `countMismatches` call + +### Deleted Files + +- `src/simdutil.h` — Functionality merged into `simd.cpp` + +## API + +```cpp +namespace fastplong_simd { + void countQualityMetrics(const char* qualstr, const char* seqstr, int len, + char qualThreshold, int& lowQualNum, int& nBaseNum, + int& totalQual); + void reverseComplement(const char* src, char* dst, int len); + int countAdjacentDiffs(const char* data, int len); + int countMismatches(const char* a, const char* b, int len); + bool testSimd(); +} +``` + +## simd.cpp Structure + +``` +#define HWY_TARGET_INCLUDE "src/simd.cpp" +#include "hwy/foreach_target.h" + +namespace fastplong_simd::HWY_NAMESPACE { + CountQualityMetricsImpl() — vectorized quality stats (SumsOf2 overflow-safe) + ReverseComplementImpl() — vectorized complement + reverse + CountAdjacentDiffsImpl() — vectorized adjacent comparison + CountMismatchesImpl() — vectorized byte comparison +} + +#if HWY_ONCE + HWY_EXPORT(...) — register all target versions + dispatch wrappers — HWY_DYNAMIC_DISPATCH + scalar references — for test validation + testSimd() — unit tests +#endif +``` + +## Key Technical Details + +- Block-based accumulation in `countQualityMetrics` (255*N elements per block) prevents u16 overflow for long reads (10k-100k+ bp) +- `SumsOf2` for u8->u16 promotion works on all targets including HWY_SCALAR +- `searchAdapter` outer logic (branching, edit_distance) stays in `adaptertrimmer.cpp`; only inner mismatch loop delegates to `countMismatches` + +## Reference + +Based on fastp's `src/simd.h` / `src/simd.cpp` implementation pattern. diff --git a/docs/plans/2026-02-26-simd-highway-refactor.md b/docs/plans/2026-02-26-simd-highway-refactor.md new file mode 100644 index 0000000..47f40ba --- /dev/null +++ b/docs/plans/2026-02-26-simd-highway-refactor.md @@ -0,0 +1,896 @@ +# SIMD Highway Refactor Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Centralize all SIMD code into `simd.h`/`simd.cpp` using Highway's `foreach_target.h` multi-target dispatch, port fastp's proven implementations, and accelerate remaining scalar hot loops. + +**Architecture:** Single `simd.cpp` compiled via `foreach_target.h` produces code for all CPU targets (SSE2, AVX2, NEON, scalar). Runtime dispatch via `HWY_DYNAMIC_DISPATCH` selects the fastest version. Highway is vendored as `third_party/highway` git submodule. + +**Tech Stack:** C++14, Google Highway 1.3, gtest + +--- + +### Task 1: Add Highway as git submodule + +**Files:** +- Create: `third_party/highway/` (git submodule) +- Modify: `.gitmodules` + +**Step 1: Add the submodule** + +```bash +cd /Users/kimy/workspace/fastplong +mkdir -p third_party +git submodule add https://github.com/google/highway.git third_party/highway +cd third_party/highway +git checkout ac0d5d29 +cd ../.. +``` + +**Step 2: Verify the submodule** + +Run: `ls third_party/highway/hwy/highway.h` +Expected: File exists + +Run: `grep HWY_MAJOR third_party/highway/hwy/base.h | head -2` +Expected: `#define HWY_MAJOR 1` and `#define HWY_MINOR 3` + +**Step 3: Commit** + +```bash +git add .gitmodules third_party/highway +git commit -m "feat: add Highway SIMD library as git submodule (v1.3)" +``` + +--- + +### Task 2: Update Makefile for Highway + +**Files:** +- Modify: `Makefile` + +**Step 1: Update the Makefile** + +Replace the entire Makefile with: + +```makefile +DIR_INC := ./inc +DIR_SRC := ./src +DIR_OBJ := ./obj +DIR_TEST := ./test +DIR_HWY := ./third_party/highway + +PREFIX ?= /usr/local +BINDIR ?= $(PREFIX)/bin +INCLUDE_DIRS ?= /opt/homebrew/include +LIBRARY_DIRS ?= /opt/homebrew/lib + +SRC := $(wildcard ${DIR_SRC}/*.cpp) +TEST := $(wildcard ${DIR_TEST}/*.cpp) +OBJ := $(patsubst %.cpp,${DIR_OBJ}/%.o,$(notdir ${SRC})) +TEST_OBJ := $(patsubst %.cpp,${DIR_OBJ}/%.o,$(notdir ${TEST})) + +# Highway runtime dispatch support +HWY_OBJS := ${DIR_OBJ}/hwy_targets.o ${DIR_OBJ}/hwy_abort.o +OBJ += $(HWY_OBJS) + +TARGET := fastplong + +BIN_TARGET := ${TARGET} +TEST_TARGET := bin/fastplong_unittest + +CXX ?= g++ +CXXFLAGS := -std=c++14 -pthread -g -O3 -MP -MD -I. -I${DIR_INC} -I${DIR_SRC} -I${DIR_HWY} $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) ${CXXFLAGS} +LIBS := -lisal -ldeflate -lpthread +STATIC_FLAGS := -static -L. -Wl,--no-as-needed -pthread +LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(LIBS) $(LD_FLAGS) +STATIC_LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(STATIC_FLAGS) $(LIBS) $(STATIC_LD_FLAGS) + + +${BIN_TARGET}:${OBJ} + $(CXX) $(OBJ) -o $@ $(LD_FLAGS) + +static:${OBJ} + $(CXX) $(OBJ) -o ${BIN_TARGET} $(STATIC_LD_FLAGS) + +${DIR_OBJ}/%.o:${DIR_SRC}/%.cpp + @mkdir -p $(@D) + $(CXX) -c $< -o $@ $(CXXFLAGS) + +# Highway source files for runtime CPU detection and error handling +${DIR_OBJ}/hwy_targets.o:${DIR_HWY}/hwy/targets.cc + @mkdir -p $(@D) + $(CXX) -c $< -o $@ $(CXXFLAGS) + +${DIR_OBJ}/hwy_abort.o:${DIR_HWY}/hwy/abort.cc + @mkdir -p $(@D) + $(CXX) -c $< -o $@ $(CXXFLAGS) + +.PHONY:clean +.PHONY:static +clean: + @rm -rf $(DIR_OBJ) + @rm -f $(TARGET) + @rm -f $(TEST_TARGET) + +install: + install $(TARGET) $(BINDIR)/$(TARGET) + @echo "Installed." + +${DIR_OBJ}/%.o:${DIR_TEST}/%.cpp + @mkdir -p $(@D) + $(CXX) -c $< -o $@ $(CXXFLAGS) + +test-static: ${TEST_OBJ} ${OBJ} + @mkdir -p bin + $(CXX) $(TEST_OBJ) ${OBJ:./obj/main.o=} -o ${TEST_TARGET} $(STATIC_LD_FLAGS) -lgtest -lgtest_main + ./${TEST_TARGET} + +test:${TEST_OBJ} ${OBJ} + @mkdir -p bin + $(CXX) $(TEST_OBJ) ${OBJ:./obj/main.o=} -o ${TEST_TARGET} $(LD_FLAGS) -lgtest -lgtest_main + ./${TEST_TARGET} + +-include $(OBJ:.o=.d) +``` + +Key changes from original: +- Added `DIR_HWY := ./third_party/highway` +- Added `-I. -I${DIR_HWY}` to CXXFLAGS +- Removed `-lhwy` from LIBS +- Added `HWY_OBJS` for `hwy_targets.o` and `hwy_abort.o` +- Added build rules for Highway source files + +**Step 2: Verify it compiles (will fail at link due to simd changes not yet done, but object files should build)** + +Run: `make clean && make -j 2>&1 | tail -5` +Expected: Compilation succeeds (may have link errors due to missing `-lhwy` until we finish simd.cpp, that's OK) + +**Step 3: Commit** + +```bash +git add Makefile +git commit -m "build: switch Highway from system library to vendored third_party submodule" +``` + +--- + +### Task 3: Create simd.h and simd.cpp + +**Files:** +- Create: `src/simd.h` +- Create: `src/simd.cpp` + +**Step 1: Create `src/simd.h`** + +```cpp +#ifndef FASTPLONG_SIMD_H +#define FASTPLONG_SIMD_H + +#include + +namespace fastplong_simd { + +// Count quality metrics for a read in one pass. +// qualstr/seqstr: quality and sequence strings of length len. +// qualThreshold: phred+33 encoded quality threshold for "low quality". +// Outputs: lowQualNum, nBaseNum, totalQual (sum of qual-33 values). +void countQualityMetrics(const char* qualstr, const char* seqstr, int len, + char qualThreshold, int& lowQualNum, int& nBaseNum, + int& totalQual); + +// Reverse complement a DNA sequence. +// src: input sequence of length len. +// dst: output buffer of at least len bytes (must NOT alias src). +void reverseComplement(const char* src, char* dst, int len); + +// Count adjacent-base differences for low complexity filter. +// Returns the number of positions where data[i] != data[i+1]. +int countAdjacentDiffs(const char* data, int len); + +// Count mismatches between two byte strings. +// Returns the number of positions where a[i] != b[i], up to len bytes. +int countMismatches(const char* a, const char* b, int len); + +// Run all SIMD unit tests. Returns true if all pass. +bool testSimd(); + +} // namespace fastplong_simd + +#endif // FASTPLONG_SIMD_H +``` + +**Step 2: Create `src/simd.cpp`** + +```cpp +// SIMD-accelerated functions for fastplong using Google Highway. +// This file is compiled once per SIMD target via foreach_target.h. + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "src/simd.cpp" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace fastplong_simd { +namespace HWY_NAMESPACE { + +namespace hn = hwy::HWY_NAMESPACE; + +void CountQualityMetricsImpl(const char* qualstr, const char* seqstr, int len, + char qualThreshold, int& lowQualNum, int& nBaseNum, + int& totalQual) { + const hn::ScalableTag d; + const int N = hn::Lanes(d); + + const auto vThresh = hn::Set(d, static_cast(qualThreshold)); + const auto vN = hn::Set(d, static_cast('N')); + const auto v33 = hn::Set(d, static_cast(33)); + + int lowQual = 0; + int nBase = 0; + int qualSum = 0; + int i = 0; + + // Process in blocks of up to 255 vectors to avoid u16 overflow during + // quality sum accumulation (255 * 255 = 65025 fits in u16). + const int blockSize = 255 * N; + + for (int blockStart = 0; blockStart < len; blockStart += blockSize) { + int blockEnd = blockStart + blockSize; + if (blockEnd > len) blockEnd = len; + + const hn::ScalableTag d16; + auto vQualSum16 = hn::Zero(d16); + + for (i = blockStart; i + N <= blockEnd; i += N) { + const auto vQual = hn::LoadU(d, reinterpret_cast(qualstr + i)); + const auto vSeq = hn::LoadU(d, reinterpret_cast(seqstr + i)); + + const auto maskLowQ = hn::Lt(vQual, vThresh); + lowQual += hn::CountTrue(d, maskLowQ); + + const auto maskN = hn::Eq(vSeq, vN); + nBase += hn::CountTrue(d, maskN); + + const auto vQualAdj = hn::Sub(vQual, v33); + vQualSum16 = hn::Add(vQualSum16, hn::SumsOf2(vQualAdj)); + } + + const hn::ScalableTag d32; + qualSum += static_cast( + hn::ReduceSum(d32, hn::SumsOf2(vQualSum16))); + } + + // Scalar tail + for (; i < len; i++) { + uint8_t qual = static_cast(qualstr[i]); + qualSum += qual - 33; + if (qual < static_cast(qualThreshold)) lowQual++; + if (seqstr[i] == 'N') nBase++; + } + + lowQualNum = lowQual; + nBaseNum = nBase; + totalQual = qualSum; +} + +void ReverseComplementImpl(const char* src, char* dst, int len) { + const hn::ScalableTag d; + const int N = hn::Lanes(d); + + const auto vA = hn::Set(d, static_cast('A')); + const auto vT = hn::Set(d, static_cast('T')); + const auto vC = hn::Set(d, static_cast('C')); + const auto vG = hn::Set(d, static_cast('G')); + const auto va = hn::Set(d, static_cast('a')); + const auto vt = hn::Set(d, static_cast('t')); + const auto vc = hn::Set(d, static_cast('c')); + const auto vg = hn::Set(d, static_cast('g')); + const auto vNch = hn::Set(d, static_cast('N')); + + int i = 0; + for (; i + N <= len; i += N) { + const auto v = hn::LoadU(d, reinterpret_cast(src + i)); + + auto comp = vNch; + comp = hn::IfThenElse(hn::Eq(v, vA), vT, comp); + comp = hn::IfThenElse(hn::Eq(v, vT), vA, comp); + comp = hn::IfThenElse(hn::Eq(v, vC), vG, comp); + comp = hn::IfThenElse(hn::Eq(v, vG), vC, comp); + comp = hn::IfThenElse(hn::Eq(v, va), vT, comp); + comp = hn::IfThenElse(hn::Eq(v, vt), vA, comp); + comp = hn::IfThenElse(hn::Eq(v, vc), vG, comp); + comp = hn::IfThenElse(hn::Eq(v, vg), vC, comp); + + const auto revComp = hn::Reverse(d, comp); + hn::StoreU(revComp, d, reinterpret_cast(dst + len - i - N)); + } + + // Scalar tail + for (; i < len; i++) { + char base = src[i]; + char comp; + switch (base) { + case 'A': case 'a': comp = 'T'; break; + case 'T': case 't': comp = 'A'; break; + case 'C': case 'c': comp = 'G'; break; + case 'G': case 'g': comp = 'C'; break; + default: comp = 'N'; break; + } + dst[len - 1 - i] = comp; + } +} + +int CountAdjacentDiffsImpl(const char* data, int len) { + if (len <= 1) return 0; + + const hn::ScalableTag d; + const int N = hn::Lanes(d); + + int diff = 0; + int i = 0; + + for (; i + N < len; i += N) { + const auto v1 = hn::LoadU(d, reinterpret_cast(data + i)); + const auto v2 = hn::LoadU(d, reinterpret_cast(data + i + 1)); + const auto maskNe = hn::Ne(v1, v2); + diff += hn::CountTrue(d, maskNe); + } + + // Scalar tail + for (; i < len - 1; i++) { + if (data[i] != data[i + 1]) diff++; + } + + return diff; +} + +int CountMismatchesImpl(const char* a, const char* b, int len) { + const hn::ScalableTag d; + const int N = hn::Lanes(d); + + int diff = 0; + int i = 0; + + for (; i + N <= len; i += N) { + const auto va = hn::LoadU(d, reinterpret_cast(a + i)); + const auto vb = hn::LoadU(d, reinterpret_cast(b + i)); + const auto maskNe = hn::Ne(va, vb); + diff += hn::CountTrue(d, maskNe); + } + + // Scalar tail + for (; i < len; i++) { + if (a[i] != b[i]) diff++; + } + + return diff; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace fastplong_simd +HWY_AFTER_NAMESPACE(); + +// ---- Dynamic dispatch wrappers (compiled once) ---- +#if HWY_ONCE + +#include +#include +#include + +namespace fastplong_simd { + +HWY_EXPORT(CountQualityMetricsImpl); +HWY_EXPORT(ReverseComplementImpl); +HWY_EXPORT(CountAdjacentDiffsImpl); +HWY_EXPORT(CountMismatchesImpl); + +void countQualityMetrics(const char* qualstr, const char* seqstr, int len, + char qualThreshold, int& lowQualNum, int& nBaseNum, + int& totalQual) { + HWY_DYNAMIC_DISPATCH(CountQualityMetricsImpl)(qualstr, seqstr, len, + qualThreshold, lowQualNum, + nBaseNum, totalQual); +} + +void reverseComplement(const char* src, char* dst, int len) { + HWY_DYNAMIC_DISPATCH(ReverseComplementImpl)(src, dst, len); +} + +int countAdjacentDiffs(const char* data, int len) { + return HWY_DYNAMIC_DISPATCH(CountAdjacentDiffsImpl)(data, len); +} + +int countMismatches(const char* a, const char* b, int len) { + return HWY_DYNAMIC_DISPATCH(CountMismatchesImpl)(a, b, len); +} + +// ---- Scalar reference implementations for testing ---- + +static void scalarCountQualityMetrics(const char* qualstr, const char* seqstr, + int len, char qualThreshold, + int& lowQualNum, int& nBaseNum, + int& totalQual) { + lowQualNum = 0; + nBaseNum = 0; + totalQual = 0; + for (int i = 0; i < len; i++) { + uint8_t q = static_cast(qualstr[i]); + totalQual += q - 33; + if (q < static_cast(qualThreshold)) lowQualNum++; + if (seqstr[i] == 'N') nBase++; + } +} + +static void scalarReverseComplement(const char* src, char* dst, int len) { + for (int i = 0; i < len; i++) { + char c; + switch (src[i]) { + case 'A': case 'a': c = 'T'; break; + case 'T': case 't': c = 'A'; break; + case 'C': case 'c': c = 'G'; break; + case 'G': case 'g': c = 'C'; break; + default: c = 'N'; break; + } + dst[len - 1 - i] = c; + } +} + +static int scalarCountAdjacentDiffs(const char* data, int len) { + int diff = 0; + for (int i = 0; i < len - 1; i++) { + if (data[i] != data[i + 1]) diff++; + } + return diff; +} + +static int scalarCountMismatches(const char* a, const char* b, int len) { + int diff = 0; + for (int i = 0; i < len; i++) { + if (a[i] != b[i]) diff++; + } + return diff; +} + +bool testSimd() { + bool pass = true; + + // --- reverseComplement --- + { + const char* in = "AAAATTTTCCCCGGGG"; + int len = 16; + char out[16]; + reverseComplement(in, out, len); + char ref[16]; + scalarReverseComplement(in, ref, len); + if (memcmp(out, ref, len) != 0) { + fprintf(stderr, "FAIL: reverseComplement basic\n"); + pass = false; + } + } + { + const char* in = "AaTtCcGgN"; + int len = 9; + char out[9]; + reverseComplement(in, out, len); + char ref[9]; + scalarReverseComplement(in, ref, len); + if (memcmp(out, ref, len) != 0) { + fprintf(stderr, "FAIL: reverseComplement mixed case\n"); + pass = false; + } + } + { + char out[1] = {'X'}; + reverseComplement("", out, 0); + } + { + char out[1]; + reverseComplement("A", out, 1); + if (out[0] != 'T') { + fprintf(stderr, "FAIL: reverseComplement len=1\n"); + pass = false; + } + } + { + const char* in = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG"; + int len = 68; + char out[68], ref[68]; + reverseComplement(in, out, len); + scalarReverseComplement(in, ref, len); + if (memcmp(out, ref, len) != 0) { + fprintf(stderr, "FAIL: reverseComplement long string\n"); + pass = false; + } + } + + // --- countQualityMetrics --- + { + const char* qual = "IIIII"; + const char* seq = "ACGTN"; + int lowQ = 0, nBase = 0, totalQ = 0; + countQualityMetrics(qual, seq, 5, '5', lowQ, nBase, totalQ); + int refLowQ = 0, refNBase = 0, refTotalQ = 0; + scalarCountQualityMetrics(qual, seq, 5, '5', refLowQ, refNBase, refTotalQ); + if (lowQ != refLowQ || nBase != refNBase || totalQ != refTotalQ) { + fprintf(stderr, "FAIL: countQualityMetrics basic\n"); + pass = false; + } + } + { + const char* qual = "!!!!!!!!!!"; + const char* seq = "AAAAAAAAAA"; + int lowQ = 0, nBase = 0, totalQ = 0; + countQualityMetrics(qual, seq, 10, '5', lowQ, nBase, totalQ); + int refLowQ = 0, refNBase = 0, refTotalQ = 0; + scalarCountQualityMetrics(qual, seq, 10, '5', refLowQ, refNBase, refTotalQ); + if (lowQ != refLowQ || nBase != refNBase || totalQ != refTotalQ) { + fprintf(stderr, "FAIL: countQualityMetrics all-low\n"); + pass = false; + } + } + { + const char qual[] = "IIIIII!!!!!IIIII55555NNNNN!!!!!IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII"; + const char seq[] = "ACGTNNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACG"; + int len = 68; + int lowQ = 0, nBase = 0, totalQ = 0; + countQualityMetrics(qual, seq, len, '5', lowQ, nBase, totalQ); + int refLowQ = 0, refNBase = 0, refTotalQ = 0; + scalarCountQualityMetrics(qual, seq, len, '5', refLowQ, refNBase, refTotalQ); + if (lowQ != refLowQ || nBase != refNBase || totalQ != refTotalQ) { + fprintf(stderr, "FAIL: countQualityMetrics long\n"); + pass = false; + } + } + + // --- countAdjacentDiffs --- + { + int d = countAdjacentDiffs("AAAAAAAAAA", 10); + if (d != 0) { + fprintf(stderr, "FAIL: countAdjacentDiffs all-same got %d\n", d); + pass = false; + } + } + { + const char* s = "ACACACACAC"; + int d = countAdjacentDiffs(s, 10); + int ref = scalarCountAdjacentDiffs(s, 10); + if (d != ref) { + fprintf(stderr, "FAIL: countAdjacentDiffs all-diff got %d expected %d\n", d, ref); + pass = false; + } + } + { + if (countAdjacentDiffs("A", 1) != 0) { + fprintf(stderr, "FAIL: countAdjacentDiffs len=1\n"); + pass = false; + } + if (countAdjacentDiffs("", 0) != 0) { + fprintf(stderr, "FAIL: countAdjacentDiffs len=0\n"); + pass = false; + } + } + { + const char* s = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG"; + int len = 68; + int d = countAdjacentDiffs(s, len); + int ref = scalarCountAdjacentDiffs(s, len); + if (d != ref) { + fprintf(stderr, "FAIL: countAdjacentDiffs long got %d expected %d\n", d, ref); + pass = false; + } + } + + // --- countMismatches --- + { + const char* s = "ACGTACGTACGT"; + int d = countMismatches(s, s, 12); + if (d != 0) { + fprintf(stderr, "FAIL: countMismatches identical got %d\n", d); + pass = false; + } + } + { + int d = countMismatches("AAAA", "TTTT", 4); + if (d != 4) { + fprintf(stderr, "FAIL: countMismatches all-diff got %d\n", d); + pass = false; + } + } + { + const char* a = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG"; + const char* b = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG"; + int d = countMismatches(a, b, 68); + if (d != 0) { + fprintf(stderr, "FAIL: countMismatches long-identical got %d\n", d); + pass = false; + } + } + { + const char* a = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + const char* b = "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"; + int d = countMismatches(a, b, 66); + if (d != 66) { + fprintf(stderr, "FAIL: countMismatches long-alldiff got %d\n", d); + pass = false; + } + } + { + int d = countMismatches("A", "T", 0); + if (d != 0) { + fprintf(stderr, "FAIL: countMismatches len=0 got %d\n", d); + pass = false; + } + } + + return pass; +} + +} // namespace fastplong_simd + +#endif // HWY_ONCE +``` + +NOTE: There is a bug in the scalar reference above — `nBase++` should be `nBaseNum++`. Fix it when implementing. Copy the exact code from fastp's `src/simd.cpp` as the reference, adjusting only the namespace from `fastp_simd` to `fastplong_simd`. + +**Step 3: Verify it compiles** + +Run: `make clean && make -j 2>&1 | tail -10` +Expected: Compiles without errors (simd.cpp may generate warnings but no errors) + +**Step 4: Commit** + +```bash +git add src/simd.h src/simd.cpp +git commit -m "feat: add centralized SIMD module with Highway multi-target dispatch" +``` + +--- + +### Task 4: Add SIMD unit test + +**Files:** +- Create: `test/simd_test.cpp` + +**Step 1: Create the test file** + +```cpp +#include +#include "../src/simd.h" + +TEST(SimdTest, allTests) { + EXPECT_TRUE(fastplong_simd::testSimd()); +} +``` + +**Step 2: Run tests** + +Run: `make clean && make test 2>&1 | tail -20` +Expected: All tests pass including SimdTest.allTests + +**Step 3: Commit** + +```bash +git add test/simd_test.cpp +git commit -m "test: add SIMD unit tests" +``` + +--- + +### Task 5: Refactor filter.cpp to use SIMD + +**Files:** +- Modify: `src/filter.cpp` + +**Step 1: Update filter.cpp** + +Add include at the top (after existing includes): +```cpp +#include "simd.h" +``` + +Replace the scalar loop in `passFilter()` (the `for(int i=0; iqualfilter.qualifiedQual, + lowQualNum, nBaseNum, totalQual); +``` + +Replace the scalar loop in `passLowComplexityFilter()` (the `for(int i=0; i&1 | tail -20` +Expected: All tests pass (FilterTest.trimAndCut still passes) + +**Step 3: Commit** + +```bash +git add src/filter.cpp +git commit -m "perf: use SIMD countQualityMetrics and countAdjacentDiffs in filter" +``` + +--- + +### Task 6: Refactor sequence.cpp to use SIMD + +**Files:** +- Modify: `src/sequence.cpp` + +**Step 1: Replace entire sequence.cpp** + +```cpp +#include "sequence.h" +#include "simd.h" + +Sequence::Sequence(){ +} + +Sequence::Sequence(string* seq){ + mStr = seq; +} + +Sequence::~Sequence(){ + if(mStr) + delete mStr; +} + +void Sequence::print(){ + std::cerr << *mStr; +} + +int Sequence::length(){ + return mStr->length(); +} + +string Sequence::reverseComplement(string* origin) { + int len = origin->length(); + string str(len, 0); + fastplong_simd::reverseComplement(origin->c_str(), &str[0], len); + return str; +} + +Sequence Sequence::reverseComplement() { + string* reversed = new string(Sequence::reverseComplement(mStr)); + return Sequence(reversed); +} + +Sequence Sequence::operator~(){ + return reverseComplement(); +} +``` + +Key changes: +- Removed `#include `, `#include `, `#include `, `#include "simdutil.h"` +- Removed `namespace hn = hwy::HWY_NAMESPACE;` +- Simplified `reverseComplement(string*)` to a 4-line function calling `fastplong_simd::reverseComplement` +- Removed the `#if _MSC_VER` / VLA / AllocateAligned branching + +**Step 2: Run tests** + +Run: `make clean && make test 2>&1 | tail -20` +Expected: All tests pass (SequenceTests.reverse still passes) + +**Step 3: Commit** + +```bash +git add src/sequence.cpp +git commit -m "refactor: use centralized SIMD reverseComplement in sequence.cpp" +``` + +--- + +### Task 7: Refactor adaptertrimmer.cpp to use SIMD + +**Files:** +- Modify: `src/adaptertrimmer.cpp` + +**Step 1: Update adaptertrimmer.cpp** + +Replace includes — change: +```cpp +#include "hwy/highway.h" +``` +to: +```cpp +#include "simd.h" +``` + +In `searchAdapter()`, remove the Highway setup at lines 61-63: +```cpp + namespace hn = hwy::HWY_NAMESPACE; + hn::ScalableTag d8; + const size_t N = hn::Lanes(d8); +``` + +Replace the three inner SIMD loops. Each occurrence of this pattern (appears at lines 89-96, 115-122, 139-146): +```cpp + size_t mismatch = 0; + for (size_t i = 0; i < alen; i += N) + { + const size_t lanesToLoad = min(alen - i, N); + const auto rdata_v = hn::LoadN(d8, reinterpret_cast(rdata + i + p), lanesToLoad); + const auto adata_v = hn::LoadN(d8, reinterpret_cast(adata + i), lanesToLoad); + const auto mismatch_mask = rdata_v != adata_v; + mismatch += hn::CountTrue(d8, mismatch_mask); + } +``` + +Replace each with: +```cpp + int mismatch = fastplong_simd::countMismatches(rdata + p, adata, alen); +``` + +Also change `size_t mismatch` → `int mismatch` in the comparison logic (the variable type is returned as `int` by `countMismatches`). + +**Step 2: Run tests** + +Run: `make clean && make test 2>&1 | tail -20` +Expected: All tests pass (AdapterTrimmer tests still pass) + +**Step 3: Commit** + +```bash +git add src/adaptertrimmer.cpp +git commit -m "refactor: use centralized SIMD countMismatches in adaptertrimmer" +``` + +--- + +### Task 8: Remove simdutil.h + +**Files:** +- Delete: `src/simdutil.h` + +**Step 1: Verify no remaining references** + +Run: `grep -r "simdutil" src/ test/` +Expected: No output (no files reference simdutil.h anymore after Task 6) + +**Step 2: Delete the file** + +```bash +rm src/simdutil.h +``` + +**Step 3: Run full build and tests** + +Run: `make clean && make test 2>&1 | tail -20` +Expected: All tests pass, no compilation errors + +**Step 4: Commit** + +```bash +git add -A src/simdutil.h +git commit -m "cleanup: remove simdutil.h, functionality merged into simd.cpp" +``` + +--- + +### Task 9: Final verification + +**Step 1: Clean build** + +Run: `make clean && make -j 2>&1` +Expected: Clean compilation with no errors or warnings + +**Step 2: Run all tests** + +Run: `make test 2>&1` +Expected: All tests pass + +**Step 3: Quick smoke test with real data (if available)** + +Run: `./fastplong --help` +Expected: Help output appears normally + +**Step 4: Verify no remaining direct Highway includes in source files (except simd.cpp)** + +Run: `grep -r '#include.*hwy/' src/ | grep -v simd.cpp` +Expected: No output — only simd.cpp should include Highway headers directly diff --git a/src/adaptertrimmer.cpp b/src/adaptertrimmer.cpp index ccc5ed5..b95c8ea 100644 --- a/src/adaptertrimmer.cpp +++ b/src/adaptertrimmer.cpp @@ -1,7 +1,7 @@ #include "adaptertrimmer.h" #include "editdistance.h" #include -#include "hwy/highway.h" +#include "simd.h" AdapterTrimmer::AdapterTrimmer(){ } @@ -58,10 +58,6 @@ int AdapterTrimmer::trimByMultiSequences(Read* r, FilterResult* fr, vector d8; - const size_t N = hn::Lanes(d8); - int minMismatch = 99999; // initialized with a large mismatch int pos = -1; // for the best match @@ -86,15 +82,7 @@ int AdapterTrimmer::searchAdapter(string *read, const string &adapter, double ed // go from left, and return immediatedly if find a mismatch < threshold for (int p = searchStart; p < searchEnd - alen; p++) { - size_t mismatch = 0; - for (size_t i = 0; i < alen; i += N) - { - const size_t lanesToLoad = min(alen - i, N); - const auto rdata_v = hn::LoadN(d8, reinterpret_cast(rdata + i + p), lanesToLoad); - const auto adata_v = hn::LoadN(d8, reinterpret_cast(adata + i), lanesToLoad); - const auto mismatch_mask = rdata_v != adata_v; - mismatch += hn::CountTrue(d8, mismatch_mask); - } + int mismatch = fastplong_simd::countMismatches(rdata + p, adata, alen); if (mismatch <= threshold) { @@ -112,15 +100,7 @@ int AdapterTrimmer::searchAdapter(string *read, const string &adapter, double ed // go from right, and return immediatedly if find a mismatch <= threshold for (int p = searchEnd - alen; p >= searchStart; p--) { - size_t mismatch = 0; - for (size_t i = 0; i < alen; i += N) - { - const size_t lanesToLoad = min(alen - i, N); - const auto rdata_v = hn::LoadN(d8, reinterpret_cast(rdata + i + p), lanesToLoad); - const auto adata_v = hn::LoadN(d8, reinterpret_cast(adata + i), lanesToLoad); - const auto mismatch_mask = rdata_v != adata_v; - mismatch += hn::CountTrue(d8, mismatch_mask); - } + int mismatch = fastplong_simd::countMismatches(rdata + p, adata, alen); if (mismatch <= threshold) { return p; @@ -136,15 +116,7 @@ int AdapterTrimmer::searchAdapter(string *read, const string &adapter, double ed { for (int p = searchStart; p < searchEnd - alen; p++) { - size_t mismatch = 0; - for (size_t i = 0; i < alen; i += N) - { - const size_t lanesToLoad = min(alen - i, N); - const auto rdata_v = hn::LoadN(d8, reinterpret_cast(rdata + i + p), lanesToLoad); - const auto adata_v = hn::LoadN(d8, reinterpret_cast(adata + i), lanesToLoad); - const auto mismatch_mask = rdata_v != adata_v; - mismatch += hn::CountTrue(d8, mismatch_mask); - } + int mismatch = fastplong_simd::countMismatches(rdata + p, adata, alen); if (mismatch < minMismatch) { minMismatch = mismatch; diff --git a/src/filter.cpp b/src/filter.cpp index 07847ef..df1d275 100644 --- a/src/filter.cpp +++ b/src/filter.cpp @@ -1,5 +1,6 @@ #include "processor.h" #include "seprocessor.h" +#include "simd.h" Filter::Filter(Options* opt){ mOptions = opt; @@ -24,18 +25,9 @@ int Filter::passFilter(Read* r) { const char* seqstr = r->mSeq->c_str(); const char* qualstr = r->mQuality->c_str(); - for(int i=0; iqualfilter.qualifiedQual) - lowQualNum ++; - - if(base == 'N') - nBaseNum++; - } + fastplong_simd::countQualityMetrics(qualstr, seqstr, rlen, + mOptions->qualfilter.qualifiedQual, + lowQualNum, nBaseNum, totalQual); } if(mOptions->qualfilter.enabled) { @@ -65,19 +57,12 @@ int Filter::passFilter(Read* r) { } bool Filter::passLowComplexityFilter(Read* r) { - int diff = 0; int length = r->length(); if(length <= 1) return false; const char* data = r->mSeq->c_str(); - for(int i=0; i= mOptions->complexityFilter.threshold ) - return true; - else - return false; + int diff = fastplong_simd::countAdjacentDiffs(data, length); + return (double)diff/(double)(length-1) >= mOptions->complexityFilter.threshold; } vector> Filter::detectLowQualityRegions(Read* r, int windowSize, int quality) { diff --git a/src/sequence.cpp b/src/sequence.cpp index 7ec0c42..0adb678 100644 --- a/src/sequence.cpp +++ b/src/sequence.cpp @@ -1,10 +1,5 @@ #include "sequence.h" -#include -#include -#include -#include "simdutil.h" - -namespace hn = hwy::HWY_NAMESPACE; +#include "simd.h" Sequence::Sequence(){ } @@ -26,54 +21,11 @@ int Sequence::length(){ return mStr->length(); } -string Sequence::reverseComplement(string *HWY_RESTRICT origin) { - auto length = origin->length(); - const hn::ScalableTag d; - const auto sequence = reinterpret_cast(origin->c_str()); - const auto transform = [](const auto d, auto output, const auto sequence) HWY_ATTR - { - const auto A = hn::Set(d, 'A'); - const auto T = hn::Set(d, 'T'); - const auto C = hn::Set(d, 'C'); - const auto G = hn::Set(d, 'G'); - const auto a = hn::Set(d, 'a'); - const auto t = hn::Set(d, 't'); - const auto c = hn::Set(d, 'c'); - const auto g = hn::Set(d, 'g'); - const auto N = hn::Set(d, 'N'); - - // output[i] = sequence[i] == 'A' || sequence[i] == 'a' ? 'T' : 'N' - output = hn::IfThenElse(hn::Or(hn::Eq(sequence, A), hn::Eq(sequence, a)), T, N); - output = hn::IfThenElse(hn::Or(hn::Eq(sequence, T), hn::Eq(sequence, t)), A, output); - output = hn::IfThenElse(hn::Or(hn::Eq(sequence, C), hn::Eq(sequence, c)), G, output); - output = hn::IfThenElse(hn::Or(hn::Eq(sequence, G), hn::Eq(sequence, g)), C, output); - return output; - }; -#if _MSC_VER - if (true) { -#else - if (length <= 1000000) { -#endif -#if _MSC_VER - auto outputPtr = std::make_unique(length); - uint8_t* output = outputPtr.get(); -#else - uint8_t output[length]; -#endif - hn::Transform1Reversed(d, output, length, sequence, transform); - auto retVal = reinterpret_cast(output); - std::string reversed(retVal, length); - return reversed; - } -#ifndef _MSC_VER - else { - const auto allocated = hwy::AllocateAligned(length); - hn::Transform1Reversed(d, allocated.get(), length, sequence, transform); - auto retVal = reinterpret_cast(allocated.get()); - std::string reversed(retVal, length); - return reversed; - } -#endif +string Sequence::reverseComplement(string* origin) { + int len = origin->length(); + string str(len, 0); + fastplong_simd::reverseComplement(origin->c_str(), &str[0], len); + return str; } Sequence Sequence::reverseComplement() { diff --git a/src/simdutil.h b/src/simdutil.h deleted file mode 100644 index 171fbdf..0000000 --- a/src/simdutil.h +++ /dev/null @@ -1,38 +0,0 @@ -#pragma once -#include -#include "hwy/highway.h" - -HWY_BEFORE_NAMESPACE(); -namespace hwy { -namespace HWY_NAMESPACE { - -template > -void Transform1Reversed(D d, T* HWY_RESTRICT inout, size_t count, - const T* HWY_RESTRICT in1, const Func& func) { - const size_t N = hwy::HWY_NAMESPACE::Lanes(d); - - size_t idx = 0; - if (count >= N) { - for (; idx <= count - N; idx += N) { - const Vec v = LoadU(d, inout + idx); - const Vec v1 = LoadU(d, in1 + idx); - StoreU(Reverse(d, func(d, v, v1)), d, inout + count - N - idx); - } - } - - // `count` was a multiple of the vector length `N`: already done. - if (HWY_UNLIKELY(idx == count)) return; - - const size_t remaining = count - idx; - HWY_DASSERT(0 != remaining && remaining < N); - const Vec v = LoadN(d, inout + idx, remaining); - const Vec v1 = LoadN(d, in1 + idx, remaining); - StoreN( - SlideDownLanes( - d, Reverse(d, func(d, v, v1)), N - remaining), - d, inout, remaining); -} - -} -} -HWY_AFTER_NAMESPACE(); diff --git a/test/simd_test.cpp b/test/simd_test.cpp new file mode 100644 index 0000000..04625a9 --- /dev/null +++ b/test/simd_test.cpp @@ -0,0 +1,6 @@ +#include +#include "../src/simd.h" + +TEST(SimdTest, allTests) { + EXPECT_TRUE(fastplong_simd::testSimd()); +} From 7c185aebc7125e2273882b3044470b59758bb686 Mon Sep 17 00:00:00 2001 From: Kim Yang Date: Thu, 26 Feb 2026 22:37:59 +0800 Subject: [PATCH 05/14] docs: fix overflow comment in CountQualityMetricsImpl Co-Authored-By: Claude Opus 4.6 --- src/simd.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/simd.cpp b/src/simd.cpp index 056f6f0..76b6b21 100644 --- a/src/simd.cpp +++ b/src/simd.cpp @@ -27,9 +27,9 @@ void CountQualityMetricsImpl(const char* qualstr, const char* seqstr, int len, int qualSum = 0; int i = 0; - // Process in chunks, accumulating into wider types to avoid overflow. - // Process in blocks of up to 255 vectors to avoid u16 overflow during - // quality sum accumulation (255 * 255 = 65025 fits in u16). + // Process in blocks of up to 255 vectors to avoid u16 overflow. + // SumsOf2 produces at most 2*93 = 186 per u16 lane (valid FASTQ range). + // Over 255 iterations: 255 * 186 = 47430, safely within u16 (max 65535). const int blockSize = 255 * N; for (int blockStart = 0; blockStart < len; blockStart += blockSize) { From 4ab790fea3b984289041fb347527efcb463e1451 Mon Sep 17 00:00:00 2001 From: Kim Yang Date: Thu, 26 Feb 2026 22:54:54 +0800 Subject: [PATCH 06/14] test: add ONT dummy data generator for e2e benchmarking Co-Authored-By: Claude Opus 4.6 --- .gitignore | 9 +++ testdata/gen_ont_reads.py | 153 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+) create mode 100644 testdata/gen_ont_reads.py diff --git a/.gitignore b/.gitignore index a14e5af..3530bdb 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,12 @@ *.out *.app bin +fastplong + +# Test data (generated) +testdata/*.fastq +testdata/*.fastq.gz + +# Reports +fastplong.html +fastplong.json diff --git a/testdata/gen_ont_reads.py b/testdata/gen_ont_reads.py new file mode 100644 index 0000000..5796cc5 --- /dev/null +++ b/testdata/gen_ont_reads.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +"""Generate dummy ONT (Oxford Nanopore) FASTQ reads for benchmarking. + +Produces 10,000 reads with realistic ONT characteristics: +- Read lengths: log-normal distribution, median ~5kb, range ~500bp-50kb +- Quality scores: mean ~Q10-Q15 (typical ONT), with some low-quality regions +- Random ATCG sequence with occasional N bases +- Adapter sequences at start/end of some reads +""" + +import random +import gzip +import math +import os + +SEED = 42 +NUM_READS = 10_000 +OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__)) + +# ONT adapter sequences (real ones from Oxford Nanopore) +START_ADAPTER = "AATGTACTTCGTTCAGTTACGTATTGCT" +END_ADAPTER = "GCAATACGTAACTGAACGAAGT" + +BASES = "ACGT" + + +def random_quality(length: int) -> str: + """Generate ONT-like quality string (phred+33). + + ONT typical range: Q5-Q20, with occasional dips. + """ + quals = [] + # Start with a baseline quality that varies per read + base_qual = random.gauss(12, 3) # mean Q12 + + for i in range(length): + # Occasional low-quality regions (simulating systematic errors) + if random.random() < 0.02: + q = max(0, min(40, int(random.gauss(5, 2)))) + else: + q = max(0, min(40, int(random.gauss(base_qual, 4)))) + quals.append(chr(q + 33)) + return "".join(quals) + + +def random_sequence(length: int) -> str: + """Generate random DNA sequence with occasional N bases.""" + seq = [] + for _ in range(length): + if random.random() < 0.005: # 0.5% N rate + seq.append("N") + else: + seq.append(random.choice(BASES)) + return "".join(seq) + + +def generate_read_length() -> int: + """Log-normal distribution mimicking ONT read lengths.""" + # median ~5000, with long tail up to ~50000 + length = int(random.lognormvariate(math.log(5000), 0.7)) + return max(200, min(100000, length)) + + +def write_fastq(filepath: str, reads: list, compress: bool = False): + """Write reads to FASTQ file, optionally gzipped.""" + opener = gzip.open if compress else open + mode = "wt" if compress else "w" + + with opener(filepath, mode) as f: + for read_id, seq, qual in reads: + f.write(f"@{read_id}\n") + f.write(f"{seq}\n") + f.write("+\n") + f.write(f"{qual}\n") + + +def main(): + random.seed(SEED) + + reads = [] + total_bases = 0 + + for i in range(NUM_READS): + length = generate_read_length() + + # Some reads have adapters (30% start, 30% end, 10% both) + seq = "" + r = random.random() + if r < 0.10: + # Both adapters + core_len = max(100, length - len(START_ADAPTER) - len(END_ADAPTER)) + seq = START_ADAPTER + random_sequence(core_len) + END_ADAPTER + elif r < 0.40: + # Start adapter only + core_len = max(100, length - len(START_ADAPTER)) + seq = START_ADAPTER + random_sequence(core_len) + elif r < 0.70: + # End adapter only + core_len = max(100, length - len(END_ADAPTER)) + seq = random_sequence(core_len) + END_ADAPTER + else: + seq = random_sequence(length) + + actual_len = len(seq) + qual = random_quality(actual_len) + + read_id = ( + f"ont_read_{i:06d} " + f"runid=dummy_run_001 " + f"read={i} " + f"ch={random.randint(1, 512)} " + f"start_time=2024-01-01T00:00:{i % 60:02d}Z " + f"flow_cell_id=FAK00001 " + f"protocol_group_id=benchmark " + f"sample_id=dummy_ont" + ) + + reads.append((read_id, seq, qual)) + total_bases += actual_len + + # Write uncompressed + fq_path = os.path.join(OUTPUT_DIR, "ont_10k.fastq") + write_fastq(fq_path, reads, compress=False) + + # Write gzipped + gz_path = os.path.join(OUTPUT_DIR, "ont_10k.fastq.gz") + write_fastq(gz_path, reads, compress=True) + + # Stats + lengths = [len(seq) for _, seq, _ in reads] + lengths.sort() + print(f"Generated {NUM_READS} ONT reads") + print(f"Total bases: {total_bases:,}") + print(f"Mean length: {total_bases // NUM_READS:,}") + print(f"Median length: {lengths[len(lengths)//2]:,}") + print(f"Min length: {lengths[0]:,}") + print(f"Max length: {lengths[-1]:,}") + print(f"N50: {calculate_n50(lengths, total_bases):,}") + print(f"Written: {fq_path} ({os.path.getsize(fq_path) / 1024 / 1024:.1f} MB)") + print(f"Written: {gz_path} ({os.path.getsize(gz_path) / 1024 / 1024:.1f} MB)") + + +def calculate_n50(sorted_lengths: list, total_bases: int) -> int: + cumsum = 0 + for l in reversed(sorted_lengths): + cumsum += l + if cumsum >= total_bases / 2: + return l + return sorted_lengths[-1] + + +if __name__ == "__main__": + main() From 827639d545c531f206c44c65258ca750581613f4 Mon Sep 17 00:00:00 2001 From: Kim Yang Date: Thu, 26 Feb 2026 23:39:59 +0800 Subject: [PATCH 07/14] build: vendor isa-l and libdeflate as submodules with dual-mode Makefile Add isa-l (v2.31.0) and libdeflate (v1.25) as git submodules matching fastp's vendoring pattern. Default `make` links system libs; `make static` builds from submodules. Remove stale src/libdeflate.h (v1.6) in favor of third_party/libdeflate/libdeflate.h (v1.25). Fix igzip include to match fastp's flat include style. Co-Authored-By: Claude Opus 4.6 --- .gitmodules | 6 + Makefile | 63 +++++-- src/fastqreader.h | 2 +- src/libdeflate.h | 362 ----------------------------------------- third_party/isa-l | 1 + third_party/libdeflate | 1 + 6 files changed, 58 insertions(+), 377 deletions(-) delete mode 100644 src/libdeflate.h create mode 160000 third_party/isa-l create mode 160000 third_party/libdeflate diff --git a/.gitmodules b/.gitmodules index e7ed026..77c1e52 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,9 @@ [submodule "third_party/highway"] path = third_party/highway url = https://github.com/google/highway.git +[submodule "third_party/isa-l"] + path = third_party/isa-l + url = https://github.com/intel/isa-l.git +[submodule "third_party/libdeflate"] + path = third_party/libdeflate + url = https://github.com/ebiggers/libdeflate.git diff --git a/Makefile b/Makefile index 576d95a..1e1c12d 100644 --- a/Makefile +++ b/Makefile @@ -3,11 +3,13 @@ DIR_SRC := ./src DIR_OBJ := ./obj DIR_TEST := ./test DIR_HWY := ./third_party/highway +DIR_ISAL := ./third_party/isa-l +DIR_LIBDEFLATE := ./third_party/libdeflate PREFIX ?= /usr/local BINDIR ?= $(PREFIX)/bin -INCLUDE_DIRS ?= /opt/homebrew/include -LIBRARY_DIRS ?= /opt/homebrew/lib +INCLUDE_DIRS ?= +LIBRARY_DIRS ?= SRC := $(wildcard ${DIR_SRC}/*.cpp) TEST := $(wildcard ${DIR_TEST}/*.cpp) @@ -24,18 +26,47 @@ BIN_TARGET := ${TARGET} TEST_TARGET := bin/fastplong_unittest CXX ?= g++ -CXXFLAGS := -std=c++17 -pthread -g -O3 -MP -MD -I. -I${DIR_HWY} -I${DIR_INC} -I${DIR_SRC} $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) ${CXXFLAGS} +CXXFLAGS := -std=c++17 -pthread -g -O3 -MD -MP \ + -I. -I${DIR_INC} -I${DIR_SRC} -I${DIR_HWY} \ + -I${DIR_ISAL}/include -I${DIR_LIBDEFLATE} \ + $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) \ + ${CXXFLAGS} + +# --- Link mode (default): link against system-installed libraries --- LIBS := -lisal -ldeflate -lpthread -STATIC_FLAGS := -static -L. -Wl,--no-as-needed -pthread LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(LIBS) $(LD_FLAGS) -STATIC_LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(STATIC_FLAGS) $(LIBS) $(STATIC_LD_FLAGS) +# --- Static mode: build from submodules and link statically --- +ISAL_LIB := $(DIR_ISAL)/bin/isa-l.a +LIBDEFLATE_LIB := $(DIR_LIBDEFLATE)/build/libdeflate.a + +# On Linux: fully static binary; on macOS: static libs, dynamic system runtime +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Linux) + STATIC_LD_FLAGS := -static -Wl,--no-as-needed -lpthread +else + STATIC_LD_FLAGS := -lpthread +endif -${BIN_TARGET}:${OBJ} +# Default target: link against system libraries +${BIN_TARGET}: ${OBJ} $(CXX) $(OBJ) -o $@ $(LD_FLAGS) -static:${OBJ} - $(CXX) $(OBJ) -o ${BIN_TARGET} $(STATIC_LD_FLAGS) +# Static target: build deps from submodules and link statically +static: $(ISAL_LIB) $(LIBDEFLATE_LIB) ${OBJ} + $(CXX) $(OBJ) -o ${BIN_TARGET} $(ISAL_LIB) $(LIBDEFLATE_LIB) $(STATIC_LD_FLAGS) + +# Build isa-l static library from submodule +$(ISAL_LIB): + $(MAKE) -C $(DIR_ISAL) -f Makefile.unx lib + +# Build libdeflate static library from submodule +$(LIBDEFLATE_LIB): + cd $(DIR_LIBDEFLATE) && cmake -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DLIBDEFLATE_BUILD_SHARED_LIB=OFF \ + -DLIBDEFLATE_BUILD_GZIP=OFF && \ + cmake --build build ${DIR_OBJ}/%.o:${DIR_SRC}/%.cpp @mkdir -p $(@D) @@ -50,27 +81,31 @@ ${DIR_OBJ}/hwy_abort.o:${DIR_HWY}/hwy/abort.cc @mkdir -p $(@D) $(CXX) -c $< -o $@ $(CXXFLAGS) -.PHONY:clean -.PHONY:static +.PHONY: clean static install clean-deps + clean: @rm -rf $(DIR_OBJ) @rm -f $(TARGET) @rm -f $(TEST_TARGET) +clean-deps: + -$(MAKE) -C $(DIR_ISAL) -f Makefile.unx clean 2>/dev/null || true + -rm -rf $(DIR_LIBDEFLATE)/build 2>/dev/null || true + install: install $(TARGET) $(BINDIR)/$(TARGET) @echo "Installed." ${DIR_OBJ}/%.o:${DIR_TEST}/%.cpp - @mkdir -p $(@D) + @mkdir -p $(@D) $(CXX) -c $< -o $@ $(CXXFLAGS) -test-static: ${TEST_OBJ} ${OBJ} +test-static: $(ISAL_LIB) $(LIBDEFLATE_LIB) ${TEST_OBJ} ${OBJ} @mkdir -p bin - $(CXX) $(TEST_OBJ) ${OBJ:./obj/main.o=} -o ${TEST_TARGET} $(STATIC_LD_FLAGS) -lgtest -lgtest_main + $(CXX) $(TEST_OBJ) ${OBJ:./obj/main.o=} -o ${TEST_TARGET} $(ISAL_LIB) $(LIBDEFLATE_LIB) $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(STATIC_LD_FLAGS) -lgtest -lgtest_main ./${TEST_TARGET} -test:${TEST_OBJ} ${OBJ} +test: ${TEST_OBJ} ${OBJ} @mkdir -p bin $(CXX) $(TEST_OBJ) ${OBJ:./obj/main.o=} -o ${TEST_TARGET} $(LD_FLAGS) -lgtest -lgtest_main ./${TEST_TARGET} diff --git a/src/fastqreader.h b/src/fastqreader.h index 3ff41c1..5c9dfe6 100644 --- a/src/fastqreader.h +++ b/src/fastqreader.h @@ -32,7 +32,7 @@ SOFTWARE. #include #include #include "readpool.h" -#include +#include "igzip_lib.h" class FastqReader{ public: diff --git a/src/libdeflate.h b/src/libdeflate.h deleted file mode 100644 index 4e124e7..0000000 --- a/src/libdeflate.h +++ /dev/null @@ -1,362 +0,0 @@ -/* - * libdeflate.h - public header for libdeflate - */ - -#ifndef LIBDEFLATE_H -#define LIBDEFLATE_H - -#ifdef __cplusplus -extern "C" { -#endif - -#define LIBDEFLATE_VERSION_MAJOR 1 -#define LIBDEFLATE_VERSION_MINOR 6 -#define LIBDEFLATE_VERSION_STRING "1.6" - -#include -#include - -/* - * On Windows, if you want to link to the DLL version of libdeflate, then - * #define LIBDEFLATE_DLL. Note that the calling convention is "stdcall". - */ -#ifdef LIBDEFLATE_DLL -# ifdef BUILDING_LIBDEFLATE -# define LIBDEFLATEEXPORT LIBEXPORT -# elif defined(_WIN32) || defined(__CYGWIN__) -# define LIBDEFLATEEXPORT __declspec(dllimport) -# endif -#endif -#ifndef LIBDEFLATEEXPORT -# define LIBDEFLATEEXPORT -#endif - -#if defined(_WIN32) && !defined(_WIN64) -# define LIBDEFLATEAPI_ABI __stdcall -#else -# define LIBDEFLATEAPI_ABI -#endif - -#if defined(BUILDING_LIBDEFLATE) && defined(__GNUC__) && \ - defined(_WIN32) && !defined(_WIN64) - /* - * On 32-bit Windows, gcc assumes 16-byte stack alignment but MSVC only 4. - * Realign the stack when entering libdeflate to avoid crashing in SSE/AVX - * code when called from an MSVC-compiled application. - */ -# define LIBDEFLATEAPI_STACKALIGN __attribute__((force_align_arg_pointer)) -#else -# define LIBDEFLATEAPI_STACKALIGN -#endif - -#define LIBDEFLATEAPI LIBDEFLATEAPI_ABI LIBDEFLATEAPI_STACKALIGN - -/* ========================================================================== */ -/* Compression */ -/* ========================================================================== */ - -struct libdeflate_compressor; - -/* - * libdeflate_alloc_compressor() allocates a new compressor that supports - * DEFLATE, zlib, and gzip compression. 'compression_level' is the compression - * level on a zlib-like scale but with a higher maximum value (1 = fastest, 6 = - * medium/default, 9 = slow, 12 = slowest). The return value is a pointer to - * the new compressor, or NULL if out of memory. - * - * Note: for compression, the sliding window size is defined at compilation time - * to 32768, the largest size permissible in the DEFLATE format. It cannot be - * changed at runtime. - * - * A single compressor is not safe to use by multiple threads concurrently. - * However, different threads may use different compressors concurrently. - */ -LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI -libdeflate_alloc_compressor(int compression_level); - -/* - * libdeflate_deflate_compress() performs raw DEFLATE compression on a buffer of - * data. The function attempts to compress 'in_nbytes' bytes of data located at - * 'in' and write the results to 'out', which has space for 'out_nbytes_avail' - * bytes. The return value is the compressed size in bytes, or 0 if the data - * could not be compressed to 'out_nbytes_avail' bytes or fewer. - */ -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI -libdeflate_deflate_compress(struct libdeflate_compressor *compressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail); - -/* - * libdeflate_deflate_compress_bound() returns a worst-case upper bound on the - * number of bytes of compressed data that may be produced by compressing any - * buffer of length less than or equal to 'in_nbytes' using - * libdeflate_deflate_compress() with the specified compressor. Mathematically, - * this bound will necessarily be a number greater than or equal to 'in_nbytes'. - * It may be an overestimate of the true upper bound. The return value is - * guaranteed to be the same for all invocations with the same compressor and - * same 'in_nbytes'. - * - * As a special case, 'compressor' may be NULL. This causes the bound to be - * taken across *any* libdeflate_compressor that could ever be allocated with - * this build of the library, with any options. - * - * Note that this function is not necessary in many applications. With - * block-based compression, it is usually preferable to separately store the - * uncompressed size of each block and to store any blocks that did not compress - * to less than their original size uncompressed. In that scenario, there is no - * need to know the worst-case compressed size, since the maximum number of - * bytes of compressed data that may be used would always be one less than the - * input length. You can just pass a buffer of that size to - * libdeflate_deflate_compress() and store the data uncompressed if - * libdeflate_deflate_compress() returns 0, indicating that the compressed data - * did not fit into the provided output buffer. - */ -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI -libdeflate_deflate_compress_bound(struct libdeflate_compressor *compressor, - size_t in_nbytes); - -/* - * Like libdeflate_deflate_compress(), but stores the data in the zlib wrapper - * format. - */ -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI -libdeflate_zlib_compress(struct libdeflate_compressor *compressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail); - -/* - * Like libdeflate_deflate_compress_bound(), but assumes the data will be - * compressed with libdeflate_zlib_compress() rather than with - * libdeflate_deflate_compress(). - */ -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI -libdeflate_zlib_compress_bound(struct libdeflate_compressor *compressor, - size_t in_nbytes); - -/* - * Like libdeflate_deflate_compress(), but stores the data in the gzip wrapper - * format. - */ -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI -libdeflate_gzip_compress(struct libdeflate_compressor *compressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail); - -/* - * Like libdeflate_deflate_compress_bound(), but assumes the data will be - * compressed with libdeflate_gzip_compress() rather than with - * libdeflate_deflate_compress(). - */ -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI -libdeflate_gzip_compress_bound(struct libdeflate_compressor *compressor, - size_t in_nbytes); - -/* - * libdeflate_free_compressor() frees a compressor that was allocated with - * libdeflate_alloc_compressor(). If a NULL pointer is passed in, no action is - * taken. - */ -LIBDEFLATEEXPORT void LIBDEFLATEAPI -libdeflate_free_compressor(struct libdeflate_compressor *compressor); - -/* ========================================================================== */ -/* Decompression */ -/* ========================================================================== */ - -struct libdeflate_decompressor; - -/* - * libdeflate_alloc_decompressor() allocates a new decompressor that can be used - * for DEFLATE, zlib, and gzip decompression. The return value is a pointer to - * the new decompressor, or NULL if out of memory. - * - * This function takes no parameters, and the returned decompressor is valid for - * decompressing data that was compressed at any compression level and with any - * sliding window size. - * - * A single decompressor is not safe to use by multiple threads concurrently. - * However, different threads may use different decompressors concurrently. - */ -LIBDEFLATEEXPORT struct libdeflate_decompressor * LIBDEFLATEAPI -libdeflate_alloc_decompressor(void); - -/* - * Result of a call to libdeflate_deflate_decompress(), - * libdeflate_zlib_decompress(), or libdeflate_gzip_decompress(). - */ -enum libdeflate_result { - /* Decompression was successful. */ - LIBDEFLATE_SUCCESS = 0, - - /* Decompressed failed because the compressed data was invalid, corrupt, - * or otherwise unsupported. */ - LIBDEFLATE_BAD_DATA = 1, - - /* A NULL 'actual_out_nbytes_ret' was provided, but the data would have - * decompressed to fewer than 'out_nbytes_avail' bytes. */ - LIBDEFLATE_SHORT_OUTPUT = 2, - - /* The data would have decompressed to more than 'out_nbytes_avail' - * bytes. */ - LIBDEFLATE_INSUFFICIENT_SPACE = 3, -}; - -/* - * libdeflate_deflate_decompress() decompresses the DEFLATE-compressed stream - * from the buffer 'in' with compressed size up to 'in_nbytes' bytes. The - * uncompressed data is written to 'out', a buffer with size 'out_nbytes_avail' - * bytes. If decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned. - * Otherwise, a nonzero result code such as LIBDEFLATE_BAD_DATA is returned. If - * a nonzero result code is returned, then the contents of the output buffer are - * undefined. - * - * Decompression stops at the end of the DEFLATE stream (as indicated by the - * BFINAL flag), even if it is actually shorter than 'in_nbytes' bytes. - * - * libdeflate_deflate_decompress() can be used in cases where the actual - * uncompressed size is known (recommended) or unknown (not recommended): - * - * - If the actual uncompressed size is known, then pass the actual - * uncompressed size as 'out_nbytes_avail' and pass NULL for - * 'actual_out_nbytes_ret'. This makes libdeflate_deflate_decompress() fail - * with LIBDEFLATE_SHORT_OUTPUT if the data decompressed to fewer than the - * specified number of bytes. - * - * - If the actual uncompressed size is unknown, then provide a non-NULL - * 'actual_out_nbytes_ret' and provide a buffer with some size - * 'out_nbytes_avail' that you think is large enough to hold all the - * uncompressed data. In this case, if the data decompresses to less than - * or equal to 'out_nbytes_avail' bytes, then - * libdeflate_deflate_decompress() will write the actual uncompressed size - * to *actual_out_nbytes_ret and return 0 (LIBDEFLATE_SUCCESS). Otherwise, - * it will return LIBDEFLATE_INSUFFICIENT_SPACE if the provided buffer was - * not large enough but no other problems were encountered, or another - * nonzero result code if decompression failed for another reason. - */ -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI -libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_out_nbytes_ret); - -/* - * Like libdeflate_deflate_decompress(), but adds the 'actual_in_nbytes_ret' - * argument. If decompression succeeds and 'actual_in_nbytes_ret' is not NULL, - * then the actual compressed size of the DEFLATE stream (aligned to the next - * byte boundary) is written to *actual_in_nbytes_ret. - */ -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI -libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, - size_t *actual_out_nbytes_ret); - -/* - * Like libdeflate_deflate_decompress(), but assumes the zlib wrapper format - * instead of raw DEFLATE. - * - * Decompression will stop at the end of the zlib stream, even if it is shorter - * than 'in_nbytes'. If you need to know exactly where the zlib stream ended, - * use libdeflate_zlib_decompress_ex(). - */ -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI -libdeflate_zlib_decompress(struct libdeflate_decompressor *decompressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_out_nbytes_ret); - -/* - * Like libdeflate_zlib_decompress(), but adds the 'actual_in_nbytes_ret' - * argument. If 'actual_in_nbytes_ret' is not NULL and the decompression - * succeeds (indicating that the first zlib-compressed stream in the input - * buffer was decompressed), then the actual number of input bytes consumed is - * written to *actual_in_nbytes_ret. - */ -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI -libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *decompressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, - size_t *actual_out_nbytes_ret); - -/* - * Like libdeflate_deflate_decompress(), but assumes the gzip wrapper format - * instead of raw DEFLATE. - * - * If multiple gzip-compressed members are concatenated, then only the first - * will be decompressed. Use libdeflate_gzip_decompress_ex() if you need - * multi-member support. - */ -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI -libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_out_nbytes_ret); - -/* - * Like libdeflate_gzip_decompress(), but adds the 'actual_in_nbytes_ret' - * argument. If 'actual_in_nbytes_ret' is not NULL and the decompression - * succeeds (indicating that the first gzip-compressed member in the input - * buffer was decompressed), then the actual number of input bytes consumed is - * written to *actual_in_nbytes_ret. - */ -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI -libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, - size_t *actual_out_nbytes_ret); - -/* - * libdeflate_free_decompressor() frees a decompressor that was allocated with - * libdeflate_alloc_decompressor(). If a NULL pointer is passed in, no action - * is taken. - */ -LIBDEFLATEEXPORT void LIBDEFLATEAPI -libdeflate_free_decompressor(struct libdeflate_decompressor *decompressor); - -/* ========================================================================== */ -/* Checksums */ -/* ========================================================================== */ - -/* - * libdeflate_adler32() updates a running Adler-32 checksum with 'len' bytes of - * data and returns the updated checksum. When starting a new checksum, the - * required initial value for 'adler' is 1. This value is also returned when - * 'buffer' is specified as NULL. - */ -LIBDEFLATEEXPORT uint32_t LIBDEFLATEAPI -libdeflate_adler32(uint32_t adler32, const void *buffer, size_t len); - - -/* - * libdeflate_crc32() updates a running CRC-32 checksum with 'len' bytes of data - * and returns the updated checksum. When starting a new checksum, the required - * initial value for 'crc' is 0. This value is also returned when 'buffer' is - * specified as NULL. - */ -LIBDEFLATEEXPORT uint32_t LIBDEFLATEAPI -libdeflate_crc32(uint32_t crc, const void *buffer, size_t len); - -/* ========================================================================== */ -/* Custom memory allocator */ -/* ========================================================================== */ - -/* - * Install a custom memory allocator which libdeflate will use for all memory - * allocations. 'malloc_func' is a function that must behave like malloc(), and - * 'free_func' is a function that must behave like free(). - * - * There must not be any libdeflate_compressor or libdeflate_decompressor - * structures in existence when calling this function. - */ -LIBDEFLATEEXPORT void LIBDEFLATEAPI -libdeflate_set_memory_allocator(void *(*malloc_func)(size_t), - void (*free_func)(void *)); - -#ifdef __cplusplus -} -#endif - -#endif /* LIBDEFLATE_H */ diff --git a/third_party/isa-l b/third_party/isa-l new file mode 160000 index 0000000..c3d0760 --- /dev/null +++ b/third_party/isa-l @@ -0,0 +1 @@ +Subproject commit c3d076027f0a7377b78472876a930af98c4eded6 diff --git a/third_party/libdeflate b/third_party/libdeflate new file mode 160000 index 0000000..22f6e02 --- /dev/null +++ b/third_party/libdeflate @@ -0,0 +1 @@ +Subproject commit 22f6e0235ff5651c7300dc120a7ee0d9f0ba3426 From 92270a2481fa9a15d372e9c810b37e54d7a3603a Mon Sep 17 00:00:00 2001 From: Kim Yang Date: Thu, 26 Feb 2026 23:41:47 +0800 Subject: [PATCH 08/14] docs: update build instructions for submodule-based compilation Replace outdated conda/system library instructions with dual-mode build documentation matching fastp: Option A (static build from submodules, recommended) and Option B (link against system libraries). Co-Authored-By: Claude Opus 4.6 --- README.md | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 2b7ff65..9e4402b 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,8 @@ fastplong supports batch processing of multiple FASTQ files in a folder, see - [ - [install with Bioconda](#install-with-bioconda) - [download the latest prebuilt binary for Linux users](#download-the-latest-prebuilt-binary-for-linux-users) - [or compile from source](#or-compile-from-source) + - [Option A: static build](#option-a-static-build-recommended-no-system-library-dependencies) + - [Option B: link against system libraries](#option-b-link-against-system-libraries) - [input and output](#input-and-output) - [output to STDOUT](#output-to-stdout) - [input from STDIN](#input-from-stdin) @@ -67,30 +69,43 @@ mv fastplong.0.2.2 fastplong chmod a+x ./fastplong ``` ## or compile from source -`fastplong` depends on `libdeflate` and `isa-l` for fast decompression and compression of zipped data, and depends on `libhwy` for SIMD acceleration. It's recommended to install all of them via Anaconda: -``` -conda install conda-forge::libdeflate -conda install conda-forge::isa-l -conda install conda-forge::libhwy +`fastplong` depends on `isa-l` and `libdeflate`, which are included as git submodules. You can either build them from source (static mode) or link against system-installed libraries. + +### Option A: static build (recommended, no system library dependencies) +Build isa-l and libdeflate from submodules and link them statically. Requires `cmake` and a C/C++ compiler. +```shell +git clone --recursive https://github.com/OpenGene/fastplong.git +cd fastplong +make static -j + +# Install +sudo make install ``` -You can also try to install them with other package management systems like `apt/yum` on Linux, or `brew` on MacOS. Otherwise you can compile them from source (https://github.com/intel/isa-l, https://github.com/ebiggers/libdeflate, and https://github.com/google/highway) -### download and build fastplong +### Option B: link against system libraries +First install isa-l and libdeflate via your package manager: ```shell -# get source (you can also use browser to download from master or releases) -git clone https://github.com/OpenGene/fastplong.git +# Ubuntu/Debian +apt install libisal-dev libdeflate-dev -# build +# macOS +brew install isa-l libdeflate +``` +Then build fastplong: +```shell +git clone --recursive https://github.com/OpenGene/fastplong.git cd fastplong make -j -# test -make test - # Install sudo make install ``` +If you already cloned without `--recursive`, initialize the submodules with: +```shell +git submodule update --init --recursive +``` + # input and output Specify input by `-i` or `--in`, and specify output by `-o` or `--out`. * if you don't specify the output file names, no output files will be written, but the QC will still be done for both data before and after filtering. From 9986af80624646e23f2e5148eaff61862a5b97f8 Mon Sep 17 00:00:00 2001 From: Kim Yang Date: Thu, 26 Feb 2026 23:45:58 +0800 Subject: [PATCH 09/14] build: simplify to static-only build from submodules Remove dual-mode build (system libs vs submodules) to eliminate potential header/library version mismatch. Default `make` now builds isa-l and libdeflate from submodules and links statically. Simplify README build instructions accordingly. Co-Authored-By: Claude Opus 4.6 --- Makefile | 31 +++++++++---------------------- README.md | 28 ++-------------------------- 2 files changed, 11 insertions(+), 48 deletions(-) diff --git a/Makefile b/Makefile index 1e1c12d..f662cbd 100644 --- a/Makefile +++ b/Makefile @@ -32,29 +32,21 @@ CXXFLAGS := -std=c++17 -pthread -g -O3 -MD -MP \ $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) \ ${CXXFLAGS} -# --- Link mode (default): link against system-installed libraries --- -LIBS := -lisal -ldeflate -lpthread -LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(LIBS) $(LD_FLAGS) - -# --- Static mode: build from submodules and link statically --- +# Static libraries built from submodules ISAL_LIB := $(DIR_ISAL)/bin/isa-l.a LIBDEFLATE_LIB := $(DIR_LIBDEFLATE)/build/libdeflate.a # On Linux: fully static binary; on macOS: static libs, dynamic system runtime UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Linux) - STATIC_LD_FLAGS := -static -Wl,--no-as-needed -lpthread + LD_FLAGS := -static -Wl,--no-as-needed -lpthread else - STATIC_LD_FLAGS := -lpthread + LD_FLAGS := -lpthread endif -# Default target: link against system libraries -${BIN_TARGET}: ${OBJ} - $(CXX) $(OBJ) -o $@ $(LD_FLAGS) - -# Static target: build deps from submodules and link statically -static: $(ISAL_LIB) $(LIBDEFLATE_LIB) ${OBJ} - $(CXX) $(OBJ) -o ${BIN_TARGET} $(ISAL_LIB) $(LIBDEFLATE_LIB) $(STATIC_LD_FLAGS) +# Default target: build deps from submodules and link statically +${BIN_TARGET}: $(ISAL_LIB) $(LIBDEFLATE_LIB) ${OBJ} + $(CXX) $(OBJ) -o $@ $(ISAL_LIB) $(LIBDEFLATE_LIB) $(LD_FLAGS) # Build isa-l static library from submodule $(ISAL_LIB): @@ -81,7 +73,7 @@ ${DIR_OBJ}/hwy_abort.o:${DIR_HWY}/hwy/abort.cc @mkdir -p $(@D) $(CXX) -c $< -o $@ $(CXXFLAGS) -.PHONY: clean static install clean-deps +.PHONY: clean install clean-deps test clean: @rm -rf $(DIR_OBJ) @@ -100,14 +92,9 @@ ${DIR_OBJ}/%.o:${DIR_TEST}/%.cpp @mkdir -p $(@D) $(CXX) -c $< -o $@ $(CXXFLAGS) -test-static: $(ISAL_LIB) $(LIBDEFLATE_LIB) ${TEST_OBJ} ${OBJ} - @mkdir -p bin - $(CXX) $(TEST_OBJ) ${OBJ:./obj/main.o=} -o ${TEST_TARGET} $(ISAL_LIB) $(LIBDEFLATE_LIB) $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(STATIC_LD_FLAGS) -lgtest -lgtest_main - ./${TEST_TARGET} - -test: ${TEST_OBJ} ${OBJ} +test: $(ISAL_LIB) $(LIBDEFLATE_LIB) ${TEST_OBJ} ${OBJ} @mkdir -p bin - $(CXX) $(TEST_OBJ) ${OBJ:./obj/main.o=} -o ${TEST_TARGET} $(LD_FLAGS) -lgtest -lgtest_main + $(CXX) $(TEST_OBJ) ${OBJ:./obj/main.o=} -o ${TEST_TARGET} $(ISAL_LIB) $(LIBDEFLATE_LIB) $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(LD_FLAGS) -lgtest -lgtest_main ./${TEST_TARGET} -include $(OBJ:.o=.d) diff --git a/README.md b/README.md index 9e4402b..5f1fcaf 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,6 @@ fastplong supports batch processing of multiple FASTQ files in a folder, see - [ - [install with Bioconda](#install-with-bioconda) - [download the latest prebuilt binary for Linux users](#download-the-latest-prebuilt-binary-for-linux-users) - [or compile from source](#or-compile-from-source) - - [Option A: static build](#option-a-static-build-recommended-no-system-library-dependencies) - - [Option B: link against system libraries](#option-b-link-against-system-libraries) - [input and output](#input-and-output) - [output to STDOUT](#output-to-stdout) - [input from STDIN](#input-from-stdin) @@ -69,29 +67,7 @@ mv fastplong.0.2.2 fastplong chmod a+x ./fastplong ``` ## or compile from source -`fastplong` depends on `isa-l` and `libdeflate`, which are included as git submodules. You can either build them from source (static mode) or link against system-installed libraries. - -### Option A: static build (recommended, no system library dependencies) -Build isa-l and libdeflate from submodules and link them statically. Requires `cmake` and a C/C++ compiler. -```shell -git clone --recursive https://github.com/OpenGene/fastplong.git -cd fastplong -make static -j - -# Install -sudo make install -``` - -### Option B: link against system libraries -First install isa-l and libdeflate via your package manager: -```shell -# Ubuntu/Debian -apt install libisal-dev libdeflate-dev - -# macOS -brew install isa-l libdeflate -``` -Then build fastplong: +`fastplong` depends on `isa-l` and `libdeflate`, which are included as git submodules and built statically. Requires `cmake` and a C/C++ compiler. ```shell git clone --recursive https://github.com/OpenGene/fastplong.git cd fastplong @@ -101,7 +77,7 @@ make -j sudo make install ``` -If you already cloned without `--recursive`, initialize the submodules with: +If you already cloned without `--recursive`, initialize the submodules first: ```shell git submodule update --init --recursive ``` From c9ac5beb567114bebae70707a755e86ef5f74998 Mon Sep 17 00:00:00 2001 From: Kim Yang Date: Thu, 26 Feb 2026 23:48:25 +0800 Subject: [PATCH 10/14] build: add googletest as submodule, remove system gtest dependency Vendor googletest v1.17.0 as a git submodule under third_party/. Build it from source via cmake during `make test`, eliminating the need to install googletest on the system. Co-Authored-By: Claude Opus 4.6 --- .gitmodules | 3 +++ Makefile | 20 ++++++++++++++------ third_party/googletest | 1 + 3 files changed, 18 insertions(+), 6 deletions(-) create mode 160000 third_party/googletest diff --git a/.gitmodules b/.gitmodules index 77c1e52..5187798 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,6 @@ [submodule "third_party/libdeflate"] path = third_party/libdeflate url = https://github.com/ebiggers/libdeflate.git +[submodule "third_party/googletest"] + path = third_party/googletest + url = https://github.com/google/googletest.git diff --git a/Makefile b/Makefile index f662cbd..f4bd551 100644 --- a/Makefile +++ b/Makefile @@ -5,11 +5,10 @@ DIR_TEST := ./test DIR_HWY := ./third_party/highway DIR_ISAL := ./third_party/isa-l DIR_LIBDEFLATE := ./third_party/libdeflate +DIR_GTEST := ./third_party/googletest PREFIX ?= /usr/local BINDIR ?= $(PREFIX)/bin -INCLUDE_DIRS ?= -LIBRARY_DIRS ?= SRC := $(wildcard ${DIR_SRC}/*.cpp) TEST := $(wildcard ${DIR_TEST}/*.cpp) @@ -29,12 +28,13 @@ CXX ?= g++ CXXFLAGS := -std=c++17 -pthread -g -O3 -MD -MP \ -I. -I${DIR_INC} -I${DIR_SRC} -I${DIR_HWY} \ -I${DIR_ISAL}/include -I${DIR_LIBDEFLATE} \ - $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) \ ${CXXFLAGS} # Static libraries built from submodules ISAL_LIB := $(DIR_ISAL)/bin/isa-l.a LIBDEFLATE_LIB := $(DIR_LIBDEFLATE)/build/libdeflate.a +GTEST_LIB := $(DIR_GTEST)/build/lib/libgtest.a +GTEST_MAIN_LIB := $(DIR_GTEST)/build/lib/libgtest_main.a # On Linux: fully static binary; on macOS: static libs, dynamic system runtime UNAME_S := $(shell uname -s) @@ -60,6 +60,13 @@ $(LIBDEFLATE_LIB): -DLIBDEFLATE_BUILD_GZIP=OFF && \ cmake --build build +# Build googletest static library from submodule +$(GTEST_LIB) $(GTEST_MAIN_LIB): + cd $(DIR_GTEST) && cmake -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_GMOCK=OFF && \ + cmake --build build + ${DIR_OBJ}/%.o:${DIR_SRC}/%.cpp @mkdir -p $(@D) $(CXX) -c $< -o $@ $(CXXFLAGS) @@ -83,6 +90,7 @@ clean: clean-deps: -$(MAKE) -C $(DIR_ISAL) -f Makefile.unx clean 2>/dev/null || true -rm -rf $(DIR_LIBDEFLATE)/build 2>/dev/null || true + -rm -rf $(DIR_GTEST)/build 2>/dev/null || true install: install $(TARGET) $(BINDIR)/$(TARGET) @@ -90,11 +98,11 @@ install: ${DIR_OBJ}/%.o:${DIR_TEST}/%.cpp @mkdir -p $(@D) - $(CXX) -c $< -o $@ $(CXXFLAGS) + $(CXX) -c $< -o $@ $(CXXFLAGS) -I${DIR_GTEST}/googletest/include -test: $(ISAL_LIB) $(LIBDEFLATE_LIB) ${TEST_OBJ} ${OBJ} +test: $(ISAL_LIB) $(LIBDEFLATE_LIB) $(GTEST_LIB) ${TEST_OBJ} ${OBJ} @mkdir -p bin - $(CXX) $(TEST_OBJ) ${OBJ:./obj/main.o=} -o ${TEST_TARGET} $(ISAL_LIB) $(LIBDEFLATE_LIB) $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(LD_FLAGS) -lgtest -lgtest_main + $(CXX) $(TEST_OBJ) ${OBJ:./obj/main.o=} -o ${TEST_TARGET} $(ISAL_LIB) $(LIBDEFLATE_LIB) $(GTEST_LIB) $(GTEST_MAIN_LIB) $(LD_FLAGS) ./${TEST_TARGET} -include $(OBJ:.o=.d) diff --git a/third_party/googletest b/third_party/googletest new file mode 160000 index 0000000..52eb810 --- /dev/null +++ b/third_party/googletest @@ -0,0 +1 @@ +Subproject commit 52eb8108c5bdec04579160ae17225d66034bd723 From 459f02e78a49efe61ffd0bca155605dd5b89e968 Mon Sep 17 00:00:00 2001 From: Kim Yang Date: Fri, 27 Feb 2026 00:03:56 +0800 Subject: [PATCH 11/14] build: enable isa-l NEON assembly on ARM macOS macOS reports arm64 via uname -m but isa-l's Makefile.unx expects aarch64 for the NEON/SVE assembly paths. Without this fix, only the C baseline code is compiled, resulting in ~60% slower gzip decompression. Pass host_cpu=aarch64 arch=aarch64 on ARM macOS to build the full optimized library (85 objects vs 23 C-only). Co-Authored-By: Claude Opus 4.6 --- Makefile | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f4bd551..643debf 100644 --- a/Makefile +++ b/Makefile @@ -38,19 +38,26 @@ GTEST_MAIN_LIB := $(DIR_GTEST)/build/lib/libgtest_main.a # On Linux: fully static binary; on macOS: static libs, dynamic system runtime UNAME_S := $(shell uname -s) +UNAME_M := $(shell uname -m) ifeq ($(UNAME_S),Linux) LD_FLAGS := -static -Wl,--no-as-needed -lpthread else LD_FLAGS := -lpthread endif +# isa-l: macOS reports arm64 but isa-l expects aarch64 for NEON assembly +ISAL_MAKE_ARGS := +ifeq ($(UNAME_M),arm64) + ISAL_MAKE_ARGS := host_cpu=aarch64 arch=aarch64 +endif + # Default target: build deps from submodules and link statically ${BIN_TARGET}: $(ISAL_LIB) $(LIBDEFLATE_LIB) ${OBJ} $(CXX) $(OBJ) -o $@ $(ISAL_LIB) $(LIBDEFLATE_LIB) $(LD_FLAGS) # Build isa-l static library from submodule $(ISAL_LIB): - $(MAKE) -C $(DIR_ISAL) -f Makefile.unx lib + $(MAKE) -C $(DIR_ISAL) -f Makefile.unx lib $(ISAL_MAKE_ARGS) # Build libdeflate static library from submodule $(LIBDEFLATE_LIB): From 1a3d95a3bdb76c1872bdb7e2516083f37ac52240 Mon Sep 17 00:00:00 2001 From: Kim Yang Date: Fri, 27 Feb 2026 00:19:06 +0800 Subject: [PATCH 12/14] ci: update workflow for submodule-based static build - Use actions/checkout@v4 with submodules: recursive - Remove system library installs (isa-l, libdeflate, highway, googletest) - Only need build-essential, cmake, nasm on Ubuntu - macOS runners have cmake/Xcode toolchain preinstalled Co-Authored-By: Claude Opus 4.6 --- .github/workflows/ci.yml | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e34223f..db509c1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,22 +15,19 @@ jobs: runs-on: ${{ matrix.os }} steps: - name: checkout scm - uses: actions/checkout@v3 + uses: actions/checkout@v4 + with: + submodules: recursive - name: install build dependencies (Ubuntu) - run: sudo apt update && sudo apt install -y build-essential libgtest-dev libhwy-dev libisal-dev libdeflate-dev + run: sudo apt update && sudo apt install -y build-essential cmake nasm if: runner.os == 'Linux' - - name: install build dependencies (MacOS) - run: brew install highway googletest isa-l libdeflate - if: runner.os == 'macOS' - - name: make fastplong run: make -j - name: make test - run: make -j test + run: make test - name: test run: chmod a+x ./fastplong && ./fastplong --version - From f3bf01a17d2e23b3ae7f9b20219cf957f343fb31 Mon Sep 17 00:00:00 2001 From: Kim Yang Date: Fri, 27 Feb 2026 00:28:08 +0800 Subject: [PATCH 13/14] ci: upgrade to macos-15 for Xcode 17+ ARM assembly support isa-l dev branch uses __arm_streaming (SVE) and ARM assembly syntax that requires Xcode 17+. macos-14 (Xcode 15.4) and macos-13 are too old. macos-15 provides Xcode 16.2+ which supports these features. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/ci.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index db509c1..e7e0fec 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,8 +10,7 @@ jobs: matrix: os: - ubuntu-24.04 - - macos-13 - - macos-14 + - macos-15 runs-on: ${{ matrix.os }} steps: - name: checkout scm From 0cb92695635e26062dd777d3f2f2a7cbf17ac583 Mon Sep 17 00:00:00 2001 From: Kim Yang Date: Sun, 1 Mar 2026 01:23:45 +0800 Subject: [PATCH 14/14] perf: use TableLookupBytes for reverseComplement SIMD Replace 8x Eq + 8x IfThenElse comparison chain with a single And + TableLookupBytes using the low nibble of DNA base ASCII codes. DNA bases A/a(1), C/c(3), T/t(4), G/g(7) have unique low nibbles, enabling a 16-byte lookup table for complement mapping. Co-Authored-By: Claude Opus 4.6 --- src/simd.cpp | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/src/simd.cpp b/src/simd.cpp index 76b6b21..3f1ba62 100644 --- a/src/simd.cpp +++ b/src/simd.cpp @@ -83,31 +83,22 @@ void ReverseComplementImpl(const char* src, char* dst, int len) { const hn::ScalableTag d; const int N = hn::Lanes(d); - // Complement via comparisons — portable across all SIMD widths - const auto vA = hn::Set(d, static_cast('A')); - const auto vT = hn::Set(d, static_cast('T')); - const auto vC = hn::Set(d, static_cast('C')); - const auto vG = hn::Set(d, static_cast('G')); - const auto va = hn::Set(d, static_cast('a')); - const auto vt = hn::Set(d, static_cast('t')); - const auto vc = hn::Set(d, static_cast('c')); - const auto vg = hn::Set(d, static_cast('g')); - const auto vN = hn::Set(d, static_cast('N')); + // Complement via single table lookup on low nibble of ASCII code. + // DNA base low nibbles: A/a=1, C/c=3, T/t=4, G/g=7, N=14. + // Uppercase and lowercase share the same low nibble, so one table + // handles both. Unmapped nibbles default to 'N'. + HWY_ALIGN static const uint8_t kComplement[16] = { + 'N', 'T', 'N', 'G', 'A', 'N', 'N', 'C', + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N' + }; + const auto table = hn::LoadDup128(d, kComplement); + const auto vMask = hn::Set(d, static_cast(0x0F)); int i = 0; for (; i + N <= len; i += N) { const auto v = hn::LoadU(d, reinterpret_cast(src + i)); - - // Build complement using conditional selects - auto comp = vN; // default: N - comp = hn::IfThenElse(hn::Eq(v, vA), vT, comp); - comp = hn::IfThenElse(hn::Eq(v, vT), vA, comp); - comp = hn::IfThenElse(hn::Eq(v, vC), vG, comp); - comp = hn::IfThenElse(hn::Eq(v, vG), vC, comp); - comp = hn::IfThenElse(hn::Eq(v, va), vT, comp); - comp = hn::IfThenElse(hn::Eq(v, vt), vA, comp); - comp = hn::IfThenElse(hn::Eq(v, vc), vG, comp); - comp = hn::IfThenElse(hn::Eq(v, vg), vC, comp); + const auto indices = hn::And(v, vMask); + const auto comp = hn::TableLookupBytes(table, indices); // Reverse and store at mirrored position const auto revComp = hn::Reverse(d, comp);