diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e34223f..e7e0fec 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,27 +10,23 @@ jobs: matrix: os: - ubuntu-24.04 - - macos-13 - - macos-14 + - macos-15 runs-on: ${{ matrix.os }} steps: - name: checkout scm - uses: actions/checkout@v3 + uses: actions/checkout@v4 + with: + submodules: recursive - name: install build dependencies (Ubuntu) - run: sudo apt update && sudo apt install -y build-essential libgtest-dev libhwy-dev libisal-dev libdeflate-dev + run: sudo apt update && sudo apt install -y build-essential cmake nasm if: runner.os == 'Linux' - - name: install build dependencies (MacOS) - run: brew install highway googletest isa-l libdeflate - if: runner.os == 'macOS' - - name: make fastplong run: make -j - name: make test - run: make -j test + run: make test - name: test run: chmod a+x ./fastplong && ./fastplong --version - diff --git a/.gitignore b/.gitignore index a14e5af..3530bdb 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,12 @@ *.out *.app bin +fastplong + +# Test data (generated) +testdata/*.fastq +testdata/*.fastq.gz + +# Reports +fastplong.html +fastplong.json diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..5187798 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,12 @@ +[submodule "third_party/highway"] + path = third_party/highway + url = https://github.com/google/highway.git +[submodule "third_party/isa-l"] + path = third_party/isa-l + url = https://github.com/intel/isa-l.git +[submodule "third_party/libdeflate"] + path = third_party/libdeflate + url = https://github.com/ebiggers/libdeflate.git +[submodule "third_party/googletest"] + path = third_party/googletest + url = https://github.com/google/googletest.git diff --git a/Makefile b/Makefile index 05377a7..643debf 100644 --- a/Makefile +++ b/Makefile @@ -2,63 +2,114 @@ DIR_INC := ./inc DIR_SRC := ./src DIR_OBJ := ./obj DIR_TEST := ./test +DIR_HWY := ./third_party/highway +DIR_ISAL := ./third_party/isa-l +DIR_LIBDEFLATE := ./third_party/libdeflate +DIR_GTEST := ./third_party/googletest PREFIX ?= /usr/local BINDIR ?= $(PREFIX)/bin -INCLUDE_DIRS ?= /opt/homebrew/include -LIBRARY_DIRS ?= /opt/homebrew/lib SRC := $(wildcard ${DIR_SRC}/*.cpp) TEST := $(wildcard ${DIR_TEST}/*.cpp) OBJ := $(patsubst %.cpp,${DIR_OBJ}/%.o,$(notdir ${SRC})) TEST_OBJ := $(patsubst %.cpp,${DIR_OBJ}/%.o,$(notdir ${TEST})) +# Highway runtime dispatch support +HWY_OBJS := ${DIR_OBJ}/hwy_targets.o ${DIR_OBJ}/hwy_abort.o +OBJ += $(HWY_OBJS) + TARGET := fastplong BIN_TARGET := ${TARGET} TEST_TARGET := bin/fastplong_unittest CXX ?= g++ -CXXFLAGS := -std=c++14 -pthread -g -O3 -MP -MD -I${DIR_INC} -I${DIR_SRC} $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) ${CXXFLAGS} -LIBS := -lisal -ldeflate -lpthread -lhwy -STATIC_FLAGS := -static -L. -Wl,--no-as-needed -pthread -LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(LIBS) $(LD_FLAGS) -STATIC_LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(STATIC_FLAGS) $(LIBS) $(STATIC_LD_FLAGS) +CXXFLAGS := -std=c++17 -pthread -g -O3 -MD -MP \ + -I. -I${DIR_INC} -I${DIR_SRC} -I${DIR_HWY} \ + -I${DIR_ISAL}/include -I${DIR_LIBDEFLATE} \ + ${CXXFLAGS} + +# Static libraries built from submodules +ISAL_LIB := $(DIR_ISAL)/bin/isa-l.a +LIBDEFLATE_LIB := $(DIR_LIBDEFLATE)/build/libdeflate.a +GTEST_LIB := $(DIR_GTEST)/build/lib/libgtest.a +GTEST_MAIN_LIB := $(DIR_GTEST)/build/lib/libgtest_main.a + +# On Linux: fully static binary; on macOS: static libs, dynamic system runtime +UNAME_S := $(shell uname -s) +UNAME_M := $(shell uname -m) +ifeq ($(UNAME_S),Linux) + LD_FLAGS := -static -Wl,--no-as-needed -lpthread +else + LD_FLAGS := -lpthread +endif + +# isa-l: macOS reports arm64 but isa-l expects aarch64 for NEON assembly +ISAL_MAKE_ARGS := +ifeq ($(UNAME_M),arm64) + ISAL_MAKE_ARGS := host_cpu=aarch64 arch=aarch64 +endif +# Default target: build deps from submodules and link statically +${BIN_TARGET}: $(ISAL_LIB) $(LIBDEFLATE_LIB) ${OBJ} + $(CXX) $(OBJ) -o $@ $(ISAL_LIB) $(LIBDEFLATE_LIB) $(LD_FLAGS) -${BIN_TARGET}:${OBJ} - $(CXX) $(OBJ) -o $@ $(LD_FLAGS) +# Build isa-l static library from submodule +$(ISAL_LIB): + $(MAKE) -C $(DIR_ISAL) -f Makefile.unx lib $(ISAL_MAKE_ARGS) -static:${OBJ} - $(CXX) $(OBJ) -o ${BIN_TARGET} $(STATIC_LD_FLAGS) +# Build libdeflate static library from submodule +$(LIBDEFLATE_LIB): + cd $(DIR_LIBDEFLATE) && cmake -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DLIBDEFLATE_BUILD_SHARED_LIB=OFF \ + -DLIBDEFLATE_BUILD_GZIP=OFF && \ + cmake --build build + +# Build googletest static library from submodule +$(GTEST_LIB) $(GTEST_MAIN_LIB): + cd $(DIR_GTEST) && cmake -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_GMOCK=OFF && \ + cmake --build build ${DIR_OBJ}/%.o:${DIR_SRC}/%.cpp - @mkdir -p $(@D) + @mkdir -p $(@D) + $(CXX) -c $< -o $@ $(CXXFLAGS) + +# Highway source files for runtime CPU detection and error handling +${DIR_OBJ}/hwy_targets.o:${DIR_HWY}/hwy/targets.cc + @mkdir -p $(@D) $(CXX) -c $< -o $@ $(CXXFLAGS) -.PHONY:clean -.PHONY:static +${DIR_OBJ}/hwy_abort.o:${DIR_HWY}/hwy/abort.cc + @mkdir -p $(@D) + $(CXX) -c $< -o $@ $(CXXFLAGS) + +.PHONY: clean install clean-deps test + clean: @rm -rf $(DIR_OBJ) @rm -f $(TARGET) @rm -f $(TEST_TARGET) +clean-deps: + -$(MAKE) -C $(DIR_ISAL) -f Makefile.unx clean 2>/dev/null || true + -rm -rf $(DIR_LIBDEFLATE)/build 2>/dev/null || true + -rm -rf $(DIR_GTEST)/build 2>/dev/null || true + install: install $(TARGET) $(BINDIR)/$(TARGET) @echo "Installed." ${DIR_OBJ}/%.o:${DIR_TEST}/%.cpp - @mkdir -p $(@D) - $(CXX) -c $< -o $@ $(CXXFLAGS) - -test-static: ${TEST_OBJ} ${OBJ} - @mkdir -p bin - $(CXX) $(TEST_OBJ) ${OBJ:./obj/main.o=} -o ${TEST_TARGET} $(STATIC_LD_FLAGS) -lgtest -lgtest_main - ./${TEST_TARGET} + @mkdir -p $(@D) + $(CXX) -c $< -o $@ $(CXXFLAGS) -I${DIR_GTEST}/googletest/include -test:${TEST_OBJ} ${OBJ} +test: $(ISAL_LIB) $(LIBDEFLATE_LIB) $(GTEST_LIB) ${TEST_OBJ} ${OBJ} @mkdir -p bin - $(CXX) $(TEST_OBJ) ${OBJ:./obj/main.o=} -o ${TEST_TARGET} $(LD_FLAGS) -lgtest -lgtest_main + $(CXX) $(TEST_OBJ) ${OBJ:./obj/main.o=} -o ${TEST_TARGET} $(ISAL_LIB) $(LIBDEFLATE_LIB) $(GTEST_LIB) $(GTEST_MAIN_LIB) $(LD_FLAGS) ./${TEST_TARGET} -include $(OBJ:.o=.d) diff --git a/README.md b/README.md index 2b7ff65..5f1fcaf 100644 --- a/README.md +++ b/README.md @@ -67,30 +67,21 @@ mv fastplong.0.2.2 fastplong chmod a+x ./fastplong ``` ## or compile from source -`fastplong` depends on `libdeflate` and `isa-l` for fast decompression and compression of zipped data, and depends on `libhwy` for SIMD acceleration. It's recommended to install all of them via Anaconda: -``` -conda install conda-forge::libdeflate -conda install conda-forge::isa-l -conda install conda-forge::libhwy -``` -You can also try to install them with other package management systems like `apt/yum` on Linux, or `brew` on MacOS. Otherwise you can compile them from source (https://github.com/intel/isa-l, https://github.com/ebiggers/libdeflate, and https://github.com/google/highway) - -### download and build fastplong +`fastplong` depends on `isa-l` and `libdeflate`, which are included as git submodules and built statically. Requires `cmake` and a C/C++ compiler. ```shell -# get source (you can also use browser to download from master or releases) -git clone https://github.com/OpenGene/fastplong.git - -# build +git clone --recursive https://github.com/OpenGene/fastplong.git cd fastplong make -j -# test -make test - # Install sudo make install ``` +If you already cloned without `--recursive`, initialize the submodules first: +```shell +git submodule update --init --recursive +``` + # input and output Specify input by `-i` or `--in`, and specify output by `-o` or `--out`. * if you don't specify the output file names, no output files will be written, but the QC will still be done for both data before and after filtering. diff --git a/docs/plans/2026-02-26-simd-highway-refactor-design.md b/docs/plans/2026-02-26-simd-highway-refactor-design.md new file mode 100644 index 0000000..47b8e4c --- /dev/null +++ b/docs/plans/2026-02-26-simd-highway-refactor-design.md @@ -0,0 +1,76 @@ +# SIMD Highway Refactor Design + +## Goal + +Centralize all SIMD code into a single `simd.h`/`simd.cpp` pair using Google Highway's `foreach_target.h` multi-target compilation + `HWY_DYNAMIC_DISPATCH` runtime dispatch pattern. Port proven implementations from fastp and accelerate remaining scalar hot loops. + +## Current State + +- fastplong links `-lhwy` as system library +- SIMD exists in `adaptertrimmer.cpp` (mismatch counting) and `sequence.cpp` (reverse complement) but without multi-target dispatch +- `simdutil.h` has a custom `Transform1Reversed` template +- `filter.cpp` hot loops (`passFilter`, `passLowComplexityFilter`) are pure scalar + +## Changes + +### New Files + +- `third_party/highway/` — Highway source (header-only + `targets.cc` / `abort.cc`) +- `src/simd.h` — Public SIMD API +- `src/simd.cpp` — Multi-target implementation with `foreach_target.h` + +### Modified Files + +- `Makefile` — Add `DIR_HWY`, compile `hwy_targets.o` / `hwy_abort.o`, remove `-lhwy`, add `-I${DIR_HWY}` +- `src/filter.cpp` — Replace scalar loops with `countQualityMetrics` and `countAdjacentDiffs` +- `src/sequence.cpp` — Replace inline Highway code with `reverseComplement` call +- `src/adaptertrimmer.cpp` — Replace inner SIMD loop with `countMismatches` call + +### Deleted Files + +- `src/simdutil.h` — Functionality merged into `simd.cpp` + +## API + +```cpp +namespace fastplong_simd { + void countQualityMetrics(const char* qualstr, const char* seqstr, int len, + char qualThreshold, int& lowQualNum, int& nBaseNum, + int& totalQual); + void reverseComplement(const char* src, char* dst, int len); + int countAdjacentDiffs(const char* data, int len); + int countMismatches(const char* a, const char* b, int len); + bool testSimd(); +} +``` + +## simd.cpp Structure + +``` +#define HWY_TARGET_INCLUDE "src/simd.cpp" +#include "hwy/foreach_target.h" + +namespace fastplong_simd::HWY_NAMESPACE { + CountQualityMetricsImpl() — vectorized quality stats (SumsOf2 overflow-safe) + ReverseComplementImpl() — vectorized complement + reverse + CountAdjacentDiffsImpl() — vectorized adjacent comparison + CountMismatchesImpl() — vectorized byte comparison +} + +#if HWY_ONCE + HWY_EXPORT(...) — register all target versions + dispatch wrappers — HWY_DYNAMIC_DISPATCH + scalar references — for test validation + testSimd() — unit tests +#endif +``` + +## Key Technical Details + +- Block-based accumulation in `countQualityMetrics` (255*N elements per block) prevents u16 overflow for long reads (10k-100k+ bp) +- `SumsOf2` for u8->u16 promotion works on all targets including HWY_SCALAR +- `searchAdapter` outer logic (branching, edit_distance) stays in `adaptertrimmer.cpp`; only inner mismatch loop delegates to `countMismatches` + +## Reference + +Based on fastp's `src/simd.h` / `src/simd.cpp` implementation pattern. diff --git a/docs/plans/2026-02-26-simd-highway-refactor.md b/docs/plans/2026-02-26-simd-highway-refactor.md new file mode 100644 index 0000000..47f40ba --- /dev/null +++ b/docs/plans/2026-02-26-simd-highway-refactor.md @@ -0,0 +1,896 @@ +# SIMD Highway Refactor Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Centralize all SIMD code into `simd.h`/`simd.cpp` using Highway's `foreach_target.h` multi-target dispatch, port fastp's proven implementations, and accelerate remaining scalar hot loops. + +**Architecture:** Single `simd.cpp` compiled via `foreach_target.h` produces code for all CPU targets (SSE2, AVX2, NEON, scalar). Runtime dispatch via `HWY_DYNAMIC_DISPATCH` selects the fastest version. Highway is vendored as `third_party/highway` git submodule. + +**Tech Stack:** C++14, Google Highway 1.3, gtest + +--- + +### Task 1: Add Highway as git submodule + +**Files:** +- Create: `third_party/highway/` (git submodule) +- Modify: `.gitmodules` + +**Step 1: Add the submodule** + +```bash +cd /Users/kimy/workspace/fastplong +mkdir -p third_party +git submodule add https://github.com/google/highway.git third_party/highway +cd third_party/highway +git checkout ac0d5d29 +cd ../.. +``` + +**Step 2: Verify the submodule** + +Run: `ls third_party/highway/hwy/highway.h` +Expected: File exists + +Run: `grep HWY_MAJOR third_party/highway/hwy/base.h | head -2` +Expected: `#define HWY_MAJOR 1` and `#define HWY_MINOR 3` + +**Step 3: Commit** + +```bash +git add .gitmodules third_party/highway +git commit -m "feat: add Highway SIMD library as git submodule (v1.3)" +``` + +--- + +### Task 2: Update Makefile for Highway + +**Files:** +- Modify: `Makefile` + +**Step 1: Update the Makefile** + +Replace the entire Makefile with: + +```makefile +DIR_INC := ./inc +DIR_SRC := ./src +DIR_OBJ := ./obj +DIR_TEST := ./test +DIR_HWY := ./third_party/highway + +PREFIX ?= /usr/local +BINDIR ?= $(PREFIX)/bin +INCLUDE_DIRS ?= /opt/homebrew/include +LIBRARY_DIRS ?= /opt/homebrew/lib + +SRC := $(wildcard ${DIR_SRC}/*.cpp) +TEST := $(wildcard ${DIR_TEST}/*.cpp) +OBJ := $(patsubst %.cpp,${DIR_OBJ}/%.o,$(notdir ${SRC})) +TEST_OBJ := $(patsubst %.cpp,${DIR_OBJ}/%.o,$(notdir ${TEST})) + +# Highway runtime dispatch support +HWY_OBJS := ${DIR_OBJ}/hwy_targets.o ${DIR_OBJ}/hwy_abort.o +OBJ += $(HWY_OBJS) + +TARGET := fastplong + +BIN_TARGET := ${TARGET} +TEST_TARGET := bin/fastplong_unittest + +CXX ?= g++ +CXXFLAGS := -std=c++14 -pthread -g -O3 -MP -MD -I. -I${DIR_INC} -I${DIR_SRC} -I${DIR_HWY} $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) ${CXXFLAGS} +LIBS := -lisal -ldeflate -lpthread +STATIC_FLAGS := -static -L. -Wl,--no-as-needed -pthread +LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(LIBS) $(LD_FLAGS) +STATIC_LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(STATIC_FLAGS) $(LIBS) $(STATIC_LD_FLAGS) + + +${BIN_TARGET}:${OBJ} + $(CXX) $(OBJ) -o $@ $(LD_FLAGS) + +static:${OBJ} + $(CXX) $(OBJ) -o ${BIN_TARGET} $(STATIC_LD_FLAGS) + +${DIR_OBJ}/%.o:${DIR_SRC}/%.cpp + @mkdir -p $(@D) + $(CXX) -c $< -o $@ $(CXXFLAGS) + +# Highway source files for runtime CPU detection and error handling +${DIR_OBJ}/hwy_targets.o:${DIR_HWY}/hwy/targets.cc + @mkdir -p $(@D) + $(CXX) -c $< -o $@ $(CXXFLAGS) + +${DIR_OBJ}/hwy_abort.o:${DIR_HWY}/hwy/abort.cc + @mkdir -p $(@D) + $(CXX) -c $< -o $@ $(CXXFLAGS) + +.PHONY:clean +.PHONY:static +clean: + @rm -rf $(DIR_OBJ) + @rm -f $(TARGET) + @rm -f $(TEST_TARGET) + +install: + install $(TARGET) $(BINDIR)/$(TARGET) + @echo "Installed." + +${DIR_OBJ}/%.o:${DIR_TEST}/%.cpp + @mkdir -p $(@D) + $(CXX) -c $< -o $@ $(CXXFLAGS) + +test-static: ${TEST_OBJ} ${OBJ} + @mkdir -p bin + $(CXX) $(TEST_OBJ) ${OBJ:./obj/main.o=} -o ${TEST_TARGET} $(STATIC_LD_FLAGS) -lgtest -lgtest_main + ./${TEST_TARGET} + +test:${TEST_OBJ} ${OBJ} + @mkdir -p bin + $(CXX) $(TEST_OBJ) ${OBJ:./obj/main.o=} -o ${TEST_TARGET} $(LD_FLAGS) -lgtest -lgtest_main + ./${TEST_TARGET} + +-include $(OBJ:.o=.d) +``` + +Key changes from original: +- Added `DIR_HWY := ./third_party/highway` +- Added `-I. -I${DIR_HWY}` to CXXFLAGS +- Removed `-lhwy` from LIBS +- Added `HWY_OBJS` for `hwy_targets.o` and `hwy_abort.o` +- Added build rules for Highway source files + +**Step 2: Verify it compiles (will fail at link due to simd changes not yet done, but object files should build)** + +Run: `make clean && make -j 2>&1 | tail -5` +Expected: Compilation succeeds (may have link errors due to missing `-lhwy` until we finish simd.cpp, that's OK) + +**Step 3: Commit** + +```bash +git add Makefile +git commit -m "build: switch Highway from system library to vendored third_party submodule" +``` + +--- + +### Task 3: Create simd.h and simd.cpp + +**Files:** +- Create: `src/simd.h` +- Create: `src/simd.cpp` + +**Step 1: Create `src/simd.h`** + +```cpp +#ifndef FASTPLONG_SIMD_H +#define FASTPLONG_SIMD_H + +#include + +namespace fastplong_simd { + +// Count quality metrics for a read in one pass. +// qualstr/seqstr: quality and sequence strings of length len. +// qualThreshold: phred+33 encoded quality threshold for "low quality". +// Outputs: lowQualNum, nBaseNum, totalQual (sum of qual-33 values). +void countQualityMetrics(const char* qualstr, const char* seqstr, int len, + char qualThreshold, int& lowQualNum, int& nBaseNum, + int& totalQual); + +// Reverse complement a DNA sequence. +// src: input sequence of length len. +// dst: output buffer of at least len bytes (must NOT alias src). +void reverseComplement(const char* src, char* dst, int len); + +// Count adjacent-base differences for low complexity filter. +// Returns the number of positions where data[i] != data[i+1]. +int countAdjacentDiffs(const char* data, int len); + +// Count mismatches between two byte strings. +// Returns the number of positions where a[i] != b[i], up to len bytes. +int countMismatches(const char* a, const char* b, int len); + +// Run all SIMD unit tests. Returns true if all pass. +bool testSimd(); + +} // namespace fastplong_simd + +#endif // FASTPLONG_SIMD_H +``` + +**Step 2: Create `src/simd.cpp`** + +```cpp +// SIMD-accelerated functions for fastplong using Google Highway. +// This file is compiled once per SIMD target via foreach_target.h. + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "src/simd.cpp" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace fastplong_simd { +namespace HWY_NAMESPACE { + +namespace hn = hwy::HWY_NAMESPACE; + +void CountQualityMetricsImpl(const char* qualstr, const char* seqstr, int len, + char qualThreshold, int& lowQualNum, int& nBaseNum, + int& totalQual) { + const hn::ScalableTag d; + const int N = hn::Lanes(d); + + const auto vThresh = hn::Set(d, static_cast(qualThreshold)); + const auto vN = hn::Set(d, static_cast('N')); + const auto v33 = hn::Set(d, static_cast(33)); + + int lowQual = 0; + int nBase = 0; + int qualSum = 0; + int i = 0; + + // Process in blocks of up to 255 vectors to avoid u16 overflow during + // quality sum accumulation (255 * 255 = 65025 fits in u16). + const int blockSize = 255 * N; + + for (int blockStart = 0; blockStart < len; blockStart += blockSize) { + int blockEnd = blockStart + blockSize; + if (blockEnd > len) blockEnd = len; + + const hn::ScalableTag d16; + auto vQualSum16 = hn::Zero(d16); + + for (i = blockStart; i + N <= blockEnd; i += N) { + const auto vQual = hn::LoadU(d, reinterpret_cast(qualstr + i)); + const auto vSeq = hn::LoadU(d, reinterpret_cast(seqstr + i)); + + const auto maskLowQ = hn::Lt(vQual, vThresh); + lowQual += hn::CountTrue(d, maskLowQ); + + const auto maskN = hn::Eq(vSeq, vN); + nBase += hn::CountTrue(d, maskN); + + const auto vQualAdj = hn::Sub(vQual, v33); + vQualSum16 = hn::Add(vQualSum16, hn::SumsOf2(vQualAdj)); + } + + const hn::ScalableTag d32; + qualSum += static_cast( + hn::ReduceSum(d32, hn::SumsOf2(vQualSum16))); + } + + // Scalar tail + for (; i < len; i++) { + uint8_t qual = static_cast(qualstr[i]); + qualSum += qual - 33; + if (qual < static_cast(qualThreshold)) lowQual++; + if (seqstr[i] == 'N') nBase++; + } + + lowQualNum = lowQual; + nBaseNum = nBase; + totalQual = qualSum; +} + +void ReverseComplementImpl(const char* src, char* dst, int len) { + const hn::ScalableTag d; + const int N = hn::Lanes(d); + + const auto vA = hn::Set(d, static_cast('A')); + const auto vT = hn::Set(d, static_cast('T')); + const auto vC = hn::Set(d, static_cast('C')); + const auto vG = hn::Set(d, static_cast('G')); + const auto va = hn::Set(d, static_cast('a')); + const auto vt = hn::Set(d, static_cast('t')); + const auto vc = hn::Set(d, static_cast('c')); + const auto vg = hn::Set(d, static_cast('g')); + const auto vNch = hn::Set(d, static_cast('N')); + + int i = 0; + for (; i + N <= len; i += N) { + const auto v = hn::LoadU(d, reinterpret_cast(src + i)); + + auto comp = vNch; + comp = hn::IfThenElse(hn::Eq(v, vA), vT, comp); + comp = hn::IfThenElse(hn::Eq(v, vT), vA, comp); + comp = hn::IfThenElse(hn::Eq(v, vC), vG, comp); + comp = hn::IfThenElse(hn::Eq(v, vG), vC, comp); + comp = hn::IfThenElse(hn::Eq(v, va), vT, comp); + comp = hn::IfThenElse(hn::Eq(v, vt), vA, comp); + comp = hn::IfThenElse(hn::Eq(v, vc), vG, comp); + comp = hn::IfThenElse(hn::Eq(v, vg), vC, comp); + + const auto revComp = hn::Reverse(d, comp); + hn::StoreU(revComp, d, reinterpret_cast(dst + len - i - N)); + } + + // Scalar tail + for (; i < len; i++) { + char base = src[i]; + char comp; + switch (base) { + case 'A': case 'a': comp = 'T'; break; + case 'T': case 't': comp = 'A'; break; + case 'C': case 'c': comp = 'G'; break; + case 'G': case 'g': comp = 'C'; break; + default: comp = 'N'; break; + } + dst[len - 1 - i] = comp; + } +} + +int CountAdjacentDiffsImpl(const char* data, int len) { + if (len <= 1) return 0; + + const hn::ScalableTag d; + const int N = hn::Lanes(d); + + int diff = 0; + int i = 0; + + for (; i + N < len; i += N) { + const auto v1 = hn::LoadU(d, reinterpret_cast(data + i)); + const auto v2 = hn::LoadU(d, reinterpret_cast(data + i + 1)); + const auto maskNe = hn::Ne(v1, v2); + diff += hn::CountTrue(d, maskNe); + } + + // Scalar tail + for (; i < len - 1; i++) { + if (data[i] != data[i + 1]) diff++; + } + + return diff; +} + +int CountMismatchesImpl(const char* a, const char* b, int len) { + const hn::ScalableTag d; + const int N = hn::Lanes(d); + + int diff = 0; + int i = 0; + + for (; i + N <= len; i += N) { + const auto va = hn::LoadU(d, reinterpret_cast(a + i)); + const auto vb = hn::LoadU(d, reinterpret_cast(b + i)); + const auto maskNe = hn::Ne(va, vb); + diff += hn::CountTrue(d, maskNe); + } + + // Scalar tail + for (; i < len; i++) { + if (a[i] != b[i]) diff++; + } + + return diff; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace fastplong_simd +HWY_AFTER_NAMESPACE(); + +// ---- Dynamic dispatch wrappers (compiled once) ---- +#if HWY_ONCE + +#include +#include +#include + +namespace fastplong_simd { + +HWY_EXPORT(CountQualityMetricsImpl); +HWY_EXPORT(ReverseComplementImpl); +HWY_EXPORT(CountAdjacentDiffsImpl); +HWY_EXPORT(CountMismatchesImpl); + +void countQualityMetrics(const char* qualstr, const char* seqstr, int len, + char qualThreshold, int& lowQualNum, int& nBaseNum, + int& totalQual) { + HWY_DYNAMIC_DISPATCH(CountQualityMetricsImpl)(qualstr, seqstr, len, + qualThreshold, lowQualNum, + nBaseNum, totalQual); +} + +void reverseComplement(const char* src, char* dst, int len) { + HWY_DYNAMIC_DISPATCH(ReverseComplementImpl)(src, dst, len); +} + +int countAdjacentDiffs(const char* data, int len) { + return HWY_DYNAMIC_DISPATCH(CountAdjacentDiffsImpl)(data, len); +} + +int countMismatches(const char* a, const char* b, int len) { + return HWY_DYNAMIC_DISPATCH(CountMismatchesImpl)(a, b, len); +} + +// ---- Scalar reference implementations for testing ---- + +static void scalarCountQualityMetrics(const char* qualstr, const char* seqstr, + int len, char qualThreshold, + int& lowQualNum, int& nBaseNum, + int& totalQual) { + lowQualNum = 0; + nBaseNum = 0; + totalQual = 0; + for (int i = 0; i < len; i++) { + uint8_t q = static_cast(qualstr[i]); + totalQual += q - 33; + if (q < static_cast(qualThreshold)) lowQualNum++; + if (seqstr[i] == 'N') nBase++; + } +} + +static void scalarReverseComplement(const char* src, char* dst, int len) { + for (int i = 0; i < len; i++) { + char c; + switch (src[i]) { + case 'A': case 'a': c = 'T'; break; + case 'T': case 't': c = 'A'; break; + case 'C': case 'c': c = 'G'; break; + case 'G': case 'g': c = 'C'; break; + default: c = 'N'; break; + } + dst[len - 1 - i] = c; + } +} + +static int scalarCountAdjacentDiffs(const char* data, int len) { + int diff = 0; + for (int i = 0; i < len - 1; i++) { + if (data[i] != data[i + 1]) diff++; + } + return diff; +} + +static int scalarCountMismatches(const char* a, const char* b, int len) { + int diff = 0; + for (int i = 0; i < len; i++) { + if (a[i] != b[i]) diff++; + } + return diff; +} + +bool testSimd() { + bool pass = true; + + // --- reverseComplement --- + { + const char* in = "AAAATTTTCCCCGGGG"; + int len = 16; + char out[16]; + reverseComplement(in, out, len); + char ref[16]; + scalarReverseComplement(in, ref, len); + if (memcmp(out, ref, len) != 0) { + fprintf(stderr, "FAIL: reverseComplement basic\n"); + pass = false; + } + } + { + const char* in = "AaTtCcGgN"; + int len = 9; + char out[9]; + reverseComplement(in, out, len); + char ref[9]; + scalarReverseComplement(in, ref, len); + if (memcmp(out, ref, len) != 0) { + fprintf(stderr, "FAIL: reverseComplement mixed case\n"); + pass = false; + } + } + { + char out[1] = {'X'}; + reverseComplement("", out, 0); + } + { + char out[1]; + reverseComplement("A", out, 1); + if (out[0] != 'T') { + fprintf(stderr, "FAIL: reverseComplement len=1\n"); + pass = false; + } + } + { + const char* in = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG"; + int len = 68; + char out[68], ref[68]; + reverseComplement(in, out, len); + scalarReverseComplement(in, ref, len); + if (memcmp(out, ref, len) != 0) { + fprintf(stderr, "FAIL: reverseComplement long string\n"); + pass = false; + } + } + + // --- countQualityMetrics --- + { + const char* qual = "IIIII"; + const char* seq = "ACGTN"; + int lowQ = 0, nBase = 0, totalQ = 0; + countQualityMetrics(qual, seq, 5, '5', lowQ, nBase, totalQ); + int refLowQ = 0, refNBase = 0, refTotalQ = 0; + scalarCountQualityMetrics(qual, seq, 5, '5', refLowQ, refNBase, refTotalQ); + if (lowQ != refLowQ || nBase != refNBase || totalQ != refTotalQ) { + fprintf(stderr, "FAIL: countQualityMetrics basic\n"); + pass = false; + } + } + { + const char* qual = "!!!!!!!!!!"; + const char* seq = "AAAAAAAAAA"; + int lowQ = 0, nBase = 0, totalQ = 0; + countQualityMetrics(qual, seq, 10, '5', lowQ, nBase, totalQ); + int refLowQ = 0, refNBase = 0, refTotalQ = 0; + scalarCountQualityMetrics(qual, seq, 10, '5', refLowQ, refNBase, refTotalQ); + if (lowQ != refLowQ || nBase != refNBase || totalQ != refTotalQ) { + fprintf(stderr, "FAIL: countQualityMetrics all-low\n"); + pass = false; + } + } + { + const char qual[] = "IIIIII!!!!!IIIII55555NNNNN!!!!!IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII"; + const char seq[] = "ACGTNNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACG"; + int len = 68; + int lowQ = 0, nBase = 0, totalQ = 0; + countQualityMetrics(qual, seq, len, '5', lowQ, nBase, totalQ); + int refLowQ = 0, refNBase = 0, refTotalQ = 0; + scalarCountQualityMetrics(qual, seq, len, '5', refLowQ, refNBase, refTotalQ); + if (lowQ != refLowQ || nBase != refNBase || totalQ != refTotalQ) { + fprintf(stderr, "FAIL: countQualityMetrics long\n"); + pass = false; + } + } + + // --- countAdjacentDiffs --- + { + int d = countAdjacentDiffs("AAAAAAAAAA", 10); + if (d != 0) { + fprintf(stderr, "FAIL: countAdjacentDiffs all-same got %d\n", d); + pass = false; + } + } + { + const char* s = "ACACACACAC"; + int d = countAdjacentDiffs(s, 10); + int ref = scalarCountAdjacentDiffs(s, 10); + if (d != ref) { + fprintf(stderr, "FAIL: countAdjacentDiffs all-diff got %d expected %d\n", d, ref); + pass = false; + } + } + { + if (countAdjacentDiffs("A", 1) != 0) { + fprintf(stderr, "FAIL: countAdjacentDiffs len=1\n"); + pass = false; + } + if (countAdjacentDiffs("", 0) != 0) { + fprintf(stderr, "FAIL: countAdjacentDiffs len=0\n"); + pass = false; + } + } + { + const char* s = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG"; + int len = 68; + int d = countAdjacentDiffs(s, len); + int ref = scalarCountAdjacentDiffs(s, len); + if (d != ref) { + fprintf(stderr, "FAIL: countAdjacentDiffs long got %d expected %d\n", d, ref); + pass = false; + } + } + + // --- countMismatches --- + { + const char* s = "ACGTACGTACGT"; + int d = countMismatches(s, s, 12); + if (d != 0) { + fprintf(stderr, "FAIL: countMismatches identical got %d\n", d); + pass = false; + } + } + { + int d = countMismatches("AAAA", "TTTT", 4); + if (d != 4) { + fprintf(stderr, "FAIL: countMismatches all-diff got %d\n", d); + pass = false; + } + } + { + const char* a = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG"; + const char* b = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG"; + int d = countMismatches(a, b, 68); + if (d != 0) { + fprintf(stderr, "FAIL: countMismatches long-identical got %d\n", d); + pass = false; + } + } + { + const char* a = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + const char* b = "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"; + int d = countMismatches(a, b, 66); + if (d != 66) { + fprintf(stderr, "FAIL: countMismatches long-alldiff got %d\n", d); + pass = false; + } + } + { + int d = countMismatches("A", "T", 0); + if (d != 0) { + fprintf(stderr, "FAIL: countMismatches len=0 got %d\n", d); + pass = false; + } + } + + return pass; +} + +} // namespace fastplong_simd + +#endif // HWY_ONCE +``` + +NOTE: There is a bug in the scalar reference above — `nBase++` should be `nBaseNum++`. Fix it when implementing. Copy the exact code from fastp's `src/simd.cpp` as the reference, adjusting only the namespace from `fastp_simd` to `fastplong_simd`. + +**Step 3: Verify it compiles** + +Run: `make clean && make -j 2>&1 | tail -10` +Expected: Compiles without errors (simd.cpp may generate warnings but no errors) + +**Step 4: Commit** + +```bash +git add src/simd.h src/simd.cpp +git commit -m "feat: add centralized SIMD module with Highway multi-target dispatch" +``` + +--- + +### Task 4: Add SIMD unit test + +**Files:** +- Create: `test/simd_test.cpp` + +**Step 1: Create the test file** + +```cpp +#include +#include "../src/simd.h" + +TEST(SimdTest, allTests) { + EXPECT_TRUE(fastplong_simd::testSimd()); +} +``` + +**Step 2: Run tests** + +Run: `make clean && make test 2>&1 | tail -20` +Expected: All tests pass including SimdTest.allTests + +**Step 3: Commit** + +```bash +git add test/simd_test.cpp +git commit -m "test: add SIMD unit tests" +``` + +--- + +### Task 5: Refactor filter.cpp to use SIMD + +**Files:** +- Modify: `src/filter.cpp` + +**Step 1: Update filter.cpp** + +Add include at the top (after existing includes): +```cpp +#include "simd.h" +``` + +Replace the scalar loop in `passFilter()` (the `for(int i=0; iqualfilter.qualifiedQual, + lowQualNum, nBaseNum, totalQual); +``` + +Replace the scalar loop in `passLowComplexityFilter()` (the `for(int i=0; i&1 | tail -20` +Expected: All tests pass (FilterTest.trimAndCut still passes) + +**Step 3: Commit** + +```bash +git add src/filter.cpp +git commit -m "perf: use SIMD countQualityMetrics and countAdjacentDiffs in filter" +``` + +--- + +### Task 6: Refactor sequence.cpp to use SIMD + +**Files:** +- Modify: `src/sequence.cpp` + +**Step 1: Replace entire sequence.cpp** + +```cpp +#include "sequence.h" +#include "simd.h" + +Sequence::Sequence(){ +} + +Sequence::Sequence(string* seq){ + mStr = seq; +} + +Sequence::~Sequence(){ + if(mStr) + delete mStr; +} + +void Sequence::print(){ + std::cerr << *mStr; +} + +int Sequence::length(){ + return mStr->length(); +} + +string Sequence::reverseComplement(string* origin) { + int len = origin->length(); + string str(len, 0); + fastplong_simd::reverseComplement(origin->c_str(), &str[0], len); + return str; +} + +Sequence Sequence::reverseComplement() { + string* reversed = new string(Sequence::reverseComplement(mStr)); + return Sequence(reversed); +} + +Sequence Sequence::operator~(){ + return reverseComplement(); +} +``` + +Key changes: +- Removed `#include `, `#include `, `#include `, `#include "simdutil.h"` +- Removed `namespace hn = hwy::HWY_NAMESPACE;` +- Simplified `reverseComplement(string*)` to a 4-line function calling `fastplong_simd::reverseComplement` +- Removed the `#if _MSC_VER` / VLA / AllocateAligned branching + +**Step 2: Run tests** + +Run: `make clean && make test 2>&1 | tail -20` +Expected: All tests pass (SequenceTests.reverse still passes) + +**Step 3: Commit** + +```bash +git add src/sequence.cpp +git commit -m "refactor: use centralized SIMD reverseComplement in sequence.cpp" +``` + +--- + +### Task 7: Refactor adaptertrimmer.cpp to use SIMD + +**Files:** +- Modify: `src/adaptertrimmer.cpp` + +**Step 1: Update adaptertrimmer.cpp** + +Replace includes — change: +```cpp +#include "hwy/highway.h" +``` +to: +```cpp +#include "simd.h" +``` + +In `searchAdapter()`, remove the Highway setup at lines 61-63: +```cpp + namespace hn = hwy::HWY_NAMESPACE; + hn::ScalableTag d8; + const size_t N = hn::Lanes(d8); +``` + +Replace the three inner SIMD loops. Each occurrence of this pattern (appears at lines 89-96, 115-122, 139-146): +```cpp + size_t mismatch = 0; + for (size_t i = 0; i < alen; i += N) + { + const size_t lanesToLoad = min(alen - i, N); + const auto rdata_v = hn::LoadN(d8, reinterpret_cast(rdata + i + p), lanesToLoad); + const auto adata_v = hn::LoadN(d8, reinterpret_cast(adata + i), lanesToLoad); + const auto mismatch_mask = rdata_v != adata_v; + mismatch += hn::CountTrue(d8, mismatch_mask); + } +``` + +Replace each with: +```cpp + int mismatch = fastplong_simd::countMismatches(rdata + p, adata, alen); +``` + +Also change `size_t mismatch` → `int mismatch` in the comparison logic (the variable type is returned as `int` by `countMismatches`). + +**Step 2: Run tests** + +Run: `make clean && make test 2>&1 | tail -20` +Expected: All tests pass (AdapterTrimmer tests still pass) + +**Step 3: Commit** + +```bash +git add src/adaptertrimmer.cpp +git commit -m "refactor: use centralized SIMD countMismatches in adaptertrimmer" +``` + +--- + +### Task 8: Remove simdutil.h + +**Files:** +- Delete: `src/simdutil.h` + +**Step 1: Verify no remaining references** + +Run: `grep -r "simdutil" src/ test/` +Expected: No output (no files reference simdutil.h anymore after Task 6) + +**Step 2: Delete the file** + +```bash +rm src/simdutil.h +``` + +**Step 3: Run full build and tests** + +Run: `make clean && make test 2>&1 | tail -20` +Expected: All tests pass, no compilation errors + +**Step 4: Commit** + +```bash +git add -A src/simdutil.h +git commit -m "cleanup: remove simdutil.h, functionality merged into simd.cpp" +``` + +--- + +### Task 9: Final verification + +**Step 1: Clean build** + +Run: `make clean && make -j 2>&1` +Expected: Clean compilation with no errors or warnings + +**Step 2: Run all tests** + +Run: `make test 2>&1` +Expected: All tests pass + +**Step 3: Quick smoke test with real data (if available)** + +Run: `./fastplong --help` +Expected: Help output appears normally + +**Step 4: Verify no remaining direct Highway includes in source files (except simd.cpp)** + +Run: `grep -r '#include.*hwy/' src/ | grep -v simd.cpp` +Expected: No output — only simd.cpp should include Highway headers directly diff --git a/src/adaptertrimmer.cpp b/src/adaptertrimmer.cpp index ccc5ed5..b95c8ea 100644 --- a/src/adaptertrimmer.cpp +++ b/src/adaptertrimmer.cpp @@ -1,7 +1,7 @@ #include "adaptertrimmer.h" #include "editdistance.h" #include -#include "hwy/highway.h" +#include "simd.h" AdapterTrimmer::AdapterTrimmer(){ } @@ -58,10 +58,6 @@ int AdapterTrimmer::trimByMultiSequences(Read* r, FilterResult* fr, vector d8; - const size_t N = hn::Lanes(d8); - int minMismatch = 99999; // initialized with a large mismatch int pos = -1; // for the best match @@ -86,15 +82,7 @@ int AdapterTrimmer::searchAdapter(string *read, const string &adapter, double ed // go from left, and return immediatedly if find a mismatch < threshold for (int p = searchStart; p < searchEnd - alen; p++) { - size_t mismatch = 0; - for (size_t i = 0; i < alen; i += N) - { - const size_t lanesToLoad = min(alen - i, N); - const auto rdata_v = hn::LoadN(d8, reinterpret_cast(rdata + i + p), lanesToLoad); - const auto adata_v = hn::LoadN(d8, reinterpret_cast(adata + i), lanesToLoad); - const auto mismatch_mask = rdata_v != adata_v; - mismatch += hn::CountTrue(d8, mismatch_mask); - } + int mismatch = fastplong_simd::countMismatches(rdata + p, adata, alen); if (mismatch <= threshold) { @@ -112,15 +100,7 @@ int AdapterTrimmer::searchAdapter(string *read, const string &adapter, double ed // go from right, and return immediatedly if find a mismatch <= threshold for (int p = searchEnd - alen; p >= searchStart; p--) { - size_t mismatch = 0; - for (size_t i = 0; i < alen; i += N) - { - const size_t lanesToLoad = min(alen - i, N); - const auto rdata_v = hn::LoadN(d8, reinterpret_cast(rdata + i + p), lanesToLoad); - const auto adata_v = hn::LoadN(d8, reinterpret_cast(adata + i), lanesToLoad); - const auto mismatch_mask = rdata_v != adata_v; - mismatch += hn::CountTrue(d8, mismatch_mask); - } + int mismatch = fastplong_simd::countMismatches(rdata + p, adata, alen); if (mismatch <= threshold) { return p; @@ -136,15 +116,7 @@ int AdapterTrimmer::searchAdapter(string *read, const string &adapter, double ed { for (int p = searchStart; p < searchEnd - alen; p++) { - size_t mismatch = 0; - for (size_t i = 0; i < alen; i += N) - { - const size_t lanesToLoad = min(alen - i, N); - const auto rdata_v = hn::LoadN(d8, reinterpret_cast(rdata + i + p), lanesToLoad); - const auto adata_v = hn::LoadN(d8, reinterpret_cast(adata + i), lanesToLoad); - const auto mismatch_mask = rdata_v != adata_v; - mismatch += hn::CountTrue(d8, mismatch_mask); - } + int mismatch = fastplong_simd::countMismatches(rdata + p, adata, alen); if (mismatch < minMismatch) { minMismatch = mismatch; diff --git a/src/fastqreader.h b/src/fastqreader.h index 3ff41c1..5c9dfe6 100644 --- a/src/fastqreader.h +++ b/src/fastqreader.h @@ -32,7 +32,7 @@ SOFTWARE. #include #include #include "readpool.h" -#include +#include "igzip_lib.h" class FastqReader{ public: diff --git a/src/filter.cpp b/src/filter.cpp index 07847ef..df1d275 100644 --- a/src/filter.cpp +++ b/src/filter.cpp @@ -1,5 +1,6 @@ #include "processor.h" #include "seprocessor.h" +#include "simd.h" Filter::Filter(Options* opt){ mOptions = opt; @@ -24,18 +25,9 @@ int Filter::passFilter(Read* r) { const char* seqstr = r->mSeq->c_str(); const char* qualstr = r->mQuality->c_str(); - for(int i=0; iqualfilter.qualifiedQual) - lowQualNum ++; - - if(base == 'N') - nBaseNum++; - } + fastplong_simd::countQualityMetrics(qualstr, seqstr, rlen, + mOptions->qualfilter.qualifiedQual, + lowQualNum, nBaseNum, totalQual); } if(mOptions->qualfilter.enabled) { @@ -65,19 +57,12 @@ int Filter::passFilter(Read* r) { } bool Filter::passLowComplexityFilter(Read* r) { - int diff = 0; int length = r->length(); if(length <= 1) return false; const char* data = r->mSeq->c_str(); - for(int i=0; i= mOptions->complexityFilter.threshold ) - return true; - else - return false; + int diff = fastplong_simd::countAdjacentDiffs(data, length); + return (double)diff/(double)(length-1) >= mOptions->complexityFilter.threshold; } vector> Filter::detectLowQualityRegions(Read* r, int windowSize, int quality) { diff --git a/src/libdeflate.h b/src/libdeflate.h deleted file mode 100644 index 4e124e7..0000000 --- a/src/libdeflate.h +++ /dev/null @@ -1,362 +0,0 @@ -/* - * libdeflate.h - public header for libdeflate - */ - -#ifndef LIBDEFLATE_H -#define LIBDEFLATE_H - -#ifdef __cplusplus -extern "C" { -#endif - -#define LIBDEFLATE_VERSION_MAJOR 1 -#define LIBDEFLATE_VERSION_MINOR 6 -#define LIBDEFLATE_VERSION_STRING "1.6" - -#include -#include - -/* - * On Windows, if you want to link to the DLL version of libdeflate, then - * #define LIBDEFLATE_DLL. Note that the calling convention is "stdcall". - */ -#ifdef LIBDEFLATE_DLL -# ifdef BUILDING_LIBDEFLATE -# define LIBDEFLATEEXPORT LIBEXPORT -# elif defined(_WIN32) || defined(__CYGWIN__) -# define LIBDEFLATEEXPORT __declspec(dllimport) -# endif -#endif -#ifndef LIBDEFLATEEXPORT -# define LIBDEFLATEEXPORT -#endif - -#if defined(_WIN32) && !defined(_WIN64) -# define LIBDEFLATEAPI_ABI __stdcall -#else -# define LIBDEFLATEAPI_ABI -#endif - -#if defined(BUILDING_LIBDEFLATE) && defined(__GNUC__) && \ - defined(_WIN32) && !defined(_WIN64) - /* - * On 32-bit Windows, gcc assumes 16-byte stack alignment but MSVC only 4. - * Realign the stack when entering libdeflate to avoid crashing in SSE/AVX - * code when called from an MSVC-compiled application. - */ -# define LIBDEFLATEAPI_STACKALIGN __attribute__((force_align_arg_pointer)) -#else -# define LIBDEFLATEAPI_STACKALIGN -#endif - -#define LIBDEFLATEAPI LIBDEFLATEAPI_ABI LIBDEFLATEAPI_STACKALIGN - -/* ========================================================================== */ -/* Compression */ -/* ========================================================================== */ - -struct libdeflate_compressor; - -/* - * libdeflate_alloc_compressor() allocates a new compressor that supports - * DEFLATE, zlib, and gzip compression. 'compression_level' is the compression - * level on a zlib-like scale but with a higher maximum value (1 = fastest, 6 = - * medium/default, 9 = slow, 12 = slowest). The return value is a pointer to - * the new compressor, or NULL if out of memory. - * - * Note: for compression, the sliding window size is defined at compilation time - * to 32768, the largest size permissible in the DEFLATE format. It cannot be - * changed at runtime. - * - * A single compressor is not safe to use by multiple threads concurrently. - * However, different threads may use different compressors concurrently. - */ -LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI -libdeflate_alloc_compressor(int compression_level); - -/* - * libdeflate_deflate_compress() performs raw DEFLATE compression on a buffer of - * data. The function attempts to compress 'in_nbytes' bytes of data located at - * 'in' and write the results to 'out', which has space for 'out_nbytes_avail' - * bytes. The return value is the compressed size in bytes, or 0 if the data - * could not be compressed to 'out_nbytes_avail' bytes or fewer. - */ -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI -libdeflate_deflate_compress(struct libdeflate_compressor *compressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail); - -/* - * libdeflate_deflate_compress_bound() returns a worst-case upper bound on the - * number of bytes of compressed data that may be produced by compressing any - * buffer of length less than or equal to 'in_nbytes' using - * libdeflate_deflate_compress() with the specified compressor. Mathematically, - * this bound will necessarily be a number greater than or equal to 'in_nbytes'. - * It may be an overestimate of the true upper bound. The return value is - * guaranteed to be the same for all invocations with the same compressor and - * same 'in_nbytes'. - * - * As a special case, 'compressor' may be NULL. This causes the bound to be - * taken across *any* libdeflate_compressor that could ever be allocated with - * this build of the library, with any options. - * - * Note that this function is not necessary in many applications. With - * block-based compression, it is usually preferable to separately store the - * uncompressed size of each block and to store any blocks that did not compress - * to less than their original size uncompressed. In that scenario, there is no - * need to know the worst-case compressed size, since the maximum number of - * bytes of compressed data that may be used would always be one less than the - * input length. You can just pass a buffer of that size to - * libdeflate_deflate_compress() and store the data uncompressed if - * libdeflate_deflate_compress() returns 0, indicating that the compressed data - * did not fit into the provided output buffer. - */ -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI -libdeflate_deflate_compress_bound(struct libdeflate_compressor *compressor, - size_t in_nbytes); - -/* - * Like libdeflate_deflate_compress(), but stores the data in the zlib wrapper - * format. - */ -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI -libdeflate_zlib_compress(struct libdeflate_compressor *compressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail); - -/* - * Like libdeflate_deflate_compress_bound(), but assumes the data will be - * compressed with libdeflate_zlib_compress() rather than with - * libdeflate_deflate_compress(). - */ -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI -libdeflate_zlib_compress_bound(struct libdeflate_compressor *compressor, - size_t in_nbytes); - -/* - * Like libdeflate_deflate_compress(), but stores the data in the gzip wrapper - * format. - */ -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI -libdeflate_gzip_compress(struct libdeflate_compressor *compressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail); - -/* - * Like libdeflate_deflate_compress_bound(), but assumes the data will be - * compressed with libdeflate_gzip_compress() rather than with - * libdeflate_deflate_compress(). - */ -LIBDEFLATEEXPORT size_t LIBDEFLATEAPI -libdeflate_gzip_compress_bound(struct libdeflate_compressor *compressor, - size_t in_nbytes); - -/* - * libdeflate_free_compressor() frees a compressor that was allocated with - * libdeflate_alloc_compressor(). If a NULL pointer is passed in, no action is - * taken. - */ -LIBDEFLATEEXPORT void LIBDEFLATEAPI -libdeflate_free_compressor(struct libdeflate_compressor *compressor); - -/* ========================================================================== */ -/* Decompression */ -/* ========================================================================== */ - -struct libdeflate_decompressor; - -/* - * libdeflate_alloc_decompressor() allocates a new decompressor that can be used - * for DEFLATE, zlib, and gzip decompression. The return value is a pointer to - * the new decompressor, or NULL if out of memory. - * - * This function takes no parameters, and the returned decompressor is valid for - * decompressing data that was compressed at any compression level and with any - * sliding window size. - * - * A single decompressor is not safe to use by multiple threads concurrently. - * However, different threads may use different decompressors concurrently. - */ -LIBDEFLATEEXPORT struct libdeflate_decompressor * LIBDEFLATEAPI -libdeflate_alloc_decompressor(void); - -/* - * Result of a call to libdeflate_deflate_decompress(), - * libdeflate_zlib_decompress(), or libdeflate_gzip_decompress(). - */ -enum libdeflate_result { - /* Decompression was successful. */ - LIBDEFLATE_SUCCESS = 0, - - /* Decompressed failed because the compressed data was invalid, corrupt, - * or otherwise unsupported. */ - LIBDEFLATE_BAD_DATA = 1, - - /* A NULL 'actual_out_nbytes_ret' was provided, but the data would have - * decompressed to fewer than 'out_nbytes_avail' bytes. */ - LIBDEFLATE_SHORT_OUTPUT = 2, - - /* The data would have decompressed to more than 'out_nbytes_avail' - * bytes. */ - LIBDEFLATE_INSUFFICIENT_SPACE = 3, -}; - -/* - * libdeflate_deflate_decompress() decompresses the DEFLATE-compressed stream - * from the buffer 'in' with compressed size up to 'in_nbytes' bytes. The - * uncompressed data is written to 'out', a buffer with size 'out_nbytes_avail' - * bytes. If decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned. - * Otherwise, a nonzero result code such as LIBDEFLATE_BAD_DATA is returned. If - * a nonzero result code is returned, then the contents of the output buffer are - * undefined. - * - * Decompression stops at the end of the DEFLATE stream (as indicated by the - * BFINAL flag), even if it is actually shorter than 'in_nbytes' bytes. - * - * libdeflate_deflate_decompress() can be used in cases where the actual - * uncompressed size is known (recommended) or unknown (not recommended): - * - * - If the actual uncompressed size is known, then pass the actual - * uncompressed size as 'out_nbytes_avail' and pass NULL for - * 'actual_out_nbytes_ret'. This makes libdeflate_deflate_decompress() fail - * with LIBDEFLATE_SHORT_OUTPUT if the data decompressed to fewer than the - * specified number of bytes. - * - * - If the actual uncompressed size is unknown, then provide a non-NULL - * 'actual_out_nbytes_ret' and provide a buffer with some size - * 'out_nbytes_avail' that you think is large enough to hold all the - * uncompressed data. In this case, if the data decompresses to less than - * or equal to 'out_nbytes_avail' bytes, then - * libdeflate_deflate_decompress() will write the actual uncompressed size - * to *actual_out_nbytes_ret and return 0 (LIBDEFLATE_SUCCESS). Otherwise, - * it will return LIBDEFLATE_INSUFFICIENT_SPACE if the provided buffer was - * not large enough but no other problems were encountered, or another - * nonzero result code if decompression failed for another reason. - */ -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI -libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_out_nbytes_ret); - -/* - * Like libdeflate_deflate_decompress(), but adds the 'actual_in_nbytes_ret' - * argument. If decompression succeeds and 'actual_in_nbytes_ret' is not NULL, - * then the actual compressed size of the DEFLATE stream (aligned to the next - * byte boundary) is written to *actual_in_nbytes_ret. - */ -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI -libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, - size_t *actual_out_nbytes_ret); - -/* - * Like libdeflate_deflate_decompress(), but assumes the zlib wrapper format - * instead of raw DEFLATE. - * - * Decompression will stop at the end of the zlib stream, even if it is shorter - * than 'in_nbytes'. If you need to know exactly where the zlib stream ended, - * use libdeflate_zlib_decompress_ex(). - */ -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI -libdeflate_zlib_decompress(struct libdeflate_decompressor *decompressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_out_nbytes_ret); - -/* - * Like libdeflate_zlib_decompress(), but adds the 'actual_in_nbytes_ret' - * argument. If 'actual_in_nbytes_ret' is not NULL and the decompression - * succeeds (indicating that the first zlib-compressed stream in the input - * buffer was decompressed), then the actual number of input bytes consumed is - * written to *actual_in_nbytes_ret. - */ -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI -libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *decompressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, - size_t *actual_out_nbytes_ret); - -/* - * Like libdeflate_deflate_decompress(), but assumes the gzip wrapper format - * instead of raw DEFLATE. - * - * If multiple gzip-compressed members are concatenated, then only the first - * will be decompressed. Use libdeflate_gzip_decompress_ex() if you need - * multi-member support. - */ -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI -libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_out_nbytes_ret); - -/* - * Like libdeflate_gzip_decompress(), but adds the 'actual_in_nbytes_ret' - * argument. If 'actual_in_nbytes_ret' is not NULL and the decompression - * succeeds (indicating that the first gzip-compressed member in the input - * buffer was decompressed), then the actual number of input bytes consumed is - * written to *actual_in_nbytes_ret. - */ -LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI -libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, - size_t *actual_out_nbytes_ret); - -/* - * libdeflate_free_decompressor() frees a decompressor that was allocated with - * libdeflate_alloc_decompressor(). If a NULL pointer is passed in, no action - * is taken. - */ -LIBDEFLATEEXPORT void LIBDEFLATEAPI -libdeflate_free_decompressor(struct libdeflate_decompressor *decompressor); - -/* ========================================================================== */ -/* Checksums */ -/* ========================================================================== */ - -/* - * libdeflate_adler32() updates a running Adler-32 checksum with 'len' bytes of - * data and returns the updated checksum. When starting a new checksum, the - * required initial value for 'adler' is 1. This value is also returned when - * 'buffer' is specified as NULL. - */ -LIBDEFLATEEXPORT uint32_t LIBDEFLATEAPI -libdeflate_adler32(uint32_t adler32, const void *buffer, size_t len); - - -/* - * libdeflate_crc32() updates a running CRC-32 checksum with 'len' bytes of data - * and returns the updated checksum. When starting a new checksum, the required - * initial value for 'crc' is 0. This value is also returned when 'buffer' is - * specified as NULL. - */ -LIBDEFLATEEXPORT uint32_t LIBDEFLATEAPI -libdeflate_crc32(uint32_t crc, const void *buffer, size_t len); - -/* ========================================================================== */ -/* Custom memory allocator */ -/* ========================================================================== */ - -/* - * Install a custom memory allocator which libdeflate will use for all memory - * allocations. 'malloc_func' is a function that must behave like malloc(), and - * 'free_func' is a function that must behave like free(). - * - * There must not be any libdeflate_compressor or libdeflate_decompressor - * structures in existence when calling this function. - */ -LIBDEFLATEEXPORT void LIBDEFLATEAPI -libdeflate_set_memory_allocator(void *(*malloc_func)(size_t), - void (*free_func)(void *)); - -#ifdef __cplusplus -} -#endif - -#endif /* LIBDEFLATE_H */ diff --git a/src/sequence.cpp b/src/sequence.cpp index 7ec0c42..0adb678 100644 --- a/src/sequence.cpp +++ b/src/sequence.cpp @@ -1,10 +1,5 @@ #include "sequence.h" -#include -#include -#include -#include "simdutil.h" - -namespace hn = hwy::HWY_NAMESPACE; +#include "simd.h" Sequence::Sequence(){ } @@ -26,54 +21,11 @@ int Sequence::length(){ return mStr->length(); } -string Sequence::reverseComplement(string *HWY_RESTRICT origin) { - auto length = origin->length(); - const hn::ScalableTag d; - const auto sequence = reinterpret_cast(origin->c_str()); - const auto transform = [](const auto d, auto output, const auto sequence) HWY_ATTR - { - const auto A = hn::Set(d, 'A'); - const auto T = hn::Set(d, 'T'); - const auto C = hn::Set(d, 'C'); - const auto G = hn::Set(d, 'G'); - const auto a = hn::Set(d, 'a'); - const auto t = hn::Set(d, 't'); - const auto c = hn::Set(d, 'c'); - const auto g = hn::Set(d, 'g'); - const auto N = hn::Set(d, 'N'); - - // output[i] = sequence[i] == 'A' || sequence[i] == 'a' ? 'T' : 'N' - output = hn::IfThenElse(hn::Or(hn::Eq(sequence, A), hn::Eq(sequence, a)), T, N); - output = hn::IfThenElse(hn::Or(hn::Eq(sequence, T), hn::Eq(sequence, t)), A, output); - output = hn::IfThenElse(hn::Or(hn::Eq(sequence, C), hn::Eq(sequence, c)), G, output); - output = hn::IfThenElse(hn::Or(hn::Eq(sequence, G), hn::Eq(sequence, g)), C, output); - return output; - }; -#if _MSC_VER - if (true) { -#else - if (length <= 1000000) { -#endif -#if _MSC_VER - auto outputPtr = std::make_unique(length); - uint8_t* output = outputPtr.get(); -#else - uint8_t output[length]; -#endif - hn::Transform1Reversed(d, output, length, sequence, transform); - auto retVal = reinterpret_cast(output); - std::string reversed(retVal, length); - return reversed; - } -#ifndef _MSC_VER - else { - const auto allocated = hwy::AllocateAligned(length); - hn::Transform1Reversed(d, allocated.get(), length, sequence, transform); - auto retVal = reinterpret_cast(allocated.get()); - std::string reversed(retVal, length); - return reversed; - } -#endif +string Sequence::reverseComplement(string* origin) { + int len = origin->length(); + string str(len, 0); + fastplong_simd::reverseComplement(origin->c_str(), &str[0], len); + return str; } Sequence Sequence::reverseComplement() { diff --git a/src/simd.cpp b/src/simd.cpp new file mode 100644 index 0000000..3f1ba62 --- /dev/null +++ b/src/simd.cpp @@ -0,0 +1,452 @@ +// SIMD-accelerated functions for fastplong using Google Highway. +// This file is compiled once per SIMD target via foreach_target.h. + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "src/simd.cpp" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace fastplong_simd { +namespace HWY_NAMESPACE { + +namespace hn = hwy::HWY_NAMESPACE; + +void CountQualityMetricsImpl(const char* qualstr, const char* seqstr, int len, + char qualThreshold, int& lowQualNum, int& nBaseNum, + int& totalQual) { + const hn::ScalableTag d; + const int N = hn::Lanes(d); + + const auto vThresh = hn::Set(d, static_cast(qualThreshold)); + const auto vN = hn::Set(d, static_cast('N')); + const auto v33 = hn::Set(d, static_cast(33)); + + int lowQual = 0; + int nBase = 0; + int qualSum = 0; + int i = 0; + + // Process in blocks of up to 255 vectors to avoid u16 overflow. + // SumsOf2 produces at most 2*93 = 186 per u16 lane (valid FASTQ range). + // Over 255 iterations: 255 * 186 = 47430, safely within u16 (max 65535). + const int blockSize = 255 * N; + + for (int blockStart = 0; blockStart < len; blockStart += blockSize) { + int blockEnd = blockStart + blockSize; + if (blockEnd > len) blockEnd = len; + + // Use u16 accumulator for quality sum within this block. + // SumsOf2 pairwise-adds adjacent u8 lanes into u16, avoiding + // PromoteUpperTo which is unavailable on HWY_SCALAR. + const hn::ScalableTag d16; + auto vQualSum16 = hn::Zero(d16); + + for (i = blockStart; i + N <= blockEnd; i += N) { + const auto vQual = hn::LoadU(d, reinterpret_cast(qualstr + i)); + const auto vSeq = hn::LoadU(d, reinterpret_cast(seqstr + i)); + + // Count bases with quality < threshold + const auto maskLowQ = hn::Lt(vQual, vThresh); + lowQual += hn::CountTrue(d, maskLowQ); + + // Count N bases + const auto maskN = hn::Eq(vSeq, vN); + nBase += hn::CountTrue(d, maskN); + + // Subtract 33 and accumulate quality into u16 accumulator. + // SumsOf2 adds adjacent u8 pairs -> u16, works on all targets. + const auto vQualAdj = hn::Sub(vQual, v33); + vQualSum16 = hn::Add(vQualSum16, hn::SumsOf2(vQualAdj)); + } + + // Reduce u16 accumulator to scalar via SumsOf2 -> u32 then ReduceSum + const hn::ScalableTag d32; + qualSum += static_cast( + hn::ReduceSum(d32, hn::SumsOf2(vQualSum16))); + } + + // Scalar tail + for (; i < len; i++) { + uint8_t qual = static_cast(qualstr[i]); + qualSum += qual - 33; + if (qual < static_cast(qualThreshold)) lowQual++; + if (seqstr[i] == 'N') nBase++; + } + + lowQualNum = lowQual; + nBaseNum = nBase; + totalQual = qualSum; +} + +void ReverseComplementImpl(const char* src, char* dst, int len) { + const hn::ScalableTag d; + const int N = hn::Lanes(d); + + // Complement via single table lookup on low nibble of ASCII code. + // DNA base low nibbles: A/a=1, C/c=3, T/t=4, G/g=7, N=14. + // Uppercase and lowercase share the same low nibble, so one table + // handles both. Unmapped nibbles default to 'N'. + HWY_ALIGN static const uint8_t kComplement[16] = { + 'N', 'T', 'N', 'G', 'A', 'N', 'N', 'C', + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N' + }; + const auto table = hn::LoadDup128(d, kComplement); + const auto vMask = hn::Set(d, static_cast(0x0F)); + + int i = 0; + for (; i + N <= len; i += N) { + const auto v = hn::LoadU(d, reinterpret_cast(src + i)); + const auto indices = hn::And(v, vMask); + const auto comp = hn::TableLookupBytes(table, indices); + + // Reverse and store at mirrored position + const auto revComp = hn::Reverse(d, comp); + hn::StoreU(revComp, d, reinterpret_cast(dst + len - i - N)); + } + + // Scalar tail + for (; i < len; i++) { + char base = src[i]; + char comp; + switch (base) { + case 'A': case 'a': comp = 'T'; break; + case 'T': case 't': comp = 'A'; break; + case 'C': case 'c': comp = 'G'; break; + case 'G': case 'g': comp = 'C'; break; + default: comp = 'N'; break; + } + dst[len - 1 - i] = comp; + } +} + +int CountAdjacentDiffsImpl(const char* data, int len) { + if (len <= 1) return 0; + + const hn::ScalableTag d; + const int N = hn::Lanes(d); + + int diff = 0; + int i = 0; + + // Compare data[i..i+N-1] with data[i+1..i+N] + for (; i + N < len; i += N) { + const auto v1 = hn::LoadU(d, reinterpret_cast(data + i)); + const auto v2 = hn::LoadU(d, reinterpret_cast(data + i + 1)); + const auto maskNe = hn::Ne(v1, v2); + diff += hn::CountTrue(d, maskNe); + } + + // Scalar tail + for (; i < len - 1; i++) { + if (data[i] != data[i + 1]) diff++; + } + + return diff; +} + +int CountMismatchesImpl(const char* a, const char* b, int len) { + const hn::ScalableTag d; + const int N = hn::Lanes(d); + + int diff = 0; + int i = 0; + + for (; i + N <= len; i += N) { + const auto va = hn::LoadU(d, reinterpret_cast(a + i)); + const auto vb = hn::LoadU(d, reinterpret_cast(b + i)); + const auto maskNe = hn::Ne(va, vb); + diff += hn::CountTrue(d, maskNe); + } + + // Scalar tail + for (; i < len; i++) { + if (a[i] != b[i]) diff++; + } + + return diff; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace fastplong_simd +HWY_AFTER_NAMESPACE(); + +// ---- Dynamic dispatch wrappers (compiled once) ---- +#if HWY_ONCE + +#include +#include +#include + +namespace fastplong_simd { + +HWY_EXPORT(CountQualityMetricsImpl); +HWY_EXPORT(ReverseComplementImpl); +HWY_EXPORT(CountAdjacentDiffsImpl); +HWY_EXPORT(CountMismatchesImpl); + +void countQualityMetrics(const char* qualstr, const char* seqstr, int len, + char qualThreshold, int& lowQualNum, int& nBaseNum, + int& totalQual) { + HWY_DYNAMIC_DISPATCH(CountQualityMetricsImpl)(qualstr, seqstr, len, + qualThreshold, lowQualNum, + nBaseNum, totalQual); +} + +void reverseComplement(const char* src, char* dst, int len) { + HWY_DYNAMIC_DISPATCH(ReverseComplementImpl)(src, dst, len); +} + +int countAdjacentDiffs(const char* data, int len) { + return HWY_DYNAMIC_DISPATCH(CountAdjacentDiffsImpl)(data, len); +} + +int countMismatches(const char* a, const char* b, int len) { + return HWY_DYNAMIC_DISPATCH(CountMismatchesImpl)(a, b, len); +} + +// ---- Scalar reference implementations for testing ---- + +static void scalarCountQualityMetrics(const char* qualstr, const char* seqstr, + int len, char qualThreshold, + int& lowQualNum, int& nBaseNum, + int& totalQual) { + lowQualNum = 0; + nBaseNum = 0; + totalQual = 0; + for (int i = 0; i < len; i++) { + uint8_t q = static_cast(qualstr[i]); + totalQual += q - 33; + if (q < static_cast(qualThreshold)) lowQualNum++; + if (seqstr[i] == 'N') nBaseNum++; + } +} + +static void scalarReverseComplement(const char* src, char* dst, int len) { + for (int i = 0; i < len; i++) { + char c; + switch (src[i]) { + case 'A': case 'a': c = 'T'; break; + case 'T': case 't': c = 'A'; break; + case 'C': case 'c': c = 'G'; break; + case 'G': case 'g': c = 'C'; break; + default: c = 'N'; break; + } + dst[len - 1 - i] = c; + } +} + +static int scalarCountAdjacentDiffs(const char* data, int len) { + int diff = 0; + for (int i = 0; i < len - 1; i++) { + if (data[i] != data[i + 1]) diff++; + } + return diff; +} + +static int scalarCountMismatches(const char* a, const char* b, int len) { + int diff = 0; + for (int i = 0; i < len; i++) { + if (a[i] != b[i]) diff++; + } + return diff; +} + +bool testSimd() { + bool pass = true; + + // --- reverseComplement --- + // Basic case + { + const char* in = "AAAATTTTCCCCGGGG"; + int len = 16; + char out[16]; + reverseComplement(in, out, len); + char ref[16]; + scalarReverseComplement(in, ref, len); + if (memcmp(out, ref, len) != 0) { + fprintf(stderr, "FAIL: reverseComplement basic\n"); + pass = false; + } + } + // Mixed case + { + const char* in = "AaTtCcGgN"; + int len = 9; + char out[9]; + reverseComplement(in, out, len); + char ref[9]; + scalarReverseComplement(in, ref, len); + if (memcmp(out, ref, len) != 0) { + fprintf(stderr, "FAIL: reverseComplement mixed case\n"); + pass = false; + } + } + // Empty + { + char out[1] = {'X'}; + reverseComplement("", out, 0); + // Should not crash; out unchanged + } + // Length 1 + { + char out[1]; + reverseComplement("A", out, 1); + if (out[0] != 'T') { + fprintf(stderr, "FAIL: reverseComplement len=1\n"); + pass = false; + } + } + // Long string (> typical SIMD width) to exercise SIMD + tail + { + const char* in = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG"; + int len = 68; + char out[68], ref[68]; + reverseComplement(in, out, len); + scalarReverseComplement(in, ref, len); + if (memcmp(out, ref, len) != 0) { + fprintf(stderr, "FAIL: reverseComplement long string\n"); + pass = false; + } + } + + // --- countQualityMetrics --- + // Basic + { + // qual: "IIIII" (phred 40 each), seq: "ACGTN" + const char* qual = "IIIII"; + const char* seq = "ACGTN"; + int lowQ = 0, nBase = 0, totalQ = 0; + countQualityMetrics(qual, seq, 5, '5', lowQ, nBase, totalQ); + int refLowQ = 0, refNBase = 0, refTotalQ = 0; + scalarCountQualityMetrics(qual, seq, 5, '5', refLowQ, refNBase, refTotalQ); + if (lowQ != refLowQ || nBase != refNBase || totalQ != refTotalQ) { + fprintf(stderr, "FAIL: countQualityMetrics basic (lowQ=%d/%d nBase=%d/%d totalQ=%d/%d)\n", + lowQ, refLowQ, nBase, refNBase, totalQ, refTotalQ); + pass = false; + } + } + // All low quality + { + const char* qual = "!!!!!!!!!!"; // phred 0 + const char* seq = "AAAAAAAAAA"; + int lowQ = 0, nBase = 0, totalQ = 0; + countQualityMetrics(qual, seq, 10, '5', lowQ, nBase, totalQ); + int refLowQ = 0, refNBase = 0, refTotalQ = 0; + scalarCountQualityMetrics(qual, seq, 10, '5', refLowQ, refNBase, refTotalQ); + if (lowQ != refLowQ || nBase != refNBase || totalQ != refTotalQ) { + fprintf(stderr, "FAIL: countQualityMetrics all-low\n"); + pass = false; + } + } + // Long string with mixed content + { + const char qual[] = "IIIIII!!!!!IIIII55555NNNNN!!!!!IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII"; + const char seq[] = "ACGTNNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACG"; + int len = 68; + int lowQ = 0, nBase = 0, totalQ = 0; + countQualityMetrics(qual, seq, len, '5', lowQ, nBase, totalQ); + int refLowQ = 0, refNBase = 0, refTotalQ = 0; + scalarCountQualityMetrics(qual, seq, len, '5', refLowQ, refNBase, refTotalQ); + if (lowQ != refLowQ || nBase != refNBase || totalQ != refTotalQ) { + fprintf(stderr, "FAIL: countQualityMetrics long (lowQ=%d/%d nBase=%d/%d totalQ=%d/%d)\n", + lowQ, refLowQ, nBase, refNBase, totalQ, refTotalQ); + pass = false; + } + } + + // --- countAdjacentDiffs --- + // All same + { + int d = countAdjacentDiffs("AAAAAAAAAA", 10); + if (d != 0) { + fprintf(stderr, "FAIL: countAdjacentDiffs all-same got %d\n", d); + pass = false; + } + } + // All different + { + const char* s = "ACACACACAC"; + int d = countAdjacentDiffs(s, 10); + int ref = scalarCountAdjacentDiffs(s, 10); + if (d != ref) { + fprintf(stderr, "FAIL: countAdjacentDiffs all-diff got %d expected %d\n", d, ref); + pass = false; + } + } + // Length 0 and 1 + { + if (countAdjacentDiffs("A", 1) != 0) { + fprintf(stderr, "FAIL: countAdjacentDiffs len=1\n"); + pass = false; + } + if (countAdjacentDiffs("", 0) != 0) { + fprintf(stderr, "FAIL: countAdjacentDiffs len=0\n"); + pass = false; + } + } + // Long string + { + const char* s = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG"; + int len = 68; + int d = countAdjacentDiffs(s, len); + int ref = scalarCountAdjacentDiffs(s, len); + if (d != ref) { + fprintf(stderr, "FAIL: countAdjacentDiffs long got %d expected %d\n", d, ref); + pass = false; + } + } + + // --- countMismatches --- + // Identical + { + const char* s = "ACGTACGTACGT"; + int d = countMismatches(s, s, 12); + if (d != 0) { + fprintf(stderr, "FAIL: countMismatches identical got %d\n", d); + pass = false; + } + } + // All different + { + int d = countMismatches("AAAA", "TTTT", 4); + if (d != 4) { + fprintf(stderr, "FAIL: countMismatches all-diff got %d\n", d); + pass = false; + } + } + // Long string + { + const char* a = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG"; + const char* b = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG"; + int d = countMismatches(a, b, 68); + if (d != 0) { + fprintf(stderr, "FAIL: countMismatches long-identical got %d\n", d); + pass = false; + } + } + { + const char* a = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + const char* b = "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"; + int d = countMismatches(a, b, 66); + if (d != 66) { + fprintf(stderr, "FAIL: countMismatches long-alldiff got %d\n", d); + pass = false; + } + } + // Length 0 + { + int d = countMismatches("A", "T", 0); + if (d != 0) { + fprintf(stderr, "FAIL: countMismatches len=0 got %d\n", d); + pass = false; + } + } + + return pass; +} + +} // namespace fastplong_simd + +#endif // HWY_ONCE diff --git a/src/simd.h b/src/simd.h new file mode 100644 index 0000000..863fcf1 --- /dev/null +++ b/src/simd.h @@ -0,0 +1,34 @@ +#ifndef FASTPLONG_SIMD_H +#define FASTPLONG_SIMD_H + +#include + +namespace fastplong_simd { + +// Count quality metrics for a read in one pass. +// qualstr/seqstr: quality and sequence strings of length len. +// qualThreshold: phred+33 encoded quality threshold for "low quality". +// Outputs: lowQualNum, nBaseNum, totalQual (sum of qual-33 values). +void countQualityMetrics(const char* qualstr, const char* seqstr, int len, + char qualThreshold, int& lowQualNum, int& nBaseNum, + int& totalQual); + +// Reverse complement a DNA sequence. +// src: input sequence of length len. +// dst: output buffer of at least len bytes (must NOT alias src). +void reverseComplement(const char* src, char* dst, int len); + +// Count adjacent-base differences for low complexity filter. +// Returns the number of positions where data[i] != data[i+1]. +int countAdjacentDiffs(const char* data, int len); + +// Count mismatches between two byte strings. +// Returns the number of positions where a[i] != b[i], up to len bytes. +int countMismatches(const char* a, const char* b, int len); + +// Run all SIMD unit tests. Returns true if all pass. +bool testSimd(); + +} // namespace fastplong_simd + +#endif // FASTPLONG_SIMD_H diff --git a/src/simdutil.h b/src/simdutil.h deleted file mode 100644 index 171fbdf..0000000 --- a/src/simdutil.h +++ /dev/null @@ -1,38 +0,0 @@ -#pragma once -#include -#include "hwy/highway.h" - -HWY_BEFORE_NAMESPACE(); -namespace hwy { -namespace HWY_NAMESPACE { - -template > -void Transform1Reversed(D d, T* HWY_RESTRICT inout, size_t count, - const T* HWY_RESTRICT in1, const Func& func) { - const size_t N = hwy::HWY_NAMESPACE::Lanes(d); - - size_t idx = 0; - if (count >= N) { - for (; idx <= count - N; idx += N) { - const Vec v = LoadU(d, inout + idx); - const Vec v1 = LoadU(d, in1 + idx); - StoreU(Reverse(d, func(d, v, v1)), d, inout + count - N - idx); - } - } - - // `count` was a multiple of the vector length `N`: already done. - if (HWY_UNLIKELY(idx == count)) return; - - const size_t remaining = count - idx; - HWY_DASSERT(0 != remaining && remaining < N); - const Vec v = LoadN(d, inout + idx, remaining); - const Vec v1 = LoadN(d, in1 + idx, remaining); - StoreN( - SlideDownLanes( - d, Reverse(d, func(d, v, v1)), N - remaining), - d, inout, remaining); -} - -} -} -HWY_AFTER_NAMESPACE(); diff --git a/test/simd_test.cpp b/test/simd_test.cpp new file mode 100644 index 0000000..04625a9 --- /dev/null +++ b/test/simd_test.cpp @@ -0,0 +1,6 @@ +#include +#include "../src/simd.h" + +TEST(SimdTest, allTests) { + EXPECT_TRUE(fastplong_simd::testSimd()); +} diff --git a/testdata/gen_ont_reads.py b/testdata/gen_ont_reads.py new file mode 100644 index 0000000..5796cc5 --- /dev/null +++ b/testdata/gen_ont_reads.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +"""Generate dummy ONT (Oxford Nanopore) FASTQ reads for benchmarking. + +Produces 10,000 reads with realistic ONT characteristics: +- Read lengths: log-normal distribution, median ~5kb, range ~500bp-50kb +- Quality scores: mean ~Q10-Q15 (typical ONT), with some low-quality regions +- Random ATCG sequence with occasional N bases +- Adapter sequences at start/end of some reads +""" + +import random +import gzip +import math +import os + +SEED = 42 +NUM_READS = 10_000 +OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__)) + +# ONT adapter sequences (real ones from Oxford Nanopore) +START_ADAPTER = "AATGTACTTCGTTCAGTTACGTATTGCT" +END_ADAPTER = "GCAATACGTAACTGAACGAAGT" + +BASES = "ACGT" + + +def random_quality(length: int) -> str: + """Generate ONT-like quality string (phred+33). + + ONT typical range: Q5-Q20, with occasional dips. + """ + quals = [] + # Start with a baseline quality that varies per read + base_qual = random.gauss(12, 3) # mean Q12 + + for i in range(length): + # Occasional low-quality regions (simulating systematic errors) + if random.random() < 0.02: + q = max(0, min(40, int(random.gauss(5, 2)))) + else: + q = max(0, min(40, int(random.gauss(base_qual, 4)))) + quals.append(chr(q + 33)) + return "".join(quals) + + +def random_sequence(length: int) -> str: + """Generate random DNA sequence with occasional N bases.""" + seq = [] + for _ in range(length): + if random.random() < 0.005: # 0.5% N rate + seq.append("N") + else: + seq.append(random.choice(BASES)) + return "".join(seq) + + +def generate_read_length() -> int: + """Log-normal distribution mimicking ONT read lengths.""" + # median ~5000, with long tail up to ~50000 + length = int(random.lognormvariate(math.log(5000), 0.7)) + return max(200, min(100000, length)) + + +def write_fastq(filepath: str, reads: list, compress: bool = False): + """Write reads to FASTQ file, optionally gzipped.""" + opener = gzip.open if compress else open + mode = "wt" if compress else "w" + + with opener(filepath, mode) as f: + for read_id, seq, qual in reads: + f.write(f"@{read_id}\n") + f.write(f"{seq}\n") + f.write("+\n") + f.write(f"{qual}\n") + + +def main(): + random.seed(SEED) + + reads = [] + total_bases = 0 + + for i in range(NUM_READS): + length = generate_read_length() + + # Some reads have adapters (30% start, 30% end, 10% both) + seq = "" + r = random.random() + if r < 0.10: + # Both adapters + core_len = max(100, length - len(START_ADAPTER) - len(END_ADAPTER)) + seq = START_ADAPTER + random_sequence(core_len) + END_ADAPTER + elif r < 0.40: + # Start adapter only + core_len = max(100, length - len(START_ADAPTER)) + seq = START_ADAPTER + random_sequence(core_len) + elif r < 0.70: + # End adapter only + core_len = max(100, length - len(END_ADAPTER)) + seq = random_sequence(core_len) + END_ADAPTER + else: + seq = random_sequence(length) + + actual_len = len(seq) + qual = random_quality(actual_len) + + read_id = ( + f"ont_read_{i:06d} " + f"runid=dummy_run_001 " + f"read={i} " + f"ch={random.randint(1, 512)} " + f"start_time=2024-01-01T00:00:{i % 60:02d}Z " + f"flow_cell_id=FAK00001 " + f"protocol_group_id=benchmark " + f"sample_id=dummy_ont" + ) + + reads.append((read_id, seq, qual)) + total_bases += actual_len + + # Write uncompressed + fq_path = os.path.join(OUTPUT_DIR, "ont_10k.fastq") + write_fastq(fq_path, reads, compress=False) + + # Write gzipped + gz_path = os.path.join(OUTPUT_DIR, "ont_10k.fastq.gz") + write_fastq(gz_path, reads, compress=True) + + # Stats + lengths = [len(seq) for _, seq, _ in reads] + lengths.sort() + print(f"Generated {NUM_READS} ONT reads") + print(f"Total bases: {total_bases:,}") + print(f"Mean length: {total_bases // NUM_READS:,}") + print(f"Median length: {lengths[len(lengths)//2]:,}") + print(f"Min length: {lengths[0]:,}") + print(f"Max length: {lengths[-1]:,}") + print(f"N50: {calculate_n50(lengths, total_bases):,}") + print(f"Written: {fq_path} ({os.path.getsize(fq_path) / 1024 / 1024:.1f} MB)") + print(f"Written: {gz_path} ({os.path.getsize(gz_path) / 1024 / 1024:.1f} MB)") + + +def calculate_n50(sorted_lengths: list, total_bases: int) -> int: + cumsum = 0 + for l in reversed(sorted_lengths): + cumsum += l + if cumsum >= total_bases / 2: + return l + return sorted_lengths[-1] + + +if __name__ == "__main__": + main() diff --git a/third_party/googletest b/third_party/googletest new file mode 160000 index 0000000..52eb810 --- /dev/null +++ b/third_party/googletest @@ -0,0 +1 @@ +Subproject commit 52eb8108c5bdec04579160ae17225d66034bd723 diff --git a/third_party/highway b/third_party/highway new file mode 160000 index 0000000..ac0d5d2 --- /dev/null +++ b/third_party/highway @@ -0,0 +1 @@ +Subproject commit ac0d5d297b13ab1b89f48484fc7911082d76a93f diff --git a/third_party/isa-l b/third_party/isa-l new file mode 160000 index 0000000..c3d0760 --- /dev/null +++ b/third_party/isa-l @@ -0,0 +1 @@ +Subproject commit c3d076027f0a7377b78472876a930af98c4eded6 diff --git a/third_party/libdeflate b/third_party/libdeflate new file mode 160000 index 0000000..22f6e02 --- /dev/null +++ b/third_party/libdeflate @@ -0,0 +1 @@ +Subproject commit 22f6e0235ff5651c7300dc120a7ee0d9f0ba3426