diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e34223f..e7e0fec 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -10,27 +10,23 @@ jobs:
       matrix:
         os:
           - ubuntu-24.04
-          - macos-13
-          - macos-14
+          - macos-15
     runs-on: ${{ matrix.os }}
     steps:
       - name: checkout scm
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
 
       - name: install build dependencies (Ubuntu)
-        run: sudo apt update && sudo apt install -y build-essential libgtest-dev libhwy-dev libisal-dev libdeflate-dev
+        run: sudo apt update && sudo apt install -y build-essential cmake nasm
         if: runner.os == 'Linux'
 
-      - name: install build dependencies (MacOS)
-        run: brew install highway googletest isa-l libdeflate
-        if: runner.os == 'macOS'
-
       - name: make fastplong
         run: make -j
 
       - name: make test
-        run: make -j test
+        run: make test
 
       - name: test
         run: chmod a+x ./fastplong && ./fastplong --version
-
diff --git a/.gitignore b/.gitignore
index a14e5af..3530bdb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,3 +31,12 @@
 *.out
 *.app
 bin
+fastplong
+
+# Test data (generated)
+testdata/*.fastq
+testdata/*.fastq.gz
+
+# Reports
+fastplong.html
+fastplong.json
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..5187798
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,12 @@
+[submodule "third_party/highway"]
+	path = third_party/highway
+	url = https://github.com/google/highway.git
+[submodule "third_party/isa-l"]
+	path = third_party/isa-l
+	url = https://github.com/intel/isa-l.git
+[submodule "third_party/libdeflate"]
+	path = third_party/libdeflate
+	url = https://github.com/ebiggers/libdeflate.git
+[submodule "third_party/googletest"]
+	path = third_party/googletest
+	url = https://github.com/google/googletest.git
diff --git a/Makefile b/Makefile
index 05377a7..643debf 100644
--- a/Makefile
+++ b/Makefile
@@ -2,63 +2,114 @@ DIR_INC := ./inc
 DIR_SRC := ./src
 DIR_OBJ := ./obj
 DIR_TEST := ./test
+DIR_HWY := ./third_party/highway
+DIR_ISAL := ./third_party/isa-l
+DIR_LIBDEFLATE := ./third_party/libdeflate
+DIR_GTEST := ./third_party/googletest
 
 PREFIX ?= /usr/local
 BINDIR ?= $(PREFIX)/bin
-INCLUDE_DIRS ?= /opt/homebrew/include
-LIBRARY_DIRS ?= /opt/homebrew/lib
 
 SRC := $(wildcard ${DIR_SRC}/*.cpp)
 TEST := $(wildcard ${DIR_TEST}/*.cpp)
 OBJ := $(patsubst %.cpp,${DIR_OBJ}/%.o,$(notdir ${SRC}))
 TEST_OBJ := $(patsubst %.cpp,${DIR_OBJ}/%.o,$(notdir ${TEST}))
 
+# Highway runtime dispatch support
+HWY_OBJS := ${DIR_OBJ}/hwy_targets.o ${DIR_OBJ}/hwy_abort.o
+OBJ += $(HWY_OBJS)
+
 TARGET := fastplong
 
 BIN_TARGET := ${TARGET}
 TEST_TARGET := bin/fastplong_unittest
 
 CXX ?= g++
-CXXFLAGS := -std=c++14 -pthread -g -O3 -MP -MD -I${DIR_INC} -I${DIR_SRC} $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) ${CXXFLAGS}
-LIBS := -lisal -ldeflate -lpthread -lhwy
-STATIC_FLAGS := -static -L. -Wl,--no-as-needed -pthread
-LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(LIBS) $(LD_FLAGS)
-STATIC_LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(STATIC_FLAGS) $(LIBS) $(STATIC_LD_FLAGS)
+CXXFLAGS := -std=c++17 -pthread -g -O3 -MD -MP \
+	-I. -I${DIR_INC} -I${DIR_SRC} -I${DIR_HWY} \
+	-I${DIR_ISAL}/include -I${DIR_LIBDEFLATE} \
+	${CXXFLAGS}
+
+# Static libraries built from submodules
+ISAL_LIB := $(DIR_ISAL)/bin/isa-l.a
+LIBDEFLATE_LIB := $(DIR_LIBDEFLATE)/build/libdeflate.a
+GTEST_LIB := $(DIR_GTEST)/build/lib/libgtest.a
+GTEST_MAIN_LIB := $(DIR_GTEST)/build/lib/libgtest_main.a
+
+# On Linux: fully static binary; on macOS: static libs, dynamic system runtime
+UNAME_S := $(shell uname -s)
+UNAME_M := $(shell uname -m)
+ifeq ($(UNAME_S),Linux)
+  LD_FLAGS := -static -Wl,--no-as-needed -lpthread
+else
+  LD_FLAGS := -lpthread
+endif
+
+# isa-l: macOS reports arm64 but isa-l expects aarch64 for NEON assembly
+ISAL_MAKE_ARGS :=
+ifeq ($(UNAME_M),arm64)
+  ISAL_MAKE_ARGS := host_cpu=aarch64 arch=aarch64
+endif
 
+# Default target: build deps from submodules and link statically
+${BIN_TARGET}: $(ISAL_LIB) $(LIBDEFLATE_LIB) ${OBJ}
+	$(CXX) $(OBJ) -o $@ $(ISAL_LIB) $(LIBDEFLATE_LIB) $(LD_FLAGS)
 
-${BIN_TARGET}:${OBJ}
-	$(CXX) $(OBJ) -o $@ $(LD_FLAGS)
+# Build isa-l static library from submodule
+$(ISAL_LIB):
+	$(MAKE) -C $(DIR_ISAL) -f Makefile.unx lib $(ISAL_MAKE_ARGS)
 
-static:${OBJ}
-	$(CXX) $(OBJ) -o ${BIN_TARGET} $(STATIC_LD_FLAGS)
+# Build libdeflate static library from submodule
+$(LIBDEFLATE_LIB):
+	cd $(DIR_LIBDEFLATE) && cmake -B build \
+		-DCMAKE_BUILD_TYPE=Release \
+		-DLIBDEFLATE_BUILD_SHARED_LIB=OFF \
+		-DLIBDEFLATE_BUILD_GZIP=OFF && \
+		cmake --build build
+
+# Build googletest static library from submodule
+$(GTEST_LIB) $(GTEST_MAIN_LIB):
+	cd $(DIR_GTEST) && cmake -B build \
+		-DCMAKE_BUILD_TYPE=Release \
+		-DBUILD_GMOCK=OFF && \
+		cmake --build build
 
 ${DIR_OBJ}/%.o:${DIR_SRC}/%.cpp
-	@mkdir -p $(@D) 
+	@mkdir -p $(@D)
+	$(CXX) -c $< -o $@ $(CXXFLAGS)
+
+# Highway source files for runtime CPU detection and error handling
+${DIR_OBJ}/hwy_targets.o:${DIR_HWY}/hwy/targets.cc
+	@mkdir -p $(@D)
 	$(CXX) -c $< -o $@ $(CXXFLAGS)
 
-.PHONY:clean
-.PHONY:static
+${DIR_OBJ}/hwy_abort.o:${DIR_HWY}/hwy/abort.cc
+	@mkdir -p $(@D)
+	$(CXX) -c $< -o $@ $(CXXFLAGS)
+
+.PHONY: clean install clean-deps test
+
 clean:
 	@rm -rf $(DIR_OBJ)
 	@rm -f $(TARGET)
 	@rm -f $(TEST_TARGET)
 
+clean-deps:
+	-$(MAKE) -C $(DIR_ISAL) -f Makefile.unx clean 2>/dev/null || true
+	-rm -rf $(DIR_LIBDEFLATE)/build 2>/dev/null || true
+	-rm -rf $(DIR_GTEST)/build 2>/dev/null || true
+
 install:
 	install $(TARGET) $(BINDIR)/$(TARGET)
 	@echo "Installed."
 
 ${DIR_OBJ}/%.o:${DIR_TEST}/%.cpp
-	@mkdir -p $(@D) 
-	$(CXX) -c $< -o $@ $(CXXFLAGS)
-
-test-static: ${TEST_OBJ} ${OBJ}
-	@mkdir -p bin
-	$(CXX) $(TEST_OBJ) ${OBJ:./obj/main.o=} -o ${TEST_TARGET} $(STATIC_LD_FLAGS) -lgtest -lgtest_main
-	./${TEST_TARGET}
+	@mkdir -p $(@D)
+	$(CXX) -c $< -o $@ $(CXXFLAGS) -I${DIR_GTEST}/googletest/include
 
-test:${TEST_OBJ} ${OBJ}
+test: $(ISAL_LIB) $(LIBDEFLATE_LIB) $(GTEST_LIB) ${TEST_OBJ} ${OBJ}
 	@mkdir -p bin
-	$(CXX) $(TEST_OBJ) ${OBJ:./obj/main.o=} -o ${TEST_TARGET} $(LD_FLAGS) -lgtest -lgtest_main
+	$(CXX) $(TEST_OBJ) ${OBJ:./obj/main.o=} -o ${TEST_TARGET} $(ISAL_LIB) $(LIBDEFLATE_LIB) $(GTEST_LIB) $(GTEST_MAIN_LIB) $(LD_FLAGS)
 	./${TEST_TARGET}
 
 -include $(OBJ:.o=.d)
diff --git a/README.md b/README.md
index 2b7ff65..5f1fcaf 100644
--- a/README.md
+++ b/README.md
@@ -67,30 +67,21 @@ mv fastplong.0.2.2 fastplong
 chmod a+x ./fastplong
 ```
 ## or compile from source
-`fastplong` depends on `libdeflate` and `isa-l` for fast decompression and compression of zipped data, and depends on `libhwy` for SIMD acceleration. It's recommended to install all of them via Anaconda:
-```
-conda install conda-forge::libdeflate
-conda install conda-forge::isa-l
-conda install conda-forge::libhwy
-```
-You can also try to install them with other package management systems like `apt/yum` on Linux, or `brew` on MacOS. Otherwise you can compile them from source (https://github.com/intel/isa-l, https://github.com/ebiggers/libdeflate, and https://github.com/google/highway)
-
-### download and build fastplong
+`fastplong` depends on `isa-l` and `libdeflate`, which are included as git submodules and built statically. Requires `cmake` and a C/C++ compiler.
 ```shell
-# get source (you can also use browser to download from master or releases)
-git clone https://github.com/OpenGene/fastplong.git
-
-# build
+git clone --recursive https://github.com/OpenGene/fastplong.git
 cd fastplong
 make -j
 
-# test
-make test
-
 # Install
 sudo make install
 ```
 
+If you already cloned without `--recursive`, initialize the submodules first:
+```shell
+git submodule update --init --recursive
+```
+
 # input and output
 Specify input by `-i` or `--in`, and specify output by `-o` or `--out`.
 * if you don't specify the output file names, no output files will be written, but the QC will still be done for both data before and after filtering.
diff --git a/docs/plans/2026-02-26-simd-highway-refactor-design.md b/docs/plans/2026-02-26-simd-highway-refactor-design.md
new file mode 100644
index 0000000..47b8e4c
--- /dev/null
+++ b/docs/plans/2026-02-26-simd-highway-refactor-design.md
@@ -0,0 +1,76 @@
+# SIMD Highway Refactor Design
+
+## Goal
+
+Centralize all SIMD code into a single `simd.h`/`simd.cpp` pair using Google Highway's `foreach_target.h` multi-target compilation + `HWY_DYNAMIC_DISPATCH` runtime dispatch pattern. Port proven implementations from fastp and accelerate remaining scalar hot loops.
+
+## Current State
+
+- fastplong links `-lhwy` as system library
+- SIMD exists in `adaptertrimmer.cpp` (mismatch counting) and `sequence.cpp` (reverse complement) but without multi-target dispatch
+- `simdutil.h` has a custom `Transform1Reversed` template
+- `filter.cpp` hot loops (`passFilter`, `passLowComplexityFilter`) are pure scalar
+
+## Changes
+
+### New Files
+
+- `third_party/highway/` — Highway source (header-only + `targets.cc` / `abort.cc`)
+- `src/simd.h` — Public SIMD API
+- `src/simd.cpp` — Multi-target implementation with `foreach_target.h`
+
+### Modified Files
+
+- `Makefile` — Add `DIR_HWY`, compile `hwy_targets.o` / `hwy_abort.o`, remove `-lhwy`, add `-I${DIR_HWY}`
+- `src/filter.cpp` — Replace scalar loops with `countQualityMetrics` and `countAdjacentDiffs`
+- `src/sequence.cpp` — Replace inline Highway code with `reverseComplement` call
+- `src/adaptertrimmer.cpp` — Replace inner SIMD loop with `countMismatches` call
+
+### Deleted Files
+
+- `src/simdutil.h` — Functionality merged into `simd.cpp`
+
+## API
+
+```cpp
+namespace fastplong_simd {
+    void countQualityMetrics(const char* qualstr, const char* seqstr, int len,
+                             char qualThreshold, int& lowQualNum, int& nBaseNum,
+                             int& totalQual);
+    void reverseComplement(const char* src, char* dst, int len);
+    int countAdjacentDiffs(const char* data, int len);
+    int countMismatches(const char* a, const char* b, int len);
+    bool testSimd();
+}
+```
+
+## simd.cpp Structure
+
+```
+#define HWY_TARGET_INCLUDE "src/simd.cpp"
+#include "hwy/foreach_target.h"
+
+namespace fastplong_simd::HWY_NAMESPACE {
+    CountQualityMetricsImpl()   — vectorized quality stats (SumsOf2 overflow-safe)
+    ReverseComplementImpl()     — vectorized complement + reverse
+    CountAdjacentDiffsImpl()    — vectorized adjacent comparison
+    CountMismatchesImpl()       — vectorized byte comparison
+}
+
+#if HWY_ONCE
+    HWY_EXPORT(...)             — register all target versions
+    dispatch wrappers           — HWY_DYNAMIC_DISPATCH
+    scalar references           — for test validation
+    testSimd()                  — unit tests
+#endif
+```
+
+## Key Technical Details
+
+- Block-based accumulation in `countQualityMetrics` (255*N elements per block) prevents u16 overflow for long reads (10k-100k+ bp)
+- `SumsOf2` for u8->u16 promotion works on all targets including HWY_SCALAR
+- `searchAdapter` outer logic (branching, edit_distance) stays in `adaptertrimmer.cpp`; only inner mismatch loop delegates to `countMismatches`
+
+## Reference
+
+Based on fastp's `src/simd.h` / `src/simd.cpp` implementation pattern.
diff --git a/docs/plans/2026-02-26-simd-highway-refactor.md b/docs/plans/2026-02-26-simd-highway-refactor.md
new file mode 100644
index 0000000..47f40ba
--- /dev/null
+++ b/docs/plans/2026-02-26-simd-highway-refactor.md
@@ -0,0 +1,896 @@
+# SIMD Highway Refactor Implementation Plan
+
+> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
+
+**Goal:** Centralize all SIMD code into `simd.h`/`simd.cpp` using Highway's `foreach_target.h` multi-target dispatch, port fastp's proven implementations, and accelerate remaining scalar hot loops.
+
+**Architecture:** Single `simd.cpp` compiled via `foreach_target.h` produces code for all CPU targets (SSE2, AVX2, NEON, scalar). Runtime dispatch via `HWY_DYNAMIC_DISPATCH` selects the fastest version. Highway is vendored as `third_party/highway` git submodule.
+
+**Tech Stack:** C++14, Google Highway 1.3, gtest
+
+---
+
+### Task 1: Add Highway as git submodule
+
+**Files:**
+- Create: `third_party/highway/` (git submodule)
+- Modify: `.gitmodules`
+
+**Step 1: Add the submodule**
+
+```bash
+cd /Users/kimy/workspace/fastplong
+mkdir -p third_party
+git submodule add https://github.com/google/highway.git third_party/highway
+cd third_party/highway
+git checkout ac0d5d29
+cd ../..
+```
+
+**Step 2: Verify the submodule**
+
+Run: `ls third_party/highway/hwy/highway.h`
+Expected: File exists
+
+Run: `grep HWY_MAJOR third_party/highway/hwy/base.h | head -2`
+Expected: `#define HWY_MAJOR 1` and `#define HWY_MINOR 3`
+
+**Step 3: Commit**
+
+```bash
+git add .gitmodules third_party/highway
+git commit -m "feat: add Highway SIMD library as git submodule (v1.3)"
+```
+
+---
+
+### Task 2: Update Makefile for Highway
+
+**Files:**
+- Modify: `Makefile`
+
+**Step 1: Update the Makefile**
+
+Replace the entire Makefile with:
+
+```makefile
+DIR_INC := ./inc
+DIR_SRC := ./src
+DIR_OBJ := ./obj
+DIR_TEST := ./test
+DIR_HWY := ./third_party/highway
+
+PREFIX ?= /usr/local
+BINDIR ?= $(PREFIX)/bin
+INCLUDE_DIRS ?= /opt/homebrew/include
+LIBRARY_DIRS ?= /opt/homebrew/lib
+
+SRC := $(wildcard ${DIR_SRC}/*.cpp)
+TEST := $(wildcard ${DIR_TEST}/*.cpp)
+OBJ := $(patsubst %.cpp,${DIR_OBJ}/%.o,$(notdir ${SRC}))
+TEST_OBJ := $(patsubst %.cpp,${DIR_OBJ}/%.o,$(notdir ${TEST}))
+
+# Highway runtime dispatch support
+HWY_OBJS := ${DIR_OBJ}/hwy_targets.o ${DIR_OBJ}/hwy_abort.o
+OBJ += $(HWY_OBJS)
+
+TARGET := fastplong
+
+BIN_TARGET := ${TARGET}
+TEST_TARGET := bin/fastplong_unittest
+
+CXX ?= g++
+CXXFLAGS := -std=c++14 -pthread -g -O3 -MP -MD -I. -I${DIR_INC} -I${DIR_SRC} -I${DIR_HWY} $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) ${CXXFLAGS}
+LIBS := -lisal -ldeflate -lpthread
+STATIC_FLAGS := -static -L. -Wl,--no-as-needed -pthread
+LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(LIBS) $(LD_FLAGS)
+STATIC_LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(STATIC_FLAGS) $(LIBS) $(STATIC_LD_FLAGS)
+
+
+${BIN_TARGET}:${OBJ}
+	$(CXX) $(OBJ) -o $@ $(LD_FLAGS)
+
+static:${OBJ}
+	$(CXX) $(OBJ) -o ${BIN_TARGET} $(STATIC_LD_FLAGS)
+
+${DIR_OBJ}/%.o:${DIR_SRC}/%.cpp
+	@mkdir -p $(@D)
+	$(CXX) -c $< -o $@ $(CXXFLAGS)
+
+# Highway source files for runtime CPU detection and error handling
+${DIR_OBJ}/hwy_targets.o:${DIR_HWY}/hwy/targets.cc
+	@mkdir -p $(@D)
+	$(CXX) -c $< -o $@ $(CXXFLAGS)
+
+${DIR_OBJ}/hwy_abort.o:${DIR_HWY}/hwy/abort.cc
+	@mkdir -p $(@D)
+	$(CXX) -c $< -o $@ $(CXXFLAGS)
+
+.PHONY:clean
+.PHONY:static
+clean:
+	@rm -rf $(DIR_OBJ)
+	@rm -f $(TARGET)
+	@rm -f $(TEST_TARGET)
+
+install:
+	install $(TARGET) $(BINDIR)/$(TARGET)
+	@echo "Installed."
+
+${DIR_OBJ}/%.o:${DIR_TEST}/%.cpp
+	@mkdir -p $(@D)
+	$(CXX) -c $< -o $@ $(CXXFLAGS)
+
+test-static: ${TEST_OBJ} ${OBJ}
+	@mkdir -p bin
+	$(CXX) $(TEST_OBJ) ${OBJ:./obj/main.o=} -o ${TEST_TARGET} $(STATIC_LD_FLAGS) -lgtest -lgtest_main
+	./${TEST_TARGET}
+
+test:${TEST_OBJ} ${OBJ}
+	@mkdir -p bin
+	$(CXX) $(TEST_OBJ) ${OBJ:./obj/main.o=} -o ${TEST_TARGET} $(LD_FLAGS) -lgtest -lgtest_main
+	./${TEST_TARGET}
+
+-include $(OBJ:.o=.d)
+```
+
+Key changes from original:
+- Added `DIR_HWY := ./third_party/highway`
+- Added `-I. -I${DIR_HWY}` to CXXFLAGS
+- Removed `-lhwy` from LIBS
+- Added `HWY_OBJS` for `hwy_targets.o` and `hwy_abort.o`
+- Added build rules for Highway source files
+
+**Step 2: Verify it compiles (will fail at link due to simd changes not yet done, but object files should build)**
+
+Run: `make clean && make -j 2>&1 | tail -5`
+Expected: Compilation succeeds (may have link errors due to missing `-lhwy` until we finish simd.cpp, that's OK)
+
+**Step 3: Commit**
+
+```bash
+git add Makefile
+git commit -m "build: switch Highway from system library to vendored third_party submodule"
+```
+
+---
+
+### Task 3: Create simd.h and simd.cpp
+
+**Files:**
+- Create: `src/simd.h`
+- Create: `src/simd.cpp`
+
+**Step 1: Create `src/simd.h`**
+
+```cpp
+#ifndef FASTPLONG_SIMD_H
+#define FASTPLONG_SIMD_H
+
+#include <cstddef>
+
+namespace fastplong_simd {
+
+// Count quality metrics for a read in one pass.
+// qualstr/seqstr: quality and sequence strings of length len.
+// qualThreshold: phred+33 encoded quality threshold for "low quality".
+// Outputs: lowQualNum, nBaseNum, totalQual (sum of qual-33 values).
+void countQualityMetrics(const char* qualstr, const char* seqstr, int len,
+                         char qualThreshold, int& lowQualNum, int& nBaseNum,
+                         int& totalQual);
+
+// Reverse complement a DNA sequence.
+// src: input sequence of length len.
+// dst: output buffer of at least len bytes (must NOT alias src).
+void reverseComplement(const char* src, char* dst, int len);
+
+// Count adjacent-base differences for low complexity filter.
+// Returns the number of positions where data[i] != data[i+1].
+int countAdjacentDiffs(const char* data, int len);
+
+// Count mismatches between two byte strings.
+// Returns the number of positions where a[i] != b[i], up to len bytes.
+int countMismatches(const char* a, const char* b, int len);
+
+// Run all SIMD unit tests. Returns true if all pass.
+bool testSimd();
+
+}  // namespace fastplong_simd
+
+#endif  // FASTPLONG_SIMD_H
+```
+
+**Step 2: Create `src/simd.cpp`**
+
+```cpp
+// SIMD-accelerated functions for fastplong using Google Highway.
+// This file is compiled once per SIMD target via foreach_target.h.
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "src/simd.cpp"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace fastplong_simd {
+namespace HWY_NAMESPACE {
+
+namespace hn = hwy::HWY_NAMESPACE;
+
+void CountQualityMetricsImpl(const char* qualstr, const char* seqstr, int len,
+                             char qualThreshold, int& lowQualNum, int& nBaseNum,
+                             int& totalQual) {
+    const hn::ScalableTag<uint8_t> d;
+    const int N = hn::Lanes(d);
+
+    const auto vThresh = hn::Set(d, static_cast<uint8_t>(qualThreshold));
+    const auto vN = hn::Set(d, static_cast<uint8_t>('N'));
+    const auto v33 = hn::Set(d, static_cast<uint8_t>(33));
+
+    int lowQual = 0;
+    int nBase = 0;
+    int qualSum = 0;
+    int i = 0;
+
+    // Process in blocks of up to 255 vectors to avoid u16 overflow during
+    // quality sum accumulation (255 * 255 = 65025 fits in u16).
+    const int blockSize = 255 * N;
+
+    for (int blockStart = 0; blockStart < len; blockStart += blockSize) {
+        int blockEnd = blockStart + blockSize;
+        if (blockEnd > len) blockEnd = len;
+
+        const hn::ScalableTag<uint16_t> d16;
+        auto vQualSum16 = hn::Zero(d16);
+
+        for (i = blockStart; i + N <= blockEnd; i += N) {
+            const auto vQual = hn::LoadU(d, reinterpret_cast<const uint8_t*>(qualstr + i));
+            const auto vSeq = hn::LoadU(d, reinterpret_cast<const uint8_t*>(seqstr + i));
+
+            const auto maskLowQ = hn::Lt(vQual, vThresh);
+            lowQual += hn::CountTrue(d, maskLowQ);
+
+            const auto maskN = hn::Eq(vSeq, vN);
+            nBase += hn::CountTrue(d, maskN);
+
+            const auto vQualAdj = hn::Sub(vQual, v33);
+            vQualSum16 = hn::Add(vQualSum16, hn::SumsOf2(vQualAdj));
+        }
+
+        const hn::ScalableTag<uint32_t> d32;
+        qualSum += static_cast<int>(
+            hn::ReduceSum(d32, hn::SumsOf2(vQualSum16)));
+    }
+
+    // Scalar tail
+    for (; i < len; i++) {
+        uint8_t qual = static_cast<uint8_t>(qualstr[i]);
+        qualSum += qual - 33;
+        if (qual < static_cast<uint8_t>(qualThreshold)) lowQual++;
+        if (seqstr[i] == 'N') nBase++;
+    }
+
+    lowQualNum = lowQual;
+    nBaseNum = nBase;
+    totalQual = qualSum;
+}
+
+void ReverseComplementImpl(const char* src, char* dst, int len) {
+    const hn::ScalableTag<uint8_t> d;
+    const int N = hn::Lanes(d);
+
+    const auto vA = hn::Set(d, static_cast<uint8_t>('A'));
+    const auto vT = hn::Set(d, static_cast<uint8_t>('T'));
+    const auto vC = hn::Set(d, static_cast<uint8_t>('C'));
+    const auto vG = hn::Set(d, static_cast<uint8_t>('G'));
+    const auto va = hn::Set(d, static_cast<uint8_t>('a'));
+    const auto vt = hn::Set(d, static_cast<uint8_t>('t'));
+    const auto vc = hn::Set(d, static_cast<uint8_t>('c'));
+    const auto vg = hn::Set(d, static_cast<uint8_t>('g'));
+    const auto vNch = hn::Set(d, static_cast<uint8_t>('N'));
+
+    int i = 0;
+    for (; i + N <= len; i += N) {
+        const auto v = hn::LoadU(d, reinterpret_cast<const uint8_t*>(src + i));
+
+        auto comp = vNch;
+        comp = hn::IfThenElse(hn::Eq(v, vA), vT, comp);
+        comp = hn::IfThenElse(hn::Eq(v, vT), vA, comp);
+        comp = hn::IfThenElse(hn::Eq(v, vC), vG, comp);
+        comp = hn::IfThenElse(hn::Eq(v, vG), vC, comp);
+        comp = hn::IfThenElse(hn::Eq(v, va), vT, comp);
+        comp = hn::IfThenElse(hn::Eq(v, vt), vA, comp);
+        comp = hn::IfThenElse(hn::Eq(v, vc), vG, comp);
+        comp = hn::IfThenElse(hn::Eq(v, vg), vC, comp);
+
+        const auto revComp = hn::Reverse(d, comp);
+        hn::StoreU(revComp, d, reinterpret_cast<uint8_t*>(dst + len - i - N));
+    }
+
+    // Scalar tail
+    for (; i < len; i++) {
+        char base = src[i];
+        char comp;
+        switch (base) {
+            case 'A': case 'a': comp = 'T'; break;
+            case 'T': case 't': comp = 'A'; break;
+            case 'C': case 'c': comp = 'G'; break;
+            case 'G': case 'g': comp = 'C'; break;
+            default: comp = 'N'; break;
+        }
+        dst[len - 1 - i] = comp;
+    }
+}
+
+int CountAdjacentDiffsImpl(const char* data, int len) {
+    if (len <= 1) return 0;
+
+    const hn::ScalableTag<uint8_t> d;
+    const int N = hn::Lanes(d);
+
+    int diff = 0;
+    int i = 0;
+
+    for (; i + N < len; i += N) {
+        const auto v1 = hn::LoadU(d, reinterpret_cast<const uint8_t*>(data + i));
+        const auto v2 = hn::LoadU(d, reinterpret_cast<const uint8_t*>(data + i + 1));
+        const auto maskNe = hn::Ne(v1, v2);
+        diff += hn::CountTrue(d, maskNe);
+    }
+
+    // Scalar tail
+    for (; i < len - 1; i++) {
+        if (data[i] != data[i + 1]) diff++;
+    }
+
+    return diff;
+}
+
+int CountMismatchesImpl(const char* a, const char* b, int len) {
+    const hn::ScalableTag<uint8_t> d;
+    const int N = hn::Lanes(d);
+
+    int diff = 0;
+    int i = 0;
+
+    for (; i + N <= len; i += N) {
+        const auto va = hn::LoadU(d, reinterpret_cast<const uint8_t*>(a + i));
+        const auto vb = hn::LoadU(d, reinterpret_cast<const uint8_t*>(b + i));
+        const auto maskNe = hn::Ne(va, vb);
+        diff += hn::CountTrue(d, maskNe);
+    }
+
+    // Scalar tail
+    for (; i < len; i++) {
+        if (a[i] != b[i]) diff++;
+    }
+
+    return diff;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace fastplong_simd
+HWY_AFTER_NAMESPACE();
+
+// ---- Dynamic dispatch wrappers (compiled once) ----
+#if HWY_ONCE
+
+#include <cstdio>
+#include <cstring>
+#include <cstdint>
+
+namespace fastplong_simd {
+
+HWY_EXPORT(CountQualityMetricsImpl);
+HWY_EXPORT(ReverseComplementImpl);
+HWY_EXPORT(CountAdjacentDiffsImpl);
+HWY_EXPORT(CountMismatchesImpl);
+
+void countQualityMetrics(const char* qualstr, const char* seqstr, int len,
+                         char qualThreshold, int& lowQualNum, int& nBaseNum,
+                         int& totalQual) {
+    HWY_DYNAMIC_DISPATCH(CountQualityMetricsImpl)(qualstr, seqstr, len,
+                                                   qualThreshold, lowQualNum,
+                                                   nBaseNum, totalQual);
+}
+
+void reverseComplement(const char* src, char* dst, int len) {
+    HWY_DYNAMIC_DISPATCH(ReverseComplementImpl)(src, dst, len);
+}
+
+int countAdjacentDiffs(const char* data, int len) {
+    return HWY_DYNAMIC_DISPATCH(CountAdjacentDiffsImpl)(data, len);
+}
+
+int countMismatches(const char* a, const char* b, int len) {
+    return HWY_DYNAMIC_DISPATCH(CountMismatchesImpl)(a, b, len);
+}
+
+// ---- Scalar reference implementations for testing ----
+
+static void scalarCountQualityMetrics(const char* qualstr, const char* seqstr,
+                                       int len, char qualThreshold,
+                                       int& lowQualNum, int& nBaseNum,
+                                       int& totalQual) {
+    lowQualNum = 0;
+    nBaseNum = 0;
+    totalQual = 0;
+    for (int i = 0; i < len; i++) {
+        uint8_t q = static_cast<uint8_t>(qualstr[i]);
+        totalQual += q - 33;
+        if (q < static_cast<uint8_t>(qualThreshold)) lowQualNum++;
+        if (seqstr[i] == 'N') nBase++;
+    }
+}
+
+static void scalarReverseComplement(const char* src, char* dst, int len) {
+    for (int i = 0; i < len; i++) {
+        char c;
+        switch (src[i]) {
+            case 'A': case 'a': c = 'T'; break;
+            case 'T': case 't': c = 'A'; break;
+            case 'C': case 'c': c = 'G'; break;
+            case 'G': case 'g': c = 'C'; break;
+            default: c = 'N'; break;
+        }
+        dst[len - 1 - i] = c;
+    }
+}
+
+static int scalarCountAdjacentDiffs(const char* data, int len) {
+    int diff = 0;
+    for (int i = 0; i < len - 1; i++) {
+        if (data[i] != data[i + 1]) diff++;
+    }
+    return diff;
+}
+
+static int scalarCountMismatches(const char* a, const char* b, int len) {
+    int diff = 0;
+    for (int i = 0; i < len; i++) {
+        if (a[i] != b[i]) diff++;
+    }
+    return diff;
+}
+
+bool testSimd() {
+    bool pass = true;
+
+    // --- reverseComplement ---
+    {
+        const char* in = "AAAATTTTCCCCGGGG";
+        int len = 16;
+        char out[16];
+        reverseComplement(in, out, len);
+        char ref[16];
+        scalarReverseComplement(in, ref, len);
+        if (memcmp(out, ref, len) != 0) {
+            fprintf(stderr, "FAIL: reverseComplement basic\n");
+            pass = false;
+        }
+    }
+    {
+        const char* in = "AaTtCcGgN";
+        int len = 9;
+        char out[9];
+        reverseComplement(in, out, len);
+        char ref[9];
+        scalarReverseComplement(in, ref, len);
+        if (memcmp(out, ref, len) != 0) {
+            fprintf(stderr, "FAIL: reverseComplement mixed case\n");
+            pass = false;
+        }
+    }
+    {
+        char out[1] = {'X'};
+        reverseComplement("", out, 0);
+    }
+    {
+        char out[1];
+        reverseComplement("A", out, 1);
+        if (out[0] != 'T') {
+            fprintf(stderr, "FAIL: reverseComplement len=1\n");
+            pass = false;
+        }
+    }
+    {
+        const char* in = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG";
+        int len = 68;
+        char out[68], ref[68];
+        reverseComplement(in, out, len);
+        scalarReverseComplement(in, ref, len);
+        if (memcmp(out, ref, len) != 0) {
+            fprintf(stderr, "FAIL: reverseComplement long string\n");
+            pass = false;
+        }
+    }
+
+    // --- countQualityMetrics ---
+    {
+        const char* qual = "IIIII";
+        const char* seq = "ACGTN";
+        int lowQ = 0, nBase = 0, totalQ = 0;
+        countQualityMetrics(qual, seq, 5, '5', lowQ, nBase, totalQ);
+        int refLowQ = 0, refNBase = 0, refTotalQ = 0;
+        scalarCountQualityMetrics(qual, seq, 5, '5', refLowQ, refNBase, refTotalQ);
+        if (lowQ != refLowQ || nBase != refNBase || totalQ != refTotalQ) {
+            fprintf(stderr, "FAIL: countQualityMetrics basic\n");
+            pass = false;
+        }
+    }
+    {
+        const char* qual = "!!!!!!!!!!";
+        const char* seq = "AAAAAAAAAA";
+        int lowQ = 0, nBase = 0, totalQ = 0;
+        countQualityMetrics(qual, seq, 10, '5', lowQ, nBase, totalQ);
+        int refLowQ = 0, refNBase = 0, refTotalQ = 0;
+        scalarCountQualityMetrics(qual, seq, 10, '5', refLowQ, refNBase, refTotalQ);
+        if (lowQ != refLowQ || nBase != refNBase || totalQ != refTotalQ) {
+            fprintf(stderr, "FAIL: countQualityMetrics all-low\n");
+            pass = false;
+        }
+    }
+    {
+        const char qual[] = "IIIIII!!!!!IIIII55555NNNNN!!!!!IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII";
+        const char seq[]  = "ACGTNNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACG";
+        int len = 68;
+        int lowQ = 0, nBase = 0, totalQ = 0;
+        countQualityMetrics(qual, seq, len, '5', lowQ, nBase, totalQ);
+        int refLowQ = 0, refNBase = 0, refTotalQ = 0;
+        scalarCountQualityMetrics(qual, seq, len, '5', refLowQ, refNBase, refTotalQ);
+        if (lowQ != refLowQ || nBase != refNBase || totalQ != refTotalQ) {
+            fprintf(stderr, "FAIL: countQualityMetrics long\n");
+            pass = false;
+        }
+    }
+
+    // --- countAdjacentDiffs ---
+    {
+        int d = countAdjacentDiffs("AAAAAAAAAA", 10);
+        if (d != 0) {
+            fprintf(stderr, "FAIL: countAdjacentDiffs all-same got %d\n", d);
+            pass = false;
+        }
+    }
+    {
+        const char* s = "ACACACACAC";
+        int d = countAdjacentDiffs(s, 10);
+        int ref = scalarCountAdjacentDiffs(s, 10);
+        if (d != ref) {
+            fprintf(stderr, "FAIL: countAdjacentDiffs all-diff got %d expected %d\n", d, ref);
+            pass = false;
+        }
+    }
+    {
+        if (countAdjacentDiffs("A", 1) != 0) {
+            fprintf(stderr, "FAIL: countAdjacentDiffs len=1\n");
+            pass = false;
+        }
+        if (countAdjacentDiffs("", 0) != 0) {
+            fprintf(stderr, "FAIL: countAdjacentDiffs len=0\n");
+            pass = false;
+        }
+    }
+    {
+        const char* s = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG";
+        int len = 68;
+        int d = countAdjacentDiffs(s, len);
+        int ref = scalarCountAdjacentDiffs(s, len);
+        if (d != ref) {
+            fprintf(stderr, "FAIL: countAdjacentDiffs long got %d expected %d\n", d, ref);
+            pass = false;
+        }
+    }
+
+    // --- countMismatches ---
+    {
+        const char* s = "ACGTACGTACGT";
+        int d = countMismatches(s, s, 12);
+        if (d != 0) {
+            fprintf(stderr, "FAIL: countMismatches identical got %d\n", d);
+            pass = false;
+        }
+    }
+    {
+        int d = countMismatches("AAAA", "TTTT", 4);
+        if (d != 4) {
+            fprintf(stderr, "FAIL: countMismatches all-diff got %d\n", d);
+            pass = false;
+        }
+    }
+    {
+        const char* a = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG";
+        const char* b = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG";
+        int d = countMismatches(a, b, 68);
+        if (d != 0) {
+            fprintf(stderr, "FAIL: countMismatches long-identical got %d\n", d);
+            pass = false;
+        }
+    }
+    {
+        const char* a = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
+        const char* b = "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT";
+        int d = countMismatches(a, b, 66);
+        if (d != 66) {
+            fprintf(stderr, "FAIL: countMismatches long-alldiff got %d\n", d);
+            pass = false;
+        }
+    }
+    {
+        int d = countMismatches("A", "T", 0);
+        if (d != 0) {
+            fprintf(stderr, "FAIL: countMismatches len=0 got %d\n", d);
+            pass = false;
+        }
+    }
+
+    return pass;
+}
+
+}  // namespace fastplong_simd
+
+#endif  // HWY_ONCE
+```
+
+NOTE: There is a bug in the scalar reference above — `nBase++` should be `nBaseNum++`. Fix it when implementing. Copy the exact code from fastp's `src/simd.cpp` as the reference, adjusting only the namespace from `fastp_simd` to `fastplong_simd`.
+
+**Step 3: Verify it compiles**
+
+Run: `make clean && make -j 2>&1 | tail -10`
+Expected: Compiles without errors (simd.cpp may generate warnings but no errors)
+
+**Step 4: Commit**
+
+```bash
+git add src/simd.h src/simd.cpp
+git commit -m "feat: add centralized SIMD module with Highway multi-target dispatch"
+```
+
+---
+
+### Task 4: Add SIMD unit test
+
+**Files:**
+- Create: `test/simd_test.cpp`
+
+**Step 1: Create the test file**
+
+```cpp
+#include <gtest/gtest.h>
+#include "../src/simd.h"
+
+TEST(SimdTest, allTests) {
+    EXPECT_TRUE(fastplong_simd::testSimd());
+}
+```
+
+**Step 2: Run tests**
+
+Run: `make clean && make test 2>&1 | tail -20`
+Expected: All tests pass including SimdTest.allTests
+
+**Step 3: Commit**
+
+```bash
+git add test/simd_test.cpp
+git commit -m "test: add SIMD unit tests"
+```
+
+---
+
+### Task 5: Refactor filter.cpp to use SIMD
+
+**Files:**
+- Modify: `src/filter.cpp`
+
+**Step 1: Update filter.cpp**
+
+Add include at the top (after existing includes):
+```cpp
+#include "simd.h"
+```
+
+Replace the scalar loop in `passFilter()` (the `for(int i=0; i<rlen; i++)` block at lines 27-38) with:
+```cpp
+        fastplong_simd::countQualityMetrics(qualstr, seqstr, rlen,
+                                            mOptions->qualfilter.qualifiedQual,
+                                            lowQualNum, nBaseNum, totalQual);
+```
+
+Replace the scalar loop in `passLowComplexityFilter()` (the `for(int i=0; i<length-1; i++)` block at lines 73-76) with:
+```cpp
+    int diff = fastplong_simd::countAdjacentDiffs(data, length);
+```
+Remove the `int diff = 0;` declaration at line 68 since `countAdjacentDiffs` returns it.
+
+**Step 2: Run tests**
+
+Run: `make clean && make test 2>&1 | tail -20`
+Expected: All tests pass (FilterTest.trimAndCut still passes)
+
+**Step 3: Commit**
+
+```bash
+git add src/filter.cpp
+git commit -m "perf: use SIMD countQualityMetrics and countAdjacentDiffs in filter"
+```
+
+---
+
+### Task 6: Refactor sequence.cpp to use SIMD
+
+**Files:**
+- Modify: `src/sequence.cpp`
+
+**Step 1: Replace entire sequence.cpp**
+
+```cpp
+#include "sequence.h"
+#include "simd.h"
+
+Sequence::Sequence(){
+}
+
+Sequence::Sequence(string* seq){
+    mStr = seq;
+}
+
+Sequence::~Sequence(){
+    if(mStr)
+        delete mStr;
+}
+
+void Sequence::print(){
+    std::cerr << *mStr;
+}
+
+int Sequence::length(){
+    return mStr->length();
+}
+
+string Sequence::reverseComplement(string* origin) {
+    int len = origin->length();
+    string str(len, 0);
+    fastplong_simd::reverseComplement(origin->c_str(), &str[0], len);
+    return str;
+}
+
+Sequence Sequence::reverseComplement() {
+    string* reversed = new string(Sequence::reverseComplement(mStr));
+    return Sequence(reversed);
+}
+
+Sequence Sequence::operator~(){
+    return reverseComplement();
+}
+```
+
+Key changes:
+- Removed `#include <hwy/highway.h>`, `#include <hwy/contrib/algo/transform-inl.h>`, `#include <hwy/aligned_allocator.h>`, `#include "simdutil.h"`
+- Removed `namespace hn = hwy::HWY_NAMESPACE;`
+- Simplified `reverseComplement(string*)` to a 4-line function calling `fastplong_simd::reverseComplement`
+- Removed the `#if _MSC_VER` / VLA / AllocateAligned branching
+
+**Step 2: Run tests**
+
+Run: `make clean && make test 2>&1 | tail -20`
+Expected: All tests pass (SequenceTests.reverse still passes)
+
+**Step 3: Commit**
+
+```bash
+git add src/sequence.cpp
+git commit -m "refactor: use centralized SIMD reverseComplement in sequence.cpp"
+```
+
+---
+
+### Task 7: Refactor adaptertrimmer.cpp to use SIMD
+
+**Files:**
+- Modify: `src/adaptertrimmer.cpp`
+
+**Step 1: Update adaptertrimmer.cpp**
+
+Replace includes — change:
+```cpp
+#include "hwy/highway.h"
+```
+to:
+```cpp
+#include "simd.h"
+```
+
+In `searchAdapter()`, remove the Highway setup at lines 61-63:
+```cpp
+    namespace hn = hwy::HWY_NAMESPACE;
+    hn::ScalableTag<uint8_t> d8;
+    const size_t N = hn::Lanes(d8);
+```
+
+Replace the three inner SIMD loops. Each occurrence of this pattern (appears at lines 89-96, 115-122, 139-146):
+```cpp
+            size_t mismatch = 0;
+            for (size_t i = 0; i < alen; i += N)
+            {
+                const size_t lanesToLoad = min(alen - i, N);
+                const auto rdata_v = hn::LoadN(d8, reinterpret_cast<const uint8_t *>(rdata + i + p), lanesToLoad);
+                const auto adata_v = hn::LoadN(d8, reinterpret_cast<const uint8_t *>(adata + i), lanesToLoad);
+                const auto mismatch_mask = rdata_v != adata_v;
+                mismatch += hn::CountTrue(d8, mismatch_mask);
+            }
+```
+
+Replace each with:
+```cpp
+            int mismatch = fastplong_simd::countMismatches(rdata + p, adata, alen);
+```
+
+Also change `size_t mismatch` → `int mismatch` in the comparison logic (the variable type is returned as `int` by `countMismatches`).
+
+**Step 2: Run tests**
+
+Run: `make clean && make test 2>&1 | tail -20`
+Expected: All tests pass (AdapterTrimmer tests still pass)
+
+**Step 3: Commit**
+
+```bash
+git add src/adaptertrimmer.cpp
+git commit -m "refactor: use centralized SIMD countMismatches in adaptertrimmer"
+```
+
+---
+
+### Task 8: Remove simdutil.h
+
+**Files:**
+- Delete: `src/simdutil.h`
+
+**Step 1: Verify no remaining references**
+
+Run: `grep -r "simdutil" src/ test/`
+Expected: No output (no files reference simdutil.h anymore after Task 6)
+
+**Step 2: Delete the file**
+
+```bash
+rm src/simdutil.h
+```
+
+**Step 3: Run full build and tests**
+
+Run: `make clean && make test 2>&1 | tail -20`
+Expected: All tests pass, no compilation errors
+
+**Step 4: Commit**
+
+```bash
+git add -A src/simdutil.h
+git commit -m "cleanup: remove simdutil.h, functionality merged into simd.cpp"
+```
+
+---
+
+### Task 9: Final verification
+
+**Step 1: Clean build**
+
+Run: `make clean && make -j 2>&1`
+Expected: Clean compilation with no errors or warnings
+
+**Step 2: Run all tests**
+
+Run: `make test 2>&1`
+Expected: All tests pass
+
+**Step 3: Quick smoke test with real data (if available)**
+
+Run: `./fastplong --help`
+Expected: Help output appears normally
+
+**Step 4: Verify no remaining direct Highway includes in source files (except simd.cpp)**
+
+Run: `grep -r '#include.*hwy/' src/ | grep -v simd.cpp`
+Expected: No output — only simd.cpp should include Highway headers directly
diff --git a/src/adaptertrimmer.cpp b/src/adaptertrimmer.cpp
index ccc5ed5..b95c8ea 100644
--- a/src/adaptertrimmer.cpp
+++ b/src/adaptertrimmer.cpp
@@ -1,7 +1,7 @@
 #include "adaptertrimmer.h"
 #include "editdistance.h"
 #include <math.h>
-#include "hwy/highway.h"
+#include "simd.h"
 
 AdapterTrimmer::AdapterTrimmer(){
 }
@@ -58,10 +58,6 @@ int AdapterTrimmer::trimByMultiSequences(Read* r, FilterResult* fr, vector<strin
 
 int AdapterTrimmer::searchAdapter(string *read, const string &adapter, double edMax, int searchStart, int searchLen, bool asLeftAsPossible, bool asRightAsPossible)
 {
-    namespace hn = hwy::HWY_NAMESPACE;
-    hn::ScalableTag<uint8_t> d8;
-    const size_t N = hn::Lanes(d8);
-
     int minMismatch = 99999; // initialized with a large mismatch
     int pos = -1;
     // for the best match
@@ -86,15 +82,7 @@ int AdapterTrimmer::searchAdapter(string *read, const string &adapter, double ed
         // go from left, and return immediatedly if find a mismatch < threshold
         for (int p = searchStart; p < searchEnd - alen; p++)
         {
-            size_t mismatch = 0;
-            for (size_t i = 0; i < alen; i += N)
-            {
-                const size_t lanesToLoad = min(alen - i, N);
-                const auto rdata_v = hn::LoadN(d8, reinterpret_cast<const uint8_t *>(rdata + i + p), lanesToLoad);
-                const auto adata_v = hn::LoadN(d8, reinterpret_cast<const uint8_t *>(adata + i), lanesToLoad);
-                const auto mismatch_mask = rdata_v != adata_v;
-                mismatch += hn::CountTrue(d8, mismatch_mask);
-            }
+            int mismatch = fastplong_simd::countMismatches(rdata + p, adata, alen);
 
             if (mismatch <= threshold)
             {
@@ -112,15 +100,7 @@ int AdapterTrimmer::searchAdapter(string *read, const string &adapter, double ed
         // go from right, and return immediatedly if find a mismatch <= threshold
         for (int p = searchEnd - alen; p >= searchStart; p--)
         {
-            size_t mismatch = 0;
-            for (size_t i = 0; i < alen; i += N)
-            {
-                const size_t lanesToLoad = min(alen - i, N);
-                const auto rdata_v = hn::LoadN(d8, reinterpret_cast<const uint8_t *>(rdata + i + p), lanesToLoad);
-                const auto adata_v = hn::LoadN(d8, reinterpret_cast<const uint8_t *>(adata + i), lanesToLoad);
-                const auto mismatch_mask = rdata_v != adata_v;
-                mismatch += hn::CountTrue(d8, mismatch_mask);
-            }
+            int mismatch = fastplong_simd::countMismatches(rdata + p, adata, alen);
             if (mismatch <= threshold)
             {
                 return p;
@@ -136,15 +116,7 @@ int AdapterTrimmer::searchAdapter(string *read, const string &adapter, double ed
     {
         for (int p = searchStart; p < searchEnd - alen; p++)
         {
-            size_t mismatch = 0;
-            for (size_t i = 0; i < alen; i += N)
-            {
-                const size_t lanesToLoad = min(alen - i, N);
-                const auto rdata_v = hn::LoadN(d8, reinterpret_cast<const uint8_t *>(rdata + i + p), lanesToLoad);
-                const auto adata_v = hn::LoadN(d8, reinterpret_cast<const uint8_t *>(adata + i), lanesToLoad);
-                const auto mismatch_mask = rdata_v != adata_v;
-                mismatch += hn::CountTrue(d8, mismatch_mask);
-            }
+            int mismatch = fastplong_simd::countMismatches(rdata + p, adata, alen);
             if (mismatch < minMismatch)
             {
                 minMismatch = mismatch;
diff --git a/src/fastqreader.h b/src/fastqreader.h
index 3ff41c1..5c9dfe6 100644
--- a/src/fastqreader.h
+++ b/src/fastqreader.h
@@ -32,7 +32,7 @@ SOFTWARE.
 #include <iostream>
 #include <fstream>
 #include "readpool.h"
-#include <isa-l/igzip_lib.h>
+#include "igzip_lib.h"
 
 class FastqReader{
 public:
diff --git a/src/filter.cpp b/src/filter.cpp
index 07847ef..df1d275 100644
--- a/src/filter.cpp
+++ b/src/filter.cpp
@@ -1,5 +1,6 @@
 #include "processor.h"
 #include "seprocessor.h"
+#include "simd.h"
 
 Filter::Filter(Options* opt){
     mOptions = opt;
@@ -24,18 +25,9 @@ int Filter::passFilter(Read* r) {
         const char* seqstr = r->mSeq->c_str();
         const char* qualstr = r->mQuality->c_str();
 
-        for(int i=0; i<rlen; i++) {
-            char base = seqstr[i];
-            char qual = qualstr[i];
-
-            totalQual += qual - 33;
-
-            if(qual < mOptions->qualfilter.qualifiedQual)
-                lowQualNum ++;
-
-            if(base == 'N')
-                nBaseNum++;
-        }
+        fastplong_simd::countQualityMetrics(qualstr, seqstr, rlen,
+                                            mOptions->qualfilter.qualifiedQual,
+                                            lowQualNum, nBaseNum, totalQual);
     }
 
     if(mOptions->qualfilter.enabled) {
@@ -65,19 +57,12 @@ int Filter::passFilter(Read* r) {
 }
 
 bool Filter::passLowComplexityFilter(Read* r) {
-    int diff = 0;
     int length = r->length();
     if(length <= 1)
         return false;
     const char* data = r->mSeq->c_str();
-    for(int i=0; i<length-1; i++) {
-        if(data[i] != data[i+1])
-            diff++;
-    }
-    if( (double)diff/(double)(length-1) >= mOptions->complexityFilter.threshold )
-        return true;
-    else
-        return false;
+    int diff = fastplong_simd::countAdjacentDiffs(data, length);
+    return (double)diff/(double)(length-1) >= mOptions->complexityFilter.threshold;
 }
 
 vector<pair<int, int>> Filter::detectLowQualityRegions(Read* r, int windowSize, int quality) {
diff --git a/src/libdeflate.h b/src/libdeflate.h
deleted file mode 100644
index 4e124e7..0000000
--- a/src/libdeflate.h
+++ /dev/null
@@ -1,362 +0,0 @@
-/*
- * libdeflate.h - public header for libdeflate
- */
-
-#ifndef LIBDEFLATE_H
-#define LIBDEFLATE_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define LIBDEFLATE_VERSION_MAJOR	1
-#define LIBDEFLATE_VERSION_MINOR	6
-#define LIBDEFLATE_VERSION_STRING	"1.6"
-
-#include <stddef.h>
-#include <stdint.h>
-
-/*
- * On Windows, if you want to link to the DLL version of libdeflate, then
- * #define LIBDEFLATE_DLL.  Note that the calling convention is "stdcall".
- */
-#ifdef LIBDEFLATE_DLL
-#  ifdef BUILDING_LIBDEFLATE
-#    define LIBDEFLATEEXPORT	LIBEXPORT
-#  elif defined(_WIN32) || defined(__CYGWIN__)
-#    define LIBDEFLATEEXPORT	__declspec(dllimport)
-#  endif
-#endif
-#ifndef LIBDEFLATEEXPORT
-#  define LIBDEFLATEEXPORT
-#endif
-
-#if defined(_WIN32) && !defined(_WIN64)
-#  define LIBDEFLATEAPI_ABI	__stdcall
-#else
-#  define LIBDEFLATEAPI_ABI
-#endif
-
-#if defined(BUILDING_LIBDEFLATE) && defined(__GNUC__) && \
-	defined(_WIN32) && !defined(_WIN64)
-    /*
-     * On 32-bit Windows, gcc assumes 16-byte stack alignment but MSVC only 4.
-     * Realign the stack when entering libdeflate to avoid crashing in SSE/AVX
-     * code when called from an MSVC-compiled application.
-     */
-#  define LIBDEFLATEAPI_STACKALIGN	__attribute__((force_align_arg_pointer))
-#else
-#  define LIBDEFLATEAPI_STACKALIGN
-#endif
-
-#define LIBDEFLATEAPI	LIBDEFLATEAPI_ABI LIBDEFLATEAPI_STACKALIGN
-
-/* ========================================================================== */
-/*                             Compression                                    */
-/* ========================================================================== */
-
-struct libdeflate_compressor;
-
-/*
- * libdeflate_alloc_compressor() allocates a new compressor that supports
- * DEFLATE, zlib, and gzip compression.  'compression_level' is the compression
- * level on a zlib-like scale but with a higher maximum value (1 = fastest, 6 =
- * medium/default, 9 = slow, 12 = slowest).  The return value is a pointer to
- * the new compressor, or NULL if out of memory.
- *
- * Note: for compression, the sliding window size is defined at compilation time
- * to 32768, the largest size permissible in the DEFLATE format.  It cannot be
- * changed at runtime.
- *
- * A single compressor is not safe to use by multiple threads concurrently.
- * However, different threads may use different compressors concurrently.
- */
-LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI
-libdeflate_alloc_compressor(int compression_level);
-
-/*
- * libdeflate_deflate_compress() performs raw DEFLATE compression on a buffer of
- * data.  The function attempts to compress 'in_nbytes' bytes of data located at
- * 'in' and write the results to 'out', which has space for 'out_nbytes_avail'
- * bytes.  The return value is the compressed size in bytes, or 0 if the data
- * could not be compressed to 'out_nbytes_avail' bytes or fewer.
- */
-LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
-libdeflate_deflate_compress(struct libdeflate_compressor *compressor,
-			    const void *in, size_t in_nbytes,
-			    void *out, size_t out_nbytes_avail);
-
-/*
- * libdeflate_deflate_compress_bound() returns a worst-case upper bound on the
- * number of bytes of compressed data that may be produced by compressing any
- * buffer of length less than or equal to 'in_nbytes' using
- * libdeflate_deflate_compress() with the specified compressor.  Mathematically,
- * this bound will necessarily be a number greater than or equal to 'in_nbytes'.
- * It may be an overestimate of the true upper bound.  The return value is
- * guaranteed to be the same for all invocations with the same compressor and
- * same 'in_nbytes'.
- *
- * As a special case, 'compressor' may be NULL.  This causes the bound to be
- * taken across *any* libdeflate_compressor that could ever be allocated with
- * this build of the library, with any options.
- *
- * Note that this function is not necessary in many applications.  With
- * block-based compression, it is usually preferable to separately store the
- * uncompressed size of each block and to store any blocks that did not compress
- * to less than their original size uncompressed.  In that scenario, there is no
- * need to know the worst-case compressed size, since the maximum number of
- * bytes of compressed data that may be used would always be one less than the
- * input length.  You can just pass a buffer of that size to
- * libdeflate_deflate_compress() and store the data uncompressed if
- * libdeflate_deflate_compress() returns 0, indicating that the compressed data
- * did not fit into the provided output buffer.
- */
-LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
-libdeflate_deflate_compress_bound(struct libdeflate_compressor *compressor,
-				  size_t in_nbytes);
-
-/*
- * Like libdeflate_deflate_compress(), but stores the data in the zlib wrapper
- * format.
- */
-LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
-libdeflate_zlib_compress(struct libdeflate_compressor *compressor,
-			 const void *in, size_t in_nbytes,
-			 void *out, size_t out_nbytes_avail);
-
-/*
- * Like libdeflate_deflate_compress_bound(), but assumes the data will be
- * compressed with libdeflate_zlib_compress() rather than with
- * libdeflate_deflate_compress().
- */
-LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
-libdeflate_zlib_compress_bound(struct libdeflate_compressor *compressor,
-			       size_t in_nbytes);
-
-/*
- * Like libdeflate_deflate_compress(), but stores the data in the gzip wrapper
- * format.
- */
-LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
-libdeflate_gzip_compress(struct libdeflate_compressor *compressor,
-			 const void *in, size_t in_nbytes,
-			 void *out, size_t out_nbytes_avail);
-
-/*
- * Like libdeflate_deflate_compress_bound(), but assumes the data will be
- * compressed with libdeflate_gzip_compress() rather than with
- * libdeflate_deflate_compress().
- */
-LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
-libdeflate_gzip_compress_bound(struct libdeflate_compressor *compressor,
-			       size_t in_nbytes);
-
-/*
- * libdeflate_free_compressor() frees a compressor that was allocated with
- * libdeflate_alloc_compressor().  If a NULL pointer is passed in, no action is
- * taken.
- */
-LIBDEFLATEEXPORT void LIBDEFLATEAPI
-libdeflate_free_compressor(struct libdeflate_compressor *compressor);
-
-/* ========================================================================== */
-/*                             Decompression                                  */
-/* ========================================================================== */
-
-struct libdeflate_decompressor;
-
-/*
- * libdeflate_alloc_decompressor() allocates a new decompressor that can be used
- * for DEFLATE, zlib, and gzip decompression.  The return value is a pointer to
- * the new decompressor, or NULL if out of memory.
- *
- * This function takes no parameters, and the returned decompressor is valid for
- * decompressing data that was compressed at any compression level and with any
- * sliding window size.
- *
- * A single decompressor is not safe to use by multiple threads concurrently.
- * However, different threads may use different decompressors concurrently.
- */
-LIBDEFLATEEXPORT struct libdeflate_decompressor * LIBDEFLATEAPI
-libdeflate_alloc_decompressor(void);
-
-/*
- * Result of a call to libdeflate_deflate_decompress(),
- * libdeflate_zlib_decompress(), or libdeflate_gzip_decompress().
- */
-enum libdeflate_result {
-	/* Decompression was successful.  */
-	LIBDEFLATE_SUCCESS = 0,
-
-	/* Decompressed failed because the compressed data was invalid, corrupt,
-	 * or otherwise unsupported.  */
-	LIBDEFLATE_BAD_DATA = 1,
-
-	/* A NULL 'actual_out_nbytes_ret' was provided, but the data would have
-	 * decompressed to fewer than 'out_nbytes_avail' bytes.  */
-	LIBDEFLATE_SHORT_OUTPUT = 2,
-
-	/* The data would have decompressed to more than 'out_nbytes_avail'
-	 * bytes.  */
-	LIBDEFLATE_INSUFFICIENT_SPACE = 3,
-};
-
-/*
- * libdeflate_deflate_decompress() decompresses the DEFLATE-compressed stream
- * from the buffer 'in' with compressed size up to 'in_nbytes' bytes.  The
- * uncompressed data is written to 'out', a buffer with size 'out_nbytes_avail'
- * bytes.  If decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned.
- * Otherwise, a nonzero result code such as LIBDEFLATE_BAD_DATA is returned.  If
- * a nonzero result code is returned, then the contents of the output buffer are
- * undefined.
- *
- * Decompression stops at the end of the DEFLATE stream (as indicated by the
- * BFINAL flag), even if it is actually shorter than 'in_nbytes' bytes.
- *
- * libdeflate_deflate_decompress() can be used in cases where the actual
- * uncompressed size is known (recommended) or unknown (not recommended):
- *
- *   - If the actual uncompressed size is known, then pass the actual
- *     uncompressed size as 'out_nbytes_avail' and pass NULL for
- *     'actual_out_nbytes_ret'.  This makes libdeflate_deflate_decompress() fail
- *     with LIBDEFLATE_SHORT_OUTPUT if the data decompressed to fewer than the
- *     specified number of bytes.
- *
- *   - If the actual uncompressed size is unknown, then provide a non-NULL
- *     'actual_out_nbytes_ret' and provide a buffer with some size
- *     'out_nbytes_avail' that you think is large enough to hold all the
- *     uncompressed data.  In this case, if the data decompresses to less than
- *     or equal to 'out_nbytes_avail' bytes, then
- *     libdeflate_deflate_decompress() will write the actual uncompressed size
- *     to *actual_out_nbytes_ret and return 0 (LIBDEFLATE_SUCCESS).  Otherwise,
- *     it will return LIBDEFLATE_INSUFFICIENT_SPACE if the provided buffer was
- *     not large enough but no other problems were encountered, or another
- *     nonzero result code if decompression failed for another reason.
- */
-LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
-libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor,
-			      const void *in, size_t in_nbytes,
-			      void *out, size_t out_nbytes_avail,
-			      size_t *actual_out_nbytes_ret);
-
-/*
- * Like libdeflate_deflate_decompress(), but adds the 'actual_in_nbytes_ret'
- * argument.  If decompression succeeds and 'actual_in_nbytes_ret' is not NULL,
- * then the actual compressed size of the DEFLATE stream (aligned to the next
- * byte boundary) is written to *actual_in_nbytes_ret.
- */
-LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
-libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor,
-				 const void *in, size_t in_nbytes,
-				 void *out, size_t out_nbytes_avail,
-				 size_t *actual_in_nbytes_ret,
-				 size_t *actual_out_nbytes_ret);
-
-/*
- * Like libdeflate_deflate_decompress(), but assumes the zlib wrapper format
- * instead of raw DEFLATE.
- *
- * Decompression will stop at the end of the zlib stream, even if it is shorter
- * than 'in_nbytes'.  If you need to know exactly where the zlib stream ended,
- * use libdeflate_zlib_decompress_ex().
- */
-LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
-libdeflate_zlib_decompress(struct libdeflate_decompressor *decompressor,
-			   const void *in, size_t in_nbytes,
-			   void *out, size_t out_nbytes_avail,
-			   size_t *actual_out_nbytes_ret);
-
-/*
- * Like libdeflate_zlib_decompress(), but adds the 'actual_in_nbytes_ret'
- * argument.  If 'actual_in_nbytes_ret' is not NULL and the decompression
- * succeeds (indicating that the first zlib-compressed stream in the input
- * buffer was decompressed), then the actual number of input bytes consumed is
- * written to *actual_in_nbytes_ret.
- */
-LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
-libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *decompressor,
-			      const void *in, size_t in_nbytes,
-			      void *out, size_t out_nbytes_avail,
-			      size_t *actual_in_nbytes_ret,
-			      size_t *actual_out_nbytes_ret);
-
-/*
- * Like libdeflate_deflate_decompress(), but assumes the gzip wrapper format
- * instead of raw DEFLATE.
- *
- * If multiple gzip-compressed members are concatenated, then only the first
- * will be decompressed.  Use libdeflate_gzip_decompress_ex() if you need
- * multi-member support.
- */
-LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
-libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor,
-			   const void *in, size_t in_nbytes,
-			   void *out, size_t out_nbytes_avail,
-			   size_t *actual_out_nbytes_ret);
-
-/*
- * Like libdeflate_gzip_decompress(), but adds the 'actual_in_nbytes_ret'
- * argument.  If 'actual_in_nbytes_ret' is not NULL and the decompression
- * succeeds (indicating that the first gzip-compressed member in the input
- * buffer was decompressed), then the actual number of input bytes consumed is
- * written to *actual_in_nbytes_ret.
- */
-LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
-libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor,
-			      const void *in, size_t in_nbytes,
-			      void *out, size_t out_nbytes_avail,
-			      size_t *actual_in_nbytes_ret,
-			      size_t *actual_out_nbytes_ret);
-
-/*
- * libdeflate_free_decompressor() frees a decompressor that was allocated with
- * libdeflate_alloc_decompressor().  If a NULL pointer is passed in, no action
- * is taken.
- */
-LIBDEFLATEEXPORT void LIBDEFLATEAPI
-libdeflate_free_decompressor(struct libdeflate_decompressor *decompressor);
-
-/* ========================================================================== */
-/*                                Checksums                                   */
-/* ========================================================================== */
-
-/*
- * libdeflate_adler32() updates a running Adler-32 checksum with 'len' bytes of
- * data and returns the updated checksum.  When starting a new checksum, the
- * required initial value for 'adler' is 1.  This value is also returned when
- * 'buffer' is specified as NULL.
- */
-LIBDEFLATEEXPORT uint32_t LIBDEFLATEAPI
-libdeflate_adler32(uint32_t adler32, const void *buffer, size_t len);
-
-
-/*
- * libdeflate_crc32() updates a running CRC-32 checksum with 'len' bytes of data
- * and returns the updated checksum.  When starting a new checksum, the required
- * initial value for 'crc' is 0.  This value is also returned when 'buffer' is
- * specified as NULL.
- */
-LIBDEFLATEEXPORT uint32_t LIBDEFLATEAPI
-libdeflate_crc32(uint32_t crc, const void *buffer, size_t len);
-
-/* ========================================================================== */
-/*                           Custom memory allocator                          */
-/* ========================================================================== */
-
-/*
- * Install a custom memory allocator which libdeflate will use for all memory
- * allocations.  'malloc_func' is a function that must behave like malloc(), and
- * 'free_func' is a function that must behave like free().
- *
- * There must not be any libdeflate_compressor or libdeflate_decompressor
- * structures in existence when calling this function.
- */
-LIBDEFLATEEXPORT void LIBDEFLATEAPI
-libdeflate_set_memory_allocator(void *(*malloc_func)(size_t),
-				void (*free_func)(void *));
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* LIBDEFLATE_H */
diff --git a/src/sequence.cpp b/src/sequence.cpp
index 7ec0c42..0adb678 100644
--- a/src/sequence.cpp
+++ b/src/sequence.cpp
@@ -1,10 +1,5 @@
 #include "sequence.h"
-#include <hwy/highway.h>
-#include <hwy/contrib/algo/transform-inl.h>
-#include <hwy/aligned_allocator.h>
-#include "simdutil.h"
-
-namespace hn = hwy::HWY_NAMESPACE;
+#include "simd.h"
 
 Sequence::Sequence(){
 }
@@ -26,54 +21,11 @@ int Sequence::length(){
     return mStr->length();
 }
 
-string Sequence::reverseComplement(string *HWY_RESTRICT origin) {
-    auto length = origin->length();
-    const hn::ScalableTag<uint8_t> d;
-    const auto sequence = reinterpret_cast<const uint8_t*>(origin->c_str());
-    const auto transform = [](const auto d, auto output, const auto sequence) HWY_ATTR
-    {
-        const auto A = hn::Set(d, 'A');
-        const auto T = hn::Set(d, 'T');
-        const auto C = hn::Set(d, 'C');
-        const auto G = hn::Set(d, 'G');
-        const auto a = hn::Set(d, 'a');
-        const auto t = hn::Set(d, 't');
-        const auto c = hn::Set(d, 'c');
-        const auto g = hn::Set(d, 'g');
-        const auto N = hn::Set(d, 'N');
-
-        // output[i] = sequence[i] == 'A' || sequence[i] == 'a' ? 'T' : 'N'
-        output = hn::IfThenElse(hn::Or(hn::Eq(sequence, A), hn::Eq(sequence, a)), T, N);
-        output = hn::IfThenElse(hn::Or(hn::Eq(sequence, T), hn::Eq(sequence, t)), A, output);
-        output = hn::IfThenElse(hn::Or(hn::Eq(sequence, C), hn::Eq(sequence, c)), G, output);
-        output = hn::IfThenElse(hn::Or(hn::Eq(sequence, G), hn::Eq(sequence, g)), C, output);
-        return output;
-    };
-#if _MSC_VER
-    if (true) {
-#else
-    if (length <= 1000000) {
-#endif
-#if _MSC_VER
-        auto outputPtr = std::make_unique<uint8_t[]>(length);
-        uint8_t* output = outputPtr.get();
-#else
-        uint8_t output[length];
-#endif
-        hn::Transform1Reversed(d, output, length, sequence, transform);
-        auto retVal = reinterpret_cast<char *>(output);
-        std::string reversed(retVal, length);
-        return reversed;
-    }
-#ifndef _MSC_VER
-    else {
-        const auto allocated = hwy::AllocateAligned<uint8_t>(length);
-        hn::Transform1Reversed(d, allocated.get(), length, sequence, transform);
-        auto retVal = reinterpret_cast<char *>(allocated.get());
-        std::string reversed(retVal, length);
-        return reversed;
-    }
-#endif
+string Sequence::reverseComplement(string* origin) {
+    int len = origin->length();
+    string str(len, 0);
+    fastplong_simd::reverseComplement(origin->c_str(), &str[0], len);
+    return str;
 }
 
 Sequence Sequence::reverseComplement() {
diff --git a/src/simd.cpp b/src/simd.cpp
new file mode 100644
index 0000000..3f1ba62
--- /dev/null
+++ b/src/simd.cpp
@@ -0,0 +1,452 @@
+// SIMD-accelerated functions for fastplong using Google Highway.
+// This file is compiled once per SIMD target via foreach_target.h.
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "src/simd.cpp"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace fastplong_simd {
+namespace HWY_NAMESPACE {
+
+namespace hn = hwy::HWY_NAMESPACE;
+
+void CountQualityMetricsImpl(const char* qualstr, const char* seqstr, int len,
+                             char qualThreshold, int& lowQualNum, int& nBaseNum,
+                             int& totalQual) {
+    const hn::ScalableTag<uint8_t> d;
+    const int N = hn::Lanes(d);
+
+    const auto vThresh = hn::Set(d, static_cast<uint8_t>(qualThreshold));
+    const auto vN = hn::Set(d, static_cast<uint8_t>('N'));
+    const auto v33 = hn::Set(d, static_cast<uint8_t>(33));
+
+    int lowQual = 0;
+    int nBase = 0;
+    int qualSum = 0;
+    int i = 0;
+
+    // Process in blocks of up to 255 vectors to avoid u16 overflow.
+    // SumsOf2 produces at most 2*93 = 186 per u16 lane (valid FASTQ range).
+    // Over 255 iterations: 255 * 186 = 47430, safely within u16 (max 65535).
+    const int blockSize = 255 * N;
+
+    for (int blockStart = 0; blockStart < len; blockStart += blockSize) {
+        int blockEnd = blockStart + blockSize;
+        if (blockEnd > len) blockEnd = len;
+
+        // Use u16 accumulator for quality sum within this block.
+        // SumsOf2 pairwise-adds adjacent u8 lanes into u16, avoiding
+        // PromoteUpperTo which is unavailable on HWY_SCALAR.
+        const hn::ScalableTag<uint16_t> d16;
+        auto vQualSum16 = hn::Zero(d16);
+
+        for (i = blockStart; i + N <= blockEnd; i += N) {
+            const auto vQual = hn::LoadU(d, reinterpret_cast<const uint8_t*>(qualstr + i));
+            const auto vSeq = hn::LoadU(d, reinterpret_cast<const uint8_t*>(seqstr + i));
+
+            // Count bases with quality < threshold
+            const auto maskLowQ = hn::Lt(vQual, vThresh);
+            lowQual += hn::CountTrue(d, maskLowQ);
+
+            // Count N bases
+            const auto maskN = hn::Eq(vSeq, vN);
+            nBase += hn::CountTrue(d, maskN);
+
+            // Subtract 33 and accumulate quality into u16 accumulator.
+            // SumsOf2 adds adjacent u8 pairs -> u16, works on all targets.
+            const auto vQualAdj = hn::Sub(vQual, v33);
+            vQualSum16 = hn::Add(vQualSum16, hn::SumsOf2(vQualAdj));
+        }
+
+        // Reduce u16 accumulator to scalar via SumsOf2 -> u32 then ReduceSum
+        const hn::ScalableTag<uint32_t> d32;
+        qualSum += static_cast<int>(
+            hn::ReduceSum(d32, hn::SumsOf2(vQualSum16)));
+    }
+
+    // Scalar tail
+    for (; i < len; i++) {
+        uint8_t qual = static_cast<uint8_t>(qualstr[i]);
+        qualSum += qual - 33;
+        if (qual < static_cast<uint8_t>(qualThreshold)) lowQual++;
+        if (seqstr[i] == 'N') nBase++;
+    }
+
+    lowQualNum = lowQual;
+    nBaseNum = nBase;
+    totalQual = qualSum;
+}
+
+void ReverseComplementImpl(const char* src, char* dst, int len) {
+    const hn::ScalableTag<uint8_t> d;
+    const int N = hn::Lanes(d);
+
+    // Complement via single table lookup on low nibble of ASCII code.
+    // DNA base low nibbles: A/a=1, C/c=3, T/t=4, G/g=7, N=14.
+    // Uppercase and lowercase share the same low nibble, so one table
+    // handles both.  Unmapped nibbles default to 'N'.
+    HWY_ALIGN static const uint8_t kComplement[16] = {
+        'N', 'T', 'N', 'G', 'A', 'N', 'N', 'C',
+        'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'
+    };
+    const auto table = hn::LoadDup128(d, kComplement);
+    const auto vMask = hn::Set(d, static_cast<uint8_t>(0x0F));
+
+    int i = 0;
+    for (; i + N <= len; i += N) {
+        const auto v = hn::LoadU(d, reinterpret_cast<const uint8_t*>(src + i));
+        const auto indices = hn::And(v, vMask);
+        const auto comp = hn::TableLookupBytes(table, indices);
+
+        // Reverse and store at mirrored position
+        const auto revComp = hn::Reverse(d, comp);
+        hn::StoreU(revComp, d, reinterpret_cast<uint8_t*>(dst + len - i - N));
+    }
+
+    // Scalar tail
+    for (; i < len; i++) {
+        char base = src[i];
+        char comp;
+        switch (base) {
+            case 'A': case 'a': comp = 'T'; break;
+            case 'T': case 't': comp = 'A'; break;
+            case 'C': case 'c': comp = 'G'; break;
+            case 'G': case 'g': comp = 'C'; break;
+            default: comp = 'N'; break;
+        }
+        dst[len - 1 - i] = comp;
+    }
+}
+
+int CountAdjacentDiffsImpl(const char* data, int len) {
+    if (len <= 1) return 0;
+
+    const hn::ScalableTag<uint8_t> d;
+    const int N = hn::Lanes(d);
+
+    int diff = 0;
+    int i = 0;
+
+    // Compare data[i..i+N-1] with data[i+1..i+N]
+    for (; i + N < len; i += N) {
+        const auto v1 = hn::LoadU(d, reinterpret_cast<const uint8_t*>(data + i));
+        const auto v2 = hn::LoadU(d, reinterpret_cast<const uint8_t*>(data + i + 1));
+        const auto maskNe = hn::Ne(v1, v2);
+        diff += hn::CountTrue(d, maskNe);
+    }
+
+    // Scalar tail
+    for (; i < len - 1; i++) {
+        if (data[i] != data[i + 1]) diff++;
+    }
+
+    return diff;
+}
+
+int CountMismatchesImpl(const char* a, const char* b, int len) {
+    const hn::ScalableTag<uint8_t> d;
+    const int N = hn::Lanes(d);
+
+    int diff = 0;
+    int i = 0;
+
+    for (; i + N <= len; i += N) {
+        const auto va = hn::LoadU(d, reinterpret_cast<const uint8_t*>(a + i));
+        const auto vb = hn::LoadU(d, reinterpret_cast<const uint8_t*>(b + i));
+        const auto maskNe = hn::Ne(va, vb);
+        diff += hn::CountTrue(d, maskNe);
+    }
+
+    // Scalar tail
+    for (; i < len; i++) {
+        if (a[i] != b[i]) diff++;
+    }
+
+    return diff;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace fastplong_simd
+HWY_AFTER_NAMESPACE();
+
+// ---- Dynamic dispatch wrappers (compiled once) ----
+#if HWY_ONCE
+
+#include <cstdio>
+#include <cstring>
+#include <cstdint>
+
+namespace fastplong_simd {
+
+HWY_EXPORT(CountQualityMetricsImpl);
+HWY_EXPORT(ReverseComplementImpl);
+HWY_EXPORT(CountAdjacentDiffsImpl);
+HWY_EXPORT(CountMismatchesImpl);
+
+void countQualityMetrics(const char* qualstr, const char* seqstr, int len,
+                         char qualThreshold, int& lowQualNum, int& nBaseNum,
+                         int& totalQual) {
+    HWY_DYNAMIC_DISPATCH(CountQualityMetricsImpl)(qualstr, seqstr, len,
+                                                   qualThreshold, lowQualNum,
+                                                   nBaseNum, totalQual);
+}
+
+void reverseComplement(const char* src, char* dst, int len) {
+    HWY_DYNAMIC_DISPATCH(ReverseComplementImpl)(src, dst, len);
+}
+
+int countAdjacentDiffs(const char* data, int len) {
+    return HWY_DYNAMIC_DISPATCH(CountAdjacentDiffsImpl)(data, len);
+}
+
+int countMismatches(const char* a, const char* b, int len) {
+    return HWY_DYNAMIC_DISPATCH(CountMismatchesImpl)(a, b, len);
+}
+
+// ---- Scalar reference implementations for testing ----
+
+static void scalarCountQualityMetrics(const char* qualstr, const char* seqstr,
+                                       int len, char qualThreshold,
+                                       int& lowQualNum, int& nBaseNum,
+                                       int& totalQual) {
+    lowQualNum = 0;
+    nBaseNum = 0;
+    totalQual = 0;
+    for (int i = 0; i < len; i++) {
+        uint8_t q = static_cast<uint8_t>(qualstr[i]);
+        totalQual += q - 33;
+        if (q < static_cast<uint8_t>(qualThreshold)) lowQualNum++;
+        if (seqstr[i] == 'N') nBaseNum++;
+    }
+}
+
+static void scalarReverseComplement(const char* src, char* dst, int len) {
+    for (int i = 0; i < len; i++) {
+        char c;
+        switch (src[i]) {
+            case 'A': case 'a': c = 'T'; break;
+            case 'T': case 't': c = 'A'; break;
+            case 'C': case 'c': c = 'G'; break;
+            case 'G': case 'g': c = 'C'; break;
+            default: c = 'N'; break;
+        }
+        dst[len - 1 - i] = c;
+    }
+}
+
+static int scalarCountAdjacentDiffs(const char* data, int len) {
+    int diff = 0;
+    for (int i = 0; i < len - 1; i++) {
+        if (data[i] != data[i + 1]) diff++;
+    }
+    return diff;
+}
+
+static int scalarCountMismatches(const char* a, const char* b, int len) {
+    int diff = 0;
+    for (int i = 0; i < len; i++) {
+        if (a[i] != b[i]) diff++;
+    }
+    return diff;
+}
+
+bool testSimd() {
+    bool pass = true;
+
+    // --- reverseComplement ---
+    // Basic case
+    {
+        const char* in = "AAAATTTTCCCCGGGG";
+        int len = 16;
+        char out[16];
+        reverseComplement(in, out, len);
+        char ref[16];
+        scalarReverseComplement(in, ref, len);
+        if (memcmp(out, ref, len) != 0) {
+            fprintf(stderr, "FAIL: reverseComplement basic\n");
+            pass = false;
+        }
+    }
+    // Mixed case
+    {
+        const char* in = "AaTtCcGgN";
+        int len = 9;
+        char out[9];
+        reverseComplement(in, out, len);
+        char ref[9];
+        scalarReverseComplement(in, ref, len);
+        if (memcmp(out, ref, len) != 0) {
+            fprintf(stderr, "FAIL: reverseComplement mixed case\n");
+            pass = false;
+        }
+    }
+    // Empty
+    {
+        char out[1] = {'X'};
+        reverseComplement("", out, 0);
+        // Should not crash; out unchanged
+    }
+    // Length 1
+    {
+        char out[1];
+        reverseComplement("A", out, 1);
+        if (out[0] != 'T') {
+            fprintf(stderr, "FAIL: reverseComplement len=1\n");
+            pass = false;
+        }
+    }
+    // Long string (> typical SIMD width) to exercise SIMD + tail
+    {
+        const char* in = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG";
+        int len = 68;
+        char out[68], ref[68];
+        reverseComplement(in, out, len);
+        scalarReverseComplement(in, ref, len);
+        if (memcmp(out, ref, len) != 0) {
+            fprintf(stderr, "FAIL: reverseComplement long string\n");
+            pass = false;
+        }
+    }
+
+    // --- countQualityMetrics ---
+    // Basic
+    {
+        // qual: "IIIII" (phred 40 each), seq: "ACGTN"
+        const char* qual = "IIIII";
+        const char* seq = "ACGTN";
+        int lowQ = 0, nBase = 0, totalQ = 0;
+        countQualityMetrics(qual, seq, 5, '5', lowQ, nBase, totalQ);
+        int refLowQ = 0, refNBase = 0, refTotalQ = 0;
+        scalarCountQualityMetrics(qual, seq, 5, '5', refLowQ, refNBase, refTotalQ);
+        if (lowQ != refLowQ || nBase != refNBase || totalQ != refTotalQ) {
+            fprintf(stderr, "FAIL: countQualityMetrics basic (lowQ=%d/%d nBase=%d/%d totalQ=%d/%d)\n",
+                    lowQ, refLowQ, nBase, refNBase, totalQ, refTotalQ);
+            pass = false;
+        }
+    }
+    // All low quality
+    {
+        const char* qual = "!!!!!!!!!!";  // phred 0
+        const char* seq = "AAAAAAAAAA";
+        int lowQ = 0, nBase = 0, totalQ = 0;
+        countQualityMetrics(qual, seq, 10, '5', lowQ, nBase, totalQ);
+        int refLowQ = 0, refNBase = 0, refTotalQ = 0;
+        scalarCountQualityMetrics(qual, seq, 10, '5', refLowQ, refNBase, refTotalQ);
+        if (lowQ != refLowQ || nBase != refNBase || totalQ != refTotalQ) {
+            fprintf(stderr, "FAIL: countQualityMetrics all-low\n");
+            pass = false;
+        }
+    }
+    // Long string with mixed content
+    {
+        const char qual[] = "IIIIII!!!!!IIIII55555NNNNN!!!!!IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII";
+        const char seq[]  = "ACGTNNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACGTNACG";
+        int len = 68;
+        int lowQ = 0, nBase = 0, totalQ = 0;
+        countQualityMetrics(qual, seq, len, '5', lowQ, nBase, totalQ);
+        int refLowQ = 0, refNBase = 0, refTotalQ = 0;
+        scalarCountQualityMetrics(qual, seq, len, '5', refLowQ, refNBase, refTotalQ);
+        if (lowQ != refLowQ || nBase != refNBase || totalQ != refTotalQ) {
+            fprintf(stderr, "FAIL: countQualityMetrics long (lowQ=%d/%d nBase=%d/%d totalQ=%d/%d)\n",
+                    lowQ, refLowQ, nBase, refNBase, totalQ, refTotalQ);
+            pass = false;
+        }
+    }
+
+    // --- countAdjacentDiffs ---
+    // All same
+    {
+        int d = countAdjacentDiffs("AAAAAAAAAA", 10);
+        if (d != 0) {
+            fprintf(stderr, "FAIL: countAdjacentDiffs all-same got %d\n", d);
+            pass = false;
+        }
+    }
+    // All different
+    {
+        const char* s = "ACACACACAC";
+        int d = countAdjacentDiffs(s, 10);
+        int ref = scalarCountAdjacentDiffs(s, 10);
+        if (d != ref) {
+            fprintf(stderr, "FAIL: countAdjacentDiffs all-diff got %d expected %d\n", d, ref);
+            pass = false;
+        }
+    }
+    // Length 0 and 1
+    {
+        if (countAdjacentDiffs("A", 1) != 0) {
+            fprintf(stderr, "FAIL: countAdjacentDiffs len=1\n");
+            pass = false;
+        }
+        if (countAdjacentDiffs("", 0) != 0) {
+            fprintf(stderr, "FAIL: countAdjacentDiffs len=0\n");
+            pass = false;
+        }
+    }
+    // Long string
+    {
+        const char* s = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG";
+        int len = 68;
+        int d = countAdjacentDiffs(s, len);
+        int ref = scalarCountAdjacentDiffs(s, len);
+        if (d != ref) {
+            fprintf(stderr, "FAIL: countAdjacentDiffs long got %d expected %d\n", d, ref);
+            pass = false;
+        }
+    }
+
+    // --- countMismatches ---
+    // Identical
+    {
+        const char* s = "ACGTACGTACGT";
+        int d = countMismatches(s, s, 12);
+        if (d != 0) {
+            fprintf(stderr, "FAIL: countMismatches identical got %d\n", d);
+            pass = false;
+        }
+    }
+    // All different
+    {
+        int d = countMismatches("AAAA", "TTTT", 4);
+        if (d != 4) {
+            fprintf(stderr, "FAIL: countMismatches all-diff got %d\n", d);
+            pass = false;
+        }
+    }
+    // Long string
+    {
+        const char* a = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG";
+        const char* b = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG";
+        int d = countMismatches(a, b, 68);
+        if (d != 0) {
+            fprintf(stderr, "FAIL: countMismatches long-identical got %d\n", d);
+            pass = false;
+        }
+    }
+    {
+        const char* a = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
+        const char* b = "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT";
+        int d = countMismatches(a, b, 66);
+        if (d != 66) {
+            fprintf(stderr, "FAIL: countMismatches long-alldiff got %d\n", d);
+            pass = false;
+        }
+    }
+    // Length 0
+    {
+        int d = countMismatches("A", "T", 0);
+        if (d != 0) {
+            fprintf(stderr, "FAIL: countMismatches len=0 got %d\n", d);
+            pass = false;
+        }
+    }
+
+    return pass;
+}
+
+}  // namespace fastplong_simd
+
+#endif  // HWY_ONCE
diff --git a/src/simd.h b/src/simd.h
new file mode 100644
index 0000000..863fcf1
--- /dev/null
+++ b/src/simd.h
@@ -0,0 +1,34 @@
+#ifndef FASTPLONG_SIMD_H
+#define FASTPLONG_SIMD_H
+
+#include <cstddef>
+
+namespace fastplong_simd {
+
+// Count quality metrics for a read in one pass.
+// qualstr/seqstr: quality and sequence strings of length len.
+// qualThreshold: phred+33 encoded quality threshold for "low quality".
+// Outputs: lowQualNum, nBaseNum, totalQual (sum of qual-33 values).
+void countQualityMetrics(const char* qualstr, const char* seqstr, int len,
+                         char qualThreshold, int& lowQualNum, int& nBaseNum,
+                         int& totalQual);
+
+// Reverse complement a DNA sequence.
+// src: input sequence of length len.
+// dst: output buffer of at least len bytes (must NOT alias src).
+void reverseComplement(const char* src, char* dst, int len);
+
+// Count adjacent-base differences for low complexity filter.
+// Returns the number of positions where data[i] != data[i+1].
+int countAdjacentDiffs(const char* data, int len);
+
+// Count mismatches between two byte strings.
+// Returns the number of positions where a[i] != b[i], up to len bytes.
+int countMismatches(const char* a, const char* b, int len);
+
+// Run all SIMD unit tests. Returns true if all pass.
+bool testSimd();
+
+}  // namespace fastplong_simd
+
+#endif  // FASTPLONG_SIMD_H
diff --git a/src/simdutil.h b/src/simdutil.h
deleted file mode 100644
index 171fbdf..0000000
--- a/src/simdutil.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#pragma once
-#include <string>
-#include "hwy/highway.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-template <class D, class Func, typename T = hwy::HWY_NAMESPACE::TFromD<D>>
-void Transform1Reversed(D d, T* HWY_RESTRICT inout, size_t count,
-                const T* HWY_RESTRICT in1, const Func& func) {
-  const size_t N = hwy::HWY_NAMESPACE::Lanes(d);
-
-  size_t idx = 0;
-  if (count >= N) {
-    for (; idx <= count - N; idx += N) {
-      const Vec<D> v = LoadU(d, inout + idx);
-      const Vec<D> v1 = LoadU(d, in1 + idx);
-      StoreU(Reverse(d, func(d, v, v1)), d, inout + count - N - idx);
-    }
-  }
-
-  // `count` was a multiple of the vector length `N`: already done.
-  if (HWY_UNLIKELY(idx == count)) return;
-
-  const size_t remaining = count - idx;
-  HWY_DASSERT(0 != remaining && remaining < N);
-  const Vec<D> v = LoadN(d, inout + idx, remaining);
-  const Vec<D> v1 = LoadN(d, in1 + idx, remaining);
-  StoreN(
-      SlideDownLanes(
-          d, Reverse(d, func(d, v, v1)), N - remaining),
-      d, inout, remaining);
-}
-
-}
-}
-HWY_AFTER_NAMESPACE();
diff --git a/test/simd_test.cpp b/test/simd_test.cpp
new file mode 100644
index 0000000..04625a9
--- /dev/null
+++ b/test/simd_test.cpp
@@ -0,0 +1,6 @@
+#include <gtest/gtest.h>
+#include "../src/simd.h"
+
+TEST(SimdTest, allTests) {
+    EXPECT_TRUE(fastplong_simd::testSimd());
+}
diff --git a/testdata/gen_ont_reads.py b/testdata/gen_ont_reads.py
new file mode 100644
index 0000000..5796cc5
--- /dev/null
+++ b/testdata/gen_ont_reads.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+"""Generate dummy ONT (Oxford Nanopore) FASTQ reads for benchmarking.
+
+Produces 10,000 reads with realistic ONT characteristics:
+- Read lengths: log-normal distribution, median ~5kb, range ~500bp-50kb
+- Quality scores: mean ~Q10-Q15 (typical ONT), with some low-quality regions
+- Random ATCG sequence with occasional N bases
+- Adapter sequences at start/end of some reads
+"""
+
+import random
+import gzip
+import math
+import os
+
+SEED = 42
+NUM_READS = 10_000
+OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# ONT adapter sequences (real ones from Oxford Nanopore)
+START_ADAPTER = "AATGTACTTCGTTCAGTTACGTATTGCT"
+END_ADAPTER = "GCAATACGTAACTGAACGAAGT"
+
+BASES = "ACGT"
+
+
+def random_quality(length: int) -> str:
+    """Generate ONT-like quality string (phred+33).
+
+    ONT typical range: Q5-Q20, with occasional dips.
+    """
+    quals = []
+    # Start with a baseline quality that varies per read
+    base_qual = random.gauss(12, 3)  # mean Q12
+
+    for i in range(length):
+        # Occasional low-quality regions (simulating systematic errors)
+        if random.random() < 0.02:
+            q = max(0, min(40, int(random.gauss(5, 2))))
+        else:
+            q = max(0, min(40, int(random.gauss(base_qual, 4))))
+        quals.append(chr(q + 33))
+    return "".join(quals)
+
+
+def random_sequence(length: int) -> str:
+    """Generate random DNA sequence with occasional N bases."""
+    seq = []
+    for _ in range(length):
+        if random.random() < 0.005:  # 0.5% N rate
+            seq.append("N")
+        else:
+            seq.append(random.choice(BASES))
+    return "".join(seq)
+
+
+def generate_read_length() -> int:
+    """Log-normal distribution mimicking ONT read lengths."""
+    # median ~5000, with long tail up to ~50000
+    length = int(random.lognormvariate(math.log(5000), 0.7))
+    return max(200, min(100000, length))
+
+
+def write_fastq(filepath: str, reads: list, compress: bool = False):
+    """Write reads to FASTQ file, optionally gzipped."""
+    opener = gzip.open if compress else open
+    mode = "wt" if compress else "w"
+
+    with opener(filepath, mode) as f:
+        for read_id, seq, qual in reads:
+            f.write(f"@{read_id}\n")
+            f.write(f"{seq}\n")
+            f.write("+\n")
+            f.write(f"{qual}\n")
+
+
+def main():
+    random.seed(SEED)
+
+    reads = []
+    total_bases = 0
+
+    for i in range(NUM_READS):
+        length = generate_read_length()
+
+        # Some reads have adapters (30% start, 30% end, 10% both)
+        seq = ""
+        r = random.random()
+        if r < 0.10:
+            # Both adapters
+            core_len = max(100, length - len(START_ADAPTER) - len(END_ADAPTER))
+            seq = START_ADAPTER + random_sequence(core_len) + END_ADAPTER
+        elif r < 0.40:
+            # Start adapter only
+            core_len = max(100, length - len(START_ADAPTER))
+            seq = START_ADAPTER + random_sequence(core_len)
+        elif r < 0.70:
+            # End adapter only
+            core_len = max(100, length - len(END_ADAPTER))
+            seq = random_sequence(core_len) + END_ADAPTER
+        else:
+            seq = random_sequence(length)
+
+        actual_len = len(seq)
+        qual = random_quality(actual_len)
+
+        read_id = (
+            f"ont_read_{i:06d} "
+            f"runid=dummy_run_001 "
+            f"read={i} "
+            f"ch={random.randint(1, 512)} "
+            f"start_time=2024-01-01T00:00:{i % 60:02d}Z "
+            f"flow_cell_id=FAK00001 "
+            f"protocol_group_id=benchmark "
+            f"sample_id=dummy_ont"
+        )
+
+        reads.append((read_id, seq, qual))
+        total_bases += actual_len
+
+    # Write uncompressed
+    fq_path = os.path.join(OUTPUT_DIR, "ont_10k.fastq")
+    write_fastq(fq_path, reads, compress=False)
+
+    # Write gzipped
+    gz_path = os.path.join(OUTPUT_DIR, "ont_10k.fastq.gz")
+    write_fastq(gz_path, reads, compress=True)
+
+    # Stats
+    lengths = [len(seq) for _, seq, _ in reads]
+    lengths.sort()
+    print(f"Generated {NUM_READS} ONT reads")
+    print(f"Total bases: {total_bases:,}")
+    print(f"Mean length: {total_bases // NUM_READS:,}")
+    print(f"Median length: {lengths[len(lengths)//2]:,}")
+    print(f"Min length: {lengths[0]:,}")
+    print(f"Max length: {lengths[-1]:,}")
+    print(f"N50: {calculate_n50(lengths, total_bases):,}")
+    print(f"Written: {fq_path} ({os.path.getsize(fq_path) / 1024 / 1024:.1f} MB)")
+    print(f"Written: {gz_path} ({os.path.getsize(gz_path) / 1024 / 1024:.1f} MB)")
+
+
+def calculate_n50(sorted_lengths: list, total_bases: int) -> int:
+    cumsum = 0
+    for l in reversed(sorted_lengths):
+        cumsum += l
+        if cumsum >= total_bases / 2:
+            return l
+    return sorted_lengths[-1]
+
+
+if __name__ == "__main__":
+    main()
diff --git a/third_party/googletest b/third_party/googletest
new file mode 160000
index 0000000..52eb810
--- /dev/null
+++ b/third_party/googletest
@@ -0,0 +1 @@
+Subproject commit 52eb8108c5bdec04579160ae17225d66034bd723
diff --git a/third_party/highway b/third_party/highway
new file mode 160000
index 0000000..ac0d5d2
--- /dev/null
+++ b/third_party/highway
@@ -0,0 +1 @@
+Subproject commit ac0d5d297b13ab1b89f48484fc7911082d76a93f
diff --git a/third_party/isa-l b/third_party/isa-l
new file mode 160000
index 0000000..c3d0760
--- /dev/null
+++ b/third_party/isa-l
@@ -0,0 +1 @@
+Subproject commit c3d076027f0a7377b78472876a930af98c4eded6
diff --git a/third_party/libdeflate b/third_party/libdeflate
new file mode 160000
index 0000000..22f6e02
--- /dev/null
+++ b/third_party/libdeflate
@@ -0,0 +1 @@
+Subproject commit 22f6e0235ff5651c7300dc120a7ee0d9f0ba3426