acg-team · KondratievaOlesya · Feb 26, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -18,6 +18,8 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-      run: |
+      run: |
+        sudo apt-get update
-      run: |
+      run: |
+        sudo apt-get update
+        sudo apt-get update
+        sudo apt-get install -y libbz2-dev zlib1g-dev liblzma-dev
         python -m pip install --upgrade pip
         python -m pip install tox tox-gh-actions
     - name: Test with tox

diff --git a/.gitignore b/.gitignore
@@ -12,4 +12,5 @@ docs/build
 .tox
 dist
 log*
-coverage.xml
+coverage.xml
+tests/data/GRCh38_repeats.bed.sorted*
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,29 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.0.0] - 2026-02-25
-## [1.0.0] - 2026-02-25
+## [1.0.0] - 2026-02-26
-## [1.0.0] - 2026-02-25
+## [1.0.0] - 2026-02-26
+
+* **Tabix-based STR panel handling**
+
+  * STR reference is no longer loaded fully into memory.
+  * The panel is now prepared automatically:
+
+    * sorted by genomic chromosome order,
+    * BGZF-compressed,
+    * tabix-indexed.
+  * During annotation, STR regions are queried directly from the tabix index,
+    enabling fast genomic lookups and significantly reducing memory usage.
+  * Improves scalability and allows safe multi-worker execution.
+
+* **Parallel directory processing (`jobs`)**
+
+  * Added new `jobs` option to control parallel annotation of VCF files.
+  * Each worker processes one VCF file independently.
+  * If `jobs` is not provided, the tool now estimates an optimal number of workers
+    based on available CPU cores and system memory.
+  * This can substantially speed up processing of large VCF directories.
+
+
 ## [0.3.0] - 2026-01-12
 - **Mismatch handling between VCF and STR panel**
   * Added support for cases where the VCF reference does not exactly match the STR panel.

diff --git a/docs/conf.py b/docs/conf.py
@@ -1,7 +1,7 @@
 project = 'strvcf_annotator'
 copyright = '2026, Olesia Kondrateva'
 author = 'Olesia Kondrateva'
-release = '0.3.0'
+release = '1.0.0'
 
 extensions = [
     "sphinx.ext.autodoc",

diff --git a/pyproject.toml b/pyproject.toml
@@ -31,7 +31,8 @@ classifiers = [
 dependencies = [
     "pysam>=0.22.0",
     "pandas>=2.0.0",
-    "trtools>=5.0.0"
+    "trtools>=5.0.0",
+    "psutil"
-    "psutil"
+    "psutil>=5.9.0"
-    "psutil"
+    "psutil>=5.9.0"
 ]
 
 [project.optional-dependencies]

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 pysam>=0.22.0
 pandas>=2.0.0
-trtools>=5.0.0
+trtools>=5.0.0
+psutil
diff --git a/setup.py b/setup.py
@@ -20,6 +20,8 @@
 requirements = [
     "pysam>=0.22.0",
     "pandas>=2.0.0",
+    "trtools>=5.0.0",
+    "psutil",
-    "psutil",
+    "psutil>=5.9.0",
-    "psutil",
+    "psutil>=5.9.0",
 ]
 
 test_requirements = [
@@ -32,7 +34,7 @@
     author_email="xkdnoa@gmail.com",
     python_requires=">=3.8",
     classifiers=[
-        "Development Status :: 2 - Pre-Alpha",
+        "Development Status :: 5 - Production/Stable",
         "Intended Audience :: Developers",
         "License :: OSI Approved :: MIT License",
         "Natural Language :: English",

diff --git a/src/strvcf_annotator/__init__.py b/src/strvcf_annotator/__init__.py
@@ -7,7 +7,7 @@
 
 __author__ = """Olesia Kondrateva"""
 __email__ = "xkdnoa@gmail.com"
-__version__ = "0.3.0"
+__version__ = "1.0.0"
 
 # Public API exports
 from .api import STRAnnotator, annotate_vcf

diff --git a/src/strvcf_annotator/api.py b/src/strvcf_annotator/api.py
@@ -1,6 +1,8 @@
 """Library API for programmatic access to STR annotation functionality."""
 
 import logging
+import statistics
+from collections import Counter
 from typing import Iterator, Optional
 
 import pysam
@@ -53,8 +55,8 @@ class STRAnnotator:
     ----------
     str_bed_path : str
         Path to STR BED file
-    str_df : pd.DataFrame
-        Loaded STR reference data
+    str_panel_gz : str
+        Path to BGZF-compressed, tabix-indexed STR reference file.
     parser : BaseVCFParser
         Parser for genotype extraction
     somatic_mode : bool
@@ -83,17 +85,14 @@ def __init__(
         mismatch_truth: str = "panel",  # "panel" | "vcf" | "skip"
     ):
         validate_str_bed_file(str_bed_path)
-        self.str_bed_path = str_bed_path
-        self.str_df = load_str_reference(str_bed_path)
+        self.str_panel_gz = load_str_reference(str_bed_path)
 
         self.parser = parser if parser is not None else GenericParser()
         self.somatic_mode = somatic_mode
 
         self.ignore_mismatch_warnings = ignore_mismatch_warnings
         self.mismatch_truth = mismatch_truth
 
-        logger.info(f"Loaded {len(self.str_df)} STR regions from {str_bed_path}")
-
     def annotate_vcf_file(
         self,
         input_path: str,
@@ -135,7 +134,6 @@ def annotate_vcf_file(
             - ``"skip"``: skip variants with mismatches entirely
 
             If ``None``, the value set on the annotator instance is used.
-
         Raises
         ------
         ValidationError
@@ -160,7 +158,7 @@ def annotate_vcf_file(
         logger.info(f"Annotating {input_path}...")
         annotate_vcf_to_file(
             input_path,
-            self.str_df,
+            self.str_panel_gz,
             output_path,
             self.parser,
             somatic_mode=smode,
@@ -231,7 +229,7 @@ def annotate_vcf_stream(
 
         yield from generate_annotated_records(
             vcf_in=vcf_in,
-            str_df=self.str_df,
+            str_panel_gz=self.str_panel_gz,
             parser=self.parser,
             somatic_mode=smode,
             ignore_mismatch_warnings=imw,
@@ -246,6 +244,7 @@ def process_directory(
         somatic_mode: Optional[bool] = None,
         ignore_mismatch_warnings: Optional[bool] = None,
         mismatch_truth: Optional[str] = None,
+        jobs: Optional[int] = None,
     ) -> None:
         """
         Batch process a directory of VCF files.
@@ -281,7 +280,11 @@ def process_directory(
             - ``"skip"``: skip variants with mismatches entirely
 
             If ``None``, the value set on the annotator instance is used.
-
+        jobs: int, optional
+            - If jobs is None: compute jobs automatically:
+                jobs_auto = min(cpu_cores, n_files)
+                jobs_auto = min(jobs_auto, floor(available_ram / ram_per_worker_estimate))
+            - If jobs is provided: use it exactly.
         Raises
         ------
         ValidationError
@@ -307,12 +310,13 @@ def process_directory(
         logger.info(f"Processing VCF files in {input_dir}...")
         process_directory(
             input_dir=input_dir,
-            str_bed_path=self.str_bed_path,
+            str_panel_gz=self.str_panel_gz,
             output_dir=output_dir,
             parser=self.parser,
             somatic_mode=smode,
             ignore_mismatch_warnings=imw,
             mismatch_truth=mtruth,
+            jobs=jobs,
         )
         logger.info(f"Batch processing complete. Output in {output_dir}")
 
@@ -341,7 +345,7 @@ def get_str_at_position(self, chrom: str, pos: int) -> Optional[dict]:
         """
         from .core.str_reference import get_str_at_position
 
-        return get_str_at_position(self.str_df, chrom, pos)
+        return get_str_at_position(self.str_panel_gz, chrom, pos)
 
     def get_statistics(self) -> dict:
         """
@@ -358,15 +362,49 @@ def get_statistics(self) -> dict:
         >>> stats = annotator.get_statistics()
         >>> print(f"Total STR regions: {stats['total_regions']}")
         """
-        stats = {
-            "total_regions": len(self.str_df),
-            "chromosomes": self.str_df["CHROM"].nunique(),
-            "unique_repeat_units": self.str_df["RU"].nunique(),
-            "period_distribution": self.str_df["PERIOD"].value_counts().to_dict(),
-            "mean_repeat_count": self.str_df["COUNT"].mean(),
-            "median_repeat_count": self.str_df["COUNT"].median(),
+        tbx = pysam.TabixFile(self.str_panel_gz)
+
+        total_regions = 0
+        chromosomes = set()
+        repeat_units = set()
+        period_counter = Counter()
+        counts = []
+
+        # Iterate through all records in the file
+        for line in tbx.fetch():
+            parts = line.rstrip("\n").split("\t")
+            if len(parts) < 5:
+                continue
+
+            try:
+                chrom = parts[0]
+                start = int(parts[1])
+                end = int(parts[2])
+                period = int(parts[3])
+                ru = parts[4]
+                count = int((end - start + 1) / period)
+            except ValueError:
+                continue
+
+            total_regions += 1
+            chromosomes.add(chrom)
+            repeat_units.add(ru)
+            period_counter[period] += 1
+            counts.append(count)
+
+        tbx.close()
-        tbx = pysam.TabixFile(self.str_panel_gz)
-
-        total_regions = 0
-        chromosomes = set()
-        repeat_units = set()
-        period_counter = Counter()
-        counts = []
-
-        # Iterate through all records in the file
-        for line in tbx.fetch():
-            parts = line.rstrip("\n").split("\t")
-            if len(parts) < 5:
-                continue
-
-            try:
-                chrom = parts[0]
-                period = int(parts[3])
-                ru = parts[4]
-                count = int(parts[5]) if len(parts) > 5 else None
-            except ValueError:
-                continue
-
-            total_regions += 1
-            chromosomes.add(chrom)
-            repeat_units.add(ru)
-            period_counter[period] += 1
-            if count is not None:
-                counts.append(count)
-
-        tbx.close()
+        total_regions = 0
+        chromosomes = set()
+        repeat_units = set()
+        period_counter = Counter()
+        counts = []
+
+        tbx = pysam.TabixFile(self.str_panel_gz)
+        try:
+            # Iterate through all records in the file
+            for line in tbx.fetch():
+                parts = line.rstrip("\n").split("\t")
+                if len(parts) < 5:
+                    continue
+
+                try:
+                    chrom = parts[0]
+                    start = int(parts[1])
+                    end = int(parts[2])
+                    period = int(parts[3])
+                    ru = parts[4]
+
+                    count: Optional[float]
+                    if len(parts) > 5 and parts[5] != "":
+                        # Use explicit COUNT column if present
+                        count = float(parts[5])
+                    else:
+                        # Derive COUNT from start, end, and period when possible
+                        if period > 0 and end > start:
+                            count = (end - start) / period
+                        else:
+                            count = None
+                except ValueError:
+                    continue
+
+                total_regions += 1
+                chromosomes.add(chrom)
+                repeat_units.add(ru)
+                period_counter[period] += 1
+                if count is not None:
+                    counts.append(count)
+        finally:
+            tbx.close()
-        tbx = pysam.TabixFile(self.str_panel_gz)
-
-        total_regions = 0
-        chromosomes = set()
-        repeat_units = set()
-        period_counter = Counter()
-        counts = []
-
-        # Iterate through all records in the file
-        for line in tbx.fetch():
-            parts = line.rstrip("\n").split("\t")
-            if len(parts) < 5:
-                continue
-
-            try:
-                chrom = parts[0]
-                period = int(parts[3])
-                ru = parts[4]
-                count = int(parts[5]) if len(parts) > 5 else None
-            except ValueError:
-                continue
-
-            total_regions += 1
-            chromosomes.add(chrom)
-            repeat_units.add(ru)
-            period_counter[period] += 1
-            if count is not None:
-                counts.append(count)
-
-        tbx.close()
+        total_regions = 0
+        chromosomes = set()
+        repeat_units = set()
+        period_counter = Counter()
+        counts = []
+
+        tbx = pysam.TabixFile(self.str_panel_gz)
+        try:
+            # Iterate through all records in the file
+            for line in tbx.fetch():
+                parts = line.rstrip("\n").split("\t")
+                if len(parts) < 5:
+                    continue
+
+                try:
+                    chrom = parts[0]
+                    start = int(parts[1])
+                    end = int(parts[2])
+                    period = int(parts[3])
+                    ru = parts[4]
+
+                    count: Optional[float]
+                    if len(parts) > 5 and parts[5] != "":
+                        # Use explicit COUNT column if present
+                        count = float(parts[5])
+                    else:
+                        # Derive COUNT from start, end, and period when possible
+                        if period > 0 and end > start:
+                            count = (end - start) / period
+                        else:
+                            count = None
+                except ValueError:
+                    continue
+
+                total_regions += 1
+                chromosomes.add(chrom)
+                repeat_units.add(ru)
+                period_counter[period] += 1
+                if count is not None:
+                    counts.append(count)
+        finally:
+            tbx.close()
+
+        mean_count = statistics.mean(counts) if counts else None
+        median_count = statistics.median(counts) if counts else None
+
+        return {
+            "total_regions": total_regions,
+            "chromosomes": len(chromosomes),
+            "unique_repeat_units": len(repeat_units),
+            "period_distribution": dict(period_counter),
+            "mean_repeat_count": mean_count,
+            "median_repeat_count": median_count,
         }
-        return stats
 
 
 def annotate_vcf(

diff --git a/src/strvcf_annotator/cli.py b/src/strvcf_annotator/cli.py
@@ -94,6 +94,13 @@ def create_parser() -> argparse.ArgumentParser:
             "and VCF REF allele."
         ),
     )
+    parser.add_argument(
+        "--jobs",
+        type=int,
+        help=(
+            "Number of parallel jobs to use for processing. Each job processes one VCF file. If not specified, the number of jobs is automatically determined based on CPU cores, number of files, and available RAM."
+        ),
+    )
     parser.add_argument(
         "--mismatch-truth",
         choices=["panel", "vcf", "skip"],
@@ -153,6 +160,7 @@ def main():
         somatic_mode = getattr(args, "somatic_mode", False)
         ignore_mismatch_warnings = getattr(args, "ignore_mismatch_warnings", False)
         mismatch_truth = getattr(args, "mismatch_truth", "panel")
+        jobs = getattr(args, "jobs", None)
         annotator = STRAnnotator(
             args.str_bed,
             somatic_mode=somatic_mode,
@@ -176,7 +184,7 @@ def main():
         elif args.input_dir:
             # Batch directory mode
             logger.info(f"Processing directory: {args.input_dir}")
-            annotator.process_directory(args.input_dir, args.output_dir)
+            annotator.process_directory(args.input_dir, args.output_dir, jobs=jobs)
             logger.info(f"Successfully processed all VCF files to {args.output_dir}")
 
         logger.info("Annotation complete!")

diff --git a/src/strvcf_annotator/core/__init__.py b/src/strvcf_annotator/core/__init__.py
@@ -1,13 +1,13 @@
 """Core modules for STR annotation functionality."""
 
+from .annotation import build_new_record, make_modified_header, should_skip_genotype
+from .repeat_utils import apply_variant_to_repeat, count_repeat_units, extract_repeat_sequence
 from .str_reference import load_str_reference
-from .repeat_utils import extract_repeat_sequence, count_repeat_units, apply_variant_to_repeat
-from .annotation import make_modified_header, build_new_record, should_skip_genotype
 from .vcf_processor import (
+    annotate_vcf_to_file,
     check_vcf_sorted,
-    reset_and_sort_vcf,
     generate_annotated_records,
-    annotate_vcf_to_file
+    reset_and_sort_vcf,
 )
 
 __all__ = [