mcvickerlab · d-laub · Feb 5, 2026 · Feb 5, 2026 · Feb 5, 2026 · Feb 5, 2026
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -12,16 +12,16 @@ jobs:
     name: "Run test across all Python versions"
     strategy:
       matrix:
-        environment: [py310, py311, py312]
+        environment: [py310, py311, py312, py313]
     steps:
       - name: Check out
         uses: actions/checkout@v4
         with:
           fetch-depth: 0
       - name: Setup pixi
-        uses: prefix-dev/setup-pixi@v0.9.3
+        uses: prefix-dev/setup-pixi@v0.9.4
         with:
-          pixi-version: v0.62.1
+          pixi-version: v0.63.2
           cache: true
           environments: ${{ matrix.environment }}
       - name: Test

diff --git a/pixi.lock b/pixi.lock
diff --git a/pixi.toml b/pixi.toml
@@ -3,16 +3,15 @@ name = "genvarloader"
 channels = ["conda-forge", "bioconda"]
 platforms = ["linux-64"]
 
-[activation.env]
-LD_LIBRARY_PATH = "$CONDA_PREFIX/lib"
-
 [environments]
+default = { features = ["default", "py310"] }
 dev = { features = ["pytorch-cpu", "basenji2", "py310"] }
 docs = { features = ["docs", "pytorch-cpu", "basenji2", "py312"] }
 docs-gpu = { features = ["docs", "pytorch-gpu", "basenji2", "py312"] }
 py310 = { features = ["pytorch-cpu", "py310"] }
 py311 = { features = ["pytorch-cpu", "py311"] }
 py312 = { features = ["pytorch-cpu", "py312"] }
+py313 = { features = ["pytorch-cpu", "py313"] }
 no-torch = { features = ["py310"] }
 demo = { features = ["demo", "py310"] }
 
@@ -22,47 +21,22 @@ samtools = "*"
 bcftools = "*"
 plink2 = "*"
 ruff = "*"
-pre-commit = "*"
+prek = "*"
 commitizen = "*"
-pyarrow = "<22"
 maturin = ">=1.6,<2"
 typing-extensions = ">=4.14"
-
-[pypi-dependencies]
-genvarloader = { path = ".", editable = true }
-hirola = "==0.3"
-seqpro = "==0.9.0"
-genoray = "==1.0.1"
-numba = ">=0.58.1"
-polars = "==1.30.0"
-loguru = "*"
-attrs = "*"
-natsort = "*"
-cyvcf2 = "*"
-pgenlib = "*"
-pandera = "*"
-pysam = "*"
-pyranges = "*"
-more-itertools = "*"
-tqdm = "*"
-pybigwig = "*"
-einops = "*"
-tbb = "*"
-joblib = "*"
-pooch = "*"
-awkward = "*"
+typer = "*"
+pytest-cases = "*"
+pytest-cov = "*"
+pytest-benchmark = "*"
 pytest = "*"
 memray = "*"
 py-spy = "*"
 icecream = "*"
-pydantic = ">=2,<3"
-pytest-cases = "*"
-pytest-cov = "*"
-pytest-benchmark = "*"
-hypothesis = "*"
-filelock = "*"
 patchelf = "*"
-typer = "*"
+
+[pypi-dependencies]
+genvarloader = { path = ".", editable = true }
 
 [feature.docs.dependencies]
 sphinx = ">=7.4.7"
@@ -88,15 +62,51 @@ basenji2-pytorch = ">=0.1.2"
 
 [feature.py310.dependencies]
 python = "3.10.*"
+numpy = "<2"
+
+[feature.py310.pypi-dependencies]
+numpy = "==1.26.*"
+numba = "==0.59.1"
+pyarrow = ">=21"
+hirola = "==0.3"
+seqpro = "==0.9.0"
+genoray = "==2.1.1"
+polars = "==1.37.1"
+loguru = "*"
+attrs = "*"
+natsort = "*"
+cyvcf2 = "*"
+pgenlib = "*"
+pandera = "*"
+pysam = "*"
+pyranges = "*"
+more-itertools = "*"
+tqdm = "*"
+pybigwig = "*"
+einops = "*"
+tbb = "*"
+joblib = "*"
+pooch = "*"
+awkward = "*"
+pydantic = ">=2,<3"
+hypothesis = "*"
+filelock = "*"
 
 [feature.py311.dependencies]
 python = "3.11.*"
 
 [feature.py312.dependencies]
 python = "3.12.*"
 
+[feature.py313.dependencies]
+python = "3.13.*"
+numpy = "<2.3"
+
+[feature.py313.pypi-dependencies]
+numba = "==0.61.2"
+
 [tasks]
-pre-commit = "pre-commit install --hook-type commit-msg --hook-type pre-push"
+prek-install = "prek install --hook-type commit-msg --hook-type pre-push"
 gen = { cmd = "python tests/data/generate_ground_truth.py" }
 test = { cmd = "pytest tests && cargo test --release", depends-on = ["gen"] }
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -8,13 +8,13 @@ authors = [
 ]
 readme = "README.md"
 license = { file = "LICENSE.txt" }
-requires-python = ">=3.10,<3.13" # >= 3.13 blocked by cyvcf2
+requires-python = ">=3.10,<3.14" # >= 3.14 blocked by pyarrow/genoray
 dependencies = [
-    "numba>=0.58.1",
+    "numba>=0.59.1",
     "loguru",
     "attrs",
     "natsort",
-    "polars>=1.30",
+    "polars>=1.37.1",
     "cyvcf2",
     "pandera",
     "pysam",
@@ -31,7 +31,7 @@ dependencies = [
     "awkward",
     "hirola>=0.3,<0.4",
     "seqpro>=0.9",
-    "genoray>=1.0.1,<2",
+    "genoray>=2.1.1,<3",
 ]
 
 [project.urls]

diff --git a/python/genvarloader/_dataset/_rag_variants.py b/python/genvarloader/_dataset/_rag_variants.py
@@ -15,7 +15,7 @@
 )
 from genoray._types import DOSAGE_TYPE, POS_TYPE, V_IDX_TYPE
 from numpy.typing import NDArray
-from seqpro.rag import OFFSET_TYPE, Ragged, lengths_to_offsets
+from seqpro.rag import OFFSET_TYPE, Ragged, is_rag_dtype, lengths_to_offsets
 from typing_extensions import Self
 
 from .._ragged import reverse_complement
@@ -32,8 +32,9 @@ class RaggedVariant(ak.Record):
 
 
 class RaggedVariants(ak.Array):
-    """An awkward record array, typically with shape (batch, ploidy, ~variants).
-    Guaranteed to at least have the field "alt" and "start" and one of "ref" or "ilen"."""
+    """An awkward record array with shape :code:`(batch, ploidy, ~variants, [~length])`.
+    Guaranteed to at least have the field :code:`"alt"` and :code:`"start"` and one of :code:`"ref"` or :code:`"ilen"`.
+    """
 
     def __init__(
         self,
@@ -179,8 +180,8 @@ def infer_germline_ccfs_(
             raise ValueError(f"Cannot infer germline CCFs without {ccf_field}.")
 
         ccfs = self[ccf_field]
-        if not isinstance(ccfs, Ragged):
-            raise ValueError(f"{ccf_field} must be a Ragged array.")
+        if not isinstance(ccfs, Ragged) or not is_rag_dtype(ccfs, DOSAGE_TYPE):
+            raise ValueError(f"{ccf_field} must be a Ragged array of {DOSAGE_TYPE}.")
 
         _infer_germline_ccfs(
             ccfs.data,

diff --git a/python/genvarloader/_dataset/_reconstruct.py b/python/genvarloader/_dataset/_reconstruct.py
@@ -16,6 +16,7 @@
 from einops import repeat
 from genoray._svar import SparseDosages, SparseGenotypes
 from genoray._types import DOSAGE_TYPE, POS_TYPE, V_IDX_TYPE
+from genoray.exprs import ILEN
 from loguru import logger
 from numpy.typing import NDArray
 from packaging.version import Version
@@ -67,6 +68,13 @@ def from_table(cls, path: str | Path, one_based: bool = True):
         """
         path = Path(path).resolve()
         variants = pl.read_ipc(path, memory_map=False)
+        if variants.schema["ALT"] == pl.List(pl.Utf8):
+            ilen = ILEN
+        else:
+            ilen = pl.col("ALT").str.len_bytes().cast(pl.Int32) - pl.col(
+                "REF"
+            ).str.len_bytes().cast(pl.Int32)
+        variants = variants.with_columns(ILEN=ilen)
         is_list_type = [
             col for col in ("ALT", "ILEN") if variants[col].dtype == pl.List
         ]

diff --git a/python/genvarloader/_dataset/_reference.py b/python/genvarloader/_dataset/_reference.py
@@ -158,7 +158,7 @@ def _fetch_impl(
     for i in nb.prange(len(c_idxs)):
         r_s, r_e = ref_offsets[c_idxs[i]], ref_offsets[c_idxs[i] + 1]
         o_s, o_e = out_offsets[i], out_offsets[i + 1]
-        out[o_s:o_e] = padded_slice(reference[r_s:r_e], starts[i], ends[i], pad_char)
+        padded_slice(reference[r_s:r_e], starts[i], ends[i], pad_char, out[o_s:o_e])
     return out
 
 
@@ -507,7 +507,7 @@ def get_reference(
         c_idx, start, end = regions[i, :3]
         c_s = ref_offsets[c_idx]
         c_e = ref_offsets[c_idx + 1]
-        out[o_s:o_e] = padded_slice(reference[c_s:c_e], start, end, pad_char)
+        padded_slice(reference[c_s:c_e], start, end, pad_char, out[o_s:o_e])
     return out
 
 

diff --git a/python/genvarloader/_dataset/_utils.py b/python/genvarloader/_dataset/_utils.py
@@ -16,11 +16,8 @@ def padded_slice(
     start: int,
     stop: int,
     pad_val: int,
-    out: NDArray[DTYPE] | None = None,
+    out: NDArray[DTYPE],
 ) -> NDArray[DTYPE]:
-    if out is None:
-        out = np.empty(stop - start, arr.dtype)
-
     if start >= stop:
         return out
     elif stop < 0:

diff --git a/python/genvarloader/_dataset/_write.py b/python/genvarloader/_dataset/_write.py
@@ -248,18 +248,18 @@ def _write_from_vcf(path: Path, bed: pl.DataFrame, vcf: VCF, max_mem: int):
     if vcf._index is None:
         if not vcf._valid_index():
             logger.info("VCF genoray index is invalid, writing")
-            vcf._write_gvi_index(progress=True)
+            vcf._write_gvi_index()
 
         vcf._load_index()
 
     assert vcf._index is not None
 
-    if vcf._index.df.select((pl.col("ALT").list.len() > 1).any()).item():
+    if vcf._index.select((pl.col("ALT").list.len() > 1).any()).item():
         raise ValueError(
             "VCF with filtering applied still contains multi-allelic variants. Please filter or split them."
         )
 
-    shutil.copy(vcf._index_path(), out_dir / "variants.arrow")
+    (out_dir / "variants.arrow").hardlink_to(vcf._index_path())
 
     unextended_var_idxs: dict[str, list[NDArray[V_IDX_TYPE]]] = {}
     for (contig,), df in bed.partition_by(
@@ -368,7 +368,7 @@ def _write_from_pgen(path: Path, bed: pl.DataFrame, pgen: PGEN, max_mem: int):
     out_dir = path / "genotypes"
     out_dir.mkdir(parents=True, exist_ok=True)
 
-    shutil.copy(pgen._index_path(), out_dir / "variants.arrow")
+    (out_dir / "variants.arrow").hardlink_to(pgen._index_path())
 
     pbar = tqdm(total=bed.height, unit=" region")
 
@@ -459,7 +459,7 @@ def _write_from_svar(
     with open(out_dir / "svar_meta.json", "w") as f:
         json.dump({"shape": offsets.shape, "dtype": offsets.dtype.str}, f)
 
-    v_ends = svar.var_table.select(
+    v_ends = svar.index.select(
         end=pl.col("POS") - pl.col("ILEN").list.first().clip(upper_bound=0)
     )["end"].to_numpy()
     max_ends = np.empty(bed.height, np.int32)

diff --git a/tests/test_ref_ds.py b/tests/test_ref_ds.py
@@ -94,5 +94,6 @@ def padded_slice_pad_both():
 def test_padded_slice(
     arr: np.ndarray, start: int, stop: int, pad_val: int, desired: np.ndarray
 ):
-    actual = padded_slice(arr, start, stop, pad_val)
+    actual = np.empty_like(desired)
+    padded_slice(arr, start, stop, pad_val, actual)
     np.testing.assert_equal(actual, desired)