Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,16 @@ jobs:
name: "Run test across all Python versions"
strategy:
matrix:
environment: [py310, py311, py312]
environment: [py310, py311, py312, py313]
steps:
- name: Check out
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Setup pixi
uses: prefix-dev/setup-pixi@v0.9.3
uses: prefix-dev/setup-pixi@v0.9.4
with:
pixi-version: v0.62.1
pixi-version: v0.63.2
cache: true
environments: ${{ matrix.environment }}
- name: Test
Expand Down
7,527 changes: 3,390 additions & 4,137 deletions pixi.lock

Large diffs are not rendered by default.

84 changes: 47 additions & 37 deletions pixi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,15 @@ name = "genvarloader"
channels = ["conda-forge", "bioconda"]
platforms = ["linux-64"]

[activation.env]
LD_LIBRARY_PATH = "$CONDA_PREFIX/lib"

[environments]
default = { features = ["default", "py310"] }
dev = { features = ["pytorch-cpu", "basenji2", "py310"] }
docs = { features = ["docs", "pytorch-cpu", "basenji2", "py312"] }
docs-gpu = { features = ["docs", "pytorch-gpu", "basenji2", "py312"] }
py310 = { features = ["pytorch-cpu", "py310"] }
py311 = { features = ["pytorch-cpu", "py311"] }
py312 = { features = ["pytorch-cpu", "py312"] }
py313 = { features = ["pytorch-cpu", "py313"] }
no-torch = { features = ["py310"] }
demo = { features = ["demo", "py310"] }

Expand All @@ -22,47 +21,22 @@ samtools = "*"
bcftools = "*"
plink2 = "*"
ruff = "*"
pre-commit = "*"
prek = "*"
commitizen = "*"
pyarrow = "<22"
maturin = ">=1.6,<2"
typing-extensions = ">=4.14"

[pypi-dependencies]
genvarloader = { path = ".", editable = true }
hirola = "==0.3"
seqpro = "==0.9.0"
genoray = "==1.0.1"
numba = ">=0.58.1"
polars = "==1.30.0"
loguru = "*"
attrs = "*"
natsort = "*"
cyvcf2 = "*"
pgenlib = "*"
pandera = "*"
pysam = "*"
pyranges = "*"
more-itertools = "*"
tqdm = "*"
pybigwig = "*"
einops = "*"
tbb = "*"
joblib = "*"
pooch = "*"
awkward = "*"
typer = "*"
pytest-cases = "*"
pytest-cov = "*"
pytest-benchmark = "*"
pytest = "*"
memray = "*"
py-spy = "*"
icecream = "*"
pydantic = ">=2,<3"
pytest-cases = "*"
pytest-cov = "*"
pytest-benchmark = "*"
hypothesis = "*"
filelock = "*"
patchelf = "*"
typer = "*"

[pypi-dependencies]
genvarloader = { path = ".", editable = true }

[feature.docs.dependencies]
sphinx = ">=7.4.7"
Expand All @@ -88,15 +62,51 @@ basenji2-pytorch = ">=0.1.2"

[feature.py310.dependencies]
python = "3.10.*"
numpy = "<2"

[feature.py310.pypi-dependencies]
numpy = "==1.26.*"
numba = "==0.59.1"
pyarrow = ">=21"
hirola = "==0.3"
seqpro = "==0.9.0"
genoray = "==2.1.1"
polars = "==1.37.1"
loguru = "*"
attrs = "*"
natsort = "*"
cyvcf2 = "*"
pgenlib = "*"
pandera = "*"
pysam = "*"
pyranges = "*"
more-itertools = "*"
tqdm = "*"
pybigwig = "*"
einops = "*"
tbb = "*"
joblib = "*"
pooch = "*"
awkward = "*"
pydantic = ">=2,<3"
hypothesis = "*"
filelock = "*"

[feature.py311.dependencies]
python = "3.11.*"

[feature.py312.dependencies]
python = "3.12.*"

[feature.py313.dependencies]
python = "3.13.*"
numpy = "<2.3"

[feature.py313.pypi-dependencies]
numba = "==0.61.2"

[tasks]
pre-commit = "pre-commit install --hook-type commit-msg --hook-type pre-push"
prek-install = "prek install --hook-type commit-msg --hook-type pre-push"
gen = { cmd = "python tests/data/generate_ground_truth.py" }
test = { cmd = "pytest tests && cargo test --release", depends-on = ["gen"] }

Expand Down
8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ authors = [
]
readme = "README.md"
license = { file = "LICENSE.txt" }
requires-python = ">=3.10,<3.13" # >= 3.13 blocked by cyvcf2
requires-python = ">=3.10,<3.14" # >= 3.14 blocked by pyarrow/genoray
dependencies = [
"numba>=0.58.1",
"numba>=0.59.1",
"loguru",
"attrs",
"natsort",
"polars>=1.30",
"polars>=1.37.1",
"cyvcf2",
"pandera",
"pysam",
Expand All @@ -31,7 +31,7 @@ dependencies = [
"awkward",
"hirola>=0.3,<0.4",
"seqpro>=0.9",
"genoray>=1.0.1,<2",
"genoray>=2.1.1,<3",
]

[project.urls]
Expand Down
11 changes: 6 additions & 5 deletions python/genvarloader/_dataset/_rag_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
)
from genoray._types import DOSAGE_TYPE, POS_TYPE, V_IDX_TYPE
from numpy.typing import NDArray
from seqpro.rag import OFFSET_TYPE, Ragged, lengths_to_offsets
from seqpro.rag import OFFSET_TYPE, Ragged, is_rag_dtype, lengths_to_offsets
from typing_extensions import Self

from .._ragged import reverse_complement
Expand All @@ -32,8 +32,9 @@ class RaggedVariant(ak.Record):


class RaggedVariants(ak.Array):
"""An awkward record array, typically with shape (batch, ploidy, ~variants).
Guaranteed to at least have the field "alt" and "start" and one of "ref" or "ilen"."""
"""An awkward record array with shape :code:`(batch, ploidy, ~variants, [~length])`.
Guaranteed to at least have the field :code:`"alt"` and :code:`"start"` and one of :code:`"ref"` or :code:`"ilen"`.
"""

def __init__(
self,
Expand Down Expand Up @@ -179,8 +180,8 @@ def infer_germline_ccfs_(
raise ValueError(f"Cannot infer germline CCFs without {ccf_field}.")

ccfs = self[ccf_field]
if not isinstance(ccfs, Ragged):
raise ValueError(f"{ccf_field} must be a Ragged array.")
if not isinstance(ccfs, Ragged) or not is_rag_dtype(ccfs, DOSAGE_TYPE):
raise ValueError(f"{ccf_field} must be a Ragged array of {DOSAGE_TYPE}.")

_infer_germline_ccfs(
ccfs.data,
Expand Down
8 changes: 8 additions & 0 deletions python/genvarloader/_dataset/_reconstruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from einops import repeat
from genoray._svar import SparseDosages, SparseGenotypes
from genoray._types import DOSAGE_TYPE, POS_TYPE, V_IDX_TYPE
from genoray.exprs import ILEN
from loguru import logger
from numpy.typing import NDArray
from packaging.version import Version
Expand Down Expand Up @@ -67,6 +68,13 @@ def from_table(cls, path: str | Path, one_based: bool = True):
"""
path = Path(path).resolve()
variants = pl.read_ipc(path, memory_map=False)
if variants.schema["ALT"] == pl.List(pl.Utf8):
ilen = ILEN
else:
ilen = pl.col("ALT").str.len_bytes().cast(pl.Int32) - pl.col(
"REF"
).str.len_bytes().cast(pl.Int32)
variants = variants.with_columns(ILEN=ilen)
is_list_type = [
col for col in ("ALT", "ILEN") if variants[col].dtype == pl.List
]
Expand Down
4 changes: 2 additions & 2 deletions python/genvarloader/_dataset/_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def _fetch_impl(
for i in nb.prange(len(c_idxs)):
r_s, r_e = ref_offsets[c_idxs[i]], ref_offsets[c_idxs[i] + 1]
o_s, o_e = out_offsets[i], out_offsets[i + 1]
out[o_s:o_e] = padded_slice(reference[r_s:r_e], starts[i], ends[i], pad_char)
padded_slice(reference[r_s:r_e], starts[i], ends[i], pad_char, out[o_s:o_e])
return out


Expand Down Expand Up @@ -507,7 +507,7 @@ def get_reference(
c_idx, start, end = regions[i, :3]
c_s = ref_offsets[c_idx]
c_e = ref_offsets[c_idx + 1]
out[o_s:o_e] = padded_slice(reference[c_s:c_e], start, end, pad_char)
padded_slice(reference[c_s:c_e], start, end, pad_char, out[o_s:o_e])
return out


Expand Down
5 changes: 1 addition & 4 deletions python/genvarloader/_dataset/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,8 @@ def padded_slice(
start: int,
stop: int,
pad_val: int,
out: NDArray[DTYPE] | None = None,
out: NDArray[DTYPE],
) -> NDArray[DTYPE]:
if out is None:
out = np.empty(stop - start, arr.dtype)

if start >= stop:
return out
elif stop < 0:
Expand Down
10 changes: 5 additions & 5 deletions python/genvarloader/_dataset/_write.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,18 +248,18 @@ def _write_from_vcf(path: Path, bed: pl.DataFrame, vcf: VCF, max_mem: int):
if vcf._index is None:
if not vcf._valid_index():
logger.info("VCF genoray index is invalid, writing")
vcf._write_gvi_index(progress=True)
vcf._write_gvi_index()

vcf._load_index()

assert vcf._index is not None

if vcf._index.df.select((pl.col("ALT").list.len() > 1).any()).item():
if vcf._index.select((pl.col("ALT").list.len() > 1).any()).item():
raise ValueError(
"VCF with filtering applied still contains multi-allelic variants. Please filter or split them."
)

shutil.copy(vcf._index_path(), out_dir / "variants.arrow")
(out_dir / "variants.arrow").hardlink_to(vcf._index_path())

unextended_var_idxs: dict[str, list[NDArray[V_IDX_TYPE]]] = {}
for (contig,), df in bed.partition_by(
Expand Down Expand Up @@ -368,7 +368,7 @@ def _write_from_pgen(path: Path, bed: pl.DataFrame, pgen: PGEN, max_mem: int):
out_dir = path / "genotypes"
out_dir.mkdir(parents=True, exist_ok=True)

shutil.copy(pgen._index_path(), out_dir / "variants.arrow")
(out_dir / "variants.arrow").hardlink_to(pgen._index_path())

pbar = tqdm(total=bed.height, unit=" region")

Expand Down Expand Up @@ -459,7 +459,7 @@ def _write_from_svar(
with open(out_dir / "svar_meta.json", "w") as f:
json.dump({"shape": offsets.shape, "dtype": offsets.dtype.str}, f)

v_ends = svar.var_table.select(
v_ends = svar.index.select(
end=pl.col("POS") - pl.col("ILEN").list.first().clip(upper_bound=0)
)["end"].to_numpy()
max_ends = np.empty(bed.height, np.int32)
Expand Down
3 changes: 2 additions & 1 deletion tests/test_ref_ds.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,5 +94,6 @@ def padded_slice_pad_both():
def test_padded_slice(
arr: np.ndarray, start: int, stop: int, pad_val: int, desired: np.ndarray
):
actual = padded_slice(arr, start, stop, pad_val)
actual = np.empty_like(desired)
padded_slice(arr, start, stop, pad_val, actual)
np.testing.assert_equal(actual, desired)