Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

## Unreleased

Bug fixes:

- Fix region edge cases and improve test coverage (#262). Region queries or views were in
some cases omitting variants that should have been returned.

Breaking:

- Require NumPy 2 (#249)
Expand Down
7 changes: 5 additions & 2 deletions tests/test_vcf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def test_write_vcf__filtering(tmp_path, include, exclude, expected_chrom_pos):

# targets only
(None, "19", [("19", 111), ("19", 112)]),
(None, "19:111", [("19", 111)]),
(None, "19:112", [("19", 112)]),
(None, "20:1230236-", [("20", 1230237), ("20", 1234567), ("20", 1235237)]),
(None, "20:1230237-", [("20", 1230237), ("20", 1234567), ("20", 1235237)]),
Expand All @@ -168,10 +169,12 @@ def test_write_vcf__filtering(tmp_path, include, exclude, expected_chrom_pos):
]
)
# fmt: on
def test_write_vcf__regions(tmp_path, regions, targets, expected_chrom_pos):
@pytest.mark.parametrize("variants_chunk_size", [None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
def test_write_vcf__regions(tmp_path, regions, targets, expected_chrom_pos,
variants_chunk_size):

original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
vcz = vcz_path_cache(original)
vcz = vcz_path_cache(original, variants_chunk_size=variants_chunk_size)
output = tmp_path.joinpath("output.vcf")

write_vcf(vcz, output, regions=regions, targets=targets)
Expand Down
42 changes: 28 additions & 14 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,28 +213,42 @@ def assert_vcfs_close(f1, f2, *, rtol=1e-05, atol=1e-03, allow_zero_variants=Fal
assert count > 0, "No variants in file"


def vcz_path_cache(vcf_path):
def vcz_path_cache(vcf_path, variants_chunk_size=None, samples_chunk_size=None):
"""
Store converted files in a cache to speed up tests. We're not testing
vcf2zarr here, so no point in running over and over again.
"""
cache_path = pathlib.Path("vcz_test_cache")
if not cache_path.exists():
cache_path.mkdir()
cached_vcz_path = (cache_path / vcf_path.name).with_suffix(".vcz")

# special case chunk size for chr22 vcf
if variants_chunk_size is None and vcf_path.name.startswith("chr22"):
variants_chunk_size = 10
if samples_chunk_size is None and vcf_path.name.startswith("chr22"):
samples_chunk_size = 10

# remove suffixes
cached_vcz_path = vcf_path
while cached_vcz_path.suffix:
cached_vcz_path = cached_vcz_path.with_suffix("")
cached_vcz_filename = cached_vcz_path.name

# incorporate chunk sizes in filename
if variants_chunk_size is not None:
cached_vcz_filename = f"{cached_vcz_filename}_vcs={variants_chunk_size}"
if samples_chunk_size is not None:
cached_vcz_filename = f"{cached_vcz_filename}_scs={samples_chunk_size}"

cached_vcz_path = (cache_path / cached_vcz_filename).with_suffix(".vcz")
if not cached_vcz_path.exists():
if vcf_path.name.startswith("chr22"):
vcf.convert(
[vcf_path],
cached_vcz_path,
worker_processes=0,
variants_chunk_size=10,
samples_chunk_size=10,
)
else:
vcf.convert(
[vcf_path], cached_vcz_path, worker_processes=0, local_alleles=False
)
vcf.convert(
[vcf_path],
cached_vcz_path,
worker_processes=0,
variants_chunk_size=variants_chunk_size,
samples_chunk_size=samples_chunk_size,
)
return cached_vcz_path


Expand Down
4 changes: 3 additions & 1 deletion vcztools/regions.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,13 @@ def regions_to_chunk_indexes(
start_position = regions_index[:, 2]
end_position = regions_index[:, 3]
max_end_position = regions_index[:, 4]
# subtract 1 from start coordinate to convert intervals
# from VCF (1-based, fully-closed) to Python (0-based, half-open)
df = pd.DataFrame(
{
"chunk_index": chunk_index,
"Chromosome": contig_id,
"Start": start_position,
"Start": start_position - 1,
"End": max_end_position if regions is not None else end_position,
}
)
Expand Down
47 changes: 18 additions & 29 deletions vcztools/retrieval.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import collections.abc

import numpy as np
import zarr

from vcztools import filter as filter_mod
from vcztools.regions import (
Expand Down Expand Up @@ -101,39 +100,29 @@ def variant_chunk_index_iter(root, regions=None, targets=None):
region_index,
)

# Then use only load required variant_contig/position chunks
if len(chunk_indexes) == 0:
# no chunks - no variants to write
return
elif len(chunk_indexes) == 1:
# single chunk
block_sel = chunk_indexes[0]
else:
# zarr.blocks doesn't support int array indexing - use that when it does
block_sel = slice(chunk_indexes[0], chunk_indexes[-1] + 1)

region_variant_contig = root["variant_contig"].blocks[block_sel][:]
region_variant_position = root["variant_position"].blocks[block_sel][:]
region_variant_length = root["variant_length"].blocks[block_sel][:]
# Then only load required variant_contig/position chunks
for chunk_index in chunk_indexes:
region_variant_contig = root["variant_contig"].blocks[chunk_index][:]
region_variant_position = root["variant_position"].blocks[chunk_index][:]
region_variant_length = root["variant_length"].blocks[chunk_index][:]

# Find the variant selection for the chunk
variant_selection = regions_to_selection(
regions_pyranges,
targets_pyranges,
complement,
region_variant_contig,
region_variant_position,
region_variant_length,
)
variant_mask = np.zeros(region_variant_position.shape[0], dtype=bool)
variant_mask[variant_selection] = 1

# Find the final variant selection
variant_selection = regions_to_selection(
regions_pyranges,
targets_pyranges,
complement,
region_variant_contig,
region_variant_position,
region_variant_length,
)
variant_mask = np.zeros(region_variant_position.shape[0], dtype=bool)
variant_mask[variant_selection] = 1
# Use zarr arrays to get mask chunks aligned with the main data
# for convenience.
z_variant_mask = zarr.array(variant_mask, chunks=pos.chunks[0])

for i, v_chunk in enumerate(chunk_indexes):
v_mask_chunk = z_variant_mask.blocks[i]
yield v_chunk, v_mask_chunk
yield chunk_index, variant_mask


def variant_chunk_index_iter_with_filtering(
Expand Down
Loading