diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f4324e..b79c678 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,11 @@ ## Unreleased +Bug fixes: + +- Fix region edge cases and improve test coverage (#262). Region queries or views were in +some cases omitting variants that should have been returned. + Breaking: - Require NumPy 2 (#249) diff --git a/tests/test_vcf_writer.py b/tests/test_vcf_writer.py index e445649..29e5a28 100644 --- a/tests/test_vcf_writer.py +++ b/tests/test_vcf_writer.py @@ -152,6 +152,7 @@ def test_write_vcf__filtering(tmp_path, include, exclude, expected_chrom_pos): # targets only (None, "19", [("19", 111), ("19", 112)]), + (None, "19:111", [("19", 111)]), (None, "19:112", [("19", 112)]), (None, "20:1230236-", [("20", 1230237), ("20", 1234567), ("20", 1235237)]), (None, "20:1230237-", [("20", 1230237), ("20", 1234567), ("20", 1235237)]), @@ -168,10 +169,12 @@ def test_write_vcf__filtering(tmp_path, include, exclude, expected_chrom_pos): ] ) # fmt: on -def test_write_vcf__regions(tmp_path, regions, targets, expected_chrom_pos): +@pytest.mark.parametrize("variants_chunk_size", [None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) +def test_write_vcf__regions(tmp_path, regions, targets, expected_chrom_pos, + variants_chunk_size): original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" - vcz = vcz_path_cache(original) + vcz = vcz_path_cache(original, variants_chunk_size=variants_chunk_size) output = tmp_path.joinpath("output.vcf") write_vcf(vcz, output, regions=regions, targets=targets) diff --git a/tests/utils.py b/tests/utils.py index efcf33b..e6a4994 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -213,7 +213,7 @@ def assert_vcfs_close(f1, f2, *, rtol=1e-05, atol=1e-03, allow_zero_variants=Fal assert count > 0, "No variants in file" -def vcz_path_cache(vcf_path): +def vcz_path_cache(vcf_path, variants_chunk_size=None, samples_chunk_size=None): """ Store converted files in a cache to speed up tests. We're not testing vcf2zarr here, so no point in running over and over again. @@ -221,20 +221,34 @@ def vcz_path_cache(vcf_path): cache_path = pathlib.Path("vcz_test_cache") if not cache_path.exists(): cache_path.mkdir() - cached_vcz_path = (cache_path / vcf_path.name).with_suffix(".vcz") + + # special case chunk size for chr22 vcf + if variants_chunk_size is None and vcf_path.name.startswith("chr22"): + variants_chunk_size = 10 + if samples_chunk_size is None and vcf_path.name.startswith("chr22"): + samples_chunk_size = 10 + + # remove suffixes + cached_vcz_path = vcf_path + while cached_vcz_path.suffix: + cached_vcz_path = cached_vcz_path.with_suffix("") + cached_vcz_filename = cached_vcz_path.name + + # incorporate chunk sizes in filename + if variants_chunk_size is not None: + cached_vcz_filename = f"{cached_vcz_filename}_vcs={variants_chunk_size}" + if samples_chunk_size is not None: + cached_vcz_filename = f"{cached_vcz_filename}_scs={samples_chunk_size}" + + cached_vcz_path = (cache_path / cached_vcz_filename).with_suffix(".vcz") if not cached_vcz_path.exists(): - if vcf_path.name.startswith("chr22"): - vcf.convert( - [vcf_path], - cached_vcz_path, - worker_processes=0, - variants_chunk_size=10, - samples_chunk_size=10, - ) - else: - vcf.convert( - [vcf_path], cached_vcz_path, worker_processes=0, local_alleles=False - ) + vcf.convert( + [vcf_path], + cached_vcz_path, + worker_processes=0, + variants_chunk_size=variants_chunk_size, + samples_chunk_size=samples_chunk_size, + ) return cached_vcz_path diff --git a/vcztools/regions.py b/vcztools/regions.py index 6e542d3..0d7515e 100644 --- a/vcztools/regions.py +++ b/vcztools/regions.py @@ -102,11 +102,13 @@ def regions_to_chunk_indexes( start_position = regions_index[:, 2] end_position = regions_index[:, 3] max_end_position = regions_index[:, 4] + # subtract 1 from start coordinate to convert intervals + # from VCF (1-based, fully-closed) to Python (0-based, half-open) df = pd.DataFrame( { "chunk_index": chunk_index, "Chromosome": contig_id, - "Start": start_position, + "Start": start_position - 1, "End": max_end_position if regions is not None else end_position, } ) diff --git a/vcztools/retrieval.py b/vcztools/retrieval.py index e892e3f..93620be 100644 --- a/vcztools/retrieval.py +++ b/vcztools/retrieval.py @@ -1,7 +1,6 @@ import collections.abc import numpy as np -import zarr from vcztools import filter as filter_mod from vcztools.regions import ( @@ -101,39 +100,29 @@ def variant_chunk_index_iter(root, regions=None, targets=None): region_index, ) - # Then use only load required variant_contig/position chunks if len(chunk_indexes) == 0: # no chunks - no variants to write return - elif len(chunk_indexes) == 1: - # single chunk - block_sel = chunk_indexes[0] - else: - # zarr.blocks doesn't support int array indexing - use that when it does - block_sel = slice(chunk_indexes[0], chunk_indexes[-1] + 1) - region_variant_contig = root["variant_contig"].blocks[block_sel][:] - region_variant_position = root["variant_position"].blocks[block_sel][:] - region_variant_length = root["variant_length"].blocks[block_sel][:] + # Then only load required variant_contig/position chunks + for chunk_index in chunk_indexes: + region_variant_contig = root["variant_contig"].blocks[chunk_index][:] + region_variant_position = root["variant_position"].blocks[chunk_index][:] + region_variant_length = root["variant_length"].blocks[chunk_index][:] + + # Find the variant selection for the chunk + variant_selection = regions_to_selection( + regions_pyranges, + targets_pyranges, + complement, + region_variant_contig, + region_variant_position, + region_variant_length, + ) + variant_mask = np.zeros(region_variant_position.shape[0], dtype=bool) + variant_mask[variant_selection] = 1 - # Find the final variant selection - variant_selection = regions_to_selection( - regions_pyranges, - targets_pyranges, - complement, - region_variant_contig, - region_variant_position, - region_variant_length, - ) - variant_mask = np.zeros(region_variant_position.shape[0], dtype=bool) - variant_mask[variant_selection] = 1 - # Use zarr arrays to get mask chunks aligned with the main data - # for convenience. - z_variant_mask = zarr.array(variant_mask, chunks=pos.chunks[0]) - - for i, v_chunk in enumerate(chunk_indexes): - v_mask_chunk = z_variant_mask.blocks[i] - yield v_chunk, v_mask_chunk + yield chunk_index, variant_mask def variant_chunk_index_iter_with_filtering(