sgkit-dev · tomwhite · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,11 @@
 
 ## Unreleased
 
+Bug fixes:
+
+- Fix region edge cases and improve test coverage (#262). Region queries or views were in
+some cases omitting variants that should have been returned.
+
 Breaking:
 
 - Require NumPy 2 (#249)

diff --git a/tests/test_vcf_writer.py b/tests/test_vcf_writer.py
@@ -152,6 +152,7 @@ def test_write_vcf__filtering(tmp_path, include, exclude, expected_chrom_pos):
 
         # targets only
         (None, "19", [("19", 111), ("19", 112)]),
+        (None, "19:111", [("19", 111)]),
         (None, "19:112", [("19", 112)]),
         (None, "20:1230236-", [("20", 1230237), ("20", 1234567), ("20", 1235237)]),
         (None, "20:1230237-", [("20", 1230237), ("20", 1234567), ("20", 1235237)]),
@@ -168,10 +169,12 @@ def test_write_vcf__filtering(tmp_path, include, exclude, expected_chrom_pos):
     ]
 )
 # fmt: on
-def test_write_vcf__regions(tmp_path, regions, targets, expected_chrom_pos):
+@pytest.mark.parametrize("variants_chunk_size", [None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+def test_write_vcf__regions(tmp_path, regions, targets, expected_chrom_pos,
+                            variants_chunk_size):
 
     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
-    vcz = vcz_path_cache(original)
+    vcz = vcz_path_cache(original, variants_chunk_size=variants_chunk_size)
     output = tmp_path.joinpath("output.vcf")
 
     write_vcf(vcz, output, regions=regions, targets=targets)

diff --git a/tests/utils.py b/tests/utils.py
@@ -213,28 +213,42 @@ def assert_vcfs_close(f1, f2, *, rtol=1e-05, atol=1e-03, allow_zero_variants=Fal
             assert count > 0, "No variants in file"
 
 
-def vcz_path_cache(vcf_path):
+def vcz_path_cache(vcf_path, variants_chunk_size=None, samples_chunk_size=None):
     """
     Store converted files in a cache to speed up tests. We're not testing
     vcf2zarr here, so no point in running over and over again.
     """
     cache_path = pathlib.Path("vcz_test_cache")
     if not cache_path.exists():
         cache_path.mkdir()
-    cached_vcz_path = (cache_path / vcf_path.name).with_suffix(".vcz")
+
+    # special case chunk size for chr22 vcf
+    if variants_chunk_size is None and vcf_path.name.startswith("chr22"):
+        variants_chunk_size = 10
+    if samples_chunk_size is None and vcf_path.name.startswith("chr22"):
+        samples_chunk_size = 10
+
+    # remove suffixes
+    cached_vcz_path = vcf_path
+    while cached_vcz_path.suffix:
+        cached_vcz_path = cached_vcz_path.with_suffix("")
+    cached_vcz_filename = cached_vcz_path.name
+
+    # incorporate chunk sizes in filename
+    if variants_chunk_size is not None:
+        cached_vcz_filename = f"{cached_vcz_filename}_vcs={variants_chunk_size}"
+    if samples_chunk_size is not None:
+        cached_vcz_filename = f"{cached_vcz_filename}_scs={samples_chunk_size}"
+
+    cached_vcz_path = (cache_path / cached_vcz_filename).with_suffix(".vcz")
     if not cached_vcz_path.exists():
-        if vcf_path.name.startswith("chr22"):
-            vcf.convert(
-                [vcf_path],
-                cached_vcz_path,
-                worker_processes=0,
-                variants_chunk_size=10,
-                samples_chunk_size=10,
-            )
-        else:
-            vcf.convert(
-                [vcf_path], cached_vcz_path, worker_processes=0, local_alleles=False
-            )
+        vcf.convert(
+            [vcf_path],
+            cached_vcz_path,
+            worker_processes=0,
+            variants_chunk_size=variants_chunk_size,
+            samples_chunk_size=samples_chunk_size,
+        )
     return cached_vcz_path
 
 

diff --git a/vcztools/regions.py b/vcztools/regions.py
@@ -102,11 +102,13 @@ def regions_to_chunk_indexes(
     start_position = regions_index[:, 2]
     end_position = regions_index[:, 3]
     max_end_position = regions_index[:, 4]
+    # subtract 1 from start coordinate to convert intervals
+    # from VCF (1-based, fully-closed) to Python (0-based, half-open)
     df = pd.DataFrame(
         {
             "chunk_index": chunk_index,
             "Chromosome": contig_id,
-            "Start": start_position,
+            "Start": start_position - 1,
             "End": max_end_position if regions is not None else end_position,
         }
     )

diff --git a/vcztools/retrieval.py b/vcztools/retrieval.py
@@ -1,7 +1,6 @@
 import collections.abc
 
 import numpy as np
-import zarr
 
 from vcztools import filter as filter_mod
 from vcztools.regions import (
@@ -101,39 +100,29 @@ def variant_chunk_index_iter(root, regions=None, targets=None):
             region_index,
         )
 
-        # Then use only load required variant_contig/position chunks
         if len(chunk_indexes) == 0:
             # no chunks - no variants to write
             return
-        elif len(chunk_indexes) == 1:
-            # single chunk
-            block_sel = chunk_indexes[0]
-        else:
-            # zarr.blocks doesn't support int array indexing - use that when it does
-            block_sel = slice(chunk_indexes[0], chunk_indexes[-1] + 1)
 
-        region_variant_contig = root["variant_contig"].blocks[block_sel][:]
-        region_variant_position = root["variant_position"].blocks[block_sel][:]
-        region_variant_length = root["variant_length"].blocks[block_sel][:]
+        # Then only load required variant_contig/position chunks
+        for chunk_index in chunk_indexes:
+            region_variant_contig = root["variant_contig"].blocks[chunk_index][:]
+            region_variant_position = root["variant_position"].blocks[chunk_index][:]
+            region_variant_length = root["variant_length"].blocks[chunk_index][:]
+
+            # Find the variant selection for the chunk
+            variant_selection = regions_to_selection(
+                regions_pyranges,
+                targets_pyranges,
+                complement,
+                region_variant_contig,
+                region_variant_position,
+                region_variant_length,
+            )
+            variant_mask = np.zeros(region_variant_position.shape[0], dtype=bool)
+            variant_mask[variant_selection] = 1
 
-        # Find the final variant selection
-        variant_selection = regions_to_selection(
-            regions_pyranges,
-            targets_pyranges,
-            complement,
-            region_variant_contig,
-            region_variant_position,
-            region_variant_length,
-        )
-        variant_mask = np.zeros(region_variant_position.shape[0], dtype=bool)
-        variant_mask[variant_selection] = 1
-        # Use zarr arrays to get mask chunks aligned with the main data
-        # for convenience.
-        z_variant_mask = zarr.array(variant_mask, chunks=pos.chunks[0])
-
-        for i, v_chunk in enumerate(chunk_indexes):
-            v_mask_chunk = z_variant_mask.blocks[i]
-            yield v_chunk, v_mask_chunk
+            yield chunk_index, variant_mask
 
 
 def variant_chunk_index_iter_with_filtering(