From 0da318b6c32621db3e13eff92f665a8cd3cb997e Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 23 Feb 2026 12:53:21 +0000 Subject: [PATCH] Support -S/--samples-file in query --- tests/test_bcftools_validation.py | 4 ++++ tests/test_samples.py | 11 +++++++++- vcztools/cli.py | 35 +++++++++++++++++-------------- vcztools/samples.py | 18 ++++++++++++++++ 4 files changed, 51 insertions(+), 17 deletions(-) diff --git a/tests/test_bcftools_validation.py b/tests/test_bcftools_validation.py index ece075e..e97d41b 100644 --- a/tests/test_bcftools_validation.py +++ b/tests/test_bcftools_validation.py @@ -235,6 +235,10 @@ def test_vcf_output_with_output_option(tmp_path, args, vcf_file): r"query -f '[%CHROM %POS %SAMPLE %GT %DP %GQ\n]' -r '20:1230236-' -i 'FMT/DP>3' -s 'NA00002,NA00003'", # noqa: E501 "sample.vcf.gz", ), + ( + r"query -f '[%CHROM %POS %SAMPLE %GT %DP %GQ\n]' -r '20:1230236-' -i 'FMT/DP>3' -S tests/data/txt/samples.txt", # noqa: E501 + "sample.vcf.gz", + ), ], ) def test_output(tmp_path, args, vcf_name): diff --git a/tests/test_samples.py b/tests/test_samples.py index e90f441..2d69cba 100644 --- a/tests/test_samples.py +++ b/tests/test_samples.py @@ -2,7 +2,7 @@ import numpy.testing as nt import pytest -from vcztools.samples import parse_samples +from vcztools.samples import parse_samples, parse_samples_file @pytest.mark.parametrize( @@ -41,3 +41,12 @@ def test_parse_samples( nt.assert_array_equal(sample_ids, expected_sample_ids) nt.assert_array_equal(samples_selection, expected_samples_selection) + + +def test_parse_samples_file(): + nt.assert_array_equal( + parse_samples_file("tests/data/txt/samples.txt"), "NA00001,NA00003" + ) + nt.assert_array_equal( + parse_samples_file("^tests/data/txt/samples.txt"), "^NA00001,NA00003" + ) diff --git a/vcztools/cli.py b/vcztools/cli.py index 24c9e60..553c858 100644 --- a/vcztools/cli.py +++ b/vcztools/cli.py @@ -5,6 +5,8 @@ import click +from vcztools.samples import parse_samples_file + from . import plink, provenance, vcf_writer from . import query as query_module from . import stats as stats_module @@ -76,6 +78,13 @@ def wrapper(*args, **kwargs): default=None, help="Samples to include.", ) +samples_file = click.option( + "-S", + "--samples-file", + type=str, + default=None, + help="File of sample names to include.", +) targets = click.option( "-t", "--targets", @@ -154,6 +163,7 @@ def index(path, nrecords, stats, zarr_backend_storage): @regions @force_samples @samples +@samples_file @targets @include @exclude @@ -175,6 +185,7 @@ def query( targets, force_samples, samples, + samples_file, include, exclude, disable_automatic_newline, @@ -200,6 +211,12 @@ def query( if format is None: raise click.UsageError("Missing option -f / --format") + + if samples_file: + if samples is not None: + raise ValueError("vcztools does not support combining -s and -S") + samples = parse_samples_file(samples_file) + with handle_broken_pipe(output): query_module.write_query( path, @@ -245,13 +262,7 @@ def query( help="Do not recalculate INFO fields for the sample subset.", ) @samples -@click.option( - "-S", - "--samples-file", - type=str, - default=None, - help="File of sample names to include.", -) +@samples_file @click.option( "-G", "--drop-genotypes", @@ -300,15 +311,7 @@ def view( if samples_file: if samples is not None: raise ValueError("vcztools does not support combining -s and -S") - - samples = "" - exclude_samples_file = samples_file.startswith("^") - samples_file = samples_file.lstrip("^") - - with open(samples_file) as file: - if exclude_samples_file: - samples = "^" + samples - samples += ",".join(line.strip() for line in file.readlines()) + samples = parse_samples_file(samples_file) with handle_broken_pipe(output): vcf_writer.write_vcf( diff --git a/vcztools/samples.py b/vcztools/samples.py index 0e77d1d..8c84d05 100644 --- a/vcztools/samples.py +++ b/vcztools/samples.py @@ -68,3 +68,21 @@ def parse_samples( samples_selection = np.setdiff1d(samples_selection, masked_sample_ids) sample_ids = all_samples[samples_selection] return sample_ids, samples_selection + + +def parse_samples_file(samples_file: str) -> str: + """Parse a file of sample IDs. + + Returns a comma-delimited string of sample IDs, + optionally preceeded by a ^ character to indicate complement. + """ + samples = "" + exclude_samples_file = samples_file.startswith("^") + samples_file = samples_file.lstrip("^") + + with open(samples_file) as file: + if exclude_samples_file: + samples = "^" + samples + samples += ",".join(line.strip() for line in file.readlines()) + + return samples