diff --git a/docs/extras/vcf_annotator.md b/docs/extras/vcf_annotator.md index e119aa40..81d25c1c 100644 --- a/docs/extras/vcf_annotator.md +++ b/docs/extras/vcf_annotator.md @@ -1,6 +1,6 @@ # VCF Annotator -The [VCF Annotator tool](../../src/ga4gh/vrs/extras/vcf_annotation.py) provides utility for annotating VCF's with VRS Allele IDs. +The [VCF Annotator tool](../../src/ga4gh/vrs/extras/annotator/vcf.py) provides a Python class for annotating VCFs with VRS Allele IDs. A [command-line interface](../../src/ga4gh/vrs/extras/annotator/cli.py) is available for accessing these functions from a shell or shell script. ## How to use diff --git a/pyproject.toml b/pyproject.toml index f4a62cf2..cefeb0eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,7 +78,7 @@ Source = "https://github.com/ga4gh/vrs-python" "Bug Tracker" = "https://github.com/ga4gh/vrs-python/issues" [project.scripts] -vrs-annotate = "ga4gh.vrs.extras.vcf_annotation:_cli" +vrs-annotate = "ga4gh.vrs.extras.annotator.cli:_cli" [build-system] requires = ["setuptools>=65.3", "setuptools_scm>=8"] @@ -193,7 +193,7 @@ exclude = [ "ANN201", "ANN202", ] -"src/ga4gh/vrs/extras/vcf_annotation.py" = [ +"src/ga4gh/vrs/extras/annotator/vcf.py" = [ "PTH123", # see https://github.com/ga4gh/vrs-python/issues/482 ] "src/ga4gh/vrs/extras/object_store.py" = [ diff --git a/src/ga4gh/vrs/extras/annotator/__init__.py b/src/ga4gh/vrs/extras/annotator/__init__.py new file mode 100644 index 00000000..7b787f94 --- /dev/null +++ b/src/ga4gh/vrs/extras/annotator/__init__.py @@ -0,0 +1 @@ +"""Provide tools for annotating data with corresponding VRS objects and attributes.""" diff --git a/src/ga4gh/vrs/extras/annotator/cli.py b/src/ga4gh/vrs/extras/annotator/cli.py new file mode 100644 index 00000000..6b3de705 --- /dev/null +++ b/src/ga4gh/vrs/extras/annotator/cli.py @@ -0,0 +1,197 @@ +"""Define command-line interface for VRS annotator tool. + +$ vrs-annotate vcf input.vcf.gz --vcf_out output.vcf.gz --vrs_pickle_out vrs_objects.pkl + +""" + +import logging +from collections.abc import Callable +from enum import Enum +from pathlib import Path +from timeit import default_timer as timer + +import click + +from ga4gh.vrs.extras.annotator.vcf import SeqRepoProxyType, VCFAnnotator + +_logger = logging.getLogger(__name__) + + +@click.group() +def _cli() -> None: + """Annotate input files with VRS variation objects.""" + logging.basicConfig( + filename="vrs-annotate.log", + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + + +class _LogLevel(str, Enum): + """Define legal values for `--log_level` option.""" + + DEBUG = "debug" + INFO = "info" + WARNING = "warning" + ERROR = "error" + CRITICAL = "critical" + + +def _log_level_option(func: Callable) -> Callable: + """Provide reusable log level CLI option decorator. + + Adds a `--log_level` CLI option to any decorated command. Doesn't pass on any + values, just sets the logging level for this module. + + :param func: incoming click command + :return: same command, wrapped with log level option + """ + + def _set_log_level(ctx: dict, param: str, value: _LogLevel) -> None: # noqa: ARG001 + level_map = { + _LogLevel.DEBUG: logging.DEBUG, + _LogLevel.INFO: logging.INFO, + _LogLevel.WARNING: logging.WARNING, + _LogLevel.ERROR: logging.ERROR, + _LogLevel.CRITICAL: logging.CRITICAL, + } + logging.getLogger(__name__).setLevel(level_map[value]) + + return click.option( + "--log_level", + type=click.Choice([v.value for v in _LogLevel.__members__.values()]), + default="info", + help="Set the logging level.", + callback=_set_log_level, + expose_value=False, + is_eager=True, + )(func) + + +@_cli.command(name="vcf") +@_log_level_option +@click.argument( + "vcf_in", + nargs=1, + type=click.Path(exists=True, readable=True, dir_okay=False, path_type=Path), +) +@click.option( + "--vcf_out", + required=False, + type=click.Path(writable=True, allow_dash=False, path_type=Path), + help=( + "Declare save location for output annotated VCF. If not provided, must provide --vrs_pickle_out." + ), +) +@click.option( + "--vrs_pickle_out", + required=False, + type=click.Path(writable=True, allow_dash=False, path_type=Path), + help=( + "Declare save location for output VCF pickle. If not provided, must provide --vcf_out." + ), +) +@click.option( + "--vrs_attributes", + is_flag=True, + default=False, + help="Include VRS_Start, VRS_End, and VRS_State fields in the VCF output INFO field.", +) +@click.option( + "--seqrepo_dp_type", + required=False, + default=SeqRepoProxyType.LOCAL, + type=click.Choice( + [v.value for v in SeqRepoProxyType.__members__.values()], case_sensitive=True + ), + help="Specify type of SeqRepo dataproxy to use.", + show_default=True, + show_choices=True, +) +@click.option( + "--seqrepo_root_dir", + required=False, + default=Path("/usr/local/share/seqrepo/latest"), + type=click.Path(path_type=Path), + help="Define root directory for local SeqRepo instance, if --seqrepo_dp_type=local.", + show_default=True, +) +@click.option( + "--seqrepo_base_url", + required=False, + default="http://localhost:5000/seqrepo", + help="Specify base URL for SeqRepo REST API, if --seqrepo_dp_type=rest.", + show_default=True, +) +@click.option( + "--assembly", + required=False, + default="GRCh38", + show_default=True, + help="Specify assembly that was used to create input VCF.", + type=str, +) +@click.option( + "--skip_ref", + is_flag=True, + default=False, + help="Skip VRS computation for REF alleles.", +) +@click.option( + "--require_validation", + is_flag=True, + default=False, + help="Require validation checks to pass to construct a VRS object.", +) +@click.option( + "--silent", + "-s", + is_flag=True, + default=False, + help="Suppress messages printed to stdout", +) +def _annotate_vcf_cli( + vcf_in: Path, + vcf_out: Path | None, + vrs_pickle_out: Path | None, + vrs_attributes: bool, + seqrepo_dp_type: SeqRepoProxyType, + seqrepo_root_dir: Path, + seqrepo_base_url: str, + assembly: str, + skip_ref: bool, + require_validation: bool, + silent: bool, +) -> None: + """Extract VRS objects from VCF located at VCF_IN. + + $ vrs-annotate vcf input.vcf.gz --vcf_out output.vcf.gz --vrs_pickle_out vrs_objects.pkl + + Note that at least one of --vcf_out or --vrs_pickle_out must be selected and defined. + """ + annotator = VCFAnnotator( + seqrepo_dp_type, seqrepo_base_url, str(seqrepo_root_dir.absolute()) + ) + vcf_out_str = str(vcf_out.absolute()) if vcf_out is not None else vcf_out + vrs_pkl_out_str = ( + str(vrs_pickle_out.absolute()) if vrs_pickle_out is not None else vrs_pickle_out + ) + start = timer() + msg = f"Annotating {vcf_in} with the VCF Annotator..." + _logger.info(msg) + if not silent: + click.echo(msg) + annotator.annotate( + str(vcf_in.absolute()), + vcf_out=vcf_out_str, + vrs_pickle_out=vrs_pkl_out_str, + vrs_attributes=vrs_attributes, + assembly=assembly, + compute_for_ref=(not skip_ref), + require_validation=require_validation, + ) + end = timer() + msg = f"VCF Annotator finished in {(end - start):.5f} seconds" + _logger.info(msg) + if not silent: + click.echo(msg) diff --git a/src/ga4gh/vrs/extras/vcf_annotation.py b/src/ga4gh/vrs/extras/annotator/vcf.py similarity index 72% rename from src/ga4gh/vrs/extras/vcf_annotation.py rename to src/ga4gh/vrs/extras/annotator/vcf.py index 0d413175..fe2a6069 100644 --- a/src/ga4gh/vrs/extras/vcf_annotation.py +++ b/src/ga4gh/vrs/extras/annotator/vcf.py @@ -1,22 +1,17 @@ -"""Annotate VCFs with VRS - -$ vrs-annotate vcf input.vcf.gz --vcf_out output.vcf.gz --vrs_pickle_out vrs_objects.pkl - -""" +"""Annotate VCFs with VRS identifiers and attributes.""" import logging -import pathlib import pickle -from collections.abc import Callable from enum import Enum -from timeit import default_timer as timer -import click import pysam from biocommons.seqrepo import SeqRepo from pydantic import ValidationError -from ga4gh.core import VrsObjectIdentifierIs, use_ga4gh_compute_identifier_when +from ga4gh.core.identifiers import ( + VrsObjectIdentifierIs, + use_ga4gh_compute_identifier_when, +) from ga4gh.vrs.dataproxy import ( DataProxyValidationError, SeqRepoDataProxy, @@ -25,7 +20,6 @@ from ga4gh.vrs.extras.translator import AlleleTranslator _logger = logging.getLogger(__name__) -_logger.setLevel(logging.DEBUG) class VCFAnnotatorError(Exception): @@ -39,186 +33,6 @@ class SeqRepoProxyType(str, Enum): REST = "rest" -@click.group() -def _cli() -> None: - """Annotate input files with VRS variation objects.""" - logging.basicConfig( - filename="vrs-annotate.log", - level=logging.INFO, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", - ) - - -class _LogLevel(str, Enum): - """Define legal values for `--log_level` option.""" - - DEBUG = "debug" - INFO = "info" - WARNING = "warning" - ERROR = "error" - CRITICAL = "critical" - - -def _log_level_option(func: Callable) -> Callable: - """Provide reusable log level CLI option decorator. - - Adds a `--log_level` CLI option to any decorated command. Doesn't pass on any - values, just sets the logging level for this module. - - :param func: incoming click command - :return: same command, wrapped with log level option - """ - - def _set_log_level(ctx: dict, param: str, value: _LogLevel) -> None: # noqa: ARG001 - level_map = { - _LogLevel.DEBUG: logging.DEBUG, - _LogLevel.INFO: logging.INFO, - _LogLevel.WARNING: logging.WARNING, - _LogLevel.ERROR: logging.ERROR, - _LogLevel.CRITICAL: logging.CRITICAL, - } - logging.getLogger(__name__).setLevel(level_map[value]) - - return click.option( - "--log_level", - type=click.Choice([v.value for v in _LogLevel.__members__.values()]), - default="info", - help="Set the logging level.", - callback=_set_log_level, - expose_value=False, - is_eager=True, - )(func) - - -@_cli.command(name="vcf") -@_log_level_option -@click.argument( - "vcf_in", - nargs=1, - type=click.Path(exists=True, readable=True, dir_okay=False, path_type=pathlib.Path), -) -@click.option( - "--vcf_out", - required=False, - type=click.Path(writable=True, allow_dash=False, path_type=pathlib.Path), - help=( - "Declare save location for output annotated VCF. If not provided, must provide --vrs_pickle_out." - ), -) -@click.option( - "--vrs_pickle_out", - required=False, - type=click.Path(writable=True, allow_dash=False, path_type=pathlib.Path), - help=( - "Declare save location for output VCF pickle. If not provided, must provide --vcf_out." - ), -) -@click.option( - "--vrs_attributes", - is_flag=True, - default=False, - help="Include VRS_Start, VRS_End, and VRS_State fields in the VCF output INFO field.", -) -@click.option( - "--seqrepo_dp_type", - required=False, - default=SeqRepoProxyType.LOCAL, - type=click.Choice( - [v.value for v in SeqRepoProxyType.__members__.values()], case_sensitive=True - ), - help="Specify type of SeqRepo dataproxy to use.", - show_default=True, - show_choices=True, -) -@click.option( - "--seqrepo_root_dir", - required=False, - default=pathlib.Path("/usr/local/share/seqrepo/latest"), - type=click.Path(path_type=pathlib.Path), - help="Define root directory for local SeqRepo instance, if --seqrepo_dp_type=local.", - show_default=True, -) -@click.option( - "--seqrepo_base_url", - required=False, - default="http://localhost:5000/seqrepo", - help="Specify base URL for SeqRepo REST API, if --seqrepo_dp_type=rest.", - show_default=True, -) -@click.option( - "--assembly", - required=False, - default="GRCh38", - show_default=True, - help="Specify assembly that was used to create input VCF.", - type=str, -) -@click.option( - "--skip_ref", - is_flag=True, - default=False, - help="Skip VRS computation for REF alleles.", -) -@click.option( - "--require_validation", - is_flag=True, - default=False, - help="Require validation checks to pass to construct a VRS object.", -) -@click.option( - "--silent", - "-s", - is_flag=True, - default=False, - help="Suppress messages printed to stdout", -) -def _annotate_vcf_cli( - vcf_in: pathlib.Path, - vcf_out: pathlib.Path | None, - vrs_pickle_out: pathlib.Path | None, - vrs_attributes: bool, - seqrepo_dp_type: SeqRepoProxyType, - seqrepo_root_dir: pathlib.Path, - seqrepo_base_url: str, - assembly: str, - skip_ref: bool, - require_validation: bool, - silent: bool, -) -> None: - """Extract VRS objects from VCF located at VCF_IN. - - $ vrs-annotate vcf input.vcf.gz --vcf_out output.vcf.gz --vrs_pickle_out vrs_objects.pkl - - Note that at least one of --vcf_out or --vrs_pickle_out must be selected and defined. - """ - annotator = VCFAnnotator( - seqrepo_dp_type, seqrepo_base_url, str(seqrepo_root_dir.absolute()) - ) - vcf_out_str = str(vcf_out.absolute()) if vcf_out is not None else vcf_out - vrs_pkl_out_str = ( - str(vrs_pickle_out.absolute()) if vrs_pickle_out is not None else vrs_pickle_out - ) - start = timer() - msg = f"Annotating {vcf_in} with the VCF Annotator..." - _logger.info(msg) - if not silent: - click.echo(msg) - annotator.annotate( - str(vcf_in.absolute()), - vcf_out=vcf_out_str, - vrs_pickle_out=vrs_pkl_out_str, - vrs_attributes=vrs_attributes, - assembly=assembly, - compute_for_ref=(not skip_ref), - require_validation=require_validation, - ) - end = timer() - msg = f"VCF Annotator finished in {(end - start):.5f} seconds" - _logger.info(msg) - if not silent: - click.echo(msg) - - class VCFAnnotator: """Annotate VCFs with VRS allele IDs. diff --git a/tests/extras/test_vcf_annotation.py b/tests/extras/test_annotate_vcf.py similarity index 97% rename from tests/extras/test_vcf_annotation.py rename to tests/extras/test_annotate_vcf.py index dc1e7ecf..e006057d 100644 --- a/tests/extras/test_vcf_annotation.py +++ b/tests/extras/test_annotate_vcf.py @@ -1,13 +1,14 @@ """Ensure proper functionality of VCFAnnotator""" import gzip +import logging import re from pathlib import Path import pytest from ga4gh.vrs.dataproxy import DataProxyValidationError -from ga4gh.vrs.extras.vcf_annotation import VCFAnnotator, VCFAnnotatorError +from ga4gh.vrs.extras.annotator.vcf import VCFAnnotator, VCFAnnotatorError TEST_DATA_DIR = "tests/extras/data" @@ -168,6 +169,9 @@ def test_annotate_vcf_input_validation(vcf_annotator): @pytest.mark.vcr def test_get_vrs_object_invalid_input(vcf_annotator, caplog): """Test that _get_vrs_object method works as expected with invalid input""" + # some tests below are checking for debug logging statements + caplog.set_level(logging.DEBUG) + # No CHROM vcf_annotator._get_vrs_object(".-140753336-A-T", {}, [], "GRCh38") assert "KeyError when getting refget accession: GRCh38:." in caplog.text