Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions settings/.env.dev
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ MAVEDB_API_KEY=

SEQREPO_ROOT_DIR=/usr/local/share/seqrepo/2024-12-20

####################################################################################################
# Environment variables for ClinGen
####################################################################################################

CLINGEN_API_URL=https://reg.genome.network/allele

####################################################################################################
# Environment variables for ensembl
####################################################################################################
Expand Down
3 changes: 3 additions & 0 deletions src/dcd_mapping/resource_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Provide basic utilities for fetching and storing external data."""
import logging
import os
import time
from pathlib import Path
Expand All @@ -7,6 +8,8 @@
import requests
from tqdm import tqdm

_logger = logging.getLogger(__name__)

MAVEDB_API_KEY = os.environ.get("MAVEDB_API_KEY")
MAVEDB_BASE_URL = os.environ.get("MAVEDB_BASE_URL")
ENSEMBL_API_URL = os.environ.get("ENSEMBL_API_URL", "https://rest.ensembl.org") # TODO
Expand Down
56 changes: 46 additions & 10 deletions src/dcd_mapping/vrs_map.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Map transcripts to VRS objects."""

import logging
import os
from collections.abc import Iterable
from itertools import cycle

Expand Down Expand Up @@ -31,7 +32,7 @@
get_seqrepo,
translate_hgvs_to_vrs,
)
from dcd_mapping.resource_utils import CDOT_URL
from dcd_mapping.resource_utils import CDOT_URL, request_with_backoff
from dcd_mapping.schemas import (
AlignmentResult,
MappedScore,
Expand All @@ -48,6 +49,8 @@

_logger = logging.getLogger(__name__)

CLINGEN_API_URL = os.environ.get("CLINGEN_API_URL", "https://reg.genome.network/allele")


def _hgvs_variant_is_valid(hgvs_string: str) -> bool:
return not hgvs_string.endswith((".=", ")", "X"))
Expand Down Expand Up @@ -86,6 +89,27 @@ def is_intronic_variant(variant: Variant) -> bool:
return False


def fetch_clingen_genomic_hgvs(hgvs: str) -> str | None:
"""Fetch the genomic HGVS string from ClinGen.

:param hgvs: The HGVS string to fetch
:return: The genomic HGVS string on GRCh38, or None if not found
"""
if CLINGEN_API_URL is None:
msg = "CLINGEN_API_URL environment variable is not set and default is unavailable."
_logger.error(msg)
raise ValueError(msg)
response = request_with_backoff("GET", f"{CLINGEN_API_URL}?hgvs={hgvs}", timeout=30)
if response.status_code == 200:
data = response.json()
for allele in data.get("genomicAlleles", []):
if allele.get("referenceGenome") == "GRCh38":
for coordinates in allele.get("hgvs", []):
if coordinates.startswith("NC_"):
return coordinates
return None


def _create_pre_mapped_hgvs_strings(
raw_description: str,
layer: AnnotationLayer,
Expand Down Expand Up @@ -156,6 +180,7 @@ def _create_post_mapped_hgvs_strings(
layer: AnnotationLayer,
tx: TxSelectResult | None = None,
alignment: AlignmentResult | None = None,
accession_id: str | None = None,
) -> list[str]:
"""Generate a list of (post-mapped) HGVS strings from one long string containing many valid HGVS substrings.

Expand All @@ -167,13 +192,14 @@ def _create_post_mapped_hgvs_strings(
:param layer: An enum denoting the targeted annotation layer of these HGVS strings
:param tx: A TxSelectResult object defining the transcript we are mapping to (or None)
:param alignment: An AlignmentResult object defining the alignment we are mapping to (or None)
:param accession_id: An accession id describing the reference sequence (or None). Only used for accession-based variants.
:return: A list of HGVS strings relative to the `tx` or `alignment`
"""
if layer is AnnotationLayer.PROTEIN and tx is None:
msg = f"Transcript result must be provided for {layer} annotations (Transcript was `{tx}`)."
raise ValueError(msg)
if layer is AnnotationLayer.GENOMIC and alignment is None:
msg = f"Alignment result must be provided for {layer} annotations (Alignment was `{alignment}`)."
if layer is AnnotationLayer.GENOMIC and alignment is None and accession_id is None:
msg = f"Alignment result or accession ID must be provided for {layer} annotations (Alignment was `{alignment}` and Accession ID was `{accession_id}`)."
raise ValueError(msg)

raw_variants = _parse_raw_variant_str(raw_description)
Expand All @@ -197,12 +223,22 @@ def _create_post_mapped_hgvs_strings(
variant = _adjust_protein_variant_to_ref(variant, tx)
hgvs_strings.append(tx.np + ":" + str(variant))
elif layer is AnnotationLayer.GENOMIC:
assert alignment # noqa: S101. mypy help

variant = _adjust_genomic_variant_to_ref(variant, alignment)
hgvs_strings.append(
get_chromosome_identifier(alignment.chrom) + ":" + str(variant)
)
if accession_id:
pre_mapped_hgvs = accession_id + ":" + str(variant)
# use ClinGen to align accession-based variants to genomic reference
genomic_hgvs = fetch_clingen_genomic_hgvs(pre_mapped_hgvs)
if genomic_hgvs:
hgvs_strings.append(genomic_hgvs)
else:
msg = f"Could not fetch genomic HGVS on GRCh38 for accession-based variant: {pre_mapped_hgvs}"
raise ValueError(msg)
else:
assert alignment # noqa: S101. mypy help

variant = _adjust_genomic_variant_to_ref(variant, alignment)
hgvs_strings.append(
get_chromosome_identifier(alignment.chrom) + ":" + str(variant)
)
else:
msg = (
f"Could not generate HGVS strings for invalid AnnotationLayer: {layer}"
Expand Down Expand Up @@ -540,7 +576,7 @@ def _map_genomic(
post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings(
row.hgvs_nt,
AnnotationLayer.GENOMIC,
alignment=align_result,
accession_id=sequence_id,
)
post_mapped_genomic = _construct_vrs_allele(
post_mapped_hgvs_strings,
Expand Down