Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
36273a8
feat: configure server logging on startup
bencap Aug 27, 2025
97cfc58
wip: auto failover for protein level alignment
bencap Aug 29, 2025
84b5e58
Add more specific error messages for target alignment failures
sallybg Dec 3, 2025
e421e82
refactor: change transcript parameter type to set for simplified tran…
bencap Dec 10, 2025
b6bbc59
feat: don't rely on user supplied target metadata for transcript sele…
bencap Dec 10, 2025
1170947
feat: Add application log config and initialize in app entrypoints
bencap Dec 11, 2025
8c77d7d
fix: Update score assignment to handle None values correctly in allel…
bencap Dec 12, 2025
fd2b69d
feat: Add hgnc_symbol field to TxSelectResult and update protein refe…
bencap Dec 12, 2025
6598890
feat: Implement get_overlapping_features_for_region function
bencap Dec 12, 2025
26cba72
Use ClinGen to map genomic accession-based variants to genome
sallybg Jan 14, 2026
c4e3d91
Merge branch 'mavedb-dev' into feature/bencap/57/auto-failover-to-pro…
bencap Jan 29, 2026
e84c20d
fix: improper import order from merge conflict resolution
bencap Jan 29, 2026
3a2bd0f
Merge pull request #64 from VariantEffect/feature/bencap/57/auto-fail…
bencap Jan 29, 2026
b6cd6a6
Merge pull request #65 from VariantEffect/feature/bencap/63/reduce-re…
bencap Jan 30, 2026
9669507
feat: compute gene info for all mapped targets.
bencap Dec 12, 2025
f503cc7
Merge branch 'mavedb-dev' into feature/bencap/55/hgnc-name-for-mapped…
bencap Jan 30, 2026
8b34557
Merge pull request #67 from VariantEffect/feature/bencap/55/hgnc-name…
bencap Jan 30, 2026
ec55365
Add ClinGen API URL as environment variable
sallybg Feb 3, 2026
e3552f1
Make requests to ClinGen with retries upon failure
sallybg Feb 4, 2026
bbabc89
feat: add CDOT_URL envvar to allow configurable endpoint in seq fetches
bencap Feb 4, 2026
6e6c31d
Merge pull request #72 from VariantEffect/feature/bencap/71/configura…
bencap Feb 5, 2026
814828d
Merge branch 'mavedb-dev' into fix-endogenous-genomic-mapping
bencap Feb 5, 2026
3520a81
Merge pull request #68 from VariantEffect/fix-endogenous-genomic-mapping
bencap Feb 5, 2026
f3bb387
chore: bump version to 2026.1.0
bencap Feb 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions settings/.env.dev
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,21 @@ MAVEDB_API_KEY=
####################################################################################################

SEQREPO_ROOT_DIR=/usr/local/share/seqrepo/2024-12-20

####################################################################################################
# Environment variables for ClinGen
####################################################################################################

CLINGEN_API_URL=https://reg.genome.network/allele

####################################################################################################
# Environment variables for ensembl
####################################################################################################

ENSEMBL_API_URL=https://rest.ensembl.org

####################################################################################################
# Environment variables for cdot
####################################################################################################

CDOT_URL=http://cdot-rest:8000
42 changes: 31 additions & 11 deletions src/api/routers/map.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
""""Provide mapping router"""
import logging
from pathlib import Path

from cool_seq_tool.schemas import AnnotationLayer
Expand All @@ -12,6 +13,7 @@
_get_mapped_reference_sequence,
_set_scoreset_layer,
annotate,
compute_target_gene_info,
)
from dcd_mapping.exceptions import (
AlignmentError,
Expand All @@ -34,6 +36,7 @@
from dcd_mapping.schemas import (
ScoreAnnotation,
ScoresetMapping,
TargetAnnotation,
TargetType,
TxSelectResult,
VrsVersion,
Expand All @@ -45,6 +48,8 @@
prefix="/api/v1", tags=["mappings"], responses={404: {"description": "Not found"}}
)

_logger = logging.getLogger(__name__)


@router.post(path="/map/{urn}", status_code=200, response_model=ScoresetMapping)
@with_mavedb_score_set
Expand All @@ -57,7 +62,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
try:
metadata = get_scoreset_metadata(urn, store_path)
records = get_scoreset_records(metadata, True, store_path)
metadata = patch_target_sequence_type(metadata, records)
metadata = patch_target_sequence_type(metadata, records, force=False)
except ScoresetNotSupportedError as e:
return JSONResponse(
content=ScoresetMapping(
Expand Down Expand Up @@ -196,29 +201,41 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse

try:
raw_metadata = get_raw_scoreset_metadata(urn, store_path)
reference_sequences: dict[str, dict] = {}
reference_sequences: dict[str, TargetAnnotation] = {}
mapped_scores: list[ScoreAnnotation] = []
for target_gene in annotated_vrs_results:
preferred_layers = {
_set_scoreset_layer(urn, annotated_vrs_results[target_gene]),
}
target_gene_name = metadata.target_genes[target_gene].target_gene_name
reference_sequences[target_gene_name] = {
reference_sequences[target_gene_name] = TargetAnnotation()
reference_sequences[target_gene_name].layers = {
layer: {
"computed_reference_sequence": None,
"mapped_reference_sequence": None,
}
for layer in preferred_layers
}

# sometimes Nonetype layers show up in preferred layers dict; remove these
preferred_layers.discard(None)

# Determine one gene symbol per target and its selection method
gene_info = await compute_target_gene_info(
target_key=target_gene,
transcripts=transcripts,
alignment_results=alignment_results,
metadata=metadata,
mapped_scores=annotated_vrs_results[target_gene],
)

for layer in preferred_layers:
reference_sequences[target_gene_name][layer][
reference_sequences[target_gene_name].layers[layer][
"computed_reference_sequence"
] = _get_computed_reference_sequence(
metadata.target_genes[target_gene], layer, transcripts[target_gene]
)
reference_sequences[target_gene_name][layer][
reference_sequences[target_gene_name].layers[layer][
"mapped_reference_sequence"
] = _get_mapped_reference_sequence(
metadata.target_genes[target_gene],
Expand All @@ -227,6 +244,9 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
alignment_results[target_gene],
)

if gene_info is not None:
reference_sequences[target_gene_name].gene_info = gene_info

for m in annotated_vrs_results[target_gene]:
if m.pre_mapped is None:
mapped_scores.append(ScoreAnnotation(**m.model_dump()))
Expand All @@ -236,15 +256,15 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse

# if genomic layer, not accession-based, and target gene type is coding, add cdna entry (just the sequence accession) to reference_sequences dict
if (
AnnotationLayer.GENOMIC in reference_sequences[target_gene_name]
AnnotationLayer.GENOMIC in reference_sequences[target_gene_name].layers
and metadata.target_genes[target_gene].target_gene_category
== TargetType.PROTEIN_CODING
and metadata.target_genes[target_gene].target_accession_id is None
and transcripts[target_gene] is not None
and isinstance(transcripts[target_gene], TxSelectResult)
and transcripts[target_gene].nm is not None
):
reference_sequences[target_gene_name][AnnotationLayer.CDNA] = {
reference_sequences[target_gene_name].layers[AnnotationLayer.CDNA] = {
"computed_reference_sequence": None,
"mapped_reference_sequence": {
"sequence_accessions": [transcripts[target_gene].nm]
Expand All @@ -253,18 +273,18 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse

# drop Nonetype reference sequences
for target_gene in reference_sequences:
for layer in list(reference_sequences[target_gene].keys()):
for layer in list(reference_sequences[target_gene].layers.keys()):
if (
reference_sequences[target_gene][layer][
reference_sequences[target_gene].layers[layer][
"mapped_reference_sequence"
]
is None
and reference_sequences[target_gene][layer][
and reference_sequences[target_gene].layers[layer][
"computed_reference_sequence"
]
is None
) or layer is None:
del reference_sequences[target_gene][layer]
del reference_sequences[target_gene].layers[layer]

except Exception as e:
return JSONResponse(
Expand Down
11 changes: 11 additions & 0 deletions src/api/server_main.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,24 @@
"""FastAPI server file"""
import logging

import uvicorn
from fastapi import FastAPI

from api.routers import map
from application_logging import init_logging
from dcd_mapping import dcd_mapping_version

init_logging()
_logger = logging.getLogger(__name__)
_logger.info("dcd-mapping API: %s", dcd_mapping_version)

app = FastAPI()

app.include_router(map.router)

msg = f"Starting DCD Mapping server v{dcd_mapping_version})"
_logger.info(msg)


# If the application is not already being run within a uvicorn server, start uvicorn here.
if __name__ == "__main__":
Expand Down
19 changes: 19 additions & 0 deletions src/application_logging/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""Application logging initialization"""

import logging
import os

LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()


def init_logging() -> None:
"""Initialize application-wide logging with configured log level and format.

This sets the root logger's level based on the LOG_LEVEL environment variable
and applies a consistent log message format across the application.
"""
logging.basicConfig(
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
level=getattr(logging, LOG_LEVEL, logging.INFO),
force=True,
)
8 changes: 8 additions & 0 deletions src/dcd_mapping/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,20 @@
See the mapping manuscript for more information:
https://www.biorxiv.org/content/10.1101/2023.06.20.545702v1
"""
import logging

from dotenv import load_dotenv

from application_logging import init_logging

from .main import map_scoreset, map_scoreset_urn
from .version import dcd_mapping_version

__all__ = ["map_scoreset", "map_scoreset_urn"]
__version__ = dcd_mapping_version

load_dotenv()

init_logging()
_logger = logging.getLogger(__name__)
_logger.info("dcd-mapping: %s", dcd_mapping_version)
34 changes: 29 additions & 5 deletions src/dcd_mapping/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import subprocess
import tempfile
from collections.abc import Mapping
from pathlib import Path
from urllib.parse import urlparse

Expand All @@ -19,8 +20,8 @@
ScoresetNotSupportedError,
)
from dcd_mapping.lookup import get_chromosome_identifier, get_gene_location
from dcd_mapping.mavedb_data import LOCAL_STORE_PATH
from dcd_mapping.resource_utils import http_download
from dcd_mapping.mavedb_data import LOCAL_STORE_PATH, patch_target_sequence_type
from dcd_mapping.resource_utils import CDOT_URL, http_download
from dcd_mapping.schemas import (
AlignmentResult,
GeneLocation,
Expand Down Expand Up @@ -376,7 +377,7 @@ def fetch_alignment(
if accession_id.startswith(("NP", "ENSP", "NC_")):
alignment_results[accession_id] = None
else:
url = f"https://cdot.cc/transcript/{accession_id}"
url = f"{CDOT_URL}/transcript/{accession_id}"
r = requests.get(url, timeout=30)

try:
Expand Down Expand Up @@ -441,7 +442,7 @@ def parse_cdot_mapping(cdot_mapping: dict, silent: bool) -> AlignmentResult:

def build_alignment_result(
metadata: ScoresetMetadata, silent: bool
) -> dict[str, AlignmentResult | None]:
) -> Mapping[str, AlignmentResult | None]:
# NOTE: Score set must contain all accession-based target genes or all sequence-based target genes
# This decision was made because it is most efficient to run BLAT all together, so the alignment function
# works on an entire score set rather than per target gene.
Expand All @@ -462,7 +463,30 @@ def build_alignment_result(
score_set_type = "sequence"

if score_set_type == "sequence":
alignment_result = align(metadata, silent)
try:
alignment_result = align(metadata, silent)
except AlignmentError as e:
failed_at_nucleotide_level = any(
target_gene.target_sequence_type == TargetSequenceType.DNA
for target_gene in metadata.target_genes.values()
)

if failed_at_nucleotide_level:
msg = f"BLAT alignment failed for {metadata.urn} at the nucleotide level. This alignment will be retried at the protein level."
_logger.warning(msg)
else:
raise AlignmentError from e

# So long as force=True, the content of the records dict is irrelevant.
try:
alignment_result = align(
patch_target_sequence_type(metadata, {}, force=True), silent
)
except AlignmentError as e2:
msg = f"BLAT alignment failed for {metadata.urn} at the protein level after failing at the nucleotide level."
_logger.error(msg)
raise AlignmentError(msg) from e2

else:
alignment_result = fetch_alignment(metadata, silent)

Expand Down
Loading