Skip to content
Binary file modified .DS_Store
Binary file not shown.
8 changes: 4 additions & 4 deletions VariantFormatter/__init__.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
import pkg_resources
import importlib.metadata
import re
import warnings

# Pull in use_scm_version=True enabled version number
_is_released_version = False
try:
__version__ = pkg_resources.get_distribution("VariantFormatter").version
__version__ = importlib.metadata.version("VariantFormatter")
if re.match(r"^\d+\.\d+\.\d+$", __version__) is not None:
_is_released_version = True
except pkg_resources.DistributionNotFound as e:
except importlib.metadata.PackageNotFoundError:
warnings.warn("can't get __version__ because %s package isn't installed" % __package__, Warning)
__version__ = None


# <LICENSE>
# Copyright (C) 2016-2025 VariantValidator Contributors
# Copyright (C) 2016-2026 VariantValidator Contributors
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
Expand Down
7 changes: 5 additions & 2 deletions VariantFormatter/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,7 @@ def fetch_aligned_transcripts(hgvs_genomic, transcript_model, vfo, genome_build)
)
enst_list_3 = []
enst_list_2 = evm.relevant_transcripts(hgvs_genomic)

for tx in enst_list_2:
enst_list_3.append([tx])

Expand Down Expand Up @@ -346,7 +347,9 @@ def fetch_aligned_transcripts(hgvs_genomic, transcript_model, vfo, genome_build)

# Filter out non-latest
if ((vfo.select_transcripts != 'raw' and vfo.select_transcripts != 'select'
and "NM_" not in str(vfo.select_transcripts) and "ENST" not in str(vfo.select_transcripts))):
and "NR" not in str(vfo.select_transcripts)
and "NM_" not in str(vfo.select_transcripts)
and "ENST" not in str(vfo.select_transcripts))):
tx_list = vfo.transcript_filter(tx_list)
return tx_list

Expand Down Expand Up @@ -413,7 +416,7 @@ def gap_checker(hgvs_transcript, hgvs_genomic, genome_build, vfo, transcript_mod


# <LICENSE>
# Copyright (C) 2016-2025 VariantValidator Contributors
# Copyright (C) 2016-2026 VariantValidator Contributors
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
Expand Down
2 changes: 1 addition & 1 deletion VariantFormatter/gapGenes.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def fully_normalize(hgvs_tx, hgvs_genomic, hn, reverse_normalizer, vm, vfo):


# <LICENSE>
# Copyright (C) 2016-2025 VariantValidator Contributors
# Copyright (C) 2016-2026 VariantValidator Contributors
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
Expand Down
37 changes: 24 additions & 13 deletions VariantFormatter/simpleVariantFormatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import VariantValidator
import VariantFormatter
import VariantFormatter.variantformatter as vf
from VariantValidator.modules import vcf_to_pvcf
GLOBAL_VFO = VariantValidator.Validator()

# Collect metadata
Expand All @@ -27,6 +28,7 @@
# VariantValidator instance, due to non thread-safe SQLite3 access via SeqRepo
def format(batch_input, genome_build, transcript_model=None, specify_transcripts=None,
checkOnly=False, liftover=False, validator=GLOBAL_VFO, testing=None):

# Testing?
if testing is True:
validator.testing = True
Expand All @@ -45,18 +47,6 @@ def format(batch_input, genome_build, transcript_model=None, specify_transcripts
if specify_transcripts == '["select"]':
specify_transcripts = "select"

# Format specify transcripts key options
if specify_transcripts == '["all"]':
specify_transcripts = "all"
if specify_transcripts == '["raw"]':
specify_transcripts = "raw"
if specify_transcripts == '["mane"]':
specify_transcripts = "mane"
if specify_transcripts == '["mane_select"]':
specify_transcripts = "mane_select"
if specify_transcripts == '["select"]':
specify_transcripts = "select"

# Set select_transcripts == 'all' to None
vfo = validator
vfo.select_transcripts = specify_transcripts
Expand All @@ -78,6 +68,25 @@ def format(batch_input, genome_build, transcript_model=None, specify_transcripts
bypass = False
# remove external whitespace
variant = variant.strip()

# Process vcf lines
# VCF line handling - Note: handling csv brings too many issues, so stick to tabs tsv
vcf_processing_warnings = []
if "\t" in variant and not re.search(r"[gcrnmo]\.", variant):
try:
variant = vcf_to_pvcf.vcf_to_shorthand(variant)
except vcf_to_pvcf.VcfConversionError:
pass
else:
vcf_processing_warnings.append(f"VcfConversionWarning: VCF line identified and converted to {variant}")
vcf_data = re.split(r'[-:]', variant)
if (re.search("\d+", vcf_data[2]) and (
re.search("del", vcf_data[3], re.IGNORECASE) or
re.search("inv", vcf_data[3], re.IGNORECASE))):
if not re.search(r"[gatcnmo]\.", str(vcf_data)):
variant = f"{vcf_data[0]}:{vcf_data[1]}_{vcf_data[2]}{vcf_data[3].lower()}"
vcf_processing_warnings.append(f"VcfConversionWarning: CNV identified, and mapped to {variant}")

# Remove internal whitespace
wsl = variant.split()
variant = ''.join(wsl)
Expand Down Expand Up @@ -162,14 +171,16 @@ def format(batch_input, genome_build, transcript_model=None, specify_transcripts
res = result.stucture_data()
formatted_variants[variant]['flag'] = result.warning_level
formatted_variants[variant][needs_formatting] = res[needs_formatting]
if vcf_processing_warnings != []:
formatted_variants[variant][needs_formatting]['genomic_variant_warnings'] = vcf_processing_warnings

# Add metadata
formatted_variants['metadata'] = metadata
return formatted_variants


# <LICENSE>
# Copyright (C) 2016-2025 VariantValidator Contributors
# Copyright (C) 2016-2026 VariantValidator Contributors
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
Expand Down
66 changes: 55 additions & 11 deletions VariantFormatter/variantformatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def __init__(self, p_vcf, g_hgvs, un_norm_hgvs, hgvs_ref_bases, gen_error, genom
self.un_norm_hgvs = un_norm_hgvs
self.g_hgvs_ref = hgvs_ref_bases
self.gen_error = gen_error
self.gen_warnings = None
self.selected_build = genome_build


Expand Down Expand Up @@ -388,23 +389,28 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None
else:
transcript_list = formatter.fetch_aligned_transcripts(g_hgvs, self.transcript_model, self.vfo, genome_build)

# Remove malform IDs
cp_transcript_list = copy.copy(transcript_list)
transcript_list = []
for tx in cp_transcript_list:
# De-dup and remove malformed IDs (EG ENS seqs with different GRCh37 and GRCh38 ver but same id and version no.)
# Also sort by the number of main chr like mappings, this avoids pyliftover where possible, since we re-use the
# first non-failing hit for all liftover. (Where mapping details exist the order is, ref, alt, strand ....)
transcript_dict = {}
for tx in transcript_list:
# Known UTA ID malforms
if re.search('\/', tx[0]):
continue
else:
transcript_list.append(tx)
# don't exclude any tx, even if we have no chr like mapping
if tx[0] not in transcript_dict:
transcript_dict[tx[0]] = +1
# but add to the sort number if we do have a main chr like mapping
if len(tx) > 2 and (tx[1].startswith('NC_00') or tx[1] in ['NC_012920.1', 'NC_001807.4']):
transcript_dict[tx[0]] = transcript_dict[tx[0]] +1

transcript_list = sorted(transcript_dict.keys(),key=lambda k:transcript_dict[k],reverse=True)

# Create a variable to trap direct g_g liftover
g_to_g_lift = {}

# Create transcript level descriptions
for tx_alignment_data in transcript_list:
tx_id = tx_alignment_data[0]

for tx_id in transcript_list:
# Get transcript annotations
try:
annotation = vfo.db.get_transcript_annotation(tx_id)
Expand Down Expand Up @@ -458,10 +464,10 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None
overlapping_tx = formatter.fetch_aligned_transcripts(g_hgvs, self.transcript_model,
self.vfo,
genome_build)

if tx_id not in str(overlapping_tx):
continue


hgvs_transcript_dict = formatter.hgvs_genomic2hgvs_transcript(g_hgvs, tx_id, self.vfo)

# Gap checking
Expand Down Expand Up @@ -496,6 +502,7 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None
hgvs_protein_tlc = formatter.hgvs_transcript2hgvs_protein(am_i_gapped['hgvs_transcript'],
self.genome_build,
self.vfo)
hgvs_protein_tlc = formatter.remove_reference(hgvs_protein_tlc)
# Handle edits that have been stringified
try:
hgvs_protein_tlc.posedit.edit.ref
Expand Down Expand Up @@ -577,6 +584,29 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None
specified_tx_variant=specified_tx_variant
)

# Use PyLiftover if needed to add missing lifts to primary assembly
if current_lift[build_to.lower()] == {}:
direct_lift = lo.liftover(self.genomic_descriptions.g_hgvs,
self.genomic_descriptions.selected_build,
build_to,
vfo.splign_normalizer,
vfo.reverse_splign_normalizer,
None,
vfo,
specify_tx=tx_id,
liftover_level=self.liftover,
gap_map=formatter.gap_checker,
vfo=self.vfo,
specified_tx_variant=specified_tx_variant,
force_pyliftover=True
)

current_lift[build_to.lower()] = direct_lift[build_to.lower()]
if build_to == "GRCh37":
current_lift["hg19"] = direct_lift[build_to.lower()]
if build_to == "GRCh38":
current_lift["hg39"] = direct_lift[build_to.lower()]

if "am_i_gapped" in current_lift.keys():
if order_my_tp['gapped_alignment_warning'] == "":
order_my_tp['gapped_alignment_warning'] = current_lift['am_i_gapped'][
Expand All @@ -591,6 +621,12 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None
elif order_my_tp['transcript_variant_error'] is not None and g_to_g_lift != {}:
current_lift = g_to_g_lift

# first edit liftover to text, as required for output
for key, val in current_lift.items():
for chr_type in val.keys():
current_lift[key][chr_type]['hgvs_genomic_description'] = \
current_lift[key][chr_type]['hgvs_genomic_description'].format(
{'max_ref_length': 0})
# Copy the liftover and split into primary and alt
cp_current_lift = copy.deepcopy(current_lift)
scaff_lift = copy.deepcopy(current_lift)
Expand Down Expand Up @@ -640,6 +676,7 @@ def stucture_data(self):
bring_order['g_hgvs'] = self.genomic_descriptions.g_hgvs # Is the removed ref version!
bring_order['selected_build'] = self.genomic_descriptions.selected_build
bring_order['genomic_variant_error'] = self.genomic_descriptions.gen_error
bring_order['genomic_variant_warnings'] = self.genomic_descriptions.gen_warnings
try:
if self.t_and_p_descriptions == {}:
bring_order['hgvs_t_and_p'] = {'intergenic': {'primary_assembly_loci': None}}
Expand All @@ -662,6 +699,13 @@ def stucture_data(self):
liftover_level=self.liftover
)

# First edit liftover to text, as required for output
for key, val in current_lift.items():
for chr_type in val.keys():
current_lift[key][chr_type]['hgvs_genomic_description'] = \
current_lift[key][chr_type]['hgvs_genomic_description'].format(
{'max_ref_length': 0})

# Copy the liftover and split into primary and alt
cp_current_lift = copy.deepcopy(current_lift)
scaff_lift = copy.deepcopy(current_lift)
Expand Down Expand Up @@ -701,7 +745,7 @@ def collect_metadata(self):
return meta

# <LICENSE>
# Copyright (C) 2016-2025 VariantValidator Contributors
# Copyright (C) 2016-2026 VariantValidator Contributors
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
Expand Down