openvar · Peter-J-Freeman · May 1, 2025 · May 20, 2025 · May 20, 2025 · Jul 22, 2025
diff --git a/.DS_Store b/.DS_Store
diff --git a/VariantFormatter/__init__.py b/VariantFormatter/__init__.py
@@ -1,20 +1,20 @@
-import pkg_resources
+import importlib.metadata
 import re
 import warnings
 
 # Pull in use_scm_version=True enabled version number
 _is_released_version = False
 try:
-    __version__ = pkg_resources.get_distribution("VariantFormatter").version
+    __version__ = importlib.metadata.version("VariantFormatter")
     if re.match(r"^\d+\.\d+\.\d+$", __version__) is not None:
         _is_released_version = True
-except pkg_resources.DistributionNotFound as e:
+except importlib.metadata.PackageNotFoundError:
     warnings.warn("can't get __version__ because %s package isn't installed" % __package__, Warning)
     __version__ = None
 
 
 # <LICENSE>
-# Copyright (C) 2016-2025 VariantValidator Contributors
+# Copyright (C) 2016-2026 VariantValidator Contributors
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as

diff --git a/VariantFormatter/formatter.py b/VariantFormatter/formatter.py
@@ -306,6 +306,7 @@ def fetch_aligned_transcripts(hgvs_genomic, transcript_model, vfo, genome_build)
                                                    )
         enst_list_3 = []
         enst_list_2 = evm.relevant_transcripts(hgvs_genomic)
+
         for tx in enst_list_2:
             enst_list_3.append([tx])
 
@@ -346,7 +347,9 @@ def fetch_aligned_transcripts(hgvs_genomic, transcript_model, vfo, genome_build)
 
     # Filter out non-latest
     if ((vfo.select_transcripts != 'raw' and vfo.select_transcripts != 'select'
-            and "NM_" not in str(vfo.select_transcripts) and "ENST" not in str(vfo.select_transcripts))):
+            and "NR" not in str(vfo.select_transcripts)
+            and "NM_" not in str(vfo.select_transcripts)
+            and "ENST" not in str(vfo.select_transcripts))):
         tx_list = vfo.transcript_filter(tx_list)
     return tx_list
 
@@ -413,7 +416,7 @@ def gap_checker(hgvs_transcript, hgvs_genomic, genome_build, vfo, transcript_mod
 
 
 # <LICENSE>
-# Copyright (C) 2016-2025 VariantValidator Contributors
+# Copyright (C) 2016-2026 VariantValidator Contributors
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as

diff --git a/VariantFormatter/gapGenes.py b/VariantFormatter/gapGenes.py
@@ -225,7 +225,7 @@ def fully_normalize(hgvs_tx, hgvs_genomic, hn, reverse_normalizer, vm, vfo):
 
 
 # <LICENSE>
-# Copyright (C) 2016-2025 VariantValidator Contributors
+# Copyright (C) 2016-2026 VariantValidator Contributors
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as

diff --git a/VariantFormatter/simpleVariantFormatter.py b/VariantFormatter/simpleVariantFormatter.py
@@ -14,6 +14,7 @@
 import VariantValidator
 import VariantFormatter
 import VariantFormatter.variantformatter as vf
+from VariantValidator.modules import vcf_to_pvcf
 GLOBAL_VFO = VariantValidator.Validator()
 
 # Collect metadata
@@ -27,6 +28,7 @@
 # VariantValidator instance, due to non thread-safe SQLite3 access via SeqRepo
 def format(batch_input, genome_build, transcript_model=None, specify_transcripts=None,
            checkOnly=False, liftover=False, validator=GLOBAL_VFO, testing=None):
+
     # Testing?
     if testing is True:
         validator.testing = True
@@ -45,18 +47,6 @@ def format(batch_input, genome_build, transcript_model=None, specify_transcripts
     if specify_transcripts == '["select"]':
         specify_transcripts = "select"
 
-    # Format specify transcripts key options
-    if specify_transcripts == '["all"]':
-        specify_transcripts = "all"
-    if specify_transcripts == '["raw"]':
-        specify_transcripts = "raw"
-    if specify_transcripts == '["mane"]':
-        specify_transcripts = "mane"
-    if specify_transcripts == '["mane_select"]':
-        specify_transcripts = "mane_select"
-    if specify_transcripts == '["select"]':
-        specify_transcripts = "select"
-
     # Set select_transcripts == 'all' to None
     vfo = validator
     vfo.select_transcripts = specify_transcripts
@@ -78,6 +68,25 @@ def format(batch_input, genome_build, transcript_model=None, specify_transcripts
         bypass = False
         # remove external whitespace
         variant = variant.strip()
+
+        # Process vcf lines
+        # VCF line handling - Note: handling csv brings too many issues, so stick to tabs tsv
+        vcf_processing_warnings = []
+        if "\t" in variant and not re.search(r"[gcrnmo]\.", variant):
+            try:
+                variant = vcf_to_pvcf.vcf_to_shorthand(variant)
+            except vcf_to_pvcf.VcfConversionError:
+                pass
+            else:
+                vcf_processing_warnings.append(f"VcfConversionWarning: VCF line identified and converted to {variant}")
+                vcf_data = re.split(r'[-:]', variant)
+                if (re.search("\d+", vcf_data[2]) and (
+                        re.search("del", vcf_data[3], re.IGNORECASE) or
+                        re.search("inv", vcf_data[3], re.IGNORECASE))):
+                    if not re.search(r"[gatcnmo]\.", str(vcf_data)):
+                        variant  = f"{vcf_data[0]}:{vcf_data[1]}_{vcf_data[2]}{vcf_data[3].lower()}"
+                        vcf_processing_warnings.append(f"VcfConversionWarning: CNV identified, and mapped to {variant}")
+
         # Remove internal whitespace
         wsl = variant.split()
         variant = ''.join(wsl)
@@ -162,14 +171,16 @@ def format(batch_input, genome_build, transcript_model=None, specify_transcripts
             res = result.stucture_data()
             formatted_variants[variant]['flag'] = result.warning_level
             formatted_variants[variant][needs_formatting] = res[needs_formatting]
+            if vcf_processing_warnings != []:
+                formatted_variants[variant][needs_formatting]['genomic_variant_warnings'] = vcf_processing_warnings
 
     # Add metadata
     formatted_variants['metadata'] = metadata
     return formatted_variants
 
 
 # <LICENSE>
-# Copyright (C) 2016-2025 VariantValidator Contributors
+# Copyright (C) 2016-2026 VariantValidator Contributors
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as

diff --git a/VariantFormatter/variantformatter.py b/VariantFormatter/variantformatter.py
@@ -80,6 +80,7 @@ def __init__(self, p_vcf, g_hgvs, un_norm_hgvs, hgvs_ref_bases, gen_error, genom
         self.un_norm_hgvs = un_norm_hgvs
         self.g_hgvs_ref = hgvs_ref_bases
         self.gen_error = gen_error
+        self.gen_warnings = None
         self.selected_build = genome_build
 
 
@@ -388,23 +389,28 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None
         else:
             transcript_list = formatter.fetch_aligned_transcripts(g_hgvs, self.transcript_model, self.vfo, genome_build)
 
-        # Remove malform IDs
-        cp_transcript_list = copy.copy(transcript_list)
-        transcript_list = []
-        for tx in cp_transcript_list:
+        # De-dup and remove malformed IDs (EG ENS seqs with different GRCh37 and GRCh38 ver but same id and version no.)
+        # Also sort by the number of main chr like mappings, this avoids pyliftover where possible, since we re-use the
+        # first non-failing hit for all liftover. (Where mapping details exist the order is, ref, alt, strand ....)
+        transcript_dict = {}
+        for tx in transcript_list:
             # Known UTA ID malforms
             if re.search('\/', tx[0]):
                 continue
-            else:
-                transcript_list.append(tx)
+            # don't exclude any tx, even if we have no chr like mapping
+            if tx[0] not in transcript_dict:
+                transcript_dict[tx[0]] = +1
+            # but add to the sort number if we do have a main chr like mapping
+            if len(tx) > 2 and (tx[1].startswith('NC_00') or tx[1] in ['NC_012920.1', 'NC_001807.4']):
+                transcript_dict[tx[0]] = transcript_dict[tx[0]] +1
+
+        transcript_list = sorted(transcript_dict.keys(),key=lambda k:transcript_dict[k],reverse=True)
 
         # Create a variable to trap direct g_g liftover
         g_to_g_lift = {}
 
         # Create transcript level descriptions
-        for tx_alignment_data in transcript_list:
-            tx_id = tx_alignment_data[0]
-
+        for tx_id in transcript_list:
             # Get transcript annotations
             try:
                 annotation = vfo.db.get_transcript_annotation(tx_id)
@@ -458,10 +464,10 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None
                 overlapping_tx = formatter.fetch_aligned_transcripts(g_hgvs, self.transcript_model,
                                                                      self.vfo,
                                                                      genome_build)
+
                 if tx_id not in str(overlapping_tx):
                     continue
 
-
             hgvs_transcript_dict = formatter.hgvs_genomic2hgvs_transcript(g_hgvs, tx_id, self.vfo)
 
             # Gap checking
@@ -496,6 +502,7 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None
                             hgvs_protein_tlc = formatter.hgvs_transcript2hgvs_protein(am_i_gapped['hgvs_transcript'],
                                                                                       self.genome_build,
                                                                                       self.vfo)
+                            hgvs_protein_tlc = formatter.remove_reference(hgvs_protein_tlc)
                             # Handle edits that have been stringified
                             try:
                                 hgvs_protein_tlc.posedit.edit.ref
@@ -577,6 +584,29 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None
                                                specified_tx_variant=specified_tx_variant
                                                )
 
+                    # Use PyLiftover if needed to add missing lifts to primary assembly
+                    if current_lift[build_to.lower()] == {}:
+                        direct_lift = lo.liftover(self.genomic_descriptions.g_hgvs,
+                                                   self.genomic_descriptions.selected_build,
+                                                   build_to,
+                                                   vfo.splign_normalizer,
+                                                   vfo.reverse_splign_normalizer,
+                                                   None,
+                                                   vfo,
+                                                   specify_tx=tx_id,
+                                                   liftover_level=self.liftover,
+                                                   gap_map=formatter.gap_checker,
+                                                   vfo=self.vfo,
+                                                   specified_tx_variant=specified_tx_variant,
+                                                   force_pyliftover=True
+                                                   )
+
+                        current_lift[build_to.lower()] = direct_lift[build_to.lower()]
+                        if build_to == "GRCh37":
+                            current_lift["hg19"] = direct_lift[build_to.lower()]
+                        if build_to == "GRCh38":
+                            current_lift["hg39"] = direct_lift[build_to.lower()]
+
                     if "am_i_gapped" in current_lift.keys():
                         if order_my_tp['gapped_alignment_warning'] == "":
                             order_my_tp['gapped_alignment_warning'] = current_lift['am_i_gapped'][
@@ -591,6 +621,12 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None
                 elif order_my_tp['transcript_variant_error'] is not None and g_to_g_lift != {}:
                     current_lift = g_to_g_lift
 
+                # first edit liftover to text, as required for output
+                for key, val in current_lift.items():
+                    for chr_type in val.keys():
+                        current_lift[key][chr_type]['hgvs_genomic_description'] = \
+                            current_lift[key][chr_type]['hgvs_genomic_description'].format(
+                                {'max_ref_length': 0})
                 # Copy the liftover and split into primary and alt
                 cp_current_lift = copy.deepcopy(current_lift)
                 scaff_lift = copy.deepcopy(current_lift)
@@ -640,6 +676,7 @@ def stucture_data(self):
         bring_order['g_hgvs'] = self.genomic_descriptions.g_hgvs  # Is the removed ref version!
         bring_order['selected_build'] = self.genomic_descriptions.selected_build
         bring_order['genomic_variant_error'] = self.genomic_descriptions.gen_error
+        bring_order['genomic_variant_warnings'] = self.genomic_descriptions.gen_warnings
         try:
             if self.t_and_p_descriptions == {}:
                 bring_order['hgvs_t_and_p'] = {'intergenic': {'primary_assembly_loci': None}}
@@ -662,6 +699,13 @@ def stucture_data(self):
                                                liftover_level=self.liftover
                                                )
 
+                    # First edit liftover to text, as required for output
+                    for key, val in current_lift.items():
+                        for chr_type in val.keys():
+                            current_lift[key][chr_type]['hgvs_genomic_description'] = \
+                                current_lift[key][chr_type]['hgvs_genomic_description'].format(
+                                    {'max_ref_length': 0})
+
                     # Copy the liftover and split into primary and alt
                     cp_current_lift = copy.deepcopy(current_lift)
                     scaff_lift = copy.deepcopy(current_lift)
@@ -701,7 +745,7 @@ def collect_metadata(self):
         return meta
 
 # <LICENSE>
-# Copyright (C) 2016-2025 VariantValidator Contributors
+# Copyright (C) 2016-2026 VariantValidator Contributors
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as