PyEED · alacheim · May 2, 2025 · Mar 11, 2025 · Mar 13, 2025 · Mar 14, 2025
diff --git a/docs/usage/blast.ipynb b/docs/usage/blast.ipynb
diff --git a/docs/usage/clustalo.ipynb b/docs/usage/clustalo.ipynb
diff --git a/docs/usage/embeddings_analysis.ipynb b/docs/usage/embeddings_analysis.ipynb
@@ -24,9 +24,10 @@
    "source": [
     "import sys\n",
     "\n",
-    "from loguru import logger\n",
-    "import pandas as pd\n",
     "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n",
+    "from loguru import logger\n",
+    "\n",
     "from pyeed import Pyeed\n",
     "from pyeed.analysis.embedding_analysis import EmbeddingTool\n",
     "\n",

diff --git a/docs/usage/mmseqs.ipynb b/docs/usage/mmseqs.ipynb
@@ -20,6 +20,7 @@
             "outputs": [],
             "source": [
                 "from pyeed import Pyeed\n",
+                "from pyeed.model import Protein\n",
                 "from pyeed.tools.mmseqs import MMSeqs"
             ]
         },
@@ -134,8 +135,6 @@
                 "pyeed = Pyeed(uri=\"bolt://localhost:7687\", user=\"neo4j\", password=\"12345678\")\n",
                 "\n",
                 "# Get first 100 protein IDs from database\n",
-                "from pyeed.model import Protein\n",
-                "\n",
                 "accession_ids = [protein.accession_id for protein in Protein.nodes.all()][:100]\n",
                 "\n",
                 "# Cluster sequences\n",

diff --git a/docs/usage/mutation_analysis.ipynb b/docs/usage/mutation_analysis.ipynb
@@ -16,6 +16,7 @@
    "outputs": [],
    "source": [
     "import sys\n",
+    "\n",
     "from loguru import logger\n",
     "\n",
     "from pyeed import Pyeed\n",

diff --git a/docs/usage/network_analysis.ipynb b/docs/usage/network_analysis.ipynb
@@ -11,6 +11,7 @@
     "import matplotlib.pyplot as plt\n",
     "import networkx as nx\n",
     "from loguru import logger\n",
+    "\n",
     "from pyeed import Pyeed\n",
     "from pyeed.analysis.network_analysis import NetworkAnalysis\n",
     "from pyeed.analysis.sequence_alignment import PairwiseAligner\n",

diff --git a/docs/usage/standard_numbering.ipynb b/docs/usage/standard_numbering.ipynb
@@ -23,10 +23,10 @@
     "%reload_ext autoreload\n",
     "%autoreload 2\n",
     "import sys\n",
+    "\n",
     "from loguru import logger\n",
     "\n",
     "from pyeed import Pyeed\n",
-    "from pyeed.analysis.mutation_detection import MutationDetection\n",
     "from pyeed.analysis.standard_numbering import StandardNumberingTool\n",
     "\n",
     "logger.remove()\n",

diff --git a/pyproject.toml b/pyproject.toml
@@ -36,7 +36,10 @@ esm = "^3.1.3"
 rdflib = "^6.0.0"
 docker = "5.0.0"
 absl-py = "1.0.0"
+crc64iso = "0.0.2"
 SPARQLWrapper = "2.0.0"
+pysam = "0.23.0"
+types-requests = "2.32.0.20250328"
 
 [tool.poetry.group.dev.dependencies]
 mkdocstrings = {extras = ["python"], version = "^0.26.2"}

diff --git a/src/pyeed/adapter/ncbi_protein_mapper.py b/src/pyeed/adapter/ncbi_protein_mapper.py
@@ -281,6 +281,8 @@ def add_to_db(self, response: Response) -> None:
                 protein = Protein(**protein_data)
                 protein.save()
 
+            if not isinstance(organism, Organism):
+                raise TypeError(f"Expected Organism, but got {type(organism)}")
             protein.organism.connect(organism)
 
             # Add features

diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
@@ -0,0 +1,132 @@
+import json
+import logging
+import os
+import sys
+from typing import List
+
+import httpx
+from crc64iso import crc64iso
+from pysam import FastaFile
+
+logger = logging.getLogger(__name__)
+
+
+class NCBIToUniprotMapper:
+    def __init__(self, ids: List[str], file: str):
+        self.ids = ids
+        self.file = file
+        self.uniparc_url = "https://www.ebi.ac.uk/proteins/api/uniparc?offset=0&size=100&sequencechecksum="
+        self.ncbi_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+
+    def download_fasta(self, refseq_id: str) -> None:
+        """
+        Downloads a FASTA file for a given RefSeq ID using httpx and saves it locally.
+
+        Args:
+            refseq_id str: NCBI ID
+        """
+
+        params = {
+            "db": "protein",
+            "id": refseq_id,
+            "rettype": "fasta",
+            "retmode": "text",
+        }
+
+        try:
+            response = httpx.get(self.ncbi_url, params=params, timeout=10.0)
+
+            if response.status_code == 200:
+                filename = f"{refseq_id}.fasta"
+                with open(filename, "w") as f:
+                    f.write(response.text)
+                print(f"✅ Downloaded: {filename}")
+            else:
+                print(
+                    f"❌ Failed to download {refseq_id} (Status: {response.status_code})"
+                )
+
+        except httpx.HTTPError as e:
+            print(f"❌ HTTP error occurred while downloading {refseq_id}: {e}")
+
+    def get_checksum(self, refseq_id: str) -> str:
+        """Fetches and calculates the checksum for a given RefSeq ID.
+
+        Args:
+            refseq_id str: NCBI ID
+
+        Returns:
+            str: checksum ID
+        """
+
+        self.download_fasta(refseq_id)
+        fa = FastaFile(f"{refseq_id}.fasta")
+        seq = fa.fetch(fa.references[0])
+        return f"{crc64iso.crc64(seq)}"
+
+    def checksum_list(self, refseq_ids: List[str]) -> List[str]:
+        """Creates a list of checksum IDs and deletes the FASTA files after processing.
+
+        Args:
+            refseq_ids str: NCBI IDs
+
+        Returns:
+            List[str]: cheksum IDs
+        """
+
+        checksums = []
+        for refseq_id in refseq_ids:
+            checksums.append(self.get_checksum(refseq_id))
+            fasta_file_path = f"{refseq_id}.fasta"
+            fai_file_path = f"{refseq_id}.fasta.fai"
+
+            if os.path.exists(fasta_file_path):
+                os.remove(fasta_file_path)  # Delete the fasta file
+
+            if os.path.exists(fai_file_path):
+                os.remove(fai_file_path)
+        return checksums
+
+    def execute_request(self) -> None:
+        """Fetches the uniparc and uniprot ids for the given refseq ids and saves them in a json file."""
+
+        checksum_list = self.checksum_list(self.ids)
+
+        id_mapping_uniprot = {}
+        id_mapping_uniparc = {}
+        counter = 0
+
+        for checksum in checksum_list:
+            url = f"{self.uniparc_url}{checksum}"
+
+            # perform request and get response as JSON
+            with httpx.Client() as client:
+                response = client.get(url, headers={"Accept": "application/json"})
+
+            # check if the request was successful
+            if response.status_code != 200:
+                print(f"Request failed with status code {response.status_code}")
+                response.raise_for_status()  # Raise exception for any non-200 response
+                sys.exit()
+
+            # Check if the response body is empty
+            if not response.content.strip():  # Check if the body is empty
+                print("The response body is empty.")
+                sys.exit()
+
+            # extracts the uniprot and the uniparc id from the repsonse and saves them in a dictionary
+            response_body = response.json()
+            for item in response_body:
+                uniparc_id = item.get("accession", None)
+                for ref in item.get("dbReference", []):
+                    if ref.get("type") == "UniProtKB/TrEMBL":
+                        uniprot_id = ref.get("id", None)
+                        id_mapping_uniparc[self.ids[counter]] = uniparc_id
+                        id_mapping_uniprot[self.ids[counter]] = uniprot_id
+            counter += 1
+
+        with open(f"{self.file}_uniprot.json", "w") as f:
+            json.dump(id_mapping_uniprot, f)
+
+        with open(f"{self.file}_uniparc.json", "w") as f:
+            json.dump(id_mapping_uniparc, f)
diff --git a/src/pyeed/adapter/uniprot_mapper.py b/src/pyeed/adapter/uniprot_mapper.py
@@ -1,9 +1,9 @@
 import json
 from collections import defaultdict
-from typing import Any, List
+from typing import Any, List, Optional
 
 import requests
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 from httpx import Response
 from loguru import logger
 from SPARQLWrapper import JSON, SPARQLWrapper
@@ -82,13 +82,15 @@ def add_sites(self, record: dict[str, Any], protein: Protein) -> None:
             site.save()
 
             protein.site.connect(site, {"positions": positions})
-
-    def get_substrates_and_products_from_rhea(self, rhea_id: str) -> dict[str, List[str]]:
+
+    def get_substrates_and_products_from_rhea(
+        self, rhea_id: str
+    ) -> dict[str, List[str]]:
         """Fetch substrates and products from Rhea by parsing the side URI (_L = substrate, _R = product).
-        
+
         Args:
             rhea_id (str or int): The Rhea reaction ID (e.g., 49528)
-        
+
         Returns:
             dict: {
                 'substrates': [list of chebi URIs],
@@ -118,7 +120,11 @@ def get_substrates_and_products_from_rhea(self, rhea_id: str) -> dict[str, List[
         sparql.setReturnFormat(JSON)
         sparql.addCustomHttpHeader("User-Agent", "MyPythonClient/1.0")
 
-        results = sparql.query().convert()
+        results_raw = sparql.query().convert()
+        if not isinstance(results_raw, dict):
+            raise TypeError("Expected dict from SPARQL query")
+
+        results: dict[str, Any] = results_raw
 
         substrates = set()
         products = set()
@@ -134,30 +140,35 @@ def get_substrates_and_products_from_rhea(self, rhea_id: str) -> dict[str, List[
             elif side_uri.endswith("_R"):
                 products.add(chebi_uri)
 
-        return {
-            "substrates": sorted(substrates),
-            "products": sorted(products)
-        }
+        return {"substrates": sorted(substrates), "products": sorted(products)}
 
-
-    def get_smiles_from_chebi_web(self, chebi_url: str) -> str:
+    def get_smiles_from_chebi_web(self, chebi_url: str) -> Optional[str]:
         """
         Extract SMILES from the official ChEBI page using HTML scraping.
         """
-        chebi_id = chebi_url.split('_')[-1]
+        chebi_id = chebi_url.split("_")[-1]
         url = f"https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:{chebi_id}"
 
         response = requests.get(url)
         soup = BeautifulSoup(response.text, "html.parser")
 
         # Look for table rows that contain the SMILES label
         for table in soup.find_all("table", class_="chebiTableContent"):
+            if not isinstance(table, Tag):
+                continue
             for row in table.find_all("tr"):
+                if not isinstance(row, Tag):
+                    continue
                 headers = row.find_all("td", class_="chebiDataHeader")
-                if headers and "SMILES" in headers[0].text:
-                    data_cell = row.find_all("td")[-1]  # Get the last <td> in row
-                    return data_cell.text.strip()
-
+                if (
+                    headers
+                    and isinstance(headers[0], Tag)
+                    and "SMILES" in headers[0].text
+                ):
+                    data_cells = row.find_all("td")
+                    if data_cells:
+                        return f"{data_cells[-1].text.strip()}"
+        return None
 
     def add_reaction(self, record: dict[str, Any], protein: Protein) -> None:
         for reference in record.get("comments", []):  # Safe retrieval with .get()
@@ -168,39 +179,39 @@ def add_reaction(self, record: dict[str, Any], protein: Protein) -> None:
                     if db_ref.get("id", "").startswith("RHEA:"):
                         rhea_id = db_ref["id"]
                         break  # Stop after finding the first match
-                
+
                 catalytic_annotation = Reaction.get_or_save(
                     rhea_id=rhea_id,
                 )
-                self.add_molecule(rhea_id, catalytic_annotation)
-                protein.reaction.connect(catalytic_annotation)
+                if rhea_id is not None:
+                    self.add_molecule(rhea_id, catalytic_annotation)
+                    protein.reaction.connect(catalytic_annotation)
 
     def add_molecule(self, rhea_id: str, reaction: Reaction) -> None:
-
         chebi = self.get_substrates_and_products_from_rhea(rhea_id)
 
         substrate_ids = chebi["substrates"]
         product_ids = chebi["products"]
-        
+
         for i in substrate_ids:
             smiles = self.get_smiles_from_chebi_web(i)
-            
-            chebi_id = i.split('_')[-1]
+
+            chebi_id = i.split("_")[-1]
             chebi_id = f"CHEBI:{chebi_id}"
             substrate = Molecule.get_or_save(
                 chebi_id=chebi_id,
-                smiles = smiles,
+                smiles=smiles,
             )
             reaction.substrate.connect(substrate)
-        
+
         for i in product_ids:
             smiles = self.get_smiles_from_chebi_web(i)
 
-            chebi_id = i.split('_')[-1]
+            chebi_id = i.split("_")[-1]
             chebi_id = f"CHEBI:{chebi_id}"
             product = Molecule.get_or_save(
                 chebi_id=chebi_id,
-                smiles = smiles,
+                smiles=smiles,
             )
             reaction.product.connect(product)
 

diff --git a/src/pyeed/analysis/mutation_detection.py b/src/pyeed/analysis/mutation_detection.py
@@ -1,7 +1,6 @@
 from typing import Any, Optional
 
 from loguru import logger
-
 from pyeed.dbconnect import DatabaseConnector