Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
723 changes: 362 additions & 361 deletions docs/usage/blast.ipynb

Large diffs are not rendered by default.

338 changes: 169 additions & 169 deletions docs/usage/clustalo.ipynb

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions docs/usage/embeddings_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@
"source": [
"import sys\n",
"\n",
"from loguru import logger\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"from loguru import logger\n",
"\n",
"from pyeed import Pyeed\n",
"from pyeed.analysis.embedding_analysis import EmbeddingTool\n",
"\n",
Expand Down
3 changes: 1 addition & 2 deletions docs/usage/mmseqs.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"outputs": [],
"source": [
"from pyeed import Pyeed\n",
"from pyeed.model import Protein\n",
"from pyeed.tools.mmseqs import MMSeqs"
]
},
Expand Down Expand Up @@ -134,8 +135,6 @@
"pyeed = Pyeed(uri=\"bolt://localhost:7687\", user=\"neo4j\", password=\"12345678\")\n",
"\n",
"# Get first 100 protein IDs from database\n",
"from pyeed.model import Protein\n",
"\n",
"accession_ids = [protein.accession_id for protein in Protein.nodes.all()][:100]\n",
"\n",
"# Cluster sequences\n",
Expand Down
1 change: 1 addition & 0 deletions docs/usage/mutation_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"outputs": [],
"source": [
"import sys\n",
"\n",
"from loguru import logger\n",
"\n",
"from pyeed import Pyeed\n",
Expand Down
1 change: 1 addition & 0 deletions docs/usage/network_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"import matplotlib.pyplot as plt\n",
"import networkx as nx\n",
"from loguru import logger\n",
"\n",
"from pyeed import Pyeed\n",
"from pyeed.analysis.network_analysis import NetworkAnalysis\n",
"from pyeed.analysis.sequence_alignment import PairwiseAligner\n",
Expand Down
2 changes: 1 addition & 1 deletion docs/usage/standard_numbering.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@
"%reload_ext autoreload\n",
"%autoreload 2\n",
"import sys\n",
"\n",
"from loguru import logger\n",
"\n",
"from pyeed import Pyeed\n",
"from pyeed.analysis.mutation_detection import MutationDetection\n",
"from pyeed.analysis.standard_numbering import StandardNumberingTool\n",
"\n",
"logger.remove()\n",
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,10 @@ esm = "^3.1.3"
rdflib = "^6.0.0"
docker = "5.0.0"
absl-py = "1.0.0"
crc64iso = "0.0.2"
SPARQLWrapper = "2.0.0"
pysam = "0.23.0"
types-requests = "2.32.0.20250328"

[tool.poetry.group.dev.dependencies]
mkdocstrings = {extras = ["python"], version = "^0.26.2"}
Expand Down
2 changes: 2 additions & 0 deletions src/pyeed/adapter/ncbi_protein_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,8 @@ def add_to_db(self, response: Response) -> None:
protein = Protein(**protein_data)
protein.save()

if not isinstance(organism, Organism):
raise TypeError(f"Expected Organism, but got {type(organism)}")
protein.organism.connect(organism)

# Add features
Expand Down
132 changes: 132 additions & 0 deletions src/pyeed/adapter/ncbi_to_uniprot_mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import json
import logging
import os
import sys
from typing import List

import httpx
from crc64iso import crc64iso
from pysam import FastaFile

logger = logging.getLogger(__name__)


class NCBIToUniprotMapper:
def __init__(self, ids: List[str], file: str):
self.ids = ids
self.file = file
self.uniparc_url = "https://www.ebi.ac.uk/proteins/api/uniparc?offset=0&size=100&sequencechecksum="
self.ncbi_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

def download_fasta(self, refseq_id: str) -> None:
"""
Downloads a FASTA file for a given RefSeq ID using httpx and saves it locally.

Args:
refseq_id str: NCBI ID
"""

params = {
"db": "protein",
"id": refseq_id,
"rettype": "fasta",
"retmode": "text",
}

try:
response = httpx.get(self.ncbi_url, params=params, timeout=10.0)

if response.status_code == 200:
filename = f"{refseq_id}.fasta"
with open(filename, "w") as f:
f.write(response.text)
print(f"✅ Downloaded: {filename}")
else:
print(
f"❌ Failed to download {refseq_id} (Status: {response.status_code})"
)

except httpx.HTTPError as e:
print(f"❌ HTTP error occurred while downloading {refseq_id}: {e}")

def get_checksum(self, refseq_id: str) -> str:
"""Fetches and calculates the checksum for a given RefSeq ID.

Args:
refseq_id str: NCBI ID

Returns:
str: checksum ID
"""

self.download_fasta(refseq_id)
fa = FastaFile(f"{refseq_id}.fasta")
seq = fa.fetch(fa.references[0])
return f"{crc64iso.crc64(seq)}"

def checksum_list(self, refseq_ids: List[str]) -> List[str]:
"""Creates a list of checksum IDs and deletes the FASTA files after processing.

Args:
refseq_ids str: NCBI IDs

Returns:
List[str]: cheksum IDs
"""

checksums = []
for refseq_id in refseq_ids:
checksums.append(self.get_checksum(refseq_id))
fasta_file_path = f"{refseq_id}.fasta"
fai_file_path = f"{refseq_id}.fasta.fai"

if os.path.exists(fasta_file_path):
os.remove(fasta_file_path) # Delete the fasta file

if os.path.exists(fai_file_path):
os.remove(fai_file_path)
return checksums

def execute_request(self) -> None:
"""Fetches the uniparc and uniprot ids for the given refseq ids and saves them in a json file."""

checksum_list = self.checksum_list(self.ids)

id_mapping_uniprot = {}
id_mapping_uniparc = {}
counter = 0

for checksum in checksum_list:
url = f"{self.uniparc_url}{checksum}"

# perform request and get response as JSON
with httpx.Client() as client:
response = client.get(url, headers={"Accept": "application/json"})

# check if the request was successful
if response.status_code != 200:
print(f"Request failed with status code {response.status_code}")
response.raise_for_status() # Raise exception for any non-200 response
sys.exit()

# Check if the response body is empty
if not response.content.strip(): # Check if the body is empty
print("The response body is empty.")
sys.exit()

# extracts the uniprot and the uniparc id from the repsonse and saves them in a dictionary
response_body = response.json()
for item in response_body:
uniparc_id = item.get("accession", None)
for ref in item.get("dbReference", []):
if ref.get("type") == "UniProtKB/TrEMBL":
uniprot_id = ref.get("id", None)
id_mapping_uniparc[self.ids[counter]] = uniparc_id
id_mapping_uniprot[self.ids[counter]] = uniprot_id
counter += 1

with open(f"{self.file}_uniprot.json", "w") as f:
json.dump(id_mapping_uniprot, f)

with open(f"{self.file}_uniparc.json", "w") as f:
json.dump(id_mapping_uniparc, f)
69 changes: 40 additions & 29 deletions src/pyeed/adapter/uniprot_mapper.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import json
from collections import defaultdict
from typing import Any, List
from typing import Any, List, Optional

import requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag
from httpx import Response
from loguru import logger
from SPARQLWrapper import JSON, SPARQLWrapper
Expand Down Expand Up @@ -82,13 +82,15 @@ def add_sites(self, record: dict[str, Any], protein: Protein) -> None:
site.save()

protein.site.connect(site, {"positions": positions})

def get_substrates_and_products_from_rhea(self, rhea_id: str) -> dict[str, List[str]]:

def get_substrates_and_products_from_rhea(
self, rhea_id: str
) -> dict[str, List[str]]:
"""Fetch substrates and products from Rhea by parsing the side URI (_L = substrate, _R = product).

Args:
rhea_id (str or int): The Rhea reaction ID (e.g., 49528)

Returns:
dict: {
'substrates': [list of chebi URIs],
Expand Down Expand Up @@ -118,7 +120,11 @@ def get_substrates_and_products_from_rhea(self, rhea_id: str) -> dict[str, List[
sparql.setReturnFormat(JSON)
sparql.addCustomHttpHeader("User-Agent", "MyPythonClient/1.0")

results = sparql.query().convert()
results_raw = sparql.query().convert()
if not isinstance(results_raw, dict):
raise TypeError("Expected dict from SPARQL query")

results: dict[str, Any] = results_raw

substrates = set()
products = set()
Expand All @@ -134,30 +140,35 @@ def get_substrates_and_products_from_rhea(self, rhea_id: str) -> dict[str, List[
elif side_uri.endswith("_R"):
products.add(chebi_uri)

return {
"substrates": sorted(substrates),
"products": sorted(products)
}
return {"substrates": sorted(substrates), "products": sorted(products)}


def get_smiles_from_chebi_web(self, chebi_url: str) -> str:
def get_smiles_from_chebi_web(self, chebi_url: str) -> Optional[str]:
"""
Extract SMILES from the official ChEBI page using HTML scraping.
"""
chebi_id = chebi_url.split('_')[-1]
chebi_id = chebi_url.split("_")[-1]
url = f"https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:{chebi_id}"

response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Look for table rows that contain the SMILES label
for table in soup.find_all("table", class_="chebiTableContent"):
if not isinstance(table, Tag):
continue
for row in table.find_all("tr"):
if not isinstance(row, Tag):
continue
headers = row.find_all("td", class_="chebiDataHeader")
if headers and "SMILES" in headers[0].text:
data_cell = row.find_all("td")[-1] # Get the last <td> in row
return data_cell.text.strip()

if (
headers
and isinstance(headers[0], Tag)
and "SMILES" in headers[0].text
):
data_cells = row.find_all("td")
if data_cells:
return f"{data_cells[-1].text.strip()}"
return None

def add_reaction(self, record: dict[str, Any], protein: Protein) -> None:
for reference in record.get("comments", []): # Safe retrieval with .get()
Expand All @@ -168,39 +179,39 @@ def add_reaction(self, record: dict[str, Any], protein: Protein) -> None:
if db_ref.get("id", "").startswith("RHEA:"):
rhea_id = db_ref["id"]
break # Stop after finding the first match

catalytic_annotation = Reaction.get_or_save(
rhea_id=rhea_id,
)
self.add_molecule(rhea_id, catalytic_annotation)
protein.reaction.connect(catalytic_annotation)
if rhea_id is not None:
self.add_molecule(rhea_id, catalytic_annotation)
protein.reaction.connect(catalytic_annotation)

def add_molecule(self, rhea_id: str, reaction: Reaction) -> None:

chebi = self.get_substrates_and_products_from_rhea(rhea_id)

substrate_ids = chebi["substrates"]
product_ids = chebi["products"]

for i in substrate_ids:
smiles = self.get_smiles_from_chebi_web(i)
chebi_id = i.split('_')[-1]

chebi_id = i.split("_")[-1]
chebi_id = f"CHEBI:{chebi_id}"
substrate = Molecule.get_or_save(
chebi_id=chebi_id,
smiles = smiles,
smiles=smiles,
)
reaction.substrate.connect(substrate)

for i in product_ids:
smiles = self.get_smiles_from_chebi_web(i)

chebi_id = i.split('_')[-1]
chebi_id = i.split("_")[-1]
chebi_id = f"CHEBI:{chebi_id}"
product = Molecule.get_or_save(
chebi_id=chebi_id,
smiles = smiles,
smiles=smiles,
)
reaction.product.connect(product)

Expand Down
1 change: 0 additions & 1 deletion src/pyeed/analysis/mutation_detection.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import Any, Optional

from loguru import logger

from pyeed.dbconnect import DatabaseConnector


Expand Down
Loading