From f661eb643abcde43362dae60773828363a595a9d Mon Sep 17 00:00:00 2001 From: alacheim Date: Tue, 11 Mar 2025 10:30:16 +0000 Subject: [PATCH 01/19] added ncbi to uniprot mapper --- src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 131 ++++++++++++++++++++ src/pyeed/main.py | 14 +++ 2 files changed, 145 insertions(+) create mode 100644 src/pyeed/adapter/ncbi_to_uniprot_mapper.py diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py new file mode 100644 index 00000000..3b6098ad --- /dev/null +++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py @@ -0,0 +1,131 @@ +import httpx +import logging +from pysam import FastaFile +from crc64iso import crc64iso +import sys +import json +import os +from typing import List + +logger = logging.getLogger(__name__) + +class NCBIToUniprotMapper: + def __init__(self, ids): + self.ids = ids + self.uniparc_url = "https://www.ebi.ac.uk/proteins/api/uniparc?offset=0&size=100&sequencechecksum=" + self.ncbi_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" + + + def download_fasta(self, refseq_id: str) -> None: + """ + Downloads a FASTA file for a given RefSeq ID using httpx and saves it locally. + + Args: + refseq_id str: NCBI ID + """ + + params = { + "db": "protein", + "id": refseq_id, + "rettype": "fasta", + "retmode": "text" + } + + try: + response = httpx.get(self.ncbi_url, params=params, timeout=10.0) + + if response.status_code == 200: + filename = f"{refseq_id}.fasta" + with open(filename, "w") as f: + f.write(response.text) + print(f"✅ Downloaded: {filename}") + else: + print(f"❌ Failed to download {refseq_id} (Status: {response.status_code})") + + except httpx.HTTPError as e: + print(f"❌ HTTP error occurred while downloading {refseq_id}: {e}") + + def get_checksum(self, refseq_id: str) -> str: + """Fetches and calculates the checksum for a given RefSeq ID. + + Args: + refseq_id str: NCBI ID + + Returns: + str: checksum ID + """ + + self.download_fasta(refseq_id) + fa = FastaFile(f"{refseq_id}.fasta") + seq = fa.fetch(fa.references[0]) + return crc64iso.crc64(seq) + + def checksum_list(self, refseq_ids: List[str]) -> List[str]: + """Creates a list of checksum IDs and deletes the FASTA files after processing. + + Args: + refseq_ids str: NCBI IDs + + Returns: + List[str]: cheksum IDs + """ + + checksums = [] + for refseq_id in refseq_ids: + checksums.append(self.get_checksum(refseq_id)) + fasta_file_path = f"{refseq_id}.fasta" + fai_file_path = f"{refseq_id}.fasta.fai" + + if os.path.exists(fasta_file_path): + os.remove(fasta_file_path) # Delete the fasta file + + if os.path.exists(fai_file_path): + os.remove(fai_file_path) + return checksums + + def execute_request(self) -> None: + """Fetches the uniparc and uniprot ids for the given refseq ids and saves them in a json file. + """ + + checksum_list = self.checksum_list(self.ids) + + id_mapping_uniprot = {} + id_mapping_uniparc = {} + counter = 0 + + for checksum in checksum_list: + url = f"{self.uniparc_url}{checksum}" + + #perform request and get response as JSON + with httpx.Client() as client: + response = client.get(url, headers={ "Accept" : "application/json"}) + + #check if the request was successful + if response.status_code != 200: + print(f"Request failed with status code {r.status_code}") + response.raise_for_status() # Raise exception for any non-200 response + sys.exit() + + # Check if the response body is empty + if not response.content.strip(): # Check if the body is empty + print("The response body is empty.") + sys.exit() + + #extracts the uniprot and the uniparc id from the repsonse and saves them in a dictionary + response_body = response.json() + for item in response_body: + uniparc_id = item.get('accession', None) + for ref in item.get('dbReference', []): + if ref.get('type') == 'UniProtKB/TrEMBL': + uniprot_id = ref.get('id', None) + id_mapping_uniparc[self.ids[counter]] = uniparc_id + id_mapping_uniprot[self.ids[counter]] = uniprot_id + counter += 1 + + with open("id_mapping_uniprot.json", "w") as f: + json.dump(id_mapping_uniprot, f) + + with open("id_mapping_uniparc.json", "w") as f: + json.dump(id_mapping_uniparc, f) + + diff --git a/src/pyeed/main.py b/src/pyeed/main.py index 5950965d..c7707d13 100644 --- a/src/pyeed/main.py +++ b/src/pyeed/main.py @@ -8,6 +8,7 @@ from pyeed.adapter.ncbi_protein_mapper import NCBIProteinToPyeed from pyeed.adapter.primary_db_adapter import PrimaryDBAdapter from pyeed.adapter.uniprot_mapper import UniprotToPyeed +from pyeed.adapter.ncbi_to_uniprot_mapper import NCBIToUniprotMapper from pyeed.dbchat import DBChat from pyeed.dbconnect import DatabaseConnector from pyeed.embedding import ( @@ -185,6 +186,19 @@ def fetch_ncbi_nucleotide(self, ids: list[str]) -> None: asyncio.run(adapter.execute_requests()) nest_asyncio.apply() + + def database_id_mapper(self, ids: list[str]) -> None: + """ + Maps IDs from one database to another using the UniProt ID mapping service + + Args: + ids (list[str]): List of IDs to map. + """ + + mapper = NCBIToUniprotMapper(ids) + mapper.execute_request() + + nest_asyncio.apply() def calculate_sequence_embeddings( self, From f44a2f3126d6b3ddb23082b2bdc193e5ace91956 Mon Sep 17 00:00:00 2001 From: alacheim Date: Thu, 13 Mar 2025 14:21:07 +0000 Subject: [PATCH 02/19] changes in mapper --- src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py index 3b6098ad..134d2ff5 100644 --- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py +++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py @@ -127,5 +127,4 @@ def execute_request(self) -> None: with open("id_mapping_uniparc.json", "w") as f: json.dump(id_mapping_uniparc, f) - - + \ No newline at end of file From 27ff2317166e9c2b43572e08ca378240405fa7ab Mon Sep 17 00:00:00 2001 From: alacheim Date: Fri, 14 Mar 2025 09:46:57 +0000 Subject: [PATCH 03/19] fixed bug in organism mapper --- src/pyeed/adapter/ncbi_protein_mapper.py | 2 ++ src/pyeed/model.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/src/pyeed/adapter/ncbi_protein_mapper.py b/src/pyeed/adapter/ncbi_protein_mapper.py index e11d4fe7..3ecf485c 100644 --- a/src/pyeed/adapter/ncbi_protein_mapper.py +++ b/src/pyeed/adapter/ncbi_protein_mapper.py @@ -281,6 +281,8 @@ def add_to_db(self, response: Response) -> None: protein = Protein(**protein_data) protein.save() + if not isinstance(organism, Organism): + raise TypeError(f"Expected Organism, but got {type(organism)}") protein.organism.connect(organism) # Add features diff --git a/src/pyeed/model.py b/src/pyeed/model.py index 7a720560..c9c1d4a4 100644 --- a/src/pyeed/model.py +++ b/src/pyeed/model.py @@ -145,6 +145,20 @@ class Annotation(Enum): class Organism(StrictStructuredNode): taxonomy_id: int = IntegerProperty(required=True, unique_index=True) name = StringProperty() + + @classmethod + def get_or_save(cls, taxonomy_id, name) -> "Organism": + try: + organism = cls.nodes.get(taxonomy_id=taxonomy_id) + return organism + except cls.DoesNotExist: + try: + organism = cls(taxonomy_id=taxonomy_id, name=name) + organism.save() + return organism + except Exception as e: + print(f"Error during saving of the organism: {e}") + raise class SiteRel(StructuredRel): # type: ignore From faf38fd4a32ab6ba27edc78748069464fe066195 Mon Sep 17 00:00:00 2001 From: alacheim Date: Fri, 25 Apr 2025 10:09:19 +0000 Subject: [PATCH 04/19] added individual file name to mapper --- src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 7 ++++--- src/pyeed/main.py | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py index 134d2ff5..4ea11801 100644 --- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py +++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py @@ -10,8 +10,9 @@ logger = logging.getLogger(__name__) class NCBIToUniprotMapper: - def __init__(self, ids): + def __init__(self, ids: List[str], file: str): self.ids = ids + self.file = file self.uniparc_url = "https://www.ebi.ac.uk/proteins/api/uniparc?offset=0&size=100&sequencechecksum=" self.ncbi_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" @@ -122,9 +123,9 @@ def execute_request(self) -> None: id_mapping_uniprot[self.ids[counter]] = uniprot_id counter += 1 - with open("id_mapping_uniprot.json", "w") as f: + with open(f"{self.file}_uniprot.json", "w") as f: json.dump(id_mapping_uniprot, f) - with open("id_mapping_uniparc.json", "w") as f: + with open(f"{self.file}_uniparc.json", "w") as f: json.dump(id_mapping_uniparc, f) \ No newline at end of file diff --git a/src/pyeed/main.py b/src/pyeed/main.py index c7707d13..44fd6d52 100644 --- a/src/pyeed/main.py +++ b/src/pyeed/main.py @@ -187,7 +187,7 @@ def fetch_ncbi_nucleotide(self, ids: list[str]) -> None: asyncio.run(adapter.execute_requests()) nest_asyncio.apply() - def database_id_mapper(self, ids: list[str]) -> None: + def database_id_mapper(self, ids: list[str], file: str) -> None: """ Maps IDs from one database to another using the UniProt ID mapping service @@ -195,7 +195,7 @@ def database_id_mapper(self, ids: list[str]) -> None: ids (list[str]): List of IDs to map. """ - mapper = NCBIToUniprotMapper(ids) + mapper = NCBIToUniprotMapper(ids, file) mapper.execute_request() nest_asyncio.apply() From 056cb6b75cb45efc650648b6e2a7b56c446803b3 Mon Sep 17 00:00:00 2001 From: alacheim Date: Wed, 30 Apr 2025 09:50:29 +0000 Subject: [PATCH 05/19] fixing ruff errors --- src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 11 ++++++----- src/pyeed/main.py | 11 ++++------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py index 4ea11801..6969e2e8 100644 --- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py +++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py @@ -1,12 +1,13 @@ -import httpx +import json import logging -from pysam import FastaFile -from crc64iso import crc64iso -import sys -import json import os +import sys from typing import List +import httpx +from crc64iso import crc64iso +from pysam import FastaFile + logger = logging.getLogger(__name__) class NCBIToUniprotMapper: diff --git a/src/pyeed/main.py b/src/pyeed/main.py index c971fe16..7effe1f1 100644 --- a/src/pyeed/main.py +++ b/src/pyeed/main.py @@ -1,25 +1,22 @@ import asyncio -from typing import Any, Literal import time from concurrent.futures import ThreadPoolExecutor -import torch +from typing import Any, Literal import nest_asyncio +import torch from loguru import logger from pyeed.adapter.ncbi_dna_mapper import NCBIDNAToPyeed from pyeed.adapter.ncbi_protein_mapper import NCBIProteinToPyeed +from pyeed.adapter.ncbi_to_uniprot_mapper import NCBIToUniprotMapper from pyeed.adapter.primary_db_adapter import PrimaryDBAdapter from pyeed.adapter.uniprot_mapper import UniprotToPyeed -from pyeed.adapter.ncbi_to_uniprot_mapper import NCBIToUniprotMapper from pyeed.dbchat import DBChat from pyeed.dbconnect import DatabaseConnector from pyeed.embedding import ( - free_memory, - get_batch_embeddings, load_model_and_tokenizer, - update_protein_embeddings_in_db, - process_batches_on_gpu + process_batches_on_gpu, ) From e7369e93c2dfd504150afbe422c76c73a20043fd Mon Sep 17 00:00:00 2001 From: alacheim Date: Wed, 30 Apr 2025 10:05:54 +0000 Subject: [PATCH 06/19] added crc64iso to dependencies --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index dd10629a..41b2c8fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ esm = "^3.1.3" rdflib = "^6.0.0" docker = "5.0.0" absl-py = "1.0.0" +crc64iso = "0.0.2" [tool.poetry.group.dev.dependencies] mkdocstrings = {extras = ["python"], version = "^0.26.2"} From f9897fe4e1185dfa0d5003b5881572a8e9355118 Mon Sep 17 00:00:00 2001 From: alacheim Date: Wed, 30 Apr 2025 10:09:18 +0000 Subject: [PATCH 07/19] added pysam to dependencies --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 2a57449b..2f81fd56 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ docker = "5.0.0" absl-py = "1.0.0" crc64iso = "0.0.2" SPARQLWrapper = "2.0.0" +pysam = "0.23.0" [tool.poetry.group.dev.dependencies] mkdocstrings = {extras = ["python"], version = "^0.26.2"} From 9d81e9b2204023d8c2a2ebff0b8ac88ff7e384cb Mon Sep 17 00:00:00 2001 From: alacheim Date: Wed, 30 Apr 2025 10:21:54 +0000 Subject: [PATCH 08/19] fixing linting errors --- src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 2 +- src/pyeed/analysis/sequence_alignment.py | 3 +-- src/pyeed/embedding.py | 5 ++--- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py index 6969e2e8..8e543f52 100644 --- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py +++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py @@ -104,7 +104,7 @@ def execute_request(self) -> None: #check if the request was successful if response.status_code != 200: - print(f"Request failed with status code {r.status_code}") + print(f"Request failed with status code {response.status_code}") response.raise_for_status() # Raise exception for any non-200 response sys.exit() diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py index 946200b2..9255d55f 100644 --- a/src/pyeed/analysis/sequence_alignment.py +++ b/src/pyeed/analysis/sequence_alignment.py @@ -5,10 +5,9 @@ from Bio.Align import PairwiseAligner as BioPairwiseAligner from Bio.Align.substitution_matrices import Array as BioSubstitutionMatrix from joblib import Parallel, cpu_count, delayed -from rich.progress import Progress - from pyeed.dbconnect import DatabaseConnector from pyeed.tools.utility import chunks +from rich.progress import Progress class PairwiseAligner: diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py index c8fa91db..1b0d4955 100644 --- a/src/pyeed/embedding.py +++ b/src/pyeed/embedding.py @@ -1,21 +1,20 @@ import gc import os from typing import Any, Tuple, Union -from loguru import logger import numpy as np import torch -from esm.models.esmc import ESMC from esm.models.esm3 import ESM3 +from esm.models.esmc import ESMC from esm.sdk.api import ESM3InferenceClient, ESMProtein, LogitsConfig, SamplingConfig from huggingface_hub import HfFolder, login +from loguru import logger from numpy.typing import NDArray from transformers import EsmModel, EsmTokenizer from pyeed.dbconnect import DatabaseConnector - def get_hf_token() -> str: """Get or request Hugging Face token.""" if os.getenv("PYTEST_DISABLE_HF_LOGIN"): # Disable Hugging Face login in tests From 3955718fa2d05a3739b1cb3c4e8bd3dc1c06d676 Mon Sep 17 00:00:00 2001 From: alacheim Date: Wed, 30 Apr 2025 11:29:50 +0000 Subject: [PATCH 09/19] reformatting --- src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 68 ++++++++++----------- src/pyeed/adapter/uniprot_mapper.py | 36 +++++------ src/pyeed/analysis/network_analysis.py | 36 +++++------ src/pyeed/analysis/sequence_alignment.py | 13 ++-- src/pyeed/dbconnect.py | 7 ++- src/pyeed/embedding.py | 15 +++-- src/pyeed/main.py | 25 ++++---- src/pyeed/model.py | 23 ++++--- 8 files changed, 114 insertions(+), 109 deletions(-) diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py index 8e543f52..2f711e16 100644 --- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py +++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py @@ -10,18 +10,18 @@ logger = logging.getLogger(__name__) + class NCBIToUniprotMapper: def __init__(self, ids: List[str], file: str): self.ids = ids self.file = file self.uniparc_url = "https://www.ebi.ac.uk/proteins/api/uniparc?offset=0&size=100&sequencechecksum=" self.ncbi_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" - - + def download_fasta(self, refseq_id: str) -> None: """ Downloads a FASTA file for a given RefSeq ID using httpx and saves it locally. - + Args: refseq_id str: NCBI ID """ @@ -30,9 +30,9 @@ def download_fasta(self, refseq_id: str) -> None: "db": "protein", "id": refseq_id, "rettype": "fasta", - "retmode": "text" + "retmode": "text", } - + try: response = httpx.get(self.ncbi_url, params=params, timeout=10.0) @@ -42,21 +42,23 @@ def download_fasta(self, refseq_id: str) -> None: f.write(response.text) print(f"✅ Downloaded: {filename}") else: - print(f"❌ Failed to download {refseq_id} (Status: {response.status_code})") + print( + f"❌ Failed to download {refseq_id} (Status: {response.status_code})" + ) except httpx.HTTPError as e: print(f"❌ HTTP error occurred while downloading {refseq_id}: {e}") def get_checksum(self, refseq_id: str) -> str: """Fetches and calculates the checksum for a given RefSeq ID. - - Args: + + Args: refseq_id str: NCBI ID - + Returns: str: checksum ID """ - + self.download_fasta(refseq_id) fa = FastaFile(f"{refseq_id}.fasta") seq = fa.fetch(fa.references[0]) @@ -71,7 +73,7 @@ def checksum_list(self, refseq_ids: List[str]) -> List[str]: Returns: List[str]: cheksum IDs """ - + checksums = [] for refseq_id in refseq_ids: checksums.append(self.get_checksum(refseq_id)) @@ -85,48 +87,46 @@ def checksum_list(self, refseq_ids: List[str]) -> List[str]: os.remove(fai_file_path) return checksums - def execute_request(self) -> None: - """Fetches the uniparc and uniprot ids for the given refseq ids and saves them in a json file. - """ - + def execute_request(self) -> None: + """Fetches the uniparc and uniprot ids for the given refseq ids and saves them in a json file.""" + checksum_list = self.checksum_list(self.ids) - + id_mapping_uniprot = {} id_mapping_uniparc = {} counter = 0 - - for checksum in checksum_list: + + for checksum in checksum_list: url = f"{self.uniparc_url}{checksum}" - - #perform request and get response as JSON + + # perform request and get response as JSON with httpx.Client() as client: - response = client.get(url, headers={ "Accept" : "application/json"}) - - #check if the request was successful + response = client.get(url, headers={"Accept": "application/json"}) + + # check if the request was successful if response.status_code != 200: print(f"Request failed with status code {response.status_code}") response.raise_for_status() # Raise exception for any non-200 response sys.exit() - + # Check if the response body is empty if not response.content.strip(): # Check if the body is empty print("The response body is empty.") sys.exit() - - #extracts the uniprot and the uniparc id from the repsonse and saves them in a dictionary + + # extracts the uniprot and the uniparc id from the repsonse and saves them in a dictionary response_body = response.json() - for item in response_body: - uniparc_id = item.get('accession', None) - for ref in item.get('dbReference', []): - if ref.get('type') == 'UniProtKB/TrEMBL': - uniprot_id = ref.get('id', None) + for item in response_body: + uniparc_id = item.get("accession", None) + for ref in item.get("dbReference", []): + if ref.get("type") == "UniProtKB/TrEMBL": + uniprot_id = ref.get("id", None) id_mapping_uniparc[self.ids[counter]] = uniparc_id id_mapping_uniprot[self.ids[counter]] = uniprot_id counter += 1 - + with open(f"{self.file}_uniprot.json", "w") as f: json.dump(id_mapping_uniprot, f) - + with open(f"{self.file}_uniparc.json", "w") as f: json.dump(id_mapping_uniparc, f) - \ No newline at end of file diff --git a/src/pyeed/adapter/uniprot_mapper.py b/src/pyeed/adapter/uniprot_mapper.py index 477249eb..36ed3577 100644 --- a/src/pyeed/adapter/uniprot_mapper.py +++ b/src/pyeed/adapter/uniprot_mapper.py @@ -82,13 +82,15 @@ def add_sites(self, record: dict[str, Any], protein: Protein) -> None: site.save() protein.site.connect(site, {"positions": positions}) - - def get_substrates_and_products_from_rhea(self, rhea_id: str) -> dict[str, List[str]]: + + def get_substrates_and_products_from_rhea( + self, rhea_id: str + ) -> dict[str, List[str]]: """Fetch substrates and products from Rhea by parsing the side URI (_L = substrate, _R = product). - + Args: rhea_id (str or int): The Rhea reaction ID (e.g., 49528) - + Returns: dict: { 'substrates': [list of chebi URIs], @@ -134,17 +136,13 @@ def get_substrates_and_products_from_rhea(self, rhea_id: str) -> dict[str, List[ elif side_uri.endswith("_R"): products.add(chebi_uri) - return { - "substrates": sorted(substrates), - "products": sorted(products) - } + return {"substrates": sorted(substrates), "products": sorted(products)} - def get_smiles_from_chebi_web(self, chebi_url: str) -> str: """ Extract SMILES from the official ChEBI page using HTML scraping. """ - chebi_id = chebi_url.split('_')[-1] + chebi_id = chebi_url.split("_")[-1] url = f"https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:{chebi_id}" response = requests.get(url) @@ -157,7 +155,6 @@ def get_smiles_from_chebi_web(self, chebi_url: str) -> str: if headers and "SMILES" in headers[0].text: data_cell = row.find_all("td")[-1] # Get the last in row return data_cell.text.strip() - def add_reaction(self, record: dict[str, Any], protein: Protein) -> None: for reference in record.get("comments", []): # Safe retrieval with .get() @@ -168,7 +165,7 @@ def add_reaction(self, record: dict[str, Any], protein: Protein) -> None: if db_ref.get("id", "").startswith("RHEA:"): rhea_id = db_ref["id"] break # Stop after finding the first match - + catalytic_annotation = Reaction.get_or_save( rhea_id=rhea_id, ) @@ -176,31 +173,30 @@ def add_reaction(self, record: dict[str, Any], protein: Protein) -> None: protein.reaction.connect(catalytic_annotation) def add_molecule(self, rhea_id: str, reaction: Reaction) -> None: - chebi = self.get_substrates_and_products_from_rhea(rhea_id) substrate_ids = chebi["substrates"] product_ids = chebi["products"] - + for i in substrate_ids: smiles = self.get_smiles_from_chebi_web(i) - - chebi_id = i.split('_')[-1] + + chebi_id = i.split("_")[-1] chebi_id = f"CHEBI:{chebi_id}" substrate = Molecule.get_or_save( chebi_id=chebi_id, - smiles = smiles, + smiles=smiles, ) reaction.substrate.connect(substrate) - + for i in product_ids: smiles = self.get_smiles_from_chebi_web(i) - chebi_id = i.split('_')[-1] + chebi_id = i.split("_")[-1] chebi_id = f"CHEBI:{chebi_id}" product = Molecule.get_or_save( chebi_id=chebi_id, - smiles = smiles, + smiles=smiles, ) reaction.product.connect(product) diff --git a/src/pyeed/analysis/network_analysis.py b/src/pyeed/analysis/network_analysis.py index fd354ebe..dd66b45c 100644 --- a/src/pyeed/analysis/network_analysis.py +++ b/src/pyeed/analysis/network_analysis.py @@ -66,57 +66,51 @@ def create_graph( base_query = """ MATCH (n) """ - + # Add node filters node_filters = [] if nodes: node_filters.append("labels(n)[0] IN $node_types") if ids: node_filters.append("n.accession_id IN $accession_ids") - + if node_filters: base_query += "WHERE " + " AND ".join(node_filters) - + # Add relationship pattern and filters base_query += """ OPTIONAL MATCH (n)-[r]->(m) """ - + # Add relationship type filter if specified if relationships: base_query += "WHERE type(r) IN $relationships " - + # Return both nodes and relationships in a single query base_query += """ RETURN collect(DISTINCT {id: ID(n), labels: labels(n), properties: properties(n)}) as nodes, collect(DISTINCT {source: ID(n), target: ID(m), type: type(r), properties: properties(r)}) as relationships """ - + logger.info("Executing combined query for nodes and relationships") results = self.db.execute_read( base_query, - { - "node_types": nodes, - "accession_ids": ids, - "relationships": relationships - } + {"node_types": nodes, "accession_ids": ids, "relationships": relationships}, ) - + if not results or not results[0]: logger.warning("No results found in the database") return self.graph - + # Process nodes nodes_data = results[0]["nodes"] for node in nodes_data: self.graph.add_node( - node["id"], - labels=node["labels"], - properties=node["properties"] + node["id"], labels=node["labels"], properties=node["properties"] ) logger.info(f"Added {len(nodes_data)} nodes to the graph") - + # Process relationships relationships_data = results[0]["relationships"] for rel in relationships_data: @@ -125,10 +119,10 @@ def create_graph( rel["source"], rel["target"], type=rel["type"], - properties=rel["properties"] + properties=rel["properties"], ) logger.info(f"Added {len(relationships_data)} relationships to the graph") - + return self.graph def compute_degree_centrality(self) -> dict[Any, float]: @@ -263,8 +257,8 @@ def calculate_positions_2d( filtered_graph.remove_edges_from(self_referential_edges) # Find isolated nodes - #isolated_nodes = self.find_isolated_nodes(filtered_graph) - #filtered_graph.remove_nodes_from(isolated_nodes) + # isolated_nodes = self.find_isolated_nodes(filtered_graph) + # filtered_graph.remove_nodes_from(isolated_nodes) # Use spring layout for force-directed graph weight_attr = attribute if attribute is not None else None diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py index 9255d55f..d57c5e63 100644 --- a/src/pyeed/analysis/sequence_alignment.py +++ b/src/pyeed/analysis/sequence_alignment.py @@ -141,12 +141,17 @@ def align_multipairwise( MATCH (p1:Protein)-[:PAIRWISE_ALIGNED]->(p2:Protein) RETURN p1.accession_id AS Protein1_ID, p2.accession_id AS Protein2_ID """ - + # Fetch results properly as a list of tuples - existing_pairs = set(tuple(sorted((row["Protein1_ID"], row["Protein2_ID"]))) for row in db.execute_write(query)) + existing_pairs = set( + tuple(sorted((row["Protein1_ID"], row["Protein2_ID"]))) + for row in db.execute_write(query) + ) # Filter new pairs that are not in existing_pairs - new_pairs = [pair for pair in pairs if tuple(sorted(pair)) not in existing_pairs] + new_pairs = [ + pair for pair in pairs if tuple(sorted(pair)) not in existing_pairs + ] print(f"Number of existing pairs: {len(existing_pairs)}") print(f"Number of total pairs: {len(pairs)}") @@ -351,4 +356,4 @@ def _get_id_sequence_dict( def _load_substitution_matrix(self) -> "BioSubstitutionMatrix": from Bio.Align import substitution_matrices - return substitution_matrices.load(self.substitution_matrix) # type: ignore \ No newline at end of file + return substitution_matrices.load(self.substitution_matrix) # type: ignore diff --git a/src/pyeed/dbconnect.py b/src/pyeed/dbconnect.py index d208deec..8abcab52 100644 --- a/src/pyeed/dbconnect.py +++ b/src/pyeed/dbconnect.py @@ -227,8 +227,11 @@ def _get_driver(uri: str, user: str | None, password: str | None) -> Driver: Creates a new Neo4j driver instance. """ auth = (user, password) if user and password else None - return GraphDatabase.driver(uri, auth=auth, connection_timeout=60, # Increase initial connection timeout - max_connection_lifetime=86400, # Keep connections alive longer + return GraphDatabase.driver( + uri, + auth=auth, + connection_timeout=60, # Increase initial connection timeout + max_connection_lifetime=86400, # Keep connections alive longer ) @property diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py index 1b0d4955..a0229385 100644 --- a/src/pyeed/embedding.py +++ b/src/pyeed/embedding.py @@ -31,6 +31,7 @@ def get_hf_token() -> str: else: raise RuntimeError("Failed to get Hugging Face token") + def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db): """ Splits data into batches and processes them on a single GPU. @@ -64,15 +65,21 @@ def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db): ) # Update the database - update_protein_embeddings_in_db(db, list(accessions[:current_batch_size]), embeddings_batch) + update_protein_embeddings_in_db( + db, list(accessions[:current_batch_size]), embeddings_batch + ) # Move to the next batch break # Successful execution, move to the next batch except torch.cuda.OutOfMemoryError: torch.cuda.empty_cache() - current_batch_size = max(1, current_batch_size // 2) # Reduce batch size - logger.warning(f"Reduced batch size to {current_batch_size} due to OOM error.") + current_batch_size = max( + 1, current_batch_size // 2 + ) # Reduce batch size + logger.warning( + f"Reduced batch size to {current_batch_size} due to OOM error." + ) # Free memory del model @@ -82,7 +89,7 @@ def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db): def load_model_and_tokenizer( model_name: str, device: str, - ) -> Tuple[Any, Union[Any, None], str]: +) -> Tuple[Any, Union[Any, None], str]: """ Loads the model and assigns it to a specific GPU. diff --git a/src/pyeed/main.py b/src/pyeed/main.py index 7effe1f1..5ba41d0d 100644 --- a/src/pyeed/main.py +++ b/src/pyeed/main.py @@ -190,8 +190,7 @@ def fetch_ncbi_nucleotide(self, ids: list[str]) -> None: # Fix: apply nest_asyncio and then run the coroutine with the event loop nest_asyncio.apply() asyncio.get_event_loop().run_until_complete(adapter.execute_requests()) - - + def database_id_mapper(self, ids: list[str], file: str) -> None: """ Maps IDs from one database to another using the UniProt ID mapping service @@ -202,7 +201,7 @@ def database_id_mapper(self, ids: list[str], file: str) -> None: mapper = NCBIToUniprotMapper(ids, file) mapper.execute_request() - + nest_asyncio.apply() def calculate_sequence_embeddings( @@ -210,9 +209,9 @@ def calculate_sequence_embeddings( batch_size: int = 16, model_name: str = "facebook/esm2_t33_650M_UR50D", num_gpus: int = None, # Number of GPUs to use - ) -> None: + ) -> None: """ - Calculates embeddings for all sequences in the database that do not have embeddings, + Calculates embeddings for all sequences in the database that do not have embeddings, distributing the workload across available GPUs. Args: @@ -243,18 +242,19 @@ def calculate_sequence_embeddings( """ results = self.db.execute_read(query) data = [(result["accession"], result["sequence"]) for result in results] - + if not data: logger.info("No sequences to process.") return - + accessions, sequences = zip(*data) total_sequences = len(sequences) logger.debug(f"Total sequences to process: {total_sequences}") # Split the data into num_gpus chunks gpu_batches = [ - list(zip(accessions[i::num_gpus], sequences[i::num_gpus])) for i in range(num_gpus) + list(zip(accessions[i::num_gpus], sequences[i::num_gpus])) + for i in range(num_gpus) ] start_time = time.time() @@ -275,16 +275,17 @@ def calculate_sequence_embeddings( model, tokenizer, device, - self.db + self.db, ) ) - + for future in futures: future.result() # Wait for all threads to complete - end_time = time.time() - logger.info(f"Total embedding calculation time: {end_time - start_time:.2f} seconds") + logger.info( + f"Total embedding calculation time: {end_time - start_time:.2f} seconds" + ) # Cleanup for model, _, _ in models_and_tokenizers: diff --git a/src/pyeed/model.py b/src/pyeed/model.py index 19d83e8b..c7a193b7 100644 --- a/src/pyeed/model.py +++ b/src/pyeed/model.py @@ -112,13 +112,12 @@ def save(self, *args: Any, **kwargs: Any) -> None: elif isinstance(base_property, FloatProperty): if not all(isinstance(item, float) for item in prop): raise TypeError(f"All items in '{field}' must be floats") - - #Validate BoleanProperty + + # Validate BoleanProperty elif isinstance(neo_type, BooleanProperty) and not isinstance(prop, bool): raise TypeError( f"Expected a boolean for '{field}', got {type(prop).__name__}" ) - super().save(*args, **kwargs) # Don't return the result @@ -153,7 +152,7 @@ class Annotation(Enum): class Organism(StrictStructuredNode): taxonomy_id = IntegerProperty(required=True, unique_index=True) name = StringProperty() - + @classmethod def get_or_save(cls, taxonomy_id, name) -> "Organism": try: @@ -395,25 +394,25 @@ class Reaction(StrictStructuredNode): """ A node representing a reaction. """ - + rhea_id = StringProperty(unique_index=True, required=True) chebi_id = ArrayProperty(StringProperty()) # Relationships substrate = RelationshipTo("Molecule", "SUBSTRATE") product = RelationshipTo("Molecule", "PRODUCT") - - + @property def label(self) -> str: """The label of the reaction.""" return {self.rhea_id} + class Molecule(StrictStructuredNode): """ A node representing a molecule in the database. """ - + chebi_id = StringProperty(unique_index=True, required=True) rhea_compound_id = StringProperty() smiles = StringProperty() @@ -431,13 +430,13 @@ def get_or_save(cls, chebi_id, smiles) -> "Molecule": except Exception as e: print(f"Error during saving of the molecule: {e}") raise - - @property + + @property def label(self) -> str: """The label of the molecule.""" return {self.chebi_id} - - + + class StandardNumbering(StrictStructuredNode): name = StringProperty(required=True, unique_index=True) definition = StringProperty(required=True) From 9b7f28b4b0649a37e654d604ad3a479bcb555e18 Mon Sep 17 00:00:00 2001 From: alacheim Date: Wed, 30 Apr 2025 12:51:26 +0000 Subject: [PATCH 10/19] fixing mypy errors --- pyproject.toml | 1 + src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 2 +- src/pyeed/model.py | 18 +++++++++++------- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2f81fd56..948c493b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ absl-py = "1.0.0" crc64iso = "0.0.2" SPARQLWrapper = "2.0.0" pysam = "0.23.0" +types-requests = "2.32.0" [tool.poetry.group.dev.dependencies] mkdocstrings = {extras = ["python"], version = "^0.26.2"} diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py index 2f711e16..1373547a 100644 --- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py +++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py @@ -62,7 +62,7 @@ def get_checksum(self, refseq_id: str) -> str: self.download_fasta(refseq_id) fa = FastaFile(f"{refseq_id}.fasta") seq = fa.fetch(fa.references[0]) - return crc64iso.crc64(seq) + return f"{crc64iso.crc64(seq)}" def checksum_list(self, refseq_ids: List[str]) -> List[str]: """Creates a list of checksum IDs and deletes the FASTA files after processing. diff --git a/src/pyeed/model.py b/src/pyeed/model.py index c7a193b7..aa498aee 100644 --- a/src/pyeed/model.py +++ b/src/pyeed/model.py @@ -1,5 +1,5 @@ from enum import Enum -from typing import Any +from typing import Any, cast # from pyeed.nodes_and_relations import StrictStructuredNode from neomodel import ( @@ -154,9 +154,11 @@ class Organism(StrictStructuredNode): name = StringProperty() @classmethod - def get_or_save(cls, taxonomy_id, name) -> "Organism": + def get_or_save(cls, **kwargs: Any) -> "Organism": + taxonomy_id = kwargs.get("taxonomy_id") + name = kwargs.get("name") try: - organism = cls.nodes.get(taxonomy_id=taxonomy_id) + organism = cast(Organism, cls.nodes.get(taxonomy_id=taxonomy_id)) return organism except cls.DoesNotExist: try: @@ -405,7 +407,7 @@ class Reaction(StrictStructuredNode): @property def label(self) -> str: """The label of the reaction.""" - return {self.rhea_id} + return f"{self.rhea_id}" class Molecule(StrictStructuredNode): @@ -418,9 +420,11 @@ class Molecule(StrictStructuredNode): smiles = StringProperty() @classmethod - def get_or_save(cls, chebi_id, smiles) -> "Molecule": + def get_or_save(cls, **kwargs:Any) -> "Molecule": + chebi_id = kwargs.get("chebi_id") + smiles = kwargs.get("smiles") try: - molecule = cls.nodes.get(chebi_id=chebi_id) + molecule = cast(Molecule, cls.nodes.get(chebi_id=chebi_id)) return molecule except cls.DoesNotExist: try: @@ -434,7 +438,7 @@ def get_or_save(cls, chebi_id, smiles) -> "Molecule": @property def label(self) -> str: """The label of the molecule.""" - return {self.chebi_id} + return f"{self.chebi_id}" class StandardNumbering(StrictStructuredNode): From cf742d77f52e9533e6b0b5813a70ac29d0756ef8 Mon Sep 17 00:00:00 2001 From: alacheim Date: Wed, 30 Apr 2025 14:02:52 +0000 Subject: [PATCH 11/19] fixing mypy errors --- src/pyeed/adapter/uniprot_mapper.py | 29 ++++++++++++++++++++--------- src/pyeed/embedding.py | 28 ++++++++++++++++++++-------- src/pyeed/main.py | 11 +++++++---- 3 files changed, 47 insertions(+), 21 deletions(-) diff --git a/src/pyeed/adapter/uniprot_mapper.py b/src/pyeed/adapter/uniprot_mapper.py index 36ed3577..7f26893e 100644 --- a/src/pyeed/adapter/uniprot_mapper.py +++ b/src/pyeed/adapter/uniprot_mapper.py @@ -1,9 +1,9 @@ import json from collections import defaultdict -from typing import Any, List +from typing import Any, List, Optional import requests -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from httpx import Response from loguru import logger from SPARQLWrapper import JSON, SPARQLWrapper @@ -120,7 +120,11 @@ def get_substrates_and_products_from_rhea( sparql.setReturnFormat(JSON) sparql.addCustomHttpHeader("User-Agent", "MyPythonClient/1.0") - results = sparql.query().convert() + results_raw = sparql.query().convert() + if not isinstance(results_raw, dict): + raise TypeError("Expected dict from SPARQL query") + + results: dict[str, Any] = results_raw substrates = set() products = set() @@ -138,7 +142,7 @@ def get_substrates_and_products_from_rhea( return {"substrates": sorted(substrates), "products": sorted(products)} - def get_smiles_from_chebi_web(self, chebi_url: str) -> str: + def get_smiles_from_chebi_web(self, chebi_url: str) -> Optional[str]: """ Extract SMILES from the official ChEBI page using HTML scraping. """ @@ -150,11 +154,17 @@ def get_smiles_from_chebi_web(self, chebi_url: str) -> str: # Look for table rows that contain the SMILES label for table in soup.find_all("table", class_="chebiTableContent"): + if not isinstance(table, Tag): + continue for row in table.find_all("tr"): + if not isinstance(row, Tag): + continue headers = row.find_all("td", class_="chebiDataHeader") - if headers and "SMILES" in headers[0].text: - data_cell = row.find_all("td")[-1] # Get the last in row - return data_cell.text.strip() + if headers and isinstance(headers[0], Tag) and "SMILES" in headers[0].text: + data_cells = row.find_all("td") + if data_cells: + return f"{data_cells[-1].text.strip()}" + return None def add_reaction(self, record: dict[str, Any], protein: Protein) -> None: for reference in record.get("comments", []): # Safe retrieval with .get() @@ -169,8 +179,9 @@ def add_reaction(self, record: dict[str, Any], protein: Protein) -> None: catalytic_annotation = Reaction.get_or_save( rhea_id=rhea_id, ) - self.add_molecule(rhea_id, catalytic_annotation) - protein.reaction.connect(catalytic_annotation) + if rhea_id is not None: + self.add_molecule(rhea_id, catalytic_annotation) + protein.reaction.connect(catalytic_annotation) def add_molecule(self, rhea_id: str, reaction: Reaction) -> None: chebi = self.get_substrates_and_products_from_rhea(rhea_id) diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py index a0229385..05895dfc 100644 --- a/src/pyeed/embedding.py +++ b/src/pyeed/embedding.py @@ -11,6 +11,7 @@ from loguru import logger from numpy.typing import NDArray from transformers import EsmModel, EsmTokenizer +from torch.nn import DataParallel, Module from pyeed.dbconnect import DatabaseConnector @@ -32,7 +33,14 @@ def get_hf_token() -> str: raise RuntimeError("Failed to get Hugging Face token") -def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db): +def process_batches_on_gpu( + data: list[tuple[str, str]], + batch_size: int, + model:Module, + tokenizer: EsmTokenizer, + db:DatabaseConnector, + device:torch.device, + ) -> None: """ Splits data into batches and processes them on a single GPU. @@ -88,8 +96,8 @@ def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db): def load_model_and_tokenizer( model_name: str, - device: str, -) -> Tuple[Any, Union[Any, None], str]: + device:torch.device, +) -> Tuple[Any, Union[Any, None], torch.device]: """ Loads the model and assigns it to a specific GPU. @@ -125,7 +133,7 @@ def get_batch_embeddings( model: Union[ EsmModel, ESMC, - torch.nn.DataParallel, + DataParallel[Module], ESM3InferenceClient, ESM3, ], @@ -209,7 +217,9 @@ def get_batch_embeddings( def calculate_single_sequence_embedding_last_hidden_state( - sequence: str, model_name: str = "facebook/esm2_t33_650M_UR50D" + sequence: str, + device: torch.device, + model_name: str = "facebook/esm2_t33_650M_UR50D", ) -> NDArray[np.float64]: """ Calculates an embedding for a single sequence. @@ -221,12 +231,14 @@ def calculate_single_sequence_embedding_last_hidden_state( Returns: NDArray[np.float64]: Normalized embedding vector for the sequence """ - model, tokenizer, device = load_model_and_tokenizer(model_name) + model, tokenizer, device = load_model_and_tokenizer(model_name, device) return get_single_embedding_last_hidden_state(sequence, model, tokenizer, device) def calculate_single_sequence_embedding_all_layers( - sequence: str, model_name: str = "facebook/esm2_t33_650M_UR50D" + sequence: str, + device: torch.device, + model_name: str = "facebook/esm2_t33_650M_UR50D", ) -> NDArray[np.float64]: """ Calculates embeddings for a single sequence across all layers. @@ -238,7 +250,7 @@ def calculate_single_sequence_embedding_all_layers( Returns: NDArray[np.float64]: A numpy array containing layer embeddings for the sequence. """ - model, tokenizer, device = load_model_and_tokenizer(model_name) + model, tokenizer, device = load_model_and_tokenizer(model_name, device) return get_single_embedding_all_layers(sequence, model, tokenizer, device) diff --git a/src/pyeed/main.py b/src/pyeed/main.py index 5ba41d0d..4dee81e0 100644 --- a/src/pyeed/main.py +++ b/src/pyeed/main.py @@ -208,7 +208,7 @@ def calculate_sequence_embeddings( self, batch_size: int = 16, model_name: str = "facebook/esm2_t33_650M_UR50D", - num_gpus: int = None, # Number of GPUs to use + num_gpus: int = 1, # Number of GPUs to use ) -> None: """ Calculates embeddings for all sequences in the database that do not have embeddings, @@ -229,9 +229,12 @@ def calculate_sequence_embeddings( logger.warning("No GPU available! Running on CPU.") # Load separate models for each GPU - devices = [f"cuda:{i}" for i in range(num_gpus)] if num_gpus > 0 else ["cpu"] + devices = [ + torch.device(f"cuda:{i}") for i in range(num_gpus) + ] if num_gpus > 0 else [torch.device("cpu")] + models_and_tokenizers = [ - load_model_and_tokenizer(model_name, device) for device in devices + load_model_and_tokenizer(model_name, device) for device in devices ] # Retrieve sequences without embeddings @@ -274,8 +277,8 @@ def calculate_sequence_embeddings( batch_size, model, tokenizer, - device, self.db, + device, ) ) From f073cd6e03f61b7e3a4a65e946d8ae0f2ff23bdd Mon Sep 17 00:00:00 2001 From: alacheim Date: Wed, 30 Apr 2025 14:11:28 +0000 Subject: [PATCH 12/19] fixing ruff error --- src/pyeed/embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py index 05895dfc..5686ca36 100644 --- a/src/pyeed/embedding.py +++ b/src/pyeed/embedding.py @@ -10,8 +10,8 @@ from huggingface_hub import HfFolder, login from loguru import logger from numpy.typing import NDArray -from transformers import EsmModel, EsmTokenizer from torch.nn import DataParallel, Module +from transformers import EsmModel, EsmTokenizer from pyeed.dbconnect import DatabaseConnector From 3ec0368351652eba4fbc18ddc818460c2e4381e4 Mon Sep 17 00:00:00 2001 From: alacheim Date: Wed, 30 Apr 2025 14:13:18 +0000 Subject: [PATCH 13/19] fixing ruff error --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 948c493b..b9897071 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ absl-py = "1.0.0" crc64iso = "0.0.2" SPARQLWrapper = "2.0.0" pysam = "0.23.0" -types-requests = "2.32.0" +types-requests = "2.32.0.20250328" [tool.poetry.group.dev.dependencies] mkdocstrings = {extras = ["python"], version = "^0.26.2"} From 37afe5c31421a124bcbb5a15fc3cb1bc3b3e321c Mon Sep 17 00:00:00 2001 From: alacheim Date: Wed, 30 Apr 2025 14:17:37 +0000 Subject: [PATCH 14/19] trigger pipeline --- src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py index 1373547a..78b493a5 100644 --- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py +++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py @@ -10,7 +10,6 @@ logger = logging.getLogger(__name__) - class NCBIToUniprotMapper: def __init__(self, ids: List[str], file: str): self.ids = ids From d84c74e7afabf2ef518220b84ade1b2dfca090cc Mon Sep 17 00:00:00 2001 From: alacheim Date: Wed, 30 Apr 2025 14:23:10 +0000 Subject: [PATCH 15/19] formated with ruff --- src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 1 + src/pyeed/adapter/uniprot_mapper.py | 6 +++++- src/pyeed/embedding.py | 20 ++++++++++---------- src/pyeed/main.py | 10 ++++++---- src/pyeed/model.py | 2 +- 5 files changed, 23 insertions(+), 16 deletions(-) diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py index 78b493a5..1373547a 100644 --- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py +++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py @@ -10,6 +10,7 @@ logger = logging.getLogger(__name__) + class NCBIToUniprotMapper: def __init__(self, ids: List[str], file: str): self.ids = ids diff --git a/src/pyeed/adapter/uniprot_mapper.py b/src/pyeed/adapter/uniprot_mapper.py index 7f26893e..f52d01b3 100644 --- a/src/pyeed/adapter/uniprot_mapper.py +++ b/src/pyeed/adapter/uniprot_mapper.py @@ -160,7 +160,11 @@ def get_smiles_from_chebi_web(self, chebi_url: str) -> Optional[str]: if not isinstance(row, Tag): continue headers = row.find_all("td", class_="chebiDataHeader") - if headers and isinstance(headers[0], Tag) and "SMILES" in headers[0].text: + if ( + headers + and isinstance(headers[0], Tag) + and "SMILES" in headers[0].text + ): data_cells = row.find_all("td") if data_cells: return f"{data_cells[-1].text.strip()}" diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py index 5686ca36..28f66a1b 100644 --- a/src/pyeed/embedding.py +++ b/src/pyeed/embedding.py @@ -34,13 +34,13 @@ def get_hf_token() -> str: def process_batches_on_gpu( - data: list[tuple[str, str]], - batch_size: int, - model:Module, - tokenizer: EsmTokenizer, - db:DatabaseConnector, - device:torch.device, - ) -> None: + data: list[tuple[str, str]], + batch_size: int, + model: Module, + tokenizer: EsmTokenizer, + db: DatabaseConnector, + device: torch.device, +) -> None: """ Splits data into batches and processes them on a single GPU. @@ -96,7 +96,7 @@ def process_batches_on_gpu( def load_model_and_tokenizer( model_name: str, - device:torch.device, + device: torch.device, ) -> Tuple[Any, Union[Any, None], torch.device]: """ Loads the model and assigns it to a specific GPU. @@ -217,7 +217,7 @@ def get_batch_embeddings( def calculate_single_sequence_embedding_last_hidden_state( - sequence: str, + sequence: str, device: torch.device, model_name: str = "facebook/esm2_t33_650M_UR50D", ) -> NDArray[np.float64]: @@ -236,7 +236,7 @@ def calculate_single_sequence_embedding_last_hidden_state( def calculate_single_sequence_embedding_all_layers( - sequence: str, + sequence: str, device: torch.device, model_name: str = "facebook/esm2_t33_650M_UR50D", ) -> NDArray[np.float64]: diff --git a/src/pyeed/main.py b/src/pyeed/main.py index 4dee81e0..1189fcb3 100644 --- a/src/pyeed/main.py +++ b/src/pyeed/main.py @@ -229,12 +229,14 @@ def calculate_sequence_embeddings( logger.warning("No GPU available! Running on CPU.") # Load separate models for each GPU - devices = [ - torch.device(f"cuda:{i}") for i in range(num_gpus) - ] if num_gpus > 0 else [torch.device("cpu")] + devices = ( + [torch.device(f"cuda:{i}") for i in range(num_gpus)] + if num_gpus > 0 + else [torch.device("cpu")] + ) models_and_tokenizers = [ - load_model_and_tokenizer(model_name, device) for device in devices + load_model_and_tokenizer(model_name, device) for device in devices ] # Retrieve sequences without embeddings diff --git a/src/pyeed/model.py b/src/pyeed/model.py index aa498aee..5a3bf188 100644 --- a/src/pyeed/model.py +++ b/src/pyeed/model.py @@ -420,7 +420,7 @@ class Molecule(StrictStructuredNode): smiles = StringProperty() @classmethod - def get_or_save(cls, **kwargs:Any) -> "Molecule": + def get_or_save(cls, **kwargs: Any) -> "Molecule": chebi_id = kwargs.get("chebi_id") smiles = kwargs.get("smiles") try: From 014dcdc220e309205bfb7cd9982f20f5750bd06d Mon Sep 17 00:00:00 2001 From: Niklas Abraham GPU Date: Thu, 1 May 2025 13:48:08 +0000 Subject: [PATCH 16/19] fixed linter issue in sequence alignment --- src/pyeed/analysis/sequence_alignment.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py index d57c5e63..9b68818e 100644 --- a/src/pyeed/analysis/sequence_alignment.py +++ b/src/pyeed/analysis/sequence_alignment.py @@ -5,9 +5,10 @@ from Bio.Align import PairwiseAligner as BioPairwiseAligner from Bio.Align.substitution_matrices import Array as BioSubstitutionMatrix from joblib import Parallel, cpu_count, delayed +from rich.progress import Progress + from pyeed.dbconnect import DatabaseConnector from pyeed.tools.utility import chunks -from rich.progress import Progress class PairwiseAligner: @@ -142,11 +143,13 @@ def align_multipairwise( RETURN p1.accession_id AS Protein1_ID, p2.accession_id AS Protein2_ID """ - # Fetch results properly as a list of tuples - existing_pairs = set( - tuple(sorted((row["Protein1_ID"], row["Protein2_ID"]))) - for row in db.execute_write(query) - ) + if db is not None: + existing_pairs = set( + tuple(sorted((row["Protein1_ID"], row["Protein2_ID"]))) + for row in db.execute_write(query) + ) + else: + existing_pairs = set() # Filter new pairs that are not in existing_pairs new_pairs = [ From a435764e26f07ca9094bf016f44b1676cccbd732 Mon Sep 17 00:00:00 2001 From: Niklas Abraham GPU Date: Thu, 1 May 2025 14:17:08 +0000 Subject: [PATCH 17/19] fixed ruff files --- docs/usage/blast.ipynb | 723 ++++++++++++----------- docs/usage/clustalo.ipynb | 338 +++++------ docs/usage/embeddings_analysis.ipynb | 5 +- docs/usage/mmseqs.ipynb | 3 +- docs/usage/mutation_analysis.ipynb | 1 + docs/usage/network_analysis.ipynb | 1 + docs/usage/standard_numbering.ipynb | 2 +- src/pyeed/analysis/embedding_analysis.py | 3 +- src/pyeed/analysis/network_analysis.py | 1 + src/pyeed/analysis/ontology_loading.py | 3 +- src/pyeed/analysis/standard_numbering.py | 1 + tests/unit/test_dbchat.py | 1 + 12 files changed, 545 insertions(+), 537 deletions(-) diff --git a/docs/usage/blast.ipynb b/docs/usage/blast.ipynb index b56140d7..d6cd57ef 100644 --- a/docs/usage/blast.ipynb +++ b/docs/usage/blast.ipynb @@ -1,363 +1,364 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# BLAST Search\n", - "\n", - "## Setup\n", - "\n", - "The BLAST service runs in a Docker container and requires:\n", - "1. A local BLAST database\n", - "2. The Docker service running" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# change log level to INFO\n", - "import sys\n", - "from loguru import logger\n", - "\n", - "logger.remove()\n", - "level = logger.add(sys.stderr, level=\"WARNING\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Basic Usage\n", - "\n", - "The `Blast` class provides an interface to search protein or nucleotide sequences against a local BLAST database." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
subject_ididentityalignment_lengthmismatchesgap_opensquery_startquery_endsubject_startsubject_endevaluebit_score
0seq781.8182231315111320.00322.3
1seq1100.00025001251250.00422.3
2seq261.5382610020455300.03819.2
\n", - "
" - ], - "text/plain": [ - " subject_id identity alignment_length mismatches gap_opens query_start \\\n", - "0 seq7 81.818 22 3 1 31 \n", - "1 seq1 100.000 25 0 0 1 \n", - "2 seq2 61.538 26 10 0 20 \n", - "\n", - " query_end subject_start subject_end evalue bit_score \n", - "0 51 11 32 0.003 22.3 \n", - "1 25 1 25 0.004 22.3 \n", - "2 45 5 30 0.038 19.2 " - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pyeed.tools import Blast\n", - "\n", - "# Example protein sequence\n", - "sequence = \"MSEQVAAVAKLRAKASEAAKEAKAREAAKKLAEAAKKAKAKEAAKRAEAKLAEKAKAAKRAEAKAAKEAKRAAAKRAEAKLAEKAKAAK\"\n", - "\n", - "# Initialize BLAST search\n", - "blast = Blast(\n", - " # service_url=\"http://localhost:6001/blast\",\n", - " mode=\"blastp\", # Use blastp for protein sequences\n", - " db_path=\"/usr/local/bin/data/test_db\", # Path in Docker container\n", - " db_name=\"protein_db\", # Name of your BLAST database\n", - " evalue=0.1, # E-value threshold\n", - " max_target_seqs=10, # Maximum number of hits to return\n", - ")\n", - "\n", - "# Perform search\n", - "results = blast.search(sequence)\n", - "results" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The results are returned as a pandas DataFrame with the following columns:\n", - "- subject_id: ID of the matched sequence\n", - "- identity: Percentage identity\n", - "- alignment_length: Length of the alignment\n", - "- mismatches: Number of mismatches\n", - "- gap_opens: Number of gap openings\n", - "- query_start/end: Start/end positions in query sequence\n", - "- subject_start/end: Start/end positions in subject sequence\n", - "- evalue: Expectation value\n", - "- bit_score: Bit score" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating a BLAST Database\n", - "\n", - "Before using BLAST, you need to create a local database. Here's how to create one from a FASTA file:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```bash\n", - "# For protein sequences\n", - "makeblastdb -in proteins.fasta -dbtype prot -out blast_db/my_proteins\n", - "\n", - "# For nucleotide sequences\n", - "makeblastdb -in nucleotides.fasta -dbtype nucl -out blast_db/my_nucleotides\n", - "```\n", - "\n", - "To access the BLAST Docker container shell and create databases:\n", - "\n", - "```bash\n", - "# Enter the BLAST container shell\n", - "docker compose exec blast bash\n", - "# \n", - "# Navigate to database directory\n", - "cd /usr/local/bin/data/blast_db\n", - "# \n", - "# Create protein database\n", - "makeblastdb -in proteins.fasta -dbtype prot -out my_proteins\n", - "# \n", - "# Create nucleotide database \n", - "makeblastdb -in nucleotides.fasta -dbtype nucl -out my_nucleotides\n", - "```\n", - "Make sure your FASTA files are mounted in the container's `/usr/local/bin/data/blast_db` directory.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Advanced Usage\n", - "\n", - "You can customize the BLAST search parameters:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
subject_ididentityalignment_lengthmismatchesgap_opensquery_startquery_endsubject_startsubject_endevaluebit_score
0seq781.8182231315111320.00322.3
1seq1100.00025001251250.00422.3
\n", - "
" - ], - "text/plain": [ - " subject_id identity alignment_length mismatches gap_opens query_start \\\n", - "0 seq7 81.818 22 3 1 31 \n", - "1 seq1 100.000 25 0 0 1 \n", - "\n", - " query_end subject_start subject_end evalue bit_score \n", - "0 51 11 32 0.003 22.3 \n", - "1 25 1 25 0.004 22.3 " - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Configure BLAST for sensitive protein search\n", - "blast = Blast(\n", - " # service_url=\"http://localhost:6001/blast\",\n", - " mode=\"blastp\",\n", - " db_path=\"/usr/local/bin/data/test_db\",\n", - " db_name=\"protein_db\",\n", - " evalue=1e-1, # More stringent E-value\n", - " max_target_seqs=100, # Return more hits\n", - " num_threads=4, # Use 4 CPU threads\n", - ")\n", - "\n", - "# Search with longer timeout\n", - "results = blast.search(sequence, timeout=7200) # 2 hour timeout\n", - "\n", - "# Filter results\n", - "significant_hits = results[results[\"identity\"] > 80] # Only hits with >90% identity\n", - "significant_hits" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Thereafter, the ids of the hits can be added to the pyeed database, using the `fetch_from_primary_db` function." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "pyeed", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BLAST Search\n", + "\n", + "## Setup\n", + "\n", + "The BLAST service runs in a Docker container and requires:\n", + "1. A local BLAST database\n", + "2. The Docker service running" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# change log level to INFO\n", + "import sys\n", + "\n", + "from loguru import logger\n", + "\n", + "logger.remove()\n", + "level = logger.add(sys.stderr, level=\"WARNING\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Basic Usage\n", + "\n", + "The `Blast` class provides an interface to search protein or nucleotide sequences against a local BLAST database." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_ididentityalignment_lengthmismatchesgap_opensquery_startquery_endsubject_startsubject_endevaluebit_score
0seq781.8182231315111320.00322.3
1seq1100.00025001251250.00422.3
2seq261.5382610020455300.03819.2
\n", + "
" + ], + "text/plain": [ + " subject_id identity alignment_length mismatches gap_opens query_start \\\n", + "0 seq7 81.818 22 3 1 31 \n", + "1 seq1 100.000 25 0 0 1 \n", + "2 seq2 61.538 26 10 0 20 \n", + "\n", + " query_end subject_start subject_end evalue bit_score \n", + "0 51 11 32 0.003 22.3 \n", + "1 25 1 25 0.004 22.3 \n", + "2 45 5 30 0.038 19.2 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pyeed.tools import Blast\n", + "\n", + "# Example protein sequence\n", + "sequence = \"MSEQVAAVAKLRAKASEAAKEAKAREAAKKLAEAAKKAKAKEAAKRAEAKLAEKAKAAKRAEAKAAKEAKRAAAKRAEAKLAEKAKAAK\"\n", + "\n", + "# Initialize BLAST search\n", + "blast = Blast(\n", + " # service_url=\"http://localhost:6001/blast\",\n", + " mode=\"blastp\", # Use blastp for protein sequences\n", + " db_path=\"/usr/local/bin/data/test_db\", # Path in Docker container\n", + " db_name=\"protein_db\", # Name of your BLAST database\n", + " evalue=0.1, # E-value threshold\n", + " max_target_seqs=10, # Maximum number of hits to return\n", + ")\n", + "\n", + "# Perform search\n", + "results = blast.search(sequence)\n", + "results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The results are returned as a pandas DataFrame with the following columns:\n", + "- subject_id: ID of the matched sequence\n", + "- identity: Percentage identity\n", + "- alignment_length: Length of the alignment\n", + "- mismatches: Number of mismatches\n", + "- gap_opens: Number of gap openings\n", + "- query_start/end: Start/end positions in query sequence\n", + "- subject_start/end: Start/end positions in subject sequence\n", + "- evalue: Expectation value\n", + "- bit_score: Bit score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating a BLAST Database\n", + "\n", + "Before using BLAST, you need to create a local database. Here's how to create one from a FASTA file:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```bash\n", + "# For protein sequences\n", + "makeblastdb -in proteins.fasta -dbtype prot -out blast_db/my_proteins\n", + "\n", + "# For nucleotide sequences\n", + "makeblastdb -in nucleotides.fasta -dbtype nucl -out blast_db/my_nucleotides\n", + "```\n", + "\n", + "To access the BLAST Docker container shell and create databases:\n", + "\n", + "```bash\n", + "# Enter the BLAST container shell\n", + "docker compose exec blast bash\n", + "# \n", + "# Navigate to database directory\n", + "cd /usr/local/bin/data/blast_db\n", + "# \n", + "# Create protein database\n", + "makeblastdb -in proteins.fasta -dbtype prot -out my_proteins\n", + "# \n", + "# Create nucleotide database \n", + "makeblastdb -in nucleotides.fasta -dbtype nucl -out my_nucleotides\n", + "```\n", + "Make sure your FASTA files are mounted in the container's `/usr/local/bin/data/blast_db` directory.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Advanced Usage\n", + "\n", + "You can customize the BLAST search parameters:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_ididentityalignment_lengthmismatchesgap_opensquery_startquery_endsubject_startsubject_endevaluebit_score
0seq781.8182231315111320.00322.3
1seq1100.00025001251250.00422.3
\n", + "
" + ], + "text/plain": [ + " subject_id identity alignment_length mismatches gap_opens query_start \\\n", + "0 seq7 81.818 22 3 1 31 \n", + "1 seq1 100.000 25 0 0 1 \n", + "\n", + " query_end subject_start subject_end evalue bit_score \n", + "0 51 11 32 0.003 22.3 \n", + "1 25 1 25 0.004 22.3 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Configure BLAST for sensitive protein search\n", + "blast = Blast(\n", + " # service_url=\"http://localhost:6001/blast\",\n", + " mode=\"blastp\",\n", + " db_path=\"/usr/local/bin/data/test_db\",\n", + " db_name=\"protein_db\",\n", + " evalue=1e-1, # More stringent E-value\n", + " max_target_seqs=100, # Return more hits\n", + " num_threads=4, # Use 4 CPU threads\n", + ")\n", + "\n", + "# Search with longer timeout\n", + "results = blast.search(sequence, timeout=7200) # 2 hour timeout\n", + "\n", + "# Filter results\n", + "significant_hits = results[results[\"identity\"] > 80] # Only hits with >90% identity\n", + "significant_hits" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Thereafter, the ids of the hits can be added to the pyeed database, using the `fetch_from_primary_db` function." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pyeed", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/docs/usage/clustalo.ipynb b/docs/usage/clustalo.ipynb index 64ed62ee..d3ba2fba 100644 --- a/docs/usage/clustalo.ipynb +++ b/docs/usage/clustalo.ipynb @@ -1,171 +1,171 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Multiple Sequence Alignment with Clustal Omega\n", - "\n", - "PyEED provides a convenient interface to Clustal Omega for multiple sequence alignment. This notebook demonstrates how to:\n", - "1. Align sequences from a dictionary\n", - "2. Align sequences directly from the database" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "from pyeed import Pyeed\n", - "from pyeed.tools.clustalo import ClustalOmega\n", - "\n", - "# change log level to INFO\n", - "import sys\n", - "from loguru import logger\n", - "\n", - "logger.remove()\n", - "level = logger.add(sys.stderr, level=\"INFO\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Direct Sequence Alignment\n", - "\n", - "You can align sequences directly by providing a dictionary of sequences:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Aligned sequences:\n", - "seq1 AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n", - "seq2 AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n", - "seq3 AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK----\n" - ] - } - ], - "source": [ - "# Initialize ClustalOmega\n", - "clustalo = ClustalOmega()\n", - "\n", - "# Example sequences\n", - "sequences = {\n", - " \"seq1\": \"AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n", - " \"seq2\": \"AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n", - " \"seq3\": \"AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK\",\n", - "}\n", - "\n", - "# Perform alignment\n", - "alignment = clustalo.align(sequences)\n", - "print(\"Aligned sequences:\")\n", - "print(alignment)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Database-based Alignment\n", - "\n", - "You can also align sequences directly from the database by providing a list of accession IDs:" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Pyeed Graph Object Mapping constraints not defined. Use _install_labels() to set up model constraints.\n", - "📡 Connected to database.\n", - "Database alignment:\n", - "AAP20891.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "CAJ85677.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "SAQ02853.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "CDR98216.1 MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n", - "WP_109963600.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGTGKRGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "CAA41038.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "WP_109874025.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "CAA46344.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "APG33178.1 MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n", - "AKC98298.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n" - ] - } - ], - "source": [ - "# Connect to database\n", - "pyeed = Pyeed(uri=\"bolt://129.69.129.130:7687\", user=\"neo4j\", password=\"12345678\")\n", - "\n", - "# Get protein IDs from database\n", - "from pyeed.model import Protein\n", - "\n", - "accession_ids = [protein.accession_id for protein in Protein.nodes.all()][:10]\n", - "\n", - "# Align sequences from database\n", - "alignment = clustalo.align_from_db(accession_ids, pyeed.db)\n", - "print(\"Database alignment:\")\n", - "print(alignment)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Understanding Alignment Results\n", - "\n", - "The alignment result is a `MultipleSequenceAlignment` object with:\n", - "- List of `Sequence` objects\n", - "- Each sequence has an ID and aligned sequence\n", - "- Gaps are represented by '-' characters\n", - "- Sequences are padded to equal length\n", - "\n", - "The alignment preserves sequence order and maintains sequence IDs from the input." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Configuration\n", - "\n", - "ClustalOmega requires the PyEED Docker service to be running. Make sure to:\n", - "1. Have Docker installed\n", - "2. Start the service with `docker-compose up -d`\n", - "3. The service runs on port 5001 by default" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "pyeed_niklas", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Multiple Sequence Alignment with Clustal Omega\n", + "\n", + "PyEED provides a convenient interface to Clustal Omega for multiple sequence alignment. This notebook demonstrates how to:\n", + "1. Align sequences from a dictionary\n", + "2. Align sequences directly from the database" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# change log level to INFO\n", + "import sys\n", + "\n", + "from loguru import logger\n", + "\n", + "from pyeed import Pyeed\n", + "from pyeed.model import Protein\n", + "from pyeed.tools.clustalo import ClustalOmega\n", + "\n", + "logger.remove()\n", + "level = logger.add(sys.stderr, level=\"INFO\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Direct Sequence Alignment\n", + "\n", + "You can align sequences directly by providing a dictionary of sequences:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Aligned sequences:\n", + "seq1 AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n", + "seq2 AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n", + "seq3 AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK----\n" + ] + } + ], + "source": [ + "# Initialize ClustalOmega\n", + "clustalo = ClustalOmega()\n", + "\n", + "# Example sequences\n", + "sequences = {\n", + " \"seq1\": \"AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n", + " \"seq2\": \"AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n", + " \"seq3\": \"AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK\",\n", + "}\n", + "\n", + "# Perform alignment\n", + "alignment = clustalo.align(sequences)\n", + "print(\"Aligned sequences:\")\n", + "print(alignment)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Database-based Alignment\n", + "\n", + "You can also align sequences directly from the database by providing a list of accession IDs:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pyeed Graph Object Mapping constraints not defined. Use _install_labels() to set up model constraints.\n", + "📡 Connected to database.\n", + "Database alignment:\n", + "AAP20891.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "CAJ85677.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "SAQ02853.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "CDR98216.1 MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n", + "WP_109963600.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGTGKRGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "CAA41038.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "WP_109874025.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "CAA46344.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "APG33178.1 MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n", + "AKC98298.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n" + ] + } + ], + "source": [ + "# Connect to database\n", + "pyeed = Pyeed(uri=\"bolt://129.69.129.130:7687\", user=\"neo4j\", password=\"12345678\")\n", + "\n", + "# Get protein IDs from database\n", + "accession_ids = [protein.accession_id for protein in Protein.nodes.all()][:10]\n", + "\n", + "# Align sequences from database\n", + "alignment = clustalo.align_from_db(accession_ids, pyeed.db)\n", + "print(\"Database alignment:\")\n", + "print(alignment)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Understanding Alignment Results\n", + "\n", + "The alignment result is a `MultipleSequenceAlignment` object with:\n", + "- List of `Sequence` objects\n", + "- Each sequence has an ID and aligned sequence\n", + "- Gaps are represented by '-' characters\n", + "- Sequences are padded to equal length\n", + "\n", + "The alignment preserves sequence order and maintains sequence IDs from the input." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuration\n", + "\n", + "ClustalOmega requires the PyEED Docker service to be running. Make sure to:\n", + "1. Have Docker installed\n", + "2. Start the service with `docker-compose up -d`\n", + "3. The service runs on port 5001 by default" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pyeed_niklas", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/docs/usage/embeddings_analysis.ipynb b/docs/usage/embeddings_analysis.ipynb index 65a2398c..0b72e743 100644 --- a/docs/usage/embeddings_analysis.ipynb +++ b/docs/usage/embeddings_analysis.ipynb @@ -24,9 +24,10 @@ "source": [ "import sys\n", "\n", - "from loguru import logger\n", - "import pandas as pd\n", "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "from loguru import logger\n", + "\n", "from pyeed import Pyeed\n", "from pyeed.analysis.embedding_analysis import EmbeddingTool\n", "\n", diff --git a/docs/usage/mmseqs.ipynb b/docs/usage/mmseqs.ipynb index 1185c6fe..2253fd8a 100644 --- a/docs/usage/mmseqs.ipynb +++ b/docs/usage/mmseqs.ipynb @@ -20,6 +20,7 @@ "outputs": [], "source": [ "from pyeed import Pyeed\n", + "from pyeed.model import Protein\n", "from pyeed.tools.mmseqs import MMSeqs" ] }, @@ -134,8 +135,6 @@ "pyeed = Pyeed(uri=\"bolt://localhost:7687\", user=\"neo4j\", password=\"12345678\")\n", "\n", "# Get first 100 protein IDs from database\n", - "from pyeed.model import Protein\n", - "\n", "accession_ids = [protein.accession_id for protein in Protein.nodes.all()][:100]\n", "\n", "# Cluster sequences\n", diff --git a/docs/usage/mutation_analysis.ipynb b/docs/usage/mutation_analysis.ipynb index 9b31c996..7d10d360 100644 --- a/docs/usage/mutation_analysis.ipynb +++ b/docs/usage/mutation_analysis.ipynb @@ -16,6 +16,7 @@ "outputs": [], "source": [ "import sys\n", + "\n", "from loguru import logger\n", "\n", "from pyeed import Pyeed\n", diff --git a/docs/usage/network_analysis.ipynb b/docs/usage/network_analysis.ipynb index 4d45db71..0b254610 100644 --- a/docs/usage/network_analysis.ipynb +++ b/docs/usage/network_analysis.ipynb @@ -11,6 +11,7 @@ "import matplotlib.pyplot as plt\n", "import networkx as nx\n", "from loguru import logger\n", + "\n", "from pyeed import Pyeed\n", "from pyeed.analysis.network_analysis import NetworkAnalysis\n", "from pyeed.analysis.sequence_alignment import PairwiseAligner\n", diff --git a/docs/usage/standard_numbering.ipynb b/docs/usage/standard_numbering.ipynb index cd84cad9..54374cd6 100644 --- a/docs/usage/standard_numbering.ipynb +++ b/docs/usage/standard_numbering.ipynb @@ -23,10 +23,10 @@ "%reload_ext autoreload\n", "%autoreload 2\n", "import sys\n", + "\n", "from loguru import logger\n", "\n", "from pyeed import Pyeed\n", - "from pyeed.analysis.mutation_detection import MutationDetection\n", "from pyeed.analysis.standard_numbering import StandardNumberingTool\n", "\n", "logger.remove()\n", diff --git a/src/pyeed/analysis/embedding_analysis.py b/src/pyeed/analysis/embedding_analysis.py index b3535f74..c27b670f 100644 --- a/src/pyeed/analysis/embedding_analysis.py +++ b/src/pyeed/analysis/embedding_analysis.py @@ -6,9 +6,10 @@ import scipy.spatial as sp from matplotlib.figure import Figure from numpy.typing import NDArray -from pyeed.dbconnect import DatabaseConnector from scipy.spatial.distance import cosine +from pyeed.dbconnect import DatabaseConnector + logger = logging.getLogger(__name__) diff --git a/src/pyeed/analysis/network_analysis.py b/src/pyeed/analysis/network_analysis.py index dd66b45c..3ab9aeaa 100644 --- a/src/pyeed/analysis/network_analysis.py +++ b/src/pyeed/analysis/network_analysis.py @@ -2,6 +2,7 @@ import networkx as nx from loguru import logger + from pyeed.dbconnect import DatabaseConnector diff --git a/src/pyeed/analysis/ontology_loading.py b/src/pyeed/analysis/ontology_loading.py index ee909636..5b6341f5 100644 --- a/src/pyeed/analysis/ontology_loading.py +++ b/src/pyeed/analysis/ontology_loading.py @@ -1,8 +1,9 @@ from typing import Dict -from pyeed.dbconnect import DatabaseConnector from rdflib import OWL, RDF, RDFS, Graph, Namespace, URIRef +from pyeed.dbconnect import DatabaseConnector + class OntologyAdapter: """ diff --git a/src/pyeed/analysis/standard_numbering.py b/src/pyeed/analysis/standard_numbering.py index 6f81869f..b2ea0667 100644 --- a/src/pyeed/analysis/standard_numbering.py +++ b/src/pyeed/analysis/standard_numbering.py @@ -13,6 +13,7 @@ from typing import Any, Dict, List, Optional, Tuple from loguru import logger + from pyeed.analysis.sequence_alignment import PairwiseAligner from pyeed.dbconnect import DatabaseConnector from pyeed.model import StandardNumbering diff --git a/tests/unit/test_dbchat.py b/tests/unit/test_dbchat.py index d1e202c6..bf6226ac 100644 --- a/tests/unit/test_dbchat.py +++ b/tests/unit/test_dbchat.py @@ -2,6 +2,7 @@ import pytest from neo4j.exceptions import CypherSyntaxError + from pyeed.dbchat import DBChat from pyeed.dbconnect import DatabaseConnector From c0739bbf3aae20c6747c216bd53ebbcced8ed396 Mon Sep 17 00:00:00 2001 From: Niklas Abraham GPU Date: Thu, 1 May 2025 14:32:06 +0000 Subject: [PATCH 18/19] fixed ruff import version mismatches --- src/pyeed/analysis/embedding_analysis.py | 3 +-- src/pyeed/analysis/mutation_detection.py | 1 - src/pyeed/analysis/network_analysis.py | 1 - src/pyeed/analysis/ontology_loading.py | 3 +-- src/pyeed/analysis/sequence_alignment.py | 3 +-- src/pyeed/analysis/standard_numbering.py | 1 - tests/unit/test_dbchat.py | 1 - 7 files changed, 3 insertions(+), 10 deletions(-) diff --git a/src/pyeed/analysis/embedding_analysis.py b/src/pyeed/analysis/embedding_analysis.py index c27b670f..b3535f74 100644 --- a/src/pyeed/analysis/embedding_analysis.py +++ b/src/pyeed/analysis/embedding_analysis.py @@ -6,9 +6,8 @@ import scipy.spatial as sp from matplotlib.figure import Figure from numpy.typing import NDArray -from scipy.spatial.distance import cosine - from pyeed.dbconnect import DatabaseConnector +from scipy.spatial.distance import cosine logger = logging.getLogger(__name__) diff --git a/src/pyeed/analysis/mutation_detection.py b/src/pyeed/analysis/mutation_detection.py index 5c6809e8..c2562ae1 100644 --- a/src/pyeed/analysis/mutation_detection.py +++ b/src/pyeed/analysis/mutation_detection.py @@ -1,7 +1,6 @@ from typing import Any, Optional from loguru import logger - from pyeed.dbconnect import DatabaseConnector diff --git a/src/pyeed/analysis/network_analysis.py b/src/pyeed/analysis/network_analysis.py index 3ab9aeaa..dd66b45c 100644 --- a/src/pyeed/analysis/network_analysis.py +++ b/src/pyeed/analysis/network_analysis.py @@ -2,7 +2,6 @@ import networkx as nx from loguru import logger - from pyeed.dbconnect import DatabaseConnector diff --git a/src/pyeed/analysis/ontology_loading.py b/src/pyeed/analysis/ontology_loading.py index 5b6341f5..ee909636 100644 --- a/src/pyeed/analysis/ontology_loading.py +++ b/src/pyeed/analysis/ontology_loading.py @@ -1,8 +1,7 @@ from typing import Dict -from rdflib import OWL, RDF, RDFS, Graph, Namespace, URIRef - from pyeed.dbconnect import DatabaseConnector +from rdflib import OWL, RDF, RDFS, Graph, Namespace, URIRef class OntologyAdapter: diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py index cb6acff4..440cbb1e 100644 --- a/src/pyeed/analysis/sequence_alignment.py +++ b/src/pyeed/analysis/sequence_alignment.py @@ -5,10 +5,9 @@ from Bio.Align import PairwiseAligner as BioPairwiseAligner from Bio.Align.substitution_matrices import Array as BioSubstitutionMatrix from joblib import Parallel, cpu_count, delayed -from rich.progress import Progress - from pyeed.dbconnect import DatabaseConnector from pyeed.tools.utility import chunks +from rich.progress import Progress class PairwiseAligner: diff --git a/src/pyeed/analysis/standard_numbering.py b/src/pyeed/analysis/standard_numbering.py index b2ea0667..6f81869f 100644 --- a/src/pyeed/analysis/standard_numbering.py +++ b/src/pyeed/analysis/standard_numbering.py @@ -13,7 +13,6 @@ from typing import Any, Dict, List, Optional, Tuple from loguru import logger - from pyeed.analysis.sequence_alignment import PairwiseAligner from pyeed.dbconnect import DatabaseConnector from pyeed.model import StandardNumbering diff --git a/tests/unit/test_dbchat.py b/tests/unit/test_dbchat.py index bf6226ac..d1e202c6 100644 --- a/tests/unit/test_dbchat.py +++ b/tests/unit/test_dbchat.py @@ -2,7 +2,6 @@ import pytest from neo4j.exceptions import CypherSyntaxError - from pyeed.dbchat import DBChat from pyeed.dbconnect import DatabaseConnector From cf19b94a87f3d800a40e8a95be4020853c59d689 Mon Sep 17 00:00:00 2001 From: alacheim Date: Fri, 2 May 2025 08:58:14 +0000 Subject: [PATCH 19/19] fixed mypy error, formated file --- src/pyeed/embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py index 30b15f0c..28f66a1b 100644 --- a/src/pyeed/embedding.py +++ b/src/pyeed/embedding.py @@ -97,7 +97,7 @@ def process_batches_on_gpu( def load_model_and_tokenizer( model_name: str, device: torch.device, -) -> Tuple[Any, Union[Any, None], str]: +) -> Tuple[Any, Union[Any, None], torch.device]: """ Loads the model and assigns it to a specific GPU.