From f661eb643abcde43362dae60773828363a595a9d Mon Sep 17 00:00:00 2001 From: alacheim Date: Tue, 11 Mar 2025 10:30:16 +0000 Subject: [PATCH 01/39] added ncbi to uniprot mapper --- src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 131 ++++++++++++++++++++ src/pyeed/main.py | 14 +++ 2 files changed, 145 insertions(+) create mode 100644 src/pyeed/adapter/ncbi_to_uniprot_mapper.py diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py new file mode 100644 index 00000000..3b6098ad --- /dev/null +++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py @@ -0,0 +1,131 @@ +import httpx +import logging +from pysam import FastaFile +from crc64iso import crc64iso +import sys +import json +import os +from typing import List + +logger = logging.getLogger(__name__) + +class NCBIToUniprotMapper: + def __init__(self, ids): + self.ids = ids + self.uniparc_url = "https://www.ebi.ac.uk/proteins/api/uniparc?offset=0&size=100&sequencechecksum=" + self.ncbi_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" + + + def download_fasta(self, refseq_id: str) -> None: + """ + Downloads a FASTA file for a given RefSeq ID using httpx and saves it locally. + + Args: + refseq_id str: NCBI ID + """ + + params = { + "db": "protein", + "id": refseq_id, + "rettype": "fasta", + "retmode": "text" + } + + try: + response = httpx.get(self.ncbi_url, params=params, timeout=10.0) + + if response.status_code == 200: + filename = f"{refseq_id}.fasta" + with open(filename, "w") as f: + f.write(response.text) + print(f"✅ Downloaded: {filename}") + else: + print(f"❌ Failed to download {refseq_id} (Status: {response.status_code})") + + except httpx.HTTPError as e: + print(f"❌ HTTP error occurred while downloading {refseq_id}: {e}") + + def get_checksum(self, refseq_id: str) -> str: + """Fetches and calculates the checksum for a given RefSeq ID. + + Args: + refseq_id str: NCBI ID + + Returns: + str: checksum ID + """ + + self.download_fasta(refseq_id) + fa = FastaFile(f"{refseq_id}.fasta") + seq = fa.fetch(fa.references[0]) + return crc64iso.crc64(seq) + + def checksum_list(self, refseq_ids: List[str]) -> List[str]: + """Creates a list of checksum IDs and deletes the FASTA files after processing. + + Args: + refseq_ids str: NCBI IDs + + Returns: + List[str]: cheksum IDs + """ + + checksums = [] + for refseq_id in refseq_ids: + checksums.append(self.get_checksum(refseq_id)) + fasta_file_path = f"{refseq_id}.fasta" + fai_file_path = f"{refseq_id}.fasta.fai" + + if os.path.exists(fasta_file_path): + os.remove(fasta_file_path) # Delete the fasta file + + if os.path.exists(fai_file_path): + os.remove(fai_file_path) + return checksums + + def execute_request(self) -> None: + """Fetches the uniparc and uniprot ids for the given refseq ids and saves them in a json file. + """ + + checksum_list = self.checksum_list(self.ids) + + id_mapping_uniprot = {} + id_mapping_uniparc = {} + counter = 0 + + for checksum in checksum_list: + url = f"{self.uniparc_url}{checksum}" + + #perform request and get response as JSON + with httpx.Client() as client: + response = client.get(url, headers={ "Accept" : "application/json"}) + + #check if the request was successful + if response.status_code != 200: + print(f"Request failed with status code {r.status_code}") + response.raise_for_status() # Raise exception for any non-200 response + sys.exit() + + # Check if the response body is empty + if not response.content.strip(): # Check if the body is empty + print("The response body is empty.") + sys.exit() + + #extracts the uniprot and the uniparc id from the repsonse and saves them in a dictionary + response_body = response.json() + for item in response_body: + uniparc_id = item.get('accession', None) + for ref in item.get('dbReference', []): + if ref.get('type') == 'UniProtKB/TrEMBL': + uniprot_id = ref.get('id', None) + id_mapping_uniparc[self.ids[counter]] = uniparc_id + id_mapping_uniprot[self.ids[counter]] = uniprot_id + counter += 1 + + with open("id_mapping_uniprot.json", "w") as f: + json.dump(id_mapping_uniprot, f) + + with open("id_mapping_uniparc.json", "w") as f: + json.dump(id_mapping_uniparc, f) + + diff --git a/src/pyeed/main.py b/src/pyeed/main.py index 5950965d..c7707d13 100644 --- a/src/pyeed/main.py +++ b/src/pyeed/main.py @@ -8,6 +8,7 @@ from pyeed.adapter.ncbi_protein_mapper import NCBIProteinToPyeed from pyeed.adapter.primary_db_adapter import PrimaryDBAdapter from pyeed.adapter.uniprot_mapper import UniprotToPyeed +from pyeed.adapter.ncbi_to_uniprot_mapper import NCBIToUniprotMapper from pyeed.dbchat import DBChat from pyeed.dbconnect import DatabaseConnector from pyeed.embedding import ( @@ -185,6 +186,19 @@ def fetch_ncbi_nucleotide(self, ids: list[str]) -> None: asyncio.run(adapter.execute_requests()) nest_asyncio.apply() + + def database_id_mapper(self, ids: list[str]) -> None: + """ + Maps IDs from one database to another using the UniProt ID mapping service + + Args: + ids (list[str]): List of IDs to map. + """ + + mapper = NCBIToUniprotMapper(ids) + mapper.execute_request() + + nest_asyncio.apply() def calculate_sequence_embeddings( self, From f44a2f3126d6b3ddb23082b2bdc193e5ace91956 Mon Sep 17 00:00:00 2001 From: alacheim Date: Thu, 13 Mar 2025 14:21:07 +0000 Subject: [PATCH 02/39] changes in mapper --- src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py index 3b6098ad..134d2ff5 100644 --- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py +++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py @@ -127,5 +127,4 @@ def execute_request(self) -> None: with open("id_mapping_uniparc.json", "w") as f: json.dump(id_mapping_uniparc, f) - - + \ No newline at end of file From 27ff2317166e9c2b43572e08ca378240405fa7ab Mon Sep 17 00:00:00 2001 From: alacheim Date: Fri, 14 Mar 2025 09:46:57 +0000 Subject: [PATCH 03/39] fixed bug in organism mapper --- src/pyeed/adapter/ncbi_protein_mapper.py | 2 ++ src/pyeed/model.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/src/pyeed/adapter/ncbi_protein_mapper.py b/src/pyeed/adapter/ncbi_protein_mapper.py index e11d4fe7..3ecf485c 100644 --- a/src/pyeed/adapter/ncbi_protein_mapper.py +++ b/src/pyeed/adapter/ncbi_protein_mapper.py @@ -281,6 +281,8 @@ def add_to_db(self, response: Response) -> None: protein = Protein(**protein_data) protein.save() + if not isinstance(organism, Organism): + raise TypeError(f"Expected Organism, but got {type(organism)}") protein.organism.connect(organism) # Add features diff --git a/src/pyeed/model.py b/src/pyeed/model.py index 7a720560..c9c1d4a4 100644 --- a/src/pyeed/model.py +++ b/src/pyeed/model.py @@ -145,6 +145,20 @@ class Annotation(Enum): class Organism(StrictStructuredNode): taxonomy_id: int = IntegerProperty(required=True, unique_index=True) name = StringProperty() + + @classmethod + def get_or_save(cls, taxonomy_id, name) -> "Organism": + try: + organism = cls.nodes.get(taxonomy_id=taxonomy_id) + return organism + except cls.DoesNotExist: + try: + organism = cls(taxonomy_id=taxonomy_id, name=name) + organism.save() + return organism + except Exception as e: + print(f"Error during saving of the organism: {e}") + raise class SiteRel(StructuredRel): # type: ignore From 3f7eade1628fd40030eb2e0090685e52a639eb28 Mon Sep 17 00:00:00 2001 From: Niklas Abraham GPU Date: Mon, 17 Mar 2025 13:37:32 +0000 Subject: [PATCH 04/39] major update for region handling of mature protein and coding sequence dna --- docs/usage/mutation_analysis.ipynb | 66 ++++--- src/pyeed/analysis/mutation_detection.py | 157 +++++++++++---- src/pyeed/analysis/sequence_alignment.py | 108 ++++++++--- src/pyeed/analysis/standard_numbering.py | 139 ++++++++++--- src/pyeed/main.py | 17 ++ src/pyeed/model.py | 236 ++++++++++++----------- 6 files changed, 491 insertions(+), 232 deletions(-) diff --git a/docs/usage/mutation_analysis.ipynb b/docs/usage/mutation_analysis.ipynb index 9ccabc1c..86ed7ea6 100644 --- a/docs/usage/mutation_analysis.ipynb +++ b/docs/usage/mutation_analysis.ipynb @@ -56,7 +56,7 @@ "\n", "eedb = Pyeed(uri, user=user, password=password)\n", "\n", - "eedb.db.wipe_database(date=\"2025-03-14\")" + "eedb.db.wipe_database(date=\"2025-03-17\")" ] }, { @@ -82,7 +82,8 @@ "ids = [\"AAM15527.1\", \"AAF05614.1\", \"AFN21551.1\", \"CAA76794.1\", \"AGQ50511.1\"]\n", "\n", "eedb.fetch_from_primary_db(ids, db=\"ncbi_protein\")\n", - "eedb.fetch_dna_entries_for_proteins()" + "eedb.fetch_dna_entries_for_proteins()\n", + "eedb.create_coding_sequences_regions()" ] }, { @@ -102,7 +103,32 @@ "cell_type": "code", "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "42e2c4d2f86f47eb970236f6f26eda6c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "sn_protein = StandardNumberingTool(name=\"test_standard_numbering_protein\")\n",
     "\n",
@@ -113,9 +139,9 @@
     "\n",
     "sn_dna = StandardNumberingTool(name=\"test_standard_numbering_dna\")\n",
     "\n",
-    "sn_dna.apply_standard_numbering(\n",
-    "    base_sequence_id=\"AF190695.1\", db=eedb.db, node_type=\"DNA\"\n",
-    ")\n"
+    "sn_dna.apply_standard_numbering_pairwise(\n",
+    "    base_sequence_id=\"AF190695.1\", db=eedb.db, node_type=\"DNA\", region_based_sequence='coding sequence'\n",
+    ")"
    ]
   },
   {
@@ -164,7 +190,7 @@
     "name_of_standard_numbering_tool = \"test_standard_numbering_dna\"\n",
     "\n",
     "mutations_dna = md.get_mutations_between_sequences(\n",
-    "    seq1, seq2, eedb.db, name_of_standard_numbering_tool, node_type=\"DNA\"\n",
+    "    seq1, seq2, eedb.db, name_of_standard_numbering_tool, node_type=\"DNA\", region_based_sequence=\"coding sequence\"\n",
     ")"
    ]
   },
@@ -190,7 +216,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'from_positions': [241, 272, 125], 'to_positions': [241, 272, 125], 'from_monomers': ['R', 'D', 'V'], 'to_monomers': ['S', 'N', 'I']}\n"
+      "{'from_positions': [272, 125, 241], 'to_positions': [272, 125, 241], 'from_monomers': ['D', 'V', 'R'], 'to_monomers': ['N', 'I', 'S']}\n"
      ]
     }
    ],
@@ -223,22 +249,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Mutation on position 682 -> 615 with a nucleotide change of T -> C\n",
-      "Mutation on position 407 -> 340 with a nucleotide change of C -> A\n",
-      "Mutation on position 92 -> 25 with a nucleotide change of C -> A\n",
-      "Mutation on position 162 -> 95 with a nucleotide change of G -> T\n",
-      "Mutation on position 929 -> 862 with a nucleotide change of A -> C\n",
-      "Mutation on position 346 -> 279 with a nucleotide change of A -> G\n",
-      "Mutation on position 87 -> 20 with a nucleotide change of C -> A\n",
-      "Mutation on position 88 -> 21 with a nucleotide change of T -> C\n",
-      "Mutation on position 130 -> 63 with a nucleotide change of C -> T\n",
-      "Mutation on position 175 -> 108 with a nucleotide change of G -> A\n",
-      "Mutation on position 131 -> 64 with a nucleotide change of T -> C\n",
-      "Mutation on position 132 -> 65 with a nucleotide change of A -> T\n",
-      "Mutation on position 914 -> 847 with a nucleotide change of G -> A\n",
-      "Mutation on position 604 -> 537 with a nucleotide change of T -> G\n",
-      "Mutation on position 925 -> 858 with a nucleotide change of G -> A\n",
-      "Mutation on position 226 -> 159 with a nucleotide change of T -> C\n"
+      "Mutation on position 705 -> 705 with a nucleotide change of G -> A\n",
+      "Mutation on position 198 -> 198 with a nucleotide change of C -> A\n",
+      "Mutation on position 720 -> 720 with a nucleotide change of A -> C\n",
+      "Mutation on position 473 -> 473 with a nucleotide change of T -> C\n",
+      "Mutation on position 17 -> 17 with a nucleotide change of T -> C\n",
+      "Mutation on position 716 -> 716 with a nucleotide change of G -> A\n",
+      "Mutation on position 137 -> 137 with a nucleotide change of A -> G\n",
+      "Mutation on position 395 -> 395 with a nucleotide change of T -> G\n"
      ]
     }
    ],
diff --git a/src/pyeed/analysis/mutation_detection.py b/src/pyeed/analysis/mutation_detection.py
index 082314f6..d469ef3d 100644
--- a/src/pyeed/analysis/mutation_detection.py
+++ b/src/pyeed/analysis/mutation_detection.py
@@ -1,4 +1,4 @@
-from typing import Any
+from typing import Any, Optional
 
 from loguru import logger
 from pyeed.dbconnect import DatabaseConnector
@@ -15,6 +15,7 @@ def get_sequence_data(
         db: DatabaseConnector,
         standard_numbering_tool_name: str,
         node_type: str = "Protein",
+        region_based_sequence: Optional[str] = None,
     ) -> tuple[dict[str, str], dict[str, list[str]]]:
         """Fetch sequence and position data for two sequences from the database.
 
@@ -23,6 +24,8 @@ def get_sequence_data(
             sequence_id2: Second sequence accession ID
             db: Database connection instance
             standard_numbering_tool_name: Name of standard numbering tool to use
+            node_type: Type of node to use (default: "Protein")
+            region_based_sequence: Annotation of region to use for numbering (default: None)
 
         Returns:
             tuple containing:
@@ -32,23 +35,43 @@ def get_sequence_data(
         Raises:
             ValueError: If standard numbering positions not found for both sequences
         """
-        query = f"""
-        MATCH (p:{node_type})-[r:HAS_STANDARD_NUMBERING]->(s:StandardNumbering)
-        WHERE p.accession_id IN ['{sequence_id1}', '{sequence_id2}'] 
-        AND s.name = '{standard_numbering_tool_name}'
-        RETURN p.accession_id as id, p.sequence as sequence, r.positions as positions
-        """
-        results = db.execute_read(query)
+        if region_based_sequence is not None:
+            query = f"""
+            MATCH (p:{node_type})-[rel:HAS_REGION]->(r:Region {{annotation: '{region_based_sequence}'}})-[rel2:HAS_STANDARD_NUMBERING]->(s:StandardNumbering)
+            WHERE p.accession_id IN ['{sequence_id1}', '{sequence_id2}'] 
+            AND s.name = '{standard_numbering_tool_name}'
+            RETURN p.accession_id as id, p.sequence as sequence, rel2.positions as positions, rel.start as start, rel.end as end
+            """
+            results = db.execute_read(query)
+        else:
+            query = f"""
+            MATCH (p:{node_type})-[r:HAS_STANDARD_NUMBERING]->(s:StandardNumbering)
+            WHERE p.accession_id IN ['{sequence_id1}', '{sequence_id2}'] 
+            AND s.name = '{standard_numbering_tool_name}'
+            RETURN p.accession_id as id, p.sequence as sequence, r.positions as positions
+            """
+            results = db.execute_read(query)
 
         if len(results) < 2:
             raise ValueError(
                 f"Could not find standard numbering positions for both sequences {sequence_id1} and {sequence_id2}"
             )
+        if region_based_sequence is not None:
+            sequences = {
+                results[i]["id"]: results[i]["sequence"][
+                    results[i]["start"] : results[i]["end"]
+                ]
+                for i in range(len(results))
+            }
+            positions = {
+                results[i]["id"]: results[i]["positions"] for i in range(len(results))
+            }
 
-        sequences = {result["id"]: result["sequence"] for result in results}
-        positions = {result["id"]: result["positions"] for result in results}
-
-        return sequences, positions
+            return sequences, positions
+        else:
+            sequences = {result["id"]: result["sequence"] for result in results}
+            positions = {result["id"]: result["positions"] for result in results}
+            return sequences, positions
 
     def find_mutations(
         self,
@@ -105,6 +128,7 @@ def save_mutations_to_db(
         sequence_id1: str,
         sequence_id2: str,
         node_type: str = "Protein",
+        region_based_sequence: Optional[str] = None,
     ) -> None:
         """Save detected mutations to the database.
 
@@ -117,41 +141,79 @@ def save_mutations_to_db(
             db: Database connection instance
             sequence_id1: First sequence accession ID
             sequence_id2: Second sequence accession ID
+            node_type: Type of node to use (default: "Protein")
+            region_based_sequence: Annotation of region to use for numbering (default: None)
         """
-
         # Check if a mutation relationship already exists between these proteins
-        existing_mutations = db.execute_read(
-            f"""
-            MATCH (p1:{node_type})-[r:MUTATION]->(p2:{node_type})
-            WHERE p1.accession_id = $sequence_id1 AND p2.accession_id = $sequence_id2
-            RETURN r
-            """,
-            {"sequence_id1": sequence_id1, "sequence_id2": sequence_id2},
-        )
+        if region_based_sequence is not None:
+            query = f"""
+            MATCH (p1:{node_type} {{accession_id: $sequence_id1}})-[rel:HAS_REGION]->(r1:Region {{annotation: $region_based_sequence}})-[rel_mutation:MUTATION]->(r2:Region {{annotation: $region_based_sequence}})<-[rel2:HAS_REGION]-(p2:{node_type} {{accession_id: $sequence_id2}})
+            RETURN rel_mutation
+            """
+            existing_mutations = db.execute_read(
+                query,
+                {
+                    "sequence_id1": sequence_id1,
+                    "sequence_id2": sequence_id2,
+                    "region_based_sequence": region_based_sequence,
+                },
+            )
+        else:
+            existing_mutations = db.execute_read(
+                f"""
+                MATCH (p1:{node_type})-[r:MUTATION]->(p2:{node_type})
+                WHERE p1.accession_id = $sequence_id1 AND p2.accession_id = $sequence_id2
+                RETURN r
+                """,
+                {"sequence_id1": sequence_id1, "sequence_id2": sequence_id2},
+            )
         if existing_mutations:
             logger.debug(
                 f"Mutation relationship already exists between {sequence_id1} and {sequence_id2}"
             )
             return
 
-        query = f"""
-        MATCH (p1:{node_type}), (p2:{node_type})
-        WHERE p1.accession_id = $sequence_id1 AND p2.accession_id = $sequence_id2
-        CREATE (p1)-[r:MUTATION]->(p2)
-        SET r.from_positions = $from_positions,
-            r.to_positions = $to_positions,
-            r.from_monomers = $from_monomers,
-            r.to_monomers = $to_monomers
-        """
-        params = {
-            "sequence_id1": sequence_id1,
-            "sequence_id2": sequence_id2,
-            "from_positions": mutations["from_positions"],
-            "to_positions": mutations["to_positions"],
-            "from_monomers": mutations["from_monomers"],
-            "to_monomers": mutations["to_monomers"],
-        }
-        db.execute_write(query, params)
+        if region_based_sequence is not None:
+            # saving the mutation between the regions
+            query = f"""
+            MATCH (r1:Region {{annotation: $region_based_sequence}})<-[rel:HAS_REGION]-(p1:{node_type} {{accession_id: $sequence_id1}})
+            MATCH (r2:Region {{annotation: $region_based_sequence}})<-[rel2:HAS_REGION]-(p2:{node_type} {{accession_id: $sequence_id2}})
+            CREATE (r1)-[r:MUTATION]->(r2)
+            SET r.from_positions = $from_positions,
+                r.to_positions = $to_positions,
+                r.from_monomers = $from_monomers,
+                r.to_monomers = $to_monomers
+            """
+            params = {
+                "sequence_id1": sequence_id1,
+                "sequence_id2": sequence_id2,
+                "region_based_sequence": region_based_sequence,
+                "from_positions": mutations["from_positions"],
+                "to_positions": mutations["to_positions"],
+                "from_monomers": mutations["from_monomers"],
+                "to_monomers": mutations["to_monomers"],
+            }
+            db.execute_write(query, params)
+        else:
+            query = f"""
+            MATCH (p1:{node_type}), (p2:{node_type})
+            WHERE p1.accession_id = $sequence_id1 AND p2.accession_id = $sequence_id2
+            CREATE (p1)-[r:MUTATION]->(p2)
+            SET r.from_positions = $from_positions,
+                r.to_positions = $to_positions,
+                r.from_monomers = $from_monomers,
+                r.to_monomers = $to_monomers
+            """
+            params = {
+                "sequence_id1": sequence_id1,
+                "sequence_id2": sequence_id2,
+                "from_positions": mutations["from_positions"],
+                "to_positions": mutations["to_positions"],
+                "from_monomers": mutations["from_monomers"],
+                "to_monomers": mutations["to_monomers"],
+            }
+            db.execute_write(query, params)
+
         logger.debug(
             f"Saved {len(list(params['from_positions']))} mutations to database"
         )
@@ -165,6 +227,7 @@ def get_mutations_between_sequences(
         save_to_db: bool = True,
         debug: bool = False,
         node_type: str = "Protein",
+        region_based_sequence: Optional[str] = None,
     ) -> dict[str, list[int | str]]:
         """Get mutations between two sequences using standard numbering.
 
@@ -174,6 +237,8 @@ def get_mutations_between_sequences(
             db: Database connection instance
             standard_numbering_tool_name: Name of standard numbering tool to use
             save_to_db: Whether to save mutations to database (default: True)
+            node_type: Type of node to use (default: "Protein")
+            region_based_sequence: Annotation of region to use for numbering (default: None)
 
         Returns:
             dict containing mutation information:
@@ -186,7 +251,12 @@ def get_mutations_between_sequences(
             ValueError: If standard numbering positions not found for both sequences
         """
         sequences, positions = self.get_sequence_data(
-            sequence_id1, sequence_id2, db, standard_numbering_tool_name, node_type
+            sequence_id1,
+            sequence_id2,
+            db,
+            standard_numbering_tool_name,
+            node_type,
+            region_based_sequence,
         )
 
         if debug:
@@ -201,7 +271,12 @@ def get_mutations_between_sequences(
 
         if save_to_db:
             self.save_mutations_to_db(
-                mutations, db, sequence_id1, sequence_id2, node_type
+                mutations,
+                db,
+                sequence_id1,
+                sequence_id2,
+                node_type,
+                region_based_sequence,
             )
 
         return mutations
diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py
index 8dd41553..c130f8d3 100644
--- a/src/pyeed/analysis/sequence_alignment.py
+++ b/src/pyeed/analysis/sequence_alignment.py
@@ -91,6 +91,7 @@ def align_multipairwise(
         return_results: bool = True,
         pairs: Optional[list[tuple[str, str]]] = None,
         node_type: str = "Protein",
+        region_based_sequence: Optional[str] = None,
     ) -> Optional[list[dict[str, Any]]]:
         """
         Creates all possible pairwise alignments from a dictionary of sequences or from sequence IDs.
@@ -114,6 +115,7 @@ def align_multipairwise(
             pairs (Optional[list[tuple[str, str]]]): A list of tuples, where each tuple contains two
                 sequence IDs to align. If provided, only these pairs will be aligned.
             node_type (str): The type of node to align. Defaults to "Protein".
+            region_based_sequence (Optional[str]): The annotation of the region to use for the alignment. Defaults to None.
         Returns:
             Optional[List[dict]]: A list of dictionaries containing the alignment results if
             `return_results` is True. If False, returns None.
@@ -121,7 +123,9 @@ def align_multipairwise(
 
         # Fetch sequences if ids are provided
         if ids is not None and db is not None:
-            sequences = self._get_id_sequence_dict(db, ids, node_type)
+            sequences = self._get_id_sequence_dict(
+                db, ids, node_type, region_based_sequence
+            )
 
         if not sequences:
             raise ValueError(
@@ -156,7 +160,7 @@ def align_multipairwise(
                 progress.update(align_task, advance=len(pair_chunk))
 
                 if db:
-                    self._to_db(alignments, db)
+                    self._to_db(alignments, db, node_type, region_based_sequence)
                     progress.update(db_task, advance=len(pair_chunk))
 
                 if return_results:
@@ -168,28 +172,52 @@ def _to_db(
         self,
         alignments: list[dict[str, Any]],
         db: DatabaseConnector,
+        node_type: str = "Protein",
+        region_based_sequence: Optional[str] = None,
     ) -> None:
         """Inserts the alignment results to pyeed graph database.
 
         Args:
             alignments (list[dict]): A list of dictionaries containing the alignment results.
             db (DatabaseConnector): A `DatabaseConnector` object.
+            node_type (str): The type of node to align. Defaults to "Protein".
+            region_based_sequence (Optional[str]): The annotation of the region to use for the alignment. Defaults to None.
         """
 
-        query = """
-        UNWIND $alignments AS alignment
-        MATCH (p1:Protein {accession_id: alignment.query_id})
-        MATCH (p2:Protein {accession_id: alignment.target_id})
-        MERGE (p1)-[r:PAIRWISE_ALIGNED]->(p2)
-        SET r.similarity = alignment.identity,
+        if region_based_sequence is None:
+            query = f"""
+            UNWIND $alignments AS alignment
+            MATCH (p1:{node_type} {{accession_id: alignment.query_id}})
+            MATCH (p2:{node_type} {{accession_id: alignment.target_id}})
+            MERGE (p1)-[r:PAIRWISE_ALIGNED]->(p2)
+            SET r.similarity = alignment.identity,
             r.mismatches = alignment.mismatches,
             r.gaps = alignment.gaps,
             r.score = alignment.score,
             r.query_aligned = alignment.query_aligned,
             r.target_aligned = alignment.target_aligned
-        """
-
-        db.execute_write(query, {"alignments": alignments})
+            """
+            db.execute_write(query, parameters={"alignments": alignments})
+        else:
+            query = f"""
+            UNWIND $alignments AS alignment
+            MATCH (p1:{node_type} {{accession_id: alignment.query_id}})-[rel1:HAS_REGION]->(r1:Region {{annotation: $region_based_sequence}})
+            MATCH (p2:{node_type} {{accession_id: alignment.target_id}})-[rel2:HAS_REGION]->(r2:Region {{annotation: $region_based_sequence}})
+            MERGE (r1)-[r:PAIRWISE_ALIGNED]->(r2)
+            SET r.similarity = alignment.identity,
+            r.mismatches = alignment.mismatches,
+            r.gaps = alignment.gaps,
+            r.score = alignment.score,
+            r.query_aligned = alignment.query_aligned,
+            r.target_aligned = alignment.target_aligned
+            """
+            db.execute_write(
+                query,
+                {
+                    "alignments": alignments,
+                    "region_based_sequence": region_based_sequence,
+                },
+            )
 
     def _get_aligner(self) -> BioPairwiseAligner:
         """Creates a BioPython pairwise aligner object with the specified parameters
@@ -244,6 +272,7 @@ def _get_id_sequence_dict(
         db: DatabaseConnector,
         ids: list[str] = [],
         node_type: str = "Protein",
+        region_based_sequence: Optional[str] = None,
     ) -> dict[str, str]:
         """Gets all sequences from the database and returns them in a dictionary.
         Key is the accession id and value is the sequence.
@@ -258,20 +287,51 @@ def _get_id_sequence_dict(
         """
 
         if not ids:
-            query = f"""
-            MATCH (p:{node_type})
-            RETURN p.accession_id AS accession_id, p.sequence AS sequence
-            """
-            nodes = db.execute_read(query)
-        else:
-            query = f"""
-            MATCH (p:{node_type})
-            WHERE p.accession_id IN $ids
-            RETURN p.accession_id AS accession_id, p.sequence AS sequence
-            """
-            nodes = db.execute_read(query, {"ids": ids})
+            if region_based_sequence is not None:
+                query = f"""
+                MATCH (p:{node_type})-[e:HAS_REGION]->(r:Region {{annotation: $region_based_sequence}})
+                RETURN p.accession_id AS accession_id, e.start AS start, e.end AS end, p.sequence AS sequence
+                """
+                nodes = db.execute_read(
+                    query,
+                    parameters={"region_based_sequence": region_based_sequence},
+                )
 
-        return {node["accession_id"]: node["sequence"] for node in nodes}
+            else:
+                query = f"""
+                MATCH (p:{node_type})
+                RETURN p.accession_id AS accession_id, p.sequence AS sequence
+                """
+                nodes = db.execute_read(query)
+        else:
+            if region_based_sequence is not None:
+                query = f"""
+                MATCH (p:{node_type})-[e:HAS_REGION]->(r:Region {{annotation: $region_based_sequence}})
+                WHERE p.accession_id IN $ids
+                RETURN p.accession_id AS accession_id, e.start AS start, e.end AS end, p.sequence AS sequence
+                """
+                nodes = db.execute_read(
+                    query,
+                    parameters={
+                        "ids": ids,
+                        "region_based_sequence": region_based_sequence,
+                    },
+                )
+            else:
+                query = f"""
+                MATCH (p:{node_type})
+                WHERE p.accession_id IN $ids
+                RETURN p.accession_id AS accession_id, p.sequence AS sequence
+                """
+                nodes = db.execute_read(query, parameters={"ids": ids})
+
+        if region_based_sequence is not None:
+            return {
+                node["accession_id"]: node["sequence"][node["start"] : node["end"]]
+                for node in nodes
+            }
+        else:
+            return {node["accession_id"]: node["sequence"] for node in nodes}
 
     def _load_substitution_matrix(self) -> "BioSubstitutionMatrix":
         from Bio.Align import substitution_matrices
diff --git a/src/pyeed/analysis/standard_numbering.py b/src/pyeed/analysis/standard_numbering.py
index 04d78d96..8d6bad66 100644
--- a/src/pyeed/analysis/standard_numbering.py
+++ b/src/pyeed/analysis/standard_numbering.py
@@ -41,7 +41,11 @@ def __init__(self, name: str) -> None:
         self.name = name
 
     def get_node_base_sequence(
-        self, base_sequence_id: str, db: DatabaseConnector, node_type: str = "Protein"
+        self,
+        base_sequence_id: str,
+        db: DatabaseConnector,
+        node_type: str = "Protein",
+        region_based_sequence: Optional[str] = None,
     ) -> dict[str, str]:
         """
         Retrieve the base node sequence from the database for a given accession id.
@@ -52,21 +56,36 @@ def get_node_base_sequence(
         Args:
             base_sequence_id: The accession id of the base node sequence.
             db: The database connector instance to perform the query.
-
+            region_based_sequence: The annotation of the region to use for the numbering. Default is None.
         Returns:
             A dictionary with keys 'id' and 'sequence' holding the node type id and its sequence.
         """
-        query = f"""
-        MATCH (p:{node_type})
-        WHERE p.accession_id = '{base_sequence_id}'
-        RETURN p.accession_id AS accession_id, p.sequence AS sequence
-        """
+        if region_based_sequence:
+            query = f"""
+            MATCH (p:{node_type})-[e:HAS_REGION]->(r:Region annotation: {region_based_sequence})
+            WHERE p.accession_id = '{base_sequence_id}'
+            RETURN p.accession_id AS accession_id, e.start AS start, e.end AS end, p.sequence AS sequence
+            """
+        else:
+            query = f"""
+            MATCH (p:{node_type})
+            WHERE p.accession_id = '{base_sequence_id}'
+            RETURN p.accession_id AS accession_id, p.sequence AS sequence
+            """
         base_sequence_read = db.execute_read(query)
         # Assume the first returned record is the desired base sequence
-        base_sequence = {
-            "id": base_sequence_read[0]["accession_id"],
-            "sequence": base_sequence_read[0]["sequence"],
-        }
+        if region_based_sequence:
+            base_sequence = {
+                "id": base_sequence_read[0]["accession_id"],
+                "sequence": base_sequence_read[0]["sequence"][
+                    base_sequence_read[0]["start"] : base_sequence_read[0]["end"]
+                ],
+            }
+        else:
+            base_sequence = {
+                "id": base_sequence_read[0]["accession_id"],
+                "sequence": base_sequence_read[0]["sequence"],
+            }
         return base_sequence
 
     def save_positions(
@@ -74,6 +93,7 @@ def save_positions(
         db: DatabaseConnector,
         positions: dict[str, list[str]],
         node_type: str = "Protein",
+        region_based_sequence: bool = False,
     ) -> None:
         """
         Save the calculated numbering positions for each protein into the database.
@@ -84,14 +104,25 @@ def save_positions(
 
         Args:
             db: The database connector instance used to execute the write queries.
+            positions: A dictionary mapping protein accession ids to lists of numbering positions.
+            node_type: The type of node to process. Default is "Protein".
+            region_based_sequence: If True, the sequence is a region based sequence. Default is False.
         """
         for protein_id in positions:
-            query = f"""
-                MATCH (p:{node_type} {{accession_id: '{protein_id}'}})
-                MATCH (s:StandardNumbering {{name: '{self.name}'}})
-                MERGE (p)-[r:HAS_STANDARD_NUMBERING]->(s)
-                SET r.positions = {str(positions[protein_id])}
-            """
+            if region_based_sequence:
+                query = f"""
+                    MATCH (p:{node_type} {{accession_id: '{protein_id}'}})-[e:HAS_REGION]->(r:Region {{annotation: 'coding sequence'}})
+                    MATCH (s:StandardNumbering {{name: '{self.name}'}})
+                    MERGE (r)-[rel:HAS_STANDARD_NUMBERING]->(s)
+                    SET rel.positions = {str(positions[protein_id])}
+                """
+            else:
+                query = f"""
+                    MATCH (p:{node_type} {{accession_id: '{protein_id}'}})
+                    MATCH (s:StandardNumbering {{name: '{self.name}'}})
+                    MERGE (p)-[rel:HAS_STANDARD_NUMBERING]->(s)
+                    SET rel.positions = {str(positions[protein_id])}
+                """
             # Execute the write query to update the standard numbering relationship.
             db.execute_write(query)
 
@@ -317,6 +348,7 @@ def apply_standard_numbering_pairwise(
         list_of_seq_ids: Optional[List[str]] = None,
         return_positions: bool = False,
         node_type: str = "Protein",
+        region_based_sequence: Optional[str] = None,
     ) -> Optional[Dict[str, List[str]]]:
         """
         Apply standard numbering via pairwise alignment using a base sequence.
@@ -332,7 +364,7 @@ def apply_standard_numbering_pairwise(
             list_of_seq_ids: An optional list of node type ids to process. If None, all node type ids are used.
             return_positions: If True, the method returns the computed positions dictionary after processing.
             node_type: The type of node to process. Default is "Protein".
-
+            region_based_sequence: The annotation of the region to use for the numbering. Default is None.
         Raises:
             ValueError: If the pairwise alignment fails and returns no results.
         """
@@ -362,15 +394,33 @@ def apply_standard_numbering_pairwise(
             pairs.append((base_sequence_id, node_id))
 
         # check if the pairs are already existing with the same name under the same standard numbering node
-        query = f"""
-        MATCH (s:StandardNumbering {{name: $name}})
-        MATCH (p:{node_type})-[r:HAS_STANDARD_NUMBERING]->(s)
-        WHERE p.accession_id IN $list_of_seq_ids
-        RETURN p.accession_id AS accession_id
-        """
-        results = db.execute_read(
-            query, parameters={"list_of_seq_ids": list_of_seq_ids, "name": self.name}
-        )
+        if node_type == "DNA" and region_based_sequence is not None:
+            query = """
+            MATCH (s:StandardNumbering {name: $name})
+            MATCH (r:Region {annotation: $region_based_sequence})<-[rel:HAS_STANDARD_NUMBERING]-(s)
+            WHERE r.accession_id IN $list_of_seq_ids
+            RETURN r.accession_id AS accession_id
+            """
+
+            results = db.execute_read(
+                query,
+                parameters={
+                    "list_of_seq_ids": list_of_seq_ids,
+                    "name": self.name,
+                    "region_based_sequence": region_based_sequence,
+                },
+            )
+        else:
+            query = f"""
+            MATCH (s:StandardNumbering {{name: $name}})
+            MATCH (p:{node_type})-[rel:HAS_STANDARD_NUMBERING]->(s)
+            WHERE p.accession_id IN $list_of_seq_ids
+            RETURN p.accession_id AS accession_id
+            """
+            results = db.execute_read(
+                query,
+                parameters={"list_of_seq_ids": list_of_seq_ids, "name": self.name},
+            )
         if results is not None:
             for row in results:
                 if row is not None:
@@ -398,6 +448,7 @@ def apply_standard_numbering_pairwise(
             db=db,
             pairs=pairs,  # List of sequence pairs to be aligned
             node_type=node_type,
+            region_based_sequence=region_based_sequence,
         )
 
         logger.info(f"Pairwise alignment results: {results_pairwise}")
@@ -435,7 +486,7 @@ def apply_standard_numbering_pairwise(
         )
 
         # Update the database with the calculated positions.
-        self.save_positions(db, positions, node_type)
+        self.save_positions(db, positions, node_type, region_based_sequence)
 
         if return_positions:
             return positions
@@ -447,6 +498,7 @@ def apply_standard_numbering(
         db: DatabaseConnector,
         list_of_seq_ids: Optional[List[str]] = None,
         node_type: str = "Protein",
+        region_based_sequence: Optional[str] = None,
     ) -> None:
         """
         Apply a standard numbering scheme to a collection of nodes using multiple sequence alignment.
@@ -460,6 +512,7 @@ def apply_standard_numbering(
             db: DatabaseConnector instance used for executing queries.
             list_of_seq_ids: An optional list of specific node type ids to process. If None, all node type ids are used.
             node_type: The type of node to process. Default is "Protein".
+            region_based_sequence: The annotation of the region to use for the numbering. Default is None.
         """
 
         if list_of_seq_ids is None:
@@ -489,12 +542,36 @@ def apply_standard_numbering(
             nodes_read = []
         else:
             nodes_read = query_result
-        nodes_dict = {node["accession_id"]: node["sequence"] for node in nodes_read}
+
+        if node_type == "DNA" and region_based_sequence is not None:
+            # then the sequence is a region based sequence.
+            # get the region objects for each of the nodes as well
+            query = f"""
+            MATCH (p:{node_type})-[e:HAS_REGION]->(r:Region {{annotation: $region_based_sequence}})
+            WHERE p.accession_id IN $list_of_seq_ids
+            RETURN p.accession_id AS accession_id, e.start AS start, e.end AS end, p.sequence AS sequence
+            """
+            region_read = db.execute_read(
+                query,
+                parameters={
+                    "list_of_seq_ids": list_of_seq_ids,
+                    "region_based_sequence": region_based_sequence,
+                },
+            )
+            nodes_dict = {
+                node["accession_id"]: node["sequence"][node["start"] : node["end"]]
+                for node in region_read
+            }
+
+        else:
+            nodes_dict = {node["accession_id"]: node["sequence"] for node in nodes_read}
 
         logger.info(f"Using {len(nodes_dict)} sequences for standard numbering")
 
         # Obtain the base sequence details from the database.
-        base_sequence = self.get_node_base_sequence(base_sequence_id, db, node_type)
+        base_sequence = self.get_node_base_sequence(
+            base_sequence_id, db, node_type, region_based_sequence
+        )
 
         # Remove the base sequence from the nodes list to prevent duplicate alignment.
         if base_sequence_id in nodes_dict:
@@ -525,4 +602,4 @@ def apply_standard_numbering(
         )
 
         # Update the database with the relationships between nodes and standard numbering.
-        self.save_positions(db, positions, node_type)
+        self.save_positions(db, positions, node_type, region_based_sequence)
diff --git a/src/pyeed/main.py b/src/pyeed/main.py
index 25a1b225..ea3a6462 100644
--- a/src/pyeed/main.py
+++ b/src/pyeed/main.py
@@ -426,3 +426,20 @@ def fetch_dna_entries_for_proteins(self, ids: list[str] | None = None) -> None:
                     f"Error processing relationship batch {i//BATCH_SIZE + 1}: {str(e)}"
                 )
                 continue
+
+    def create_coding_sequences_regions(self) -> None:
+        """
+        Creates coding sequences regions for all proteins in the database.
+
+        It finds the nucleotide start and end positions and create a Region object for the corresponding DNA sequence.
+        Create the region object with the right annotation. And then connect it to the DNA sequence.
+        """
+        query = """
+        MATCH (p:Protein)
+        WHERE p.nucleotide_id IS NOT NULL
+        CREATE (r:Region {annotation: 'coding sequence'})
+        WITH p, r
+        MATCH (d:DNA {accession_id: p.nucleotide_id})
+        CREATE (d)-[:HAS_REGION {start: p.nucleotide_start, end: p.nucleotide_end}]->(r)
+        """
+        self.db.execute_write(query)
diff --git a/src/pyeed/model.py b/src/pyeed/model.py
index 869a4091..128e7eab 100644
--- a/src/pyeed/model.py
+++ b/src/pyeed/model.py
@@ -147,6 +147,118 @@ class Organism(StrictStructuredNode):
     name = StringProperty()
 
 
+class Mutation(StructuredRel):  # type: ignore
+    """A relationship representing mutations between two sequences."""
+
+    from_positions = ArrayProperty(IntegerProperty(), required=True)
+    to_positions = ArrayProperty(IntegerProperty(), required=True)
+    from_monomers = ArrayProperty(StringProperty(), required=True)
+    to_monomers = ArrayProperty(StringProperty(), required=True)
+
+    @classmethod
+    def validate_and_connect(
+        cls,
+        molecule1: Any,
+        molecule2: Any,
+        from_positions: list[int],
+        to_positions: list[int],
+        from_monomers: list[str],
+        to_monomers: list[str],
+    ) -> "Mutation":
+        """Validates the mutations and connects the two molecules, ensuring that no double mutations
+        occur – i.e. if a mutation affecting any of the same positions already exists between these proteins,
+        a new mutation cannot be created.
+
+        Raises:
+            ValueError: If input lists have different lengths or if a mutation for any of these positions
+                        already exists.
+        """
+        # Instead of checking *any* mutation, retrieve all mutation relationships between these proteins.
+        # Here molecule1.mutation.relationship(molecule2) returns a list of mutation relationship instances.
+        existing_mutations = molecule1.mutation.relationship(molecule2)
+
+        if existing_mutations:
+            raise ValueError(
+                "A mutation relationship affecting one or more of these positions already exists between these proteins."
+            )
+
+        if (
+            len(from_positions) != len(to_positions)
+            or len(from_positions) != len(from_monomers)
+            or len(from_positions) != len(to_monomers)
+        ):
+            raise ValueError("All input lists must have the same length.")
+
+        for from_position, from_monomer in zip(from_positions, from_monomers):
+            if molecule1.sequence[from_position] != from_monomer:
+                raise ValueError(
+                    f"Monomer '{from_monomer}' does not match the sequence {molecule1.accession_id} at position {from_position}"
+                )
+
+        for to_position, to_monomer in zip(to_positions, to_monomers):
+            if molecule2.sequence[to_position] != to_monomer:
+                raise ValueError(
+                    f"Monomer '{to_monomer}' does not match the sequence {molecule2.accession_id} at position {to_position}"
+                )
+
+        molecule1.mutation.connect(
+            molecule2,
+            {
+                "from_positions": from_positions,
+                "to_positions": to_positions,
+                "from_monomers": from_monomers,
+                "to_monomers": to_monomers,
+            },
+        )
+
+        return cls(
+            from_positions=from_positions,
+            to_positions=to_positions,
+            from_monomers=from_monomers,
+            to_monomers=to_monomers,
+        )
+
+    @property
+    def label(self) -> str:
+        """The label of the mutation."""
+        return ",".join(
+            f"{from_monomer}{from_position}{to_monomer}"
+            for from_position, from_monomer, to_monomer in zip(
+                list(self.from_positions),
+                list(self.from_monomers),
+                list(self.to_monomers),
+            )
+        )
+
+
+class StandardNumberingRel(StructuredRel):  # type: ignore
+    positions = ArrayProperty(StringProperty(), required=True)
+
+    @classmethod
+    def validate_and_connect(
+        cls,
+        molecule1: Any,
+        molecule2: Any,
+        positions: list[str],
+    ) -> "StandardNumberingRel":
+        """Validates the positions and connects the two molecules."""
+        molecule1.sequences_protein.connect(
+            molecule2,
+            {
+                "positions": positions,
+            },
+        )
+
+        return cls(
+            positions=positions,
+        )
+
+    @property
+    def label(self) -> str:
+        """The label of the standard numbering."""
+        return f"{self.positions}"
+
+
 class SiteRel(StructuredRel):  # type: ignore
     positions = ArrayProperty(IntegerProperty(), required=True)
 
@@ -188,6 +300,12 @@ class Region(StrictStructuredNode):
         choices=[(e.value, e.name) for e in Annotation], required=True
     )
 
+    # Relationships
+    has_mutation_region = RelationshipTo("Region", "MUTATION", model=Mutation)
+    has_standard_numbering = RelationshipTo(
+        "StandardNumbering", "HAS_STANDARD_NUMBERING", model=StandardNumberingRel
+    )
+
 
 class DNAProteinRel(StructuredRel):  # type: ignore
     """A relationship between a DNA and a protein."""
@@ -264,34 +382,6 @@ def label(self) -> str:
         return str(self.name)
 
 
-class StandardNumberingRel(StructuredRel):  # type: ignore
-    positions = ArrayProperty(StringProperty(), required=True)
-
-    @classmethod
-    def validate_and_connect(
-        cls,
-        molecule1: Any,
-        molecule2: Any,
-        positions: list[str],
-    ) -> "StandardNumberingRel":
-        """Validates the positions and connects the two molecules."""
-        molecule1.sequences_protein.connect(
-            molecule2,
-            {
-                "positions": positions,
-            },
-        )
-
-        return cls(
-            positions=positions,
-        )
-
-    @property
-    def label(self) -> str:
-        """The label of the standard numbering."""
-        return f"{self.positions}"
-
-
 class StandardNumbering(StrictStructuredNode):
     name = StringProperty(required=True, unique_index=True)
     definition = StringProperty(required=True)
@@ -389,90 +479,6 @@ def label(self) -> str:
         return str(self.term)
 
 
-class Mutation(StructuredRel):  # type: ignore
-    """A relationship representing mutations between two sequences."""
-
-    from_positions = ArrayProperty(IntegerProperty(), required=True)
-    to_positions = ArrayProperty(IntegerProperty(), required=True)
-    from_monomers = ArrayProperty(StringProperty(), required=True)
-    to_monomers = ArrayProperty(StringProperty(), required=True)
-
-    @classmethod
-    def validate_and_connect(
-        cls,
-        molecule1: Any,
-        molecule2: Any,
-        from_positions: list[int],
-        to_positions: list[int],
-        from_monomers: list[str],
-        to_monomers: list[str],
-    ) -> "Mutation":
-        """Validates the mutations and connects the two molecules, ensuring that no double mutations
-        occur – i.e. if a mutation affecting any of the same positions already exists between these proteins,
-        a new mutation cannot be created.
-
-        Raises:
-            ValueError: If input lists have different lengths or if a mutation for any of these positions
-                        already exists.
-        """
-        # Instead of checking *any* mutation, retrieve all mutation relationships between these proteins.
-        # Here molecule1.mutation.relationship(molecule2) returns a list of mutation relationship instances.
-        existing_mutations = molecule1.mutation.relationship(molecule2)
-
-        if existing_mutations:
-            raise ValueError(
-                "A mutation relationship affecting one or more of these positions already exists between these proteins."
-            )
-
-        if (
-            len(from_positions) != len(to_positions)
-            or len(from_positions) != len(from_monomers)
-            or len(from_positions) != len(to_monomers)
-        ):
-            raise ValueError("All input lists must have the same length.")
-
-        for from_position, from_monomer in zip(from_positions, from_monomers):
-            if molecule1.sequence[from_position] != from_monomer:
-                raise ValueError(
-                    f"Monomer '{from_monomer}' does not match the sequence {molecule1.accession_id} at position {from_position}"
-                )
-
-        for to_position, to_monomer in zip(to_positions, to_monomers):
-            if molecule2.sequence[to_position] != to_monomer:
-                raise ValueError(
-                    f"Monomer '{to_monomer}' does not match the sequence {molecule2.accession_id} at position {to_position}"
-                )
-
-        molecule1.mutation.connect(
-            molecule2,
-            {
-                "from_positions": from_positions,
-                "to_positions": to_positions,
-                "from_monomers": from_monomers,
-                "to_monomers": to_monomers,
-            },
-        )
-
-        return cls(
-            from_positions=from_positions,
-            to_positions=to_positions,
-            from_monomers=from_monomers,
-            to_monomers=to_monomers,
-        )
-
-    @property
-    def label(self) -> str:
-        """The label of the mutation."""
-        return ",".join(
-            f"{from_monomer}{from_position}{to_monomer}"
-            for from_position, from_monomer, to_monomer in zip(
-                list(self.from_positions),
-                list(self.from_monomers),
-                list(self.to_monomers),
-            )
-        )
-
-
 class Protein(StrictStructuredNode):
     """A protein sequence node in the database."""
 
@@ -507,6 +513,9 @@ class Protein(StrictStructuredNode):
     pairwise_aligned = RelationshipTo(
         "Protein", "PAIRWISE_ALIGNED", model=PairwiseAlignmentResult
     )
+    has_standard_numbering = RelationshipTo(
+        "StandardNumbering", "HAS_STANDARD_NUMBERING", model=StandardNumberingRel
+    )
 
 
 class DNA(StrictStructuredNode):
@@ -533,6 +542,9 @@ class DNA(StrictStructuredNode):
     pairwise_aligned = RelationshipTo(
         "DNA", "PAIRWISE_ALIGNED", model=PairwiseAlignmentResult
     )
+    has_standard_numbering = RelationshipTo(
+        "StandardNumbering", "HAS_STANDARD_NUMBERING", model=StandardNumberingRel
+    )
 
 
 class CustomRealationship(StructuredRel):  # type: ignore

From 44ca574c2f5470d4080cd3301aa75aad615c7c7c Mon Sep 17 00:00:00 2001
From: Niklas Abraham GPU 
Date: Wed, 19 Mar 2025 13:01:40 +0000
Subject: [PATCH 05/39] updated notebooks

---
 docs/usage/mutation_analysis.ipynb  | 38 ++++++++---------
 docs/usage/standard_numbering.ipynb | 65 +++++++++++++++++++++++------
 2 files changed, 71 insertions(+), 32 deletions(-)

diff --git a/docs/usage/mutation_analysis.ipynb b/docs/usage/mutation_analysis.ipynb
index 86ed7ea6..8dbe2b07 100644
--- a/docs/usage/mutation_analysis.ipynb
+++ b/docs/usage/mutation_analysis.ipynb
@@ -11,7 +11,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -37,7 +37,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
@@ -56,7 +56,7 @@
     "\n",
     "eedb = Pyeed(uri, user=user, password=password)\n",
     "\n",
-    "eedb.db.wipe_database(date=\"2025-03-17\")"
+    "eedb.db.wipe_database(date=\"2025-03-19\")"
    ]
   },
   {
@@ -75,7 +75,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -101,13 +101,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "42e2c4d2f86f47eb970236f6f26eda6c",
+       "model_id": "19d06e22f1a74d589f68135fcdbd232f",
        "version_major": 2,
        "version_minor": 0
       },
@@ -137,7 +137,7 @@
     "    base_sequence_id=\"AAM15527.1\", db=eedb.db, list_of_seq_ids=ids\n",
     ")\n",
     "\n",
-    "sn_dna = StandardNumberingTool(name=\"test_standard_numbering_dna\")\n",
+    "sn_dna = StandardNumberingTool(name=\"test_standard_numbering_dna_pairwise\")\n",
     "\n",
     "sn_dna.apply_standard_numbering_pairwise(\n",
     "    base_sequence_id=\"AF190695.1\", db=eedb.db, node_type=\"DNA\", region_based_sequence='coding sequence'\n",
@@ -162,7 +162,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -179,7 +179,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -187,7 +187,7 @@
     "\n",
     "seq1 = \"AF190695.1\"\n",
     "seq2 = \"JX042489.1\"\n",
-    "name_of_standard_numbering_tool = \"test_standard_numbering_dna\"\n",
+    "name_of_standard_numbering_tool = \"test_standard_numbering_dna_pairwise\"\n",
     "\n",
     "mutations_dna = md.get_mutations_between_sequences(\n",
     "    seq1, seq2, eedb.db, name_of_standard_numbering_tool, node_type=\"DNA\", region_based_sequence=\"coding sequence\"\n",
@@ -209,14 +209,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'from_positions': [272, 125, 241], 'to_positions': [272, 125, 241], 'from_monomers': ['D', 'V', 'R'], 'to_monomers': ['N', 'I', 'S']}\n"
+      "{'from_positions': [125, 272, 241], 'to_positions': [125, 272, 241], 'from_monomers': ['V', 'D', 'R'], 'to_monomers': ['I', 'N', 'S']}\n"
      ]
     }
    ],
@@ -242,21 +242,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Mutation on position 705 -> 705 with a nucleotide change of G -> A\n",
-      "Mutation on position 198 -> 198 with a nucleotide change of C -> A\n",
       "Mutation on position 720 -> 720 with a nucleotide change of A -> C\n",
-      "Mutation on position 473 -> 473 with a nucleotide change of T -> C\n",
-      "Mutation on position 17 -> 17 with a nucleotide change of T -> C\n",
-      "Mutation on position 716 -> 716 with a nucleotide change of G -> A\n",
       "Mutation on position 137 -> 137 with a nucleotide change of A -> G\n",
-      "Mutation on position 395 -> 395 with a nucleotide change of T -> G\n"
+      "Mutation on position 705 -> 705 with a nucleotide change of G -> A\n",
+      "Mutation on position 395 -> 395 with a nucleotide change of T -> G\n",
+      "Mutation on position 716 -> 716 with a nucleotide change of G -> A\n",
+      "Mutation on position 17 -> 17 with a nucleotide change of T -> C\n",
+      "Mutation on position 198 -> 198 with a nucleotide change of C -> A\n",
+      "Mutation on position 473 -> 473 with a nucleotide change of T -> C\n"
      ]
     }
    ],
diff --git a/docs/usage/standard_numbering.ipynb b/docs/usage/standard_numbering.ipynb
index d2132d3c..f2bf1004 100644
--- a/docs/usage/standard_numbering.ipynb
+++ b/docs/usage/standard_numbering.ipynb
@@ -16,7 +16,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -35,7 +35,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -94,32 +94,33 @@
     "password = \"12345678\"\n",
     "\n",
     "eedb = Pyeed(uri, user=user, password=password)\n",
-    "eedb.db.wipe_database(date=\"2025-03-14\")\n",
+    "eedb.db.wipe_database(date=\"2025-03-19\")\n",
     "\n",
     "eedb.db.initialize_db_constraints(user=user, password=password)\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
     "ids = [\"AAM15527.1\", \"AAF05614.1\", \"AFN21551.1\", \"CAA76794.1\", \"AGQ50511.1\"]\n",
     "\n",
     "eedb.fetch_from_primary_db(ids, db=\"ncbi_protein\")\n",
-    "eedb.fetch_dna_entries_for_proteins()"
+    "eedb.fetch_dna_entries_for_proteins()\n",
+    "eedb.create_coding_sequences_regions()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6470d4b80eb648e1af401b2e59cbe95b",
+       "model_id": "9e75d0d45b214ded9bae2253574798c2",
        "version_major": 2,
        "version_minor": 0
       },
@@ -152,13 +153,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "47406b43c98e4b31ba41eb15f7cdd000",
+       "model_id": "0bbf1bf2c7454c6e9b224de596b51835",
        "version_major": 2,
        "version_minor": 0
       },
@@ -188,7 +189,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -201,7 +202,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -214,13 +215,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6a3fb35a08714174b353558dedff592c",
+       "model_id": "1269c760e0f743b2937ce95400d3240b",
        "version_major": 2,
        "version_minor": 0
       },
@@ -250,6 +251,44 @@
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9484ba5fa2cb4d89b8371b0554d54510",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "sn_dna_region = StandardNumberingTool(name=\"test_standard_numbering_dna_pairwise_region\")\n",
+    "\n",
+    "sn_dna_region.apply_standard_numbering_pairwise(\n",
+    "    base_sequence_id=\"AF190695.1\", db=eedb.db, node_type=\"DNA\", region_based_sequence='coding sequence'\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

From 9037be2b0a7d457c7c2ffb38fd2e645854c14cdb Mon Sep 17 00:00:00 2001
From: Niklas Abraham GPU 
Date: Wed, 19 Mar 2025 16:06:14 +0000
Subject: [PATCH 06/39] okay build in multiple region support for big dna with
 mutplte coding sequence via region id

---
 docs/usage/mutation_analysis.ipynb       | 54 +++++++++++++++-------
 docs/usage/standard_numbering.ipynb      | 52 +++++++++++++++------
 src/pyeed/analysis/mutation_detection.py | 50 ++++++++++++--------
 src/pyeed/analysis/sequence_alignment.py | 58 ++++++++++++------------
 src/pyeed/analysis/standard_numbering.py | 58 ++++++++++++++----------
 src/pyeed/main.py                        |  2 +-
 src/pyeed/model.py                       |  1 +
 7 files changed, 170 insertions(+), 105 deletions(-)

diff --git a/docs/usage/mutation_analysis.ipynb b/docs/usage/mutation_analysis.ipynb
index 8dbe2b07..9b31c996 100644
--- a/docs/usage/mutation_analysis.ipynb
+++ b/docs/usage/mutation_analysis.ipynb
@@ -11,7 +11,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -37,7 +37,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -75,7 +75,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -101,13 +101,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "19d06e22f1a74d589f68135fcdbd232f",
+       "model_id": "6ed852d438ab480fa4d1c6129eacfd26",
        "version_major": 2,
        "version_minor": 0
       },
@@ -118,6 +118,14 @@
      "metadata": {},
      "output_type": "display_data"
     },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region ids: [143, 129, 128, 69, 9]\n",
+      "len of ids: 5\n"
+     ]
+    },
     {
      "data": {
       "text/html": [
@@ -139,8 +147,19 @@
     "\n",
     "sn_dna = StandardNumberingTool(name=\"test_standard_numbering_dna_pairwise\")\n",
     "\n",
+    "query_get_region_ids = \"\"\"\n",
+    "MATCH (p:Protein)<-[rel:ENCODES]-(d:DNA)-[rel2:HAS_REGION]->(r:Region)\n",
+    "WHERE r.annotation = $region_annotation AND p.accession_id IN $protein_id\n",
+    "RETURN id(r)\n",
+    "\"\"\"\n",
+    "\n",
+    "region_ids = eedb.db.execute_read(query_get_region_ids, parameters={\"protein_id\": ids, \"region_annotation\": \"coding sequence\"})\n",
+    "region_ids = [id['id(r)'] for id in region_ids]\n",
+    "print(f\"Region ids: {region_ids}\")\n",
+    "print(f\"len of ids: {len(ids)}\")\n",
+    "\n",
     "sn_dna.apply_standard_numbering_pairwise(\n",
-    "    base_sequence_id=\"AF190695.1\", db=eedb.db, node_type=\"DNA\", region_based_sequence='coding sequence'\n",
+    "    base_sequence_id=\"AF190695.1\", db=eedb.db, node_type=\"DNA\", region_ids_neo4j=region_ids\n",
     ")"
    ]
   },
@@ -162,7 +181,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -179,18 +198,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
     "md = MutationDetection()\n",
     "\n",
+    "\n",
     "seq1 = \"AF190695.1\"\n",
     "seq2 = \"JX042489.1\"\n",
     "name_of_standard_numbering_tool = \"test_standard_numbering_dna_pairwise\"\n",
     "\n",
     "mutations_dna = md.get_mutations_between_sequences(\n",
-    "    seq1, seq2, eedb.db, name_of_standard_numbering_tool, node_type=\"DNA\", region_based_sequence=\"coding sequence\"\n",
+    "    seq1, seq2, eedb.db, name_of_standard_numbering_tool, node_type=\"DNA\", region_ids_neo4j=region_ids\n",
     ")"
    ]
   },
@@ -209,14 +229,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'from_positions': [125, 272, 241], 'to_positions': [125, 272, 241], 'from_monomers': ['V', 'D', 'R'], 'to_monomers': ['I', 'N', 'S']}\n"
+      "{'from_positions': [241, 125, 272], 'to_positions': [241, 125, 272], 'from_monomers': ['R', 'V', 'D'], 'to_monomers': ['S', 'I', 'N']}\n"
      ]
     }
    ],
@@ -242,21 +262,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Mutation on position 720 -> 720 with a nucleotide change of A -> C\n",
-      "Mutation on position 137 -> 137 with a nucleotide change of A -> G\n",
       "Mutation on position 705 -> 705 with a nucleotide change of G -> A\n",
       "Mutation on position 395 -> 395 with a nucleotide change of T -> G\n",
-      "Mutation on position 716 -> 716 with a nucleotide change of G -> A\n",
+      "Mutation on position 137 -> 137 with a nucleotide change of A -> G\n",
       "Mutation on position 17 -> 17 with a nucleotide change of T -> C\n",
-      "Mutation on position 198 -> 198 with a nucleotide change of C -> A\n",
-      "Mutation on position 473 -> 473 with a nucleotide change of T -> C\n"
+      "Mutation on position 473 -> 473 with a nucleotide change of T -> C\n",
+      "Mutation on position 716 -> 716 with a nucleotide change of G -> A\n",
+      "Mutation on position 720 -> 720 with a nucleotide change of A -> C\n",
+      "Mutation on position 198 -> 198 with a nucleotide change of C -> A\n"
      ]
     }
    ],
diff --git a/docs/usage/standard_numbering.ipynb b/docs/usage/standard_numbering.ipynb
index f2bf1004..cd84cad9 100644
--- a/docs/usage/standard_numbering.ipynb
+++ b/docs/usage/standard_numbering.ipynb
@@ -16,7 +16,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -35,7 +35,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -101,7 +101,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -114,13 +114,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9e75d0d45b214ded9bae2253574798c2",
+       "model_id": "0f961c177f1444fb8190669487a1cb89",
        "version_major": 2,
        "version_minor": 0
       },
@@ -153,13 +153,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0bbf1bf2c7454c6e9b224de596b51835",
+       "model_id": "b7c38c15de4c4fa2bcf3f0a223d527b0",
        "version_major": 2,
        "version_minor": 0
       },
@@ -189,7 +189,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -202,7 +202,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -215,13 +215,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "1269c760e0f743b2937ce95400d3240b",
+       "model_id": "b204fcf51571421b8fff36de4e9ba9dd",
        "version_major": 2,
        "version_minor": 0
       },
@@ -253,13 +253,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9484ba5fa2cb4d89b8371b0554d54510",
+       "model_id": "526ca870c8fb4b76b2df332a4b06af18",
        "version_major": 2,
        "version_minor": 0
       },
@@ -270,6 +270,14 @@
      "metadata": {},
      "output_type": "display_data"
     },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region ids: [13, 0, 41, 38, 19]\n",
+      "len of ids: 5\n"
+     ]
+    },
     {
      "data": {
       "text/html": [
@@ -284,8 +292,24 @@
    "source": [
     "sn_dna_region = StandardNumberingTool(name=\"test_standard_numbering_dna_pairwise_region\")\n",
     "\n",
+    "\n",
+    "ids = [\"AAM15527.1\", \"AAF05614.1\", \"AFN21551.1\", \"CAA76794.1\", \"AGQ50511.1\"]\n",
+    "\n",
+    "\n",
+    "query_get_region_ids = \"\"\"\n",
+    "MATCH (p:Protein)<-[rel:ENCODES]-(d:DNA)-[rel2:HAS_REGION]->(r:Region)\n",
+    "WHERE r.annotation = $region_annotation AND p.accession_id IN $protein_id\n",
+    "RETURN id(r)\n",
+    "\"\"\"\n",
+    "\n",
+    "region_ids = eedb.db.execute_read(query_get_region_ids, parameters={\"protein_id\": ids, \"region_annotation\": \"coding sequence\"})\n",
+    "region_ids = [id['id(r)'] for id in region_ids]\n",
+    "print(f\"Region ids: {region_ids}\")\n",
+    "print(f\"len of ids: {len(ids)}\")\n",
+    "\n",
+    "\n",
     "sn_dna_region.apply_standard_numbering_pairwise(\n",
-    "    base_sequence_id=\"AF190695.1\", db=eedb.db, node_type=\"DNA\", region_based_sequence='coding sequence'\n",
+    "    base_sequence_id=\"AF190695.1\", db=eedb.db, node_type=\"DNA\", region_ids_neo4j=region_ids\n",
     ")"
    ]
   },
diff --git a/src/pyeed/analysis/mutation_detection.py b/src/pyeed/analysis/mutation_detection.py
index d469ef3d..c2562ae1 100644
--- a/src/pyeed/analysis/mutation_detection.py
+++ b/src/pyeed/analysis/mutation_detection.py
@@ -15,7 +15,7 @@ def get_sequence_data(
         db: DatabaseConnector,
         standard_numbering_tool_name: str,
         node_type: str = "Protein",
-        region_based_sequence: Optional[str] = None,
+        region_ids_neo4j: Optional[list[int]] = None,
     ) -> tuple[dict[str, str], dict[str, list[str]]]:
         """Fetch sequence and position data for two sequences from the database.
 
@@ -25,7 +25,7 @@ def get_sequence_data(
             db: Database connection instance
             standard_numbering_tool_name: Name of standard numbering tool to use
             node_type: Type of node to use (default: "Protein")
-            region_based_sequence: Annotation of region to use for numbering (default: None)
+            region_ids_neo4j: List of region IDs for the sequence cuting based on region_based_sequence.
 
         Returns:
             tuple containing:
@@ -35,14 +35,18 @@ def get_sequence_data(
         Raises:
             ValueError: If standard numbering positions not found for both sequences
         """
-        if region_based_sequence is not None:
+        if region_ids_neo4j is not None:
             query = f"""
-            MATCH (p:{node_type})-[rel:HAS_REGION]->(r:Region {{annotation: '{region_based_sequence}'}})-[rel2:HAS_STANDARD_NUMBERING]->(s:StandardNumbering)
+            MATCH (p:{node_type})-[rel:HAS_REGION]->(r:Region)
+            WHERE id(r) IN $region_ids_neo4j
+            MATCH (r)-[rel2:HAS_STANDARD_NUMBERING]->(s:StandardNumbering)
             WHERE p.accession_id IN ['{sequence_id1}', '{sequence_id2}'] 
             AND s.name = '{standard_numbering_tool_name}'
             RETURN p.accession_id as id, p.sequence as sequence, rel2.positions as positions, rel.start as start, rel.end as end
             """
-            results = db.execute_read(query)
+            results = db.execute_read(
+                query, parameters={"region_ids_neo4j": region_ids_neo4j}
+            )
         else:
             query = f"""
             MATCH (p:{node_type})-[r:HAS_STANDARD_NUMBERING]->(s:StandardNumbering)
@@ -56,7 +60,7 @@ def get_sequence_data(
             raise ValueError(
                 f"Could not find standard numbering positions for both sequences {sequence_id1} and {sequence_id2}"
             )
-        if region_based_sequence is not None:
+        if region_ids_neo4j is not None:
             sequences = {
                 results[i]["id"]: results[i]["sequence"][
                     results[i]["start"] : results[i]["end"]
@@ -128,7 +132,7 @@ def save_mutations_to_db(
         sequence_id1: str,
         sequence_id2: str,
         node_type: str = "Protein",
-        region_based_sequence: Optional[str] = None,
+        region_ids_neo4j: Optional[list[int]] = None,
     ) -> None:
         """Save detected mutations to the database.
 
@@ -142,12 +146,16 @@ def save_mutations_to_db(
             sequence_id1: First sequence accession ID
             sequence_id2: Second sequence accession ID
             node_type: Type of node to use (default: "Protein")
-            region_based_sequence: Annotation of region to use for numbering (default: None)
+            region_ids_neo4j: List of region IDs for the sequence cuting based on region_based_sequence.
         """
         # Check if a mutation relationship already exists between these proteins
-        if region_based_sequence is not None:
+        if region_ids_neo4j is not None:
             query = f"""
-            MATCH (p1:{node_type} {{accession_id: $sequence_id1}})-[rel:HAS_REGION]->(r1:Region {{annotation: $region_based_sequence}})-[rel_mutation:MUTATION]->(r2:Region {{annotation: $region_based_sequence}})<-[rel2:HAS_REGION]-(p2:{node_type} {{accession_id: $sequence_id2}})
+            MATCH (p1:{node_type} {{accession_id: $sequence_id1}})-[rel:HAS_REGION]->(r1:Region)
+            WHERE id(r1) IN $region_ids_neo4j
+            MATCH (r1)-[rel_mutation:MUTATION]->(r2:Region)
+            WHERE id(r2) IN $region_ids_neo4j
+            MATCH (r2)<-[:HAS_REGION]-(p2:{node_type} {{accession_id: $sequence_id2}})
             RETURN rel_mutation
             """
             existing_mutations = db.execute_read(
@@ -155,7 +163,7 @@ def save_mutations_to_db(
                 {
                     "sequence_id1": sequence_id1,
                     "sequence_id2": sequence_id2,
-                    "region_based_sequence": region_based_sequence,
+                    "region_ids_neo4j": region_ids_neo4j,
                 },
             )
         else:
@@ -173,11 +181,15 @@ def save_mutations_to_db(
             )
             return
 
-        if region_based_sequence is not None:
+        if region_ids_neo4j is not None:
             # saving the mutation between the regions
             query = f"""
-            MATCH (r1:Region {{annotation: $region_based_sequence}})<-[rel:HAS_REGION]-(p1:{node_type} {{accession_id: $sequence_id1}})
-            MATCH (r2:Region {{annotation: $region_based_sequence}})<-[rel2:HAS_REGION]-(p2:{node_type} {{accession_id: $sequence_id2}})
+            MATCH (r1:Region)
+            WHERE id(r1) IN $region_ids_neo4j
+            MATCH (r1)<-[:HAS_REGION]-(p1:{node_type} {{accession_id: $sequence_id1}})
+            MATCH (r2:Region)
+            WHERE id(r2) IN $region_ids_neo4j
+            MATCH (r2)<-[:HAS_REGION]-(p2:{node_type} {{accession_id: $sequence_id2}})
             CREATE (r1)-[r:MUTATION]->(r2)
             SET r.from_positions = $from_positions,
                 r.to_positions = $to_positions,
@@ -187,7 +199,7 @@ def save_mutations_to_db(
             params = {
                 "sequence_id1": sequence_id1,
                 "sequence_id2": sequence_id2,
-                "region_based_sequence": region_based_sequence,
+                "region_ids_neo4j": region_ids_neo4j,
                 "from_positions": mutations["from_positions"],
                 "to_positions": mutations["to_positions"],
                 "from_monomers": mutations["from_monomers"],
@@ -227,7 +239,7 @@ def get_mutations_between_sequences(
         save_to_db: bool = True,
         debug: bool = False,
         node_type: str = "Protein",
-        region_based_sequence: Optional[str] = None,
+        region_ids_neo4j: Optional[list[int]] = None,
     ) -> dict[str, list[int | str]]:
         """Get mutations between two sequences using standard numbering.
 
@@ -238,7 +250,7 @@ def get_mutations_between_sequences(
             standard_numbering_tool_name: Name of standard numbering tool to use
             save_to_db: Whether to save mutations to database (default: True)
             node_type: Type of node to use (default: "Protein")
-            region_based_sequence: Annotation of region to use for numbering (default: None)
+            region_ids_neo4j: List of region IDs for the sequence cuting based on region_based_sequence.
 
         Returns:
             dict containing mutation information:
@@ -256,7 +268,7 @@ def get_mutations_between_sequences(
             db,
             standard_numbering_tool_name,
             node_type,
-            region_based_sequence,
+            region_ids_neo4j,
         )
 
         if debug:
@@ -276,7 +288,7 @@ def get_mutations_between_sequences(
                 sequence_id1,
                 sequence_id2,
                 node_type,
-                region_based_sequence,
+                region_ids_neo4j,
             )
 
         return mutations
diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py
index c130f8d3..3bfd019c 100644
--- a/src/pyeed/analysis/sequence_alignment.py
+++ b/src/pyeed/analysis/sequence_alignment.py
@@ -91,7 +91,8 @@ def align_multipairwise(
         return_results: bool = True,
         pairs: Optional[list[tuple[str, str]]] = None,
         node_type: str = "Protein",
-        region_based_sequence: Optional[str] = None,
+        region_ids_neo4j: Optional[list[str]] = None,
+        num_cores: int = cpu_count() - 1,
     ) -> Optional[list[dict[str, Any]]]:
         """
         Creates all possible pairwise alignments from a dictionary of sequences or from sequence IDs.
@@ -115,7 +116,7 @@ def align_multipairwise(
             pairs (Optional[list[tuple[str, str]]]): A list of tuples, where each tuple contains two
                 sequence IDs to align. If provided, only these pairs will be aligned.
             node_type (str): The type of node to align. Defaults to "Protein".
-            region_based_sequence (Optional[str]): The annotation of the region to use for the alignment. Defaults to None.
+            region_ids_neo4j (Optional[list[str]]): A list of region IDs for the sequence cuting based on region_based_sequence.
         Returns:
             Optional[List[dict]]: A list of dictionaries containing the alignment results if
             `return_results` is True. If False, returns None.
@@ -123,9 +124,7 @@ def align_multipairwise(
 
         # Fetch sequences if ids are provided
         if ids is not None and db is not None:
-            sequences = self._get_id_sequence_dict(
-                db, ids, node_type, region_based_sequence
-            )
+            sequences = self._get_id_sequence_dict(db, ids, node_type, region_ids_neo4j)
 
         if not sequences:
             raise ValueError(
@@ -148,7 +147,7 @@ def align_multipairwise(
 
             for pair_chunk in chunks(pairs, batch_size):
                 # Align the pairs in the current chunk
-                alignments = Parallel(n_jobs=cpu_count(), prefer="processes")(
+                alignments = Parallel(n_jobs=num_cores, prefer="processes")(
                     delayed(self.align_pairwise)(
                         {pair[0]: sequences[pair[0]]},
                         {pair[1]: sequences[pair[1]]},
@@ -160,7 +159,7 @@ def align_multipairwise(
                 progress.update(align_task, advance=len(pair_chunk))
 
                 if db:
-                    self._to_db(alignments, db, node_type, region_based_sequence)
+                    self._to_db(alignments, db, node_type, region_ids_neo4j)
                     progress.update(db_task, advance=len(pair_chunk))
 
                 if return_results:
@@ -173,7 +172,7 @@ def _to_db(
         alignments: list[dict[str, Any]],
         db: DatabaseConnector,
         node_type: str = "Protein",
-        region_based_sequence: Optional[str] = None,
+        region_ids_neo4j: Optional[list[str]] = None,
     ) -> None:
         """Inserts the alignment results to pyeed graph database.
 
@@ -181,10 +180,10 @@ def _to_db(
             alignments (list[dict]): A list of dictionaries containing the alignment results.
             db (DatabaseConnector): A `DatabaseConnector` object.
             node_type (str): The type of node to align. Defaults to "Protein".
-            region_based_sequence (Optional[str]): The annotation of the region to use for the alignment. Defaults to None.
+            region_ids_neo4j (Optional[list[str]]): A list of region IDs for the sequence cuting based on region_based_sequence.
         """
 
-        if region_based_sequence is None:
+        if region_ids_neo4j is None:
             query = f"""
             UNWIND $alignments AS alignment
             MATCH (p1:{node_type} {{accession_id: alignment.query_id}})
@@ -201,8 +200,9 @@ def _to_db(
         else:
             query = f"""
             UNWIND $alignments AS alignment
-            MATCH (p1:{node_type} {{accession_id: alignment.query_id}})-[rel1:HAS_REGION]->(r1:Region {{annotation: $region_based_sequence}})
-            MATCH (p2:{node_type} {{accession_id: alignment.target_id}})-[rel2:HAS_REGION]->(r2:Region {{annotation: $region_based_sequence}})
+            MATCH (p1:{node_type} {{accession_id: alignment.query_id}})-[rel1:HAS_REGION]->(r1:Region)
+            MATCH (p2:{node_type} {{accession_id: alignment.target_id}})-[rel2:HAS_REGION]->(r2:Region)
+            WHERE id(r1) IN $region_ids_neo4j AND id(r2) IN $region_ids_neo4j
             MERGE (r1)-[r:PAIRWISE_ALIGNED]->(r2)
             SET r.similarity = alignment.identity,
             r.mismatches = alignment.mismatches,
@@ -213,9 +213,9 @@ def _to_db(
             """
             db.execute_write(
                 query,
-                {
+                parameters={
                     "alignments": alignments,
-                    "region_based_sequence": region_based_sequence,
+                    "region_ids_neo4j": region_ids_neo4j,
                 },
             )
 
@@ -272,7 +272,7 @@ def _get_id_sequence_dict(
         db: DatabaseConnector,
         ids: list[str] = [],
         node_type: str = "Protein",
-        region_based_sequence: Optional[str] = None,
+        region_ids_neo4j: Optional[list[str]] = None,
     ) -> dict[str, str]:
         """Gets all sequences from the database and returns them in a dictionary.
         Key is the accession id and value is the sequence.
@@ -286,46 +286,46 @@ def _get_id_sequence_dict(
             dict[str, str]: Dictionary of sequences with accession id as key.
         """
 
-        if not ids:
-            if region_based_sequence is not None:
+        if ids != []:
+            if region_ids_neo4j is not None:
                 query = f"""
-                MATCH (p:{node_type})-[e:HAS_REGION]->(r:Region {{annotation: $region_based_sequence}})
+                MATCH (p:{node_type})-[e:HAS_REGION]->(r:Region)
+                WHERE id(r) IN $region_ids_neo4j AND p.accession_id IN $ids
                 RETURN p.accession_id AS accession_id, e.start AS start, e.end AS end, p.sequence AS sequence
                 """
                 nodes = db.execute_read(
                     query,
-                    parameters={"region_based_sequence": region_based_sequence},
+                    parameters={"region_ids_neo4j": region_ids_neo4j, "ids": ids},
                 )
-
             else:
                 query = f"""
                 MATCH (p:{node_type})
+                WHERE p.accession_id IN $ids
                 RETURN p.accession_id AS accession_id, p.sequence AS sequence
                 """
-                nodes = db.execute_read(query)
+                nodes = db.execute_read(query, parameters={"ids": ids})
+
         else:
-            if region_based_sequence is not None:
+            if region_ids_neo4j is not None:
                 query = f"""
-                MATCH (p:{node_type})-[e:HAS_REGION]->(r:Region {{annotation: $region_based_sequence}})
-                WHERE p.accession_id IN $ids
+                MATCH (p:{node_type})-[e:HAS_REGION]->(r:Region)
+                WHERE id(r) IN $region_ids_neo4j
                 RETURN p.accession_id AS accession_id, e.start AS start, e.end AS end, p.sequence AS sequence
                 """
                 nodes = db.execute_read(
                     query,
                     parameters={
-                        "ids": ids,
-                        "region_based_sequence": region_based_sequence,
+                        "region_ids_neo4j": region_ids_neo4j,
                     },
                 )
             else:
                 query = f"""
                 MATCH (p:{node_type})
-                WHERE p.accession_id IN $ids
                 RETURN p.accession_id AS accession_id, p.sequence AS sequence
                 """
-                nodes = db.execute_read(query, parameters={"ids": ids})
+                nodes = db.execute_read(query)
 
-        if region_based_sequence is not None:
+        if region_ids_neo4j is not None:
             return {
                 node["accession_id"]: node["sequence"][node["start"] : node["end"]]
                 for node in nodes
diff --git a/src/pyeed/analysis/standard_numbering.py b/src/pyeed/analysis/standard_numbering.py
index 8d6bad66..6f81869f 100644
--- a/src/pyeed/analysis/standard_numbering.py
+++ b/src/pyeed/analysis/standard_numbering.py
@@ -45,7 +45,7 @@ def get_node_base_sequence(
         base_sequence_id: str,
         db: DatabaseConnector,
         node_type: str = "Protein",
-        region_based_sequence: Optional[str] = None,
+        region_ids_neo4j: Optional[list[str]] = None,
     ) -> dict[str, str]:
         """
         Retrieve the base node sequence from the database for a given accession id.
@@ -56,13 +56,14 @@ def get_node_base_sequence(
         Args:
             base_sequence_id: The accession id of the base node sequence.
             db: The database connector instance to perform the query.
-            region_based_sequence: The annotation of the region to use for the numbering. Default is None.
+            region_ids_neo4j: A list of region IDs for the sequence cuting based on region_based_sequence.
         Returns:
             A dictionary with keys 'id' and 'sequence' holding the node type id and its sequence.
         """
-        if region_based_sequence:
+        if region_ids_neo4j:
             query = f"""
-            MATCH (p:{node_type})-[e:HAS_REGION]->(r:Region annotation: {region_based_sequence})
+            MATCH (p:{node_type})-[e:HAS_REGION]->(r:Region)
+            WHERE id(r) IN $region_ids_neo4j
             WHERE p.accession_id = '{base_sequence_id}'
             RETURN p.accession_id AS accession_id, e.start AS start, e.end AS end, p.sequence AS sequence
             """
@@ -74,7 +75,7 @@ def get_node_base_sequence(
             """
         base_sequence_read = db.execute_read(query)
         # Assume the first returned record is the desired base sequence
-        if region_based_sequence:
+        if region_ids_neo4j:
             base_sequence = {
                 "id": base_sequence_read[0]["accession_id"],
                 "sequence": base_sequence_read[0]["sequence"][
@@ -93,7 +94,7 @@ def save_positions(
         db: DatabaseConnector,
         positions: dict[str, list[str]],
         node_type: str = "Protein",
-        region_based_sequence: bool = False,
+        region_ids_neo4j: Optional[list[str]] = None,
     ) -> None:
         """
         Save the calculated numbering positions for each protein into the database.
@@ -106,16 +107,20 @@ def save_positions(
             db: The database connector instance used to execute the write queries.
             positions: A dictionary mapping protein accession ids to lists of numbering positions.
             node_type: The type of node to process. Default is "Protein".
-            region_based_sequence: If True, the sequence is a region based sequence. Default is False.
+            region_ids_neo4j: A list of region IDs for the sequence cuting based on region_based_sequence.
         """
         for protein_id in positions:
-            if region_based_sequence:
+            if region_ids_neo4j:
                 query = f"""
-                    MATCH (p:{node_type} {{accession_id: '{protein_id}'}})-[e:HAS_REGION]->(r:Region {{annotation: 'coding sequence'}})
+                    MATCH (p:{node_type} {{accession_id: '{protein_id}'}})-[e:HAS_REGION]->(r:Region)
+                    WHERE id(r) IN $region_ids_neo4j
                     MATCH (s:StandardNumbering {{name: '{self.name}'}})
                     MERGE (r)-[rel:HAS_STANDARD_NUMBERING]->(s)
                     SET rel.positions = {str(positions[protein_id])}
                 """
+                db.execute_write(
+                    query, parameters={"region_ids_neo4j": region_ids_neo4j}
+                )
             else:
                 query = f"""
                     MATCH (p:{node_type} {{accession_id: '{protein_id}'}})
@@ -123,8 +128,7 @@ def save_positions(
                     MERGE (p)-[rel:HAS_STANDARD_NUMBERING]->(s)
                     SET rel.positions = {str(positions[protein_id])}
                 """
-            # Execute the write query to update the standard numbering relationship.
-            db.execute_write(query)
+                db.execute_write(query)
 
     def run_numbering_algorithm_clustalo(
         self, base_sequence_id: str, alignment: Any
@@ -348,7 +352,7 @@ def apply_standard_numbering_pairwise(
         list_of_seq_ids: Optional[List[str]] = None,
         return_positions: bool = False,
         node_type: str = "Protein",
-        region_based_sequence: Optional[str] = None,
+        region_ids_neo4j: Optional[list[str]] = None,
     ) -> Optional[Dict[str, List[str]]]:
         """
         Apply standard numbering via pairwise alignment using a base sequence.
@@ -364,7 +368,7 @@ def apply_standard_numbering_pairwise(
             list_of_seq_ids: An optional list of node type ids to process. If None, all node type ids are used.
             return_positions: If True, the method returns the computed positions dictionary after processing.
             node_type: The type of node to process. Default is "Protein".
-            region_based_sequence: The annotation of the region to use for the numbering. Default is None.
+            region_ids_neo4j: A list of region IDs for the sequence cuting based on region_based_sequence.
         Raises:
             ValueError: If the pairwise alignment fails and returns no results.
         """
@@ -394,10 +398,12 @@ def apply_standard_numbering_pairwise(
             pairs.append((base_sequence_id, node_id))
 
         # check if the pairs are already existing with the same name under the same standard numbering node
-        if node_type == "DNA" and region_based_sequence is not None:
+        if node_type == "DNA" and region_ids_neo4j is not None:
             query = """
             MATCH (s:StandardNumbering {name: $name})
-            MATCH (r:Region {annotation: $region_based_sequence})<-[rel:HAS_STANDARD_NUMBERING]-(s)
+            MATCH (r:Region)
+            WHERE id(r) IN $region_ids_neo4j
+            MATCH (r:Region)<-[:HAS_STANDARD_NUMBERING]-(s)
             WHERE r.accession_id IN $list_of_seq_ids
             RETURN r.accession_id AS accession_id
             """
@@ -407,7 +413,7 @@ def apply_standard_numbering_pairwise(
                 parameters={
                     "list_of_seq_ids": list_of_seq_ids,
                     "name": self.name,
-                    "region_based_sequence": region_based_sequence,
+                    "region_ids_neo4j": region_ids_neo4j,
                 },
             )
         else:
@@ -421,6 +427,7 @@ def apply_standard_numbering_pairwise(
                 query,
                 parameters={"list_of_seq_ids": list_of_seq_ids, "name": self.name},
             )
+
         if results is not None:
             for row in results:
                 if row is not None:
@@ -448,7 +455,7 @@ def apply_standard_numbering_pairwise(
             db=db,
             pairs=pairs,  # List of sequence pairs to be aligned
             node_type=node_type,
-            region_based_sequence=region_based_sequence,
+            region_ids_neo4j=region_ids_neo4j,
         )
 
         logger.info(f"Pairwise alignment results: {results_pairwise}")
@@ -486,7 +493,7 @@ def apply_standard_numbering_pairwise(
         )
 
         # Update the database with the calculated positions.
-        self.save_positions(db, positions, node_type, region_based_sequence)
+        self.save_positions(db, positions, node_type, region_ids_neo4j)
 
         if return_positions:
             return positions
@@ -498,7 +505,7 @@ def apply_standard_numbering(
         db: DatabaseConnector,
         list_of_seq_ids: Optional[List[str]] = None,
         node_type: str = "Protein",
-        region_based_sequence: Optional[str] = None,
+        region_ids_neo4j: Optional[list[str]] = None,
     ) -> None:
         """
         Apply a standard numbering scheme to a collection of nodes using multiple sequence alignment.
@@ -512,7 +519,7 @@ def apply_standard_numbering(
             db: DatabaseConnector instance used for executing queries.
             list_of_seq_ids: An optional list of specific node type ids to process. If None, all node type ids are used.
             node_type: The type of node to process. Default is "Protein".
-            region_based_sequence: The annotation of the region to use for the numbering. Default is None.
+            region_ids_neo4j: A list of region IDs for the sequence cuting based on region_based_sequence.
         """
 
         if list_of_seq_ids is None:
@@ -543,11 +550,12 @@ def apply_standard_numbering(
         else:
             nodes_read = query_result
 
-        if node_type == "DNA" and region_based_sequence is not None:
+        if node_type == "DNA" and region_ids_neo4j is not None:
             # then the sequence is a region based sequence.
             # get the region objects for each of the nodes as well
             query = f"""
-            MATCH (p:{node_type})-[e:HAS_REGION]->(r:Region {{annotation: $region_based_sequence}})
+            MATCH (p:{node_type})-[e:HAS_REGION]->(r:Region)
+            WHERE id(r) IN $region_ids_neo4j
             WHERE p.accession_id IN $list_of_seq_ids
             RETURN p.accession_id AS accession_id, e.start AS start, e.end AS end, p.sequence AS sequence
             """
@@ -555,7 +563,7 @@ def apply_standard_numbering(
                 query,
                 parameters={
                     "list_of_seq_ids": list_of_seq_ids,
-                    "region_based_sequence": region_based_sequence,
+                    "region_ids_neo4j": region_ids_neo4j,
                 },
             )
             nodes_dict = {
@@ -570,7 +578,7 @@ def apply_standard_numbering(
 
         # Obtain the base sequence details from the database.
         base_sequence = self.get_node_base_sequence(
-            base_sequence_id, db, node_type, region_based_sequence
+            base_sequence_id, db, node_type, region_ids_neo4j
         )
 
         # Remove the base sequence from the nodes list to prevent duplicate alignment.
@@ -602,4 +610,4 @@ def apply_standard_numbering(
         )
 
         # Update the database with the relationships between nodes and standard numbering.
-        self.save_positions(db, positions, node_type, region_based_sequence)
+        self.save_positions(db, positions, node_type, region_ids_neo4j)
diff --git a/src/pyeed/main.py b/src/pyeed/main.py
index ea3a6462..d4a520b9 100644
--- a/src/pyeed/main.py
+++ b/src/pyeed/main.py
@@ -437,7 +437,7 @@ def create_coding_sequences_regions(self) -> None:
         query = """
         MATCH (p:Protein)
         WHERE p.nucleotide_id IS NOT NULL
-        CREATE (r:Region {annotation: 'coding sequence'})
+        CREATE (r:Region {annotation: 'coding sequence', sequence_id: p.accession_id})
         WITH p, r
         MATCH (d:DNA {accession_id: p.nucleotide_id})
         CREATE (d)-[:HAS_REGION {start: p.nucleotide_start, end: p.nucleotide_end}]->(r)
diff --git a/src/pyeed/model.py b/src/pyeed/model.py
index 128e7eab..aa374669 100644
--- a/src/pyeed/model.py
+++ b/src/pyeed/model.py
@@ -299,6 +299,7 @@ class Region(StrictStructuredNode):
     annotation = StringProperty(
         choices=[(e.value, e.name) for e in Annotation], required=True
     )
+    sequence_id = StringProperty()
 
     # Relationships
     has_mutation_region = RelationshipTo("Region", "MUTATION", model=Mutation)

From 366e90f3892af470551498157ff92d885794d211 Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Mon, 24 Mar 2025 15:23:08 +0000
Subject: [PATCH 07/39] added existing pairs check in pairwise alignment

---
 src/pyeed/analysis/sequence_alignment.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py
index 3bfd019c..386ba295 100644
--- a/src/pyeed/analysis/sequence_alignment.py
+++ b/src/pyeed/analysis/sequence_alignment.py
@@ -137,6 +137,21 @@ def align_multipairwise(
         total_pairs = len(pairs)
         all_alignments = []
 
+        query = """
+        MATCH (p1:Protein)-[:PAIRWISE_ALIGNED]->(p2:Protein)
+        RETURN p1.accession_id AS Protein1_ID, p2.accession_id AS Protein2_ID
+        """
+        
+        # Fetch results properly as a list of tuples
+        existing_pairs = set(tuple(sorted((row["Protein1_ID"], row["Protein2_ID"]))) for row in db.execute_write(query))
+
+        # Filter new pairs that are not in existing_pairs
+        new_pairs = [pair for pair in pairs if tuple(sorted(pair)) not in existing_pairs]
+
+        print(f"Number of existing pairs: {len(existing_pairs)}")
+        print(f"Number of total pairs: {len(pairs)}")
+        print(f"Number of pairs to align: {len(new_pairs)}")
+        
         with Progress() as progress:
             align_task = progress.add_task(
                 f"⛓️ Aligning {total_pairs} sequence pairs...", total=total_pairs
@@ -145,7 +160,7 @@ def align_multipairwise(
                 "📥 Inserting alignment results to database...", total=total_pairs
             )
 
-            for pair_chunk in chunks(pairs, batch_size):
+            for pair_chunk in chunks(new_pairs, batch_size):
                 # Align the pairs in the current chunk
                 alignments = Parallel(n_jobs=num_cores, prefer="processes")(
                     delayed(self.align_pairwise)(
@@ -155,6 +170,7 @@ def align_multipairwise(
                     )
                     for pair in pair_chunk
                 )
+                print(f"chunk size: {len(pair_chunk)}, chunk done")
 
                 progress.update(align_task, advance=len(pair_chunk))
 

From c9566b51bd33a83937d7e92ba02acedd631eab47 Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Mon, 24 Mar 2025 23:13:28 +0000
Subject: [PATCH 08/39] adapted embeddings for multiple gpus

---
 src/pyeed/embedding.py | 26 ++++++++++++++++++++------
 src/pyeed/main.py      |  5 +++--
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py
index 82b5c6d7..cb7f2d82 100644
--- a/src/pyeed/embedding.py
+++ b/src/pyeed/embedding.py
@@ -32,6 +32,7 @@ def get_hf_token() -> str:
 
 def load_model_and_tokenizer(
     model_name: str,
+    use_all_gpus: bool = True
 ) -> Tuple[
     Union[EsmModel, ESMC],  # Changed from ESM3InferenceClient to ESMC
     Union[EsmTokenizer, None],
@@ -50,12 +51,14 @@ def load_model_and_tokenizer(
     # Get token only when loading model
     token = get_hf_token()
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    num_gpus = torch.cuda.device_count()
 
     # Check if this is an ESM-3 variant
     if "esmc" in model_name.lower():
         # Using ESMC from_pretrained
-        model = ESMC.from_pretrained(model_name)
-        model = model.to(device)
+        model = ESMC.from_pretrained(model_name).to(device)
+        if use_all_gpus and torch.cuda.device_count() > 1:
+            model = torch.nn.DataParallel(model)
         return model, None, device
     else:
         # Otherwise, assume it's an ESM-2 model on Hugging Face
@@ -64,9 +67,11 @@ def load_model_and_tokenizer(
             if model_name.startswith("facebook/")
             else f"facebook/{model_name}"
         )
-        model = EsmModel.from_pretrained(full_model_name, use_auth_token=token)
+        model = EsmModel.from_pretrained(full_model_name, use_auth_token=token).to(device)
         tokenizer = EsmTokenizer.from_pretrained(full_model_name, use_auth_token=token)
-        model = model.to(device)
+        if use_all_gpus and torch.cuda.device_count() > 1:
+            model = torch.nn.DataParallel(model)
+
         return model, tokenizer, device
 
 
@@ -76,6 +81,7 @@ def get_batch_embeddings(
     tokenizer_or_alphabet: Union[EsmTokenizer, None],
     device: torch.device,
     pool_embeddings: bool = True,
+    use_all_gpus: bool = True,
 ) -> list[NDArray[np.float64]]:
     """
     Generates mean-pooled embeddings for a batch of sequences.
@@ -90,13 +96,21 @@ def get_batch_embeddings(
     Returns:
         list[NDArray[np.float64]]: A list of embeddings as NumPy arrays.
     """
-    if isinstance(model, ESMC):
+    # Wrap model in DataParallel if multiple GPUs should be used
+    if use_all_gpus and torch.cuda.device_count() > 1:
+        model = torch.nn.DataParallel(model)  # Multi-GPU
+        print(f"Using {torch.cuda.device_count()} GPUs for inference.")
+
+    model = model.to(device)
+    model.eval()  # Ensure model is in inference mode
+    
+    if isinstance(model.module if use_all_gpus else model, ESMC):
         with torch.no_grad():
             embedding_list = []
             for sequence in batch_sequences:
                 # Process each sequence individually
                 protein = ESMProtein(sequence=sequence)
-                protein_tensor = model.encode(protein)
+                protein_tensor = model.module.encode(protein) if use_all_gpus else model.encode(protein)
                 logits_output = model.logits(
                     protein_tensor, LogitsConfig(sequence=True, return_embeddings=True)
                 )
diff --git a/src/pyeed/main.py b/src/pyeed/main.py
index d4a520b9..f4783f59 100644
--- a/src/pyeed/main.py
+++ b/src/pyeed/main.py
@@ -193,6 +193,7 @@ def calculate_sequence_embeddings(
         self,
         batch_size: int = 16,
         model_name: str = "facebook/esm2_t33_650M_UR50D",
+        use_all_gpus: bool = True,
     ) -> None:
         """
         Calculates embeddings for all sequences in the database that do not have embeddings, processing in batches.
@@ -205,7 +206,7 @@ def calculate_sequence_embeddings(
         """
 
         # Load the model, tokenizer, and device
-        model, tokenizer, device = load_model_and_tokenizer(model_name)
+        model, tokenizer, device = load_model_and_tokenizer(model_name, use_all_gpus)
 
         # Cypher query to retrieve proteins without embeddings and with valid sequences
         query = """
@@ -236,7 +237,7 @@ def calculate_sequence_embeddings(
 
             # Get embeddings for the current batch
             embeddings_batch = get_batch_embeddings(
-                list(batch_sequences), model, tokenizer, device
+                list(batch_sequences), model, tokenizer, device, use_all_gpus
             )
 
             # Update the database for the current batch

From 5e031bca1ccf68428eb9d4adc68492df86b734f4 Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Tue, 25 Mar 2025 14:31:58 +0000
Subject: [PATCH 09/39] changed connection timeout

---
 src/pyeed/analysis/sequence_alignment.py | 9 +++++----
 src/pyeed/dbconnect.py                   | 4 +++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py
index 386ba295..4b7837a9 100644
--- a/src/pyeed/analysis/sequence_alignment.py
+++ b/src/pyeed/analysis/sequence_alignment.py
@@ -1,5 +1,7 @@
 from itertools import combinations
-from typing import Any, Dict, Optional
+from typing import List,Any, Dict, Optional, Tuple
+import time
+import asyncio
 
 from Bio.Align import Alignment as Alignment
 from Bio.Align import PairwiseAligner as BioPairwiseAligner
@@ -151,7 +153,7 @@ def align_multipairwise(
         print(f"Number of existing pairs: {len(existing_pairs)}")
         print(f"Number of total pairs: {len(pairs)}")
         print(f"Number of pairs to align: {len(new_pairs)}")
-        
+
         with Progress() as progress:
             align_task = progress.add_task(
                 f"⛓️ Aligning {total_pairs} sequence pairs...", total=total_pairs
@@ -170,7 +172,6 @@ def align_multipairwise(
                     )
                     for pair in pair_chunk
                 )
-                print(f"chunk size: {len(pair_chunk)}, chunk done")
 
                 progress.update(align_task, advance=len(pair_chunk))
 
@@ -352,4 +353,4 @@ def _get_id_sequence_dict(
     def _load_substitution_matrix(self) -> "BioSubstitutionMatrix":
         from Bio.Align import substitution_matrices
 
-        return substitution_matrices.load(self.substitution_matrix)  # type: ignore
+        return substitution_matrices.load(self.substitution_matrix)  # type: ignore
\ No newline at end of file
diff --git a/src/pyeed/dbconnect.py b/src/pyeed/dbconnect.py
index ec2df259..d208deec 100644
--- a/src/pyeed/dbconnect.py
+++ b/src/pyeed/dbconnect.py
@@ -227,7 +227,9 @@ def _get_driver(uri: str, user: str | None, password: str | None) -> Driver:
         Creates a new Neo4j driver instance.
         """
         auth = (user, password) if user and password else None
-        return GraphDatabase.driver(uri, auth=auth)
+        return GraphDatabase.driver(uri, auth=auth, connection_timeout=60,  # Increase initial connection timeout
+        max_connection_lifetime=86400,  # Keep connections alive longer
+        )
 
     @property
     def node_properties(self) -> list[dict[str, str]]:

From f4d04c8f1581f9abba6e1de440525d2c03909cae Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Wed, 26 Mar 2025 16:36:23 +0000
Subject: [PATCH 10/39] added catalytic activity to uniprot model

---
 src/pyeed/adapter/uniprot_mapper.py | 17 +++++++++++++++--
 src/pyeed/model.py                  |  8 +++++---
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/pyeed/adapter/uniprot_mapper.py b/src/pyeed/adapter/uniprot_mapper.py
index 5a285adb..cbc9460b 100644
--- a/src/pyeed/adapter/uniprot_mapper.py
+++ b/src/pyeed/adapter/uniprot_mapper.py
@@ -83,17 +83,30 @@ def add_catalytic_activity(self, record: dict[str, Any], protein: Protein) -> No
         try:
             for reference in record["comments"]:
                 if reference["type"] == "CATALYTIC_ACTIVITY":
+                    name = reference["reaction"]["name"]
+                    for i in reference["reaction"]["dbReferences"]:
+                        if i['id'].startswith("RHEA:"):
+                            rhea_id = i['id']
+                            break
+                    left_side, right_side = name.split("=")
+
+                    # Further split each side by "+"
+                    left_list = list(left_side.strip().split(" + "))
+                    right_list = list(right_side.strip().split(" + "))
+                    
                     catalytic_annotation = CatalyticActivity.get_or_save(
                         catalytic_id=int(reference["id"])
                         if reference.get("id")
                         else None,
-                        name=reference["reaction"]["name"],
+                        rhea_id=rhea_id,
+                        reactants = left_list,
+                        products = right_list,
                     )
                     protein.catalytic_annotation.connect(catalytic_annotation)
 
         except Exception as e:
             logger.error(
-                f"Error saving catalytic activity for {protein.accession_id}: {e}"
+                f"No Catalytic Activity for {protein.accession_id}: {e}"
             )
 
     def add_go(self, record: dict[str, Any], protein: Protein) -> None:
diff --git a/src/pyeed/model.py b/src/pyeed/model.py
index aa374669..d1c155b7 100644
--- a/src/pyeed/model.py
+++ b/src/pyeed/model.py
@@ -375,12 +375,14 @@ class CatalyticActivity(StrictStructuredNode):
     """
 
     catalytic_id = IntegerProperty(required=False, unique_index=True)
-    name = StringProperty()
+    rhea_id = StringProperty(required=False, unique_index=True)
+    reactants = ArrayProperty()
+    products = ArrayProperty()
 
     @property
     def label(self) -> str:
         """The label of the catalytic activity."""
-        return str(self.name)
+        return str(self.rhea_id)
 
 
 class StandardNumbering(StrictStructuredNode):
@@ -508,7 +510,7 @@ class Protein(StrictStructuredNode):
     site = RelationshipTo("Site", "HAS_SITE", model=SiteRel)
     region = RelationshipTo("Region", "HAS_REGION", model=RegionRel)
     go_annotation = RelationshipTo("GOAnnotation", "ASSOCIATED_WITH")
-    catalytic_annotation = RelationshipTo("CatalyticActivity", "CATALYTIC_ACTIVITY")
+    catalytic_annotation = RelationshipTo("CatalyticActivity", "HAS_CATALYTIC_ACTIVITY")
     ontology_object = RelationshipTo("OntologyObject", "ASSOCIATED_WITH")
     mutation = RelationshipTo("Protein", "MUTATION", model=Mutation)
     pairwise_aligned = RelationshipTo(

From ee68c0af8c97591a4f8d849cef8529633d798710 Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Wed, 26 Mar 2025 17:24:42 +0000
Subject: [PATCH 11/39] possibility to split data across multiple gpus

---
 src/pyeed/embedding.py | 165 ++++++++++++++++++++++++++++-------------
 src/pyeed/main.py      |  95 +++++++++++++++---------
 2 files changed, 175 insertions(+), 85 deletions(-)

diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py
index cb7f2d82..31ce7dfd 100644
--- a/src/pyeed/embedding.py
+++ b/src/pyeed/embedding.py
@@ -1,11 +1,13 @@
 import gc
 import os
 from typing import Any, Tuple, Union
+from loguru import logger
 
 import numpy as np
 import torch
 from esm.models.esmc import ESMC
-from esm.sdk.api import ESMProtein, LogitsConfig
+from esm.models.esm3 import ESM3
+from esm.sdk.api import ESM3InferenceClient, ESMProtein, LogitsConfig, SamplingConfig
 from huggingface_hub import HfFolder, login
 from numpy.typing import NDArray
 from transformers import EsmModel, EsmTokenizer
@@ -29,92 +31,129 @@ def get_hf_token() -> str:
     else:
         raise RuntimeError("Failed to get Hugging Face token")
 
+def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db):
+    """
+    Splits data into batches and processes them on a single GPU.
+
+    Args:
+        data (list): List of (accession_id, sequence) tuples.
+        batch_size (int): Size of each batch.
+        model: The model instance for this GPU.
+        tokenizer: The tokenizer for the model.
+        device (str): The assigned GPU device.
+        db: Database connection.
+    """
+    logger.debug(f"Processing {len(data)} sequences on {device}.")
+
+    model = model.to(device)
+
+    # Split data into smaller batches
+    for batch_start in range(0, len(data), batch_size):
+        batch_end = min(batch_start + batch_size, len(data))
+        batch = data[batch_start:batch_end]
+
+        accessions, sequences = zip(*batch)
+
+        current_batch_size = len(sequences)
+
+        while current_batch_size > 0:
+            try:
+                # Compute embeddings
+                embeddings_batch = get_batch_embeddings(
+                    list(sequences[:current_batch_size]), model, tokenizer, device
+                )
+
+                # Update the database
+                update_protein_embeddings_in_db(db, list(accessions[:current_batch_size]), embeddings_batch)
+
+                # Move to the next batch
+                break  # Successful execution, move to the next batch
+
+            except torch.cuda.OutOfMemoryError:
+                torch.cuda.empty_cache()
+                current_batch_size = max(1, current_batch_size // 2)  # Reduce batch size
+                logger.warning(f"Reduced batch size to {current_batch_size} due to OOM error.")
+
+    # Free memory
+    del model
+    torch.cuda.empty_cache()
+
 
 def load_model_and_tokenizer(
     model_name: str,
-    use_all_gpus: bool = True
-) -> Tuple[
-    Union[EsmModel, ESMC],  # Changed from ESM3InferenceClient to ESMC
-    Union[EsmTokenizer, None],
-    torch.device,
-]:
+    device: str,
+    ) -> Tuple[Any, Union[Any, None], str]:
     """
-    Loads either an ESM-3 (using ESMC) or an ESM-2 (using Transformers) model,
-    depending on the `model_name` provided.
+    Loads the model and assigns it to a specific GPU.
 
     Args:
-        model_name (str): The model name or identifier (e.g., 'esmc' or 'esm2_t12_35M_UR50D').
+        model_name (str): The model name.
+        device (str): The specific GPU device.
 
     Returns:
-        Tuple of (model, tokenizer, device)
+        Tuple: (model, tokenizer, device)
     """
-    # Get token only when loading model
     token = get_hf_token()
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    num_gpus = torch.cuda.device_count()
+    tokenizer = None
 
-    # Check if this is an ESM-3 variant
     if "esmc" in model_name.lower():
-        # Using ESMC from_pretrained
-        model = ESMC.from_pretrained(model_name).to(device)
-        if use_all_gpus and torch.cuda.device_count() > 1:
-            model = torch.nn.DataParallel(model)
-        return model, None, device
+        model = ESMC.from_pretrained(model_name)
+    elif "esm3-sm-open-v1" in model_name.lower():
+        model = ESM3.from_pretrained("esm3_sm_open_v1")
     else:
-        # Otherwise, assume it's an ESM-2 model on Hugging Face
         full_model_name = (
             model_name
             if model_name.startswith("facebook/")
             else f"facebook/{model_name}"
         )
-        model = EsmModel.from_pretrained(full_model_name, use_auth_token=token).to(device)
+        model = EsmModel.from_pretrained(full_model_name, use_auth_token=token)
         tokenizer = EsmTokenizer.from_pretrained(full_model_name, use_auth_token=token)
-        if use_all_gpus and torch.cuda.device_count() > 1:
-            model = torch.nn.DataParallel(model)
 
-        return model, tokenizer, device
+    model = model.to(device)
+    return model, tokenizer, device
 
 
 def get_batch_embeddings(
     batch_sequences: list[str],
-    model: Union[EsmModel, ESMC],
+    model: Union[
+        EsmModel,
+        ESMC,
+        torch.nn.DataParallel,
+        ESM3InferenceClient,
+        ESM3,
+    ],
     tokenizer_or_alphabet: Union[EsmTokenizer, None],
     device: torch.device,
     pool_embeddings: bool = True,
-    use_all_gpus: bool = True,
 ) -> list[NDArray[np.float64]]:
     """
     Generates mean-pooled embeddings for a batch of sequences.
+    Supports ESM++, ESM-2 and ESM-3 models.
 
     Args:
-        batch_sequences (list[str]): List of sequence strings to be embedded.
-        model (Union[EsmModel, ESMC]): Loaded model (ESM-2 or ESM-3).
-        tokenizer_or_alphabet (Union[EsmTokenizer, None]): Tokenizer if ESM-2, None if ESM-3.
-        device (torch.device): Device on which to run inference (CPU or GPU).
-        pool_embeddings (bool): Whether to pool embeddings across sequence length.
+        batch_sequences (list[str]): List of sequence strings.
+        model: Loaded model (could be wrapped in DataParallel).
+        tokenizer_or_alphabet: Tokenizer if needed.
+        device: Inference device (CPU/GPU).
+        pool_embeddings (bool): Whether to average embeddings across the sequence length.
 
     Returns:
-        list[NDArray[np.float64]]: A list of embeddings as NumPy arrays.
+        List of embeddings as NumPy arrays.
     """
-    # Wrap model in DataParallel if multiple GPUs should be used
-    if use_all_gpus and torch.cuda.device_count() > 1:
-        model = torch.nn.DataParallel(model)  # Multi-GPU
-        print(f"Using {torch.cuda.device_count()} GPUs for inference.")
+    # First, determine the base model type
+    base_model = model.module if isinstance(model, torch.nn.DataParallel) else model
 
-    model = model.to(device)
-    model.eval()  # Ensure model is in inference mode
-    
-    if isinstance(model.module if use_all_gpus else model, ESMC):
+    if isinstance(base_model, ESMC):
+        # For ESMC models
+        embedding_list = []
         with torch.no_grad():
-            embedding_list = []
             for sequence in batch_sequences:
-                # Process each sequence individually
                 protein = ESMProtein(sequence=sequence)
-                protein_tensor = model.module.encode(protein) if use_all_gpus else model.encode(protein)
-                logits_output = model.logits(
+                # Use the model directly - DataParallel handles internal distribution
+                protein_tensor = base_model.encode(protein)
+                logits_output = base_model.logits(
                     protein_tensor, LogitsConfig(sequence=True, return_embeddings=True)
                 )
-                # Convert embeddings to numpy array - ensure embeddings is not None
                 if logits_output.embeddings is None:
                     raise ValueError(
                         "Model did not return embeddings. Check LogitsConfig settings."
@@ -123,9 +162,27 @@ def get_batch_embeddings(
                 if pool_embeddings:
                     embeddings = embeddings.mean(axis=1)
                 embedding_list.append(embeddings[0])
-
         return embedding_list
-
+    elif isinstance(base_model, ESM3):
+        # For ESM3 models
+        embedding_list = []
+        with torch.no_grad():
+            for sequence in batch_sequences:
+                protein = ESMProtein(sequence=sequence)
+                sequence_encoding = base_model.encode(protein)
+                result = base_model.forward_and_sample(
+                    sequence_encoding,
+                    SamplingConfig(return_per_residue_embeddings=True),
+                )
+                if result is None or result.per_residue_embedding is None:
+                    raise ValueError("Model did not return embeddings")
+                embeddings = (
+                    result.per_residue_embedding.to(torch.float32).cpu().numpy()
+                )
+                if pool_embeddings:
+                    embeddings = embeddings.mean(axis=0)
+                embedding_list.append(embeddings)
+        return embedding_list
     else:
         # ESM-2 logic
         assert tokenizer_or_alphabet is not None, "Tokenizer required for ESM-2 models"
@@ -133,11 +190,15 @@ def get_batch_embeddings(
             batch_sequences, padding=True, truncation=True, return_tensors="pt"
         ).to(device)
         with torch.no_grad():
-            outputs = model(**inputs)
-        embeddings = outputs.last_hidden_state.cpu().numpy()
+            outputs = model(**inputs, output_hidden_states=True)
+
+        # Get last hidden state for each sequence
+        hidden_states = outputs.last_hidden_state.cpu().numpy()
+
         if pool_embeddings:
-            return [embedding.mean(axis=0) for embedding in embeddings]
-        return list(embeddings)
+            # Mean pooling across sequence length
+            return [embedding.mean(axis=0) for embedding in hidden_states]
+        return list(hidden_states)
 
 
 def calculate_single_sequence_embedding_last_hidden_state(
diff --git a/src/pyeed/main.py b/src/pyeed/main.py
index f4783f59..30984592 100644
--- a/src/pyeed/main.py
+++ b/src/pyeed/main.py
@@ -1,5 +1,8 @@
 import asyncio
 from typing import Any, Literal
+import time
+from concurrent.futures import ThreadPoolExecutor
+import torch
 
 import nest_asyncio
 from loguru import logger
@@ -15,6 +18,7 @@
     get_batch_embeddings,
     load_model_and_tokenizer,
     update_protein_embeddings_in_db,
+    process_batches_on_gpu
 )
 
 
@@ -193,61 +197,86 @@ def calculate_sequence_embeddings(
         self,
         batch_size: int = 16,
         model_name: str = "facebook/esm2_t33_650M_UR50D",
-        use_all_gpus: bool = True,
-    ) -> None:
+        num_gpus: int = None,  # Number of GPUs to use
+        ) -> None:
         """
-        Calculates embeddings for all sequences in the database that do not have embeddings, processing in batches.
+        Calculates embeddings for all sequences in the database that do not have embeddings, 
+        distributing the workload across available GPUs.
 
         Args:
             batch_size (int): Number of sequences to process in each batch.
-            model_name (str): Name of the model to use for calculating embeddings.
-                Defaults to "facebook/esm2_t33_650M_UR50D".
-                Available models can be found at https://huggingface.co/facebook/esm2_t6_8M_UR50D.
+            model_name (str): Model used for calculating embeddings.
+            num_gpus (int, optional): Number of GPUs to use. If None, use all available GPUs.
         """
 
-        # Load the model, tokenizer, and device
-        model, tokenizer, device = load_model_and_tokenizer(model_name, use_all_gpus)
+        # Get the available GPUs
+        available_gpus = torch.cuda.device_count()
+        if num_gpus is None or num_gpus > available_gpus:
+            num_gpus = available_gpus
+
+        if num_gpus == 0:
+            logger.warning("No GPU available! Running on CPU.")
+
+        # Load separate models for each GPU
+        devices = [f"cuda:{i}" for i in range(num_gpus)] if num_gpus > 0 else ["cpu"]
+        models_and_tokenizers = [
+            load_model_and_tokenizer(model_name, device) for device in devices
+        ]
 
-        # Cypher query to retrieve proteins without embeddings and with valid sequences
+        # Retrieve sequences without embeddings
         query = """
         MATCH (p:Protein)
         WHERE p.embedding IS NULL AND p.sequence IS NOT NULL
         RETURN p.accession_id AS accession, p.sequence AS sequence
         """
-
-        # Execute the query and retrieve the results
         results = self.db.execute_read(query)
         data = [(result["accession"], result["sequence"]) for result in results]
+        
         if not data:
             logger.info("No sequences to process.")
             return
+        
         accessions, sequences = zip(*data)
         total_sequences = len(sequences)
-        logger.debug(f"Calculating embeddings for {total_sequences} sequences.")
-
-        # Process and save embeddings batch by batch
-        for batch_start in range(0, total_sequences, batch_size):
-            batch_end = min(batch_start + batch_size, total_sequences)
-            batch_sequences = sequences[batch_start:batch_end]
-            batch_accessions = accessions[batch_start:batch_end]
-            logger.debug(
-                f"Processing batch {batch_start // batch_size + 1}/"
-                f"{(total_sequences + batch_size - 1) // batch_size + 1}"
-            )
+        logger.debug(f"Total sequences to process: {total_sequences}")
 
-            # Get embeddings for the current batch
-            embeddings_batch = get_batch_embeddings(
-                list(batch_sequences), model, tokenizer, device, use_all_gpus
-            )
+        # Split the data into num_gpus chunks
+        gpu_batches = [
+            list(zip(accessions[i::num_gpus], sequences[i::num_gpus])) for i in range(num_gpus)
+        ]
 
-            # Update the database for the current batch
-            update_protein_embeddings_in_db(
-                self.db, list(batch_accessions), embeddings_batch
-            )
+        start_time = time.time()
+
+        # Process batches in parallel across GPUs
+        with ThreadPoolExecutor(max_workers=num_gpus) as executor:
+            futures = []
+            for i, gpu_data in enumerate(gpu_batches):
+                if not gpu_data:
+                    continue  # Skip empty GPU batches
+
+                model, tokenizer, device = models_and_tokenizers[i]
+                futures.append(
+                    executor.submit(
+                        process_batches_on_gpu,
+                        gpu_data,
+                        batch_size,
+                        model,
+                        tokenizer,
+                        device,
+                        self.db
+                    )
+                )
+            
+            for future in futures:
+                future.result()  # Wait for all threads to complete
+
+
+        end_time = time.time()
+        logger.info(f"Total embedding calculation time: {end_time - start_time:.2f} seconds")
 
-        # Free memory after processing all batches
-        del model, tokenizer
-        free_memory()
+        # Cleanup
+        for model, _, _ in models_and_tokenizers:
+            del model
 
     def get_proteins(self, accession_ids: list[str]) -> list[dict[str, Any]]:
         """

From 01fc23aa654795cb438c4c298da8469f1ea26dc2 Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Wed, 26 Mar 2025 22:15:35 +0000
Subject: [PATCH 12/39] changed embeddings to multi gpu

---
 src/pyeed/embedding.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py
index 31ce7dfd..c8fa91db 100644
--- a/src/pyeed/embedding.py
+++ b/src/pyeed/embedding.py
@@ -15,6 +15,7 @@
 from pyeed.dbconnect import DatabaseConnector
 
 
+
 def get_hf_token() -> str:
     """Get or request Hugging Face token."""
     if os.getenv("PYTEST_DISABLE_HF_LOGIN"):  # Disable Hugging Face login in tests

From b7c37e6cc3ce9d99b5c5ee7a0dfa387ce040d6f7 Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Mon, 31 Mar 2025 16:42:48 +0000
Subject: [PATCH 13/39] changed catalytic activity to reaction

---
 src/pyeed/adapter/uniprot_mapper.py | 55 +++++++++++++++--------------
 src/pyeed/model.py                  | 15 ++++----
 2 files changed, 36 insertions(+), 34 deletions(-)

diff --git a/src/pyeed/adapter/uniprot_mapper.py b/src/pyeed/adapter/uniprot_mapper.py
index cbc9460b..f6d0a202 100644
--- a/src/pyeed/adapter/uniprot_mapper.py
+++ b/src/pyeed/adapter/uniprot_mapper.py
@@ -8,7 +8,7 @@
 from pyeed.adapter.primary_db_adapter import PrimaryDBMapper
 from pyeed.model import (
     Annotation,
-    CatalyticActivity,
+    Reaction,
     GOAnnotation,
     Organism,
     Protein,
@@ -57,9 +57,9 @@ def add_to_db(self, response: Response) -> None:
                 return
 
             protein.organism.connect(organism)
+            self.add_reaction(record, protein)
 
         self.add_sites(record, protein)
-        self.add_catalytic_activity(record, protein)
         self.add_go(record, protein)
 
     def add_sites(self, record: dict[str, Any], protein: Protein) -> None:
@@ -79,35 +79,38 @@ def add_sites(self, record: dict[str, Any], protein: Protein) -> None:
 
             protein.site.connect(site, {"positions": positions})
 
-    def add_catalytic_activity(self, record: dict[str, Any], protein: Protein) -> None:
-        try:
-            for reference in record["comments"]:
-                if reference["type"] == "CATALYTIC_ACTIVITY":
-                    name = reference["reaction"]["name"]
-                    for i in reference["reaction"]["dbReferences"]:
-                        if i['id'].startswith("RHEA:"):
-                            rhea_id = i['id']
-                            break
+    def add_reaction(self, record: dict[str, Any], protein: Protein) -> None:
+        for reference in record.get("comments", []):  # Safe retrieval with .get()
+            if reference.get("type") == "CATALYTIC_ACTIVITY":
+                name = reference.get("reaction", {}).get("name", "")
+                rhea_id = None  # Default value
+
+                for db_ref in reference.get("reaction", {}).get("dbReferences", []):
+                    if db_ref.get("id", "").startswith("RHEA:"):
+                        rhea_id = db_ref["id"]
+                        break  # Stop after finding the first match
+
+                # Ensure we have both a reaction name and an RHEA ID
+                if not name or not rhea_id:
+                    logger.warning(f"Skipping {protein.accession_id}: Missing reaction name or RHEA ID")
+                    continue  # Move to the next reference
+
+                try:
                     left_side, right_side = name.split("=")
+                    left_list = [s.strip() for s in left_side.split(" + ")]
+                    right_list = [s.strip() for s in right_side.split(" + ")]
 
-                    # Further split each side by "+"
-                    left_list = list(left_side.strip().split(" + "))
-                    right_list = list(right_side.strip().split(" + "))
-                    
-                    catalytic_annotation = CatalyticActivity.get_or_save(
-                        catalytic_id=int(reference["id"])
-                        if reference.get("id")
-                        else None,
+                    catalytic_annotation = Reaction.get_or_save(
                         rhea_id=rhea_id,
-                        reactants = left_list,
-                        products = right_list,
+                        reactants=left_list,
+                        products=right_list,
                     )
-                    protein.catalytic_annotation.connect(catalytic_annotation)
+                    protein.reaction.connect(catalytic_annotation)
+
+                except Exception as parse_error:
+                    logger.error(f"Error processing reaction for {protein.accession_id}: {parse_error}")
+                    continue  # Continue processing next accession_id
 
-        except Exception as e:
-            logger.error(
-                f"No Catalytic Activity for {protein.accession_id}: {e}"
-            )
 
     def add_go(self, record: dict[str, Any], protein: Protein) -> None:
         for reference in record["dbReferences"]:
diff --git a/src/pyeed/model.py b/src/pyeed/model.py
index d1c155b7..1162933f 100644
--- a/src/pyeed/model.py
+++ b/src/pyeed/model.py
@@ -369,20 +369,19 @@ def label(self) -> str:
         return f"{self.start}-{self.end}"
 
 
-class CatalyticActivity(StrictStructuredNode):
+class Reaction(StrictStructuredNode):
     """
-    A node representing a catalytic activity.
+    A node representing a reaction.
     """
-
-    catalytic_id = IntegerProperty(required=False, unique_index=True)
+    
     rhea_id = StringProperty(required=False, unique_index=True)
     reactants = ArrayProperty()
     products = ArrayProperty()
 
     @property
     def label(self) -> str:
-        """The label of the catalytic activity."""
-        return str(self.rhea_id)
+        """The label of the reaction."""
+        return {self.rhea_id}
 
 
 class StandardNumbering(StrictStructuredNode):
@@ -497,7 +496,7 @@ class Protein(StrictStructuredNode):
     locus_tag = StringProperty()
     structure_ids = ArrayProperty(StringProperty())
     go_terms = ArrayProperty(StringProperty())
-    catalytic_name = ArrayProperty(StringProperty())
+    rhea_id = ArrayProperty(StringProperty())
     embedding = ArrayProperty(
         FloatProperty(),
         vector_index=VectorIndex(dimensions=1280),
@@ -510,7 +509,7 @@ class Protein(StrictStructuredNode):
     site = RelationshipTo("Site", "HAS_SITE", model=SiteRel)
     region = RelationshipTo("Region", "HAS_REGION", model=RegionRel)
     go_annotation = RelationshipTo("GOAnnotation", "ASSOCIATED_WITH")
-    catalytic_annotation = RelationshipTo("CatalyticActivity", "HAS_CATALYTIC_ACTIVITY")
+    reaction = RelationshipTo("Reaction", "HAS_REACTION")
     ontology_object = RelationshipTo("OntologyObject", "ASSOCIATED_WITH")
     mutation = RelationshipTo("Protein", "MUTATION", model=Mutation)
     pairwise_aligned = RelationshipTo(

From b8a82df7f66713e46eb3feb44a098da16cf7e100 Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Mon, 31 Mar 2025 16:44:14 +0000
Subject: [PATCH 14/39] add more efficient cypher queries

---
 src/pyeed/analysis/network_analysis.py | 118 ++++++++++++++++---------
 1 file changed, 74 insertions(+), 44 deletions(-)

diff --git a/src/pyeed/analysis/network_analysis.py b/src/pyeed/analysis/network_analysis.py
index dab37fa0..fd354ebe 100644
--- a/src/pyeed/analysis/network_analysis.py
+++ b/src/pyeed/analysis/network_analysis.py
@@ -20,6 +20,27 @@ def __init__(self, db: DatabaseConnector):
         self.db: DatabaseConnector = db
         self.graph: nx.Graph = nx.Graph()
 
+    def check_indexes(self) -> list[dict[str, Any]]:
+        """
+        Checks all existing indexes in the Neo4j database.
+
+        Returns:
+            list[dict[str, Any]]: List of dictionaries containing index information including:
+                - name: The name of the index
+                - type: The type of index (e.g., "BTREE", "LOOKUP")
+                - labelsOrTypes: The labels or relationship types the index is on
+                - properties: The properties the index is on
+                - uniqueness: Whether the index is unique
+                - state: The state of the index (e.g., "ONLINE", "POPULATING")
+        """
+        query = """
+        SHOW INDEXES
+        """
+        logger.info("Checking existing indexes in the database")
+        indexes = self.db.execute_read(query)
+        logger.info(f"Found {len(indexes)} indexes")
+        return indexes
+
     def create_graph(
         self,
         nodes: Optional[list[str]] = None,
@@ -37,68 +58,77 @@ def create_graph(
         Returns:
             networkx.Graph: The created graph.
         """
-
         logger.info(
             f"Creating graph with node types: {nodes} and relationships: {relationships} and ids: {ids}"
         )
 
-        # Query to fetch nodes with filters
-        node_filter = ""
+        # Build the base query
+        base_query = """
+        MATCH (n)
+        """
+        
+        # Add node filters
+        node_filters = []
         if nodes:
-            node_filter += "WHERE labels(n)[0] IN $node_types "
+            node_filters.append("labels(n)[0] IN $node_types")
         if ids:
-            if "WHERE" in node_filter:
-                node_filter += "AND n.accession_id IN $accession_ids "
-            else:
-                node_filter += "WHERE n.accession_id IN $accession_ids "
-
-        query_nodes = f"""
-        MATCH (n)
-        {node_filter}
-        RETURN ID(n) as id, labels(n) as labels, properties(n) as properties
+            node_filters.append("n.accession_id IN $accession_ids")
+            
+        if node_filters:
+            base_query += "WHERE " + " AND ".join(node_filters)
+            
+        # Add relationship pattern and filters
+        base_query += """
+        OPTIONAL MATCH (n)-[r]->(m)
         """
-
-        # Query to fetch relationships with filters
-        relationship_filter = ""
+        
+        # Add relationship type filter if specified
         if relationships:
-            relationship_filter += "WHERE type(r) IN $relationships "
-
-        query_relationships = f"""
-        MATCH (n)-[r]->(m)
-        {relationship_filter}
-        RETURN ID(n) as source, ID(m) as target, type(r) as type, properties(r) as properties
+            base_query += "WHERE type(r) IN $relationships "
+            
+        # Return both nodes and relationships in a single query
+        base_query += """
+        RETURN 
+            collect(DISTINCT {id: ID(n), labels: labels(n), properties: properties(n)}) as nodes,
+            collect(DISTINCT {source: ID(n), target: ID(m), type: type(r), properties: properties(r)}) as relationships
         """
-
-        # Fetch nodes and relationships
-        logger.debug(f"Executing query: {query_nodes}")
-        nodes_results = self.db.execute_read(
-            query_nodes, {"node_types": nodes, "accession_ids": ids}
+        
+        logger.info("Executing combined query for nodes and relationships")
+        results = self.db.execute_read(
+            base_query,
+            {
+                "node_types": nodes,
+                "accession_ids": ids,
+                "relationships": relationships
+            }
         )
-        logger.debug(f"Executing query: {query_relationships}")
-        relationships_results = self.db.execute_read(
-            query_relationships, {"relationships": relationships}
-        )
-        logger.debug(f"Number of nodes: {len(nodes_results)}")
-        logger.debug(f"Number of relationships: {len(relationships_results)}")
-
-        # Add nodes
-        for node in nodes_results:
+        
+        if not results or not results[0]:
+            logger.warning("No results found in the database")
+            return self.graph
+            
+        # Process nodes
+        nodes_data = results[0]["nodes"]
+        for node in nodes_data:
             self.graph.add_node(
                 node["id"],
                 labels=node["labels"],
-                properties=node["properties"],
+                properties=node["properties"]
             )
-
-        # Add relationships
-        for rel in relationships_results:
+        logger.info(f"Added {len(nodes_data)} nodes to the graph")
+        
+        # Process relationships
+        relationships_data = results[0]["relationships"]
+        for rel in relationships_data:
             if rel["source"] in self.graph and rel["target"] in self.graph:
                 self.graph.add_edge(
                     rel["source"],
                     rel["target"],
                     type=rel["type"],
-                    properties=rel["properties"],
+                    properties=rel["properties"]
                 )
-
+        logger.info(f"Added {len(relationships_data)} relationships to the graph")
+        
         return self.graph
 
     def compute_degree_centrality(self) -> dict[Any, float]:
@@ -233,8 +263,8 @@ def calculate_positions_2d(
         filtered_graph.remove_edges_from(self_referential_edges)
 
         # Find isolated nodes
-        isolated_nodes = self.find_isolated_nodes(filtered_graph)
-        filtered_graph.remove_nodes_from(isolated_nodes)
+        #isolated_nodes = self.find_isolated_nodes(filtered_graph)
+        #filtered_graph.remove_nodes_from(isolated_nodes)
 
         # Use spring layout for force-directed graph
         weight_attr = attribute if attribute is not None else None

From 4406df396f850a4cbb8a8f38931a8d713172f2b2 Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Wed, 16 Apr 2025 09:16:32 +0000
Subject: [PATCH 15/39] added substrates and products to catalytic activity

---
 src/pyeed/adapter/uniprot_mapper.py      | 141 +++++++++++++++++++----
 src/pyeed/analysis/embedding_analysis.py |   5 +-
 src/pyeed/model.py                       |  43 ++++++-
 3 files changed, 160 insertions(+), 29 deletions(-)

diff --git a/src/pyeed/adapter/uniprot_mapper.py b/src/pyeed/adapter/uniprot_mapper.py
index f6d0a202..4f48c31b 100644
--- a/src/pyeed/adapter/uniprot_mapper.py
+++ b/src/pyeed/adapter/uniprot_mapper.py
@@ -1,6 +1,9 @@
 import json
 from collections import defaultdict
-from typing import Any
+from typing import Any, List
+import requests
+from bs4 import BeautifulSoup
+from SPARQLWrapper import SPARQLWrapper, JSON
 
 from httpx import Response
 from loguru import logger
@@ -13,6 +16,8 @@
     Organism,
     Protein,
     Site,
+    Reaction, 
+    Molecule,
 )
 
 
@@ -78,6 +83,83 @@ def add_sites(self, record: dict[str, Any], protein: Protein) -> None:
             site.save()
 
             protein.site.connect(site, {"positions": positions})
+    
+    def get_substrates_and_products_from_rhea(self, rhea_id: str) -> dict[str, List[str]]:
+        """Fetch substrates and products from Rhea by parsing the side URI (_L = substrate, _R = product).
+        
+        Args:
+            rhea_id (str or int): The Rhea reaction ID (e.g., 49528)
+        
+        Returns:
+            dict: {
+                'substrates': [list of chebi URIs],
+                'products': [list of chebi URIs]
+            }
+        """
+        rhea_id = rhea_id.strip().replace("RHEA:", "")
+        rhea_id_str = str(rhea_id).strip()
+        sparql = SPARQLWrapper("https://sparql.rhea-db.org/sparql")
+        sparql.setQuery(f"""
+        PREFIX rh: 
+        PREFIX rdfs: 
+
+        SELECT DISTINCT ?participant ?compound ?chebi ?side
+        WHERE {{
+        rh:{rhea_id_str} rh:side ?side .
+        ?side rh:contains ?participant .
+        ?participant rh:compound ?compound .
+        OPTIONAL {{ ?compound rh:chebi ?chebi . }}
+        OPTIONAL {{ ?compound rh:underlyingChebi ?chebi . }}
+        OPTIONAL {{
+            ?compound rdfs:seeAlso ?chebi .
+            FILTER STRSTARTS(STR(?chebi), "http://purl.obolibrary.org/obo/CHEBI_")
+        }}
+        }}
+        """)
+        sparql.setReturnFormat(JSON)
+        sparql.addCustomHttpHeader("User-Agent", "MyPythonClient/1.0")
+
+        results = sparql.query().convert()
+
+        substrates = set()
+        products = set()
+
+        for r in results["results"]["bindings"]:
+            chebi_uri = r.get("chebi", {}).get("value")
+            if not chebi_uri:
+                logger.info(f"No ChEBI URI found for compound {r['compound']['value']}")
+
+            side_uri = r["side"]["value"]
+            if side_uri.endswith("_L"):
+                substrates.add(chebi_uri)
+            elif side_uri.endswith("_R"):
+                products.add(chebi_uri)
+
+        return {
+            "substrates": sorted(substrates),
+            "products": sorted(products)
+        }
+
+    
+
+    def get_smiles_from_chebi_web(self, chebi_url: str) -> str:
+        """
+        Extract SMILES from the official ChEBI page using HTML scraping.
+        """
+        chebi_id = chebi_url.split('_')[-1]
+        url = f"https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:{chebi_id}"
+
+        response = requests.get(url)
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        # Look for table rows that contain the SMILES label
+        for table in soup.find_all("table", class_="chebiTableContent"):
+            for row in table.find_all("tr"):
+                headers = row.find_all("td", class_="chebiDataHeader")
+                if headers and "SMILES" in headers[0].text:
+                    data_cell = row.find_all("td")[-1]  # Get the last  in row
+                    return data_cell.text.strip()
+                
 
     def add_reaction(self, record: dict[str, Any], protein: Protein) -> None:
         for reference in record.get("comments", []):  # Safe retrieval with .get()
@@ -89,28 +171,41 @@ def add_reaction(self, record: dict[str, Any], protein: Protein) -> None:
                     if db_ref.get("id", "").startswith("RHEA:"):
                         rhea_id = db_ref["id"]
                         break  # Stop after finding the first match
-
-                # Ensure we have both a reaction name and an RHEA ID
-                if not name or not rhea_id:
-                    logger.warning(f"Skipping {protein.accession_id}: Missing reaction name or RHEA ID")
-                    continue  # Move to the next reference
-
-                try:
-                    left_side, right_side = name.split("=")
-                    left_list = [s.strip() for s in left_side.split(" + ")]
-                    right_list = [s.strip() for s in right_side.split(" + ")]
-
-                    catalytic_annotation = Reaction.get_or_save(
-                        rhea_id=rhea_id,
-                        reactants=left_list,
-                        products=right_list,
-                    )
-                    protein.reaction.connect(catalytic_annotation)
-
-                except Exception as parse_error:
-                    logger.error(f"Error processing reaction for {protein.accession_id}: {parse_error}")
-                    continue  # Continue processing next accession_id
-
+                
+                catalytic_annotation = Reaction.get_or_save(
+                    rhea_id=rhea_id,
+                )
+                self.add_molecule(rhea_id, catalytic_annotation)
+                protein.reaction.connect(catalytic_annotation)
+
+    def add_molecule(self, rhea_id: str, reaction: Reaction) -> None:
+    
+        chebi = self.get_substrates_and_products_from_rhea(rhea_id)
+
+        substrate_ids = chebi["substrates"]
+        product_ids = chebi["products"]
+        
+        for i in substrate_ids:
+            smiles = self.get_smiles_from_chebi_web(i)
+            
+            chebi_id = i.split('_')[-1]
+            chebi_id = f"CHEBI:{chebi_id}"
+            substrate = Molecule.get_or_save(
+                chebi_id=chebi_id,
+                smiles=smiles,
+            )
+            reaction.substrate.connect(substrate)
+        
+        for i in product_ids:
+            smiles = self.get_smiles_from_chebi_web(i)
+
+            chebi_id = i.split('_')[-1]
+            chebi_id = f"CHEBI:{chebi_id}"
+            product = Molecule.get_or_save(
+                chebi_id=chebi_id,
+                smiles=smiles,
+            )
+            reaction.product.connect(product)
 
     def add_go(self, record: dict[str, Any], protein: Protein) -> None:
         for reference in record["dbReferences"]:
diff --git a/src/pyeed/analysis/embedding_analysis.py b/src/pyeed/analysis/embedding_analysis.py
index fa9d6c0e..b3535f74 100644
--- a/src/pyeed/analysis/embedding_analysis.py
+++ b/src/pyeed/analysis/embedding_analysis.py
@@ -348,8 +348,8 @@ def create_embedding_vector_index_neo4j(
 
     def find_nearest_neighbors_based_on_vector_index(
         self,
+        query_id: str,
         db: DatabaseConnector,
-        query_protein_id: str,
         index_name: str = "embedding_index",
         number_of_neighbors: int = 50,
     ) -> list[tuple[str, float]]:
@@ -406,10 +406,11 @@ def find_nearest_neighbors_based_on_vector_index(
             logger.info(f"Index {index_name} is populated, finding nearest neighbors")
 
         query_find_nearest_neighbors = f"""
-        MATCH (source:Protein {{accession_id: '{query_protein_id}'}})
+        MATCH (source:Protein {{accession_id: '{query_id}'}})
         WITH source.embedding AS embedding
         CALL db.index.vector.queryNodes('{index_name}', {number_of_neighbors}, embedding)
         YIELD node AS fprotein, score
+        WHERE score > 0.95
         RETURN fprotein.accession_id, score
         """
         results = db.execute_read(query_find_nearest_neighbors)
diff --git a/src/pyeed/model.py b/src/pyeed/model.py
index 1162933f..26478aef 100644
--- a/src/pyeed/model.py
+++ b/src/pyeed/model.py
@@ -1,11 +1,13 @@
 from enum import Enum
 from typing import Any
 
+
 # from pyeed.nodes_and_relations import StrictStructuredNode
 from neomodel import (
     ArrayProperty,
     FloatProperty,
     IntegerProperty,
+    BooleanProperty,
     RelationshipTo,
     StringProperty,
     StructuredNode,
@@ -111,6 +113,13 @@ def save(self, *args: Any, **kwargs: Any) -> None:
                 elif isinstance(base_property, FloatProperty):
                     if not all(isinstance(item, float) for item in prop):
                         raise TypeError(f"All items in '{field}' must be floats")
+                    
+            #Validate BoleanProperty
+            elif isinstance(neo_type, BooleanProperty) and not isinstance(prop, bool):
+                raise TypeError(
+                    f"Expected a boolean for '{field}', got {type(prop).__name__}"
+                )
+                
 
         super().save(*args, **kwargs)  # Don't return the result
 
@@ -374,16 +383,35 @@ class Reaction(StrictStructuredNode):
     A node representing a reaction.
     """
     
-    rhea_id = StringProperty(required=False, unique_index=True)
-    reactants = ArrayProperty()
-    products = ArrayProperty()
+    rhea_id = StringProperty(unique_index=True, required=True)
+    chebi_id = ArrayProperty(StringProperty())
+    smiles = ArrayProperty(StringProperty())
 
+    # Relationships
+    substrate = RelationshipTo("Molecule", "SUBSTRATE")
+    product = RelationshipTo("Molecule", "PRODUCT")
+    
+    
     @property
     def label(self) -> str:
         """The label of the reaction."""
         return {self.rhea_id}
 
-
+class Molecule(StrictStructuredNode):
+    """
+    A node representing a molecule in the database.
+    """
+    
+    chebi_id = StringProperty(unique_index=True, required=True)
+    rhea_compound_id = StringProperty()
+    smiles = StringProperty()
+    
+    @property 
+    def label(self) -> str:
+        """The label of the molecule."""
+        return {self.chebi_id}
+    
+    
 class StandardNumbering(StrictStructuredNode):
     name = StringProperty(required=True, unique_index=True)
     definition = StringProperty(required=True)
@@ -497,12 +525,17 @@ class Protein(StrictStructuredNode):
     structure_ids = ArrayProperty(StringProperty())
     go_terms = ArrayProperty(StringProperty())
     rhea_id = ArrayProperty(StringProperty())
+    chebi_id = ArrayProperty(StringProperty())
     embedding = ArrayProperty(
         FloatProperty(),
         vector_index=VectorIndex(dimensions=1280),
         index_type="hnsw",
         distance_metric="COSINE",
     )
+    TBT = StringProperty()
+    PCL = StringProperty()
+    BHET = StringProperty()
+    PET_powder = StringProperty()
 
     # Relationships
     organism = RelationshipTo("Organism", "ORIGINATES_FROM")
@@ -510,6 +543,8 @@ class Protein(StrictStructuredNode):
     region = RelationshipTo("Region", "HAS_REGION", model=RegionRel)
     go_annotation = RelationshipTo("GOAnnotation", "ASSOCIATED_WITH")
     reaction = RelationshipTo("Reaction", "HAS_REACTION")
+    substrate = RelationshipTo("Molecule", "SUBSTRATE")
+    product = RelationshipTo("Molecule", "PRODUCT")
     ontology_object = RelationshipTo("OntologyObject", "ASSOCIATED_WITH")
     mutation = RelationshipTo("Protein", "MUTATION", model=Mutation)
     pairwise_aligned = RelationshipTo(

From edd47b6980531eeb66c51d2b4efb4705dc6b093f Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Wed, 16 Apr 2025 16:08:30 +0000
Subject: [PATCH 16/39] fixed saving of molecule nodes

---
 src/pyeed/adapter/uniprot_mapper.py |  5 ++---
 src/pyeed/model.py                  | 15 ++++++++++++++-
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/pyeed/adapter/uniprot_mapper.py b/src/pyeed/adapter/uniprot_mapper.py
index 4f48c31b..26605256 100644
--- a/src/pyeed/adapter/uniprot_mapper.py
+++ b/src/pyeed/adapter/uniprot_mapper.py
@@ -164,7 +164,6 @@ def get_smiles_from_chebi_web(self, chebi_url: str) -> str:
     def add_reaction(self, record: dict[str, Any], protein: Protein) -> None:
         for reference in record.get("comments", []):  # Safe retrieval with .get()
             if reference.get("type") == "CATALYTIC_ACTIVITY":
-                name = reference.get("reaction", {}).get("name", "")
                 rhea_id = None  # Default value
 
                 for db_ref in reference.get("reaction", {}).get("dbReferences", []):
@@ -192,7 +191,7 @@ def add_molecule(self, rhea_id: str, reaction: Reaction) -> None:
             chebi_id = f"CHEBI:{chebi_id}"
             substrate = Molecule.get_or_save(
                 chebi_id=chebi_id,
-                smiles=smiles,
+                smiles = smiles,
             )
             reaction.substrate.connect(substrate)
         
@@ -203,7 +202,7 @@ def add_molecule(self, rhea_id: str, reaction: Reaction) -> None:
             chebi_id = f"CHEBI:{chebi_id}"
             product = Molecule.get_or_save(
                 chebi_id=chebi_id,
-                smiles=smiles,
+                smiles = smiles,
             )
             reaction.product.connect(product)
 
diff --git a/src/pyeed/model.py b/src/pyeed/model.py
index 26478aef..42c487c8 100644
--- a/src/pyeed/model.py
+++ b/src/pyeed/model.py
@@ -385,7 +385,6 @@ class Reaction(StrictStructuredNode):
     
     rhea_id = StringProperty(unique_index=True, required=True)
     chebi_id = ArrayProperty(StringProperty())
-    smiles = ArrayProperty(StringProperty())
 
     # Relationships
     substrate = RelationshipTo("Molecule", "SUBSTRATE")
@@ -405,6 +404,20 @@ class Molecule(StrictStructuredNode):
     chebi_id = StringProperty(unique_index=True, required=True)
     rhea_compound_id = StringProperty()
     smiles = StringProperty()
+
+    @classmethod
+    def get_or_save(cls, chebi_id, smiles) -> "Molecule":
+        try:
+            molecule = cls.nodes.get(chebi_id=chebi_id)
+            return molecule
+        except cls.DoesNotExist:
+            try:
+                molecule = cls(chebi_id=chebi_id, smiles=smiles)
+                molecule.save()
+                return molecule
+            except Exception as e:
+                print(f"Error during saving of the molecule: {e}")
+                raise
     
     @property 
     def label(self) -> str:

From faf38fd4a32ab6ba27edc78748069464fe066195 Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Fri, 25 Apr 2025 10:09:19 +0000
Subject: [PATCH 17/39] added individual file name to mapper

---
 src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 7 ++++---
 src/pyeed/main.py                           | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
index 134d2ff5..4ea11801 100644
--- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
+++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
@@ -10,8 +10,9 @@
 logger = logging.getLogger(__name__)
 
 class NCBIToUniprotMapper:
-    def __init__(self, ids):
+    def __init__(self, ids: List[str], file: str):
         self.ids = ids
+        self.file = file
         self.uniparc_url = "https://www.ebi.ac.uk/proteins/api/uniparc?offset=0&size=100&sequencechecksum="
         self.ncbi_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
     
@@ -122,9 +123,9 @@ def execute_request(self)  -> None:
                         id_mapping_uniprot[self.ids[counter]] = uniprot_id
             counter += 1
         
-        with open("id_mapping_uniprot.json", "w") as f:
+        with open(f"{self.file}_uniprot.json", "w") as f:
             json.dump(id_mapping_uniprot, f)
             
-        with open("id_mapping_uniparc.json", "w") as f:
+        with open(f"{self.file}_uniparc.json", "w") as f:
             json.dump(id_mapping_uniparc, f)
     
\ No newline at end of file
diff --git a/src/pyeed/main.py b/src/pyeed/main.py
index c7707d13..44fd6d52 100644
--- a/src/pyeed/main.py
+++ b/src/pyeed/main.py
@@ -187,7 +187,7 @@ def fetch_ncbi_nucleotide(self, ids: list[str]) -> None:
         asyncio.run(adapter.execute_requests())
         nest_asyncio.apply()
     
-    def database_id_mapper(self, ids: list[str]) -> None:
+    def database_id_mapper(self, ids: list[str], file: str) -> None:
         """
         Maps IDs from one database to another using the UniProt ID mapping service
 
@@ -195,7 +195,7 @@ def database_id_mapper(self, ids: list[str]) -> None:
             ids (list[str]): List of IDs to map.
         """
 
-        mapper = NCBIToUniprotMapper(ids)
+        mapper = NCBIToUniprotMapper(ids, file)
         mapper.execute_request()
         
         nest_asyncio.apply()

From 4df6c1733823e2cc7386a481915d6b705ffe7193 Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Tue, 29 Apr 2025 08:56:28 +0000
Subject: [PATCH 18/39] added SPARQLWrapper as dependency

---
 pyproject.toml                      | 1 +
 src/pyeed/adapter/uniprot_mapper.py | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index dd10629a..86edb7ea 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ esm = "^3.1.3"
 rdflib = "^6.0.0"
 docker = "5.0.0"
 absl-py = "1.0.0"
+SPARQLWrapper = "2.0.0"
 
 [tool.poetry.group.dev.dependencies]
 mkdocstrings = {extras = ["python"], version = "^0.26.2"}
diff --git a/src/pyeed/adapter/uniprot_mapper.py b/src/pyeed/adapter/uniprot_mapper.py
index 26605256..4b339a3d 100644
--- a/src/pyeed/adapter/uniprot_mapper.py
+++ b/src/pyeed/adapter/uniprot_mapper.py
@@ -141,7 +141,6 @@ def get_substrates_and_products_from_rhea(self, rhea_id: str) -> dict[str, List[
         }
 
     
-
     def get_smiles_from_chebi_web(self, chebi_url: str) -> str:
         """
         Extract SMILES from the official ChEBI page using HTML scraping.

From a02d128e81b527f59b8a603d552e33e4c0b2fc16 Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Wed, 30 Apr 2025 08:49:10 +0000
Subject: [PATCH 19/39] fixing with ruff

---
 src/pyeed/adapter/uniprot_mapper.py | 9 ++++-----
 src/pyeed/main.py                   | 9 +++------
 src/pyeed/model.py                  | 3 +--
 3 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/src/pyeed/adapter/uniprot_mapper.py b/src/pyeed/adapter/uniprot_mapper.py
index 4b339a3d..477249eb 100644
--- a/src/pyeed/adapter/uniprot_mapper.py
+++ b/src/pyeed/adapter/uniprot_mapper.py
@@ -1,23 +1,22 @@
 import json
 from collections import defaultdict
 from typing import Any, List
+
 import requests
 from bs4 import BeautifulSoup
-from SPARQLWrapper import SPARQLWrapper, JSON
-
 from httpx import Response
 from loguru import logger
+from SPARQLWrapper import JSON, SPARQLWrapper
 
 from pyeed.adapter.primary_db_adapter import PrimaryDBMapper
 from pyeed.model import (
     Annotation,
-    Reaction,
     GOAnnotation,
+    Molecule,
     Organism,
     Protein,
+    Reaction,
     Site,
-    Reaction, 
-    Molecule,
 )
 
 
diff --git a/src/pyeed/main.py b/src/pyeed/main.py
index 30984592..cf0daed2 100644
--- a/src/pyeed/main.py
+++ b/src/pyeed/main.py
@@ -1,10 +1,10 @@
 import asyncio
-from typing import Any, Literal
 import time
 from concurrent.futures import ThreadPoolExecutor
-import torch
+from typing import Any, Literal
 
 import nest_asyncio
+import torch
 from loguru import logger
 
 from pyeed.adapter.ncbi_dna_mapper import NCBIDNAToPyeed
@@ -14,11 +14,8 @@
 from pyeed.dbchat import DBChat
 from pyeed.dbconnect import DatabaseConnector
 from pyeed.embedding import (
-    free_memory,
-    get_batch_embeddings,
     load_model_and_tokenizer,
-    update_protein_embeddings_in_db,
-    process_batches_on_gpu
+    process_batches_on_gpu,
 )
 
 
diff --git a/src/pyeed/model.py b/src/pyeed/model.py
index 42c487c8..95f9ffce 100644
--- a/src/pyeed/model.py
+++ b/src/pyeed/model.py
@@ -1,13 +1,12 @@
 from enum import Enum
 from typing import Any
 
-
 # from pyeed.nodes_and_relations import StrictStructuredNode
 from neomodel import (
     ArrayProperty,
+    BooleanProperty,
     FloatProperty,
     IntegerProperty,
-    BooleanProperty,
     RelationshipTo,
     StringProperty,
     StructuredNode,

From 9baa8f4f9790604f8289ecb621e0cfddfa870b23 Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Wed, 30 Apr 2025 09:40:45 +0000
Subject: [PATCH 20/39] fixing errors with ruff

---
 src/pyeed/analysis/sequence_alignment.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py
index 4b7837a9..946200b2 100644
--- a/src/pyeed/analysis/sequence_alignment.py
+++ b/src/pyeed/analysis/sequence_alignment.py
@@ -1,15 +1,14 @@
 from itertools import combinations
-from typing import List,Any, Dict, Optional, Tuple
-import time
-import asyncio
+from typing import Any, Dict, Optional
 
 from Bio.Align import Alignment as Alignment
 from Bio.Align import PairwiseAligner as BioPairwiseAligner
 from Bio.Align.substitution_matrices import Array as BioSubstitutionMatrix
 from joblib import Parallel, cpu_count, delayed
+from rich.progress import Progress
+
 from pyeed.dbconnect import DatabaseConnector
 from pyeed.tools.utility import chunks
-from rich.progress import Progress
 
 
 class PairwiseAligner:

From 056cb6b75cb45efc650648b6e2a7b56c446803b3 Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Wed, 30 Apr 2025 09:50:29 +0000
Subject: [PATCH 21/39] fixing ruff errors

---
 src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 11 ++++++-----
 src/pyeed/main.py                           | 11 ++++-------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
index 4ea11801..6969e2e8 100644
--- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
+++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
@@ -1,12 +1,13 @@
-import httpx
+import json
 import logging
-from pysam import FastaFile
-from crc64iso import crc64iso
-import sys
-import json 
 import os
+import sys
 from typing import List
 
+import httpx
+from crc64iso import crc64iso
+from pysam import FastaFile
+
 logger = logging.getLogger(__name__)
 
 class NCBIToUniprotMapper:
diff --git a/src/pyeed/main.py b/src/pyeed/main.py
index c971fe16..7effe1f1 100644
--- a/src/pyeed/main.py
+++ b/src/pyeed/main.py
@@ -1,25 +1,22 @@
 import asyncio
-from typing import Any, Literal
 import time
 from concurrent.futures import ThreadPoolExecutor
-import torch
+from typing import Any, Literal
 
 import nest_asyncio
+import torch
 from loguru import logger
 
 from pyeed.adapter.ncbi_dna_mapper import NCBIDNAToPyeed
 from pyeed.adapter.ncbi_protein_mapper import NCBIProteinToPyeed
+from pyeed.adapter.ncbi_to_uniprot_mapper import NCBIToUniprotMapper
 from pyeed.adapter.primary_db_adapter import PrimaryDBAdapter
 from pyeed.adapter.uniprot_mapper import UniprotToPyeed
-from pyeed.adapter.ncbi_to_uniprot_mapper import NCBIToUniprotMapper
 from pyeed.dbchat import DBChat
 from pyeed.dbconnect import DatabaseConnector
 from pyeed.embedding import (
-    free_memory,
-    get_batch_embeddings,
     load_model_and_tokenizer,
-    update_protein_embeddings_in_db,
-    process_batches_on_gpu
+    process_batches_on_gpu,
 )
 
 

From e7369e93c2dfd504150afbe422c76c73a20043fd Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Wed, 30 Apr 2025 10:05:54 +0000
Subject: [PATCH 22/39] added crc64iso to dependencies

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index dd10629a..41b2c8fc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ esm = "^3.1.3"
 rdflib = "^6.0.0"
 docker = "5.0.0"
 absl-py = "1.0.0"
+crc64iso = "0.0.2"
 
 [tool.poetry.group.dev.dependencies]
 mkdocstrings = {extras = ["python"], version = "^0.26.2"}

From f9897fe4e1185dfa0d5003b5881572a8e9355118 Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Wed, 30 Apr 2025 10:09:18 +0000
Subject: [PATCH 23/39] added pysam to dependencies

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 2a57449b..2f81fd56 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ docker = "5.0.0"
 absl-py = "1.0.0"
 crc64iso = "0.0.2"
 SPARQLWrapper = "2.0.0"
+pysam = "0.23.0"
 
 [tool.poetry.group.dev.dependencies]
 mkdocstrings = {extras = ["python"], version = "^0.26.2"}

From 9d81e9b2204023d8c2a2ebff0b8ac88ff7e384cb Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Wed, 30 Apr 2025 10:21:54 +0000
Subject: [PATCH 24/39] fixing linting errors

---
 src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 2 +-
 src/pyeed/analysis/sequence_alignment.py    | 3 +--
 src/pyeed/embedding.py                      | 5 ++---
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
index 6969e2e8..8e543f52 100644
--- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
+++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
@@ -104,7 +104,7 @@ def execute_request(self)  -> None:
                 
             #check if the request was successful
             if response.status_code != 200:
-                print(f"Request failed with status code {r.status_code}")
+                print(f"Request failed with status code {response.status_code}")
                 response.raise_for_status()  # Raise exception for any non-200 response
                 sys.exit()
             
diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py
index 946200b2..9255d55f 100644
--- a/src/pyeed/analysis/sequence_alignment.py
+++ b/src/pyeed/analysis/sequence_alignment.py
@@ -5,10 +5,9 @@
 from Bio.Align import PairwiseAligner as BioPairwiseAligner
 from Bio.Align.substitution_matrices import Array as BioSubstitutionMatrix
 from joblib import Parallel, cpu_count, delayed
-from rich.progress import Progress
-
 from pyeed.dbconnect import DatabaseConnector
 from pyeed.tools.utility import chunks
+from rich.progress import Progress
 
 
 class PairwiseAligner:
diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py
index c8fa91db..1b0d4955 100644
--- a/src/pyeed/embedding.py
+++ b/src/pyeed/embedding.py
@@ -1,21 +1,20 @@
 import gc
 import os
 from typing import Any, Tuple, Union
-from loguru import logger
 
 import numpy as np
 import torch
-from esm.models.esmc import ESMC
 from esm.models.esm3 import ESM3
+from esm.models.esmc import ESMC
 from esm.sdk.api import ESM3InferenceClient, ESMProtein, LogitsConfig, SamplingConfig
 from huggingface_hub import HfFolder, login
+from loguru import logger
 from numpy.typing import NDArray
 from transformers import EsmModel, EsmTokenizer
 
 from pyeed.dbconnect import DatabaseConnector
 
 
-
 def get_hf_token() -> str:
     """Get or request Hugging Face token."""
     if os.getenv("PYTEST_DISABLE_HF_LOGIN"):  # Disable Hugging Face login in tests

From 3955718fa2d05a3739b1cb3c4e8bd3dc1c06d676 Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Wed, 30 Apr 2025 11:29:50 +0000
Subject: [PATCH 25/39] reformatting

---
 src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 68 ++++++++++-----------
 src/pyeed/adapter/uniprot_mapper.py         | 36 +++++------
 src/pyeed/analysis/network_analysis.py      | 36 +++++------
 src/pyeed/analysis/sequence_alignment.py    | 13 ++--
 src/pyeed/dbconnect.py                      |  7 ++-
 src/pyeed/embedding.py                      | 15 +++--
 src/pyeed/main.py                           | 25 ++++----
 src/pyeed/model.py                          | 23 ++++---
 8 files changed, 114 insertions(+), 109 deletions(-)

diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
index 8e543f52..2f711e16 100644
--- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
+++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
@@ -10,18 +10,18 @@
 
 logger = logging.getLogger(__name__)
 
+
 class NCBIToUniprotMapper:
     def __init__(self, ids: List[str], file: str):
         self.ids = ids
         self.file = file
         self.uniparc_url = "https://www.ebi.ac.uk/proteins/api/uniparc?offset=0&size=100&sequencechecksum="
         self.ncbi_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
-    
-    
+
     def download_fasta(self, refseq_id: str) -> None:
         """
         Downloads a FASTA file for a given RefSeq ID using httpx and saves it locally.
-        
+
         Args:
             refseq_id str: NCBI ID
         """
@@ -30,9 +30,9 @@ def download_fasta(self, refseq_id: str) -> None:
             "db": "protein",
             "id": refseq_id,
             "rettype": "fasta",
-            "retmode": "text"
+            "retmode": "text",
         }
-        
+
         try:
             response = httpx.get(self.ncbi_url, params=params, timeout=10.0)
 
@@ -42,21 +42,23 @@ def download_fasta(self, refseq_id: str) -> None:
                     f.write(response.text)
                 print(f"✅ Downloaded: {filename}")
             else:
-                print(f"❌ Failed to download {refseq_id} (Status: {response.status_code})")
+                print(
+                    f"❌ Failed to download {refseq_id} (Status: {response.status_code})"
+                )
 
         except httpx.HTTPError as e:
             print(f"❌ HTTP error occurred while downloading {refseq_id}: {e}")
 
     def get_checksum(self, refseq_id: str) -> str:
         """Fetches and calculates the checksum for a given RefSeq ID.
-        
-        Args: 
+
+        Args:
             refseq_id str: NCBI ID
-        
+
         Returns:
             str: checksum ID
         """
-        
+
         self.download_fasta(refseq_id)
         fa = FastaFile(f"{refseq_id}.fasta")
         seq = fa.fetch(fa.references[0])
@@ -71,7 +73,7 @@ def checksum_list(self, refseq_ids: List[str]) -> List[str]:
         Returns:
             List[str]: cheksum IDs
         """
-        
+
         checksums = []
         for refseq_id in refseq_ids:
             checksums.append(self.get_checksum(refseq_id))
@@ -85,48 +87,46 @@ def checksum_list(self, refseq_ids: List[str]) -> List[str]:
                 os.remove(fai_file_path)
         return checksums
 
-    def execute_request(self)  -> None: 
-        """Fetches the uniparc and uniprot ids for the given refseq ids and saves them in a json file.
-        """
-        
+    def execute_request(self) -> None:
+        """Fetches the uniparc and uniprot ids for the given refseq ids and saves them in a json file."""
+
         checksum_list = self.checksum_list(self.ids)
-        
+
         id_mapping_uniprot = {}
         id_mapping_uniparc = {}
         counter = 0
-        
-        for checksum in checksum_list: 
+
+        for checksum in checksum_list:
             url = f"{self.uniparc_url}{checksum}"
-            
-            #perform request and get response as JSON
+
+            # perform request and get response as JSON
             with httpx.Client() as client:
-                response = client.get(url, headers={ "Accept" : "application/json"})
-                
-            #check if the request was successful
+                response = client.get(url, headers={"Accept": "application/json"})
+
+            # check if the request was successful
             if response.status_code != 200:
                 print(f"Request failed with status code {response.status_code}")
                 response.raise_for_status()  # Raise exception for any non-200 response
                 sys.exit()
-            
+
             # Check if the response body is empty
             if not response.content.strip():  # Check if the body is empty
                 print("The response body is empty.")
                 sys.exit()
-            
-            #extracts the uniprot and the uniparc id from the repsonse and saves them in a dictionary
+
+            # extracts the uniprot and the uniparc id from the repsonse and saves them in a dictionary
             response_body = response.json()
-            for item in response_body: 
-                uniparc_id = item.get('accession', None)
-                for ref in item.get('dbReference', []):  
-                    if ref.get('type') == 'UniProtKB/TrEMBL':
-                        uniprot_id = ref.get('id', None)
+            for item in response_body:
+                uniparc_id = item.get("accession", None)
+                for ref in item.get("dbReference", []):
+                    if ref.get("type") == "UniProtKB/TrEMBL":
+                        uniprot_id = ref.get("id", None)
                         id_mapping_uniparc[self.ids[counter]] = uniparc_id
                         id_mapping_uniprot[self.ids[counter]] = uniprot_id
             counter += 1
-        
+
         with open(f"{self.file}_uniprot.json", "w") as f:
             json.dump(id_mapping_uniprot, f)
-            
+
         with open(f"{self.file}_uniparc.json", "w") as f:
             json.dump(id_mapping_uniparc, f)
-    
\ No newline at end of file
diff --git a/src/pyeed/adapter/uniprot_mapper.py b/src/pyeed/adapter/uniprot_mapper.py
index 477249eb..36ed3577 100644
--- a/src/pyeed/adapter/uniprot_mapper.py
+++ b/src/pyeed/adapter/uniprot_mapper.py
@@ -82,13 +82,15 @@ def add_sites(self, record: dict[str, Any], protein: Protein) -> None:
             site.save()
 
             protein.site.connect(site, {"positions": positions})
-    
-    def get_substrates_and_products_from_rhea(self, rhea_id: str) -> dict[str, List[str]]:
+
+    def get_substrates_and_products_from_rhea(
+        self, rhea_id: str
+    ) -> dict[str, List[str]]:
         """Fetch substrates and products from Rhea by parsing the side URI (_L = substrate, _R = product).
-        
+
         Args:
             rhea_id (str or int): The Rhea reaction ID (e.g., 49528)
-        
+
         Returns:
             dict: {
                 'substrates': [list of chebi URIs],
@@ -134,17 +136,13 @@ def get_substrates_and_products_from_rhea(self, rhea_id: str) -> dict[str, List[
             elif side_uri.endswith("_R"):
                 products.add(chebi_uri)
 
-        return {
-            "substrates": sorted(substrates),
-            "products": sorted(products)
-        }
+        return {"substrates": sorted(substrates), "products": sorted(products)}
 
-    
     def get_smiles_from_chebi_web(self, chebi_url: str) -> str:
         """
         Extract SMILES from the official ChEBI page using HTML scraping.
         """
-        chebi_id = chebi_url.split('_')[-1]
+        chebi_id = chebi_url.split("_")[-1]
         url = f"https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:{chebi_id}"
 
         response = requests.get(url)
@@ -157,7 +155,6 @@ def get_smiles_from_chebi_web(self, chebi_url: str) -> str:
                 if headers and "SMILES" in headers[0].text:
                     data_cell = row.find_all("td")[-1]  # Get the last  in row
                     return data_cell.text.strip()
-                
 
     def add_reaction(self, record: dict[str, Any], protein: Protein) -> None:
         for reference in record.get("comments", []):  # Safe retrieval with .get()
@@ -168,7 +165,7 @@ def add_reaction(self, record: dict[str, Any], protein: Protein) -> None:
                     if db_ref.get("id", "").startswith("RHEA:"):
                         rhea_id = db_ref["id"]
                         break  # Stop after finding the first match
-                
+
                 catalytic_annotation = Reaction.get_or_save(
                     rhea_id=rhea_id,
                 )
@@ -176,31 +173,30 @@ def add_reaction(self, record: dict[str, Any], protein: Protein) -> None:
                 protein.reaction.connect(catalytic_annotation)
 
     def add_molecule(self, rhea_id: str, reaction: Reaction) -> None:
-    
         chebi = self.get_substrates_and_products_from_rhea(rhea_id)
 
         substrate_ids = chebi["substrates"]
         product_ids = chebi["products"]
-        
+
         for i in substrate_ids:
             smiles = self.get_smiles_from_chebi_web(i)
-            
-            chebi_id = i.split('_')[-1]
+
+            chebi_id = i.split("_")[-1]
             chebi_id = f"CHEBI:{chebi_id}"
             substrate = Molecule.get_or_save(
                 chebi_id=chebi_id,
-                smiles = smiles,
+                smiles=smiles,
             )
             reaction.substrate.connect(substrate)
-        
+
         for i in product_ids:
             smiles = self.get_smiles_from_chebi_web(i)
 
-            chebi_id = i.split('_')[-1]
+            chebi_id = i.split("_")[-1]
             chebi_id = f"CHEBI:{chebi_id}"
             product = Molecule.get_or_save(
                 chebi_id=chebi_id,
-                smiles = smiles,
+                smiles=smiles,
             )
             reaction.product.connect(product)
 
diff --git a/src/pyeed/analysis/network_analysis.py b/src/pyeed/analysis/network_analysis.py
index fd354ebe..dd66b45c 100644
--- a/src/pyeed/analysis/network_analysis.py
+++ b/src/pyeed/analysis/network_analysis.py
@@ -66,57 +66,51 @@ def create_graph(
         base_query = """
         MATCH (n)
         """
-        
+
         # Add node filters
         node_filters = []
         if nodes:
             node_filters.append("labels(n)[0] IN $node_types")
         if ids:
             node_filters.append("n.accession_id IN $accession_ids")
-            
+
         if node_filters:
             base_query += "WHERE " + " AND ".join(node_filters)
-            
+
         # Add relationship pattern and filters
         base_query += """
         OPTIONAL MATCH (n)-[r]->(m)
         """
-        
+
         # Add relationship type filter if specified
         if relationships:
             base_query += "WHERE type(r) IN $relationships "
-            
+
         # Return both nodes and relationships in a single query
         base_query += """
         RETURN 
             collect(DISTINCT {id: ID(n), labels: labels(n), properties: properties(n)}) as nodes,
             collect(DISTINCT {source: ID(n), target: ID(m), type: type(r), properties: properties(r)}) as relationships
         """
-        
+
         logger.info("Executing combined query for nodes and relationships")
         results = self.db.execute_read(
             base_query,
-            {
-                "node_types": nodes,
-                "accession_ids": ids,
-                "relationships": relationships
-            }
+            {"node_types": nodes, "accession_ids": ids, "relationships": relationships},
         )
-        
+
         if not results or not results[0]:
             logger.warning("No results found in the database")
             return self.graph
-            
+
         # Process nodes
         nodes_data = results[0]["nodes"]
         for node in nodes_data:
             self.graph.add_node(
-                node["id"],
-                labels=node["labels"],
-                properties=node["properties"]
+                node["id"], labels=node["labels"], properties=node["properties"]
             )
         logger.info(f"Added {len(nodes_data)} nodes to the graph")
-        
+
         # Process relationships
         relationships_data = results[0]["relationships"]
         for rel in relationships_data:
@@ -125,10 +119,10 @@ def create_graph(
                     rel["source"],
                     rel["target"],
                     type=rel["type"],
-                    properties=rel["properties"]
+                    properties=rel["properties"],
                 )
         logger.info(f"Added {len(relationships_data)} relationships to the graph")
-        
+
         return self.graph
 
     def compute_degree_centrality(self) -> dict[Any, float]:
@@ -263,8 +257,8 @@ def calculate_positions_2d(
         filtered_graph.remove_edges_from(self_referential_edges)
 
         # Find isolated nodes
-        #isolated_nodes = self.find_isolated_nodes(filtered_graph)
-        #filtered_graph.remove_nodes_from(isolated_nodes)
+        # isolated_nodes = self.find_isolated_nodes(filtered_graph)
+        # filtered_graph.remove_nodes_from(isolated_nodes)
 
         # Use spring layout for force-directed graph
         weight_attr = attribute if attribute is not None else None
diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py
index 9255d55f..d57c5e63 100644
--- a/src/pyeed/analysis/sequence_alignment.py
+++ b/src/pyeed/analysis/sequence_alignment.py
@@ -141,12 +141,17 @@ def align_multipairwise(
         MATCH (p1:Protein)-[:PAIRWISE_ALIGNED]->(p2:Protein)
         RETURN p1.accession_id AS Protein1_ID, p2.accession_id AS Protein2_ID
         """
-        
+
         # Fetch results properly as a list of tuples
-        existing_pairs = set(tuple(sorted((row["Protein1_ID"], row["Protein2_ID"]))) for row in db.execute_write(query))
+        existing_pairs = set(
+            tuple(sorted((row["Protein1_ID"], row["Protein2_ID"])))
+            for row in db.execute_write(query)
+        )
 
         # Filter new pairs that are not in existing_pairs
-        new_pairs = [pair for pair in pairs if tuple(sorted(pair)) not in existing_pairs]
+        new_pairs = [
+            pair for pair in pairs if tuple(sorted(pair)) not in existing_pairs
+        ]
 
         print(f"Number of existing pairs: {len(existing_pairs)}")
         print(f"Number of total pairs: {len(pairs)}")
@@ -351,4 +356,4 @@ def _get_id_sequence_dict(
     def _load_substitution_matrix(self) -> "BioSubstitutionMatrix":
         from Bio.Align import substitution_matrices
 
-        return substitution_matrices.load(self.substitution_matrix)  # type: ignore
\ No newline at end of file
+        return substitution_matrices.load(self.substitution_matrix)  # type: ignore
diff --git a/src/pyeed/dbconnect.py b/src/pyeed/dbconnect.py
index d208deec..8abcab52 100644
--- a/src/pyeed/dbconnect.py
+++ b/src/pyeed/dbconnect.py
@@ -227,8 +227,11 @@ def _get_driver(uri: str, user: str | None, password: str | None) -> Driver:
         Creates a new Neo4j driver instance.
         """
         auth = (user, password) if user and password else None
-        return GraphDatabase.driver(uri, auth=auth, connection_timeout=60,  # Increase initial connection timeout
-        max_connection_lifetime=86400,  # Keep connections alive longer
+        return GraphDatabase.driver(
+            uri,
+            auth=auth,
+            connection_timeout=60,  # Increase initial connection timeout
+            max_connection_lifetime=86400,  # Keep connections alive longer
         )
 
     @property
diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py
index 1b0d4955..a0229385 100644
--- a/src/pyeed/embedding.py
+++ b/src/pyeed/embedding.py
@@ -31,6 +31,7 @@ def get_hf_token() -> str:
     else:
         raise RuntimeError("Failed to get Hugging Face token")
 
+
 def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db):
     """
     Splits data into batches and processes them on a single GPU.
@@ -64,15 +65,21 @@ def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db):
                 )
 
                 # Update the database
-                update_protein_embeddings_in_db(db, list(accessions[:current_batch_size]), embeddings_batch)
+                update_protein_embeddings_in_db(
+                    db, list(accessions[:current_batch_size]), embeddings_batch
+                )
 
                 # Move to the next batch
                 break  # Successful execution, move to the next batch
 
             except torch.cuda.OutOfMemoryError:
                 torch.cuda.empty_cache()
-                current_batch_size = max(1, current_batch_size // 2)  # Reduce batch size
-                logger.warning(f"Reduced batch size to {current_batch_size} due to OOM error.")
+                current_batch_size = max(
+                    1, current_batch_size // 2
+                )  # Reduce batch size
+                logger.warning(
+                    f"Reduced batch size to {current_batch_size} due to OOM error."
+                )
 
     # Free memory
     del model
@@ -82,7 +89,7 @@ def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db):
 def load_model_and_tokenizer(
     model_name: str,
     device: str,
-    ) -> Tuple[Any, Union[Any, None], str]:
+) -> Tuple[Any, Union[Any, None], str]:
     """
     Loads the model and assigns it to a specific GPU.
 
diff --git a/src/pyeed/main.py b/src/pyeed/main.py
index 7effe1f1..5ba41d0d 100644
--- a/src/pyeed/main.py
+++ b/src/pyeed/main.py
@@ -190,8 +190,7 @@ def fetch_ncbi_nucleotide(self, ids: list[str]) -> None:
         # Fix: apply nest_asyncio and then run the coroutine with the event loop
         nest_asyncio.apply()
         asyncio.get_event_loop().run_until_complete(adapter.execute_requests())
-    
-    
+
     def database_id_mapper(self, ids: list[str], file: str) -> None:
         """
         Maps IDs from one database to another using the UniProt ID mapping service
@@ -202,7 +201,7 @@ def database_id_mapper(self, ids: list[str], file: str) -> None:
 
         mapper = NCBIToUniprotMapper(ids, file)
         mapper.execute_request()
-        
+
         nest_asyncio.apply()
 
     def calculate_sequence_embeddings(
@@ -210,9 +209,9 @@ def calculate_sequence_embeddings(
         batch_size: int = 16,
         model_name: str = "facebook/esm2_t33_650M_UR50D",
         num_gpus: int = None,  # Number of GPUs to use
-        ) -> None:
+    ) -> None:
         """
-        Calculates embeddings for all sequences in the database that do not have embeddings, 
+        Calculates embeddings for all sequences in the database that do not have embeddings,
         distributing the workload across available GPUs.
 
         Args:
@@ -243,18 +242,19 @@ def calculate_sequence_embeddings(
         """
         results = self.db.execute_read(query)
         data = [(result["accession"], result["sequence"]) for result in results]
-        
+
         if not data:
             logger.info("No sequences to process.")
             return
-        
+
         accessions, sequences = zip(*data)
         total_sequences = len(sequences)
         logger.debug(f"Total sequences to process: {total_sequences}")
 
         # Split the data into num_gpus chunks
         gpu_batches = [
-            list(zip(accessions[i::num_gpus], sequences[i::num_gpus])) for i in range(num_gpus)
+            list(zip(accessions[i::num_gpus], sequences[i::num_gpus]))
+            for i in range(num_gpus)
         ]
 
         start_time = time.time()
@@ -275,16 +275,17 @@ def calculate_sequence_embeddings(
                         model,
                         tokenizer,
                         device,
-                        self.db
+                        self.db,
                     )
                 )
-            
+
             for future in futures:
                 future.result()  # Wait for all threads to complete
 
-
         end_time = time.time()
-        logger.info(f"Total embedding calculation time: {end_time - start_time:.2f} seconds")
+        logger.info(
+            f"Total embedding calculation time: {end_time - start_time:.2f} seconds"
+        )
 
         # Cleanup
         for model, _, _ in models_and_tokenizers:
diff --git a/src/pyeed/model.py b/src/pyeed/model.py
index 19d83e8b..c7a193b7 100644
--- a/src/pyeed/model.py
+++ b/src/pyeed/model.py
@@ -112,13 +112,12 @@ def save(self, *args: Any, **kwargs: Any) -> None:
                 elif isinstance(base_property, FloatProperty):
                     if not all(isinstance(item, float) for item in prop):
                         raise TypeError(f"All items in '{field}' must be floats")
-                    
-            #Validate BoleanProperty
+
+            # Validate BoleanProperty
             elif isinstance(neo_type, BooleanProperty) and not isinstance(prop, bool):
                 raise TypeError(
                     f"Expected a boolean for '{field}', got {type(prop).__name__}"
                 )
-                
 
         super().save(*args, **kwargs)  # Don't return the result
 
@@ -153,7 +152,7 @@ class Annotation(Enum):
 class Organism(StrictStructuredNode):
     taxonomy_id = IntegerProperty(required=True, unique_index=True)
     name = StringProperty()
-    
+
     @classmethod
     def get_or_save(cls, taxonomy_id, name) -> "Organism":
         try:
@@ -395,25 +394,25 @@ class Reaction(StrictStructuredNode):
     """
     A node representing a reaction.
     """
-    
+
     rhea_id = StringProperty(unique_index=True, required=True)
     chebi_id = ArrayProperty(StringProperty())
 
     # Relationships
     substrate = RelationshipTo("Molecule", "SUBSTRATE")
     product = RelationshipTo("Molecule", "PRODUCT")
-    
-    
+
     @property
     def label(self) -> str:
         """The label of the reaction."""
         return {self.rhea_id}
 
+
 class Molecule(StrictStructuredNode):
     """
     A node representing a molecule in the database.
     """
-    
+
     chebi_id = StringProperty(unique_index=True, required=True)
     rhea_compound_id = StringProperty()
     smiles = StringProperty()
@@ -431,13 +430,13 @@ def get_or_save(cls, chebi_id, smiles) -> "Molecule":
             except Exception as e:
                 print(f"Error during saving of the molecule: {e}")
                 raise
-    
-    @property 
+
+    @property
     def label(self) -> str:
         """The label of the molecule."""
         return {self.chebi_id}
-    
-    
+
+
 class StandardNumbering(StrictStructuredNode):
     name = StringProperty(required=True, unique_index=True)
     definition = StringProperty(required=True)

From 9b7f28b4b0649a37e654d604ad3a479bcb555e18 Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Wed, 30 Apr 2025 12:51:26 +0000
Subject: [PATCH 26/39] fixing mypy errors

---
 pyproject.toml                              |  1 +
 src/pyeed/adapter/ncbi_to_uniprot_mapper.py |  2 +-
 src/pyeed/model.py                          | 18 +++++++++++-------
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2f81fd56..948c493b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,6 +39,7 @@ absl-py = "1.0.0"
 crc64iso = "0.0.2"
 SPARQLWrapper = "2.0.0"
 pysam = "0.23.0"
+types-requests = "2.32.0"
 
 [tool.poetry.group.dev.dependencies]
 mkdocstrings = {extras = ["python"], version = "^0.26.2"}
diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
index 2f711e16..1373547a 100644
--- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
+++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
@@ -62,7 +62,7 @@ def get_checksum(self, refseq_id: str) -> str:
         self.download_fasta(refseq_id)
         fa = FastaFile(f"{refseq_id}.fasta")
         seq = fa.fetch(fa.references[0])
-        return crc64iso.crc64(seq)
+        return f"{crc64iso.crc64(seq)}"
 
     def checksum_list(self, refseq_ids: List[str]) -> List[str]:
         """Creates a list of checksum IDs and deletes the FASTA files after processing.
diff --git a/src/pyeed/model.py b/src/pyeed/model.py
index c7a193b7..aa498aee 100644
--- a/src/pyeed/model.py
+++ b/src/pyeed/model.py
@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Any
+from typing import Any, cast
 
 # from pyeed.nodes_and_relations import StrictStructuredNode
 from neomodel import (
@@ -154,9 +154,11 @@ class Organism(StrictStructuredNode):
     name = StringProperty()
 
     @classmethod
-    def get_or_save(cls, taxonomy_id, name) -> "Organism":
+    def get_or_save(cls, **kwargs: Any) -> "Organism":
+        taxonomy_id = kwargs.get("taxonomy_id")
+        name = kwargs.get("name")
         try:
-            organism = cls.nodes.get(taxonomy_id=taxonomy_id)
+            organism = cast(Organism, cls.nodes.get(taxonomy_id=taxonomy_id))
             return organism
         except cls.DoesNotExist:
             try:
@@ -405,7 +407,7 @@ class Reaction(StrictStructuredNode):
     @property
     def label(self) -> str:
         """The label of the reaction."""
-        return {self.rhea_id}
+        return f"{self.rhea_id}"
 
 
 class Molecule(StrictStructuredNode):
@@ -418,9 +420,11 @@ class Molecule(StrictStructuredNode):
     smiles = StringProperty()
 
     @classmethod
-    def get_or_save(cls, chebi_id, smiles) -> "Molecule":
+    def get_or_save(cls, **kwargs:Any) -> "Molecule":
+        chebi_id = kwargs.get("chebi_id")
+        smiles = kwargs.get("smiles")
         try:
-            molecule = cls.nodes.get(chebi_id=chebi_id)
+            molecule = cast(Molecule, cls.nodes.get(chebi_id=chebi_id))
             return molecule
         except cls.DoesNotExist:
             try:
@@ -434,7 +438,7 @@ def get_or_save(cls, chebi_id, smiles) -> "Molecule":
     @property
     def label(self) -> str:
         """The label of the molecule."""
-        return {self.chebi_id}
+        return f"{self.chebi_id}"
 
 
 class StandardNumbering(StrictStructuredNode):

From cf742d77f52e9533e6b0b5813a70ac29d0756ef8 Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Wed, 30 Apr 2025 14:02:52 +0000
Subject: [PATCH 27/39] fixing mypy errors

---
 src/pyeed/adapter/uniprot_mapper.py | 29 ++++++++++++++++++++---------
 src/pyeed/embedding.py              | 28 ++++++++++++++++++++--------
 src/pyeed/main.py                   | 11 +++++++----
 3 files changed, 47 insertions(+), 21 deletions(-)

diff --git a/src/pyeed/adapter/uniprot_mapper.py b/src/pyeed/adapter/uniprot_mapper.py
index 36ed3577..7f26893e 100644
--- a/src/pyeed/adapter/uniprot_mapper.py
+++ b/src/pyeed/adapter/uniprot_mapper.py
@@ -1,9 +1,9 @@
 import json
 from collections import defaultdict
-from typing import Any, List
+from typing import Any, List, Optional
 
 import requests
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 from httpx import Response
 from loguru import logger
 from SPARQLWrapper import JSON, SPARQLWrapper
@@ -120,7 +120,11 @@ def get_substrates_and_products_from_rhea(
         sparql.setReturnFormat(JSON)
         sparql.addCustomHttpHeader("User-Agent", "MyPythonClient/1.0")
 
-        results = sparql.query().convert()
+        results_raw = sparql.query().convert()
+        if not isinstance(results_raw, dict):
+            raise TypeError("Expected dict from SPARQL query")
+
+        results: dict[str, Any] = results_raw
 
         substrates = set()
         products = set()
@@ -138,7 +142,7 @@ def get_substrates_and_products_from_rhea(
 
         return {"substrates": sorted(substrates), "products": sorted(products)}
 
-    def get_smiles_from_chebi_web(self, chebi_url: str) -> str:
+    def get_smiles_from_chebi_web(self, chebi_url: str) -> Optional[str]:
         """
         Extract SMILES from the official ChEBI page using HTML scraping.
         """
@@ -150,11 +154,17 @@ def get_smiles_from_chebi_web(self, chebi_url: str) -> str:
 
         # Look for table rows that contain the SMILES label
         for table in soup.find_all("table", class_="chebiTableContent"):
+            if not isinstance(table, Tag):
+                continue
             for row in table.find_all("tr"):
+                if not isinstance(row, Tag):
+                    continue
                 headers = row.find_all("td", class_="chebiDataHeader")
-                if headers and "SMILES" in headers[0].text:
-                    data_cell = row.find_all("td")[-1]  # Get the last  in row
-                    return data_cell.text.strip()
+                if headers and isinstance(headers[0], Tag) and "SMILES" in headers[0].text:
+                    data_cells = row.find_all("td")
+                    if data_cells:
+                        return f"{data_cells[-1].text.strip()}"
+        return None
 
     def add_reaction(self, record: dict[str, Any], protein: Protein) -> None:
         for reference in record.get("comments", []):  # Safe retrieval with .get()
@@ -169,8 +179,9 @@ def add_reaction(self, record: dict[str, Any], protein: Protein) -> None:
                 catalytic_annotation = Reaction.get_or_save(
                     rhea_id=rhea_id,
                 )
-                self.add_molecule(rhea_id, catalytic_annotation)
-                protein.reaction.connect(catalytic_annotation)
+                if rhea_id is not None:
+                    self.add_molecule(rhea_id, catalytic_annotation)
+                    protein.reaction.connect(catalytic_annotation)
 
     def add_molecule(self, rhea_id: str, reaction: Reaction) -> None:
         chebi = self.get_substrates_and_products_from_rhea(rhea_id)
diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py
index a0229385..05895dfc 100644
--- a/src/pyeed/embedding.py
+++ b/src/pyeed/embedding.py
@@ -11,6 +11,7 @@
 from loguru import logger
 from numpy.typing import NDArray
 from transformers import EsmModel, EsmTokenizer
+from torch.nn import DataParallel, Module
 
 from pyeed.dbconnect import DatabaseConnector
 
@@ -32,7 +33,14 @@ def get_hf_token() -> str:
         raise RuntimeError("Failed to get Hugging Face token")
 
 
-def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db):
+def process_batches_on_gpu(
+    data: list[tuple[str, str]], 
+    batch_size: int, 
+    model:Module, 
+    tokenizer: EsmTokenizer, 
+    db:DatabaseConnector,
+    device:torch.device,
+    ) -> None:
     """
     Splits data into batches and processes them on a single GPU.
 
@@ -88,8 +96,8 @@ def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db):
 
 def load_model_and_tokenizer(
     model_name: str,
-    device: str,
-) -> Tuple[Any, Union[Any, None], str]:
+    device:torch.device,
+) -> Tuple[Any, Union[Any, None], torch.device]:
     """
     Loads the model and assigns it to a specific GPU.
 
@@ -125,7 +133,7 @@ def get_batch_embeddings(
     model: Union[
         EsmModel,
         ESMC,
-        torch.nn.DataParallel,
+        DataParallel[Module],
         ESM3InferenceClient,
         ESM3,
     ],
@@ -209,7 +217,9 @@ def get_batch_embeddings(
 
 
 def calculate_single_sequence_embedding_last_hidden_state(
-    sequence: str, model_name: str = "facebook/esm2_t33_650M_UR50D"
+    sequence: str, 
+    device: torch.device,
+    model_name: str = "facebook/esm2_t33_650M_UR50D",
 ) -> NDArray[np.float64]:
     """
     Calculates an embedding for a single sequence.
@@ -221,12 +231,14 @@ def calculate_single_sequence_embedding_last_hidden_state(
     Returns:
         NDArray[np.float64]: Normalized embedding vector for the sequence
     """
-    model, tokenizer, device = load_model_and_tokenizer(model_name)
+    model, tokenizer, device = load_model_and_tokenizer(model_name, device)
     return get_single_embedding_last_hidden_state(sequence, model, tokenizer, device)
 
 
 def calculate_single_sequence_embedding_all_layers(
-    sequence: str, model_name: str = "facebook/esm2_t33_650M_UR50D"
+    sequence: str, 
+    device: torch.device,
+    model_name: str = "facebook/esm2_t33_650M_UR50D",
 ) -> NDArray[np.float64]:
     """
     Calculates embeddings for a single sequence across all layers.
@@ -238,7 +250,7 @@ def calculate_single_sequence_embedding_all_layers(
     Returns:
         NDArray[np.float64]: A numpy array containing layer embeddings for the sequence.
     """
-    model, tokenizer, device = load_model_and_tokenizer(model_name)
+    model, tokenizer, device = load_model_and_tokenizer(model_name, device)
     return get_single_embedding_all_layers(sequence, model, tokenizer, device)
 
 
diff --git a/src/pyeed/main.py b/src/pyeed/main.py
index 5ba41d0d..4dee81e0 100644
--- a/src/pyeed/main.py
+++ b/src/pyeed/main.py
@@ -208,7 +208,7 @@ def calculate_sequence_embeddings(
         self,
         batch_size: int = 16,
         model_name: str = "facebook/esm2_t33_650M_UR50D",
-        num_gpus: int = None,  # Number of GPUs to use
+        num_gpus: int = 1,  # Number of GPUs to use
     ) -> None:
         """
         Calculates embeddings for all sequences in the database that do not have embeddings,
@@ -229,9 +229,12 @@ def calculate_sequence_embeddings(
             logger.warning("No GPU available! Running on CPU.")
 
         # Load separate models for each GPU
-        devices = [f"cuda:{i}" for i in range(num_gpus)] if num_gpus > 0 else ["cpu"]
+        devices = [
+        torch.device(f"cuda:{i}") for i in range(num_gpus)
+        ] if num_gpus > 0 else [torch.device("cpu")]
+
         models_and_tokenizers = [
-            load_model_and_tokenizer(model_name, device) for device in devices
+        load_model_and_tokenizer(model_name, device) for device in devices
         ]
 
         # Retrieve sequences without embeddings
@@ -274,8 +277,8 @@ def calculate_sequence_embeddings(
                         batch_size,
                         model,
                         tokenizer,
-                        device,
                         self.db,
+                        device,
                     )
                 )
 

From f073cd6e03f61b7e3a4a65e946d8ae0f2ff23bdd Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Wed, 30 Apr 2025 14:11:28 +0000
Subject: [PATCH 28/39] fixing ruff error

---
 src/pyeed/embedding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py
index 05895dfc..5686ca36 100644
--- a/src/pyeed/embedding.py
+++ b/src/pyeed/embedding.py
@@ -10,8 +10,8 @@
 from huggingface_hub import HfFolder, login
 from loguru import logger
 from numpy.typing import NDArray
-from transformers import EsmModel, EsmTokenizer
 from torch.nn import DataParallel, Module
+from transformers import EsmModel, EsmTokenizer
 
 from pyeed.dbconnect import DatabaseConnector
 

From 3ec0368351652eba4fbc18ddc818460c2e4381e4 Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Wed, 30 Apr 2025 14:13:18 +0000
Subject: [PATCH 29/39] fixing ruff error

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 948c493b..b9897071 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,7 @@ absl-py = "1.0.0"
 crc64iso = "0.0.2"
 SPARQLWrapper = "2.0.0"
 pysam = "0.23.0"
-types-requests = "2.32.0"
+types-requests = "2.32.0.20250328"
 
 [tool.poetry.group.dev.dependencies]
 mkdocstrings = {extras = ["python"], version = "^0.26.2"}

From 37afe5c31421a124bcbb5a15fc3cb1bc3b3e321c Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Wed, 30 Apr 2025 14:17:37 +0000
Subject: [PATCH 30/39] trigger pipeline

---
 src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
index 1373547a..78b493a5 100644
--- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
+++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
@@ -10,7 +10,6 @@
 
 logger = logging.getLogger(__name__)
 
-
 class NCBIToUniprotMapper:
     def __init__(self, ids: List[str], file: str):
         self.ids = ids

From d84c74e7afabf2ef518220b84ade1b2dfca090cc Mon Sep 17 00:00:00 2001
From: alacheim 
Date: Wed, 30 Apr 2025 14:23:10 +0000
Subject: [PATCH 31/39] formated with ruff

---
 src/pyeed/adapter/ncbi_to_uniprot_mapper.py |  1 +
 src/pyeed/adapter/uniprot_mapper.py         |  6 +++++-
 src/pyeed/embedding.py                      | 20 ++++++++++----------
 src/pyeed/main.py                           | 10 ++++++----
 src/pyeed/model.py                          |  2 +-
 5 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
index 78b493a5..1373547a 100644
--- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
+++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
@@ -10,6 +10,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 class NCBIToUniprotMapper:
     def __init__(self, ids: List[str], file: str):
         self.ids = ids
diff --git a/src/pyeed/adapter/uniprot_mapper.py b/src/pyeed/adapter/uniprot_mapper.py
index 7f26893e..f52d01b3 100644
--- a/src/pyeed/adapter/uniprot_mapper.py
+++ b/src/pyeed/adapter/uniprot_mapper.py
@@ -160,7 +160,11 @@ def get_smiles_from_chebi_web(self, chebi_url: str) -> Optional[str]:
                 if not isinstance(row, Tag):
                     continue
                 headers = row.find_all("td", class_="chebiDataHeader")
-                if headers and isinstance(headers[0], Tag) and "SMILES" in headers[0].text:
+                if (
+                    headers
+                    and isinstance(headers[0], Tag)
+                    and "SMILES" in headers[0].text
+                ):
                     data_cells = row.find_all("td")
                     if data_cells:
                         return f"{data_cells[-1].text.strip()}"
diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py
index 5686ca36..28f66a1b 100644
--- a/src/pyeed/embedding.py
+++ b/src/pyeed/embedding.py
@@ -34,13 +34,13 @@ def get_hf_token() -> str:
 
 
 def process_batches_on_gpu(
-    data: list[tuple[str, str]], 
-    batch_size: int, 
-    model:Module, 
-    tokenizer: EsmTokenizer, 
-    db:DatabaseConnector,
-    device:torch.device,
-    ) -> None:
+    data: list[tuple[str, str]],
+    batch_size: int,
+    model: Module,
+    tokenizer: EsmTokenizer,
+    db: DatabaseConnector,
+    device: torch.device,
+) -> None:
     """
     Splits data into batches and processes them on a single GPU.
 
@@ -96,7 +96,7 @@ def process_batches_on_gpu(
 
 def load_model_and_tokenizer(
     model_name: str,
-    device:torch.device,
+    device: torch.device,
 ) -> Tuple[Any, Union[Any, None], torch.device]:
     """
     Loads the model and assigns it to a specific GPU.
@@ -217,7 +217,7 @@ def get_batch_embeddings(
 
 
 def calculate_single_sequence_embedding_last_hidden_state(
-    sequence: str, 
+    sequence: str,
     device: torch.device,
     model_name: str = "facebook/esm2_t33_650M_UR50D",
 ) -> NDArray[np.float64]:
@@ -236,7 +236,7 @@ def calculate_single_sequence_embedding_last_hidden_state(
 
 
 def calculate_single_sequence_embedding_all_layers(
-    sequence: str, 
+    sequence: str,
     device: torch.device,
     model_name: str = "facebook/esm2_t33_650M_UR50D",
 ) -> NDArray[np.float64]:
diff --git a/src/pyeed/main.py b/src/pyeed/main.py
index 4dee81e0..1189fcb3 100644
--- a/src/pyeed/main.py
+++ b/src/pyeed/main.py
@@ -229,12 +229,14 @@ def calculate_sequence_embeddings(
             logger.warning("No GPU available! Running on CPU.")
 
         # Load separate models for each GPU
-        devices = [
-        torch.device(f"cuda:{i}") for i in range(num_gpus)
-        ] if num_gpus > 0 else [torch.device("cpu")]
+        devices = (
+            [torch.device(f"cuda:{i}") for i in range(num_gpus)]
+            if num_gpus > 0
+            else [torch.device("cpu")]
+        )
 
         models_and_tokenizers = [
-        load_model_and_tokenizer(model_name, device) for device in devices
+            load_model_and_tokenizer(model_name, device) for device in devices
         ]
 
         # Retrieve sequences without embeddings
diff --git a/src/pyeed/model.py b/src/pyeed/model.py
index aa498aee..5a3bf188 100644
--- a/src/pyeed/model.py
+++ b/src/pyeed/model.py
@@ -420,7 +420,7 @@ class Molecule(StrictStructuredNode):
     smiles = StringProperty()
 
     @classmethod
-    def get_or_save(cls, **kwargs:Any) -> "Molecule":
+    def get_or_save(cls, **kwargs: Any) -> "Molecule":
         chebi_id = kwargs.get("chebi_id")
         smiles = kwargs.get("smiles")
         try:

From 57c09b41856ec179da911a8db503a1939382907d Mon Sep 17 00:00:00 2001
From: Niklas Abraham GPU 
Date: Thu, 1 May 2025 13:42:51 +0000
Subject: [PATCH 32/39] fixed linter issue in sequence alignment

---
 src/pyeed/analysis/sequence_alignment.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py
index 946200b2..cb6acff4 100644
--- a/src/pyeed/analysis/sequence_alignment.py
+++ b/src/pyeed/analysis/sequence_alignment.py
@@ -142,12 +142,19 @@ def align_multipairwise(
         MATCH (p1:Protein)-[:PAIRWISE_ALIGNED]->(p2:Protein)
         RETURN p1.accession_id AS Protein1_ID, p2.accession_id AS Protein2_ID
         """
-        
+
         # Fetch results properly as a list of tuples
-        existing_pairs = set(tuple(sorted((row["Protein1_ID"], row["Protein2_ID"]))) for row in db.execute_write(query))
+        existing_pairs = set()
+        if db is not None:
+            existing_pairs = set(
+                tuple(sorted((row["Protein1_ID"], row["Protein2_ID"])))
+                for row in db.execute_write(query)
+            )
 
         # Filter new pairs that are not in existing_pairs
-        new_pairs = [pair for pair in pairs if tuple(sorted(pair)) not in existing_pairs]
+        new_pairs = [
+            pair for pair in pairs if tuple(sorted(pair)) not in existing_pairs
+        ]
 
         print(f"Number of existing pairs: {len(existing_pairs)}")
         print(f"Number of total pairs: {len(pairs)}")
@@ -352,4 +359,4 @@ def _get_id_sequence_dict(
     def _load_substitution_matrix(self) -> "BioSubstitutionMatrix":
         from Bio.Align import substitution_matrices
 
-        return substitution_matrices.load(self.substitution_matrix)  # type: ignore
\ No newline at end of file
+        return substitution_matrices.load(self.substitution_matrix)  # type: ignore

From 014dcdc220e309205bfb7cd9982f20f5750bd06d Mon Sep 17 00:00:00 2001
From: Niklas Abraham GPU 
Date: Thu, 1 May 2025 13:48:08 +0000
Subject: [PATCH 33/39] fixed linter issue in sequence alignment

---
 src/pyeed/analysis/sequence_alignment.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py
index d57c5e63..9b68818e 100644
--- a/src/pyeed/analysis/sequence_alignment.py
+++ b/src/pyeed/analysis/sequence_alignment.py
@@ -5,9 +5,10 @@
 from Bio.Align import PairwiseAligner as BioPairwiseAligner
 from Bio.Align.substitution_matrices import Array as BioSubstitutionMatrix
 from joblib import Parallel, cpu_count, delayed
+from rich.progress import Progress
+
 from pyeed.dbconnect import DatabaseConnector
 from pyeed.tools.utility import chunks
-from rich.progress import Progress
 
 
 class PairwiseAligner:
@@ -142,11 +143,13 @@ def align_multipairwise(
         RETURN p1.accession_id AS Protein1_ID, p2.accession_id AS Protein2_ID
         """
 
-        # Fetch results properly as a list of tuples
-        existing_pairs = set(
-            tuple(sorted((row["Protein1_ID"], row["Protein2_ID"])))
-            for row in db.execute_write(query)
-        )
+        if db is not None:
+            existing_pairs = set(
+                tuple(sorted((row["Protein1_ID"], row["Protein2_ID"])))
+                for row in db.execute_write(query)
+            )
+        else:
+            existing_pairs = set()
 
         # Filter new pairs that are not in existing_pairs
         new_pairs = [

From ad840ca33f90718c174be150656858a9079aead8 Mon Sep 17 00:00:00 2001
From: Niklas Abraham GPU 
Date: Thu, 1 May 2025 14:01:51 +0000
Subject: [PATCH 34/39] fixed ruff errors imports

---
 src/pyeed/analysis/mutation_detection.py |  1 +
 src/pyeed/embedding.py                   | 20 +++++++++++++-------
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/pyeed/analysis/mutation_detection.py b/src/pyeed/analysis/mutation_detection.py
index c2562ae1..5c6809e8 100644
--- a/src/pyeed/analysis/mutation_detection.py
+++ b/src/pyeed/analysis/mutation_detection.py
@@ -1,6 +1,7 @@
 from typing import Any, Optional
 
 from loguru import logger
+
 from pyeed.dbconnect import DatabaseConnector
 
 
diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py
index c8fa91db..a0229385 100644
--- a/src/pyeed/embedding.py
+++ b/src/pyeed/embedding.py
@@ -1,21 +1,20 @@
 import gc
 import os
 from typing import Any, Tuple, Union
-from loguru import logger
 
 import numpy as np
 import torch
-from esm.models.esmc import ESMC
 from esm.models.esm3 import ESM3
+from esm.models.esmc import ESMC
 from esm.sdk.api import ESM3InferenceClient, ESMProtein, LogitsConfig, SamplingConfig
 from huggingface_hub import HfFolder, login
+from loguru import logger
 from numpy.typing import NDArray
 from transformers import EsmModel, EsmTokenizer
 
 from pyeed.dbconnect import DatabaseConnector
 
 
-
 def get_hf_token() -> str:
     """Get or request Hugging Face token."""
     if os.getenv("PYTEST_DISABLE_HF_LOGIN"):  # Disable Hugging Face login in tests
@@ -32,6 +31,7 @@ def get_hf_token() -> str:
     else:
         raise RuntimeError("Failed to get Hugging Face token")
 
+
 def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db):
     """
     Splits data into batches and processes them on a single GPU.
@@ -65,15 +65,21 @@ def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db):
                 )
 
                 # Update the database
-                update_protein_embeddings_in_db(db, list(accessions[:current_batch_size]), embeddings_batch)
+                update_protein_embeddings_in_db(
+                    db, list(accessions[:current_batch_size]), embeddings_batch
+                )
 
                 # Move to the next batch
                 break  # Successful execution, move to the next batch
 
             except torch.cuda.OutOfMemoryError:
                 torch.cuda.empty_cache()
-                current_batch_size = max(1, current_batch_size // 2)  # Reduce batch size
-                logger.warning(f"Reduced batch size to {current_batch_size} due to OOM error.")
+                current_batch_size = max(
+                    1, current_batch_size // 2
+                )  # Reduce batch size
+                logger.warning(
+                    f"Reduced batch size to {current_batch_size} due to OOM error."
+                )
 
     # Free memory
     del model
@@ -83,7 +89,7 @@ def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db):
 def load_model_and_tokenizer(
     model_name: str,
     device: str,
-    ) -> Tuple[Any, Union[Any, None], str]:
+) -> Tuple[Any, Union[Any, None], str]:
     """
     Loads the model and assigns it to a specific GPU.
 

From 70710f0bd50f03a9ba11f654ccd1b9f493e5da05 Mon Sep 17 00:00:00 2001
From: Niklas Abraham GPU 
Date: Thu, 1 May 2025 14:15:44 +0000
Subject: [PATCH 35/39] fixed files for ruff stuff

---
 docs/usage/blast.ipynb                   | 723 ++++++++++++-----------
 docs/usage/clustalo.ipynb                | 338 +++++------
 docs/usage/embeddings_analysis.ipynb     |   5 +-
 docs/usage/mmseqs.ipynb                  |   3 +-
 docs/usage/mutation_analysis.ipynb       |   1 +
 docs/usage/network_analysis.ipynb        |   1 +
 docs/usage/standard_numbering.ipynb      |   2 +-
 src/pyeed/analysis/embedding_analysis.py |   3 +-
 src/pyeed/analysis/network_analysis.py   |   1 +
 src/pyeed/analysis/ontology_loading.py   |   3 +-
 src/pyeed/analysis/standard_numbering.py |   1 +
 tests/unit/test_dbchat.py                |   1 +
 12 files changed, 545 insertions(+), 537 deletions(-)

diff --git a/docs/usage/blast.ipynb b/docs/usage/blast.ipynb
index b56140d7..d6cd57ef 100644
--- a/docs/usage/blast.ipynb
+++ b/docs/usage/blast.ipynb
@@ -1,363 +1,364 @@
 {
-    "cells": [
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "# BLAST Search\n",
-                "\n",
-                "## Setup\n",
-                "\n",
-                "The BLAST service runs in a Docker container and requires:\n",
-                "1. A local BLAST database\n",
-                "2. The Docker service running"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 1,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# change log level to INFO\n",
-                "import sys\n",
-                "from loguru import logger\n",
-                "\n",
-                "logger.remove()\n",
-                "level = logger.add(sys.stderr, level=\"WARNING\")"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Basic Usage\n",
-                "\n",
-                "The `Blast` class provides an interface to search protein or nucleotide sequences against a local BLAST database."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 2,
-            "metadata": {},
-            "outputs": [
-                {
-                    "data": {
-                        "text/html": [
-                            "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
subject_ididentityalignment_lengthmismatchesgap_opensquery_startquery_endsubject_startsubject_endevaluebit_score
0seq781.8182231315111320.00322.3
1seq1100.00025001251250.00422.3
2seq261.5382610020455300.03819.2
\n", - "
" - ], - "text/plain": [ - " subject_id identity alignment_length mismatches gap_opens query_start \\\n", - "0 seq7 81.818 22 3 1 31 \n", - "1 seq1 100.000 25 0 0 1 \n", - "2 seq2 61.538 26 10 0 20 \n", - "\n", - " query_end subject_start subject_end evalue bit_score \n", - "0 51 11 32 0.003 22.3 \n", - "1 25 1 25 0.004 22.3 \n", - "2 45 5 30 0.038 19.2 " - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pyeed.tools import Blast\n", - "\n", - "# Example protein sequence\n", - "sequence = \"MSEQVAAVAKLRAKASEAAKEAKAREAAKKLAEAAKKAKAKEAAKRAEAKLAEKAKAAKRAEAKAAKEAKRAAAKRAEAKLAEKAKAAK\"\n", - "\n", - "# Initialize BLAST search\n", - "blast = Blast(\n", - " # service_url=\"http://localhost:6001/blast\",\n", - " mode=\"blastp\", # Use blastp for protein sequences\n", - " db_path=\"/usr/local/bin/data/test_db\", # Path in Docker container\n", - " db_name=\"protein_db\", # Name of your BLAST database\n", - " evalue=0.1, # E-value threshold\n", - " max_target_seqs=10, # Maximum number of hits to return\n", - ")\n", - "\n", - "# Perform search\n", - "results = blast.search(sequence)\n", - "results" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The results are returned as a pandas DataFrame with the following columns:\n", - "- subject_id: ID of the matched sequence\n", - "- identity: Percentage identity\n", - "- alignment_length: Length of the alignment\n", - "- mismatches: Number of mismatches\n", - "- gap_opens: Number of gap openings\n", - "- query_start/end: Start/end positions in query sequence\n", - "- subject_start/end: Start/end positions in subject sequence\n", - "- evalue: Expectation value\n", - "- bit_score: Bit score" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating a BLAST Database\n", - "\n", - "Before using BLAST, you need to create a local database. Here's how to create one from a FASTA file:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```bash\n", - "# For protein sequences\n", - "makeblastdb -in proteins.fasta -dbtype prot -out blast_db/my_proteins\n", - "\n", - "# For nucleotide sequences\n", - "makeblastdb -in nucleotides.fasta -dbtype nucl -out blast_db/my_nucleotides\n", - "```\n", - "\n", - "To access the BLAST Docker container shell and create databases:\n", - "\n", - "```bash\n", - "# Enter the BLAST container shell\n", - "docker compose exec blast bash\n", - "# \n", - "# Navigate to database directory\n", - "cd /usr/local/bin/data/blast_db\n", - "# \n", - "# Create protein database\n", - "makeblastdb -in proteins.fasta -dbtype prot -out my_proteins\n", - "# \n", - "# Create nucleotide database \n", - "makeblastdb -in nucleotides.fasta -dbtype nucl -out my_nucleotides\n", - "```\n", - "Make sure your FASTA files are mounted in the container's `/usr/local/bin/data/blast_db` directory.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Advanced Usage\n", - "\n", - "You can customize the BLAST search parameters:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
subject_ididentityalignment_lengthmismatchesgap_opensquery_startquery_endsubject_startsubject_endevaluebit_score
0seq781.8182231315111320.00322.3
1seq1100.00025001251250.00422.3
\n", - "
" - ], - "text/plain": [ - " subject_id identity alignment_length mismatches gap_opens query_start \\\n", - "0 seq7 81.818 22 3 1 31 \n", - "1 seq1 100.000 25 0 0 1 \n", - "\n", - " query_end subject_start subject_end evalue bit_score \n", - "0 51 11 32 0.003 22.3 \n", - "1 25 1 25 0.004 22.3 " - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Configure BLAST for sensitive protein search\n", - "blast = Blast(\n", - " # service_url=\"http://localhost:6001/blast\",\n", - " mode=\"blastp\",\n", - " db_path=\"/usr/local/bin/data/test_db\",\n", - " db_name=\"protein_db\",\n", - " evalue=1e-1, # More stringent E-value\n", - " max_target_seqs=100, # Return more hits\n", - " num_threads=4, # Use 4 CPU threads\n", - ")\n", - "\n", - "# Search with longer timeout\n", - "results = blast.search(sequence, timeout=7200) # 2 hour timeout\n", - "\n", - "# Filter results\n", - "significant_hits = results[results[\"identity\"] > 80] # Only hits with >90% identity\n", - "significant_hits" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Thereafter, the ids of the hits can be added to the pyeed database, using the `fetch_from_primary_db` function." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "pyeed", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BLAST Search\n", + "\n", + "## Setup\n", + "\n", + "The BLAST service runs in a Docker container and requires:\n", + "1. A local BLAST database\n", + "2. The Docker service running" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# change log level to INFO\n", + "import sys\n", + "\n", + "from loguru import logger\n", + "\n", + "logger.remove()\n", + "level = logger.add(sys.stderr, level=\"WARNING\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Basic Usage\n", + "\n", + "The `Blast` class provides an interface to search protein or nucleotide sequences against a local BLAST database." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_ididentityalignment_lengthmismatchesgap_opensquery_startquery_endsubject_startsubject_endevaluebit_score
0seq781.8182231315111320.00322.3
1seq1100.00025001251250.00422.3
2seq261.5382610020455300.03819.2
\n", + "
" + ], + "text/plain": [ + " subject_id identity alignment_length mismatches gap_opens query_start \\\n", + "0 seq7 81.818 22 3 1 31 \n", + "1 seq1 100.000 25 0 0 1 \n", + "2 seq2 61.538 26 10 0 20 \n", + "\n", + " query_end subject_start subject_end evalue bit_score \n", + "0 51 11 32 0.003 22.3 \n", + "1 25 1 25 0.004 22.3 \n", + "2 45 5 30 0.038 19.2 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pyeed.tools import Blast\n", + "\n", + "# Example protein sequence\n", + "sequence = \"MSEQVAAVAKLRAKASEAAKEAKAREAAKKLAEAAKKAKAKEAAKRAEAKLAEKAKAAKRAEAKAAKEAKRAAAKRAEAKLAEKAKAAK\"\n", + "\n", + "# Initialize BLAST search\n", + "blast = Blast(\n", + " # service_url=\"http://localhost:6001/blast\",\n", + " mode=\"blastp\", # Use blastp for protein sequences\n", + " db_path=\"/usr/local/bin/data/test_db\", # Path in Docker container\n", + " db_name=\"protein_db\", # Name of your BLAST database\n", + " evalue=0.1, # E-value threshold\n", + " max_target_seqs=10, # Maximum number of hits to return\n", + ")\n", + "\n", + "# Perform search\n", + "results = blast.search(sequence)\n", + "results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The results are returned as a pandas DataFrame with the following columns:\n", + "- subject_id: ID of the matched sequence\n", + "- identity: Percentage identity\n", + "- alignment_length: Length of the alignment\n", + "- mismatches: Number of mismatches\n", + "- gap_opens: Number of gap openings\n", + "- query_start/end: Start/end positions in query sequence\n", + "- subject_start/end: Start/end positions in subject sequence\n", + "- evalue: Expectation value\n", + "- bit_score: Bit score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating a BLAST Database\n", + "\n", + "Before using BLAST, you need to create a local database. Here's how to create one from a FASTA file:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```bash\n", + "# For protein sequences\n", + "makeblastdb -in proteins.fasta -dbtype prot -out blast_db/my_proteins\n", + "\n", + "# For nucleotide sequences\n", + "makeblastdb -in nucleotides.fasta -dbtype nucl -out blast_db/my_nucleotides\n", + "```\n", + "\n", + "To access the BLAST Docker container shell and create databases:\n", + "\n", + "```bash\n", + "# Enter the BLAST container shell\n", + "docker compose exec blast bash\n", + "# \n", + "# Navigate to database directory\n", + "cd /usr/local/bin/data/blast_db\n", + "# \n", + "# Create protein database\n", + "makeblastdb -in proteins.fasta -dbtype prot -out my_proteins\n", + "# \n", + "# Create nucleotide database \n", + "makeblastdb -in nucleotides.fasta -dbtype nucl -out my_nucleotides\n", + "```\n", + "Make sure your FASTA files are mounted in the container's `/usr/local/bin/data/blast_db` directory.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Advanced Usage\n", + "\n", + "You can customize the BLAST search parameters:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_ididentityalignment_lengthmismatchesgap_opensquery_startquery_endsubject_startsubject_endevaluebit_score
0seq781.8182231315111320.00322.3
1seq1100.00025001251250.00422.3
\n", + "
" + ], + "text/plain": [ + " subject_id identity alignment_length mismatches gap_opens query_start \\\n", + "0 seq7 81.818 22 3 1 31 \n", + "1 seq1 100.000 25 0 0 1 \n", + "\n", + " query_end subject_start subject_end evalue bit_score \n", + "0 51 11 32 0.003 22.3 \n", + "1 25 1 25 0.004 22.3 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Configure BLAST for sensitive protein search\n", + "blast = Blast(\n", + " # service_url=\"http://localhost:6001/blast\",\n", + " mode=\"blastp\",\n", + " db_path=\"/usr/local/bin/data/test_db\",\n", + " db_name=\"protein_db\",\n", + " evalue=1e-1, # More stringent E-value\n", + " max_target_seqs=100, # Return more hits\n", + " num_threads=4, # Use 4 CPU threads\n", + ")\n", + "\n", + "# Search with longer timeout\n", + "results = blast.search(sequence, timeout=7200) # 2 hour timeout\n", + "\n", + "# Filter results\n", + "significant_hits = results[results[\"identity\"] > 80] # Only hits with >90% identity\n", + "significant_hits" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Thereafter, the ids of the hits can be added to the pyeed database, using the `fetch_from_primary_db` function." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pyeed", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/docs/usage/clustalo.ipynb b/docs/usage/clustalo.ipynb index 64ed62ee..d3ba2fba 100644 --- a/docs/usage/clustalo.ipynb +++ b/docs/usage/clustalo.ipynb @@ -1,171 +1,171 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Multiple Sequence Alignment with Clustal Omega\n", - "\n", - "PyEED provides a convenient interface to Clustal Omega for multiple sequence alignment. This notebook demonstrates how to:\n", - "1. Align sequences from a dictionary\n", - "2. Align sequences directly from the database" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "from pyeed import Pyeed\n", - "from pyeed.tools.clustalo import ClustalOmega\n", - "\n", - "# change log level to INFO\n", - "import sys\n", - "from loguru import logger\n", - "\n", - "logger.remove()\n", - "level = logger.add(sys.stderr, level=\"INFO\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Direct Sequence Alignment\n", - "\n", - "You can align sequences directly by providing a dictionary of sequences:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Aligned sequences:\n", - "seq1 AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n", - "seq2 AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n", - "seq3 AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK----\n" - ] - } - ], - "source": [ - "# Initialize ClustalOmega\n", - "clustalo = ClustalOmega()\n", - "\n", - "# Example sequences\n", - "sequences = {\n", - " \"seq1\": \"AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n", - " \"seq2\": \"AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n", - " \"seq3\": \"AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK\",\n", - "}\n", - "\n", - "# Perform alignment\n", - "alignment = clustalo.align(sequences)\n", - "print(\"Aligned sequences:\")\n", - "print(alignment)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Database-based Alignment\n", - "\n", - "You can also align sequences directly from the database by providing a list of accession IDs:" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Pyeed Graph Object Mapping constraints not defined. Use _install_labels() to set up model constraints.\n", - "📡 Connected to database.\n", - "Database alignment:\n", - "AAP20891.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "CAJ85677.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "SAQ02853.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "CDR98216.1 MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n", - "WP_109963600.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGTGKRGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "CAA41038.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "WP_109874025.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "CAA46344.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "APG33178.1 MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n", - "AKC98298.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n" - ] - } - ], - "source": [ - "# Connect to database\n", - "pyeed = Pyeed(uri=\"bolt://129.69.129.130:7687\", user=\"neo4j\", password=\"12345678\")\n", - "\n", - "# Get protein IDs from database\n", - "from pyeed.model import Protein\n", - "\n", - "accession_ids = [protein.accession_id for protein in Protein.nodes.all()][:10]\n", - "\n", - "# Align sequences from database\n", - "alignment = clustalo.align_from_db(accession_ids, pyeed.db)\n", - "print(\"Database alignment:\")\n", - "print(alignment)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Understanding Alignment Results\n", - "\n", - "The alignment result is a `MultipleSequenceAlignment` object with:\n", - "- List of `Sequence` objects\n", - "- Each sequence has an ID and aligned sequence\n", - "- Gaps are represented by '-' characters\n", - "- Sequences are padded to equal length\n", - "\n", - "The alignment preserves sequence order and maintains sequence IDs from the input." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Configuration\n", - "\n", - "ClustalOmega requires the PyEED Docker service to be running. Make sure to:\n", - "1. Have Docker installed\n", - "2. Start the service with `docker-compose up -d`\n", - "3. The service runs on port 5001 by default" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "pyeed_niklas", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Multiple Sequence Alignment with Clustal Omega\n", + "\n", + "PyEED provides a convenient interface to Clustal Omega for multiple sequence alignment. This notebook demonstrates how to:\n", + "1. Align sequences from a dictionary\n", + "2. Align sequences directly from the database" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# change log level to INFO\n", + "import sys\n", + "\n", + "from loguru import logger\n", + "\n", + "from pyeed import Pyeed\n", + "from pyeed.model import Protein\n", + "from pyeed.tools.clustalo import ClustalOmega\n", + "\n", + "logger.remove()\n", + "level = logger.add(sys.stderr, level=\"INFO\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Direct Sequence Alignment\n", + "\n", + "You can align sequences directly by providing a dictionary of sequences:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Aligned sequences:\n", + "seq1 AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n", + "seq2 AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n", + "seq3 AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK----\n" + ] + } + ], + "source": [ + "# Initialize ClustalOmega\n", + "clustalo = ClustalOmega()\n", + "\n", + "# Example sequences\n", + "sequences = {\n", + " \"seq1\": \"AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n", + " \"seq2\": \"AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n", + " \"seq3\": \"AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK\",\n", + "}\n", + "\n", + "# Perform alignment\n", + "alignment = clustalo.align(sequences)\n", + "print(\"Aligned sequences:\")\n", + "print(alignment)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Database-based Alignment\n", + "\n", + "You can also align sequences directly from the database by providing a list of accession IDs:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pyeed Graph Object Mapping constraints not defined. Use _install_labels() to set up model constraints.\n", + "📡 Connected to database.\n", + "Database alignment:\n", + "AAP20891.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "CAJ85677.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "SAQ02853.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "CDR98216.1 MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n", + "WP_109963600.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGTGKRGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "CAA41038.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "WP_109874025.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "CAA46344.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "APG33178.1 MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n", + "AKC98298.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n" + ] + } + ], + "source": [ + "# Connect to database\n", + "pyeed = Pyeed(uri=\"bolt://129.69.129.130:7687\", user=\"neo4j\", password=\"12345678\")\n", + "\n", + "# Get protein IDs from database\n", + "accession_ids = [protein.accession_id for protein in Protein.nodes.all()][:10]\n", + "\n", + "# Align sequences from database\n", + "alignment = clustalo.align_from_db(accession_ids, pyeed.db)\n", + "print(\"Database alignment:\")\n", + "print(alignment)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Understanding Alignment Results\n", + "\n", + "The alignment result is a `MultipleSequenceAlignment` object with:\n", + "- List of `Sequence` objects\n", + "- Each sequence has an ID and aligned sequence\n", + "- Gaps are represented by '-' characters\n", + "- Sequences are padded to equal length\n", + "\n", + "The alignment preserves sequence order and maintains sequence IDs from the input." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuration\n", + "\n", + "ClustalOmega requires the PyEED Docker service to be running. Make sure to:\n", + "1. Have Docker installed\n", + "2. Start the service with `docker-compose up -d`\n", + "3. The service runs on port 5001 by default" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pyeed_niklas", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/docs/usage/embeddings_analysis.ipynb b/docs/usage/embeddings_analysis.ipynb index 65a2398c..0b72e743 100644 --- a/docs/usage/embeddings_analysis.ipynb +++ b/docs/usage/embeddings_analysis.ipynb @@ -24,9 +24,10 @@ "source": [ "import sys\n", "\n", - "from loguru import logger\n", - "import pandas as pd\n", "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "from loguru import logger\n", + "\n", "from pyeed import Pyeed\n", "from pyeed.analysis.embedding_analysis import EmbeddingTool\n", "\n", diff --git a/docs/usage/mmseqs.ipynb b/docs/usage/mmseqs.ipynb index 1185c6fe..2253fd8a 100644 --- a/docs/usage/mmseqs.ipynb +++ b/docs/usage/mmseqs.ipynb @@ -20,6 +20,7 @@ "outputs": [], "source": [ "from pyeed import Pyeed\n", + "from pyeed.model import Protein\n", "from pyeed.tools.mmseqs import MMSeqs" ] }, @@ -134,8 +135,6 @@ "pyeed = Pyeed(uri=\"bolt://localhost:7687\", user=\"neo4j\", password=\"12345678\")\n", "\n", "# Get first 100 protein IDs from database\n", - "from pyeed.model import Protein\n", - "\n", "accession_ids = [protein.accession_id for protein in Protein.nodes.all()][:100]\n", "\n", "# Cluster sequences\n", diff --git a/docs/usage/mutation_analysis.ipynb b/docs/usage/mutation_analysis.ipynb index 9b31c996..7d10d360 100644 --- a/docs/usage/mutation_analysis.ipynb +++ b/docs/usage/mutation_analysis.ipynb @@ -16,6 +16,7 @@ "outputs": [], "source": [ "import sys\n", + "\n", "from loguru import logger\n", "\n", "from pyeed import Pyeed\n", diff --git a/docs/usage/network_analysis.ipynb b/docs/usage/network_analysis.ipynb index 4d45db71..0b254610 100644 --- a/docs/usage/network_analysis.ipynb +++ b/docs/usage/network_analysis.ipynb @@ -11,6 +11,7 @@ "import matplotlib.pyplot as plt\n", "import networkx as nx\n", "from loguru import logger\n", + "\n", "from pyeed import Pyeed\n", "from pyeed.analysis.network_analysis import NetworkAnalysis\n", "from pyeed.analysis.sequence_alignment import PairwiseAligner\n", diff --git a/docs/usage/standard_numbering.ipynb b/docs/usage/standard_numbering.ipynb index cd84cad9..54374cd6 100644 --- a/docs/usage/standard_numbering.ipynb +++ b/docs/usage/standard_numbering.ipynb @@ -23,10 +23,10 @@ "%reload_ext autoreload\n", "%autoreload 2\n", "import sys\n", + "\n", "from loguru import logger\n", "\n", "from pyeed import Pyeed\n", - "from pyeed.analysis.mutation_detection import MutationDetection\n", "from pyeed.analysis.standard_numbering import StandardNumberingTool\n", "\n", "logger.remove()\n", diff --git a/src/pyeed/analysis/embedding_analysis.py b/src/pyeed/analysis/embedding_analysis.py index b3535f74..c27b670f 100644 --- a/src/pyeed/analysis/embedding_analysis.py +++ b/src/pyeed/analysis/embedding_analysis.py @@ -6,9 +6,10 @@ import scipy.spatial as sp from matplotlib.figure import Figure from numpy.typing import NDArray -from pyeed.dbconnect import DatabaseConnector from scipy.spatial.distance import cosine +from pyeed.dbconnect import DatabaseConnector + logger = logging.getLogger(__name__) diff --git a/src/pyeed/analysis/network_analysis.py b/src/pyeed/analysis/network_analysis.py index fd354ebe..5627461e 100644 --- a/src/pyeed/analysis/network_analysis.py +++ b/src/pyeed/analysis/network_analysis.py @@ -2,6 +2,7 @@ import networkx as nx from loguru import logger + from pyeed.dbconnect import DatabaseConnector diff --git a/src/pyeed/analysis/ontology_loading.py b/src/pyeed/analysis/ontology_loading.py index ee909636..5b6341f5 100644 --- a/src/pyeed/analysis/ontology_loading.py +++ b/src/pyeed/analysis/ontology_loading.py @@ -1,8 +1,9 @@ from typing import Dict -from pyeed.dbconnect import DatabaseConnector from rdflib import OWL, RDF, RDFS, Graph, Namespace, URIRef +from pyeed.dbconnect import DatabaseConnector + class OntologyAdapter: """ diff --git a/src/pyeed/analysis/standard_numbering.py b/src/pyeed/analysis/standard_numbering.py index 6f81869f..b2ea0667 100644 --- a/src/pyeed/analysis/standard_numbering.py +++ b/src/pyeed/analysis/standard_numbering.py @@ -13,6 +13,7 @@ from typing import Any, Dict, List, Optional, Tuple from loguru import logger + from pyeed.analysis.sequence_alignment import PairwiseAligner from pyeed.dbconnect import DatabaseConnector from pyeed.model import StandardNumbering diff --git a/tests/unit/test_dbchat.py b/tests/unit/test_dbchat.py index d1e202c6..bf6226ac 100644 --- a/tests/unit/test_dbchat.py +++ b/tests/unit/test_dbchat.py @@ -2,6 +2,7 @@ import pytest from neo4j.exceptions import CypherSyntaxError + from pyeed.dbchat import DBChat from pyeed.dbconnect import DatabaseConnector From a435764e26f07ca9094bf016f44b1676cccbd732 Mon Sep 17 00:00:00 2001 From: Niklas Abraham GPU Date: Thu, 1 May 2025 14:17:08 +0000 Subject: [PATCH 36/39] fixed ruff files --- docs/usage/blast.ipynb | 723 ++++++++++++----------- docs/usage/clustalo.ipynb | 338 +++++------ docs/usage/embeddings_analysis.ipynb | 5 +- docs/usage/mmseqs.ipynb | 3 +- docs/usage/mutation_analysis.ipynb | 1 + docs/usage/network_analysis.ipynb | 1 + docs/usage/standard_numbering.ipynb | 2 +- src/pyeed/analysis/embedding_analysis.py | 3 +- src/pyeed/analysis/network_analysis.py | 1 + src/pyeed/analysis/ontology_loading.py | 3 +- src/pyeed/analysis/standard_numbering.py | 1 + tests/unit/test_dbchat.py | 1 + 12 files changed, 545 insertions(+), 537 deletions(-) diff --git a/docs/usage/blast.ipynb b/docs/usage/blast.ipynb index b56140d7..d6cd57ef 100644 --- a/docs/usage/blast.ipynb +++ b/docs/usage/blast.ipynb @@ -1,363 +1,364 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# BLAST Search\n", - "\n", - "## Setup\n", - "\n", - "The BLAST service runs in a Docker container and requires:\n", - "1. A local BLAST database\n", - "2. The Docker service running" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# change log level to INFO\n", - "import sys\n", - "from loguru import logger\n", - "\n", - "logger.remove()\n", - "level = logger.add(sys.stderr, level=\"WARNING\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Basic Usage\n", - "\n", - "The `Blast` class provides an interface to search protein or nucleotide sequences against a local BLAST database." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
subject_ididentityalignment_lengthmismatchesgap_opensquery_startquery_endsubject_startsubject_endevaluebit_score
0seq781.8182231315111320.00322.3
1seq1100.00025001251250.00422.3
2seq261.5382610020455300.03819.2
\n", - "
" - ], - "text/plain": [ - " subject_id identity alignment_length mismatches gap_opens query_start \\\n", - "0 seq7 81.818 22 3 1 31 \n", - "1 seq1 100.000 25 0 0 1 \n", - "2 seq2 61.538 26 10 0 20 \n", - "\n", - " query_end subject_start subject_end evalue bit_score \n", - "0 51 11 32 0.003 22.3 \n", - "1 25 1 25 0.004 22.3 \n", - "2 45 5 30 0.038 19.2 " - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pyeed.tools import Blast\n", - "\n", - "# Example protein sequence\n", - "sequence = \"MSEQVAAVAKLRAKASEAAKEAKAREAAKKLAEAAKKAKAKEAAKRAEAKLAEKAKAAKRAEAKAAKEAKRAAAKRAEAKLAEKAKAAK\"\n", - "\n", - "# Initialize BLAST search\n", - "blast = Blast(\n", - " # service_url=\"http://localhost:6001/blast\",\n", - " mode=\"blastp\", # Use blastp for protein sequences\n", - " db_path=\"/usr/local/bin/data/test_db\", # Path in Docker container\n", - " db_name=\"protein_db\", # Name of your BLAST database\n", - " evalue=0.1, # E-value threshold\n", - " max_target_seqs=10, # Maximum number of hits to return\n", - ")\n", - "\n", - "# Perform search\n", - "results = blast.search(sequence)\n", - "results" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The results are returned as a pandas DataFrame with the following columns:\n", - "- subject_id: ID of the matched sequence\n", - "- identity: Percentage identity\n", - "- alignment_length: Length of the alignment\n", - "- mismatches: Number of mismatches\n", - "- gap_opens: Number of gap openings\n", - "- query_start/end: Start/end positions in query sequence\n", - "- subject_start/end: Start/end positions in subject sequence\n", - "- evalue: Expectation value\n", - "- bit_score: Bit score" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating a BLAST Database\n", - "\n", - "Before using BLAST, you need to create a local database. Here's how to create one from a FASTA file:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```bash\n", - "# For protein sequences\n", - "makeblastdb -in proteins.fasta -dbtype prot -out blast_db/my_proteins\n", - "\n", - "# For nucleotide sequences\n", - "makeblastdb -in nucleotides.fasta -dbtype nucl -out blast_db/my_nucleotides\n", - "```\n", - "\n", - "To access the BLAST Docker container shell and create databases:\n", - "\n", - "```bash\n", - "# Enter the BLAST container shell\n", - "docker compose exec blast bash\n", - "# \n", - "# Navigate to database directory\n", - "cd /usr/local/bin/data/blast_db\n", - "# \n", - "# Create protein database\n", - "makeblastdb -in proteins.fasta -dbtype prot -out my_proteins\n", - "# \n", - "# Create nucleotide database \n", - "makeblastdb -in nucleotides.fasta -dbtype nucl -out my_nucleotides\n", - "```\n", - "Make sure your FASTA files are mounted in the container's `/usr/local/bin/data/blast_db` directory.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Advanced Usage\n", - "\n", - "You can customize the BLAST search parameters:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
subject_ididentityalignment_lengthmismatchesgap_opensquery_startquery_endsubject_startsubject_endevaluebit_score
0seq781.8182231315111320.00322.3
1seq1100.00025001251250.00422.3
\n", - "
" - ], - "text/plain": [ - " subject_id identity alignment_length mismatches gap_opens query_start \\\n", - "0 seq7 81.818 22 3 1 31 \n", - "1 seq1 100.000 25 0 0 1 \n", - "\n", - " query_end subject_start subject_end evalue bit_score \n", - "0 51 11 32 0.003 22.3 \n", - "1 25 1 25 0.004 22.3 " - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Configure BLAST for sensitive protein search\n", - "blast = Blast(\n", - " # service_url=\"http://localhost:6001/blast\",\n", - " mode=\"blastp\",\n", - " db_path=\"/usr/local/bin/data/test_db\",\n", - " db_name=\"protein_db\",\n", - " evalue=1e-1, # More stringent E-value\n", - " max_target_seqs=100, # Return more hits\n", - " num_threads=4, # Use 4 CPU threads\n", - ")\n", - "\n", - "# Search with longer timeout\n", - "results = blast.search(sequence, timeout=7200) # 2 hour timeout\n", - "\n", - "# Filter results\n", - "significant_hits = results[results[\"identity\"] > 80] # Only hits with >90% identity\n", - "significant_hits" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Thereafter, the ids of the hits can be added to the pyeed database, using the `fetch_from_primary_db` function." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "pyeed", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BLAST Search\n", + "\n", + "## Setup\n", + "\n", + "The BLAST service runs in a Docker container and requires:\n", + "1. A local BLAST database\n", + "2. The Docker service running" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# change log level to INFO\n", + "import sys\n", + "\n", + "from loguru import logger\n", + "\n", + "logger.remove()\n", + "level = logger.add(sys.stderr, level=\"WARNING\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Basic Usage\n", + "\n", + "The `Blast` class provides an interface to search protein or nucleotide sequences against a local BLAST database." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_ididentityalignment_lengthmismatchesgap_opensquery_startquery_endsubject_startsubject_endevaluebit_score
0seq781.8182231315111320.00322.3
1seq1100.00025001251250.00422.3
2seq261.5382610020455300.03819.2
\n", + "
" + ], + "text/plain": [ + " subject_id identity alignment_length mismatches gap_opens query_start \\\n", + "0 seq7 81.818 22 3 1 31 \n", + "1 seq1 100.000 25 0 0 1 \n", + "2 seq2 61.538 26 10 0 20 \n", + "\n", + " query_end subject_start subject_end evalue bit_score \n", + "0 51 11 32 0.003 22.3 \n", + "1 25 1 25 0.004 22.3 \n", + "2 45 5 30 0.038 19.2 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pyeed.tools import Blast\n", + "\n", + "# Example protein sequence\n", + "sequence = \"MSEQVAAVAKLRAKASEAAKEAKAREAAKKLAEAAKKAKAKEAAKRAEAKLAEKAKAAKRAEAKAAKEAKRAAAKRAEAKLAEKAKAAK\"\n", + "\n", + "# Initialize BLAST search\n", + "blast = Blast(\n", + " # service_url=\"http://localhost:6001/blast\",\n", + " mode=\"blastp\", # Use blastp for protein sequences\n", + " db_path=\"/usr/local/bin/data/test_db\", # Path in Docker container\n", + " db_name=\"protein_db\", # Name of your BLAST database\n", + " evalue=0.1, # E-value threshold\n", + " max_target_seqs=10, # Maximum number of hits to return\n", + ")\n", + "\n", + "# Perform search\n", + "results = blast.search(sequence)\n", + "results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The results are returned as a pandas DataFrame with the following columns:\n", + "- subject_id: ID of the matched sequence\n", + "- identity: Percentage identity\n", + "- alignment_length: Length of the alignment\n", + "- mismatches: Number of mismatches\n", + "- gap_opens: Number of gap openings\n", + "- query_start/end: Start/end positions in query sequence\n", + "- subject_start/end: Start/end positions in subject sequence\n", + "- evalue: Expectation value\n", + "- bit_score: Bit score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating a BLAST Database\n", + "\n", + "Before using BLAST, you need to create a local database. Here's how to create one from a FASTA file:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```bash\n", + "# For protein sequences\n", + "makeblastdb -in proteins.fasta -dbtype prot -out blast_db/my_proteins\n", + "\n", + "# For nucleotide sequences\n", + "makeblastdb -in nucleotides.fasta -dbtype nucl -out blast_db/my_nucleotides\n", + "```\n", + "\n", + "To access the BLAST Docker container shell and create databases:\n", + "\n", + "```bash\n", + "# Enter the BLAST container shell\n", + "docker compose exec blast bash\n", + "# \n", + "# Navigate to database directory\n", + "cd /usr/local/bin/data/blast_db\n", + "# \n", + "# Create protein database\n", + "makeblastdb -in proteins.fasta -dbtype prot -out my_proteins\n", + "# \n", + "# Create nucleotide database \n", + "makeblastdb -in nucleotides.fasta -dbtype nucl -out my_nucleotides\n", + "```\n", + "Make sure your FASTA files are mounted in the container's `/usr/local/bin/data/blast_db` directory.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Advanced Usage\n", + "\n", + "You can customize the BLAST search parameters:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_ididentityalignment_lengthmismatchesgap_opensquery_startquery_endsubject_startsubject_endevaluebit_score
0seq781.8182231315111320.00322.3
1seq1100.00025001251250.00422.3
\n", + "
" + ], + "text/plain": [ + " subject_id identity alignment_length mismatches gap_opens query_start \\\n", + "0 seq7 81.818 22 3 1 31 \n", + "1 seq1 100.000 25 0 0 1 \n", + "\n", + " query_end subject_start subject_end evalue bit_score \n", + "0 51 11 32 0.003 22.3 \n", + "1 25 1 25 0.004 22.3 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Configure BLAST for sensitive protein search\n", + "blast = Blast(\n", + " # service_url=\"http://localhost:6001/blast\",\n", + " mode=\"blastp\",\n", + " db_path=\"/usr/local/bin/data/test_db\",\n", + " db_name=\"protein_db\",\n", + " evalue=1e-1, # More stringent E-value\n", + " max_target_seqs=100, # Return more hits\n", + " num_threads=4, # Use 4 CPU threads\n", + ")\n", + "\n", + "# Search with longer timeout\n", + "results = blast.search(sequence, timeout=7200) # 2 hour timeout\n", + "\n", + "# Filter results\n", + "significant_hits = results[results[\"identity\"] > 80] # Only hits with >90% identity\n", + "significant_hits" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Thereafter, the ids of the hits can be added to the pyeed database, using the `fetch_from_primary_db` function." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pyeed", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/docs/usage/clustalo.ipynb b/docs/usage/clustalo.ipynb index 64ed62ee..d3ba2fba 100644 --- a/docs/usage/clustalo.ipynb +++ b/docs/usage/clustalo.ipynb @@ -1,171 +1,171 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Multiple Sequence Alignment with Clustal Omega\n", - "\n", - "PyEED provides a convenient interface to Clustal Omega for multiple sequence alignment. This notebook demonstrates how to:\n", - "1. Align sequences from a dictionary\n", - "2. Align sequences directly from the database" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "from pyeed import Pyeed\n", - "from pyeed.tools.clustalo import ClustalOmega\n", - "\n", - "# change log level to INFO\n", - "import sys\n", - "from loguru import logger\n", - "\n", - "logger.remove()\n", - "level = logger.add(sys.stderr, level=\"INFO\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Direct Sequence Alignment\n", - "\n", - "You can align sequences directly by providing a dictionary of sequences:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Aligned sequences:\n", - "seq1 AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n", - "seq2 AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n", - "seq3 AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK----\n" - ] - } - ], - "source": [ - "# Initialize ClustalOmega\n", - "clustalo = ClustalOmega()\n", - "\n", - "# Example sequences\n", - "sequences = {\n", - " \"seq1\": \"AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n", - " \"seq2\": \"AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n", - " \"seq3\": \"AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK\",\n", - "}\n", - "\n", - "# Perform alignment\n", - "alignment = clustalo.align(sequences)\n", - "print(\"Aligned sequences:\")\n", - "print(alignment)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Database-based Alignment\n", - "\n", - "You can also align sequences directly from the database by providing a list of accession IDs:" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Pyeed Graph Object Mapping constraints not defined. Use _install_labels() to set up model constraints.\n", - "📡 Connected to database.\n", - "Database alignment:\n", - "AAP20891.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "CAJ85677.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "SAQ02853.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "CDR98216.1 MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n", - "WP_109963600.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGTGKRGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "CAA41038.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "WP_109874025.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "CAA46344.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "APG33178.1 MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n", - "AKC98298.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n" - ] - } - ], - "source": [ - "# Connect to database\n", - "pyeed = Pyeed(uri=\"bolt://129.69.129.130:7687\", user=\"neo4j\", password=\"12345678\")\n", - "\n", - "# Get protein IDs from database\n", - "from pyeed.model import Protein\n", - "\n", - "accession_ids = [protein.accession_id for protein in Protein.nodes.all()][:10]\n", - "\n", - "# Align sequences from database\n", - "alignment = clustalo.align_from_db(accession_ids, pyeed.db)\n", - "print(\"Database alignment:\")\n", - "print(alignment)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Understanding Alignment Results\n", - "\n", - "The alignment result is a `MultipleSequenceAlignment` object with:\n", - "- List of `Sequence` objects\n", - "- Each sequence has an ID and aligned sequence\n", - "- Gaps are represented by '-' characters\n", - "- Sequences are padded to equal length\n", - "\n", - "The alignment preserves sequence order and maintains sequence IDs from the input." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Configuration\n", - "\n", - "ClustalOmega requires the PyEED Docker service to be running. Make sure to:\n", - "1. Have Docker installed\n", - "2. Start the service with `docker-compose up -d`\n", - "3. The service runs on port 5001 by default" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "pyeed_niklas", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Multiple Sequence Alignment with Clustal Omega\n", + "\n", + "PyEED provides a convenient interface to Clustal Omega for multiple sequence alignment. This notebook demonstrates how to:\n", + "1. Align sequences from a dictionary\n", + "2. Align sequences directly from the database" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# change log level to INFO\n", + "import sys\n", + "\n", + "from loguru import logger\n", + "\n", + "from pyeed import Pyeed\n", + "from pyeed.model import Protein\n", + "from pyeed.tools.clustalo import ClustalOmega\n", + "\n", + "logger.remove()\n", + "level = logger.add(sys.stderr, level=\"INFO\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Direct Sequence Alignment\n", + "\n", + "You can align sequences directly by providing a dictionary of sequences:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Aligned sequences:\n", + "seq1 AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n", + "seq2 AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n", + "seq3 AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK----\n" + ] + } + ], + "source": [ + "# Initialize ClustalOmega\n", + "clustalo = ClustalOmega()\n", + "\n", + "# Example sequences\n", + "sequences = {\n", + " \"seq1\": \"AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n", + " \"seq2\": \"AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n", + " \"seq3\": \"AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK\",\n", + "}\n", + "\n", + "# Perform alignment\n", + "alignment = clustalo.align(sequences)\n", + "print(\"Aligned sequences:\")\n", + "print(alignment)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Database-based Alignment\n", + "\n", + "You can also align sequences directly from the database by providing a list of accession IDs:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pyeed Graph Object Mapping constraints not defined. Use _install_labels() to set up model constraints.\n", + "📡 Connected to database.\n", + "Database alignment:\n", + "AAP20891.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "CAJ85677.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "SAQ02853.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "CDR98216.1 MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n", + "WP_109963600.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGTGKRGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "CAA41038.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "WP_109874025.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "CAA46344.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "APG33178.1 MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n", + "AKC98298.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n" + ] + } + ], + "source": [ + "# Connect to database\n", + "pyeed = Pyeed(uri=\"bolt://129.69.129.130:7687\", user=\"neo4j\", password=\"12345678\")\n", + "\n", + "# Get protein IDs from database\n", + "accession_ids = [protein.accession_id for protein in Protein.nodes.all()][:10]\n", + "\n", + "# Align sequences from database\n", + "alignment = clustalo.align_from_db(accession_ids, pyeed.db)\n", + "print(\"Database alignment:\")\n", + "print(alignment)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Understanding Alignment Results\n", + "\n", + "The alignment result is a `MultipleSequenceAlignment` object with:\n", + "- List of `Sequence` objects\n", + "- Each sequence has an ID and aligned sequence\n", + "- Gaps are represented by '-' characters\n", + "- Sequences are padded to equal length\n", + "\n", + "The alignment preserves sequence order and maintains sequence IDs from the input." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuration\n", + "\n", + "ClustalOmega requires the PyEED Docker service to be running. Make sure to:\n", + "1. Have Docker installed\n", + "2. Start the service with `docker-compose up -d`\n", + "3. The service runs on port 5001 by default" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pyeed_niklas", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/docs/usage/embeddings_analysis.ipynb b/docs/usage/embeddings_analysis.ipynb index 65a2398c..0b72e743 100644 --- a/docs/usage/embeddings_analysis.ipynb +++ b/docs/usage/embeddings_analysis.ipynb @@ -24,9 +24,10 @@ "source": [ "import sys\n", "\n", - "from loguru import logger\n", - "import pandas as pd\n", "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "from loguru import logger\n", + "\n", "from pyeed import Pyeed\n", "from pyeed.analysis.embedding_analysis import EmbeddingTool\n", "\n", diff --git a/docs/usage/mmseqs.ipynb b/docs/usage/mmseqs.ipynb index 1185c6fe..2253fd8a 100644 --- a/docs/usage/mmseqs.ipynb +++ b/docs/usage/mmseqs.ipynb @@ -20,6 +20,7 @@ "outputs": [], "source": [ "from pyeed import Pyeed\n", + "from pyeed.model import Protein\n", "from pyeed.tools.mmseqs import MMSeqs" ] }, @@ -134,8 +135,6 @@ "pyeed = Pyeed(uri=\"bolt://localhost:7687\", user=\"neo4j\", password=\"12345678\")\n", "\n", "# Get first 100 protein IDs from database\n", - "from pyeed.model import Protein\n", - "\n", "accession_ids = [protein.accession_id for protein in Protein.nodes.all()][:100]\n", "\n", "# Cluster sequences\n", diff --git a/docs/usage/mutation_analysis.ipynb b/docs/usage/mutation_analysis.ipynb index 9b31c996..7d10d360 100644 --- a/docs/usage/mutation_analysis.ipynb +++ b/docs/usage/mutation_analysis.ipynb @@ -16,6 +16,7 @@ "outputs": [], "source": [ "import sys\n", + "\n", "from loguru import logger\n", "\n", "from pyeed import Pyeed\n", diff --git a/docs/usage/network_analysis.ipynb b/docs/usage/network_analysis.ipynb index 4d45db71..0b254610 100644 --- a/docs/usage/network_analysis.ipynb +++ b/docs/usage/network_analysis.ipynb @@ -11,6 +11,7 @@ "import matplotlib.pyplot as plt\n", "import networkx as nx\n", "from loguru import logger\n", + "\n", "from pyeed import Pyeed\n", "from pyeed.analysis.network_analysis import NetworkAnalysis\n", "from pyeed.analysis.sequence_alignment import PairwiseAligner\n", diff --git a/docs/usage/standard_numbering.ipynb b/docs/usage/standard_numbering.ipynb index cd84cad9..54374cd6 100644 --- a/docs/usage/standard_numbering.ipynb +++ b/docs/usage/standard_numbering.ipynb @@ -23,10 +23,10 @@ "%reload_ext autoreload\n", "%autoreload 2\n", "import sys\n", + "\n", "from loguru import logger\n", "\n", "from pyeed import Pyeed\n", - "from pyeed.analysis.mutation_detection import MutationDetection\n", "from pyeed.analysis.standard_numbering import StandardNumberingTool\n", "\n", "logger.remove()\n", diff --git a/src/pyeed/analysis/embedding_analysis.py b/src/pyeed/analysis/embedding_analysis.py index b3535f74..c27b670f 100644 --- a/src/pyeed/analysis/embedding_analysis.py +++ b/src/pyeed/analysis/embedding_analysis.py @@ -6,9 +6,10 @@ import scipy.spatial as sp from matplotlib.figure import Figure from numpy.typing import NDArray -from pyeed.dbconnect import DatabaseConnector from scipy.spatial.distance import cosine +from pyeed.dbconnect import DatabaseConnector + logger = logging.getLogger(__name__) diff --git a/src/pyeed/analysis/network_analysis.py b/src/pyeed/analysis/network_analysis.py index dd66b45c..3ab9aeaa 100644 --- a/src/pyeed/analysis/network_analysis.py +++ b/src/pyeed/analysis/network_analysis.py @@ -2,6 +2,7 @@ import networkx as nx from loguru import logger + from pyeed.dbconnect import DatabaseConnector diff --git a/src/pyeed/analysis/ontology_loading.py b/src/pyeed/analysis/ontology_loading.py index ee909636..5b6341f5 100644 --- a/src/pyeed/analysis/ontology_loading.py +++ b/src/pyeed/analysis/ontology_loading.py @@ -1,8 +1,9 @@ from typing import Dict -from pyeed.dbconnect import DatabaseConnector from rdflib import OWL, RDF, RDFS, Graph, Namespace, URIRef +from pyeed.dbconnect import DatabaseConnector + class OntologyAdapter: """ diff --git a/src/pyeed/analysis/standard_numbering.py b/src/pyeed/analysis/standard_numbering.py index 6f81869f..b2ea0667 100644 --- a/src/pyeed/analysis/standard_numbering.py +++ b/src/pyeed/analysis/standard_numbering.py @@ -13,6 +13,7 @@ from typing import Any, Dict, List, Optional, Tuple from loguru import logger + from pyeed.analysis.sequence_alignment import PairwiseAligner from pyeed.dbconnect import DatabaseConnector from pyeed.model import StandardNumbering diff --git a/tests/unit/test_dbchat.py b/tests/unit/test_dbchat.py index d1e202c6..bf6226ac 100644 --- a/tests/unit/test_dbchat.py +++ b/tests/unit/test_dbchat.py @@ -2,6 +2,7 @@ import pytest from neo4j.exceptions import CypherSyntaxError + from pyeed.dbchat import DBChat from pyeed.dbconnect import DatabaseConnector From c0739bbf3aae20c6747c216bd53ebbcced8ed396 Mon Sep 17 00:00:00 2001 From: Niklas Abraham GPU Date: Thu, 1 May 2025 14:32:06 +0000 Subject: [PATCH 37/39] fixed ruff import version mismatches --- src/pyeed/analysis/embedding_analysis.py | 3 +-- src/pyeed/analysis/mutation_detection.py | 1 - src/pyeed/analysis/network_analysis.py | 1 - src/pyeed/analysis/ontology_loading.py | 3 +-- src/pyeed/analysis/sequence_alignment.py | 3 +-- src/pyeed/analysis/standard_numbering.py | 1 - tests/unit/test_dbchat.py | 1 - 7 files changed, 3 insertions(+), 10 deletions(-) diff --git a/src/pyeed/analysis/embedding_analysis.py b/src/pyeed/analysis/embedding_analysis.py index c27b670f..b3535f74 100644 --- a/src/pyeed/analysis/embedding_analysis.py +++ b/src/pyeed/analysis/embedding_analysis.py @@ -6,9 +6,8 @@ import scipy.spatial as sp from matplotlib.figure import Figure from numpy.typing import NDArray -from scipy.spatial.distance import cosine - from pyeed.dbconnect import DatabaseConnector +from scipy.spatial.distance import cosine logger = logging.getLogger(__name__) diff --git a/src/pyeed/analysis/mutation_detection.py b/src/pyeed/analysis/mutation_detection.py index 5c6809e8..c2562ae1 100644 --- a/src/pyeed/analysis/mutation_detection.py +++ b/src/pyeed/analysis/mutation_detection.py @@ -1,7 +1,6 @@ from typing import Any, Optional from loguru import logger - from pyeed.dbconnect import DatabaseConnector diff --git a/src/pyeed/analysis/network_analysis.py b/src/pyeed/analysis/network_analysis.py index 3ab9aeaa..dd66b45c 100644 --- a/src/pyeed/analysis/network_analysis.py +++ b/src/pyeed/analysis/network_analysis.py @@ -2,7 +2,6 @@ import networkx as nx from loguru import logger - from pyeed.dbconnect import DatabaseConnector diff --git a/src/pyeed/analysis/ontology_loading.py b/src/pyeed/analysis/ontology_loading.py index 5b6341f5..ee909636 100644 --- a/src/pyeed/analysis/ontology_loading.py +++ b/src/pyeed/analysis/ontology_loading.py @@ -1,8 +1,7 @@ from typing import Dict -from rdflib import OWL, RDF, RDFS, Graph, Namespace, URIRef - from pyeed.dbconnect import DatabaseConnector +from rdflib import OWL, RDF, RDFS, Graph, Namespace, URIRef class OntologyAdapter: diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py index cb6acff4..440cbb1e 100644 --- a/src/pyeed/analysis/sequence_alignment.py +++ b/src/pyeed/analysis/sequence_alignment.py @@ -5,10 +5,9 @@ from Bio.Align import PairwiseAligner as BioPairwiseAligner from Bio.Align.substitution_matrices import Array as BioSubstitutionMatrix from joblib import Parallel, cpu_count, delayed -from rich.progress import Progress - from pyeed.dbconnect import DatabaseConnector from pyeed.tools.utility import chunks +from rich.progress import Progress class PairwiseAligner: diff --git a/src/pyeed/analysis/standard_numbering.py b/src/pyeed/analysis/standard_numbering.py index b2ea0667..6f81869f 100644 --- a/src/pyeed/analysis/standard_numbering.py +++ b/src/pyeed/analysis/standard_numbering.py @@ -13,7 +13,6 @@ from typing import Any, Dict, List, Optional, Tuple from loguru import logger - from pyeed.analysis.sequence_alignment import PairwiseAligner from pyeed.dbconnect import DatabaseConnector from pyeed.model import StandardNumbering diff --git a/tests/unit/test_dbchat.py b/tests/unit/test_dbchat.py index bf6226ac..d1e202c6 100644 --- a/tests/unit/test_dbchat.py +++ b/tests/unit/test_dbchat.py @@ -2,7 +2,6 @@ import pytest from neo4j.exceptions import CypherSyntaxError - from pyeed.dbchat import DBChat from pyeed.dbconnect import DatabaseConnector From 667882db8b550ee14653dfbf127f64c6ac0c99a9 Mon Sep 17 00:00:00 2001 From: Niklas Abraham GPU Date: Thu, 1 May 2025 14:32:31 +0000 Subject: [PATCH 38/39] fixed ruff mismatches run --- src/pyeed/analysis/embedding_analysis.py | 3 +-- src/pyeed/analysis/mutation_detection.py | 1 - src/pyeed/analysis/network_analysis.py | 1 - src/pyeed/analysis/ontology_loading.py | 3 +-- src/pyeed/analysis/sequence_alignment.py | 3 +-- src/pyeed/analysis/standard_numbering.py | 1 - tests/unit/test_dbchat.py | 1 - 7 files changed, 3 insertions(+), 10 deletions(-) diff --git a/src/pyeed/analysis/embedding_analysis.py b/src/pyeed/analysis/embedding_analysis.py index c27b670f..b3535f74 100644 --- a/src/pyeed/analysis/embedding_analysis.py +++ b/src/pyeed/analysis/embedding_analysis.py @@ -6,9 +6,8 @@ import scipy.spatial as sp from matplotlib.figure import Figure from numpy.typing import NDArray -from scipy.spatial.distance import cosine - from pyeed.dbconnect import DatabaseConnector +from scipy.spatial.distance import cosine logger = logging.getLogger(__name__) diff --git a/src/pyeed/analysis/mutation_detection.py b/src/pyeed/analysis/mutation_detection.py index 5c6809e8..c2562ae1 100644 --- a/src/pyeed/analysis/mutation_detection.py +++ b/src/pyeed/analysis/mutation_detection.py @@ -1,7 +1,6 @@ from typing import Any, Optional from loguru import logger - from pyeed.dbconnect import DatabaseConnector diff --git a/src/pyeed/analysis/network_analysis.py b/src/pyeed/analysis/network_analysis.py index 5627461e..fd354ebe 100644 --- a/src/pyeed/analysis/network_analysis.py +++ b/src/pyeed/analysis/network_analysis.py @@ -2,7 +2,6 @@ import networkx as nx from loguru import logger - from pyeed.dbconnect import DatabaseConnector diff --git a/src/pyeed/analysis/ontology_loading.py b/src/pyeed/analysis/ontology_loading.py index 5b6341f5..ee909636 100644 --- a/src/pyeed/analysis/ontology_loading.py +++ b/src/pyeed/analysis/ontology_loading.py @@ -1,8 +1,7 @@ from typing import Dict -from rdflib import OWL, RDF, RDFS, Graph, Namespace, URIRef - from pyeed.dbconnect import DatabaseConnector +from rdflib import OWL, RDF, RDFS, Graph, Namespace, URIRef class OntologyAdapter: diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py index cb6acff4..440cbb1e 100644 --- a/src/pyeed/analysis/sequence_alignment.py +++ b/src/pyeed/analysis/sequence_alignment.py @@ -5,10 +5,9 @@ from Bio.Align import PairwiseAligner as BioPairwiseAligner from Bio.Align.substitution_matrices import Array as BioSubstitutionMatrix from joblib import Parallel, cpu_count, delayed -from rich.progress import Progress - from pyeed.dbconnect import DatabaseConnector from pyeed.tools.utility import chunks +from rich.progress import Progress class PairwiseAligner: diff --git a/src/pyeed/analysis/standard_numbering.py b/src/pyeed/analysis/standard_numbering.py index b2ea0667..6f81869f 100644 --- a/src/pyeed/analysis/standard_numbering.py +++ b/src/pyeed/analysis/standard_numbering.py @@ -13,7 +13,6 @@ from typing import Any, Dict, List, Optional, Tuple from loguru import logger - from pyeed.analysis.sequence_alignment import PairwiseAligner from pyeed.dbconnect import DatabaseConnector from pyeed.model import StandardNumbering diff --git a/tests/unit/test_dbchat.py b/tests/unit/test_dbchat.py index bf6226ac..d1e202c6 100644 --- a/tests/unit/test_dbchat.py +++ b/tests/unit/test_dbchat.py @@ -2,7 +2,6 @@ import pytest from neo4j.exceptions import CypherSyntaxError - from pyeed.dbchat import DBChat from pyeed.dbconnect import DatabaseConnector From cf19b94a87f3d800a40e8a95be4020853c59d689 Mon Sep 17 00:00:00 2001 From: alacheim Date: Fri, 2 May 2025 08:58:14 +0000 Subject: [PATCH 39/39] fixed mypy error, formated file --- src/pyeed/embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py index 30b15f0c..28f66a1b 100644 --- a/src/pyeed/embedding.py +++ b/src/pyeed/embedding.py @@ -97,7 +97,7 @@ def process_batches_on_gpu( def load_model_and_tokenizer( model_name: str, device: torch.device, -) -> Tuple[Any, Union[Any, None], str]: +) -> Tuple[Any, Union[Any, None], torch.device]: """ Loads the model and assigns it to a specific GPU.