From f661eb643abcde43362dae60773828363a595a9d Mon Sep 17 00:00:00 2001
From: alacheim <a.lacheim@outlook.de>
Date: Tue, 11 Mar 2025 10:30:16 +0000
Subject: [PATCH 01/19] added ncbi to uniprot mapper

---
 src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 131 ++++++++++++++++++++
 src/pyeed/main.py                           |  14 +++
 2 files changed, 145 insertions(+)
 create mode 100644 src/pyeed/adapter/ncbi_to_uniprot_mapper.py

diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
new file mode 100644
index 00000000..3b6098ad
--- /dev/null
+++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
@@ -0,0 +1,131 @@
+import httpx
+import logging
+from pysam import FastaFile
+from crc64iso import crc64iso
+import sys
+import json 
+import os
+from typing import List
+
+logger = logging.getLogger(__name__)
+
+class NCBIToUniprotMapper:
+    def __init__(self, ids):
+        self.ids = ids
+        self.uniparc_url = "https://www.ebi.ac.uk/proteins/api/uniparc?offset=0&size=100&sequencechecksum="
+        self.ncbi_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+    
+    
+    def download_fasta(self, refseq_id: str) -> None:
+        """
+        Downloads a FASTA file for a given RefSeq ID using httpx and saves it locally.
+        
+        Args:
+            refseq_id str: NCBI ID
+        """
+
+        params = {
+            "db": "protein",
+            "id": refseq_id,
+            "rettype": "fasta",
+            "retmode": "text"
+        }
+        
+        try:
+            response = httpx.get(self.ncbi_url, params=params, timeout=10.0)
+
+            if response.status_code == 200:
+                filename = f"{refseq_id}.fasta"
+                with open(filename, "w") as f:
+                    f.write(response.text)
+                print(f"✅ Downloaded: {filename}")
+            else:
+                print(f"❌ Failed to download {refseq_id} (Status: {response.status_code})")
+
+        except httpx.HTTPError as e:
+            print(f"❌ HTTP error occurred while downloading {refseq_id}: {e}")
+
+    def get_checksum(self, refseq_id: str) -> str:
+        """Fetches and calculates the checksum for a given RefSeq ID.
+        
+        Args: 
+            refseq_id str: NCBI ID
+        
+        Returns:
+            str: checksum ID
+        """
+        
+        self.download_fasta(refseq_id)
+        fa = FastaFile(f"{refseq_id}.fasta")
+        seq = fa.fetch(fa.references[0])
+        return crc64iso.crc64(seq)
+
+    def checksum_list(self, refseq_ids: List[str]) -> List[str]:
+        """Creates a list of checksum IDs and deletes the FASTA files after processing.
+
+        Args:
+            refseq_ids str: NCBI IDs
+
+        Returns:
+            List[str]: cheksum IDs
+        """
+        
+        checksums = []
+        for refseq_id in refseq_ids:
+            checksums.append(self.get_checksum(refseq_id))
+            fasta_file_path = f"{refseq_id}.fasta"
+            fai_file_path = f"{refseq_id}.fasta.fai"
+
+            if os.path.exists(fasta_file_path):
+                os.remove(fasta_file_path)  # Delete the fasta file
+
+            if os.path.exists(fai_file_path):
+                os.remove(fai_file_path)
+        return checksums
+
+    def execute_request(self)  -> None: 
+        """Fetches the uniparc and uniprot ids for the given refseq ids and saves them in a json file.
+        """
+        
+        checksum_list = self.checksum_list(self.ids)
+        
+        id_mapping_uniprot = {}
+        id_mapping_uniparc = {}
+        counter = 0
+        
+        for checksum in checksum_list: 
+            url = f"{self.uniparc_url}{checksum}"
+            
+            #perform request and get response as JSON
+            with httpx.Client() as client:
+                response = client.get(url, headers={ "Accept" : "application/json"})
+                
+            #check if the request was successful
+            if response.status_code != 200:
+                print(f"Request failed with status code {r.status_code}")
+                response.raise_for_status()  # Raise exception for any non-200 response
+                sys.exit()
+            
+            # Check if the response body is empty
+            if not response.content.strip():  # Check if the body is empty
+                print("The response body is empty.")
+                sys.exit()
+            
+            #extracts the uniprot and the uniparc id from the repsonse and saves them in a dictionary
+            response_body = response.json()
+            for item in response_body: 
+                uniparc_id = item.get('accession', None)
+                for ref in item.get('dbReference', []):  
+                    if ref.get('type') == 'UniProtKB/TrEMBL':
+                        uniprot_id = ref.get('id', None)
+                        id_mapping_uniparc[self.ids[counter]] = uniparc_id
+                        id_mapping_uniprot[self.ids[counter]] = uniprot_id
+            counter += 1
+        
+        with open("id_mapping_uniprot.json", "w") as f:
+            json.dump(id_mapping_uniprot, f)
+            
+        with open("id_mapping_uniparc.json", "w") as f:
+            json.dump(id_mapping_uniparc, f)
+    
+    
diff --git a/src/pyeed/main.py b/src/pyeed/main.py
index 5950965d..c7707d13 100644
--- a/src/pyeed/main.py
+++ b/src/pyeed/main.py
@@ -8,6 +8,7 @@
 from pyeed.adapter.ncbi_protein_mapper import NCBIProteinToPyeed
 from pyeed.adapter.primary_db_adapter import PrimaryDBAdapter
 from pyeed.adapter.uniprot_mapper import UniprotToPyeed
+from pyeed.adapter.ncbi_to_uniprot_mapper import NCBIToUniprotMapper
 from pyeed.dbchat import DBChat
 from pyeed.dbconnect import DatabaseConnector
 from pyeed.embedding import (
@@ -185,6 +186,19 @@ def fetch_ncbi_nucleotide(self, ids: list[str]) -> None:
 
         asyncio.run(adapter.execute_requests())
         nest_asyncio.apply()
+    
+    def database_id_mapper(self, ids: list[str]) -> None:
+        """
+        Maps IDs from one database to another using the UniProt ID mapping service
+
+        Args:
+            ids (list[str]): List of IDs to map.
+        """
+
+        mapper = NCBIToUniprotMapper(ids)
+        mapper.execute_request()
+        
+        nest_asyncio.apply()
 
     def calculate_sequence_embeddings(
         self,

From f44a2f3126d6b3ddb23082b2bdc193e5ace91956 Mon Sep 17 00:00:00 2001
From: alacheim <a.lacheim@outlook.de>
Date: Thu, 13 Mar 2025 14:21:07 +0000
Subject: [PATCH 02/19] changes in mapper

---
 src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
index 3b6098ad..134d2ff5 100644
--- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
+++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
@@ -127,5 +127,4 @@ def execute_request(self)  -> None:
             
         with open("id_mapping_uniparc.json", "w") as f:
             json.dump(id_mapping_uniparc, f)
-    
-    
+    
\ No newline at end of file

From 27ff2317166e9c2b43572e08ca378240405fa7ab Mon Sep 17 00:00:00 2001
From: alacheim <a.lacheim@outlook.de>
Date: Fri, 14 Mar 2025 09:46:57 +0000
Subject: [PATCH 03/19] fixed bug in organism mapper

---
 src/pyeed/adapter/ncbi_protein_mapper.py |  2 ++
 src/pyeed/model.py                       | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/src/pyeed/adapter/ncbi_protein_mapper.py b/src/pyeed/adapter/ncbi_protein_mapper.py
index e11d4fe7..3ecf485c 100644
--- a/src/pyeed/adapter/ncbi_protein_mapper.py
+++ b/src/pyeed/adapter/ncbi_protein_mapper.py
@@ -281,6 +281,8 @@ def add_to_db(self, response: Response) -> None:
                 protein = Protein(**protein_data)
                 protein.save()
 
+            if not isinstance(organism, Organism):
+                raise TypeError(f"Expected Organism, but got {type(organism)}")
             protein.organism.connect(organism)
 
             # Add features
diff --git a/src/pyeed/model.py b/src/pyeed/model.py
index 7a720560..c9c1d4a4 100644
--- a/src/pyeed/model.py
+++ b/src/pyeed/model.py
@@ -145,6 +145,20 @@ class Annotation(Enum):
 class Organism(StrictStructuredNode):
     taxonomy_id: int = IntegerProperty(required=True, unique_index=True)
     name = StringProperty()
+    
+    @classmethod
+    def get_or_save(cls, taxonomy_id, name) -> "Organism":
+        try:
+            organism = cls.nodes.get(taxonomy_id=taxonomy_id)
+            return organism
+        except cls.DoesNotExist:
+            try:
+                organism = cls(taxonomy_id=taxonomy_id, name=name)
+                organism.save()
+                return organism
+            except Exception as e:
+                print(f"Error during saving of the organism: {e}")
+                raise
 
 
 class SiteRel(StructuredRel):  # type: ignore

From faf38fd4a32ab6ba27edc78748069464fe066195 Mon Sep 17 00:00:00 2001
From: alacheim <a.lacheim@outlook.de>
Date: Fri, 25 Apr 2025 10:09:19 +0000
Subject: [PATCH 04/19] added individual file name to mapper

---
 src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 7 ++++---
 src/pyeed/main.py                           | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
index 134d2ff5..4ea11801 100644
--- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
+++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
@@ -10,8 +10,9 @@
 logger = logging.getLogger(__name__)
 
 class NCBIToUniprotMapper:
-    def __init__(self, ids):
+    def __init__(self, ids: List[str], file: str):
         self.ids = ids
+        self.file = file
         self.uniparc_url = "https://www.ebi.ac.uk/proteins/api/uniparc?offset=0&size=100&sequencechecksum="
         self.ncbi_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
     
@@ -122,9 +123,9 @@ def execute_request(self)  -> None:
                         id_mapping_uniprot[self.ids[counter]] = uniprot_id
             counter += 1
         
-        with open("id_mapping_uniprot.json", "w") as f:
+        with open(f"{self.file}_uniprot.json", "w") as f:
             json.dump(id_mapping_uniprot, f)
             
-        with open("id_mapping_uniparc.json", "w") as f:
+        with open(f"{self.file}_uniparc.json", "w") as f:
             json.dump(id_mapping_uniparc, f)
     
\ No newline at end of file
diff --git a/src/pyeed/main.py b/src/pyeed/main.py
index c7707d13..44fd6d52 100644
--- a/src/pyeed/main.py
+++ b/src/pyeed/main.py
@@ -187,7 +187,7 @@ def fetch_ncbi_nucleotide(self, ids: list[str]) -> None:
         asyncio.run(adapter.execute_requests())
         nest_asyncio.apply()
     
-    def database_id_mapper(self, ids: list[str]) -> None:
+    def database_id_mapper(self, ids: list[str], file: str) -> None:
         """
         Maps IDs from one database to another using the UniProt ID mapping service
 
@@ -195,7 +195,7 @@ def database_id_mapper(self, ids: list[str]) -> None:
             ids (list[str]): List of IDs to map.
         """
 
-        mapper = NCBIToUniprotMapper(ids)
+        mapper = NCBIToUniprotMapper(ids, file)
         mapper.execute_request()
         
         nest_asyncio.apply()

From 056cb6b75cb45efc650648b6e2a7b56c446803b3 Mon Sep 17 00:00:00 2001
From: alacheim <a.lacheim@outlook.de>
Date: Wed, 30 Apr 2025 09:50:29 +0000
Subject: [PATCH 05/19] fixing ruff errors

---
 src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 11 ++++++-----
 src/pyeed/main.py                           | 11 ++++-------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
index 4ea11801..6969e2e8 100644
--- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
+++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
@@ -1,12 +1,13 @@
-import httpx
+import json
 import logging
-from pysam import FastaFile
-from crc64iso import crc64iso
-import sys
-import json 
 import os
+import sys
 from typing import List
 
+import httpx
+from crc64iso import crc64iso
+from pysam import FastaFile
+
 logger = logging.getLogger(__name__)
 
 class NCBIToUniprotMapper:
diff --git a/src/pyeed/main.py b/src/pyeed/main.py
index c971fe16..7effe1f1 100644
--- a/src/pyeed/main.py
+++ b/src/pyeed/main.py
@@ -1,25 +1,22 @@
 import asyncio
-from typing import Any, Literal
 import time
 from concurrent.futures import ThreadPoolExecutor
-import torch
+from typing import Any, Literal
 
 import nest_asyncio
+import torch
 from loguru import logger
 
 from pyeed.adapter.ncbi_dna_mapper import NCBIDNAToPyeed
 from pyeed.adapter.ncbi_protein_mapper import NCBIProteinToPyeed
+from pyeed.adapter.ncbi_to_uniprot_mapper import NCBIToUniprotMapper
 from pyeed.adapter.primary_db_adapter import PrimaryDBAdapter
 from pyeed.adapter.uniprot_mapper import UniprotToPyeed
-from pyeed.adapter.ncbi_to_uniprot_mapper import NCBIToUniprotMapper
 from pyeed.dbchat import DBChat
 from pyeed.dbconnect import DatabaseConnector
 from pyeed.embedding import (
-    free_memory,
-    get_batch_embeddings,
     load_model_and_tokenizer,
-    update_protein_embeddings_in_db,
-    process_batches_on_gpu
+    process_batches_on_gpu,
 )
 
 

From e7369e93c2dfd504150afbe422c76c73a20043fd Mon Sep 17 00:00:00 2001
From: alacheim <a.lacheim@outlook.de>
Date: Wed, 30 Apr 2025 10:05:54 +0000
Subject: [PATCH 06/19] added crc64iso to dependencies

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index dd10629a..41b2c8fc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ esm = "^3.1.3"
 rdflib = "^6.0.0"
 docker = "5.0.0"
 absl-py = "1.0.0"
+crc64iso = "0.0.2"
 
 [tool.poetry.group.dev.dependencies]
 mkdocstrings = {extras = ["python"], version = "^0.26.2"}

From f9897fe4e1185dfa0d5003b5881572a8e9355118 Mon Sep 17 00:00:00 2001
From: alacheim <a.lacheim@outlook.de>
Date: Wed, 30 Apr 2025 10:09:18 +0000
Subject: [PATCH 07/19] added pysam to dependencies

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 2a57449b..2f81fd56 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ docker = "5.0.0"
 absl-py = "1.0.0"
 crc64iso = "0.0.2"
 SPARQLWrapper = "2.0.0"
+pysam = "0.23.0"
 
 [tool.poetry.group.dev.dependencies]
 mkdocstrings = {extras = ["python"], version = "^0.26.2"}

From 9d81e9b2204023d8c2a2ebff0b8ac88ff7e384cb Mon Sep 17 00:00:00 2001
From: alacheim <a.lacheim@outlook.de>
Date: Wed, 30 Apr 2025 10:21:54 +0000
Subject: [PATCH 08/19] fixing linting errors

---
 src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 2 +-
 src/pyeed/analysis/sequence_alignment.py    | 3 +--
 src/pyeed/embedding.py                      | 5 ++---
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
index 6969e2e8..8e543f52 100644
--- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
+++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
@@ -104,7 +104,7 @@ def execute_request(self)  -> None:
                 
             #check if the request was successful
             if response.status_code != 200:
-                print(f"Request failed with status code {r.status_code}")
+                print(f"Request failed with status code {response.status_code}")
                 response.raise_for_status()  # Raise exception for any non-200 response
                 sys.exit()
             
diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py
index 946200b2..9255d55f 100644
--- a/src/pyeed/analysis/sequence_alignment.py
+++ b/src/pyeed/analysis/sequence_alignment.py
@@ -5,10 +5,9 @@
 from Bio.Align import PairwiseAligner as BioPairwiseAligner
 from Bio.Align.substitution_matrices import Array as BioSubstitutionMatrix
 from joblib import Parallel, cpu_count, delayed
-from rich.progress import Progress
-
 from pyeed.dbconnect import DatabaseConnector
 from pyeed.tools.utility import chunks
+from rich.progress import Progress
 
 
 class PairwiseAligner:
diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py
index c8fa91db..1b0d4955 100644
--- a/src/pyeed/embedding.py
+++ b/src/pyeed/embedding.py
@@ -1,21 +1,20 @@
 import gc
 import os
 from typing import Any, Tuple, Union
-from loguru import logger
 
 import numpy as np
 import torch
-from esm.models.esmc import ESMC
 from esm.models.esm3 import ESM3
+from esm.models.esmc import ESMC
 from esm.sdk.api import ESM3InferenceClient, ESMProtein, LogitsConfig, SamplingConfig
 from huggingface_hub import HfFolder, login
+from loguru import logger
 from numpy.typing import NDArray
 from transformers import EsmModel, EsmTokenizer
 
 from pyeed.dbconnect import DatabaseConnector
 
 
-
 def get_hf_token() -> str:
     """Get or request Hugging Face token."""
     if os.getenv("PYTEST_DISABLE_HF_LOGIN"):  # Disable Hugging Face login in tests

From 3955718fa2d05a3739b1cb3c4e8bd3dc1c06d676 Mon Sep 17 00:00:00 2001
From: alacheim <a.lacheim@outlook.de>
Date: Wed, 30 Apr 2025 11:29:50 +0000
Subject: [PATCH 09/19] reformatting

---
 src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 68 ++++++++++-----------
 src/pyeed/adapter/uniprot_mapper.py         | 36 +++++------
 src/pyeed/analysis/network_analysis.py      | 36 +++++------
 src/pyeed/analysis/sequence_alignment.py    | 13 ++--
 src/pyeed/dbconnect.py                      |  7 ++-
 src/pyeed/embedding.py                      | 15 +++--
 src/pyeed/main.py                           | 25 ++++----
 src/pyeed/model.py                          | 23 ++++---
 8 files changed, 114 insertions(+), 109 deletions(-)

diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
index 8e543f52..2f711e16 100644
--- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
+++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
@@ -10,18 +10,18 @@
 
 logger = logging.getLogger(__name__)
 
+
 class NCBIToUniprotMapper:
     def __init__(self, ids: List[str], file: str):
         self.ids = ids
         self.file = file
         self.uniparc_url = "https://www.ebi.ac.uk/proteins/api/uniparc?offset=0&size=100&sequencechecksum="
         self.ncbi_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
-    
-    
+
     def download_fasta(self, refseq_id: str) -> None:
         """
         Downloads a FASTA file for a given RefSeq ID using httpx and saves it locally.
-        
+
         Args:
             refseq_id str: NCBI ID
         """
@@ -30,9 +30,9 @@ def download_fasta(self, refseq_id: str) -> None:
             "db": "protein",
             "id": refseq_id,
             "rettype": "fasta",
-            "retmode": "text"
+            "retmode": "text",
         }
-        
+
         try:
             response = httpx.get(self.ncbi_url, params=params, timeout=10.0)
 
@@ -42,21 +42,23 @@ def download_fasta(self, refseq_id: str) -> None:
                     f.write(response.text)
                 print(f"✅ Downloaded: {filename}")
             else:
-                print(f"❌ Failed to download {refseq_id} (Status: {response.status_code})")
+                print(
+                    f"❌ Failed to download {refseq_id} (Status: {response.status_code})"
+                )
 
         except httpx.HTTPError as e:
             print(f"❌ HTTP error occurred while downloading {refseq_id}: {e}")
 
     def get_checksum(self, refseq_id: str) -> str:
         """Fetches and calculates the checksum for a given RefSeq ID.
-        
-        Args: 
+
+        Args:
             refseq_id str: NCBI ID
-        
+
         Returns:
             str: checksum ID
         """
-        
+
         self.download_fasta(refseq_id)
         fa = FastaFile(f"{refseq_id}.fasta")
         seq = fa.fetch(fa.references[0])
@@ -71,7 +73,7 @@ def checksum_list(self, refseq_ids: List[str]) -> List[str]:
         Returns:
             List[str]: cheksum IDs
         """
-        
+
         checksums = []
         for refseq_id in refseq_ids:
             checksums.append(self.get_checksum(refseq_id))
@@ -85,48 +87,46 @@ def checksum_list(self, refseq_ids: List[str]) -> List[str]:
                 os.remove(fai_file_path)
         return checksums
 
-    def execute_request(self)  -> None: 
-        """Fetches the uniparc and uniprot ids for the given refseq ids and saves them in a json file.
-        """
-        
+    def execute_request(self) -> None:
+        """Fetches the uniparc and uniprot ids for the given refseq ids and saves them in a json file."""
+
         checksum_list = self.checksum_list(self.ids)
-        
+
         id_mapping_uniprot = {}
         id_mapping_uniparc = {}
         counter = 0
-        
-        for checksum in checksum_list: 
+
+        for checksum in checksum_list:
             url = f"{self.uniparc_url}{checksum}"
-            
-            #perform request and get response as JSON
+
+            # perform request and get response as JSON
             with httpx.Client() as client:
-                response = client.get(url, headers={ "Accept" : "application/json"})
-                
-            #check if the request was successful
+                response = client.get(url, headers={"Accept": "application/json"})
+
+            # check if the request was successful
             if response.status_code != 200:
                 print(f"Request failed with status code {response.status_code}")
                 response.raise_for_status()  # Raise exception for any non-200 response
                 sys.exit()
-            
+
             # Check if the response body is empty
             if not response.content.strip():  # Check if the body is empty
                 print("The response body is empty.")
                 sys.exit()
-            
-            #extracts the uniprot and the uniparc id from the repsonse and saves them in a dictionary
+
+            # extracts the uniprot and the uniparc id from the repsonse and saves them in a dictionary
             response_body = response.json()
-            for item in response_body: 
-                uniparc_id = item.get('accession', None)
-                for ref in item.get('dbReference', []):  
-                    if ref.get('type') == 'UniProtKB/TrEMBL':
-                        uniprot_id = ref.get('id', None)
+            for item in response_body:
+                uniparc_id = item.get("accession", None)
+                for ref in item.get("dbReference", []):
+                    if ref.get("type") == "UniProtKB/TrEMBL":
+                        uniprot_id = ref.get("id", None)
                         id_mapping_uniparc[self.ids[counter]] = uniparc_id
                         id_mapping_uniprot[self.ids[counter]] = uniprot_id
             counter += 1
-        
+
         with open(f"{self.file}_uniprot.json", "w") as f:
             json.dump(id_mapping_uniprot, f)
-            
+
         with open(f"{self.file}_uniparc.json", "w") as f:
             json.dump(id_mapping_uniparc, f)
-    
\ No newline at end of file
diff --git a/src/pyeed/adapter/uniprot_mapper.py b/src/pyeed/adapter/uniprot_mapper.py
index 477249eb..36ed3577 100644
--- a/src/pyeed/adapter/uniprot_mapper.py
+++ b/src/pyeed/adapter/uniprot_mapper.py
@@ -82,13 +82,15 @@ def add_sites(self, record: dict[str, Any], protein: Protein) -> None:
             site.save()
 
             protein.site.connect(site, {"positions": positions})
-    
-    def get_substrates_and_products_from_rhea(self, rhea_id: str) -> dict[str, List[str]]:
+
+    def get_substrates_and_products_from_rhea(
+        self, rhea_id: str
+    ) -> dict[str, List[str]]:
         """Fetch substrates and products from Rhea by parsing the side URI (_L = substrate, _R = product).
-        
+
         Args:
             rhea_id (str or int): The Rhea reaction ID (e.g., 49528)
-        
+
         Returns:
             dict: {
                 'substrates': [list of chebi URIs],
@@ -134,17 +136,13 @@ def get_substrates_and_products_from_rhea(self, rhea_id: str) -> dict[str, List[
             elif side_uri.endswith("_R"):
                 products.add(chebi_uri)
 
-        return {
-            "substrates": sorted(substrates),
-            "products": sorted(products)
-        }
+        return {"substrates": sorted(substrates), "products": sorted(products)}
 
-    
     def get_smiles_from_chebi_web(self, chebi_url: str) -> str:
         """
         Extract SMILES from the official ChEBI page using HTML scraping.
         """
-        chebi_id = chebi_url.split('_')[-1]
+        chebi_id = chebi_url.split("_")[-1]
         url = f"https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:{chebi_id}"
 
         response = requests.get(url)
@@ -157,7 +155,6 @@ def get_smiles_from_chebi_web(self, chebi_url: str) -> str:
                 if headers and "SMILES" in headers[0].text:
                     data_cell = row.find_all("td")[-1]  # Get the last <td> in row
                     return data_cell.text.strip()
-                
 
     def add_reaction(self, record: dict[str, Any], protein: Protein) -> None:
         for reference in record.get("comments", []):  # Safe retrieval with .get()
@@ -168,7 +165,7 @@ def add_reaction(self, record: dict[str, Any], protein: Protein) -> None:
                     if db_ref.get("id", "").startswith("RHEA:"):
                         rhea_id = db_ref["id"]
                         break  # Stop after finding the first match
-                
+
                 catalytic_annotation = Reaction.get_or_save(
                     rhea_id=rhea_id,
                 )
@@ -176,31 +173,30 @@ def add_reaction(self, record: dict[str, Any], protein: Protein) -> None:
                 protein.reaction.connect(catalytic_annotation)
 
     def add_molecule(self, rhea_id: str, reaction: Reaction) -> None:
-    
         chebi = self.get_substrates_and_products_from_rhea(rhea_id)
 
         substrate_ids = chebi["substrates"]
         product_ids = chebi["products"]
-        
+
         for i in substrate_ids:
             smiles = self.get_smiles_from_chebi_web(i)
-            
-            chebi_id = i.split('_')[-1]
+
+            chebi_id = i.split("_")[-1]
             chebi_id = f"CHEBI:{chebi_id}"
             substrate = Molecule.get_or_save(
                 chebi_id=chebi_id,
-                smiles = smiles,
+                smiles=smiles,
             )
             reaction.substrate.connect(substrate)
-        
+
         for i in product_ids:
             smiles = self.get_smiles_from_chebi_web(i)
 
-            chebi_id = i.split('_')[-1]
+            chebi_id = i.split("_")[-1]
             chebi_id = f"CHEBI:{chebi_id}"
             product = Molecule.get_or_save(
                 chebi_id=chebi_id,
-                smiles = smiles,
+                smiles=smiles,
             )
             reaction.product.connect(product)
 
diff --git a/src/pyeed/analysis/network_analysis.py b/src/pyeed/analysis/network_analysis.py
index fd354ebe..dd66b45c 100644
--- a/src/pyeed/analysis/network_analysis.py
+++ b/src/pyeed/analysis/network_analysis.py
@@ -66,57 +66,51 @@ def create_graph(
         base_query = """
         MATCH (n)
         """
-        
+
         # Add node filters
         node_filters = []
         if nodes:
             node_filters.append("labels(n)[0] IN $node_types")
         if ids:
             node_filters.append("n.accession_id IN $accession_ids")
-            
+
         if node_filters:
             base_query += "WHERE " + " AND ".join(node_filters)
-            
+
         # Add relationship pattern and filters
         base_query += """
         OPTIONAL MATCH (n)-[r]->(m)
         """
-        
+
         # Add relationship type filter if specified
         if relationships:
             base_query += "WHERE type(r) IN $relationships "
-            
+
         # Return both nodes and relationships in a single query
         base_query += """
         RETURN 
             collect(DISTINCT {id: ID(n), labels: labels(n), properties: properties(n)}) as nodes,
             collect(DISTINCT {source: ID(n), target: ID(m), type: type(r), properties: properties(r)}) as relationships
         """
-        
+
         logger.info("Executing combined query for nodes and relationships")
         results = self.db.execute_read(
             base_query,
-            {
-                "node_types": nodes,
-                "accession_ids": ids,
-                "relationships": relationships
-            }
+            {"node_types": nodes, "accession_ids": ids, "relationships": relationships},
         )
-        
+
         if not results or not results[0]:
             logger.warning("No results found in the database")
             return self.graph
-            
+
         # Process nodes
         nodes_data = results[0]["nodes"]
         for node in nodes_data:
             self.graph.add_node(
-                node["id"],
-                labels=node["labels"],
-                properties=node["properties"]
+                node["id"], labels=node["labels"], properties=node["properties"]
             )
         logger.info(f"Added {len(nodes_data)} nodes to the graph")
-        
+
         # Process relationships
         relationships_data = results[0]["relationships"]
         for rel in relationships_data:
@@ -125,10 +119,10 @@ def create_graph(
                     rel["source"],
                     rel["target"],
                     type=rel["type"],
-                    properties=rel["properties"]
+                    properties=rel["properties"],
                 )
         logger.info(f"Added {len(relationships_data)} relationships to the graph")
-        
+
         return self.graph
 
     def compute_degree_centrality(self) -> dict[Any, float]:
@@ -263,8 +257,8 @@ def calculate_positions_2d(
         filtered_graph.remove_edges_from(self_referential_edges)
 
         # Find isolated nodes
-        #isolated_nodes = self.find_isolated_nodes(filtered_graph)
-        #filtered_graph.remove_nodes_from(isolated_nodes)
+        # isolated_nodes = self.find_isolated_nodes(filtered_graph)
+        # filtered_graph.remove_nodes_from(isolated_nodes)
 
         # Use spring layout for force-directed graph
         weight_attr = attribute if attribute is not None else None
diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py
index 9255d55f..d57c5e63 100644
--- a/src/pyeed/analysis/sequence_alignment.py
+++ b/src/pyeed/analysis/sequence_alignment.py
@@ -141,12 +141,17 @@ def align_multipairwise(
         MATCH (p1:Protein)-[:PAIRWISE_ALIGNED]->(p2:Protein)
         RETURN p1.accession_id AS Protein1_ID, p2.accession_id AS Protein2_ID
         """
-        
+
         # Fetch results properly as a list of tuples
-        existing_pairs = set(tuple(sorted((row["Protein1_ID"], row["Protein2_ID"]))) for row in db.execute_write(query))
+        existing_pairs = set(
+            tuple(sorted((row["Protein1_ID"], row["Protein2_ID"])))
+            for row in db.execute_write(query)
+        )
 
         # Filter new pairs that are not in existing_pairs
-        new_pairs = [pair for pair in pairs if tuple(sorted(pair)) not in existing_pairs]
+        new_pairs = [
+            pair for pair in pairs if tuple(sorted(pair)) not in existing_pairs
+        ]
 
         print(f"Number of existing pairs: {len(existing_pairs)}")
         print(f"Number of total pairs: {len(pairs)}")
@@ -351,4 +356,4 @@ def _get_id_sequence_dict(
     def _load_substitution_matrix(self) -> "BioSubstitutionMatrix":
         from Bio.Align import substitution_matrices
 
-        return substitution_matrices.load(self.substitution_matrix)  # type: ignore
\ No newline at end of file
+        return substitution_matrices.load(self.substitution_matrix)  # type: ignore
diff --git a/src/pyeed/dbconnect.py b/src/pyeed/dbconnect.py
index d208deec..8abcab52 100644
--- a/src/pyeed/dbconnect.py
+++ b/src/pyeed/dbconnect.py
@@ -227,8 +227,11 @@ def _get_driver(uri: str, user: str | None, password: str | None) -> Driver:
         Creates a new Neo4j driver instance.
         """
         auth = (user, password) if user and password else None
-        return GraphDatabase.driver(uri, auth=auth, connection_timeout=60,  # Increase initial connection timeout
-        max_connection_lifetime=86400,  # Keep connections alive longer
+        return GraphDatabase.driver(
+            uri,
+            auth=auth,
+            connection_timeout=60,  # Increase initial connection timeout
+            max_connection_lifetime=86400,  # Keep connections alive longer
         )
 
     @property
diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py
index 1b0d4955..a0229385 100644
--- a/src/pyeed/embedding.py
+++ b/src/pyeed/embedding.py
@@ -31,6 +31,7 @@ def get_hf_token() -> str:
     else:
         raise RuntimeError("Failed to get Hugging Face token")
 
+
 def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db):
     """
     Splits data into batches and processes them on a single GPU.
@@ -64,15 +65,21 @@ def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db):
                 )
 
                 # Update the database
-                update_protein_embeddings_in_db(db, list(accessions[:current_batch_size]), embeddings_batch)
+                update_protein_embeddings_in_db(
+                    db, list(accessions[:current_batch_size]), embeddings_batch
+                )
 
                 # Move to the next batch
                 break  # Successful execution, move to the next batch
 
             except torch.cuda.OutOfMemoryError:
                 torch.cuda.empty_cache()
-                current_batch_size = max(1, current_batch_size // 2)  # Reduce batch size
-                logger.warning(f"Reduced batch size to {current_batch_size} due to OOM error.")
+                current_batch_size = max(
+                    1, current_batch_size // 2
+                )  # Reduce batch size
+                logger.warning(
+                    f"Reduced batch size to {current_batch_size} due to OOM error."
+                )
 
     # Free memory
     del model
@@ -82,7 +89,7 @@ def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db):
 def load_model_and_tokenizer(
     model_name: str,
     device: str,
-    ) -> Tuple[Any, Union[Any, None], str]:
+) -> Tuple[Any, Union[Any, None], str]:
     """
     Loads the model and assigns it to a specific GPU.
 
diff --git a/src/pyeed/main.py b/src/pyeed/main.py
index 7effe1f1..5ba41d0d 100644
--- a/src/pyeed/main.py
+++ b/src/pyeed/main.py
@@ -190,8 +190,7 @@ def fetch_ncbi_nucleotide(self, ids: list[str]) -> None:
         # Fix: apply nest_asyncio and then run the coroutine with the event loop
         nest_asyncio.apply()
         asyncio.get_event_loop().run_until_complete(adapter.execute_requests())
-    
-    
+
     def database_id_mapper(self, ids: list[str], file: str) -> None:
         """
         Maps IDs from one database to another using the UniProt ID mapping service
@@ -202,7 +201,7 @@ def database_id_mapper(self, ids: list[str], file: str) -> None:
 
         mapper = NCBIToUniprotMapper(ids, file)
         mapper.execute_request()
-        
+
         nest_asyncio.apply()
 
     def calculate_sequence_embeddings(
@@ -210,9 +209,9 @@ def calculate_sequence_embeddings(
         batch_size: int = 16,
         model_name: str = "facebook/esm2_t33_650M_UR50D",
         num_gpus: int = None,  # Number of GPUs to use
-        ) -> None:
+    ) -> None:
         """
-        Calculates embeddings for all sequences in the database that do not have embeddings, 
+        Calculates embeddings for all sequences in the database that do not have embeddings,
         distributing the workload across available GPUs.
 
         Args:
@@ -243,18 +242,19 @@ def calculate_sequence_embeddings(
         """
         results = self.db.execute_read(query)
         data = [(result["accession"], result["sequence"]) for result in results]
-        
+
         if not data:
             logger.info("No sequences to process.")
             return
-        
+
         accessions, sequences = zip(*data)
         total_sequences = len(sequences)
         logger.debug(f"Total sequences to process: {total_sequences}")
 
         # Split the data into num_gpus chunks
         gpu_batches = [
-            list(zip(accessions[i::num_gpus], sequences[i::num_gpus])) for i in range(num_gpus)
+            list(zip(accessions[i::num_gpus], sequences[i::num_gpus]))
+            for i in range(num_gpus)
         ]
 
         start_time = time.time()
@@ -275,16 +275,17 @@ def calculate_sequence_embeddings(
                         model,
                         tokenizer,
                         device,
-                        self.db
+                        self.db,
                     )
                 )
-            
+
             for future in futures:
                 future.result()  # Wait for all threads to complete
 
-
         end_time = time.time()
-        logger.info(f"Total embedding calculation time: {end_time - start_time:.2f} seconds")
+        logger.info(
+            f"Total embedding calculation time: {end_time - start_time:.2f} seconds"
+        )
 
         # Cleanup
         for model, _, _ in models_and_tokenizers:
diff --git a/src/pyeed/model.py b/src/pyeed/model.py
index 19d83e8b..c7a193b7 100644
--- a/src/pyeed/model.py
+++ b/src/pyeed/model.py
@@ -112,13 +112,12 @@ def save(self, *args: Any, **kwargs: Any) -> None:
                 elif isinstance(base_property, FloatProperty):
                     if not all(isinstance(item, float) for item in prop):
                         raise TypeError(f"All items in '{field}' must be floats")
-                    
-            #Validate BoleanProperty
+
+            # Validate BoleanProperty
             elif isinstance(neo_type, BooleanProperty) and not isinstance(prop, bool):
                 raise TypeError(
                     f"Expected a boolean for '{field}', got {type(prop).__name__}"
                 )
-                
 
         super().save(*args, **kwargs)  # Don't return the result
 
@@ -153,7 +152,7 @@ class Annotation(Enum):
 class Organism(StrictStructuredNode):
     taxonomy_id = IntegerProperty(required=True, unique_index=True)
     name = StringProperty()
-    
+
     @classmethod
     def get_or_save(cls, taxonomy_id, name) -> "Organism":
         try:
@@ -395,25 +394,25 @@ class Reaction(StrictStructuredNode):
     """
     A node representing a reaction.
     """
-    
+
     rhea_id = StringProperty(unique_index=True, required=True)
     chebi_id = ArrayProperty(StringProperty())
 
     # Relationships
     substrate = RelationshipTo("Molecule", "SUBSTRATE")
     product = RelationshipTo("Molecule", "PRODUCT")
-    
-    
+
     @property
     def label(self) -> str:
         """The label of the reaction."""
         return {self.rhea_id}
 
+
 class Molecule(StrictStructuredNode):
     """
     A node representing a molecule in the database.
     """
-    
+
     chebi_id = StringProperty(unique_index=True, required=True)
     rhea_compound_id = StringProperty()
     smiles = StringProperty()
@@ -431,13 +430,13 @@ def get_or_save(cls, chebi_id, smiles) -> "Molecule":
             except Exception as e:
                 print(f"Error during saving of the molecule: {e}")
                 raise
-    
-    @property 
+
+    @property
     def label(self) -> str:
         """The label of the molecule."""
         return {self.chebi_id}
-    
-    
+
+
 class StandardNumbering(StrictStructuredNode):
     name = StringProperty(required=True, unique_index=True)
     definition = StringProperty(required=True)

From 9b7f28b4b0649a37e654d604ad3a479bcb555e18 Mon Sep 17 00:00:00 2001
From: alacheim <a.lacheim@outlook.de>
Date: Wed, 30 Apr 2025 12:51:26 +0000
Subject: [PATCH 10/19] fixing mypy errors

---
 pyproject.toml                              |  1 +
 src/pyeed/adapter/ncbi_to_uniprot_mapper.py |  2 +-
 src/pyeed/model.py                          | 18 +++++++++++-------
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2f81fd56..948c493b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,6 +39,7 @@ absl-py = "1.0.0"
 crc64iso = "0.0.2"
 SPARQLWrapper = "2.0.0"
 pysam = "0.23.0"
+types-requests = "2.32.0"
 
 [tool.poetry.group.dev.dependencies]
 mkdocstrings = {extras = ["python"], version = "^0.26.2"}
diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
index 2f711e16..1373547a 100644
--- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
+++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
@@ -62,7 +62,7 @@ def get_checksum(self, refseq_id: str) -> str:
         self.download_fasta(refseq_id)
         fa = FastaFile(f"{refseq_id}.fasta")
         seq = fa.fetch(fa.references[0])
-        return crc64iso.crc64(seq)
+        return f"{crc64iso.crc64(seq)}"
 
     def checksum_list(self, refseq_ids: List[str]) -> List[str]:
         """Creates a list of checksum IDs and deletes the FASTA files after processing.
diff --git a/src/pyeed/model.py b/src/pyeed/model.py
index c7a193b7..aa498aee 100644
--- a/src/pyeed/model.py
+++ b/src/pyeed/model.py
@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Any
+from typing import Any, cast
 
 # from pyeed.nodes_and_relations import StrictStructuredNode
 from neomodel import (
@@ -154,9 +154,11 @@ class Organism(StrictStructuredNode):
     name = StringProperty()
 
     @classmethod
-    def get_or_save(cls, taxonomy_id, name) -> "Organism":
+    def get_or_save(cls, **kwargs: Any) -> "Organism":
+        taxonomy_id = kwargs.get("taxonomy_id")
+        name = kwargs.get("name")
         try:
-            organism = cls.nodes.get(taxonomy_id=taxonomy_id)
+            organism = cast(Organism, cls.nodes.get(taxonomy_id=taxonomy_id))
             return organism
         except cls.DoesNotExist:
             try:
@@ -405,7 +407,7 @@ class Reaction(StrictStructuredNode):
     @property
     def label(self) -> str:
         """The label of the reaction."""
-        return {self.rhea_id}
+        return f"{self.rhea_id}"
 
 
 class Molecule(StrictStructuredNode):
@@ -418,9 +420,11 @@ class Molecule(StrictStructuredNode):
     smiles = StringProperty()
 
     @classmethod
-    def get_or_save(cls, chebi_id, smiles) -> "Molecule":
+    def get_or_save(cls, **kwargs:Any) -> "Molecule":
+        chebi_id = kwargs.get("chebi_id")
+        smiles = kwargs.get("smiles")
         try:
-            molecule = cls.nodes.get(chebi_id=chebi_id)
+            molecule = cast(Molecule, cls.nodes.get(chebi_id=chebi_id))
             return molecule
         except cls.DoesNotExist:
             try:
@@ -434,7 +438,7 @@ def get_or_save(cls, chebi_id, smiles) -> "Molecule":
     @property
     def label(self) -> str:
         """The label of the molecule."""
-        return {self.chebi_id}
+        return f"{self.chebi_id}"
 
 
 class StandardNumbering(StrictStructuredNode):

From cf742d77f52e9533e6b0b5813a70ac29d0756ef8 Mon Sep 17 00:00:00 2001
From: alacheim <a.lacheim@outlook.de>
Date: Wed, 30 Apr 2025 14:02:52 +0000
Subject: [PATCH 11/19] fixing mypy errors

---
 src/pyeed/adapter/uniprot_mapper.py | 29 ++++++++++++++++++++---------
 src/pyeed/embedding.py              | 28 ++++++++++++++++++++--------
 src/pyeed/main.py                   | 11 +++++++----
 3 files changed, 47 insertions(+), 21 deletions(-)

diff --git a/src/pyeed/adapter/uniprot_mapper.py b/src/pyeed/adapter/uniprot_mapper.py
index 36ed3577..7f26893e 100644
--- a/src/pyeed/adapter/uniprot_mapper.py
+++ b/src/pyeed/adapter/uniprot_mapper.py
@@ -1,9 +1,9 @@
 import json
 from collections import defaultdict
-from typing import Any, List
+from typing import Any, List, Optional
 
 import requests
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 from httpx import Response
 from loguru import logger
 from SPARQLWrapper import JSON, SPARQLWrapper
@@ -120,7 +120,11 @@ def get_substrates_and_products_from_rhea(
         sparql.setReturnFormat(JSON)
         sparql.addCustomHttpHeader("User-Agent", "MyPythonClient/1.0")
 
-        results = sparql.query().convert()
+        results_raw = sparql.query().convert()
+        if not isinstance(results_raw, dict):
+            raise TypeError("Expected dict from SPARQL query")
+
+        results: dict[str, Any] = results_raw
 
         substrates = set()
         products = set()
@@ -138,7 +142,7 @@ def get_substrates_and_products_from_rhea(
 
         return {"substrates": sorted(substrates), "products": sorted(products)}
 
-    def get_smiles_from_chebi_web(self, chebi_url: str) -> str:
+    def get_smiles_from_chebi_web(self, chebi_url: str) -> Optional[str]:
         """
         Extract SMILES from the official ChEBI page using HTML scraping.
         """
@@ -150,11 +154,17 @@ def get_smiles_from_chebi_web(self, chebi_url: str) -> str:
 
         # Look for table rows that contain the SMILES label
         for table in soup.find_all("table", class_="chebiTableContent"):
+            if not isinstance(table, Tag):
+                continue
             for row in table.find_all("tr"):
+                if not isinstance(row, Tag):
+                    continue
                 headers = row.find_all("td", class_="chebiDataHeader")
-                if headers and "SMILES" in headers[0].text:
-                    data_cell = row.find_all("td")[-1]  # Get the last <td> in row
-                    return data_cell.text.strip()
+                if headers and isinstance(headers[0], Tag) and "SMILES" in headers[0].text:
+                    data_cells = row.find_all("td")
+                    if data_cells:
+                        return f"{data_cells[-1].text.strip()}"
+        return None
 
     def add_reaction(self, record: dict[str, Any], protein: Protein) -> None:
         for reference in record.get("comments", []):  # Safe retrieval with .get()
@@ -169,8 +179,9 @@ def add_reaction(self, record: dict[str, Any], protein: Protein) -> None:
                 catalytic_annotation = Reaction.get_or_save(
                     rhea_id=rhea_id,
                 )
-                self.add_molecule(rhea_id, catalytic_annotation)
-                protein.reaction.connect(catalytic_annotation)
+                if rhea_id is not None:
+                    self.add_molecule(rhea_id, catalytic_annotation)
+                    protein.reaction.connect(catalytic_annotation)
 
     def add_molecule(self, rhea_id: str, reaction: Reaction) -> None:
         chebi = self.get_substrates_and_products_from_rhea(rhea_id)
diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py
index a0229385..05895dfc 100644
--- a/src/pyeed/embedding.py
+++ b/src/pyeed/embedding.py
@@ -11,6 +11,7 @@
 from loguru import logger
 from numpy.typing import NDArray
 from transformers import EsmModel, EsmTokenizer
+from torch.nn import DataParallel, Module
 
 from pyeed.dbconnect import DatabaseConnector
 
@@ -32,7 +33,14 @@ def get_hf_token() -> str:
         raise RuntimeError("Failed to get Hugging Face token")
 
 
-def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db):
+def process_batches_on_gpu(
+    data: list[tuple[str, str]], 
+    batch_size: int, 
+    model:Module, 
+    tokenizer: EsmTokenizer, 
+    db:DatabaseConnector,
+    device:torch.device,
+    ) -> None:
     """
     Splits data into batches and processes them on a single GPU.
 
@@ -88,8 +96,8 @@ def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db):
 
 def load_model_and_tokenizer(
     model_name: str,
-    device: str,
-) -> Tuple[Any, Union[Any, None], str]:
+    device:torch.device,
+) -> Tuple[Any, Union[Any, None], torch.device]:
     """
     Loads the model and assigns it to a specific GPU.
 
@@ -125,7 +133,7 @@ def get_batch_embeddings(
     model: Union[
         EsmModel,
         ESMC,
-        torch.nn.DataParallel,
+        DataParallel[Module],
         ESM3InferenceClient,
         ESM3,
     ],
@@ -209,7 +217,9 @@ def get_batch_embeddings(
 
 
 def calculate_single_sequence_embedding_last_hidden_state(
-    sequence: str, model_name: str = "facebook/esm2_t33_650M_UR50D"
+    sequence: str, 
+    device: torch.device,
+    model_name: str = "facebook/esm2_t33_650M_UR50D",
 ) -> NDArray[np.float64]:
     """
     Calculates an embedding for a single sequence.
@@ -221,12 +231,14 @@ def calculate_single_sequence_embedding_last_hidden_state(
     Returns:
         NDArray[np.float64]: Normalized embedding vector for the sequence
     """
-    model, tokenizer, device = load_model_and_tokenizer(model_name)
+    model, tokenizer, device = load_model_and_tokenizer(model_name, device)
     return get_single_embedding_last_hidden_state(sequence, model, tokenizer, device)
 
 
 def calculate_single_sequence_embedding_all_layers(
-    sequence: str, model_name: str = "facebook/esm2_t33_650M_UR50D"
+    sequence: str, 
+    device: torch.device,
+    model_name: str = "facebook/esm2_t33_650M_UR50D",
 ) -> NDArray[np.float64]:
     """
     Calculates embeddings for a single sequence across all layers.
@@ -238,7 +250,7 @@ def calculate_single_sequence_embedding_all_layers(
     Returns:
         NDArray[np.float64]: A numpy array containing layer embeddings for the sequence.
     """
-    model, tokenizer, device = load_model_and_tokenizer(model_name)
+    model, tokenizer, device = load_model_and_tokenizer(model_name, device)
     return get_single_embedding_all_layers(sequence, model, tokenizer, device)
 
 
diff --git a/src/pyeed/main.py b/src/pyeed/main.py
index 5ba41d0d..4dee81e0 100644
--- a/src/pyeed/main.py
+++ b/src/pyeed/main.py
@@ -208,7 +208,7 @@ def calculate_sequence_embeddings(
         self,
         batch_size: int = 16,
         model_name: str = "facebook/esm2_t33_650M_UR50D",
-        num_gpus: int = None,  # Number of GPUs to use
+        num_gpus: int = 1,  # Number of GPUs to use
     ) -> None:
         """
         Calculates embeddings for all sequences in the database that do not have embeddings,
@@ -229,9 +229,12 @@ def calculate_sequence_embeddings(
             logger.warning("No GPU available! Running on CPU.")
 
         # Load separate models for each GPU
-        devices = [f"cuda:{i}" for i in range(num_gpus)] if num_gpus > 0 else ["cpu"]
+        devices = [
+        torch.device(f"cuda:{i}") for i in range(num_gpus)
+        ] if num_gpus > 0 else [torch.device("cpu")]
+
         models_and_tokenizers = [
-            load_model_and_tokenizer(model_name, device) for device in devices
+        load_model_and_tokenizer(model_name, device) for device in devices
         ]
 
         # Retrieve sequences without embeddings
@@ -274,8 +277,8 @@ def calculate_sequence_embeddings(
                         batch_size,
                         model,
                         tokenizer,
-                        device,
                         self.db,
+                        device,
                     )
                 )
 

From f073cd6e03f61b7e3a4a65e946d8ae0f2ff23bdd Mon Sep 17 00:00:00 2001
From: alacheim <a.lacheim@outlook.de>
Date: Wed, 30 Apr 2025 14:11:28 +0000
Subject: [PATCH 12/19] fixing ruff error

---
 src/pyeed/embedding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py
index 05895dfc..5686ca36 100644
--- a/src/pyeed/embedding.py
+++ b/src/pyeed/embedding.py
@@ -10,8 +10,8 @@
 from huggingface_hub import HfFolder, login
 from loguru import logger
 from numpy.typing import NDArray
-from transformers import EsmModel, EsmTokenizer
 from torch.nn import DataParallel, Module
+from transformers import EsmModel, EsmTokenizer
 
 from pyeed.dbconnect import DatabaseConnector
 

From 3ec0368351652eba4fbc18ddc818460c2e4381e4 Mon Sep 17 00:00:00 2001
From: alacheim <a.lacheim@outlook.de>
Date: Wed, 30 Apr 2025 14:13:18 +0000
Subject: [PATCH 13/19] fixing ruff error

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 948c493b..b9897071 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,7 @@ absl-py = "1.0.0"
 crc64iso = "0.0.2"
 SPARQLWrapper = "2.0.0"
 pysam = "0.23.0"
-types-requests = "2.32.0"
+types-requests = "2.32.0.20250328"
 
 [tool.poetry.group.dev.dependencies]
 mkdocstrings = {extras = ["python"], version = "^0.26.2"}

From 37afe5c31421a124bcbb5a15fc3cb1bc3b3e321c Mon Sep 17 00:00:00 2001
From: alacheim <a.lacheim@outlook.de>
Date: Wed, 30 Apr 2025 14:17:37 +0000
Subject: [PATCH 14/19] trigger pipeline

---
 src/pyeed/adapter/ncbi_to_uniprot_mapper.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
index 1373547a..78b493a5 100644
--- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
+++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
@@ -10,7 +10,6 @@
 
 logger = logging.getLogger(__name__)
 
-
 class NCBIToUniprotMapper:
     def __init__(self, ids: List[str], file: str):
         self.ids = ids

From d84c74e7afabf2ef518220b84ade1b2dfca090cc Mon Sep 17 00:00:00 2001
From: alacheim <a.lacheim@outlook.de>
Date: Wed, 30 Apr 2025 14:23:10 +0000
Subject: [PATCH 15/19] formated with ruff

---
 src/pyeed/adapter/ncbi_to_uniprot_mapper.py |  1 +
 src/pyeed/adapter/uniprot_mapper.py         |  6 +++++-
 src/pyeed/embedding.py                      | 20 ++++++++++----------
 src/pyeed/main.py                           | 10 ++++++----
 src/pyeed/model.py                          |  2 +-
 5 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
index 78b493a5..1373547a 100644
--- a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
+++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
@@ -10,6 +10,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 class NCBIToUniprotMapper:
     def __init__(self, ids: List[str], file: str):
         self.ids = ids
diff --git a/src/pyeed/adapter/uniprot_mapper.py b/src/pyeed/adapter/uniprot_mapper.py
index 7f26893e..f52d01b3 100644
--- a/src/pyeed/adapter/uniprot_mapper.py
+++ b/src/pyeed/adapter/uniprot_mapper.py
@@ -160,7 +160,11 @@ def get_smiles_from_chebi_web(self, chebi_url: str) -> Optional[str]:
                 if not isinstance(row, Tag):
                     continue
                 headers = row.find_all("td", class_="chebiDataHeader")
-                if headers and isinstance(headers[0], Tag) and "SMILES" in headers[0].text:
+                if (
+                    headers
+                    and isinstance(headers[0], Tag)
+                    and "SMILES" in headers[0].text
+                ):
                     data_cells = row.find_all("td")
                     if data_cells:
                         return f"{data_cells[-1].text.strip()}"
diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py
index 5686ca36..28f66a1b 100644
--- a/src/pyeed/embedding.py
+++ b/src/pyeed/embedding.py
@@ -34,13 +34,13 @@ def get_hf_token() -> str:
 
 
 def process_batches_on_gpu(
-    data: list[tuple[str, str]], 
-    batch_size: int, 
-    model:Module, 
-    tokenizer: EsmTokenizer, 
-    db:DatabaseConnector,
-    device:torch.device,
-    ) -> None:
+    data: list[tuple[str, str]],
+    batch_size: int,
+    model: Module,
+    tokenizer: EsmTokenizer,
+    db: DatabaseConnector,
+    device: torch.device,
+) -> None:
     """
     Splits data into batches and processes them on a single GPU.
 
@@ -96,7 +96,7 @@ def process_batches_on_gpu(
 
 def load_model_and_tokenizer(
     model_name: str,
-    device:torch.device,
+    device: torch.device,
 ) -> Tuple[Any, Union[Any, None], torch.device]:
     """
     Loads the model and assigns it to a specific GPU.
@@ -217,7 +217,7 @@ def get_batch_embeddings(
 
 
 def calculate_single_sequence_embedding_last_hidden_state(
-    sequence: str, 
+    sequence: str,
     device: torch.device,
     model_name: str = "facebook/esm2_t33_650M_UR50D",
 ) -> NDArray[np.float64]:
@@ -236,7 +236,7 @@ def calculate_single_sequence_embedding_last_hidden_state(
 
 
 def calculate_single_sequence_embedding_all_layers(
-    sequence: str, 
+    sequence: str,
     device: torch.device,
     model_name: str = "facebook/esm2_t33_650M_UR50D",
 ) -> NDArray[np.float64]:
diff --git a/src/pyeed/main.py b/src/pyeed/main.py
index 4dee81e0..1189fcb3 100644
--- a/src/pyeed/main.py
+++ b/src/pyeed/main.py
@@ -229,12 +229,14 @@ def calculate_sequence_embeddings(
             logger.warning("No GPU available! Running on CPU.")
 
         # Load separate models for each GPU
-        devices = [
-        torch.device(f"cuda:{i}") for i in range(num_gpus)
-        ] if num_gpus > 0 else [torch.device("cpu")]
+        devices = (
+            [torch.device(f"cuda:{i}") for i in range(num_gpus)]
+            if num_gpus > 0
+            else [torch.device("cpu")]
+        )
 
         models_and_tokenizers = [
-        load_model_and_tokenizer(model_name, device) for device in devices
+            load_model_and_tokenizer(model_name, device) for device in devices
         ]
 
         # Retrieve sequences without embeddings
diff --git a/src/pyeed/model.py b/src/pyeed/model.py
index aa498aee..5a3bf188 100644
--- a/src/pyeed/model.py
+++ b/src/pyeed/model.py
@@ -420,7 +420,7 @@ class Molecule(StrictStructuredNode):
     smiles = StringProperty()
 
     @classmethod
-    def get_or_save(cls, **kwargs:Any) -> "Molecule":
+    def get_or_save(cls, **kwargs: Any) -> "Molecule":
         chebi_id = kwargs.get("chebi_id")
         smiles = kwargs.get("smiles")
         try:

From 014dcdc220e309205bfb7cd9982f20f5750bd06d Mon Sep 17 00:00:00 2001
From: Niklas Abraham GPU <st184432@stud.uni-stuttgart.de>
Date: Thu, 1 May 2025 13:48:08 +0000
Subject: [PATCH 16/19] fixed linter issue in sequence alignment

---
 src/pyeed/analysis/sequence_alignment.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py
index d57c5e63..9b68818e 100644
--- a/src/pyeed/analysis/sequence_alignment.py
+++ b/src/pyeed/analysis/sequence_alignment.py
@@ -5,9 +5,10 @@
 from Bio.Align import PairwiseAligner as BioPairwiseAligner
 from Bio.Align.substitution_matrices import Array as BioSubstitutionMatrix
 from joblib import Parallel, cpu_count, delayed
+from rich.progress import Progress
+
 from pyeed.dbconnect import DatabaseConnector
 from pyeed.tools.utility import chunks
-from rich.progress import Progress
 
 
 class PairwiseAligner:
@@ -142,11 +143,13 @@ def align_multipairwise(
         RETURN p1.accession_id AS Protein1_ID, p2.accession_id AS Protein2_ID
         """
 
-        # Fetch results properly as a list of tuples
-        existing_pairs = set(
-            tuple(sorted((row["Protein1_ID"], row["Protein2_ID"])))
-            for row in db.execute_write(query)
-        )
+        if db is not None:
+            existing_pairs = set(
+                tuple(sorted((row["Protein1_ID"], row["Protein2_ID"])))
+                for row in db.execute_write(query)
+            )
+        else:
+            existing_pairs = set()
 
         # Filter new pairs that are not in existing_pairs
         new_pairs = [

From a435764e26f07ca9094bf016f44b1676cccbd732 Mon Sep 17 00:00:00 2001
From: Niklas Abraham GPU <st184432@stud.uni-stuttgart.de>
Date: Thu, 1 May 2025 14:17:08 +0000
Subject: [PATCH 17/19] fixed ruff files

---
 docs/usage/blast.ipynb                   | 723 ++++++++++++-----------
 docs/usage/clustalo.ipynb                | 338 +++++------
 docs/usage/embeddings_analysis.ipynb     |   5 +-
 docs/usage/mmseqs.ipynb                  |   3 +-
 docs/usage/mutation_analysis.ipynb       |   1 +
 docs/usage/network_analysis.ipynb        |   1 +
 docs/usage/standard_numbering.ipynb      |   2 +-
 src/pyeed/analysis/embedding_analysis.py |   3 +-
 src/pyeed/analysis/network_analysis.py   |   1 +
 src/pyeed/analysis/ontology_loading.py   |   3 +-
 src/pyeed/analysis/standard_numbering.py |   1 +
 tests/unit/test_dbchat.py                |   1 +
 12 files changed, 545 insertions(+), 537 deletions(-)

diff --git a/docs/usage/blast.ipynb b/docs/usage/blast.ipynb
index b56140d7..d6cd57ef 100644
--- a/docs/usage/blast.ipynb
+++ b/docs/usage/blast.ipynb
@@ -1,363 +1,364 @@
 {
-    "cells": [
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "# BLAST Search\n",
-                "\n",
-                "## Setup\n",
-                "\n",
-                "The BLAST service runs in a Docker container and requires:\n",
-                "1. A local BLAST database\n",
-                "2. The Docker service running"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 1,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# change log level to INFO\n",
-                "import sys\n",
-                "from loguru import logger\n",
-                "\n",
-                "logger.remove()\n",
-                "level = logger.add(sys.stderr, level=\"WARNING\")"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Basic Usage\n",
-                "\n",
-                "The `Blast` class provides an interface to search protein or nucleotide sequences against a local BLAST database."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 2,
-            "metadata": {},
-            "outputs": [
-                {
-                    "data": {
-                        "text/html": [
-                            "<div>\n",
-                            "<style scoped>\n",
-                            "    .dataframe tbody tr th:only-of-type {\n",
-                            "        vertical-align: middle;\n",
-                            "    }\n",
-                            "\n",
-                            "    .dataframe tbody tr th {\n",
-                            "        vertical-align: top;\n",
-                            "    }\n",
-                            "\n",
-                            "    .dataframe thead th {\n",
-                            "        text-align: right;\n",
-                            "    }\n",
-                            "</style>\n",
-                            "<table border=\"1\" class=\"dataframe\">\n",
-                            "  <thead>\n",
-                            "    <tr style=\"text-align: right;\">\n",
-                            "      <th></th>\n",
-                            "      <th>subject_id</th>\n",
-                            "      <th>identity</th>\n",
-                            "      <th>alignment_length</th>\n",
-                            "      <th>mismatches</th>\n",
-                            "      <th>gap_opens</th>\n",
-                            "      <th>query_start</th>\n",
-                            "      <th>query_end</th>\n",
-                            "      <th>subject_start</th>\n",
-                            "      <th>subject_end</th>\n",
-                            "      <th>evalue</th>\n",
-                            "      <th>bit_score</th>\n",
-                            "    </tr>\n",
-                            "  </thead>\n",
-                            "  <tbody>\n",
-                            "    <tr>\n",
-                            "      <th>0</th>\n",
-                            "      <td>seq7</td>\n",
-                            "      <td>81.818</td>\n",
-                            "      <td>22</td>\n",
-                            "      <td>3</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>31</td>\n",
-                            "      <td>51</td>\n",
-                            "      <td>11</td>\n",
-                            "      <td>32</td>\n",
-                            "      <td>0.003</td>\n",
-                            "      <td>22.3</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>1</th>\n",
-                            "      <td>seq1</td>\n",
-                            "      <td>100.000</td>\n",
-                            "      <td>25</td>\n",
-                            "      <td>0</td>\n",
-                            "      <td>0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>25</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>25</td>\n",
-                            "      <td>0.004</td>\n",
-                            "      <td>22.3</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>2</th>\n",
-                            "      <td>seq2</td>\n",
-                            "      <td>61.538</td>\n",
-                            "      <td>26</td>\n",
-                            "      <td>10</td>\n",
-                            "      <td>0</td>\n",
-                            "      <td>20</td>\n",
-                            "      <td>45</td>\n",
-                            "      <td>5</td>\n",
-                            "      <td>30</td>\n",
-                            "      <td>0.038</td>\n",
-                            "      <td>19.2</td>\n",
-                            "    </tr>\n",
-                            "  </tbody>\n",
-                            "</table>\n",
-                            "</div>"
-                        ],
-                        "text/plain": [
-                            "  subject_id  identity  alignment_length  mismatches  gap_opens  query_start  \\\n",
-                            "0       seq7    81.818                22           3          1           31   \n",
-                            "1       seq1   100.000                25           0          0            1   \n",
-                            "2       seq2    61.538                26          10          0           20   \n",
-                            "\n",
-                            "   query_end  subject_start  subject_end  evalue  bit_score  \n",
-                            "0         51             11           32   0.003       22.3  \n",
-                            "1         25              1           25   0.004       22.3  \n",
-                            "2         45              5           30   0.038       19.2  "
-                        ]
-                    },
-                    "execution_count": 2,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "from pyeed.tools import Blast\n",
-                "\n",
-                "# Example protein sequence\n",
-                "sequence = \"MSEQVAAVAKLRAKASEAAKEAKAREAAKKLAEAAKKAKAKEAAKRAEAKLAEKAKAAKRAEAKAAKEAKRAAAKRAEAKLAEKAKAAK\"\n",
-                "\n",
-                "# Initialize BLAST search\n",
-                "blast = Blast(\n",
-                "    # service_url=\"http://localhost:6001/blast\",\n",
-                "    mode=\"blastp\",  # Use blastp for protein sequences\n",
-                "    db_path=\"/usr/local/bin/data/test_db\",  # Path in Docker container\n",
-                "    db_name=\"protein_db\",  # Name of your BLAST database\n",
-                "    evalue=0.1,  # E-value threshold\n",
-                "    max_target_seqs=10,  # Maximum number of hits to return\n",
-                ")\n",
-                "\n",
-                "# Perform search\n",
-                "results = blast.search(sequence)\n",
-                "results"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "The results are returned as a pandas DataFrame with the following columns:\n",
-                "- subject_id: ID of the matched sequence\n",
-                "- identity: Percentage identity\n",
-                "- alignment_length: Length of the alignment\n",
-                "- mismatches: Number of mismatches\n",
-                "- gap_opens: Number of gap openings\n",
-                "- query_start/end: Start/end positions in query sequence\n",
-                "- subject_start/end: Start/end positions in subject sequence\n",
-                "- evalue: Expectation value\n",
-                "- bit_score: Bit score"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Creating a BLAST Database\n",
-                "\n",
-                "Before using BLAST, you need to create a local database. Here's how to create one from a FASTA file:"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "```bash\n",
-                "# For protein sequences\n",
-                "makeblastdb -in proteins.fasta -dbtype prot -out blast_db/my_proteins\n",
-                "\n",
-                "# For nucleotide sequences\n",
-                "makeblastdb -in nucleotides.fasta -dbtype nucl -out blast_db/my_nucleotides\n",
-                "```\n",
-                "\n",
-                "To access the BLAST Docker container shell and create databases:\n",
-                "\n",
-                "```bash\n",
-                "# Enter the BLAST container shell\n",
-                "docker compose exec blast bash\n",
-                "# \n",
-                "# Navigate to database directory\n",
-                "cd /usr/local/bin/data/blast_db\n",
-                "# \n",
-                "# Create protein database\n",
-                "makeblastdb -in proteins.fasta -dbtype prot -out my_proteins\n",
-                "# \n",
-                "# Create nucleotide database \n",
-                "makeblastdb -in nucleotides.fasta -dbtype nucl -out my_nucleotides\n",
-                "```\n",
-                "Make sure your FASTA files are mounted in the container's `/usr/local/bin/data/blast_db` directory.\n"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Advanced Usage\n",
-                "\n",
-                "You can customize the BLAST search parameters:"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 3,
-            "metadata": {},
-            "outputs": [
-                {
-                    "data": {
-                        "text/html": [
-                            "<div>\n",
-                            "<style scoped>\n",
-                            "    .dataframe tbody tr th:only-of-type {\n",
-                            "        vertical-align: middle;\n",
-                            "    }\n",
-                            "\n",
-                            "    .dataframe tbody tr th {\n",
-                            "        vertical-align: top;\n",
-                            "    }\n",
-                            "\n",
-                            "    .dataframe thead th {\n",
-                            "        text-align: right;\n",
-                            "    }\n",
-                            "</style>\n",
-                            "<table border=\"1\" class=\"dataframe\">\n",
-                            "  <thead>\n",
-                            "    <tr style=\"text-align: right;\">\n",
-                            "      <th></th>\n",
-                            "      <th>subject_id</th>\n",
-                            "      <th>identity</th>\n",
-                            "      <th>alignment_length</th>\n",
-                            "      <th>mismatches</th>\n",
-                            "      <th>gap_opens</th>\n",
-                            "      <th>query_start</th>\n",
-                            "      <th>query_end</th>\n",
-                            "      <th>subject_start</th>\n",
-                            "      <th>subject_end</th>\n",
-                            "      <th>evalue</th>\n",
-                            "      <th>bit_score</th>\n",
-                            "    </tr>\n",
-                            "  </thead>\n",
-                            "  <tbody>\n",
-                            "    <tr>\n",
-                            "      <th>0</th>\n",
-                            "      <td>seq7</td>\n",
-                            "      <td>81.818</td>\n",
-                            "      <td>22</td>\n",
-                            "      <td>3</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>31</td>\n",
-                            "      <td>51</td>\n",
-                            "      <td>11</td>\n",
-                            "      <td>32</td>\n",
-                            "      <td>0.003</td>\n",
-                            "      <td>22.3</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>1</th>\n",
-                            "      <td>seq1</td>\n",
-                            "      <td>100.000</td>\n",
-                            "      <td>25</td>\n",
-                            "      <td>0</td>\n",
-                            "      <td>0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>25</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>25</td>\n",
-                            "      <td>0.004</td>\n",
-                            "      <td>22.3</td>\n",
-                            "    </tr>\n",
-                            "  </tbody>\n",
-                            "</table>\n",
-                            "</div>"
-                        ],
-                        "text/plain": [
-                            "  subject_id  identity  alignment_length  mismatches  gap_opens  query_start  \\\n",
-                            "0       seq7    81.818                22           3          1           31   \n",
-                            "1       seq1   100.000                25           0          0            1   \n",
-                            "\n",
-                            "   query_end  subject_start  subject_end  evalue  bit_score  \n",
-                            "0         51             11           32   0.003       22.3  \n",
-                            "1         25              1           25   0.004       22.3  "
-                        ]
-                    },
-                    "execution_count": 3,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "# Configure BLAST for sensitive protein search\n",
-                "blast = Blast(\n",
-                "    # service_url=\"http://localhost:6001/blast\",\n",
-                "    mode=\"blastp\",\n",
-                "    db_path=\"/usr/local/bin/data/test_db\",\n",
-                "    db_name=\"protein_db\",\n",
-                "    evalue=1e-1,  # More stringent E-value\n",
-                "    max_target_seqs=100,  # Return more hits\n",
-                "    num_threads=4,  # Use 4 CPU threads\n",
-                ")\n",
-                "\n",
-                "# Search with longer timeout\n",
-                "results = blast.search(sequence, timeout=7200)  # 2 hour timeout\n",
-                "\n",
-                "# Filter results\n",
-                "significant_hits = results[results[\"identity\"] > 80]  # Only hits with >90% identity\n",
-                "significant_hits"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "Thereafter, the ids of the hits can be added to the pyeed database, using the `fetch_from_primary_db` function."
-            ]
-        }
-    ],
-    "metadata": {
-        "kernelspec": {
-            "display_name": "pyeed",
-            "language": "python",
-            "name": "python3"
-        },
-        "language_info": {
-            "codemirror_mode": {
-                "name": "ipython",
-                "version": 3
-            },
-            "file_extension": ".py",
-            "mimetype": "text/x-python",
-            "name": "python",
-            "nbconvert_exporter": "python",
-            "pygments_lexer": "ipython3",
-            "version": "3.11.5"
-        }
-    },
-    "nbformat": 4,
-    "nbformat_minor": 2
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# BLAST Search\n",
+    "\n",
+    "## Setup\n",
+    "\n",
+    "The BLAST service runs in a Docker container and requires:\n",
+    "1. A local BLAST database\n",
+    "2. The Docker service running"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# change log level to INFO\n",
+    "import sys\n",
+    "\n",
+    "from loguru import logger\n",
+    "\n",
+    "logger.remove()\n",
+    "level = logger.add(sys.stderr, level=\"WARNING\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Basic Usage\n",
+    "\n",
+    "The `Blast` class provides an interface to search protein or nucleotide sequences against a local BLAST database."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>subject_id</th>\n",
+       "      <th>identity</th>\n",
+       "      <th>alignment_length</th>\n",
+       "      <th>mismatches</th>\n",
+       "      <th>gap_opens</th>\n",
+       "      <th>query_start</th>\n",
+       "      <th>query_end</th>\n",
+       "      <th>subject_start</th>\n",
+       "      <th>subject_end</th>\n",
+       "      <th>evalue</th>\n",
+       "      <th>bit_score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>seq7</td>\n",
+       "      <td>81.818</td>\n",
+       "      <td>22</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1</td>\n",
+       "      <td>31</td>\n",
+       "      <td>51</td>\n",
+       "      <td>11</td>\n",
+       "      <td>32</td>\n",
+       "      <td>0.003</td>\n",
+       "      <td>22.3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>seq1</td>\n",
+       "      <td>100.000</td>\n",
+       "      <td>25</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>25</td>\n",
+       "      <td>1</td>\n",
+       "      <td>25</td>\n",
+       "      <td>0.004</td>\n",
+       "      <td>22.3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>seq2</td>\n",
+       "      <td>61.538</td>\n",
+       "      <td>26</td>\n",
+       "      <td>10</td>\n",
+       "      <td>0</td>\n",
+       "      <td>20</td>\n",
+       "      <td>45</td>\n",
+       "      <td>5</td>\n",
+       "      <td>30</td>\n",
+       "      <td>0.038</td>\n",
+       "      <td>19.2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  subject_id  identity  alignment_length  mismatches  gap_opens  query_start  \\\n",
+       "0       seq7    81.818                22           3          1           31   \n",
+       "1       seq1   100.000                25           0          0            1   \n",
+       "2       seq2    61.538                26          10          0           20   \n",
+       "\n",
+       "   query_end  subject_start  subject_end  evalue  bit_score  \n",
+       "0         51             11           32   0.003       22.3  \n",
+       "1         25              1           25   0.004       22.3  \n",
+       "2         45              5           30   0.038       19.2  "
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from pyeed.tools import Blast\n",
+    "\n",
+    "# Example protein sequence\n",
+    "sequence = \"MSEQVAAVAKLRAKASEAAKEAKAREAAKKLAEAAKKAKAKEAAKRAEAKLAEKAKAAKRAEAKAAKEAKRAAAKRAEAKLAEKAKAAK\"\n",
+    "\n",
+    "# Initialize BLAST search\n",
+    "blast = Blast(\n",
+    "    # service_url=\"http://localhost:6001/blast\",\n",
+    "    mode=\"blastp\",  # Use blastp for protein sequences\n",
+    "    db_path=\"/usr/local/bin/data/test_db\",  # Path in Docker container\n",
+    "    db_name=\"protein_db\",  # Name of your BLAST database\n",
+    "    evalue=0.1,  # E-value threshold\n",
+    "    max_target_seqs=10,  # Maximum number of hits to return\n",
+    ")\n",
+    "\n",
+    "# Perform search\n",
+    "results = blast.search(sequence)\n",
+    "results"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The results are returned as a pandas DataFrame with the following columns:\n",
+    "- subject_id: ID of the matched sequence\n",
+    "- identity: Percentage identity\n",
+    "- alignment_length: Length of the alignment\n",
+    "- mismatches: Number of mismatches\n",
+    "- gap_opens: Number of gap openings\n",
+    "- query_start/end: Start/end positions in query sequence\n",
+    "- subject_start/end: Start/end positions in subject sequence\n",
+    "- evalue: Expectation value\n",
+    "- bit_score: Bit score"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Creating a BLAST Database\n",
+    "\n",
+    "Before using BLAST, you need to create a local database. Here's how to create one from a FASTA file:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```bash\n",
+    "# For protein sequences\n",
+    "makeblastdb -in proteins.fasta -dbtype prot -out blast_db/my_proteins\n",
+    "\n",
+    "# For nucleotide sequences\n",
+    "makeblastdb -in nucleotides.fasta -dbtype nucl -out blast_db/my_nucleotides\n",
+    "```\n",
+    "\n",
+    "To access the BLAST Docker container shell and create databases:\n",
+    "\n",
+    "```bash\n",
+    "# Enter the BLAST container shell\n",
+    "docker compose exec blast bash\n",
+    "# \n",
+    "# Navigate to database directory\n",
+    "cd /usr/local/bin/data/blast_db\n",
+    "# \n",
+    "# Create protein database\n",
+    "makeblastdb -in proteins.fasta -dbtype prot -out my_proteins\n",
+    "# \n",
+    "# Create nucleotide database \n",
+    "makeblastdb -in nucleotides.fasta -dbtype nucl -out my_nucleotides\n",
+    "```\n",
+    "Make sure your FASTA files are mounted in the container's `/usr/local/bin/data/blast_db` directory.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Advanced Usage\n",
+    "\n",
+    "You can customize the BLAST search parameters:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>subject_id</th>\n",
+       "      <th>identity</th>\n",
+       "      <th>alignment_length</th>\n",
+       "      <th>mismatches</th>\n",
+       "      <th>gap_opens</th>\n",
+       "      <th>query_start</th>\n",
+       "      <th>query_end</th>\n",
+       "      <th>subject_start</th>\n",
+       "      <th>subject_end</th>\n",
+       "      <th>evalue</th>\n",
+       "      <th>bit_score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>seq7</td>\n",
+       "      <td>81.818</td>\n",
+       "      <td>22</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1</td>\n",
+       "      <td>31</td>\n",
+       "      <td>51</td>\n",
+       "      <td>11</td>\n",
+       "      <td>32</td>\n",
+       "      <td>0.003</td>\n",
+       "      <td>22.3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>seq1</td>\n",
+       "      <td>100.000</td>\n",
+       "      <td>25</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>25</td>\n",
+       "      <td>1</td>\n",
+       "      <td>25</td>\n",
+       "      <td>0.004</td>\n",
+       "      <td>22.3</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  subject_id  identity  alignment_length  mismatches  gap_opens  query_start  \\\n",
+       "0       seq7    81.818                22           3          1           31   \n",
+       "1       seq1   100.000                25           0          0            1   \n",
+       "\n",
+       "   query_end  subject_start  subject_end  evalue  bit_score  \n",
+       "0         51             11           32   0.003       22.3  \n",
+       "1         25              1           25   0.004       22.3  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Configure BLAST for sensitive protein search\n",
+    "blast = Blast(\n",
+    "    # service_url=\"http://localhost:6001/blast\",\n",
+    "    mode=\"blastp\",\n",
+    "    db_path=\"/usr/local/bin/data/test_db\",\n",
+    "    db_name=\"protein_db\",\n",
+    "    evalue=1e-1,  # More stringent E-value\n",
+    "    max_target_seqs=100,  # Return more hits\n",
+    "    num_threads=4,  # Use 4 CPU threads\n",
+    ")\n",
+    "\n",
+    "# Search with longer timeout\n",
+    "results = blast.search(sequence, timeout=7200)  # 2 hour timeout\n",
+    "\n",
+    "# Filter results\n",
+    "significant_hits = results[results[\"identity\"] > 80]  # Only hits with >90% identity\n",
+    "significant_hits"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Thereafter, the ids of the hits can be added to the pyeed database, using the `fetch_from_primary_db` function."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pyeed",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
 }
diff --git a/docs/usage/clustalo.ipynb b/docs/usage/clustalo.ipynb
index 64ed62ee..d3ba2fba 100644
--- a/docs/usage/clustalo.ipynb
+++ b/docs/usage/clustalo.ipynb
@@ -1,171 +1,171 @@
 {
-    "cells": [
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "# Multiple Sequence Alignment with Clustal Omega\n",
-                "\n",
-                "PyEED provides a convenient interface to Clustal Omega for multiple sequence alignment. This notebook demonstrates how to:\n",
-                "1. Align sequences from a dictionary\n",
-                "2. Align sequences directly from the database"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 13,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "from pyeed import Pyeed\n",
-                "from pyeed.tools.clustalo import ClustalOmega\n",
-                "\n",
-                "# change log level to INFO\n",
-                "import sys\n",
-                "from loguru import logger\n",
-                "\n",
-                "logger.remove()\n",
-                "level = logger.add(sys.stderr, level=\"INFO\")"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Direct Sequence Alignment\n",
-                "\n",
-                "You can align sequences directly by providing a dictionary of sequences:"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 14,
-            "metadata": {},
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "Aligned sequences:\n",
-                        "seq1  AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n",
-                        "seq2  AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n",
-                        "seq3  AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK----\n"
-                    ]
-                }
-            ],
-            "source": [
-                "# Initialize ClustalOmega\n",
-                "clustalo = ClustalOmega()\n",
-                "\n",
-                "# Example sequences\n",
-                "sequences = {\n",
-                "    \"seq1\": \"AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n",
-                "    \"seq2\": \"AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n",
-                "    \"seq3\": \"AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK\",\n",
-                "}\n",
-                "\n",
-                "# Perform alignment\n",
-                "alignment = clustalo.align(sequences)\n",
-                "print(\"Aligned sequences:\")\n",
-                "print(alignment)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Database-based Alignment\n",
-                "\n",
-                "You can also align sequences directly from the database by providing a list of accession IDs:"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 15,
-            "metadata": {},
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "Pyeed Graph Object Mapping constraints not defined. Use _install_labels() to set up model constraints.\n",
-                        "📡 Connected to database.\n",
-                        "Database alignment:\n",
-                        "AAP20891.1      MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
-                        "CAJ85677.1      MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
-                        "SAQ02853.1      MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
-                        "CDR98216.1      MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n",
-                        "WP_109963600.1  MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGTGKRGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
-                        "CAA41038.1      MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
-                        "WP_109874025.1  MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
-                        "CAA46344.1      MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
-                        "APG33178.1      MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n",
-                        "AKC98298.1      MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n"
-                    ]
-                }
-            ],
-            "source": [
-                "# Connect to database\n",
-                "pyeed = Pyeed(uri=\"bolt://129.69.129.130:7687\", user=\"neo4j\", password=\"12345678\")\n",
-                "\n",
-                "# Get protein IDs from database\n",
-                "from pyeed.model import Protein\n",
-                "\n",
-                "accession_ids = [protein.accession_id for protein in Protein.nodes.all()][:10]\n",
-                "\n",
-                "# Align sequences from database\n",
-                "alignment = clustalo.align_from_db(accession_ids, pyeed.db)\n",
-                "print(\"Database alignment:\")\n",
-                "print(alignment)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Understanding Alignment Results\n",
-                "\n",
-                "The alignment result is a `MultipleSequenceAlignment` object with:\n",
-                "- List of `Sequence` objects\n",
-                "- Each sequence has an ID and aligned sequence\n",
-                "- Gaps are represented by '-' characters\n",
-                "- Sequences are padded to equal length\n",
-                "\n",
-                "The alignment preserves sequence order and maintains sequence IDs from the input."
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Configuration\n",
-                "\n",
-                "ClustalOmega requires the PyEED Docker service to be running. Make sure to:\n",
-                "1. Have Docker installed\n",
-                "2. Start the service with `docker-compose up -d`\n",
-                "3. The service runs on port 5001 by default"
-            ]
-        }
-    ],
-    "metadata": {
-        "kernelspec": {
-            "display_name": "pyeed_niklas",
-            "language": "python",
-            "name": "python3"
-        },
-        "language_info": {
-            "codemirror_mode": {
-                "name": "ipython",
-                "version": 3
-            },
-            "file_extension": ".py",
-            "mimetype": "text/x-python",
-            "name": "python",
-            "nbconvert_exporter": "python",
-            "pygments_lexer": "ipython3",
-            "version": "3.12.8"
-        }
-    },
-    "nbformat": 4,
-    "nbformat_minor": 2
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Multiple Sequence Alignment with Clustal Omega\n",
+    "\n",
+    "PyEED provides a convenient interface to Clustal Omega for multiple sequence alignment. This notebook demonstrates how to:\n",
+    "1. Align sequences from a dictionary\n",
+    "2. Align sequences directly from the database"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# change log level to INFO\n",
+    "import sys\n",
+    "\n",
+    "from loguru import logger\n",
+    "\n",
+    "from pyeed import Pyeed\n",
+    "from pyeed.model import Protein\n",
+    "from pyeed.tools.clustalo import ClustalOmega\n",
+    "\n",
+    "logger.remove()\n",
+    "level = logger.add(sys.stderr, level=\"INFO\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Direct Sequence Alignment\n",
+    "\n",
+    "You can align sequences directly by providing a dictionary of sequences:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Aligned sequences:\n",
+      "seq1  AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n",
+      "seq2  AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n",
+      "seq3  AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK----\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Initialize ClustalOmega\n",
+    "clustalo = ClustalOmega()\n",
+    "\n",
+    "# Example sequences\n",
+    "sequences = {\n",
+    "    \"seq1\": \"AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n",
+    "    \"seq2\": \"AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n",
+    "    \"seq3\": \"AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK\",\n",
+    "}\n",
+    "\n",
+    "# Perform alignment\n",
+    "alignment = clustalo.align(sequences)\n",
+    "print(\"Aligned sequences:\")\n",
+    "print(alignment)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Database-based Alignment\n",
+    "\n",
+    "You can also align sequences directly from the database by providing a list of accession IDs:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pyeed Graph Object Mapping constraints not defined. Use _install_labels() to set up model constraints.\n",
+      "📡 Connected to database.\n",
+      "Database alignment:\n",
+      "AAP20891.1      MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
+      "CAJ85677.1      MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
+      "SAQ02853.1      MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
+      "CDR98216.1      MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n",
+      "WP_109963600.1  MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGTGKRGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
+      "CAA41038.1      MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
+      "WP_109874025.1  MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
+      "CAA46344.1      MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
+      "APG33178.1      MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n",
+      "AKC98298.1      MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Connect to database\n",
+    "pyeed = Pyeed(uri=\"bolt://129.69.129.130:7687\", user=\"neo4j\", password=\"12345678\")\n",
+    "\n",
+    "# Get protein IDs from database\n",
+    "accession_ids = [protein.accession_id for protein in Protein.nodes.all()][:10]\n",
+    "\n",
+    "# Align sequences from database\n",
+    "alignment = clustalo.align_from_db(accession_ids, pyeed.db)\n",
+    "print(\"Database alignment:\")\n",
+    "print(alignment)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Understanding Alignment Results\n",
+    "\n",
+    "The alignment result is a `MultipleSequenceAlignment` object with:\n",
+    "- List of `Sequence` objects\n",
+    "- Each sequence has an ID and aligned sequence\n",
+    "- Gaps are represented by '-' characters\n",
+    "- Sequences are padded to equal length\n",
+    "\n",
+    "The alignment preserves sequence order and maintains sequence IDs from the input."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Configuration\n",
+    "\n",
+    "ClustalOmega requires the PyEED Docker service to be running. Make sure to:\n",
+    "1. Have Docker installed\n",
+    "2. Start the service with `docker-compose up -d`\n",
+    "3. The service runs on port 5001 by default"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pyeed_niklas",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
 }
diff --git a/docs/usage/embeddings_analysis.ipynb b/docs/usage/embeddings_analysis.ipynb
index 65a2398c..0b72e743 100644
--- a/docs/usage/embeddings_analysis.ipynb
+++ b/docs/usage/embeddings_analysis.ipynb
@@ -24,9 +24,10 @@
    "source": [
     "import sys\n",
     "\n",
-    "from loguru import logger\n",
-    "import pandas as pd\n",
     "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n",
+    "from loguru import logger\n",
+    "\n",
     "from pyeed import Pyeed\n",
     "from pyeed.analysis.embedding_analysis import EmbeddingTool\n",
     "\n",
diff --git a/docs/usage/mmseqs.ipynb b/docs/usage/mmseqs.ipynb
index 1185c6fe..2253fd8a 100644
--- a/docs/usage/mmseqs.ipynb
+++ b/docs/usage/mmseqs.ipynb
@@ -20,6 +20,7 @@
             "outputs": [],
             "source": [
                 "from pyeed import Pyeed\n",
+                "from pyeed.model import Protein\n",
                 "from pyeed.tools.mmseqs import MMSeqs"
             ]
         },
@@ -134,8 +135,6 @@
                 "pyeed = Pyeed(uri=\"bolt://localhost:7687\", user=\"neo4j\", password=\"12345678\")\n",
                 "\n",
                 "# Get first 100 protein IDs from database\n",
-                "from pyeed.model import Protein\n",
-                "\n",
                 "accession_ids = [protein.accession_id for protein in Protein.nodes.all()][:100]\n",
                 "\n",
                 "# Cluster sequences\n",
diff --git a/docs/usage/mutation_analysis.ipynb b/docs/usage/mutation_analysis.ipynb
index 9b31c996..7d10d360 100644
--- a/docs/usage/mutation_analysis.ipynb
+++ b/docs/usage/mutation_analysis.ipynb
@@ -16,6 +16,7 @@
    "outputs": [],
    "source": [
     "import sys\n",
+    "\n",
     "from loguru import logger\n",
     "\n",
     "from pyeed import Pyeed\n",
diff --git a/docs/usage/network_analysis.ipynb b/docs/usage/network_analysis.ipynb
index 4d45db71..0b254610 100644
--- a/docs/usage/network_analysis.ipynb
+++ b/docs/usage/network_analysis.ipynb
@@ -11,6 +11,7 @@
     "import matplotlib.pyplot as plt\n",
     "import networkx as nx\n",
     "from loguru import logger\n",
+    "\n",
     "from pyeed import Pyeed\n",
     "from pyeed.analysis.network_analysis import NetworkAnalysis\n",
     "from pyeed.analysis.sequence_alignment import PairwiseAligner\n",
diff --git a/docs/usage/standard_numbering.ipynb b/docs/usage/standard_numbering.ipynb
index cd84cad9..54374cd6 100644
--- a/docs/usage/standard_numbering.ipynb
+++ b/docs/usage/standard_numbering.ipynb
@@ -23,10 +23,10 @@
     "%reload_ext autoreload\n",
     "%autoreload 2\n",
     "import sys\n",
+    "\n",
     "from loguru import logger\n",
     "\n",
     "from pyeed import Pyeed\n",
-    "from pyeed.analysis.mutation_detection import MutationDetection\n",
     "from pyeed.analysis.standard_numbering import StandardNumberingTool\n",
     "\n",
     "logger.remove()\n",
diff --git a/src/pyeed/analysis/embedding_analysis.py b/src/pyeed/analysis/embedding_analysis.py
index b3535f74..c27b670f 100644
--- a/src/pyeed/analysis/embedding_analysis.py
+++ b/src/pyeed/analysis/embedding_analysis.py
@@ -6,9 +6,10 @@
 import scipy.spatial as sp
 from matplotlib.figure import Figure
 from numpy.typing import NDArray
-from pyeed.dbconnect import DatabaseConnector
 from scipy.spatial.distance import cosine
 
+from pyeed.dbconnect import DatabaseConnector
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/src/pyeed/analysis/network_analysis.py b/src/pyeed/analysis/network_analysis.py
index dd66b45c..3ab9aeaa 100644
--- a/src/pyeed/analysis/network_analysis.py
+++ b/src/pyeed/analysis/network_analysis.py
@@ -2,6 +2,7 @@
 
 import networkx as nx
 from loguru import logger
+
 from pyeed.dbconnect import DatabaseConnector
 
 
diff --git a/src/pyeed/analysis/ontology_loading.py b/src/pyeed/analysis/ontology_loading.py
index ee909636..5b6341f5 100644
--- a/src/pyeed/analysis/ontology_loading.py
+++ b/src/pyeed/analysis/ontology_loading.py
@@ -1,8 +1,9 @@
 from typing import Dict
 
-from pyeed.dbconnect import DatabaseConnector
 from rdflib import OWL, RDF, RDFS, Graph, Namespace, URIRef
 
+from pyeed.dbconnect import DatabaseConnector
+
 
 class OntologyAdapter:
     """
diff --git a/src/pyeed/analysis/standard_numbering.py b/src/pyeed/analysis/standard_numbering.py
index 6f81869f..b2ea0667 100644
--- a/src/pyeed/analysis/standard_numbering.py
+++ b/src/pyeed/analysis/standard_numbering.py
@@ -13,6 +13,7 @@
 from typing import Any, Dict, List, Optional, Tuple
 
 from loguru import logger
+
 from pyeed.analysis.sequence_alignment import PairwiseAligner
 from pyeed.dbconnect import DatabaseConnector
 from pyeed.model import StandardNumbering
diff --git a/tests/unit/test_dbchat.py b/tests/unit/test_dbchat.py
index d1e202c6..bf6226ac 100644
--- a/tests/unit/test_dbchat.py
+++ b/tests/unit/test_dbchat.py
@@ -2,6 +2,7 @@
 
 import pytest
 from neo4j.exceptions import CypherSyntaxError
+
 from pyeed.dbchat import DBChat
 from pyeed.dbconnect import DatabaseConnector
 

From c0739bbf3aae20c6747c216bd53ebbcced8ed396 Mon Sep 17 00:00:00 2001
From: Niklas Abraham GPU <st184432@stud.uni-stuttgart.de>
Date: Thu, 1 May 2025 14:32:06 +0000
Subject: [PATCH 18/19] fixed ruff import version mismatches

---
 src/pyeed/analysis/embedding_analysis.py | 3 +--
 src/pyeed/analysis/mutation_detection.py | 1 -
 src/pyeed/analysis/network_analysis.py   | 1 -
 src/pyeed/analysis/ontology_loading.py   | 3 +--
 src/pyeed/analysis/sequence_alignment.py | 3 +--
 src/pyeed/analysis/standard_numbering.py | 1 -
 tests/unit/test_dbchat.py                | 1 -
 7 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/pyeed/analysis/embedding_analysis.py b/src/pyeed/analysis/embedding_analysis.py
index c27b670f..b3535f74 100644
--- a/src/pyeed/analysis/embedding_analysis.py
+++ b/src/pyeed/analysis/embedding_analysis.py
@@ -6,9 +6,8 @@
 import scipy.spatial as sp
 from matplotlib.figure import Figure
 from numpy.typing import NDArray
-from scipy.spatial.distance import cosine
-
 from pyeed.dbconnect import DatabaseConnector
+from scipy.spatial.distance import cosine
 
 logger = logging.getLogger(__name__)
 
diff --git a/src/pyeed/analysis/mutation_detection.py b/src/pyeed/analysis/mutation_detection.py
index 5c6809e8..c2562ae1 100644
--- a/src/pyeed/analysis/mutation_detection.py
+++ b/src/pyeed/analysis/mutation_detection.py
@@ -1,7 +1,6 @@
 from typing import Any, Optional
 
 from loguru import logger
-
 from pyeed.dbconnect import DatabaseConnector
 
 
diff --git a/src/pyeed/analysis/network_analysis.py b/src/pyeed/analysis/network_analysis.py
index 3ab9aeaa..dd66b45c 100644
--- a/src/pyeed/analysis/network_analysis.py
+++ b/src/pyeed/analysis/network_analysis.py
@@ -2,7 +2,6 @@
 
 import networkx as nx
 from loguru import logger
-
 from pyeed.dbconnect import DatabaseConnector
 
 
diff --git a/src/pyeed/analysis/ontology_loading.py b/src/pyeed/analysis/ontology_loading.py
index 5b6341f5..ee909636 100644
--- a/src/pyeed/analysis/ontology_loading.py
+++ b/src/pyeed/analysis/ontology_loading.py
@@ -1,8 +1,7 @@
 from typing import Dict
 
-from rdflib import OWL, RDF, RDFS, Graph, Namespace, URIRef
-
 from pyeed.dbconnect import DatabaseConnector
+from rdflib import OWL, RDF, RDFS, Graph, Namespace, URIRef
 
 
 class OntologyAdapter:
diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py
index cb6acff4..440cbb1e 100644
--- a/src/pyeed/analysis/sequence_alignment.py
+++ b/src/pyeed/analysis/sequence_alignment.py
@@ -5,10 +5,9 @@
 from Bio.Align import PairwiseAligner as BioPairwiseAligner
 from Bio.Align.substitution_matrices import Array as BioSubstitutionMatrix
 from joblib import Parallel, cpu_count, delayed
-from rich.progress import Progress
-
 from pyeed.dbconnect import DatabaseConnector
 from pyeed.tools.utility import chunks
+from rich.progress import Progress
 
 
 class PairwiseAligner:
diff --git a/src/pyeed/analysis/standard_numbering.py b/src/pyeed/analysis/standard_numbering.py
index b2ea0667..6f81869f 100644
--- a/src/pyeed/analysis/standard_numbering.py
+++ b/src/pyeed/analysis/standard_numbering.py
@@ -13,7 +13,6 @@
 from typing import Any, Dict, List, Optional, Tuple
 
 from loguru import logger
-
 from pyeed.analysis.sequence_alignment import PairwiseAligner
 from pyeed.dbconnect import DatabaseConnector
 from pyeed.model import StandardNumbering
diff --git a/tests/unit/test_dbchat.py b/tests/unit/test_dbchat.py
index bf6226ac..d1e202c6 100644
--- a/tests/unit/test_dbchat.py
+++ b/tests/unit/test_dbchat.py
@@ -2,7 +2,6 @@
 
 import pytest
 from neo4j.exceptions import CypherSyntaxError
-
 from pyeed.dbchat import DBChat
 from pyeed.dbconnect import DatabaseConnector
 

From cf19b94a87f3d800a40e8a95be4020853c59d689 Mon Sep 17 00:00:00 2001
From: alacheim <a.lacheim@outlook.de>
Date: Fri, 2 May 2025 08:58:14 +0000
Subject: [PATCH 19/19] fixed mypy error, formated file

---
 src/pyeed/embedding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py
index 30b15f0c..28f66a1b 100644
--- a/src/pyeed/embedding.py
+++ b/src/pyeed/embedding.py
@@ -97,7 +97,7 @@ def process_batches_on_gpu(
 def load_model_and_tokenizer(
     model_name: str,
     device: torch.device,
-) -> Tuple[Any, Union[Any, None], str]:
+) -> Tuple[Any, Union[Any, None], torch.device]:
     """
     Loads the model and assigns it to a specific GPU.