diff --git a/docs/usage/blast.ipynb b/docs/usage/blast.ipynb
index b56140d7..d6cd57ef 100644
--- a/docs/usage/blast.ipynb
+++ b/docs/usage/blast.ipynb
@@ -1,363 +1,364 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# BLAST Search\n",
- "\n",
- "## Setup\n",
- "\n",
- "The BLAST service runs in a Docker container and requires:\n",
- "1. A local BLAST database\n",
- "2. The Docker service running"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "# change log level to INFO\n",
- "import sys\n",
- "from loguru import logger\n",
- "\n",
- "logger.remove()\n",
- "level = logger.add(sys.stderr, level=\"WARNING\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Basic Usage\n",
- "\n",
- "The `Blast` class provides an interface to search protein or nucleotide sequences against a local BLAST database."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " subject_id | \n",
- " identity | \n",
- " alignment_length | \n",
- " mismatches | \n",
- " gap_opens | \n",
- " query_start | \n",
- " query_end | \n",
- " subject_start | \n",
- " subject_end | \n",
- " evalue | \n",
- " bit_score | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " seq7 | \n",
- " 81.818 | \n",
- " 22 | \n",
- " 3 | \n",
- " 1 | \n",
- " 31 | \n",
- " 51 | \n",
- " 11 | \n",
- " 32 | \n",
- " 0.003 | \n",
- " 22.3 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " seq1 | \n",
- " 100.000 | \n",
- " 25 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 25 | \n",
- " 1 | \n",
- " 25 | \n",
- " 0.004 | \n",
- " 22.3 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " seq2 | \n",
- " 61.538 | \n",
- " 26 | \n",
- " 10 | \n",
- " 0 | \n",
- " 20 | \n",
- " 45 | \n",
- " 5 | \n",
- " 30 | \n",
- " 0.038 | \n",
- " 19.2 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " subject_id identity alignment_length mismatches gap_opens query_start \\\n",
- "0 seq7 81.818 22 3 1 31 \n",
- "1 seq1 100.000 25 0 0 1 \n",
- "2 seq2 61.538 26 10 0 20 \n",
- "\n",
- " query_end subject_start subject_end evalue bit_score \n",
- "0 51 11 32 0.003 22.3 \n",
- "1 25 1 25 0.004 22.3 \n",
- "2 45 5 30 0.038 19.2 "
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from pyeed.tools import Blast\n",
- "\n",
- "# Example protein sequence\n",
- "sequence = \"MSEQVAAVAKLRAKASEAAKEAKAREAAKKLAEAAKKAKAKEAAKRAEAKLAEKAKAAKRAEAKAAKEAKRAAAKRAEAKLAEKAKAAK\"\n",
- "\n",
- "# Initialize BLAST search\n",
- "blast = Blast(\n",
- " # service_url=\"http://localhost:6001/blast\",\n",
- " mode=\"blastp\", # Use blastp for protein sequences\n",
- " db_path=\"/usr/local/bin/data/test_db\", # Path in Docker container\n",
- " db_name=\"protein_db\", # Name of your BLAST database\n",
- " evalue=0.1, # E-value threshold\n",
- " max_target_seqs=10, # Maximum number of hits to return\n",
- ")\n",
- "\n",
- "# Perform search\n",
- "results = blast.search(sequence)\n",
- "results"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The results are returned as a pandas DataFrame with the following columns:\n",
- "- subject_id: ID of the matched sequence\n",
- "- identity: Percentage identity\n",
- "- alignment_length: Length of the alignment\n",
- "- mismatches: Number of mismatches\n",
- "- gap_opens: Number of gap openings\n",
- "- query_start/end: Start/end positions in query sequence\n",
- "- subject_start/end: Start/end positions in subject sequence\n",
- "- evalue: Expectation value\n",
- "- bit_score: Bit score"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Creating a BLAST Database\n",
- "\n",
- "Before using BLAST, you need to create a local database. Here's how to create one from a FASTA file:"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "```bash\n",
- "# For protein sequences\n",
- "makeblastdb -in proteins.fasta -dbtype prot -out blast_db/my_proteins\n",
- "\n",
- "# For nucleotide sequences\n",
- "makeblastdb -in nucleotides.fasta -dbtype nucl -out blast_db/my_nucleotides\n",
- "```\n",
- "\n",
- "To access the BLAST Docker container shell and create databases:\n",
- "\n",
- "```bash\n",
- "# Enter the BLAST container shell\n",
- "docker compose exec blast bash\n",
- "# \n",
- "# Navigate to database directory\n",
- "cd /usr/local/bin/data/blast_db\n",
- "# \n",
- "# Create protein database\n",
- "makeblastdb -in proteins.fasta -dbtype prot -out my_proteins\n",
- "# \n",
- "# Create nucleotide database \n",
- "makeblastdb -in nucleotides.fasta -dbtype nucl -out my_nucleotides\n",
- "```\n",
- "Make sure your FASTA files are mounted in the container's `/usr/local/bin/data/blast_db` directory.\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Advanced Usage\n",
- "\n",
- "You can customize the BLAST search parameters:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " subject_id | \n",
- " identity | \n",
- " alignment_length | \n",
- " mismatches | \n",
- " gap_opens | \n",
- " query_start | \n",
- " query_end | \n",
- " subject_start | \n",
- " subject_end | \n",
- " evalue | \n",
- " bit_score | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " seq7 | \n",
- " 81.818 | \n",
- " 22 | \n",
- " 3 | \n",
- " 1 | \n",
- " 31 | \n",
- " 51 | \n",
- " 11 | \n",
- " 32 | \n",
- " 0.003 | \n",
- " 22.3 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " seq1 | \n",
- " 100.000 | \n",
- " 25 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 25 | \n",
- " 1 | \n",
- " 25 | \n",
- " 0.004 | \n",
- " 22.3 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " subject_id identity alignment_length mismatches gap_opens query_start \\\n",
- "0 seq7 81.818 22 3 1 31 \n",
- "1 seq1 100.000 25 0 0 1 \n",
- "\n",
- " query_end subject_start subject_end evalue bit_score \n",
- "0 51 11 32 0.003 22.3 \n",
- "1 25 1 25 0.004 22.3 "
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Configure BLAST for sensitive protein search\n",
- "blast = Blast(\n",
- " # service_url=\"http://localhost:6001/blast\",\n",
- " mode=\"blastp\",\n",
- " db_path=\"/usr/local/bin/data/test_db\",\n",
- " db_name=\"protein_db\",\n",
- " evalue=1e-1, # More stringent E-value\n",
- " max_target_seqs=100, # Return more hits\n",
- " num_threads=4, # Use 4 CPU threads\n",
- ")\n",
- "\n",
- "# Search with longer timeout\n",
- "results = blast.search(sequence, timeout=7200) # 2 hour timeout\n",
- "\n",
- "# Filter results\n",
- "significant_hits = results[results[\"identity\"] > 80] # Only hits with >90% identity\n",
- "significant_hits"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Thereafter, the ids of the hits can be added to the pyeed database, using the `fetch_from_primary_db` function."
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "pyeed",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# BLAST Search\n",
+ "\n",
+ "## Setup\n",
+ "\n",
+ "The BLAST service runs in a Docker container and requires:\n",
+ "1. A local BLAST database\n",
+ "2. The Docker service running"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# change log level to INFO\n",
+ "import sys\n",
+ "\n",
+ "from loguru import logger\n",
+ "\n",
+ "logger.remove()\n",
+ "level = logger.add(sys.stderr, level=\"WARNING\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Basic Usage\n",
+ "\n",
+ "The `Blast` class provides an interface to search protein or nucleotide sequences against a local BLAST database."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " subject_id | \n",
+ " identity | \n",
+ " alignment_length | \n",
+ " mismatches | \n",
+ " gap_opens | \n",
+ " query_start | \n",
+ " query_end | \n",
+ " subject_start | \n",
+ " subject_end | \n",
+ " evalue | \n",
+ " bit_score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " seq7 | \n",
+ " 81.818 | \n",
+ " 22 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 31 | \n",
+ " 51 | \n",
+ " 11 | \n",
+ " 32 | \n",
+ " 0.003 | \n",
+ " 22.3 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " seq1 | \n",
+ " 100.000 | \n",
+ " 25 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 25 | \n",
+ " 1 | \n",
+ " 25 | \n",
+ " 0.004 | \n",
+ " 22.3 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " seq2 | \n",
+ " 61.538 | \n",
+ " 26 | \n",
+ " 10 | \n",
+ " 0 | \n",
+ " 20 | \n",
+ " 45 | \n",
+ " 5 | \n",
+ " 30 | \n",
+ " 0.038 | \n",
+ " 19.2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " subject_id identity alignment_length mismatches gap_opens query_start \\\n",
+ "0 seq7 81.818 22 3 1 31 \n",
+ "1 seq1 100.000 25 0 0 1 \n",
+ "2 seq2 61.538 26 10 0 20 \n",
+ "\n",
+ " query_end subject_start subject_end evalue bit_score \n",
+ "0 51 11 32 0.003 22.3 \n",
+ "1 25 1 25 0.004 22.3 \n",
+ "2 45 5 30 0.038 19.2 "
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from pyeed.tools import Blast\n",
+ "\n",
+ "# Example protein sequence\n",
+ "sequence = \"MSEQVAAVAKLRAKASEAAKEAKAREAAKKLAEAAKKAKAKEAAKRAEAKLAEKAKAAKRAEAKAAKEAKRAAAKRAEAKLAEKAKAAK\"\n",
+ "\n",
+ "# Initialize BLAST search\n",
+ "blast = Blast(\n",
+ " # service_url=\"http://localhost:6001/blast\",\n",
+ " mode=\"blastp\", # Use blastp for protein sequences\n",
+ " db_path=\"/usr/local/bin/data/test_db\", # Path in Docker container\n",
+ " db_name=\"protein_db\", # Name of your BLAST database\n",
+ " evalue=0.1, # E-value threshold\n",
+ " max_target_seqs=10, # Maximum number of hits to return\n",
+ ")\n",
+ "\n",
+ "# Perform search\n",
+ "results = blast.search(sequence)\n",
+ "results"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The results are returned as a pandas DataFrame with the following columns:\n",
+ "- subject_id: ID of the matched sequence\n",
+ "- identity: Percentage identity\n",
+ "- alignment_length: Length of the alignment\n",
+ "- mismatches: Number of mismatches\n",
+ "- gap_opens: Number of gap openings\n",
+ "- query_start/end: Start/end positions in query sequence\n",
+ "- subject_start/end: Start/end positions in subject sequence\n",
+ "- evalue: Expectation value\n",
+ "- bit_score: Bit score"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Creating a BLAST Database\n",
+ "\n",
+ "Before using BLAST, you need to create a local database. Here's how to create one from a FASTA file:"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "```bash\n",
+ "# For protein sequences\n",
+ "makeblastdb -in proteins.fasta -dbtype prot -out blast_db/my_proteins\n",
+ "\n",
+ "# For nucleotide sequences\n",
+ "makeblastdb -in nucleotides.fasta -dbtype nucl -out blast_db/my_nucleotides\n",
+ "```\n",
+ "\n",
+ "To access the BLAST Docker container shell and create databases:\n",
+ "\n",
+ "```bash\n",
+ "# Enter the BLAST container shell\n",
+ "docker compose exec blast bash\n",
+ "# \n",
+ "# Navigate to database directory\n",
+ "cd /usr/local/bin/data/blast_db\n",
+ "# \n",
+ "# Create protein database\n",
+ "makeblastdb -in proteins.fasta -dbtype prot -out my_proteins\n",
+ "# \n",
+ "# Create nucleotide database \n",
+ "makeblastdb -in nucleotides.fasta -dbtype nucl -out my_nucleotides\n",
+ "```\n",
+ "Make sure your FASTA files are mounted in the container's `/usr/local/bin/data/blast_db` directory.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Advanced Usage\n",
+ "\n",
+ "You can customize the BLAST search parameters:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " subject_id | \n",
+ " identity | \n",
+ " alignment_length | \n",
+ " mismatches | \n",
+ " gap_opens | \n",
+ " query_start | \n",
+ " query_end | \n",
+ " subject_start | \n",
+ " subject_end | \n",
+ " evalue | \n",
+ " bit_score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " seq7 | \n",
+ " 81.818 | \n",
+ " 22 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 31 | \n",
+ " 51 | \n",
+ " 11 | \n",
+ " 32 | \n",
+ " 0.003 | \n",
+ " 22.3 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " seq1 | \n",
+ " 100.000 | \n",
+ " 25 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 25 | \n",
+ " 1 | \n",
+ " 25 | \n",
+ " 0.004 | \n",
+ " 22.3 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " subject_id identity alignment_length mismatches gap_opens query_start \\\n",
+ "0 seq7 81.818 22 3 1 31 \n",
+ "1 seq1 100.000 25 0 0 1 \n",
+ "\n",
+ " query_end subject_start subject_end evalue bit_score \n",
+ "0 51 11 32 0.003 22.3 \n",
+ "1 25 1 25 0.004 22.3 "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Configure BLAST for sensitive protein search\n",
+ "blast = Blast(\n",
+ " # service_url=\"http://localhost:6001/blast\",\n",
+ " mode=\"blastp\",\n",
+ " db_path=\"/usr/local/bin/data/test_db\",\n",
+ " db_name=\"protein_db\",\n",
+ " evalue=1e-1, # More stringent E-value\n",
+ " max_target_seqs=100, # Return more hits\n",
+ " num_threads=4, # Use 4 CPU threads\n",
+ ")\n",
+ "\n",
+ "# Search with longer timeout\n",
+ "results = blast.search(sequence, timeout=7200) # 2 hour timeout\n",
+ "\n",
+ "# Filter results\n",
+ "significant_hits = results[results[\"identity\"] > 80] # Only hits with >90% identity\n",
+ "significant_hits"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Thereafter, the ids of the hits can be added to the pyeed database, using the `fetch_from_primary_db` function."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "pyeed",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
}
diff --git a/docs/usage/clustalo.ipynb b/docs/usage/clustalo.ipynb
index 64ed62ee..d3ba2fba 100644
--- a/docs/usage/clustalo.ipynb
+++ b/docs/usage/clustalo.ipynb
@@ -1,171 +1,171 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Multiple Sequence Alignment with Clustal Omega\n",
- "\n",
- "PyEED provides a convenient interface to Clustal Omega for multiple sequence alignment. This notebook demonstrates how to:\n",
- "1. Align sequences from a dictionary\n",
- "2. Align sequences directly from the database"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [],
- "source": [
- "from pyeed import Pyeed\n",
- "from pyeed.tools.clustalo import ClustalOmega\n",
- "\n",
- "# change log level to INFO\n",
- "import sys\n",
- "from loguru import logger\n",
- "\n",
- "logger.remove()\n",
- "level = logger.add(sys.stderr, level=\"INFO\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Direct Sequence Alignment\n",
- "\n",
- "You can align sequences directly by providing a dictionary of sequences:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Aligned sequences:\n",
- "seq1 AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n",
- "seq2 AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n",
- "seq3 AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK----\n"
- ]
- }
- ],
- "source": [
- "# Initialize ClustalOmega\n",
- "clustalo = ClustalOmega()\n",
- "\n",
- "# Example sequences\n",
- "sequences = {\n",
- " \"seq1\": \"AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n",
- " \"seq2\": \"AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n",
- " \"seq3\": \"AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK\",\n",
- "}\n",
- "\n",
- "# Perform alignment\n",
- "alignment = clustalo.align(sequences)\n",
- "print(\"Aligned sequences:\")\n",
- "print(alignment)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Database-based Alignment\n",
- "\n",
- "You can also align sequences directly from the database by providing a list of accession IDs:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Pyeed Graph Object Mapping constraints not defined. Use _install_labels() to set up model constraints.\n",
- "📡 Connected to database.\n",
- "Database alignment:\n",
- "AAP20891.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
- "CAJ85677.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
- "SAQ02853.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
- "CDR98216.1 MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n",
- "WP_109963600.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGTGKRGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
- "CAA41038.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
- "WP_109874025.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
- "CAA46344.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
- "APG33178.1 MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n",
- "AKC98298.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n"
- ]
- }
- ],
- "source": [
- "# Connect to database\n",
- "pyeed = Pyeed(uri=\"bolt://129.69.129.130:7687\", user=\"neo4j\", password=\"12345678\")\n",
- "\n",
- "# Get protein IDs from database\n",
- "from pyeed.model import Protein\n",
- "\n",
- "accession_ids = [protein.accession_id for protein in Protein.nodes.all()][:10]\n",
- "\n",
- "# Align sequences from database\n",
- "alignment = clustalo.align_from_db(accession_ids, pyeed.db)\n",
- "print(\"Database alignment:\")\n",
- "print(alignment)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Understanding Alignment Results\n",
- "\n",
- "The alignment result is a `MultipleSequenceAlignment` object with:\n",
- "- List of `Sequence` objects\n",
- "- Each sequence has an ID and aligned sequence\n",
- "- Gaps are represented by '-' characters\n",
- "- Sequences are padded to equal length\n",
- "\n",
- "The alignment preserves sequence order and maintains sequence IDs from the input."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Configuration\n",
- "\n",
- "ClustalOmega requires the PyEED Docker service to be running. Make sure to:\n",
- "1. Have Docker installed\n",
- "2. Start the service with `docker-compose up -d`\n",
- "3. The service runs on port 5001 by default"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "pyeed_niklas",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.12.8"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Multiple Sequence Alignment with Clustal Omega\n",
+ "\n",
+ "PyEED provides a convenient interface to Clustal Omega for multiple sequence alignment. This notebook demonstrates how to:\n",
+ "1. Align sequences from a dictionary\n",
+ "2. Align sequences directly from the database"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# change log level to INFO\n",
+ "import sys\n",
+ "\n",
+ "from loguru import logger\n",
+ "\n",
+ "from pyeed import Pyeed\n",
+ "from pyeed.model import Protein\n",
+ "from pyeed.tools.clustalo import ClustalOmega\n",
+ "\n",
+ "logger.remove()\n",
+ "level = logger.add(sys.stderr, level=\"INFO\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Direct Sequence Alignment\n",
+ "\n",
+ "You can align sequences directly by providing a dictionary of sequences:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Aligned sequences:\n",
+ "seq1 AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n",
+ "seq2 AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n",
+ "seq3 AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK----\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Initialize ClustalOmega\n",
+ "clustalo = ClustalOmega()\n",
+ "\n",
+ "# Example sequences\n",
+ "sequences = {\n",
+ " \"seq1\": \"AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n",
+ " \"seq2\": \"AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n",
+ " \"seq3\": \"AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK\",\n",
+ "}\n",
+ "\n",
+ "# Perform alignment\n",
+ "alignment = clustalo.align(sequences)\n",
+ "print(\"Aligned sequences:\")\n",
+ "print(alignment)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Database-based Alignment\n",
+ "\n",
+ "You can also align sequences directly from the database by providing a list of accession IDs:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Pyeed Graph Object Mapping constraints not defined. Use _install_labels() to set up model constraints.\n",
+ "📡 Connected to database.\n",
+ "Database alignment:\n",
+ "AAP20891.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
+ "CAJ85677.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
+ "SAQ02853.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
+ "CDR98216.1 MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n",
+ "WP_109963600.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGTGKRGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
+ "CAA41038.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
+ "WP_109874025.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
+ "CAA46344.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n",
+ "APG33178.1 MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n",
+ "AKC98298.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Connect to database\n",
+ "pyeed = Pyeed(uri=\"bolt://129.69.129.130:7687\", user=\"neo4j\", password=\"12345678\")\n",
+ "\n",
+ "# Get protein IDs from database\n",
+ "accession_ids = [protein.accession_id for protein in Protein.nodes.all()][:10]\n",
+ "\n",
+ "# Align sequences from database\n",
+ "alignment = clustalo.align_from_db(accession_ids, pyeed.db)\n",
+ "print(\"Database alignment:\")\n",
+ "print(alignment)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Understanding Alignment Results\n",
+ "\n",
+ "The alignment result is a `MultipleSequenceAlignment` object with:\n",
+ "- List of `Sequence` objects\n",
+ "- Each sequence has an ID and aligned sequence\n",
+ "- Gaps are represented by '-' characters\n",
+ "- Sequences are padded to equal length\n",
+ "\n",
+ "The alignment preserves sequence order and maintains sequence IDs from the input."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Configuration\n",
+ "\n",
+ "ClustalOmega requires the PyEED Docker service to be running. Make sure to:\n",
+ "1. Have Docker installed\n",
+ "2. Start the service with `docker-compose up -d`\n",
+ "3. The service runs on port 5001 by default"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "pyeed_niklas",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
}
diff --git a/docs/usage/embeddings_analysis.ipynb b/docs/usage/embeddings_analysis.ipynb
index 65a2398c..0b72e743 100644
--- a/docs/usage/embeddings_analysis.ipynb
+++ b/docs/usage/embeddings_analysis.ipynb
@@ -24,9 +24,10 @@
"source": [
"import sys\n",
"\n",
- "from loguru import logger\n",
- "import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
+ "import pandas as pd\n",
+ "from loguru import logger\n",
+ "\n",
"from pyeed import Pyeed\n",
"from pyeed.analysis.embedding_analysis import EmbeddingTool\n",
"\n",
diff --git a/docs/usage/mmseqs.ipynb b/docs/usage/mmseqs.ipynb
index 1185c6fe..2253fd8a 100644
--- a/docs/usage/mmseqs.ipynb
+++ b/docs/usage/mmseqs.ipynb
@@ -20,6 +20,7 @@
"outputs": [],
"source": [
"from pyeed import Pyeed\n",
+ "from pyeed.model import Protein\n",
"from pyeed.tools.mmseqs import MMSeqs"
]
},
@@ -134,8 +135,6 @@
"pyeed = Pyeed(uri=\"bolt://localhost:7687\", user=\"neo4j\", password=\"12345678\")\n",
"\n",
"# Get first 100 protein IDs from database\n",
- "from pyeed.model import Protein\n",
- "\n",
"accession_ids = [protein.accession_id for protein in Protein.nodes.all()][:100]\n",
"\n",
"# Cluster sequences\n",
diff --git a/docs/usage/mutation_analysis.ipynb b/docs/usage/mutation_analysis.ipynb
index 9b31c996..7d10d360 100644
--- a/docs/usage/mutation_analysis.ipynb
+++ b/docs/usage/mutation_analysis.ipynb
@@ -16,6 +16,7 @@
"outputs": [],
"source": [
"import sys\n",
+ "\n",
"from loguru import logger\n",
"\n",
"from pyeed import Pyeed\n",
diff --git a/docs/usage/network_analysis.ipynb b/docs/usage/network_analysis.ipynb
index 4d45db71..0b254610 100644
--- a/docs/usage/network_analysis.ipynb
+++ b/docs/usage/network_analysis.ipynb
@@ -11,6 +11,7 @@
"import matplotlib.pyplot as plt\n",
"import networkx as nx\n",
"from loguru import logger\n",
+ "\n",
"from pyeed import Pyeed\n",
"from pyeed.analysis.network_analysis import NetworkAnalysis\n",
"from pyeed.analysis.sequence_alignment import PairwiseAligner\n",
diff --git a/docs/usage/standard_numbering.ipynb b/docs/usage/standard_numbering.ipynb
index cd84cad9..54374cd6 100644
--- a/docs/usage/standard_numbering.ipynb
+++ b/docs/usage/standard_numbering.ipynb
@@ -23,10 +23,10 @@
"%reload_ext autoreload\n",
"%autoreload 2\n",
"import sys\n",
+ "\n",
"from loguru import logger\n",
"\n",
"from pyeed import Pyeed\n",
- "from pyeed.analysis.mutation_detection import MutationDetection\n",
"from pyeed.analysis.standard_numbering import StandardNumberingTool\n",
"\n",
"logger.remove()\n",
diff --git a/pyproject.toml b/pyproject.toml
index 86edb7ea..b9897071 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,10 @@ esm = "^3.1.3"
rdflib = "^6.0.0"
docker = "5.0.0"
absl-py = "1.0.0"
+crc64iso = "0.0.2"
SPARQLWrapper = "2.0.0"
+pysam = "0.23.0"
+types-requests = "2.32.0.20250328"
[tool.poetry.group.dev.dependencies]
mkdocstrings = {extras = ["python"], version = "^0.26.2"}
diff --git a/src/pyeed/adapter/ncbi_protein_mapper.py b/src/pyeed/adapter/ncbi_protein_mapper.py
index e11d4fe7..3ecf485c 100644
--- a/src/pyeed/adapter/ncbi_protein_mapper.py
+++ b/src/pyeed/adapter/ncbi_protein_mapper.py
@@ -281,6 +281,8 @@ def add_to_db(self, response: Response) -> None:
protein = Protein(**protein_data)
protein.save()
+ if not isinstance(organism, Organism):
+ raise TypeError(f"Expected Organism, but got {type(organism)}")
protein.organism.connect(organism)
# Add features
diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
new file mode 100644
index 00000000..1373547a
--- /dev/null
+++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py
@@ -0,0 +1,132 @@
+import json
+import logging
+import os
+import sys
+from typing import List
+
+import httpx
+from crc64iso import crc64iso
+from pysam import FastaFile
+
+logger = logging.getLogger(__name__)
+
+
+class NCBIToUniprotMapper:
+ def __init__(self, ids: List[str], file: str):
+ self.ids = ids
+ self.file = file
+ self.uniparc_url = "https://www.ebi.ac.uk/proteins/api/uniparc?offset=0&size=100&sequencechecksum="
+ self.ncbi_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+
+ def download_fasta(self, refseq_id: str) -> None:
+ """
+ Downloads a FASTA file for a given RefSeq ID using httpx and saves it locally.
+
+ Args:
+ refseq_id str: NCBI ID
+ """
+
+ params = {
+ "db": "protein",
+ "id": refseq_id,
+ "rettype": "fasta",
+ "retmode": "text",
+ }
+
+ try:
+ response = httpx.get(self.ncbi_url, params=params, timeout=10.0)
+
+ if response.status_code == 200:
+ filename = f"{refseq_id}.fasta"
+ with open(filename, "w") as f:
+ f.write(response.text)
+ print(f"✅ Downloaded: {filename}")
+ else:
+ print(
+ f"❌ Failed to download {refseq_id} (Status: {response.status_code})"
+ )
+
+ except httpx.HTTPError as e:
+ print(f"❌ HTTP error occurred while downloading {refseq_id}: {e}")
+
+ def get_checksum(self, refseq_id: str) -> str:
+ """Fetches and calculates the checksum for a given RefSeq ID.
+
+ Args:
+ refseq_id str: NCBI ID
+
+ Returns:
+ str: checksum ID
+ """
+
+ self.download_fasta(refseq_id)
+ fa = FastaFile(f"{refseq_id}.fasta")
+ seq = fa.fetch(fa.references[0])
+ return f"{crc64iso.crc64(seq)}"
+
+ def checksum_list(self, refseq_ids: List[str]) -> List[str]:
+ """Creates a list of checksum IDs and deletes the FASTA files after processing.
+
+ Args:
+ refseq_ids str: NCBI IDs
+
+ Returns:
+ List[str]: cheksum IDs
+ """
+
+ checksums = []
+ for refseq_id in refseq_ids:
+ checksums.append(self.get_checksum(refseq_id))
+ fasta_file_path = f"{refseq_id}.fasta"
+ fai_file_path = f"{refseq_id}.fasta.fai"
+
+ if os.path.exists(fasta_file_path):
+ os.remove(fasta_file_path) # Delete the fasta file
+
+ if os.path.exists(fai_file_path):
+ os.remove(fai_file_path)
+ return checksums
+
+ def execute_request(self) -> None:
+ """Fetches the uniparc and uniprot ids for the given refseq ids and saves them in a json file."""
+
+ checksum_list = self.checksum_list(self.ids)
+
+ id_mapping_uniprot = {}
+ id_mapping_uniparc = {}
+ counter = 0
+
+ for checksum in checksum_list:
+ url = f"{self.uniparc_url}{checksum}"
+
+ # perform request and get response as JSON
+ with httpx.Client() as client:
+ response = client.get(url, headers={"Accept": "application/json"})
+
+ # check if the request was successful
+ if response.status_code != 200:
+ print(f"Request failed with status code {response.status_code}")
+ response.raise_for_status() # Raise exception for any non-200 response
+ sys.exit()
+
+ # Check if the response body is empty
+ if not response.content.strip(): # Check if the body is empty
+ print("The response body is empty.")
+ sys.exit()
+
+ # extracts the uniprot and the uniparc id from the repsonse and saves them in a dictionary
+ response_body = response.json()
+ for item in response_body:
+ uniparc_id = item.get("accession", None)
+ for ref in item.get("dbReference", []):
+ if ref.get("type") == "UniProtKB/TrEMBL":
+ uniprot_id = ref.get("id", None)
+ id_mapping_uniparc[self.ids[counter]] = uniparc_id
+ id_mapping_uniprot[self.ids[counter]] = uniprot_id
+ counter += 1
+
+ with open(f"{self.file}_uniprot.json", "w") as f:
+ json.dump(id_mapping_uniprot, f)
+
+ with open(f"{self.file}_uniparc.json", "w") as f:
+ json.dump(id_mapping_uniparc, f)
diff --git a/src/pyeed/adapter/uniprot_mapper.py b/src/pyeed/adapter/uniprot_mapper.py
index 477249eb..f52d01b3 100644
--- a/src/pyeed/adapter/uniprot_mapper.py
+++ b/src/pyeed/adapter/uniprot_mapper.py
@@ -1,9 +1,9 @@
import json
from collections import defaultdict
-from typing import Any, List
+from typing import Any, List, Optional
import requests
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
from httpx import Response
from loguru import logger
from SPARQLWrapper import JSON, SPARQLWrapper
@@ -82,13 +82,15 @@ def add_sites(self, record: dict[str, Any], protein: Protein) -> None:
site.save()
protein.site.connect(site, {"positions": positions})
-
- def get_substrates_and_products_from_rhea(self, rhea_id: str) -> dict[str, List[str]]:
+
+ def get_substrates_and_products_from_rhea(
+ self, rhea_id: str
+ ) -> dict[str, List[str]]:
"""Fetch substrates and products from Rhea by parsing the side URI (_L = substrate, _R = product).
-
+
Args:
rhea_id (str or int): The Rhea reaction ID (e.g., 49528)
-
+
Returns:
dict: {
'substrates': [list of chebi URIs],
@@ -118,7 +120,11 @@ def get_substrates_and_products_from_rhea(self, rhea_id: str) -> dict[str, List[
sparql.setReturnFormat(JSON)
sparql.addCustomHttpHeader("User-Agent", "MyPythonClient/1.0")
- results = sparql.query().convert()
+ results_raw = sparql.query().convert()
+ if not isinstance(results_raw, dict):
+ raise TypeError("Expected dict from SPARQL query")
+
+ results: dict[str, Any] = results_raw
substrates = set()
products = set()
@@ -134,17 +140,13 @@ def get_substrates_and_products_from_rhea(self, rhea_id: str) -> dict[str, List[
elif side_uri.endswith("_R"):
products.add(chebi_uri)
- return {
- "substrates": sorted(substrates),
- "products": sorted(products)
- }
+ return {"substrates": sorted(substrates), "products": sorted(products)}
-
- def get_smiles_from_chebi_web(self, chebi_url: str) -> str:
+ def get_smiles_from_chebi_web(self, chebi_url: str) -> Optional[str]:
"""
Extract SMILES from the official ChEBI page using HTML scraping.
"""
- chebi_id = chebi_url.split('_')[-1]
+ chebi_id = chebi_url.split("_")[-1]
url = f"https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:{chebi_id}"
response = requests.get(url)
@@ -152,12 +154,21 @@ def get_smiles_from_chebi_web(self, chebi_url: str) -> str:
# Look for table rows that contain the SMILES label
for table in soup.find_all("table", class_="chebiTableContent"):
+ if not isinstance(table, Tag):
+ continue
for row in table.find_all("tr"):
+ if not isinstance(row, Tag):
+ continue
headers = row.find_all("td", class_="chebiDataHeader")
- if headers and "SMILES" in headers[0].text:
- data_cell = row.find_all("td")[-1] # Get the last in row
- return data_cell.text.strip()
-
+ if (
+ headers
+ and isinstance(headers[0], Tag)
+ and "SMILES" in headers[0].text
+ ):
+ data_cells = row.find_all("td")
+ if data_cells:
+ return f"{data_cells[-1].text.strip()}"
+ return None
def add_reaction(self, record: dict[str, Any], protein: Protein) -> None:
for reference in record.get("comments", []): # Safe retrieval with .get()
@@ -168,39 +179,39 @@ def add_reaction(self, record: dict[str, Any], protein: Protein) -> None:
if db_ref.get("id", "").startswith("RHEA:"):
rhea_id = db_ref["id"]
break # Stop after finding the first match
-
+
catalytic_annotation = Reaction.get_or_save(
rhea_id=rhea_id,
)
- self.add_molecule(rhea_id, catalytic_annotation)
- protein.reaction.connect(catalytic_annotation)
+ if rhea_id is not None:
+ self.add_molecule(rhea_id, catalytic_annotation)
+ protein.reaction.connect(catalytic_annotation)
def add_molecule(self, rhea_id: str, reaction: Reaction) -> None:
-
chebi = self.get_substrates_and_products_from_rhea(rhea_id)
substrate_ids = chebi["substrates"]
product_ids = chebi["products"]
-
+
for i in substrate_ids:
smiles = self.get_smiles_from_chebi_web(i)
-
- chebi_id = i.split('_')[-1]
+
+ chebi_id = i.split("_")[-1]
chebi_id = f"CHEBI:{chebi_id}"
substrate = Molecule.get_or_save(
chebi_id=chebi_id,
- smiles = smiles,
+ smiles=smiles,
)
reaction.substrate.connect(substrate)
-
+
for i in product_ids:
smiles = self.get_smiles_from_chebi_web(i)
- chebi_id = i.split('_')[-1]
+ chebi_id = i.split("_")[-1]
chebi_id = f"CHEBI:{chebi_id}"
product = Molecule.get_or_save(
chebi_id=chebi_id,
- smiles = smiles,
+ smiles=smiles,
)
reaction.product.connect(product)
diff --git a/src/pyeed/analysis/mutation_detection.py b/src/pyeed/analysis/mutation_detection.py
index 5c6809e8..c2562ae1 100644
--- a/src/pyeed/analysis/mutation_detection.py
+++ b/src/pyeed/analysis/mutation_detection.py
@@ -1,7 +1,6 @@
from typing import Any, Optional
from loguru import logger
-
from pyeed.dbconnect import DatabaseConnector
diff --git a/src/pyeed/analysis/network_analysis.py b/src/pyeed/analysis/network_analysis.py
index fd354ebe..dd66b45c 100644
--- a/src/pyeed/analysis/network_analysis.py
+++ b/src/pyeed/analysis/network_analysis.py
@@ -66,57 +66,51 @@ def create_graph(
base_query = """
MATCH (n)
"""
-
+
# Add node filters
node_filters = []
if nodes:
node_filters.append("labels(n)[0] IN $node_types")
if ids:
node_filters.append("n.accession_id IN $accession_ids")
-
+
if node_filters:
base_query += "WHERE " + " AND ".join(node_filters)
-
+
# Add relationship pattern and filters
base_query += """
OPTIONAL MATCH (n)-[r]->(m)
"""
-
+
# Add relationship type filter if specified
if relationships:
base_query += "WHERE type(r) IN $relationships "
-
+
# Return both nodes and relationships in a single query
base_query += """
RETURN
collect(DISTINCT {id: ID(n), labels: labels(n), properties: properties(n)}) as nodes,
collect(DISTINCT {source: ID(n), target: ID(m), type: type(r), properties: properties(r)}) as relationships
"""
-
+
logger.info("Executing combined query for nodes and relationships")
results = self.db.execute_read(
base_query,
- {
- "node_types": nodes,
- "accession_ids": ids,
- "relationships": relationships
- }
+ {"node_types": nodes, "accession_ids": ids, "relationships": relationships},
)
-
+
if not results or not results[0]:
logger.warning("No results found in the database")
return self.graph
-
+
# Process nodes
nodes_data = results[0]["nodes"]
for node in nodes_data:
self.graph.add_node(
- node["id"],
- labels=node["labels"],
- properties=node["properties"]
+ node["id"], labels=node["labels"], properties=node["properties"]
)
logger.info(f"Added {len(nodes_data)} nodes to the graph")
-
+
# Process relationships
relationships_data = results[0]["relationships"]
for rel in relationships_data:
@@ -125,10 +119,10 @@ def create_graph(
rel["source"],
rel["target"],
type=rel["type"],
- properties=rel["properties"]
+ properties=rel["properties"],
)
logger.info(f"Added {len(relationships_data)} relationships to the graph")
-
+
return self.graph
def compute_degree_centrality(self) -> dict[Any, float]:
@@ -263,8 +257,8 @@ def calculate_positions_2d(
filtered_graph.remove_edges_from(self_referential_edges)
# Find isolated nodes
- #isolated_nodes = self.find_isolated_nodes(filtered_graph)
- #filtered_graph.remove_nodes_from(isolated_nodes)
+ # isolated_nodes = self.find_isolated_nodes(filtered_graph)
+ # filtered_graph.remove_nodes_from(isolated_nodes)
# Use spring layout for force-directed graph
weight_attr = attribute if attribute is not None else None
diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py
index cb6acff4..440cbb1e 100644
--- a/src/pyeed/analysis/sequence_alignment.py
+++ b/src/pyeed/analysis/sequence_alignment.py
@@ -5,10 +5,9 @@
from Bio.Align import PairwiseAligner as BioPairwiseAligner
from Bio.Align.substitution_matrices import Array as BioSubstitutionMatrix
from joblib import Parallel, cpu_count, delayed
-from rich.progress import Progress
-
from pyeed.dbconnect import DatabaseConnector
from pyeed.tools.utility import chunks
+from rich.progress import Progress
class PairwiseAligner:
diff --git a/src/pyeed/dbconnect.py b/src/pyeed/dbconnect.py
index d208deec..8abcab52 100644
--- a/src/pyeed/dbconnect.py
+++ b/src/pyeed/dbconnect.py
@@ -227,8 +227,11 @@ def _get_driver(uri: str, user: str | None, password: str | None) -> Driver:
Creates a new Neo4j driver instance.
"""
auth = (user, password) if user and password else None
- return GraphDatabase.driver(uri, auth=auth, connection_timeout=60, # Increase initial connection timeout
- max_connection_lifetime=86400, # Keep connections alive longer
+ return GraphDatabase.driver(
+ uri,
+ auth=auth,
+ connection_timeout=60, # Increase initial connection timeout
+ max_connection_lifetime=86400, # Keep connections alive longer
)
@property
diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py
index a0229385..28f66a1b 100644
--- a/src/pyeed/embedding.py
+++ b/src/pyeed/embedding.py
@@ -10,6 +10,7 @@
from huggingface_hub import HfFolder, login
from loguru import logger
from numpy.typing import NDArray
+from torch.nn import DataParallel, Module
from transformers import EsmModel, EsmTokenizer
from pyeed.dbconnect import DatabaseConnector
@@ -32,7 +33,14 @@ def get_hf_token() -> str:
raise RuntimeError("Failed to get Hugging Face token")
-def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db):
+def process_batches_on_gpu(
+ data: list[tuple[str, str]],
+ batch_size: int,
+ model: Module,
+ tokenizer: EsmTokenizer,
+ db: DatabaseConnector,
+ device: torch.device,
+) -> None:
"""
Splits data into batches and processes them on a single GPU.
@@ -88,8 +96,8 @@ def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db):
def load_model_and_tokenizer(
model_name: str,
- device: str,
-) -> Tuple[Any, Union[Any, None], str]:
+ device: torch.device,
+) -> Tuple[Any, Union[Any, None], torch.device]:
"""
Loads the model and assigns it to a specific GPU.
@@ -125,7 +133,7 @@ def get_batch_embeddings(
model: Union[
EsmModel,
ESMC,
- torch.nn.DataParallel,
+ DataParallel[Module],
ESM3InferenceClient,
ESM3,
],
@@ -209,7 +217,9 @@ def get_batch_embeddings(
def calculate_single_sequence_embedding_last_hidden_state(
- sequence: str, model_name: str = "facebook/esm2_t33_650M_UR50D"
+ sequence: str,
+ device: torch.device,
+ model_name: str = "facebook/esm2_t33_650M_UR50D",
) -> NDArray[np.float64]:
"""
Calculates an embedding for a single sequence.
@@ -221,12 +231,14 @@ def calculate_single_sequence_embedding_last_hidden_state(
Returns:
NDArray[np.float64]: Normalized embedding vector for the sequence
"""
- model, tokenizer, device = load_model_and_tokenizer(model_name)
+ model, tokenizer, device = load_model_and_tokenizer(model_name, device)
return get_single_embedding_last_hidden_state(sequence, model, tokenizer, device)
def calculate_single_sequence_embedding_all_layers(
- sequence: str, model_name: str = "facebook/esm2_t33_650M_UR50D"
+ sequence: str,
+ device: torch.device,
+ model_name: str = "facebook/esm2_t33_650M_UR50D",
) -> NDArray[np.float64]:
"""
Calculates embeddings for a single sequence across all layers.
@@ -238,7 +250,7 @@ def calculate_single_sequence_embedding_all_layers(
Returns:
NDArray[np.float64]: A numpy array containing layer embeddings for the sequence.
"""
- model, tokenizer, device = load_model_and_tokenizer(model_name)
+ model, tokenizer, device = load_model_and_tokenizer(model_name, device)
return get_single_embedding_all_layers(sequence, model, tokenizer, device)
diff --git a/src/pyeed/main.py b/src/pyeed/main.py
index cf0daed2..1189fcb3 100644
--- a/src/pyeed/main.py
+++ b/src/pyeed/main.py
@@ -9,6 +9,7 @@
from pyeed.adapter.ncbi_dna_mapper import NCBIDNAToPyeed
from pyeed.adapter.ncbi_protein_mapper import NCBIProteinToPyeed
+from pyeed.adapter.ncbi_to_uniprot_mapper import NCBIToUniprotMapper
from pyeed.adapter.primary_db_adapter import PrimaryDBAdapter
from pyeed.adapter.uniprot_mapper import UniprotToPyeed
from pyeed.dbchat import DBChat
@@ -190,14 +191,27 @@ def fetch_ncbi_nucleotide(self, ids: list[str]) -> None:
nest_asyncio.apply()
asyncio.get_event_loop().run_until_complete(adapter.execute_requests())
+ def database_id_mapper(self, ids: list[str], file: str) -> None:
+ """
+ Maps IDs from one database to another using the UniProt ID mapping service
+
+ Args:
+ ids (list[str]): List of IDs to map.
+ """
+
+ mapper = NCBIToUniprotMapper(ids, file)
+ mapper.execute_request()
+
+ nest_asyncio.apply()
+
def calculate_sequence_embeddings(
self,
batch_size: int = 16,
model_name: str = "facebook/esm2_t33_650M_UR50D",
- num_gpus: int = None, # Number of GPUs to use
- ) -> None:
+ num_gpus: int = 1, # Number of GPUs to use
+ ) -> None:
"""
- Calculates embeddings for all sequences in the database that do not have embeddings,
+ Calculates embeddings for all sequences in the database that do not have embeddings,
distributing the workload across available GPUs.
Args:
@@ -215,7 +229,12 @@ def calculate_sequence_embeddings(
logger.warning("No GPU available! Running on CPU.")
# Load separate models for each GPU
- devices = [f"cuda:{i}" for i in range(num_gpus)] if num_gpus > 0 else ["cpu"]
+ devices = (
+ [torch.device(f"cuda:{i}") for i in range(num_gpus)]
+ if num_gpus > 0
+ else [torch.device("cpu")]
+ )
+
models_and_tokenizers = [
load_model_and_tokenizer(model_name, device) for device in devices
]
@@ -228,18 +247,19 @@ def calculate_sequence_embeddings(
"""
results = self.db.execute_read(query)
data = [(result["accession"], result["sequence"]) for result in results]
-
+
if not data:
logger.info("No sequences to process.")
return
-
+
accessions, sequences = zip(*data)
total_sequences = len(sequences)
logger.debug(f"Total sequences to process: {total_sequences}")
# Split the data into num_gpus chunks
gpu_batches = [
- list(zip(accessions[i::num_gpus], sequences[i::num_gpus])) for i in range(num_gpus)
+ list(zip(accessions[i::num_gpus], sequences[i::num_gpus]))
+ for i in range(num_gpus)
]
start_time = time.time()
@@ -259,17 +279,18 @@ def calculate_sequence_embeddings(
batch_size,
model,
tokenizer,
+ self.db,
device,
- self.db
)
)
-
+
for future in futures:
future.result() # Wait for all threads to complete
-
end_time = time.time()
- logger.info(f"Total embedding calculation time: {end_time - start_time:.2f} seconds")
+ logger.info(
+ f"Total embedding calculation time: {end_time - start_time:.2f} seconds"
+ )
# Cleanup
for model, _, _ in models_and_tokenizers:
diff --git a/src/pyeed/model.py b/src/pyeed/model.py
index 95f9ffce..5a3bf188 100644
--- a/src/pyeed/model.py
+++ b/src/pyeed/model.py
@@ -1,5 +1,5 @@
from enum import Enum
-from typing import Any
+from typing import Any, cast
# from pyeed.nodes_and_relations import StrictStructuredNode
from neomodel import (
@@ -112,13 +112,12 @@ def save(self, *args: Any, **kwargs: Any) -> None:
elif isinstance(base_property, FloatProperty):
if not all(isinstance(item, float) for item in prop):
raise TypeError(f"All items in '{field}' must be floats")
-
- #Validate BoleanProperty
+
+ # Validate BoleanProperty
elif isinstance(neo_type, BooleanProperty) and not isinstance(prop, bool):
raise TypeError(
f"Expected a boolean for '{field}', got {type(prop).__name__}"
)
-
super().save(*args, **kwargs) # Don't return the result
@@ -154,6 +153,22 @@ class Organism(StrictStructuredNode):
taxonomy_id = IntegerProperty(required=True, unique_index=True)
name = StringProperty()
+ @classmethod
+ def get_or_save(cls, **kwargs: Any) -> "Organism":
+ taxonomy_id = kwargs.get("taxonomy_id")
+ name = kwargs.get("name")
+ try:
+ organism = cast(Organism, cls.nodes.get(taxonomy_id=taxonomy_id))
+ return organism
+ except cls.DoesNotExist:
+ try:
+ organism = cls(taxonomy_id=taxonomy_id, name=name)
+ organism.save()
+ return organism
+ except Exception as e:
+ print(f"Error during saving of the organism: {e}")
+ raise
+
class Mutation(StructuredRel): # type: ignore
"""A relationship representing mutations between two sequences."""
@@ -381,33 +396,35 @@ class Reaction(StrictStructuredNode):
"""
A node representing a reaction.
"""
-
+
rhea_id = StringProperty(unique_index=True, required=True)
chebi_id = ArrayProperty(StringProperty())
# Relationships
substrate = RelationshipTo("Molecule", "SUBSTRATE")
product = RelationshipTo("Molecule", "PRODUCT")
-
-
+
@property
def label(self) -> str:
"""The label of the reaction."""
- return {self.rhea_id}
+ return f"{self.rhea_id}"
+
class Molecule(StrictStructuredNode):
"""
A node representing a molecule in the database.
"""
-
+
chebi_id = StringProperty(unique_index=True, required=True)
rhea_compound_id = StringProperty()
smiles = StringProperty()
@classmethod
- def get_or_save(cls, chebi_id, smiles) -> "Molecule":
+ def get_or_save(cls, **kwargs: Any) -> "Molecule":
+ chebi_id = kwargs.get("chebi_id")
+ smiles = kwargs.get("smiles")
try:
- molecule = cls.nodes.get(chebi_id=chebi_id)
+ molecule = cast(Molecule, cls.nodes.get(chebi_id=chebi_id))
return molecule
except cls.DoesNotExist:
try:
@@ -417,13 +434,13 @@ def get_or_save(cls, chebi_id, smiles) -> "Molecule":
except Exception as e:
print(f"Error during saving of the molecule: {e}")
raise
-
- @property
+
+ @property
def label(self) -> str:
"""The label of the molecule."""
- return {self.chebi_id}
-
-
+ return f"{self.chebi_id}"
+
+
class StandardNumbering(StrictStructuredNode):
name = StringProperty(required=True, unique_index=True)
definition = StringProperty(required=True)
|