diff --git a/docs/usage/blast.ipynb b/docs/usage/blast.ipynb index b56140d7..d6cd57ef 100644 --- a/docs/usage/blast.ipynb +++ b/docs/usage/blast.ipynb @@ -1,363 +1,364 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# BLAST Search\n", - "\n", - "## Setup\n", - "\n", - "The BLAST service runs in a Docker container and requires:\n", - "1. A local BLAST database\n", - "2. The Docker service running" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# change log level to INFO\n", - "import sys\n", - "from loguru import logger\n", - "\n", - "logger.remove()\n", - "level = logger.add(sys.stderr, level=\"WARNING\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Basic Usage\n", - "\n", - "The `Blast` class provides an interface to search protein or nucleotide sequences against a local BLAST database." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
subject_ididentityalignment_lengthmismatchesgap_opensquery_startquery_endsubject_startsubject_endevaluebit_score
0seq781.8182231315111320.00322.3
1seq1100.00025001251250.00422.3
2seq261.5382610020455300.03819.2
\n", - "
" - ], - "text/plain": [ - " subject_id identity alignment_length mismatches gap_opens query_start \\\n", - "0 seq7 81.818 22 3 1 31 \n", - "1 seq1 100.000 25 0 0 1 \n", - "2 seq2 61.538 26 10 0 20 \n", - "\n", - " query_end subject_start subject_end evalue bit_score \n", - "0 51 11 32 0.003 22.3 \n", - "1 25 1 25 0.004 22.3 \n", - "2 45 5 30 0.038 19.2 " - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pyeed.tools import Blast\n", - "\n", - "# Example protein sequence\n", - "sequence = \"MSEQVAAVAKLRAKASEAAKEAKAREAAKKLAEAAKKAKAKEAAKRAEAKLAEKAKAAKRAEAKAAKEAKRAAAKRAEAKLAEKAKAAK\"\n", - "\n", - "# Initialize BLAST search\n", - "blast = Blast(\n", - " # service_url=\"http://localhost:6001/blast\",\n", - " mode=\"blastp\", # Use blastp for protein sequences\n", - " db_path=\"/usr/local/bin/data/test_db\", # Path in Docker container\n", - " db_name=\"protein_db\", # Name of your BLAST database\n", - " evalue=0.1, # E-value threshold\n", - " max_target_seqs=10, # Maximum number of hits to return\n", - ")\n", - "\n", - "# Perform search\n", - "results = blast.search(sequence)\n", - "results" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The results are returned as a pandas DataFrame with the following columns:\n", - "- subject_id: ID of the matched sequence\n", - "- identity: Percentage identity\n", - "- alignment_length: Length of the alignment\n", - "- mismatches: Number of mismatches\n", - "- gap_opens: Number of gap openings\n", - "- query_start/end: Start/end positions in query sequence\n", - "- subject_start/end: Start/end positions in subject sequence\n", - "- evalue: Expectation value\n", - "- bit_score: Bit score" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating a BLAST Database\n", - "\n", - "Before using BLAST, you need to create a local database. Here's how to create one from a FASTA file:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```bash\n", - "# For protein sequences\n", - "makeblastdb -in proteins.fasta -dbtype prot -out blast_db/my_proteins\n", - "\n", - "# For nucleotide sequences\n", - "makeblastdb -in nucleotides.fasta -dbtype nucl -out blast_db/my_nucleotides\n", - "```\n", - "\n", - "To access the BLAST Docker container shell and create databases:\n", - "\n", - "```bash\n", - "# Enter the BLAST container shell\n", - "docker compose exec blast bash\n", - "# \n", - "# Navigate to database directory\n", - "cd /usr/local/bin/data/blast_db\n", - "# \n", - "# Create protein database\n", - "makeblastdb -in proteins.fasta -dbtype prot -out my_proteins\n", - "# \n", - "# Create nucleotide database \n", - "makeblastdb -in nucleotides.fasta -dbtype nucl -out my_nucleotides\n", - "```\n", - "Make sure your FASTA files are mounted in the container's `/usr/local/bin/data/blast_db` directory.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Advanced Usage\n", - "\n", - "You can customize the BLAST search parameters:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
subject_ididentityalignment_lengthmismatchesgap_opensquery_startquery_endsubject_startsubject_endevaluebit_score
0seq781.8182231315111320.00322.3
1seq1100.00025001251250.00422.3
\n", - "
" - ], - "text/plain": [ - " subject_id identity alignment_length mismatches gap_opens query_start \\\n", - "0 seq7 81.818 22 3 1 31 \n", - "1 seq1 100.000 25 0 0 1 \n", - "\n", - " query_end subject_start subject_end evalue bit_score \n", - "0 51 11 32 0.003 22.3 \n", - "1 25 1 25 0.004 22.3 " - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Configure BLAST for sensitive protein search\n", - "blast = Blast(\n", - " # service_url=\"http://localhost:6001/blast\",\n", - " mode=\"blastp\",\n", - " db_path=\"/usr/local/bin/data/test_db\",\n", - " db_name=\"protein_db\",\n", - " evalue=1e-1, # More stringent E-value\n", - " max_target_seqs=100, # Return more hits\n", - " num_threads=4, # Use 4 CPU threads\n", - ")\n", - "\n", - "# Search with longer timeout\n", - "results = blast.search(sequence, timeout=7200) # 2 hour timeout\n", - "\n", - "# Filter results\n", - "significant_hits = results[results[\"identity\"] > 80] # Only hits with >90% identity\n", - "significant_hits" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Thereafter, the ids of the hits can be added to the pyeed database, using the `fetch_from_primary_db` function." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "pyeed", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BLAST Search\n", + "\n", + "## Setup\n", + "\n", + "The BLAST service runs in a Docker container and requires:\n", + "1. A local BLAST database\n", + "2. The Docker service running" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# change log level to INFO\n", + "import sys\n", + "\n", + "from loguru import logger\n", + "\n", + "logger.remove()\n", + "level = logger.add(sys.stderr, level=\"WARNING\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Basic Usage\n", + "\n", + "The `Blast` class provides an interface to search protein or nucleotide sequences against a local BLAST database." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_ididentityalignment_lengthmismatchesgap_opensquery_startquery_endsubject_startsubject_endevaluebit_score
0seq781.8182231315111320.00322.3
1seq1100.00025001251250.00422.3
2seq261.5382610020455300.03819.2
\n", + "
" + ], + "text/plain": [ + " subject_id identity alignment_length mismatches gap_opens query_start \\\n", + "0 seq7 81.818 22 3 1 31 \n", + "1 seq1 100.000 25 0 0 1 \n", + "2 seq2 61.538 26 10 0 20 \n", + "\n", + " query_end subject_start subject_end evalue bit_score \n", + "0 51 11 32 0.003 22.3 \n", + "1 25 1 25 0.004 22.3 \n", + "2 45 5 30 0.038 19.2 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pyeed.tools import Blast\n", + "\n", + "# Example protein sequence\n", + "sequence = \"MSEQVAAVAKLRAKASEAAKEAKAREAAKKLAEAAKKAKAKEAAKRAEAKLAEKAKAAKRAEAKAAKEAKRAAAKRAEAKLAEKAKAAK\"\n", + "\n", + "# Initialize BLAST search\n", + "blast = Blast(\n", + " # service_url=\"http://localhost:6001/blast\",\n", + " mode=\"blastp\", # Use blastp for protein sequences\n", + " db_path=\"/usr/local/bin/data/test_db\", # Path in Docker container\n", + " db_name=\"protein_db\", # Name of your BLAST database\n", + " evalue=0.1, # E-value threshold\n", + " max_target_seqs=10, # Maximum number of hits to return\n", + ")\n", + "\n", + "# Perform search\n", + "results = blast.search(sequence)\n", + "results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The results are returned as a pandas DataFrame with the following columns:\n", + "- subject_id: ID of the matched sequence\n", + "- identity: Percentage identity\n", + "- alignment_length: Length of the alignment\n", + "- mismatches: Number of mismatches\n", + "- gap_opens: Number of gap openings\n", + "- query_start/end: Start/end positions in query sequence\n", + "- subject_start/end: Start/end positions in subject sequence\n", + "- evalue: Expectation value\n", + "- bit_score: Bit score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating a BLAST Database\n", + "\n", + "Before using BLAST, you need to create a local database. Here's how to create one from a FASTA file:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```bash\n", + "# For protein sequences\n", + "makeblastdb -in proteins.fasta -dbtype prot -out blast_db/my_proteins\n", + "\n", + "# For nucleotide sequences\n", + "makeblastdb -in nucleotides.fasta -dbtype nucl -out blast_db/my_nucleotides\n", + "```\n", + "\n", + "To access the BLAST Docker container shell and create databases:\n", + "\n", + "```bash\n", + "# Enter the BLAST container shell\n", + "docker compose exec blast bash\n", + "# \n", + "# Navigate to database directory\n", + "cd /usr/local/bin/data/blast_db\n", + "# \n", + "# Create protein database\n", + "makeblastdb -in proteins.fasta -dbtype prot -out my_proteins\n", + "# \n", + "# Create nucleotide database \n", + "makeblastdb -in nucleotides.fasta -dbtype nucl -out my_nucleotides\n", + "```\n", + "Make sure your FASTA files are mounted in the container's `/usr/local/bin/data/blast_db` directory.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Advanced Usage\n", + "\n", + "You can customize the BLAST search parameters:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_ididentityalignment_lengthmismatchesgap_opensquery_startquery_endsubject_startsubject_endevaluebit_score
0seq781.8182231315111320.00322.3
1seq1100.00025001251250.00422.3
\n", + "
" + ], + "text/plain": [ + " subject_id identity alignment_length mismatches gap_opens query_start \\\n", + "0 seq7 81.818 22 3 1 31 \n", + "1 seq1 100.000 25 0 0 1 \n", + "\n", + " query_end subject_start subject_end evalue bit_score \n", + "0 51 11 32 0.003 22.3 \n", + "1 25 1 25 0.004 22.3 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Configure BLAST for sensitive protein search\n", + "blast = Blast(\n", + " # service_url=\"http://localhost:6001/blast\",\n", + " mode=\"blastp\",\n", + " db_path=\"/usr/local/bin/data/test_db\",\n", + " db_name=\"protein_db\",\n", + " evalue=1e-1, # More stringent E-value\n", + " max_target_seqs=100, # Return more hits\n", + " num_threads=4, # Use 4 CPU threads\n", + ")\n", + "\n", + "# Search with longer timeout\n", + "results = blast.search(sequence, timeout=7200) # 2 hour timeout\n", + "\n", + "# Filter results\n", + "significant_hits = results[results[\"identity\"] > 80] # Only hits with >90% identity\n", + "significant_hits" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Thereafter, the ids of the hits can be added to the pyeed database, using the `fetch_from_primary_db` function." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pyeed", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/docs/usage/clustalo.ipynb b/docs/usage/clustalo.ipynb index 64ed62ee..d3ba2fba 100644 --- a/docs/usage/clustalo.ipynb +++ b/docs/usage/clustalo.ipynb @@ -1,171 +1,171 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Multiple Sequence Alignment with Clustal Omega\n", - "\n", - "PyEED provides a convenient interface to Clustal Omega for multiple sequence alignment. This notebook demonstrates how to:\n", - "1. Align sequences from a dictionary\n", - "2. Align sequences directly from the database" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "from pyeed import Pyeed\n", - "from pyeed.tools.clustalo import ClustalOmega\n", - "\n", - "# change log level to INFO\n", - "import sys\n", - "from loguru import logger\n", - "\n", - "logger.remove()\n", - "level = logger.add(sys.stderr, level=\"INFO\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Direct Sequence Alignment\n", - "\n", - "You can align sequences directly by providing a dictionary of sequences:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Aligned sequences:\n", - "seq1 AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n", - "seq2 AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n", - "seq3 AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK----\n" - ] - } - ], - "source": [ - "# Initialize ClustalOmega\n", - "clustalo = ClustalOmega()\n", - "\n", - "# Example sequences\n", - "sequences = {\n", - " \"seq1\": \"AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n", - " \"seq2\": \"AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n", - " \"seq3\": \"AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK\",\n", - "}\n", - "\n", - "# Perform alignment\n", - "alignment = clustalo.align(sequences)\n", - "print(\"Aligned sequences:\")\n", - "print(alignment)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Database-based Alignment\n", - "\n", - "You can also align sequences directly from the database by providing a list of accession IDs:" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Pyeed Graph Object Mapping constraints not defined. Use _install_labels() to set up model constraints.\n", - "📡 Connected to database.\n", - "Database alignment:\n", - "AAP20891.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "CAJ85677.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "SAQ02853.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "CDR98216.1 MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n", - "WP_109963600.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGTGKRGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "CAA41038.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "WP_109874025.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "CAA46344.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", - "APG33178.1 MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n", - "AKC98298.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n" - ] - } - ], - "source": [ - "# Connect to database\n", - "pyeed = Pyeed(uri=\"bolt://129.69.129.130:7687\", user=\"neo4j\", password=\"12345678\")\n", - "\n", - "# Get protein IDs from database\n", - "from pyeed.model import Protein\n", - "\n", - "accession_ids = [protein.accession_id for protein in Protein.nodes.all()][:10]\n", - "\n", - "# Align sequences from database\n", - "alignment = clustalo.align_from_db(accession_ids, pyeed.db)\n", - "print(\"Database alignment:\")\n", - "print(alignment)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Understanding Alignment Results\n", - "\n", - "The alignment result is a `MultipleSequenceAlignment` object with:\n", - "- List of `Sequence` objects\n", - "- Each sequence has an ID and aligned sequence\n", - "- Gaps are represented by '-' characters\n", - "- Sequences are padded to equal length\n", - "\n", - "The alignment preserves sequence order and maintains sequence IDs from the input." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Configuration\n", - "\n", - "ClustalOmega requires the PyEED Docker service to be running. Make sure to:\n", - "1. Have Docker installed\n", - "2. Start the service with `docker-compose up -d`\n", - "3. The service runs on port 5001 by default" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "pyeed_niklas", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Multiple Sequence Alignment with Clustal Omega\n", + "\n", + "PyEED provides a convenient interface to Clustal Omega for multiple sequence alignment. This notebook demonstrates how to:\n", + "1. Align sequences from a dictionary\n", + "2. Align sequences directly from the database" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# change log level to INFO\n", + "import sys\n", + "\n", + "from loguru import logger\n", + "\n", + "from pyeed import Pyeed\n", + "from pyeed.model import Protein\n", + "from pyeed.tools.clustalo import ClustalOmega\n", + "\n", + "logger.remove()\n", + "level = logger.add(sys.stderr, level=\"INFO\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Direct Sequence Alignment\n", + "\n", + "You can align sequences directly by providing a dictionary of sequences:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Aligned sequences:\n", + "seq1 AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n", + "seq2 AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\n", + "seq3 AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK----\n" + ] + } + ], + "source": [ + "# Initialize ClustalOmega\n", + "clustalo = ClustalOmega()\n", + "\n", + "# Example sequences\n", + "sequences = {\n", + " \"seq1\": \"AKFVMPDRAWHLYTGNECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n", + " \"seq2\": \"AKFVMPDRQWHLYTGQECSKQRLYVWFHDGAPILKTQSDNMGAYRCPLFHVTKNWEI\",\n", + " \"seq3\": \"AKFVMPDRQWHLYTGNECSKQRLYVWFHDGAPILKTQADNMGAYRCALFHVTK\",\n", + "}\n", + "\n", + "# Perform alignment\n", + "alignment = clustalo.align(sequences)\n", + "print(\"Aligned sequences:\")\n", + "print(alignment)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Database-based Alignment\n", + "\n", + "You can also align sequences directly from the database by providing a list of accession IDs:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pyeed Graph Object Mapping constraints not defined. Use _install_labels() to set up model constraints.\n", + "📡 Connected to database.\n", + "Database alignment:\n", + "AAP20891.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "CAJ85677.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "SAQ02853.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "CDR98216.1 MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n", + "WP_109963600.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGTGKRGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "CAA41038.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "WP_109874025.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "CAA46344.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGASERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n", + "APG33178.1 MSIQHFRVALIPFFAAFCFPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVKYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDSWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYMTGSQATMDERNRQIAEIGASLIKHW\n", + "AKC98298.1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDKLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDHWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW\n" + ] + } + ], + "source": [ + "# Connect to database\n", + "pyeed = Pyeed(uri=\"bolt://129.69.129.130:7687\", user=\"neo4j\", password=\"12345678\")\n", + "\n", + "# Get protein IDs from database\n", + "accession_ids = [protein.accession_id for protein in Protein.nodes.all()][:10]\n", + "\n", + "# Align sequences from database\n", + "alignment = clustalo.align_from_db(accession_ids, pyeed.db)\n", + "print(\"Database alignment:\")\n", + "print(alignment)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Understanding Alignment Results\n", + "\n", + "The alignment result is a `MultipleSequenceAlignment` object with:\n", + "- List of `Sequence` objects\n", + "- Each sequence has an ID and aligned sequence\n", + "- Gaps are represented by '-' characters\n", + "- Sequences are padded to equal length\n", + "\n", + "The alignment preserves sequence order and maintains sequence IDs from the input." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuration\n", + "\n", + "ClustalOmega requires the PyEED Docker service to be running. Make sure to:\n", + "1. Have Docker installed\n", + "2. Start the service with `docker-compose up -d`\n", + "3. The service runs on port 5001 by default" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pyeed_niklas", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/docs/usage/embeddings_analysis.ipynb b/docs/usage/embeddings_analysis.ipynb index 65a2398c..0b72e743 100644 --- a/docs/usage/embeddings_analysis.ipynb +++ b/docs/usage/embeddings_analysis.ipynb @@ -24,9 +24,10 @@ "source": [ "import sys\n", "\n", - "from loguru import logger\n", - "import pandas as pd\n", "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "from loguru import logger\n", + "\n", "from pyeed import Pyeed\n", "from pyeed.analysis.embedding_analysis import EmbeddingTool\n", "\n", diff --git a/docs/usage/mmseqs.ipynb b/docs/usage/mmseqs.ipynb index 1185c6fe..2253fd8a 100644 --- a/docs/usage/mmseqs.ipynb +++ b/docs/usage/mmseqs.ipynb @@ -20,6 +20,7 @@ "outputs": [], "source": [ "from pyeed import Pyeed\n", + "from pyeed.model import Protein\n", "from pyeed.tools.mmseqs import MMSeqs" ] }, @@ -134,8 +135,6 @@ "pyeed = Pyeed(uri=\"bolt://localhost:7687\", user=\"neo4j\", password=\"12345678\")\n", "\n", "# Get first 100 protein IDs from database\n", - "from pyeed.model import Protein\n", - "\n", "accession_ids = [protein.accession_id for protein in Protein.nodes.all()][:100]\n", "\n", "# Cluster sequences\n", diff --git a/docs/usage/mutation_analysis.ipynb b/docs/usage/mutation_analysis.ipynb index 9b31c996..7d10d360 100644 --- a/docs/usage/mutation_analysis.ipynb +++ b/docs/usage/mutation_analysis.ipynb @@ -16,6 +16,7 @@ "outputs": [], "source": [ "import sys\n", + "\n", "from loguru import logger\n", "\n", "from pyeed import Pyeed\n", diff --git a/docs/usage/network_analysis.ipynb b/docs/usage/network_analysis.ipynb index 4d45db71..0b254610 100644 --- a/docs/usage/network_analysis.ipynb +++ b/docs/usage/network_analysis.ipynb @@ -11,6 +11,7 @@ "import matplotlib.pyplot as plt\n", "import networkx as nx\n", "from loguru import logger\n", + "\n", "from pyeed import Pyeed\n", "from pyeed.analysis.network_analysis import NetworkAnalysis\n", "from pyeed.analysis.sequence_alignment import PairwiseAligner\n", diff --git a/docs/usage/standard_numbering.ipynb b/docs/usage/standard_numbering.ipynb index cd84cad9..54374cd6 100644 --- a/docs/usage/standard_numbering.ipynb +++ b/docs/usage/standard_numbering.ipynb @@ -23,10 +23,10 @@ "%reload_ext autoreload\n", "%autoreload 2\n", "import sys\n", + "\n", "from loguru import logger\n", "\n", "from pyeed import Pyeed\n", - "from pyeed.analysis.mutation_detection import MutationDetection\n", "from pyeed.analysis.standard_numbering import StandardNumberingTool\n", "\n", "logger.remove()\n", diff --git a/pyproject.toml b/pyproject.toml index 86edb7ea..b9897071 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,10 @@ esm = "^3.1.3" rdflib = "^6.0.0" docker = "5.0.0" absl-py = "1.0.0" +crc64iso = "0.0.2" SPARQLWrapper = "2.0.0" +pysam = "0.23.0" +types-requests = "2.32.0.20250328" [tool.poetry.group.dev.dependencies] mkdocstrings = {extras = ["python"], version = "^0.26.2"} diff --git a/src/pyeed/adapter/ncbi_protein_mapper.py b/src/pyeed/adapter/ncbi_protein_mapper.py index e11d4fe7..3ecf485c 100644 --- a/src/pyeed/adapter/ncbi_protein_mapper.py +++ b/src/pyeed/adapter/ncbi_protein_mapper.py @@ -281,6 +281,8 @@ def add_to_db(self, response: Response) -> None: protein = Protein(**protein_data) protein.save() + if not isinstance(organism, Organism): + raise TypeError(f"Expected Organism, but got {type(organism)}") protein.organism.connect(organism) # Add features diff --git a/src/pyeed/adapter/ncbi_to_uniprot_mapper.py b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py new file mode 100644 index 00000000..1373547a --- /dev/null +++ b/src/pyeed/adapter/ncbi_to_uniprot_mapper.py @@ -0,0 +1,132 @@ +import json +import logging +import os +import sys +from typing import List + +import httpx +from crc64iso import crc64iso +from pysam import FastaFile + +logger = logging.getLogger(__name__) + + +class NCBIToUniprotMapper: + def __init__(self, ids: List[str], file: str): + self.ids = ids + self.file = file + self.uniparc_url = "https://www.ebi.ac.uk/proteins/api/uniparc?offset=0&size=100&sequencechecksum=" + self.ncbi_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" + + def download_fasta(self, refseq_id: str) -> None: + """ + Downloads a FASTA file for a given RefSeq ID using httpx and saves it locally. + + Args: + refseq_id str: NCBI ID + """ + + params = { + "db": "protein", + "id": refseq_id, + "rettype": "fasta", + "retmode": "text", + } + + try: + response = httpx.get(self.ncbi_url, params=params, timeout=10.0) + + if response.status_code == 200: + filename = f"{refseq_id}.fasta" + with open(filename, "w") as f: + f.write(response.text) + print(f"✅ Downloaded: {filename}") + else: + print( + f"❌ Failed to download {refseq_id} (Status: {response.status_code})" + ) + + except httpx.HTTPError as e: + print(f"❌ HTTP error occurred while downloading {refseq_id}: {e}") + + def get_checksum(self, refseq_id: str) -> str: + """Fetches and calculates the checksum for a given RefSeq ID. + + Args: + refseq_id str: NCBI ID + + Returns: + str: checksum ID + """ + + self.download_fasta(refseq_id) + fa = FastaFile(f"{refseq_id}.fasta") + seq = fa.fetch(fa.references[0]) + return f"{crc64iso.crc64(seq)}" + + def checksum_list(self, refseq_ids: List[str]) -> List[str]: + """Creates a list of checksum IDs and deletes the FASTA files after processing. + + Args: + refseq_ids str: NCBI IDs + + Returns: + List[str]: cheksum IDs + """ + + checksums = [] + for refseq_id in refseq_ids: + checksums.append(self.get_checksum(refseq_id)) + fasta_file_path = f"{refseq_id}.fasta" + fai_file_path = f"{refseq_id}.fasta.fai" + + if os.path.exists(fasta_file_path): + os.remove(fasta_file_path) # Delete the fasta file + + if os.path.exists(fai_file_path): + os.remove(fai_file_path) + return checksums + + def execute_request(self) -> None: + """Fetches the uniparc and uniprot ids for the given refseq ids and saves them in a json file.""" + + checksum_list = self.checksum_list(self.ids) + + id_mapping_uniprot = {} + id_mapping_uniparc = {} + counter = 0 + + for checksum in checksum_list: + url = f"{self.uniparc_url}{checksum}" + + # perform request and get response as JSON + with httpx.Client() as client: + response = client.get(url, headers={"Accept": "application/json"}) + + # check if the request was successful + if response.status_code != 200: + print(f"Request failed with status code {response.status_code}") + response.raise_for_status() # Raise exception for any non-200 response + sys.exit() + + # Check if the response body is empty + if not response.content.strip(): # Check if the body is empty + print("The response body is empty.") + sys.exit() + + # extracts the uniprot and the uniparc id from the repsonse and saves them in a dictionary + response_body = response.json() + for item in response_body: + uniparc_id = item.get("accession", None) + for ref in item.get("dbReference", []): + if ref.get("type") == "UniProtKB/TrEMBL": + uniprot_id = ref.get("id", None) + id_mapping_uniparc[self.ids[counter]] = uniparc_id + id_mapping_uniprot[self.ids[counter]] = uniprot_id + counter += 1 + + with open(f"{self.file}_uniprot.json", "w") as f: + json.dump(id_mapping_uniprot, f) + + with open(f"{self.file}_uniparc.json", "w") as f: + json.dump(id_mapping_uniparc, f) diff --git a/src/pyeed/adapter/uniprot_mapper.py b/src/pyeed/adapter/uniprot_mapper.py index 477249eb..f52d01b3 100644 --- a/src/pyeed/adapter/uniprot_mapper.py +++ b/src/pyeed/adapter/uniprot_mapper.py @@ -1,9 +1,9 @@ import json from collections import defaultdict -from typing import Any, List +from typing import Any, List, Optional import requests -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from httpx import Response from loguru import logger from SPARQLWrapper import JSON, SPARQLWrapper @@ -82,13 +82,15 @@ def add_sites(self, record: dict[str, Any], protein: Protein) -> None: site.save() protein.site.connect(site, {"positions": positions}) - - def get_substrates_and_products_from_rhea(self, rhea_id: str) -> dict[str, List[str]]: + + def get_substrates_and_products_from_rhea( + self, rhea_id: str + ) -> dict[str, List[str]]: """Fetch substrates and products from Rhea by parsing the side URI (_L = substrate, _R = product). - + Args: rhea_id (str or int): The Rhea reaction ID (e.g., 49528) - + Returns: dict: { 'substrates': [list of chebi URIs], @@ -118,7 +120,11 @@ def get_substrates_and_products_from_rhea(self, rhea_id: str) -> dict[str, List[ sparql.setReturnFormat(JSON) sparql.addCustomHttpHeader("User-Agent", "MyPythonClient/1.0") - results = sparql.query().convert() + results_raw = sparql.query().convert() + if not isinstance(results_raw, dict): + raise TypeError("Expected dict from SPARQL query") + + results: dict[str, Any] = results_raw substrates = set() products = set() @@ -134,17 +140,13 @@ def get_substrates_and_products_from_rhea(self, rhea_id: str) -> dict[str, List[ elif side_uri.endswith("_R"): products.add(chebi_uri) - return { - "substrates": sorted(substrates), - "products": sorted(products) - } + return {"substrates": sorted(substrates), "products": sorted(products)} - - def get_smiles_from_chebi_web(self, chebi_url: str) -> str: + def get_smiles_from_chebi_web(self, chebi_url: str) -> Optional[str]: """ Extract SMILES from the official ChEBI page using HTML scraping. """ - chebi_id = chebi_url.split('_')[-1] + chebi_id = chebi_url.split("_")[-1] url = f"https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:{chebi_id}" response = requests.get(url) @@ -152,12 +154,21 @@ def get_smiles_from_chebi_web(self, chebi_url: str) -> str: # Look for table rows that contain the SMILES label for table in soup.find_all("table", class_="chebiTableContent"): + if not isinstance(table, Tag): + continue for row in table.find_all("tr"): + if not isinstance(row, Tag): + continue headers = row.find_all("td", class_="chebiDataHeader") - if headers and "SMILES" in headers[0].text: - data_cell = row.find_all("td")[-1] # Get the last in row - return data_cell.text.strip() - + if ( + headers + and isinstance(headers[0], Tag) + and "SMILES" in headers[0].text + ): + data_cells = row.find_all("td") + if data_cells: + return f"{data_cells[-1].text.strip()}" + return None def add_reaction(self, record: dict[str, Any], protein: Protein) -> None: for reference in record.get("comments", []): # Safe retrieval with .get() @@ -168,39 +179,39 @@ def add_reaction(self, record: dict[str, Any], protein: Protein) -> None: if db_ref.get("id", "").startswith("RHEA:"): rhea_id = db_ref["id"] break # Stop after finding the first match - + catalytic_annotation = Reaction.get_or_save( rhea_id=rhea_id, ) - self.add_molecule(rhea_id, catalytic_annotation) - protein.reaction.connect(catalytic_annotation) + if rhea_id is not None: + self.add_molecule(rhea_id, catalytic_annotation) + protein.reaction.connect(catalytic_annotation) def add_molecule(self, rhea_id: str, reaction: Reaction) -> None: - chebi = self.get_substrates_and_products_from_rhea(rhea_id) substrate_ids = chebi["substrates"] product_ids = chebi["products"] - + for i in substrate_ids: smiles = self.get_smiles_from_chebi_web(i) - - chebi_id = i.split('_')[-1] + + chebi_id = i.split("_")[-1] chebi_id = f"CHEBI:{chebi_id}" substrate = Molecule.get_or_save( chebi_id=chebi_id, - smiles = smiles, + smiles=smiles, ) reaction.substrate.connect(substrate) - + for i in product_ids: smiles = self.get_smiles_from_chebi_web(i) - chebi_id = i.split('_')[-1] + chebi_id = i.split("_")[-1] chebi_id = f"CHEBI:{chebi_id}" product = Molecule.get_or_save( chebi_id=chebi_id, - smiles = smiles, + smiles=smiles, ) reaction.product.connect(product) diff --git a/src/pyeed/analysis/mutation_detection.py b/src/pyeed/analysis/mutation_detection.py index 5c6809e8..c2562ae1 100644 --- a/src/pyeed/analysis/mutation_detection.py +++ b/src/pyeed/analysis/mutation_detection.py @@ -1,7 +1,6 @@ from typing import Any, Optional from loguru import logger - from pyeed.dbconnect import DatabaseConnector diff --git a/src/pyeed/analysis/network_analysis.py b/src/pyeed/analysis/network_analysis.py index fd354ebe..dd66b45c 100644 --- a/src/pyeed/analysis/network_analysis.py +++ b/src/pyeed/analysis/network_analysis.py @@ -66,57 +66,51 @@ def create_graph( base_query = """ MATCH (n) """ - + # Add node filters node_filters = [] if nodes: node_filters.append("labels(n)[0] IN $node_types") if ids: node_filters.append("n.accession_id IN $accession_ids") - + if node_filters: base_query += "WHERE " + " AND ".join(node_filters) - + # Add relationship pattern and filters base_query += """ OPTIONAL MATCH (n)-[r]->(m) """ - + # Add relationship type filter if specified if relationships: base_query += "WHERE type(r) IN $relationships " - + # Return both nodes and relationships in a single query base_query += """ RETURN collect(DISTINCT {id: ID(n), labels: labels(n), properties: properties(n)}) as nodes, collect(DISTINCT {source: ID(n), target: ID(m), type: type(r), properties: properties(r)}) as relationships """ - + logger.info("Executing combined query for nodes and relationships") results = self.db.execute_read( base_query, - { - "node_types": nodes, - "accession_ids": ids, - "relationships": relationships - } + {"node_types": nodes, "accession_ids": ids, "relationships": relationships}, ) - + if not results or not results[0]: logger.warning("No results found in the database") return self.graph - + # Process nodes nodes_data = results[0]["nodes"] for node in nodes_data: self.graph.add_node( - node["id"], - labels=node["labels"], - properties=node["properties"] + node["id"], labels=node["labels"], properties=node["properties"] ) logger.info(f"Added {len(nodes_data)} nodes to the graph") - + # Process relationships relationships_data = results[0]["relationships"] for rel in relationships_data: @@ -125,10 +119,10 @@ def create_graph( rel["source"], rel["target"], type=rel["type"], - properties=rel["properties"] + properties=rel["properties"], ) logger.info(f"Added {len(relationships_data)} relationships to the graph") - + return self.graph def compute_degree_centrality(self) -> dict[Any, float]: @@ -263,8 +257,8 @@ def calculate_positions_2d( filtered_graph.remove_edges_from(self_referential_edges) # Find isolated nodes - #isolated_nodes = self.find_isolated_nodes(filtered_graph) - #filtered_graph.remove_nodes_from(isolated_nodes) + # isolated_nodes = self.find_isolated_nodes(filtered_graph) + # filtered_graph.remove_nodes_from(isolated_nodes) # Use spring layout for force-directed graph weight_attr = attribute if attribute is not None else None diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py index cb6acff4..440cbb1e 100644 --- a/src/pyeed/analysis/sequence_alignment.py +++ b/src/pyeed/analysis/sequence_alignment.py @@ -5,10 +5,9 @@ from Bio.Align import PairwiseAligner as BioPairwiseAligner from Bio.Align.substitution_matrices import Array as BioSubstitutionMatrix from joblib import Parallel, cpu_count, delayed -from rich.progress import Progress - from pyeed.dbconnect import DatabaseConnector from pyeed.tools.utility import chunks +from rich.progress import Progress class PairwiseAligner: diff --git a/src/pyeed/dbconnect.py b/src/pyeed/dbconnect.py index d208deec..8abcab52 100644 --- a/src/pyeed/dbconnect.py +++ b/src/pyeed/dbconnect.py @@ -227,8 +227,11 @@ def _get_driver(uri: str, user: str | None, password: str | None) -> Driver: Creates a new Neo4j driver instance. """ auth = (user, password) if user and password else None - return GraphDatabase.driver(uri, auth=auth, connection_timeout=60, # Increase initial connection timeout - max_connection_lifetime=86400, # Keep connections alive longer + return GraphDatabase.driver( + uri, + auth=auth, + connection_timeout=60, # Increase initial connection timeout + max_connection_lifetime=86400, # Keep connections alive longer ) @property diff --git a/src/pyeed/embedding.py b/src/pyeed/embedding.py index a0229385..28f66a1b 100644 --- a/src/pyeed/embedding.py +++ b/src/pyeed/embedding.py @@ -10,6 +10,7 @@ from huggingface_hub import HfFolder, login from loguru import logger from numpy.typing import NDArray +from torch.nn import DataParallel, Module from transformers import EsmModel, EsmTokenizer from pyeed.dbconnect import DatabaseConnector @@ -32,7 +33,14 @@ def get_hf_token() -> str: raise RuntimeError("Failed to get Hugging Face token") -def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db): +def process_batches_on_gpu( + data: list[tuple[str, str]], + batch_size: int, + model: Module, + tokenizer: EsmTokenizer, + db: DatabaseConnector, + device: torch.device, +) -> None: """ Splits data into batches and processes them on a single GPU. @@ -88,8 +96,8 @@ def process_batches_on_gpu(data, batch_size, model, tokenizer, device, db): def load_model_and_tokenizer( model_name: str, - device: str, -) -> Tuple[Any, Union[Any, None], str]: + device: torch.device, +) -> Tuple[Any, Union[Any, None], torch.device]: """ Loads the model and assigns it to a specific GPU. @@ -125,7 +133,7 @@ def get_batch_embeddings( model: Union[ EsmModel, ESMC, - torch.nn.DataParallel, + DataParallel[Module], ESM3InferenceClient, ESM3, ], @@ -209,7 +217,9 @@ def get_batch_embeddings( def calculate_single_sequence_embedding_last_hidden_state( - sequence: str, model_name: str = "facebook/esm2_t33_650M_UR50D" + sequence: str, + device: torch.device, + model_name: str = "facebook/esm2_t33_650M_UR50D", ) -> NDArray[np.float64]: """ Calculates an embedding for a single sequence. @@ -221,12 +231,14 @@ def calculate_single_sequence_embedding_last_hidden_state( Returns: NDArray[np.float64]: Normalized embedding vector for the sequence """ - model, tokenizer, device = load_model_and_tokenizer(model_name) + model, tokenizer, device = load_model_and_tokenizer(model_name, device) return get_single_embedding_last_hidden_state(sequence, model, tokenizer, device) def calculate_single_sequence_embedding_all_layers( - sequence: str, model_name: str = "facebook/esm2_t33_650M_UR50D" + sequence: str, + device: torch.device, + model_name: str = "facebook/esm2_t33_650M_UR50D", ) -> NDArray[np.float64]: """ Calculates embeddings for a single sequence across all layers. @@ -238,7 +250,7 @@ def calculate_single_sequence_embedding_all_layers( Returns: NDArray[np.float64]: A numpy array containing layer embeddings for the sequence. """ - model, tokenizer, device = load_model_and_tokenizer(model_name) + model, tokenizer, device = load_model_and_tokenizer(model_name, device) return get_single_embedding_all_layers(sequence, model, tokenizer, device) diff --git a/src/pyeed/main.py b/src/pyeed/main.py index cf0daed2..1189fcb3 100644 --- a/src/pyeed/main.py +++ b/src/pyeed/main.py @@ -9,6 +9,7 @@ from pyeed.adapter.ncbi_dna_mapper import NCBIDNAToPyeed from pyeed.adapter.ncbi_protein_mapper import NCBIProteinToPyeed +from pyeed.adapter.ncbi_to_uniprot_mapper import NCBIToUniprotMapper from pyeed.adapter.primary_db_adapter import PrimaryDBAdapter from pyeed.adapter.uniprot_mapper import UniprotToPyeed from pyeed.dbchat import DBChat @@ -190,14 +191,27 @@ def fetch_ncbi_nucleotide(self, ids: list[str]) -> None: nest_asyncio.apply() asyncio.get_event_loop().run_until_complete(adapter.execute_requests()) + def database_id_mapper(self, ids: list[str], file: str) -> None: + """ + Maps IDs from one database to another using the UniProt ID mapping service + + Args: + ids (list[str]): List of IDs to map. + """ + + mapper = NCBIToUniprotMapper(ids, file) + mapper.execute_request() + + nest_asyncio.apply() + def calculate_sequence_embeddings( self, batch_size: int = 16, model_name: str = "facebook/esm2_t33_650M_UR50D", - num_gpus: int = None, # Number of GPUs to use - ) -> None: + num_gpus: int = 1, # Number of GPUs to use + ) -> None: """ - Calculates embeddings for all sequences in the database that do not have embeddings, + Calculates embeddings for all sequences in the database that do not have embeddings, distributing the workload across available GPUs. Args: @@ -215,7 +229,12 @@ def calculate_sequence_embeddings( logger.warning("No GPU available! Running on CPU.") # Load separate models for each GPU - devices = [f"cuda:{i}" for i in range(num_gpus)] if num_gpus > 0 else ["cpu"] + devices = ( + [torch.device(f"cuda:{i}") for i in range(num_gpus)] + if num_gpus > 0 + else [torch.device("cpu")] + ) + models_and_tokenizers = [ load_model_and_tokenizer(model_name, device) for device in devices ] @@ -228,18 +247,19 @@ def calculate_sequence_embeddings( """ results = self.db.execute_read(query) data = [(result["accession"], result["sequence"]) for result in results] - + if not data: logger.info("No sequences to process.") return - + accessions, sequences = zip(*data) total_sequences = len(sequences) logger.debug(f"Total sequences to process: {total_sequences}") # Split the data into num_gpus chunks gpu_batches = [ - list(zip(accessions[i::num_gpus], sequences[i::num_gpus])) for i in range(num_gpus) + list(zip(accessions[i::num_gpus], sequences[i::num_gpus])) + for i in range(num_gpus) ] start_time = time.time() @@ -259,17 +279,18 @@ def calculate_sequence_embeddings( batch_size, model, tokenizer, + self.db, device, - self.db ) ) - + for future in futures: future.result() # Wait for all threads to complete - end_time = time.time() - logger.info(f"Total embedding calculation time: {end_time - start_time:.2f} seconds") + logger.info( + f"Total embedding calculation time: {end_time - start_time:.2f} seconds" + ) # Cleanup for model, _, _ in models_and_tokenizers: diff --git a/src/pyeed/model.py b/src/pyeed/model.py index 95f9ffce..5a3bf188 100644 --- a/src/pyeed/model.py +++ b/src/pyeed/model.py @@ -1,5 +1,5 @@ from enum import Enum -from typing import Any +from typing import Any, cast # from pyeed.nodes_and_relations import StrictStructuredNode from neomodel import ( @@ -112,13 +112,12 @@ def save(self, *args: Any, **kwargs: Any) -> None: elif isinstance(base_property, FloatProperty): if not all(isinstance(item, float) for item in prop): raise TypeError(f"All items in '{field}' must be floats") - - #Validate BoleanProperty + + # Validate BoleanProperty elif isinstance(neo_type, BooleanProperty) and not isinstance(prop, bool): raise TypeError( f"Expected a boolean for '{field}', got {type(prop).__name__}" ) - super().save(*args, **kwargs) # Don't return the result @@ -154,6 +153,22 @@ class Organism(StrictStructuredNode): taxonomy_id = IntegerProperty(required=True, unique_index=True) name = StringProperty() + @classmethod + def get_or_save(cls, **kwargs: Any) -> "Organism": + taxonomy_id = kwargs.get("taxonomy_id") + name = kwargs.get("name") + try: + organism = cast(Organism, cls.nodes.get(taxonomy_id=taxonomy_id)) + return organism + except cls.DoesNotExist: + try: + organism = cls(taxonomy_id=taxonomy_id, name=name) + organism.save() + return organism + except Exception as e: + print(f"Error during saving of the organism: {e}") + raise + class Mutation(StructuredRel): # type: ignore """A relationship representing mutations between two sequences.""" @@ -381,33 +396,35 @@ class Reaction(StrictStructuredNode): """ A node representing a reaction. """ - + rhea_id = StringProperty(unique_index=True, required=True) chebi_id = ArrayProperty(StringProperty()) # Relationships substrate = RelationshipTo("Molecule", "SUBSTRATE") product = RelationshipTo("Molecule", "PRODUCT") - - + @property def label(self) -> str: """The label of the reaction.""" - return {self.rhea_id} + return f"{self.rhea_id}" + class Molecule(StrictStructuredNode): """ A node representing a molecule in the database. """ - + chebi_id = StringProperty(unique_index=True, required=True) rhea_compound_id = StringProperty() smiles = StringProperty() @classmethod - def get_or_save(cls, chebi_id, smiles) -> "Molecule": + def get_or_save(cls, **kwargs: Any) -> "Molecule": + chebi_id = kwargs.get("chebi_id") + smiles = kwargs.get("smiles") try: - molecule = cls.nodes.get(chebi_id=chebi_id) + molecule = cast(Molecule, cls.nodes.get(chebi_id=chebi_id)) return molecule except cls.DoesNotExist: try: @@ -417,13 +434,13 @@ def get_or_save(cls, chebi_id, smiles) -> "Molecule": except Exception as e: print(f"Error during saving of the molecule: {e}") raise - - @property + + @property def label(self) -> str: """The label of the molecule.""" - return {self.chebi_id} - - + return f"{self.chebi_id}" + + class StandardNumbering(StrictStructuredNode): name = StringProperty(required=True, unique_index=True) definition = StringProperty(required=True)