Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/lint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.x"
python-version: "3.12"

- name: Install dependencies
run: |
Expand Down
25 changes: 19 additions & 6 deletions docs/usage/mutation_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -166,13 +166,19 @@
"RETURN id(r)\n",
"\"\"\"\n",
"\n",
"region_ids = eedb.db.execute_read(query_get_region_ids, parameters={\"protein_id\": ids, \"region_annotation\": \"coding sequence\"})\n",
"region_ids = [id['id(r)'] for id in region_ids]\n",
"region_ids = eedb.db.execute_read(\n",
" query_get_region_ids,\n",
" parameters={\"protein_id\": ids, \"region_annotation\": \"coding sequence\"},\n",
")\n",
"region_ids = [id[\"id(r)\"] for id in region_ids]\n",
"print(f\"Region ids: {region_ids}\")\n",
"print(f\"len of ids: {len(ids)}\")\n",
"\n",
"sn_dna.apply_standard_numbering_pairwise(\n",
" base_sequence_id=\"AF190695.1\", db=eedb.db, node_type=\"DNA\", region_ids_neo4j=region_ids\n",
" base_sequence_id=\"AF190695.1\",\n",
" db=eedb.db,\n",
" node_type=\"DNA\",\n",
" region_ids_neo4j=region_ids,\n",
")"
]
},
Expand Down Expand Up @@ -223,7 +229,12 @@
"name_of_standard_numbering_tool = \"test_standard_numbering_dna_pairwise\"\n",
"\n",
"mutations_dna = md.get_mutations_between_sequences(\n",
" seq1, seq2, eedb.db, name_of_standard_numbering_tool, node_type=\"DNA\", region_ids_neo4j=region_ids\n",
" seq1,\n",
" seq2,\n",
" eedb.db,\n",
" name_of_standard_numbering_tool,\n",
" node_type=\"DNA\",\n",
" region_ids_neo4j=region_ids,\n",
")"
]
},
Expand Down Expand Up @@ -306,8 +317,10 @@
}
],
"source": [
"for i in range(len(mutations_dna['from_positions'])):\n",
" print(f\"Mutation on position {mutations_dna['from_positions'][i]} -> {mutations_dna['to_positions'][i]} with a nucleotide change of {mutations_dna['from_monomers'][i]} -> {mutations_dna['to_monomers'][i]}\")"
"for i in range(len(mutations_dna[\"from_positions\"])):\n",
" print(\n",
" f\"Mutation on position {mutations_dna['from_positions'][i]} -> {mutations_dna['to_positions'][i]} with a nucleotide change of {mutations_dna['from_monomers'][i]} -> {mutations_dna['to_monomers'][i]}\"\n",
" )"
]
},
{
Expand Down
22 changes: 15 additions & 7 deletions docs/usage/standard_numbering.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@
"eedb = Pyeed(uri, user=user, password=password)\n",
"eedb.db.wipe_database(date=\"2025-03-19\")\n",
"\n",
"eedb.db.initialize_db_constraints(user=user, password=password)\n"
"eedb.db.initialize_db_constraints(user=user, password=password)"
]
},
{
Expand Down Expand Up @@ -148,7 +148,7 @@
"\n",
"sn.apply_standard_numbering_pairwise(\n",
" base_sequence_id=\"AAM15527.1\", db=eedb.db, list_of_seq_ids=ids[0:5]\n",
")\n"
")"
]
},
{
Expand Down Expand Up @@ -184,7 +184,7 @@
"source": [
"sn.apply_standard_numbering_pairwise(\n",
" base_sequence_id=\"AAM15527.1\", db=eedb.db, list_of_seq_ids=ids\n",
")\n"
")"
]
},
{
Expand Down Expand Up @@ -290,7 +290,9 @@
}
],
"source": [
"sn_dna_region = StandardNumberingTool(name=\"test_standard_numbering_dna_pairwise_region\")\n",
"sn_dna_region = StandardNumberingTool(\n",
" name=\"test_standard_numbering_dna_pairwise_region\"\n",
")\n",
"\n",
"\n",
"ids = [\"AAM15527.1\", \"AAF05614.1\", \"AFN21551.1\", \"CAA76794.1\", \"AGQ50511.1\"]\n",
Expand All @@ -302,14 +304,20 @@
"RETURN id(r)\n",
"\"\"\"\n",
"\n",
"region_ids = eedb.db.execute_read(query_get_region_ids, parameters={\"protein_id\": ids, \"region_annotation\": \"coding sequence\"})\n",
"region_ids = [id['id(r)'] for id in region_ids]\n",
"region_ids = eedb.db.execute_read(\n",
" query_get_region_ids,\n",
" parameters={\"protein_id\": ids, \"region_annotation\": \"coding sequence\"},\n",
")\n",
"region_ids = [id[\"id(r)\"] for id in region_ids]\n",
"print(f\"Region ids: {region_ids}\")\n",
"print(f\"len of ids: {len(ids)}\")\n",
"\n",
"\n",
"sn_dna_region.apply_standard_numbering_pairwise(\n",
" base_sequence_id=\"AF190695.1\", db=eedb.db, node_type=\"DNA\", region_ids_neo4j=region_ids\n",
" base_sequence_id=\"AF190695.1\",\n",
" db=eedb.db,\n",
" node_type=\"DNA\",\n",
" region_ids_neo4j=region_ids,\n",
")"
]
},
Expand Down
100 changes: 51 additions & 49 deletions src/pyeed/analysis/mutation_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,24 @@ def get_sequence_data(
node_type: str = "Protein",
region_ids_neo4j: Optional[list[int]] = None,
) -> tuple[dict[str, str], dict[str, list[str]]]:
"""Fetch sequence and position data for two sequences from the database.
"""
Fetch sequence and standard numbering position data for two sequences from the database.

Args:
sequence_id1: First sequence accession ID
sequence_id2: Second sequence accession ID
db: Database connection instance
standard_numbering_tool_name: Name of standard numbering tool to use
node_type: Type of node to use (default: "Protein")
region_ids_neo4j: List of region IDs for the sequence cuting based on region_based_sequence.
sequence_id1 (str): Accession ID of the first sequence.
sequence_id2 (str): Accession ID of the second sequence.
db (DatabaseConnector): Database connection instance.
standard_numbering_tool_name (str): Name of the standard numbering tool to use.
node_type (str, optional): Type of node to use (default: "Protein").
region_ids_neo4j (Optional[list[int]], optional): List of region IDs for region-based sequence extraction.

Returns:
tuple containing:
- dict[str, str]: Mapping of sequence IDs to sequences
- dict[str, list[str]]: Mapping of sequence IDs to position lists
tuple[dict[str, str], dict[str, list[str]]]:
- Mapping of sequence IDs to sequences.
- Mapping of sequence IDs to position lists.

Raises:
ValueError: If standard numbering positions not found for both sequences
ValueError: If standard numbering positions are not found for both sequences.
"""
if region_ids_neo4j is not None:
query = f"""
Expand Down Expand Up @@ -84,20 +85,21 @@ def find_mutations(
pos1: list[str],
pos2: list[str],
) -> dict[str, Any]:
"""Compare two sequences and identify mutations between them.
"""
Compare two sequences and identify mutations between them using standard numbering positions.

Args:
seq1: First amino acid sequence
seq2: Second amino acid sequence
pos1: Standard numbering positions for first sequence
pos2: Standard numbering positions for second sequence
seq1 (str): First amino acid sequence.
seq2 (str): Second amino acid sequence.
pos1 (list[str]): Standard numbering positions for the first sequence.
pos2 (list[str]): Standard numbering positions for the second sequence.

Returns:
dict containing mutation information:
- from_positions: List[int] - Source positions (1-based)
- to_positions: List[int] - Target positions (1-based)
- from_monomers: List[str] - Source amino acids
- to_monomers: List[str] - Target amino acids
dict[str, Any]: Dictionary containing mutation information:
- from_positions (List[int]): Source positions (1-based).
- to_positions (List[int]): Target positions (1-based).
- from_monomers (List[str]): Source amino acids.
- to_monomers (List[str]): Target amino acids.
"""
pos_to_idx1 = {pos: idx for idx, pos in enumerate(pos1)}
pos_to_idx2 = {pos: idx for idx, pos in enumerate(pos2)}
Expand Down Expand Up @@ -134,19 +136,20 @@ def save_mutations_to_db(
node_type: str = "Protein",
region_ids_neo4j: Optional[list[int]] = None,
) -> None:
"""Save detected mutations to the database.
"""
Save detected mutations to the database as relationships between nodes.

Args:
mutations: Dictionary containing mutation information:
- from_positions: List[int] - Source positions
- to_positions: List[int] - Target positions
- from_monomers: List[str] - Source amino acids
- to_monomers: List[str] - Target amino acids
db: Database connection instance
sequence_id1: First sequence accession ID
sequence_id2: Second sequence accession ID
node_type: Type of node to use (default: "Protein")
region_ids_neo4j: List of region IDs for the sequence cuting based on region_based_sequence.
mutations (dict[str, list[int | str]]): Dictionary containing mutation information:
- from_positions (List[int]): Source positions.
- to_positions (List[int]): Target positions.
- from_monomers (List[str]): Source amino acids.
- to_monomers (List[str]): Target amino acids.
db (DatabaseConnector): Database connection instance.
sequence_id1 (str): Accession ID of the first sequence.
sequence_id2 (str): Accession ID of the second sequence.
node_type (str, optional): Type of node to use (default: "Protein").
region_ids_neo4j (Optional[list[int]], optional): List of region IDs for region-based sequence extraction.
"""
# Check if a mutation relationship already exists between these proteins
if region_ids_neo4j is not None:
Expand Down Expand Up @@ -237,30 +240,30 @@ def get_mutations_between_sequences(
db: DatabaseConnector,
standard_numbering_tool_name: str,
save_to_db: bool = True,
debug: bool = False,
node_type: str = "Protein",
region_ids_neo4j: Optional[list[int]] = None,
) -> dict[str, list[int | str]]:
"""Get mutations between two sequences using standard numbering.
"""
Get mutations between two sequences using standard numbering and optionally save them to the database.

Args:
sequence_id1: First sequence accession ID
sequence_id2: Second sequence accession ID
db: Database connection instance
standard_numbering_tool_name: Name of standard numbering tool to use
save_to_db: Whether to save mutations to database (default: True)
node_type: Type of node to use (default: "Protein")
region_ids_neo4j: List of region IDs for the sequence cuting based on region_based_sequence.
sequence_id1 (str): Accession ID of the first sequence.
sequence_id2 (str): Accession ID of the second sequence.
db (DatabaseConnector): Database connection instance.
standard_numbering_tool_name (str): Name of the standard numbering tool to use.
save_to_db (bool, optional): Whether to save mutations to the database (default: True).
node_type (str, optional): Type of node to use (default: "Protein").
region_ids_neo4j (Optional[list[int]], optional): List of region IDs for region-based sequence extraction.

Returns:
dict containing mutation information:
- from_positions: List[int] - Source positions (1-based)
- to_positions: List[int] - Target positions (1-based)
- from_monomers: List[str] - Source amino acids
- to_monomers: List[str] - Target amino acids
dict[str, list[int | str]]: Dictionary containing mutation information:
- from_positions (List[int]): Source positions (1-based).
- to_positions (List[int]): Target positions (1-based).
- from_monomers (List[str]): Source amino acids.
- to_monomers (List[str]): Target amino acids.

Raises:
ValueError: If standard numbering positions not found for both sequences
ValueError: If standard numbering positions are not found for both sequences.
"""
sequences, positions = self.get_sequence_data(
sequence_id1,
Expand All @@ -271,8 +274,7 @@ def get_mutations_between_sequences(
region_ids_neo4j,
)

if debug:
logger.info(f"Debug mode output: {sequences} and {positions}")
logger.debug(f"Debug mode output: {sequences} and {positions}")

mutations = self.find_mutations(
sequences[sequence_id1],
Expand Down
2 changes: 1 addition & 1 deletion src/pyeed/tools/resources/alphafold/docker_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def main(argv): # type: ignore
mount, target_path = _create_mount(f"fasta_path_{i}", fasta_path)
mounts.append(mount)
target_fasta_paths.append(target_path)
command_args.append(f'--fasta_paths={",".join(target_fasta_paths)}')
command_args.append(f"--fasta_paths={','.join(target_fasta_paths)}")

database_paths = [
("uniref90_database_path", uniref90_database_path),
Expand Down