diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 40a6b665..cdef9c83 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -10,7 +10,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: "3.x" + python-version: "3.12" - name: Install dependencies run: | diff --git a/docs/usage/mutation_analysis.ipynb b/docs/usage/mutation_analysis.ipynb index a1131338..ac608881 100644 --- a/docs/usage/mutation_analysis.ipynb +++ b/docs/usage/mutation_analysis.ipynb @@ -166,13 +166,19 @@ "RETURN id(r)\n", "\"\"\"\n", "\n", - "region_ids = eedb.db.execute_read(query_get_region_ids, parameters={\"protein_id\": ids, \"region_annotation\": \"coding sequence\"})\n", - "region_ids = [id['id(r)'] for id in region_ids]\n", + "region_ids = eedb.db.execute_read(\n", + " query_get_region_ids,\n", + " parameters={\"protein_id\": ids, \"region_annotation\": \"coding sequence\"},\n", + ")\n", + "region_ids = [id[\"id(r)\"] for id in region_ids]\n", "print(f\"Region ids: {region_ids}\")\n", "print(f\"len of ids: {len(ids)}\")\n", "\n", "sn_dna.apply_standard_numbering_pairwise(\n", - " base_sequence_id=\"AF190695.1\", db=eedb.db, node_type=\"DNA\", region_ids_neo4j=region_ids\n", + " base_sequence_id=\"AF190695.1\",\n", + " db=eedb.db,\n", + " node_type=\"DNA\",\n", + " region_ids_neo4j=region_ids,\n", ")" ] }, @@ -223,7 +229,12 @@ "name_of_standard_numbering_tool = \"test_standard_numbering_dna_pairwise\"\n", "\n", "mutations_dna = md.get_mutations_between_sequences(\n", - " seq1, seq2, eedb.db, name_of_standard_numbering_tool, node_type=\"DNA\", region_ids_neo4j=region_ids\n", + " seq1,\n", + " seq2,\n", + " eedb.db,\n", + " name_of_standard_numbering_tool,\n", + " node_type=\"DNA\",\n", + " region_ids_neo4j=region_ids,\n", ")" ] }, @@ -306,8 +317,10 @@ } ], "source": [ - "for i in range(len(mutations_dna['from_positions'])):\n", - " print(f\"Mutation on position {mutations_dna['from_positions'][i]} -> {mutations_dna['to_positions'][i]} with a nucleotide change of {mutations_dna['from_monomers'][i]} -> {mutations_dna['to_monomers'][i]}\")" + "for i in range(len(mutations_dna[\"from_positions\"])):\n", + " print(\n", + " f\"Mutation on position {mutations_dna['from_positions'][i]} -> {mutations_dna['to_positions'][i]} with a nucleotide change of {mutations_dna['from_monomers'][i]} -> {mutations_dna['to_monomers'][i]}\"\n", + " )" ] }, { diff --git a/docs/usage/standard_numbering.ipynb b/docs/usage/standard_numbering.ipynb index 54374cd6..b5ee657b 100644 --- a/docs/usage/standard_numbering.ipynb +++ b/docs/usage/standard_numbering.ipynb @@ -96,7 +96,7 @@ "eedb = Pyeed(uri, user=user, password=password)\n", "eedb.db.wipe_database(date=\"2025-03-19\")\n", "\n", - "eedb.db.initialize_db_constraints(user=user, password=password)\n" + "eedb.db.initialize_db_constraints(user=user, password=password)" ] }, { @@ -148,7 +148,7 @@ "\n", "sn.apply_standard_numbering_pairwise(\n", " base_sequence_id=\"AAM15527.1\", db=eedb.db, list_of_seq_ids=ids[0:5]\n", - ")\n" + ")" ] }, { @@ -184,7 +184,7 @@ "source": [ "sn.apply_standard_numbering_pairwise(\n", " base_sequence_id=\"AAM15527.1\", db=eedb.db, list_of_seq_ids=ids\n", - ")\n" + ")" ] }, { @@ -290,7 +290,9 @@ } ], "source": [ - "sn_dna_region = StandardNumberingTool(name=\"test_standard_numbering_dna_pairwise_region\")\n", + "sn_dna_region = StandardNumberingTool(\n", + " name=\"test_standard_numbering_dna_pairwise_region\"\n", + ")\n", "\n", "\n", "ids = [\"AAM15527.1\", \"AAF05614.1\", \"AFN21551.1\", \"CAA76794.1\", \"AGQ50511.1\"]\n", @@ -302,14 +304,20 @@ "RETURN id(r)\n", "\"\"\"\n", "\n", - "region_ids = eedb.db.execute_read(query_get_region_ids, parameters={\"protein_id\": ids, \"region_annotation\": \"coding sequence\"})\n", - "region_ids = [id['id(r)'] for id in region_ids]\n", + "region_ids = eedb.db.execute_read(\n", + " query_get_region_ids,\n", + " parameters={\"protein_id\": ids, \"region_annotation\": \"coding sequence\"},\n", + ")\n", + "region_ids = [id[\"id(r)\"] for id in region_ids]\n", "print(f\"Region ids: {region_ids}\")\n", "print(f\"len of ids: {len(ids)}\")\n", "\n", "\n", "sn_dna_region.apply_standard_numbering_pairwise(\n", - " base_sequence_id=\"AF190695.1\", db=eedb.db, node_type=\"DNA\", region_ids_neo4j=region_ids\n", + " base_sequence_id=\"AF190695.1\",\n", + " db=eedb.db,\n", + " node_type=\"DNA\",\n", + " region_ids_neo4j=region_ids,\n", ")" ] }, diff --git a/src/pyeed/analysis/mutation_detection.py b/src/pyeed/analysis/mutation_detection.py index c2562ae1..274e168b 100644 --- a/src/pyeed/analysis/mutation_detection.py +++ b/src/pyeed/analysis/mutation_detection.py @@ -17,23 +17,24 @@ def get_sequence_data( node_type: str = "Protein", region_ids_neo4j: Optional[list[int]] = None, ) -> tuple[dict[str, str], dict[str, list[str]]]: - """Fetch sequence and position data for two sequences from the database. + """ + Fetch sequence and standard numbering position data for two sequences from the database. Args: - sequence_id1: First sequence accession ID - sequence_id2: Second sequence accession ID - db: Database connection instance - standard_numbering_tool_name: Name of standard numbering tool to use - node_type: Type of node to use (default: "Protein") - region_ids_neo4j: List of region IDs for the sequence cuting based on region_based_sequence. + sequence_id1 (str): Accession ID of the first sequence. + sequence_id2 (str): Accession ID of the second sequence. + db (DatabaseConnector): Database connection instance. + standard_numbering_tool_name (str): Name of the standard numbering tool to use. + node_type (str, optional): Type of node to use (default: "Protein"). + region_ids_neo4j (Optional[list[int]], optional): List of region IDs for region-based sequence extraction. Returns: - tuple containing: - - dict[str, str]: Mapping of sequence IDs to sequences - - dict[str, list[str]]: Mapping of sequence IDs to position lists + tuple[dict[str, str], dict[str, list[str]]]: + - Mapping of sequence IDs to sequences. + - Mapping of sequence IDs to position lists. Raises: - ValueError: If standard numbering positions not found for both sequences + ValueError: If standard numbering positions are not found for both sequences. """ if region_ids_neo4j is not None: query = f""" @@ -84,20 +85,21 @@ def find_mutations( pos1: list[str], pos2: list[str], ) -> dict[str, Any]: - """Compare two sequences and identify mutations between them. + """ + Compare two sequences and identify mutations between them using standard numbering positions. Args: - seq1: First amino acid sequence - seq2: Second amino acid sequence - pos1: Standard numbering positions for first sequence - pos2: Standard numbering positions for second sequence + seq1 (str): First amino acid sequence. + seq2 (str): Second amino acid sequence. + pos1 (list[str]): Standard numbering positions for the first sequence. + pos2 (list[str]): Standard numbering positions for the second sequence. Returns: - dict containing mutation information: - - from_positions: List[int] - Source positions (1-based) - - to_positions: List[int] - Target positions (1-based) - - from_monomers: List[str] - Source amino acids - - to_monomers: List[str] - Target amino acids + dict[str, Any]: Dictionary containing mutation information: + - from_positions (List[int]): Source positions (1-based). + - to_positions (List[int]): Target positions (1-based). + - from_monomers (List[str]): Source amino acids. + - to_monomers (List[str]): Target amino acids. """ pos_to_idx1 = {pos: idx for idx, pos in enumerate(pos1)} pos_to_idx2 = {pos: idx for idx, pos in enumerate(pos2)} @@ -134,19 +136,20 @@ def save_mutations_to_db( node_type: str = "Protein", region_ids_neo4j: Optional[list[int]] = None, ) -> None: - """Save detected mutations to the database. + """ + Save detected mutations to the database as relationships between nodes. Args: - mutations: Dictionary containing mutation information: - - from_positions: List[int] - Source positions - - to_positions: List[int] - Target positions - - from_monomers: List[str] - Source amino acids - - to_monomers: List[str] - Target amino acids - db: Database connection instance - sequence_id1: First sequence accession ID - sequence_id2: Second sequence accession ID - node_type: Type of node to use (default: "Protein") - region_ids_neo4j: List of region IDs for the sequence cuting based on region_based_sequence. + mutations (dict[str, list[int | str]]): Dictionary containing mutation information: + - from_positions (List[int]): Source positions. + - to_positions (List[int]): Target positions. + - from_monomers (List[str]): Source amino acids. + - to_monomers (List[str]): Target amino acids. + db (DatabaseConnector): Database connection instance. + sequence_id1 (str): Accession ID of the first sequence. + sequence_id2 (str): Accession ID of the second sequence. + node_type (str, optional): Type of node to use (default: "Protein"). + region_ids_neo4j (Optional[list[int]], optional): List of region IDs for region-based sequence extraction. """ # Check if a mutation relationship already exists between these proteins if region_ids_neo4j is not None: @@ -237,30 +240,30 @@ def get_mutations_between_sequences( db: DatabaseConnector, standard_numbering_tool_name: str, save_to_db: bool = True, - debug: bool = False, node_type: str = "Protein", region_ids_neo4j: Optional[list[int]] = None, ) -> dict[str, list[int | str]]: - """Get mutations between two sequences using standard numbering. + """ + Get mutations between two sequences using standard numbering and optionally save them to the database. Args: - sequence_id1: First sequence accession ID - sequence_id2: Second sequence accession ID - db: Database connection instance - standard_numbering_tool_name: Name of standard numbering tool to use - save_to_db: Whether to save mutations to database (default: True) - node_type: Type of node to use (default: "Protein") - region_ids_neo4j: List of region IDs for the sequence cuting based on region_based_sequence. + sequence_id1 (str): Accession ID of the first sequence. + sequence_id2 (str): Accession ID of the second sequence. + db (DatabaseConnector): Database connection instance. + standard_numbering_tool_name (str): Name of the standard numbering tool to use. + save_to_db (bool, optional): Whether to save mutations to the database (default: True). + node_type (str, optional): Type of node to use (default: "Protein"). + region_ids_neo4j (Optional[list[int]], optional): List of region IDs for region-based sequence extraction. Returns: - dict containing mutation information: - - from_positions: List[int] - Source positions (1-based) - - to_positions: List[int] - Target positions (1-based) - - from_monomers: List[str] - Source amino acids - - to_monomers: List[str] - Target amino acids + dict[str, list[int | str]]: Dictionary containing mutation information: + - from_positions (List[int]): Source positions (1-based). + - to_positions (List[int]): Target positions (1-based). + - from_monomers (List[str]): Source amino acids. + - to_monomers (List[str]): Target amino acids. Raises: - ValueError: If standard numbering positions not found for both sequences + ValueError: If standard numbering positions are not found for both sequences. """ sequences, positions = self.get_sequence_data( sequence_id1, @@ -271,8 +274,7 @@ def get_mutations_between_sequences( region_ids_neo4j, ) - if debug: - logger.info(f"Debug mode output: {sequences} and {positions}") + logger.debug(f"Debug mode output: {sequences} and {positions}") mutations = self.find_mutations( sequences[sequence_id1], diff --git a/src/pyeed/tools/resources/alphafold/docker_run.py b/src/pyeed/tools/resources/alphafold/docker_run.py index 6c21ade7..5d8f4f5c 100644 --- a/src/pyeed/tools/resources/alphafold/docker_run.py +++ b/src/pyeed/tools/resources/alphafold/docker_run.py @@ -214,7 +214,7 @@ def main(argv): # type: ignore mount, target_path = _create_mount(f"fasta_path_{i}", fasta_path) mounts.append(mount) target_fasta_paths.append(target_path) - command_args.append(f'--fasta_paths={",".join(target_fasta_paths)}') + command_args.append(f"--fasta_paths={','.join(target_fasta_paths)}") database_paths = [ ("uniref90_database_path", uniref90_database_path),