diff --git a/.github/workflows/docker_build.yml b/.github/workflows/docker_build.yml index 178f886..4f1be86 100644 --- a/.github/workflows/docker_build.yml +++ b/.github/workflows/docker_build.yml @@ -23,10 +23,11 @@ jobs: calculate_net_charge, generate_conformers, mol2_to_pdbqt, nmr4md, openbabel, remove_terminal_residue_name_prefixes, - rename_residues_mol, combine_structure, + rename_residues, combine_structure, remove_terminal_residue_name_prefixes, molgan, pdbbind_refined, onionnet-sfct, smina, pdbfixer, - fix_pdb_atom_column, extract_protein, extract_ligand_protein, generate_conformers] # No username for pdbind_refined + fix_pdb_atom_column, extract_protein, extract_ligand_protein, generate_conformers, + onionnet, combine_pdb] # No username for pdbind_refined # skip data/ and cwl_adapters/file_format_conversions/biosimspace/ runs-on: [ubuntu-latest] diff --git a/cwl_adapters/autodock_vina.cwl b/cwl_adapters/autodock_vina.cwl new file mode 100644 index 0000000..ad4167d --- /dev/null +++ b/cwl_adapters/autodock_vina.cwl @@ -0,0 +1,221 @@ + +#!/usr/bin/env cwl-runner +cwlVersion: v1.1 # See `loadContents: true` below! + +class: CommandLineTool + +label: Wrapper of the AutoDock Vina software. + +doc: |- + This class performs docking of the ligand to a set of grids describing the target protein via the AutoDock Vina software. + +baseCommand: vina # NOTE: Only version >=1.2 supports --batch! +arguments: +# Need to parse box.pdb and pass in each number separately. +# REMARK BOX CENTER: 0.102 0.019 -0.004 SIZE: 30.195 31.940 27.005 +# - "--dir" # Need to explicitly pass --dir . in --batch mode +# - "." +- "--center_x" +- $(inputs.input_box_path.contents.split("\n")[0].split(" ").filter(function(s) {return !isNaN(parseFloat(s))})[0]) +- "--center_y" +- $(inputs.input_box_path.contents.split("\n")[0].split(" ").filter(function(s) {return !isNaN(parseFloat(s))})[1]) +- "--center_z" +- $(inputs.input_box_path.contents.split("\n")[0].split(" ").filter(function(s) {return !isNaN(parseFloat(s))})[2]) +- "--size_x" +- $(inputs.input_box_path.contents.split("\n")[0].split(" ").filter(function(s) {return !isNaN(parseFloat(s))})[3]) +- "--size_y" +- $(inputs.input_box_path.contents.split("\n")[0].split(" ").filter(function(s) {return !isNaN(parseFloat(s))})[4]) +- "--size_z" +- $(inputs.input_box_path.contents.split("\n")[0].split(" ").filter(function(s) {return !isNaN(parseFloat(s))})[5]) +# NOTE: Cannot use a single javascript expression to create the entire arguments list because CWL treats it as a string: +# "the `arguments` field is not valid because value is a str" +# ${ +# var words = inputs.input_box_path.contents.split("\n")[0].split(" "); +# var nums = words.filter(function(s) {return !isNaN(parseFloat(s))}); +# var args = {}; +# args.push("--dir"); // Need to explicitly pass --dir . in --batch mode +# args.push("."); +# args.push("--center_x"); +# args.push(nums[0]); +# args.push("--center_y"); +# args.push(nums[1]); +# args.push("--center_z"); +# args.push(nums[2]); +# args.push("--size_x"); +# args.push(nums[3]); +# args.push("--size_y"); +# args.push(nums[4]); +# args.push("--size_z"); +# args.push(nums[5]); +# return args; +# } + +hints: + DockerRequirement: + dockerPull: jakefennick/autodock_vina + +requirements: + InlineJavascriptRequirement: {} + +inputs: + input_ligand_pdbqt_path: + label: Path to the input PDBQT ligand + doc: |- + Path to the input PDBQT ligand + Type: string + File type: input + Accepted formats: pdbqt + Example file: https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/data/vina/vina_ligand.pdbqt + type: File + format: + - edam:format_1476 + inputBinding: + prefix: --ligand + + input_receptor_pdbqt_path: + label: Path to the input PDBQT receptor + doc: |- + Path to the input PDBQT receptor + Type: string + File type: input + Accepted formats: pdbqt + Example file: https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/data/vina/vina_receptor.pdbqt + type: File + format: + - edam:format_1476 + inputBinding: + #position: 2 + prefix: --receptor + + input_box_path: + label: Path to the PDB containing the residues belonging to the binding site + doc: |- + Path to the PDB containing the residues belonging to the binding site + Type: string + File type: input + Accepted formats: pdb + Example file: https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/data/vina/vina_box.pdb + type: File + format: + - edam:format_1476 +# inputBinding: +# position: 3 +# prefix: --input_box_path + loadContents: true # requires cwlVersion: v1.1 + # See https://www.commonwl.org/v1.1/CommandLineTool.html#Changelog + # Curiously, cwlVersion: v1.0 allows loadContents for outputs, but not inputs. + + output_pdbqt_path: + label: Path to the output PDBQT file + doc: |- + Path to the output PDBQT file + Type: string + File type: output + Accepted formats: pdbqt + Example file: https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/reference/vina/ref_output_vina.pdbqt + type: string + format: + - edam:format_1476 + inputBinding: + prefix: --out + default: system.pdbqt + + output_log_path: + label: Path to the log file + doc: |- + Path to the log file + Type: string + File type: output + Accepted formats: log + Example file: https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/reference/vina/ref_output_vina.log + type: string + format: + - edam:format_2330 + default: system.log + + cpu: + label: The number of CPUs to use + doc: |- + The number of CPUs to use + Type: int + type: int? + format: + - edam:format_2330 + inputBinding: + prefix: --cpu + default: 1 + + exhaustiveness: + label: exhaustiveness of the global search (roughly proportional to time). + doc: |- + exhaustiveness of the global search (roughly proportional to time). + Type: int + type: int? + format: + - edam:format_2330 + inputBinding: + prefix: --exhaustiveness + default: 8 + + num_modes: + label: maximum number of binding modes to generate + doc: |- + Tmaximum number of binding modes to generate + Type: int + type: int? + format: + - edam:format_2330 + inputBinding: + prefix: --num_modes + default: 9 + + min_rmsd: + label: The minimum RMSD between output poses + doc: |- + The minimum RMSD between output poses + Type: int + type: int? + format: + - edam:format_2330 + inputBinding: + prefix: --min_rmsd + default: 1 + + energy_range: + label: maximum energy difference between the best binding mode and the worst one displayed (kcal/mol). + doc: |- + maximum energy difference between the best binding mode and the worst one displayed (kcal/mol). + Type: int + type: int? + format: + - edam:format_2330 + inputBinding: + prefix: --energy_range + default: 3 + +outputs: + output_pdbqt_path: + label: Path to the output PDBQT file + doc: |- + Path to the output PDBQT file + type: File + outputBinding: + glob: $(inputs.output_pdbqt_path) + format: edam:format_1476 + + output_log_path: + label: Path to the log file + doc: |- + Path to the log file + type: File + outputBinding: + glob: $(inputs.output_log_path) + format: edam:format_2330 + +stdout: $(inputs.output_log_path) + +$namespaces: + edam: https://edamontology.org/ + +$schemas: +- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl diff --git a/cwl_adapters/combine_pdb.cwl b/cwl_adapters/combine_pdb.cwl new file mode 100644 index 0000000..5dac17f --- /dev/null +++ b/cwl_adapters/combine_pdb.cwl @@ -0,0 +1,72 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.0 + +class: CommandLineTool + +label: A tool that employs MDAnalysis to combine two PDB structures in a single PDB file. + +doc: |- + A tool that employs MDAnalysis to combine two PDB structures in a single PDB file. + +baseCommand: ['python', '/combine_pdb.py'] + +hints: + DockerRequirement: + dockerPull: ndonyapour/combine_pdb + +inputs: + input_structure1: + label: Input PDB structure 1 file path + doc: |- + Input PDB structure 1 file path + Type: string + File type: input + Accepted formats: pdb + type: File + format: + - edam:format_1476 + inputBinding: + prefix: --input_structure1 + + input_structure2: + label: Input PDB structure 2 file path + doc: |- + Input PDB structure 2 file path + Type: string + File type: input + Accepted formats: pdb + type: File + format: + - edam:format_1476 + inputBinding: + prefix: --input_structure2 + + output_structure_path: + label: Output combined PDB file path + doc: |- + Output combined PDB file path + Type: string + File type: output + Accepted formats: pdb + Example file: https://github.com/bioexcel/biobb_structure_utils/raw/master/biobb_structure_utils/test/reference/utils/ref_cat_pdb.pdb + type: string + format: + - edam:format_1476 + inputBinding: + prefix: --output_structure_path + default: system.pdb +outputs: + output_structure_path: + label: Output protein file path + doc: |- + Output protein file path + type: File + outputBinding: + glob: $(inputs.output_structure_path) + format: edam:format_1476 + +$namespaces: + edam: https://edamontology.org/ + +$schemas: +- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl diff --git a/cwl_adapters/convert_pdb.cwl b/cwl_adapters/convert_pdb.cwl new file mode 100644 index 0000000..94ead7f --- /dev/null +++ b/cwl_adapters/convert_pdb.cwl @@ -0,0 +1,129 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.0 + +class: CommandLineTool + +label: This class is a wrapper of the Open Babel tool. + +doc: |- + Small molecule format conversion for structures or trajectories. Open Babel is a chemical toolbox designed to speak the many languages of chemical data. It's an open, collaborative project allowing anyone to search, convert, analyze, or store data from molecular modeling, chemistry, solid-state materials, biochemistry, or related areas. Visit the official page. + +baseCommand: obabel +#Usage: +#obabel[-i] [-o] -O [Options] +#... +#Options, other than -i -o -O -m, must come after the input files. +arguments: [$(inputs.input_path), "-h", "-o", "pdb", "-O", $(inputs.output_pdb_path)] +# NOTE: These arguments must be given individually; they cannot be concatenated together. +# (e.g. -xrhn) Otherwise, all but the first argument will be silently ignored! + +# pdb format options +# Read Options e.g. -as +# s Output single bonds only +# b Disable bonding entirely +# c Ignore CONECT records + +# Write Options, e.g. -xo +# n Do not write duplicate CONECT records to indicate bond order +# o Write origin in space group label (CRYST1 section) + +hints: + DockerRequirement: + dockerPull: quay.io/biocontainers/biobb_chemistry:4.0.0--pyhdfd78af_1 + +requirements: + InlineJavascriptRequirement: {} + +inputs: + first_molecule: + label: Index of the first molecule (1-based) + doc: |- + Input Index of the first molecule (1-based) + Type: string? + type: string? + format: + - edam:format_2330 # 'Textual format' + inputBinding: + prefix: -f + + last_molecule: + label: Index of the last molecule (1-based) + doc: |- + Input Index of the last molecule (1-based) + Type: string? + type: string? + format: + - edam:format_2330 # 'Textual format' + inputBinding: + prefix: -l + + input_path: + label: Path to the input file + doc: |- + Path to the input file + Type: string + File type: input + Accepted formats: dat, ent, fa, fasta, gro, inp, log, mcif, mdl, mmcif, mol, mol2, pdb, pdbqt, png, sdf, smi, smiles, txt, xml, xtc + Example file: https://github.com/bioexcel/biobb_chemistry/raw/master/biobb_chemistry/test/data/babel/babel.smi + type: File + format: + - edam:format_1637 + - edam:format_1476 + - edam:format_1929 + - edam:format_1929 + - edam:format_2033 + - edam:format_3878 + - edam:format_2030 + - edam:format_1477 + - edam:format_3815 + - edam:format_1477 + - edam:format_3815 + - edam:format_3816 + - edam:format_1476 + - edam:format_1476 + - edam:format_3603 + - edam:format_3814 + - edam:format_1196 + - edam:format_1196 + - edam:format_2033 + - edam:format_2332 + - edam:format_3875 + + output_pdb_path: + label: Path to the output file + doc: |- + Path to the output file + Type: string + File type: output + Accepted formats: pdb + type: string + format: + - edam:format_1476 # pdb + default: system.pdb + + arg1: + label: Additional arguments + doc: |- + Additional arguments + Type: string? + type: string? + format: + - edam:format_2330 # 'Textual format' + inputBinding: + position: 1 + +outputs: + output_pdb_path: + label: Path to the output file + doc: |- + Path to the output file + type: File + outputBinding: + glob: $(inputs.output_pdb_path) + format: edam:format_1476 # pdb + +$namespaces: + edam: https://edamontology.org/ + +$schemas: +- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl diff --git a/cwl_adapters/onionnet-sfct.cwl b/cwl_adapters/onionnet-sfct.cwl index 3cf2fb4..d4a8be4 100644 --- a/cwl_adapters/onionnet-sfct.cwl +++ b/cwl_adapters/onionnet-sfct.cwl @@ -9,7 +9,7 @@ baseCommand: ["python", "/OnionNet-SFCT/scorer.py"] hints: DockerRequirement: - dockerPull: polusai/onionnet-sfct-tool:1.0.0 + dockerPull: polusai/onionnet-sfct-tool@sha256:8b51dd5e9d1218a4033cb6126c44aed0ea67a7ad4cfcf49ee4703fce7520c6be requirements: InlineJavascriptRequirement: {} diff --git a/cwl_adapters/onionnet_featurize.cwl b/cwl_adapters/onionnet_featurize.cwl new file mode 100644 index 0000000..42a977c --- /dev/null +++ b/cwl_adapters/onionnet_featurize.cwl @@ -0,0 +1,70 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: CommandLineTool + +label: Generate the features for the protein-ligand complexes using OnionNet V1 + +baseCommand: ["conda", "run", "-n", "py37", "python", "/onionnet/generate_features.py"] +arguments: ["-inp", "input.dat"] + +hints: + DockerRequirement: + dockerPull: ndonyapour/onionnet + +requirements: + - class: InlineJavascriptRequirement + # This enables staging input files and dynamically generating a file + # containing the file paths on the fly. + - class: InitialWorkDirRequirement + listing: | + ${ + var dat_file_contents = ""; + for (var i = 0; i < inputs.pdb_paths.length; i++) { + dat_file_contents += inputs.pdb_paths[i].path + "\n"; + } + // Note: Uses https://www.commonwl.org/v1.0/CommandLineTool.html#Dirent + // and https://www.commonwl.org/user_guide/topics/creating-files-at-runtime.html + // "If the value is a string literal or an expression which evaluates to a string, + // a new file must be created with the string as the file contents." + return ([{"entryname": "input.dat", "entry": dat_file_contents}]); + } + +inputs: + pdb_paths: + label: The path of input pdb files + type: File[] + format: + - edam:format_1476 # PDB + + num_of_cpus: + label: number of CPUs to use. + type: int? + format: + - edam:format_2330 + inputBinding: + prefix: -nt + default: 1 + + output_feature_file: + label: the output file name containing the features. + type: string? + format: + - edam:format_3752 + inputBinding: + prefix: -out + default: "output.csv" + +outputs: + output_feature_file: + type: File + format: edam:format_3752 + outputBinding: + glob: $(inputs.output_feature_file) + +$namespaces: + edam: https://edamontology.org/ + cwltool: http://commonwl.org/cwltool# + +$schemas: +- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl diff --git a/cwl_adapters/onionnet_score.cwl b/cwl_adapters/onionnet_score.cwl new file mode 100644 index 0000000..6accd62 --- /dev/null +++ b/cwl_adapters/onionnet_score.cwl @@ -0,0 +1,113 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: CommandLineTool + +label: OnionNet (version1) for rescoring of docking poses + +baseCommand: ["conda", "run", "-n", "py37", "python", "/onionnet/predict.py"] + +hints: + DockerRequirement: + dockerPull: ndonyapour/onionnet + +requirements: + InlineJavascriptRequirement: {} + +inputs: + input_feature_file: + label: feature csv file for protein-ligand complexes + type: File? + format: + - edam:format_3752 + inputBinding: + prefix: -fn + + scaler: + label: the standard scaler file. + type: string? + format: + - edam:format_2330 + inputBinding: + prefix: -scaler + default: "/onionnet/models/StandardScaler.model" + + weights: + label: the trained DNN model file. + type: string? + format: + - edam:format_2330 + inputBinding: + prefix: -weights + default: "/onionnet/models/CNN_final_model_weights.h5" + + output_score_file: + label: the predicted pKa values file + type: string? + format: + - edam:format_3752 + inputBinding: + prefix: -out + default: "predicted_pKa.csv" + + onionnet_score: + type: string? + +outputs: + output_score_file: + type: File + outputBinding: + glob: $(inputs.output_score_file) + format: edam:format_3752 + + onionnet_score: + label: Estimated Free Energy of Binding (onionnet score) + doc: |- + Estimated Free Energy of Binding + type: float[] + outputBinding: + glob: $(inputs.output_score_file) + loadContents: true + outputEval: | + ${ + const lines = self[0].contents.split("\n"); + // Remove blank lines + var non_blank_lines = lines.filter(function(line) {return line.trim() !== '';}); + var onionnet_dGs = []; + + // Calculate the binding free energy from pKa, + // See https://github.com/PolusAI/mm-workflows/blob/1c2284e5bf517b90950c2f7af574caff3aa536d8/examples/scripts/generate_conformers.py#L31 + // Calculate pKa = -log(Kd), See https://en.wikipedia.org/wiki/Dissociation_constant, + // Calculate the binding free energy from Kd so we can make the correlation plots. dG = RT*ln(Kd/c) + // See https://en.wikipedia.org/wiki/Binding_constant + const ideal_gas_constant = 8.31446261815324; // J/(Mol*K) + const kcal_per_joule = 4184; + // NOTE: Unfortunately, the temperature at which experimental Kd binding data was taken + // is often not recorded. Thus, we are forced to guess. The two standard guesses are + // physiological body temperature (310K) or room temperature (298K). + const temperature = 298; + const RT = (ideal_gas_constant / kcal_per_joule) * temperature; + // NOTE: For performance, simulations are often done in a very small unit cell, and + // thus at a very high concentration. The size of the unit cell bounds the volume. + // For shorter simulations where the ligand has not explored the entire box, it may + // be less. See the Yank paper for a method of calculating the correct volumes. + const standard_concentration = 1; // Units of mol / L, but see comment above. + + for (var i = 1; i < non_blank_lines.length; i++) { + // The correct lines of output score file should be of the form + // pKa_predicted + // /var/lib/cwl/stg19c300d1-f7fd-4a38-80d2-0f5615e3eb8f/complex_pdbs.pdb,7.441 + var pKa_string = non_blank_lines[i].split(",").filter(function(s) {return !isNaN(parseFloat(s))})[0]; + var Kd = (10 ** - parseFloat(pKa_string)); + var dG = RT * Math.log(Kd / standard_concentration); + onionnet_dGs.push(dG); + } + return onionnet_dGs; + } + +$namespaces: + edam: https://edamontology.org/ + cwltool: http://commonwl.org/cwltool# + +$schemas: +- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl diff --git a/cwl_adapters/rename_residues_mol2.cwl b/cwl_adapters/rename_residues_mol2.cwl new file mode 100644 index 0000000..8be5d22 --- /dev/null +++ b/cwl_adapters/rename_residues_mol2.cwl @@ -0,0 +1,79 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.0 + +class: CommandLineTool + +label: Change the labels of residues + +doc: | + Change the labels of residues + +baseCommand: ["python", "/rename_residues.py"] + +hints: + DockerRequirement: + dockerPull: ndonyapour/rename_residues + +inputs: + input_path: + type: File + format: + - edam:format_3816 # mol2 + inputBinding: + prefix: --input_path + + output_path: + label: Path to the output file + doc: |- + Path to the output file + type: string + format: + - edam:format_3816 # mol2 + inputBinding: + prefix: --output_path + default: system.mol2 + + residue_name: + label: The new residue name to which all residues should be changed + doc: |- + The new residue name to which all residues should be changed + type: string + format: + - edam:format_2330 + inputBinding: + prefix: --residue_name + + column_index: + label: The index of column of the line + doc: |- + The index of column of the line + type: int? + format: + - edam:format_2330 + inputBinding: + prefix: --column_index + + line_start_column_idxs: + label: A dictionary string indicating the line start and corresponding column indice + doc: |- + A dictionary string indicating the line start and corresponding column indice + type: string? + format: + - edam:format_2330 + inputBinding: + prefix: --line_start_column_idxs +outputs: + output_path: + label: Path to the output file + doc: |- + Path to the output file + type: File + format: edam:format_3816 # mol2 + outputBinding: + glob: $(inputs.output_path) + +$namespaces: + edam: https://edamontology.org/ + +$schemas: +- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl \ No newline at end of file diff --git a/cwl_adapters/rename_residues_pdb.cwl b/cwl_adapters/rename_residues_pdb.cwl new file mode 100644 index 0000000..72d5f36 --- /dev/null +++ b/cwl_adapters/rename_residues_pdb.cwl @@ -0,0 +1,76 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.0 + +class: CommandLineTool + +label: Change the labels of residues + +doc: | + Change the labels of residues + +baseCommand: ["python", "/rename_residues.py"] + +hints: + DockerRequirement: + dockerPull: ndonyapour/rename_residues + +inputs: + input_path: + label: Input pdb file + type: File + format: + - edam:format_1476 # pdb + inputBinding: + prefix: --input_path + + output_path: + label: Output pdb file + type: string? + format: + - edam:format_1476 # pdb + inputBinding: + prefix: --output_path + default: system.pdb + + residue_name: + label: The new residue name to which all residues should be changed + doc: |- + The new residue name to which all residues should be changed + type: string + format: + - edam:format_2330 + inputBinding: + prefix: --residue_name + + column_index: + label: The index of column of the line + doc: |- + The index of column of the line + type: int? + format: + - edam:format_2330 + inputBinding: + prefix: --column_index + + line_start_column_idxs: + label: A dictionary string indicating the line start and corresponding column indice + doc: |- + A dictionary string indicating the line start and corresponding column indice + type: string? + format: + - edam:format_2330 + inputBinding: + prefix: --line_start_column_idxs + +outputs: + output_path: + type: File + format: edam:format_1476 + outputBinding: + glob: $(inputs.output_path) + +$namespaces: + edam: https://edamontology.org/ + +$schemas: +- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl \ No newline at end of file diff --git a/cwl_adapters/smina_docking.cwl b/cwl_adapters/smina_docking.cwl index 78b2614..a16c623 100644 --- a/cwl_adapters/smina_docking.cwl +++ b/cwl_adapters/smina_docking.cwl @@ -33,7 +33,6 @@ inputs: - edam:format_3815 - edam:format_3816 inputBinding: - position: 1 prefix: -r ligand_file: @@ -50,7 +49,6 @@ inputs: - edam:format_3815 - edam:format_3816 inputBinding: - position: 2 prefix: -l ligand_box: @@ -67,14 +65,24 @@ inputs: - edam:format_3815 - edam:format_3816 inputBinding: - position: 3 prefix: --autobox_ligand + local_only: + label: try local minimization only rather than docking + type: boolean? + inputBinding: + prefix: --local_only + + score_only: + label: Do not do any conformational search; simply rescore. + type: boolean? + inputBinding: + prefix: --score_only + scoring: label: scoring function option, default is vina, options can be (vina, vinardo, or a customized scoring function) type: string? inputBinding: - position: 4 prefix: --scoring default: "vina" @@ -83,7 +91,6 @@ inputs: type: string? format: edam:format_1476 inputBinding: - position: 5 prefix: -o default: "docked.pdb" diff --git a/docker/dockerBuild.sh b/docker/dockerBuild.sh index 368b8fa..5cfed75 100755 --- a/docker/dockerBuild.sh +++ b/docker/dockerBuild.sh @@ -27,7 +27,6 @@ sudo docker build --no-cache --pull -f Dockerfile_mol2_to_pdbqt -t jakefennick/m sudo docker build --no-cache --pull -f Dockerfile_nmr4md -t jakefennick/nmr4md . sudo docker build --no-cache --pull -f Dockerfile_openbabel -t ndonyapour/openbabel . sudo docker build --no-cache --pull -f Dockerfile_remove_terminal_residue_name_prefixes -t jakefennick/remove_terminal_residue_name_prefixes . -sudo docker build --no-cache --pull -f Dockerfile_rename_residues_mol -t jakefennick/rename_residues_mol . sudo docker build --no-cache --pull -f Dockerfile_combine_structure -t ndonyapour/combine_structure . sudo docker build --no-cache --pull -f Dockerfile_molgan -t ndonyapour/molgan . sudo docker build --no-cache --pull -f Dockerfile_onionnet-sfct -t polusai/onionnet-sfct-tool:1.0.0 . @@ -37,6 +36,9 @@ sudo docker build --no-cache --pull -f Dockerfile_extract_protein -t ndonyapour/ sudo docker build --no-cache --pull -f Dockerfile_extract_ligand_protein -t mrbrandonwalker/extract_ligand_protein . sudo docker build --no-cache --pull -f Dockerfile_fix_pdb_atom_column -t ndonyapour/fix_pdb_atom_column . sudo docker build --no-cache --pull -f Dockerfile_generate_conformers -t ndonyapour/generate_conformers . +sudo docker build --no-cache --pull -f Dockerfile_onionnet -t ndonyapour/onionnet . +sudo docker build --no-cache --pull -f Dockerfile_combine_pdb -t ndonyapour/combine_pdb . +sudo docker build --no-cache --pull -f Dockerfile_rename_residues -t ndonyapour/rename_residues . sudo docker build --no-cache --pull -f Dockerfile_pdbbind_refined -t pdbbind_refined_v2020 . # NOTE: no username cd ../.. diff --git a/dockerPull.sh b/dockerPull.sh index d635d77..0ff87fa 100755 --- a/dockerPull.sh +++ b/dockerPull.sh @@ -19,7 +19,6 @@ docker pull jakefennick/mol2_to_pdbqt docker pull jakefennick/nmr4md docker pull ndonyapour/openbabel docker pull jakefennick/remove_terminal_residue_name_prefixes -docker pull jakefennick/rename_residues_mol docker pull ndonyapour/molgan docker pull cyangnyu/onionnet-sfct docker pull cyangnyu/smina @@ -30,4 +29,7 @@ docker pull ndonyapour/pdbfixer docker pull ndonyapour/extract_protein docker pull mrbrandonwalker/extract_ligand_protein docker pull ndonyapour/fix_pdb_atom_column -docker pull ndonyapour/generate_conformers \ No newline at end of file +docker pull ndonyapour/generate_conformers +docker pull ndonyapour/onionnet +docker pull ndonyapour/combine_pdb +docker pull ndonyapour/rename_residues diff --git a/examples/config_ci.json b/examples/config_ci.json index 467efbd..a31fa5b 100644 --- a/examples/config_ci.json +++ b/examples/config_ci.json @@ -10,7 +10,8 @@ "vs_demo_2_weekly", "vs_demo_3_weekly", "vs_demo_4_weekly", - "docking_rescoring_weekly" + "docking_rescoring_weekly", + "docking_rescoring_onionnet" ], // NOTE: Most of the workflows in this list have free variables because they are subworkflows // i.e. if you try to run them, you will get "Missing required input parameter" diff --git a/examples/docking/convert_ligand_mol2_to_pdbqt_obabel.wic b/examples/docking/convert_ligand_mol2_to_pdbqt_obabel.wic index a18ae51..f4de973 100644 --- a/examples/docking/convert_ligand_mol2_to_pdbqt_obabel.wic +++ b/examples/docking/convert_ligand_mol2_to_pdbqt_obabel.wic @@ -1,7 +1,12 @@ +inputs: + input_path: + type: File + format: + - edam:format_3816 steps: - id: convert_pdbqt in: - input_path: !* conformer.mol2 + input_path: input_path # out: # - output_pdb_path: !& ligand_keywords.pdbqt # NOTE: There is no way to preserve the order of the atoms. Indeed, the diff --git a/examples/docking/docking.wic b/examples/docking/docking.wic index 10cf0f0..07c4402 100644 --- a/examples/docking/docking.wic +++ b/examples/docking/docking.wic @@ -9,9 +9,9 @@ steps: # NOTE: Searching for conformers tends to cause the ligand to curl up into a ball. # Although this lowers its energy in isolation, the decreased surface area tends # to weaken the binding free energy! (as reported by autodock vina) - flc.wic: - in: - sdf_path: sdf_path + flc.wic: + in: + sdf_path: sdf_path # minimize_ligand_only.wic: # NOTE: We converted to mol2 format above because it allows explicit charges. @@ -22,92 +22,96 @@ steps: # NOTE: Rename all residues to MOL before calling acpypye. Otherwise, acpype crashes with: # "ERROR: more than one residue detected '{'UNL', 'MOL'}'" - python3_mol2_to_mol2: - in: - script: !ii /rename_residues_mol.py # NOTE: Initial / required - input_mol2_path: !* ligand_min.mol2 - out: - - output_mol2_path: !& conformer.mol2 + rename_residues_mol2: + in: + input_path: !* ligand_min.mol2 + output_path: !ii conformer.mol2 + residue_name: !ii MOL + column_index: !ii 7 + out: + - output_path: !& conformer.mol2 # NOTE: minimize before calling acpype so 1. tleap complains less about close contacts: # /usr/local/bin/teLeap: Warning! # Close contact of 1.418311 angstroms between .R.A and .R.A # and 2. acpype doesn't complain about 'ERROR: Atoms TOO alone (> 3.0 Ang.)' # (acpype suggests using --force, but it's better to just minimize.) - acpype: - # NOTE: We are using our own acpypye CWL adapter (NOT the biobb version) so - # we have the choice of using charges from the mol2 file. - in: - input_path: !* conformer.mol2 # Do NOT use pose_ligand.pdb - out: - - output_itp_path: !& ligand_GMX.itp - # NOTE: Although we don't need the *.itp topology file yet, we - # need to use these coordinates with autodock because they are - # sorted to agree with the *.itp topology include file. - # Otherwise, we will get the grompp atom name warning (see below). - - output_gro_path: !& ligand_GMX.gro - - output_top_path: !& ligand_GMX.top - #charge_method: user # take charges from mol2 file + acpype: + # NOTE: We are using our own acpypye CWL adapter (NOT the biobb version) so + # we have the choice of using charges from the mol2 file. + in: + input_path: !* conformer.mol2 # Do NOT use pose_ligand.pdb + out: + - output_itp_path: !& ligand_GMX.itp + # NOTE: Although we don't need the *.itp topology file yet, we + # need to use these coordinates with autodock because they are + # sorted to agree with the *.itp topology include file. + # Otherwise, we will get the grompp atom name warning (see below). + - output_gro_path: !& ligand_GMX.gro + - output_top_path: !& ligand_GMX.top + #charge_method: user # take charges from mol2 file # NOTE: acpype doesn't add forcefield or water topology #include lines. - bash_top: - in: - script: !ii /gmx_add_topology_includes.sh # NOTE: Initial / required - input_top_path: !* ligand_GMX.top - out: - - output_top_path: !& ligand_GMX_includes.top - zip_top: - in: - input_top_path: !* ligand_GMX_includes.top - input_itp_path: !* ligand_GMX.itp - out: - - output_top_zip_path: !& ligand_GMX.zip + bash_top: + in: + script: !ii /gmx_add_topology_includes.sh # NOTE: Initial / required + input_top_path: !* ligand_GMX.top + out: + - output_top_path: !& ligand_GMX_includes.top + zip_top: + in: + input_top_path: !* ligand_GMX_includes.top + input_itp_path: !* ligand_GMX.itp + out: + - output_top_zip_path: !& ligand_GMX.zip # Docking - convert_ligand_mol2_to_pdbqt_obabel.wic: -# convert_ligand_mol2_to_pdbqt_mdanalysis.wic: # generates ligand_rigid.pdbqt only + convert_ligand_mol2_to_pdbqt_obabel.wic: + in: + input_path: !* conformer.mol2 +# - convert_ligand_mol2_to_pdbqt_mdanalysis.yml: # generates ligand_rigid.pdbqt only - autodock_vina_run: - in: - input_ligand_pdbqt_path: !* ligand_flex.pdbqt - #input_ligand_pdbqt_path: !* ligand_rigid.pdbqt - input_receptor_pdbqt_path: !* pdb.pdbqt - input_box_path: !* box.pdb - out: - - output_pdbqt_path: !& poses_ligand.pdbqt - - output_log_path: !& vina.log - extract_model_pdbqt: - in: - input_pdbqt_path: !* poses_ligand.pdbqt - output_pdbqt_path: !ii pose_ligand.pdbqt - config: !ii - model: 1 # NOTE: score, rmsd l.b., rmsd u.b. stored in REMARK lines - out: - - output_pdbqt_path: !& pose_ligand.pdbqt - convert_xyz: - in: - input_path: !* pose_ligand.pdbqt - output_xyz_path: !ii pose_ligand.xyz - out: - - output_xyz_path: !& pose_ligand.xyz + autodock_vina_run: + in: + input_ligand_pdbqt_path: !* ligand_flex.pdbqt + #input_ligand_pdbqt_path: !* ligand_rigid.pdbqt + input_receptor_pdbqt_path: !* pdb.pdbqt + input_box_path: !* box.pdb + out: + - output_pdbqt_path: !& poses_ligand.pdbqt + - output_log_path: !& vina.log + extract_model_pdbqt: + in: + input_pdbqt_path: !* poses_ligand.pdbqt + output_pdbqt_path: !ii pose_ligand.pdbqt + config: !ii + model: 1 # NOTE: score, rmsd l.b., rmsd u.b. stored in REMARK lines + out: + - output_pdbqt_path: !& pose_ligand.pdbqt + convert_xyz: + in: + input_path: !* pose_ligand.pdbqt + output_xyz_path: !ii pose_ligand.xyz + out: + - output_xyz_path: !& pose_ligand.xyz # Molecular Dynamics combine receptor & ligand - append_ligand: - in: - input_itp_path: !* ligand_GMX.itp - input_top_zip_path: !* receptor.zip - out: - - output_top_zip_path: !& complex_vac.zip - combine_structure: - in: - input_structure1: !* receptor.xyz # receptor_hydrogens.pdb - input_structure2: !* pose_ligand.xyz - out: - - output_structure_path: !& complex_vac.pdb + append_ligand: + in: + input_itp_path: !* ligand_GMX.itp + input_top_zip_path: !* receptor.zip + out: + - output_top_zip_path: !& complex_vac.zip + combine_structure: + in: + input_structure1: !* receptor.xyz # receptor_hydrogens.pdb + input_structure2: !* pose_ligand.xyz + out: + - output_structure_path: !& complex_vac.pdb wic: graphviz: label: Cheminformatics + Docking + Initial Topology Setup steps: - (2, python3_mol2_to_mol2): + (2, rename_residues_mol2): wic: graphviz: label: Rename Ligand\nResidues to MOL diff --git a/examples/docking/gen_topol_params.wic b/examples/docking/gen_topol_params.wic index 1efb759..430d236 100644 --- a/examples/docking/gen_topol_params.wic +++ b/examples/docking/gen_topol_params.wic @@ -18,12 +18,10 @@ steps: # input_path: !* ligand.pdbqt # out: # - output_mol2_path: !& pose.mol2 - python3_mol2_to_mol2: + rename_residues_mol2: in: - script: !ii /rename_residues_mol.py # NOTE: Initial / required -# input_mol2_path: !* pose.mol2 -# out: -# - output_mol2_path: !& pose_mol.mol2 + residue_name: !ii MOL + column_index: !ii 7 # NOTE: minimize before calling acpype so 1. tleap complains less about close contacts: # /usr/local/bin/teLeap: Warning! # Close contact of 1.418311 angstroms between .R.A and .R.A @@ -89,7 +87,7 @@ wic: wic: graphviz: label: Convert to\nmol2 format - (2, python3_mol2_to_mol2): + (2, rename_residues_mol2): wic: graphviz: label: Rename Residues\nto MOL diff --git a/examples/docking/ligand_modeling_docking.wic b/examples/docking/ligand_modeling_docking.wic index b9a26ec..dbb9d51 100644 --- a/examples/docking/ligand_modeling_docking.wic +++ b/examples/docking/ligand_modeling_docking.wic @@ -9,9 +9,9 @@ steps: # NOTE: Searching for conformers tends to cause the ligand to curl up into a ball. # Although this lowers its energy in isolation, the decreased surface area tends # to weaken the binding free energy! (as reported by autodock vina) - flc.wic: - in: - sdf_path: sdf_path + flc.wic: + in: + sdf_path: sdf_path # minimize_ligand_only.wic: # in: # sdf_path: sdf_path @@ -24,36 +24,41 @@ steps: # NOTE: Rename all residues to MOL before calling acpypye. Otherwise, acpype crashes with: # "ERROR: more than one residue detected '{'UNL', 'MOL'}'" - python3_mol2_to_mol2: - in: - script: !ii /rename_residues_mol.py # NOTE: Initial / required - input_mol2_path: !* ligand_min.mol2 - out: - - output_mol2_path: !& conformer.mol2 + rename_residues_mol2: + in: + input_path: !* ligand_min.mol2 + output_path: !ii conformer.mol2 + residue_name: !ii MOL + column_index: !ii 7 + out: + - output_path: !& conformer.mol2 + # Docking - convert_ligand_mol2_to_pdbqt_obabel.wic: -# convert_ligand_mol2_to_pdbqt_mdanalysis.wic: # generates ligand_rigid.pdbqt only - autodock_vina_run: - in: - input_ligand_pdbqt_path: !* ligand_flex.pdbqt - #input_ligand_pdbqt_path: !* ligand_rigid.pdbqt - input_receptor_pdbqt_path: !* pdb.pdbqt - input_box_path: !* box.pdb - out: + convert_ligand_mol2_to_pdbqt_obabel.wic: + in: + input_path: !* conformer.mol2 +# - convert_ligand_mol2_to_pdbqt_mdanalysis.yml: # generates ligand_rigid.pdbqt only + autodock_vina_run: + in: + input_ligand_pdbqt_path: !* ligand_flex.pdbqt + #input_ligand_pdbqt_path: !* ligand_rigid.pdbqt + input_receptor_pdbqt_path: !* pdb.pdbqt + input_box_path: !* box.pdb + out: # - output_pdbqt_path: !& poses_ligand.pdbqt - - output_log_path: !& vina.log - split_pdbqt: + - output_log_path: !& vina.log + split_pdbqt: # in: # input_path: !* poses_ligand.pdbqt # Scalar type - out: - - output_pdb_path: !& ligand_nested_split.pdbqt + out: + - output_pdb_path: !& ligand_nested_split.pdbqt wic: graphviz: label: Perform Ligand\nModeling & Docking steps: - (2, python3_mol2_to_mol2): + (2, rename_residues_mol2): wic: graphviz: label: Rename Residues\nto MOL diff --git a/examples/onionnet/autodock_onionnet.wic b/examples/onionnet/autodock_onionnet.wic new file mode 100644 index 0000000..8d887f0 --- /dev/null +++ b/examples/onionnet/autodock_onionnet.wic @@ -0,0 +1,255 @@ +## Protein-ligand docking and docking poses re-ranking +## +## input: pdb structures from PDBbind refined dataset +## output: +## 1. Docking +## 2. Scoring the best pose using OnionNet + +steps: + extract_pdbbind_refined: + in: + # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html + # The query() method uses a slightly modified Python syntax by default. + # For example, the & and | (bitwise) operators have the precedence of their + # boolean cousins, and and or. This is syntactically valid Python, however + # the semantics are different. + query: !ii '(Kd_Ki == "Kd") and (value < 0.001)' + max_row: !ii 1 #25 # Use 1 for CI + convert_Kd_dG: !ii True + output_txt_path: !ii binding_data.txt + out: + - output_txt_path: !& binding_data.txt + - output_pdb_paths: !& pdbbind_pdbs + - output_sdf_paths: !& pdbbind_sdfs + - experimental_dGs: !& exp_dGs + - pdb_ids: !& pdbids + extract_protein: + scatter: [input_pdb_path] + in: + input_pdb_path: !* pdbbind_pdbs + output_pdb_path: !ii pdbbind_protein.pdb + out: + - output_pdb_path: !& pdbbind_protein.pdb + config_tag_pdb: + scatter: [pdb_id] + in: + pdb_id: !* pdbids + filter: !ii False + out: + - output_config_string: !& pdb_id_str + pdb: + scatter: [config] + in: + output_pdb_path: !ii rcsb_protein.pdb + config: !* pdb_id_str + out: + - output_pdb_path: !& rcsb_protein.pdb + pdb_fixer: + in: + input_pdb_path: !* pdbbind_protein.pdb + input_helper_pdb_path: !* rcsb_protein.pdb # Note: PDBFixer utilizes sequence information from this file + output_pdb_path: !ii pdbbind_pdbfixer.pdb + add_atoms: !ii heavy + add_residues: !ii True + replace_nonstandard: !ii True + keep_heterogens: !ii none + scatter: [input_pdb_path, input_helper_pdb_path] + scatterMethod: dotproduct + out: + - output_pdb_path: !& pdbbind_pdbfixer.pdb + setup_pdb.wic: + scatter: [input_pdb_path] + in: + input_pdb_path: !* pdbbind_pdbfixer.pdb + pdb_path: !ii pdb.pdbqt + box_path: !ii box.pdb + box_buffer: !ii 20 # Angstroms + water_type: !ii spce + forcefield: !ii amber99sb-ildn + out: + - pdb_path: !& pdb.pdbqt + - box_path: !& box.pdb + + # assign partial charges (ligand) + convert_mol2: + scatter: [input_path] + in: + input_path: !* pdbbind_sdfs + + rename_residues_mol2: + scatter: [input_path] + in: + output_path: !ii ligand.mol2 + residue_name: !ii MOL + column_index: !ii 7 + out: + - output_path: !& ligand.mol2 + + convert_ligand_mol2_to_pdbqt_obabel.wic: + scatter: [input_path] + autodock_vina: + scatter: [input_ligand_pdbqt_path, input_receptor_pdbqt_path, input_box_path] + scatterMethod: dotproduct + in: + input_ligand_pdbqt_path: !* ligand_rigid.pdbqt + input_receptor_pdbqt_path: !* pdb.pdbqt + input_box_path: !* box.pdb + output_pdbqt_path: !ii poses_ligand.pdbqt + output_log_path: !ii vina.log + num_modes: !ii 20 + out: + - output_pdbqt_path: !& poses_ligand.pdbqt + - output_log_path: !& vina.log' + + extract_model_pdbqt: + scatter: [input_pdbqt_path] + in: + input_pdbqt_path: !* poses_ligand.pdbqt + output_pdbqt_path: !ii pose_ligand.pdbqt + config: !ii + model: 1 # NOTE: score, rmsd l.b., rmsd u.b. stored in REMARK lines + out: + - output_pdbqt_path: !& pose_ligand.pdbqt + + convert_pdb: + scatter: [input_path] + in: + input_path: !* pose_ligand.pdbqt + output_pdb_path: !ii pose_ligand.pdb + out: + - output_pdb_path: !& pose_ligand.pdb + + rename_residues_pdb: + scatter: [input_path] + in: + input_path: !* pose_ligand.pdb + output_path: !ii ligand.pdb + residue_name: !ii LIG + line_start_column_idxs: !ii '{"ATOM": 3, "HETATM": 3, "TER": 2}' + out: + - output_path: !& ligand.pdb + + combine_pdb: + scatter: [input_structure1, input_structure2] + scatterMethod: dotproduct + in: + input_structure1: !* pdbbind_pdbfixer.pdb + input_structure2: !* ligand.pdb + output_structure_path: !ii complex_pdb.pdb + out: + - output_structure_path: !& complex_pdb.pdb + + onionnet_featurize: + in: + pdb_paths: !* complex_pdb.pdb + output_feature_file: !ii output_features.csv + out: + - output_feature_file: !& output_features.csv + + onionnet_score: + in: + input_feature_file: !* output_features.csv + output_score_file: !ii predicted_pKa.csv + onionnet_score: !ii onionnet_score + out: + - output_score_file: !& predicted_pKa.csv + - onionnet_score: !& onionnet_score + + scatter_plot: + in: + xs: !* exp_dGs + ys: !* onionnet_score + output_png_path: !ii dG_exp_wpred.png + out: + - output_png_path: !& dG_exp_wpred.png + + check_linear_fit: + in: + xs: !* exp_dGs + ys: !* onionnet_score + # Use massive tolerances because the re-docking performance of classical scoring functions (Vina) are unreliable. + tol_quad: !ii 0.18 + slope_min: !ii -10 + slope_max: !ii 10 + +wic: + graphviz: + label: Virtual Screening Demo + steps: + (1, extract_pdbbind_refined): + wic: + graphviz: + label: Extract PDBbind Data + (2, extract_protein): + wic: + inlineable: False + graphviz: + label: Extract protein + (3, config_tag_pdb): + wic: + inlineable: False + graphviz: + label: Specify PDB Code + (4, pdb): + wic: + inlineable: False + graphviz: + label: Download PDB File + (5, pdbfixer): + wic: + inlineable: False + graphviz: + label: Fix Protein Structure + (6, setup_pdb.yml): + wic: + inlineable: True + graphviz: + label: Setup PDB + (7, convert_mol2): + wic: + graphviz: + label: Convert Ligand\nsdf to mol2 + (8, rename_residues_mol2): + wic: + graphviz: + label: Rename Ligand\nResidues to MOL + (9, convert_ligand_mol2_to_pdbqt_obabel.yml): + wic: + graphviz: + label: Rename Residues\nto MOL + (10, autodock_vina): + wic: + graphviz: + label: Docking + (11, extract_model_pdbqt): + wic: + graphviz: + label: Extract Best\nDocking Pose + (12, convert_pdb): + wic: + graphviz: + label: Convert to\npdb format + (13, rename_residues_pdb): + wic: + graphviz: + label: Rename residues to LIG + (14, combine_pdb): + wic: + graphviz: + label: Combine Protein & Ligand\nPDB Structures + (15, onionnet_featurize): + wic: + graphviz: + label: Generate features for the Docking Poses + (16, onionnet_score): + wic: + graphviz: + label: Predict Binding Scores + (17, scatter_plot): + wic: + graphviz: + label: Plot Experimental\nvs Predicted Binding + (18, check_linear_fit): + wic: + graphviz: + label: True if fitted slope is between slope_min and slope_max \ No newline at end of file diff --git a/examples/rescoring/docking_rescoring.wic b/examples/onionnet/docking_rescoring.wic similarity index 98% rename from examples/rescoring/docking_rescoring.wic rename to examples/onionnet/docking_rescoring.wic index 97c7b1a..0c2e958 100644 --- a/examples/rescoring/docking_rescoring.wic +++ b/examples/onionnet/docking_rescoring.wic @@ -6,7 +6,6 @@ ## 2. scoring file (vina score, sfct correction, combined_score for re-ranking docking poses) steps: -# - id: extract_pdbbind_refined in: # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html @@ -15,7 +14,7 @@ steps: # This is syntactically valid Python, however the semantics are different." query: !ii '(Kd_Ki == "Kd") and ((value < 0.0001) or ((0.08 < value) and ( value < 0.1)) or ((8 < value) and ( value < 10)) or (1000 < value))' # to obtain a broader experimental dGs - max_row: !ii 500 # first 500 rows selection. + max_row: !ii 100 # first 500 rows selection. convert_Kd_dG: !ii True output_txt_path: !ii binding_data.txt out: diff --git a/examples/rescoring/docking_rescoring_weekly.wic b/examples/onionnet/docking_rescoring_weekly.wic similarity index 84% rename from examples/rescoring/docking_rescoring_weekly.wic rename to examples/onionnet/docking_rescoring_weekly.wic index 8e56403..6c847f2 100644 --- a/examples/rescoring/docking_rescoring_weekly.wic +++ b/examples/onionnet/docking_rescoring_weekly.wic @@ -8,7 +8,7 @@ wic: steps: (1, extract_pdbbind_refined): in: - max_row: !ii 1000 # Default 500 + max_row: !ii 1000 # Default 100 (2, random_subset_rows): in: num_of_samples: !ii 32 \ No newline at end of file diff --git a/examples/onionnet/replicate_onionnet_paper.wic b/examples/onionnet/replicate_onionnet_paper.wic new file mode 100644 index 0000000..ec3e460 --- /dev/null +++ b/examples/onionnet/replicate_onionnet_paper.wic @@ -0,0 +1,124 @@ +## Protein-ligand dG predcition +## +## input: pdb structures from PDBbind refined dataset +## output: +## 1. Predicting protein-ligand pKa using OnionNet + +steps: + extract_pdbbind_refined: + in: + # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html + # The query() method uses a slightly modified Python syntax by default. + # For example, the & and | (bitwise) operators have the precedence of their + # boolean cousins, and and or. This is syntactically valid Python, however + # the semantics are different. + query: !ii '(Kd_Ki == "Kd") and (value < 0.001)' + max_row: !ii 1 #25 # Use 1 for CI + convert_Kd_dG: !ii True + output_txt_path: !ii binding_data.txt + out: + - output_txt_path: !& binding_data.txt + - output_pdb_paths: !& pdbbind_pdbs + - output_sdf_paths: !& pdbbind_sdfs + - experimental_dGs: !& exp_dGs + - pdb_ids: !& pdbids + + convert_pdb: + scatter: [input_path] + in: + input_path: !* pdbbind_sdfs + output_pdb_path: !ii ligand_pdbbind.pdb + out: + - output_pdb_path: !& ligand_pdbbind.pdb + + rename_residues_pdb: + scatter: [input_path] + in: + input_path: !* ligand_pdbbind.pdb + output_path: !ii ligand.pdb + residue_name: !ii LIG # Note, the residue name should be three characters + # See: https://www.cgl.ucsf.edu/chimera/1.4/docs/UsersGuide/tutorials/pdbintro.html + line_start_column_idxs: !ii '{"ATOM": 3, "HETATM": 3, "TER": 2}' + out: + - output_path: !& ligand.pdb + + combine_pdb: + scatter: [input_structure1, input_structure2] + scatterMethod: dotproduct + in: + input_structure1: !* pdbbind_pdbs + input_structure2: !* ligand.pdb + output_structure_path: !ii complex_pdbs.pdb + out: + - output_structure_path: !& complex_pdbs.pdb + + onionnet_featurize: + in: + pdb_paths: !* complex_pdbs.pdb + output_feature_file: !ii output_features.csv + out: + - output_feature_file: !& output_features.csv + + onionnet_score: + in: + input_feature_file: !* output_features.csv + output_score_file: !ii predicted_pKa.csv + onionnet_score: !ii onionnet_score + out: + - output_score_file: !& predicted_pKa.csv + - onionnet_score: !& onionnet_score + + scatter_plot: + in: + xs: !* exp_dGs + ys: !* onionnet_score + output_png_path: !ii dG_exp_wpred.png + out: + - output_png_path: !& dG_exp_wpred.png + + check_linear_fit: + in: + xs: !* exp_dGs + ys: !* onionnet_score + # Use massive tolerances because the re-docking performance of classical scoring functions (Vina) are unreliable. + tol_quad: !ii 0.18 + slope_min: !ii -10 + slope_max: !ii 10 + +wic: + graphviz: + label: Virtual Screening Demo + steps: + (1, extract_pdbbind_refined): + wic: + graphviz: + label: Extract PDBbind Data + + (2, convert_pdb): + wic: + graphviz: + label: Convert to PDB + (3, rename_residues_pdb): + wic: + graphviz: + label: Rename residues to LIG + (4, combine_pdb): + wic: + graphviz: + label: Combine Protein & Ligand\nPDB Structures + (5, onionnet_featurize): + wic: + graphviz: + label: Generate features for the Docking Poses + (6, onionnet_score): + wic: + graphviz: + label: Predict Binding Scores + (7, scatter_plot): + wic: + graphviz: + label: Plot Experimental\nvs Predicted Binding + (8, check_linear_fit): + wic: + graphviz: + label: True if fitted slope is between slope_min and slope_max \ No newline at end of file diff --git a/examples/scripts/Dockerfile_combine_pdb b/examples/scripts/Dockerfile_combine_pdb new file mode 100644 index 0000000..71de53f --- /dev/null +++ b/examples/scripts/Dockerfile_combine_pdb @@ -0,0 +1,7 @@ +FROM condaforge/mambaforge +# NOT mambaforge-pypy3 (mdanalysis is incompatible with pypy) + +RUN mamba install -c conda-forge mdanalysis + +ADD combine_pdb.py . +ADD Dockerfile_combine_pdb . diff --git a/examples/scripts/Dockerfile_onionnet b/examples/scripts/Dockerfile_onionnet new file mode 100644 index 0000000..d9132db --- /dev/null +++ b/examples/scripts/Dockerfile_onionnet @@ -0,0 +1,34 @@ +FROM condaforge/mambaforge +# NOT mambaforge-pypy3 (pandas & rdkit & mdtraj are incompatible with pypy) + +# Install requirements +RUN apt-get update && apt-get install -y wget git + +# Create environment +# Since python 3.10 is already installed in the base image condaforge/mambaforge, +# See https://github.com/zhenglz/onionnet#installation +# OnionNet uses sklearn.externals.joblib which is removed in sklearn 0.23 +# RUN mamba install -y python=3.7 Doesn't work +RUN mamba create -n py37 python=3.7 + +# Activate the conda environment +SHELL ["conda", "run", "-n", "py37", "/bin/bash", "-c"] +RUN echo "source activate py37" > ~/.bashrc +ENV PATH /opt/conda/envs/env/bin:$PATH + +# Install the required packages +RUN mamba install -c conda-forge scipy numpy pandas "scikit-learn<0.23" mdtraj openbabel tensorflow -y + +# cleanup +RUN apt-get clean +RUN mamba clean --all --yes + +# Install onionnet +RUN git clone https://github.com/zhenglz/onionnet.git + +# Download models +## the default model of onionnet-v1 in github repo is not correct, the actually size is around 600 MB. +## The authors provided a google drive link to download it, +## but their command wget "https://drive.google.com/uc?export=download&id=1cwJN44TgaVBWYEEb_SGU5JBJp6WbFdM1" -O "CNN_final_model_weights.h5" is not working. +RUN cd /onionnet/models && rm CNN_final_model_weights.h5 && wget -nv --no-clobber https://huggingface.co/ndonyapour/OnionNet-V1/resolve/main/CNN_final_model_weights.h5 +ADD Dockerfile_onionnet . \ No newline at end of file diff --git a/examples/scripts/Dockerfile_rename_residues b/examples/scripts/Dockerfile_rename_residues new file mode 100644 index 0000000..2481697 --- /dev/null +++ b/examples/scripts/Dockerfile_rename_residues @@ -0,0 +1,7 @@ +FROM python + +RUN apt-get update && apt-get install -y wget +RUN apt-get clean + +Add rename_residues.py . +Add Dockerfile_rename_residues . diff --git a/examples/scripts/combine_pdb.py b/examples/scripts/combine_pdb.py new file mode 100644 index 0000000..9c00e08 --- /dev/null +++ b/examples/scripts/combine_pdb.py @@ -0,0 +1,61 @@ +import sys +import os +import argparse +from typing import Optional +import numpy as np + +import MDAnalysis as mda + + +def parse_arguments() -> argparse.Namespace: + """ This function parses the arguments. + + Returns: + argparse.Namespace: The command line arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument('--input_structure1', type=str) + parser.add_argument('--input_structure2', type=str) + parser.add_argument('--output_structure_path', type=str) + args = parser.parse_args() + return args + + +def combine_structure_rdkit(input_structure1_path: str, input_structure2_path: str, output_structure_path: str) -> None: + """ Combine two structures into a single PDB file using MDAnalysis + + Args: + input_structure1_path (str): The path to the PDB structure 1 + input_structure2_path (str): The path to the PDB structure 2 + output_structure_path (str): The path to the output combined PDB structure + """ + + u1 = mda.Universe(input_structure1_path) + u2 = mda.Universe(input_structure2_path) + + merged = mda.Merge(u1.atoms, u2.atoms) + merged_elements = np.concatenate([u1.atoms.elements, u2.atoms.elements]) + + merged.atoms.elements = merged_elements + all_atoms = merged.select_atoms("all") + all_atoms.write(output_structure_path) + + +def main() -> None: + """ Reads the command line arguments and combine two structures into a single PDB file + """ + args = parse_arguments() + + if not os.path.exists(args.input_structure1): + print(f'Error: Can not find file {args.input_structure1}') + sys.exit(1) + + if not os.path.exists(args.input_structure2): + print(f'Error: Can not find file {args.input_structure2}') + sys.exit(1) + + combine_structure_rdkit(args.input_structure1, args.input_structure2, args.output_structure_path) + + +if __name__ == '__main__': + main() diff --git a/examples/scripts/rename_residues.py b/examples/scripts/rename_residues.py new file mode 100644 index 0000000..50a255f --- /dev/null +++ b/examples/scripts/rename_residues.py @@ -0,0 +1,86 @@ +import sys +import os +import argparse +import ast +from typing import Dict + + +def parse_arguments() -> argparse.Namespace: + """ This function parses the arguments. + + Returns: + argparse.Namespace: The command line arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument('--input_path', type=str) + parser.add_argument('--output_path', type=str) + parser.add_argument('--residue_name', type=str) + parser.add_argument('--column_index', type=int, required=False, default=None) + parser.add_argument('--line_start_column_idxs', type=str, required=False, default=None) + args = parser.parse_args() + return args + + +def rename_residues(input_path: str, output_path: str, residue_name: str, + column_index: int, line_start_column_idxs: str) -> None: + """ Modifies the names of residues based on the specified column index or + line start and column indices + + Args: + input_path (str): input file path + output_path (str): output file path + residue_name (str): The new residue name to which all residues should be changed + column_index (int): The index of column + line_start_column_idxs (str): A dictionary string indicating the line start and corresponding column indices, + such as '{"ATOM": 3, "HETATM": 3}'. + """ + line_start_column_idxs_dict: Dict[str, int] = {} + + if line_start_column_idxs: + # Parse the dictionary string to a dictionary + line_start_column_idxs_dict = ast.literal_eval(line_start_column_idxs) + + with open(input_path, mode='r', encoding='utf-8') as f: + lines = f.readlines() + + lines_new = [] + for line in lines: + l = line + words = l.split() + if column_index: + if len(words) >= column_index: + l = l.replace(words[column_index], residue_name) + elif line_start_column_idxs: + for line_start, cindex in line_start_column_idxs_dict.items(): + if words[0] == line_start: + l = l.replace(words[cindex], residue_name) + + lines_new.append(l) + + with open(output_path, mode='w', encoding='utf-8') as f: + f.writelines(lines_new) + + +def main() -> None: + """ Reads the command line arguments and renames the residues + """ + args = parse_arguments() + + if args.input_path and not os.path.exists(args.input_path): + print(f'Error: Can not find file {args.input_path}') + sys.exit(1) + + if (args.column_index is None) and (args.line_start_column_idxs is None): + print("Error: No column index information is provided") + sys.exit(1) + + if (args.column_index) and (args.line_start_column_idxs): + print("Error: One of column_index or line_start_column_idxs arguments should be provided") + sys.exit(1) + + rename_residues(args.input_path, args.output_path, args.residue_name, + args.column_index, args.line_start_column_idxs) + + +if __name__ == '__main__': + main() diff --git a/examples/scripts/rename_residues_mol.py b/examples/scripts/rename_residues_mol.py deleted file mode 100644 index 29550fc..0000000 --- a/examples/scripts/rename_residues_mol.py +++ /dev/null @@ -1,20 +0,0 @@ -import sys - -input_mol2_path = sys.argv[1] -output_mol2_path = sys.argv[2] - -with open(input_mol2_path, mode='r', encoding='utf-8') as f: - lines = f.readlines() - -lines_new = [] -index = 7 # mol2 file format residue name column index -for line in lines: - l = line - words = l.split() - if len(words) >= index: # TODO: and only for ATOM records - l = l.replace(words[index], 'MOL') - - lines_new.append(l) - -with open(output_mol2_path, mode='w', encoding='utf-8') as f: - f.writelines(lines_new)