|
4 | 4 | "cell_type": "markdown", |
5 | 5 | "metadata": {}, |
6 | 6 | "source": [ |
7 | | - "## Dataset Analysis Plotting" |
| 7 | + "## Dataset Interaction Analysis Plotting" |
8 | 8 | ] |
9 | 9 | }, |
10 | 10 | { |
|
67 | 67 | "source": [ |
68 | 68 | "ad_set_dir = os.path.join(\"..\", \"data\", \"astex_diverse_set\")\n", |
69 | 69 | "pb_set_dir = os.path.join(\"..\", \"data\", \"posebusters_benchmark_set\")\n", |
| 70 | + "dg_set_dir = os.path.join(\"..\", \"data\", \"dockgen_set\")\n", |
70 | 71 | "casp15_set_dir = os.path.join(\n", |
71 | 72 | " \"..\",\n", |
72 | 73 | " \"data\",\n", |
73 | | - " \"casp15_set\",\n", |
| 74 | + " \"casp15_set_public\",\n", |
74 | 75 | " \"targets\",\n", |
75 | | - ")\n", |
| 76 | + ") # NOTE: change to `casp15_set` directory as needed\n", |
76 | 77 | "assert os.path.exists(\n", |
77 | 78 | " ad_set_dir\n", |
78 | 79 | "), \"Please download the Astex Diverse set from `https://zenodo.org/records/11199233` before proceeding.\"\n", |
79 | 80 | "assert os.path.exists(\n", |
80 | 81 | " pb_set_dir\n", |
81 | 82 | "), \"Please download the PoseBusters Benchmark set from `https://zenodo.org/records/11199233` before proceeding.\"\n", |
82 | 83 | "assert os.path.exists(\n", |
| 84 | + " dg_set_dir\n", |
| 85 | + "), \"Please download the DockGen set from `https://zenodo.org/records/11199233` before proceeding.\"\n", |
| 86 | + "assert os.path.exists(\n", |
83 | 87 | " casp15_set_dir\n", |
84 | 88 | "), \"Please download the (public) CASP15 set from `https://zenodo.org/records/11199233` before proceeding.\"\n", |
85 | 89 | "\n", |
|
248 | 252 | " store.put(f\"df_{i}\", df)" |
249 | 253 | ] |
250 | 254 | }, |
| 255 | + { |
| 256 | + "cell_type": "markdown", |
| 257 | + "metadata": {}, |
| 258 | + "source": [ |
| 259 | + "##### Analyze `DockGen` set interactions" |
| 260 | + ] |
| 261 | + }, |
| 262 | + { |
| 263 | + "cell_type": "code", |
| 264 | + "execution_count": null, |
| 265 | + "metadata": {}, |
| 266 | + "outputs": [], |
| 267 | + "source": [ |
| 268 | + "if not os.path.exists(\"dockgen_interaction_dataframes.h5\"):\n", |
| 269 | + " dockgen_test_ids_filepath = os.path.join(\n", |
| 270 | + " \"..\", \"data\", \"dockgen_set\", \"split_test.txt\"\n", |
| 271 | + " ) # NOTE: change as needed\n", |
| 272 | + " assert os.path.exists(\n", |
| 273 | + " dockgen_test_ids_filepath\n", |
| 274 | + " ), f\"Invalid test IDs filepath for DockGen: {os.path.exists(dockgen_test_ids_filepath)}.\"\n", |
| 275 | + " with open(dockgen_test_ids_filepath) as f:\n", |
| 276 | + " pdb_ids = {line.replace(\" \", \"-\") for line in f.read().splitlines()}\n", |
| 277 | + " dg_protein_ligand_filepath_pairs = []\n", |
| 278 | + " for item in os.listdir(dg_set_dir):\n", |
| 279 | + " if item not in pdb_ids:\n", |
| 280 | + " continue\n", |
| 281 | + " item_path = os.path.join(dg_set_dir, item)\n", |
| 282 | + " if os.path.isdir(item_path):\n", |
| 283 | + " protein_filepath = os.path.join(item_path, f\"{item}_protein_processed.pdb\")\n", |
| 284 | + " ligand_filepath = os.path.join(item_path, f\"{item}_ligand.pdb\")\n", |
| 285 | + " if os.path.exists(protein_filepath) and os.path.exists(ligand_filepath):\n", |
| 286 | + " dg_protein_ligand_filepath_pairs.append((protein_filepath, ligand_filepath))\n", |
| 287 | + "\n", |
| 288 | + " pc = (\n", |
| 289 | + " PoseCheck()\n", |
| 290 | + " ) # NOTE: despite what `PoseCheck` might say, `reduce` should be available in the `PoseBench` environment\n", |
| 291 | + " dg_protein_ligand_interaction_dfs = []\n", |
| 292 | + " for protein_filepath, ligand_filepath in tqdm(\n", |
| 293 | + " dg_protein_ligand_filepath_pairs, desc=\"Processing DockGen set\"\n", |
| 294 | + " ):\n", |
| 295 | + " try:\n", |
| 296 | + " temp_protein_filepath = create_temp_pdb_with_only_molecule_type_residues(\n", |
| 297 | + " protein_filepath, molecule_type=\"protein\"\n", |
| 298 | + " )\n", |
| 299 | + " ligand_mol = Chem.MolFromPDBFile(ligand_filepath)\n", |
| 300 | + " if ligand_mol is None:\n", |
| 301 | + " ligand_mol = Chem.MolFromPDFile(ligand_filepath, sanitize=False)\n", |
| 302 | + " pc.load_protein_from_pdb(temp_protein_filepath)\n", |
| 303 | + " pc.load_ligands_from_mols([ligand_mol])\n", |
| 304 | + " dg_protein_ligand_interaction_dfs.append(pc.calculate_interactions())\n", |
| 305 | + " except Exception as e:\n", |
| 306 | + " print(\n", |
| 307 | + " f\"Error processing Dockgen filepath pari {temp_protein_filepath} and {ligand_filepath} due to: {e}. Skipping...\"\n", |
| 308 | + " )\n", |
| 309 | + " continue\n", |
| 310 | + "\n", |
| 311 | + " # NOTE: we iteratively save the interaction dataframes to an HDF5 file\n", |
| 312 | + " with pd.HDFStore(\"dockgen_interaction_dataframes.h5\") as store:\n", |
| 313 | + " for i, df in enumerate(dg_protein_ligand_interaction_dfs):\n", |
| 314 | + " store.put(f\"df_{i}\", df)" |
| 315 | + ] |
| 316 | + }, |
251 | 317 | { |
252 | 318 | "cell_type": "markdown", |
253 | 319 | "metadata": {}, |
|
293 | 359 | " pc.load_ligands_from_mols(\n", |
294 | 360 | " Chem.GetMolFrags(ligand_mol, asMols=True, sanitizeFrags=False)\n", |
295 | 361 | " )\n", |
296 | | - " interactions = pc.calculate_interactions()\n", |
297 | | - " casp15_protein_ligand_interaction_dfs.append(interactions)\n", |
| 362 | + " casp15_protein_ligand_interaction_dfs.append(pc.calculate_interactions())\n", |
298 | 363 | " except Exception as e:\n", |
299 | 364 | " print(\n", |
300 | 365 | " f\"Error processing CASP15 target {protein_ligand_complex_filepath} due to: {e}. Skipping...\"\n", |
|
366 | 431 | " process_dataset(\"posebusters_benchmark_interaction_dataframes.h5\", \"PoseBusters Benchmark\")\n", |
367 | 432 | " )\n", |
368 | 433 | "\n", |
| 434 | + "if os.path.exists(\"dockgen_interaction_dataframes.h5\"):\n", |
| 435 | + " dfs.append(process_dataset(\"dockgen_interaction_dataframes.h5\", \"DockGen\"))\n", |
| 436 | + "\n", |
369 | 437 | "if os.path.exists(\"casp15_interaction_dataframes.h5\"):\n", |
370 | 438 | " dfs.append(process_dataset(\"casp15_interaction_dataframes.h5\", \"CASP15\"))\n", |
371 | 439 | "\n", |
|
416 | 484 | " ax.grid(True)\n", |
417 | 485 | "\n", |
418 | 486 | "plt.tight_layout()\n", |
419 | | - "plt.savefig(\"astex_posebusters_casp15_interaction_analysis.png\", dpi=300)\n", |
| 487 | + "plt.savefig(\"dataset_interaction_analysis.png\", dpi=300)\n", |
420 | 488 | "plt.show()" |
421 | 489 | ] |
422 | 490 | } |
|
0 commit comments