diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0484715..ca4b547 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,7 +38,7 @@ repos: # Ruff for linting and formatting Python files - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.14.10 + rev: v0.14.13 hooks: - id: ruff-check args: ["--fix"] diff --git a/notebooks/0.download-data/2.preprocessing.ipynb b/notebooks/0.download-data/2.preprocessing.ipynb index 6b40b1d..5ecd508 100644 --- a/notebooks/0.download-data/2.preprocessing.ipynb +++ b/notebooks/0.download-data/2.preprocessing.ipynb @@ -548,6 +548,9 @@ "# adding a unique cell ID based on all features\n", "cfret_profiles = add_cell_id_hash(cfret_profiles, force=True)\n", "\n", + "# drop rows cells that have been treated with drug_x\n", + "cfret_profiles = cfret_profiles.filter(pl.col(\"Metadata_treatment\") != \"drug_x\")\n", + "\n", "# split features\n", "meta_cols, features_cols = split_meta_and_features(cfret_profiles)\n", "\n", diff --git a/notebooks/0.download-data/nbconverted/2.preprocessing.py b/notebooks/0.download-data/nbconverted/2.preprocessing.py index b643cef..07005d7 100644 --- a/notebooks/0.download-data/nbconverted/2.preprocessing.py +++ b/notebooks/0.download-data/nbconverted/2.preprocessing.py @@ -453,6 +453,9 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # adding a unique cell ID based on all features cfret_profiles = add_cell_id_hash(cfret_profiles, force=True) +# drop rows cells that have been treated with drug_x +cfret_profiles = cfret_profiles.filter(pl.col("Metadata_treatment") != "drug_x") + # split features meta_cols, features_cols = split_meta_and_features(cfret_profiles) diff --git a/notebooks/2.cfret-analysis/1.cfret-pilot-buscar-analysis.ipynb b/notebooks/2.cfret-analysis/1.cfret-pilot-buscar-analysis.ipynb index 4866206..790c96c 100644 --- a/notebooks/2.cfret-analysis/1.cfret-pilot-buscar-analysis.ipynb +++ b/notebooks/2.cfret-analysis/1.cfret-pilot-buscar-analysis.ipynb @@ -70,18 +70,14 @@ "treatment_col = \"Metadata_treatment\"\n", "treatment_heart_col = \"Metadata_treatment_and_heart\"\n", "\n", - "# parameters used for clustering optimization\n", + "# parameter grid for clustering optimization\n", "cfret_pilot_cluster_param_grid = {\n", - " # Clustering resolution: how granular the clusters should be\n", - " \"cluster_resolution\": {\"type\": \"float\", \"low\": 0.1, \"high\": 2.2},\n", - " # Number of neighbors for graph construction\n", - " \"n_neighbors\": {\"type\": \"int\", \"low\": 5, \"high\": 100},\n", - " # Clustering algorithm\n", - " \"cluster_method\": {\"type\": \"categorical\", \"choices\": [\"leiden\"]},\n", - " # Distance metric for neighbor computation\n", + " \"cluster_resolution\": {\"type\": \"float\", \"low\": 0.05, \"high\": 3.0},\n", + " \"n_neighbors\": {\"type\": \"int\", \"low\": 10, \"high\": 100},\n", + " \"cluster_method\": {\"type\": \"categorical\", \"choices\": [\"leiden\", \"louvain\"]},\n", " \"neighbor_distance_metric\": {\n", " \"type\": \"categorical\",\n", - " \"choices\": [\"euclidean\", \"cosine\", \"manhattan\"],\n", + " \"choices\": [\"cosine\", \"euclidean\", \"manhattan\"],\n", " },\n", "}" ] @@ -113,8 +109,24 @@ ").resolve(strict=True)\n", "\n", "# make results dir\n", - "results_dir = pathlib.Path(\"./results/cfret-pilot\").resolve()\n", - "results_dir.mkdir(parents=True, exist_ok=True)" + "results_dir = pathlib.Path(\"./results\").resolve()\n", + "results_dir.mkdir(parents=True, exist_ok=True)\n", + "\n", + "# set signatures results dir\n", + "signatures_results_dir = (results_dir / \"signatures\").resolve()\n", + "signatures_results_dir.mkdir(parents=True, exist_ok=True)\n", + "\n", + "# set cluster labels results dir\n", + "cluster_labels_results_dir = (results_dir / \"clusters\").resolve()\n", + "cluster_labels_results_dir.mkdir(parents=True, exist_ok=True)\n", + "\n", + "# set pca results dir\n", + "transformed_results_dir = (results_dir / \"transformed-data\").resolve()\n", + "transformed_results_dir.mkdir(parents=True, exist_ok=True)\n", + "\n", + "# set phenotypic scores results dir\n", + "phenotypic_scores_results_dir = (results_dir / \"phenotypic_scores\").resolve()\n", + "phenotypic_scores_results_dir.mkdir(parents=True, exist_ok=True)" ] }, { @@ -225,20 +237,20 @@ "cfret_df.head()" ] }, + { + "cell_type": "markdown", + "id": "d75068b9", + "metadata": {}, + "source": [ + "Display the treatments and number of cells per heart-treatment combination " + ] + }, { "cell_type": "code", "execution_count": 5, "id": "c7e0c3d2", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_339079/1240663315.py:4: DeprecationWarning: `GroupBy.count` was renamed; use `GroupBy.len` instead\n", - " cfret_df.group_by(treatment_heart_col).count().sort(treatment_heart_col)\n" - ] - }, { "data": { "text/html": [ @@ -249,22 +261,20 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (6, 2)
Metadata_treatment_and_heartcount
stru32
"DMSO_heart_11"1366
"DMSO_heart_9"9153
"TGFRi_heart_11"1486
"TGFRi_heart_9"3788
"drug_x_heart_11"1427
"drug_x_heart_9"3645
" + "shape: (4, 2)
Metadata_treatment_and_heartlen
stru32
"DMSO_heart_11"1366
"DMSO_heart_9"9153
"TGFRi_heart_11"1486
"TGFRi_heart_9"3788
" ], "text/plain": [ - "shape: (6, 2)\n", - "┌──────────────────────────────┬───────┐\n", - "│ Metadata_treatment_and_heart ┆ count │\n", - "│ --- ┆ --- │\n", - "│ str ┆ u32 │\n", - "╞══════════════════════════════╪═══════╡\n", - "│ DMSO_heart_11 ┆ 1366 │\n", - "│ DMSO_heart_9 ┆ 9153 │\n", - "│ TGFRi_heart_11 ┆ 1486 │\n", - "│ TGFRi_heart_9 ┆ 3788 │\n", - "│ drug_x_heart_11 ┆ 1427 │\n", - "│ drug_x_heart_9 ┆ 3645 │\n", - "└──────────────────────────────┴───────┘" + "shape: (4, 2)\n", + "┌──────────────────────────────┬──────┐\n", + "│ Metadata_treatment_and_heart ┆ len │\n", + "│ --- ┆ --- │\n", + "│ str ┆ u32 │\n", + "╞══════════════════════════════╪══════╡\n", + "│ DMSO_heart_11 ┆ 1366 │\n", + "│ DMSO_heart_9 ┆ 9153 │\n", + "│ TGFRi_heart_11 ┆ 1486 │\n", + "│ TGFRi_heart_9 ┆ 3788 │\n", + "└──────────────────────────────┴──────┘" ] }, "execution_count": 5, @@ -276,7 +286,7 @@ "# show how many cells per treatment\n", "# shows the number of cells per treatment that will be clustered.\n", "cells_per_treatment_counts = (\n", - " cfret_df.group_by(treatment_heart_col).count().sort(treatment_heart_col)\n", + " cfret_df.group_by(treatment_heart_col).len().sort(treatment_heart_col)\n", ")\n", "cells_per_treatment_counts" ] @@ -302,10 +312,18 @@ "execution_count": 6, "id": "437ca668", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Signatures already exist, skipping this step.\n" + ] + } + ], "source": [ "# setting output paths\n", - "signatures_outpath = (results_dir / \"cfret_pilot_signatures.json\").resolve()\n", + "signatures_outpath = (signatures_results_dir / \"cfret_pilot_signatures.json\").resolve()\n", "\n", "if signatures_outpath.exists():\n", " print(\"Signatures already exist, skipping this step.\")\n", @@ -333,10 +351,10 @@ }, { "cell_type": "markdown", - "id": "19b63da0", + "id": "3260617b", "metadata": {}, "source": [ - "Search for heterogenous effects for each treatment" + "Transform raw data into PCA components that explains 95% of the variance." ] }, { @@ -352,11 +370,33 @@ " meta_features=cfret_meta,\n", " morph_features=cfret_feats,\n", " var_explained=0.95,\n", - " seed=0,\n", + " random_state=0,\n", ")\n", "\n", + "# save PCA transformed data\n", + "pca_cfret_outpath = (\n", + " transformed_results_dir / \"cfret_pca_profiles_95var.parquet\"\n", + ").resolve()\n", + "pca_cfret_df.write_parquet(pca_cfret_outpath)\n", + "\n", "# update cfret_feats because PCA was applied\n", - "cfret_pca_feats = pca_cfret_df.drop(cfret_meta).columns" + "cfret_pca_feats = pca_cfret_df.drop(cfret_meta).columns\n", + "\n", + "# save feature space\n", + "with open(transformed_results_dir / \"cfret_pca_feature_space.json\", \"w\") as f:\n", + " json.dump(\n", + " {\"metadata-features\": cfret_meta, \"morphological-features\": cfret_pca_feats},\n", + " f,\n", + " indent=4,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "d68e152d", + "metadata": {}, + "source": [ + "This section applies clustering to identify distinct cell populations within each treatment condition. The clustering is optimized using Optuna to find the best hyperparameters (resolution, number of neighbors, and distance metric) that maximize separation of cell populations while maintaining biological relevance." ] }, { @@ -369,87 +409,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Number of unique treatments (hearts + treatment): 6\n", - "Optimizing clustering for 6 treatment(s) with 6 job(s)...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/erikserrano/Software/miniconda3/envs/buscar/lib/python3.12/site-packages/louvain/__init__.py:54: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.\n", - " from pkg_resources import get_distribution, DistributionNotFound\n", - "/home/erikserrano/Software/miniconda3/envs/buscar/lib/python3.12/site-packages/louvain/__init__.py:54: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.\n", - " from pkg_resources import get_distribution, DistributionNotFound\n", - "/home/erikserrano/Software/miniconda3/envs/buscar/lib/python3.12/site-packages/louvain/__init__.py:54: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.\n", - " from pkg_resources import get_distribution, DistributionNotFound\n", - "/home/erikserrano/Software/miniconda3/envs/buscar/lib/python3.12/site-packages/louvain/__init__.py:54: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.\n", - " from pkg_resources import get_distribution, DistributionNotFound\n", - "/home/erikserrano/Software/miniconda3/envs/buscar/lib/python3.12/site-packages/louvain/__init__.py:54: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.\n", - " from pkg_resources import get_distribution, DistributionNotFound\n", - "/home/erikserrano/Software/miniconda3/envs/buscar/lib/python3.12/site-packages/louvain/__init__.py:54: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.\n", - " from pkg_resources import get_distribution, DistributionNotFound\n", - "[I 2025-11-09 12:30:35,009] A new study created in memory with name: cfret_pilot_pca_DMSO_heart_11\n", - "[I 2025-11-09 12:30:35,076] A new study created in memory with name: cfret_pilot_pca_drug_x_heart_11\n", - "[I 2025-11-09 12:30:35,158] A new study created in memory with name: cfret_pilot_pca_TGFRi_heart_9\n", - "[I 2025-11-09 12:30:35,173] A new study created in memory with name: cfret_pilot_pca_drug_x_heart_9\n", - "[I 2025-11-09 12:30:35,196] A new study created in memory with name: cfret_pilot_pca_TGFRi_heart_11\n", - "[I 2025-11-09 12:30:35,222] A new study created in memory with name: cfret_pilot_pca_DMSO_heart_9\n", - "/home/erikserrano/Projects/buscar/notebooks/2.cfret-analysis/../../utils/heterogeneity.py:236: FutureWarning: In the future, the default backend for leiden will be igraph instead of leidenalg.\n", - "\n", - " To achieve the future defaults please pass: flavor=\"igraph\" and n_iterations=2. directed must also be False to work with igraph's implementation.\n", - " sc.tl.leiden(\n", - "/home/erikserrano/Projects/buscar/notebooks/2.cfret-analysis/../../utils/heterogeneity.py:236: FutureWarning: In the future, the default backend for leiden will be igraph instead of leidenalg.\n", - "\n", - " To achieve the future defaults please pass: flavor=\"igraph\" and n_iterations=2. directed must also be False to work with igraph's implementation.\n", - " sc.tl.leiden(\n", - "/home/erikserrano/Projects/buscar/notebooks/2.cfret-analysis/../../utils/heterogeneity.py:236: FutureWarning: In the future, the default backend for leiden will be igraph instead of leidenalg.\n", - "\n", - " To achieve the future defaults please pass: flavor=\"igraph\" and n_iterations=2. directed must also be False to work with igraph's implementation.\n", - " sc.tl.leiden(\n", - "/home/erikserrano/Projects/buscar/notebooks/2.cfret-analysis/../../utils/heterogeneity.py:236: FutureWarning: In the future, the default backend for leiden will be igraph instead of leidenalg.\n", - "\n", - " To achieve the future defaults please pass: flavor=\"igraph\" and n_iterations=2. directed must also be False to work with igraph's implementation.\n", - " sc.tl.leiden(\n", - "[I 2025-11-09 12:30:42,826] Trial 0 finished with value: 0.04100142061144193 and parameters: {'cluster_resolution': 0.615874986133352, 'n_neighbors': 35, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'cosine'}. Best is trial 0 with value: 0.04100142061144193.\n", - "/home/erikserrano/Projects/buscar/notebooks/2.cfret-analysis/../../utils/heterogeneity.py:236: FutureWarning: In the future, the default backend for leiden will be igraph instead of leidenalg.\n", - "\n", - " To achieve the future defaults please pass: flavor=\"igraph\" and n_iterations=2. directed must also be False to work with igraph's implementation.\n", - " sc.tl.leiden(\n", - "[I 2025-11-09 12:30:43,599] Trial 0 finished with value: -1.0 and parameters: {'cluster_resolution': 0.14721895087458015, 'n_neighbors': 19, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'manhattan'}. Best is trial 0 with value: -1.0.\n", - "[I 2025-11-09 12:30:43,604] Trial 0 finished with value: 0.028557868208511677 and parameters: {'cluster_resolution': 0.43038472133043715, 'n_neighbors': 63, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'cosine'}. Best is trial 0 with value: 0.028557868208511677.\n", - "[I 2025-11-09 12:30:43,622] Trial 1 finished with value: 0.04132911168771784 and parameters: {'cluster_resolution': 0.7747165384593787, 'n_neighbors': 43, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'euclidean'}. Best is trial 1 with value: 0.04132911168771784.\n", - "[I 2025-11-09 12:30:43,988] Trial 0 finished with value: -0.011059941009945867 and parameters: {'cluster_resolution': 1.9927959039991667, 'n_neighbors': 5, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'euclidean'}. Best is trial 0 with value: -0.011059941009945867.\n", - "[I 2025-11-09 12:30:44,240] Trial 1 finished with value: 0.20252471787791898 and parameters: {'cluster_resolution': 0.26087142409254793, 'n_neighbors': 20, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'cosine'}. Best is trial 1 with value: 0.20252471787791898.\n", - "[I 2025-11-09 12:30:44,422] Trial 0 finished with value: 0.005309892082091613 and parameters: {'cluster_resolution': 1.806574588577733, 'n_neighbors': 55, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'cosine'}. Best is trial 0 with value: 0.005309892082091613.\n", - "[I 2025-11-09 12:30:46,969] Trial 1 finished with value: 0.010633223155152081 and parameters: {'cluster_resolution': 1.808553434505587, 'n_neighbors': 94, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'cosine'}. Best is trial 1 with value: 0.010633223155152081.\n", - "[I 2025-11-09 12:30:47,238] Trial 1 finished with value: 0.00606484131259955 and parameters: {'cluster_resolution': 1.1850800391535194, 'n_neighbors': 30, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'manhattan'}. Best is trial 1 with value: 0.00606484131259955.\n", - "[I 2025-11-09 12:30:48,420] Trial 1 finished with value: 0.004045281868313049 and parameters: {'cluster_resolution': 2.0546310508099337, 'n_neighbors': 57, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'euclidean'}. Best is trial 1 with value: 0.004045281868313049.\n", - "/home/erikserrano/Projects/buscar/notebooks/2.cfret-analysis/../../utils/heterogeneity.py:236: FutureWarning: In the future, the default backend for leiden will be igraph instead of leidenalg.\n", - "\n", - " To achieve the future defaults please pass: flavor=\"igraph\" and n_iterations=2. directed must also be False to work with igraph's implementation.\n", - " sc.tl.leiden(\n", - "[I 2025-11-09 12:31:19,651] Trial 0 finished with value: -1.0 and parameters: {'cluster_resolution': 0.34704506427437753, 'n_neighbors': 75, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'manhattan'}. Best is trial 0 with value: -1.0.\n", - "[I 2025-11-09 12:31:30,259] Trial 1 finished with value: -0.00138083707769107 and parameters: {'cluster_resolution': 1.2885634587473769, 'n_neighbors': 14, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'manhattan'}. Best is trial 1 with value: -0.00138083707769107.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " DMSO_heart_9: silhouette=-0.001, params={'cluster_resolution': 1.2885634587473769, 'n_neighbors': 14, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'manhattan'}\n", - " drug_x_heart_9: silhouette=0.006, params={'cluster_resolution': 1.1850800391535194, 'n_neighbors': 30, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'manhattan'}\n", - " TGFRi_heart_9: silhouette=0.004, params={'cluster_resolution': 2.0546310508099337, 'n_neighbors': 57, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'euclidean'}\n", - " DMSO_heart_11: silhouette=0.011, params={'cluster_resolution': 1.808553434505587, 'n_neighbors': 94, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'cosine'}\n", - " drug_x_heart_11: silhouette=0.203, params={'cluster_resolution': 0.26087142409254793, 'n_neighbors': 20, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'cosine'}\n", - " TGFRi_heart_11: silhouette=0.041, params={'cluster_resolution': 0.7747165384593787, 'n_neighbors': 43, 'cluster_method': 'leiden', 'neighbor_distance_metric': 'euclidean'}\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "sys:1: CategoricalRemappingWarning: Local categoricals have different encodings, expensive re-encoding is done to perform this merge operation. Consider using a StringCache or an Enum type if the categories are known in advance\n" + "Number of unique treatments (hearts + treatment): 4\n", + "Cluster labels already exist, skipping clustering optimization.\n" ] } ], @@ -461,7 +422,9 @@ "\n", "# check if the cluster labels already exist; if so just load the labels and skip optimization\n", "# if not run optimization\n", - "cluster_labels_output = (results_dir / \"cfret_pilot_cluster_labels.parquet\").resolve()\n", + "cluster_labels_output = (\n", + " cluster_labels_results_dir / \"cfret_pilot_cluster_labels.parquet\"\n", + ").resolve()\n", "if cluster_labels_output.exists():\n", " print(\"Cluster labels already exist, skipping clustering optimization.\")\n", " cfret_cluster_labels_df = pl.read_parquet(cluster_labels_output)\n", @@ -486,13 +449,23 @@ " cfret_cluster_labels_df.write_parquet(cluster_labels_output)\n", "\n", " # write best params as a json file\n", - " with open(results_dir / \"cfret_pilot_best_clustering_params.json\", \"w\") as f:\n", + " with open(\n", + " cluster_labels_results_dir / \"cfret_pilot_best_clustering_params.json\", \"w\"\n", + " ) as f:\n", " json.dump(cfret_best_params, f, indent=4)" ] }, + { + "cell_type": "markdown", + "id": "14570634", + "metadata": {}, + "source": [ + "This section measures the phenotypic distance between each treatment and the reference control (DMSO_heart_11) using the on and off signatures. The phenotypic scores are then used to rank treatments and identify top-ranking compounds based on their morphological activity." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "6ef0c6e0", "metadata": {}, "outputs": [], @@ -511,15 +484,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "dc6be2d0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Treatment phenotypic distance scores already exist, skipping this step.\n" + ] + } + ], "source": [ "# setting output paths\n", "treatment_dist_scores_outpath = (\n", - " results_dir / \"treatment_phenotypic_scores.csv\"\n", + " phenotypic_scores_results_dir / \"treatment_phenotypic_scores.csv\"\n", ").resolve()\n", + "\n", + "# calculate phenotypic distance scores\n", "if treatment_dist_scores_outpath.exists():\n", " print(\"Treatment phenotypic distance scores already exist, skipping this step.\")\n", " treatment_heart_dist_scores = pl.read_csv(treatment_dist_scores_outpath)\n", @@ -539,74 +522,25 @@ }, { "cell_type": "code", - "execution_count": 12, - "id": "d65cd245", + "execution_count": 11, + "id": "83bfbb82", "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "shape: (564, 6)
ref_clustertreatmenttrt_clusteron_distoff_distexp_cluster_ratio
strstrstrf64f64f64
"DMSO_heart_11_leiden_3""TGFRi_heart_9""TGFRi_heart_9_leiden_1"29.88296511.2526780.091077
"DMSO_heart_11_leiden_6""TGFRi_heart_9""TGFRi_heart_9_leiden_1"25.3105111.2269790.091077
"DMSO_heart_11_leiden_1""TGFRi_heart_9""TGFRi_heart_9_leiden_1"30.73077211.389310.091077
"DMSO_heart_11_leiden_7""TGFRi_heart_9""TGFRi_heart_9_leiden_1"28.19400810.7659770.091077
"DMSO_heart_11_leiden_10""TGFRi_heart_9""TGFRi_heart_9_leiden_1"28.50708411.2781460.091077
"DMSO_heart_11_leiden_11""DMSO_heart_9""DMSO_heart_9_leiden_11"34.03375912.9210120.022506
"DMSO_heart_11_leiden_5""DMSO_heart_9""DMSO_heart_9_leiden_11"33.11779312.0168390.022506
"DMSO_heart_11_leiden_9""DMSO_heart_9""DMSO_heart_9_leiden_11"39.26314712.847640.022506
"DMSO_heart_11_leiden_8""DMSO_heart_9""DMSO_heart_9_leiden_11"33.9826111.5037930.022506
"DMSO_heart_11_leiden_2""DMSO_heart_9""DMSO_heart_9_leiden_11"36.37054111.7612030.022506
" - ], - "text/plain": [ - "shape: (564, 6)\n", - "┌───────────────────┬───────────────┬───────────────────┬───────────┬───────────┬──────────────────┐\n", - "│ ref_cluster ┆ treatment ┆ trt_cluster ┆ on_dist ┆ off_dist ┆ exp_cluster_rati │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ o │\n", - "│ str ┆ str ┆ str ┆ f64 ┆ f64 ┆ --- │\n", - "│ ┆ ┆ ┆ ┆ ┆ f64 │\n", - "╞═══════════════════╪═══════════════╪═══════════════════╪═══════════╪═══════════╪══════════════════╡\n", - "│ DMSO_heart_11_lei ┆ TGFRi_heart_9 ┆ TGFRi_heart_9_lei ┆ 29.882965 ┆ 11.252678 ┆ 0.091077 │\n", - "│ den_3 ┆ ┆ den_1 ┆ ┆ ┆ │\n", - "│ DMSO_heart_11_lei ┆ TGFRi_heart_9 ┆ TGFRi_heart_9_lei ┆ 25.31051 ┆ 11.226979 ┆ 0.091077 │\n", - "│ den_6 ┆ ┆ den_1 ┆ ┆ ┆ │\n", - "│ DMSO_heart_11_lei ┆ TGFRi_heart_9 ┆ TGFRi_heart_9_lei ┆ 30.730772 ┆ 11.38931 ┆ 0.091077 │\n", - "│ den_1 ┆ ┆ den_1 ┆ ┆ ┆ │\n", - "│ DMSO_heart_11_lei ┆ TGFRi_heart_9 ┆ TGFRi_heart_9_lei ┆ 28.194008 ┆ 10.765977 ┆ 0.091077 │\n", - "│ den_7 ┆ ┆ den_1 ┆ ┆ ┆ │\n", - "│ DMSO_heart_11_lei ┆ TGFRi_heart_9 ┆ TGFRi_heart_9_lei ┆ 28.507084 ┆ 11.278146 ┆ 0.091077 │\n", - "│ den_10 ┆ ┆ den_1 ┆ ┆ ┆ │\n", - "│ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", - "│ DMSO_heart_11_lei ┆ DMSO_heart_9 ┆ DMSO_heart_9_leid ┆ 34.033759 ┆ 12.921012 ┆ 0.022506 │\n", - "│ den_11 ┆ ┆ en_11 ┆ ┆ ┆ │\n", - "│ DMSO_heart_11_lei ┆ DMSO_heart_9 ┆ DMSO_heart_9_leid ┆ 33.117793 ┆ 12.016839 ┆ 0.022506 │\n", - "│ den_5 ┆ ┆ en_11 ┆ ┆ ┆ │\n", - "│ DMSO_heart_11_lei ┆ DMSO_heart_9 ┆ DMSO_heart_9_leid ┆ 39.263147 ┆ 12.84764 ┆ 0.022506 │\n", - "│ den_9 ┆ ┆ en_11 ┆ ┆ ┆ │\n", - "│ DMSO_heart_11_lei ┆ DMSO_heart_9 ┆ DMSO_heart_9_leid ┆ 33.98261 ┆ 11.503793 ┆ 0.022506 │\n", - "│ den_8 ┆ ┆ en_11 ┆ ┆ ┆ │\n", - "│ DMSO_heart_11_lei ┆ DMSO_heart_9 ┆ DMSO_heart_9_leid ┆ 36.370541 ┆ 11.761203 ┆ 0.022506 │\n", - "│ den_2 ┆ ┆ en_11 ┆ ┆ ┆ │\n", - "└───────────────────┴───────────────┴───────────────────┴───────────┴───────────┴──────────────────┘" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Treatment heart rankings already exist, skipping this step.\n" + ] } ], "source": [ - "treatment_heart_dist_scores" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "83bfbb82", - "metadata": {}, - "outputs": [], - "source": [ - "# setting outptut paths\n", + "# setting output paths\n", "treatment_heart_rankings_outpath = (\n", - " results_dir / \"treatment_heart_rankings.csv\"\n", + " phenotypic_scores_results_dir / \"treatment_heart_rankings.csv\"\n", ").resolve()\n", + "\n", + "# identify hits based on distance scores\n", "if treatment_heart_rankings_outpath.exists():\n", " print(\"Treatment heart rankings already exist, skipping this step.\")\n", " treatment_heart_rankings = pl.read_csv(treatment_heart_rankings_outpath)\n", diff --git a/notebooks/2.cfret-analysis/2.generate-aggregate-profiles.ipynb b/notebooks/2.cfret-analysis/2.generate-aggregate-profiles.ipynb new file mode 100644 index 0000000..1e2ebe6 --- /dev/null +++ b/notebooks/2.cfret-analysis/2.generate-aggregate-profiles.ipynb @@ -0,0 +1,253 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e4243db2", + "metadata": {}, + "source": [ + "# 2. Generating Aggregate Profiles\n", + "\n", + "This notebook transforms single-cell morphological profiles from the CFReT pilot dataset into summary representations for downstream analysis. Aggregation reduces noise and enables robust comparisons between experimental conditions by collapsing hundreds or thousands of single-cell measurements into representative profiles.\n", + "\n", + "Two levels of aggregation are generated:\n", + "1. **Replicate-level profiles**: Aggregate cells by well position, heart number, cell type, and treatment to create technical replicate profiles\n", + "2. **Consensus profiles**: Further aggregate replicates by heart type and treatment to generate condition-level consensus signatures\n", + "\n", + "Here we used `pycytominer.aggregate()` to apply median aggregation to generate two profiles explained above. Then output profiles are saved as parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4583045e", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import pathlib\n", + "\n", + "import polars as pl\n", + "from pycytominer import aggregate\n", + "\n", + "sys.path.append(\"../../\")\n", + "from utils.data_utils import split_meta_and_features" + ] + }, + { + "cell_type": "markdown", + "id": "e9d61ea3", + "metadata": {}, + "source": [ + "Setting input and output paths " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "cc7b356b", + "metadata": {}, + "outputs": [], + "source": [ + "# setting data path for cfret-pilot dataset\n", + "cfret_profiles_path = pathlib.Path(\n", + " \"../0.download-data/data/sc-profiles/cfret/localhost230405150001_sc_feature_selected.parquet\"\n", + ").resolve(strict=True)\n", + "\n", + "# set results directory path\n", + "results_dir = pathlib.Path(\"./results\").resolve()\n", + "results_dir.mkdir(exist_ok=True)\n", + "\n", + "# make aggregate profile directory\n", + "aggregate_profiles_dir = results_dir / \"aggregate_profiles\"\n", + "aggregate_profiles_dir.mkdir(exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3c81e833", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(15793, 678)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 678)
Metadata_cell_idMetadata_WellRowMetadata_WellColMetadata_heart_numberMetadata_cell_typeMetadata_heart_failure_typeMetadata_treatmentMetadata_Nuclei_Location_Center_XMetadata_Nuclei_Location_Center_YMetadata_Cells_Location_Center_XMetadata_Cells_Location_Center_YMetadata_Image_Count_CellsMetadata_ImageNumberMetadata_PlateMetadata_WellMetadata_Cells_Number_Object_NumberMetadata_Cytoplasm_Parent_CellsMetadata_Cytoplasm_Parent_NucleiMetadata_Nuclei_Number_Object_NumberMetadata_SiteCytoplasm_AreaShape_BoundingBoxMinimum_XCytoplasm_AreaShape_CompactnessCytoplasm_AreaShape_EccentricityCytoplasm_AreaShape_ExtentCytoplasm_AreaShape_FormFactorCytoplasm_AreaShape_MajorAxisLengthCytoplasm_AreaShape_MeanRadiusCytoplasm_AreaShape_MinorAxisLengthCytoplasm_AreaShape_PerimeterCytoplasm_AreaShape_SolidityCytoplasm_AreaShape_Zernike_0_0Cytoplasm_AreaShape_Zernike_1_1Cytoplasm_AreaShape_Zernike_2_0Cytoplasm_AreaShape_Zernike_2_2Cytoplasm_AreaShape_Zernike_3_1Cytoplasm_AreaShape_Zernike_4_0Cytoplasm_AreaShape_Zernike_4_2Nuclei_Texture_DifferenceVariance_Actin_3_01_256Nuclei_Texture_DifferenceVariance_Mitochondria_3_03_256Nuclei_Texture_DifferenceVariance_PM_3_03_256Nuclei_Texture_InfoMeas1_ER_3_00_256Nuclei_Texture_InfoMeas1_ER_3_01_256Nuclei_Texture_InfoMeas1_ER_3_02_256Nuclei_Texture_InfoMeas1_ER_3_03_256Nuclei_Texture_InfoMeas1_Hoechst_3_00_256Nuclei_Texture_InfoMeas1_Hoechst_3_01_256Nuclei_Texture_InfoMeas1_Hoechst_3_02_256Nuclei_Texture_InfoMeas1_Hoechst_3_03_256Nuclei_Texture_InfoMeas1_Mitochondria_3_00_256Nuclei_Texture_InfoMeas1_Mitochondria_3_01_256Nuclei_Texture_InfoMeas1_Mitochondria_3_02_256Nuclei_Texture_InfoMeas1_Mitochondria_3_03_256Nuclei_Texture_InfoMeas1_PM_3_00_256Nuclei_Texture_InfoMeas1_PM_3_01_256Nuclei_Texture_InfoMeas1_PM_3_02_256Nuclei_Texture_InfoMeas1_PM_3_03_256Nuclei_Texture_InfoMeas2_ER_3_01_256Nuclei_Texture_InfoMeas2_ER_3_03_256Nuclei_Texture_InfoMeas2_Hoechst_3_01_256Nuclei_Texture_InfoMeas2_Hoechst_3_03_256Nuclei_Texture_InfoMeas2_PM_3_01_256Nuclei_Texture_InfoMeas2_PM_3_03_256Nuclei_Texture_InverseDifferenceMoment_Actin_3_02_256Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_03_256Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256Nuclei_Texture_SumEntropy_PM_3_01_256Nuclei_Texture_SumVariance_ER_3_03_256Nuclei_Texture_SumVariance_Hoechst_3_03_256Nuclei_Texture_SumVariance_Mitochondria_3_01_256Nuclei_Texture_SumVariance_PM_3_01_256Metadata_heart_treatment
strstri64i64strstrstrf64f64f64f64i64i64strstri64i64i64i64strf64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64str
"210e4376efde9d557a5c60029bdda6…"B"29"failing""rejected""DMSO"221.046761137.115493246.6028109.285755401"localhost230405150001""B02"1166"f00"-1.354940.8412290.648883-0.850138-1.0452141.2983580.3761650.9351011.530228-0.983617-0.261031-0.299817-0.7219770.9447250.1610740.5323291.845864-0.0527190.7970950.359081-0.1733360.3000410.217945-0.0397740.4885310.4721640.286590.4643590.5016490.5076231.0766630.741941-0.696022-0.1787620.1867410.1582220.3415950.50487-0.440604-0.4269660.194372-0.0351170.400021-0.619206-0.3934480.9612140.4060680.374039-0.280532-0.158967-0.344804-0.263653-0.305486"failing_DMSO"
"cef18f209640ef8ae98ec110cfdcb6…"B"29"failing""rejected""DMSO"690.596142183.067828716.170091177.132195401"localhost230405150001""B02"2277"f00"0.657107-0.850399-0.5849312.0909251.263259-0.0210311.6279570.944161-0.0855111.4753452.164761-0.6884621.2150151.499086-0.7706671.0127210.6791-0.318777-1.154168-0.664730.1348350.263514-0.1243090.6345170.9685120.8595620.3511440.914468-2.508508-2.389124-1.80698-2.121536-0.231231-0.763949-1.055166-0.2581520.2823190.048807-0.981164-1.07430.6129960.2903390.030854-0.421502-0.61852-0.0509250.4247530.323462-0.096856-0.218001-0.3592972.621455-0.175679"failing_DMSO"
"cca07fa581da808fdefe80f9c0542d…"B"29"failing""rejected""DMSO"626.56149206.923698623.94374199.90644401"localhost230405150001""B02"3388"f00"0.384287-0.7273440.3998130.6995680.778991-0.192578-0.166121-0.185078-0.6205640.3853250.41953-1.35377-0.1890271.88019-0.1988230.778262.084304-0.4372250.0970140.148712-0.1262390.315114-0.682006-0.9529940.5345210.448969-0.5122130.68761-0.333052-1.116806-0.671374-0.0855830.5656590.117809-0.0352320.3400220.3921090.906171-0.637012-0.912759-0.139719-0.319312-0.119514-0.62708-0.2139980.4920220.7834650.531513-0.515924-0.090464-0.381751-0.23489-0.312005"failing_DMSO"
"c076728ed2ebba7c01e6adb4244b02…"B"29"failing""rejected""DMSO"559.448583220.68816528.646623196.955552401"localhost230405150001""B02"4499"f00"-0.08178-0.31057-1.9844630.923396-0.152527-0.4547480.4856720.9781430.0758530.3330351.0367022.124015-0.11271-1.2760170.6634991.351768-2.07981-0.1802730.1544550.355861-0.2851380.187411-0.401472-1.3237160.2164790.6944550.223340.272893-1.610123-1.983535-1.990444-1.759351-0.667021-1.511134-1.70973-1.0256080.389880.970785-0.723812-0.2404651.028610.8178750.731123-0.4102790.0669510.2339850.6976680.38680.216837-0.078625-0.345897-0.148249-0.205381"failing_DMSO"
"2e8f5f11d29d8f82baa39f573e2e51…"B"29"failing""rejected""DMSO"909.019946247.69434897.965996253.621836401"localhost230405150001""B02"551010"f00"1.384627-0.2368570.651571-0.525561-0.256208-0.352022-0.51073-0.650514-0.61187-0.390602-0.9156440.274757-0.807468-0.2639141.0128770.3330810.457026-0.235359-0.8743221.0367520.5603280.0870480.5009351.0246880.6823560.703425-0.5599190.535412-0.446346-0.250839-0.325067-0.2207810.135176-0.068065-1.328074-0.471597-0.313553-1.011855-0.921082-0.718369-0.17010.0766690.1510630.784110.796587-0.8330350.9717810.96971-0.859995-0.437968-0.3754270.054053-0.346036"failing_DMSO"
" + ], + "text/plain": [ + "shape: (5, 678)\n", + "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", + "│ Metadata_ ┆ Metadata_ ┆ Metadata_ ┆ Metadata_ ┆ … ┆ Nuclei_Te ┆ Nuclei_Te ┆ Nuclei_Te ┆ Metadata │\n", + "│ cell_id ┆ WellRow ┆ WellCol ┆ heart_num ┆ ┆ xture_Sum ┆ xture_Sum ┆ xture_Sum ┆ _heart_t │\n", + "│ --- ┆ --- ┆ --- ┆ ber ┆ ┆ Variance_ ┆ Variance_ ┆ Variance_ ┆ reatment │\n", + "│ str ┆ str ┆ i64 ┆ --- ┆ ┆ Hoe… ┆ Mit… ┆ PM_… ┆ --- │\n", + "│ ┆ ┆ ┆ i64 ┆ ┆ --- ┆ --- ┆ --- ┆ str │\n", + "│ ┆ ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ │\n", + "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", + "│ 210e4376e ┆ B ┆ 2 ┆ 9 ┆ … ┆ -0.344804 ┆ -0.263653 ┆ -0.305486 ┆ failing_ │\n", + "│ fde9d557a ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ DMSO │\n", + "│ 5c60029bd ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ da6… ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ cef18f209 ┆ B ┆ 2 ┆ 9 ┆ … ┆ -0.359297 ┆ 2.621455 ┆ -0.175679 ┆ failing_ │\n", + "│ 640ef8ae9 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ DMSO │\n", + "│ 8ec110cfd ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ cb6… ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ cca07fa58 ┆ B ┆ 2 ┆ 9 ┆ … ┆ -0.381751 ┆ -0.23489 ┆ -0.312005 ┆ failing_ │\n", + "│ 1da808fde ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ DMSO │\n", + "│ fe80f9c05 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 42d… ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ c076728ed ┆ B ┆ 2 ┆ 9 ┆ … ┆ -0.345897 ┆ -0.148249 ┆ -0.205381 ┆ failing_ │\n", + "│ 2ebba7c01 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ DMSO │\n", + "│ e6adb4244 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ b02… ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2e8f5f11d ┆ B ┆ 2 ┆ 9 ┆ … ┆ -0.375427 ┆ 0.054053 ┆ -0.346036 ┆ failing_ │\n", + "│ 29d8f82ba ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ DMSO │\n", + "│ a39f573e2 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ e51… ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# load in the cfret-pilot dataset\n", + "cfret_df = pl.read_parquet(cfret_profiles_path)\n", + "\n", + "# add a column that indicates the heart and treatment added\n", + "cfret_df = cfret_df.with_columns(\n", + " pl.concat_str(\n", + " [\n", + " pl.col(\"Metadata_cell_type\"),\n", + " pl.col(\"Metadata_treatment\"),\n", + " ],\n", + " separator=\"_\",\n", + " ).alias(\"Metadata_heart_treatment\")\n", + ")\n", + "\n", + "# split feature space\n", + "cfret_meta, cfret_feats = split_meta_and_features(cfret_df)\n", + "\n", + "# display\n", + "print(cfret_df.shape)\n", + "cfret_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "3e32377a", + "metadata": {}, + "source": [ + "Generating aggregate profiles at the replicate level " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5524105f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/home/erikserrano/Projects/buscar/notebooks/2.cfret-analysis/results/aggregate_profiles/cfret_replicate_profiles.parquet')" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "aggregate(\n", + " population_df=cfret_df.to_pandas(),\n", + " strata=[\n", + " \"Metadata_heart_treatment\",\n", + " \"Metadata_WellRow\",\n", + " \"Metadata_WellCol\",\n", + " \"Metadata_heart_number\",\n", + " \"Metadata_cell_type\",\n", + " \"Metadata_treatment\",\n", + " ],\n", + " features=cfret_feats,\n", + " operation=\"median\",\n", + " output_type=\"parquet\",\n", + " output_file=(aggregate_profiles_dir / \"cfret_replicate_profiles.parquet\").resolve(),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "399f6d2e", + "metadata": {}, + "source": [ + "Generating consensus profiles of of the treatment and heart type " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ef7d7c6a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/home/erikserrano/Projects/buscar/notebooks/2.cfret-analysis/results/aggregate_profiles/cfret_consensus_profiles.parquet')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# aggregating profiles by heart and treatment\n", + "aggregate(\n", + " population_df=cfret_df.to_pandas(),\n", + " strata=[\"Metadata_heart_treatment\", \"Metadata_cell_type\", \"Metadata_treatment\"],\n", + " features=cfret_feats,\n", + " operation=\"median\",\n", + " output_type=\"parquet\",\n", + " output_file=(aggregate_profiles_dir / \"cfret_consensus_profiles.parquet\").resolve(),\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "buscar", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/2.cfret-analysis/3.generate-centroid.ipynb b/notebooks/2.cfret-analysis/3.generate-centroid.ipynb new file mode 100644 index 0000000..4abf4bc --- /dev/null +++ b/notebooks/2.cfret-analysis/3.generate-centroid.ipynb @@ -0,0 +1,356 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "21cce3f3", + "metadata": {}, + "source": [ + "## 3 Generating centroid profiles\n", + "In this notebook, we identify the centroid for each cluster found in the single-cell profiles after running the buscar clustering module.\n", + "\n", + "The centroid is the representative cell that best captures the distribution of cells within a cluster." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "f5f30dda", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import pathlib\n", + "\n", + "import polars as pl\n", + "from pycytominer import aggregate\n", + "\n", + "sys.path.append(\"../../\")\n", + "from utils.data_utils import split_meta_and_features" + ] + }, + { + "cell_type": "markdown", + "id": "d7ce927b", + "metadata": {}, + "source": [ + "Setting input and output paths" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "90cd4cde", + "metadata": {}, + "outputs": [], + "source": [ + "# setting data path for cfret-pilot dataset\n", + "cfret_profiles_path = pathlib.Path(\n", + " \"../0.download-data/data/sc-profiles/cfret/localhost230405150001_sc_feature_selected.parquet\"\n", + ").resolve(strict=True)\n", + "\n", + "# setting cluster labels path\n", + "cluster_labels_path = pathlib.Path(\n", + " \"./results/clusters/cfret_pilot_cluster_labels.parquet\"\n", + ").resolve(strict=True)\n", + "\n", + "# setting outpaths for results\n", + "results_dir = pathlib.Path(\"./results\").resolve()\n", + "results_dir.mkdir(exist_ok=True)\n", + "\n", + "# setting outpath for centroids\n", + "centroids_dir = (results_dir / \"centroids\").resolve()\n", + "centroids_dir.mkdir(exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "id": "947cc2f0", + "metadata": {}, + "source": [ + "Loading profiles" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "65c89040", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(15793, 682)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 682)
Metadata_cell_idMetadata_WellRowMetadata_WellColMetadata_heart_numberMetadata_cell_typeMetadata_heart_failure_typeMetadata_treatmentMetadata_Nuclei_Location_Center_XMetadata_Nuclei_Location_Center_YMetadata_Cells_Location_Center_XMetadata_Cells_Location_Center_YMetadata_Image_Count_CellsMetadata_ImageNumberMetadata_PlateMetadata_WellMetadata_Cells_Number_Object_NumberMetadata_Cytoplasm_Parent_CellsMetadata_Cytoplasm_Parent_NucleiMetadata_Nuclei_Number_Object_NumberMetadata_SiteMetadata_cluster_idMetadata_cluster_n_cellsMetadata_treatment_n_cellsMetadata_cluster_ratioMetadata_heart_treatmentCytoplasm_AreaShape_BoundingBoxMinimum_XCytoplasm_AreaShape_CompactnessCytoplasm_AreaShape_EccentricityCytoplasm_AreaShape_ExtentCytoplasm_AreaShape_FormFactorCytoplasm_AreaShape_MajorAxisLengthCytoplasm_AreaShape_MeanRadiusCytoplasm_AreaShape_MinorAxisLengthCytoplasm_AreaShape_PerimeterCytoplasm_AreaShape_SolidityCytoplasm_AreaShape_Zernike_0_0Cytoplasm_AreaShape_Zernike_1_1Nuclei_Texture_DifferenceEntropy_PM_3_00_256Nuclei_Texture_DifferenceVariance_Actin_3_01_256Nuclei_Texture_DifferenceVariance_Mitochondria_3_03_256Nuclei_Texture_DifferenceVariance_PM_3_03_256Nuclei_Texture_InfoMeas1_ER_3_00_256Nuclei_Texture_InfoMeas1_ER_3_01_256Nuclei_Texture_InfoMeas1_ER_3_02_256Nuclei_Texture_InfoMeas1_ER_3_03_256Nuclei_Texture_InfoMeas1_Hoechst_3_00_256Nuclei_Texture_InfoMeas1_Hoechst_3_01_256Nuclei_Texture_InfoMeas1_Hoechst_3_02_256Nuclei_Texture_InfoMeas1_Hoechst_3_03_256Nuclei_Texture_InfoMeas1_Mitochondria_3_00_256Nuclei_Texture_InfoMeas1_Mitochondria_3_01_256Nuclei_Texture_InfoMeas1_Mitochondria_3_02_256Nuclei_Texture_InfoMeas1_Mitochondria_3_03_256Nuclei_Texture_InfoMeas1_PM_3_00_256Nuclei_Texture_InfoMeas1_PM_3_01_256Nuclei_Texture_InfoMeas1_PM_3_02_256Nuclei_Texture_InfoMeas1_PM_3_03_256Nuclei_Texture_InfoMeas2_ER_3_01_256Nuclei_Texture_InfoMeas2_ER_3_03_256Nuclei_Texture_InfoMeas2_Hoechst_3_01_256Nuclei_Texture_InfoMeas2_Hoechst_3_03_256Nuclei_Texture_InfoMeas2_PM_3_01_256Nuclei_Texture_InfoMeas2_PM_3_03_256Nuclei_Texture_InverseDifferenceMoment_Actin_3_02_256Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_03_256Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256Nuclei_Texture_SumEntropy_PM_3_01_256Nuclei_Texture_SumVariance_ER_3_03_256Nuclei_Texture_SumVariance_Hoechst_3_03_256Nuclei_Texture_SumVariance_Mitochondria_3_01_256Nuclei_Texture_SumVariance_PM_3_01_256
strstri64i64strstrstrf64f64f64f64i64i64strstri64i64i64i64strcatu32u32f64strf64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64
"210e4376efde9d557a5c60029bdda6…"B"29"failing""rejected""DMSO"221.046761137.115493246.6028109.285755401"localhost230405150001""B02"1166"f00""DMSO_heart_9_louvain_0"914291530.998798"failing_DMSO"-1.354940.8412290.648883-0.850138-1.0452141.2983580.3761650.9351011.530228-0.983617-0.261031-0.299817-0.740763-0.0527190.7970950.359081-0.1733360.3000410.217945-0.0397740.4885310.4721640.286590.4643590.5016490.5076231.0766630.741941-0.696022-0.1787620.1867410.1582220.3415950.50487-0.440604-0.4269660.194372-0.0351170.400021-0.619206-0.3934480.9612140.4060680.374039-0.280532-0.158967-0.344804-0.263653-0.305486
"cef18f209640ef8ae98ec110cfdcb6…"B"29"failing""rejected""DMSO"690.596142183.067828716.170091177.132195401"localhost230405150001""B02"2277"f00""DMSO_heart_9_louvain_0"914291530.998798"failing_DMSO"0.657107-0.850399-0.5849312.0909251.263259-0.0210311.6279570.944161-0.0855111.4753452.164761-0.6884620.037684-0.318777-1.154168-0.664730.1348350.263514-0.1243090.6345170.9685120.8595620.3511440.914468-2.508508-2.389124-1.80698-2.121536-0.231231-0.763949-1.055166-0.2581520.2823190.048807-0.981164-1.07430.6129960.2903390.030854-0.421502-0.61852-0.0509250.4247530.323462-0.096856-0.218001-0.3592972.621455-0.175679
"cca07fa581da808fdefe80f9c0542d…"B"29"failing""rejected""DMSO"626.56149206.923698623.94374199.90644401"localhost230405150001""B02"3388"f00""DMSO_heart_9_louvain_0"914291530.998798"failing_DMSO"0.384287-0.7273440.3998130.6995680.778991-0.192578-0.166121-0.185078-0.6205640.3853250.41953-1.35377-0.439591-0.4372250.0970140.148712-0.1262390.315114-0.682006-0.9529940.5345210.448969-0.5122130.68761-0.333052-1.116806-0.671374-0.0855830.5656590.117809-0.0352320.3400220.3921090.906171-0.637012-0.912759-0.139719-0.319312-0.119514-0.62708-0.2139980.4920220.7834650.531513-0.515924-0.090464-0.381751-0.23489-0.312005
"c076728ed2ebba7c01e6adb4244b02…"B"29"failing""rejected""DMSO"559.448583220.68816528.646623196.955552401"localhost230405150001""B02"4499"f00""DMSO_heart_9_louvain_0"914291530.998798"failing_DMSO"-0.08178-0.31057-1.9844630.923396-0.152527-0.4547480.4856720.9781430.0758530.3330351.0367022.124015-0.370192-0.1802730.1544550.355861-0.2851380.187411-0.401472-1.3237160.2164790.6944550.223340.272893-1.610123-1.983535-1.990444-1.759351-0.667021-1.511134-1.70973-1.0256080.389880.970785-0.723812-0.2404651.028610.8178750.731123-0.4102790.0669510.2339850.6976680.38680.216837-0.078625-0.345897-0.148249-0.205381
"2e8f5f11d29d8f82baa39f573e2e51…"B"29"failing""rejected""DMSO"909.019946247.69434897.965996253.621836401"localhost230405150001""B02"551010"f00""DMSO_heart_9_louvain_0"914291530.998798"failing_DMSO"1.384627-0.2368570.651571-0.525561-0.256208-0.352022-0.51073-0.650514-0.61187-0.390602-0.9156440.274757-0.784562-0.235359-0.8743221.0367520.5603280.0870480.5009351.0246880.6823560.703425-0.5599190.535412-0.446346-0.250839-0.325067-0.2207810.135176-0.068065-1.328074-0.471597-0.313553-1.011855-0.921082-0.718369-0.17010.0766690.1510630.784110.796587-0.8330350.9717810.96971-0.859995-0.437968-0.3754270.054053-0.346036
" + ], + "text/plain": [ + "shape: (5, 682)\n", + "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", + "│ Metadata_ ┆ Metadata_ ┆ Metadata_ ┆ Metadata_ ┆ … ┆ Nuclei_Te ┆ Nuclei_Te ┆ Nuclei_Te ┆ Nuclei_T │\n", + "│ cell_id ┆ WellRow ┆ WellCol ┆ heart_num ┆ ┆ xture_Sum ┆ xture_Sum ┆ xture_Sum ┆ exture_S │\n", + "│ --- ┆ --- ┆ --- ┆ ber ┆ ┆ Variance_ ┆ Variance_ ┆ Variance_ ┆ umVarian │\n", + "│ str ┆ str ┆ i64 ┆ --- ┆ ┆ ER_… ┆ Hoe… ┆ Mit… ┆ ce_PM_… │\n", + "│ ┆ ┆ ┆ i64 ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ ┆ ┆ ┆ ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", + "│ 210e4376e ┆ B ┆ 2 ┆ 9 ┆ … ┆ -0.158967 ┆ -0.344804 ┆ -0.263653 ┆ -0.30548 │\n", + "│ fde9d557a ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 6 │\n", + "│ 5c60029bd ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ da6… ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ cef18f209 ┆ B ┆ 2 ┆ 9 ┆ … ┆ -0.218001 ┆ -0.359297 ┆ 2.621455 ┆ -0.17567 │\n", + "│ 640ef8ae9 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 9 │\n", + "│ 8ec110cfd ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ cb6… ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ cca07fa58 ┆ B ┆ 2 ┆ 9 ┆ … ┆ -0.090464 ┆ -0.381751 ┆ -0.23489 ┆ -0.31200 │\n", + "│ 1da808fde ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 5 │\n", + "│ fe80f9c05 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 42d… ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ c076728ed ┆ B ┆ 2 ┆ 9 ┆ … ┆ -0.078625 ┆ -0.345897 ┆ -0.148249 ┆ -0.20538 │\n", + "│ 2ebba7c01 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 1 │\n", + "│ e6adb4244 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ b02… ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2e8f5f11d ┆ B ┆ 2 ┆ 9 ┆ … ┆ -0.437968 ┆ -0.375427 ┆ 0.054053 ┆ -0.34603 │\n", + "│ 29d8f82ba ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 6 │\n", + "│ a39f573e2 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ e51… ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# loading in profiles and add cluster labels to profiles dataframe\n", + "cfret_df = pl.read_parquet(cfret_profiles_path).join(\n", + " pl.read_parquet(cluster_labels_path), on=\"Metadata_cell_id\", how=\"inner\"\n", + ")\n", + "\n", + "# add a column that indicates the heart and treatment added\n", + "cfret_df = cfret_df.with_columns(\n", + " pl.concat_str(\n", + " [\n", + " pl.col(\"Metadata_cell_type\"),\n", + " pl.col(\"Metadata_treatment\"),\n", + " ],\n", + " separator=\"_\",\n", + " ).alias(\"Metadata_heart_treatment\")\n", + ")\n", + "\n", + "# split feature space\n", + "cfret_meta, cfret_feats = split_meta_and_features(cfret_df)\n", + "\n", + "# display\n", + "print(cfret_df.shape)\n", + "\n", + "\n", + "cfret_df.select(cfret_meta + cfret_feats).head()" + ] + }, + { + "cell_type": "markdown", + "id": "035e6b11", + "metadata": {}, + "source": [ + "We use **median aggregation** to generate centroid profiles for each cluster. For each cluster, we calculate the component-wise median across all cells to create a synthetic representative profile that captures the central tendency. This approach is robust to outliers, consistent with replicate and consensus profile generation workflow, and works well for high-dimensional morphological features." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "94ec613d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total cells: 15793\n", + "Number of features: 657\n", + "Unique clusters: 9\n" + ] + } + ], + "source": [ + "# split metadata and features\n", + "cfret_meta, cfret_feats = split_meta_and_features(cfret_df)\n", + "\n", + "print(f\"Total cells: {len(cfret_df)}\")\n", + "print(f\"Number of features: {len(cfret_feats)}\")\n", + "print(f\"Unique clusters: {cfret_df['Metadata_cluster_id'].n_unique()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "94be9962", + "metadata": {}, + "source": [ + "Save centroid profiles" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "09599fbc", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/erikserrano/Software/miniconda3/envs/buscar/lib/python3.12/site-packages/pycytominer/aggregate.py:108: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", + " population_df = population_df.groupby(strata, dropna=False) # type: ignore[assignment]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total centroids generated: 9\n", + "Centroid shape: (9, 658)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (9, 658)
Metadata_cluster_idCytoplasm_AreaShape_BoundingBoxMinimum_XCytoplasm_AreaShape_CompactnessCytoplasm_AreaShape_EccentricityCytoplasm_AreaShape_ExtentCytoplasm_AreaShape_FormFactorCytoplasm_AreaShape_MajorAxisLengthCytoplasm_AreaShape_MeanRadiusCytoplasm_AreaShape_MinorAxisLengthCytoplasm_AreaShape_PerimeterCytoplasm_AreaShape_SolidityCytoplasm_AreaShape_Zernike_0_0Cytoplasm_AreaShape_Zernike_1_1Cytoplasm_AreaShape_Zernike_2_0Cytoplasm_AreaShape_Zernike_2_2Cytoplasm_AreaShape_Zernike_3_1Cytoplasm_AreaShape_Zernike_4_0Cytoplasm_AreaShape_Zernike_4_2Cytoplasm_AreaShape_Zernike_5_1Cytoplasm_AreaShape_Zernike_5_3Cytoplasm_AreaShape_Zernike_6_0Cytoplasm_AreaShape_Zernike_6_2Cytoplasm_AreaShape_Zernike_6_4Cytoplasm_AreaShape_Zernike_7_1Cytoplasm_AreaShape_Zernike_7_3Cytoplasm_AreaShape_Zernike_7_5Cytoplasm_AreaShape_Zernike_8_0Cytoplasm_AreaShape_Zernike_8_2Cytoplasm_AreaShape_Zernike_8_4Cytoplasm_AreaShape_Zernike_8_6Cytoplasm_AreaShape_Zernike_9_1Cytoplasm_AreaShape_Zernike_9_3Cytoplasm_AreaShape_Zernike_9_5Cytoplasm_AreaShape_Zernike_9_7Cytoplasm_Correlation_Correlation_Actin_HoechstCytoplasm_Correlation_Correlation_Actin_MitochondriaCytoplasm_Correlation_Correlation_Actin_PMNuclei_Texture_DifferenceEntropy_PM_3_00_256Nuclei_Texture_DifferenceVariance_Actin_3_01_256Nuclei_Texture_DifferenceVariance_Mitochondria_3_03_256Nuclei_Texture_DifferenceVariance_PM_3_03_256Nuclei_Texture_InfoMeas1_ER_3_00_256Nuclei_Texture_InfoMeas1_ER_3_01_256Nuclei_Texture_InfoMeas1_ER_3_02_256Nuclei_Texture_InfoMeas1_ER_3_03_256Nuclei_Texture_InfoMeas1_Hoechst_3_00_256Nuclei_Texture_InfoMeas1_Hoechst_3_01_256Nuclei_Texture_InfoMeas1_Hoechst_3_02_256Nuclei_Texture_InfoMeas1_Hoechst_3_03_256Nuclei_Texture_InfoMeas1_Mitochondria_3_00_256Nuclei_Texture_InfoMeas1_Mitochondria_3_01_256Nuclei_Texture_InfoMeas1_Mitochondria_3_02_256Nuclei_Texture_InfoMeas1_Mitochondria_3_03_256Nuclei_Texture_InfoMeas1_PM_3_00_256Nuclei_Texture_InfoMeas1_PM_3_01_256Nuclei_Texture_InfoMeas1_PM_3_02_256Nuclei_Texture_InfoMeas1_PM_3_03_256Nuclei_Texture_InfoMeas2_ER_3_01_256Nuclei_Texture_InfoMeas2_ER_3_03_256Nuclei_Texture_InfoMeas2_Hoechst_3_01_256Nuclei_Texture_InfoMeas2_Hoechst_3_03_256Nuclei_Texture_InfoMeas2_PM_3_01_256Nuclei_Texture_InfoMeas2_PM_3_03_256Nuclei_Texture_InverseDifferenceMoment_Actin_3_02_256Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_03_256Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256Nuclei_Texture_SumEntropy_PM_3_01_256Nuclei_Texture_SumVariance_ER_3_03_256Nuclei_Texture_SumVariance_Hoechst_3_03_256Nuclei_Texture_SumVariance_Mitochondria_3_01_256Nuclei_Texture_SumVariance_PM_3_01_256
catf64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64
"DMSO_heart_9_louvain_0"-0.017365-0.3759480.0288750.200863-0.04977-0.2283980.0468750.063641-0.2517730.2022990.2163030.0328260.183010.037060.0231-0.16592-0.0071580.028768-0.048361-0.142402-0.033751-0.093717-0.005448-0.063685-0.063426-0.149138-0.033556-0.129697-0.116366-0.05216-0.084436-0.103767-0.092716-0.065370.4821940.064856-0.02033-0.471746-0.436971-0.282468-0.135781-0.088347-0.131908-0.0598390.1845210.3217210.1786240.3202-0.084185-0.041163-0.064363-0.021267-0.345281-0.249803-0.31299-0.2328830.4930850.481681-0.209778-0.2119740.4423420.437698-0.508997-0.369874-0.374387-0.1431250.1061850.0913770.125246-0.14294-0.321629-0.179205-0.172281
"DMSO_heart_9_louvain_1"0.3956542.378332-0.985671-2.826533-1.423156-1.207654-2.215587-0.621809-1.44509-3.881097-2.090087-1.561002-1.840888-1.807643-1.559165-1.006071-0.817123-1.440496-1.233745-0.89177-1.22465-1.198378-1.218086-1.310631-1.194103-0.893521-1.153585-0.154594-1.371785-1.10184-0.817372-0.981738-0.956727-2.3099490.034426-2.595523-0.081897-0.2634180.51143-0.438628-0.221524-0.549879-0.411148-0.03664-0.0807660.043208-0.553022-0.1026190.4575550.1369550.2923890.493841-1.226464-1.13966-1.176091-0.8386150.4749930.24530.1864220.0988951.0882970.9683010.3411020.084556-0.049190.6482680.284350.2519950.241521-0.231621-0.320909-0.257407-0.016748
"TGFRi_heart_9_leiden_0"0.05084-0.5127120.1220430.214090.206682-0.534524-0.363768-0.438155-0.5909740.3059950.1849020.0227150.1011760.0631080.007036-0.166722-0.0378730.000546-0.074703-0.159134-0.037196-0.107186-0.073931-0.043619-0.065998-0.190028-0.022159-0.039249-0.094816-0.06538-0.027334-0.023891-0.1214670.1116650.0734030.1912010.119815-0.400729-0.582538-0.489821-0.0434470.000504-0.038286-0.0006530.014190.038980.0156320.066566-0.094978-0.032138-0.027835-0.0121330.0431690.0614520.0837770.1106050.3699340.3667130.2090910.1976810.2800640.272913-0.137388-0.20213-0.220531-0.354877-0.222919-0.2158130.213626-0.224205-0.255745-0.15432-0.180122
"TGFRi_heart_9_leiden_1"0.069786-0.469497-0.1316150.1757950.118696-0.845099-0.725856-0.570358-0.8369880.0560.246278-0.048875-0.3307480.230854-0.0176780.3728950.0046170.342574-0.0636960.0960230.2436250.0271360.0497190.040431-0.057737-0.0507850.0914170.025771-0.0673950.0055610.1617930.046476-0.1601860.1248130.0487120.1286581.17069-0.473631-0.984541-1.258119-0.189699-0.127408-0.101198-0.195698-1.802566-1.904054-1.782106-1.955272-0.416038-0.266277-0.287373-0.351814-0.401621-0.40268-0.301852-0.4725520.7148480.767741.7474781.7615031.0294491.058242-0.533105-0.889939-0.853182-1.13172-1.359421-1.3291241.5713690.2189841.4793550.1284190.631976
"TGFRi_heart_9_leiden_2"-0.203034-0.1801820.1780970.186032-0.3271880.3557010.5201990.6798240.8366990.2877730.3164850.0936140.651911-0.134109-0.000078-0.3904550.306842-0.3556610.165521-0.29452-0.282744-0.094311-0.484958-0.589681-0.163747-0.462171-0.155039-0.064892-0.003906-0.194986-0.552613-0.397471-0.0169630.14380.1082940.136585-0.809491-0.380828-0.081690.370777-0.09104-0.220186-0.187921-0.1869191.2035341.1678951.3604821.0942220.043290.140568-0.1430150.0619890.1400470.0279570.186810.0723040.0526840.158956-1.79827-1.617563-0.042937-0.130666-0.1081450.3394220.4100490.2341640.6894650.738672-0.430098-0.318539-0.397507-0.206243-0.323603
"DMSO_heart_11_louvain_0"-0.089359-0.0486280.77182-0.552878-0.4750730.780666-0.039416-0.1007980.300961-0.320947-0.787942-0.462016-0.488334-0.381523-0.378915-0.481662-0.214635-0.450028-0.388782-0.465964-0.414834-0.095099-0.419727-0.344907-0.26114-0.423281-0.337739-0.1496910.007461-0.36725-0.289816-0.204504-0.1958970.107897-0.979987-0.248679-0.7482332.6891320.4384590.8081480.02390.0796220.0021430.1083130.575440.4515010.5360750.4585060.0718660.142473-0.0267390.1605750.6286210.6790680.5764620.661105-0.148037-0.125223-0.252605-0.258211-0.773978-0.7730821.8194760.7276290.7015510.8092750.7117070.681499-0.914648-0.409342-0.294257-0.251505-0.381779
"DMSO_heart_11_louvain_1"-0.185982-0.704415-0.061790.4061810.704153-1.013965-0.709926-0.77931-1.0248920.5609550.3942070.0824760.0344590.2197230.1182810.23893-0.0668120.406346-0.1800880.1647060.241641-0.1254710.1407570.156625-0.145210.1242410.1752120.02932-0.1678710.02120.0890250.059247-0.1126330.998810.3258661.0944470.9078710.00493-0.662954-0.927316-0.128128-0.217484-0.173287-0.425216-0.64718-0.742306-0.754983-0.931694-0.294991-0.292186-0.488872-0.6075450.160981-0.092709-0.061026-0.3066240.8129250.8876891.142671.2168250.6643340.8202880.862601-1.21605-1.147641-0.438408-1.078519-0.8003690.8185340.4524290.200085-0.1093160.069745
"TGFRi_heart_11_louvain_0"-0.08178-0.1336470.832756-0.547976-0.3833680.618722-0.262382-0.3301290.084263-0.241623-0.797622-0.530191-0.451588-0.281318-0.49391-0.427594-0.210232-0.469392-0.350128-0.410353-0.4139050.022236-0.391863-0.292197-0.294512-0.361586-0.331971-0.1889040.06988-0.22934-0.283059-0.250591-0.2653380.238552-0.7093290.001229-0.6972652.7480620.5524210.615608-0.238093-0.176665-0.172067-0.1601160.3459570.2285070.3801410.251612-0.006810.0707880.0772720.1181420.4652190.5116640.528550.5860910.119780.1007350.0627990.075708-0.541789-0.6080471.8305360.5681230.5527620.8019930.5465210.517282-0.75612-0.380266-0.258423-0.254312-0.367058
"TGFRi_heart_11_louvain_1"-0.149985-0.784473-0.1991550.6698080.984705-0.845036-0.318641-0.515862-0.9109990.857370.6908630.1894860.448860.202892-0.0478810.144906-0.0068140.272627-0.142710.0777340.033925-0.1196250.3477160.173774-0.033673-0.0250590.185187-0.089088-0.0464910.190270.297102-0.075725-0.1174520.4860940.0619340.4628640.8761190.227659-0.390094-1.080284-0.041793-0.116102-0.006852-0.014854-0.980267-1.209736-1.012096-1.132649-0.337989-0.408025-0.347322-0.2023610.192599-0.0211310.124581-0.1243790.6874180.6483471.4396431.382720.6506560.7103791.094288-0.880076-0.921128-0.136533-1.218487-1.1871440.9408080.1237050.395675-0.1693750.131136
" + ], + "text/plain": [ + "shape: (9, 658)\n", + "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", + "│ Metadata_ ┆ Cytoplasm ┆ Cytoplasm ┆ Cytoplasm ┆ … ┆ Nuclei_Te ┆ Nuclei_Te ┆ Nuclei_Te ┆ Nuclei_T │\n", + "│ cluster_i ┆ _AreaShap ┆ _AreaShap ┆ _AreaShap ┆ ┆ xture_Sum ┆ xture_Sum ┆ xture_Sum ┆ exture_S │\n", + "│ d ┆ e_Boundin ┆ e_Compact ┆ e_Eccentr ┆ ┆ Variance_ ┆ Variance_ ┆ Variance_ ┆ umVarian │\n", + "│ --- ┆ gBo… ┆ nes… ┆ ici… ┆ ┆ ER_… ┆ Hoe… ┆ Mit… ┆ ce_PM_… │\n", + "│ cat ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", + "│ DMSO_hear ┆ -0.017365 ┆ -0.375948 ┆ 0.028875 ┆ … ┆ -0.14294 ┆ -0.321629 ┆ -0.179205 ┆ -0.17228 │\n", + "│ t_9_louva ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 1 │\n", + "│ in_0 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ DMSO_hear ┆ 0.395654 ┆ 2.378332 ┆ -0.985671 ┆ … ┆ -0.231621 ┆ -0.320909 ┆ -0.257407 ┆ -0.01674 │\n", + "│ t_9_louva ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 8 │\n", + "│ in_1 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ TGFRi_hea ┆ 0.05084 ┆ -0.512712 ┆ 0.122043 ┆ … ┆ -0.224205 ┆ -0.255745 ┆ -0.15432 ┆ -0.18012 │\n", + "│ rt_9_leid ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2 │\n", + "│ en_0 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ TGFRi_hea ┆ 0.069786 ┆ -0.469497 ┆ -0.131615 ┆ … ┆ 0.218984 ┆ 1.479355 ┆ 0.128419 ┆ 0.631976 │\n", + "│ rt_9_leid ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ en_1 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ TGFRi_hea ┆ -0.203034 ┆ -0.180182 ┆ 0.178097 ┆ … ┆ -0.318539 ┆ -0.397507 ┆ -0.206243 ┆ -0.32360 │\n", + "│ rt_9_leid ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 3 │\n", + "│ en_2 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ DMSO_hear ┆ -0.089359 ┆ -0.048628 ┆ 0.77182 ┆ … ┆ -0.409342 ┆ -0.294257 ┆ -0.251505 ┆ -0.38177 │\n", + "│ t_11_louv ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 9 │\n", + "│ ain_0 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ DMSO_hear ┆ -0.185982 ┆ -0.704415 ┆ -0.06179 ┆ … ┆ 0.452429 ┆ 0.200085 ┆ -0.109316 ┆ 0.069745 │\n", + "│ t_11_louv ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ ain_1 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ TGFRi_hea ┆ -0.08178 ┆ -0.133647 ┆ 0.832756 ┆ … ┆ -0.380266 ┆ -0.258423 ┆ -0.254312 ┆ -0.36705 │\n", + "│ rt_11_lou ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 8 │\n", + "│ vain_0 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ TGFRi_hea ┆ -0.149985 ┆ -0.784473 ┆ -0.199155 ┆ … ┆ 0.123705 ┆ 0.395675 ┆ -0.169375 ┆ 0.131136 │\n", + "│ rt_11_lou ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ vain_1 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# aggregate by cluster using median to generate centroid profiles\n", + "centroids_df = aggregate(\n", + " population_df=cfret_df.to_pandas(),\n", + " strata=[\"Metadata_cluster_id\"],\n", + " features=cfret_feats,\n", + " operation=\"median\",\n", + ")\n", + "\n", + "# convert back to polars\n", + "centroids_df = pl.from_pandas(centroids_df)\n", + "\n", + "print(f\"Total centroids generated: {len(centroids_df)}\")\n", + "print(f\"Centroid shape: {centroids_df.shape}\")\n", + "centroids_df" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "00db4614", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Centroids saved to: /home/erikserrano/Projects/buscar/notebooks/2.cfret-analysis/results/centroids/cfret_pilot_centroids.parquet\n", + "Final centroid shape: (9, 659)\n" + ] + } + ], + "source": [ + "# create a mapping of cluster_id to heart_treatment (unique per cluster)\n", + "cluster_treatment_mapping = cfret_df.select(\n", + " [\"Metadata_cluster_id\", \"Metadata_heart_treatment\"]\n", + ").unique()\n", + "\n", + "# join centroids with the treatment metadata\n", + "centroids_df = centroids_df.join(\n", + " cluster_treatment_mapping, on=\"Metadata_cluster_id\", how=\"left\"\n", + ")\n", + "\n", + "# save centroids to parquet file\n", + "centroids_output_path = centroids_dir / \"cfret_pilot_centroids.parquet\"\n", + "centroids_df.write_parquet(centroids_output_path)\n", + "\n", + "print(f\"Centroids saved to: {centroids_output_path}\")\n", + "print(f\"Final centroid shape: {centroids_df.shape}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "buscar", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/2.cfret-analysis/nbconverted/1.cfret-pilot-buscar-analysis.py b/notebooks/2.cfret-analysis/nbconverted/1.cfret-pilot-buscar-analysis.py index cd1b7a9..aaeb0eb 100644 --- a/notebooks/2.cfret-analysis/nbconverted/1.cfret-pilot-buscar-analysis.py +++ b/notebooks/2.cfret-analysis/nbconverted/1.cfret-pilot-buscar-analysis.py @@ -39,18 +39,14 @@ treatment_col = "Metadata_treatment" treatment_heart_col = "Metadata_treatment_and_heart" -# parameters used for clustering optimization +# parameter grid for clustering optimization cfret_pilot_cluster_param_grid = { - # Clustering resolution: how granular the clusters should be - "cluster_resolution": {"type": "float", "low": 0.1, "high": 2.2}, - # Number of neighbors for graph construction - "n_neighbors": {"type": "int", "low": 5, "high": 100}, - # Clustering algorithm - "cluster_method": {"type": "categorical", "choices": ["leiden"]}, - # Distance metric for neighbor computation + "cluster_resolution": {"type": "float", "low": 0.05, "high": 3.0}, + "n_neighbors": {"type": "int", "low": 10, "high": 100}, + "cluster_method": {"type": "categorical", "choices": ["leiden", "louvain"]}, "neighbor_distance_metric": { "type": "categorical", - "choices": ["euclidean", "cosine", "manhattan"], + "choices": ["cosine", "euclidean", "manhattan"], }, } @@ -72,9 +68,25 @@ ).resolve(strict=True) # make results dir -results_dir = pathlib.Path("./results/cfret-pilot").resolve() +results_dir = pathlib.Path("./results").resolve() results_dir.mkdir(parents=True, exist_ok=True) +# set signatures results dir +signatures_results_dir = (results_dir / "signatures").resolve() +signatures_results_dir.mkdir(parents=True, exist_ok=True) + +# set cluster labels results dir +cluster_labels_results_dir = (results_dir / "clusters").resolve() +cluster_labels_results_dir.mkdir(parents=True, exist_ok=True) + +# set pca results dir +transformed_results_dir = (results_dir / "transformed-data").resolve() +transformed_results_dir.mkdir(parents=True, exist_ok=True) + +# set phenotypic scores results dir +phenotypic_scores_results_dir = (results_dir / "phenotypic_scores").resolve() +phenotypic_scores_results_dir.mkdir(parents=True, exist_ok=True) + # Data preprocessing # - @@ -124,13 +136,15 @@ cfret_df.head() +# Display the treatments and number of cells per heart-treatment combination + # In[5]: # show how many cells per treatment # shows the number of cells per treatment that will be clustered. cells_per_treatment_counts = ( - cfret_df.group_by(treatment_heart_col).count().sort(treatment_heart_col) + cfret_df.group_by(treatment_heart_col).len().sort(treatment_heart_col) ) cells_per_treatment_counts @@ -143,7 +157,7 @@ # setting output paths -signatures_outpath = (results_dir / "cfret_pilot_signatures.json").resolve() +signatures_outpath = (signatures_results_dir / "cfret_pilot_signatures.json").resolve() if signatures_outpath.exists(): print("Signatures already exist, skipping this step.") @@ -169,7 +183,7 @@ json.dump({"on": on_sigs, "off": off_sigs}, f, indent=4) -# Search for heterogenous effects for each treatment +# Transform raw data into PCA components that explains 95% of the variance. # In[7]: @@ -180,12 +194,28 @@ meta_features=cfret_meta, morph_features=cfret_feats, var_explained=0.95, - seed=0, + random_state=0, ) +# save PCA transformed data +pca_cfret_outpath = ( + transformed_results_dir / "cfret_pca_profiles_95var.parquet" +).resolve() +pca_cfret_df.write_parquet(pca_cfret_outpath) + # update cfret_feats because PCA was applied cfret_pca_feats = pca_cfret_df.drop(cfret_meta).columns +# save feature space +with open(transformed_results_dir / "cfret_pca_feature_space.json", "w") as f: + json.dump( + {"metadata-features": cfret_meta, "morphological-features": cfret_pca_feats}, + f, + indent=4, + ) + + +# This section applies clustering to identify distinct cell populations within each treatment condition. The clustering is optimized using Optuna to find the best hyperparameters (resolution, number of neighbors, and distance metric) that maximize separation of cell populations while maintaining biological relevance. # In[ ]: @@ -197,7 +227,9 @@ # check if the cluster labels already exist; if so just load the labels and skip optimization # if not run optimization -cluster_labels_output = (results_dir / "cfret_pilot_cluster_labels.parquet").resolve() +cluster_labels_output = ( + cluster_labels_results_dir / "cfret_pilot_cluster_labels.parquet" +).resolve() if cluster_labels_output.exists(): print("Cluster labels already exist, skipping clustering optimization.") cfret_cluster_labels_df = pl.read_parquet(cluster_labels_output) @@ -222,11 +254,15 @@ cfret_cluster_labels_df.write_parquet(cluster_labels_output) # write best params as a json file - with open(results_dir / "cfret_pilot_best_clustering_params.json", "w") as f: + with open( + cluster_labels_results_dir / "cfret_pilot_best_clustering_params.json", "w" + ) as f: json.dump(cfret_best_params, f, indent=4) -# In[ ]: +# This section measures the phenotypic distance between each treatment and the reference control (DMSO_heart_11) using the on and off signatures. The phenotypic scores are then used to rank treatments and identify top-ranking compounds based on their morphological activity. + +# In[9]: # merge cfret_df with the cluster labels and make sure to drop duplicate Metadata_cell_id columns @@ -241,13 +277,15 @@ raise ValueError("Merged DataFrame has different number of rows!") -# In[ ]: +# In[10]: # setting output paths treatment_dist_scores_outpath = ( - results_dir / "treatment_phenotypic_scores.csv" + phenotypic_scores_results_dir / "treatment_phenotypic_scores.csv" ).resolve() + +# calculate phenotypic distance scores if treatment_dist_scores_outpath.exists(): print("Treatment phenotypic distance scores already exist, skipping this step.") treatment_heart_dist_scores = pl.read_csv(treatment_dist_scores_outpath) @@ -265,19 +303,15 @@ treatment_heart_dist_scores.write_csv(treatment_dist_scores_outpath) -# In[12]: - +# In[11]: -treatment_heart_dist_scores - -# In[13]: - - -# setting outptut paths +# setting output paths treatment_heart_rankings_outpath = ( - results_dir / "treatment_heart_rankings.csv" + phenotypic_scores_results_dir / "treatment_heart_rankings.csv" ).resolve() + +# identify hits based on distance scores if treatment_heart_rankings_outpath.exists(): print("Treatment heart rankings already exist, skipping this step.") treatment_heart_rankings = pl.read_csv(treatment_heart_rankings_outpath) diff --git a/notebooks/2.cfret-analysis/nbconverted/2.generate-aggregate-profiles.py b/notebooks/2.cfret-analysis/nbconverted/2.generate-aggregate-profiles.py new file mode 100644 index 0000000..7bd2c9e --- /dev/null +++ b/notebooks/2.cfret-analysis/nbconverted/2.generate-aggregate-profiles.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python + +# # 2. Generating Aggregate Profiles +# +# This notebook transforms single-cell morphological profiles from the CFReT pilot dataset into summary representations for downstream analysis. Aggregation reduces noise and enables robust comparisons between experimental conditions by collapsing hundreds or thousands of single-cell measurements into representative profiles. +# +# Two levels of aggregation are generated: +# 1. **Replicate-level profiles**: Aggregate cells by well position, heart number, cell type, and treatment to create technical replicate profiles +# 2. **Consensus profiles**: Further aggregate replicates by heart type and treatment to generate condition-level consensus signatures +# +# Here we used `pycytominer.aggregate()` to apply median aggregation to generate two profiles explained above. Then output profiles are saved as parquet files. + +# In[1]: + + +import pathlib +import sys + +import polars as pl +from pycytominer import aggregate + +sys.path.append("../../") +from utils.data_utils import split_meta_and_features + +# Setting input and output paths + +# In[2]: + + +# setting data path for cfret-pilot dataset +cfret_profiles_path = pathlib.Path( + "../0.download-data/data/sc-profiles/cfret/localhost230405150001_sc_feature_selected.parquet" +).resolve(strict=True) + +# set results directory path +results_dir = pathlib.Path("./results").resolve() +results_dir.mkdir(exist_ok=True) + +# make aggregate profile directory +aggregate_profiles_dir = results_dir / "aggregate_profiles" +aggregate_profiles_dir.mkdir(exist_ok=True) + + +# In[3]: + + +# load in the cfret-pilot dataset +cfret_df = pl.read_parquet(cfret_profiles_path) + +# add a column that indicates the heart and treatment added +cfret_df = cfret_df.with_columns( + pl.concat_str( + [ + pl.col("Metadata_cell_type"), + pl.col("Metadata_treatment"), + ], + separator="_", + ).alias("Metadata_heart_treatment") +) + +# split feature space +cfret_meta, cfret_feats = split_meta_and_features(cfret_df) + +# display +print(cfret_df.shape) +cfret_df.head() + + +# Generating aggregate profiles at the replicate level + +# In[4]: + + +aggregate( + population_df=cfret_df.to_pandas(), + strata=[ + "Metadata_heart_treatment", + "Metadata_WellRow", + "Metadata_WellCol", + "Metadata_heart_number", + "Metadata_cell_type", + "Metadata_treatment", + ], + features=cfret_feats, + operation="median", + output_type="parquet", + output_file=(aggregate_profiles_dir / "cfret_replicate_profiles.parquet").resolve(), +) + + +# Generating consensus profiles of of the treatment and heart type + +# In[5]: + + +# aggregating profiles by heart and treatment +aggregate( + population_df=cfret_df.to_pandas(), + strata=["Metadata_heart_treatment", "Metadata_cell_type", "Metadata_treatment"], + features=cfret_feats, + operation="median", + output_type="parquet", + output_file=(aggregate_profiles_dir / "cfret_consensus_profiles.parquet").resolve(), +) diff --git a/notebooks/2.cfret-analysis/nbconverted/3.generate-centroid.py b/notebooks/2.cfret-analysis/nbconverted/3.generate-centroid.py new file mode 100644 index 0000000..27ab6d3 --- /dev/null +++ b/notebooks/2.cfret-analysis/nbconverted/3.generate-centroid.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python + +# ## 3 Generating centroid profiles +# In this notebook, we identify the centroid for each cluster found in the single-cell profiles after running the buscar clustering module. +# +# The centroid is the representative cell that best captures the distribution of cells within a cluster. + +# In[1]: + + +import pathlib +import sys + +import polars as pl +from pycytominer import aggregate + +sys.path.append("../../") +from utils.data_utils import split_meta_and_features + +# Setting input and output paths + +# In[2]: + + +# setting data path for cfret-pilot dataset +cfret_profiles_path = pathlib.Path( + "../0.download-data/data/sc-profiles/cfret/localhost230405150001_sc_feature_selected.parquet" +).resolve(strict=True) + +# setting cluster labels path +cluster_labels_path = pathlib.Path( + "./results/clusters/cfret_pilot_cluster_labels.parquet" +).resolve(strict=True) + +# setting outpaths for results +results_dir = pathlib.Path("./results").resolve() +results_dir.mkdir(exist_ok=True) + +# setting outpath for centroids +centroids_dir = (results_dir / "centroids").resolve() +centroids_dir.mkdir(exist_ok=True) + + +# Loading profiles + +# In[3]: + + +# loading in profiles and add cluster labels to profiles dataframe +cfret_df = pl.read_parquet(cfret_profiles_path).join( + pl.read_parquet(cluster_labels_path), on="Metadata_cell_id", how="inner" +) + +# add a column that indicates the heart and treatment added +cfret_df = cfret_df.with_columns( + pl.concat_str( + [ + pl.col("Metadata_cell_type"), + pl.col("Metadata_treatment"), + ], + separator="_", + ).alias("Metadata_heart_treatment") +) + +# split feature space +cfret_meta, cfret_feats = split_meta_and_features(cfret_df) + +# display +print(cfret_df.shape) + + +cfret_df.select(cfret_meta + cfret_feats).head() + + +# We use **median aggregation** to generate centroid profiles for each cluster. For each cluster, we calculate the component-wise median across all cells to create a synthetic representative profile that captures the central tendency. This approach is robust to outliers, consistent with replicate and consensus profile generation workflow, and works well for high-dimensional morphological features. + +# In[4]: + + +# split metadata and features +cfret_meta, cfret_feats = split_meta_and_features(cfret_df) + +print(f"Total cells: {len(cfret_df)}") +print(f"Number of features: {len(cfret_feats)}") +print(f"Unique clusters: {cfret_df['Metadata_cluster_id'].n_unique()}") + + +# Save centroid profiles + +# In[5]: + + +# aggregate by cluster using median to generate centroid profiles +centroids_df = aggregate( + population_df=cfret_df.to_pandas(), + strata=["Metadata_cluster_id"], + features=cfret_feats, + operation="median", +) + +# convert back to polars +centroids_df = pl.from_pandas(centroids_df) + +print(f"Total centroids generated: {len(centroids_df)}") +print(f"Centroid shape: {centroids_df.shape}") +centroids_df + + +# In[6]: + + +# create a mapping of cluster_id to heart_treatment (unique per cluster) +cluster_treatment_mapping = cfret_df.select( + ["Metadata_cluster_id", "Metadata_heart_treatment"] +).unique() + +# join centroids with the treatment metadata +centroids_df = centroids_df.join( + cluster_treatment_mapping, on="Metadata_cluster_id", how="left" +) + +# save centroids to parquet file +centroids_output_path = centroids_dir / "cfret_pilot_centroids.parquet" +centroids_df.write_parquet(centroids_output_path) + +print(f"Centroids saved to: {centroids_output_path}") +print(f"Final centroid shape: {centroids_df.shape}")