WayScience · axiomcura · Dec 23, 2025 · Jan 15, 2026 · Jan 15, 2026 · Jan 16, 2026
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -38,7 +38,7 @@ repos:
 
   # Ruff for linting and formatting Python files
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.14.10
+    rev: v0.14.13
     hooks:
     -   id: ruff-check
         args: ["--fix"]

diff --git a/notebooks/0.download-data/2.preprocessing.ipynb b/notebooks/0.download-data/2.preprocessing.ipynb
@@ -548,6 +548,9 @@
     "# adding a unique cell ID based on all features\n",
     "cfret_profiles = add_cell_id_hash(cfret_profiles, force=True)\n",
     "\n",
+    "# drop rows cells that have been treated with drug_x\n",
+    "cfret_profiles = cfret_profiles.filter(pl.col(\"Metadata_treatment\") != \"drug_x\")\n",
+    "\n",
     "# split features\n",
     "meta_cols, features_cols = split_meta_and_features(cfret_profiles)\n",
     "\n",

diff --git a/notebooks/0.download-data/nbconverted/2.preprocessing.py b/notebooks/0.download-data/nbconverted/2.preprocessing.py
@@ -453,6 +453,9 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 # adding a unique cell ID based on all features
 cfret_profiles = add_cell_id_hash(cfret_profiles, force=True)
 
+# drop rows cells that have been treated with drug_x
+cfret_profiles = cfret_profiles.filter(pl.col("Metadata_treatment") != "drug_x")
+
 # split features
 meta_cols, features_cols = split_meta_and_features(cfret_profiles)
 

diff --git a/notebooks/2.cfret-analysis/1.cfret-pilot-buscar-analysis.ipynb b/notebooks/2.cfret-analysis/1.cfret-pilot-buscar-analysis.ipynb
diff --git a/notebooks/2.cfret-analysis/2.generate-aggregate-profiles.ipynb b/notebooks/2.cfret-analysis/2.generate-aggregate-profiles.ipynb
diff --git a/notebooks/2.cfret-analysis/3.generate-centroid.ipynb b/notebooks/2.cfret-analysis/3.generate-centroid.ipynb
diff --git a/notebooks/2.cfret-analysis/nbconverted/1.cfret-pilot-buscar-analysis.py b/notebooks/2.cfret-analysis/nbconverted/1.cfret-pilot-buscar-analysis.py
@@ -39,18 +39,14 @@
 treatment_col = "Metadata_treatment"
 treatment_heart_col = "Metadata_treatment_and_heart"
 
-# parameters used for clustering optimization
+# parameter grid for clustering optimization
 cfret_pilot_cluster_param_grid = {
-    # Clustering resolution: how granular the clusters should be
-    "cluster_resolution": {"type": "float", "low": 0.1, "high": 2.2},
-    # Number of neighbors for graph construction
-    "n_neighbors": {"type": "int", "low": 5, "high": 100},
-    # Clustering algorithm
-    "cluster_method": {"type": "categorical", "choices": ["leiden"]},
-    # Distance metric for neighbor computation
+    "cluster_resolution": {"type": "float", "low": 0.05, "high": 3.0},
+    "n_neighbors": {"type": "int", "low": 10, "high": 100},
+    "cluster_method": {"type": "categorical", "choices": ["leiden", "louvain"]},
     "neighbor_distance_metric": {
         "type": "categorical",
-        "choices": ["euclidean", "cosine", "manhattan"],
+        "choices": ["cosine", "euclidean", "manhattan"],
     },
 }
 
@@ -72,9 +68,25 @@
 ).resolve(strict=True)
 
 # make results dir
-results_dir = pathlib.Path("./results/cfret-pilot").resolve()
+results_dir = pathlib.Path("./results").resolve()
 results_dir.mkdir(parents=True, exist_ok=True)
 
+# set signatures results dir
+signatures_results_dir = (results_dir / "signatures").resolve()
+signatures_results_dir.mkdir(parents=True, exist_ok=True)
+
+# set cluster labels results dir
+cluster_labels_results_dir = (results_dir / "clusters").resolve()
+cluster_labels_results_dir.mkdir(parents=True, exist_ok=True)
+
+# set pca results dir
+transformed_results_dir = (results_dir / "transformed-data").resolve()
+transformed_results_dir.mkdir(parents=True, exist_ok=True)
+
+# set phenotypic scores results dir
+phenotypic_scores_results_dir = (results_dir / "phenotypic_scores").resolve()
+phenotypic_scores_results_dir.mkdir(parents=True, exist_ok=True)
+
 
 # Data preprocessing
 # -
@@ -124,13 +136,15 @@
 cfret_df.head()
 
 
+# Display the treatments and number of cells per heart-treatment combination
+
 # In[5]:
 
 
 # show how many cells per treatment
 # shows the number of cells per treatment that will be clustered.
 cells_per_treatment_counts = (
-    cfret_df.group_by(treatment_heart_col).count().sort(treatment_heart_col)
+    cfret_df.group_by(treatment_heart_col).len().sort(treatment_heart_col)
 )
 cells_per_treatment_counts
 
@@ -143,7 +157,7 @@
 
 
 # setting output paths
-signatures_outpath = (results_dir / "cfret_pilot_signatures.json").resolve()
+signatures_outpath = (signatures_results_dir / "cfret_pilot_signatures.json").resolve()
 
 if signatures_outpath.exists():
     print("Signatures already exist, skipping this step.")
@@ -169,7 +183,7 @@
         json.dump({"on": on_sigs, "off": off_sigs}, f, indent=4)
 
 
-# Search for heterogenous effects for each treatment
+# Transform raw data into PCA components that explains 95% of the variance.
 
 # In[7]:
 
@@ -180,12 +194,28 @@
     meta_features=cfret_meta,
     morph_features=cfret_feats,
     var_explained=0.95,
-    seed=0,
+    random_state=0,
 )
 
+# save PCA transformed data
+pca_cfret_outpath = (
+    transformed_results_dir / "cfret_pca_profiles_95var.parquet"
+).resolve()
+pca_cfret_df.write_parquet(pca_cfret_outpath)
+
 # update cfret_feats because PCA was applied
 cfret_pca_feats = pca_cfret_df.drop(cfret_meta).columns
 
+# save feature space
+with open(transformed_results_dir / "cfret_pca_feature_space.json", "w") as f:
+    json.dump(
+        {"metadata-features": cfret_meta, "morphological-features": cfret_pca_feats},
+        f,
+        indent=4,
+    )
+
+
+# This section applies clustering to identify distinct cell populations within each treatment condition. The clustering is optimized using Optuna to find the best hyperparameters (resolution, number of neighbors, and distance metric) that maximize separation of cell populations while maintaining biological relevance.
 
 # In[ ]:
 
@@ -197,7 +227,9 @@
 
 # check if the cluster labels already exist; if so just load the labels and skip optimization
 # if not run optimization
-cluster_labels_output = (results_dir / "cfret_pilot_cluster_labels.parquet").resolve()
+cluster_labels_output = (
+    cluster_labels_results_dir / "cfret_pilot_cluster_labels.parquet"
+).resolve()
 if cluster_labels_output.exists():
     print("Cluster labels already exist, skipping clustering optimization.")
     cfret_cluster_labels_df = pl.read_parquet(cluster_labels_output)
@@ -222,11 +254,15 @@
     cfret_cluster_labels_df.write_parquet(cluster_labels_output)
 
     # write best params as a json file
-    with open(results_dir / "cfret_pilot_best_clustering_params.json", "w") as f:
+    with open(
+        cluster_labels_results_dir / "cfret_pilot_best_clustering_params.json", "w"
+    ) as f:
         json.dump(cfret_best_params, f, indent=4)
 
 
-# In[ ]:
+# This section measures the phenotypic distance between each treatment and the reference control (DMSO_heart_11) using the on and off signatures. The phenotypic scores are then used to rank treatments and identify top-ranking compounds based on their morphological activity.
+
+# In[9]:
 
 
 # merge cfret_df with the cluster labels and make sure to drop duplicate Metadata_cell_id columns
@@ -241,13 +277,15 @@
     raise ValueError("Merged DataFrame has different number of rows!")
 
 
-# In[ ]:
+# In[10]:
 
 
 # setting output paths
 treatment_dist_scores_outpath = (
-    results_dir / "treatment_phenotypic_scores.csv"
+    phenotypic_scores_results_dir / "treatment_phenotypic_scores.csv"
 ).resolve()
+
+# calculate phenotypic distance scores
 if treatment_dist_scores_outpath.exists():
     print("Treatment phenotypic distance scores already exist, skipping this step.")
     treatment_heart_dist_scores = pl.read_csv(treatment_dist_scores_outpath)
@@ -265,19 +303,15 @@
     treatment_heart_dist_scores.write_csv(treatment_dist_scores_outpath)
 
 
-# In[12]:
-
+# In[11]:
 
-treatment_heart_dist_scores
 
-
-# In[13]:
-
-
-# setting outptut paths
+# setting output paths
 treatment_heart_rankings_outpath = (
-    results_dir / "treatment_heart_rankings.csv"
+    phenotypic_scores_results_dir / "treatment_heart_rankings.csv"
 ).resolve()
+
+# identify hits based on distance scores
 if treatment_heart_rankings_outpath.exists():
     print("Treatment heart rankings already exist, skipping this step.")
     treatment_heart_rankings = pl.read_csv(treatment_heart_rankings_outpath)

diff --git a/notebooks/2.cfret-analysis/nbconverted/2.generate-aggregate-profiles.py b/notebooks/2.cfret-analysis/nbconverted/2.generate-aggregate-profiles.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+
+# # 2. Generating Aggregate Profiles
+#
+# This notebook transforms single-cell morphological profiles from the CFReT pilot dataset into summary representations for downstream analysis. Aggregation reduces noise and enables robust comparisons between experimental conditions by collapsing hundreds or thousands of single-cell measurements into representative profiles.
+#
+# Two levels of aggregation are generated:
+# 1. **Replicate-level profiles**: Aggregate cells by well position, heart number, cell type, and treatment to create technical replicate profiles
+# 2. **Consensus profiles**: Further aggregate replicates by heart type and treatment to generate condition-level consensus signatures
+#
+# Here we used `pycytominer.aggregate()` to apply median aggregation to generate two profiles explained above. Then output profiles are saved as parquet files.
+
+# In[1]:
+
+
+import pathlib
+import sys
+
+import polars as pl
+from pycytominer import aggregate
+
+sys.path.append("../../")
+from utils.data_utils import split_meta_and_features
+
+# Setting input and output paths
+
+# In[2]:
+
+
+# setting data path for cfret-pilot dataset
+cfret_profiles_path = pathlib.Path(
+    "../0.download-data/data/sc-profiles/cfret/localhost230405150001_sc_feature_selected.parquet"
+).resolve(strict=True)
+
+# set results directory path
+results_dir = pathlib.Path("./results").resolve()
+results_dir.mkdir(exist_ok=True)
+
+# make aggregate profile directory
+aggregate_profiles_dir = results_dir / "aggregate_profiles"
+aggregate_profiles_dir.mkdir(exist_ok=True)
+
+
+# In[3]:
+
+
+# load in the cfret-pilot dataset
+cfret_df = pl.read_parquet(cfret_profiles_path)
+
+# add a column that indicates the heart and treatment added
+cfret_df = cfret_df.with_columns(
+    pl.concat_str(
+        [
+            pl.col("Metadata_cell_type"),
+            pl.col("Metadata_treatment"),
+        ],
+        separator="_",
+    ).alias("Metadata_heart_treatment")
+)
+
+# split feature space
+cfret_meta, cfret_feats = split_meta_and_features(cfret_df)
+
+# display
+print(cfret_df.shape)
+cfret_df.head()
+
+
+# Generating aggregate profiles at the replicate level
+
+# In[4]:
+
+
+aggregate(
+    population_df=cfret_df.to_pandas(),
+    strata=[
+        "Metadata_heart_treatment",
+        "Metadata_WellRow",
+        "Metadata_WellCol",
+        "Metadata_heart_number",
+        "Metadata_cell_type",
+        "Metadata_treatment",
+    ],
+    features=cfret_feats,
+    operation="median",
+    output_type="parquet",
+    output_file=(aggregate_profiles_dir / "cfret_replicate_profiles.parquet").resolve(),
+)
+
+
+# Generating consensus profiles of of the treatment and heart type
+
+# In[5]:
+
+
+# aggregating profiles by heart and treatment
+aggregate(
+    population_df=cfret_df.to_pandas(),
+    strata=["Metadata_heart_treatment", "Metadata_cell_type", "Metadata_treatment"],
+    features=cfret_feats,
+    operation="median",
+    output_type="parquet",
+    output_file=(aggregate_profiles_dir / "cfret_consensus_profiles.parquet").resolve(),
+)