Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ repos:

# Ruff for linting and formatting Python files
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.14.10
rev: v0.14.13
hooks:
- id: ruff-check
args: ["--fix"]
Expand Down
3 changes: 3 additions & 0 deletions notebooks/0.download-data/2.preprocessing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,9 @@
"# adding a unique cell ID based on all features\n",
"cfret_profiles = add_cell_id_hash(cfret_profiles, force=True)\n",
"\n",
"# drop rows cells that have been treated with drug_x\n",
"cfret_profiles = cfret_profiles.filter(pl.col(\"Metadata_treatment\") != \"drug_x\")\n",
"\n",
"# split features\n",
"meta_cols, features_cols = split_meta_and_features(cfret_profiles)\n",
"\n",
Expand Down
3 changes: 3 additions & 0 deletions notebooks/0.download-data/nbconverted/2.preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,9 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
# adding a unique cell ID based on all features
cfret_profiles = add_cell_id_hash(cfret_profiles, force=True)

# drop rows cells that have been treated with drug_x
cfret_profiles = cfret_profiles.filter(pl.col("Metadata_treatment") != "drug_x")

# split features
meta_cols, features_cols = split_meta_and_features(cfret_profiles)

Expand Down
308 changes: 121 additions & 187 deletions notebooks/2.cfret-analysis/1.cfret-pilot-buscar-analysis.ipynb

Large diffs are not rendered by default.

253 changes: 253 additions & 0 deletions notebooks/2.cfret-analysis/2.generate-aggregate-profiles.ipynb

Large diffs are not rendered by default.

356 changes: 356 additions & 0 deletions notebooks/2.cfret-analysis/3.generate-centroid.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -39,18 +39,14 @@
treatment_col = "Metadata_treatment"
treatment_heart_col = "Metadata_treatment_and_heart"

# parameters used for clustering optimization
# parameter grid for clustering optimization
cfret_pilot_cluster_param_grid = {
# Clustering resolution: how granular the clusters should be
"cluster_resolution": {"type": "float", "low": 0.1, "high": 2.2},
# Number of neighbors for graph construction
"n_neighbors": {"type": "int", "low": 5, "high": 100},
# Clustering algorithm
"cluster_method": {"type": "categorical", "choices": ["leiden"]},
# Distance metric for neighbor computation
"cluster_resolution": {"type": "float", "low": 0.05, "high": 3.0},
"n_neighbors": {"type": "int", "low": 10, "high": 100},
"cluster_method": {"type": "categorical", "choices": ["leiden", "louvain"]},
"neighbor_distance_metric": {
"type": "categorical",
"choices": ["euclidean", "cosine", "manhattan"],
"choices": ["cosine", "euclidean", "manhattan"],
},
}

Expand All @@ -72,9 +68,25 @@
).resolve(strict=True)

# make results dir
results_dir = pathlib.Path("./results/cfret-pilot").resolve()
results_dir = pathlib.Path("./results").resolve()
results_dir.mkdir(parents=True, exist_ok=True)

# set signatures results dir
signatures_results_dir = (results_dir / "signatures").resolve()
signatures_results_dir.mkdir(parents=True, exist_ok=True)

# set cluster labels results dir
cluster_labels_results_dir = (results_dir / "clusters").resolve()
cluster_labels_results_dir.mkdir(parents=True, exist_ok=True)

# set pca results dir
transformed_results_dir = (results_dir / "transformed-data").resolve()
transformed_results_dir.mkdir(parents=True, exist_ok=True)

# set phenotypic scores results dir
phenotypic_scores_results_dir = (results_dir / "phenotypic_scores").resolve()
phenotypic_scores_results_dir.mkdir(parents=True, exist_ok=True)


# Data preprocessing
# -
Expand Down Expand Up @@ -124,13 +136,15 @@
cfret_df.head()


# Display the treatments and number of cells per heart-treatment combination

# In[5]:


# show how many cells per treatment
# shows the number of cells per treatment that will be clustered.
cells_per_treatment_counts = (
cfret_df.group_by(treatment_heart_col).count().sort(treatment_heart_col)
cfret_df.group_by(treatment_heart_col).len().sort(treatment_heart_col)
)
cells_per_treatment_counts

Expand All @@ -143,7 +157,7 @@


# setting output paths
signatures_outpath = (results_dir / "cfret_pilot_signatures.json").resolve()
signatures_outpath = (signatures_results_dir / "cfret_pilot_signatures.json").resolve()

if signatures_outpath.exists():
print("Signatures already exist, skipping this step.")
Expand All @@ -169,7 +183,7 @@
json.dump({"on": on_sigs, "off": off_sigs}, f, indent=4)


# Search for heterogenous effects for each treatment
# Transform raw data into PCA components that explains 95% of the variance.

# In[7]:

Expand All @@ -180,12 +194,28 @@
meta_features=cfret_meta,
morph_features=cfret_feats,
var_explained=0.95,
seed=0,
random_state=0,
)

# save PCA transformed data
pca_cfret_outpath = (
transformed_results_dir / "cfret_pca_profiles_95var.parquet"
).resolve()
pca_cfret_df.write_parquet(pca_cfret_outpath)

# update cfret_feats because PCA was applied
cfret_pca_feats = pca_cfret_df.drop(cfret_meta).columns

# save feature space
with open(transformed_results_dir / "cfret_pca_feature_space.json", "w") as f:
json.dump(
{"metadata-features": cfret_meta, "morphological-features": cfret_pca_feats},
f,
indent=4,
)


# This section applies clustering to identify distinct cell populations within each treatment condition. The clustering is optimized using Optuna to find the best hyperparameters (resolution, number of neighbors, and distance metric) that maximize separation of cell populations while maintaining biological relevance.

# In[ ]:

Expand All @@ -197,7 +227,9 @@

# check if the cluster labels already exist; if so just load the labels and skip optimization
# if not run optimization
cluster_labels_output = (results_dir / "cfret_pilot_cluster_labels.parquet").resolve()
cluster_labels_output = (
cluster_labels_results_dir / "cfret_pilot_cluster_labels.parquet"
).resolve()
if cluster_labels_output.exists():
print("Cluster labels already exist, skipping clustering optimization.")
cfret_cluster_labels_df = pl.read_parquet(cluster_labels_output)
Expand All @@ -222,11 +254,15 @@
cfret_cluster_labels_df.write_parquet(cluster_labels_output)

# write best params as a json file
with open(results_dir / "cfret_pilot_best_clustering_params.json", "w") as f:
with open(
cluster_labels_results_dir / "cfret_pilot_best_clustering_params.json", "w"
) as f:
json.dump(cfret_best_params, f, indent=4)


# In[ ]:
# This section measures the phenotypic distance between each treatment and the reference control (DMSO_heart_11) using the on and off signatures. The phenotypic scores are then used to rank treatments and identify top-ranking compounds based on their morphological activity.

# In[9]:


# merge cfret_df with the cluster labels and make sure to drop duplicate Metadata_cell_id columns
Expand All @@ -241,13 +277,15 @@
raise ValueError("Merged DataFrame has different number of rows!")


# In[ ]:
# In[10]:


# setting output paths
treatment_dist_scores_outpath = (
results_dir / "treatment_phenotypic_scores.csv"
phenotypic_scores_results_dir / "treatment_phenotypic_scores.csv"
).resolve()

# calculate phenotypic distance scores
if treatment_dist_scores_outpath.exists():
print("Treatment phenotypic distance scores already exist, skipping this step.")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might want to perform some content check to ensure everything needed is present so in the event you include/uninclude samples earlier the pipeline knows if stuff needs to be re-generated or perhaps a force-re-run flag would be handy.

treatment_heart_dist_scores = pl.read_csv(treatment_dist_scores_outpath)
Expand All @@ -265,19 +303,15 @@
treatment_heart_dist_scores.write_csv(treatment_dist_scores_outpath)


# In[12]:

# In[11]:

treatment_heart_dist_scores


# In[13]:


# setting outptut paths
# setting output paths
treatment_heart_rankings_outpath = (
results_dir / "treatment_heart_rankings.csv"
phenotypic_scores_results_dir / "treatment_heart_rankings.csv"
).resolve()

# identify hits based on distance scores
if treatment_heart_rankings_outpath.exists():
print("Treatment heart rankings already exist, skipping this step.")
treatment_heart_rankings = pl.read_csv(treatment_heart_rankings_outpath)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#!/usr/bin/env python

# # 2. Generating Aggregate Profiles
#
# This notebook transforms single-cell morphological profiles from the CFReT pilot dataset into summary representations for downstream analysis. Aggregation reduces noise and enables robust comparisons between experimental conditions by collapsing hundreds or thousands of single-cell measurements into representative profiles.
#
# Two levels of aggregation are generated:
# 1. **Replicate-level profiles**: Aggregate cells by well position, heart number, cell type, and treatment to create technical replicate profiles
# 2. **Consensus profiles**: Further aggregate replicates by heart type and treatment to generate condition-level consensus signatures
#
# Here we used `pycytominer.aggregate()` to apply median aggregation to generate two profiles explained above. Then output profiles are saved as parquet files.

# In[1]:


import pathlib
import sys

import polars as pl
from pycytominer import aggregate

sys.path.append("../../")
from utils.data_utils import split_meta_and_features

# Setting input and output paths

# In[2]:


# setting data path for cfret-pilot dataset
cfret_profiles_path = pathlib.Path(
"../0.download-data/data/sc-profiles/cfret/localhost230405150001_sc_feature_selected.parquet"
).resolve(strict=True)

# set results directory path
results_dir = pathlib.Path("./results").resolve()
results_dir.mkdir(exist_ok=True)

# make aggregate profile directory
aggregate_profiles_dir = results_dir / "aggregate_profiles"
aggregate_profiles_dir.mkdir(exist_ok=True)


# In[3]:


# load in the cfret-pilot dataset
cfret_df = pl.read_parquet(cfret_profiles_path)

# add a column that indicates the heart and treatment added
cfret_df = cfret_df.with_columns(
pl.concat_str(
[
pl.col("Metadata_cell_type"),
pl.col("Metadata_treatment"),
],
separator="_",
).alias("Metadata_heart_treatment")
)

# split feature space
cfret_meta, cfret_feats = split_meta_and_features(cfret_df)

# display
print(cfret_df.shape)
cfret_df.head()


# Generating aggregate profiles at the replicate level

# In[4]:


aggregate(
population_df=cfret_df.to_pandas(),
strata=[
"Metadata_heart_treatment",
"Metadata_WellRow",
"Metadata_WellCol",
"Metadata_heart_number",
"Metadata_cell_type",
"Metadata_treatment",
],
features=cfret_feats,
operation="median",
output_type="parquet",
output_file=(aggregate_profiles_dir / "cfret_replicate_profiles.parquet").resolve(),
)


# Generating consensus profiles of of the treatment and heart type

# In[5]:


# aggregating profiles by heart and treatment
aggregate(
population_df=cfret_df.to_pandas(),
strata=["Metadata_heart_treatment", "Metadata_cell_type", "Metadata_treatment"],
features=cfret_feats,
operation="median",
output_type="parquet",
output_file=(aggregate_profiles_dir / "cfret_consensus_profiles.parquet").resolve(),
)
Loading