-
Notifications
You must be signed in to change notification settings - Fork 1
Cfret pilot analysis #60
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
ca5b704
9876715
b3a30de
395f1b2
c146135
b34981d
ae59428
d2ffdaa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,18 +39,14 @@ | |
| treatment_col = "Metadata_treatment" | ||
| treatment_heart_col = "Metadata_treatment_and_heart" | ||
|
|
||
| # parameters used for clustering optimization | ||
| # parameter grid for clustering optimization | ||
| cfret_pilot_cluster_param_grid = { | ||
| # Clustering resolution: how granular the clusters should be | ||
| "cluster_resolution": {"type": "float", "low": 0.1, "high": 2.2}, | ||
| # Number of neighbors for graph construction | ||
| "n_neighbors": {"type": "int", "low": 5, "high": 100}, | ||
| # Clustering algorithm | ||
| "cluster_method": {"type": "categorical", "choices": ["leiden"]}, | ||
| # Distance metric for neighbor computation | ||
| "cluster_resolution": {"type": "float", "low": 0.05, "high": 3.0}, | ||
| "n_neighbors": {"type": "int", "low": 10, "high": 100}, | ||
| "cluster_method": {"type": "categorical", "choices": ["leiden", "louvain"]}, | ||
| "neighbor_distance_metric": { | ||
| "type": "categorical", | ||
| "choices": ["euclidean", "cosine", "manhattan"], | ||
| "choices": ["cosine", "euclidean", "manhattan"], | ||
| }, | ||
| } | ||
|
|
||
|
|
@@ -72,9 +68,25 @@ | |
| ).resolve(strict=True) | ||
|
|
||
| # make results dir | ||
| results_dir = pathlib.Path("./results/cfret-pilot").resolve() | ||
| results_dir = pathlib.Path("./results").resolve() | ||
| results_dir.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| # set signatures results dir | ||
| signatures_results_dir = (results_dir / "signatures").resolve() | ||
| signatures_results_dir.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| # set cluster labels results dir | ||
| cluster_labels_results_dir = (results_dir / "clusters").resolve() | ||
| cluster_labels_results_dir.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| # set pca results dir | ||
| transformed_results_dir = (results_dir / "transformed-data").resolve() | ||
| transformed_results_dir.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| # set phenotypic scores results dir | ||
| phenotypic_scores_results_dir = (results_dir / "phenotypic_scores").resolve() | ||
| phenotypic_scores_results_dir.mkdir(parents=True, exist_ok=True) | ||
|
|
||
|
|
||
| # Data preprocessing | ||
| # - | ||
|
|
@@ -124,13 +136,15 @@ | |
| cfret_df.head() | ||
|
|
||
|
|
||
| # Display the treatments and number of cells per heart-treatment combination | ||
|
|
||
| # In[5]: | ||
|
|
||
|
|
||
| # show how many cells per treatment | ||
| # shows the number of cells per treatment that will be clustered. | ||
| cells_per_treatment_counts = ( | ||
| cfret_df.group_by(treatment_heart_col).count().sort(treatment_heart_col) | ||
| cfret_df.group_by(treatment_heart_col).len().sort(treatment_heart_col) | ||
| ) | ||
| cells_per_treatment_counts | ||
|
|
||
|
|
@@ -143,7 +157,7 @@ | |
|
|
||
|
|
||
| # setting output paths | ||
| signatures_outpath = (results_dir / "cfret_pilot_signatures.json").resolve() | ||
| signatures_outpath = (signatures_results_dir / "cfret_pilot_signatures.json").resolve() | ||
|
|
||
| if signatures_outpath.exists(): | ||
| print("Signatures already exist, skipping this step.") | ||
|
|
@@ -169,7 +183,7 @@ | |
| json.dump({"on": on_sigs, "off": off_sigs}, f, indent=4) | ||
|
|
||
|
|
||
| # Search for heterogenous effects for each treatment | ||
| # Transform raw data into PCA components that explains 95% of the variance. | ||
|
|
||
| # In[7]: | ||
|
|
||
|
|
@@ -180,12 +194,28 @@ | |
| meta_features=cfret_meta, | ||
| morph_features=cfret_feats, | ||
| var_explained=0.95, | ||
| seed=0, | ||
| random_state=0, | ||
| ) | ||
|
|
||
| # save PCA transformed data | ||
| pca_cfret_outpath = ( | ||
| transformed_results_dir / "cfret_pca_profiles_95var.parquet" | ||
| ).resolve() | ||
| pca_cfret_df.write_parquet(pca_cfret_outpath) | ||
|
|
||
| # update cfret_feats because PCA was applied | ||
| cfret_pca_feats = pca_cfret_df.drop(cfret_meta).columns | ||
|
|
||
| # save feature space | ||
| with open(transformed_results_dir / "cfret_pca_feature_space.json", "w") as f: | ||
| json.dump( | ||
| {"metadata-features": cfret_meta, "morphological-features": cfret_pca_feats}, | ||
| f, | ||
| indent=4, | ||
| ) | ||
|
|
||
|
|
||
| # This section applies clustering to identify distinct cell populations within each treatment condition. The clustering is optimized using Optuna to find the best hyperparameters (resolution, number of neighbors, and distance metric) that maximize separation of cell populations while maintaining biological relevance. | ||
|
|
||
| # In[ ]: | ||
|
|
||
|
|
@@ -197,7 +227,9 @@ | |
|
|
||
| # check if the cluster labels already exist; if so just load the labels and skip optimization | ||
| # if not run optimization | ||
| cluster_labels_output = (results_dir / "cfret_pilot_cluster_labels.parquet").resolve() | ||
| cluster_labels_output = ( | ||
| cluster_labels_results_dir / "cfret_pilot_cluster_labels.parquet" | ||
| ).resolve() | ||
| if cluster_labels_output.exists(): | ||
| print("Cluster labels already exist, skipping clustering optimization.") | ||
| cfret_cluster_labels_df = pl.read_parquet(cluster_labels_output) | ||
|
|
@@ -222,11 +254,15 @@ | |
| cfret_cluster_labels_df.write_parquet(cluster_labels_output) | ||
|
|
||
| # write best params as a json file | ||
| with open(results_dir / "cfret_pilot_best_clustering_params.json", "w") as f: | ||
| with open( | ||
| cluster_labels_results_dir / "cfret_pilot_best_clustering_params.json", "w" | ||
| ) as f: | ||
| json.dump(cfret_best_params, f, indent=4) | ||
|
|
||
|
|
||
| # In[ ]: | ||
| # This section measures the phenotypic distance between each treatment and the reference control (DMSO_heart_11) using the on and off signatures. The phenotypic scores are then used to rank treatments and identify top-ranking compounds based on their morphological activity. | ||
|
|
||
| # In[9]: | ||
|
|
||
|
|
||
| # merge cfret_df with the cluster labels and make sure to drop duplicate Metadata_cell_id columns | ||
|
|
@@ -241,13 +277,15 @@ | |
| raise ValueError("Merged DataFrame has different number of rows!") | ||
|
|
||
|
|
||
| # In[ ]: | ||
| # In[10]: | ||
|
|
||
|
|
||
| # setting output paths | ||
| treatment_dist_scores_outpath = ( | ||
| results_dir / "treatment_phenotypic_scores.csv" | ||
| phenotypic_scores_results_dir / "treatment_phenotypic_scores.csv" | ||
| ).resolve() | ||
|
|
||
| # calculate phenotypic distance scores | ||
| if treatment_dist_scores_outpath.exists(): | ||
| print("Treatment phenotypic distance scores already exist, skipping this step.") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. might want to perform some content check to ensure everything needed is present so in the event you include/uninclude samples earlier the pipeline knows if stuff needs to be re-generated or perhaps a force-re-run flag would be handy. |
||
| treatment_heart_dist_scores = pl.read_csv(treatment_dist_scores_outpath) | ||
|
|
@@ -265,19 +303,15 @@ | |
| treatment_heart_dist_scores.write_csv(treatment_dist_scores_outpath) | ||
|
|
||
|
|
||
| # In[12]: | ||
|
|
||
| # In[11]: | ||
|
|
||
| treatment_heart_dist_scores | ||
|
|
||
|
|
||
| # In[13]: | ||
|
|
||
|
|
||
| # setting outptut paths | ||
| # setting output paths | ||
| treatment_heart_rankings_outpath = ( | ||
| results_dir / "treatment_heart_rankings.csv" | ||
| phenotypic_scores_results_dir / "treatment_heart_rankings.csv" | ||
| ).resolve() | ||
|
|
||
| # identify hits based on distance scores | ||
| if treatment_heart_rankings_outpath.exists(): | ||
| print("Treatment heart rankings already exist, skipping this step.") | ||
| treatment_heart_rankings = pl.read_csv(treatment_heart_rankings_outpath) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,104 @@ | ||
| #!/usr/bin/env python | ||
|
|
||
| # # 2. Generating Aggregate Profiles | ||
| # | ||
| # This notebook transforms single-cell morphological profiles from the CFReT pilot dataset into summary representations for downstream analysis. Aggregation reduces noise and enables robust comparisons between experimental conditions by collapsing hundreds or thousands of single-cell measurements into representative profiles. | ||
| # | ||
| # Two levels of aggregation are generated: | ||
| # 1. **Replicate-level profiles**: Aggregate cells by well position, heart number, cell type, and treatment to create technical replicate profiles | ||
| # 2. **Consensus profiles**: Further aggregate replicates by heart type and treatment to generate condition-level consensus signatures | ||
| # | ||
| # Here we used `pycytominer.aggregate()` to apply median aggregation to generate two profiles explained above. Then output profiles are saved as parquet files. | ||
|
|
||
| # In[1]: | ||
|
|
||
|
|
||
| import pathlib | ||
| import sys | ||
|
|
||
| import polars as pl | ||
| from pycytominer import aggregate | ||
|
|
||
| sys.path.append("../../") | ||
| from utils.data_utils import split_meta_and_features | ||
|
|
||
| # Setting input and output paths | ||
|
|
||
| # In[2]: | ||
|
|
||
|
|
||
| # setting data path for cfret-pilot dataset | ||
| cfret_profiles_path = pathlib.Path( | ||
| "../0.download-data/data/sc-profiles/cfret/localhost230405150001_sc_feature_selected.parquet" | ||
| ).resolve(strict=True) | ||
|
|
||
| # set results directory path | ||
| results_dir = pathlib.Path("./results").resolve() | ||
| results_dir.mkdir(exist_ok=True) | ||
|
|
||
| # make aggregate profile directory | ||
| aggregate_profiles_dir = results_dir / "aggregate_profiles" | ||
| aggregate_profiles_dir.mkdir(exist_ok=True) | ||
|
|
||
|
|
||
| # In[3]: | ||
|
|
||
|
|
||
| # load in the cfret-pilot dataset | ||
| cfret_df = pl.read_parquet(cfret_profiles_path) | ||
|
|
||
| # add a column that indicates the heart and treatment added | ||
| cfret_df = cfret_df.with_columns( | ||
| pl.concat_str( | ||
| [ | ||
| pl.col("Metadata_cell_type"), | ||
| pl.col("Metadata_treatment"), | ||
| ], | ||
| separator="_", | ||
| ).alias("Metadata_heart_treatment") | ||
| ) | ||
|
|
||
| # split feature space | ||
| cfret_meta, cfret_feats = split_meta_and_features(cfret_df) | ||
|
|
||
| # display | ||
| print(cfret_df.shape) | ||
| cfret_df.head() | ||
|
|
||
|
|
||
| # Generating aggregate profiles at the replicate level | ||
|
|
||
| # In[4]: | ||
|
|
||
|
|
||
| aggregate( | ||
| population_df=cfret_df.to_pandas(), | ||
| strata=[ | ||
| "Metadata_heart_treatment", | ||
| "Metadata_WellRow", | ||
| "Metadata_WellCol", | ||
| "Metadata_heart_number", | ||
| "Metadata_cell_type", | ||
| "Metadata_treatment", | ||
| ], | ||
| features=cfret_feats, | ||
| operation="median", | ||
| output_type="parquet", | ||
| output_file=(aggregate_profiles_dir / "cfret_replicate_profiles.parquet").resolve(), | ||
| ) | ||
|
|
||
|
|
||
| # Generating consensus profiles of of the treatment and heart type | ||
|
|
||
| # In[5]: | ||
|
|
||
|
|
||
| # aggregating profiles by heart and treatment | ||
| aggregate( | ||
| population_df=cfret_df.to_pandas(), | ||
| strata=["Metadata_heart_treatment", "Metadata_cell_type", "Metadata_treatment"], | ||
| features=cfret_feats, | ||
| operation="median", | ||
| output_type="parquet", | ||
| output_file=(aggregate_profiles_dir / "cfret_consensus_profiles.parquet").resolve(), | ||
| ) |
Uh oh!
There was an error while loading. Please reload this page.