From a19ed167b208b04740b603f52d499c937474c174 Mon Sep 17 00:00:00 2001 From: MrBones1102 Date: Tue, 14 Apr 2026 11:10:04 -0700 Subject: [PATCH 01/10] update dependencies --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6572865..7053fd9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ dependencies = [ "networkx>=3.3", "gseapy~=1.1.6", "statsmodels>=0.14.5", - "scipy<=1.15.3", + "scipy<=1.16", ] From a886f8facae28d6b2609bb80a1d49929e818f431 Mon Sep 17 00:00:00 2001 From: MrBones1102 Date: Tue, 14 Apr 2026 11:10:22 -0700 Subject: [PATCH 02/10] drop-in pcr replacement --- pf2/predict.py | 185 ++++++++++++++++++++++++++----------------------- 1 file changed, 100 insertions(+), 85 deletions(-) diff --git a/pf2/predict.py b/pf2/predict.py index 6b8ca48..0800ec9 100644 --- a/pf2/predict.py +++ b/pf2/predict.py @@ -1,177 +1,180 @@ import pandas as pd import numpy as np -from sklearn.cross_decomposition import PLSRegression +from sklearn.decomposition import PCA +from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, roc_auc_score from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler import anndata + SKF = StratifiedKFold(n_splits=10) SGKF = StratifiedGroupKFold(n_splits=10) -def run_plsr( +def run_pca_lr( data: pd.DataFrame, labels: pd.DataFrame, proba: bool = False, n_components: int = 1 -) -> tuple[pd.Series, PLSRegression]: +) -> tuple[pd.Series, Pipeline]: """ - Predicts labels via PLSR cross-validation. + Predicts labels via PCA + logistic regression cross-validation. Args: data (pd.DataFrame): data to predict - labels (pd.Series): classification labels - proba (bool, default:False): return probability of prediction - n_components (int, default:1): number of PLSR components to use + labels (pd.DataFrame): classification labels with binary_outcome and patient_id + proba (bool, default:False): return probability of positive class + n_components (int, default:1): number of PCA components to use Returns: - predicted (pd.Series): predicted mortality for patients; if proba is - True, returns probabilities of mortality - plsr (PLSRegression): fitted PLSR model + predicted (pd.Series): predicted mortality (or probabilities if proba=True) + pipeline (Pipeline): fitted PCA + LR pipeline """ if not isinstance(data, pd.DataFrame): data = pd.DataFrame(data) - plsr = PLSRegression(n_components=n_components, scale=True, max_iter=int(1e5)) + pipeline = Pipeline([ + ("scaler", StandardScaler()), + ("pca", PCA(n_components=n_components)), + ("lr", LogisticRegression(max_iter=int(1e5))), + ]) - probabilities = pd.Series(0, dtype=float, index=data.index) + probabilities = pd.Series(0.0, dtype=float, index=data.index) for train_index, test_index in SGKF.split( data, labels.loc[:, "binary_outcome"], - labels.loc[:, "patient_id"] + labels.loc[:, "patient_id"], ): - train_group_data = data.iloc[train_index, :] + train_data = data.iloc[train_index] train_labels = labels.iloc[train_index].loc[:, "binary_outcome"] - test_group_data = data.iloc[test_index] - plsr.fit(train_group_data, train_labels) - probabilities.iloc[test_index] = plsr.predict(test_group_data) + test_data = data.iloc[test_index] + pipeline.fit(train_data, train_labels) + probabilities.iloc[test_index] = pipeline.predict_proba(test_data)[:, 1] - plsr.fit(data, labels.loc[:, "binary_outcome"]) - plsr.coef_ = pd.Series(plsr.coef_.squeeze(), index=data.columns) + pipeline.fit(data, labels.loc[:, "binary_outcome"]) if proba: - return probabilities, plsr - + return probabilities, pipeline else: - predicted = probabilities.round().astype(int) - return predicted, plsr + predicted = (probabilities > 0.5).astype(int) + return predicted, pipeline + - def predict_mortality_all( - X: anndata.AnnData, data: pd.DataFrame, proba: bool = False, n_components=1, bulk=False + X: anndata.AnnData, + data: pd.DataFrame, + proba: bool = False, + n_components=1, + bulk=False, ): """ Predicts mortality via cross-validation without breaking up by status. Parameters: - data (pd.DataFrame): data to predict - meta (pd.DataFrame): patient meta-data + X (anndata.AnnData): factorization results + data (pd.DataFrame): patient meta-data proba (bool, default:False): return probability of prediction + n_components (int, default:1): number of PCA components + bulk (bool, default:False): use bulk features instead of Pf2 components Returns: if proba: - probabilities (pd.Series): predicted probability of mortality for - patients + probabilities (pd.Series): predicted probability of mortality labels (pd.Series): classification targets else: accuracy (float): prediction accuracy labels (pd.Series): classification targets - model: fitted PLSR models + model: fitted pipeline """ if not isinstance(data, pd.DataFrame): data = pd.DataFrame(data) cond_fact_meta_df = data[data["patient_category"] != "Non-Pneumonia Control"] - + labels = cond_fact_meta_df.loc[:, ["binary_outcome", "patient_id"]] predictions = pd.Series(index=cond_fact_meta_df.index) - + + n_cmps = X.uns["Pf2_A"].shape[1] if bulk is False: - predictions[:], all_plsr = run_plsr( - cond_fact_meta_df[[f"Cmp. {i}" for i in np.arange(1, X.uns["Pf2_A"].shape[1] + 1)]], + predictions[:], model = run_pca_lr( + cond_fact_meta_df[[f"Cmp. {i}" for i in np.arange(1, n_cmps + 1)]], labels, proba=proba, n_components=n_components ) else: - predictions[:], all_plsr = run_plsr( + predictions[:], model = run_pca_lr( cond_fact_meta_df.iloc[:, :-3], labels, proba=proba, n_components=n_components ) - + if proba: return predictions, labels.loc[:, "binary_outcome"] - else: predicted = predictions.round().astype(int) return ( accuracy_score(labels.loc[:, "binary_outcome"], predicted), labels.loc[:, "binary_outcome"], - all_plsr + model ) - + def predict_mortality( X: anndata.AnnData, data: pd.DataFrame, n_components: int = 1, proba: bool = False -) -> tuple[pd.Series, pd.Series, tuple[PLSRegression, PLSRegression]]: +) -> tuple[pd.Series, pd.Series, tuple[Pipeline, Pipeline]]: """ - Predicts mortality via cross-validation. + Predicts mortality via cross-validation, separately for C19 and nC19. Parameters: X (anndata.AnnData): factorization results data (pd.DataFrame): patient meta-data - n_components (int, default:1): number of PLS components to use + n_components (int, default:1): number of PCA components proba (bool, default:False): return probability of prediction Returns: - predictions (pd.Series): if proba, probabilities of mortality for each - sample; else, predicted mortality outcome + predictions (pd.Series): if proba, probabilities; else predicted outcome labels (pd.Series): classification targets - models (tuple[COVID, Non-COVID]): fitted PLSR models + models (tuple[C19, nC19]): fitted pipelines """ if not isinstance(data, pd.DataFrame): data = pd.DataFrame(data) cond_fact_meta_df = data.loc[ - data.loc[:, "patient_category"] != "Non-Pneumonia Control", - : + data.loc[:, "patient_category"] != "Non-Pneumonia Control", : ] - + labels = cond_fact_meta_df.loc[ - :, - ["binary_outcome", "patient_id"] - ].astype(int) - - covid_data = cond_fact_meta_df.loc[cond_fact_meta_df.loc[:, "patient_category"] == "COVID-19", :] - covid_labels = labels.loc[ - cond_fact_meta_df.loc[:, "patient_category"] == "COVID-19", - : - ] - nc_data = cond_fact_meta_df.loc[cond_fact_meta_df.loc[:, "patient_category"] != "COVID-19", :] - nc_labels = labels.loc[ - cond_fact_meta_df.loc[:, "patient_category"] != "COVID-19", - : - ] + :, ["binary_outcome", "patient_id"] + ].astype(int) + + c19_mask = cond_fact_meta_df.loc[:, "patient_category"] == "COVID-19" + n_cmps = X.uns["Pf2_A"].shape[1] + cmps = [f"Cmp. {i}" for i in np.arange(1, n_cmps + 1)] + + covid_data = cond_fact_meta_df.loc[c19_mask, :] + covid_labels = labels.loc[c19_mask, :] + nc_data = cond_fact_meta_df.loc[~c19_mask, :] + nc_labels = labels.loc[~c19_mask, :] predictions = pd.Series(index=cond_fact_meta_df.index) - predictions.loc[cond_fact_meta_df.loc[:, "patient_category"] == "COVID-19"], c_plsr = run_plsr( - covid_data[[f"Cmp. {i}" for i in np.arange(1, X.uns["Pf2_A"].shape[1] + 1)]], - covid_labels, proba=proba, n_components=n_components + predictions.loc[c19_mask], c_model = run_pca_lr( + covid_data[cmps], covid_labels, proba=proba, n_components=n_components ) - predictions.loc[cond_fact_meta_df.loc[:, "patient_category"] != "COVID-19"], nc_plsr = run_plsr( - nc_data[[f"Cmp. {i}" for i in np.arange(1, X.uns["Pf2_A"].shape[1] + 1)]], - nc_labels, proba=proba, n_components=n_components + predictions.loc[~c19_mask], nc_model = run_pca_lr( + nc_data[cmps], nc_labels, proba=proba, n_components=n_components ) return ( predictions, labels.loc[:, "binary_outcome"].squeeze(), - (c_plsr, nc_plsr) + (c_model, nc_model) ) def plsr_acc_proba(X, patient_factor_matrix, n_components=1, roc_auc=True): - """Runs PLSR and obtains average prediction accuracy""" + """Runs PCA+LR and obtains average prediction accuracy""" acc_df = pd.DataFrame(columns=["Overall", "C19", "nC19"]) @@ -179,20 +182,32 @@ def plsr_acc_proba(X, patient_factor_matrix, n_components=1, roc_auc=True): X, patient_factor_matrix, n_components=n_components, proba=True ) + c19_mask = patient_factor_matrix.loc[:, "patient_category"] == "COVID-19" + nc19_mask = patient_factor_matrix.loc[:, "patient_category"] != "COVID-19" + if roc_auc: - score = roc_auc_score + covid_acc = roc_auc_score( + labels_all.loc[c19_mask].to_numpy().astype(int), + probabilities_all.loc[c19_mask], + ) + nc_acc = roc_auc_score( + labels_all.loc[nc19_mask].to_numpy().astype(int), + probabilities_all.loc[nc19_mask], + ) + acc = roc_auc_score(labels_all.to_numpy().astype(int), probabilities_all) else: - score = accuracy_score - - covid_acc = score( - labels_all.loc[patient_factor_matrix.loc[:, "patient_category"] == "COVID-19"].to_numpy().astype(int), - probabilities_all.loc[patient_factor_matrix.loc[:, "patient_category"] == "COVID-19"].round().astype(int), - ) - nc_acc = score( - labels_all.loc[patient_factor_matrix.loc[:, "patient_category"] != "COVID-19"].to_numpy().astype(int), - probabilities_all.loc[patient_factor_matrix.loc[:, "patient_category"] != "COVID-19"].round().astype(int), - ) - acc = score(labels_all.to_numpy().astype(int), probabilities_all.round().astype(int)) + covid_acc = accuracy_score( + labels_all.loc[c19_mask].to_numpy().astype(int), + probabilities_all.loc[c19_mask].round().astype(int), + ) + nc_acc = accuracy_score( + labels_all.loc[nc19_mask].to_numpy().astype(int), + probabilities_all.loc[nc19_mask].round().astype(int), + ) + acc = accuracy_score( + labels_all.to_numpy().astype(int), + probabilities_all.round().astype(int), + ) acc_df.loc[0, :] = [acc, covid_acc, nc_acc] @@ -200,10 +215,10 @@ def plsr_acc_proba(X, patient_factor_matrix, n_components=1, roc_auc=True): def plsr_acc(X, patient_factor_matrix, n_components=1): - """Runs PLSR and obtains average prediction accuracy for C19 and nC19""" + """Runs PCA+LR and returns labels and fitted models for C19 and nC19""" - _, labels, [c19_plsr, nc19_plsr] = predict_mortality(X, + _, labels, [c19_model, nc19_model] = predict_mortality(X, patient_factor_matrix, n_components=n_components, ) - return labels, [c19_plsr, nc19_plsr] + return labels, [c19_model, nc19_model] From 58e452e5cbf9049ba7fcf40f42c8a2614da64d92 Mon Sep 17 00:00:00 2001 From: MrBones1102 Date: Tue, 14 Apr 2026 14:58:27 -0700 Subject: [PATCH 03/10] rename funcs --- pf2/predict.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pf2/predict.py b/pf2/predict.py index 0800ec9..f29d461 100644 --- a/pf2/predict.py +++ b/pf2/predict.py @@ -173,7 +173,7 @@ def predict_mortality( ) -def plsr_acc_proba(X, patient_factor_matrix, n_components=1, roc_auc=True): +def pca_acc_proba(X, patient_factor_matrix, n_components=1, roc_auc=True): """Runs PCA+LR and obtains average prediction accuracy""" acc_df = pd.DataFrame(columns=["Overall", "C19", "nC19"]) @@ -214,7 +214,7 @@ def plsr_acc_proba(X, patient_factor_matrix, n_components=1, roc_auc=True): return acc_df -def plsr_acc(X, patient_factor_matrix, n_components=1): +def pca_acc(X, patient_factor_matrix, n_components=1): """Runs PCA+LR and returns labels and fitted models for C19 and nC19""" _, labels, [c19_model, nc19_model] = predict_mortality(X, From c846f0c09b2b449c67569e59cf51851f8396b7fa Mon Sep 17 00:00:00 2001 From: MrBones1102 Date: Tue, 14 Apr 2026 14:59:04 -0700 Subject: [PATCH 04/10] refactor FigureA3b_c --- pf2/figures/figureA3b_c.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/pf2/figures/figureA3b_c.py b/pf2/figures/figureA3b_c.py index 48517e7..49ec15d 100644 --- a/pf2/figures/figureA3b_c.py +++ b/pf2/figures/figureA3b_c.py @@ -6,7 +6,7 @@ import anndata import seaborn as sns from ..data_import import condition_factors_meta, add_obs, combine_cell_types -from ..predict import predict_mortality_all, plsr_acc_proba +from ..predict import predict_mortality_all, pca_acc_proba from .common import subplotLabel, getSetup from sklearn.metrics import RocCurveDisplay @@ -25,19 +25,19 @@ def makeFigure(): roc_auc = [False, True] for i in range(2): - plsr_acc_df = pd.DataFrame([]) - for j in range(3): - df = plsr_acc_proba( + pca_acc_df = pd.DataFrame([]) + for j in range(5): + df = pca_acc_proba( X, cond_fact_meta_df, n_components=j + 1, roc_auc=roc_auc[i] ) df["Component"] = j + 1 - plsr_acc_df = pd.concat([plsr_acc_df, df], axis=0) + pca_acc_df = pd.concat([pca_acc_df, df], axis=0) - plsr_acc_df = plsr_acc_df.melt( + pca_acc_df = pca_acc_df.melt( id_vars="Component", var_name="Category", value_name="Accuracy" ) sns.barplot( - data=plsr_acc_df, x="Component", y="Accuracy", hue="Category", ax=ax[i], + data=pca_acc_df, x="Component", y="Accuracy", hue="Category", ax=ax[i], hue_order=["C19", "nC19", "Overall"] ) if roc_auc[i] is True: @@ -45,16 +45,22 @@ def makeFigure(): else: ax[i].set(ylim=[0, 1], ylabel="Prediction Accuracy") + # Find the top performing PCA models based on best total accuracy within a component + top_acc_idx = pca_acc_df.groupby("Component")["Accuracy"].idxmax().tolist() + pca_acc_df_top = pca_acc_df.loc[top_acc_idx].sort_values("Accuracy", ascending=False) + top_performing = pca_acc_df_top["Component"].tolist() + + # Get the top 2 performing PCA models and plot the ROC curve for each for i in range(2): - plot_plsr_auc_roc(X, cond_fact_meta_df, n_components=i + 1, ax=ax[i + 2]) - ax[i + 2].set(title=f"PLSR {i + 1} Components") + plot_pca_auc_roc(X, cond_fact_meta_df, n_components=top_performing[i], ax=ax[i + 2]) + ax[i + 2].set(title=f"PCA {top_performing[i]} Components") return f -def plot_plsr_auc_roc(X, patient_factor_matrix, n_components, ax): - """Runs PLSR and plots ROC AUC based on actual and prediction labels""" +def plot_pca_auc_roc(X, patient_factor_matrix, n_components, ax): + """Runs PCA and plots ROC AUC based on actual and prediction labels""" probabilities_all, labels_all = predict_mortality_all(X, patient_factor_matrix, n_components=n_components, proba=True) From ef126e955fdde4a7aebedc39fe79091894eb8bde Mon Sep 17 00:00:00 2001 From: MrBones1102 Date: Tue, 14 Apr 2026 15:37:59 -0700 Subject: [PATCH 05/10] refactor Figure3Ad_g, add configurable PC --- pf2/figures/figureA3d_g.py | 91 +++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 51 deletions(-) diff --git a/pf2/figures/figureA3d_g.py b/pf2/figures/figureA3d_g.py index 53edc65..7d49360 100644 --- a/pf2/figures/figureA3d_g.py +++ b/pf2/figures/figureA3d_g.py @@ -2,14 +2,14 @@ Figure A3d_g """ +import anndata import numpy as np import pandas as pd -import anndata import seaborn as sns from ..data_import import condition_factors_meta -from ..predict import plsr_acc +from ..predict import pca_acc from .common import subplotLabel, getSetup -import seaborn as sns + def makeFigure(): """Get a list of the axis objects and create a figure.""" @@ -19,54 +19,47 @@ def makeFigure(): X = anndata.read_h5ad("/opt/northwest_bal/full_fitted.h5ad") cond_fact_meta_df = condition_factors_meta(X) - - labels, plsr_results_both = plsr_acc(X, cond_fact_meta_df, n_components=1) - plot_plsr_loadings(plsr_results_both, ax[0], ax[1]) + labels, pca_results_both = pca_acc(X, cond_fact_meta_df, n_components=3) + + plot_pca_loadings(pca_results_both, ax[0], ax[1]) ax[0].set(xlim=[-0.35, 0.35]) ax[1].set(xlim=[-0.35, 0.35]) - plot_plsr_scores(plsr_results_both, cond_fact_meta_df, labels, ax[2], ax[3]) + plot_pca_scores(pca_results_both, cond_fact_meta_df, labels, ax[2], ax[3]) ax[2].set(xlim=[-7, 7]) ax[3].set(xlim=[-9.5, 9.5]) return f -def plot_plsr_loadings(plsr_results, ax1, ax2): - """Runs PLSR and plots ROC AUC based on actual and prediction labels""" +def plot_pca_loadings(pca_results, ax1, ax2): + """Plots PCA component loadings for C19 and nC19 models.""" + TARGET_PC = 3 + TARGET_STR = f"PC {TARGET_PC}" ax = [ax1, ax2] type_of_data = ["C19", "nC19"] for i in range(2): - x_load = plsr_results[i].x_loadings_[:, 0] + pca = pca_results[i].named_steps["pca"] + + x_load = pca.components_[TARGET_PC - 1, :] if i == 1: - x_load =-1*x_load - df_xload = pd.DataFrame(data=x_load, columns=["PLSR 1"]) + x_load = -1 * x_load + df_xload = pd.DataFrame(data=x_load, columns=[TARGET_STR]) df_xload["Component"] = np.arange(df_xload.shape[0]) + 1 - print(df_xload.sort_values(by="PLSR 1")) - y_load = plsr_results[i].y_loadings_[0, 0] - if i == 1: - y_load =-1*y_load - df_yload = pd.DataFrame(data=[[y_load]], columns=["PLSR 1"]) - sns.swarmplot( - data=df_xload, - x="PLSR 1", - ax=ax[i], - color="k", - ) - sns.swarmplot( - data=df_yload, - x="PLSR 1", - ax=ax[i], - color="r", - + print(df_xload.sort_values(by=TARGET_STR)) + + sns.swarmplot(data=df_xload, x=TARGET_STR, ax=ax[i], color="k") + ax[i].set( + xlabel=TARGET_STR, ylabel="Pf2 Components", title=f"{type_of_data[i]}-loadings" ) - ax[i].set(xlabel="PLSR 1", ylabel="Pf2 Components", title=f"{type_of_data[i]}-loadings") -def plot_plsr_scores(plsr_results, cond_fact_meta_df, labels, ax1, ax2): - """Runs PLSR and plots ROC AUC based on actual and prediction labels""" +def plot_pca_scores(pca_results, cond_fact_meta_df, labels, ax1, ax2): + """Plots PCA scores for C19 and nC19 patients colored by mortality outcome.""" + TARGET_PC = 3 + TARGET_STR = f"PC {TARGET_PC}" ax = [ax1, ax2] type_of_data = ["C19", "nC19"] @@ -74,32 +67,28 @@ def plot_plsr_scores(plsr_results, cond_fact_meta_df, labels, ax1, ax2): cond_fact_meta_df.loc[:, "patient_category"] != "Non-Pneumonia Control", : ] + cmp_cols = [c for c in cond_fact_meta_df.columns if c.startswith("Cmp.")] + c19_mask = cond_fact_meta_df.loc[:, "patient_category"] == "COVID-19" + for i in range(2): - if i == 0: - score_labels = labels.loc[ - cond_fact_meta_df.loc[:, "patient_category"] == "COVID-19" - ] - else: - score_labels = labels.loc[ - cond_fact_meta_df.loc[:, "patient_category"] != "COVID-19" - ] + mask = c19_mask if i == 0 else ~c19_mask + score_labels = labels.loc[mask] + subset_data = cond_fact_meta_df.loc[mask, cmp_cols] - pal = sns.color_palette() - if i == 0: - numb1=0; numb2=2 - else: - numb1=1; numb2=3 - - x_scores = plsr_results[i].x_scores_[:, 0] + x_scores = pca_results[i][:-1].transform(subset_data)[:, TARGET_PC - 1] if i == 1: - x_scores =-1*x_scores - df_xscores = pd.DataFrame(data=x_scores, columns=["PLSR 1"]) + x_scores = -1 * x_scores + + pal = sns.color_palette() + numb1, numb2 = (0, 2) if i == 0 else (1, 3) + + df_xscores = pd.DataFrame(data=x_scores, columns=[TARGET_STR]) sns.swarmplot( data=df_xscores, - x="PLSR 1", + x=TARGET_STR, ax=ax[i], hue=score_labels.to_numpy(), palette=[pal[numb1], pal[numb2]], hue_order=[1, 0], ) - ax[i].set(xlabel="PLSR 1", ylabel="Samples", title=f"{type_of_data[i]}-scores") \ No newline at end of file + ax[i].set(xlabel=TARGET_STR, ylabel="Samples", title=f"{type_of_data[i]}-scores") From 9175ae6b0d7f42526b634f24bf8421e667fd1426 Mon Sep 17 00:00:00 2001 From: MrBones1102 Date: Wed, 15 Apr 2026 07:33:39 -0700 Subject: [PATCH 06/10] predict mortality defaults to 3 pca --- pf2/predict.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pf2/predict.py b/pf2/predict.py index f29d461..9dd3a0b 100644 --- a/pf2/predict.py +++ b/pf2/predict.py @@ -121,7 +121,7 @@ def predict_mortality_all( def predict_mortality( X: anndata.AnnData, data: pd.DataFrame, - n_components: int = 1, + n_components: int = 3, proba: bool = False ) -> tuple[pd.Series, pd.Series, tuple[Pipeline, Pipeline]]: """ From b039b3c440a4674976bc98a6a17e6218c18d5df4 Mon Sep 17 00:00:00 2001 From: MrBones1102 Date: Thu, 30 Apr 2026 14:39:36 -0700 Subject: [PATCH 07/10] 2d plot of scores --- pf2/figures/figureA3d_g.py | 48 +++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/pf2/figures/figureA3d_g.py b/pf2/figures/figureA3d_g.py index 7d49360..26d6cd6 100644 --- a/pf2/figures/figureA3d_g.py +++ b/pf2/figures/figureA3d_g.py @@ -13,7 +13,7 @@ def makeFigure(): """Get a list of the axis objects and create a figure.""" - ax, f = getSetup((3, 7), (4, 1)) + ax, f = getSetup((6, 8), (3, 2)) subplotLabel(ax) X = anndata.read_h5ad("/opt/northwest_bal/full_fitted.h5ad") @@ -27,8 +27,8 @@ def makeFigure(): ax[1].set(xlim=[-0.35, 0.35]) plot_pca_scores(pca_results_both, cond_fact_meta_df, labels, ax[2], ax[3]) - ax[2].set(xlim=[-7, 7]) - ax[3].set(xlim=[-9.5, 9.5]) + + plot_pca_scores_2d(pca_results_both, cond_fact_meta_df, labels, ax[4], ax[5]) return f @@ -92,3 +92,45 @@ def plot_pca_scores(pca_results, cond_fact_meta_df, labels, ax1, ax2): hue_order=[1, 0], ) ax[i].set(xlabel=TARGET_STR, ylabel="Samples", title=f"{type_of_data[i]}-scores") + +def plot_pca_scores_2d(pca_results, cond_fact_meta_df, labels, ax1, ax2): + """Plots PCA scores of PC1 vs PC2 and PC1 vs PC3 for C19 and nC19 patients colored by mortality outcome.""" + ax = [ax1, ax2] + pc_pairs = [(0, 1), (0, 2)] + + cond_fact_meta_df = cond_fact_meta_df.loc[ + cond_fact_meta_df.loc[:, "patient_category"] != "Non-Pneumonia Control", : + ] + + cmp_cols = [c for c in cond_fact_meta_df.columns if c.startswith("Cmp.")] + c19_mask = cond_fact_meta_df.loc[:, "patient_category"] == "COVID-19" + + for i, (pc_x, pc_y) in enumerate(pc_pairs): + score_labels = labels.loc[c19_mask] + subset_data = cond_fact_meta_df.loc[c19_mask, cmp_cols] + + scores = pca_results[0][:-1].transform(subset_data)[:, :3] + + pal = sns.color_palette() + df_scores = pd.DataFrame( + { + f"PC {pc_x + 1}": scores[:, pc_x], + f"PC {pc_y + 1}": scores[:, pc_y], + "Outcome": score_labels.to_numpy(), + } + ) + sns.scatterplot( + data=df_scores, + x=f"PC {pc_x + 1}", + y=f"PC {pc_y + 1}", + hue="Outcome", + palette=[pal[0], pal[2]], + hue_order=[1, 0], + ax=ax[i], + ) + ax[i].set( + xlabel=f"PC {pc_x + 1}", + ylabel=f"PC {pc_y + 1}", + title="C19-scores 2D", + ) + From dda5433ec66759eb968a83d2ca96532c0a5d2fca Mon Sep 17 00:00:00 2001 From: MrBones1102 Date: Thu, 30 Apr 2026 14:39:59 -0700 Subject: [PATCH 08/10] add anova test of 5-component PCA model --- pf2/figures/figureA3b_c.py | 64 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/pf2/figures/figureA3b_c.py b/pf2/figures/figureA3b_c.py index 49ec15d..fb1bf85 100644 --- a/pf2/figures/figureA3b_c.py +++ b/pf2/figures/figureA3b_c.py @@ -2,9 +2,14 @@ Figure A3b_c """ +import numpy as np import pandas as pd import anndata import seaborn as sns +from statsmodels.stats.anova import anova_lm +from statsmodels.formula.api import ols +from sklearn.decomposition import PCA +from sklearn.preprocessing import StandardScaler from ..data_import import condition_factors_meta, add_obs, combine_cell_types from ..predict import predict_mortality_all, pca_acc_proba from .common import subplotLabel, getSetup @@ -13,7 +18,7 @@ def makeFigure(): """Get a list of the axis objects and create a figure.""" - ax, f = getSetup((6, 6), (2, 2)) + ax, f = getSetup((9, 9), (3, 2)) subplotLabel(ax) X = anndata.read_h5ad("/opt/northwest_bal/full_fitted.h5ad") @@ -55,6 +60,10 @@ def makeFigure(): plot_pca_auc_roc(X, cond_fact_meta_df, n_components=top_performing[i], ax=ax[i + 2]) ax[i + 2].set(title=f"PCA {top_performing[i]} Components") + anova_df = pc_anova_analysis(cond_fact_meta_df) + plot_pc_anova_heatmap(anova_df, ax=ax[4]) + ax[5].set_visible(False) + return f @@ -78,4 +87,55 @@ def plot_pca_auc_roc(X, patient_factor_matrix, n_components, ax): ) RocCurveDisplay.from_predictions( labels_all.to_numpy().astype(int), probabilities_all, plot_chance_level=True, ax=ax, name="Overall" - ) \ No newline at end of file + ) + + +def pc_anova_analysis(cond_fact_meta_df, n_components=5): + """ANOVA for each PC against C19 outcome, nC19 outcome, and C19 vs nC19.""" + feature_cols = [c for c in cond_fact_meta_df.columns if c.startswith("Cmp.")] + df = cond_fact_meta_df[ + cond_fact_meta_df["patient_category"] != "Non-Pneumonia Control" + ].copy() + + pca_scores = PCA(n_components=n_components).fit_transform( + StandardScaler().fit_transform(df[feature_cols]) + ) + pc_df = pd.DataFrame( + pca_scores, + columns=[f"PC{i+1}" for i in range(n_components)], + index=df.index, + ) + pc_df["binary_outcome"] = df["binary_outcome"] + pc_df["patient_category"] = df["patient_category"] + pc_df["is_c19"] = (pc_df["patient_category"] == "COVID-19").astype(int) + + def _p(formula, data, term): + return anova_lm(ols(formula, data=data).fit(), typ=2).loc[term, "PR(>F)"] + + results = {} + for i in range(n_components): + pc = f"PC{i+1}" + c19 = pc_df[pc_df["patient_category"] == "COVID-19"].dropna( + subset=["binary_outcome"] + ) + nc19 = pc_df[pc_df["patient_category"] != "COVID-19"].dropna( + subset=["binary_outcome"] + ) + results[pc] = { + "C19 Outcome": _p(f"{pc} ~ C(binary_outcome)", c19, "C(binary_outcome)"), + "nC19 Outcome": _p(f"{pc} ~ C(binary_outcome)", nc19, "C(binary_outcome)"), + "C19 vs nC19": _p(f"{pc} ~ C(is_c19)", pc_df, "C(is_c19)"), + } + + return pd.DataFrame(results).T + + +def plot_pc_anova_heatmap(anova_df, ax): + """Heatmap of -log10(p-values) from per-PC ANOVA associations.""" + neg_log_p = -np.log10(anova_df.astype(float)) + sns.heatmap( + neg_log_p, ax=ax, cmap="YlOrRd", annot=True, fmt=".2f", + linewidths=0.5, cbar_kws={"label": "-log10(p-value)"}, + ) + ax.axhline(y=0, color="black", linewidth=0.5) + ax.set(title="PC Associations (ANOVA)", xlabel="", ylabel="") \ No newline at end of file From ce854a1184a59af9453900b15247169d62282bee Mon Sep 17 00:00:00 2001 From: MrBones1102 Date: Fri, 1 May 2026 11:23:40 -0700 Subject: [PATCH 09/10] Add SVM to 2D plots Co-authored-by: Copilot --- pf2/figures/figureA3d_g.py | 80 +++++++++++++++++++++++++++----------- 1 file changed, 58 insertions(+), 22 deletions(-) diff --git a/pf2/figures/figureA3d_g.py b/pf2/figures/figureA3d_g.py index 26d6cd6..76600d8 100644 --- a/pf2/figures/figureA3d_g.py +++ b/pf2/figures/figureA3d_g.py @@ -6,10 +6,24 @@ import numpy as np import pandas as pd import seaborn as sns +from sklearn.model_selection import cross_val_score +from sklearn.svm import SVC from ..data_import import condition_factors_meta from ..predict import pca_acc from .common import subplotLabel, getSetup +# Make modifying the target PCs easier while exploring +C19_TARGET = 3 +nC19_TARGET = 4 +C19_TARGET_STR = f"PC {C19_TARGET}" +nC19_TARGET_STR = f"PC {nC19_TARGET}" +target_pcs = {C19_TARGET_STR: C19_TARGET, nC19_TARGET_STR: nC19_TARGET} + +def get_nth_key(target_dict, n): + for i, key in enumerate(target_dict): + if i == n: + return key + raise IndexError("Index out of range") def makeFigure(): """Get a list of the axis objects and create a figure.""" @@ -20,7 +34,7 @@ def makeFigure(): cond_fact_meta_df = condition_factors_meta(X) - labels, pca_results_both = pca_acc(X, cond_fact_meta_df, n_components=3) + labels, pca_results_both = pca_acc(X, cond_fact_meta_df, n_components=5) plot_pca_loadings(pca_results_both, ax[0], ax[1]) ax[0].set(xlim=[-0.35, 0.35]) @@ -35,31 +49,27 @@ def makeFigure(): def plot_pca_loadings(pca_results, ax1, ax2): """Plots PCA component loadings for C19 and nC19 models.""" - TARGET_PC = 3 - TARGET_STR = f"PC {TARGET_PC}" ax = [ax1, ax2] type_of_data = ["C19", "nC19"] for i in range(2): pca = pca_results[i].named_steps["pca"] - x_load = pca.components_[TARGET_PC - 1, :] + x_load = pca.components_[target_pcs[get_nth_key(target_pcs, i)] - 1, :] if i == 1: x_load = -1 * x_load - df_xload = pd.DataFrame(data=x_load, columns=[TARGET_STR]) + df_xload = pd.DataFrame(data=x_load, columns=[get_nth_key(target_pcs, i)]) df_xload["Component"] = np.arange(df_xload.shape[0]) + 1 - print(df_xload.sort_values(by=TARGET_STR)) + print(df_xload.sort_values(by=get_nth_key(target_pcs, i))) - sns.swarmplot(data=df_xload, x=TARGET_STR, ax=ax[i], color="k") + sns.swarmplot(data=df_xload, x=get_nth_key(target_pcs, i), ax=ax[i], color="k") ax[i].set( - xlabel=TARGET_STR, ylabel="Pf2 Components", title=f"{type_of_data[i]}-loadings" + xlabel=get_nth_key(target_pcs, i), ylabel="Pf2 Components", title=f"{type_of_data[i]}-loadings" ) def plot_pca_scores(pca_results, cond_fact_meta_df, labels, ax1, ax2): """Plots PCA scores for C19 and nC19 patients colored by mortality outcome.""" - TARGET_PC = 3 - TARGET_STR = f"PC {TARGET_PC}" ax = [ax1, ax2] type_of_data = ["C19", "nC19"] @@ -75,28 +85,28 @@ def plot_pca_scores(pca_results, cond_fact_meta_df, labels, ax1, ax2): score_labels = labels.loc[mask] subset_data = cond_fact_meta_df.loc[mask, cmp_cols] - x_scores = pca_results[i][:-1].transform(subset_data)[:, TARGET_PC - 1] + x_scores = pca_results[i][:-1].transform(subset_data)[:, target_pcs[get_nth_key(target_pcs, i)] - 1] if i == 1: x_scores = -1 * x_scores pal = sns.color_palette() numb1, numb2 = (0, 2) if i == 0 else (1, 3) - df_xscores = pd.DataFrame(data=x_scores, columns=[TARGET_STR]) + df_xscores = pd.DataFrame(data=x_scores, columns=[get_nth_key(target_pcs, i)]) sns.swarmplot( data=df_xscores, - x=TARGET_STR, + x=get_nth_key(target_pcs, i), ax=ax[i], hue=score_labels.to_numpy(), palette=[pal[numb1], pal[numb2]], hue_order=[1, 0], ) - ax[i].set(xlabel=TARGET_STR, ylabel="Samples", title=f"{type_of_data[i]}-scores") + ax[i].set(xlabel=get_nth_key(target_pcs, i), ylabel="Samples", title=f"{type_of_data[i]}-scores") def plot_pca_scores_2d(pca_results, cond_fact_meta_df, labels, ax1, ax2): - """Plots PCA scores of PC1 vs PC2 and PC1 vs PC3 for C19 and nC19 patients colored by mortality outcome.""" + """Plots PCA scores 2D with RBF-SVM decision boundaries and cross-validated AUC.""" ax = [ax1, ax2] - pc_pairs = [(0, 1), (0, 2)] + pc_pairs = [(1, 2), (0, 2)] cond_fact_meta_df = cond_fact_meta_df.loc[ cond_fact_meta_df.loc[:, "patient_category"] != "Non-Pneumonia Control", : @@ -105,13 +115,39 @@ def plot_pca_scores_2d(pca_results, cond_fact_meta_df, labels, ax1, ax2): cmp_cols = [c for c in cond_fact_meta_df.columns if c.startswith("Cmp.")] c19_mask = cond_fact_meta_df.loc[:, "patient_category"] == "COVID-19" - for i, (pc_x, pc_y) in enumerate(pc_pairs): - score_labels = labels.loc[c19_mask] - subset_data = cond_fact_meta_df.loc[c19_mask, cmp_cols] + score_labels = labels.loc[c19_mask] + subset_data = cond_fact_meta_df.loc[c19_mask, cmp_cols] + scores = pca_results[0][:-1].transform(subset_data)[:, :3] + y_svm = score_labels.to_numpy().astype(int) - scores = pca_results[0][:-1].transform(subset_data)[:, :3] + pal = sns.color_palette() + + for i, (pc_x, pc_y) in enumerate(pc_pairs): + X_svm = np.column_stack([scores[:, pc_x], scores[:, pc_y]]) + + svm = SVC(kernel="rbf", probability=True) + svm.fit(X_svm, y_svm) + cv_auc = cross_val_score( + SVC(kernel="rbf", probability=True), + X_svm, y_svm, cv=5, scoring="roc_auc", + ).mean() + + # Decision boundary drawn before scatter so points sit on top + x_pad = (X_svm[:, 0].max() - X_svm[:, 0].min()) * 0.1 + y_pad = (X_svm[:, 1].max() - X_svm[:, 1].min()) * 0.1 + xx, yy = np.meshgrid( + np.linspace(X_svm[:, 0].min() - x_pad, X_svm[:, 0].max() + x_pad, 300), + np.linspace(X_svm[:, 1].min() - y_pad, X_svm[:, 1].max() + y_pad, 300), + ) + Z = svm.decision_function(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) + # Z<0 → class 0 (Lived, pal[2]); Z>0 → class 1 (Died, pal[0]) + ax[i].contourf( + xx, yy, Z, levels=[-1000, 0, 1000], alpha=0.15, colors=[pal[2], pal[0]] + ) + ax[i].contour( + xx, yy, Z, levels=[0], colors="k", linewidths=1.5, linestyles="--" + ) - pal = sns.color_palette() df_scores = pd.DataFrame( { f"PC {pc_x + 1}": scores[:, pc_x], @@ -131,6 +167,6 @@ def plot_pca_scores_2d(pca_results, cond_fact_meta_df, labels, ax1, ax2): ax[i].set( xlabel=f"PC {pc_x + 1}", ylabel=f"PC {pc_y + 1}", - title="C19-scores 2D", + title=f"C19-scores 2D (CV AUC={cv_auc:.2f})", ) From 36edabb942d423b41e5eeb1ee72ac122333471dc Mon Sep 17 00:00:00 2001 From: MrBones1102 Date: Wed, 6 May 2026 11:29:20 -0700 Subject: [PATCH 10/10] peform SVM for full 5-component model Co-authored-by: Copilot --- pf2/figures/figureA3d_g.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/pf2/figures/figureA3d_g.py b/pf2/figures/figureA3d_g.py index 76600d8..8005743 100644 --- a/pf2/figures/figureA3d_g.py +++ b/pf2/figures/figureA3d_g.py @@ -19,6 +19,7 @@ nC19_TARGET_STR = f"PC {nC19_TARGET}" target_pcs = {C19_TARGET_STR: C19_TARGET, nC19_TARGET_STR: nC19_TARGET} +# Helper function to get keys by index since dicts don't support direct indexing def get_nth_key(target_dict, n): for i, key in enumerate(target_dict): if i == n: @@ -106,7 +107,7 @@ def plot_pca_scores(pca_results, cond_fact_meta_df, labels, ax1, ax2): def plot_pca_scores_2d(pca_results, cond_fact_meta_df, labels, ax1, ax2): """Plots PCA scores 2D with RBF-SVM decision boundaries and cross-validated AUC.""" ax = [ax1, ax2] - pc_pairs = [(1, 2), (0, 2)] + pc_pairs = [(1, 2)] # PC2 vs. PC3 cond_fact_meta_df = cond_fact_meta_df.loc[ cond_fact_meta_df.loc[:, "patient_category"] != "Non-Pneumonia Control", : @@ -117,29 +118,32 @@ def plot_pca_scores_2d(pca_results, cond_fact_meta_df, labels, ax1, ax2): score_labels = labels.loc[c19_mask] subset_data = cond_fact_meta_df.loc[c19_mask, cmp_cols] - scores = pca_results[0][:-1].transform(subset_data)[:, :3] + scores = pca_results[0][:-1].transform(subset_data)[:, :5] y_svm = score_labels.to_numpy().astype(int) + # Perform SVM on 5-component model for total accuracy reporting + cv_auc_5d = cross_val_score( + SVC(kernel="rbf", probability=True), + scores, y_svm, cv=5, scoring="roc_auc", + ).mean() + pal = sns.color_palette() + # Plot just pairs of PCs with SVM decision boundaries for visualization for i, (pc_x, pc_y) in enumerate(pc_pairs): - X_svm = np.column_stack([scores[:, pc_x], scores[:, pc_y]]) + X_2d = np.column_stack([scores[:, pc_x], scores[:, pc_y]]) - svm = SVC(kernel="rbf", probability=True) - svm.fit(X_svm, y_svm) - cv_auc = cross_val_score( - SVC(kernel="rbf", probability=True), - X_svm, y_svm, cv=5, scoring="roc_auc", - ).mean() + svm_2d = SVC(kernel="rbf", probability=True) + svm_2d.fit(X_2d, y_svm) # Decision boundary drawn before scatter so points sit on top - x_pad = (X_svm[:, 0].max() - X_svm[:, 0].min()) * 0.1 - y_pad = (X_svm[:, 1].max() - X_svm[:, 1].min()) * 0.1 + x_pad = (X_2d[:, 0].max() - X_2d[:, 0].min()) * 0.1 + y_pad = (X_2d[:, 1].max() - X_2d[:, 1].min()) * 0.1 xx, yy = np.meshgrid( - np.linspace(X_svm[:, 0].min() - x_pad, X_svm[:, 0].max() + x_pad, 300), - np.linspace(X_svm[:, 1].min() - y_pad, X_svm[:, 1].max() + y_pad, 300), + np.linspace(X_2d[:, 0].min() - x_pad, X_2d[:, 0].max() + x_pad, 300), + np.linspace(X_2d[:, 1].min() - y_pad, X_2d[:, 1].max() + y_pad, 300), ) - Z = svm.decision_function(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) + Z = svm_2d.decision_function(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) # Z<0 → class 0 (Lived, pal[2]); Z>0 → class 1 (Died, pal[0]) ax[i].contourf( xx, yy, Z, levels=[-1000, 0, 1000], alpha=0.15, colors=[pal[2], pal[0]] @@ -167,6 +171,6 @@ def plot_pca_scores_2d(pca_results, cond_fact_meta_df, labels, ax1, ax2): ax[i].set( xlabel=f"PC {pc_x + 1}", ylabel=f"PC {pc_y + 1}", - title=f"C19-scores 2D (CV AUC={cv_auc:.2f})", + title=f"C19-scores 2D (5-PC SVM CV AUC={cv_auc_5d:.2f})", )