From a2091f0380dbeeb27dec12ec0dc7c3c806f44e1a Mon Sep 17 00:00:00 2001 From: henning Date: Wed, 12 Jun 2024 17:33:33 +0200 Subject: [PATCH 01/12] Improve error handling to only handle specific errors, also print stack_trace based on debug mode of Django --- protzilla/disk_operator.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/protzilla/disk_operator.py b/protzilla/disk_operator.py index 9f36c67d..63701a89 100644 --- a/protzilla/disk_operator.py +++ b/protzilla/disk_operator.py @@ -1,5 +1,6 @@ from __future__ import annotations +import traceback from dataclasses import dataclass from pathlib import Path @@ -12,6 +13,13 @@ from protzilla.constants.protzilla_logging import logger from protzilla.steps import Messages, Output, Plots, Step, StepManager +try: + from django.conf import settings + + DEBUG_MODE = settings.DEBUG +except ImportError: + DEBUG_MODE = False + class ErrorHandler: def __enter__(self): @@ -23,9 +31,10 @@ def __exit__(self, exc_type, exc_val, exc_tb): logger.error(f"File not found: {exc_val}") elif issubclass(exc_type, PermissionError): logger.error(f"Permission denied: {exc_val}") - else: - logger.error("An error occurred", exc_info=(exc_type, exc_val, exc_tb)) - return False + if DEBUG_MODE: + traceback.print_exception(exc_type, exc_val, exc_tb) + return False + return True class YamlOperator: @@ -100,7 +109,11 @@ def read_run(self, file: Path | None = None) -> StepManager: step_manager = StepManager() step_manager.df_mode = run.get(KEYS.DF_MODE, "disk") for step_data in run[KEYS.STEPS]: - step = self._read_step(step_data, step_manager) + try: + step = self._read_step(step_data, step_manager) + except Exception as e: + logger.error(f"Error reading step: {e}") + continue step_manager.add_step(step) step_manager.current_step_index = run.get(KEYS.CURRENT_STEP_INDEX, 0) return step_manager From 9c7db1913b59c41e52ff263915b553f848c880d2 Mon Sep 17 00:00:00 2001 From: henning Date: Thu, 13 Jun 2024 13:02:19 +0200 Subject: [PATCH 02/12] Remove outdated tests --- protzilla/disk_operator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/protzilla/disk_operator.py b/protzilla/disk_operator.py index 63701a89..d71deb42 100644 --- a/protzilla/disk_operator.py +++ b/protzilla/disk_operator.py @@ -115,7 +115,9 @@ def read_run(self, file: Path | None = None) -> StepManager: logger.error(f"Error reading step: {e}") continue step_manager.add_step(step) - step_manager.current_step_index = run.get(KEYS.CURRENT_STEP_INDEX, 0) + step_manager.current_step_index = max( + run.get(KEYS.CURRENT_STEP_INDEX, 0), len(step_manager.all_steps) - 1 + ) return step_manager def write_run(self, step_manager: StepManager) -> None: From 5c97022ce8b3165222caa5368d350f99bdbd3d2a Mon Sep 17 00:00:00 2001 From: lucatreide Date: Mon, 17 Jun 2024 11:49:38 +0200 Subject: [PATCH 03/12] add flexiquantlf to protzilla --- protzilla/data_analysis/ptm_quantification.py | 508 ++++++++++++++++++ protzilla/methods/data_analysis.py | 32 ++ ui/runs/form_mapping.py | 1 + ui/runs/forms/data_analysis.py | 33 ++ ui/runs/forms/fill_helper.py | 4 +- 5 files changed, 577 insertions(+), 1 deletion(-) create mode 100644 protzilla/data_analysis/ptm_quantification.py diff --git a/protzilla/data_analysis/ptm_quantification.py b/protzilla/data_analysis/ptm_quantification.py new file mode 100644 index 00000000..8ff7109b --- /dev/null +++ b/protzilla/data_analysis/ptm_quantification.py @@ -0,0 +1,508 @@ +import logging + +import matplotlib +import pandas as pd +from numpy import array, nan, sqrt, square +from scipy.stats import f, median_abs_deviation +from sklearn import linear_model + +matplotlib.use("Agg") +import matplotlib.pyplot as plt +from matplotlib import gridspec +from matplotlib.colors import LinearSegmentedColormap +from seaborn import distplot, lineplot, scatterplot + +from protzilla.utilities.utilities import fig_to_base64 + + +def flexiquant_lf( + peptide_df: pd.DataFrame, + metadata_df: pd.DataFrame, + reference_group: str, + protein_id: str, + num_init: int = 50, + mod_cutoff: float = 0.5, +) -> dict: + df = peptide_df[peptide_df["Protein ID"] == protein_id].pivot_table( + index="Sample", columns="Sequence", values="Intensity", aggfunc="first" + ) + df.reset_index(inplace=True) + + df = pd.merge( + left=df, + right=metadata_df[["Sample", "Group"]], + on="Sample", + copy=False, + ) + + try: + df["Group"] + except KeyError: + return dict( + messages=[ + dict( + level=logging.ERROR, + msg="No 'Group' column found in provided dataframe.", + ) + ] + ) + + # delete columns where all entries are nan + df.dropna(how="all", axis=1, inplace=True) + + if reference_group not in df["Group"].unique(): + return dict( + messages=[ + dict( + level=logging.ERROR, + msg=f"Reference sample '{reference_group}' not found in provided dataframe.", + ) + ] + ) + else: + # filter dataframe for controls + df_control = df[df["Group"] == reference_group] + + # get modified peptides + modified = [] + for elm in df_control.columns: + if "ph" in elm or "ac" in elm or "gly" in elm: + modified.append(elm) + + # delete modified peptides + df_control: pd.DataFrame = df_control.drop(modified, axis=1) + df.drop(modified, inplace=True, axis=1) + + # delete Group column + group_column = df["Group"] + df_control.drop("Group", axis=1, inplace=True) + df.drop("Group", axis=1, inplace=True) + + sample_column = df["Sample"] + + # calculate median intensities for unmodified peptides of control + median_intensities = df_control.median(axis=0) + + # initiate empty lists to save results of linear regressions + slope_list = [] + r2_score_list_model = [] + r2_score_list_data = [] + reproducibility_list = [] + + df_distance_RL = df.copy() + + regression_plots = [] + + plot_dict = {} + + for idx, row in df.iterrows(): + df_train = pd.DataFrame( + {"Sample intensity": row, "Reference intensity": median_intensities} + ) + + df_train.dropna(inplace=True) + + df_train.sort_index(inplace=True, axis=0) + + # if number of peptides is smaller than 5, skip sample and continue with next interation + if len(df_train) < 5: + # set all metrices to nan + df_distance_RL.loc[idx] = nan + df_train["CB low"] = nan + df_train["CB high"] = nan + slope_list.append(nan) + r2_score_list_model.append(nan) + r2_score_list_data.append(nan) + reproducibility_list.append(nan) + + continue + + # initiate empty list to save results of linear regression + list_model_slopes = [] + list_r2_model = [] + list_r2_data = [] + all_iter_slopes = [] + all_iter_r2_data = [] + all_iter_r2_model = [] + + # set training data + X = array(df_train["Reference intensity"]).reshape(-1, 1) + y = df_train["Sample intensity"] + + sq_mad = square(median_abs_deviation(df_train["Sample intensity"])) + + for i in range(num_init): + ransac_model = linear_model.RANSACRegressor( + estimator=linear_model.LinearRegression(fit_intercept=False, n_jobs=-2), + max_trials=1000, + stop_probability=1, + min_samples=0.5, + residual_threshold=sq_mad, + ) + + ransac_model.fit(X, y) + + slope = float(ransac_model.estimator_.coef_) + + inlier_mask = ransac_model.inlier_mask_ + + df_train["Outlier"] = ~inlier_mask.astype(bool) + + # calculate R2 score based on inliers + df_train_inlier = df_train[~df_train["Outlier"]] + X_inlier = array(df_train_inlier["Reference intensity"]).reshape(-1, 1) + y_inlier = df_train_inlier["Sample intensity"] + r2_score_model = round(ransac_model.score(X_inlier, y_inlier), 4) + r2_score_data = round(ransac_model.score(X, y), 4) + + list_model_slopes.append(slope) + list_r2_model.append(r2_score_model) + list_r2_data.append(r2_score_data) + + all_iter_slopes.append(list_model_slopes) + all_iter_r2_model.append(list_r2_model) + all_iter_r2_data.append(list_r2_data) + + # determine best model based on r2 scores + best_model = list_r2_model.index(max(list_r2_model)) + + # save slope of best model to slope_list + slope_list.append(list_model_slopes[best_model]) + slope = list_model_slopes[best_model] + + # calculate reproducibility factor and save to list + series_slopes = pd.Series(list_model_slopes) + reproducibility_factor = max(series_slopes.value_counts()) / num_init + reproducibility_list.append(reproducibility_factor) + + # get r2 scores of best model + r2_score_model = list_r2_model[best_model] + r2_score_data = list_r2_data[best_model] + + # save best r2 score to lists + r2_score_list_model.append(r2_score_model) + r2_score_list_data.append(r2_score_data) + + # calculate confidence band + alpha = 0.3 + df_distance_RL, df_train = calculate_confidence_band( + slope, median_intensities, df_train, X, y, row, idx, df_distance_RL, alpha + ) + + # plot scatter plot with regression line + + plot_dict[sample_column[idx]] = [ + df_train, + idx, + r2_score_model, + r2_score_data, + slope, + alpha, + ] + # regression_plots.append(fig_to_base64(create_regression_plots(df_train, idx, r2_score_model, r2_score_data, slope, alpha, sample_column))) + + df_distance_RL["Slope"] = slope_list + df_raw_scores = calc_raw_scores(df_distance_RL, median_intensities) + + # Assume df_raw_scores is your input DataFrame + # calculate MAD per sample + df_raw_scores.drop("Slope", axis=1, inplace=True) + df_raw_scores_T = df_raw_scores.T + df_raw_scores_T = df_raw_scores_T.apply(pd.to_numeric, errors="coerce") + mad = df_raw_scores_T.mad(axis=0) + median = df_raw_scores_T.median(axis=0) + + # calculate cutoff value for each time point (> 3*MAD) + cutoff = median + 3 * mad + + # remove peptides with raw scores > cutoff for each sample + df_raw_scores_T_cutoff = df_raw_scores_T[ + round(df_raw_scores_T, 5) <= round(cutoff, 5) + ] + removed = pd.Series( + df_raw_scores_T_cutoff.index[df_raw_scores_T_cutoff.isna().all(axis=1)] + ) + df_raw_scores_T_cutoff.dropna(axis=0, how="all", inplace=True) + df_raw_scores_cutoff = df_raw_scores_T_cutoff.T + + # apply t3median normalization to calculate RM scores + df_RM = normalize_t3median(df_raw_scores_cutoff) + + # check if peptides are modified (RM score below modification cutoff) + df_RM_mod = df_RM < mod_cutoff + + df_raw_scores["Slope"] = slope_list + df_raw_scores["R2 model"] = r2_score_list_model + df_raw_scores["R2 data"] = r2_score_list_data + df_raw_scores["Reproducibility factor"] = reproducibility_list + + df_RM["Slope"] = slope_list + df_RM["R2 model"] = r2_score_list_model + df_RM["R2 data"] = r2_score_list_data + df_RM["Reproducibility factor"] = reproducibility_list + + # add Group column again + df_raw_scores["Group"] = group_column + df_RM["Group"] = group_column + df_RM_mod["Group"] = group_column + + # add Sample column again + df_raw_scores["Sample"] = sample_column + df_RM["Sample"] = sample_column + df_RM_mod["Sample"] = sample_column + + for sample in sample_column: + if sample in plot_dict: + regression_plots.append( + fig_to_base64( + create_regression_plots( + *plot_dict[sample], + sample_column, + df_RM[df_RM["Sample"] == sample].iloc[0], + ) + ) + ) + + return dict( + raw_scores=df_raw_scores, + RM_scores=df_RM[df_RM.columns[::-1]], + diff_modified=df_RM_mod[df_RM_mod.columns[::-1]], + removed_peptides=removed.to_list(), + plots=regression_plots, + ) + + +def calculate_confidence_band( + slope, median_int, dataframe_train, X, y, row, idx, matrix_distance_RL, alpha +): + """Calculates confidence bands arround the regression line""" + + # calculate predicted intensity with Reference intensity of a peptide and slope of the sample (Y hat) + Y_pred = slope * median_int + + # calculate W + N = len(dataframe_train) + F = f.ppf(q=1 - alpha, dfn=2, dfd=N - 1) + W = sqrt(2 * F) + + # calculate standard deviation (s(Y hat)) + # calculate prediction error + error = y - Y_pred + error.dropna(inplace=True) + + # calculate mean squared error + MSE = sum(error.apply(square)) / (N - 1) + + # calculate mean X intensity + X_bar = dataframe_train["Reference intensity"].mean() + + # iterate over all peptides of the sample + CB_low = [] + CB_high = [] + + # iterate through median peptide intensities + for idx_2, elm in dataframe_train["Reference intensity"].items(): + # calculate squared distance to mean X (numerator) + dist_X_bar = square(elm - X_bar) + + # calculate sum of squared distances to mean X(denominator) + sum_dist_X_bar = sum(square(X - X_bar)) + + # calculate standard deviation + s = float(sqrt(MSE * ((1 / N) + (dist_X_bar / sum_dist_X_bar)))) + + # calculate predicted intensity for given X + Y_hat = slope * elm + + # calculate high and low CB values and append to list + cb_low = Y_hat - W * s + cb_high = Y_hat + W * s + + CB_low.append(cb_low) + CB_high.append(cb_high) + + # calculate predicted intensities + pred_ints = median_int * slope + + # calculate distance to regression line + distance_RL = pred_ints - row + + # save distances in matrix_distance + matrix_distance_RL.loc[idx] = distance_RL + + # add CBs as columns to dataframe_train + dataframe_train["CB low"] = CB_low + dataframe_train["CB high"] = CB_high + + return matrix_distance_RL, dataframe_train + + +def create_regression_plots( + dataframe_train, + idx, + r2_score_model, + r2_score_data, + slope, + alpha, + sample_column, + rm_scores, +): + """Creates a scatter plot with regression line and confidence bands""" + + # create new figure with two subplots + fig = plt.figure(figsize=(16, 9)) + gs = gridspec.GridSpec(2, 1, height_ratios=[1, 6]) + ax1 = plt.subplot(gs[1]) + ax0 = plt.subplot(gs[0], sharex=ax1) + + # set space between subplots + gs.update(hspace=0.05) + + # plot histogram in upper subplot + plt.sca(ax0) + + # add title + plt.title("RANSAC Linear Regression of Sample " + str(sample_column[idx])) + + # plot histogram + distplot(a=dataframe_train["Reference intensity"], bins=150, kde=False) + + # remove axis and tick labels + plt.xlabel("") + plt.tick_params( + axis="x", # changes apply to the x-axis + which="both", # both major and minor ticks are affected + bottom=True, # ticks along the bottom edge are off + top=False, # ticks along the top edge are off + labelbottom=False, + ) # labels along the bottom edge are off + + # plot scatter plot + plt.sca(ax1) + + rm_scores = rm_scores.drop( + ["Slope", "R2 model", "R2 data", "Reproducibility factor", "Group", "Sample"] + ) + rm_scores.dropna(inplace=True) + rm_scores.clip(0, 1, inplace=True) + + colors = ["red", "blue", "green"] # Red to green + cmap = LinearSegmentedColormap.from_list("custom_red_green", colors, N=256) + + rm_scores = rm_scores.to_frame(name="RM score") + # outliers in dataframe_train don't have an RM score + rm_scores = dataframe_train.merge( + rm_scores, left_index=True, right_index=True, how="left" + ) + rm_scores.fillna(1, inplace=True) + + scatterplot( + x="Reference intensity", + y="Sample intensity", + data=dataframe_train, + hue=list(rm_scores.index), + palette=cmap(list(rm_scores["RM score"])), + ) + + # draw regression line + line_label = "R2 model: " + str(r2_score_model) + "\nR2 data: " + str(r2_score_data) + max_int = dataframe_train["Reference intensity"].max() + min_int = min( + dataframe_train["Reference intensity"].min(), + dataframe_train["Sample intensity"].min(), + ) + X = [min_int - 2, max_int] + y = [min_int - 2, slope * max_int] + plt.plot(X, y, color="darkblue", linestyle="-", label=line_label) + + # draw confidence band + lineplot( + x="Reference intensity", + y="CB low", + data=dataframe_train, + color="darkgreen", + label="CB, alpha=" + str(alpha), + ) + lineplot( + x="Reference intensity", y="CB high", data=dataframe_train, color="darkgreen" + ) + + # set line style of CB lines to dashed + for i in [len(ax1.lines) - 1, len(ax1.lines) - 2]: + ax1.lines[i].set_linestyle("--") + + # create legend if sample has 20 peptides or less otherwise don't create a legend + if len(dataframe_train) <= 20: + # set right x axis limit + plt.gca().set_xlim(right=1.4 * max_int) + plt.legend() + else: + plt.gca().get_legend().remove() + + # set y axis label + plt.ylabel("Intensity sample " + str(sample_column[idx])) + plt.xlabel("Reference intensity") + + return fig + + +def calc_raw_scores(df_distance, median_int): + """ + Parameters: + df_distance: Pandas DataFrame containing the vertical distances to the regression line + rows: samples, columns: peptides + median_int: Pandas Series containing the median intensities for each peptide of the reference samples + Returns: + Pandas DataFrame of same dimension as df_distance containing raw scores + """ + # copy df_distance + df_rs = df_distance.copy() + + # iterate through rows of df_distance (samples) + for idx, row in df_distance.iterrows(): + # extract slope + slope = row["Slope"] + + # delete slope from row + row.drop("Slope", inplace=True) + + # calculate raw scores + raw_scores = 1 - row / (slope * median_int) + + # add slope to raw scores + raw_scores["Slope"] = slope + + # replace idx row in df_RM_score with calculated raw scores + df_rs.loc[idx] = raw_scores + + return df_rs + + +def normalize_t3median(dataframe): + """ + Applies Top3 median normalization to dataframe + Determines the median of the three highest values in each row and divides every value in the row by it + + Parameter: + dataframe: Pandas dataframe of datatype float or integer + + Returns: + normalized pandas dataframe of same dimensions as input dataframe + """ + # copy dataframe + dataframe_t3med = dataframe.copy() + + # for each row, normalize values by dividing each value by the median + # of the three highest values of the row + # iterate over rows of dataframe + for idx, row in dataframe.iterrows(): + # calculate the median of the three highest values + median_top3 = row.nlargest(3).median() + + # normalize each value of row by dividing by median_top3 + row_norm = row / median_top3 + + # update row in dataframe_norm with row_norm + dataframe_t3med.loc[idx] = row_norm + + return dataframe_t3med diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index f74909dc..76a76db7 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -16,6 +16,7 @@ scatter_plot, ) from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph +from protzilla.data_analysis.ptm_quantification import flexiquant_lf from protzilla.methods.data_preprocessing import TransformationLog from protzilla.steps import Plots, Step, StepManager @@ -599,3 +600,34 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["peptide_df"] = steps.peptide_df inputs["isoform_df"] = steps.isoform_df return inputs + + +class FLEXIQuantLF(PlotStep): + display_name = "FLEXIQuant-LF" + operation = "modification_quantification" + method_description = "FLEXIQuant-LF is an unbiased, label-free computational tool to indirectly detect modified peptides and to quantify the degree of modification based solely on the unmodified peptide species." + + input_keys = [ + "peptide_df", + "metadata_df", + "reference_group", + "protein_id", + "num_init", + "mod_cutoff", + ] + output_keys = [ + "raw_scores", + "RM_scores", + "diff_modified", + "removed_peptides", + ] + + def method(self, inputs: dict) -> dict: + return flexiquant_lf(**inputs) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["peptide_df"] = steps.get_step_output( + Step, "peptide_df", inputs["peptide_df"] + ) + inputs["metadata_df"] = steps.metadata_df + return inputs diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py index 178132c9..b54d724f 100644 --- a/ui/runs/form_mapping.py +++ b/ui/runs/form_mapping.py @@ -59,6 +59,7 @@ data_analysis.DimensionReductionUMAP: data_analysis_forms.DimensionReductionUMAPForm, data_analysis.ProteinGraphPeptidesToIsoform: data_analysis_forms.ProteinGraphPeptidesToIsoformForm, data_analysis.ProteinGraphVariationGraph: data_analysis_forms.ProteinGraphVariationGraphForm, + data_analysis.FLEXIQuantLF: data_analysis_forms.FLEXIQuantLFForm, data_preprocessing.ImputationByMinPerSample: data_preprocessing_forms.ImputationByMinPerSampleForms, data_integration.EnrichmentAnalysisGOAnalysisWithString: data_integration_forms.EnrichmentAnalysisGOAnalysisWithStringForm, data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr: data_integration_forms.EnrichmentAnalysisGOAnalysisWithEnrichrForm, diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 353d7cff..aa2014a6 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -881,3 +881,36 @@ class ProteinGraphVariationGraphForm(MethodForm): label="Protein ID", initial="Enter the Uniprot-ID of the protein" ) # TODO: workflow_meta line 2291 - 2295 + + +class FLEXIQuantLFForm(MethodForm): + is_dynamic = True + + peptide_df = CustomChoiceField(label="Peptide dataframe", choices=[]) + reference_group = CustomChoiceField(label="Reference group", choices=[]) + protein_id = CustomChoiceField(label="Protein ID", choices=[]) + num_init = CustomNumberField( + label="Number of RANSAC initiations", + initial=30, + min_value=1, + max_value=60, + step_size=1, + ) + mod_cutoff = CustomFloatField( + label="Modification cutoff", initial=0.5, min_value=0, max_value=1 + ) + + def fill_form(self, run: Run) -> None: + self.fields["peptide_df"].choices = fill_helper.get_choices(run, "peptide_df") + self.fields["reference_group"].choices = fill_helper.to_choices( + run.steps.metadata_df["Group"].unique() + ) + peptide_df_instance_id = self.data.get( + "peptide_df", self.fields["peptide_df"].choices[0][0] + ) + + self.fields["protein_id"].choices = fill_helper.to_choices( + run.steps.get_step_output(Step, "peptide_df", peptide_df_instance_id)[ + "Protein ID" + ].unique() + ) diff --git a/ui/runs/forms/fill_helper.py b/ui/runs/forms/fill_helper.py index dbea9279..6b790c45 100644 --- a/ui/runs/forms/fill_helper.py +++ b/ui/runs/forms/fill_helper.py @@ -23,7 +23,9 @@ def get_choices( :param output_key: the output key (e.g. "protein_df" or "enrichment_df" :return: a list of tuples containing the instance identifier and the instance identifier """ - return to_choices(run.steps.get_instance_identifiers(step_type, output_key)) + return reversed( + to_choices(run.steps.get_instance_identifiers(step_type, output_key)) + ) def get_choices_for_metadata_non_sample_columns(run: Run) -> list[tuple[str, str]]: From 61492741b211ea56079f152a84f26c2fa6b459ff Mon Sep 17 00:00:00 2001 From: lucatreide Date: Thu, 20 Jun 2024 14:46:26 +0200 Subject: [PATCH 04/12] add final changes for flexiquant-lf --- protzilla/data_analysis/ptm_quantification.py | 80 +++++++++++++------ protzilla/importing/peptide_import.py | 48 ++++++----- 2 files changed, 87 insertions(+), 41 deletions(-) diff --git a/protzilla/data_analysis/ptm_quantification.py b/protzilla/data_analysis/ptm_quantification.py index 8ff7109b..5ef6f715 100644 --- a/protzilla/data_analysis/ptm_quantification.py +++ b/protzilla/data_analysis/ptm_quantification.py @@ -9,11 +9,12 @@ matplotlib.use("Agg") import matplotlib.pyplot as plt from matplotlib import gridspec -from matplotlib.colors import LinearSegmentedColormap -from seaborn import distplot, lineplot, scatterplot +from seaborn import distplot, diverging_palette, lineplot, scatterplot from protzilla.utilities.utilities import fig_to_base64 +CONFIDENCE_BAND_ALPHA = 0.3 + def flexiquant_lf( peptide_df: pd.DataFrame, @@ -35,9 +36,7 @@ def flexiquant_lf( copy=False, ) - try: - df["Group"] - except KeyError: + if not "Group" in df: return dict( messages=[ dict( @@ -55,7 +54,7 @@ def flexiquant_lf( messages=[ dict( level=logging.ERROR, - msg=f"Reference sample '{reference_group}' not found in provided dataframe.", + msg=f"Reference sample '{reference_group}' not found in provided data.", ) ] ) @@ -186,7 +185,15 @@ def flexiquant_lf( # calculate confidence band alpha = 0.3 df_distance_RL, df_train = calculate_confidence_band( - slope, median_intensities, df_train, X, y, row, idx, df_distance_RL, alpha + slope, + median_intensities, + df_train, + X, + y, + row, + idx, + df_distance_RL, + CONFIDENCE_BAND_ALPHA, ) # plot scatter plot with regression line @@ -259,6 +266,7 @@ def flexiquant_lf( *plot_dict[sample], sample_column, df_RM[df_RM["Sample"] == sample].iloc[0], + mod_cutoff=mod_cutoff, ) ) ) @@ -273,7 +281,15 @@ def flexiquant_lf( def calculate_confidence_band( - slope, median_int, dataframe_train, X, y, row, idx, matrix_distance_RL, alpha + slope: float, + median_int: float, + dataframe_train: pd.DataFrame, + X: array, + y: pd.Series, + row: pd.Series, + idx: int, + matrix_distance_RL: pd.DataFrame, + alpha: float, ): """Calculates confidence bands arround the regression line""" @@ -338,14 +354,15 @@ def calculate_confidence_band( def create_regression_plots( - dataframe_train, - idx, - r2_score_model, - r2_score_data, - slope, - alpha, - sample_column, - rm_scores, + dataframe_train: pd.DataFrame, + idx: int, + r2_score_model: float, + r2_score_data: float, + slope: float, + alpha: float, + sample_column: pd.Series, + rm_scores: pd.DataFrame, + mod_cutoff: float, ): """Creates a scatter plot with regression line and confidence bands""" @@ -383,25 +400,31 @@ def create_regression_plots( rm_scores = rm_scores.drop( ["Slope", "R2 model", "R2 data", "Reproducibility factor", "Group", "Sample"] ) - rm_scores.dropna(inplace=True) + # rm_scores.dropna(inplace=True) rm_scores.clip(0, 1, inplace=True) - colors = ["red", "blue", "green"] # Red to green - cmap = LinearSegmentedColormap.from_list("custom_red_green", colors, N=256) - rm_scores = rm_scores.to_frame(name="RM score") # outliers in dataframe_train don't have an RM score rm_scores = dataframe_train.merge( rm_scores, left_index=True, right_index=True, how="left" ) - rm_scores.fillna(1, inplace=True) + rm_scores.fillna(-1, inplace=True) + + palette = diverging_palette(h_neg=0, h_pos=120, as_cmap=True, center="dark") + + def cmap(values: list[float]): + nanIdx = set([i for i, x in enumerate(values) if x == -1]) + return [ + color if i not in nanIdx else [0.75, 0.75, 0.75, 1.0] + for i, color in enumerate(palette(values)) + ] scatterplot( x="Reference intensity", y="Sample intensity", data=dataframe_train, hue=list(rm_scores.index), - palette=cmap(list(rm_scores["RM score"])), + palette=cmap(scale_to_mod_cutoff(list(rm_scores["RM score"]), mod_cutoff)), ) # draw regression line @@ -446,7 +469,7 @@ def create_regression_plots( return fig -def calc_raw_scores(df_distance, median_int): +def calc_raw_scores(df_distance: pd.DataFrame, median_int: pd.Series): """ Parameters: df_distance: Pandas DataFrame containing the vertical distances to the regression line @@ -506,3 +529,14 @@ def normalize_t3median(dataframe): dataframe_t3med.loc[idx] = row_norm return dataframe_t3med + + +def scale_to_mod_cutoff(values: list[float], cutoff: float) -> list[float]: + return [ + 0.5 + (v - cutoff) * 0.5 / (1 - cutoff) + if v >= 0.5 + else v * 0.5 / cutoff + if v >= 0 + else v + for v in values + ] diff --git a/protzilla/importing/peptide_import.py b/protzilla/importing/peptide_import.py index 2a393fbd..3bd60f56 100644 --- a/protzilla/importing/peptide_import.py +++ b/protzilla/importing/peptide_import.py @@ -33,25 +33,37 @@ def peptide_import(file_path, intensity_name, map_to_uniprot) -> dict: keep_default_na=True, ) - df = read.drop(columns=["Intensity"]) - id_df = df[id_columns] - intensity_df = df.filter(regex=f"^{peptide_intensity_name} ", axis=1) - intensity_df.columns = [ - c[len(peptide_intensity_name) + 1 :] for c in intensity_df.columns - ] - molten = pd.melt( - pd.concat([id_df, intensity_df], axis=1), - id_vars=id_columns, - var_name="Sample", - value_name="Intensity", - ) + if peptide_intensity_name != "Intensity": + df = read.drop(columns=["Intensity"]) + else: + df = read + + if "Sample" not in df.columns: + id_df = df[id_columns] + intensity_df = df.filter(regex=f"^{peptide_intensity_name} ", axis=1) + intensity_df.columns = [ + c[len(peptide_intensity_name) + 1 :] for c in intensity_df.columns + ] + molten = pd.melt( + pd.concat([id_df, intensity_df], axis=1), + id_vars=id_columns, + var_name="Sample", + value_name="Intensity", + ) - molten = molten.rename(columns={"Proteins": "Protein ID"}) - ordered = molten[ - ["Sample", "Protein ID", "Sequence", "Intensity", "PEP"] - ] - ordered.dropna(subset=["Protein ID"], inplace=True) - ordered.sort_values(by=["Sample", "Protein ID"], ignore_index=True, inplace=True) + molten = molten.rename(columns={"Proteins": "Protein ID"}) + ordered = molten[["Sample", "Protein ID", "Sequence", "Intensity", "PEP"]] + ordered.dropna(subset=["Protein ID"], inplace=True) + ordered.sort_values( + by=["Sample", "Protein ID"], ignore_index=True, inplace=True + ) + else: + final_df = df.rename(columns={"Proteins": "Protein ID"}) + ordered = final_df[["Sample", "Protein ID", "Sequence", "Intensity", "PEP"]] + ordered.dropna(subset=["Protein ID"], inplace=True) + ordered.sort_values( + by=["Sample", "Protein ID"], ignore_index=True, inplace=True + ) new_groups, filtered_proteins = clean_protein_groups( ordered["Protein ID"].tolist(), map_to_uniprot From 71230a57ae802111e70c365392e3a9a2fd955e32 Mon Sep 17 00:00:00 2001 From: lucatreide Date: Thu, 20 Jun 2024 14:47:14 +0200 Subject: [PATCH 05/12] remove unnecessary commit --- protzilla/data_analysis/ptm_quantification.py | 1 - 1 file changed, 1 deletion(-) diff --git a/protzilla/data_analysis/ptm_quantification.py b/protzilla/data_analysis/ptm_quantification.py index 5ef6f715..17d9c893 100644 --- a/protzilla/data_analysis/ptm_quantification.py +++ b/protzilla/data_analysis/ptm_quantification.py @@ -206,7 +206,6 @@ def flexiquant_lf( slope, alpha, ] - # regression_plots.append(fig_to_base64(create_regression_plots(df_train, idx, r2_score_model, r2_score_data, slope, alpha, sample_column))) df_distance_RL["Slope"] = slope_list df_raw_scores = calc_raw_scores(df_distance_RL, median_intensities) From 2cd66c4053b44f19384ee6f252372ef2297b5845 Mon Sep 17 00:00:00 2001 From: lucatreide Date: Thu, 20 Jun 2024 15:20:17 +0200 Subject: [PATCH 06/12] add messages to flexiquant --- protzilla/data_analysis/ptm_quantification.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/protzilla/data_analysis/ptm_quantification.py b/protzilla/data_analysis/ptm_quantification.py index 17d9c893..624f6c5b 100644 --- a/protzilla/data_analysis/ptm_quantification.py +++ b/protzilla/data_analysis/ptm_quantification.py @@ -270,12 +270,38 @@ def flexiquant_lf( ) ) + messages = [] + if len(regression_plots) == 0: + messages.append( + dict( + level=logging.WARNING, + msg="No samples were processed. This is probably due to the fact that there are not enough valid peptides in the samples.", + ) + ) + else: + if len(regression_plots) == len(sample_column): + messages.append( + dict( + level=logging.INFO, + msg=f"All {len(sample_column)} samples have been processed successfully. {len(removed)} peptides have been removed.", + ) + ) + else: + messages.append( + dict( + level=logging.INFO, + msg=f"{len(regression_plots)}/{len(sample_column)} samples have been processed successfully. " + f"The remaining samples have been skipped due to insufficient valid peptides. {len(removed)} peptides have been removed.", + ) + ) + return dict( raw_scores=df_raw_scores, RM_scores=df_RM[df_RM.columns[::-1]], diff_modified=df_RM_mod[df_RM_mod.columns[::-1]], removed_peptides=removed.to_list(), plots=regression_plots, + messages=messages, ) From f901ee613e39737272fc65b450905c82a30655fa Mon Sep 17 00:00:00 2001 From: lucatreide Date: Sat, 22 Jun 2024 14:39:26 +0200 Subject: [PATCH 07/12] add first stub for multiflex --- .../ptm_quantification/__init__.py | 0 .../ptm_quantification/flexiquant.py | 566 ++++++++++++++++++ .../ptm_quantification/multiflex.py | 88 +++ protzilla/methods/data_analysis.py | 44 +- ui/runs/form_mapping.py | 1 + ui/runs/forms/data_analysis.py | 55 +- 6 files changed, 748 insertions(+), 6 deletions(-) create mode 100644 protzilla/data_analysis/ptm_quantification/__init__.py create mode 100644 protzilla/data_analysis/ptm_quantification/flexiquant.py create mode 100644 protzilla/data_analysis/ptm_quantification/multiflex.py diff --git a/protzilla/data_analysis/ptm_quantification/__init__.py b/protzilla/data_analysis/ptm_quantification/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/protzilla/data_analysis/ptm_quantification/flexiquant.py b/protzilla/data_analysis/ptm_quantification/flexiquant.py new file mode 100644 index 00000000..bbc30be3 --- /dev/null +++ b/protzilla/data_analysis/ptm_quantification/flexiquant.py @@ -0,0 +1,566 @@ +import logging + +import matplotlib +import pandas as pd +from numpy import array, nan, sqrt, square +from scipy.stats import f, median_abs_deviation +from sklearn import linear_model + +matplotlib.use("Agg") +import matplotlib.pyplot as plt +from matplotlib import gridspec +from seaborn import distplot, diverging_palette, lineplot, scatterplot + +from protzilla.utilities.utilities import fig_to_base64 + +CONFIDENCE_BAND_ALPHA = 0.3 + + +def flexiquant_lf( + peptide_df: pd.DataFrame, + metadata_df: pd.DataFrame, + reference_group: str, + protein_id: str, + num_init: int = 50, + mod_cutoff: float = 0.5, +) -> dict: + """ + FLEXIQuant-LF is a method to quantify protein modification extent in label-free proteomics data. + + Parts of the implementation have been adapted from https://github.com/SteenOmicsLab/FLEXIQuantLF. + """ + + df = peptide_df[peptide_df["Protein ID"] == protein_id].pivot_table( + index="Sample", columns="Sequence", values="Intensity", aggfunc="first" + ) + df.reset_index(inplace=True) + + df = pd.merge( + left=df, + right=metadata_df[["Sample", "Group"]], + on="Sample", + copy=False, + ) + + if not "Group" in df: + return dict( + messages=[ + dict( + level=logging.ERROR, + msg="No 'Group' column found in provided dataframe.", + ) + ] + ) + + # delete columns where all entries are nan + df.dropna(how="all", axis=1, inplace=True) + + if reference_group not in df["Group"].unique(): + return dict( + messages=[ + dict( + level=logging.ERROR, + msg=f"Reference sample '{reference_group}' not found in provided data.", + ) + ] + ) + else: + # filter dataframe for controls + df_control = df[df["Group"] == reference_group] + + # get modified peptides + modified = [] + for elm in df_control.columns: + if "ph" in elm or "ac" in elm or "gly" in elm: + modified.append(elm) + + # delete modified peptides + df_control: pd.DataFrame = df_control.drop(modified, axis=1) + df.drop(modified, inplace=True, axis=1) + + # delete Group column + group_column = df["Group"] + df_control.drop("Group", axis=1, inplace=True) + df.drop("Group", axis=1, inplace=True) + + sample_column = df["Sample"] + + # calculate median intensities for unmodified peptides of control + median_intensities = df_control.median(axis=0) + + # initiate empty lists to save results of linear regressions + slope_list = [] + r2_score_list_model = [] + r2_score_list_data = [] + reproducibility_list = [] + + df_distance_RL = df.copy() + + regression_plots = [] + + plot_dict = {} + + for idx, row in df.iterrows(): + df_train = pd.DataFrame( + {"Sample intensity": row, "Reference intensity": median_intensities} + ) + + df_train.dropna(inplace=True) + + df_train.sort_index(inplace=True, axis=0) + + # if number of peptides is smaller than 5, skip sample and continue with next interation + if len(df_train) < 5: + # set all metrices to nan + df_distance_RL.loc[idx] = nan + df_train["CB low"] = nan + df_train["CB high"] = nan + slope_list.append(nan) + r2_score_list_model.append(nan) + r2_score_list_data.append(nan) + reproducibility_list.append(nan) + + continue + + # initiate empty list to save results of linear regression + list_model_slopes = [] + list_r2_model = [] + list_r2_data = [] + all_iter_slopes = [] + all_iter_r2_data = [] + all_iter_r2_model = [] + + # set training data + X = array(df_train["Reference intensity"]).reshape(-1, 1) + y = df_train["Sample intensity"] + + sq_mad = square(median_abs_deviation(df_train["Sample intensity"])) + + for i in range(num_init): + ransac_model = linear_model.RANSACRegressor( + estimator=linear_model.LinearRegression(fit_intercept=False, n_jobs=-2), + max_trials=1000, + stop_probability=1, + min_samples=0.5, + residual_threshold=sq_mad, + ) + + ransac_model.fit(X, y) + + slope = float(ransac_model.estimator_.coef_) + + inlier_mask = ransac_model.inlier_mask_ + + df_train["Outlier"] = ~inlier_mask.astype(bool) + + # calculate R2 score based on inliers + df_train_inlier = df_train[~df_train["Outlier"]] + X_inlier = array(df_train_inlier["Reference intensity"]).reshape(-1, 1) + y_inlier = df_train_inlier["Sample intensity"] + r2_score_model = round(ransac_model.score(X_inlier, y_inlier), 4) + r2_score_data = round(ransac_model.score(X, y), 4) + + list_model_slopes.append(slope) + list_r2_model.append(r2_score_model) + list_r2_data.append(r2_score_data) + + all_iter_slopes.append(list_model_slopes) + all_iter_r2_model.append(list_r2_model) + all_iter_r2_data.append(list_r2_data) + + # determine best model based on r2 scores + best_model = list_r2_model.index(max(list_r2_model)) + + # save slope of best model to slope_list + slope_list.append(list_model_slopes[best_model]) + slope = list_model_slopes[best_model] + + # calculate reproducibility factor and save to list + series_slopes = pd.Series(list_model_slopes) + reproducibility_factor = max(series_slopes.value_counts()) / num_init + reproducibility_list.append(reproducibility_factor) + + # get r2 scores of best model + r2_score_model = list_r2_model[best_model] + r2_score_data = list_r2_data[best_model] + + # save best r2 score to lists + r2_score_list_model.append(r2_score_model) + r2_score_list_data.append(r2_score_data) + + # calculate confidence band + alpha = 0.3 + df_distance_RL, df_train = calculate_confidence_band( + slope, + median_intensities, + df_train, + X, + y, + row, + idx, + df_distance_RL, + CONFIDENCE_BAND_ALPHA, + ) + + # plot scatter plot with regression line + + plot_dict[sample_column[idx]] = [ + df_train, + idx, + r2_score_model, + r2_score_data, + slope, + alpha, + ] + + df_distance_RL["Slope"] = slope_list + df_raw_scores = calc_raw_scores(df_distance_RL, median_intensities) + + # Assume df_raw_scores is your input DataFrame + # calculate MAD per sample + df_raw_scores.drop("Slope", axis=1, inplace=True) + df_raw_scores_T = df_raw_scores.T + df_raw_scores_T = df_raw_scores_T.apply(pd.to_numeric, errors="coerce") + mad = df_raw_scores_T.mad(axis=0) + median = df_raw_scores_T.median(axis=0) + + # calculate cutoff value for each time point (> 3*MAD) + cutoff = median + 3 * mad + + # remove peptides with raw scores > cutoff for each sample + df_raw_scores_T_cutoff = df_raw_scores_T[ + round(df_raw_scores_T, 5) <= round(cutoff, 5) + ] + removed = pd.Series( + df_raw_scores_T_cutoff.index[df_raw_scores_T_cutoff.isna().all(axis=1)] + ) + df_raw_scores_T_cutoff.dropna(axis=0, how="all", inplace=True) + df_raw_scores_cutoff = df_raw_scores_T_cutoff.T + + # apply t3median normalization to calculate RM scores + df_RM = normalize_t3median(df_raw_scores_cutoff) + + # check if peptides are modified (RM score below modification cutoff) + df_RM_mod = df_RM < mod_cutoff + + df_raw_scores["Slope"] = slope_list + df_raw_scores["R2 model"] = r2_score_list_model + df_raw_scores["R2 data"] = r2_score_list_data + df_raw_scores["Reproducibility factor"] = reproducibility_list + + df_RM["Slope"] = slope_list + df_RM["R2 model"] = r2_score_list_model + df_RM["R2 data"] = r2_score_list_data + df_RM["Reproducibility factor"] = reproducibility_list + + # add Group column again + df_raw_scores["Group"] = group_column + df_RM["Group"] = group_column + df_RM_mod["Group"] = group_column + + # add Sample column again + df_raw_scores["Sample"] = sample_column + df_RM["Sample"] = sample_column + df_RM_mod["Sample"] = sample_column + + for sample in sample_column: + if sample in plot_dict: + regression_plots.append( + fig_to_base64( + create_regression_plots( + *plot_dict[sample], + sample_column, + df_RM[df_RM["Sample"] == sample].iloc[0], + mod_cutoff=mod_cutoff, + ) + ) + ) + + messages = [] + if len(regression_plots) == 0: + messages.append( + dict( + level=logging.WARNING, + msg="No samples were processed. This is probably due to the fact that there are not enough valid peptides in the samples.", + ) + ) + else: + if len(regression_plots) == len(sample_column): + messages.append( + dict( + level=logging.INFO, + msg=f"All {len(sample_column)} samples have been processed successfully. {len(removed)} peptides have been removed.", + ) + ) + else: + messages.append( + dict( + level=logging.INFO, + msg=f"{len(regression_plots)}/{len(sample_column)} samples have been processed successfully. " + f"The remaining samples have been skipped due to insufficient valid peptides. {len(removed)} peptides have been removed.", + ) + ) + + return dict( + raw_scores=df_raw_scores, + RM_scores=df_RM[df_RM.columns[::-1]], + diff_modified=df_RM_mod[df_RM_mod.columns[::-1]], + removed_peptides=removed.to_list(), + plots=regression_plots, + messages=messages, + ) + + +def calculate_confidence_band( + slope: float, + median_int: float, + dataframe_train: pd.DataFrame, + X: array, + y: pd.Series, + row: pd.Series, + idx: int, + matrix_distance_RL: pd.DataFrame, + alpha: float, +): + """ + Calculates confidence bands arround the regression line. + """ + + # calculate predicted intensity with Reference intensity of a peptide and slope of the sample (Y hat) + Y_pred = slope * median_int + + # calculate W + N = len(dataframe_train) + F = f.ppf(q=1 - alpha, dfn=2, dfd=N - 1) + W = sqrt(2 * F) + + # calculate standard deviation (s(Y hat)) + # calculate prediction error + error = y - Y_pred + error.dropna(inplace=True) + + # calculate mean squared error + MSE = sum(error.apply(square)) / (N - 1) + + # calculate mean X intensity + X_bar = dataframe_train["Reference intensity"].mean() + + # iterate over all peptides of the sample + CB_low = [] + CB_high = [] + + # iterate through median peptide intensities + for idx_2, elm in dataframe_train["Reference intensity"].items(): + # calculate squared distance to mean X (numerator) + dist_X_bar = square(elm - X_bar) + + # calculate sum of squared distances to mean X(denominator) + sum_dist_X_bar = sum(square(X - X_bar)) + + # calculate standard deviation + s = float(sqrt(MSE * ((1 / N) + (dist_X_bar / sum_dist_X_bar)))) + + # calculate predicted intensity for given X + Y_hat = slope * elm + + # calculate high and low CB values and append to list + cb_low = Y_hat - W * s + cb_high = Y_hat + W * s + + CB_low.append(cb_low) + CB_high.append(cb_high) + + # calculate predicted intensities + pred_ints = median_int * slope + + # calculate distance to regression line + distance_RL = pred_ints - row + + # save distances in matrix_distance + matrix_distance_RL.loc[idx] = distance_RL + + # add CBs as columns to dataframe_train + dataframe_train["CB low"] = CB_low + dataframe_train["CB high"] = CB_high + + return matrix_distance_RL, dataframe_train + + +def create_regression_plots( + dataframe_train: pd.DataFrame, + idx: int, + r2_score_model: float, + r2_score_data: float, + slope: float, + alpha: float, + sample_column: pd.Series, + rm_scores: pd.DataFrame, + mod_cutoff: float, +): + """ + Creates a scatter plot with regression line and confidence bands. + """ + + # create new figure with two subplots + fig = plt.figure(figsize=(16, 9)) + gs = gridspec.GridSpec(2, 1, height_ratios=[1, 6]) + ax1 = plt.subplot(gs[1]) + ax0 = plt.subplot(gs[0], sharex=ax1) + + # set space between subplots + gs.update(hspace=0.05) + + # plot histogram in upper subplot + plt.sca(ax0) + + # add title + plt.title("RANSAC Linear Regression of Sample " + str(sample_column[idx])) + + # plot histogram + distplot(a=dataframe_train["Reference intensity"], bins=150, kde=False) + + # remove axis and tick labels + plt.xlabel("") + plt.tick_params( + axis="x", # changes apply to the x-axis + which="both", # both major and minor ticks are affected + bottom=True, # ticks along the bottom edge are off + top=False, # ticks along the top edge are off + labelbottom=False, + ) # labels along the bottom edge are off + + # plot scatter plot + plt.sca(ax1) + + rm_scores = rm_scores.drop( + ["Slope", "R2 model", "R2 data", "Reproducibility factor", "Group", "Sample"] + ) + # rm_scores.dropna(inplace=True) + rm_scores.clip(0, 1, inplace=True) + + rm_scores = rm_scores.to_frame(name="RM score") + # outliers in dataframe_train don't have an RM score + rm_scores = dataframe_train.merge( + rm_scores, left_index=True, right_index=True, how="left" + ) + rm_scores.fillna(-1, inplace=True) + + palette = diverging_palette(h_neg=0, h_pos=120, as_cmap=True, center="dark") + + def cmap(values: list[float]): + nanIdx = set([i for i, x in enumerate(values) if x == -1]) + return [ + color if i not in nanIdx else [0.75, 0.75, 0.75, 1.0] + for i, color in enumerate(palette(values)) + ] + + scatterplot( + x="Reference intensity", + y="Sample intensity", + data=dataframe_train, + hue=list(rm_scores.index), + palette=cmap(scale_to_mod_cutoff(list(rm_scores["RM score"]), mod_cutoff)), + ) + + # draw regression line + line_label = "R2 model: " + str(r2_score_model) + "\nR2 data: " + str(r2_score_data) + max_int = dataframe_train["Reference intensity"].max() + min_int = min( + dataframe_train["Reference intensity"].min(), + dataframe_train["Sample intensity"].min(), + ) + X = [min_int - 2, max_int] + y = [min_int - 2, slope * max_int] + plt.plot(X, y, color="darkblue", linestyle="-", label=line_label) + + # draw confidence band + lineplot( + x="Reference intensity", + y="CB low", + data=dataframe_train, + color="darkgreen", + label="CB, alpha=" + str(alpha), + ) + lineplot( + x="Reference intensity", y="CB high", data=dataframe_train, color="darkgreen" + ) + + # set line style of CB lines to dashed + for i in [len(ax1.lines) - 1, len(ax1.lines) - 2]: + ax1.lines[i].set_linestyle("--") + + # create legend if sample has 20 peptides or less otherwise don't create a legend + if len(dataframe_train) <= 20: + # set right x axis limit + plt.gca().set_xlim(right=1.4 * max_int) + plt.legend() + else: + plt.gca().get_legend().remove() + + # set y axis label + plt.ylabel("Intensity sample " + str(sample_column[idx])) + plt.xlabel("Reference intensity") + + return fig + + +def calc_raw_scores(df_distance: pd.DataFrame, median_int: pd.Series): + """ + Calculates raw scores for each sample based on the distance to the regression line. + """ + # copy df_distance + df_rs = df_distance.copy() + + # iterate through rows of df_distance (samples) + for idx, row in df_distance.iterrows(): + # extract slope + slope = row["Slope"] + + # delete slope from row + row.drop("Slope", inplace=True) + + # calculate raw scores + raw_scores = 1 - row / (slope * median_int) + + # add slope to raw scores + raw_scores["Slope"] = slope + + # replace idx row in df_RM_score with calculated raw scores + df_rs.loc[idx] = raw_scores + + return df_rs + + +def normalize_t3median(dataframe: pd.DataFrame): + """ + Applies Top3 median normalization to dataframe + Determines the median of the three highest values in each row and divides every value in the row by it + """ + # copy dataframe + dataframe_t3med = dataframe.copy() + + # for each row, normalize values by dividing each value by the median + # of the three highest values of the row + # iterate over rows of dataframe + for idx, row in dataframe.iterrows(): + # calculate the median of the three highest values + median_top3 = row.nlargest(3).median() + + # normalize each value of row by dividing by median_top3 + row_norm = row / median_top3 + + # update row in dataframe_norm with row_norm + dataframe_t3med.loc[idx] = row_norm + + return dataframe_t3med + + +def scale_to_mod_cutoff(values: list[float], cutoff: float) -> list[float]: + return [ + ( + 0.5 + (v - cutoff) * 0.5 / (1 - cutoff) + if v >= 0.5 + else v * 0.5 / cutoff if v >= 0 else v + ) + for v in values + ] diff --git a/protzilla/data_analysis/ptm_quantification/multiflex.py b/protzilla/data_analysis/ptm_quantification/multiflex.py new file mode 100644 index 00000000..3fa06c69 --- /dev/null +++ b/protzilla/data_analysis/ptm_quantification/multiflex.py @@ -0,0 +1,88 @@ +import logging +import time + +import pandas as pd + +from protzilla.data_analysis.ptm_quantification.flexiquant import flexiquant_lf + + +def multiflex_lf( + peptide_df: pd.DataFrame, + metadata_df: pd.DataFrame, + reference_group: str, + num_init: int = 30, + mod_cutoff: float = 0.5, + remove_outliers: bool = True, + imputation_cosine_similarity: float = 0.98, + deseq2_normalization: bool = True, + colormap: int = 1, +): + """ + Quantifies the extent of protein modifications in proteomics data by using robust linear regression to compare modified and unmodified peptide precursors + and facilitates the analysis of modification dynamics and coregulated modifications across large datasets without the need for preselecting specific proteins. + + Parts of the implementation have been adapted from https://gitlab.com/SteenOmicsLab/multiflex-lf. + """ + + # get current system time to track the runtime + time.time() + + # create dataframe input for multiflex-lf + df_intens_matrix_all_proteins = pd.DataFrame( + { + "ProteinID": peptide_df["Protein ID"], + "PeptideID": peptide_df["Sequence"], + "Sample": peptide_df["Sample"], + "Intensity": peptide_df["Intensity"], + } + ) + + # add Group column to input + df_intens_matrix_all_proteins = pd.merge( + df_intens_matrix_all_proteins, metadata_df[["Sample", "Group"]], on="Sample" + ) + + # check if reference identifier exists in Group column + if str(reference_group) not in set( + df_intens_matrix_all_proteins["Group"].astype(str) + ): + return dict( + messages=[ + dict( + level=logging.ERROR, + msg=f"Reference group {reference_group} not found in metadata.", + ) + ], + ) + + df_intens_matrix_all_proteins = ( + df_intens_matrix_all_proteins.dropna(subset=["Intensity"]) + .groupby(["ProteinID", "PeptideID", "Group", "Sample"])["Intensity"] + .apply(max) + .unstack(level=["Group", "Sample"]) + .T + ) + df_intens_matrix_all_proteins = df_intens_matrix_all_proteins.set_index( + [ + df_intens_matrix_all_proteins.index.get_level_values("Group"), + df_intens_matrix_all_proteins.index.get_level_values("Sample"), + ] + ) + df_intens_matrix_all_proteins = df_intens_matrix_all_proteins.sort_index( + axis=0 + ).sort_index(axis=1) + + # dataframe for the calculated RM scores of all proteins and peptides + pd.DataFrame() + + # create a list of all proteins in the dataset + list_proteins = ( + df_intens_matrix_all_proteins.columns.get_level_values("ProteinID") + .unique() + .sort_values() + ) + + for protein in list_proteins: + flexiquant_lf( + peptide_df, metadata_df, reference_group, protein, num_init, mod_cutoff + ) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 99a2099a..7b79330a 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -10,7 +10,6 @@ from protzilla.data_analysis.differential_expression_linear_model import linear_model from protzilla.data_analysis.differential_expression_t_test import t_test from protzilla.data_analysis.dimension_reduction import t_sne, umap -from protzilla.data_analysis.ptm_analysis import filter_peptides_of_protein from protzilla.data_analysis.model_evaluation import evaluate_classification_model from protzilla.data_analysis.plots import ( clustergram_plot, @@ -19,7 +18,9 @@ scatter_plot, ) from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph -from protzilla.data_analysis.ptm_quantification import flexiquant_lf +from protzilla.data_analysis.ptm_analysis import filter_peptides_of_protein +from protzilla.data_analysis.ptm_quantification.flexiquant import flexiquant_lf +from protzilla.data_analysis.ptm_quantification.multiflex import multiflex_lf from protzilla.methods.data_preprocessing import TransformationLog from protzilla.steps import Plots, Step, StepManager @@ -636,6 +637,45 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: ) inputs["metadata_df"] = steps.metadata_df + return inputs + + +class MultiFLEXLF(PlotStep): + display_name = "MultiFLEX-LF" + operation = "modification_quantification" + method_description = "Quantifies the extent of protein modifications in proteomics data by using robust linear regression to compare modified and unmodified peptide precursors and facilitates the analysis of modification dynamics and coregulated modifications across large datasets without the need for preselecting specific proteins." + + input_keys = [ + "peptide_df", + "metadata_df", + "reference_group", + "num_init", + "mod_cutoff", + "remove_outliers", + "imputation_cosine_similarity", + "deseq2_normalization", + "colormap", + ] + + output_keys = [ + "RM_scores_clustered", + "diff_modified", + "raw_scores", + "removed_peptides", + "RM_scores", + "sample_groups", + ] + + def method(self, inputs: dict) -> dict: + return multiflex_lf(**inputs) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["peptide_df"] = steps.get_step_output( + Step, "peptide_df", inputs["peptide_df"] + ) + + inputs["metadata_df"] = steps.metadata_df + return inputs class SelectPeptidesForProtein(DataAnalysisStep): diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py index b3521f6d..ce110b09 100644 --- a/ui/runs/form_mapping.py +++ b/ui/runs/form_mapping.py @@ -61,6 +61,7 @@ data_analysis.ProteinGraphVariationGraph: data_analysis_forms.ProteinGraphVariationGraphForm, data_analysis.SelectPeptidesForProtein: data_analysis_forms.SelectPeptidesForProteinForm, data_analysis.FLEXIQuantLF: data_analysis_forms.FLEXIQuantLFForm, + data_analysis.MultiFLEXLF: data_analysis_forms.MultiFLEXLFForm, data_preprocessing.ImputationByMinPerSample: data_preprocessing_forms.ImputationByMinPerSampleForms, data_integration.EnrichmentAnalysisGOAnalysisWithString: data_integration_forms.EnrichmentAnalysisGOAnalysisWithStringForm, data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr: data_integration_forms.EnrichmentAnalysisGOAnalysisWithEnrichrForm, diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 84eaeb3d..78fac7c4 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1,25 +1,24 @@ -import logging from enum import Enum, StrEnum -from protzilla.methods.data_preprocessing import DataPreprocessingStep from protzilla.methods.data_analysis import ( + DataAnalysisStep, DifferentialExpressionLinearModel, DifferentialExpressionTTest, DimensionReductionUMAP, - DataAnalysisStep, ) +from protzilla.methods.data_preprocessing import DataPreprocessingStep from protzilla.run import Run from protzilla.steps import Step from . import fill_helper from .base import MethodForm from .custom_fields import ( + CustomBooleanField, CustomCharField, CustomChoiceField, CustomFloatField, CustomMultipleChoiceField, CustomNumberField, - CustomBooleanField, ) @@ -920,6 +919,54 @@ def fill_form(self, run: Run) -> None: ) +class MultiFLEXLFForm(MethodForm): + peptide_df = CustomChoiceField(label="Peptide dataframe", choices=[]) + reference_group = CustomChoiceField(label="Reference group", choices=[]) + num_init = CustomNumberField( + label="Number of RANSAC initiations", + initial=30, + min_value=1, + max_value=60, + step_size=1, + ) + mod_cutoff = CustomFloatField( + label="Modification cutoff", initial=0.5, min_value=0, max_value=1 + ) + imputation_cosine_similarity = CustomFloatField( + label="Minimal cosine similarity value for missing value imputation of RM scores for clustering", + initial=0.98, + ) + colormap = CustomChoiceField( + label="Color map used for heatmap", + choices=[ + (1, "red-white-blue"), + (2, "pink-white-green"), + (3, "purple-white-green"), + (4, "brown-white-bluegreen"), + (5, "orange-white-purple"), + (6, "red-white-grey"), + (7, "red-yellow-green"), + (8, "red-yellow-blue"), + ], + ) + remove_outliers = CustomBooleanField( + label="Remove peptides with outlier intensities from RM score calculation", + required=False, + initial=True, + ) + deseq2_normalization = CustomBooleanField( + label="Apply DESeq2 normalization to RM scores before clustering", + required=False, + initial=True, + ) + + def fill_form(self, run: Run) -> None: + self.fields["peptide_df"].choices = fill_helper.get_choices(run, "peptide_df") + self.fields["reference_group"].choices = fill_helper.to_choices( + run.steps.metadata_df["Group"].unique() + ) + + class SelectPeptidesForProteinForm(MethodForm): is_dynamic = True From 26140e24e7da85c21c92cbf89209e9491922fe4a Mon Sep 17 00:00:00 2001 From: lucatreide Date: Mon, 8 Jul 2024 11:58:32 +0200 Subject: [PATCH 08/12] finalise multiflex --- .../ptm_quantification/multiflex.py | 685 +++++++++++++++++- protzilla/importing/peptide_import.py | 10 +- protzilla/methods/data_analysis.py | 3 +- requirements.txt | 3 +- 4 files changed, 687 insertions(+), 14 deletions(-) diff --git a/protzilla/data_analysis/ptm_quantification/multiflex.py b/protzilla/data_analysis/ptm_quantification/multiflex.py index 3fa06c69..adff084b 100644 --- a/protzilla/data_analysis/ptm_quantification/multiflex.py +++ b/protzilla/data_analysis/ptm_quantification/multiflex.py @@ -1,9 +1,23 @@ import logging import time +from collections import Counter +from copy import copy +from math import sqrt +import matplotlib.pyplot as plt import pandas as pd +from matplotlib.colors import Normalize +from numpy import arange, array, flip, nan, ones +from plotly.figure_factory import create_dendrogram +from plotly.graph_objects import Heatmap +from plotly.subplots import make_subplots +from pydeseq2.preprocessing import deseq2_norm +from scipy.cluster.hierarchy import linkage +from scipy.spatial.distance import cdist +from seaborn import color_palette, histplot from protzilla.data_analysis.ptm_quantification.flexiquant import flexiquant_lf +from protzilla.utilities.utilities import fig_to_base64 def multiflex_lf( @@ -12,7 +26,6 @@ def multiflex_lf( reference_group: str, num_init: int = 30, mod_cutoff: float = 0.5, - remove_outliers: bool = True, imputation_cosine_similarity: float = 0.98, deseq2_normalization: bool = True, colormap: int = 1, @@ -25,7 +38,7 @@ def multiflex_lf( """ # get current system time to track the runtime - time.time() + start = time.time() # create dataframe input for multiflex-lf df_intens_matrix_all_proteins = pd.DataFrame( @@ -72,9 +85,6 @@ def multiflex_lf( axis=0 ).sort_index(axis=1) - # dataframe for the calculated RM scores of all proteins and peptides - pd.DataFrame() - # create a list of all proteins in the dataset list_proteins = ( df_intens_matrix_all_proteins.columns.get_level_values("ProteinID") @@ -82,7 +92,670 @@ def multiflex_lf( .sort_values() ) + df_diff_modified = pd.DataFrame() + df_raw_scores = pd.DataFrame() + df_removed_peptides = pd.DataFrame() + df_RM_scores = pd.DataFrame() + + skipped_proteins = [] + for protein in list_proteins: - flexiquant_lf( + flexi_result = flexiquant_lf( peptide_df, metadata_df, reference_group, protein, num_init, mod_cutoff ) + + if any( + [ + message + for message in flexi_result["messages"] + if message["level"] == logging.ERROR + ] + ): + skipped_proteins.append(protein) + continue + + protein_raw_scores = flexi_result["raw_scores"] + protein_raw_scores = protein_raw_scores.T + protein_raw_scores.columns = protein_raw_scores.loc["Sample"] + protein_raw_scores["ProteinID"] = protein + protein_raw_scores.drop( + index=[ + "Sample", + "Slope", + "R2 model", + "R2 data", + "Reproducibility factor", + "Group", + ], + inplace=True, + ) + protein_raw_scores = ( + protein_raw_scores.reset_index() + .rename(columns={"index": "PeptideID"}) + .set_index(["ProteinID", "PeptideID"]) + ) + df_raw_scores = pd.concat([df_raw_scores, protein_raw_scores]) + + protein_diff_modified = flexi_result["diff_modified"] + protein_diff_modified.drop(columns=["Group"], inplace=True) + protein_diff_modified = protein_diff_modified.T + protein_diff_modified.columns = protein_diff_modified.loc["Sample"] + protein_diff_modified["ProteinID"] = protein + protein_diff_modified.drop(index="Sample", inplace=True) + protein_diff_modified = ( + protein_diff_modified.reset_index() + .rename(columns={"index": "PeptideID"}) + .set_index(["ProteinID", "PeptideID"]) + ) + df_diff_modified = pd.concat([df_diff_modified, protein_diff_modified]) + + protein_removed_peptides = flexi_result["removed_peptides"] + protein_removed_peptides.remove("Sample") + protein_removed_peptides_df = pd.DataFrame( + {"ProteinID": protein, "PeptideID": protein_removed_peptides} + ) + df_removed_peptides = pd.concat( + [df_removed_peptides, protein_removed_peptides_df] + ) + + protein_RM_scores = flexi_result["RM_scores"] + protein_RM_scores = protein_RM_scores.T + protein_RM_scores.columns = pd.MultiIndex.from_arrays( + [protein_RM_scores.loc["Group"], protein_RM_scores.loc["Sample"]], + names=["Group", "Sample"], + ) + protein_RM_scores["ProteinID"] = protein + protein_RM_scores.drop( + index=[ + "Sample", + "Slope", + "R2 model", + "R2 data", + "Reproducibility factor", + "Group", + ], + inplace=True, + ) + protein_RM_scores = ( + protein_RM_scores.reset_index() + .rename(columns={"index": "PeptideID"}) + .set_index(["ProteinID", "PeptideID"]) + ) + df_RM_scores = pd.concat([df_RM_scores, protein_RM_scores]) + + if df_RM_scores.empty: + return dict( + messages=[ + dict( + level=logging.WARNING, + msg="RM scores were not computed! Intensities of at least 5 peptides per protein have to be given!", + ) + ], + ) + + # list of all groups for creation the distribution plots and protein-wise heatmaps + list_groups = list(set(df_RM_scores.columns.get_level_values("Group"))) + list_groups.sort() + + rm_score_dist_plots = fig_to_base64( + create_RM_score_distribution_plots(df_RM_scores, list_groups) + ) + + # define the colormap for the heatmap as specified by the user + if colormap == 1: + color_map = "RdBu" + elif colormap == 2: + color_map = "PiYG" + elif colormap == 3: + color_map = "PRGn" + elif colormap == 3: + color_map = "BrBG" + elif colormap == 4: + color_map = "PuOr" + elif colormap == 5: + color_map = "RdGy" + elif colormap == 6: + color_map = "RdYlGn" + elif colormap == 7: + color_map = "RdYlBu" + + # check if colormap is valid + try: + custom_cmap = copy(plt.get_cmap(color_map)) + except ValueError: + return dict( + messages=[ + dict( + level=logging.ERROR, + msg="Invalid color map!\n Please choose a valid color map. See: https://matplotlib.org/stable/tutorials/colors/colormaps.html", + ) + ], + ) + + # create title page + fig = plt.figure(figsize=(1, 1)) + plt.title("Heatmaps of RM scores of multiFLEX-LF", fontsize=20) + plt.axis("off") + plt.close() + + # define color map for the heatmaps + custom_cmap = copy(plt.get_cmap(color_map)) + # missing values are set to black + custom_cmap.set_bad("black", 1.0) + custom_norm = Normalize(vmin=0, vmax=mod_cutoff * 2) + + # sort the proteins descending by number of peptides and samples with a RM scores below the modification cutoff + sorted_proteins = list( + df_RM_scores[df_RM_scores < mod_cutoff] + .count(axis=1) + .groupby("ProteinID") + .sum() + .sort_values(ascending=False) + .index + ) + + heatmap_plots = [] + # go through all protein in the sorted order + for protein_id in sorted_proteins: + # dataframe of the RM scores of the current protein + df_RM_scores_protein = df_RM_scores.loc[protein_id] + + # skip the protein, if dataframe empty + if df_RM_scores_protein.empty: + continue + + # ignore nan-slice warnings + # with warnings.catch_warnings(): + # warnings.filterwarnings("ignore", r"All-NaN (slice|axis) encountered") + + # create heatmap of the current protein + heatmap_plots.append( + create_heatmap(df_RM_scores_protein, protein_id, color_map, mod_cutoff) + ) + + # keep only peptides that have RM scores in at least two groups + to_remove = df_RM_scores.loc[ + df_RM_scores.groupby("Group", axis=1).count().replace(0, nan).count(axis=1) < 2 + ].index + df_RM_scores_all_proteins_reduced = df_RM_scores.drop(to_remove, axis=0) + removed_peptides = pd.DataFrame(list(to_remove)) + to_remove = pd.DataFrame() + + # impute missing values for clustering + ( + df_RM_scores_all_proteins_reduced, + df_RM_scores_all_proteins_imputed, + removed, + ) = missing_value_imputation( + df_RM_scores_all_proteins_reduced, round(1 - imputation_cosine_similarity, 3) + ) + removed_peptides = pd.concat([removed_peptides, removed]) + removed = pd.DataFrame() + + # check if RM scores dataframe is empty, if true return error and finish analysis + if df_RM_scores_all_proteins_imputed.empty: + # add removed peptides to csv file + if len(removed_peptides) > 0: + removed_peptides.columns = ["ProteinID", "PeptideID"] + removed_peptides = removed_peptides.set_index(["ProteinID"]) + + return dict( + messages=[ + dict( + level=logging.ERROR, + msg="Imputation of RM scores for clustering was unsuccessful! Too many missing values in the data!", + ) + ], + removed_peptides=removed_peptides, + ) + + if deseq2_normalization: + groups = pd.DataFrame(list(df_RM_scores.columns)) + groups.columns = ["Group", "Sample"] + df_normalization = df_RM_scores_all_proteins_imputed.copy() + + # one column per peptide, one row per sample + df_normalization = df_normalization.T + df_normalization = df_normalization.astype(float) + + # apply normalization + df_normalization = deseq2_norm(df_normalization)[0] + + # transpose back + df_normalization = df_normalization.T + + # keep only normalized RM scores which were not missing before the previous imputation + # then reimpute the missing values + df_RM_scores_all_proteins_reduced = df_normalization[ + df_RM_scores_all_proteins_reduced.isna() == False + ] + df_RM_scores_all_proteins_reduced = round(df_RM_scores_all_proteins_reduced, 5) + + # impute missing values again + ( + df_RM_scores_all_proteins_reduced, + df_RM_scores_all_proteins_imputed, + removed, + ) = missing_value_imputation( + df_RM_scores_all_proteins_reduced, + round(1 - imputation_cosine_similarity, 5), + ) + # dataframe of peptides that were removed during imputation + removed_peptides = removed_peptides.append(removed) + removed = pd.DataFrame() + + rm_score_dist_plots = fig_to_base64( + create_RM_score_distribution_plots( + df_RM_scores_all_proteins_reduced, list_groups + ) + ) + + if len(removed_peptides) > 0: + removed_peptides.columns = ["ProteinID", "PeptideID"] + removed_peptides = removed_peptides.set_index(["ProteinID"]) + + linkage_matrix = linkage( + df_RM_scores_all_proteins_imputed, + metric=lambda u, v: RM_score_distance(u, v, mod_cutoff), + method="average", + ) + + # create plotly figure + ( + peptide_clustering_fig, + array_RM_scores_all_proteins_reduced, + ordered_peptides, + ) = peptide_clustering( + df_RM_scores_all_proteins_reduced, + linkage_matrix, + mod_cutoff, + color_map, + ["black"] * 8, + None, + [], + ) + + # create output of the RM scores in same order as in the heatmap + output_df = pd.DataFrame(flip(array_RM_scores_all_proteins_reduced, axis=0)) + output_df.columns = df_RM_scores_all_proteins_reduced.columns.get_level_values( + "Sample" + ) + # add the ID column + output_df.index = pd.MultiIndex.from_tuples( + flip(array(df_RM_scores_all_proteins_reduced.index)[ordered_peptides]), + names=("ProteinID", "PeptideID"), + ) + output_df = output_df.reset_index() + output_df.index.names = ["ID"] + + end = time.time() + message = dict( + level=logging.INFO, + msg=f"Finished with MultiFLEX-LF analysis in {end-start:.1f} seconds.", + ) + + return dict( + RM_scores_clustered=output_df, + diff_modified=df_diff_modified, + raw_scores=df_raw_scores, + removed_peptides=removed_peptides, + RM_scores=df_RM_scores, + plots=[rm_score_dist_plots, peptide_clustering_fig] + heatmap_plots, + messages=[message], + ) + + +def create_RM_score_distribution_plots(df_RM_scores, list_groups): + """ + Constructs a figure of distribution plots of the RM scores. For every group a seperate plot is created + with the different samples in different colors. + """ + + # initialize the figure with a size that accounts for 7ptx7pt plots of every sample group + fig_size = sqrt(len(list_groups)) + + if fig_size % 1 != 0: + fig_size = int(int(fig_size) + 1) + else: + fig_size = int(fig_size) + + fig = plt.figure(figsize=(7 * fig_size, 7 * fig_size)) + + # list of colors for the color coding of the different samples in one group + colors_list = color_palette( + "husl", + Counter(df_RM_scores.columns.get_level_values("Group")).most_common(1)[0][1], + ) # int(df_group_all_prots.shape[1]/2) + + # create the distribution plots for every group and apply kernel density estimation if possible + i = 1 + for group in list_groups: + df_group = df_RM_scores[group] + + ax = fig.add_subplot(fig_size, fig_size, i) + + try: + histplot( + df_group, + ax=ax, + kde=True, + stat="count", + bins=30, + palette=colors_list[: df_group.shape[1]], + edgecolor=None, + ) + except: + histplot( + df_group, + ax=ax, + kde=False, + stat="count", + bins=30, + palette=colors_list[: df_group.shape[1]], + edgecolor=None, + ) + + plt.xticks(fontsize=8) + plt.yticks(fontsize=8) + plt.xlim(0, 3) + plt.title("Group: " + group, fontsize=16) + plt.xlabel("RM score") + + # show sample legend if group contains 10 samples or less + if df_group.shape[1] > 10: + plt.legend([], [], frameon=False) + + plt.tight_layout(h_pad=2) + + i += 1 + + fig.suptitle("Distribution of RM scores of FLEXIQuant-LF ", fontsize=20) + plt.subplots_adjust(top=0.90) + + plt.close() + + return fig + + +def create_heatmap( + df_RM_scores: pd.DataFrame, protein_id: str, color_scale: str, mod_cutoff: float +): + """ + Constructs a heatmap of the RM scores for a protein + """ + + # Create Plotly subplots figure + fig = make_subplots( + rows=1, + cols=1, # +1 for colorbar + shared_yaxes=True, + horizontal_spacing=0.01, + ) + + df_RM_scores = df_RM_scores.astype(float) + fig.add_trace( + Heatmap( + z=df_RM_scores.values, + x=df_RM_scores.columns.get_level_values("Sample"), + y=df_RM_scores.index.get_level_values("PeptideID"), + zmin=0, + zmax=mod_cutoff * 2, + hovertemplate="Sample: %{x}
Peptide: %{y}
RM score: %{z}", + colorscale=color_scale, + showscale=True, + colorbar=dict( + title="RM score", + titleside="top", + tickmode="array", + thicknessmode="pixels", + thickness=25, + len=1, + x=1.05, + ticks="outside", + dtick=5, + ), + ), + ) + + # Update layout + fig.update_layout( + title_text=f"Protein: {protein_id}", + title_x=0.5, + height=20 * df_RM_scores.shape[0] + 200, # Adjust height to fit + autosize=True, + xaxis_title="", + yaxis_title="Peptides", + yaxis_nticks=df_RM_scores.shape[0], # Adjust number of ticks + ) + + return fig + + +def missing_value_imputation(df_RM_scores: pd.DataFrame, max_cos_dist: float): + """ + Impute missing values by calculating the median of the RM scores of all peptides + with a cosine distance (i.e. 1 - cosine similarity) of at most max_cos_dist from + the current peptide. If not all missing values of a peptide were imputed, it is + removed from further analysis. + """ + + # copy df + df_RM_scores_imputed = df_RM_scores.copy() + + for peptide in df_RM_scores.index: + # get RM scores of the current peptide + df_RM_scores_pep = df_RM_scores.loc[peptide] + + # skip if no missing value for peptide + if not df_RM_scores_pep.isna().any(): + continue + + # remove NaN values + df_RM_scores_pep = pd.DataFrame(df_RM_scores_pep.dropna()) + + # get all other peptides and keep only samples that have a RM scores for the current peptides + df_RM_scores_other_peps = df_RM_scores[df_RM_scores_pep.index].drop( + peptide, axis=0 + ) + + # calculate all pairwise cosine distances between the current peptide and all other + cos_dist_other_peps = cdist( + df_RM_scores_pep.T, df_RM_scores_other_peps, "cosine" + )[0] + + # get the index of the closest peptides + index_impute = df_RM_scores_other_peps[ + cos_dist_other_peps <= max_cos_dist + ].index + + # skip peptide if less than 2 close peptides were found + if len(index_impute) < 2: + continue + + # calculate the median RM scores of the closest peptides + df_imputation_values = ( + df_RM_scores.loc[index_impute].median().drop(df_RM_scores_pep.index) + ) + + # replace the missing values with the calculated values + df_RM_scores_imputed.loc[ + peptide, df_imputation_values.index + ] = df_imputation_values + + # remove all peptides that still have missing values + remove_nans = df_RM_scores_imputed[df_RM_scores_imputed.isna().any(axis=1)].index + df_RM_scores_imputed = df_RM_scores_imputed.drop(remove_nans, axis=0) + df_RM_scores = df_RM_scores.drop(remove_nans, axis=0) + + return df_RM_scores, df_RM_scores_imputed, pd.DataFrame(list(remove_nans)) + + +def RM_score_distance(u: float, v: float, mod_cutoff: float): + """ + Calculation of the customized Manhattan distance between the arrays of RM scores u and v + """ + # initialize distance vector + if len(u) == len(v): + dist = ones(len(u)) + else: + return + + # calculate absolute differences between the elements of u and v + for i in range(len(dist)): + x = u[i] + y = v[i] + + # penalize jumps from below to above the modification cutoff + if (x < mod_cutoff and y >= mod_cutoff) or (x >= mod_cutoff and y < mod_cutoff): + dist[i] = abs(x - y) + 1 + else: + dist[i] = abs(x - y) + + # return sum of the penalized absolute differences + return dist.sum() + + +def peptide_clustering( + df_RM_scores: pd.DataFrame, + linkage_matrix: pd.DataFrame, + mod_cutoff: float, + cmap: str, + colors: list[str], + clust_threshold: float, + clust_ids: list, +): + """ + Clustering results are saved as interactive HTML file with the dendrogram and the heatmap. + """ + # initialize plotly figure and create the dendrogram based on the linkage matrix + plotly_figure = create_dendrogram( + df_RM_scores.astype(float), + orientation="right", + linkagefun=lambda x: linkage_matrix, + colorscale=colors, + color_threshold=clust_threshold, + ) + + # set x-axis of the dendrogram to x2 (axis nr. 2) + for i in range(len(plotly_figure["data"])): + plotly_figure["data"][i]["xaxis"] = "x2" + + # get order of the peptides in the dendrogram + clust_leaves = plotly_figure["layout"]["yaxis"]["ticktext"] + clust_leaves = list(map(int, clust_leaves)) + + # create numpy array from the RM scores dataframe + # and sort the peptides by the order in the dendrogram + heat_data = df_RM_scores.to_numpy() + heat_data = heat_data[clust_leaves, :] + + # define row and column labels for the heatmap + row_names = [str(i[0]) + "
Peptide: " + str(i[1]) for i in df_RM_scores.index] + row_names = list(array(row_names)[clust_leaves]) + col_names = list(df_RM_scores.columns.get_level_values(1)) + + if len(clust_ids) == 0: + # define the row IDs shown upon hovering over the cells of the heatmap + clust_id = array( + [[i] * df_RM_scores.shape[1] for i in range(len(clust_leaves) - 1, -1, -1)] + ) + else: + ## if cluster ids given add the cluster id to the hover information + clust_id = array( + [ + [str(i) + "
Cluster: " + str(clust_ids[i])] * df_RM_scores.shape[1] + for i in range(len(clust_leaves) - 1, -1, -1) + ] + ) + + # create heatmap + heatmap = Heatmap( + z=heat_data, + colorscale=cmap, + zmin=0, + zmax=mod_cutoff * 2, + customdata=clust_id, + hovertemplate="Sample: %{x}
Protein: %{y}
RM score: %{z}
ID: %{customdata}", + colorbar=dict( + title="RM score", + titleside="top", + tickmode="array", + thicknessmode="pixels", + thickness=25, + lenmode="pixels", + len=250, + yanchor="top", + y=1, + x=1.05, + ticks="outside", + dtick=5, + ), + ) + + # align y-axis of heatmap to dendrogram + heatmap["y"] = plotly_figure["layout"]["yaxis"]["tickvals"] + + # add the heatmap to plotly figure plotly_figure + plotly_figure.add_trace(heatmap) + + # edit layout of plotly_figure + figure_width = min(max(df_RM_scores.shape[1] * 50 + 100, 500), 1500) + plotly_figure.update_layout(autosize=True, height=900, font={"size": 11}) + + # amount of the figure size used for the dendrogram + dendrogram_width = 100 / figure_width + + # update x-axis of the heatmap + plotly_figure.update_layout( + xaxis={ + "domain": [dendrogram_width, 1], + "mirror": False, + "showgrid": False, + "showline": False, + "zeroline": False, + "showticklabels": True, + "ticks": "outside", + "ticktext": col_names, # list of sample names + "tickvals": arange(0, len(col_names)), + } + ) + + # update x-axis (xaxis2) of the dendrogram + plotly_figure.update_layout( + xaxis2={ + "domain": [0, dendrogram_width], + "mirror": False, + "showgrid": False, + "showline": False, + "zeroline": False, + "showticklabels": True, + "ticks": "", + } + ) + + # update y-axis of the heatmap + plotly_figure.update_layout( + yaxis={ + "domain": [0, 1], + "mirror": False, + "showgrid": False, + "showline": False, + "zeroline": False, + "showticklabels": False, + "ticks": "", + "side": "right", + "ticktext": row_names, # list of the peptides + } + ) + + # set plot title and axes lables + plotly_figure.update_layout( + title="Clustered Heatmap of RM scores", + xaxis_title="Sample", + yaxis_title="Peptides", + xaxis2_title="", + template="plotly_white", + ) + + # return the figure, the matrix of the RM scores in clustering order and the clustering order of the peptides + return plotly_figure, heat_data, clust_leaves diff --git a/protzilla/importing/peptide_import.py b/protzilla/importing/peptide_import.py index 3bd60f56..4ecae3d0 100644 --- a/protzilla/importing/peptide_import.py +++ b/protzilla/importing/peptide_import.py @@ -65,12 +65,12 @@ def peptide_import(file_path, intensity_name, map_to_uniprot) -> dict: by=["Sample", "Protein ID"], ignore_index=True, inplace=True ) - new_groups, filtered_proteins = clean_protein_groups( - ordered["Protein ID"].tolist(), map_to_uniprot - ) - cleaned = ordered.assign(**{"Protein ID": new_groups}) + # new_groups, filtered_proteins = clean_protein_groups( + # ordered["Protein ID"].tolist(), map_to_uniprot + # ) + # cleaned = ordered.assign(**{"Protein ID": new_groups}) - return dict(peptide_df=cleaned) + return dict(peptide_df=ordered) def evidence_import(file_path, intensity_name, map_to_uniprot) -> dict: diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 7b79330a..079bc316 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -651,7 +651,6 @@ class MultiFLEXLF(PlotStep): "reference_group", "num_init", "mod_cutoff", - "remove_outliers", "imputation_cosine_similarity", "deseq2_normalization", "colormap", @@ -663,7 +662,6 @@ class MultiFLEXLF(PlotStep): "raw_scores", "removed_peptides", "RM_scores", - "sample_groups", ] def method(self, inputs: dict) -> dict: @@ -675,6 +673,7 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: ) inputs["metadata_df"] = steps.metadata_df + inputs["colormap"] = int(inputs["colormap"]) return inputs diff --git a/requirements.txt b/requirements.txt index bc175e2a..c3d1439f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,7 +19,7 @@ pytest-django==4.5.2 pytest-order==1.1.0 restring==0.1.20 scikit-learn==1.2.2 -scipy==1.10.1 +scipy==1.11.0 statsmodels==0.13.5 umap-learn==0.5.3 Werkzeug==2.2.3 @@ -36,3 +36,4 @@ beautifulsoup4==4.12.2 sphinx==7.2.6 sphinx-autoapi==3.0.0 openpyxl==3.1.2 +pydeseq2==0.4.9 From ae2e0dd10d087cc4aab633c6b33c8e98c8a0db89 Mon Sep 17 00:00:00 2001 From: lucatreide Date: Mon, 8 Jul 2024 12:42:51 +0200 Subject: [PATCH 09/12] small adjustments --- protzilla/data_analysis/ptm_quantification/multiflex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protzilla/data_analysis/ptm_quantification/multiflex.py b/protzilla/data_analysis/ptm_quantification/multiflex.py index adff084b..324d019f 100644 --- a/protzilla/data_analysis/ptm_quantification/multiflex.py +++ b/protzilla/data_analysis/ptm_quantification/multiflex.py @@ -391,7 +391,7 @@ def multiflex_lf( end = time.time() message = dict( level=logging.INFO, - msg=f"Finished with MultiFLEX-LF analysis in {end-start:.1f} seconds.", + msg=f"Finished with MultiFLEX-LF analysis in ~{end-start:.1f} seconds.", ) return dict( From 82fbb5365da78db617198984218bbb6908198bab Mon Sep 17 00:00:00 2001 From: lucatreide Date: Thu, 11 Jul 2024 11:36:53 +0200 Subject: [PATCH 10/12] add neccesary packages for multiflex --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index c3d1439f..d7491384 100644 --- a/requirements.txt +++ b/requirements.txt @@ -37,3 +37,5 @@ sphinx==7.2.6 sphinx-autoapi==3.0.0 openpyxl==3.1.2 pydeseq2==0.4.9 +seaborn==0.13.0 +matplotlib==3.8.0 From 7c6127d8bf938f931a8cf4482437fc22afd22360 Mon Sep 17 00:00:00 2001 From: lucatreide Date: Thu, 11 Jul 2024 17:25:27 +0200 Subject: [PATCH 11/12] move flexiquant implementation --- protzilla/data_analysis/ptm_quantification.py | 616 ------------------ .../ptm_quantification/flexiquant.py | 88 ++- 2 files changed, 69 insertions(+), 635 deletions(-) delete mode 100644 protzilla/data_analysis/ptm_quantification.py diff --git a/protzilla/data_analysis/ptm_quantification.py b/protzilla/data_analysis/ptm_quantification.py deleted file mode 100644 index a98ea22b..00000000 --- a/protzilla/data_analysis/ptm_quantification.py +++ /dev/null @@ -1,616 +0,0 @@ -import logging - -import matplotlib -import pandas as pd -from numpy import array, nan, sqrt, square -from scipy.stats import f, median_abs_deviation -from sklearn import linear_model - -matplotlib.use("Agg") -import matplotlib.pyplot as plt -from matplotlib import gridspec -from seaborn import distplot, diverging_palette, lineplot, scatterplot - -from protzilla.utilities.utilities import fig_to_base64 - -CONFIDENCE_BAND_ALPHA = 0.3 - - -def flexiquant_lf( - peptide_df: pd.DataFrame, - metadata_df: pd.DataFrame, - reference_group: str, - protein_id: str, - grouping_column: str, - num_init: int = 50, - mod_cutoff: float = 0.5, -) -> dict: - """ - FLEXIQuant-LF is a method to quantify protein modification extent in label-free proteomics data. - - Parts of the implementation have been adapted from https://github.com/SteenOmicsLab/FLEXIQuantLF. - - :param peptide_df: DataFrame containing peptide intensities. - :param metadata_df: DataFrame containing metadata. - :param reference_group: Name of the reference group. - :param protein_id: Protein ID that should be analysed. - :param num_init: Number of initializations for RANSAC regression. - :param mod_cutoff: RM score cutoff value for modified peptides. - """ - - df = peptide_df[peptide_df["Protein ID"] == protein_id].pivot_table( - index="Sample", columns="Sequence", values="Intensity", aggfunc="first" - ) - df.reset_index(inplace=True) - - df = pd.merge( - left=df, - right=metadata_df[["Sample", grouping_column]], - on="Sample", - copy=False, - ) - - if not grouping_column in df: - return dict( - messages=[ - dict( - level=logging.ERROR, - msg=f"No {grouping_column} column found in provided dataframe.", - ) - ] - ) - - # delete columns where all entries are nan - df.dropna(how="all", axis=1, inplace=True) - - if reference_group not in df[grouping_column].unique(): - return dict( - messages=[ - dict( - level=logging.ERROR, - msg=f"Reference sample '{reference_group}' not found in provided data.", - ) - ] - ) - else: - # filter dataframe for controls - df_control = df[df[grouping_column] == reference_group] - - # get modified peptides - modified = [] - for elm in df_control.columns: - if "ph" in elm or "ac" in elm or "gly" in elm: - modified.append(elm) - - # delete modified peptides - df_control: pd.DataFrame = df_control.drop(modified, axis=1) - df.drop(modified, inplace=True, axis=1) - - # delete Group column - group_column = df[grouping_column] - df_control.drop(grouping_column, axis=1, inplace=True) - df.drop(grouping_column, axis=1, inplace=True) - - sample_column = df["Sample"] - - # calculate median intensities for unmodified peptides of control - median_intensities = df_control.median(axis=0) - - # initiate empty lists to save results of linear regressions - slope_list = [] - r2_score_list_model = [] - r2_score_list_data = [] - reproducibility_list = [] - - df_distance_RL = df.copy() - - regression_plots = [] - - plot_dict = {} - - for idx, row in df.iterrows(): - df_train = pd.DataFrame( - {"Sample intensity": row, "Reference intensity": median_intensities} - ) - - df_train.dropna(inplace=True) - - df_train.sort_index(inplace=True, axis=0) - - # if number of peptides is smaller than 5, skip sample and continue with next interation - if len(df_train) < 5: - # set all metrices to nan - df_distance_RL.loc[idx] = nan - df_train["CB low"] = nan - df_train["CB high"] = nan - slope_list.append(nan) - r2_score_list_model.append(nan) - r2_score_list_data.append(nan) - reproducibility_list.append(nan) - - continue - - # initiate empty list to save results of linear regression - list_model_slopes = [] - list_r2_model = [] - list_r2_data = [] - all_iter_slopes = [] - all_iter_r2_data = [] - all_iter_r2_model = [] - - # set training data - X = array(df_train["Reference intensity"]).reshape(-1, 1) - y = df_train["Sample intensity"] - - sq_mad = square(median_abs_deviation(df_train["Sample intensity"])) - - for i in range(num_init): - ransac_model = linear_model.RANSACRegressor( - estimator=linear_model.LinearRegression(fit_intercept=False, n_jobs=-2), - max_trials=1000, - stop_probability=1, - min_samples=0.5, - residual_threshold=sq_mad, - ) - - ransac_model.fit(X, y) - - slope = float(ransac_model.estimator_.coef_) - - inlier_mask = ransac_model.inlier_mask_ - - df_train["Outlier"] = ~inlier_mask.astype(bool) - - # calculate R2 score based on inliers - df_train_inlier = df_train[~df_train["Outlier"]] - X_inlier = array(df_train_inlier["Reference intensity"]).reshape(-1, 1) - y_inlier = df_train_inlier["Sample intensity"] - r2_score_model = round(ransac_model.score(X_inlier, y_inlier), 4) - r2_score_data = round(ransac_model.score(X, y), 4) - - list_model_slopes.append(slope) - list_r2_model.append(r2_score_model) - list_r2_data.append(r2_score_data) - - all_iter_slopes.append(list_model_slopes) - all_iter_r2_model.append(list_r2_model) - all_iter_r2_data.append(list_r2_data) - - # determine best model based on r2 scores - best_model = list_r2_model.index(max(list_r2_model)) - - # save slope of best model to slope_list - slope_list.append(list_model_slopes[best_model]) - slope = list_model_slopes[best_model] - - # calculate reproducibility factor and save to list - series_slopes = pd.Series(list_model_slopes) - reproducibility_factor = max(series_slopes.value_counts()) / num_init - reproducibility_list.append(reproducibility_factor) - - # get r2 scores of best model - r2_score_model = list_r2_model[best_model] - r2_score_data = list_r2_data[best_model] - - # save best r2 score to lists - r2_score_list_model.append(r2_score_model) - r2_score_list_data.append(r2_score_data) - - # calculate confidence band - alpha = 0.3 - df_distance_RL, df_train = calculate_confidence_band( - slope, - median_intensities, - df_train, - X, - y, - row, - idx, - df_distance_RL, - CONFIDENCE_BAND_ALPHA, - ) - - # plot scatter plot with regression line - - plot_dict[sample_column[idx]] = [ - df_train, - idx, - r2_score_model, - r2_score_data, - slope, - alpha, - ] - - df_distance_RL["Slope"] = slope_list - df_raw_scores = calc_raw_scores(df_distance_RL, median_intensities) - - # Assume df_raw_scores is your input DataFrame - # calculate MAD per sample - df_raw_scores.drop("Slope", axis=1, inplace=True) - df_raw_scores_T = df_raw_scores.T - df_raw_scores_T = df_raw_scores_T.apply(pd.to_numeric, errors="coerce") - mad = df_raw_scores_T.mad(axis=0) - median = df_raw_scores_T.median(axis=0) - - # calculate cutoff value for each time point (> 3*MAD) - cutoff = median + 3 * mad - - # remove peptides with raw scores > cutoff for each sample - df_raw_scores_T_cutoff = df_raw_scores_T[ - round(df_raw_scores_T, 5) <= round(cutoff, 5) - ] - removed = pd.Series( - df_raw_scores_T_cutoff.index[df_raw_scores_T_cutoff.isna().all(axis=1)] - ) - df_raw_scores_T_cutoff.dropna(axis=0, how="all", inplace=True) - df_raw_scores_cutoff = df_raw_scores_T_cutoff.T - - # apply t3median normalization to calculate RM scores - df_RM = normalize_t3median(df_raw_scores_cutoff) - - # check if peptides are modified (RM score below modification cutoff) - df_RM_mod = df_RM < mod_cutoff - - df_raw_scores["Slope"] = slope_list - df_raw_scores["R2 model"] = r2_score_list_model - df_raw_scores["R2 data"] = r2_score_list_data - df_raw_scores["Reproducibility factor"] = reproducibility_list - - df_RM["Slope"] = slope_list - df_RM["R2 model"] = r2_score_list_model - df_RM["R2 data"] = r2_score_list_data - df_RM["Reproducibility factor"] = reproducibility_list - - # add Group column again - df_raw_scores[grouping_column] = group_column - df_RM[grouping_column] = group_column - df_RM_mod[grouping_column] = group_column - - # add Sample column again - df_raw_scores["Sample"] = sample_column - df_RM["Sample"] = sample_column - df_RM_mod["Sample"] = sample_column - - for sample in sample_column: - if sample in plot_dict: - regression_plots.append( - fig_to_base64( - create_regression_plots( - *plot_dict[sample], - sample_column, - df_RM[df_RM["Sample"] == sample].iloc[0], - mod_cutoff=mod_cutoff, - grouping_column=grouping_column, - ) - ) - ) - - messages = [] - if len(regression_plots) == 0: - messages.append( - dict( - level=logging.WARNING, - msg="No samples were processed. This is probably due to the fact that there are not enough valid peptides in the samples.", - ) - ) - else: - if len(regression_plots) == len(sample_column): - messages.append( - dict( - level=logging.INFO, - msg=f"All {len(sample_column)} samples have been processed successfully. {len(removed)} peptides have been removed.", - ) - ) - else: - messages.append( - dict( - level=logging.INFO, - msg=f"{len(regression_plots)}/{len(sample_column)} samples have been processed successfully. " - f"The remaining samples have been skipped due to insufficient valid peptides. {len(removed)} peptides have been removed.", - ) - ) - - return dict( - raw_scores=df_raw_scores, - RM_scores=df_RM[df_RM.columns[::-1]], - diff_modified=df_RM_mod[df_RM_mod.columns[::-1]], - removed_peptides=removed.to_list(), - plots=regression_plots, - messages=messages, - ) - - -def calculate_confidence_band( - slope: float, - median_int: float, - dataframe_train: pd.DataFrame, - X: array, - y: pd.Series, - row: pd.Series, - idx: int, - matrix_distance_RL: pd.DataFrame, - alpha: float, -): - """ - Calculates confidence bands arround the regression line. - - :param slope: Slope of the regression line. - :param median_int: Median intensity of the reference group. - :param dataframe_train: DataFrame containing the training data. - :param X: Array containing the reference intensities. - :param y: Series containing the sample intensities. - :param row: Series containing the sample intensities. - :param idx: Index of the sample. - :param matrix_distance_RL: DataFrame containing the distances to the regression line. - :param alpha: Alpha value for the confidence band. - """ - - # calculate predicted intensity with Reference intensity of a peptide and slope of the sample (Y hat) - Y_pred = slope * median_int - - # calculate W - N = len(dataframe_train) - F = f.ppf(q=1 - alpha, dfn=2, dfd=N - 1) - W = sqrt(2 * F) - - # calculate standard deviation (s(Y hat)) - # calculate prediction error - error = y - Y_pred - error.dropna(inplace=True) - - # calculate mean squared error - MSE = sum(error.apply(square)) / (N - 1) - - # calculate mean X intensity - X_bar = dataframe_train["Reference intensity"].mean() - - # iterate over all peptides of the sample - CB_low = [] - CB_high = [] - - # iterate through median peptide intensities - for idx_2, elm in dataframe_train["Reference intensity"].items(): - # calculate squared distance to mean X (numerator) - dist_X_bar = square(elm - X_bar) - - # calculate sum of squared distances to mean X(denominator) - sum_dist_X_bar = sum(square(X - X_bar)) - - # calculate standard deviation - s = float(sqrt(MSE * ((1 / N) + (dist_X_bar / sum_dist_X_bar)))) - - # calculate predicted intensity for given X - Y_hat = slope * elm - - # calculate high and low CB values and append to list - cb_low = Y_hat - W * s - cb_high = Y_hat + W * s - - CB_low.append(cb_low) - CB_high.append(cb_high) - - # calculate predicted intensities - pred_ints = median_int * slope - - # calculate distance to regression line - distance_RL = pred_ints - row - - # save distances in matrix_distance - matrix_distance_RL.loc[idx] = distance_RL - - # add CBs as columns to dataframe_train - dataframe_train["CB low"] = CB_low - dataframe_train["CB high"] = CB_high - - return matrix_distance_RL, dataframe_train - - -def create_regression_plots( - dataframe_train: pd.DataFrame, - idx: int, - r2_score_model: float, - r2_score_data: float, - slope: float, - alpha: float, - sample_column: pd.Series, - rm_scores: pd.DataFrame, - mod_cutoff: float, - grouping_column: str, -): - """ - Creates a scatter plot with regression line and confidence bands. - - :param dataframe_train: DataFrame containing the training data. - :param idx: Index of the sample. - :param r2_score_model: R2 score of the model. - :param r2_score_data: R2 score of the data. - :param slope: Slope of the regression line. - :param alpha: Alpha value for the confidence band. - :param sample_column: Series containing the sample names. - :param rm_scores: DataFrame containing the RM scores. - :param mod_cutoff: RM score cutoff value for modified peptides. - :param grouping_column: Name of the grouping column. - """ - - # create new figure with two subplots - fig = plt.figure(figsize=(16, 9)) - gs = gridspec.GridSpec(2, 1, height_ratios=[1, 6]) - ax1 = plt.subplot(gs[1]) - ax0 = plt.subplot(gs[0], sharex=ax1) - - # set space between subplots - gs.update(hspace=0.05) - - # plot histogram in upper subplot - plt.sca(ax0) - - # add title - plt.title("RANSAC Linear Regression of Sample " + str(sample_column[idx])) - - # plot histogram - distplot(a=dataframe_train["Reference intensity"], bins=150, kde=False) - - # remove axis and tick labels - plt.xlabel("") - plt.tick_params( - axis="x", # changes apply to the x-axis - which="both", # both major and minor ticks are affected - bottom=True, # ticks along the bottom edge are off - top=False, # ticks along the top edge are off - labelbottom=False, - ) # labels along the bottom edge are off - - # plot scatter plot - plt.sca(ax1) - - rm_scores = rm_scores.drop( - [ - "Slope", - "R2 model", - "R2 data", - "Reproducibility factor", - grouping_column, - "Sample", - ] - ) - # rm_scores.dropna(inplace=True) - rm_scores.clip(0, 1, inplace=True) - - rm_scores = rm_scores.to_frame(name="RM score") - # outliers in dataframe_train don't have an RM score - rm_scores = dataframe_train.merge( - rm_scores, left_index=True, right_index=True, how="left" - ) - rm_scores.fillna(-1, inplace=True) - - palette = diverging_palette(h_neg=0, h_pos=120, as_cmap=True, center="dark") - - def cmap(values: list[float]): - nanIdx = set([i for i, x in enumerate(values) if x == -1]) - return [ - color if i not in nanIdx else [0.75, 0.75, 0.75, 1.0] - for i, color in enumerate(palette(values)) - ] - - scatterplot( - x="Reference intensity", - y="Sample intensity", - data=dataframe_train, - hue=list(rm_scores.index), - palette=cmap(scale_to_mod_cutoff(list(rm_scores["RM score"]), mod_cutoff)), - ) - - # draw regression line - line_label = "R2 model: " + str(r2_score_model) + "\nR2 data: " + str(r2_score_data) - max_int = dataframe_train["Reference intensity"].max() - min_int = min( - dataframe_train["Reference intensity"].min(), - dataframe_train["Sample intensity"].min(), - ) - X = [min_int - 2, max_int] - y = [min_int - 2, slope * max_int] - plt.plot(X, y, color="darkblue", linestyle="-", label=line_label) - - # draw confidence band - lineplot( - x="Reference intensity", - y="CB low", - data=dataframe_train, - color="darkgreen", - label="CB, alpha=" + str(alpha), - ) - lineplot( - x="Reference intensity", y="CB high", data=dataframe_train, color="darkgreen" - ) - - # set line style of CB lines to dashed - for i in [len(ax1.lines) - 1, len(ax1.lines) - 2]: - ax1.lines[i].set_linestyle("--") - - # create legend if sample has 20 peptides or less otherwise don't create a legend - if len(dataframe_train) <= 20: - # set right x axis limit - plt.gca().set_xlim(right=1.4 * max_int) - plt.legend() - else: - plt.gca().get_legend().remove() - - # set y axis label - plt.ylabel("Intensity sample " + str(sample_column[idx])) - plt.xlabel("Reference intensity") - - return fig - - -def calc_raw_scores(df_distance: pd.DataFrame, median_int: pd.Series): - """ - Calculates raw scores for each sample based on the distance to the regression line. - - :param df_distance: DataFrame containing the distances to the regression line. - :param median_int: Median intensity of the reference group - """ - # copy df_distance - df_rs = df_distance.copy() - - # iterate through rows of df_distance (samples) - for idx, row in df_distance.iterrows(): - # extract slope - slope = row["Slope"] - - # delete slope from row - row.drop("Slope", inplace=True) - - # calculate raw scores - raw_scores = 1 - row / (slope * median_int) - - # add slope to raw scores - raw_scores["Slope"] = slope - - # replace idx row in df_RM_score with calculated raw scores - df_rs.loc[idx] = raw_scores - - return df_rs - - -def normalize_t3median(dataframe: pd.DataFrame): - """ - Applies Top3 median normalization to dataframe. - Determines the median of the three highest values in each row and divides every value in the row by it. - - :param dataframe: DataFrame containing the data to be normalized. - """ - # copy dataframe - dataframe_t3med = dataframe.copy() - - # for each row, normalize values by dividing each value by the median - # of the three highest values of the row - # iterate over rows of dataframe - for idx, row in dataframe.iterrows(): - # calculate the median of the three highest values - median_top3 = row.nlargest(3).median() - - # normalize each value of row by dividing by median_top3 - row_norm = row / median_top3 - - # update row in dataframe_norm with row_norm - dataframe_t3med.loc[idx] = row_norm - - return dataframe_t3med - - -def scale_to_mod_cutoff(values: list[float], cutoff: float) -> list[float]: - """ - Scales values to a cutoff value. - - :param values: List of values to be scaled. - :param cutoff: Cutoff value. - """ - - return [ - 0.5 + (v - cutoff) * 0.5 / (1 - cutoff) - if v >= 0.5 - else v * 0.5 / cutoff - if v >= 0 - else v - for v in values - ] diff --git a/protzilla/data_analysis/ptm_quantification/flexiquant.py b/protzilla/data_analysis/ptm_quantification/flexiquant.py index bbc30be3..a98ea22b 100644 --- a/protzilla/data_analysis/ptm_quantification/flexiquant.py +++ b/protzilla/data_analysis/ptm_quantification/flexiquant.py @@ -21,6 +21,7 @@ def flexiquant_lf( metadata_df: pd.DataFrame, reference_group: str, protein_id: str, + grouping_column: str, num_init: int = 50, mod_cutoff: float = 0.5, ) -> dict: @@ -28,6 +29,13 @@ def flexiquant_lf( FLEXIQuant-LF is a method to quantify protein modification extent in label-free proteomics data. Parts of the implementation have been adapted from https://github.com/SteenOmicsLab/FLEXIQuantLF. + + :param peptide_df: DataFrame containing peptide intensities. + :param metadata_df: DataFrame containing metadata. + :param reference_group: Name of the reference group. + :param protein_id: Protein ID that should be analysed. + :param num_init: Number of initializations for RANSAC regression. + :param mod_cutoff: RM score cutoff value for modified peptides. """ df = peptide_df[peptide_df["Protein ID"] == protein_id].pivot_table( @@ -37,17 +45,17 @@ def flexiquant_lf( df = pd.merge( left=df, - right=metadata_df[["Sample", "Group"]], + right=metadata_df[["Sample", grouping_column]], on="Sample", copy=False, ) - if not "Group" in df: + if not grouping_column in df: return dict( messages=[ dict( level=logging.ERROR, - msg="No 'Group' column found in provided dataframe.", + msg=f"No {grouping_column} column found in provided dataframe.", ) ] ) @@ -55,7 +63,7 @@ def flexiquant_lf( # delete columns where all entries are nan df.dropna(how="all", axis=1, inplace=True) - if reference_group not in df["Group"].unique(): + if reference_group not in df[grouping_column].unique(): return dict( messages=[ dict( @@ -66,7 +74,7 @@ def flexiquant_lf( ) else: # filter dataframe for controls - df_control = df[df["Group"] == reference_group] + df_control = df[df[grouping_column] == reference_group] # get modified peptides modified = [] @@ -79,9 +87,9 @@ def flexiquant_lf( df.drop(modified, inplace=True, axis=1) # delete Group column - group_column = df["Group"] - df_control.drop("Group", axis=1, inplace=True) - df.drop("Group", axis=1, inplace=True) + group_column = df[grouping_column] + df_control.drop(grouping_column, axis=1, inplace=True) + df.drop(grouping_column, axis=1, inplace=True) sample_column = df["Sample"] @@ -254,9 +262,9 @@ def flexiquant_lf( df_RM["Reproducibility factor"] = reproducibility_list # add Group column again - df_raw_scores["Group"] = group_column - df_RM["Group"] = group_column - df_RM_mod["Group"] = group_column + df_raw_scores[grouping_column] = group_column + df_RM[grouping_column] = group_column + df_RM_mod[grouping_column] = group_column # add Sample column again df_raw_scores["Sample"] = sample_column @@ -272,6 +280,7 @@ def flexiquant_lf( sample_column, df_RM[df_RM["Sample"] == sample].iloc[0], mod_cutoff=mod_cutoff, + grouping_column=grouping_column, ) ) ) @@ -324,6 +333,16 @@ def calculate_confidence_band( ): """ Calculates confidence bands arround the regression line. + + :param slope: Slope of the regression line. + :param median_int: Median intensity of the reference group. + :param dataframe_train: DataFrame containing the training data. + :param X: Array containing the reference intensities. + :param y: Series containing the sample intensities. + :param row: Series containing the sample intensities. + :param idx: Index of the sample. + :param matrix_distance_RL: DataFrame containing the distances to the regression line. + :param alpha: Alpha value for the confidence band. """ # calculate predicted intensity with Reference intensity of a peptide and slope of the sample (Y hat) @@ -396,9 +415,21 @@ def create_regression_plots( sample_column: pd.Series, rm_scores: pd.DataFrame, mod_cutoff: float, + grouping_column: str, ): """ Creates a scatter plot with regression line and confidence bands. + + :param dataframe_train: DataFrame containing the training data. + :param idx: Index of the sample. + :param r2_score_model: R2 score of the model. + :param r2_score_data: R2 score of the data. + :param slope: Slope of the regression line. + :param alpha: Alpha value for the confidence band. + :param sample_column: Series containing the sample names. + :param rm_scores: DataFrame containing the RM scores. + :param mod_cutoff: RM score cutoff value for modified peptides. + :param grouping_column: Name of the grouping column. """ # create new figure with two subplots @@ -433,7 +464,14 @@ def create_regression_plots( plt.sca(ax1) rm_scores = rm_scores.drop( - ["Slope", "R2 model", "R2 data", "Reproducibility factor", "Group", "Sample"] + [ + "Slope", + "R2 model", + "R2 data", + "Reproducibility factor", + grouping_column, + "Sample", + ] ) # rm_scores.dropna(inplace=True) rm_scores.clip(0, 1, inplace=True) @@ -507,6 +545,9 @@ def cmap(values: list[float]): def calc_raw_scores(df_distance: pd.DataFrame, median_int: pd.Series): """ Calculates raw scores for each sample based on the distance to the regression line. + + :param df_distance: DataFrame containing the distances to the regression line. + :param median_int: Median intensity of the reference group """ # copy df_distance df_rs = df_distance.copy() @@ -533,8 +574,10 @@ def calc_raw_scores(df_distance: pd.DataFrame, median_int: pd.Series): def normalize_t3median(dataframe: pd.DataFrame): """ - Applies Top3 median normalization to dataframe - Determines the median of the three highest values in each row and divides every value in the row by it + Applies Top3 median normalization to dataframe. + Determines the median of the three highest values in each row and divides every value in the row by it. + + :param dataframe: DataFrame containing the data to be normalized. """ # copy dataframe dataframe_t3med = dataframe.copy() @@ -556,11 +599,18 @@ def normalize_t3median(dataframe: pd.DataFrame): def scale_to_mod_cutoff(values: list[float], cutoff: float) -> list[float]: + """ + Scales values to a cutoff value. + + :param values: List of values to be scaled. + :param cutoff: Cutoff value. + """ + return [ - ( - 0.5 + (v - cutoff) * 0.5 / (1 - cutoff) - if v >= 0.5 - else v * 0.5 / cutoff if v >= 0 else v - ) + 0.5 + (v - cutoff) * 0.5 / (1 - cutoff) + if v >= 0.5 + else v * 0.5 / cutoff + if v >= 0 + else v for v in values ] From d3ae110f5f5672c59350fdd3b770f56a87e47aa9 Mon Sep 17 00:00:00 2001 From: lucatreide Date: Fri, 12 Jul 2024 11:22:26 +0200 Subject: [PATCH 12/12] fix grouping column --- protzilla/methods/data_analysis.py | 1 + ui/runs/forms/data_analysis.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index fa449c7a..0eba72e5 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -706,6 +706,7 @@ class FLEXIQuantLF(PlotStep): "peptide_df", "metadata_df", "reference_group", + "grouping_column", "protein_id", "num_init", "mod_cutoff", diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 620cc747..2ff6fdf7 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1007,7 +1007,7 @@ class FLEXIQuantLFForm(MethodForm): def fill_form(self, run: Run) -> None: self.fields["peptide_df"].choices = fill_helper.get_choices(run, "peptide_df") self.fields["grouping_column"].choices = fill_helper.to_choices( - run.steps.metadata_df.drop("Sample", axis=1).columns[1:] + run.steps.metadata_df.drop("Sample", axis=1).columns ) chosen_grouping_column = self.data.get(