From ae8a7b3e34d84baf46274f60a4ccfef3d470692a Mon Sep 17 00:00:00 2001 From: AK Date: Fri, 21 Jun 2024 16:56:19 +0200 Subject: [PATCH 01/52] Fixed import for circadian mouse data --- protzilla/importing/ms_data_import.py | 8 +- protzilla/importing/peptide_import.py | 1 + user_data/workflows/workflow_Plot-Thesis.yaml | 94 +++++++++++++++++++ 3 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 user_data/workflows/workflow_Plot-Thesis.yaml diff --git a/protzilla/importing/ms_data_import.py b/protzilla/importing/ms_data_import.py index d102d7ca..e8cf2df6 100644 --- a/protzilla/importing/ms_data_import.py +++ b/protzilla/importing/ms_data_import.py @@ -191,9 +191,15 @@ def clean_protein_groups(protein_groups, map_to_uniprot=True): found_ids_per_group = [] # go through all groups and find the valid proteins # non uniprot ids are put into extracted_ids, so they can be mapped + extract_protein_id_regex = re.compile(r'\|([^|]+)\|') + + # Function to extract protein IDs from the formatted string + def extract_protein_ids(protein_group_str): + return extract_protein_id_regex.findall(protein_group_str) + for group in protein_groups: found_in_group = set() - for protein_id in group.split(";"): + for protein_id in extract_protein_ids(group) or group.split(";"): if not protein_id.startswith("ENSP") and ( match := uniprot_regex.search(protein_id) ): diff --git a/protzilla/importing/peptide_import.py b/protzilla/importing/peptide_import.py index 2a393fbd..b1c1e96f 100644 --- a/protzilla/importing/peptide_import.py +++ b/protzilla/importing/peptide_import.py @@ -87,6 +87,7 @@ def evidence_import(file_path, intensity_name, map_to_uniprot) -> dict: "Missed cleavages", "PEP", "Raw file", + "Retention time", ] read = pd.read_csv( diff --git a/user_data/workflows/workflow_Plot-Thesis.yaml b/user_data/workflows/workflow_Plot-Thesis.yaml new file mode 100644 index 00000000..0bdccd0f --- /dev/null +++ b/user_data/workflows/workflow_Plot-Thesis.yaml @@ -0,0 +1,94 @@ +df_mode: disk_memory +steps: +- form_inputs: + feature_orientation: Columns (samples in rows, features in columns) + inputs: {} + instance_identifier: MetadataImport_1 + type: MetadataImport +- form_inputs: {} + inputs: {} + instance_identifier: EvidenceImport_1 + type: EvidenceImport +- form_inputs: + percentage: 0.5 + inputs: {} + instance_identifier: FilterProteinsBySamplesMissing_1 + plot_inputs: + graph_type: Bar chart + type: FilterProteinsBySamplesMissing +- form_inputs: + deviation_threshold: 2.0 + inputs: {} + instance_identifier: FilterSamplesByProteinIntensitiesSum_1 + plot_inputs: + graph_type: Bar chart + type: FilterSamplesByProteinIntensitiesSum +- form_inputs: + number_of_neighbours: 5 + inputs: {} + instance_identifier: ImputationByKNN_1 + plot_inputs: + graph_type: Boxplot + graph_type_quantities: Bar chart + group_by: None + visual_transformation: log10 + type: ImputationByKNN +- form_inputs: + number_of_neighbors: 20 + inputs: {} + instance_identifier: OutlierDetectionByLocalOutlierFactor_1 + plot_inputs: {} + type: OutlierDetectionByLocalOutlierFactor +- form_inputs: + percentile: 0.5 + inputs: {} + instance_identifier: NormalisationByMedian_1 + plot_inputs: + graph_type: Boxplot + group_by: None + visual_transformation: log10 + type: NormalisationByMedian +- form_inputs: + log_base: log2 + inputs: {} + instance_identifier: TransformationLog_1 + plot_inputs: + graph_type: Histogram + group_by: None + type: TransformationLog +- form_inputs: + similarity_measure: euclidean distance + inputs: {} + instance_identifier: PlotProtQuant_1 + type: PlotProtQuant +- form_inputs: + alpha: 0.05 + inputs: {} + instance_identifier: DifferentialExpressionTTest_1 + type: DifferentialExpressionTTest +- form_inputs: + fc_threshold: 1 + inputs: {} + instance_identifier: PlotVolcano_1 + type: PlotVolcano +- form_inputs: + differential_expression_threshold: 1 + direction: both + gene_sets_restring: [] + organism: 9606 + inputs: {} + instance_identifier: EnrichmentAnalysisGOAnalysisWithString_1 + type: EnrichmentAnalysisGOAnalysisWithString +- form_inputs: + colors: [] + cutoff: 0.05 + gene_sets: + - Process + - Component + - Function + - KEGG + top_terms: 10 + value: p-value + inputs: {} + instance_identifier: PlotGOEnrichmentBarPlot_1 + type: PlotGOEnrichmentBarPlot From 53d6bafe458e43cb8f33dcbe40529058057c6b34 Mon Sep 17 00:00:00 2001 From: AK Date: Tue, 2 Jul 2024 11:10:34 +0200 Subject: [PATCH 02/52] Plotquantplot for peptide --- .../data_preprocessing/peptide_filter.py | 64 ++++++++++++++ protzilla/methods/data_analysis.py | 20 +++++ protzilla/methods/data_preprocessing.py | 4 +- ui/runs/form_mapping.py | 1 + ui/runs/forms/data_analysis.py | 69 ++++++--------- ui/runs/forms/data_preprocessing.py | 1 - user_data/workflows/workflow_Plot-Thesis.yaml | 86 +------------------ 7 files changed, 115 insertions(+), 130 deletions(-) diff --git a/protzilla/data_preprocessing/peptide_filter.py b/protzilla/data_preprocessing/peptide_filter.py index 3b1caee9..67745d15 100644 --- a/protzilla/data_preprocessing/peptide_filter.py +++ b/protzilla/data_preprocessing/peptide_filter.py @@ -50,3 +50,67 @@ def by_pep_value_plot(method_inputs, method_outputs, graph_type): elif graph_type == "Bar chart": fig = create_bar_plot(**value_dict) return [fig] + +def by_samples_missing( + protein_df: pd.DataFrame | None, + peptide_df: pd.DataFrame | None, + percentage: float = 0.5, +) -> dict: + """ + This function filters proteins based on the amount of samples with nan values, if the percentage of nan values + is below a threshold (percentage). + + :param protein_df: the protein dataframe that should be filtered + :param peptide_df: the peptide dataframe that should be filtered in accordance to the intensity dataframe (optional) + :param percentage: ranging from 0 to 1. Defining the relative share of samples the proteins need to be present in, + in order for the protein to be kept. + :return: returns the filtered df as a Dataframe and a dict with a list of Protein IDs that were discarded + and a list of Protein IDs that were kept + """ + + filter_threshold: int = percentage * len(protein_df.Sample.unique()) + transformed_df = long_to_wide(protein_df) + + remaining_proteins_list = transformed_df.dropna( + axis=1, thresh=filter_threshold + ).columns.tolist() + filtered_proteins_list = ( + transformed_df.drop(remaining_proteins_list, axis=1).columns.unique().tolist() + ) + filtered_df = protein_df[ + (protein_df["Protein ID"].isin(remaining_proteins_list)) + ] + filtered_peptide_df = None + if peptide_df is not None: + filtered_peptide_df = peptide_df[ + (peptide_df["Protein ID"].isin(remaining_proteins_list)) + ] + return dict( + protein_df=filtered_df, + peptide_df=filtered_peptide_df, + filtered_proteins=filtered_proteins_list, + remaining_proteins=remaining_proteins_list, + ) + + +def _build_pie_bar_plot(remaining_proteins, filtered_proteins, graph_type): + if graph_type == "Pie chart": + fig = create_pie_plot( + values_of_sectors=[ + len(remaining_proteins), + len(filtered_proteins), + ], + names_of_sectors=["Proteins kept", "Proteins filtered"], + heading="Number of Filtered Proteins", + ) + elif graph_type == "Bar chart": + fig = create_bar_plot( + values_of_sectors=[ + len(remaining_proteins), + len(filtered_proteins), + ], + names_of_sectors=["Proteins kept", "Proteins filtered"], + heading="Number of Filtered Proteins", + y_title="Number of Proteins", + ) + return [fig] \ No newline at end of file diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 79982ea2..c7f110e3 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -18,6 +18,7 @@ prot_quant_plot, scatter_plot, ) +from protzilla.data_analysis.prot_quant_plot_peptide import prot_quant_plot_peptide from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph from protzilla.methods.data_preprocessing import TransformationLog from protzilla.steps import Plots, Step, StepManager @@ -252,6 +253,25 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: ) return inputs +class PlotProtQuantPeptide(PlotStep): + display_name = "Protein Quantification Plot For Peptide" + operation = "plot" + method_description = ( + "Creates a line chart for intensity across samples for protein groups" + ) + + input_keys = ["input_df", "protein_group", "similarity_measure", "similarity"] + output_keys = [] + + def method(self, inputs: dict) -> dict: + return prot_quant_plot_peptide(**inputs) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["input_df"] = steps.get_step_output( + Step, "peptide_df", inputs["input_df"] + ) + return inputs + class PlotPrecisionRecallCurve(PlotStep): display_name = "Precision Recall" diff --git a/protzilla/methods/data_preprocessing.py b/protzilla/methods/data_preprocessing.py index 0565eaf0..50373899 100644 --- a/protzilla/methods/data_preprocessing.py +++ b/protzilla/methods/data_preprocessing.py @@ -329,8 +329,8 @@ class FilterPeptidesByPEPThreshold(DataPreprocessingStep): operation = "filter_peptides" method_description = "Filter by PEP-threshold" - input_keys = ["protein_df", "peptide_df", "threshold"] - output_keys = ["protein_df", "peptide_df", "filtered_peptides"] + input_keys = ["peptide_df", "threshold"] + output_keys = ["peptide_df", "filtered_peptides"] def method(self, inputs): return peptide_filter.by_pep_value(**inputs) diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py index ab27a219..dcfc6306 100644 --- a/ui/runs/form_mapping.py +++ b/ui/runs/form_mapping.py @@ -47,6 +47,7 @@ data_analysis.PlotScatterPlot: data_analysis_forms.PlotScatterPlotForm, data_analysis.PlotClustergram: data_analysis_forms.PlotClustergramForm, data_analysis.PlotProtQuant: data_analysis_forms.PlotProtQuantForm, + data_analysis.PlotProtQuantPeptide: data_analysis_forms.PlotProtQuantPeptideForm, data_analysis.PlotPrecisionRecallCurve: data_analysis_forms.PlotPrecisionRecallCurveForm, data_analysis.PlotROC: data_analysis_forms.PlotROCCurveForm, data_analysis.ClusteringKMeans: data_analysis_forms.ClusteringKMeansForm, diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 09c6a73c..de7c3fa4 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -368,56 +368,35 @@ class PlotProtQuantForm(MethodForm): label="Similarity", min_value=-1, max_value=999, step_size=1, initial=1 ) +class PlotProtQuantPeptideForm(MethodForm): + is_dynamic = True + + input_df = CustomChoiceField( + choices=[], + label="Choose dataframe to be plotted", + ) + protein_group = CustomChoiceField( + choices=[], + label="Protein group: choose highlighted protein group", + ) + similarity_measure = CustomChoiceField( + choices=SimilarityMeasure, + label="Similarity Measurement: choose how to compare protein groups", + initial=SimilarityMeasure.euclidean_distance, + ) + similarity = CustomNumberField( + label="Similarity", min_value=-1, max_value=999, step_size=1, initial=1 + ) + def fill_form(self, run: Run) -> None: - self.fields["input_df"].choices = fill_helper.get_choices_for_protein_df_steps( - run - ) + self.fields["input_df"].choices = fill_helper.get_choices(run, "peptide_df") - input_df_instance_id = self.data.get( - "input_df", self.fields["input_df"].choices[0][0] - ) self.fields["protein_group"].choices = fill_helper.to_choices( - run.steps.get_step_output( - step_type=Step, - output_key="protein_df", - instance_identifier=input_df_instance_id, - )["Protein ID"].unique() - ) - - similarity_measure = self.data.get( - "similarity_measure", self.fields["similarity_measure"].choices[0][0] + run.steps.get_step_output(Step, "peptide_df")[ + "Protein ID" + ].unique() ) - self.data = self.data.copy() - if similarity_measure == SimilarityMeasure.cosine_similarity: - self.fields["similarity"] = CustomFloatField( - label="Cosine Similarity", - min_value=-1, - max_value=1, - step_size=0.1, - initial=0, - ) - if ( - "similarity" not in self.data - or float(self.data["similarity"]) < -1 - or float(self.data["similarity"]) > 1 - ): - self.data["similarity"] = 0 - else: - self.fields["similarity"] = CustomNumberField( - label="Euclidean Distance", - min_value=0, - max_value=999, - step_size=1, - initial=1, - ) - if ( - "similarity" not in self.data - or float(self.data["similarity"]) < 0 - or float(self.data["similarity"]) > 999 - ): - self.data["similarity"] = 1 - class PlotPrecisionRecallCurveForm(MethodForm): # Todo: Input diff --git a/ui/runs/forms/data_preprocessing.py b/ui/runs/forms/data_preprocessing.py index 08590b79..40ca2f78 100644 --- a/ui/runs/forms/data_preprocessing.py +++ b/ui/runs/forms/data_preprocessing.py @@ -469,7 +469,6 @@ class FilterPeptidesByPEPThresholdForm(MethodForm): threshold = CustomFloatField( label="Threshold value for PEP", min_value=0, initial=0 ) - peptide_df = CustomChoiceField(choices=EmptyEnum, label="peptide_df") class FilterPeptidesByPEPThresholdPlotForm(MethodForm): diff --git a/user_data/workflows/workflow_Plot-Thesis.yaml b/user_data/workflows/workflow_Plot-Thesis.yaml index 0bdccd0f..474c0839 100644 --- a/user_data/workflows/workflow_Plot-Thesis.yaml +++ b/user_data/workflows/workflow_Plot-Thesis.yaml @@ -1,94 +1,16 @@ df_mode: disk_memory steps: -- form_inputs: - feature_orientation: Columns (samples in rows, features in columns) - inputs: {} - instance_identifier: MetadataImport_1 - type: MetadataImport - form_inputs: {} inputs: {} instance_identifier: EvidenceImport_1 type: EvidenceImport - form_inputs: - percentage: 0.5 - inputs: {} - instance_identifier: FilterProteinsBySamplesMissing_1 - plot_inputs: - graph_type: Bar chart - type: FilterProteinsBySamplesMissing -- form_inputs: - deviation_threshold: 2.0 - inputs: {} - instance_identifier: FilterSamplesByProteinIntensitiesSum_1 - plot_inputs: - graph_type: Bar chart - type: FilterSamplesByProteinIntensitiesSum -- form_inputs: - number_of_neighbours: 5 - inputs: {} - instance_identifier: ImputationByKNN_1 - plot_inputs: - graph_type: Boxplot - graph_type_quantities: Bar chart - group_by: None - visual_transformation: log10 - type: ImputationByKNN -- form_inputs: - number_of_neighbors: 20 - inputs: {} - instance_identifier: OutlierDetectionByLocalOutlierFactor_1 - plot_inputs: {} - type: OutlierDetectionByLocalOutlierFactor -- form_inputs: - percentile: 0.5 - inputs: {} - instance_identifier: NormalisationByMedian_1 - plot_inputs: - graph_type: Boxplot - group_by: None - visual_transformation: log10 - type: NormalisationByMedian -- form_inputs: - log_base: log2 + feature_orientation: Columns (samples in rows, features in columns) inputs: {} - instance_identifier: TransformationLog_1 - plot_inputs: - graph_type: Histogram - group_by: None - type: TransformationLog + instance_identifier: MetadataImport_1 + type: MetadataImport - form_inputs: similarity_measure: euclidean distance inputs: {} instance_identifier: PlotProtQuant_1 - type: PlotProtQuant -- form_inputs: - alpha: 0.05 - inputs: {} - instance_identifier: DifferentialExpressionTTest_1 - type: DifferentialExpressionTTest -- form_inputs: - fc_threshold: 1 - inputs: {} - instance_identifier: PlotVolcano_1 - type: PlotVolcano -- form_inputs: - differential_expression_threshold: 1 - direction: both - gene_sets_restring: [] - organism: 9606 - inputs: {} - instance_identifier: EnrichmentAnalysisGOAnalysisWithString_1 - type: EnrichmentAnalysisGOAnalysisWithString -- form_inputs: - colors: [] - cutoff: 0.05 - gene_sets: - - Process - - Component - - Function - - KEGG - top_terms: 10 - value: p-value - inputs: {} - instance_identifier: PlotGOEnrichmentBarPlot_1 - type: PlotGOEnrichmentBarPlot + type: PlotProtQuantPeptide \ No newline at end of file From 786da450da5fe037475fb632de0c926f5cb787d2 Mon Sep 17 00:00:00 2001 From: AK Date: Wed, 10 Jul 2024 08:22:48 +0200 Subject: [PATCH 03/52] Plotquantplot for peptide --- protzilla/methods/data_analysis.py | 1 + ui/runs/forms/data_analysis.py | 48 ++++++++++- ui/runs/forms/fill_helper.py | 4 + user_data/workflows/workflow_Plot-Thesis.yaml | 79 +++++++++++++++---- 4 files changed, 114 insertions(+), 18 deletions(-) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 8407f4c0..5fffaa67 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -339,6 +339,7 @@ class PlotProtQuantPeptide(PlotStep): def method(self, inputs: dict) -> dict: return prot_quant_plot_peptide(**inputs) + def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["input_df"] = steps.get_step_output( Step, "peptide_df", inputs["input_df"] diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 5a824a08..30845d74 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -535,15 +535,55 @@ class PlotProtQuantPeptideForm(MethodForm): ) def fill_form(self, run: Run) -> None: - self.fields["input_df"].choices = fill_helper.get_choices(run, "peptide_df") + self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps( + run + ) + input_df_instance_id = self.data.get( + "input_df", self.fields["input_df"].choices[0][0] + ) self.fields["protein_group"].choices = fill_helper.to_choices( - run.steps.get_step_output(Step, "peptide_df")[ - "Protein ID" - ].unique() + run.steps.get_step_output( + step_type=Step, + output_key="peptide_df", + instance_identifier=input_df_instance_id, + )["Protein ID"].unique() ) + similarity_measure = self.data.get( + "similarity_measure", self.fields["similarity_measure"].choices[0][0] + ) + self.data = self.data.copy() + if similarity_measure == SimilarityMeasure.cosine_similarity: + self.fields["similarity"] = CustomFloatField( + label="Cosine Similarity", + min_value=-1, + max_value=1, + step_size=0.1, + initial=0, + ) + if ( + "similarity" not in self.data + or float(self.data["similarity"]) < -1 + or float(self.data["similarity"]) > 1 + ): + self.data["similarity"] = 0 + else: + self.fields["similarity"] = CustomNumberField( + label="Euclidean Distance", + min_value=0, + max_value=999, + step_size=1, + initial=1, + ) + if ( + "similarity" not in self.data + or float(self.data["similarity"]) < 0 + or float(self.data["similarity"]) > 999 + ): + self.data["similarity"] = 1 + class PlotPrecisionRecallCurveForm(MethodForm): # Todo: Input diff --git a/ui/runs/forms/fill_helper.py b/ui/runs/forms/fill_helper.py index 0b416f8f..1d0ea050 100644 --- a/ui/runs/forms/fill_helper.py +++ b/ui/runs/forms/fill_helper.py @@ -14,6 +14,10 @@ def get_choices_for_protein_df_steps(run: Run) -> list[tuple[str, str]]: return reversed(to_choices(run.steps.get_instance_identifiers(Step, "protein_df"))) +def get_choices_for_peptide_df_steps(run: Run) -> list[tuple[str, str]]: + return reversed(to_choices(run.steps.get_instance_identifiers(Step, "peptide_df"))) + + def get_choices( run: Run, output_key: str, step_type: type[Step] = Step ) -> list[tuple[str, str]]: diff --git a/user_data/workflows/workflow_Plot-Thesis.yaml b/user_data/workflows/workflow_Plot-Thesis.yaml index 474c0839..1758d861 100644 --- a/user_data/workflows/workflow_Plot-Thesis.yaml +++ b/user_data/workflows/workflow_Plot-Thesis.yaml @@ -1,16 +1,67 @@ df_mode: disk_memory steps: -- form_inputs: {} - inputs: {} - instance_identifier: EvidenceImport_1 - type: EvidenceImport -- form_inputs: - feature_orientation: Columns (samples in rows, features in columns) - inputs: {} - instance_identifier: MetadataImport_1 - type: MetadataImport -- form_inputs: - similarity_measure: euclidean distance - inputs: {} - instance_identifier: PlotProtQuant_1 - type: PlotProtQuantPeptide \ No newline at end of file + - form_inputs: + intensity_name: iBAQ + map_to_uniprot: false + aggregation_mode: Sum + inputs: { } + type: MaxQuantImport + - form_inputs: {} + inputs: {} + instance_identifier: EvidenceImport_1 + type: EvidenceImport + - form_inputs: + feature_orientation: Columns (samples in rows, features in columns) + inputs: {} + instance_identifier: MetadataImport_1 + type: MetadataImport + - form_inputs: + similarity_measure: euclidean distance + inputs: {} + instance_identifier: PlotProtQuant_1 + type: PlotProtQuantPeptide + - form_inputs: + percentage: 0.5 + inputs: { } + plot_inputs: + graph_type: Bar chart + type: FilterProteinsBySamplesMissing + - form_inputs: + deviation_threshold: 2.0 + inputs: { } + plot_inputs: + graph_type: Bar chart + type: FilterSamplesByProteinIntensitiesSum + - form_inputs: + number_of_neighbours: 5 + inputs: { } + plot_inputs: + graph_type: Boxplot + graph_type_quantities: Bar chart + group_by: None + visual_transformation: log10 + type: ImputationByKNN + - form_inputs: + number_of_neighbors: 20 + inputs: { } + plot_inputs: { } + type: OutlierDetectionByLocalOutlierFactor + - form_inputs: + percentile: 0.5 + inputs: { } + plot_inputs: + graph_type: Boxplot + group_by: None + visual_transformation: log10 + type: NormalisationByMedian + - form_inputs: + log_base: log2 + inputs: { } + plot_inputs: + graph_type: Histogram + group_by: None + type: TransformationLog + - form_inputs: + similarity_measure: euclidean distance + inputs: { } + type: PlotProtQuantPeptide \ No newline at end of file From dc9fc74e317abea886b33d8e4191bbd5156ba079 Mon Sep 17 00:00:00 2001 From: AK Date: Wed, 10 Jul 2024 11:32:13 +0200 Subject: [PATCH 04/52] updated transform_dfs.py so that it supports peptide DFs --- protzilla/utilities/transform_dfs.py | 34 ++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/protzilla/utilities/transform_dfs.py b/protzilla/utilities/transform_dfs.py index f3605b08..aa2bc6c0 100644 --- a/protzilla/utilities/transform_dfs.py +++ b/protzilla/utilities/transform_dfs.py @@ -17,10 +17,17 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str = None): packages such as sklearn :rtype: pd.DataFrame """ + + if intensity_df.duplicated(subset=["Sample", "Protein ID"]).any(): + intensity_df = intensity_df.groupby(["Sample", "Protein ID"]).mean().reset_index() + intensity_df = intensity_df.dropna() + values_name = default_intensity_column(intensity_df) if value_name is None else value_name - return pd.pivot( + intensity_df = pd.pivot( intensity_df, index="Sample", columns="Protein ID", values=values_name ) + intensity_df = intensity_df.fillna(intensity_df.mean()) + return intensity_df def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame): @@ -40,26 +47,35 @@ def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame): """ # Read out info from original dataframe intensity_name = default_intensity_column(original_long_df) - gene_info = original_long_df["Gene"] - # Turn the wide format into the long format - intensity_df = pd.melt( - wide_df.reset_index(), + + # Identify the additional columns from the original long dataframe + additional_columns = ['Modification', 'Retention Time'] + existing_additional_columns = [col for col in additional_columns if col in original_long_df.columns] + + # Melt the wide format back to long format + melted_df = pd.melt( + wide_df, id_vars="Sample", var_name="Protein ID", value_name=intensity_name, ) - intensity_df.sort_values( + melted_df.sort_values( by=["Sample", "Protein ID"], ignore_index=True, inplace=True, ) - intensity_df.insert(2, "Gene", gene_info) - return intensity_df + # Add back the additional columns if they exist in the original dataframe + for col in existing_additional_columns: + melted_df[col] = original_long_df[col] + + return melted_df def is_long_format(df: pd.DataFrame): - return set(df.columns[:3]) == {"Sample", "Protein ID", "Gene"} + required_columns = {"Sample", "Protein ID"} + additional_columns = {"Gene", "Retention time"} + return required_columns.issubset(df.columns) and any(col in df.columns for col in additional_columns) def is_intensity_df(df: pd.DataFrame): From 286023daf8b5fe1fe25dcf4334f7d37f539f7b5d Mon Sep 17 00:00:00 2001 From: AK Date: Wed, 10 Jul 2024 16:55:25 +0200 Subject: [PATCH 05/52] updated transform_dfs.py so that it supports peptide DFs --- .../data_analysis/prot_quant_plot_peptide.py | 194 ++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 protzilla/data_analysis/prot_quant_plot_peptide.py diff --git a/protzilla/data_analysis/prot_quant_plot_peptide.py b/protzilla/data_analysis/prot_quant_plot_peptide.py new file mode 100644 index 00000000..eeefbcfc --- /dev/null +++ b/protzilla/data_analysis/prot_quant_plot_peptide.py @@ -0,0 +1,194 @@ +import pandas as pd +import plotly.graph_objects as go +from scipy import stats +from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances + +from protzilla.utilities.transform_dfs import is_long_format, long_to_wide_retention_time + +# Define color constants +PROTZILLA_DISCRETE_COLOR_SEQUENCE = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#19D3F3", "#E763FA", "#FECB52", "#FFA15A", "#FF6692", "#B6E880"] +colors = { + "plot_bgcolor": "white", + "gridcolor": "#F1F1F1", + "linecolor": "#F1F1F1", + "annotation_text_color": "#ffffff", + "annotation_proteins_of_interest": "#4A536A", +} + +def prot_quant_plot_peptide( + input_df: pd.DataFrame, + protein_group: str, + similarity: float = 1.0, + similarity_measure: str = "euclidean distance", +) -> dict: + """ + A function to create a graph visualising protein quantifications across all samples + as a line diagram using retention time and intensity. It's possible to select one proteingroup + that will be displayed in orange and choose a similarity measurement with a similarity score + to get all proteingroups that are similar displayed in another color in this line diagram. + All other proteingroups are displayed in the background as a grey polygon. + + :param input_df: A dataframe in protzilla wide format, where each row + represents a sample and each column represents a feature. + :param protein_group: Protein IDs as the columnheader of the dataframe + :param similarity_measure: method to compare the chosen proteingroup with all others. The two + methods are "cosine similarity" and "euclidean distance". + :param similarity: similarity score of the chosen similarity measurement method. + + :return: returns a dictionary containing a list with a plotly figure and/or a list of messages + """ + # Ensure the dataframe includes retention time + if 'Retention time' not in input_df.columns: + raise ValueError("The input dataframe must include a 'Retention time' column.") + + wide_df = input_df.interpolate(method='linear', axis=0) + wide_df = long_to_wide_retention_time(wide_df) if is_long_format(wide_df) else wide_df + + if protein_group not in wide_df.columns: + raise ValueError("Please select a valid protein group.") + elif similarity_measure == "euclidean distance" and similarity < 0: + raise ValueError( + "Similarity for euclidean distance should be greater than or equal to 0." + ) + elif similarity_measure == "cosine similarity" and ( + similarity < -1 or similarity > 1 + ): + raise ValueError("Similarity for cosine similarity should be between -1 and 1") + + fig = go.Figure() + + color_mapping = { + "A": PROTZILLA_DISCRETE_COLOR_SEQUENCE[0], + "C": PROTZILLA_DISCRETE_COLOR_SEQUENCE[1], + } + + lower_upper_x = [] + lower_upper_y = [] + + lower_upper_x.append(wide_df['Retention time'].iloc[0]) + lower_upper_y.append(wide_df.iloc[0].min()) + + for index, row in wide_df.iterrows(): + lower_upper_x.append(row['Retention time']) + lower_upper_y.append(row.max()) + + for index, row in reversed(list(wide_df.iterrows())): + lower_upper_x.append(row['Retention time']) + lower_upper_y.append(row.min()) + + fig.add_trace( + go.Scatter( + x=lower_upper_x, + y=lower_upper_y, + fill="toself", + name="Intensity Range", + line=dict(color="silver"), + ) + ) + + similar_groups = [] + for group_to_compare in wide_df.columns: + if group_to_compare not in ['Retention time', protein_group]: + if similarity_measure == "euclidean distance": + distance = euclidean_distances( + stats.zscore(wide_df[protein_group]).values.reshape(1, -1), + stats.zscore(wide_df[group_to_compare]).values.reshape(1, -1), + )[0][0] + else: + distance = cosine_similarity( + stats.zscore(wide_df[protein_group]).values.reshape(1, -1), + stats.zscore(wide_df[group_to_compare]).values.reshape(1, -1), + )[0][0] + if similarity_measure == "euclidean distance": + if distance <= similarity: + similar_groups.append(group_to_compare) + else: + if distance >= similarity: + similar_groups.append(group_to_compare) + + for group in similar_groups: + fig.add_trace( + go.Scatter( + x=wide_df['Retention time'], + y=wide_df[group], + mode="lines", + name=group[:15] + "..." if len(group) > 15 else group, + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[1]), + showlegend=len(similar_groups) <= 7, + ) + ) + + if len(similar_groups) > 7: + fig.add_trace( + go.Scatter( + x=[None], + y=[None], + mode="lines", + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[1]), + name="Similar Protein Groups", + ) + ) + + formatted_protein_name = ( + protein_group[:15] + "..." if len(protein_group) > 15 else protein_group + ) + fig.add_trace( + go.Scatter( + x=wide_df['Retention time'], + y=wide_df[protein_group], + mode="lines", + name=formatted_protein_name, + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]), + ) + ) + + fig.add_trace( + go.Scatter( + x=[None], + y=[None], + mode="markers", + marker=dict(color=color_mapping.get("A")), + name="Experimental Group", + ) + ) + + fig.add_trace( + go.Scatter( + x=[None], + y=[None], + mode="markers", + marker=dict(color=color_mapping.get("C")), + name="Control Group", + ) + ) + + fig.update_layout( + title=f"Intensity of {formatted_protein_name} across retention time", + plot_bgcolor=colors["plot_bgcolor"], + xaxis_gridcolor=colors["gridcolor"], + yaxis_gridcolor=colors["gridcolor"], + xaxis_linecolor=colors["linecolor"], + yaxis_linecolor=colors["linecolor"], + xaxis_title="Retention Time", + yaxis_title="Intensity", + legend_title="Legend", + xaxis=dict( + tickmode="array", + tickangle=0, + tickvals=sorted(wide_df['Retention time']), + ticktext=[ + f"" + for label in wide_df['Retention time'] + ], + ), + autosize=True, + margin=dict(l=100, r=300, t=100, b=100), + legend=dict( + x=1.05, + y=1, + bgcolor="rgba(255, 255, 255, 0.5)", + orientation="v", + ), + ) + + return dict(plots=[fig]) From 4b940f0f7631fa69ddc69d1ea976b1cb43078f10 Mon Sep 17 00:00:00 2001 From: AK Date: Wed, 10 Jul 2024 17:55:46 +0200 Subject: [PATCH 06/52] Implemeted Protquantplot with retention time instead of Intensities --- .../data_analysis/prot_quant_plot_peptide.py | 33 +++++++++---------- protzilla/utilities/transform_dfs.py | 25 ++++++++++++++ 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/protzilla/data_analysis/prot_quant_plot_peptide.py b/protzilla/data_analysis/prot_quant_plot_peptide.py index eeefbcfc..7df80807 100644 --- a/protzilla/data_analysis/prot_quant_plot_peptide.py +++ b/protzilla/data_analysis/prot_quant_plot_peptide.py @@ -23,7 +23,7 @@ def prot_quant_plot_peptide( ) -> dict: """ A function to create a graph visualising protein quantifications across all samples - as a line diagram using retention time and intensity. It's possible to select one proteingroup + as a line diagram using retention time. It's possible to select one proteingroup that will be displayed in orange and choose a similarity measurement with a similarity score to get all proteingroups that are similar displayed in another color in this line diagram. All other proteingroups are displayed in the background as a grey polygon. @@ -37,13 +37,10 @@ def prot_quant_plot_peptide( :return: returns a dictionary containing a list with a plotly figure and/or a list of messages """ - # Ensure the dataframe includes retention time - if 'Retention time' not in input_df.columns: - raise ValueError("The input dataframe must include a 'Retention time' column.") - wide_df = input_df.interpolate(method='linear', axis=0) wide_df = long_to_wide_retention_time(wide_df) if is_long_format(wide_df) else wide_df + if protein_group not in wide_df.columns: raise ValueError("Please select a valid protein group.") elif similarity_measure == "euclidean distance" and similarity < 0: @@ -65,15 +62,15 @@ def prot_quant_plot_peptide( lower_upper_x = [] lower_upper_y = [] - lower_upper_x.append(wide_df['Retention time'].iloc[0]) + lower_upper_x.append(wide_df.index[0]) lower_upper_y.append(wide_df.iloc[0].min()) for index, row in wide_df.iterrows(): - lower_upper_x.append(row['Retention time']) + lower_upper_x.append(index) lower_upper_y.append(row.max()) for index, row in reversed(list(wide_df.iterrows())): - lower_upper_x.append(row['Retention time']) + lower_upper_x.append(index) lower_upper_y.append(row.min()) fig.add_trace( @@ -81,14 +78,14 @@ def prot_quant_plot_peptide( x=lower_upper_x, y=lower_upper_y, fill="toself", - name="Intensity Range", + name="Retention time of all protein groups", line=dict(color="silver"), ) ) similar_groups = [] for group_to_compare in wide_df.columns: - if group_to_compare not in ['Retention time', protein_group]: + if group_to_compare != protein_group: if similarity_measure == "euclidean distance": distance = euclidean_distances( stats.zscore(wide_df[protein_group]).values.reshape(1, -1), @@ -109,7 +106,7 @@ def prot_quant_plot_peptide( for group in similar_groups: fig.add_trace( go.Scatter( - x=wide_df['Retention time'], + x=wide_df.index, y=wide_df[group], mode="lines", name=group[:15] + "..." if len(group) > 15 else group, @@ -134,7 +131,7 @@ def prot_quant_plot_peptide( ) fig.add_trace( go.Scatter( - x=wide_df['Retention time'], + x=wide_df.index, y=wide_df[protein_group], mode="lines", name=formatted_protein_name, @@ -163,22 +160,22 @@ def prot_quant_plot_peptide( ) fig.update_layout( - title=f"Intensity of {formatted_protein_name} across retention time", + title=f"Retention time of {formatted_protein_name} in all samples", plot_bgcolor=colors["plot_bgcolor"], xaxis_gridcolor=colors["gridcolor"], yaxis_gridcolor=colors["gridcolor"], xaxis_linecolor=colors["linecolor"], yaxis_linecolor=colors["linecolor"], - xaxis_title="Retention Time", - yaxis_title="Intensity", + xaxis_title="Sample", + yaxis_title="Retention time", legend_title="Legend", xaxis=dict( tickmode="array", tickangle=0, - tickvals=sorted(wide_df['Retention time']), + tickvals=wide_df.index, ticktext=[ f"" - for label in wide_df['Retention time'] + for label in wide_df.index ], ), autosize=True, @@ -191,4 +188,4 @@ def prot_quant_plot_peptide( ), ) - return dict(plots=[fig]) + return dict(plots=[fig]) \ No newline at end of file diff --git a/protzilla/utilities/transform_dfs.py b/protzilla/utilities/transform_dfs.py index aa2bc6c0..0b1e7976 100644 --- a/protzilla/utilities/transform_dfs.py +++ b/protzilla/utilities/transform_dfs.py @@ -29,6 +29,31 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str = None): intensity_df = intensity_df.fillna(intensity_df.mean()) return intensity_df +def long_to_wide_retention_time(intensity_df: pd.DataFrame, value_name: str = None): + """ + This function transforms the dataframe to a wide format that + can be more easily handled by packages such as sklearn. + Each sample gets one row with all observations as columns. + + :param intensity_df: the dataframe that should be transformed into + long format + :type intensity_df: pd.DataFrame + + :return: returns dataframe in wide format suitable for use by + packages such as sklearn + :rtype: pd.DataFrame + """ + + if intensity_df.duplicated(subset=["Sample", "Protein ID"]).any(): + intensity_df = intensity_df.groupby(["Sample", "Protein ID"]).mean().reset_index() + intensity_df = intensity_df.dropna() + + values_name = 'Retention time' + intensity_df = pd.pivot( + intensity_df, index="Sample", columns="Protein ID", values=values_name + ) + intensity_df = intensity_df.fillna(intensity_df.mean()) + return intensity_df def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame): """ From 3d2bb3032cc293b9ac48f41da75fcaa3d72f78dd Mon Sep 17 00:00:00 2001 From: AK Date: Thu, 11 Jul 2024 15:29:09 +0200 Subject: [PATCH 07/52] Implemeted Timequantplot --- ..._peptide.py => time_quant_plot_peptide.py} | 43 ++++++++--------- protzilla/methods/data_analysis.py | 13 ++--- protzilla/utilities/transform_dfs.py | 47 +++++++------------ ui/runs/form_mapping.py | 2 +- ui/runs/forms/data_analysis.py | 2 +- 5 files changed, 44 insertions(+), 63 deletions(-) rename protzilla/data_analysis/{prot_quant_plot_peptide.py => time_quant_plot_peptide.py} (88%) diff --git a/protzilla/data_analysis/prot_quant_plot_peptide.py b/protzilla/data_analysis/time_quant_plot_peptide.py similarity index 88% rename from protzilla/data_analysis/prot_quant_plot_peptide.py rename to protzilla/data_analysis/time_quant_plot_peptide.py index 7df80807..f5921ae4 100644 --- a/protzilla/data_analysis/prot_quant_plot_peptide.py +++ b/protzilla/data_analysis/time_quant_plot_peptide.py @@ -3,7 +3,7 @@ from scipy import stats from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances -from protzilla.utilities.transform_dfs import is_long_format, long_to_wide_retention_time +from protzilla.utilities.transform_dfs import is_long_format, long_to_wide_time # Define color constants PROTZILLA_DISCRETE_COLOR_SEQUENCE = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#19D3F3", "#E763FA", "#FECB52", "#FFA15A", "#FF6692", "#B6E880"] @@ -15,8 +15,9 @@ "annotation_proteins_of_interest": "#4A536A", } -def prot_quant_plot_peptide( +def time_quant_plot_peptide( input_df: pd.DataFrame, + metadata_df: pd.DataFrame, protein_group: str, similarity: float = 1.0, similarity_measure: str = "euclidean distance", @@ -30,6 +31,7 @@ def prot_quant_plot_peptide( :param input_df: A dataframe in protzilla wide format, where each row represents a sample and each column represents a feature. + :param metadata_df: A dataframe containing the metadata of the samples. :param protein_group: Protein IDs as the columnheader of the dataframe :param similarity_measure: method to compare the chosen proteingroup with all others. The two methods are "cosine similarity" and "euclidean distance". @@ -37,8 +39,16 @@ def prot_quant_plot_peptide( :return: returns a dictionary containing a list with a plotly figure and/or a list of messages """ + + input_df = pd.merge( + left=input_df, + right=metadata_df[["Sample", "Time"]], + on="Sample", + copy=False, + ) + wide_df = input_df.interpolate(method='linear', axis=0) - wide_df = long_to_wide_retention_time(wide_df) if is_long_format(wide_df) else wide_df + wide_df = long_to_wide_time(wide_df) if is_long_format(wide_df) else wide_df if protein_group not in wide_df.columns: @@ -78,7 +88,7 @@ def prot_quant_plot_peptide( x=lower_upper_x, y=lower_upper_y, fill="toself", - name="Retention time of all protein groups", + name="Intensity Range", line=dict(color="silver"), ) ) @@ -138,45 +148,30 @@ def prot_quant_plot_peptide( line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]), ) ) - fig.add_trace( go.Scatter( x=[None], y=[None], mode="markers", marker=dict(color=color_mapping.get("A")), - name="Experimental Group", + name="Intensity", ) ) - - fig.add_trace( - go.Scatter( - x=[None], - y=[None], - mode="markers", - marker=dict(color=color_mapping.get("C")), - name="Control Group", - ) - ) - fig.update_layout( - title=f"Retention time of {formatted_protein_name} in all samples", + title=f"Time Series of {formatted_protein_name} in all samples", plot_bgcolor=colors["plot_bgcolor"], xaxis_gridcolor=colors["gridcolor"], yaxis_gridcolor=colors["gridcolor"], xaxis_linecolor=colors["linecolor"], yaxis_linecolor=colors["linecolor"], - xaxis_title="Sample", - yaxis_title="Retention time", + xaxis_title="Time", + yaxis_title="Intensity", legend_title="Legend", xaxis=dict( tickmode="array", tickangle=0, tickvals=wide_df.index, - ticktext=[ - f"" - for label in wide_df.index - ], + ticktext=[wide_df["Time"].unique() for wide_df["Time"] in wide_df.index], ), autosize=True, margin=dict(l=100, r=300, t=100, b=100), diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 5fffaa67..bcd705a5 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -21,7 +21,7 @@ prot_quant_plot, scatter_plot, ) -from protzilla.data_analysis.prot_quant_plot_peptide import prot_quant_plot_peptide +from protzilla.data_analysis.time_quant_plot_peptide import time_quant_plot_peptide from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph from protzilla.methods.data_preprocessing import TransformationLog from protzilla.steps import Plots, Step, StepManager @@ -326,24 +326,25 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: ) return inputs -class PlotProtQuantPeptide(PlotStep): - display_name = "Protein Quantification Plot For Peptide" +class PlotTimeQuantPeptide(PlotStep): + display_name = "Time Quantification Plot For Peptide" operation = "plot" method_description = ( - "Creates a line chart for intensity across samples for protein groups" + "Creates a line chart for intensity across Time for protein groups" ) - input_keys = ["input_df", "protein_group", "similarity_measure", "similarity"] + input_keys = ["input_df", "metadata_df", "protein_group", "similarity_measure", "similarity"] output_keys = [] def method(self, inputs: dict) -> dict: - return prot_quant_plot_peptide(**inputs) + return time_quant_plot_peptide(**inputs) def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["input_df"] = steps.get_step_output( Step, "peptide_df", inputs["input_df"] ) + inputs["metadata_df"] = steps.metadata_df return inputs diff --git a/protzilla/utilities/transform_dfs.py b/protzilla/utilities/transform_dfs.py index 0b1e7976..5e5c5e99 100644 --- a/protzilla/utilities/transform_dfs.py +++ b/protzilla/utilities/transform_dfs.py @@ -3,6 +3,7 @@ from protzilla.utilities import default_intensity_column + def long_to_wide(intensity_df: pd.DataFrame, value_name: str = None): """ This function transforms the dataframe to a wide format that @@ -17,19 +18,13 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str = None): packages such as sklearn :rtype: pd.DataFrame """ - - if intensity_df.duplicated(subset=["Sample", "Protein ID"]).any(): - intensity_df = intensity_df.groupby(["Sample", "Protein ID"]).mean().reset_index() - intensity_df = intensity_df.dropna() - values_name = default_intensity_column(intensity_df) if value_name is None else value_name - intensity_df = pd.pivot( + return pd.pivot( intensity_df, index="Sample", columns="Protein ID", values=values_name ) - intensity_df = intensity_df.fillna(intensity_df.mean()) - return intensity_df -def long_to_wide_retention_time(intensity_df: pd.DataFrame, value_name: str = None): + +def long_to_wide_time(intensity_df: pd.DataFrame, value_name: str = None): """ This function transforms the dataframe to a wide format that can be more easily handled by packages such as sklearn. @@ -43,14 +38,11 @@ def long_to_wide_retention_time(intensity_df: pd.DataFrame, value_name: str = No packages such as sklearn :rtype: pd.DataFrame """ - - if intensity_df.duplicated(subset=["Sample", "Protein ID"]).any(): - intensity_df = intensity_df.groupby(["Sample", "Protein ID"]).mean().reset_index() - intensity_df = intensity_df.dropna() - - values_name = 'Retention time' + if intensity_df.duplicated(subset=["Time", "Protein ID"]).any(): + intensity_df = intensity_df.groupby(["Time", "Protein ID"]).mean().reset_index() + values_name = default_intensity_column(intensity_df) if value_name is None else value_name intensity_df = pd.pivot( - intensity_df, index="Sample", columns="Protein ID", values=values_name + intensity_df, index="Time", columns="Protein ID", values=values_name ) intensity_df = intensity_df.fillna(intensity_df.mean()) return intensity_df @@ -72,34 +64,27 @@ def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame): """ # Read out info from original dataframe intensity_name = default_intensity_column(original_long_df) - - # Identify the additional columns from the original long dataframe - additional_columns = ['Modification', 'Retention Time'] - existing_additional_columns = [col for col in additional_columns if col in original_long_df.columns] - - # Melt the wide format back to long format - melted_df = pd.melt( - wide_df, + gene_info = original_long_df["Gene"] + # Turn the wide format into the long format + intensity_df = pd.melt( + wide_df.reset_index(), id_vars="Sample", var_name="Protein ID", value_name=intensity_name, ) - melted_df.sort_values( + intensity_df.sort_values( by=["Sample", "Protein ID"], ignore_index=True, inplace=True, ) + intensity_df.insert(2, "Gene", gene_info) - # Add back the additional columns if they exist in the original dataframe - for col in existing_additional_columns: - melted_df[col] = original_long_df[col] - - return melted_df + return intensity_df def is_long_format(df: pd.DataFrame): required_columns = {"Sample", "Protein ID"} - additional_columns = {"Gene", "Retention time"} + additional_columns = {"Gene", "Time"} return required_columns.issubset(df.columns) and any(col in df.columns for col in additional_columns) diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py index f195f3b6..adac90a1 100644 --- a/ui/runs/form_mapping.py +++ b/ui/runs/form_mapping.py @@ -49,7 +49,7 @@ data_analysis.PlotScatterPlot: data_analysis_forms.PlotScatterPlotForm, data_analysis.PlotClustergram: data_analysis_forms.PlotClustergramForm, data_analysis.PlotProtQuant: data_analysis_forms.PlotProtQuantForm, - data_analysis.PlotProtQuantPeptide: data_analysis_forms.PlotProtQuantPeptideForm, + data_analysis.PlotTimeQuantPeptide: data_analysis_forms.PlotTimeQuantPeptideForm, data_analysis.PlotPrecisionRecallCurve: data_analysis_forms.PlotPrecisionRecallCurveForm, data_analysis.PlotROC: data_analysis_forms.PlotROCCurveForm, data_analysis.ClusteringKMeans: data_analysis_forms.ClusteringKMeansForm, diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 30845d74..de182b5b 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -514,7 +514,7 @@ def fill_form(self, run: Run) -> None: self.data["similarity"] = 1 -class PlotProtQuantPeptideForm(MethodForm): +class PlotTimeQuantPeptideForm(MethodForm): is_dynamic = True input_df = CustomChoiceField( From 468ac23d78986abfd9c943b89fffe80cfe93293d Mon Sep 17 00:00:00 2001 From: AK Date: Thu, 11 Jul 2024 15:33:02 +0200 Subject: [PATCH 08/52] Renamed the plot to time series plot --- ...time_quant_plot_peptide.py => time_series_plot_peptide.py} | 2 +- protzilla/methods/data_analysis.py | 4 ++-- ui/runs/form_mapping.py | 2 +- ui/runs/forms/data_analysis.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) rename protzilla/data_analysis/{time_quant_plot_peptide.py => time_series_plot_peptide.py} (99%) diff --git a/protzilla/data_analysis/time_quant_plot_peptide.py b/protzilla/data_analysis/time_series_plot_peptide.py similarity index 99% rename from protzilla/data_analysis/time_quant_plot_peptide.py rename to protzilla/data_analysis/time_series_plot_peptide.py index f5921ae4..5f5ac64e 100644 --- a/protzilla/data_analysis/time_quant_plot_peptide.py +++ b/protzilla/data_analysis/time_series_plot_peptide.py @@ -15,7 +15,7 @@ "annotation_proteins_of_interest": "#4A536A", } -def time_quant_plot_peptide( +def time_series_plot_peptide( input_df: pd.DataFrame, metadata_df: pd.DataFrame, protein_group: str, diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index bcd705a5..42a77182 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -21,7 +21,7 @@ prot_quant_plot, scatter_plot, ) -from protzilla.data_analysis.time_quant_plot_peptide import time_quant_plot_peptide +from protzilla.data_analysis.time_series_plot_peptide import time_quant_plot_peptide from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph from protzilla.methods.data_preprocessing import TransformationLog from protzilla.steps import Plots, Step, StepManager @@ -326,7 +326,7 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: ) return inputs -class PlotTimeQuantPeptide(PlotStep): +class PlotTimeSeriesPeptide(PlotStep): display_name = "Time Quantification Plot For Peptide" operation = "plot" method_description = ( diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py index adac90a1..083676f0 100644 --- a/ui/runs/form_mapping.py +++ b/ui/runs/form_mapping.py @@ -49,7 +49,7 @@ data_analysis.PlotScatterPlot: data_analysis_forms.PlotScatterPlotForm, data_analysis.PlotClustergram: data_analysis_forms.PlotClustergramForm, data_analysis.PlotProtQuant: data_analysis_forms.PlotProtQuantForm, - data_analysis.PlotTimeQuantPeptide: data_analysis_forms.PlotTimeQuantPeptideForm, + data_analysis.PlotTimeSeriesPeptide: data_analysis_forms.PlotTimeSeriesForm, data_analysis.PlotPrecisionRecallCurve: data_analysis_forms.PlotPrecisionRecallCurveForm, data_analysis.PlotROC: data_analysis_forms.PlotROCCurveForm, data_analysis.ClusteringKMeans: data_analysis_forms.ClusteringKMeansForm, diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index de182b5b..de3651b6 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -514,7 +514,7 @@ def fill_form(self, run: Run) -> None: self.data["similarity"] = 1 -class PlotTimeQuantPeptideForm(MethodForm): +class PlotTimeSeriesForm(MethodForm): is_dynamic = True input_df = CustomChoiceField( From 8a419b27f5b2a1cc6e0aae3fe7ffc7ec096f8e24 Mon Sep 17 00:00:00 2001 From: AK Date: Thu, 11 Jul 2024 16:15:38 +0200 Subject: [PATCH 09/52] Fixed Tests --- protzilla/importing/peptide_import.py | 1 - protzilla/methods/data_analysis.py | 4 ++-- user_data/workflows/workflow_Plot-Thesis.yaml | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/protzilla/importing/peptide_import.py b/protzilla/importing/peptide_import.py index e5400204..d38495dd 100644 --- a/protzilla/importing/peptide_import.py +++ b/protzilla/importing/peptide_import.py @@ -87,7 +87,6 @@ def evidence_import(file_path, intensity_name, map_to_uniprot) -> dict: "Missed cleavages", "PEP", "Raw file", - "Retention time", ] read = pd.read_csv( diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 42a77182..b2bbbbdf 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -21,7 +21,7 @@ prot_quant_plot, scatter_plot, ) -from protzilla.data_analysis.time_series_plot_peptide import time_quant_plot_peptide +from protzilla.data_analysis.time_series_plot_peptide import time_series_plot_peptide from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph from protzilla.methods.data_preprocessing import TransformationLog from protzilla.steps import Plots, Step, StepManager @@ -337,7 +337,7 @@ class PlotTimeSeriesPeptide(PlotStep): output_keys = [] def method(self, inputs: dict) -> dict: - return time_quant_plot_peptide(**inputs) + return time_series_plot_peptide(**inputs) def insert_dataframes(self, steps: StepManager, inputs) -> dict: diff --git a/user_data/workflows/workflow_Plot-Thesis.yaml b/user_data/workflows/workflow_Plot-Thesis.yaml index 1758d861..a3dee9fa 100644 --- a/user_data/workflows/workflow_Plot-Thesis.yaml +++ b/user_data/workflows/workflow_Plot-Thesis.yaml @@ -18,8 +18,8 @@ steps: - form_inputs: similarity_measure: euclidean distance inputs: {} - instance_identifier: PlotProtQuant_1 - type: PlotProtQuantPeptide + instance_identifier: PlotTimeSeries_1 + type: PlotTimeSeriesPeptide - form_inputs: percentage: 0.5 inputs: { } From 61b6df881bc2d805c9cc89e82afa6597c8667489 Mon Sep 17 00:00:00 2001 From: AK Date: Thu, 11 Jul 2024 16:40:44 +0200 Subject: [PATCH 10/52] Implemented test for time series plot --- .../data_analysis/test_time_series_plots.py | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 tests/protzilla/data_analysis/test_time_series_plots.py diff --git a/tests/protzilla/data_analysis/test_time_series_plots.py b/tests/protzilla/data_analysis/test_time_series_plots.py new file mode 100644 index 00000000..85bfad11 --- /dev/null +++ b/tests/protzilla/data_analysis/test_time_series_plots.py @@ -0,0 +1,65 @@ +import numpy as np +import pandas as pd +import pytest + +from protzilla.data_analysis.time_series_plot_peptide import time_series_plot_peptide + + +@pytest.fixture +def time_series_test_data(): + test_intensity_list = ( + ["Sample1", "Protein1", "Gene1", 20], + ["Sample1", "Protein2", "Gene1", 16], + ["Sample1", "Protein3", "Gene1", 1], + ["Sample1", "Protein4", "Gene1", 14], + ["Sample2", "Protein1", "Gene1", 20], + ["Sample2", "Protein2", "Gene1", 15], + ["Sample2", "Protein3", "Gene1", 2], + ["Sample2", "Protein4", "Gene1", 15], + ["Sample3", "Protein1", "Gene1", 22], + ["Sample3", "Protein2", "Gene1", 14], + ["Sample3", "Protein3", "Gene1", 3], + ["Sample3", "Protein4", "Gene1", 16], + ["Sample4", "Protein1", "Gene1", 8], + ["Sample4", "Protein2", "Gene1", 15], + ["Sample4", "Protein3", "Gene1", 1], + ["Sample4", "Protein4", "Gene1", 9], + ["Sample5", "Protein1", "Gene1", 10], + ["Sample5", "Protein2", "Gene1", 14], + ["Sample5", "Protein3", "Gene1", 2], + ["Sample5", "Protein4", "Gene1", 10], + ["Sample6", "Protein1", "Gene1", 12], + ["Sample6", "Protein2", "Gene1", 13], + ["Sample6", "Protein3", "Gene1", 3], + ["Sample6", "Protein4", "Gene1", 11], + ["Sample7", "Protein1", "Gene1", 12], + ["Sample7", "Protein2", "Gene1", 13], + ["Sample7", "Protein3", "Gene1", 3], + ["Sample7", "Protein4", "Gene1", 11], + ) + + test_intensity_df = pd.DataFrame( + data=test_intensity_list, + columns=["Sample", "Protein ID", "Gene", "Intensity"], + ) + + test_metadata_df = ( + ["Sample1", "02:00:00", 1], + ["Sample2", "06:00:00", 1], + ["Sample3", "10:00:00", 1], + ["Sample4", "14:00:00", 1], + ) + test_metadata_df = pd.DataFrame( + data=test_metadata_df, + columns=["Sample", "Time", "Day"], + ) + return test_intensity_df, test_metadata_df + +def test_time_series_plot(show_figures, time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_plot_peptide(test_intensity, test_metadata, "Protein1") + assert "plots" in outputs + fig = outputs["plots"][0] + if show_figures: + fig.show() + return \ No newline at end of file From 935f0b6f800c43de951209dec09876a4dbfe5565 Mon Sep 17 00:00:00 2001 From: AK Date: Thu, 18 Jul 2024 14:26:27 +0200 Subject: [PATCH 11/52] Implemented time series regression analysis --- protzilla/data_analysis/time_series_helper.py | 7 ++ .../time_series_regression_analysis.py | 91 +++++++++++++++++++ protzilla/methods/data_analysis.py | 23 +++++ ui/runs/form_mapping.py | 1 + ui/runs/forms/data_analysis.py | 20 +++- 5 files changed, 141 insertions(+), 1 deletion(-) create mode 100644 protzilla/data_analysis/time_series_helper.py create mode 100644 protzilla/data_analysis/time_series_regression_analysis.py diff --git a/protzilla/data_analysis/time_series_helper.py b/protzilla/data_analysis/time_series_helper.py new file mode 100644 index 00000000..b3ebe0c5 --- /dev/null +++ b/protzilla/data_analysis/time_series_helper.py @@ -0,0 +1,7 @@ +import pandas as pd +from datetime import datetime + +def convert_time_to_datetime(time_str): + time_obj = datetime.strptime(time_str, '%H:%M:%S') + seconds_since_midnight = time_obj.second + time_obj.minute * 60 + time_obj.hour * 3600 + return seconds_since_midnight diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py new file mode 100644 index 00000000..3b06c2f3 --- /dev/null +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -0,0 +1,91 @@ +import logging + +import numpy as np +import pandas as pd +import plotly.graph_objects as go +import plotly.express as px + +from protzilla.data_analysis.time_series_helper import convert_time_to_datetime + +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error, r2_score + +def time_series_linear_regression( + input_df: pd.DataFrame, + metadata_df: pd.DataFrame, + test_size: float, +): + + input_df = pd.merge( + left=input_df, + right=metadata_df, + on="Sample", + copy=False, + ) + + input_df["Time"] = input_df["Time"].apply(convert_time_to_datetime) + input_df = input_df.interpolate(method='linear', axis=0) + X = input_df[["Time"]] + y = input_df["Intensity"] + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False) + model = LinearRegression() + model.fit(X_train, y_train) + + y_pred_train = model.predict(X_train) + y_pred_test = model.predict(X_test) + + + """ + train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) + test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test)) + train_r2 = r2_score(y_train, y_pred_train) + test_r2 = r2_score(y_test, y_pred_test) + return dict( + train_rmse=train_rmse, + test_rmse=test_rmse, + train_r2=train_r2, + test_r2=test_r2, + ) + """ + + train_df = pd.DataFrame({'Time': X_train['Time'], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) + test_df = pd.DataFrame({'Time': X_test['Time'], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + plot_df = pd.concat([train_df, test_df]) + + fig = go.Figure() + + fig.add_trace(go.Scatter( + x=plot_df['Time'], + y=plot_df['Intensity'], + mode='markers', + name='Actual Intensity', + marker=dict(color='blue') + )) + + fig.add_trace(go.Scatter( + x=plot_df['Time'], + y=plot_df['Predicted'], + mode='lines', + name='Predicted Intensity', + line=dict(color='red') + )) + + fig.update_layout( + title={ + "text": "Intensity over Time", + "font": dict(size=16), + "y": 0.98, + "x": 0.5, + "xanchor": "center", + "yanchor": "top", + }, + xaxis_title="Time", + yaxis_title="Intensity", + plot_bgcolor="white", + yaxis={"gridcolor": "lightgrey", "zerolinecolor": "lightgrey"}, + font=dict(size=14, family="Arial") + ) + + return dict(plot=[fig]) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index b2bbbbdf..b8171a9a 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -12,6 +12,7 @@ mann_whitney_test_on_intensity_data from protzilla.data_analysis.differential_expression_t_test import t_test from protzilla.data_analysis.dimension_reduction import t_sne, umap +from protzilla.data_analysis.time_series_regression_analysis import time_series_linear_regression from protzilla.data_analysis.ptm_analysis import filter_peptides_of_protein, ptms_per_sample, \ ptms_per_protein_and_sample from protzilla.data_analysis.model_evaluation import evaluate_classification_model @@ -738,6 +739,28 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: return inputs +class TimeSeriesLinearRegression(PlotStep): + display_name = "Time Series Linear Regression" + operation = "Time series analysis" + method_description = ("A function to fit a linear model using ordinary least squares for each protein. " + "The linear model fits the protein intensities on Y axis and the Time on X. " + "The p-values are corrected for multiple testing.") + + input_keys = [ + "input_df", + "metadata_df", + "test_size", + ] + output_keys = [] + + def method(self, inputs: dict) -> dict: + return time_series_linear_regression(**inputs) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["input_df"] = steps.get_step_output(Step, "peptide_df", inputs["input_df"]) + inputs["metadata_df"] = steps.metadata_df + return inputs + class PTMsPerSample(DataAnalysisStep): display_name = "PTMs per Sample" operation = "Peptide analysis" diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py index 083676f0..c14d9a20 100644 --- a/ui/runs/form_mapping.py +++ b/ui/runs/form_mapping.py @@ -65,6 +65,7 @@ data_analysis.SelectPeptidesForProtein: data_analysis_forms.SelectPeptidesForProteinForm, data_analysis.PTMsPerSample: data_analysis_forms.PTMsPerSampleForm, data_analysis.PTMsProteinAndPerSample: data_analysis_forms.PTMsPerProteinAndSampleForm, + data_analysis.TimeSeriesLinearRegression: data_analysis_forms.PlotTimeSeriesLinearRegressionForm, data_preprocessing.ImputationByMinPerSample: data_preprocessing_forms.ImputationByMinPerSampleForms, data_integration.EnrichmentAnalysisGOAnalysisWithString: data_integration_forms.EnrichmentAnalysisGOAnalysisWithStringForm, data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr: data_integration_forms.EnrichmentAnalysisGOAnalysisWithEnrichrForm, diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index de3651b6..9e6d6f29 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1161,4 +1161,22 @@ def fill_form(self, run: Run) -> None: SelectPeptidesForProtein, "peptide_df" ) if single_protein_peptides: - self.fields["peptide_df"].initial = single_protein_peptides[0] \ No newline at end of file + self.fields["peptide_df"].initial = single_protein_peptides[0] + + +class PlotTimeSeriesLinearRegressionForm(MethodForm): + input_df = CustomChoiceField( + choices=[], + label="Peptide dataframe containing the peptides of a single protein", + ) + test_size = CustomFloatField( + label="Test size", + min_value=0, + max_value=1, + initial=0.2 + ) + + def fill_form(self, run: Run) -> None: + self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps( + run + ) From de89b56a7215089b35f735387962d845bb868e77 Mon Sep 17 00:00:00 2001 From: AK Date: Fri, 19 Jul 2024 13:41:23 +0200 Subject: [PATCH 12/52] Implemented time series regression analysis --- protzilla/data_analysis/time_series_helper.py | 4 +- .../time_series_regression_analysis.py | 70 +++++++++++-------- protzilla/methods/data_analysis.py | 10 ++- ui/runs/form_mapping.py | 2 +- ui/runs/forms/data_analysis.py | 23 +++++- 5 files changed, 71 insertions(+), 38 deletions(-) diff --git a/protzilla/data_analysis/time_series_helper.py b/protzilla/data_analysis/time_series_helper.py index b3ebe0c5..b10b13b6 100644 --- a/protzilla/data_analysis/time_series_helper.py +++ b/protzilla/data_analysis/time_series_helper.py @@ -3,5 +3,5 @@ def convert_time_to_datetime(time_str): time_obj = datetime.strptime(time_str, '%H:%M:%S') - seconds_since_midnight = time_obj.second + time_obj.minute * 60 + time_obj.hour * 3600 - return seconds_since_midnight + hours_since_midnight = time_obj.hour + return hours_since_midnight \ No newline at end of file diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py index 3b06c2f3..b3fb2099 100644 --- a/protzilla/data_analysis/time_series_regression_analysis.py +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -3,20 +3,35 @@ import numpy as np import pandas as pd import plotly.graph_objects as go -import plotly.express as px -from protzilla.data_analysis.time_series_helper import convert_time_to_datetime +from protzilla.data_analysis.time_series_helper import convert_time_to_datetime +from protzilla.constants.colors import PROTZILLA_DISCRETE_COLOR_SEQUENCE from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, r2_score +colors = { + "plot_bgcolor": "white", + "gridcolor": "#F1F1F1", + "linecolor": "#F1F1F1", + "annotation_text_color": "#ffffff", + "annotation_proteins_of_interest": "#4A536A", +} + + def time_series_linear_regression( input_df: pd.DataFrame, metadata_df: pd.DataFrame, + protein_group: str, test_size: float, ): + if test_size < 0 or test_size > 1 : + raise ValueError("Test size should be between 0 and 1") + + input_df = input_df[input_df['Protein ID'] == protein_group] + input_df = pd.merge( left=input_df, right=metadata_df, @@ -36,19 +51,10 @@ def time_series_linear_regression( y_pred_train = model.predict(X_train) y_pred_test = model.predict(X_test) - - """ - train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) - test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test)) - train_r2 = r2_score(y_train, y_pred_train) - test_r2 = r2_score(y_test, y_pred_test) - return dict( - train_rmse=train_rmse, - test_rmse=test_rmse, - train_r2=train_r2, - test_r2=test_r2, - ) - """ + train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) + test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test)) + train_r2 = r2_score(y_train, y_pred_train) + test_r2 = r2_score(y_test, y_pred_test) train_df = pd.DataFrame({'Time': X_train['Time'], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) test_df = pd.DataFrame({'Time': X_test['Time'], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) @@ -61,7 +67,7 @@ def time_series_linear_regression( y=plot_df['Intensity'], mode='markers', name='Actual Intensity', - marker=dict(color='blue') + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) )) fig.add_trace(go.Scatter( @@ -69,23 +75,27 @@ def time_series_linear_regression( y=plot_df['Predicted'], mode='lines', name='Predicted Intensity', - line=dict(color='red') + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]) )) fig.update_layout( - title={ - "text": "Intensity over Time", - "font": dict(size=16), - "y": 0.98, - "x": 0.5, - "xanchor": "center", - "yanchor": "top", - }, - xaxis_title="Time", + title=f"Intensity over Time for {protein_group}", + plot_bgcolor=colors["plot_bgcolor"], + xaxis_gridcolor=colors["gridcolor"], + yaxis_gridcolor=colors["gridcolor"], + xaxis_linecolor=colors["linecolor"], + yaxis_linecolor=colors["linecolor"], + xaxis_title="Time (hours)", yaxis_title="Intensity", - plot_bgcolor="white", - yaxis={"gridcolor": "lightgrey", "zerolinecolor": "lightgrey"}, - font=dict(size=14, family="Arial") + legend_title="Legend", + autosize=True, + margin=dict(l=100, r=300, t=100, b=100), ) - return dict(plot=[fig]) + return dict( + train_root_mean_squared=train_rmse, + test_root_mean_squared=test_rmse, + train_r2_score=train_r2, + test_r2_score=test_r2, + plots=[fig], + ) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 2bbd0c6a..752bb5c6 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -799,7 +799,7 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class TimeSeriesLinearRegression(PlotStep): - display_name = "Time Series Linear Regression" + display_name = "Linear Regression" operation = "Time series analysis" method_description = ("A function to fit a linear model using ordinary least squares for each protein. " "The linear model fits the protein intensities on Y axis and the Time on X. " @@ -808,9 +808,15 @@ class TimeSeriesLinearRegression(PlotStep): input_keys = [ "input_df", "metadata_df", + "protein_group", "test_size", ] - output_keys = [] + output_keys = [ + "train_root_mean_squared", + "test_root_mean_squared", + "train_r2_score", + "test_r2_score", + ] def method(self, inputs: dict) -> dict: return time_series_linear_regression(**inputs) diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py index b2cb4c8e..a6d350d2 100644 --- a/ui/runs/form_mapping.py +++ b/ui/runs/form_mapping.py @@ -66,7 +66,7 @@ data_analysis.FLEXIQuantLF: data_analysis_forms.FLEXIQuantLFForm, data_analysis.PTMsPerSample: data_analysis_forms.PTMsPerSampleForm, data_analysis.PTMsProteinAndPerSample: data_analysis_forms.PTMsPerProteinAndSampleForm, - data_analysis.TimeSeriesLinearRegression: data_analysis_forms.PlotTimeSeriesLinearRegressionForm, + data_analysis.TimeSeriesLinearRegression: data_analysis_forms.TimeSeriesLinearRegressionForm, data_preprocessing.ImputationByMinPerSample: data_preprocessing_forms.ImputationByMinPerSampleForms, data_integration.EnrichmentAnalysisGOAnalysisWithString: data_integration_forms.EnrichmentAnalysisGOAnalysisWithStringForm, data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr: data_integration_forms.EnrichmentAnalysisGOAnalysisWithEnrichrForm, diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index fa3a2825..7b80d0d0 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1214,19 +1214,36 @@ def fill_form(self, run: Run) -> None: self.fields["peptide_df"].initial = single_protein_peptides[0] -class PlotTimeSeriesLinearRegressionForm(MethodForm): +class TimeSeriesLinearRegressionForm(MethodForm): input_df = CustomChoiceField( choices=[], - label="Peptide dataframe containing the peptides of a single protein", + label="Peptide dataframe", + ) + protein_group = CustomChoiceField( + choices=[], + label="Protein group: which protein group to perform the linear regression on", ) test_size = CustomFloatField( - label="Test size", + label="Test size: proportion of the dataset to include in the test split", min_value=0, max_value=1, + step_size=0.1, initial=0.2 ) + def fill_form(self, run: Run) -> None: self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps( run ) + input_df_instance_id = self.data.get( + "input_df", self.fields["input_df"].choices[0][0] + ) + + self.fields["protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + step_type=Step, + output_key="peptide_df", + instance_identifier=input_df_instance_id, + )["Protein ID"].unique() + ) From 97ace4af9cfe128f85bceb2f9c99fcde82639a42 Mon Sep 17 00:00:00 2001 From: AK Date: Fri, 19 Jul 2024 13:51:20 +0200 Subject: [PATCH 13/52] Added Docstrings --- protzilla/data_analysis/time_series_helper.py | 6 ++++++ .../data_analysis/time_series_regression_analysis.py | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/protzilla/data_analysis/time_series_helper.py b/protzilla/data_analysis/time_series_helper.py index b10b13b6..e2ead503 100644 --- a/protzilla/data_analysis/time_series_helper.py +++ b/protzilla/data_analysis/time_series_helper.py @@ -2,6 +2,12 @@ from datetime import datetime def convert_time_to_datetime(time_str): + """ + Convert a string time to a datetime object + :param time_str: The time string to convert + + :return: A datetime object + """ time_obj = datetime.strptime(time_str, '%H:%M:%S') hours_since_midnight = time_obj.hour return hours_since_midnight \ No newline at end of file diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py index b3fb2099..f117bccb 100644 --- a/protzilla/data_analysis/time_series_regression_analysis.py +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -26,6 +26,15 @@ def time_series_linear_regression( protein_group: str, test_size: float, ): + """ + Perform linear regression on the time series data for a given protein group. + :param input_df: Peptide dataframe which contains the intensity of each sample + :param metadata_df: Metadata dataframe which contains the timestamps + :param protein_group: Protein group to perform the analysis on + :param test_size: The proportion of the dataset to include in the test split + + :return: A dictionary containing the root mean squared error and r2 score for the training and test sets + """ if test_size < 0 or test_size > 1 : raise ValueError("Test size should be between 0 and 1") From 38eb985c0872f8d39ba4bc85a94ab91ce4207134 Mon Sep 17 00:00:00 2001 From: AK Date: Fri, 19 Jul 2024 14:22:08 +0200 Subject: [PATCH 14/52] Implemented tests --- .../test_time_series_analysis.py | 79 +++++++++++++++++++ .../data_analysis/test_time_series_plots.py | 7 +- 2 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 tests/protzilla/data_analysis/test_time_series_analysis.py diff --git a/tests/protzilla/data_analysis/test_time_series_analysis.py b/tests/protzilla/data_analysis/test_time_series_analysis.py new file mode 100644 index 00000000..ff01ba50 --- /dev/null +++ b/tests/protzilla/data_analysis/test_time_series_analysis.py @@ -0,0 +1,79 @@ +import pandas as pd +import pytest + +from protzilla.data_analysis.time_series_regression_analysis import time_series_linear_regression + + +@pytest.fixture +def time_series_test_data(): + test_intensity_list = ( + ["Sample1", "Protein1", "Gene1", 20], + ["Sample1", "Protein2", "Gene1", 16], + ["Sample1", "Protein3", "Gene1", 1], + ["Sample1", "Protein4", "Gene1", 14], + ["Sample2", "Protein1", "Gene1", 20], + ["Sample2", "Protein2", "Gene1", 15], + ["Sample2", "Protein3", "Gene1", 2], + ["Sample2", "Protein4", "Gene1", 15], + ["Sample3", "Protein1", "Gene1", 22], + ["Sample3", "Protein2", "Gene1", 14], + ["Sample3", "Protein3", "Gene1", 3], + ["Sample3", "Protein4", "Gene1", 16], + ["Sample4", "Protein1", "Gene1", 8], + ["Sample4", "Protein2", "Gene1", 15], + ["Sample4", "Protein3", "Gene1", 1], + ["Sample4", "Protein4", "Gene1", 9], + ["Sample5", "Protein1", "Gene1", 10], + ["Sample5", "Protein2", "Gene1", 14], + ["Sample5", "Protein3", "Gene1", 2], + ["Sample5", "Protein4", "Gene1", 10], + ["Sample6", "Protein1", "Gene1", 12], + ["Sample6", "Protein2", "Gene1", 13], + ["Sample6", "Protein3", "Gene1", 3], + ["Sample6", "Protein4", "Gene1", 11], + ["Sample7", "Protein1", "Gene1", 12], + ["Sample7", "Protein2", "Gene1", 13], + ["Sample7", "Protein3", "Gene1", 3], + ["Sample7", "Protein4", "Gene1", 11], + ) + + test_intensity_df = pd.DataFrame( + data=test_intensity_list, + columns=["Sample", "Protein ID", "Gene", "Intensity"], + ) + + test_metadata_df = ( + ["Sample1", "02:00:00", 1], + ["Sample2", "06:00:00", 1], + ["Sample3", "10:00:00", 1], + ["Sample4", "14:00:00", 1], + ) + test_metadata_df = pd.DataFrame( + data=test_metadata_df, + columns=["Sample", "Time", "Day"], + ) + return test_intensity_df, test_metadata_df + +def test_linear_regression_plot(show_figures, time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2) + assert "plots" in outputs + fig = outputs["plots"][0] + if show_figures: + fig.show() + return + +def test_linear_regression_plot_invalid_test_size(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + with pytest.raises(ValueError): + time_series_linear_regression(test_intensity, test_metadata, "Protein1", 2) + return + +def test_linear_regression_outputs(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2) + assert "train_root_mean_squared" in outputs + assert "test_root_mean_squared" in outputs + assert "train_r2_score" in outputs + assert "test_r2_score" in outputs + return \ No newline at end of file diff --git a/tests/protzilla/data_analysis/test_time_series_plots.py b/tests/protzilla/data_analysis/test_time_series_plots.py index 85bfad11..904d46a2 100644 --- a/tests/protzilla/data_analysis/test_time_series_plots.py +++ b/tests/protzilla/data_analysis/test_time_series_plots.py @@ -1,4 +1,3 @@ -import numpy as np import pandas as pd import pytest @@ -62,4 +61,10 @@ def test_time_series_plot(show_figures, time_series_test_data): fig = outputs["plots"][0] if show_figures: fig.show() + return + +def test_time_series_plot_invalid_similarity(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + with pytest.raises(ValueError): + time_series_plot_peptide(test_intensity, test_metadata, "Protein1", similarity=-1, similarity_measure="euclidean distance") return \ No newline at end of file From d7522254a7402b8eb52deb9c4a1c01264d609670 Mon Sep 17 00:00:00 2001 From: AK Date: Fri, 19 Jul 2024 14:27:43 +0200 Subject: [PATCH 15/52] Implemented tests --- tests/protzilla/data_analysis/test_time_series_plots.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/protzilla/data_analysis/test_time_series_plots.py b/tests/protzilla/data_analysis/test_time_series_plots.py index 904d46a2..12249fb0 100644 --- a/tests/protzilla/data_analysis/test_time_series_plots.py +++ b/tests/protzilla/data_analysis/test_time_series_plots.py @@ -63,8 +63,14 @@ def test_time_series_plot(show_figures, time_series_test_data): fig.show() return -def test_time_series_plot_invalid_similarity(time_series_test_data): +def test_time_series_plot_invalid_euclidean_similarity(time_series_test_data): test_intensity, test_metadata = time_series_test_data with pytest.raises(ValueError): time_series_plot_peptide(test_intensity, test_metadata, "Protein1", similarity=-1, similarity_measure="euclidean distance") + return + +def test_time_series_plot_invalid_cosine_similarity(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + with pytest.raises(ValueError): + time_series_plot_peptide(test_intensity, test_metadata, "Protein1", similarity=2, similarity_measure="cosine similarity") return \ No newline at end of file From 47556f372a88aa4cd1c0fc5b1d48b8db2c235e68 Mon Sep 17 00:00:00 2001 From: AK Date: Fri, 19 Jul 2024 14:38:38 +0200 Subject: [PATCH 16/52] made some minor changes --- protzilla/data_analysis/time_series_helper.py | 1 - protzilla/data_analysis/time_series_regression_analysis.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/protzilla/data_analysis/time_series_helper.py b/protzilla/data_analysis/time_series_helper.py index e2ead503..077e7e06 100644 --- a/protzilla/data_analysis/time_series_helper.py +++ b/protzilla/data_analysis/time_series_helper.py @@ -1,4 +1,3 @@ -import pandas as pd from datetime import datetime def convert_time_to_datetime(time_str): diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py index f117bccb..3785116f 100644 --- a/protzilla/data_analysis/time_series_regression_analysis.py +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -1,5 +1,3 @@ -import logging - import numpy as np import pandas as pd import plotly.graph_objects as go From 4f8737defec2ae3486cbc1b76f9e08e4df56c3e8 Mon Sep 17 00:00:00 2001 From: AK Date: Wed, 24 Jul 2024 15:08:59 +0200 Subject: [PATCH 17/52] Implemented RANSAC regression --- .../time_series_regression_analysis.py | 181 +++++++++++++++++- protzilla/methods/data_analysis.py | 29 ++- ui/runs/form_mapping.py | 1 + ui/runs/forms/data_analysis.py | 35 ++++ 4 files changed, 238 insertions(+), 8 deletions(-) diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py index 3785116f..2d5622ff 100644 --- a/protzilla/data_analysis/time_series_regression_analysis.py +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -5,15 +5,16 @@ from protzilla.data_analysis.time_series_helper import convert_time_to_datetime from protzilla.constants.colors import PROTZILLA_DISCRETE_COLOR_SEQUENCE -from sklearn.linear_model import LinearRegression +from sklearn.linear_model import LinearRegression, RANSACRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, r2_score +from plotly.subplots import make_subplots colors = { "plot_bgcolor": "white", "gridcolor": "#F1F1F1", "linecolor": "#F1F1F1", - "annotation_text_color": "#ffffff", + "annotation_text_color": "#4c4c4c", "annotation_proteins_of_interest": "#4A536A", } @@ -67,15 +68,16 @@ def time_series_linear_regression( test_df = pd.DataFrame({'Time': X_test['Time'], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) plot_df = pd.concat([train_df, test_df]) - fig = go.Figure() + fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025) + # Add main plot traces fig.add_trace(go.Scatter( x=plot_df['Time'], y=plot_df['Intensity'], mode='markers', name='Actual Intensity', marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) - )) + ), row=1, col=1) fig.add_trace(go.Scatter( x=plot_df['Time'], @@ -83,8 +85,26 @@ def time_series_linear_regression( mode='lines', name='Predicted Intensity', line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]) - )) + ), row=1, col=1) + # Add annotation text as a separate trace in the subplot + annotation_text = ( + f"Train RMSE: {train_rmse:.3f}
" + f"Test RMSE: {test_rmse:.3f}
" + f"Train R²: {train_r2:.3f}
" + f"Test R²: {test_r2:.3f}" + ) + + fig.add_trace(go.Scatter( + x=[0], + y=[0.25], + text=[annotation_text], + mode='text', + textfont=dict(size=12), + showlegend=False + ), row=1, col=2) + + # Update layout fig.update_layout( title=f"Intensity over Time for {protein_group}", plot_bgcolor=colors["plot_bgcolor"], @@ -92,13 +112,160 @@ def time_series_linear_regression( yaxis_gridcolor=colors["gridcolor"], xaxis_linecolor=colors["linecolor"], yaxis_linecolor=colors["linecolor"], - xaxis_title="Time (hours)", + xaxis_title="Time", yaxis_title="Intensity", legend_title="Legend", autosize=True, - margin=dict(l=100, r=300, t=100, b=100), + margin=dict(l=100, r=100, t=100, b=50), + legend=dict( + yanchor="top", + y=0.95, + xanchor="right", + x=0.85 + ) + ) + + # Hide x-axis of the annotation subplot + fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2) + fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2) + + # Adjust subplot titles + fig.update_annotations(font_size=12) + + return dict( + train_root_mean_squared=train_rmse, + test_root_mean_squared=test_rmse, + train_r2_score=train_r2, + test_r2_score=test_r2, + plots=[fig], ) + +def time_series_ransac_regression( + input_df: pd.DataFrame, + metadata_df: pd.DataFrame, + protein_group: str, + test_size: float, +): + """ + Perform RANSAC regression on the time series data for a given protein group. + :param input_df: Peptide dataframe which contains the intensity of each sample + :param metadata_df: Metadata dataframe which contains the timestamps + :param protein_group: Protein group to perform the analysis on + :param test_size: The proportion of the dataset to include in the test split + + :return: A dictionary containing the root mean squared error and r2 score for the training and test sets + """ + + if test_size < 0 or test_size > 1: + raise ValueError("Test size should be between 0 and 1") + + input_df = input_df[input_df['Protein ID'] == protein_group] + + input_df = pd.merge( + left=input_df, + right=metadata_df, + on="Sample", + copy=False, + ) + + input_df["Time"] = input_df["Time"].apply(convert_time_to_datetime) + input_df = input_df.interpolate(method='linear', axis=0) + X = input_df[["Time"]] + y = input_df["Intensity"] + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False) + model = RANSACRegressor(base_estimator=LinearRegression()) + model.fit(X_train, y_train) + + inlier_mask = model.inlier_mask_ + + y_pred_train = model.predict(X_train) + y_pred_test = model.predict(X_test) + + train_rmse = np.sqrt(mean_squared_error(y_train[inlier_mask], y_pred_train[inlier_mask])) + test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test)) + train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask]) + test_r2 = r2_score(y_test, y_pred_test) + + train_df = pd.DataFrame({'Time': X_train["Time"], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) + test_df = pd.DataFrame({'Time': X_test["Time"], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + train_df['Inlier'] = inlier_mask + test_df['Inlier'] = False + plot_df = pd.concat([train_df, test_df]) + + fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025) + + # Add main plot traces + fig.add_trace(go.Scatter( + x=plot_df['Time'], + y=plot_df['Intensity'], + mode='markers', + name='Actual Intensity', + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) + ), row=1, col=1) + + fig.add_trace(go.Scatter( + x=plot_df['Time'], + y=plot_df['Predicted'], + mode='lines', + name='Predicted Intensity', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]) + ), row=1, col=1) + + fig.add_trace(go.Scatter( + x=plot_df[plot_df['Inlier'] == False]['Time'], + y=plot_df[plot_df['Inlier'] == False]['Intensity'], + mode='markers', + name='Outliers', + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[1]) + ), row=1, col=1) + + # Add annotation text as a separate trace in the subplot + annotation_text = ( + f"Train RMSE: {train_rmse:.3f}
" + f"Test RMSE: {test_rmse:.3f}
" + f"Train R²: {train_r2:.3f}
" + f"Test R²: {test_r2:.3f}" + ) + + fig.add_trace(go.Scatter( + x=[0], + y=[0.25], + text=[annotation_text], + mode='text', + textfont=dict(size=12), + showlegend=False + ), row=1, col=2) + + # Update layout + fig.update_layout( + title=f"Intensity over Time for {protein_group}", + plot_bgcolor=colors["plot_bgcolor"], + xaxis_gridcolor=colors["gridcolor"], + yaxis_gridcolor=colors["gridcolor"], + xaxis_linecolor=colors["linecolor"], + yaxis_linecolor=colors["linecolor"], + xaxis_title="Time", + yaxis_title="Intensity", + legend_title="Legend", + autosize=True, + margin=dict(l=100, r=100, t=100, b=50), + legend=dict( + yanchor="top", + y=0.95, + xanchor="right", + x=0.85 + ) + ) + + # Hide x-axis of the annotation subplot + fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2) + fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2) + + # Adjust subplot titles + fig.update_annotations(font_size=12) + return dict( train_root_mean_squared=train_rmse, test_root_mean_squared=test_rmse, diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 752bb5c6..6f45e7e1 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -14,7 +14,7 @@ ) from protzilla.data_analysis.differential_expression_t_test import t_test from protzilla.data_analysis.dimension_reduction import t_sne, umap -from protzilla.data_analysis.time_series_regression_analysis import time_series_linear_regression +from protzilla.data_analysis.time_series_regression_analysis import time_series_linear_regression, time_series_ransac_regression from protzilla.data_analysis.ptm_analysis import filter_peptides_of_protein, ptms_per_sample, \ ptms_per_protein_and_sample from protzilla.data_analysis.model_evaluation import evaluate_classification_model @@ -826,6 +826,33 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["metadata_df"] = steps.metadata_df return inputs + +class TimeSeriesRANSACRegression(PlotStep): + display_name = "RANSAC Regression" + operation = "Time series analysis" + method_description = " Perform RANSAC regression on the time series data for a given protein group." + + input_keys = [ + "input_df", + "metadata_df", + "protein_group", + "test_size", + ] + output_keys = [ + "train_root_mean_squared", + "test_root_mean_squared", + "train_r2_score", + "test_r2_score", + ] + + def method(self, inputs: dict) -> dict: + return time_series_ransac_regression(**inputs) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["input_df"] = steps.get_step_output(Step, "peptide_df", inputs["input_df"]) + inputs["metadata_df"] = steps.metadata_df + return inputs + class PTMsPerSample(DataAnalysisStep): display_name = "PTMs per Sample" operation = "Peptide analysis" diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py index a6d350d2..079e1569 100644 --- a/ui/runs/form_mapping.py +++ b/ui/runs/form_mapping.py @@ -67,6 +67,7 @@ data_analysis.PTMsPerSample: data_analysis_forms.PTMsPerSampleForm, data_analysis.PTMsProteinAndPerSample: data_analysis_forms.PTMsPerProteinAndSampleForm, data_analysis.TimeSeriesLinearRegression: data_analysis_forms.TimeSeriesLinearRegressionForm, + data_analysis.TimeSeriesRANSACRegression: data_analysis_forms.TimeSeriesRANSACRegressionForm, data_preprocessing.ImputationByMinPerSample: data_preprocessing_forms.ImputationByMinPerSampleForms, data_integration.EnrichmentAnalysisGOAnalysisWithString: data_integration_forms.EnrichmentAnalysisGOAnalysisWithStringForm, data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr: data_integration_forms.EnrichmentAnalysisGOAnalysisWithEnrichrForm, diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 7b80d0d0..99d23798 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1247,3 +1247,38 @@ def fill_form(self, run: Run) -> None: instance_identifier=input_df_instance_id, )["Protein ID"].unique() ) + + +class TimeSeriesRANSACRegressionForm(MethodForm): + input_df = CustomChoiceField( + choices=[], + label="Peptide dataframe", + ) + protein_group = CustomChoiceField( + choices=[], + label="Protein group: which protein group to perform the RANSAC regression on", + ) + test_size = CustomFloatField( + label="Test size: proportion of the dataset to include in the test split", + min_value=0, + max_value=1, + step_size=0.1, + initial=0.2 + ) + + + def fill_form(self, run: Run) -> None: + self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps( + run + ) + input_df_instance_id = self.data.get( + "input_df", self.fields["input_df"].choices[0][0] + ) + + self.fields["protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + step_type=Step, + output_key="peptide_df", + instance_identifier=input_df_instance_id, + )["Protein ID"].unique() + ) \ No newline at end of file From c3fae9b14c97e6504256f59149d17700cf308303 Mon Sep 17 00:00:00 2001 From: selenabr Date: Thu, 20 Jun 2024 17:33:18 +0200 Subject: [PATCH 18/52] output field for result --- protzilla/methods/data_analysis.py | 4 ++++ protzilla/steps.py | 14 ++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 6f45e7e1..d1f56e00 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -901,3 +901,7 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: Step, "peptide_df", inputs["peptide_df"] ) return inputs + + def handle_outputs(self, outputs: dict): + super().handle_outputs(outputs) + self.display_output["required_sample_size"] = outputs["required_sample_size"] \ No newline at end of file diff --git a/protzilla/steps.py b/protzilla/steps.py index d5fb124e..7dec4936 100644 --- a/protzilla/steps.py +++ b/protzilla/steps.py @@ -36,6 +36,7 @@ def __init__(self, instance_identifier: str | None = None): self.messages: Messages = Messages([]) self.output: Output = Output() self.plots: Plots = Plots() + self.display_output: DisplayOutput = DisplayOutput() self.instance_identifier = instance_identifier if self.instance_identifier is None: @@ -310,6 +311,19 @@ def export(self, format_): exports.append(BytesIO(base64.b64decode(plot))) return exports +class DisplayOutput: + + def __init__(self, display_output: dict = None): + if display_output is None: + display_output = [] + self.display_output = display_output + def __iter__(self): + return iter(self.display_output) + def __repr__(self): + return f"DisplayOutput: {self.display_output}" + def __contains__(self, key): + return key in self.display_output + class StepManager: def __repr__(self): From 67c59c7f8fc6fc8661694f6cb9e0725be01fd889 Mon Sep 17 00:00:00 2001 From: selenabr Date: Fri, 21 Jun 2024 20:24:11 +0200 Subject: [PATCH 19/52] further implementation of output field for result --- protzilla/methods/data_analysis.py | 3 +-- protzilla/steps.py | 10 +++++++++- ui/runs/templates/runs/details.html | 7 +++++++ ui/runs/views.py | 8 ++++++++ 4 files changed, 25 insertions(+), 3 deletions(-) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index d1f56e00..1c54da36 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -33,7 +33,7 @@ ) from protzilla.data_analysis.ptm_quantification import flexiquant_lf from protzilla.methods.data_preprocessing import TransformationLog -from protzilla.steps import Plots, Step, StepManager +from protzilla.steps import Plots, Step, StepManager, DisplayOutput class DataAnalysisStep(Step): @@ -844,7 +844,6 @@ class TimeSeriesRANSACRegression(PlotStep): "train_r2_score", "test_r2_score", ] - def method(self, inputs: dict) -> dict: return time_series_ransac_regression(**inputs) diff --git a/protzilla/steps.py b/protzilla/steps.py index 7dec4936..32ce93b3 100644 --- a/protzilla/steps.py +++ b/protzilla/steps.py @@ -315,7 +315,7 @@ class DisplayOutput: def __init__(self, display_output: dict = None): if display_output is None: - display_output = [] + display_output = {} self.display_output = display_output def __iter__(self): return iter(self.display_output) @@ -323,6 +323,14 @@ def __repr__(self): return f"DisplayOutput: {self.display_output}" def __contains__(self, key): return key in self.display_output + def __getitem__(self, key): + return self.display_output[key] + def __setitem__(self, key, value): + self.display_output[key] = value + def is_empty(self) -> bool: + return len(self.display_output) == 0 + + class StepManager: diff --git a/ui/runs/templates/runs/details.html b/ui/runs/templates/runs/details.html index 5809d356..a930f486 100644 --- a/ui/runs/templates/runs/details.html +++ b/ui/runs/templates/runs/details.html @@ -211,6 +211,13 @@

{{ display_name }}

{% endif %} {% endif %} + {% if display_output %} +
+ + +
+ {% endif %} {% else %}

You are at the end of the run. Go back to add more steps of the same section, or add steps of diff --git a/ui/runs/views.py b/ui/runs/views.py index b95be756..67635169 100644 --- a/ui/runs/views.py +++ b/ui/runs/views.py @@ -121,6 +121,12 @@ def detail(request: HttpRequest, run_name: str): and Path(run.current_outputs["graph_path"]).exists() ) + display_output_form = ( + run.steps.current_step.display_output is not None + and not run.current_step.display_output.is_empty() + ) + display_output_text = f"{run.current_step.display_output}" + return render( request, "runs/details.html", @@ -156,6 +162,8 @@ def detail(request: HttpRequest, run_name: str): method_form=method_form, is_form_dynamic=method_form.is_dynamic, plot_form=plot_form, + display_output=display_output_form, + display_output_result=display_output_text, ), ) From 20b5e69464137ecc9ee6d33ad378d90469aa3efb Mon Sep 17 00:00:00 2001 From: selenabr Date: Sun, 23 Jun 2024 02:43:34 +0200 Subject: [PATCH 20/52] display display_output in output field --- protzilla/methods/data_analysis.py | 2 +- ui/runs/templates/runs/details.html | 4 ++-- ui/runs/views.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 1c54da36..081b0d7f 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -903,4 +903,4 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: def handle_outputs(self, outputs: dict): super().handle_outputs(outputs) - self.display_output["required_sample_size"] = outputs["required_sample_size"] \ No newline at end of file + self.display_output["required_sample_size"] = f"Required Sample Size: {outputs['required_sample_size']}" diff --git a/ui/runs/templates/runs/details.html b/ui/runs/templates/runs/details.html index a930f486..361875f7 100644 --- a/ui/runs/templates/runs/details.html +++ b/ui/runs/templates/runs/details.html @@ -213,8 +213,8 @@

{{ display_name }}

{% endif %} {% if display_output %}
- -
{% endif %} diff --git a/ui/runs/views.py b/ui/runs/views.py index 67635169..c4314306 100644 --- a/ui/runs/views.py +++ b/ui/runs/views.py @@ -125,7 +125,7 @@ def detail(request: HttpRequest, run_name: str): run.steps.current_step.display_output is not None and not run.current_step.display_output.is_empty() ) - display_output_text = f"{run.current_step.display_output}" + display_output_text = next(iter(run.current_step.display_output.display_output.values())) return render( request, From 3aa711d808a6326119c85193db0f0cb5a7a0bdec Mon Sep 17 00:00:00 2001 From: selenabr Date: Tue, 25 Jun 2024 13:25:53 +0200 Subject: [PATCH 21/52] display_output field displayed in the same size and position as the other fields --- ui/runs/static/runs/style.css | 7 +++++++ ui/runs/templates/runs/details.html | 15 ++++++++------- ui/runs/views.py | 2 +- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/ui/runs/static/runs/style.css b/ui/runs/static/runs/style.css index 63d66a0b..477e0f11 100644 --- a/ui/runs/static/runs/style.css +++ b/ui/runs/static/runs/style.css @@ -75,3 +75,10 @@ html, body { #gsea_enrichment_plot_img { width: 800px; } + +.display-output-textarea { + display: flex; + width: 100%; + height: auto; + resize: none; +} \ No newline at end of file diff --git a/ui/runs/templates/runs/details.html b/ui/runs/templates/runs/details.html index 361875f7..84ec3cfd 100644 --- a/ui/runs/templates/runs/details.html +++ b/ui/runs/templates/runs/details.html @@ -209,13 +209,14 @@

{{ display_name }}

{% endif %} - - {% endif %} - {% if display_output %} -
- - + {% if display_output %} +
+ + +
+ {% endif %}
{% endif %} diff --git a/ui/runs/views.py b/ui/runs/views.py index c4314306..6d98d025 100644 --- a/ui/runs/views.py +++ b/ui/runs/views.py @@ -125,7 +125,7 @@ def detail(request: HttpRequest, run_name: str): run.steps.current_step.display_output is not None and not run.current_step.display_output.is_empty() ) - display_output_text = next(iter(run.current_step.display_output.display_output.values())) + display_output_text = next(iter(run.current_step.display_output.display_output.values()), None) return render( request, From 2b483f9ceb005979b7ae371e4393daf6b13e1d67 Mon Sep 17 00:00:00 2001 From: AK Date: Thu, 25 Jul 2024 10:38:34 +0200 Subject: [PATCH 22/52] Changed is_dynamic to True --- ui/runs/forms/data_analysis.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 99d23798..fd8de70d 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1215,6 +1215,7 @@ def fill_form(self, run: Run) -> None: class TimeSeriesLinearRegressionForm(MethodForm): + is_dynamic = True input_df = CustomChoiceField( choices=[], label="Peptide dataframe", @@ -1250,6 +1251,7 @@ def fill_form(self, run: Run) -> None: class TimeSeriesRANSACRegressionForm(MethodForm): + is_dynamic = True input_df = CustomChoiceField( choices=[], label="Peptide dataframe", From 5553ca769cb9f34b3762004e4cbc73780aec81d7 Mon Sep 17 00:00:00 2001 From: AK Date: Thu, 25 Jul 2024 10:59:44 +0200 Subject: [PATCH 23/52] Made some minor changes to the Plot positioning --- protzilla/data_analysis/time_series_regression_analysis.py | 3 ++- protzilla/methods/data_analysis.py | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py index 2d5622ff..a69eced9 100644 --- a/protzilla/data_analysis/time_series_regression_analysis.py +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -255,7 +255,7 @@ def time_series_ransac_regression( yanchor="top", y=0.95, xanchor="right", - x=0.85 + x=0.825 ) ) @@ -273,3 +273,4 @@ def time_series_ransac_regression( test_r2_score=test_r2, plots=[fig], ) + diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 081b0d7f..687987a4 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -901,6 +901,3 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: ) return inputs - def handle_outputs(self, outputs: dict): - super().handle_outputs(outputs) - self.display_output["required_sample_size"] = f"Required Sample Size: {outputs['required_sample_size']}" From 14dac5e1ef398812061f8e293d61fb6b15c50658 Mon Sep 17 00:00:00 2001 From: AK Date: Thu, 25 Jul 2024 11:01:38 +0200 Subject: [PATCH 24/52] Made some minor changes to the Plot positioning --- protzilla/data_analysis/time_series_regression_analysis.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py index a69eced9..d61c5815 100644 --- a/protzilla/data_analysis/time_series_regression_analysis.py +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -70,7 +70,6 @@ def time_series_linear_regression( fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025) - # Add main plot traces fig.add_trace(go.Scatter( x=plot_df['Time'], y=plot_df['Intensity'], @@ -104,7 +103,6 @@ def time_series_linear_regression( showlegend=False ), row=1, col=2) - # Update layout fig.update_layout( title=f"Intensity over Time for {protein_group}", plot_bgcolor=colors["plot_bgcolor"], @@ -121,7 +119,7 @@ def time_series_linear_regression( yanchor="top", y=0.95, xanchor="right", - x=0.85 + x=0.825 ) ) @@ -129,7 +127,6 @@ def time_series_linear_regression( fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2) fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2) - # Adjust subplot titles fig.update_annotations(font_size=12) return dict( @@ -238,7 +235,6 @@ def time_series_ransac_regression( showlegend=False ), row=1, col=2) - # Update layout fig.update_layout( title=f"Intensity over Time for {protein_group}", plot_bgcolor=colors["plot_bgcolor"], @@ -263,7 +259,6 @@ def time_series_ransac_regression( fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2) fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2) - # Adjust subplot titles fig.update_annotations(font_size=12) return dict( From e9c9acf0b6885bceb57ee20b2a1a8726061bf867 Mon Sep 17 00:00:00 2001 From: AK Date: Thu, 25 Jul 2024 11:23:08 +0200 Subject: [PATCH 25/52] Created a thesis Workflow and added some tests for RANSAC --- .../test_time_series_analysis.py | 25 ++++++ user_data/workflows/workflow_BA_Kuganash.yaml | 89 +++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 user_data/workflows/workflow_BA_Kuganash.yaml diff --git a/tests/protzilla/data_analysis/test_time_series_analysis.py b/tests/protzilla/data_analysis/test_time_series_analysis.py index ff01ba50..74c5d5c1 100644 --- a/tests/protzilla/data_analysis/test_time_series_analysis.py +++ b/tests/protzilla/data_analysis/test_time_series_analysis.py @@ -70,6 +70,31 @@ def test_linear_regression_plot_invalid_test_size(time_series_test_data): return def test_linear_regression_outputs(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2) + assert "train_root_mean_squared" in outputs + assert "test_root_mean_squared" in outputs + assert "train_r2_score" in outputs + assert "test_r2_score" in outputs + return + + +def test_ransac_regression_plot(show_figures, time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2) + assert "plots" in outputs + fig = outputs["plots"][0] + if show_figures: + fig.show() + return + +def test_linear_ransac_plot_invalid_test_size(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + with pytest.raises(ValueError): + time_series_linear_regression(test_intensity, test_metadata, "Protein1", 2) + return + +def test_ransac_regression_outputs(time_series_test_data): test_intensity, test_metadata = time_series_test_data outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2) assert "train_root_mean_squared" in outputs diff --git a/user_data/workflows/workflow_BA_Kuganash.yaml b/user_data/workflows/workflow_BA_Kuganash.yaml new file mode 100644 index 00000000..1a19947c --- /dev/null +++ b/user_data/workflows/workflow_BA_Kuganash.yaml @@ -0,0 +1,89 @@ +df_mode: disk_memory +steps: +- form_inputs: + aggregation_method: Median + intensity_name: Intensity + map_to_uniprot: false + inputs: {} + instance_identifier: MaxQuantImport_1 + type: MaxQuantImport +- form_inputs: + intensity_name: Intensity + map_to_uniprot: false + inputs: {} + instance_identifier: EvidenceImport_1 + type: EvidenceImport +- form_inputs: + feature_orientation: Columns (samples in rows, features in columns) + inputs: {} + instance_identifier: MetadataImport_1 + type: MetadataImport +- form_inputs: + percentage: 0.5 + inputs: {} + instance_identifier: FilterProteinsBySamplesMissing_1 + plot_inputs: + graph_type: Bar chart + type: FilterProteinsBySamplesMissing +- form_inputs: + deviation_threshold: 2.0 + inputs: {} + instance_identifier: FilterSamplesByProteinIntensitiesSum_1 + plot_inputs: + graph_type: Bar chart + type: FilterSamplesByProteinIntensitiesSum +- form_inputs: + number_of_neighbours: 5 + inputs: {} + instance_identifier: ImputationByKNN_1 + plot_inputs: + graph_type: Boxplot + graph_type_quantities: Bar chart + group_by: None + visual_transformation: log10 + type: ImputationByKNN +- form_inputs: + number_of_neighbors: 20 + inputs: {} + instance_identifier: OutlierDetectionByLocalOutlierFactor_1 + plot_inputs: {} + type: OutlierDetectionByLocalOutlierFactor +- form_inputs: + percentile: 0.5 + inputs: {} + instance_identifier: NormalisationByMedian_1 + plot_inputs: + graph_type: Boxplot + group_by: None + visual_transformation: log10 + type: NormalisationByMedian +- form_inputs: + log_base: log2 + inputs: {} + instance_identifier: TransformationLog_1 + plot_inputs: + graph_type: Histogram + group_by: None + type: TransformationLog +- form_inputs: + input_df: TransformationLog_1 + protein_group: D3YYU8 + similarity: 1 + similarity_measure: euclidean distance + inputs: {} + instance_identifier: PlotTimeSeries_1 + type: PlotTimeSeriesPeptide +- form_inputs: + input_df: TransformationLog_1 + protein_group: D3YYU8 + test_size: 0.2 + inputs: {} + instance_identifier: TimeSeriesLinearRegression_1 + type: TimeSeriesLinearRegression +- form_inputs: + input_df: TransformationLog_1 + protein_group: D3YYU8 + test_size: 0.2 + inputs: {} + instance_identifier: TimeSeriesRANSACRegression_1 + type: TimeSeriesRANSACRegression From 37180af62f7ea667abb3b07d9144ea297dbabf25 Mon Sep 17 00:00:00 2001 From: AK Date: Thu, 25 Jul 2024 14:12:51 +0200 Subject: [PATCH 26/52] Implemented Augmented Dickey-Fuller test to check if a time series data stationary or not --- .../time_series_regression_analysis.py | 103 ++++++++++++++++++ protzilla/methods/data_analysis.py | 32 +++++- .../test_time_series_analysis.py | 20 +++- ui/runs/form_mapping.py | 1 + ui/runs/forms/data_analysis.py | 34 ++++++ 5 files changed, 184 insertions(+), 6 deletions(-) diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py index d61c5815..1af2a6a1 100644 --- a/protzilla/data_analysis/time_series_regression_analysis.py +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -1,3 +1,5 @@ +import logging + import numpy as np import pandas as pd import plotly.graph_objects as go @@ -8,6 +10,8 @@ from sklearn.linear_model import LinearRegression, RANSACRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, r2_score +from statsmodels.tsa.arima.model import ARIMA +from statsmodels.tsa.stattools import adfuller from plotly.subplots import make_subplots colors = { @@ -269,3 +273,102 @@ def time_series_ransac_regression( plots=[fig], ) + +def adfuller_test( + input_df: pd.DataFrame, + metadata_df: pd.DataFrame, + protein_group: str, + alpha: float = 0.05, +) -> dict: + """ + Perform the Augmented Dickey-Fuller test to check for stationarity in a time series. + :param input_df: The dataframe containing the time series data. + :param metadata_df: The dataframe containing the metadata. + :param protein_group: The protein group to perform the test on. + :param alpha: The significance level for the test (default is 0.05). + + :return: A dictionary containing: + - test_statistic: The test statistic from the ADF test. + - p_value: The p-value from the ADF test. + - critical_values: The critical values for different significance levels. + - is_stationary: A boolean indicating if the series is stationary. + - messages: A list of messages for the user. + """ + + messages = [] + input_df = input_df[input_df['Protein ID'] == protein_group] + + input_df = pd.merge( + left=input_df, + right=metadata_df, + on="Sample", + copy=False, + ) + + input_df = input_df["Intensity"].dropna() + + # Perform the ADF test + result = adfuller(input_df) + test_statistic = result[0] + p_value = result[1] + critical_values = result[4] + + # Determine if the series is stationary + is_stationary = p_value < alpha + + # Create a message for the user + if is_stationary: + messages.append( + { + "level": logging.INFO, + "msg": f"The time series is stationary (p-value: {p_value:.5f}).", + } + ) + else: + messages.append( + { + "level": logging.WARNING, + "msg": f"The time series is not stationary (p-value: {p_value:.5f}).", + } + ) + """ + fig = go.Figure() + + annotation_text = ( + f"Test Statistic: {test_statistic:.3f}
" + f"P-Value: {p_value:.3f}
" + f"Critical Values:
" + f"Is Stationary: {is_stationary}" + ) + + fig.add_trace( + go.Scatter( + x=[0], + y=[0.25], + text=[annotation_text], + mode='text', + textfont=dict(size=12), + showlegend=False + ) + ) + + fig.update_layout( + title=f"Augmented Dickey-Fuller Test for {protein_group}", + autosize=True, + margin=dict(l=100, r=100, t=100, b=50), + ) + + # Hide x-axis of the annotation subplot + fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False) + fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False) + + fig.update_annotations(font_size=12) + """ + return dict( + test_statistic=test_statistic, + p_value=p_value, + critical_values=critical_values, + is_stationary=is_stationary, + messages=messages, + ) + diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 687987a4..92844a21 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -14,7 +14,7 @@ ) from protzilla.data_analysis.differential_expression_t_test import t_test from protzilla.data_analysis.dimension_reduction import t_sne, umap -from protzilla.data_analysis.time_series_regression_analysis import time_series_linear_regression, time_series_ransac_regression +from protzilla.data_analysis.time_series_regression_analysis import time_series_linear_regression, time_series_ransac_regression, adfuller_test from protzilla.data_analysis.ptm_analysis import filter_peptides_of_protein, ptms_per_sample, \ ptms_per_protein_and_sample from protzilla.data_analysis.model_evaluation import evaluate_classification_model @@ -33,7 +33,7 @@ ) from protzilla.data_analysis.ptm_quantification import flexiquant_lf from protzilla.methods.data_preprocessing import TransformationLog -from protzilla.steps import Plots, Step, StepManager, DisplayOutput +from protzilla.steps import Plots, Step, StepManager class DataAnalysisStep(Step): @@ -852,6 +852,34 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["metadata_df"] = steps.metadata_df return inputs + +class TimeSeriesADFullerTest(DataAnalysisStep): + display_name = "Augmented Dickey-Fuller Test" + operation = "Time series analysis" + method_description = "Perform Augmented Dickey-Fuller test on the time series data for a given protein group." + + input_keys = [ + "input_df", + "metadata_df", + "protein_group", + "alpha", + ] + output_keys = [ + "test_statistic", + "p_value", + "critical_values", + "is_stationary", + ] + + def method(self, inputs: dict) -> dict: + return adfuller_test(**inputs) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["input_df"] = steps.get_step_output(Step, "peptide_df", inputs["input_df"]) + inputs["metadata_df"] = steps.metadata_df + return inputs + + class PTMsPerSample(DataAnalysisStep): display_name = "PTMs per Sample" operation = "Peptide analysis" diff --git a/tests/protzilla/data_analysis/test_time_series_analysis.py b/tests/protzilla/data_analysis/test_time_series_analysis.py index 74c5d5c1..5c359d5d 100644 --- a/tests/protzilla/data_analysis/test_time_series_analysis.py +++ b/tests/protzilla/data_analysis/test_time_series_analysis.py @@ -1,7 +1,7 @@ import pandas as pd import pytest -from protzilla.data_analysis.time_series_regression_analysis import time_series_linear_regression +from protzilla.data_analysis.time_series_regression_analysis import time_series_linear_regression, time_series_ransac_regression, adfuller_test @pytest.fixture @@ -81,7 +81,7 @@ def test_linear_regression_outputs(time_series_test_data): def test_ransac_regression_plot(show_figures, time_series_test_data): test_intensity, test_metadata = time_series_test_data - outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2) + outputs = time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 0.2) assert "plots" in outputs fig = outputs["plots"][0] if show_figures: @@ -91,14 +91,26 @@ def test_ransac_regression_plot(show_figures, time_series_test_data): def test_linear_ransac_plot_invalid_test_size(time_series_test_data): test_intensity, test_metadata = time_series_test_data with pytest.raises(ValueError): - time_series_linear_regression(test_intensity, test_metadata, "Protein1", 2) + time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 2) return def test_ransac_regression_outputs(time_series_test_data): test_intensity, test_metadata = time_series_test_data - outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2) + outputs = time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 0.2) assert "train_root_mean_squared" in outputs assert "test_root_mean_squared" in outputs assert "train_r2_score" in outputs assert "test_r2_score" in outputs + return + + +def test_adfuller_test(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = adfuller_test(test_intensity, test_metadata, "Protein1") + + assert "test_statistic" in outputs + assert "p_value" in outputs + assert "critical_values" in outputs + assert "is_stationary" in outputs + assert "messages" in outputs return \ No newline at end of file diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py index 079e1569..bca07db4 100644 --- a/ui/runs/form_mapping.py +++ b/ui/runs/form_mapping.py @@ -68,6 +68,7 @@ data_analysis.PTMsProteinAndPerSample: data_analysis_forms.PTMsPerProteinAndSampleForm, data_analysis.TimeSeriesLinearRegression: data_analysis_forms.TimeSeriesLinearRegressionForm, data_analysis.TimeSeriesRANSACRegression: data_analysis_forms.TimeSeriesRANSACRegressionForm, + data_analysis.TimeSeriesADFullerTest: data_analysis_forms.TimeSeriesADFullerTestForm, data_preprocessing.ImputationByMinPerSample: data_preprocessing_forms.ImputationByMinPerSampleForms, data_integration.EnrichmentAnalysisGOAnalysisWithString: data_integration_forms.EnrichmentAnalysisGOAnalysisWithStringForm, data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr: data_integration_forms.EnrichmentAnalysisGOAnalysisWithEnrichrForm, diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index fd8de70d..a215f8bf 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1269,6 +1269,40 @@ class TimeSeriesRANSACRegressionForm(MethodForm): ) + def fill_form(self, run: Run) -> None: + self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps( + run + ) + input_df_instance_id = self.data.get( + "input_df", self.fields["input_df"].choices[0][0] + ) + + self.fields["protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + step_type=Step, + output_key="peptide_df", + instance_identifier=input_df_instance_id, + )["Protein ID"].unique() + ) + + +class TimeSeriesADFullerTestForm(MethodForm): + is_dynamic = True + input_df = CustomChoiceField( + choices=[], + label="Peptide dataframe", + ) + protein_group = CustomChoiceField( + choices=[], + label="Protein group: which protein group to perform the ADFuller test on", + ) + alpha = CustomFloatField( + label="Significance level", + min_value=0, + max_value=1, + initial=0.05 + ) + def fill_form(self, run: Run) -> None: self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps( run From 21468fc26c3cddecb3e4643075d73b3ef12c04af Mon Sep 17 00:00:00 2001 From: henning Date: Wed, 10 Jul 2024 16:00:05 +0200 Subject: [PATCH 27/52] Implemented the option to do regression on each group --- protzilla/constants/colors.py | 25 +- .../time_series_regression_analysis.py | 360 +++++++++++------- protzilla/methods/data_analysis.py | 12 +- ui/runs/forms/data_analysis.py | 14 + 4 files changed, 270 insertions(+), 141 deletions(-) diff --git a/protzilla/constants/colors.py b/protzilla/constants/colors.py index eec08b1b..3f33249b 100644 --- a/protzilla/constants/colors.py +++ b/protzilla/constants/colors.py @@ -1,8 +1,23 @@ PROTZILLA_DISCRETE_COLOR_SEQUENCE = [ - "#4A536A", - "#87A8B9", - "#CE5A5A", - "#8E3325", - "#E2A46D", + #Muted Dark Slate + "#252935", + "#4A536A", + '#a4a9b4', +# Muted Indian Red + "#CE5A5A", + "#B04A4A", + "#EBBDBD", +# Muted Light Steel Blue + "#51646f", + "#87A8B9", + "#B7CAD5", + # Muted Sienna + "#804538", + "#8E3325", + "#471912", + #Muted Sandy Brown + "#715236", + "#E2A46D", + "F0D1B6", ] PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE = ["#4A536A", "#CE5A5A"] diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py index 1af2a6a1..feb9997f 100644 --- a/protzilla/data_analysis/time_series_regression_analysis.py +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -27,7 +27,8 @@ def time_series_linear_regression( input_df: pd.DataFrame, metadata_df: pd.DataFrame, protein_group: str, - test_size: float, + grouping: str = None, + test_size: float = 0.2, ): """ Perform linear regression on the time series data for a given protein group. @@ -35,11 +36,12 @@ def time_series_linear_regression( :param metadata_df: Metadata dataframe which contains the timestamps :param protein_group: Protein group to perform the analysis on :param test_size: The proportion of the dataset to include in the test split + :param grouping: Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups :return: A dictionary containing the root mean squared error and r2 score for the training and test sets """ - - if test_size < 0 or test_size > 1 : + color_index = 0 + if test_size < 0 or test_size > 1: raise ValueError("Test size should be between 0 and 1") input_df = input_df[input_df['Protein ID'] == protein_group] @@ -56,47 +58,109 @@ def time_series_linear_regression( X = input_df[["Time"]] y = input_df["Intensity"] - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False) - model = LinearRegression() - model.fit(X_train, y_train) - - y_pred_train = model.predict(X_train) - y_pred_test = model.predict(X_test) - - train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) - test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test)) - train_r2 = r2_score(y_train, y_pred_train) - test_r2 = r2_score(y_test, y_pred_test) - - train_df = pd.DataFrame({'Time': X_train['Time'], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) - test_df = pd.DataFrame({'Time': X_test['Time'], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) - plot_df = pd.concat([train_df, test_df]) - fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025) - fig.add_trace(go.Scatter( - x=plot_df['Time'], - y=plot_df['Intensity'], - mode='markers', - name='Actual Intensity', - marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) - ), row=1, col=1) + scores = [] + + if grouping == "With Grouping" and "Group" in input_df.columns: + groups = input_df["Group"].unique() + for group in groups: + group_df = input_df[input_df["Group"] == group] + X_group = group_df[["Time"]] + y_group = group_df["Intensity"] + + X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, test_size=test_size, shuffle=False) + model = LinearRegression() + model.fit(X_train, y_train) + + y_pred_train = model.predict(X_train) + y_pred_test = model.predict(X_test) + + train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) + test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test)) + train_r2 = r2_score(y_train, y_pred_train) + test_r2 = r2_score(y_test, y_pred_test) + + train_df = pd.DataFrame({'Time': X_train['Time'], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) + test_df = pd.DataFrame({'Time': X_test['Time'], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + plot_df = pd.concat([train_df, test_df]) + + color = PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index % len(PROTZILLA_DISCRETE_COLOR_SEQUENCE)] + color_index += 3 + + fig.add_trace(go.Scatter( + x=plot_df['Time'], + y=plot_df['Intensity'], + mode='markers', + name=f'Actual Intensity ({group})', + marker=dict(color=color) + ), row=1, col=1) + + fig.add_trace(go.Scatter( + x=plot_df['Time'], + y=plot_df['Predicted'], + mode='lines', + name=f'Predicted Intensity ({group})', + line=dict(color=color) + ), row=1, col=1) + + scores.append({ + 'group': group, + 'train_root_mean_squared': train_rmse, + 'test_root_mean_squared': test_rmse, + 'train_r2_score': train_r2, + 'test_r2_score': test_r2, + }) - fig.add_trace(go.Scatter( - x=plot_df['Time'], - y=plot_df['Predicted'], - mode='lines', - name='Predicted Intensity', - line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]) - ), row=1, col=1) + else: + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False) + model = LinearRegression() + model.fit(X_train, y_train) + + y_pred_train = model.predict(X_train) + y_pred_test = model.predict(X_test) + + train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) + test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test)) + train_r2 = r2_score(y_train, y_pred_train) + test_r2 = r2_score(y_test, y_pred_test) + + train_df = pd.DataFrame({'Time': X_train['Time'], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) + test_df = pd.DataFrame({'Time': X_test['Time'], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + plot_df = pd.concat([train_df, test_df]) + + fig.add_trace(go.Scatter( + x=plot_df['Time'], + y=plot_df['Intensity'], + mode='markers', + name='Actual Intensity', + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) + ), row=1, col=1) + + fig.add_trace(go.Scatter( + x=plot_df['Time'], + y=plot_df['Predicted'], + mode='lines', + name='Predicted Intensity', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]) + ), row=1, col=1) + + scores.append({ + 'group': 'Overall', + 'train_root_mean_squared': train_rmse, + 'test_root_mean_squared': test_rmse, + 'train_r2_score': train_r2, + 'test_r2_score': test_r2, + }) # Add annotation text as a separate trace in the subplot - annotation_text = ( - f"Train RMSE: {train_rmse:.3f}
" - f"Test RMSE: {test_rmse:.3f}
" - f"Train R²: {train_r2:.3f}
" - f"Test R²: {test_r2:.3f}" - ) + annotation_text = "
".join([ + f"Group: {res['group']}
Train RMSE: {res['train_root_mean_squared']:.3f}
" + f"Test RMSE: {res['test_root_mean_squared']:.3f}
" + f"Train R²: {res['train_r2_score']:.3f}
" + f"Test R²: {res['test_r2_score']:.3f}" + for res in scores + ]) fig.add_trace(go.Scatter( x=[0], @@ -134,10 +198,7 @@ def time_series_linear_regression( fig.update_annotations(font_size=12) return dict( - train_root_mean_squared=train_rmse, - test_root_mean_squared=test_rmse, - train_r2_score=train_r2, - test_r2_score=test_r2, + scores=scores, plots=[fig], ) @@ -146,6 +207,7 @@ def time_series_ransac_regression( input_df: pd.DataFrame, metadata_df: pd.DataFrame, protein_group: str, + grouping: str, test_size: float, ): """ @@ -158,6 +220,7 @@ def time_series_ransac_regression( :return: A dictionary containing the root mean squared error and r2 score for the training and test sets """ + color_index = 0 if test_size < 0 or test_size > 1: raise ValueError("Test size should be between 0 and 1") @@ -175,60 +238,134 @@ def time_series_ransac_regression( X = input_df[["Time"]] y = input_df["Intensity"] - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False) - model = RANSACRegressor(base_estimator=LinearRegression()) - model.fit(X_train, y_train) - - inlier_mask = model.inlier_mask_ - - y_pred_train = model.predict(X_train) - y_pred_test = model.predict(X_test) - - train_rmse = np.sqrt(mean_squared_error(y_train[inlier_mask], y_pred_train[inlier_mask])) - test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test)) - train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask]) - test_r2 = r2_score(y_test, y_pred_test) - - train_df = pd.DataFrame({'Time': X_train["Time"], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) - test_df = pd.DataFrame({'Time': X_test["Time"], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) - train_df['Inlier'] = inlier_mask - test_df['Inlier'] = False - plot_df = pd.concat([train_df, test_df]) - fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025) - # Add main plot traces - fig.add_trace(go.Scatter( - x=plot_df['Time'], - y=plot_df['Intensity'], - mode='markers', - name='Actual Intensity', - marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) - ), row=1, col=1) - - fig.add_trace(go.Scatter( - x=plot_df['Time'], - y=plot_df['Predicted'], - mode='lines', - name='Predicted Intensity', - line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]) - ), row=1, col=1) + scores = [] + + if grouping == "With Grouping" and "Group" in input_df.columns: + groups = input_df["Group"].unique() + for group in groups: + group_df = input_df[input_df["Group"] == group] + X_group = group_df[["Time"]] + y_group = group_df["Intensity"] + + X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, test_size=test_size, shuffle=False) + model = RANSACRegressor(base_estimator=LinearRegression()) + model.fit(X_train, y_train) + + inlier_mask = model.inlier_mask_ + + y_pred_train = model.predict(X_train) + y_pred_test = model.predict(X_test) + + train_rmse = np.sqrt(mean_squared_error(y_train[inlier_mask], y_pred_train[inlier_mask])) + test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test)) + train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask]) + test_r2 = r2_score(y_test, y_pred_test) + + train_df = pd.DataFrame({'Time': X_train["Time"], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) + test_df = pd.DataFrame({'Time': X_test["Time"], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + train_df['Inlier'] = inlier_mask + test_df['Inlier'] = False + plot_df = pd.concat([train_df, test_df]) + + # Add main plot traces + fig.add_trace(go.Scatter( + x=plot_df['Time'], + y=plot_df['Intensity'], + mode='markers', + name='Actual Intensity', + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index]) + ), row=1, col=1) + + fig.add_trace(go.Scatter( + x=plot_df['Time'], + y=plot_df['Predicted'], + mode='lines', + name='Predicted Intensity', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 1]) + ), row=1, col=1) + + fig.add_trace(go.Scatter( + x=plot_df[plot_df['Inlier'] == False]['Time'], + y=plot_df[plot_df['Inlier'] == False]['Intensity'], + mode='markers', + name='Outliers', + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2]) + ), row=1, col=1) + + color_index += 3 + + scores.append({ + 'group': group, + 'train_root_mean_squared': train_rmse, + 'test_root_mean_squared': test_rmse, + 'train_r2_score': train_r2, + 'test_r2_score': test_r2, + }) - fig.add_trace(go.Scatter( - x=plot_df[plot_df['Inlier'] == False]['Time'], - y=plot_df[plot_df['Inlier'] == False]['Intensity'], - mode='markers', - name='Outliers', - marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[1]) - ), row=1, col=1) + else: + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False) + model = RANSACRegressor(base_estimator=LinearRegression()) + model.fit(X_train, y_train) + + inlier_mask = model.inlier_mask_ + + y_pred_train = model.predict(X_train) + y_pred_test = model.predict(X_test) + + train_rmse = np.sqrt(mean_squared_error(y_train[inlier_mask], y_pred_train[inlier_mask])) + test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test)) + train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask]) + test_r2 = r2_score(y_test, y_pred_test) + + train_df = pd.DataFrame({'Time': X_train["Time"], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) + test_df = pd.DataFrame({'Time': X_test["Time"], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + train_df['Inlier'] = inlier_mask + test_df['Inlier'] = False + plot_df = pd.concat([train_df, test_df]) + + # Add main plot traces + fig.add_trace(go.Scatter( + x=plot_df['Time'], + y=plot_df['Intensity'], + mode='markers', + name='Actual Intensity', + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) + ), row=1, col=1) + + fig.add_trace(go.Scatter( + x=plot_df['Time'], + y=plot_df['Predicted'], + mode='lines', + name='Predicted Intensity', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) + ), row=1, col=1) + + fig.add_trace(go.Scatter( + x=plot_df[plot_df['Inlier'] == False]['Time'], + y=plot_df[plot_df['Inlier'] == False]['Intensity'], + mode='markers', + name='Outliers', + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]) + ), row=1, col=1) + + scores.append({ + 'group': 'Overall', + 'train_root_mean_squared': train_rmse, + 'test_root_mean_squared': test_rmse, + 'train_r2_score': train_r2, + 'test_r2_score': test_r2, + }) # Add annotation text as a separate trace in the subplot - annotation_text = ( - f"Train RMSE: {train_rmse:.3f}
" - f"Test RMSE: {test_rmse:.3f}
" - f"Train R²: {train_r2:.3f}
" - f"Test R²: {test_r2:.3f}" - ) + annotation_text = "
".join([ + f"Group: {res['group']}
Train RMSE: {res['train_root_mean_squared']:.3f}
" + f"Test RMSE: {res['test_root_mean_squared']:.3f}
" + f"Train R²: {res['train_r2_score']:.3f}
" + f"Test R²: {res['test_r2_score']:.3f}" + for res in scores + ]) fig.add_trace(go.Scatter( x=[0], @@ -266,10 +403,7 @@ def time_series_ransac_regression( fig.update_annotations(font_size=12) return dict( - train_root_mean_squared=train_rmse, - test_root_mean_squared=test_rmse, - train_r2_score=train_r2, - test_r2_score=test_r2, + scores=scores, plots=[fig], ) @@ -295,6 +429,8 @@ def adfuller_test( - messages: A list of messages for the user. """ + # TODO: Info box for the user + messages = [] input_df = input_df[input_df['Protein ID'] == protein_group] @@ -331,39 +467,7 @@ def adfuller_test( "msg": f"The time series is not stationary (p-value: {p_value:.5f}).", } ) - """ - fig = go.Figure() - - annotation_text = ( - f"Test Statistic: {test_statistic:.3f}
" - f"P-Value: {p_value:.3f}
" - f"Critical Values:
" - f"Is Stationary: {is_stationary}" - ) - - fig.add_trace( - go.Scatter( - x=[0], - y=[0.25], - text=[annotation_text], - mode='text', - textfont=dict(size=12), - showlegend=False - ) - ) - fig.update_layout( - title=f"Augmented Dickey-Fuller Test for {protein_group}", - autosize=True, - margin=dict(l=100, r=100, t=100, b=50), - ) - - # Hide x-axis of the annotation subplot - fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False) - fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False) - - fig.update_annotations(font_size=12) - """ return dict( test_statistic=test_statistic, p_value=p_value, diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 92844a21..1c3f7093 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -809,13 +809,11 @@ class TimeSeriesLinearRegression(PlotStep): "input_df", "metadata_df", "protein_group", + "grouping", "test_size", ] output_keys = [ - "train_root_mean_squared", - "test_root_mean_squared", - "train_r2_score", - "test_r2_score", + "scores", ] def method(self, inputs: dict) -> dict: @@ -836,13 +834,11 @@ class TimeSeriesRANSACRegression(PlotStep): "input_df", "metadata_df", "protein_group", + "grouping", "test_size", ] output_keys = [ - "train_root_mean_squared", - "test_root_mean_squared", - "train_r2_score", - "test_r2_score", + "scores", ] def method(self, inputs: dict) -> dict: return time_series_ransac_regression(**inputs) diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index a215f8bf..f3d2e849 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -145,6 +145,10 @@ class DimensionReductionMetric(Enum): cosine = "cosine" havensine = "havensine" +class TimeSeriesGrouping(Enum): + with_grouping = "With Grouping" + without_grouping = "Without Grouping" + class DifferentialExpressionANOVAForm(MethodForm): is_dynamic = True @@ -1224,6 +1228,11 @@ class TimeSeriesLinearRegressionForm(MethodForm): choices=[], label="Protein group: which protein group to perform the linear regression on", ) + grouping = CustomChoiceField( + choices= TimeSeriesGrouping, + label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", + initial=TimeSeriesGrouping.with_grouping + ) test_size = CustomFloatField( label="Test size: proportion of the dataset to include in the test split", min_value=0, @@ -1260,6 +1269,11 @@ class TimeSeriesRANSACRegressionForm(MethodForm): choices=[], label="Protein group: which protein group to perform the RANSAC regression on", ) + grouping = CustomChoiceField( + choices= TimeSeriesGrouping, + label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", + initial=TimeSeriesGrouping.with_grouping + ) test_size = CustomFloatField( label="Test size: proportion of the dataset to include in the test split", min_value=0, From 0153a1a61e9b96c4b8441980bc86c3fa43f696ed Mon Sep 17 00:00:00 2001 From: AK Date: Wed, 31 Jul 2024 12:07:55 +0200 Subject: [PATCH 28/52] Cherry picked Text Field from Henning's BA --- ui/runs/forms/custom_fields.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/ui/runs/forms/custom_fields.py b/ui/runs/forms/custom_fields.py index 7171f173..7370b64b 100644 --- a/ui/runs/forms/custom_fields.py +++ b/ui/runs/forms/custom_fields.py @@ -1,6 +1,8 @@ +import json import logging from enum import Enum +import django.forms as forms from django.forms import ( BooleanField, CharField, @@ -126,3 +128,32 @@ class CustomFloatField(FloatField): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.widget.attrs.update({"class": "form-control mb-2"}) + + +from django import forms +from django.utils.safestring import mark_safe + + +class TextDisplayWidget(forms.Widget): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.attrs.update() + + def render(self, name, value, attrs=None, renderer=None): + display_text = self.attrs.get("data-display-text", "") + return mark_safe(f"
{display_text}
") + + +class TextDisplayField(forms.Field): + widget = TextDisplayWidget + + def __init__(self, *args, **kwargs): + self.text = kwargs.pop("text", "") + kwargs["required"] = False + super().__init__(*args, **kwargs) + self.update_text() + + def update_text(self, text=None): + if text is not None: + self.text = text + self.widget.attrs["data-display-text"] = self.text From 31bd7af965043b05e7fbf47243a8a792b6b2a5ee Mon Sep 17 00:00:00 2001 From: AK Date: Wed, 31 Jul 2024 12:46:13 +0200 Subject: [PATCH 29/52] Added info box for ADFuller Test --- ui/runs/forms/data_analysis.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index f3d2e849..77b7384a 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -21,6 +21,7 @@ CustomFloatField, CustomMultipleChoiceField, CustomNumberField, + TextDisplayField ) @@ -1302,6 +1303,16 @@ def fill_form(self, run: Run) -> None: class TimeSeriesADFullerTestForm(MethodForm): is_dynamic = True + test_info = TextDisplayField( + label="Information about the Augmented Dickey-Fuller test", + text=( + "The Augmented Dickey-Fuller test is a type of statistical test called a unit root test. The test " + "determines how strongly a time series is defined by a trend. The null hypothesis of the test is that the " + "time series can be represented by a unit root, which implies that the time series is not stationary. " + "The alternative hypothesis is that the time series is stationary. If the p-value is less than the " + "significance level, the null hypothesis can be rejected and the time series is considered stationary." + ), + ) input_df = CustomChoiceField( choices=[], label="Peptide dataframe", From fb476e4a45b34a11ae827bac50a6522865e976b1 Mon Sep 17 00:00:00 2001 From: AK Date: Thu, 1 Aug 2024 10:56:43 +0200 Subject: [PATCH 30/52] Fixed Tests --- protzilla/data_analysis/time_series_helper.py | 10 +-- .../test_time_series_analysis.py | 65 ++++++++++++------- 2 files changed, 47 insertions(+), 28 deletions(-) diff --git a/protzilla/data_analysis/time_series_helper.py b/protzilla/data_analysis/time_series_helper.py index 077e7e06..0fb294ed 100644 --- a/protzilla/data_analysis/time_series_helper.py +++ b/protzilla/data_analysis/time_series_helper.py @@ -1,12 +1,12 @@ from datetime import datetime -def convert_time_to_datetime(time_str): +def convert_time_to_hours(time_str): """ - Convert a string time to a datetime object - :param time_str: The time string to convert + Convert a string time to the number of hours since midnight. + :param time_str: The time string to convert in format '%H:%M:%S' - :return: A datetime object + :return: Number of hours since midnight as a float """ time_obj = datetime.strptime(time_str, '%H:%M:%S') - hours_since_midnight = time_obj.hour + hours_since_midnight = time_obj.hour + time_obj.minute / 60 + time_obj.second / 3600 return hours_since_midnight \ No newline at end of file diff --git a/tests/protzilla/data_analysis/test_time_series_analysis.py b/tests/protzilla/data_analysis/test_time_series_analysis.py index 5c359d5d..4520d3f2 100644 --- a/tests/protzilla/data_analysis/test_time_series_analysis.py +++ b/tests/protzilla/data_analysis/test_time_series_analysis.py @@ -1,7 +1,11 @@ import pandas as pd import pytest -from protzilla.data_analysis.time_series_regression_analysis import time_series_linear_regression, time_series_ransac_regression, adfuller_test +from protzilla.data_analysis.time_series_regression_analysis import ( + time_series_linear_regression, + time_series_ransac_regression, + adfuller_test, +) @pytest.fixture @@ -43,20 +47,32 @@ def time_series_test_data(): ) test_metadata_df = ( - ["Sample1", "02:00:00", 1], - ["Sample2", "06:00:00", 1], - ["Sample3", "10:00:00", 1], - ["Sample4", "14:00:00", 1], + ["Sample1", "02:00:00", "1"], + ["Sample2", "06:00:00", "1"], + ["Sample3", "10:00:00", "1"], + ["Sample4", "14:00:00", "1"], + ["Sample5", "2:00:00", "2"], + ["Sample6", "4:00:00", "2"], + ["Sample7", "6:00:00", "2"], ) test_metadata_df = pd.DataFrame( data=test_metadata_df, - columns=["Sample", "Time", "Day"], + columns=["Sample", "Time", "Group"], ) return test_intensity_df, test_metadata_df -def test_linear_regression_plot(show_figures, time_series_test_data): +def test_linear_regression_plot_with_grouping(show_figures, time_series_test_data): test_intensity, test_metadata = time_series_test_data - outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2) + outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2,"With Grouping") + assert "plots" in outputs + fig = outputs["plots"][0] + if show_figures: + fig.show() + return + +def test_linear_regression_plot_without_grouping(show_figures, time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2,"Without Grouping") assert "plots" in outputs fig = outputs["plots"][0] if show_figures: @@ -66,41 +82,44 @@ def test_linear_regression_plot(show_figures, time_series_test_data): def test_linear_regression_plot_invalid_test_size(time_series_test_data): test_intensity, test_metadata = time_series_test_data with pytest.raises(ValueError): - time_series_linear_regression(test_intensity, test_metadata, "Protein1", 2) + time_series_linear_regression(test_intensity, test_metadata, "Protein1", 2, "Without Grouping") return def test_linear_regression_outputs(time_series_test_data): test_intensity, test_metadata = time_series_test_data - outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2) - assert "train_root_mean_squared" in outputs - assert "test_root_mean_squared" in outputs - assert "train_r2_score" in outputs - assert "test_r2_score" in outputs + outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2, "Without Grouping") + assert "scores" in outputs return -def test_ransac_regression_plot(show_figures, time_series_test_data): +def test_ransac_regression_plot_with_grouping(show_figures, time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 0.2, "With Grouping") + assert "plots" in outputs + fig = outputs["plots"][0] + if show_figures: + fig.show() + return + +def test_ransac_regression_plot_without_grouping(show_figures, time_series_test_data): test_intensity, test_metadata = time_series_test_data - outputs = time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 0.2) + outputs = time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 0.2, "Without Grouping") assert "plots" in outputs fig = outputs["plots"][0] if show_figures: fig.show() return -def test_linear_ransac_plot_invalid_test_size(time_series_test_data): +def test_ransac_plot_invalid_test_size(time_series_test_data): test_intensity, test_metadata = time_series_test_data with pytest.raises(ValueError): - time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 2) + time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 2, "Without Grouping") return def test_ransac_regression_outputs(time_series_test_data): test_intensity, test_metadata = time_series_test_data - outputs = time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 0.2) - assert "train_root_mean_squared" in outputs - assert "test_root_mean_squared" in outputs - assert "train_r2_score" in outputs - assert "test_r2_score" in outputs + outputs = time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 0.2, "Without Grouping") + assert "scores" in outputs return From f7da4aaffd00d951113956378dcd84803f4e49fe Mon Sep 17 00:00:00 2001 From: AK Date: Thu, 1 Aug 2024 11:00:04 +0200 Subject: [PATCH 31/52] implemented Auto ARIMA --- .../time_series_regression_analysis.py | 198 ++++++++++++++++-- protzilla/methods/data_analysis.py | 41 +++- requirements.txt | 1 + ui/runs/form_mapping.py | 1 + ui/runs/forms/data_analysis.py | 95 +++++++-- 5 files changed, 306 insertions(+), 30 deletions(-) diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py index feb9997f..0e0a43ec 100644 --- a/protzilla/data_analysis/time_series_regression_analysis.py +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -4,7 +4,7 @@ import pandas as pd import plotly.graph_objects as go -from protzilla.data_analysis.time_series_helper import convert_time_to_datetime +from protzilla.data_analysis.time_series_helper import convert_time_to_hours from protzilla.constants.colors import PROTZILLA_DISCRETE_COLOR_SEQUENCE from sklearn.linear_model import LinearRegression, RANSACRegressor @@ -12,6 +12,7 @@ from sklearn.metrics import mean_squared_error, r2_score from statsmodels.tsa.arima.model import ARIMA from statsmodels.tsa.stattools import adfuller +from pmdarima import auto_arima from plotly.subplots import make_subplots colors = { @@ -27,21 +28,21 @@ def time_series_linear_regression( input_df: pd.DataFrame, metadata_df: pd.DataFrame, protein_group: str, + train_size: float = 0.2, grouping: str = None, - test_size: float = 0.2, ): """ Perform linear regression on the time series data for a given protein group. :param input_df: Peptide dataframe which contains the intensity of each sample :param metadata_df: Metadata dataframe which contains the timestamps :param protein_group: Protein group to perform the analysis on - :param test_size: The proportion of the dataset to include in the test split + :param train_size: The proportion of the dataset to include in the test split :param grouping: Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups :return: A dictionary containing the root mean squared error and r2 score for the training and test sets """ color_index = 0 - if test_size < 0 or test_size > 1: + if train_size < 0 or train_size > 1: raise ValueError("Test size should be between 0 and 1") input_df = input_df[input_df['Protein ID'] == protein_group] @@ -53,7 +54,7 @@ def time_series_linear_regression( copy=False, ) - input_df["Time"] = input_df["Time"].apply(convert_time_to_datetime) + input_df["Time"] = input_df["Time"].apply(convert_time_to_hours) input_df = input_df.interpolate(method='linear', axis=0) X = input_df[["Time"]] y = input_df["Intensity"] @@ -69,7 +70,7 @@ def time_series_linear_regression( X_group = group_df[["Time"]] y_group = group_df["Intensity"] - X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, test_size=test_size, shuffle=False) + X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, test_size=train_size, shuffle=False) model = LinearRegression() model.fit(X_train, y_train) @@ -113,7 +114,7 @@ def time_series_linear_regression( }) else: - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_size, shuffle=False) model = LinearRegression() model.fit(X_train, y_train) @@ -207,21 +208,21 @@ def time_series_ransac_regression( input_df: pd.DataFrame, metadata_df: pd.DataFrame, protein_group: str, + train_size: float, grouping: str, - test_size: float, ): """ Perform RANSAC regression on the time series data for a given protein group. :param input_df: Peptide dataframe which contains the intensity of each sample :param metadata_df: Metadata dataframe which contains the timestamps :param protein_group: Protein group to perform the analysis on - :param test_size: The proportion of the dataset to include in the test split + :param train_size: The proportion of the dataset to include in the test split :return: A dictionary containing the root mean squared error and r2 score for the training and test sets """ color_index = 0 - if test_size < 0 or test_size > 1: + if train_size < 0 or train_size > 1: raise ValueError("Test size should be between 0 and 1") input_df = input_df[input_df['Protein ID'] == protein_group] @@ -233,7 +234,7 @@ def time_series_ransac_regression( copy=False, ) - input_df["Time"] = input_df["Time"].apply(convert_time_to_datetime) + input_df["Time"] = input_df["Time"].apply(convert_time_to_hours) input_df = input_df.interpolate(method='linear', axis=0) X = input_df[["Time"]] y = input_df["Intensity"] @@ -249,7 +250,7 @@ def time_series_ransac_regression( X_group = group_df[["Time"]] y_group = group_df["Intensity"] - X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, test_size=test_size, shuffle=False) + X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, test_size=train_size, shuffle=False) model = RANSACRegressor(base_estimator=LinearRegression()) model.fit(X_train, y_train) @@ -305,7 +306,7 @@ def time_series_ransac_regression( }) else: - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_size, shuffle=False) model = RANSACRegressor(base_estimator=LinearRegression()) model.fit(X_train, y_train) @@ -476,3 +477,174 @@ def adfuller_test( messages=messages, ) + +def time_series_auto_arima( + input_df: pd.DataFrame, + metadata_df: pd.DataFrame, + protein_group: str, + seasonal: str, + m: int, + train_size: float, + forecast_steps: int, + grouping: str, +) -> dict: + """ + Perform an automatic ARIMA model selection on the time series data for a given protein group. + :param input_df: Peptide dataframe which contains the intensity of each sample + :param metadata_df: Metadata dataframe which contains the timestamps + :param protein_group: Protein group to perform the analysis on + :param seasonal: Whether the ARIMA model should be seasonal + :param m: The number of time steps for a single seasonal period (ignored if seasonal=False) + :param train_size: The proportion of the dataset to include in the test split + :param forecast_steps: The number of steps to forecast + + :return: A dictionary containing the root mean squared error and r2 score for the training and test sets + """ + + color_index = 0 + + if train_size < 0 or train_size > 1: + raise ValueError("Train size should be between 0 and 1") + if seasonal == "Yes": + seasonal = True + else: + seasonal = False + + input_df = input_df[input_df['Protein ID'] == protein_group] + + input_df = pd.merge( + left=input_df, + right=metadata_df, + on="Sample", + copy=False, + ) + + input_df["Time"] = input_df["Time"].apply(convert_time_to_hours) + input_df.set_index("Time", inplace=True) + input_df = input_df.interpolate(method='linear', axis=0) + + data = input_df["Intensity"] + + train_size = int(len(data) * train_size) + train, test = data[:train_size], data[train_size:] + + # Fit the ARIMA model + model = auto_arima( + train, + seasonal=seasonal, + m=m, + trace=True, + error_action='ignore', + suppress_warnings=True, + stepwise=True, + ) + + # Forecast the test set + forecast = model.predict(n_periods=forecast_steps) + + last_time = data.index[-1] +1 + forecast_index = np.arange(last_time, last_time + forecast_steps) + forecast_series = pd.Series(forecast, index=forecast_index) + + test_for_comparison = test[:forecast_steps] + forecast_for_comparison = forecast_series[: len(test_for_comparison)] + + + + test_rmse = np.sqrt(mean_squared_error(test_for_comparison, forecast_for_comparison)) + test_r2 = r2_score(test_for_comparison, forecast_for_comparison) + train_rmse = np.sqrt(mean_squared_error(train, model.predict_in_sample())) + train_r2 = r2_score(train, model.predict_in_sample()) + + fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3]) + + scores = [] + + plot_df = pd.DataFrame({ + 'Time': test.index[:forecast_steps], + 'Intensity': test[:forecast_steps], + 'Predicted': forecast_series, + 'Inlier': np.abs(test[:forecast_steps] - forecast_series) < (1.5 * np.std(test[:forecast_steps])) + }) + + fig.add_trace(go.Scatter( + x=plot_df['Time'], + y=plot_df['Intensity'], + mode='markers', + name='Actual Intensity', + marker=dict(color='blue') + ), row=1, col=1) + + fig.add_trace(go.Scatter( + x=plot_df['Time'], + y=plot_df['Predicted'], + mode='lines', + name='Predicted Intensity', + line=dict(color='red') + ), row=1, col=1) + + fig.add_trace(go.Scatter( + x=plot_df[plot_df['Inlier'] == False]['Time'], + y=plot_df[plot_df['Inlier'] == False]['Intensity'], + mode='markers', + name='Outliers', + marker=dict(color='green') + ), row=1, col=1) + + scores.append({ + 'group': 'Overall', + 'train_root_mean_squared': train_rmse, + 'test_root_mean_squared': test_rmse, + 'train_r2_score': train_r2, + 'test_r2_score': test_r2, + }) + + # Add annotation text as a separate trace in the subplot + annotation_text = "
".join([ + f"Group: {res['group']}
Train RMSE: {res['train_root_mean_squared']:.3f}
" + f"Test RMSE: {res['test_root_mean_squared']:.3f}
" + f"Train R²: {res['train_r2_score']:.3f}
" + f"Test R²: {res['test_r2_score']:.3f}" + for res in scores + ]) + + fig.add_trace(go.Scatter( + x=[0], + y=[0.25], + text=[annotation_text], + mode='text', + textfont=dict(size=12), + showlegend=False + ), row=1, col=2) + + fig.update_layout( + title=f"Intensity over Time for {protein_group}", + plot_bgcolor=colors["plot_bgcolor"], + xaxis_gridcolor=colors["gridcolor"], + yaxis_gridcolor=colors["gridcolor"], + xaxis_linecolor=colors["linecolor"], + yaxis_linecolor=colors["linecolor"], + xaxis_title="Time", + yaxis_title="Intensity", + legend_title="Legend", + autosize=True, + margin=dict(l=100, r=100, t=100, b=50), + legend=dict( + yanchor="top", + y=0.95, + xanchor="right", + x=0.825 + ) + ) + + fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2) + fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2) + + fig.update_annotations(font_size=12) + + fig.show() + + return dict( + scores=scores, + plots=[fig], + ) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 1c3f7093..ad84a523 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -14,7 +14,12 @@ ) from protzilla.data_analysis.differential_expression_t_test import t_test from protzilla.data_analysis.dimension_reduction import t_sne, umap -from protzilla.data_analysis.time_series_regression_analysis import time_series_linear_regression, time_series_ransac_regression, adfuller_test +from protzilla.data_analysis.time_series_regression_analysis import ( + time_series_linear_regression, + time_series_ransac_regression, + adfuller_test, + time_series_auto_arima, +) from protzilla.data_analysis.ptm_analysis import filter_peptides_of_protein, ptms_per_sample, \ ptms_per_protein_and_sample from protzilla.data_analysis.model_evaluation import evaluate_classification_model @@ -809,8 +814,8 @@ class TimeSeriesLinearRegression(PlotStep): "input_df", "metadata_df", "protein_group", + "train_size", "grouping", - "test_size", ] output_keys = [ "scores", @@ -834,8 +839,8 @@ class TimeSeriesRANSACRegression(PlotStep): "input_df", "metadata_df", "protein_group", + "train_size", "grouping", - "test_size", ] output_keys = [ "scores", @@ -876,6 +881,36 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: return inputs +class TimeSeriesAutoARIMA(PlotStep): + display_name = "Auto ARIMA (AutoRegressive Integrated Moving Average)" + operation = "Time series analysis" + method_description = ( + "Perform Auto ARIMA on the time series data for a given protein group." + ) + + input_keys = [ + "input_df", + "metadata_df", + "protein_group", + "seasonal", + "m", + "train_size", + "forecast_steps", + "grouping", + ] + output_keys = [ + "scores", + ] + + def method(self, inputs: dict) -> dict: + return time_series_auto_arima(**inputs) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["input_df"] = steps.get_step_output(Step, "peptide_df", inputs["input_df"]) + inputs["metadata_df"] = steps.metadata_df + return inputs + + class PTMsPerSample(DataAnalysisStep): display_name = "PTMs per Sample" operation = "Peptide analysis" diff --git a/requirements.txt b/requirements.txt index bc175e2a..e7f0c7ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,6 +21,7 @@ restring==0.1.20 scikit-learn==1.2.2 scipy==1.10.1 statsmodels==0.13.5 +pmdarima==2.0.4 umap-learn==0.5.3 Werkzeug==2.2.3 numba==0.57.0 diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py index bca07db4..a1bcd37b 100644 --- a/ui/runs/form_mapping.py +++ b/ui/runs/form_mapping.py @@ -69,6 +69,7 @@ data_analysis.TimeSeriesLinearRegression: data_analysis_forms.TimeSeriesLinearRegressionForm, data_analysis.TimeSeriesRANSACRegression: data_analysis_forms.TimeSeriesRANSACRegressionForm, data_analysis.TimeSeriesADFullerTest: data_analysis_forms.TimeSeriesADFullerTestForm, + data_analysis.TimeSeriesAutoARIMA: data_analysis_forms.TimeSeriesAutoARIMAForm, data_preprocessing.ImputationByMinPerSample: data_preprocessing_forms.ImputationByMinPerSampleForms, data_integration.EnrichmentAnalysisGOAnalysisWithString: data_integration_forms.EnrichmentAnalysisGOAnalysisWithStringForm, data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr: data_integration_forms.EnrichmentAnalysisGOAnalysisWithEnrichrForm, diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 77b7384a..960dd33e 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1229,18 +1229,18 @@ class TimeSeriesLinearRegressionForm(MethodForm): choices=[], label="Protein group: which protein group to perform the linear regression on", ) - grouping = CustomChoiceField( - choices= TimeSeriesGrouping, - label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", - initial=TimeSeriesGrouping.with_grouping - ) - test_size = CustomFloatField( - label="Test size: proportion of the dataset to include in the test split", + train_size = CustomFloatField( + label="Train size: proportion of the dataset to include in the test split", min_value=0, max_value=1, step_size=0.1, initial=0.2 ) + grouping = CustomChoiceField( + choices= TimeSeriesGrouping, + label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", + initial=TimeSeriesGrouping.with_grouping + ) def fill_form(self, run: Run) -> None: @@ -1270,18 +1270,18 @@ class TimeSeriesRANSACRegressionForm(MethodForm): choices=[], label="Protein group: which protein group to perform the RANSAC regression on", ) - grouping = CustomChoiceField( - choices= TimeSeriesGrouping, - label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", - initial=TimeSeriesGrouping.with_grouping - ) - test_size = CustomFloatField( - label="Test size: proportion of the dataset to include in the test split", + train_size = CustomFloatField( + label="Train size: proportion of the dataset to include in the test split", min_value=0, max_value=1, step_size=0.1, initial=0.2 ) + grouping = CustomChoiceField( + choices= TimeSeriesGrouping, + label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", + initial=TimeSeriesGrouping.with_grouping + ) def fill_form(self, run: Run) -> None: @@ -1328,6 +1328,73 @@ class TimeSeriesADFullerTestForm(MethodForm): initial=0.05 ) + def fill_form(self, run: Run) -> None: + self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps( + run + ) + input_df_instance_id = self.data.get( + "input_df", self.fields["input_df"].choices[0][0] + ) + + self.fields["protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + step_type=Step, + output_key="peptide_df", + instance_identifier=input_df_instance_id, + )["Protein ID"].unique() + ) + + +class TimeSeriesAutoARIMAForm(MethodForm): + is_dynamic = True + model_info = TextDisplayField( + label="Information about the AutoARIMA model", + text=( + "Auto ARIMA is a function that automatically selects the best-fitting ARIMA model for a time series" + "by iterating over multiple combinations of model parameters to minimize an information criterion like AIC (Akaike Information Criterion)." + "It simplifies the model selection process, handling both seasonal and non-seasonal data," + " and helps in making accurate forecasts." + ), + ) + input_df = CustomChoiceField( + choices=[], + label="Peptide dataframe", + ) + protein_group = CustomChoiceField( + choices=[], + label="Protein group: which protein group to perform the AutoARIMA on", + ) + seasonal = CustomChoiceField( + choices=YesNo, + label="Seasonal: Whether the ARIMA model should be seasonal", + initial=YesNo.no + ) + m = CustomNumberField( + label = "The number of time steps for a single seasonal period (ignored if seasonal=No)", + min_value=1, + step_size=1, + initial=1, + ) + train_size = CustomFloatField( + label="Train size: proportion of the dataset to include in the test split", + min_value=0, + max_value=1, + step_size=0.1, + initial=0.8, + ) + forecast_steps = CustomNumberField( + label="Number of steps to forecast", + min_value=1, + step_size=1, + initial=10 + ) + grouping = CustomChoiceField( + choices= TimeSeriesGrouping, + label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", + initial=TimeSeriesGrouping.with_grouping + ) + + def fill_form(self, run: Run) -> None: self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps( run From 2e22aa6df4c0cdaa02fdc6ff9280e0beea5d0905 Mon Sep 17 00:00:00 2001 From: AK Date: Thu, 8 Aug 2024 10:51:01 +0200 Subject: [PATCH 32/52] implemented Auto ARIMA --- .../time_series_regression_analysis.py | 232 +++++++++++------- protzilla/methods/data_analysis.py | 4 +- ui/runs/forms/data_analysis.py | 32 ++- 3 files changed, 175 insertions(+), 93 deletions(-) diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py index 0e0a43ec..1646b532 100644 --- a/protzilla/data_analysis/time_series_regression_analysis.py +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -56,6 +56,9 @@ def time_series_linear_regression( input_df["Time"] = input_df["Time"].apply(convert_time_to_hours) input_df = input_df.interpolate(method='linear', axis=0) + + input_df = input_df.sample(frac=1, random_state = 42).reset_index(drop=True) + X = input_df[["Time"]] y = input_df["Intensity"] @@ -114,7 +117,7 @@ def time_series_linear_regression( }) else: - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_size, shuffle=False) + X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, shuffle=False) model = LinearRegression() model.fit(X_train, y_train) @@ -156,10 +159,9 @@ def time_series_linear_regression( # Add annotation text as a separate trace in the subplot annotation_text = "
".join([ - f"Group: {res['group']}
Train RMSE: {res['train_root_mean_squared']:.3f}
" - f"Test RMSE: {res['test_root_mean_squared']:.3f}
" - f"Train R²: {res['train_r2_score']:.3f}
" - f"Test R²: {res['test_r2_score']:.3f}" + f"Group: {res['group']} (Train/Test)" + f"
RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}
" + f"Train R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}
" for res in scores ]) @@ -188,7 +190,7 @@ def time_series_linear_regression( yanchor="top", y=0.95, xanchor="right", - x=0.825 + x=0.8 ) ) @@ -208,6 +210,9 @@ def time_series_ransac_regression( input_df: pd.DataFrame, metadata_df: pd.DataFrame, protein_group: str, + max_trials: int, + stop_probability: float, + loss: str, train_size: float, grouping: str, ): @@ -236,6 +241,9 @@ def time_series_ransac_regression( input_df["Time"] = input_df["Time"].apply(convert_time_to_hours) input_df = input_df.interpolate(method='linear', axis=0) + + input_df = input_df.sample(frac=1, random_state = 42).reset_index(drop=True) + X = input_df[["Time"]] y = input_df["Intensity"] @@ -250,8 +258,8 @@ def time_series_ransac_regression( X_group = group_df[["Time"]] y_group = group_df["Intensity"] - X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, test_size=train_size, shuffle=False) - model = RANSACRegressor(base_estimator=LinearRegression()) + X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, train_size=train_size, shuffle=False) + model = RANSACRegressor(max_trials = max_trials, stop_probability = stop_probability, loss = loss, base_estimator=LinearRegression()) model.fit(X_train, y_train) inlier_mask = model.inlier_mask_ @@ -361,10 +369,9 @@ def time_series_ransac_regression( # Add annotation text as a separate trace in the subplot annotation_text = "
".join([ - f"Group: {res['group']}
Train RMSE: {res['train_root_mean_squared']:.3f}
" - f"Test RMSE: {res['test_root_mean_squared']:.3f}
" - f"Train R²: {res['train_r2_score']:.3f}
" - f"Test R²: {res['test_r2_score']:.3f}" + f"Group: {res['group']} (Train/Test)" + f"
RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}
" + f"Train R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}
" for res in scores ]) @@ -393,7 +400,7 @@ def time_series_ransac_regression( yanchor="top", y=0.95, xanchor="right", - x=0.825 + x=0.8 ) ) @@ -485,7 +492,6 @@ def time_series_auto_arima( seasonal: str, m: int, train_size: float, - forecast_steps: int, grouping: str, ) -> dict: """ @@ -511,6 +517,7 @@ def time_series_auto_arima( seasonal = False input_df = input_df[input_df['Protein ID'] == protein_group] + input_df = input_df.sample(frac=1, random_state=42).reset_index(drop=True) input_df = pd.merge( left=input_df, @@ -519,92 +526,150 @@ def time_series_auto_arima( copy=False, ) - input_df["Time"] = input_df["Time"].apply(convert_time_to_hours) - input_df.set_index("Time", inplace=True) - input_df = input_df.interpolate(method='linear', axis=0) + fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3]) + scores = [] - data = input_df["Intensity"] + if grouping == "With Grouping" and "Group" in input_df.columns: + groups = input_df["Group"].unique() + for group in groups: + group_df = input_df[input_df["Group"] == group] - train_size = int(len(data) * train_size) - train, test = data[:train_size], data[train_size:] + group_df["Time"] = group_df["Time"].apply(convert_time_to_hours) + group_df = group_df.interpolate(method='linear', axis=0) - # Fit the ARIMA model - model = auto_arima( - train, - seasonal=seasonal, - m=m, - trace=True, - error_action='ignore', - suppress_warnings=True, - stepwise=True, - ) + train_df_size = int(len(group_df) * train_size) + train_df, test_df = group_df[:train_df_size], group_df[train_df_size:] - # Forecast the test set - forecast = model.predict(n_periods=forecast_steps) + train_df = train_df.set_index("Time")["Intensity"] + test_df = test_df.set_index("Time")["Intensity"] - last_time = data.index[-1] +1 - forecast_index = np.arange(last_time, last_time + forecast_steps) - forecast_series = pd.Series(forecast, index=forecast_index) + # Fit the ARIMA model + model = auto_arima( + train_df, + seasonal=seasonal, + m=m, + trace=True, + error_action='ignore', + suppress_warnings=True, + stepwise=True, + ) - test_for_comparison = test[:forecast_steps] - forecast_for_comparison = forecast_series[: len(test_for_comparison)] + # Forecast the test set + forecast = model.predict(n_periods=test_df.shape[0]) + test_rmse = np.sqrt(mean_squared_error(test_df, forecast)) + test_r2 = r2_score(test_df, forecast) + train_rmse = np.sqrt(mean_squared_error(train_df, model.predict_in_sample())) + train_r2 = r2_score(train_df, model.predict_in_sample()) + forecast_reset = forecast.reset_index(drop=True) + forecast_plot = pd.Series(forecast_reset.values, index=test_df.index) + forecast_plot = forecast_plot.groupby(forecast_plot.index).mean() - test_rmse = np.sqrt(mean_squared_error(test_for_comparison, forecast_for_comparison)) - test_r2 = r2_score(test_for_comparison, forecast_for_comparison) - train_rmse = np.sqrt(mean_squared_error(train, model.predict_in_sample())) - train_r2 = r2_score(train, model.predict_in_sample()) + fig.add_trace(go.Scatter( + x=test_df.index, + y=test_df, + mode='markers', + name='Actual Intensity', + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index]) + ), row=1, col=1) - fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3]) + fig.add_trace(go.Scatter( + x=test_df.index, + y=forecast, + mode='markers', + name='Predicted Intensity', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 1]) + ), row=1, col=1) - scores = [] + fig.add_trace(go.Scatter( + x = forecast_plot.index, + y = forecast_plot, + mode = 'lines', + name = 'Mean Predicted Intensity', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2]) + ), row=1, col=1) - plot_df = pd.DataFrame({ - 'Time': test.index[:forecast_steps], - 'Intensity': test[:forecast_steps], - 'Predicted': forecast_series, - 'Inlier': np.abs(test[:forecast_steps] - forecast_series) < (1.5 * np.std(test[:forecast_steps])) - }) + color_index += 3 - fig.add_trace(go.Scatter( - x=plot_df['Time'], - y=plot_df['Intensity'], - mode='markers', - name='Actual Intensity', - marker=dict(color='blue') - ), row=1, col=1) + scores.append({ + 'group': group, + 'train_root_mean_squared': train_rmse, + 'test_root_mean_squared': test_rmse, + 'train_r2_score': train_r2, + 'test_r2_score': test_r2, + }) - fig.add_trace(go.Scatter( - x=plot_df['Time'], - y=plot_df['Predicted'], - mode='lines', - name='Predicted Intensity', - line=dict(color='red') - ), row=1, col=1) + else: + input_df["Time"] = input_df["Time"].apply(convert_time_to_hours) + input_df = input_df.interpolate(method='linear', axis=0) + + train_size = int(len(input_df) * train_size) + train_df, test_df = input_df[:train_size], input_df[train_size:] + + train_df = train_df.set_index("Time")["Intensity"] + test_df = test_df.set_index("Time")["Intensity"] + + # Fit the ARIMA model + model = auto_arima( + train_df, + seasonal=seasonal, + m=m, + trace=True, + error_action='ignore', + suppress_warnings=True, + stepwise=True, + ) - fig.add_trace(go.Scatter( - x=plot_df[plot_df['Inlier'] == False]['Time'], - y=plot_df[plot_df['Inlier'] == False]['Intensity'], - mode='markers', - name='Outliers', - marker=dict(color='green') - ), row=1, col=1) - - scores.append({ - 'group': 'Overall', - 'train_root_mean_squared': train_rmse, - 'test_root_mean_squared': test_rmse, - 'train_r2_score': train_r2, - 'test_r2_score': test_r2, - }) + # Forecast the test set + forecast = model.predict(n_periods=test_df.shape[0]) + + test_rmse = np.sqrt(mean_squared_error(test_df, forecast)) + test_r2 = r2_score(test_df, forecast) + train_rmse = np.sqrt(mean_squared_error(train_df, model.predict_in_sample())) + train_r2 = r2_score(train_df, model.predict_in_sample()) + + forecast_reset = forecast.reset_index(drop=True) + forecast_plot = pd.Series(forecast_reset.values, index=test_df.index) + forecast_plot = forecast_plot.groupby(forecast_plot.index).mean() + + fig.add_trace(go.Scatter( + x=test_df.index, + y=test_df, + mode='markers', + name='Actual Intensity', + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) + ), row=1, col=1) + + fig.add_trace(go.Scatter( + x=test_df.index, + y=forecast, + mode='markers', + name='Predicted Intensity', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]) + ), row=1, col=1) + + fig.add_trace(go.Scatter( + x=forecast_plot.index, + y=forecast_plot, + mode='lines', + name='Mean Predicted Intensity', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3]) + ), row=1, col=1) + + scores.append({ + 'group': 'Overall', + 'train_root_mean_squared': train_rmse, + 'test_root_mean_squared': test_rmse, + 'train_r2_score': train_r2, + 'test_r2_score': test_r2, + }) # Add annotation text as a separate trace in the subplot annotation_text = "
".join([ - f"Group: {res['group']}
Train RMSE: {res['train_root_mean_squared']:.3f}
" - f"Test RMSE: {res['test_root_mean_squared']:.3f}
" - f"Train R²: {res['train_r2_score']:.3f}
" - f"Test R²: {res['test_r2_score']:.3f}" + f"Group: {res['group']} (Train/Test)" + f"
RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}
" + f"Train R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}
" for res in scores ]) @@ -633,7 +698,7 @@ def time_series_auto_arima( yanchor="top", y=0.95, xanchor="right", - x=0.825 + x=0.775 ) ) @@ -642,7 +707,6 @@ def time_series_auto_arima( fig.update_annotations(font_size=12) - fig.show() return dict( scores=scores, diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index ad84a523..4733ff90 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -839,6 +839,9 @@ class TimeSeriesRANSACRegression(PlotStep): "input_df", "metadata_df", "protein_group", + "max_trials", + "stop_probability", + "loss", "train_size", "grouping", ] @@ -895,7 +898,6 @@ class TimeSeriesAutoARIMA(PlotStep): "seasonal", "m", "train_size", - "forecast_steps", "grouping", ] output_keys = [ diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 960dd33e..9d15c199 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -150,6 +150,10 @@ class TimeSeriesGrouping(Enum): with_grouping = "With Grouping" without_grouping = "Without Grouping" +class TimeSeriesRANSACLoss(Enum): + absolute_error = "absolute_error" + squared_error = "squared_error" + class DifferentialExpressionANOVAForm(MethodForm): is_dynamic = True @@ -1234,7 +1238,7 @@ class TimeSeriesLinearRegressionForm(MethodForm): min_value=0, max_value=1, step_size=0.1, - initial=0.2 + initial=0.8 ) grouping = CustomChoiceField( choices= TimeSeriesGrouping, @@ -1270,12 +1274,30 @@ class TimeSeriesRANSACRegressionForm(MethodForm): choices=[], label="Protein group: which protein group to perform the RANSAC regression on", ) + max_trials = CustomNumberField( + label="Max trials: the maximum number of iterations for random sample selection", + min_value=1, + step_size=1, + initial=100, + ) + stop_probability = CustomFloatField( + label="Stop Probability: the probability that the algorithm stops after a certain number of iterations if at least one outlier-free set of the training data is sampled", + min_value=0, + max_value=1, + step_size=0.01, + initial=0.99 + ) + loss = CustomChoiceField( + choices= TimeSeriesRANSACLoss, + label="Loss function: the loss function to be used for fitting the linear model", + initial=TimeSeriesRANSACLoss.absolute_error, + ) train_size = CustomFloatField( label="Train size: proportion of the dataset to include in the test split", min_value=0, max_value=1, step_size=0.1, - initial=0.2 + initial=0.8 ) grouping = CustomChoiceField( choices= TimeSeriesGrouping, @@ -1382,12 +1404,6 @@ class TimeSeriesAutoARIMAForm(MethodForm): step_size=0.1, initial=0.8, ) - forecast_steps = CustomNumberField( - label="Number of steps to forecast", - min_value=1, - step_size=1, - initial=10 - ) grouping = CustomChoiceField( choices= TimeSeriesGrouping, label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", From 82a550ad211716584888984645609a3246e5d7c0 Mon Sep 17 00:00:00 2001 From: AK Date: Thu, 8 Aug 2024 15:39:16 +0200 Subject: [PATCH 33/52] implemented ARIMA --- .../time_series_regression_analysis.py | 226 +++++++++++++++++- protzilla/methods/data_analysis.py | 32 +++ ui/runs/form_mapping.py | 1 + ui/runs/forms/data_analysis.py | 81 ++++++- 4 files changed, 330 insertions(+), 10 deletions(-) diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py index 1646b532..7777f833 100644 --- a/protzilla/data_analysis/time_series_regression_analysis.py +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -437,8 +437,6 @@ def adfuller_test( - messages: A list of messages for the user. """ - # TODO: Info box for the user - messages = [] input_df = input_df[input_df['Protein ID'] == protein_group] @@ -502,7 +500,7 @@ def time_series_auto_arima( :param seasonal: Whether the ARIMA model should be seasonal :param m: The number of time steps for a single seasonal period (ignored if seasonal=False) :param train_size: The proportion of the dataset to include in the test split - :param forecast_steps: The number of steps to forecast + :param grouping: Whether to group the data by the 'Group' column :return: A dictionary containing the root mean squared error and r2 score for the training and test sets """ @@ -579,7 +577,7 @@ def time_series_auto_arima( y=forecast, mode='markers', name='Predicted Intensity', - line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 1]) + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2]) ), row=1, col=1) fig.add_trace(go.Scatter( @@ -712,3 +710,223 @@ def time_series_auto_arima( scores=scores, plots=[fig], ) + + +def time_series_arima( + input_df: pd.DataFrame, + metadata_df: pd.DataFrame, + protein_group: str, + seasonal: str, + p: int, + d: int, + q: int, + train_size: float, + grouping: str, +) -> dict: + + """ + Perform ARIMA model selection on the time series data for a given protein group. + :param input_df: Peptide dataframe which contains the intensity of each sample + :param metadata_df: Metadata dataframe which contains the timestamps + :param protein_group: Protein group to perform the analysis on + :param seasonal: Whether the ARIMA model should be seasonal + :param p: ARIMA p parameter + :param d: ARIMA d parameter + :param q: ARIMA q parameter + :param train_size: The proportion of the dataset to include in the test split + :param grouping: Whether to group the data by the 'Group' column + + :return: A dictionary containing the root mean squared error and r2 score for the training and test sets + """ + + color_index = 0 + + if train_size < 0 or train_size > 1: + raise ValueError("Train size should be between 0 and 1") + + input_df = input_df[input_df['Protein ID'] == protein_group] + input_df = input_df.sample(frac=1, random_state=42).reset_index(drop=True) + + input_df = pd.merge(left=input_df, right=metadata_df, on="Sample", copy=False) + + fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3]) + scores = [] + + if grouping == "With Grouping" and "Group" in input_df.columns: + groups = input_df["Group"].unique() + for group in groups: + group_df = input_df[input_df["Group"] == group] + + group_df["Time"] = group_df["Time"].apply(convert_time_to_hours) + group_df = group_df.interpolate(method='linear', axis=0) + + train_df_size = int(len(group_df) * train_size) + train_df, test_df = group_df[:train_df_size], group_df[train_df_size:] + + train_df = train_df.set_index("Time")["Intensity"] + test_df = test_df.set_index("Time")["Intensity"] + + if seasonal == "Yes": + model = ARIMA( + train_df, + order=(p, d, q), + #seasonal_order=(P, D, Q, m) + ) + else: + model = ARIMA( + train_df, + order=(p, d, q) + ) + + model_fit = model.fit() + + forecast = model_fit.forecast(steps=len(test_df)) + + test_rmse = np.sqrt(mean_squared_error(test_df, forecast)) + test_r2 = r2_score(test_df, forecast) + train_rmse = np.sqrt(mean_squared_error(train_df, model_fit.fittedvalues)) + train_r2 = r2_score(train_df, model_fit.fittedvalues) + + forecast_reset = forecast.reset_index(drop=True) + forecast_plot = pd.Series(forecast_reset.values, index=test_df.index) + forecast_mean_plot = forecast_plot.groupby(forecast_plot.index).mean() + + fig.add_trace(go.Scatter( + x=test_df.index, + y=test_df, + mode='markers', + name='Actual Intensity', + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index]) + ), row=1, col=1) + + fig.add_trace(go.Scatter( + x=forecast_plot.index, + y=forecast_plot, + mode='markers', + name='Predicted Intensity', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2]) + ), row=1, col=1) + + fig.add_trace(go.Scatter( + x = forecast_mean_plot.index, + y = forecast_mean_plot, + mode = 'lines', + name = 'Mean Predicted Intensity', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2]) + ), row=1, col=1) + + color_index += 3 + + scores.append({ + 'group': group, + 'train_root_mean_squared': train_rmse, + 'test_root_mean_squared': test_rmse, + 'train_r2_score': train_r2, + 'test_r2_score': test_r2, + }) + + else: + input_df["Time"] = input_df["Time"].apply(convert_time_to_hours) + input_df = input_df.interpolate(method='linear', axis=0) + + train_size = int(len(input_df) * train_size) + train_df, test_df = input_df[:train_size], input_df[train_size:] + + train_df = train_df.set_index("Time")["Intensity"] + test_df = test_df.set_index("Time")["Intensity"] + + if seasonal == "Yes": + model = ARIMA(train_df, order=(p, d, q)) + else: + model = ARIMA(train_df, order=(p, d, q)) + + model_fit = model.fit() + + forecast = model_fit.forecast(steps=len(test_df)) + + test_rmse = np.sqrt(mean_squared_error(test_df, forecast)) + test_r2 = r2_score(test_df, forecast) + train_rmse = np.sqrt(mean_squared_error(train_df, model_fit.fittedvalues)) + train_r2 = r2_score(train_df, model_fit.fittedvalues) + + forecast_reset = forecast.reset_index(drop=True) + forecast_plot = pd.Series(forecast_reset.values, index=test_df.index) + forecast_plot = forecast_plot.groupby(forecast_plot.index).mean() + + fig.add_trace(go.Scatter( + x=test_df.index, + y=test_df, + mode='markers', + name='Actual Intensity', + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) + ), row=1, col=1) + + fig.add_trace(go.Scatter( + x=test_df.index, + y=forecast, + mode='markers', + name='Predicted Intensity', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]) + ), row=1, col=1) + + fig.add_trace(go.Scatter( + x=forecast_plot.index, + y=forecast_plot, + mode='lines', + name='Mean Predicted Intensity', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3]) + ), row=1, col=1) + + scores.append({ + 'group': 'Overall', + 'train_root_mean_squared': train_rmse, + 'test_root_mean_squared': test_rmse, + 'train_r2_score': train_r2, + 'test_r2_score': test_r2, + }) + + annotation_text = "
".join([ + f"Group: {res['group']} (Train/Test)" + f"
RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}
" + f"Train R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}
" + for res in scores + ]) + + fig.add_trace(go.Scatter( + x=[0], + y=[0.25], + text=[annotation_text], + mode='text', + textfont=dict(size=12), + showlegend=False + ), row=1, col=2) + + fig.update_layout( + title=f"Intensity over Time for {protein_group}", + plot_bgcolor=colors["plot_bgcolor"], + xaxis_gridcolor=colors["gridcolor"], + yaxis_gridcolor=colors["gridcolor"], + xaxis_linecolor=colors["linecolor"], + yaxis_linecolor=colors["linecolor"], + xaxis_title="Time", + yaxis_title="Intensity", + legend_title="Legend", + autosize=True, + margin=dict(l=100, r=100, t=100, b=50), + legend=dict( + yanchor="top", + y=0.95, + xanchor="right", + x=0.775 + ) + ) + + fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2) + fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2) + + fig.update_annotations(font_size=12) + + return dict( + scores=scores, + plots=[fig], + ) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 4733ff90..26d73f77 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -19,6 +19,7 @@ time_series_ransac_regression, adfuller_test, time_series_auto_arima, + time_series_arima, ) from protzilla.data_analysis.ptm_analysis import filter_peptides_of_protein, ptms_per_sample, \ ptms_per_protein_and_sample @@ -913,6 +914,37 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: return inputs +class TimeSeriesARIMA(PlotStep): + display_name = "ARIMA (AutoRegressive Integrated Moving Average)" + operation = "Time series analysis" + method_description = ( + "Perform ARIMA on the time series data for a given protein group." + ) + + input_keys = [ + "input_df", + "metadata_df", + "protein_group", + "seasonal", + "p", + "d", + "q", + "train_size", + "grouping", + ] + output_keys = [ + "scores", + ] + + def method(self, inputs: dict) -> dict: + return time_series_arima(**inputs) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["input_df"] = steps.get_step_output(Step, "peptide_df", inputs["input_df"]) + inputs["metadata_df"] = steps.metadata_df + return inputs + + class PTMsPerSample(DataAnalysisStep): display_name = "PTMs per Sample" operation = "Peptide analysis" diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py index a1bcd37b..8f4793f1 100644 --- a/ui/runs/form_mapping.py +++ b/ui/runs/form_mapping.py @@ -70,6 +70,7 @@ data_analysis.TimeSeriesRANSACRegression: data_analysis_forms.TimeSeriesRANSACRegressionForm, data_analysis.TimeSeriesADFullerTest: data_analysis_forms.TimeSeriesADFullerTestForm, data_analysis.TimeSeriesAutoARIMA: data_analysis_forms.TimeSeriesAutoARIMAForm, + data_analysis.TimeSeriesARIMA: data_analysis_forms.TimeSeriesARIMAForm, data_preprocessing.ImputationByMinPerSample: data_preprocessing_forms.ImputationByMinPerSampleForms, data_integration.EnrichmentAnalysisGOAnalysisWithString: data_integration_forms.EnrichmentAnalysisGOAnalysisWithStringForm, data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr: data_integration_forms.EnrichmentAnalysisGOAnalysisWithEnrichrForm, diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 9d15c199..5e01e5d0 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1332,7 +1332,9 @@ class TimeSeriesADFullerTestForm(MethodForm): "determines how strongly a time series is defined by a trend. The null hypothesis of the test is that the " "time series can be represented by a unit root, which implies that the time series is not stationary. " "The alternative hypothesis is that the time series is stationary. If the p-value is less than the " - "significance level, the null hypothesis can be rejected and the time series is considered stationary." + "significance level, the null hypothesis can be rejected and the time series is considered stationary.
" + "Dickey, D. & Fuller, Wayne. (1979). Distribution of the Estimators for Autoregressive Time Series With a Unit Root." + "JASA. Journal of the American Statistical Association. 74. 10.2307/2286348. " ), ) input_df = CustomChoiceField( @@ -1370,12 +1372,8 @@ def fill_form(self, run: Run) -> None: class TimeSeriesAutoARIMAForm(MethodForm): is_dynamic = True model_info = TextDisplayField( - label="Information about the AutoARIMA model", + label="Citation for AutoARIMA model", text=( - "Auto ARIMA is a function that automatically selects the best-fitting ARIMA model for a time series" - "by iterating over multiple combinations of model parameters to minimize an information criterion like AIC (Akaike Information Criterion)." - "It simplifies the model selection process, handling both seasonal and non-seasonal data," - " and helps in making accurate forecasts." ), ) input_df = CustomChoiceField( @@ -1411,6 +1409,77 @@ class TimeSeriesAutoARIMAForm(MethodForm): ) + def fill_form(self, run: Run) -> None: + self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps( + run + ) + input_df_instance_id = self.data.get( + "input_df", self.fields["input_df"].choices[0][0] + ) + + self.fields["protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + step_type=Step, + output_key="peptide_df", + instance_identifier=input_df_instance_id, + )["Protein ID"].unique() + ) + + +class TimeSeriesARIMAForm(MethodForm): + is_dynamic = True + """ + model_info = TextDisplayField( + label="Citation for ARIMA model", + text=( + ), + ) + """ + input_df = CustomChoiceField( + choices=[], + label="Peptide dataframe", + ) + protein_group = CustomChoiceField( + choices=[], + label="Protein group: which protein group to perform the AutoARIMA on", + ) + seasonal = CustomChoiceField( + choices=YesNo, + label="Seasonal: Whether the ARIMA model should be seasonal", + initial=YesNo.no + ) + p = CustomNumberField( + label = "The number of lag observations included in the model", + min_value=0, + step_size=1, + initial=1, + ) + d = CustomNumberField( + label = "The number of times that the raw observations are differenced", + min_value=0, + step_size=1, + initial=1, + ) + q = CustomNumberField( + label = "The size of the moving average window", + min_value=1, + step_size=1, + initial=1, + ) + train_size = CustomFloatField( + label="Train size: proportion of the dataset to include in the test split", + min_value=0, + max_value=1, + step_size=0.1, + initial=0.8, + ) + grouping = CustomChoiceField( + choices= TimeSeriesGrouping, + label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", + initial=TimeSeriesGrouping.with_grouping + ) + + def fill_form(self, run: Run) -> None: self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps( run From 5c0c157d4f3b899da7bfb239dd0db291f1da864d Mon Sep 17 00:00:00 2001 From: AK Date: Thu, 8 Aug 2024 15:56:48 +0200 Subject: [PATCH 34/52] Fixed RANSAC tests --- .../time_series_regression_analysis.py | 4 +- .../test_time_series_analysis.py | 54 +++++++++++++++---- 2 files changed, 47 insertions(+), 11 deletions(-) diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py index 7777f833..bb9f5d83 100644 --- a/protzilla/data_analysis/time_series_regression_analysis.py +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -73,7 +73,7 @@ def time_series_linear_regression( X_group = group_df[["Time"]] y_group = group_df["Intensity"] - X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, test_size=train_size, shuffle=False) + X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, train_size=train_size, shuffle=False) model = LinearRegression() model.fit(X_train, y_train) @@ -314,7 +314,7 @@ def time_series_ransac_regression( }) else: - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_size, shuffle=False) + X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, shuffle=False) model = RANSACRegressor(base_estimator=LinearRegression()) model.fit(X_train, y_train) diff --git a/tests/protzilla/data_analysis/test_time_series_analysis.py b/tests/protzilla/data_analysis/test_time_series_analysis.py index 4520d3f2..2ee0ff4f 100644 --- a/tests/protzilla/data_analysis/test_time_series_analysis.py +++ b/tests/protzilla/data_analysis/test_time_series_analysis.py @@ -63,7 +63,7 @@ def time_series_test_data(): def test_linear_regression_plot_with_grouping(show_figures, time_series_test_data): test_intensity, test_metadata = time_series_test_data - outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2,"With Grouping") + outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.8,"With Grouping") assert "plots" in outputs fig = outputs["plots"][0] if show_figures: @@ -72,14 +72,14 @@ def test_linear_regression_plot_with_grouping(show_figures, time_series_test_dat def test_linear_regression_plot_without_grouping(show_figures, time_series_test_data): test_intensity, test_metadata = time_series_test_data - outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2,"Without Grouping") + outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.8,"Without Grouping") assert "plots" in outputs fig = outputs["plots"][0] if show_figures: fig.show() return -def test_linear_regression_plot_invalid_test_size(time_series_test_data): +def test_linear_regression_plot_invalid_train_size(time_series_test_data): test_intensity, test_metadata = time_series_test_data with pytest.raises(ValueError): time_series_linear_regression(test_intensity, test_metadata, "Protein1", 2, "Without Grouping") @@ -87,14 +87,23 @@ def test_linear_regression_plot_invalid_test_size(time_series_test_data): def test_linear_regression_outputs(time_series_test_data): test_intensity, test_metadata = time_series_test_data - outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2, "Without Grouping") + outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.8, "Without Grouping") assert "scores" in outputs return def test_ransac_regression_plot_with_grouping(show_figures, time_series_test_data): test_intensity, test_metadata = time_series_test_data - outputs = time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 0.2, "With Grouping") + outputs = time_series_ransac_regression( + test_intensity, + test_metadata, + "Protein1", + 100, + 0.99, + "absolute_error", + 0.8, + "With Grouping" + ) assert "plots" in outputs fig = outputs["plots"][0] if show_figures: @@ -103,22 +112,49 @@ def test_ransac_regression_plot_with_grouping(show_figures, time_series_test_dat def test_ransac_regression_plot_without_grouping(show_figures, time_series_test_data): test_intensity, test_metadata = time_series_test_data - outputs = time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 0.2, "Without Grouping") + outputs = time_series_ransac_regression( + test_intensity, + test_metadata, + "Protein1", + 100, + 0.99, + "absolute_error", + 0.8, + "With Grouping" + ) assert "plots" in outputs fig = outputs["plots"][0] if show_figures: fig.show() return -def test_ransac_plot_invalid_test_size(time_series_test_data): +def test_ransac_plot_invalid_train_size(time_series_test_data): test_intensity, test_metadata = time_series_test_data with pytest.raises(ValueError): - time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 2, "Without Grouping") + time_series_ransac_regression( + test_intensity, + test_metadata, + "Protein1", + 100, + 0.99, + "absolute_error", + 2, + "With Grouping" + ) return def test_ransac_regression_outputs(time_series_test_data): test_intensity, test_metadata = time_series_test_data - outputs = time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 0.2, "Without Grouping") + outputs = time_series_ransac_regression( + test_intensity, + test_metadata, + "Protein1", + 100, + 0.99, + "absolute_error", + 0.8, + "With Grouping" + ) assert "scores" in outputs return From 0b54ee81a787923f6a2cb9e95bdff1c488db0f26 Mon Sep 17 00:00:00 2001 From: AK Date: Wed, 14 Aug 2024 13:06:06 +0200 Subject: [PATCH 35/52] Updated ARIMA so that it supports seasonal parameters --- .../time_series_regression_analysis.py | 12 ++++- protzilla/methods/data_analysis.py | 4 ++ ui/runs/forms/data_analysis.py | 44 ++++++++++++++++--- 3 files changed, 53 insertions(+), 7 deletions(-) diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py index bb9f5d83..52173b23 100644 --- a/protzilla/data_analysis/time_series_regression_analysis.py +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -720,6 +720,10 @@ def time_series_arima( p: int, d: int, q: int, + P: int, + D: int, + Q: int, + s: int, train_size: float, grouping: str, ) -> dict: @@ -770,7 +774,7 @@ def time_series_arima( model = ARIMA( train_df, order=(p, d, q), - #seasonal_order=(P, D, Q, m) + seasonal_order=(P, D, Q, s) ) else: model = ARIMA( @@ -836,7 +840,11 @@ def time_series_arima( test_df = test_df.set_index("Time")["Intensity"] if seasonal == "Yes": - model = ARIMA(train_df, order=(p, d, q)) + model = ARIMA( + train_df, + order=(p, d, q), + seasonal_order = (P, D, Q, s), + ) else: model = ARIMA(train_df, order=(p, d, q)) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 26d73f77..dcdee6ce 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -929,6 +929,10 @@ class TimeSeriesARIMA(PlotStep): "p", "d", "q", + "P", + "D", + "Q", + "s", "train_size", "grouping", ] diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 5e01e5d0..1da3729d 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1449,23 +1449,51 @@ class TimeSeriesARIMAForm(MethodForm): initial=YesNo.no ) p = CustomNumberField( - label = "The number of lag observations included in the model", + label = "Autoregressive Order: The number of lag observations included in the model", min_value=0, step_size=1, initial=1, ) d = CustomNumberField( - label = "The number of times that the raw observations are differenced", + label = "Differencing Order: The number of times that the raw observations are differenced", min_value=0, step_size=1, initial=1, ) q = CustomNumberField( - label = "The size of the moving average window", - min_value=1, + label = "Moving Average Order: The size of the moving average window", + min_value=0, step_size=1, initial=1, ) + P = CustomNumberField( + label = "Seasonal Autoregressive Order: The number of seasonal lag observations included in the model", + min_value=0, + step_size=1, + initial=0, + required=False + ) + D = CustomNumberField( + label = "Seasonal Differencing Order: The number of times that the seasonal observations are differenced", + min_value=0, + step_size=1, + initial=0, + required=False + ) + Q = CustomNumberField( + label = "Seasonal Moving Average Order: The size of the seasonal moving average window", + min_value=0, + step_size=1, + initial=0, + required=False + ) + s = CustomNumberField( + label = "Seasonal Period: The number of periods for a single seasonal cycle", + min_value=0, + step_size=1, + initial=0, + required=False + ) train_size = CustomFloatField( label="Train size: proportion of the dataset to include in the test split", min_value=0, @@ -1494,4 +1522,10 @@ def fill_form(self, run: Run) -> None: output_key="peptide_df", instance_identifier=input_df_instance_id, )["Protein ID"].unique() - ) \ No newline at end of file + ) + seasonal = self.data.get("seasonal") + if seasonal == "No": + self.toggle_visibility("P", False) + self.toggle_visibility("D", False) + self.toggle_visibility("Q", False) + self.toggle_visibility("s", False) \ No newline at end of file From e1b77dcf66a2d1a410d64ae657247b595f425e49 Mon Sep 17 00:00:00 2001 From: AK Date: Thu, 15 Aug 2024 12:18:24 +0200 Subject: [PATCH 36/52] Corrected the output text for the scores --- .../data_analysis/time_series_regression_analysis.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py index 52173b23..0b136a76 100644 --- a/protzilla/data_analysis/time_series_regression_analysis.py +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -161,7 +161,7 @@ def time_series_linear_regression( annotation_text = "
".join([ f"Group: {res['group']} (Train/Test)" f"
RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}
" - f"Train R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}
" + f"R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}
" for res in scores ]) @@ -371,7 +371,7 @@ def time_series_ransac_regression( annotation_text = "
".join([ f"Group: {res['group']} (Train/Test)" f"
RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}
" - f"Train R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}
" + f"R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}
" for res in scores ]) @@ -667,7 +667,7 @@ def time_series_auto_arima( annotation_text = "
".join([ f"Group: {res['group']} (Train/Test)" f"
RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}
" - f"Train R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}
" + f"R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}
" for res in scores ]) @@ -896,7 +896,7 @@ def time_series_arima( annotation_text = "
".join([ f"Group: {res['group']} (Train/Test)" f"
RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}
" - f"Train R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}
" + f"R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}
" for res in scores ]) From d89b2362515ffcf03dd8f7b2c9f9b36e33cc56c7 Mon Sep 17 00:00:00 2001 From: AK Date: Thu, 15 Aug 2024 13:45:49 +0200 Subject: [PATCH 37/52] Implemented tests for auto ARIMA and ARIMA --- .../test_time_series_analysis.py | 184 ++++++++++++++++++ 1 file changed, 184 insertions(+) diff --git a/tests/protzilla/data_analysis/test_time_series_analysis.py b/tests/protzilla/data_analysis/test_time_series_analysis.py index 2ee0ff4f..5e139ce7 100644 --- a/tests/protzilla/data_analysis/test_time_series_analysis.py +++ b/tests/protzilla/data_analysis/test_time_series_analysis.py @@ -5,6 +5,8 @@ time_series_linear_regression, time_series_ransac_regression, adfuller_test, + time_series_auto_arima, + time_series_arima, ) @@ -39,6 +41,13 @@ def time_series_test_data(): ["Sample7", "Protein2", "Gene1", 13], ["Sample7", "Protein3", "Gene1", 3], ["Sample7", "Protein4", "Gene1", 11], + ["Sample1", "Protein1", "Gene2", 10], + ["Sample1", "Protein2", "Gene2", 14], + ["Sample1", "Protein3", "Gene2", 2], + ["Sample1", "Protein4", "Gene2", 10], + ["Sample2", "Protein1", "Gene2", 12], + ["Sample2", "Protein1", "Gene3", 13], + ) test_intensity_df = pd.DataFrame( @@ -168,4 +177,179 @@ def test_adfuller_test(time_series_test_data): assert "critical_values" in outputs assert "is_stationary" in outputs assert "messages" in outputs + return + + +def test_auto_arima_plot_with_grouping(show_figures, time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_auto_arima( + test_intensity, + test_metadata, + "Protein1", + "No", + 1, + 0.5, + "With Grouping" + ) + assert "plots" in outputs + fig = outputs["plots"][0] + if show_figures: + fig.show() + return + +def test_auto_arima_plot_without_grouping(show_figures, time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_auto_arima( + test_intensity, + test_metadata, + "Protein1", + "No", + 1, + 0.5, + "With Grouping" + ) + assert "plots" in outputs + fig = outputs["plots"][0] + if show_figures: + fig.show() + return + +def test_auto_arima_plot_invalid_train_size(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + with pytest.raises(ValueError): + time_series_auto_arima( + test_intensity, + test_metadata, + "Protein1", + "No", + 1, + 2, + "With Grouping" + ) + return + + +def test_auto_arima_outputs(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_auto_arima( + test_intensity, + test_metadata, + "Protein1", + "No", + 1, + 0.5, + "With Grouping" + ) + assert "scores" in outputs + return + + +def test_arima_plot_with_grouping(show_figures, time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_arima( + test_intensity, + test_metadata, + "Protein1", + "No", + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 0.5, + "With Grouping" + ) + assert "plots" in outputs + fig = outputs["plots"][0] + if show_figures: + fig.show() + return + +def test_arima_plot_seasonal_with_grouping(show_figures, time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_arima( + test_intensity, + test_metadata, + "Protein1", + "Yes", + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 0.5, + "With Grouping" + ) + assert "plots" in outputs + fig = outputs["plots"][0] + if show_figures: + fig.show() + return + +def test_arima_plot_without_grouping(show_figures, time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_arima( + test_intensity, + test_metadata, + "Protein1", + "No", + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 0.5, + "Without Grouping" + ) + assert "plots" in outputs + fig = outputs["plots"][0] + if show_figures: + fig.show() + return + +def test_arima_plot_invalid_train_size(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + with pytest.raises(ValueError): + time_series_arima( + test_intensity, + test_metadata, + "Protein1", + "No", + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 2, + "With Grouping" + ) + return + + +def test_arima_outputs(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_arima( + test_intensity, + test_metadata, + "Protein1", + "No", + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 0.5, + "With Grouping" + ) + assert "scores" in outputs return \ No newline at end of file From ec9b78301330f847bd04dd900a2ea3d9e9e40449 Mon Sep 17 00:00:00 2001 From: AK Date: Sun, 18 Aug 2024 19:05:58 +0200 Subject: [PATCH 38/52] Implemented a dynamic field where the user can select the time column and the group column in each time series methods --- .../time_series_regression_analysis.py | 110 +++++++++++------- protzilla/methods/data_analysis.py | 9 ++ protzilla/methods/importing.py | 3 +- ui/runs/forms/data_analysis.py | 44 +++++++ ui/runs/forms/importing.py | 8 +- 5 files changed, 126 insertions(+), 48 deletions(-) diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py index 0b136a76..5f6240c4 100644 --- a/protzilla/data_analysis/time_series_regression_analysis.py +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -27,16 +27,20 @@ def time_series_linear_regression( input_df: pd.DataFrame, metadata_df: pd.DataFrame, + time_column_name: str, protein_group: str, - train_size: float = 0.2, + train_size: float, + grouping_column_name: str, grouping: str = None, ): """ Perform linear regression on the time series data for a given protein group. :param input_df: Peptide dataframe which contains the intensity of each sample :param metadata_df: Metadata dataframe which contains the timestamps + :param time_column_name: The name of the column containing the time values :param protein_group: Protein group to perform the analysis on :param train_size: The proportion of the dataset to include in the test split + :param grouping_column_name: The name of the column containing the grouping information :param grouping: Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups :return: A dictionary containing the root mean squared error and r2 score for the training and test sets @@ -54,23 +58,23 @@ def time_series_linear_regression( copy=False, ) - input_df["Time"] = input_df["Time"].apply(convert_time_to_hours) + input_df[time_column_name] = input_df[time_column_name].apply(convert_time_to_hours) input_df = input_df.interpolate(method='linear', axis=0) input_df = input_df.sample(frac=1, random_state = 42).reset_index(drop=True) - X = input_df[["Time"]] + X = input_df[[time_column_name]] y = input_df["Intensity"] fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025) scores = [] - if grouping == "With Grouping" and "Group" in input_df.columns: - groups = input_df["Group"].unique() + if grouping == "With Grouping" and grouping_column_name in input_df.columns: + groups = input_df[grouping_column_name].unique() for group in groups: - group_df = input_df[input_df["Group"] == group] - X_group = group_df[["Time"]] + group_df = input_df[input_df[grouping_column_name] == group] + X_group = group_df[[time_column_name]] y_group = group_df["Intensity"] X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, train_size=train_size, shuffle=False) @@ -85,15 +89,15 @@ def time_series_linear_regression( train_r2 = r2_score(y_train, y_pred_train) test_r2 = r2_score(y_test, y_pred_test) - train_df = pd.DataFrame({'Time': X_train['Time'], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) - test_df = pd.DataFrame({'Time': X_test['Time'], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + train_df = pd.DataFrame({'Time': X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) + test_df = pd.DataFrame({'Time': X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) plot_df = pd.concat([train_df, test_df]) color = PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index % len(PROTZILLA_DISCRETE_COLOR_SEQUENCE)] color_index += 3 fig.add_trace(go.Scatter( - x=plot_df['Time'], + x=plot_df[time_column_name], y=plot_df['Intensity'], mode='markers', name=f'Actual Intensity ({group})', @@ -101,7 +105,7 @@ def time_series_linear_regression( ), row=1, col=1) fig.add_trace(go.Scatter( - x=plot_df['Time'], + x=plot_df[time_column_name], y=plot_df['Predicted'], mode='lines', name=f'Predicted Intensity ({group})', @@ -129,12 +133,12 @@ def time_series_linear_regression( train_r2 = r2_score(y_train, y_pred_train) test_r2 = r2_score(y_test, y_pred_test) - train_df = pd.DataFrame({'Time': X_train['Time'], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) - test_df = pd.DataFrame({'Time': X_test['Time'], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + train_df = pd.DataFrame({'Time': X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) + test_df = pd.DataFrame({'Time': X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) plot_df = pd.concat([train_df, test_df]) fig.add_trace(go.Scatter( - x=plot_df['Time'], + x=plot_df[time_column_name], y=plot_df['Intensity'], mode='markers', name='Actual Intensity', @@ -142,7 +146,7 @@ def time_series_linear_regression( ), row=1, col=1) fig.add_trace(go.Scatter( - x=plot_df['Time'], + x=plot_df[time_column_name], y=plot_df['Predicted'], mode='lines', name='Predicted Intensity', @@ -209,19 +213,27 @@ def time_series_linear_regression( def time_series_ransac_regression( input_df: pd.DataFrame, metadata_df: pd.DataFrame, + time_column_name: str, protein_group: str, max_trials: int, stop_probability: float, loss: str, train_size: float, + grouping_column_name: str, grouping: str, ): """ Perform RANSAC regression on the time series data for a given protein group. :param input_df: Peptide dataframe which contains the intensity of each sample :param metadata_df: Metadata dataframe which contains the timestamps + :param time_column_name: The name of the column containing the time values + :param max_trials: The maximum number of iterations to perform + :param stop_probability: The probability to stop the RANSAC algorithm + :param loss: The loss function to use :param protein_group: Protein group to perform the analysis on :param train_size: The proportion of the dataset to include in the test split + :param grouping_column_name: The name of the column containing the grouping information + :param grouping: Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups :return: A dictionary containing the root mean squared error and r2 score for the training and test sets """ @@ -239,23 +251,23 @@ def time_series_ransac_regression( copy=False, ) - input_df["Time"] = input_df["Time"].apply(convert_time_to_hours) + input_df[time_column_name] = input_df[time_column_name].apply(convert_time_to_hours) input_df = input_df.interpolate(method='linear', axis=0) input_df = input_df.sample(frac=1, random_state = 42).reset_index(drop=True) - X = input_df[["Time"]] + X = input_df[[time_column_name]] y = input_df["Intensity"] fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025) scores = [] - if grouping == "With Grouping" and "Group" in input_df.columns: - groups = input_df["Group"].unique() + if grouping == "With Grouping" and grouping_column_name in input_df.columns: + groups = input_df[grouping_column_name].unique() for group in groups: - group_df = input_df[input_df["Group"] == group] - X_group = group_df[["Time"]] + group_df = input_df[input_df[grouping_column_name] == group] + X_group = group_df[[time_column_name]] y_group = group_df["Intensity"] X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, train_size=train_size, shuffle=False) @@ -272,8 +284,8 @@ def time_series_ransac_regression( train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask]) test_r2 = r2_score(y_test, y_pred_test) - train_df = pd.DataFrame({'Time': X_train["Time"], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) - test_df = pd.DataFrame({'Time': X_test["Time"], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + train_df = pd.DataFrame({'Time': X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) + test_df = pd.DataFrame({'Time': X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) train_df['Inlier'] = inlier_mask test_df['Inlier'] = False plot_df = pd.concat([train_df, test_df]) @@ -328,8 +340,8 @@ def time_series_ransac_regression( train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask]) test_r2 = r2_score(y_test, y_pred_test) - train_df = pd.DataFrame({'Time': X_train["Time"], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) - test_df = pd.DataFrame({'Time': X_test["Time"], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + train_df = pd.DataFrame({'Time': X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) + test_df = pd.DataFrame({'Time': X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) train_df['Inlier'] = inlier_mask test_df['Inlier'] = False plot_df = pd.concat([train_df, test_df]) @@ -486,20 +498,24 @@ def adfuller_test( def time_series_auto_arima( input_df: pd.DataFrame, metadata_df: pd.DataFrame, + time_column_name: str, protein_group: str, seasonal: str, m: int, train_size: float, + grouping_column_name: str, grouping: str, ) -> dict: """ Perform an automatic ARIMA model selection on the time series data for a given protein group. :param input_df: Peptide dataframe which contains the intensity of each sample :param metadata_df: Metadata dataframe which contains the timestamps + :param time_column_name: The name of the column containing the time values :param protein_group: Protein group to perform the analysis on :param seasonal: Whether the ARIMA model should be seasonal :param m: The number of time steps for a single seasonal period (ignored if seasonal=False) :param train_size: The proportion of the dataset to include in the test split + :param grouping_column_name: The name of the column containing the grouping information :param grouping: Whether to group the data by the 'Group' column :return: A dictionary containing the root mean squared error and r2 score for the training and test sets @@ -527,19 +543,19 @@ def time_series_auto_arima( fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3]) scores = [] - if grouping == "With Grouping" and "Group" in input_df.columns: - groups = input_df["Group"].unique() + if grouping == "With Grouping" and grouping_column_name in input_df.columns: + groups = input_df[grouping_column_name].unique() for group in groups: - group_df = input_df[input_df["Group"] == group] + group_df = input_df[input_df[grouping_column_name] == group] - group_df["Time"] = group_df["Time"].apply(convert_time_to_hours) + group_df[time_column_name] = group_df[time_column_name].apply(convert_time_to_hours) group_df = group_df.interpolate(method='linear', axis=0) train_df_size = int(len(group_df) * train_size) train_df, test_df = group_df[:train_df_size], group_df[train_df_size:] - train_df = train_df.set_index("Time")["Intensity"] - test_df = test_df.set_index("Time")["Intensity"] + train_df = train_df.set_index(time_column_name)["Intensity"] + test_df = test_df.set_index(time_column_name)["Intensity"] # Fit the ARIMA model model = auto_arima( @@ -599,14 +615,14 @@ def time_series_auto_arima( }) else: - input_df["Time"] = input_df["Time"].apply(convert_time_to_hours) + input_df[time_column_name] = input_df[time_column_name].apply(convert_time_to_hours) input_df = input_df.interpolate(method='linear', axis=0) train_size = int(len(input_df) * train_size) train_df, test_df = input_df[:train_size], input_df[train_size:] - train_df = train_df.set_index("Time")["Intensity"] - test_df = test_df.set_index("Time")["Intensity"] + train_df = train_df.set_index(time_column_name)["Intensity"] + test_df = test_df.set_index(time_column_name)["Intensity"] # Fit the ARIMA model model = auto_arima( @@ -715,6 +731,7 @@ def time_series_auto_arima( def time_series_arima( input_df: pd.DataFrame, metadata_df: pd.DataFrame, + time_column_name: str, protein_group: str, seasonal: str, p: int, @@ -725,6 +742,7 @@ def time_series_arima( Q: int, s: int, train_size: float, + grouping_column_name: str, grouping: str, ) -> dict: @@ -732,12 +750,18 @@ def time_series_arima( Perform ARIMA model selection on the time series data for a given protein group. :param input_df: Peptide dataframe which contains the intensity of each sample :param metadata_df: Metadata dataframe which contains the timestamps + :param time_column_name: The name of the column containing the time values :param protein_group: Protein group to perform the analysis on :param seasonal: Whether the ARIMA model should be seasonal :param p: ARIMA p parameter :param d: ARIMA d parameter :param q: ARIMA q parameter + :param P: ARIMA seasonal P parameter + :param D: ARIMA seasonal D parameter + :param Q: ARIMA seasonal Q parameter + :param s: ARIMA seasonal s parameter :param train_size: The proportion of the dataset to include in the test split + :param grouping_column_name: The name of the column containing the grouping information :param grouping: Whether to group the data by the 'Group' column :return: A dictionary containing the root mean squared error and r2 score for the training and test sets @@ -756,19 +780,19 @@ def time_series_arima( fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3]) scores = [] - if grouping == "With Grouping" and "Group" in input_df.columns: - groups = input_df["Group"].unique() + if grouping == "With Grouping" and grouping_column_name in input_df.columns: + groups = input_df[grouping_column_name].unique() for group in groups: - group_df = input_df[input_df["Group"] == group] + group_df = input_df[input_df[grouping_column_name] == group] - group_df["Time"] = group_df["Time"].apply(convert_time_to_hours) + group_df[time_column_name] = group_df[time_column_name].apply(convert_time_to_hours) group_df = group_df.interpolate(method='linear', axis=0) train_df_size = int(len(group_df) * train_size) train_df, test_df = group_df[:train_df_size], group_df[train_df_size:] - train_df = train_df.set_index("Time")["Intensity"] - test_df = test_df.set_index("Time")["Intensity"] + train_df = train_df.set_index(time_column_name)["Intensity"] + test_df = test_df.set_index(time_column_name)["Intensity"] if seasonal == "Yes": model = ARIMA( @@ -830,14 +854,14 @@ def time_series_arima( }) else: - input_df["Time"] = input_df["Time"].apply(convert_time_to_hours) + input_df[time_column_name] = input_df[time_column_name].apply(convert_time_to_hours) input_df = input_df.interpolate(method='linear', axis=0) train_size = int(len(input_df) * train_size) train_df, test_df = input_df[:train_size], input_df[train_size:] - train_df = train_df.set_index("Time")["Intensity"] - test_df = test_df.set_index("Time")["Intensity"] + train_df = train_df.set_index(time_column_name)["Intensity"] + test_df = test_df.set_index(time_column_name)["Intensity"] if seasonal == "Yes": model = ARIMA( diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 45b9b96e..229c27b6 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -806,8 +806,10 @@ class TimeSeriesLinearRegression(PlotStep): input_keys = [ "input_df", "metadata_df", + "time_column_name", "protein_group", "train_size", + "grouping_column_name", "grouping", ] output_keys = [ @@ -831,11 +833,13 @@ class TimeSeriesRANSACRegression(PlotStep): input_keys = [ "input_df", "metadata_df", + "time_column_name", "protein_group", "max_trials", "stop_probability", "loss", "train_size", + "grouping_column_name", "grouping", ] output_keys = [ @@ -858,6 +862,7 @@ class TimeSeriesADFullerTest(DataAnalysisStep): input_keys = [ "input_df", "metadata_df", + "time_column_name", "protein_group", "alpha", ] @@ -887,10 +892,12 @@ class TimeSeriesAutoARIMA(PlotStep): input_keys = [ "input_df", "metadata_df", + "time_column_name", "protein_group", "seasonal", "m", "train_size", + "grouping_column_name", "grouping", ] output_keys = [ @@ -916,6 +923,7 @@ class TimeSeriesARIMA(PlotStep): input_keys = [ "input_df", "metadata_df", + "time_column_name", "protein_group", "seasonal", "p", @@ -926,6 +934,7 @@ class TimeSeriesARIMA(PlotStep): "Q", "s", "train_size", + "grouping_column_name", "grouping", ] output_keys = [ diff --git a/protzilla/methods/importing.py b/protzilla/methods/importing.py index 7cde1ba0..a7af6d42 100644 --- a/protzilla/methods/importing.py +++ b/protzilla/methods/importing.py @@ -96,7 +96,8 @@ class MetadataColumnAssignment(ImportingStep): display_name = "Metadata column assignment" operation = "metadataimport" method_description = ( - "Assign columns to metadata categories, repeatable for each category" + "Protzilla uses a unique metadata column name to identify certain features in the metadata. " + "This step assigns the metadata columns to the correct feature." ) input_keys = [ diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index d7de9ce3..899b56ae 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1240,6 +1240,7 @@ class TimeSeriesLinearRegressionForm(MethodForm): choices=[], label="Peptide dataframe", ) + time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") protein_group = CustomChoiceField( choices=[], label="Protein group: which protein group to perform the linear regression on", @@ -1251,6 +1252,7 @@ class TimeSeriesLinearRegressionForm(MethodForm): step_size=0.1, initial=0.8 ) + grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") grouping = CustomChoiceField( choices= TimeSeriesGrouping, label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", @@ -1265,6 +1267,13 @@ def fill_form(self, run: Run) -> None: input_df_instance_id = self.data.get( "input_df", self.fields["input_df"].choices[0][0] ) + self.fields[ + "time_column_name" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + + self.fields[ + "grouping_column_name" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) self.fields["protein_group"].choices = fill_helper.to_choices( run.steps.get_step_output( @@ -1281,6 +1290,7 @@ class TimeSeriesRANSACRegressionForm(MethodForm): choices=[], label="Peptide dataframe", ) + time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") protein_group = CustomChoiceField( choices=[], label="Protein group: which protein group to perform the RANSAC regression on", @@ -1310,6 +1320,7 @@ class TimeSeriesRANSACRegressionForm(MethodForm): step_size=0.1, initial=0.8 ) + grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") grouping = CustomChoiceField( choices= TimeSeriesGrouping, label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", @@ -1325,6 +1336,14 @@ def fill_form(self, run: Run) -> None: "input_df", self.fields["input_df"].choices[0][0] ) + self.fields[ + "time_column_name" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + + self.fields[ + "grouping_column_name" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields["protein_group"].choices = fill_helper.to_choices( run.steps.get_step_output( step_type=Step, @@ -1352,6 +1371,7 @@ class TimeSeriesADFullerTestForm(MethodForm): choices=[], label="Peptide dataframe", ) + time_column_name = CustomChoiceField(choices=[], label="Time: which column from metadata that represents time") protein_group = CustomChoiceField( choices=[], label="Protein group: which protein group to perform the ADFuller test on", @@ -1371,6 +1391,10 @@ def fill_form(self, run: Run) -> None: "input_df", self.fields["input_df"].choices[0][0] ) + self.fields[ + "time_column_name" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields["protein_group"].choices = fill_helper.to_choices( run.steps.get_step_output( step_type=Step, @@ -1391,6 +1415,7 @@ class TimeSeriesAutoARIMAForm(MethodForm): choices=[], label="Peptide dataframe", ) + time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") protein_group = CustomChoiceField( choices=[], label="Protein group: which protein group to perform the AutoARIMA on", @@ -1413,6 +1438,7 @@ class TimeSeriesAutoARIMAForm(MethodForm): step_size=0.1, initial=0.8, ) + grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") grouping = CustomChoiceField( choices= TimeSeriesGrouping, label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", @@ -1428,6 +1454,14 @@ def fill_form(self, run: Run) -> None: "input_df", self.fields["input_df"].choices[0][0] ) + self.fields[ + "time_column_name" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + + self.fields[ + "grouping_column_name" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields["protein_group"].choices = fill_helper.to_choices( run.steps.get_step_output( step_type=Step, @@ -1450,6 +1484,7 @@ class TimeSeriesARIMAForm(MethodForm): choices=[], label="Peptide dataframe", ) + time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") protein_group = CustomChoiceField( choices=[], label="Protein group: which protein group to perform the AutoARIMA on", @@ -1512,6 +1547,7 @@ class TimeSeriesARIMAForm(MethodForm): step_size=0.1, initial=0.8, ) + grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") grouping = CustomChoiceField( choices= TimeSeriesGrouping, label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", @@ -1527,6 +1563,14 @@ def fill_form(self, run: Run) -> None: "input_df", self.fields["input_df"].choices[0][0] ) + self.fields[ + "time_column_name" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + + self.fields[ + "grouping_column_name" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields["protein_group"].choices = fill_helper.to_choices( run.steps.get_step_output( step_type=Step, diff --git a/ui/runs/forms/importing.py b/ui/runs/forms/importing.py index cc799be3..be961fa1 100644 --- a/ui/runs/forms/importing.py +++ b/ui/runs/forms/importing.py @@ -93,12 +93,12 @@ class MetadataImportMethodDiannForm(MethodForm): class MetadataColumnAssignmentForm(MethodForm): metadata_required_column = CustomChoiceField( choices=EmptyEnum, - label="Missing, but required metadata columns", + label="Columns in Metadata that needs to be assigned", required=False, ) metadata_unknown_column = CustomChoiceField( choices=EmptyEnum, - label="Existing, but unknown metadata columns", + label="Available columns in Metadata that can be assigned", required=False, ) @@ -111,7 +111,7 @@ def fill_form(self, run: Run) -> None: if metadata is not None: self.fields["metadata_required_column"].choices = [ (col, col) - for col in ["Sample", "Group", "Batch"] + for col in ["Sample", "Group", "Batch", "Time"] if col not in metadata.columns ] if len(self.fields["metadata_required_column"].choices) == 0: @@ -122,7 +122,7 @@ def fill_form(self, run: Run) -> None: unknown_columns = list( metadata.columns[ - ~metadata.columns.isin(["Sample", "Group", "Batch"]) + ~metadata.columns.isin(["Sample", "Group", "Batch", "Time"]) ].unique() ) From 4059f581e0f07c8980b689482d72ee5b4c383486 Mon Sep 17 00:00:00 2001 From: AK Date: Sun, 18 Aug 2024 19:27:57 +0200 Subject: [PATCH 39/52] Fixed Tests --- .../test_time_series_analysis.py | 76 ++++++++++++++++--- 1 file changed, 67 insertions(+), 9 deletions(-) diff --git a/tests/protzilla/data_analysis/test_time_series_analysis.py b/tests/protzilla/data_analysis/test_time_series_analysis.py index 5e139ce7..4962eb22 100644 --- a/tests/protzilla/data_analysis/test_time_series_analysis.py +++ b/tests/protzilla/data_analysis/test_time_series_analysis.py @@ -72,7 +72,15 @@ def time_series_test_data(): def test_linear_regression_plot_with_grouping(show_figures, time_series_test_data): test_intensity, test_metadata = time_series_test_data - outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.8,"With Grouping") + outputs = time_series_linear_regression( + test_intensity, + test_metadata, + "Time", + "Protein1", # + 0.8, + "Group", + "With Grouping" + ) assert "plots" in outputs fig = outputs["plots"][0] if show_figures: @@ -81,7 +89,15 @@ def test_linear_regression_plot_with_grouping(show_figures, time_series_test_dat def test_linear_regression_plot_without_grouping(show_figures, time_series_test_data): test_intensity, test_metadata = time_series_test_data - outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.8,"Without Grouping") + outputs = time_series_linear_regression( + test_intensity, + test_metadata, + "Time", + "Protein1", # + 0.8, + "Group", + "With Grouping" + ) assert "plots" in outputs fig = outputs["plots"][0] if show_figures: @@ -91,12 +107,28 @@ def test_linear_regression_plot_without_grouping(show_figures, time_series_test_ def test_linear_regression_plot_invalid_train_size(time_series_test_data): test_intensity, test_metadata = time_series_test_data with pytest.raises(ValueError): - time_series_linear_regression(test_intensity, test_metadata, "Protein1", 2, "Without Grouping") + time_series_linear_regression( + test_intensity, + test_metadata, + "Time", + "Protein1", # + 2, + "Group", + "With Grouping" + ) return def test_linear_regression_outputs(time_series_test_data): test_intensity, test_metadata = time_series_test_data - outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.8, "Without Grouping") + outputs = time_series_linear_regression( + test_intensity, + test_metadata, + "Time", + "Protein1", # + 0.8, + "Group", + "With Grouping" + ) assert "scores" in outputs return @@ -106,11 +138,13 @@ def test_ransac_regression_plot_with_grouping(show_figures, time_series_test_dat outputs = time_series_ransac_regression( test_intensity, test_metadata, + "Time", "Protein1", 100, 0.99, "absolute_error", 0.8, + "Group", "With Grouping" ) assert "plots" in outputs @@ -124,11 +158,13 @@ def test_ransac_regression_plot_without_grouping(show_figures, time_series_test_ outputs = time_series_ransac_regression( test_intensity, test_metadata, + "Time", "Protein1", 100, 0.99, "absolute_error", 0.8, + "Group", "With Grouping" ) assert "plots" in outputs @@ -143,11 +179,13 @@ def test_ransac_plot_invalid_train_size(time_series_test_data): time_series_ransac_regression( test_intensity, test_metadata, + "Time", "Protein1", 100, 0.99, "absolute_error", 2, + "Group", "With Grouping" ) return @@ -157,11 +195,13 @@ def test_ransac_regression_outputs(time_series_test_data): outputs = time_series_ransac_regression( test_intensity, test_metadata, + "Time", "Protein1", 100, 0.99, "absolute_error", 0.8, + "Group", "With Grouping" ) assert "scores" in outputs @@ -185,10 +225,12 @@ def test_auto_arima_plot_with_grouping(show_figures, time_series_test_data): outputs = time_series_auto_arima( test_intensity, test_metadata, + "Time", "Protein1", "No", 1, 0.5, + "Group", "With Grouping" ) assert "plots" in outputs @@ -199,13 +241,15 @@ def test_auto_arima_plot_with_grouping(show_figures, time_series_test_data): def test_auto_arima_plot_without_grouping(show_figures, time_series_test_data): test_intensity, test_metadata = time_series_test_data - outputs = time_series_auto_arima( + outputs = time_series_auto_arima( test_intensity, test_metadata, + "Time", "Protein1", "No", 1, 0.5, + "Group", "With Grouping" ) assert "plots" in outputs @@ -220,10 +264,12 @@ def test_auto_arima_plot_invalid_train_size(time_series_test_data): time_series_auto_arima( test_intensity, test_metadata, + "Time", "Protein1", "No", 1, 2, + "Group", "With Grouping" ) return @@ -231,13 +277,15 @@ def test_auto_arima_plot_invalid_train_size(time_series_test_data): def test_auto_arima_outputs(time_series_test_data): test_intensity, test_metadata = time_series_test_data - outputs = time_series_auto_arima( + outputs = time_series_auto_arima( test_intensity, test_metadata, + "Time", "Protein1", "No", 1, 0.5, + "Group", "With Grouping" ) assert "scores" in outputs @@ -249,6 +297,7 @@ def test_arima_plot_with_grouping(show_figures, time_series_test_data): outputs = time_series_arima( test_intensity, test_metadata, + "Time", "Protein1", "No", 1, @@ -259,6 +308,7 @@ def test_arima_plot_with_grouping(show_figures, time_series_test_data): 0, 0, 0.5, + "Group", "With Grouping" ) assert "plots" in outputs @@ -272,8 +322,9 @@ def test_arima_plot_seasonal_with_grouping(show_figures, time_series_test_data): outputs = time_series_arima( test_intensity, test_metadata, + "Time", "Protein1", - "Yes", + "No", 1, 1, 1, @@ -282,6 +333,7 @@ def test_arima_plot_seasonal_with_grouping(show_figures, time_series_test_data): 0, 0, 0.5, + "Group", "With Grouping" ) assert "plots" in outputs @@ -295,6 +347,7 @@ def test_arima_plot_without_grouping(show_figures, time_series_test_data): outputs = time_series_arima( test_intensity, test_metadata, + "Time", "Protein1", "No", 1, @@ -305,7 +358,8 @@ def test_arima_plot_without_grouping(show_figures, time_series_test_data): 0, 0, 0.5, - "Without Grouping" + "Group", + "With Grouping" ) assert "plots" in outputs fig = outputs["plots"][0] @@ -319,6 +373,7 @@ def test_arima_plot_invalid_train_size(time_series_test_data): time_series_arima( test_intensity, test_metadata, + "Time", "Protein1", "No", 1, @@ -329,6 +384,7 @@ def test_arima_plot_invalid_train_size(time_series_test_data): 0, 0, 2, + "Group", "With Grouping" ) return @@ -339,6 +395,7 @@ def test_arima_outputs(time_series_test_data): outputs = time_series_arima( test_intensity, test_metadata, + "Time", "Protein1", "No", 1, @@ -349,7 +406,8 @@ def test_arima_outputs(time_series_test_data): 0, 0, 0.5, - "With Grouping" + "Group", + "With Grouping", ) assert "scores" in outputs return \ No newline at end of file From 9f624f870946ee54c3f0ce360f4033edc29c33e3 Mon Sep 17 00:00:00 2001 From: AK Date: Thu, 5 Sep 2024 16:23:14 +0200 Subject: [PATCH 40/52] Fixed Time Series Analysis --- protzilla/constants/colors.py | 61 ++++--- .../data_analysis/time_series_plot_peptide.py | 2 +- .../time_series_regression_analysis.py | 169 +++++++++--------- protzilla/methods/data_analysis.py | 21 ++- ui/runs/forms/data_analysis.py | 60 +++---- 5 files changed, 164 insertions(+), 149 deletions(-) diff --git a/protzilla/constants/colors.py b/protzilla/constants/colors.py index 3f33249b..98daf656 100644 --- a/protzilla/constants/colors.py +++ b/protzilla/constants/colors.py @@ -1,23 +1,44 @@ PROTZILLA_DISCRETE_COLOR_SEQUENCE = [ - #Muted Dark Slate - "#252935", - "#4A536A", - '#a4a9b4', -# Muted Indian Red - "#CE5A5A", - "#B04A4A", - "#EBBDBD", -# Muted Light Steel Blue - "#51646f", - "#87A8B9", - "#B7CAD5", - # Muted Sienna - "#804538", - "#8E3325", - "#471912", - #Muted Sandy Brown - "#715236", - "#E2A46D", - "F0D1B6", + # Set 1: Muted Dark Slate + "#252935", "#3A3F50", "#50556A", "#6B7186", "#858DA2", + # Set 2: Muted Indian Red + "#CE5A5A", "#B24C4C", "#9D3F3F", "#E07272", "#F48D8D", + # Set 3: Muted Light Steel Blue + "#51646F", "#6A7D89", "#7F92A0", "#96A9B8", "#ADBFCD", + # Set 4: Muted Sienna + "#804538", "#6F3C31", "#5F342A", "#A05748", "#B66E5E", + # Set 5: Muted Sandy Brown + "#715236", "#63472F", "#57402B", "#96755A", "#A98575", + # Set 6: Muted Olive + "#6E6B48", "#5D5B3E", "#4E4D36", "#89875C", "#A1A16E", + # Set 7: Muted Teal + "#3B6B6A", "#315B5B", "#274C4C", "#507E7E", "#6B9898", + # Set 8: Muted Taupe + "#8B7E74", "#776F65", "#675E56", "#A09085", "#B9AAA1", + # Set 9: Muted Burgundy + "#7B3A4F", "#6A3345", "#582C3C", "#925664", "#A8737E", + # Set 10: Muted Forest Green + "#3D5047", "#35453E", "#2D3B35", "#5F7267", "#7B8D80", + # Set 11: Muted Navy + "#2F3E4C", "#283442", "#222B38", "#485669", "#627185", + # Set 12: Muted Mustard + "#BFA054", "#A98F4A", "#927D3F", "#D7BA75", "#E2CD96", + # Set 13: Muted Dusty Rose + "#C18394", "#AA727E", "#93616C", "#D69BA7", "#E4B8C2", + # Set 14: Muted Lavender + "#8A729D", "#7A638C", "#6A547C", "#A591B3", "#BDA9C8", + # Set 15: Muted Charcoal + "#404040", "#353535", "#2B2B2B", "#585858", "#707070", + # Set 16: Muted Emerald Green + "#4D7456", "#426448", "#37563B", "#6A9177", "#85A990", + # Set 17: Muted Peach + "#D89B83", "#C2866F", "#A7725E", "#E3B39C", "#ECC7B6", + # Set 18: Muted Plum + "#704F6E", "#634464", "#563A59", "#876A87", "#A18AA1", + # Set 19: Muted Periwinkle + "#7E8DAF", "#6F7B98", "#616A82", "#97A3BF", "#B0B9D1", + # Set 20: Muted Coral + "#CC7A5E", "#B26951", "#9A5A45", "#DD937C", "#EBAA99" ] + PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE = ["#4A536A", "#CE5A5A"] diff --git a/protzilla/data_analysis/time_series_plot_peptide.py b/protzilla/data_analysis/time_series_plot_peptide.py index 5f5ac64e..04f95c32 100644 --- a/protzilla/data_analysis/time_series_plot_peptide.py +++ b/protzilla/data_analysis/time_series_plot_peptide.py @@ -24,7 +24,7 @@ def time_series_plot_peptide( ) -> dict: """ A function to create a graph visualising protein quantifications across all samples - as a line diagram using retention time. It's possible to select one proteingroup + as a line diagram using time. It's possible to select one proteingroup that will be displayed in orange and choose a similarity measurement with a similarity score to get all proteingroups that are similar displayed in another color in this line diagram. All other proteingroups are displayed in the background as a grey polygon. diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py index 5f6240c4..f579bb03 100644 --- a/protzilla/data_analysis/time_series_regression_analysis.py +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -5,6 +5,7 @@ import plotly.graph_objects as go from protzilla.data_analysis.time_series_helper import convert_time_to_hours +from protzilla.utilities import default_intensity_column from protzilla.constants.colors import PROTZILLA_DISCRETE_COLOR_SEQUENCE from sklearn.linear_model import LinearRegression, RANSACRegressor @@ -25,7 +26,7 @@ def time_series_linear_regression( - input_df: pd.DataFrame, + intensity_df: pd.DataFrame, metadata_df: pd.DataFrame, time_column_name: str, protein_group: str, @@ -35,7 +36,7 @@ def time_series_linear_regression( ): """ Perform linear regression on the time series data for a given protein group. - :param input_df: Peptide dataframe which contains the intensity of each sample + :param intensity_df: Peptide dataframe which contains the intensity of each sample :param metadata_df: Metadata dataframe which contains the timestamps :param time_column_name: The name of the column containing the time values :param protein_group: Protein group to perform the analysis on @@ -49,33 +50,33 @@ def time_series_linear_regression( if train_size < 0 or train_size > 1: raise ValueError("Test size should be between 0 and 1") - input_df = input_df[input_df['Protein ID'] == protein_group] - - input_df = pd.merge( - left=input_df, + intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group] + intensity_column_name = default_intensity_column(intensity_df) + intensity_df = pd.merge( + left=intensity_df, right=metadata_df, on="Sample", copy=False, ) - input_df[time_column_name] = input_df[time_column_name].apply(convert_time_to_hours) - input_df = input_df.interpolate(method='linear', axis=0) + intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours) + intensity_df = intensity_df.interpolate(method='linear', axis=0) - input_df = input_df.sample(frac=1, random_state = 42).reset_index(drop=True) + intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True) - X = input_df[[time_column_name]] - y = input_df["Intensity"] + X = intensity_df[[time_column_name]] + y = intensity_df[intensity_column_name] fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025) scores = [] - if grouping == "With Grouping" and grouping_column_name in input_df.columns: - groups = input_df[grouping_column_name].unique() + if grouping == "With Grouping" and grouping_column_name in intensity_df.columns: + groups = intensity_df[grouping_column_name].unique() for group in groups: - group_df = input_df[input_df[grouping_column_name] == group] + group_df = intensity_df[intensity_df[grouping_column_name] == group] X_group = group_df[[time_column_name]] - y_group = group_df["Intensity"] + y_group = group_df[intensity_column_name] X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, train_size=train_size, shuffle=False) model = LinearRegression() @@ -94,7 +95,7 @@ def time_series_linear_regression( plot_df = pd.concat([train_df, test_df]) color = PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index % len(PROTZILLA_DISCRETE_COLOR_SEQUENCE)] - color_index += 3 + color_index += 5 fig.add_trace(go.Scatter( x=plot_df[time_column_name], @@ -150,7 +151,7 @@ def time_series_linear_regression( y=plot_df['Predicted'], mode='lines', name='Predicted Intensity', - line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]) + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[5]) ), row=1, col=1) scores.append({ @@ -211,7 +212,7 @@ def time_series_linear_regression( def time_series_ransac_regression( - input_df: pd.DataFrame, + intensity_df: pd.DataFrame, metadata_df: pd.DataFrame, time_column_name: str, protein_group: str, @@ -224,7 +225,7 @@ def time_series_ransac_regression( ): """ Perform RANSAC regression on the time series data for a given protein group. - :param input_df: Peptide dataframe which contains the intensity of each sample + :param intensity_df: Peptide dataframe which contains the intensity of each sample :param metadata_df: Metadata dataframe which contains the timestamps :param time_column_name: The name of the column containing the time values :param max_trials: The maximum number of iterations to perform @@ -242,33 +243,34 @@ def time_series_ransac_regression( if train_size < 0 or train_size > 1: raise ValueError("Test size should be between 0 and 1") - input_df = input_df[input_df['Protein ID'] == protein_group] + intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group] + intensity_column_name = default_intensity_column(intensity_df) - input_df = pd.merge( - left=input_df, + intensity_df = pd.merge( + left=intensity_df, right=metadata_df, on="Sample", copy=False, ) - input_df[time_column_name] = input_df[time_column_name].apply(convert_time_to_hours) - input_df = input_df.interpolate(method='linear', axis=0) + intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours) + intensity_df = intensity_df.interpolate(method='linear', axis=0) - input_df = input_df.sample(frac=1, random_state = 42).reset_index(drop=True) + intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True) - X = input_df[[time_column_name]] - y = input_df["Intensity"] + X = intensity_df[[time_column_name]] + y = intensity_df[intensity_column_name] fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025) scores = [] - if grouping == "With Grouping" and grouping_column_name in input_df.columns: - groups = input_df[grouping_column_name].unique() + if grouping == "With Grouping" and grouping_column_name in intensity_df.columns: + groups = intensity_df[grouping_column_name].unique() for group in groups: - group_df = input_df[input_df[grouping_column_name] == group] + group_df = intensity_df[intensity_df[grouping_column_name] == group] X_group = group_df[[time_column_name]] - y_group = group_df["Intensity"] + y_group = group_df[intensity_column_name] X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, train_size=train_size, shuffle=False) model = RANSACRegressor(max_trials = max_trials, stop_probability = stop_probability, loss = loss, base_estimator=LinearRegression()) @@ -304,7 +306,7 @@ def time_series_ransac_regression( y=plot_df['Predicted'], mode='lines', name='Predicted Intensity', - line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 1]) + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2]) ), row=1, col=1) fig.add_trace(go.Scatter( @@ -312,10 +314,10 @@ def time_series_ransac_regression( y=plot_df[plot_df['Inlier'] == False]['Intensity'], mode='markers', name='Outliers', - marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2]) + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4]) ), row=1, col=1) - color_index += 3 + color_index += 5 scores.append({ 'group': group, @@ -368,7 +370,7 @@ def time_series_ransac_regression( y=plot_df[plot_df['Inlier'] == False]['Intensity'], mode='markers', name='Outliers', - marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]) + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3]) ), row=1, col=1) scores.append({ @@ -429,14 +431,14 @@ def time_series_ransac_regression( def adfuller_test( - input_df: pd.DataFrame, + intensity_df: pd.DataFrame, metadata_df: pd.DataFrame, protein_group: str, alpha: float = 0.05, ) -> dict: """ Perform the Augmented Dickey-Fuller test to check for stationarity in a time series. - :param input_df: The dataframe containing the time series data. + :param intensity_df: The dataframe containing the time series data. :param metadata_df: The dataframe containing the metadata. :param protein_group: The protein group to perform the test on. :param alpha: The significance level for the test (default is 0.05). @@ -450,19 +452,20 @@ def adfuller_test( """ messages = [] - input_df = input_df[input_df['Protein ID'] == protein_group] + intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group] + intensity_column_name = default_intensity_column(intensity_df) - input_df = pd.merge( - left=input_df, + intensity_df = pd.merge( + left=intensity_df, right=metadata_df, on="Sample", copy=False, ) - input_df = input_df["Intensity"].dropna() + intensity_df = intensity_df[intensity_column_name].dropna() # Perform the ADF test - result = adfuller(input_df) + result = adfuller(intensity_df) test_statistic = result[0] p_value = result[1] critical_values = result[4] @@ -496,7 +499,7 @@ def adfuller_test( def time_series_auto_arima( - input_df: pd.DataFrame, + intensity_df: pd.DataFrame, metadata_df: pd.DataFrame, time_column_name: str, protein_group: str, @@ -508,7 +511,7 @@ def time_series_auto_arima( ) -> dict: """ Perform an automatic ARIMA model selection on the time series data for a given protein group. - :param input_df: Peptide dataframe which contains the intensity of each sample + :param intensity_df: Peptide dataframe which contains the intensity of each sample :param metadata_df: Metadata dataframe which contains the timestamps :param time_column_name: The name of the column containing the time values :param protein_group: Protein group to perform the analysis on @@ -530,11 +533,12 @@ def time_series_auto_arima( else: seasonal = False - input_df = input_df[input_df['Protein ID'] == protein_group] - input_df = input_df.sample(frac=1, random_state=42).reset_index(drop=True) + intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group] + intensity_df = intensity_df.sample(frac=1, random_state=42).reset_index(drop=True) + intensity_column_name = default_intensity_column(intensity_df) - input_df = pd.merge( - left=input_df, + intensity_df = pd.merge( + left=intensity_df, right=metadata_df, on="Sample", copy=False, @@ -543,19 +547,19 @@ def time_series_auto_arima( fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3]) scores = [] - if grouping == "With Grouping" and grouping_column_name in input_df.columns: - groups = input_df[grouping_column_name].unique() + if grouping == "With Grouping" and grouping_column_name in intensity_df.columns: + groups = intensity_df[grouping_column_name].unique() for group in groups: - group_df = input_df[input_df[grouping_column_name] == group] + group_df = intensity_df[intensity_df[grouping_column_name] == group] - group_df[time_column_name] = group_df[time_column_name].apply(convert_time_to_hours) + group_df[time_column_name] = group_df[str(time_column_name)].apply(convert_time_to_hours) group_df = group_df.interpolate(method='linear', axis=0) train_df_size = int(len(group_df) * train_size) train_df, test_df = group_df[:train_df_size], group_df[train_df_size:] - train_df = train_df.set_index(time_column_name)["Intensity"] - test_df = test_df.set_index(time_column_name)["Intensity"] + train_df = train_df.set_index(time_column_name)[intensity_column_name] + test_df = test_df.set_index(time_column_name)[intensity_column_name] # Fit the ARIMA model model = auto_arima( @@ -593,7 +597,7 @@ def time_series_auto_arima( y=forecast, mode='markers', name='Predicted Intensity', - line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2]) + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 3]) ), row=1, col=1) fig.add_trace(go.Scatter( @@ -601,10 +605,10 @@ def time_series_auto_arima( y = forecast_plot, mode = 'lines', name = 'Mean Predicted Intensity', - line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2]) + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 3]) ), row=1, col=1) - color_index += 3 + color_index += 5 scores.append({ 'group': group, @@ -615,14 +619,14 @@ def time_series_auto_arima( }) else: - input_df[time_column_name] = input_df[time_column_name].apply(convert_time_to_hours) - input_df = input_df.interpolate(method='linear', axis=0) + intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours) + intensity_df = intensity_df.interpolate(method='linear', axis=0) - train_size = int(len(input_df) * train_size) - train_df, test_df = input_df[:train_size], input_df[train_size:] + train_size = int(len(intensity_df) * train_size) + train_df, test_df = intensity_df[:train_size], intensity_df[train_size:] - train_df = train_df.set_index(time_column_name)["Intensity"] - test_df = test_df.set_index(time_column_name)["Intensity"] + train_df = train_df.set_index(time_column_name)[intensity_column_name] + test_df = test_df.set_index(time_column_name)[intensity_column_name] # Fit the ARIMA model model = auto_arima( @@ -729,7 +733,7 @@ def time_series_auto_arima( def time_series_arima( - input_df: pd.DataFrame, + intensity_df: pd.DataFrame, metadata_df: pd.DataFrame, time_column_name: str, protein_group: str, @@ -748,7 +752,7 @@ def time_series_arima( """ Perform ARIMA model selection on the time series data for a given protein group. - :param input_df: Peptide dataframe which contains the intensity of each sample + :param intensity_df: Peptide dataframe which contains the intensity of each sample :param metadata_df: Metadata dataframe which contains the timestamps :param time_column_name: The name of the column containing the time values :param protein_group: Protein group to perform the analysis on @@ -772,27 +776,28 @@ def time_series_arima( if train_size < 0 or train_size > 1: raise ValueError("Train size should be between 0 and 1") - input_df = input_df[input_df['Protein ID'] == protein_group] - input_df = input_df.sample(frac=1, random_state=42).reset_index(drop=True) + intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group] + intensity_df = intensity_df.sample(frac=1, random_state=42).reset_index(drop=True) + intensity_column_name = default_intensity_column(intensity_df) - input_df = pd.merge(left=input_df, right=metadata_df, on="Sample", copy=False) + intensity_df = pd.merge(left=intensity_df, right=metadata_df, on="Sample", copy=False) fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3]) scores = [] - if grouping == "With Grouping" and grouping_column_name in input_df.columns: - groups = input_df[grouping_column_name].unique() + if grouping == "With Grouping" and grouping_column_name in intensity_df.columns: + groups = intensity_df[grouping_column_name].unique() for group in groups: - group_df = input_df[input_df[grouping_column_name] == group] + group_df = intensity_df[intensity_df[grouping_column_name] == group] - group_df[time_column_name] = group_df[time_column_name].apply(convert_time_to_hours) + group_df[time_column_name] = group_df[str(time_column_name)].apply(convert_time_to_hours) group_df = group_df.interpolate(method='linear', axis=0) train_df_size = int(len(group_df) * train_size) train_df, test_df = group_df[:train_df_size], group_df[train_df_size:] - train_df = train_df.set_index(time_column_name)["Intensity"] - test_df = test_df.set_index(time_column_name)["Intensity"] + train_df = train_df.set_index(time_column_name)[intensity_column_name] + test_df = test_df.set_index(time_column_name)[intensity_column_name] if seasonal == "Yes": model = ARIMA( @@ -843,7 +848,7 @@ def time_series_arima( line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2]) ), row=1, col=1) - color_index += 3 + color_index += 5 scores.append({ 'group': group, @@ -854,14 +859,14 @@ def time_series_arima( }) else: - input_df[time_column_name] = input_df[time_column_name].apply(convert_time_to_hours) - input_df = input_df.interpolate(method='linear', axis=0) + intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours) + intensity_df = intensity_df.interpolate(method='linear', axis=0) - train_size = int(len(input_df) * train_size) - train_df, test_df = input_df[:train_size], input_df[train_size:] + train_size = int(len(intensity_df) * train_size) + train_df, test_df = intensity_df[:train_size], intensity_df[train_size:] - train_df = train_df.set_index(time_column_name)["Intensity"] - test_df = test_df.set_index(time_column_name)["Intensity"] + train_df = train_df.set_index(time_column_name)[intensity_column_name] + test_df = test_df.set_index(time_column_name)[intensity_column_name] if seasonal == "Yes": model = ARIMA( @@ -906,7 +911,7 @@ def time_series_arima( y=forecast_plot, mode='lines', name='Mean Predicted Intensity', - line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3]) + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[4]) ), row=1, col=1) scores.append({ diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 229c27b6..9f5261d1 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -804,7 +804,7 @@ class TimeSeriesLinearRegression(PlotStep): "The p-values are corrected for multiple testing.") input_keys = [ - "input_df", + "intensity_df", "metadata_df", "time_column_name", "protein_group", @@ -820,7 +820,7 @@ def method(self, inputs: dict) -> dict: return time_series_linear_regression(**inputs) def insert_dataframes(self, steps: StepManager, inputs) -> dict: - inputs["input_df"] = steps.get_step_output(Step, "peptide_df", inputs["input_df"]) + inputs["intensity_df"] = steps.protein_df inputs["metadata_df"] = steps.metadata_df return inputs @@ -831,7 +831,7 @@ class TimeSeriesRANSACRegression(PlotStep): method_description = " Perform RANSAC regression on the time series data for a given protein group." input_keys = [ - "input_df", + "intensity_df", "metadata_df", "time_column_name", "protein_group", @@ -849,7 +849,7 @@ def method(self, inputs: dict) -> dict: return time_series_ransac_regression(**inputs) def insert_dataframes(self, steps: StepManager, inputs) -> dict: - inputs["input_df"] = steps.get_step_output(Step, "peptide_df", inputs["input_df"]) + inputs["intensity_df"] = steps.protein_df inputs["metadata_df"] = steps.metadata_df return inputs @@ -860,9 +860,8 @@ class TimeSeriesADFullerTest(DataAnalysisStep): method_description = "Perform Augmented Dickey-Fuller test on the time series data for a given protein group." input_keys = [ - "input_df", + "intensity_df", "metadata_df", - "time_column_name", "protein_group", "alpha", ] @@ -877,7 +876,7 @@ def method(self, inputs: dict) -> dict: return adfuller_test(**inputs) def insert_dataframes(self, steps: StepManager, inputs) -> dict: - inputs["input_df"] = steps.get_step_output(Step, "peptide_df", inputs["input_df"]) + inputs["intensity_df"] = steps.protein_df inputs["metadata_df"] = steps.metadata_df return inputs @@ -890,7 +889,7 @@ class TimeSeriesAutoARIMA(PlotStep): ) input_keys = [ - "input_df", + "intensity_df", "metadata_df", "time_column_name", "protein_group", @@ -908,7 +907,7 @@ def method(self, inputs: dict) -> dict: return time_series_auto_arima(**inputs) def insert_dataframes(self, steps: StepManager, inputs) -> dict: - inputs["input_df"] = steps.get_step_output(Step, "peptide_df", inputs["input_df"]) + inputs["intensity_df"] = steps.protein_df inputs["metadata_df"] = steps.metadata_df return inputs @@ -921,7 +920,7 @@ class TimeSeriesARIMA(PlotStep): ) input_keys = [ - "input_df", + "intensity_df", "metadata_df", "time_column_name", "protein_group", @@ -945,7 +944,7 @@ def method(self, inputs: dict) -> dict: return time_series_arima(**inputs) def insert_dataframes(self, steps: StepManager, inputs) -> dict: - inputs["input_df"] = steps.get_step_output(Step, "peptide_df", inputs["input_df"]) + inputs["intensity_df"] = steps.protein_df inputs["metadata_df"] = steps.metadata_df return inputs diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 899b56ae..12328911 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1236,9 +1236,9 @@ def fill_form(self, run: Run) -> None: class TimeSeriesLinearRegressionForm(MethodForm): is_dynamic = True - input_df = CustomChoiceField( + intensity_df = CustomChoiceField( choices=[], - label="Peptide dataframe", + label="Intensity dataframe", ) time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") protein_group = CustomChoiceField( @@ -1261,11 +1261,11 @@ class TimeSeriesLinearRegressionForm(MethodForm): def fill_form(self, run: Run) -> None: - self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps( + self.fields["intensity_df"].choices = fill_helper.get_choices_for_protein_df_steps( run ) input_df_instance_id = self.data.get( - "input_df", self.fields["input_df"].choices[0][0] + "intensity_df", self.fields["intensity_df"].choices[0][0] ) self.fields[ "time_column_name" @@ -1278,7 +1278,7 @@ def fill_form(self, run: Run) -> None: self.fields["protein_group"].choices = fill_helper.to_choices( run.steps.get_step_output( step_type=Step, - output_key="peptide_df", + output_key="protein_df", instance_identifier=input_df_instance_id, )["Protein ID"].unique() ) @@ -1286,9 +1286,9 @@ def fill_form(self, run: Run) -> None: class TimeSeriesRANSACRegressionForm(MethodForm): is_dynamic = True - input_df = CustomChoiceField( + intensity_df = CustomChoiceField( choices=[], - label="Peptide dataframe", + label="Intensity dataframe", ) time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") protein_group = CustomChoiceField( @@ -1329,13 +1329,12 @@ class TimeSeriesRANSACRegressionForm(MethodForm): def fill_form(self, run: Run) -> None: - self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps( + self.fields["intensity_df"].choices = fill_helper.get_choices_for_protein_df_steps( run ) input_df_instance_id = self.data.get( - "input_df", self.fields["input_df"].choices[0][0] + "intensity_df", self.fields["intensity_df"].choices[0][0] ) - self.fields[ "time_column_name" ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) @@ -1347,7 +1346,7 @@ def fill_form(self, run: Run) -> None: self.fields["protein_group"].choices = fill_helper.to_choices( run.steps.get_step_output( step_type=Step, - output_key="peptide_df", + output_key="protein_df", instance_identifier=input_df_instance_id, )["Protein ID"].unique() ) @@ -1367,11 +1366,10 @@ class TimeSeriesADFullerTestForm(MethodForm): "JASA. Journal of the American Statistical Association. 74. 10.2307/2286348. " ), ) - input_df = CustomChoiceField( + intensity_df = CustomChoiceField( choices=[], - label="Peptide dataframe", + label="Intensity dataframe", ) - time_column_name = CustomChoiceField(choices=[], label="Time: which column from metadata that represents time") protein_group = CustomChoiceField( choices=[], label="Protein group: which protein group to perform the ADFuller test on", @@ -1384,26 +1382,20 @@ class TimeSeriesADFullerTestForm(MethodForm): ) def fill_form(self, run: Run) -> None: - self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps( + self.fields["intensity_df"].choices = fill_helper.get_choices_for_protein_df_steps( run ) input_df_instance_id = self.data.get( - "input_df", self.fields["input_df"].choices[0][0] + "intensity_df", self.fields["intensity_df"].choices[0][0] ) - - self.fields[ - "time_column_name" - ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) - self.fields["protein_group"].choices = fill_helper.to_choices( run.steps.get_step_output( step_type=Step, - output_key="peptide_df", + output_key="protein_df", instance_identifier=input_df_instance_id, )["Protein ID"].unique() ) - class TimeSeriesAutoARIMAForm(MethodForm): is_dynamic = True model_info = TextDisplayField( @@ -1411,9 +1403,9 @@ class TimeSeriesAutoARIMAForm(MethodForm): text=( ), ) - input_df = CustomChoiceField( + intensity_df = CustomChoiceField( choices=[], - label="Peptide dataframe", + label="Intensity dataframe", ) time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") protein_group = CustomChoiceField( @@ -1447,13 +1439,12 @@ class TimeSeriesAutoARIMAForm(MethodForm): def fill_form(self, run: Run) -> None: - self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps( + self.fields["intensity_df"].choices = fill_helper.get_choices_for_protein_df_steps( run ) input_df_instance_id = self.data.get( - "input_df", self.fields["input_df"].choices[0][0] + "intensity_df", self.fields["intensity_df"].choices[0][0] ) - self.fields[ "time_column_name" ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) @@ -1465,7 +1456,7 @@ def fill_form(self, run: Run) -> None: self.fields["protein_group"].choices = fill_helper.to_choices( run.steps.get_step_output( step_type=Step, - output_key="peptide_df", + output_key="protein_df", instance_identifier=input_df_instance_id, )["Protein ID"].unique() ) @@ -1480,9 +1471,9 @@ class TimeSeriesARIMAForm(MethodForm): ), ) """ - input_df = CustomChoiceField( + intensity_df = CustomChoiceField( choices=[], - label="Peptide dataframe", + label="Intensity dataframe", ) time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") protein_group = CustomChoiceField( @@ -1556,13 +1547,12 @@ class TimeSeriesARIMAForm(MethodForm): def fill_form(self, run: Run) -> None: - self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps( + self.fields["intensity_df"].choices = fill_helper.get_choices_for_protein_df_steps( run ) input_df_instance_id = self.data.get( - "input_df", self.fields["input_df"].choices[0][0] + "intensity_df", self.fields["intensity_df"].choices[0][0] ) - self.fields[ "time_column_name" ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) @@ -1574,7 +1564,7 @@ def fill_form(self, run: Run) -> None: self.fields["protein_group"].choices = fill_helper.to_choices( run.steps.get_step_output( step_type=Step, - output_key="peptide_df", + output_key="protein_df", instance_identifier=input_df_instance_id, )["Protein ID"].unique() ) From 671633de78589095c7442a3fed234221845b672d Mon Sep 17 00:00:00 2001 From: AK Date: Sat, 14 Sep 2024 13:05:27 +0200 Subject: [PATCH 41/52] Implemented TMT data import for PROTzilla --- protzilla/importing/ms_data_import.py | 73 +++++++++++++++++++++++++++ protzilla/methods/importing.py | 12 +++++ ui/runs/forms/importing.py | 18 +++++++ 3 files changed, 103 insertions(+) diff --git a/protzilla/importing/ms_data_import.py b/protzilla/importing/ms_data_import.py index fc5cc105..595aacbb 100644 --- a/protzilla/importing/ms_data_import.py +++ b/protzilla/importing/ms_data_import.py @@ -123,6 +123,79 @@ def diann_import(file_path, map_to_uniprot=False, aggregation_method: str ="Sum" return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))]) +def tmt_data_import( + file_path: str, intensity_name: str = "Reporter intensity", map_to_uniprot=False, + aggregation_method: str = "Sum" +) -> dict: + try: + # Read the file + df = pd.read_csv( + file_path, + sep="\t", + low_memory=False, + na_values=["", 0], + keep_default_na=True, + ) + + # Debug step: Print the column names to check the actual names in the data + print("Columns in the file:", df.columns.tolist()) + + # Try to handle different possible names for the 'Protein ID' column + protein_column = None + possible_names = ["Majority protein IDs"] + + for name in possible_names: + if name in df.columns: + protein_column = name + break + + if protein_column is None: + raise KeyError("No valid 'Protein ID' or equivalent column found in the data.") + + df = df.rename(columns={protein_column: "Protein ID"}) + + # Extract protein or gene identifiers + protein_groups = df["Protein ID"] + + # Drop columns that are not relevant + columns_to_drop = [ + "Combined Spectral Count", + "Combined Unique Spectral Count", + "Combined Total Spectral Count", + ] + existing_columns = set(df.columns) + columns_to_drop_existing = [col for col in columns_to_drop if col in existing_columns] + df = df.drop(columns=columns_to_drop_existing) + print("Columns after dropping irrelevant ones:", df.columns.tolist()) + + # Use regex to find columns matching the TMT pattern with visits for both NP and T1D samples + intensity_columns = df.filter( + regex=f"{intensity_name} \\d+ (NP\\d{{2}}|TD\\d{{2}})", axis=1 + ) + + # Debug step: Print the intensity columns that were matched + print("Matched intensity columns:", intensity_columns.columns.tolist()) + + # Rename columns to the format 'NPXX_1' or 'T1DXX_1' + intensity_columns.columns = [ + re.sub(f"{intensity_name} (\\d+) (NP\\d{{2}}|TD\\d{{2}})", + lambda m: f"{m.group(2)}_{int(m.group(1)) + 1}", col) for col in intensity_columns.columns + ] + + # Debug step: Print the renamed intensity columns + print("Renamed intensity columns:", intensity_columns.columns.tolist()) + # Add back the protein identifiers to the dataframe + intensity_columns = intensity_columns.assign(**{"Protein ID": protein_groups}) + + # Apply transformation, clean-up, or aggregation (depending on your logic) + return transform_and_clean(intensity_columns, intensity_name, map_to_uniprot, aggregation_method) + + except Exception as e: + msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid TMT data file." + return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))]) + + + def transform_and_clean( df: pd.DataFrame, intensity_name: str, map_to_uniprot: bool, aggregation_method: str ="Sum" ) -> dict: diff --git a/protzilla/methods/importing.py b/protzilla/methods/importing.py index a7af6d42..f94218f4 100644 --- a/protzilla/methods/importing.py +++ b/protzilla/methods/importing.py @@ -9,6 +9,7 @@ diann_import, max_quant_import, ms_fragger_import, + tmt_data_import, ) from protzilla.importing.peptide_import import peptide_import, evidence_import from protzilla.steps import Step, StepManager @@ -60,6 +61,17 @@ def method(self, inputs): return ms_fragger_import(**inputs) +class TMTImport(ImportingStep): + display_name = "TMT" + operation = "msdataimport" + method_description = "TMT data import" + input_keys = ["file_path", "map_to_uniprot", "aggregation_method"] + output_keys = ["protein_df"] + + def method(self, inputs): + return tmt_data_import(**inputs) + + class MetadataImport(ImportingStep): display_name = "Metadata import" operation = "metadataimport" diff --git a/ui/runs/forms/importing.py b/ui/runs/forms/importing.py index be961fa1..07c8d74c 100644 --- a/ui/runs/forms/importing.py +++ b/ui/runs/forms/importing.py @@ -75,6 +75,24 @@ class MSFraggerImportForm(MethodForm): choices=AggregationMethods, label="Aggregation method", initial="Sum" ) +class TMTImportForm(MethodForm): + file_path = CustomFileField(label="TMT intensities file") + map_to_uniprot = CustomBooleanField( + label="Map to Uniprot IDs using Biomart (online)", required=False + ) + aggregation_method = CustomChoiceField( + choices=AggregationMethods, label="Aggregation method", initial="Sum" + ) + +class DiannImportForm(MethodForm): + file_path = CustomFileField(label="DIA-NN intensities file:") + map_to_uniprot = CustomBooleanField( + label="Map to Uniprot IDs using Biomart (online)", required=False + ) + aggregation_method = CustomChoiceField( + choices=AggregationMethods, label="Aggregation method", initial="Sum" + ) + class MetadataImportForm(MethodForm): file_path = CustomFileField(label="Metadata file") From 1e1b50f7749f173f69f475b637dc7f72be884cf9 Mon Sep 17 00:00:00 2001 From: AK Date: Sat, 14 Sep 2024 13:06:37 +0200 Subject: [PATCH 42/52] Updated TimeQuant plot --- ...s_plot_peptide.py => time_series_plots.py} | 22 ++++++++++--------- protzilla/utilities/transform_dfs.py | 18 ++++++++------- 2 files changed, 22 insertions(+), 18 deletions(-) rename protzilla/data_analysis/{time_series_plot_peptide.py => time_series_plots.py} (89%) diff --git a/protzilla/data_analysis/time_series_plot_peptide.py b/protzilla/data_analysis/time_series_plots.py similarity index 89% rename from protzilla/data_analysis/time_series_plot_peptide.py rename to protzilla/data_analysis/time_series_plots.py index 04f95c32..236c0e5a 100644 --- a/protzilla/data_analysis/time_series_plot_peptide.py +++ b/protzilla/data_analysis/time_series_plots.py @@ -15,9 +15,10 @@ "annotation_proteins_of_interest": "#4A536A", } -def time_series_plot_peptide( - input_df: pd.DataFrame, +def time_quant_plot( + intensity_df: pd.DataFrame, metadata_df: pd.DataFrame, + time_column_name: str, protein_group: str, similarity: float = 1.0, similarity_measure: str = "euclidean distance", @@ -29,9 +30,10 @@ def time_series_plot_peptide( to get all proteingroups that are similar displayed in another color in this line diagram. All other proteingroups are displayed in the background as a grey polygon. - :param input_df: A dataframe in protzilla wide format, where each row + :param intensity_df: A dataframe in protzilla wide format, where each row represents a sample and each column represents a feature. :param metadata_df: A dataframe containing the metadata of the samples. + :param time_column_name: The name of the column in the metadata_df that contains the time information. :param protein_group: Protein IDs as the columnheader of the dataframe :param similarity_measure: method to compare the chosen proteingroup with all others. The two methods are "cosine similarity" and "euclidean distance". @@ -40,15 +42,15 @@ def time_series_plot_peptide( :return: returns a dictionary containing a list with a plotly figure and/or a list of messages """ - input_df = pd.merge( - left=input_df, - right=metadata_df[["Sample", "Time"]], + intensity_df = pd.merge( + left=intensity_df, + right=metadata_df[["Sample", time_column_name]], on="Sample", copy=False, ) - wide_df = input_df.interpolate(method='linear', axis=0) - wide_df = long_to_wide_time(wide_df) if is_long_format(wide_df) else wide_df + wide_df = intensity_df.interpolate(method='linear', axis=0) + wide_df = long_to_wide_time(wide_df, time_column_name=time_column_name) if is_long_format(wide_df, time_column_name=time_column_name) else wide_df if protein_group not in wide_df.columns: @@ -164,14 +166,14 @@ def time_series_plot_peptide( yaxis_gridcolor=colors["gridcolor"], xaxis_linecolor=colors["linecolor"], yaxis_linecolor=colors["linecolor"], - xaxis_title="Time", + xaxis_title=time_column_name, yaxis_title="Intensity", legend_title="Legend", xaxis=dict( tickmode="array", tickangle=0, tickvals=wide_df.index, - ticktext=[wide_df["Time"].unique() for wide_df["Time"] in wide_df.index], + ticktext=[wide_df[time_column_name].unique() for wide_df[time_column_name] in wide_df.index], ), autosize=True, margin=dict(l=100, r=300, t=100, b=100), diff --git a/protzilla/utilities/transform_dfs.py b/protzilla/utilities/transform_dfs.py index a5380b32..1c3b7fbe 100644 --- a/protzilla/utilities/transform_dfs.py +++ b/protzilla/utilities/transform_dfs.py @@ -12,6 +12,7 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None): :param intensity_df: the dataframe that should be transformed into long format :type intensity_df: pd.DataFrame + :param value_name: the name of the column in the metadata_df that contains the intensity information. :return: returns dataframe in wide format suitable for use by packages such as sklearn @@ -23,7 +24,7 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None): ) -def long_to_wide_time(intensity_df: pd.DataFrame, value_name: str = None): +def long_to_wide_time(intensity_df: pd.DataFrame, value_name: str = None, time_column_name: str = None): """ This function transforms the dataframe to a wide format that can be more easily handled by packages such as sklearn. @@ -32,16 +33,18 @@ def long_to_wide_time(intensity_df: pd.DataFrame, value_name: str = None): :param intensity_df: the dataframe that should be transformed into long format :type intensity_df: pd.DataFrame + :param value_name: the name of the column in the metadata_df that contains the intensity information. + :param time_column_name: the name of the column in the metadata_df that contains the time information. :return: returns dataframe in wide format suitable for use by packages such as sklearn :rtype: pd.DataFrame """ - if intensity_df.duplicated(subset=["Time", "Protein ID"]).any(): - intensity_df = intensity_df.groupby(["Time", "Protein ID"]).mean().reset_index() + if intensity_df.duplicated(subset=[time_column_name, "Protein ID"]).any(): + intensity_df = intensity_df.groupby([time_column_name, "Protein ID"]).mean().reset_index() values_name = default_intensity_column(intensity_df) if value_name is None else value_name intensity_df = pd.pivot( - intensity_df, index="Time", columns="Protein ID", values=values_name + intensity_df, index=time_column_name, columns="Protein ID", values=values_name ) intensity_df = intensity_df.fillna(intensity_df.mean()) return intensity_df @@ -81,17 +84,16 @@ def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame): return intensity_df -def is_long_format(df: pd.DataFrame): +def is_long_format(df: pd.DataFrame, time_column_name: str = None): required_columns = {"Sample", "Protein ID"} - additional_columns = {"Gene", "Time"} + additional_columns = {"Gene", time_column_name} return required_columns.issubset(df.columns) and any(col in df.columns for col in additional_columns) def is_intensity_df(df: pd.DataFrame): """ Checks if the dataframe is an intensity dataframe. - An intensity dataframe should have the columns "Sample", "Protein ID" and - and intensity column. + An intensity dataframe should have the columns "Sample", "Protein ID" and intensity column. :param df: the dataframe that should be checked :type df: pd.DataFrame From a3ffe29f1ef6d79a263e17d64ebf5d878aaa9722 Mon Sep 17 00:00:00 2001 From: AK Date: Sat, 14 Sep 2024 13:06:57 +0200 Subject: [PATCH 43/52] Updated a test --- tests/protzilla/data_analysis/test_time_series_plots.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/protzilla/data_analysis/test_time_series_plots.py b/tests/protzilla/data_analysis/test_time_series_plots.py index 12249fb0..46182092 100644 --- a/tests/protzilla/data_analysis/test_time_series_plots.py +++ b/tests/protzilla/data_analysis/test_time_series_plots.py @@ -1,7 +1,7 @@ import pandas as pd import pytest -from protzilla.data_analysis.time_series_plot_peptide import time_series_plot_peptide +from protzilla.data_analysis.time_series_plots import time_quant_plot @pytest.fixture @@ -56,7 +56,7 @@ def time_series_test_data(): def test_time_series_plot(show_figures, time_series_test_data): test_intensity, test_metadata = time_series_test_data - outputs = time_series_plot_peptide(test_intensity, test_metadata, "Protein1") + outputs = time_quant_plot(test_intensity, test_metadata, "Protein1") assert "plots" in outputs fig = outputs["plots"][0] if show_figures: @@ -66,11 +66,11 @@ def test_time_series_plot(show_figures, time_series_test_data): def test_time_series_plot_invalid_euclidean_similarity(time_series_test_data): test_intensity, test_metadata = time_series_test_data with pytest.raises(ValueError): - time_series_plot_peptide(test_intensity, test_metadata, "Protein1", similarity=-1, similarity_measure="euclidean distance") + time_quant_plot(test_intensity, test_metadata, "Protein1", similarity=-1, similarity_measure="euclidean distance") return def test_time_series_plot_invalid_cosine_similarity(time_series_test_data): test_intensity, test_metadata = time_series_test_data with pytest.raises(ValueError): - time_series_plot_peptide(test_intensity, test_metadata, "Protein1", similarity=2, similarity_measure="cosine similarity") + time_quant_plot(test_intensity, test_metadata, "Protein1", similarity=2, similarity_measure="cosine similarity") return \ No newline at end of file From 776299519242fadeed3eb2c257cecfaf7e3d0485 Mon Sep 17 00:00:00 2001 From: AK Date: Sat, 14 Sep 2024 13:07:36 +0200 Subject: [PATCH 44/52] Mapped TMT import --- ui/runs/form_mapping.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py index 8f4793f1..a0e58689 100644 --- a/ui/runs/form_mapping.py +++ b/ui/runs/form_mapping.py @@ -17,6 +17,7 @@ importing.MaxQuantImport: importing_forms.MaxQuantImportForm, importing.DiannImport: importing_forms.DiannImportForm, importing.MsFraggerImport: importing_forms.MSFraggerImportForm, + importing.TMTImport: importing_forms.TMTImportForm, importing.MetadataImport: importing_forms.MetadataImportForm, importing.MetadataImportMethodDiann: importing_forms.MetadataImportMethodDiannForm, importing.MetadataColumnAssignment: importing_forms.MetadataColumnAssignmentForm, @@ -49,7 +50,7 @@ data_analysis.PlotScatterPlot: data_analysis_forms.PlotScatterPlotForm, data_analysis.PlotClustergram: data_analysis_forms.PlotClustergramForm, data_analysis.PlotProtQuant: data_analysis_forms.PlotProtQuantForm, - data_analysis.PlotTimeSeriesPeptide: data_analysis_forms.PlotTimeSeriesForm, + data_analysis.PlotTimeQuant: data_analysis_forms.PlotTimeQuantForm, data_analysis.PlotPrecisionRecallCurve: data_analysis_forms.PlotPrecisionRecallCurveForm, data_analysis.PlotROC: data_analysis_forms.PlotROCCurveForm, data_analysis.ClusteringKMeans: data_analysis_forms.ClusteringKMeansForm, From aaeed09dc2f534d57ddf7503a747fb0de97e0060 Mon Sep 17 00:00:00 2001 From: AK Date: Sat, 14 Sep 2024 13:09:17 +0200 Subject: [PATCH 45/52] Added an option for the user to select the Time and Grouping column names --- .../time_series_regression_analysis.py | 99 ++++++++---- protzilla/methods/data_analysis.py | 50 +++--- ui/runs/forms/data_analysis.py | 148 +++++++++--------- 3 files changed, 171 insertions(+), 126 deletions(-) diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py index f579bb03..cd579295 100644 --- a/protzilla/data_analysis/time_series_regression_analysis.py +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -60,7 +60,6 @@ def time_series_linear_regression( ) intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours) - intensity_df = intensity_df.interpolate(method='linear', axis=0) intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True) @@ -90,8 +89,8 @@ def time_series_linear_regression( train_r2 = r2_score(y_train, y_pred_train) test_r2 = r2_score(y_test, y_pred_test) - train_df = pd.DataFrame({'Time': X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) - test_df = pd.DataFrame({'Time': X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + train_df = pd.DataFrame({time_column_name: X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) + test_df = pd.DataFrame({time_column_name: X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) plot_df = pd.concat([train_df, test_df]) color = PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index % len(PROTZILLA_DISCRETE_COLOR_SEQUENCE)] @@ -134,8 +133,11 @@ def time_series_linear_regression( train_r2 = r2_score(y_train, y_pred_train) test_r2 = r2_score(y_test, y_pred_test) - train_df = pd.DataFrame({'Time': X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) - test_df = pd.DataFrame({'Time': X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + train_df = pd.DataFrame( + {time_column_name: X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, + 'Type': 'Train'}) + test_df = pd.DataFrame( + {time_column_name: X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) plot_df = pd.concat([train_df, test_df]) fig.add_trace(go.Scatter( @@ -186,7 +188,7 @@ def time_series_linear_regression( yaxis_gridcolor=colors["gridcolor"], xaxis_linecolor=colors["linecolor"], yaxis_linecolor=colors["linecolor"], - xaxis_title="Time", + xaxis_title=time_column_name, yaxis_title="Intensity", legend_title="Legend", autosize=True, @@ -254,7 +256,6 @@ def time_series_ransac_regression( ) intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours) - intensity_df = intensity_df.interpolate(method='linear', axis=0) intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True) @@ -286,31 +287,31 @@ def time_series_ransac_regression( train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask]) test_r2 = r2_score(y_test, y_pred_test) - train_df = pd.DataFrame({'Time': X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) - test_df = pd.DataFrame({'Time': X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + train_df = pd.DataFrame({time_column_name: X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) + test_df = pd.DataFrame({time_column_name: X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) train_df['Inlier'] = inlier_mask test_df['Inlier'] = False plot_df = pd.concat([train_df, test_df]) # Add main plot traces fig.add_trace(go.Scatter( - x=plot_df['Time'], + x=plot_df[time_column_name], y=plot_df['Intensity'], mode='markers', - name='Actual Intensity', + name=f'Actual Intensity ({group})', marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index]) ), row=1, col=1) fig.add_trace(go.Scatter( - x=plot_df['Time'], + x=plot_df[time_column_name], y=plot_df['Predicted'], mode='lines', - name='Predicted Intensity', + name=f'Predicted Intensity ({group})', line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2]) ), row=1, col=1) fig.add_trace(go.Scatter( - x=plot_df[plot_df['Inlier'] == False]['Time'], + x=plot_df[plot_df['Inlier'] == False][time_column_name], y=plot_df[plot_df['Inlier'] == False]['Intensity'], mode='markers', name='Outliers', @@ -342,15 +343,15 @@ def time_series_ransac_regression( train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask]) test_r2 = r2_score(y_test, y_pred_test) - train_df = pd.DataFrame({'Time': X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) - test_df = pd.DataFrame({'Time': X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + train_df = pd.DataFrame({time_column_name: X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) + test_df = pd.DataFrame({time_column_name: X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) train_df['Inlier'] = inlier_mask test_df['Inlier'] = False plot_df = pd.concat([train_df, test_df]) # Add main plot traces fig.add_trace(go.Scatter( - x=plot_df['Time'], + x=plot_df[time_column_name], y=plot_df['Intensity'], mode='markers', name='Actual Intensity', @@ -358,7 +359,7 @@ def time_series_ransac_regression( ), row=1, col=1) fig.add_trace(go.Scatter( - x=plot_df['Time'], + x=plot_df[time_column_name], y=plot_df['Predicted'], mode='lines', name='Predicted Intensity', @@ -366,7 +367,7 @@ def time_series_ransac_regression( ), row=1, col=1) fig.add_trace(go.Scatter( - x=plot_df[plot_df['Inlier'] == False]['Time'], + x=plot_df[plot_df['Inlier'] == False][time_column_name], y=plot_df[plot_df['Inlier'] == False]['Intensity'], mode='markers', name='Outliers', @@ -405,7 +406,7 @@ def time_series_ransac_regression( yaxis_gridcolor=colors["gridcolor"], xaxis_linecolor=colors["linecolor"], yaxis_linecolor=colors["linecolor"], - xaxis_title="Time", + xaxis_title=time_column_name, yaxis_title="Intensity", legend_title="Legend", autosize=True, @@ -553,7 +554,6 @@ def time_series_auto_arima( group_df = intensity_df[intensity_df[grouping_column_name] == group] group_df[time_column_name] = group_df[str(time_column_name)].apply(convert_time_to_hours) - group_df = group_df.interpolate(method='linear', axis=0) train_df_size = int(len(group_df) * train_size) train_df, test_df = group_df[:train_df_size], group_df[train_df_size:] @@ -574,6 +574,24 @@ def time_series_auto_arima( # Forecast the test set forecast = model.predict(n_periods=test_df.shape[0]) + parameters = model.get_params() + aa_order = parameters['order'] + aa_seasonal_order = parameters['seasonal_order'] + messages = [] + + messages.append( + { + "level": logging.INFO, + "msg": f"Auto Arima Order (p,d,q): {aa_order}.", + } + ) + if seasonal: + messages.append( + { + "level": logging.INFO, + "msg": f"Auto Arima Seasonal Order (P,D,Q,s): {aa_seasonal_order}.", + } + ) test_rmse = np.sqrt(mean_squared_error(test_df, forecast)) test_r2 = r2_score(test_df, forecast) @@ -588,7 +606,7 @@ def time_series_auto_arima( x=test_df.index, y=test_df, mode='markers', - name='Actual Intensity', + name=f'Actual Intensity ({group})', marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index]) ), row=1, col=1) @@ -596,7 +614,7 @@ def time_series_auto_arima( x=test_df.index, y=forecast, mode='markers', - name='Predicted Intensity', + name=f'Predicted Intensity ({group})', line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 3]) ), row=1, col=1) @@ -604,7 +622,7 @@ def time_series_auto_arima( x = forecast_plot.index, y = forecast_plot, mode = 'lines', - name = 'Mean Predicted Intensity', + name = f'Mean Predicted Intensity ({group})', line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 3]) ), row=1, col=1) @@ -620,7 +638,6 @@ def time_series_auto_arima( else: intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours) - intensity_df = intensity_df.interpolate(method='linear', axis=0) train_size = int(len(intensity_df) * train_size) train_df, test_df = intensity_df[:train_size], intensity_df[train_size:] @@ -641,6 +658,25 @@ def time_series_auto_arima( # Forecast the test set forecast = model.predict(n_periods=test_df.shape[0]) + parameters = model.get_params() + + aa_order = parameters['order'] + aa_seasonal_order = parameters['seasonal_order'] + messages = [] + + messages.append( + { + "level": logging.INFO, + "msg": f"Auto Arima Order (p,d,q): {aa_order}.", + } + ) + if seasonal: + messages.append( + { + "level": logging.INFO, + "msg": f"Auto Arima Seasonal Order (P,D,Q,s): {aa_seasonal_order}.", + } + ) test_rmse = np.sqrt(mean_squared_error(test_df, forecast)) test_r2 = r2_score(test_df, forecast) @@ -707,7 +743,7 @@ def time_series_auto_arima( yaxis_gridcolor=colors["gridcolor"], xaxis_linecolor=colors["linecolor"], yaxis_linecolor=colors["linecolor"], - xaxis_title="Time", + xaxis_title=time_column_name, yaxis_title="Intensity", legend_title="Legend", autosize=True, @@ -725,7 +761,6 @@ def time_series_auto_arima( fig.update_annotations(font_size=12) - return dict( scores=scores, plots=[fig], @@ -791,7 +826,6 @@ def time_series_arima( group_df = intensity_df[intensity_df[grouping_column_name] == group] group_df[time_column_name] = group_df[str(time_column_name)].apply(convert_time_to_hours) - group_df = group_df.interpolate(method='linear', axis=0) train_df_size = int(len(group_df) * train_size) train_df, test_df = group_df[:train_df_size], group_df[train_df_size:] @@ -828,7 +862,7 @@ def time_series_arima( x=test_df.index, y=test_df, mode='markers', - name='Actual Intensity', + name=f'Actual Intensity ({group})', marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index]) ), row=1, col=1) @@ -836,7 +870,7 @@ def time_series_arima( x=forecast_plot.index, y=forecast_plot, mode='markers', - name='Predicted Intensity', + name= f'Predicted Intensity ({group})', line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2]) ), row=1, col=1) @@ -844,7 +878,7 @@ def time_series_arima( x = forecast_mean_plot.index, y = forecast_mean_plot, mode = 'lines', - name = 'Mean Predicted Intensity', + name = f'Mean Predicted Intensity ({group})', line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2]) ), row=1, col=1) @@ -860,7 +894,6 @@ def time_series_arima( else: intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours) - intensity_df = intensity_df.interpolate(method='linear', axis=0) train_size = int(len(intensity_df) * train_size) train_df, test_df = intensity_df[:train_size], intensity_df[train_size:] @@ -945,7 +978,7 @@ def time_series_arima( yaxis_gridcolor=colors["gridcolor"], xaxis_linecolor=colors["linecolor"], yaxis_linecolor=colors["linecolor"], - xaxis_title="Time", + xaxis_title=time_column_name, yaxis_title="Intensity", legend_title="Legend", autosize=True, diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 9f5261d1..bf05e4cb 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -28,7 +28,7 @@ prot_quant_plot, scatter_plot, ) -from protzilla.data_analysis.time_series_plot_peptide import time_series_plot_peptide +from protzilla.data_analysis.time_series_plots import time_quant_plot from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph from protzilla.data_analysis.ptm_analysis import ( filter_peptides_of_protein, @@ -344,27 +344,6 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: ) return inputs -class PlotTimeSeriesPeptide(PlotStep): - display_name = "Time Quantification Plot For Peptide" - operation = "plot" - method_description = ( - "Creates a line chart for intensity across Time for protein groups" - ) - - input_keys = ["input_df", "metadata_df", "protein_group", "similarity_measure", "similarity"] - output_keys = [] - - def method(self, inputs: dict) -> dict: - return time_series_plot_peptide(**inputs) - - - def insert_dataframes(self, steps: StepManager, inputs) -> dict: - inputs["input_df"] = steps.get_step_output( - Step, "peptide_df", inputs["input_df"] - ) - inputs["metadata_df"] = steps.metadata_df - return inputs - class PlotPrecisionRecallCurve(PlotStep): display_name = "Precision Recall" @@ -796,6 +775,33 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: return inputs +class PlotTimeQuant(PlotStep): + display_name = "Time Quantification Plot For Protein" + operation = "Time series analysis" + method_description = ( + "Creates a line chart for intensity across Time for protein groups" + ) + + input_keys = [ + "intensity_df", + "metadata_df", + "time_column_name", + "protein_group", + "similarity_measure", + "similarity" + ] + output_keys = [] + + def method(self, inputs: dict) -> dict: + return time_quant_plot(**inputs) + + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["intensity_df"] = steps.protein_df + inputs["metadata_df"] = steps.metadata_df + return inputs + + class TimeSeriesLinearRegression(PlotStep): display_name = "Linear Regression" operation = "Time series analysis" diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 12328911..e682c879 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -540,77 +540,6 @@ def fill_form(self, run: Run) -> None: self.data["similarity"] = 1 -class PlotTimeSeriesForm(MethodForm): - is_dynamic = True - - input_df = CustomChoiceField( - choices=[], - label="Choose dataframe to be plotted", - ) - protein_group = CustomChoiceField( - choices=[], - label="Protein group: choose highlighted protein group", - ) - similarity_measure = CustomChoiceField( - choices=SimilarityMeasure, - label="Similarity Measurement: choose how to compare protein groups", - initial=SimilarityMeasure.euclidean_distance, - ) - similarity = CustomNumberField( - label="Similarity", min_value=-1, max_value=999, step_size=1, initial=1 - ) - - def fill_form(self, run: Run) -> None: - self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps( - run - ) - - input_df_instance_id = self.data.get( - "input_df", self.fields["input_df"].choices[0][0] - ) - - self.fields["protein_group"].choices = fill_helper.to_choices( - run.steps.get_step_output( - step_type=Step, - output_key="peptide_df", - instance_identifier=input_df_instance_id, - )["Protein ID"].unique() - ) - - similarity_measure = self.data.get( - "similarity_measure", self.fields["similarity_measure"].choices[0][0] - ) - self.data = self.data.copy() - if similarity_measure == SimilarityMeasure.cosine_similarity: - self.fields["similarity"] = CustomFloatField( - label="Cosine Similarity", - min_value=-1, - max_value=1, - step_size=0.1, - initial=0, - ) - if ( - "similarity" not in self.data - or float(self.data["similarity"]) < -1 - or float(self.data["similarity"]) > 1 - ): - self.data["similarity"] = 0 - else: - self.fields["similarity"] = CustomNumberField( - label="Euclidean Distance", - min_value=0, - max_value=999, - step_size=1, - initial=1, - ) - if ( - "similarity" not in self.data - or float(self.data["similarity"]) < 0 - or float(self.data["similarity"]) > 999 - ): - self.data["similarity"] = 1 - - class PlotPrecisionRecallCurveForm(MethodForm): # Todo: Input plot_title = CustomCharField( @@ -1234,6 +1163,83 @@ def fill_form(self, run: Run) -> None: self.fields["peptide_df"].initial = single_protein_peptides[0] +class PlotTimeQuantForm(MethodForm): + is_dynamic = True + + intensity_df = CustomChoiceField( + choices=[], + label="Choose dataframe to be plotted", + ) + time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") + protein_group = CustomChoiceField( + choices=[], + label="Protein group: choose highlighted protein group", + ) + similarity_measure = CustomChoiceField( + choices=SimilarityMeasure, + label="Similarity Measurement: choose how to compare protein groups", + initial=SimilarityMeasure.euclidean_distance, + ) + similarity = CustomNumberField( + label="Similarity", min_value=-1, max_value=999, step_size=1, initial=1 + ) + + def fill_form(self, run: Run) -> None: + self.fields["intensity_df"].choices = fill_helper.get_choices_for_protein_df_steps( + run + ) + + input_df_instance_id = self.data.get( + "intensity_df", self.fields["intensity_df"].choices[0][0] + ) + self.fields[ + "time_column_name" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + + self.fields["protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + step_type=Step, + output_key="protein_df", + instance_identifier=input_df_instance_id, + )["Protein ID"].unique() + ) + + similarity_measure = self.data.get( + "similarity_measure", self.fields["similarity_measure"].choices[0][0] + ) + self.data = self.data.copy() + if similarity_measure == SimilarityMeasure.cosine_similarity: + self.fields["similarity"] = CustomFloatField( + label="Cosine Similarity", + min_value=-1, + max_value=1, + step_size=0.1, + initial=0, + ) + if ( + "similarity" not in self.data + or float(self.data["similarity"]) < -1 + or float(self.data["similarity"]) > 1 + ): + self.data["similarity"] = 0 + else: + self.fields["similarity"] = CustomNumberField( + label="Euclidean Distance", + min_value=0, + max_value=999, + step_size=1, + initial=1, + ) + if ( + "similarity" not in self.data + or float(self.data["similarity"]) < 0 + or float(self.data["similarity"]) > 999 + ): + self.data["similarity"] = 1 + + + + class TimeSeriesLinearRegressionForm(MethodForm): is_dynamic = True intensity_df = CustomChoiceField( From 25c37d6edd70b6cb0359542dce456529082411b2 Mon Sep 17 00:00:00 2001 From: AK Date: Sat, 14 Sep 2024 14:51:44 +0200 Subject: [PATCH 46/52] Resolved some comments from from Hendrik --- protzilla/data_analysis/time_series_helper.py | 5 +- .../time_series_regression_analysis.py | 64 ++++++++----------- protzilla/methods/data_analysis.py | 18 ++++-- ui/runs/forms/data_analysis.py | 44 +++++-------- 4 files changed, 59 insertions(+), 72 deletions(-) diff --git a/protzilla/data_analysis/time_series_helper.py b/protzilla/data_analysis/time_series_helper.py index 0fb294ed..e643fe93 100644 --- a/protzilla/data_analysis/time_series_helper.py +++ b/protzilla/data_analysis/time_series_helper.py @@ -6,7 +6,10 @@ def convert_time_to_hours(time_str): :param time_str: The time string to convert in format '%H:%M:%S' :return: Number of hours since midnight as a float + """ + """ time_obj = datetime.strptime(time_str, '%H:%M:%S') hours_since_midnight = time_obj.hour + time_obj.minute / 60 + time_obj.second / 3600 - return hours_since_midnight \ No newline at end of file + """ + return time_str \ No newline at end of file diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py index cd579295..984754fe 100644 --- a/protzilla/data_analysis/time_series_regression_analysis.py +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -4,7 +4,7 @@ import pandas as pd import plotly.graph_objects as go -from protzilla.data_analysis.time_series_helper import convert_time_to_hours +#from protzilla.data_analysis.time_series_helper import convert_time_to_hours from protzilla.utilities import default_intensity_column from protzilla.constants.colors import PROTZILLA_DISCRETE_COLOR_SEQUENCE @@ -29,10 +29,10 @@ def time_series_linear_regression( intensity_df: pd.DataFrame, metadata_df: pd.DataFrame, time_column_name: str, - protein_group: str, train_size: float, + protein_group: str, + grouping: str, grouping_column_name: str, - grouping: str = None, ): """ Perform linear regression on the time series data for a given protein group. @@ -59,8 +59,6 @@ def time_series_linear_regression( copy=False, ) - intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours) - intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True) X = intensity_df[[time_column_name]] @@ -222,8 +220,8 @@ def time_series_ransac_regression( stop_probability: float, loss: str, train_size: float, - grouping_column_name: str, grouping: str, + grouping_column_name: str, ): """ Perform RANSAC regression on the time series data for a given protein group. @@ -255,8 +253,6 @@ def time_series_ransac_regression( copy=False, ) - intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours) - intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True) X = intensity_df[[time_column_name]] @@ -298,7 +294,7 @@ def time_series_ransac_regression( x=plot_df[time_column_name], y=plot_df['Intensity'], mode='markers', - name=f'Actual Intensity ({group})', + name=f'Inliers ({group})', marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index]) ), row=1, col=1) @@ -354,7 +350,7 @@ def time_series_ransac_regression( x=plot_df[time_column_name], y=plot_df['Intensity'], mode='markers', - name='Actual Intensity', + name='Inliers', marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) ), row=1, col=1) @@ -507,8 +503,8 @@ def time_series_auto_arima( seasonal: str, m: int, train_size: float, - grouping_column_name: str, grouping: str, + grouping_column_name: str, ) -> dict: """ Perform an automatic ARIMA model selection on the time series data for a given protein group. @@ -526,6 +522,7 @@ def time_series_auto_arima( """ color_index = 0 + messages = [] if train_size < 0 or train_size > 1: raise ValueError("Train size should be between 0 and 1") @@ -553,8 +550,6 @@ def time_series_auto_arima( for group in groups: group_df = intensity_df[intensity_df[grouping_column_name] == group] - group_df[time_column_name] = group_df[str(time_column_name)].apply(convert_time_to_hours) - train_df_size = int(len(group_df) * train_size) train_df, test_df = group_df[:train_df_size], group_df[train_df_size:] @@ -575,23 +570,6 @@ def time_series_auto_arima( # Forecast the test set forecast = model.predict(n_periods=test_df.shape[0]) parameters = model.get_params() - aa_order = parameters['order'] - aa_seasonal_order = parameters['seasonal_order'] - messages = [] - - messages.append( - { - "level": logging.INFO, - "msg": f"Auto Arima Order (p,d,q): {aa_order}.", - } - ) - if seasonal: - messages.append( - { - "level": logging.INFO, - "msg": f"Auto Arima Seasonal Order (P,D,Q,s): {aa_seasonal_order}.", - } - ) test_rmse = np.sqrt(mean_squared_error(test_df, forecast)) test_r2 = r2_score(test_df, forecast) @@ -635,10 +613,24 @@ def time_series_auto_arima( 'train_r2_score': train_r2, 'test_r2_score': test_r2, }) + aa_order = parameters['order'] + aa_seasonal_order = parameters['seasonal_order'] - else: - intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours) + messages.append( + { + "level": logging.INFO, + "msg": f"Auto Arima Order (p,d,q): {aa_order}.", + } + ) + if seasonal: + messages.append( + { + "level": logging.INFO, + "msg": f"Auto Arima Seasonal Order (P,D,Q,s): {aa_seasonal_order}.", + } + ) + else: train_size = int(len(intensity_df) * train_size) train_df, test_df = intensity_df[:train_size], intensity_df[train_size:] @@ -662,7 +654,6 @@ def time_series_auto_arima( aa_order = parameters['order'] aa_seasonal_order = parameters['seasonal_order'] - messages = [] messages.append( { @@ -764,6 +755,7 @@ def time_series_auto_arima( return dict( scores=scores, plots=[fig], + messages=messages, ) @@ -781,8 +773,8 @@ def time_series_arima( Q: int, s: int, train_size: float, - grouping_column_name: str, grouping: str, + grouping_column_name: str, ) -> dict: """ @@ -825,8 +817,6 @@ def time_series_arima( for group in groups: group_df = intensity_df[intensity_df[grouping_column_name] == group] - group_df[time_column_name] = group_df[str(time_column_name)].apply(convert_time_to_hours) - train_df_size = int(len(group_df) * train_size) train_df, test_df = group_df[:train_df_size], group_df[train_df_size:] @@ -893,8 +883,6 @@ def time_series_arima( }) else: - intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours) - train_size = int(len(intensity_df) * train_size) train_df, test_df = intensity_df[:train_size], intensity_df[train_size:] diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index bf05e4cb..3201cc44 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -815,8 +815,8 @@ class TimeSeriesLinearRegression(PlotStep): "time_column_name", "protein_group", "train_size", - "grouping_column_name", "grouping", + "grouping_column_name", ] output_keys = [ "scores", @@ -845,8 +845,8 @@ class TimeSeriesRANSACRegression(PlotStep): "stop_probability", "loss", "train_size", - "grouping_column_name", "grouping", + "grouping_column_name", ] output_keys = [ "scores", @@ -863,7 +863,15 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class TimeSeriesADFullerTest(DataAnalysisStep): display_name = "Augmented Dickey-Fuller Test" operation = "Time series analysis" - method_description = "Perform Augmented Dickey-Fuller test on the time series data for a given protein group." + method_description = ( + "The Augmented Dickey-Fuller test is a type of statistical test called a unit root test. The test " + "determines how strongly a time series is defined by a trend. The null hypothesis of the test is that the " + "time series can be represented by a unit root, which implies that the time series is not stationary. " + "The alternative hypothesis is that the time series is stationary. If the p-value is less than the " + "significance level, the null hypothesis can be rejected and the time series is considered stationary." + "Dickey, D. & Fuller, Wayne. (1979). Distribution of the Estimators for Autoregressive Time Series With a Unit Root. " + "JASA. Journal of the American Statistical Association. 74. 10.2307/2286348. " + ) input_keys = [ "intensity_df", @@ -902,8 +910,8 @@ class TimeSeriesAutoARIMA(PlotStep): "seasonal", "m", "train_size", - "grouping_column_name", "grouping", + "grouping_column_name", ] output_keys = [ "scores", @@ -939,8 +947,8 @@ class TimeSeriesARIMA(PlotStep): "Q", "s", "train_size", - "grouping_column_name", "grouping", + "grouping_column_name", ] output_keys = [ "scores", diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index e682c879..4cdc84fd 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1258,12 +1258,12 @@ class TimeSeriesLinearRegressionForm(MethodForm): step_size=0.1, initial=0.8 ) - grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") grouping = CustomChoiceField( choices= TimeSeriesGrouping, label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", initial=TimeSeriesGrouping.with_grouping ) + grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") def fill_form(self, run: Run) -> None: @@ -1288,6 +1288,9 @@ def fill_form(self, run: Run) -> None: instance_identifier=input_df_instance_id, )["Protein ID"].unique() ) + grouping = self.data.get("grouping") + if grouping == "Without Grouping": + self.toggle_visibility("grouping_column_name", False) class TimeSeriesRANSACRegressionForm(MethodForm): @@ -1326,12 +1329,12 @@ class TimeSeriesRANSACRegressionForm(MethodForm): step_size=0.1, initial=0.8 ) - grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") grouping = CustomChoiceField( choices= TimeSeriesGrouping, label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", initial=TimeSeriesGrouping.with_grouping ) + grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") def fill_form(self, run: Run) -> None: @@ -1356,22 +1359,13 @@ def fill_form(self, run: Run) -> None: instance_identifier=input_df_instance_id, )["Protein ID"].unique() ) + grouping = self.data.get("grouping") + if grouping == "Without Grouping": + self.toggle_visibility("grouping_column_name", False) class TimeSeriesADFullerTestForm(MethodForm): is_dynamic = True - test_info = TextDisplayField( - label="Information about the Augmented Dickey-Fuller test", - text=( - "The Augmented Dickey-Fuller test is a type of statistical test called a unit root test. The test " - "determines how strongly a time series is defined by a trend. The null hypothesis of the test is that the " - "time series can be represented by a unit root, which implies that the time series is not stationary. " - "The alternative hypothesis is that the time series is stationary. If the p-value is less than the " - "significance level, the null hypothesis can be rejected and the time series is considered stationary.
" - "Dickey, D. & Fuller, Wayne. (1979). Distribution of the Estimators for Autoregressive Time Series With a Unit Root." - "JASA. Journal of the American Statistical Association. 74. 10.2307/2286348. " - ), - ) intensity_df = CustomChoiceField( choices=[], label="Intensity dataframe", @@ -1404,11 +1398,6 @@ def fill_form(self, run: Run) -> None: class TimeSeriesAutoARIMAForm(MethodForm): is_dynamic = True - model_info = TextDisplayField( - label="Citation for AutoARIMA model", - text=( - ), - ) intensity_df = CustomChoiceField( choices=[], label="Intensity dataframe", @@ -1436,12 +1425,12 @@ class TimeSeriesAutoARIMAForm(MethodForm): step_size=0.1, initial=0.8, ) - grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") grouping = CustomChoiceField( choices= TimeSeriesGrouping, label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", initial=TimeSeriesGrouping.with_grouping ) + grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") def fill_form(self, run: Run) -> None: @@ -1466,17 +1455,13 @@ def fill_form(self, run: Run) -> None: instance_identifier=input_df_instance_id, )["Protein ID"].unique() ) + grouping = self.data.get("grouping") + if grouping == "Without Grouping": + self.toggle_visibility("grouping_column_name", False) class TimeSeriesARIMAForm(MethodForm): is_dynamic = True - """ - model_info = TextDisplayField( - label="Citation for ARIMA model", - text=( - ), - ) - """ intensity_df = CustomChoiceField( choices=[], label="Intensity dataframe", @@ -1544,12 +1529,12 @@ class TimeSeriesARIMAForm(MethodForm): step_size=0.1, initial=0.8, ) - grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") grouping = CustomChoiceField( choices= TimeSeriesGrouping, label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", initial=TimeSeriesGrouping.with_grouping ) + grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") def fill_form(self, run: Run) -> None: @@ -1574,6 +1559,9 @@ def fill_form(self, run: Run) -> None: instance_identifier=input_df_instance_id, )["Protein ID"].unique() ) + grouping = self.data.get("grouping") + if grouping == "Without Grouping": + self.toggle_visibility("grouping_column_name", False) seasonal = self.data.get("seasonal") if seasonal == "No": self.toggle_visibility("P", False) From f8b556c805b96fba072e2430effe184f5a22e893 Mon Sep 17 00:00:00 2001 From: AK Date: Sun, 15 Sep 2024 14:56:52 +0200 Subject: [PATCH 47/52] Fixed Tests --- .../test_time_series_analysis.py | 55 ++++++++++--------- .../data_analysis/test_time_series_plots.py | 14 ++--- 2 files changed, 35 insertions(+), 34 deletions(-) diff --git a/tests/protzilla/data_analysis/test_time_series_analysis.py b/tests/protzilla/data_analysis/test_time_series_analysis.py index 4962eb22..20017922 100644 --- a/tests/protzilla/data_analysis/test_time_series_analysis.py +++ b/tests/protzilla/data_analysis/test_time_series_analysis.py @@ -56,13 +56,13 @@ def time_series_test_data(): ) test_metadata_df = ( - ["Sample1", "02:00:00", "1"], - ["Sample2", "06:00:00", "1"], - ["Sample3", "10:00:00", "1"], - ["Sample4", "14:00:00", "1"], - ["Sample5", "2:00:00", "2"], - ["Sample6", "4:00:00", "2"], - ["Sample7", "6:00:00", "2"], + ["Sample1", "2", "1"], + ["Sample2", "6", "1"], + ["Sample3", "7", "1"], + ["Sample4", "8", "1"], + ["Sample5", "2", "2"], + ["Sample6", "6", "2"], + ["Sample7", "7", "2"], ) test_metadata_df = pd.DataFrame( data=test_metadata_df, @@ -76,8 +76,8 @@ def test_linear_regression_plot_with_grouping(show_figures, time_series_test_dat test_intensity, test_metadata, "Time", - "Protein1", # 0.8, + "Protein1", "Group", "With Grouping" ) @@ -93,10 +93,10 @@ def test_linear_regression_plot_without_grouping(show_figures, time_series_test_ test_intensity, test_metadata, "Time", - "Protein1", # 0.8, + "Protein1", + "With Grouping", "Group", - "With Grouping" ) assert "plots" in outputs fig = outputs["plots"][0] @@ -111,10 +111,10 @@ def test_linear_regression_plot_invalid_train_size(time_series_test_data): test_intensity, test_metadata, "Time", - "Protein1", # 2, + "Protein1", + "With Grouping", "Group", - "With Grouping" ) return @@ -124,10 +124,10 @@ def test_linear_regression_outputs(time_series_test_data): test_intensity, test_metadata, "Time", - "Protein1", # 0.8, + "Protein1", + "With Grouping", "Group", - "With Grouping" ) assert "scores" in outputs return @@ -144,8 +144,8 @@ def test_ransac_regression_plot_with_grouping(show_figures, time_series_test_dat 0.99, "absolute_error", 0.8, + "With Grouping", "Group", - "With Grouping" ) assert "plots" in outputs fig = outputs["plots"][0] @@ -164,8 +164,9 @@ def test_ransac_regression_plot_without_grouping(show_figures, time_series_test_ 0.99, "absolute_error", 0.8, + "With Grouping", "Group", - "With Grouping" + ) assert "plots" in outputs fig = outputs["plots"][0] @@ -185,8 +186,8 @@ def test_ransac_plot_invalid_train_size(time_series_test_data): 0.99, "absolute_error", 2, + "With Grouping", "Group", - "With Grouping" ) return @@ -201,8 +202,8 @@ def test_ransac_regression_outputs(time_series_test_data): 0.99, "absolute_error", 0.8, + "With Grouping", "Group", - "With Grouping" ) assert "scores" in outputs return @@ -230,8 +231,8 @@ def test_auto_arima_plot_with_grouping(show_figures, time_series_test_data): "No", 1, 0.5, + "With Grouping", "Group", - "With Grouping" ) assert "plots" in outputs fig = outputs["plots"][0] @@ -249,8 +250,8 @@ def test_auto_arima_plot_without_grouping(show_figures, time_series_test_data): "No", 1, 0.5, + "With Grouping", "Group", - "With Grouping" ) assert "plots" in outputs fig = outputs["plots"][0] @@ -269,8 +270,8 @@ def test_auto_arima_plot_invalid_train_size(time_series_test_data): "No", 1, 2, + "With Grouping", "Group", - "With Grouping" ) return @@ -285,8 +286,8 @@ def test_auto_arima_outputs(time_series_test_data): "No", 1, 0.5, + "With Grouping", "Group", - "With Grouping" ) assert "scores" in outputs return @@ -308,8 +309,8 @@ def test_arima_plot_with_grouping(show_figures, time_series_test_data): 0, 0, 0.5, + "With Grouping", "Group", - "With Grouping" ) assert "plots" in outputs fig = outputs["plots"][0] @@ -333,8 +334,8 @@ def test_arima_plot_seasonal_with_grouping(show_figures, time_series_test_data): 0, 0, 0.5, + "With Grouping", "Group", - "With Grouping" ) assert "plots" in outputs fig = outputs["plots"][0] @@ -358,8 +359,8 @@ def test_arima_plot_without_grouping(show_figures, time_series_test_data): 0, 0, 0.5, + "With Grouping", "Group", - "With Grouping" ) assert "plots" in outputs fig = outputs["plots"][0] @@ -384,8 +385,8 @@ def test_arima_plot_invalid_train_size(time_series_test_data): 0, 0, 2, + "With Grouping", "Group", - "With Grouping" ) return @@ -406,8 +407,8 @@ def test_arima_outputs(time_series_test_data): 0, 0, 0.5, - "Group", "With Grouping", + "Group", ) assert "scores" in outputs return \ No newline at end of file diff --git a/tests/protzilla/data_analysis/test_time_series_plots.py b/tests/protzilla/data_analysis/test_time_series_plots.py index 46182092..ca3fe4fa 100644 --- a/tests/protzilla/data_analysis/test_time_series_plots.py +++ b/tests/protzilla/data_analysis/test_time_series_plots.py @@ -43,10 +43,10 @@ def time_series_test_data(): ) test_metadata_df = ( - ["Sample1", "02:00:00", 1], - ["Sample2", "06:00:00", 1], - ["Sample3", "10:00:00", 1], - ["Sample4", "14:00:00", 1], + ["Sample1", "2", 1], + ["Sample2", "6", 1], + ["Sample3", "7", 1], + ["Sample4", "10", 1], ) test_metadata_df = pd.DataFrame( data=test_metadata_df, @@ -56,7 +56,7 @@ def time_series_test_data(): def test_time_series_plot(show_figures, time_series_test_data): test_intensity, test_metadata = time_series_test_data - outputs = time_quant_plot(test_intensity, test_metadata, "Protein1") + outputs = time_quant_plot(test_intensity, test_metadata, "Time","Protein1") assert "plots" in outputs fig = outputs["plots"][0] if show_figures: @@ -66,11 +66,11 @@ def test_time_series_plot(show_figures, time_series_test_data): def test_time_series_plot_invalid_euclidean_similarity(time_series_test_data): test_intensity, test_metadata = time_series_test_data with pytest.raises(ValueError): - time_quant_plot(test_intensity, test_metadata, "Protein1", similarity=-1, similarity_measure="euclidean distance") + time_quant_plot(test_intensity, test_metadata, "Time", "Protein1", similarity=-1, similarity_measure="euclidean distance") return def test_time_series_plot_invalid_cosine_similarity(time_series_test_data): test_intensity, test_metadata = time_series_test_data with pytest.raises(ValueError): - time_quant_plot(test_intensity, test_metadata, "Protein1", similarity=2, similarity_measure="cosine similarity") + time_quant_plot(test_intensity, test_metadata, "Time","Protein1", similarity=2, similarity_measure="cosine similarity") return \ No newline at end of file From 58ec156c094845fc6912a444821853a31a3e2eb1 Mon Sep 17 00:00:00 2001 From: AK Date: Sun, 15 Sep 2024 15:24:04 +0200 Subject: [PATCH 48/52] Updated a variable name --- protzilla/data_analysis/time_series_plots.py | 12 +- .../time_series_regression_analysis.py | 124 +++++++++--------- protzilla/methods/data_analysis.py | 18 +-- protzilla/utilities/transform_dfs.py | 14 +- ui/runs/forms/data_analysis.py | 44 +++---- 5 files changed, 106 insertions(+), 106 deletions(-) diff --git a/protzilla/data_analysis/time_series_plots.py b/protzilla/data_analysis/time_series_plots.py index 236c0e5a..9250c6a9 100644 --- a/protzilla/data_analysis/time_series_plots.py +++ b/protzilla/data_analysis/time_series_plots.py @@ -18,7 +18,7 @@ def time_quant_plot( intensity_df: pd.DataFrame, metadata_df: pd.DataFrame, - time_column_name: str, + time_column: str, protein_group: str, similarity: float = 1.0, similarity_measure: str = "euclidean distance", @@ -33,7 +33,7 @@ def time_quant_plot( :param intensity_df: A dataframe in protzilla wide format, where each row represents a sample and each column represents a feature. :param metadata_df: A dataframe containing the metadata of the samples. - :param time_column_name: The name of the column in the metadata_df that contains the time information. + :param time_column: The name of the column in the metadata_df that contains the time information. :param protein_group: Protein IDs as the columnheader of the dataframe :param similarity_measure: method to compare the chosen proteingroup with all others. The two methods are "cosine similarity" and "euclidean distance". @@ -44,13 +44,13 @@ def time_quant_plot( intensity_df = pd.merge( left=intensity_df, - right=metadata_df[["Sample", time_column_name]], + right=metadata_df[["Sample", time_column]], on="Sample", copy=False, ) wide_df = intensity_df.interpolate(method='linear', axis=0) - wide_df = long_to_wide_time(wide_df, time_column_name=time_column_name) if is_long_format(wide_df, time_column_name=time_column_name) else wide_df + wide_df = long_to_wide_time(wide_df, time_column=time_column) if is_long_format(wide_df, time_column=time_column) else wide_df if protein_group not in wide_df.columns: @@ -166,14 +166,14 @@ def time_quant_plot( yaxis_gridcolor=colors["gridcolor"], xaxis_linecolor=colors["linecolor"], yaxis_linecolor=colors["linecolor"], - xaxis_title=time_column_name, + xaxis_title=time_column, yaxis_title="Intensity", legend_title="Legend", xaxis=dict( tickmode="array", tickangle=0, tickvals=wide_df.index, - ticktext=[wide_df[time_column_name].unique() for wide_df[time_column_name] in wide_df.index], + ticktext=[wide_df[time_column].unique() for wide_df[time_column] in wide_df.index], ), autosize=True, margin=dict(l=100, r=300, t=100, b=100), diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py index 984754fe..143aa696 100644 --- a/protzilla/data_analysis/time_series_regression_analysis.py +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -28,20 +28,20 @@ def time_series_linear_regression( intensity_df: pd.DataFrame, metadata_df: pd.DataFrame, - time_column_name: str, + time_column: str, train_size: float, protein_group: str, grouping: str, - grouping_column_name: str, + grouping_column: str, ): """ Perform linear regression on the time series data for a given protein group. :param intensity_df: Peptide dataframe which contains the intensity of each sample :param metadata_df: Metadata dataframe which contains the timestamps - :param time_column_name: The name of the column containing the time values + :param time_column: The name of the column containing the time values :param protein_group: Protein group to perform the analysis on :param train_size: The proportion of the dataset to include in the test split - :param grouping_column_name: The name of the column containing the grouping information + :param grouping_column: The name of the column containing the grouping information :param grouping: Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups :return: A dictionary containing the root mean squared error and r2 score for the training and test sets @@ -61,18 +61,18 @@ def time_series_linear_regression( intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True) - X = intensity_df[[time_column_name]] + X = intensity_df[[time_column]] y = intensity_df[intensity_column_name] fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025) scores = [] - if grouping == "With Grouping" and grouping_column_name in intensity_df.columns: - groups = intensity_df[grouping_column_name].unique() + if grouping == "With Grouping" and grouping_column in intensity_df.columns: + groups = intensity_df[grouping_column].unique() for group in groups: - group_df = intensity_df[intensity_df[grouping_column_name] == group] - X_group = group_df[[time_column_name]] + group_df = intensity_df[intensity_df[grouping_column] == group] + X_group = group_df[[time_column]] y_group = group_df[intensity_column_name] X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, train_size=train_size, shuffle=False) @@ -87,15 +87,15 @@ def time_series_linear_regression( train_r2 = r2_score(y_train, y_pred_train) test_r2 = r2_score(y_test, y_pred_test) - train_df = pd.DataFrame({time_column_name: X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) - test_df = pd.DataFrame({time_column_name: X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + train_df = pd.DataFrame({time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) + test_df = pd.DataFrame({time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) plot_df = pd.concat([train_df, test_df]) color = PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index % len(PROTZILLA_DISCRETE_COLOR_SEQUENCE)] color_index += 5 fig.add_trace(go.Scatter( - x=plot_df[time_column_name], + x=plot_df[time_column], y=plot_df['Intensity'], mode='markers', name=f'Actual Intensity ({group})', @@ -103,7 +103,7 @@ def time_series_linear_regression( ), row=1, col=1) fig.add_trace(go.Scatter( - x=plot_df[time_column_name], + x=plot_df[time_column], y=plot_df['Predicted'], mode='lines', name=f'Predicted Intensity ({group})', @@ -132,14 +132,14 @@ def time_series_linear_regression( test_r2 = r2_score(y_test, y_pred_test) train_df = pd.DataFrame( - {time_column_name: X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, + {time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) test_df = pd.DataFrame( - {time_column_name: X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + {time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) plot_df = pd.concat([train_df, test_df]) fig.add_trace(go.Scatter( - x=plot_df[time_column_name], + x=plot_df[time_column], y=plot_df['Intensity'], mode='markers', name='Actual Intensity', @@ -147,7 +147,7 @@ def time_series_linear_regression( ), row=1, col=1) fig.add_trace(go.Scatter( - x=plot_df[time_column_name], + x=plot_df[time_column], y=plot_df['Predicted'], mode='lines', name='Predicted Intensity', @@ -186,7 +186,7 @@ def time_series_linear_regression( yaxis_gridcolor=colors["gridcolor"], xaxis_linecolor=colors["linecolor"], yaxis_linecolor=colors["linecolor"], - xaxis_title=time_column_name, + xaxis_title=time_column, yaxis_title="Intensity", legend_title="Legend", autosize=True, @@ -214,26 +214,26 @@ def time_series_linear_regression( def time_series_ransac_regression( intensity_df: pd.DataFrame, metadata_df: pd.DataFrame, - time_column_name: str, + time_column: str, protein_group: str, max_trials: int, stop_probability: float, loss: str, train_size: float, grouping: str, - grouping_column_name: str, + grouping_column: str, ): """ Perform RANSAC regression on the time series data for a given protein group. :param intensity_df: Peptide dataframe which contains the intensity of each sample :param metadata_df: Metadata dataframe which contains the timestamps - :param time_column_name: The name of the column containing the time values + :param time_column: The name of the column containing the time values :param max_trials: The maximum number of iterations to perform :param stop_probability: The probability to stop the RANSAC algorithm :param loss: The loss function to use :param protein_group: Protein group to perform the analysis on :param train_size: The proportion of the dataset to include in the test split - :param grouping_column_name: The name of the column containing the grouping information + :param grouping_column: The name of the column containing the grouping information :param grouping: Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups :return: A dictionary containing the root mean squared error and r2 score for the training and test sets @@ -255,18 +255,18 @@ def time_series_ransac_regression( intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True) - X = intensity_df[[time_column_name]] + X = intensity_df[[time_column]] y = intensity_df[intensity_column_name] fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025) scores = [] - if grouping == "With Grouping" and grouping_column_name in intensity_df.columns: - groups = intensity_df[grouping_column_name].unique() + if grouping == "With Grouping" and grouping_column in intensity_df.columns: + groups = intensity_df[grouping_column].unique() for group in groups: - group_df = intensity_df[intensity_df[grouping_column_name] == group] - X_group = group_df[[time_column_name]] + group_df = intensity_df[intensity_df[grouping_column] == group] + X_group = group_df[[time_column]] y_group = group_df[intensity_column_name] X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, train_size=train_size, shuffle=False) @@ -283,15 +283,15 @@ def time_series_ransac_regression( train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask]) test_r2 = r2_score(y_test, y_pred_test) - train_df = pd.DataFrame({time_column_name: X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) - test_df = pd.DataFrame({time_column_name: X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + train_df = pd.DataFrame({time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) + test_df = pd.DataFrame({time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) train_df['Inlier'] = inlier_mask test_df['Inlier'] = False plot_df = pd.concat([train_df, test_df]) # Add main plot traces fig.add_trace(go.Scatter( - x=plot_df[time_column_name], + x=plot_df[time_column], y=plot_df['Intensity'], mode='markers', name=f'Inliers ({group})', @@ -299,7 +299,7 @@ def time_series_ransac_regression( ), row=1, col=1) fig.add_trace(go.Scatter( - x=plot_df[time_column_name], + x=plot_df[time_column], y=plot_df['Predicted'], mode='lines', name=f'Predicted Intensity ({group})', @@ -307,7 +307,7 @@ def time_series_ransac_regression( ), row=1, col=1) fig.add_trace(go.Scatter( - x=plot_df[plot_df['Inlier'] == False][time_column_name], + x=plot_df[plot_df['Inlier'] == False][time_column], y=plot_df[plot_df['Inlier'] == False]['Intensity'], mode='markers', name='Outliers', @@ -339,15 +339,15 @@ def time_series_ransac_regression( train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask]) test_r2 = r2_score(y_test, y_pred_test) - train_df = pd.DataFrame({time_column_name: X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) - test_df = pd.DataFrame({time_column_name: X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + train_df = pd.DataFrame({time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) + test_df = pd.DataFrame({time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) train_df['Inlier'] = inlier_mask test_df['Inlier'] = False plot_df = pd.concat([train_df, test_df]) # Add main plot traces fig.add_trace(go.Scatter( - x=plot_df[time_column_name], + x=plot_df[time_column], y=plot_df['Intensity'], mode='markers', name='Inliers', @@ -355,7 +355,7 @@ def time_series_ransac_regression( ), row=1, col=1) fig.add_trace(go.Scatter( - x=plot_df[time_column_name], + x=plot_df[time_column], y=plot_df['Predicted'], mode='lines', name='Predicted Intensity', @@ -363,7 +363,7 @@ def time_series_ransac_regression( ), row=1, col=1) fig.add_trace(go.Scatter( - x=plot_df[plot_df['Inlier'] == False][time_column_name], + x=plot_df[plot_df['Inlier'] == False][time_column], y=plot_df[plot_df['Inlier'] == False]['Intensity'], mode='markers', name='Outliers', @@ -402,7 +402,7 @@ def time_series_ransac_regression( yaxis_gridcolor=colors["gridcolor"], xaxis_linecolor=colors["linecolor"], yaxis_linecolor=colors["linecolor"], - xaxis_title=time_column_name, + xaxis_title=time_column, yaxis_title="Intensity", legend_title="Legend", autosize=True, @@ -498,24 +498,24 @@ def adfuller_test( def time_series_auto_arima( intensity_df: pd.DataFrame, metadata_df: pd.DataFrame, - time_column_name: str, + time_column: str, protein_group: str, seasonal: str, m: int, train_size: float, grouping: str, - grouping_column_name: str, + grouping_column: str, ) -> dict: """ Perform an automatic ARIMA model selection on the time series data for a given protein group. :param intensity_df: Peptide dataframe which contains the intensity of each sample :param metadata_df: Metadata dataframe which contains the timestamps - :param time_column_name: The name of the column containing the time values + :param time_column: The name of the column containing the time values :param protein_group: Protein group to perform the analysis on :param seasonal: Whether the ARIMA model should be seasonal :param m: The number of time steps for a single seasonal period (ignored if seasonal=False) :param train_size: The proportion of the dataset to include in the test split - :param grouping_column_name: The name of the column containing the grouping information + :param grouping_column: The name of the column containing the grouping information :param grouping: Whether to group the data by the 'Group' column :return: A dictionary containing the root mean squared error and r2 score for the training and test sets @@ -545,16 +545,16 @@ def time_series_auto_arima( fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3]) scores = [] - if grouping == "With Grouping" and grouping_column_name in intensity_df.columns: - groups = intensity_df[grouping_column_name].unique() + if grouping == "With Grouping" and grouping_column in intensity_df.columns: + groups = intensity_df[grouping_column].unique() for group in groups: - group_df = intensity_df[intensity_df[grouping_column_name] == group] + group_df = intensity_df[intensity_df[grouping_column] == group] train_df_size = int(len(group_df) * train_size) train_df, test_df = group_df[:train_df_size], group_df[train_df_size:] - train_df = train_df.set_index(time_column_name)[intensity_column_name] - test_df = test_df.set_index(time_column_name)[intensity_column_name] + train_df = train_df.set_index(time_column)[intensity_column_name] + test_df = test_df.set_index(time_column)[intensity_column_name] # Fit the ARIMA model model = auto_arima( @@ -634,8 +634,8 @@ def time_series_auto_arima( train_size = int(len(intensity_df) * train_size) train_df, test_df = intensity_df[:train_size], intensity_df[train_size:] - train_df = train_df.set_index(time_column_name)[intensity_column_name] - test_df = test_df.set_index(time_column_name)[intensity_column_name] + train_df = train_df.set_index(time_column)[intensity_column_name] + test_df = test_df.set_index(time_column)[intensity_column_name] # Fit the ARIMA model model = auto_arima( @@ -734,7 +734,7 @@ def time_series_auto_arima( yaxis_gridcolor=colors["gridcolor"], xaxis_linecolor=colors["linecolor"], yaxis_linecolor=colors["linecolor"], - xaxis_title=time_column_name, + xaxis_title=time_column, yaxis_title="Intensity", legend_title="Legend", autosize=True, @@ -762,7 +762,7 @@ def time_series_auto_arima( def time_series_arima( intensity_df: pd.DataFrame, metadata_df: pd.DataFrame, - time_column_name: str, + time_column: str, protein_group: str, seasonal: str, p: int, @@ -774,14 +774,14 @@ def time_series_arima( s: int, train_size: float, grouping: str, - grouping_column_name: str, + grouping_column: str, ) -> dict: """ Perform ARIMA model selection on the time series data for a given protein group. :param intensity_df: Peptide dataframe which contains the intensity of each sample :param metadata_df: Metadata dataframe which contains the timestamps - :param time_column_name: The name of the column containing the time values + :param time_column: The name of the column containing the time values :param protein_group: Protein group to perform the analysis on :param seasonal: Whether the ARIMA model should be seasonal :param p: ARIMA p parameter @@ -792,7 +792,7 @@ def time_series_arima( :param Q: ARIMA seasonal Q parameter :param s: ARIMA seasonal s parameter :param train_size: The proportion of the dataset to include in the test split - :param grouping_column_name: The name of the column containing the grouping information + :param grouping_column: The name of the column containing the grouping information :param grouping: Whether to group the data by the 'Group' column :return: A dictionary containing the root mean squared error and r2 score for the training and test sets @@ -812,16 +812,16 @@ def time_series_arima( fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3]) scores = [] - if grouping == "With Grouping" and grouping_column_name in intensity_df.columns: - groups = intensity_df[grouping_column_name].unique() + if grouping == "With Grouping" and grouping_column in intensity_df.columns: + groups = intensity_df[grouping_column].unique() for group in groups: - group_df = intensity_df[intensity_df[grouping_column_name] == group] + group_df = intensity_df[intensity_df[grouping_column] == group] train_df_size = int(len(group_df) * train_size) train_df, test_df = group_df[:train_df_size], group_df[train_df_size:] - train_df = train_df.set_index(time_column_name)[intensity_column_name] - test_df = test_df.set_index(time_column_name)[intensity_column_name] + train_df = train_df.set_index(time_column)[intensity_column_name] + test_df = test_df.set_index(time_column)[intensity_column_name] if seasonal == "Yes": model = ARIMA( @@ -886,8 +886,8 @@ def time_series_arima( train_size = int(len(intensity_df) * train_size) train_df, test_df = intensity_df[:train_size], intensity_df[train_size:] - train_df = train_df.set_index(time_column_name)[intensity_column_name] - test_df = test_df.set_index(time_column_name)[intensity_column_name] + train_df = train_df.set_index(time_column)[intensity_column_name] + test_df = test_df.set_index(time_column)[intensity_column_name] if seasonal == "Yes": model = ARIMA( @@ -966,7 +966,7 @@ def time_series_arima( yaxis_gridcolor=colors["gridcolor"], xaxis_linecolor=colors["linecolor"], yaxis_linecolor=colors["linecolor"], - xaxis_title=time_column_name, + xaxis_title=time_column, yaxis_title="Intensity", legend_title="Legend", autosize=True, diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 3201cc44..4a1f96d5 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -785,7 +785,7 @@ class PlotTimeQuant(PlotStep): input_keys = [ "intensity_df", "metadata_df", - "time_column_name", + "time_column", "protein_group", "similarity_measure", "similarity" @@ -812,11 +812,11 @@ class TimeSeriesLinearRegression(PlotStep): input_keys = [ "intensity_df", "metadata_df", - "time_column_name", + "time_column", "protein_group", "train_size", "grouping", - "grouping_column_name", + "grouping_column", ] output_keys = [ "scores", @@ -839,14 +839,14 @@ class TimeSeriesRANSACRegression(PlotStep): input_keys = [ "intensity_df", "metadata_df", - "time_column_name", + "time_column", "protein_group", "max_trials", "stop_probability", "loss", "train_size", "grouping", - "grouping_column_name", + "grouping_column", ] output_keys = [ "scores", @@ -905,13 +905,13 @@ class TimeSeriesAutoARIMA(PlotStep): input_keys = [ "intensity_df", "metadata_df", - "time_column_name", + "time_column", "protein_group", "seasonal", "m", "train_size", "grouping", - "grouping_column_name", + "grouping_column", ] output_keys = [ "scores", @@ -936,7 +936,7 @@ class TimeSeriesARIMA(PlotStep): input_keys = [ "intensity_df", "metadata_df", - "time_column_name", + "time_column", "protein_group", "seasonal", "p", @@ -948,7 +948,7 @@ class TimeSeriesARIMA(PlotStep): "s", "train_size", "grouping", - "grouping_column_name", + "grouping_column", ] output_keys = [ "scores", diff --git a/protzilla/utilities/transform_dfs.py b/protzilla/utilities/transform_dfs.py index 1c3b7fbe..fdb931e7 100644 --- a/protzilla/utilities/transform_dfs.py +++ b/protzilla/utilities/transform_dfs.py @@ -24,7 +24,7 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None): ) -def long_to_wide_time(intensity_df: pd.DataFrame, value_name: str = None, time_column_name: str = None): +def long_to_wide_time(intensity_df: pd.DataFrame, value_name: str = None, time_column: str = None): """ This function transforms the dataframe to a wide format that can be more easily handled by packages such as sklearn. @@ -34,17 +34,17 @@ def long_to_wide_time(intensity_df: pd.DataFrame, value_name: str = None, time_c long format :type intensity_df: pd.DataFrame :param value_name: the name of the column in the metadata_df that contains the intensity information. - :param time_column_name: the name of the column in the metadata_df that contains the time information. + :param time_column: the name of the column in the metadata_df that contains the time information. :return: returns dataframe in wide format suitable for use by packages such as sklearn :rtype: pd.DataFrame """ - if intensity_df.duplicated(subset=[time_column_name, "Protein ID"]).any(): - intensity_df = intensity_df.groupby([time_column_name, "Protein ID"]).mean().reset_index() + if intensity_df.duplicated(subset=[time_column, "Protein ID"]).any(): + intensity_df = intensity_df.groupby([time_column, "Protein ID"]).mean().reset_index() values_name = default_intensity_column(intensity_df) if value_name is None else value_name intensity_df = pd.pivot( - intensity_df, index=time_column_name, columns="Protein ID", values=values_name + intensity_df, index=time_column, columns="Protein ID", values=values_name ) intensity_df = intensity_df.fillna(intensity_df.mean()) return intensity_df @@ -84,9 +84,9 @@ def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame): return intensity_df -def is_long_format(df: pd.DataFrame, time_column_name: str = None): +def is_long_format(df: pd.DataFrame, time_column: str = None): required_columns = {"Sample", "Protein ID"} - additional_columns = {"Gene", time_column_name} + additional_columns = {"Gene", time_column} return required_columns.issubset(df.columns) and any(col in df.columns for col in additional_columns) diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 4cdc84fd..5f4e18e8 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1170,7 +1170,7 @@ class PlotTimeQuantForm(MethodForm): choices=[], label="Choose dataframe to be plotted", ) - time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") + time_column = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") protein_group = CustomChoiceField( choices=[], label="Protein group: choose highlighted protein group", @@ -1193,7 +1193,7 @@ def fill_form(self, run: Run) -> None: "intensity_df", self.fields["intensity_df"].choices[0][0] ) self.fields[ - "time_column_name" + "time_column" ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) self.fields["protein_group"].choices = fill_helper.to_choices( @@ -1246,7 +1246,7 @@ class TimeSeriesLinearRegressionForm(MethodForm): choices=[], label="Intensity dataframe", ) - time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") + time_column = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") protein_group = CustomChoiceField( choices=[], label="Protein group: which protein group to perform the linear regression on", @@ -1263,7 +1263,7 @@ class TimeSeriesLinearRegressionForm(MethodForm): label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", initial=TimeSeriesGrouping.with_grouping ) - grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") + grouping_column = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") def fill_form(self, run: Run) -> None: @@ -1274,11 +1274,11 @@ def fill_form(self, run: Run) -> None: "intensity_df", self.fields["intensity_df"].choices[0][0] ) self.fields[ - "time_column_name" + "time_column" ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) self.fields[ - "grouping_column_name" + "grouping_column" ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) self.fields["protein_group"].choices = fill_helper.to_choices( @@ -1290,7 +1290,7 @@ def fill_form(self, run: Run) -> None: ) grouping = self.data.get("grouping") if grouping == "Without Grouping": - self.toggle_visibility("grouping_column_name", False) + self.toggle_visibility("grouping_column", False) class TimeSeriesRANSACRegressionForm(MethodForm): @@ -1299,7 +1299,7 @@ class TimeSeriesRANSACRegressionForm(MethodForm): choices=[], label="Intensity dataframe", ) - time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") + time_column = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") protein_group = CustomChoiceField( choices=[], label="Protein group: which protein group to perform the RANSAC regression on", @@ -1334,7 +1334,7 @@ class TimeSeriesRANSACRegressionForm(MethodForm): label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", initial=TimeSeriesGrouping.with_grouping ) - grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") + grouping_column = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") def fill_form(self, run: Run) -> None: @@ -1345,11 +1345,11 @@ def fill_form(self, run: Run) -> None: "intensity_df", self.fields["intensity_df"].choices[0][0] ) self.fields[ - "time_column_name" + "time_column" ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) self.fields[ - "grouping_column_name" + "grouping_column" ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) self.fields["protein_group"].choices = fill_helper.to_choices( @@ -1361,7 +1361,7 @@ def fill_form(self, run: Run) -> None: ) grouping = self.data.get("grouping") if grouping == "Without Grouping": - self.toggle_visibility("grouping_column_name", False) + self.toggle_visibility("grouping_column", False) class TimeSeriesADFullerTestForm(MethodForm): @@ -1402,7 +1402,7 @@ class TimeSeriesAutoARIMAForm(MethodForm): choices=[], label="Intensity dataframe", ) - time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") + time_column = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") protein_group = CustomChoiceField( choices=[], label="Protein group: which protein group to perform the AutoARIMA on", @@ -1430,7 +1430,7 @@ class TimeSeriesAutoARIMAForm(MethodForm): label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", initial=TimeSeriesGrouping.with_grouping ) - grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") + grouping_column = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") def fill_form(self, run: Run) -> None: @@ -1441,11 +1441,11 @@ def fill_form(self, run: Run) -> None: "intensity_df", self.fields["intensity_df"].choices[0][0] ) self.fields[ - "time_column_name" + "time_column" ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) self.fields[ - "grouping_column_name" + "grouping_column" ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) self.fields["protein_group"].choices = fill_helper.to_choices( @@ -1457,7 +1457,7 @@ def fill_form(self, run: Run) -> None: ) grouping = self.data.get("grouping") if grouping == "Without Grouping": - self.toggle_visibility("grouping_column_name", False) + self.toggle_visibility("grouping_column", False) class TimeSeriesARIMAForm(MethodForm): @@ -1466,7 +1466,7 @@ class TimeSeriesARIMAForm(MethodForm): choices=[], label="Intensity dataframe", ) - time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") + time_column = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") protein_group = CustomChoiceField( choices=[], label="Protein group: which protein group to perform the AutoARIMA on", @@ -1534,7 +1534,7 @@ class TimeSeriesARIMAForm(MethodForm): label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", initial=TimeSeriesGrouping.with_grouping ) - grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") + grouping_column = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") def fill_form(self, run: Run) -> None: @@ -1545,11 +1545,11 @@ def fill_form(self, run: Run) -> None: "intensity_df", self.fields["intensity_df"].choices[0][0] ) self.fields[ - "time_column_name" + "time_column" ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) self.fields[ - "grouping_column_name" + "grouping_column" ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) self.fields["protein_group"].choices = fill_helper.to_choices( @@ -1561,7 +1561,7 @@ def fill_form(self, run: Run) -> None: ) grouping = self.data.get("grouping") if grouping == "Without Grouping": - self.toggle_visibility("grouping_column_name", False) + self.toggle_visibility("grouping_column", False) seasonal = self.data.get("seasonal") if seasonal == "No": self.toggle_visibility("P", False) From 8dda742f9551eab2c055c7c9e32f050b1144de97 Mon Sep 17 00:00:00 2001 From: AK Date: Tue, 24 Sep 2024 14:03:29 +0200 Subject: [PATCH 49/52] Updated some methods --- protzilla/data_analysis/time_series_plots.py | 12 +- .../time_series_regression_analysis.py | 304 ++++++++++-------- protzilla/methods/data_analysis.py | 1 + ui/runs/forms/data_analysis.py | 4 + user_data/workflows/standard.yaml | 14 +- ...uganash.yaml => workflow_Kuganash-BA.yaml} | 54 ++-- user_data/workflows/workflow_Plot-Thesis.yaml | 67 ---- 7 files changed, 201 insertions(+), 255 deletions(-) rename user_data/workflows/{workflow_BA_Kuganash.yaml => workflow_Kuganash-BA.yaml} (71%) delete mode 100644 user_data/workflows/workflow_Plot-Thesis.yaml diff --git a/protzilla/data_analysis/time_series_plots.py b/protzilla/data_analysis/time_series_plots.py index 9250c6a9..3c5f8059 100644 --- a/protzilla/data_analysis/time_series_plots.py +++ b/protzilla/data_analysis/time_series_plots.py @@ -4,9 +4,9 @@ from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances from protzilla.utilities.transform_dfs import is_long_format, long_to_wide_time +from protzilla.constants.colors import PROTZILLA_DISCRETE_COLOR_SEQUENCE # Define color constants -PROTZILLA_DISCRETE_COLOR_SEQUENCE = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#19D3F3", "#E763FA", "#FECB52", "#FFA15A", "#FF6692", "#B6E880"] colors = { "plot_bgcolor": "white", "gridcolor": "#F1F1F1", @@ -68,7 +68,7 @@ def time_quant_plot( color_mapping = { "A": PROTZILLA_DISCRETE_COLOR_SEQUENCE[0], - "C": PROTZILLA_DISCRETE_COLOR_SEQUENCE[1], + "C": PROTZILLA_DISCRETE_COLOR_SEQUENCE[4], } lower_upper_x = [] @@ -122,7 +122,7 @@ def time_quant_plot( y=wide_df[group], mode="lines", name=group[:15] + "..." if len(group) > 15 else group, - line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[1]), + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[9]), showlegend=len(similar_groups) <= 7, ) ) @@ -133,7 +133,7 @@ def time_quant_plot( x=[None], y=[None], mode="lines", - line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[1]), + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[9]), name="Similar Protein Groups", ) ) @@ -147,7 +147,7 @@ def time_quant_plot( y=wide_df[protein_group], mode="lines", name=formatted_protein_name, - line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]), + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[4]), ) ) fig.add_trace( @@ -155,7 +155,7 @@ def time_quant_plot( x=[None], y=[None], mode="markers", - marker=dict(color=color_mapping.get("A")), + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]), name="Intensity", ) ) diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py index 143aa696..b8aebce3 100644 --- a/protzilla/data_analysis/time_series_regression_analysis.py +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -46,6 +46,7 @@ def time_series_linear_regression( :return: A dictionary containing the root mean squared error and r2 score for the training and test sets """ + messages = [] color_index = 0 if train_size < 0 or train_size > 1: raise ValueError("Test size should be between 0 and 1") @@ -64,7 +65,7 @@ def time_series_linear_regression( X = intensity_df[[time_column]] y = intensity_df[intensity_column_name] - fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025) + fig = go.Figure() scores = [] @@ -100,7 +101,8 @@ def time_series_linear_regression( mode='markers', name=f'Actual Intensity ({group})', marker=dict(color=color) - ), row=1, col=1) + ) + ) fig.add_trace(go.Scatter( x=plot_df[time_column], @@ -108,7 +110,8 @@ def time_series_linear_regression( mode='lines', name=f'Predicted Intensity ({group})', line=dict(color=color) - ), row=1, col=1) + ) + ) scores.append({ 'group': group, @@ -144,7 +147,8 @@ def time_series_linear_regression( mode='markers', name='Actual Intensity', marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) - ), row=1, col=1) + ) + ) fig.add_trace(go.Scatter( x=plot_df[time_column], @@ -152,7 +156,8 @@ def time_series_linear_regression( mode='lines', name='Predicted Intensity', line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[5]) - ), row=1, col=1) + ) + ) scores.append({ 'group': 'Overall', @@ -170,15 +175,6 @@ def time_series_linear_regression( for res in scores ]) - fig.add_trace(go.Scatter( - x=[0], - y=[0.25], - text=[annotation_text], - mode='text', - textfont=dict(size=12), - showlegend=False - ), row=1, col=2) - fig.update_layout( title=f"Intensity over Time for {protein_group}", plot_bgcolor=colors["plot_bgcolor"], @@ -190,24 +186,26 @@ def time_series_linear_regression( yaxis_title="Intensity", legend_title="Legend", autosize=True, - margin=dict(l=100, r=100, t=100, b=50), + margin=dict(l=100, r=300, t=100, b=100), legend=dict( - yanchor="top", - y=0.95, - xanchor="right", - x=0.8 + y=1.05, + x=1, + bgcolor = "rgba(255, 255, 255, 0.5)", + orientation = "v", ) ) - # Hide x-axis of the annotation subplot - fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2) - fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2) - - fig.update_annotations(font_size=12) + messages.append( + { + "level": logging.INFO, + "msg": annotation_text, + } + ) return dict( scores=scores, plots=[fig], + messages=messages, ) @@ -238,7 +236,7 @@ def time_series_ransac_regression( :return: A dictionary containing the root mean squared error and r2 score for the training and test sets """ - + messages = [] color_index = 0 if train_size < 0 or train_size > 1: raise ValueError("Test size should be between 0 and 1") @@ -258,7 +256,7 @@ def time_series_ransac_regression( X = intensity_df[[time_column]] y = intensity_df[intensity_column_name] - fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025) + fig = go.Figure() scores = [] @@ -273,30 +271,38 @@ def time_series_ransac_regression( model = RANSACRegressor(max_trials = max_trials, stop_probability = stop_probability, loss = loss, base_estimator=LinearRegression()) model.fit(X_train, y_train) - inlier_mask = model.inlier_mask_ - y_pred_train = model.predict(X_train) y_pred_test = model.predict(X_test) - train_rmse = np.sqrt(mean_squared_error(y_train[inlier_mask], y_pred_train[inlier_mask])) - test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test)) - train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask]) - test_r2 = r2_score(y_test, y_pred_test) + inlier_mask_train = model.inlier_mask_ - train_df = pd.DataFrame({time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) - test_df = pd.DataFrame({time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) - train_df['Inlier'] = inlier_mask - test_df['Inlier'] = False + # Predict the inliers for the test set + test_inlier_mask = model.predict( + X_test) == y_pred_test + + train_rmse = np.sqrt(mean_squared_error(y_train[inlier_mask_train], y_pred_train[inlier_mask_train])) + test_rmse = np.sqrt(mean_squared_error(y_test[test_inlier_mask], y_pred_test[test_inlier_mask])) + train_r2 = r2_score(y_train[inlier_mask_train], y_pred_train[inlier_mask_train]) + test_r2 = r2_score(y_test[test_inlier_mask], y_pred_test[test_inlier_mask]) + + # Prepare DataFrames for plotting + train_df = pd.DataFrame( + {time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) + test_df = pd.DataFrame( + {time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + train_df['Inlier'] = inlier_mask_train + test_df['Inlier'] = test_inlier_mask plot_df = pd.concat([train_df, test_df]) # Add main plot traces fig.add_trace(go.Scatter( - x=plot_df[time_column], - y=plot_df['Intensity'], + x=plot_df[plot_df['Inlier'] == True][time_column], + y=plot_df[plot_df['Inlier'] == True]['Intensity'], mode='markers', name=f'Inliers ({group})', marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index]) - ), row=1, col=1) + ) + ) fig.add_trace(go.Scatter( x=plot_df[time_column], @@ -304,7 +310,8 @@ def time_series_ransac_regression( mode='lines', name=f'Predicted Intensity ({group})', line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2]) - ), row=1, col=1) + ) + ) fig.add_trace(go.Scatter( x=plot_df[plot_df['Inlier'] == False][time_column], @@ -312,7 +319,8 @@ def time_series_ransac_regression( mode='markers', name='Outliers', marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4]) - ), row=1, col=1) + ) + ) color_index += 5 @@ -329,30 +337,38 @@ def time_series_ransac_regression( model = RANSACRegressor(base_estimator=LinearRegression()) model.fit(X_train, y_train) - inlier_mask = model.inlier_mask_ y_pred_train = model.predict(X_train) y_pred_test = model.predict(X_test) - train_rmse = np.sqrt(mean_squared_error(y_train[inlier_mask], y_pred_train[inlier_mask])) - test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test)) - train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask]) - test_r2 = r2_score(y_test, y_pred_test) + inlier_mask_train = model.inlier_mask_ - train_df = pd.DataFrame({time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) - test_df = pd.DataFrame({time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) - train_df['Inlier'] = inlier_mask - test_df['Inlier'] = False + # Predict the inliers for the test set + test_inlier_mask = model.predict(X_test) == y_pred_test + + train_rmse = np.sqrt(mean_squared_error(y_train[inlier_mask_train], y_pred_train[inlier_mask_train])) + test_rmse = np.sqrt(mean_squared_error(y_test[test_inlier_mask], y_pred_test[test_inlier_mask])) + train_r2 = r2_score(y_train[inlier_mask_train], y_pred_train[inlier_mask_train]) + test_r2 = r2_score(y_test[test_inlier_mask], y_pred_test[test_inlier_mask]) + + # Prepare DataFrames for plotting + train_df = pd.DataFrame( + {time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) + test_df = pd.DataFrame( + {time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + train_df['Inlier'] = inlier_mask_train + test_df['Inlier'] = test_inlier_mask plot_df = pd.concat([train_df, test_df]) # Add main plot traces fig.add_trace(go.Scatter( - x=plot_df[time_column], - y=plot_df['Intensity'], + x=plot_df[plot_df['Inlier'] == True][time_column], + y=plot_df[plot_df['Inlier'] == True]['Intensity'], mode='markers', name='Inliers', marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) - ), row=1, col=1) + ) + ) fig.add_trace(go.Scatter( x=plot_df[time_column], @@ -360,7 +376,8 @@ def time_series_ransac_regression( mode='lines', name='Predicted Intensity', line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) - ), row=1, col=1) + ) + ) fig.add_trace(go.Scatter( x=plot_df[plot_df['Inlier'] == False][time_column], @@ -368,7 +385,8 @@ def time_series_ransac_regression( mode='markers', name='Outliers', marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3]) - ), row=1, col=1) + ) + ) scores.append({ 'group': 'Overall', @@ -386,15 +404,6 @@ def time_series_ransac_regression( for res in scores ]) - fig.add_trace(go.Scatter( - x=[0], - y=[0.25], - text=[annotation_text], - mode='text', - textfont=dict(size=12), - showlegend=False - ), row=1, col=2) - fig.update_layout( title=f"Intensity over Time for {protein_group}", plot_bgcolor=colors["plot_bgcolor"], @@ -408,36 +417,40 @@ def time_series_ransac_regression( autosize=True, margin=dict(l=100, r=100, t=100, b=50), legend=dict( - yanchor="top", - y=0.95, - xanchor="right", - x=0.8 - ) + x=1.05, + y=1, + bgcolor="rgba(255, 255, 255, 0.5)", + orientation="v", + ), ) - # Hide x-axis of the annotation subplot - fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2) - fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2) - - fig.update_annotations(font_size=12) + messages.append( + { + "level": logging.INFO, + "msg": annotation_text, + } + ) return dict( scores=scores, plots=[fig], + messages=messages ) def adfuller_test( - intensity_df: pd.DataFrame, - metadata_df: pd.DataFrame, - protein_group: str, - alpha: float = 0.05, + intensity_df: pd.DataFrame, + metadata_df: pd.DataFrame, + time_column: str, + protein_group: str, + alpha: float = 0.05, ) -> dict: """ Perform the Augmented Dickey-Fuller test to check for stationarity in a time series. :param intensity_df: The dataframe containing the time series data. :param metadata_df: The dataframe containing the metadata. :param protein_group: The protein group to perform the test on. + :param time_column: The column representing time (e.g., 'visit', 'timepoint'). :param alpha: The significance level for the test (default is 0.05). :return: A dictionary containing: @@ -449,20 +462,27 @@ def adfuller_test( """ messages = [] + # Filter for the specific protein group intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group] intensity_column_name = default_intensity_column(intensity_df) - intensity_df = pd.merge( - left=intensity_df, - right=metadata_df, + # Merge with metadata to include time information + merged_df = pd.merge( + left=intensity_df[["Sample", intensity_column_name]], + right=metadata_df[["Sample", time_column]], on="Sample", copy=False, ) - intensity_df = intensity_df[intensity_column_name].dropna() + # Sort the data by time to ensure it is treated as a time series + merged_df = merged_df.sort_values(by=time_column) + grouped_df = merged_df.groupby(time_column)[intensity_column_name].mean().reset_index() + + # Extract the time series (after aggregation) + time_series = grouped_df[intensity_column_name].dropna() # Perform the ADF test - result = adfuller(intensity_df) + result = adfuller(time_series) test_statistic = result[0] p_value = result[1] critical_values = result[4] @@ -521,8 +541,8 @@ def time_series_auto_arima( :return: A dictionary containing the root mean squared error and r2 score for the training and test sets """ - color_index = 0 messages = [] + color_index = 0 if train_size < 0 or train_size > 1: raise ValueError("Train size should be between 0 and 1") @@ -542,7 +562,7 @@ def time_series_auto_arima( copy=False, ) - fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3]) + fig = go.Figure() scores = [] if grouping == "With Grouping" and grouping_column in intensity_df.columns: @@ -586,23 +606,26 @@ def time_series_auto_arima( mode='markers', name=f'Actual Intensity ({group})', marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index]) - ), row=1, col=1) + ) + ) fig.add_trace(go.Scatter( x=test_df.index, y=forecast, mode='markers', name=f'Predicted Intensity ({group})', - line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 3]) - ), row=1, col=1) + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4]) + ) + ) fig.add_trace(go.Scatter( x = forecast_plot.index, y = forecast_plot, mode = 'lines', name = f'Mean Predicted Intensity ({group})', - line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 3]) - ), row=1, col=1) + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4]) + ) + ) color_index += 5 @@ -683,24 +706,27 @@ def time_series_auto_arima( y=test_df, mode='markers', name='Actual Intensity', - marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) - ), row=1, col=1) + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) + ) + ) fig.add_trace(go.Scatter( x=test_df.index, y=forecast, mode='markers', name='Predicted Intensity', - line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]) - ), row=1, col=1) + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3]) + ) + ) fig.add_trace(go.Scatter( x=forecast_plot.index, y=forecast_plot, mode='lines', name='Mean Predicted Intensity', - line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3]) - ), row=1, col=1) + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[4]) + ) + ) scores.append({ 'group': 'Overall', @@ -718,14 +744,6 @@ def time_series_auto_arima( for res in scores ]) - fig.add_trace(go.Scatter( - x=[0], - y=[0.25], - text=[annotation_text], - mode='text', - textfont=dict(size=12), - showlegend=False - ), row=1, col=2) fig.update_layout( title=f"Intensity over Time for {protein_group}", @@ -740,17 +758,19 @@ def time_series_auto_arima( autosize=True, margin=dict(l=100, r=100, t=100, b=50), legend=dict( - yanchor="top", - y=0.95, - xanchor="right", - x=0.775 - ) + x=1.05, + y=1, + bgcolor="rgba(255, 255, 255, 0.5)", + orientation="v", + ), ) - fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2) - fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2) - - fig.update_annotations(font_size=12) + messages.append( + { + "level": logging.INFO, + "msg": annotation_text, + } + ) return dict( scores=scores, @@ -797,7 +817,7 @@ def time_series_arima( :return: A dictionary containing the root mean squared error and r2 score for the training and test sets """ - + messages = [] color_index = 0 if train_size < 0 or train_size > 1: @@ -809,7 +829,7 @@ def time_series_arima( intensity_df = pd.merge(left=intensity_df, right=metadata_df, on="Sample", copy=False) - fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3]) + fig = go.Figure() scores = [] if grouping == "With Grouping" and grouping_column in intensity_df.columns: @@ -854,23 +874,26 @@ def time_series_arima( mode='markers', name=f'Actual Intensity ({group})', marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index]) - ), row=1, col=1) + ) + ) fig.add_trace(go.Scatter( x=forecast_plot.index, y=forecast_plot, mode='markers', name= f'Predicted Intensity ({group})', - line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2]) - ), row=1, col=1) + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4]) + ) + ) fig.add_trace(go.Scatter( x = forecast_mean_plot.index, y = forecast_mean_plot, mode = 'lines', name = f'Mean Predicted Intensity ({group})', - line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2]) - ), row=1, col=1) + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4]) + ) + ) color_index += 5 @@ -917,15 +940,17 @@ def time_series_arima( mode='markers', name='Actual Intensity', marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) - ), row=1, col=1) + ) + ) fig.add_trace(go.Scatter( x=test_df.index, y=forecast, mode='markers', name='Predicted Intensity', - line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]) - ), row=1, col=1) + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3]) + ) + ) fig.add_trace(go.Scatter( x=forecast_plot.index, @@ -933,7 +958,8 @@ def time_series_arima( mode='lines', name='Mean Predicted Intensity', line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[4]) - ), row=1, col=1) + ) + ) scores.append({ 'group': 'Overall', @@ -950,15 +976,6 @@ def time_series_arima( for res in scores ]) - fig.add_trace(go.Scatter( - x=[0], - y=[0.25], - text=[annotation_text], - mode='text', - textfont=dict(size=12), - showlegend=False - ), row=1, col=2) - fig.update_layout( title=f"Intensity over Time for {protein_group}", plot_bgcolor=colors["plot_bgcolor"], @@ -972,19 +989,22 @@ def time_series_arima( autosize=True, margin=dict(l=100, r=100, t=100, b=50), legend=dict( - yanchor="top", - y=0.95, - xanchor="right", - x=0.775 - ) + x=1.05, + y=1, + bgcolor="rgba(255, 255, 255, 0.5)", + orientation="v", + ), ) - fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2) - fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2) - - fig.update_annotations(font_size=12) + messages.append( + { + "level": logging.INFO, + "msg": annotation_text, + } + ) return dict( scores=scores, plots=[fig], + messages=messages, ) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 4a1f96d5..4907bbf6 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -876,6 +876,7 @@ class TimeSeriesADFullerTest(DataAnalysisStep): input_keys = [ "intensity_df", "metadata_df", + "time_column", "protein_group", "alpha", ] diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 5f4e18e8..2b8fcf64 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1370,6 +1370,7 @@ class TimeSeriesADFullerTestForm(MethodForm): choices=[], label="Intensity dataframe", ) + time_column = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") protein_group = CustomChoiceField( choices=[], label="Protein group: which protein group to perform the ADFuller test on", @@ -1388,6 +1389,9 @@ def fill_form(self, run: Run) -> None: input_df_instance_id = self.data.get( "intensity_df", self.fields["intensity_df"].choices[0][0] ) + self.fields[ + "time_column" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) self.fields["protein_group"].choices = fill_helper.to_choices( run.steps.get_step_output( step_type=Step, diff --git a/user_data/workflows/standard.yaml b/user_data/workflows/standard.yaml index 0b93bd80..52d754b7 100644 --- a/user_data/workflows/standard.yaml +++ b/user_data/workflows/standard.yaml @@ -36,20 +36,20 @@ steps: plot_inputs: { } type: OutlierDetectionByLocalOutlierFactor - form_inputs: - percentile: 0.5 + log_base: log2 inputs: { } plot_inputs: - graph_type: Boxplot + graph_type: Histogram group_by: None - visual_transformation: log10 - type: NormalisationByMedian + type: TransformationLog - form_inputs: - log_base: log2 + percentile: 0.5 inputs: { } plot_inputs: - graph_type: Histogram + graph_type: Boxplot group_by: None - type: TransformationLog + visual_transformation: log10 + type: NormalisationByMedian - form_inputs: similarity_measure: euclidean distance inputs: { } diff --git a/user_data/workflows/workflow_BA_Kuganash.yaml b/user_data/workflows/workflow_Kuganash-BA.yaml similarity index 71% rename from user_data/workflows/workflow_BA_Kuganash.yaml rename to user_data/workflows/workflow_Kuganash-BA.yaml index 1a19947c..ef2d26bb 100644 --- a/user_data/workflows/workflow_BA_Kuganash.yaml +++ b/user_data/workflows/workflow_Kuganash-BA.yaml @@ -1,18 +1,12 @@ df_mode: disk_memory steps: - form_inputs: - aggregation_method: Median - intensity_name: Intensity + aggregation_mode: Sum + intensity_name: iBAQ map_to_uniprot: false inputs: {} instance_identifier: MaxQuantImport_1 type: MaxQuantImport -- form_inputs: - intensity_name: Intensity - map_to_uniprot: false - inputs: {} - instance_identifier: EvidenceImport_1 - type: EvidenceImport - form_inputs: feature_orientation: Columns (samples in rows, features in columns) inputs: {} @@ -48,15 +42,6 @@ steps: instance_identifier: OutlierDetectionByLocalOutlierFactor_1 plot_inputs: {} type: OutlierDetectionByLocalOutlierFactor -- form_inputs: - percentile: 0.5 - inputs: {} - instance_identifier: NormalisationByMedian_1 - plot_inputs: - graph_type: Boxplot - group_by: None - visual_transformation: log10 - type: NormalisationByMedian - form_inputs: log_base: log2 inputs: {} @@ -66,24 +51,27 @@ steps: group_by: None type: TransformationLog - form_inputs: - input_df: TransformationLog_1 - protein_group: D3YYU8 - similarity: 1 - similarity_measure: euclidean distance + percentile: 0.5 inputs: {} - instance_identifier: PlotTimeSeries_1 - type: PlotTimeSeriesPeptide -- form_inputs: - input_df: TransformationLog_1 - protein_group: D3YYU8 - test_size: 0.2 + instance_identifier: NormalisationByMedian_1 + plot_inputs: + graph_type: Boxplot + group_by: None + visual_transformation: log10 + type: NormalisationByMedian +- form_inputs: {} + inputs: {} + instance_identifier: PlotTimeQuant_1 + type: PlotTimeQuant +- form_inputs: {} inputs: {} instance_identifier: TimeSeriesLinearRegression_1 type: TimeSeriesLinearRegression -- form_inputs: - input_df: TransformationLog_1 - protein_group: D3YYU8 - test_size: 0.2 +- form_inputs: {} + inputs: {} + instance_identifier: TimeSeriesADFullerTest_1 + type: TimeSeriesADFullerTest +- form_inputs: {} inputs: {} - instance_identifier: TimeSeriesRANSACRegression_1 - type: TimeSeriesRANSACRegression + instance_identifier: TimeSeriesAutoARIMA_1 + type: TimeSeriesAutoARIMA diff --git a/user_data/workflows/workflow_Plot-Thesis.yaml b/user_data/workflows/workflow_Plot-Thesis.yaml deleted file mode 100644 index a3dee9fa..00000000 --- a/user_data/workflows/workflow_Plot-Thesis.yaml +++ /dev/null @@ -1,67 +0,0 @@ -df_mode: disk_memory -steps: - - form_inputs: - intensity_name: iBAQ - map_to_uniprot: false - aggregation_mode: Sum - inputs: { } - type: MaxQuantImport - - form_inputs: {} - inputs: {} - instance_identifier: EvidenceImport_1 - type: EvidenceImport - - form_inputs: - feature_orientation: Columns (samples in rows, features in columns) - inputs: {} - instance_identifier: MetadataImport_1 - type: MetadataImport - - form_inputs: - similarity_measure: euclidean distance - inputs: {} - instance_identifier: PlotTimeSeries_1 - type: PlotTimeSeriesPeptide - - form_inputs: - percentage: 0.5 - inputs: { } - plot_inputs: - graph_type: Bar chart - type: FilterProteinsBySamplesMissing - - form_inputs: - deviation_threshold: 2.0 - inputs: { } - plot_inputs: - graph_type: Bar chart - type: FilterSamplesByProteinIntensitiesSum - - form_inputs: - number_of_neighbours: 5 - inputs: { } - plot_inputs: - graph_type: Boxplot - graph_type_quantities: Bar chart - group_by: None - visual_transformation: log10 - type: ImputationByKNN - - form_inputs: - number_of_neighbors: 20 - inputs: { } - plot_inputs: { } - type: OutlierDetectionByLocalOutlierFactor - - form_inputs: - percentile: 0.5 - inputs: { } - plot_inputs: - graph_type: Boxplot - group_by: None - visual_transformation: log10 - type: NormalisationByMedian - - form_inputs: - log_base: log2 - inputs: { } - plot_inputs: - graph_type: Histogram - group_by: None - type: TransformationLog - - form_inputs: - similarity_measure: euclidean distance - inputs: { } - type: PlotProtQuantPeptide \ No newline at end of file From 0e2044705df5a7aea2e5b2122e3c24e2aacae427 Mon Sep 17 00:00:00 2001 From: AK Date: Tue, 24 Sep 2024 14:04:52 +0200 Subject: [PATCH 50/52] Updated Test --- tests/protzilla/data_analysis/test_time_series_analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/protzilla/data_analysis/test_time_series_analysis.py b/tests/protzilla/data_analysis/test_time_series_analysis.py index 20017922..7bdebbda 100644 --- a/tests/protzilla/data_analysis/test_time_series_analysis.py +++ b/tests/protzilla/data_analysis/test_time_series_analysis.py @@ -211,7 +211,7 @@ def test_ransac_regression_outputs(time_series_test_data): def test_adfuller_test(time_series_test_data): test_intensity, test_metadata = time_series_test_data - outputs = adfuller_test(test_intensity, test_metadata, "Protein1") + outputs = adfuller_test(test_intensity, test_metadata, "Time", "Protein1") assert "test_statistic" in outputs assert "p_value" in outputs From f3b00e3bfbd711858f73a6535cb26c47d6215d43 Mon Sep 17 00:00:00 2001 From: AK Date: Fri, 27 Sep 2024 13:50:12 +0200 Subject: [PATCH 51/52] Removed unwanted lines --- protzilla/data_analysis/time_series_plots.py | 9 --------- .../data_analysis/time_series_regression_analysis.py | 2 -- 2 files changed, 11 deletions(-) diff --git a/protzilla/data_analysis/time_series_plots.py b/protzilla/data_analysis/time_series_plots.py index 3c5f8059..37c8ad34 100644 --- a/protzilla/data_analysis/time_series_plots.py +++ b/protzilla/data_analysis/time_series_plots.py @@ -150,15 +150,6 @@ def time_quant_plot( line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[4]), ) ) - fig.add_trace( - go.Scatter( - x=[None], - y=[None], - mode="markers", - marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]), - name="Intensity", - ) - ) fig.update_layout( title=f"Time Series of {formatted_protein_name} in all samples", plot_bgcolor=colors["plot_bgcolor"], diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py index b8aebce3..898f82f9 100644 --- a/protzilla/data_analysis/time_series_regression_analysis.py +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -4,7 +4,6 @@ import pandas as pd import plotly.graph_objects as go -#from protzilla.data_analysis.time_series_helper import convert_time_to_hours from protzilla.utilities import default_intensity_column from protzilla.constants.colors import PROTZILLA_DISCRETE_COLOR_SEQUENCE @@ -14,7 +13,6 @@ from statsmodels.tsa.arima.model import ARIMA from statsmodels.tsa.stattools import adfuller from pmdarima import auto_arima -from plotly.subplots import make_subplots colors = { "plot_bgcolor": "white", From f8bab05baa3f559a7e5c6b2485496751cbf1dd1a Mon Sep 17 00:00:00 2001 From: AK Date: Fri, 27 Sep 2024 13:54:08 +0200 Subject: [PATCH 52/52] Fixed Tests --- tests/protzilla/test_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/protzilla/test_runner.py b/tests/protzilla/test_runner.py index b5de3148..18080f48 100644 --- a/tests/protzilla/test_runner.py +++ b/tests/protzilla/test_runner.py @@ -94,8 +94,8 @@ def test_runner_imports( 'FilterSamplesByProteinIntensitiesSum', 'ImputationByKNN', 'OutlierDetectionByLocalOutlierFactor', - 'NormalisationByMedian', 'TransformationLog', + 'NormalisationByMedian', 'PlotProtQuant', 'DifferentialExpressionTTest', 'PlotVolcano', @@ -109,8 +109,8 @@ def test_runner_imports( call({'deviation_threshold': 2.0}), call({'number_of_neighbours': 5}), call({'number_of_neighbors': 20}), - call({'percentile': 0.5}), call({'log_base': 'log2'}), + call({'percentile': 0.5}), call({'similarity_measure': 'euclidean distance'}), call({'alpha': 0.05}), call({'fc_threshold': 1}),