From ae8a7b3e34d84baf46274f60a4ccfef3d470692a Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Fri, 21 Jun 2024 16:56:19 +0200
Subject: [PATCH 01/52] Fixed import for circadian mouse data

---
 protzilla/importing/ms_data_import.py         |  8 +-
 protzilla/importing/peptide_import.py         |  1 +
 user_data/workflows/workflow_Plot-Thesis.yaml | 94 +++++++++++++++++++
 3 files changed, 102 insertions(+), 1 deletion(-)
 create mode 100644 user_data/workflows/workflow_Plot-Thesis.yaml

diff --git a/protzilla/importing/ms_data_import.py b/protzilla/importing/ms_data_import.py
index d102d7ca..e8cf2df6 100644
--- a/protzilla/importing/ms_data_import.py
+++ b/protzilla/importing/ms_data_import.py
@@ -191,9 +191,15 @@ def clean_protein_groups(protein_groups, map_to_uniprot=True):
     found_ids_per_group = []
     # go through all groups and find the valid proteins
     # non uniprot ids are put into extracted_ids, so they can be mapped
+    extract_protein_id_regex = re.compile(r'\|([^|]+)\|')
+
+    # Function to extract protein IDs from the formatted string
+    def extract_protein_ids(protein_group_str):
+        return extract_protein_id_regex.findall(protein_group_str)
+
     for group in protein_groups:
         found_in_group = set()
-        for protein_id in group.split(";"):
+        for protein_id in extract_protein_ids(group) or group.split(";"):
             if not protein_id.startswith("ENSP") and (
                 match := uniprot_regex.search(protein_id)
             ):
diff --git a/protzilla/importing/peptide_import.py b/protzilla/importing/peptide_import.py
index 2a393fbd..b1c1e96f 100644
--- a/protzilla/importing/peptide_import.py
+++ b/protzilla/importing/peptide_import.py
@@ -87,6 +87,7 @@ def evidence_import(file_path, intensity_name, map_to_uniprot) -> dict:
         "Missed cleavages",
         "PEP",
         "Raw file",
+        "Retention time",
     ]
 
     read = pd.read_csv(
diff --git a/user_data/workflows/workflow_Plot-Thesis.yaml b/user_data/workflows/workflow_Plot-Thesis.yaml
new file mode 100644
index 00000000..0bdccd0f
--- /dev/null
+++ b/user_data/workflows/workflow_Plot-Thesis.yaml
@@ -0,0 +1,94 @@
+df_mode: disk_memory
+steps:
+- form_inputs:
+    feature_orientation: Columns (samples in rows, features in columns)
+  inputs: {}
+  instance_identifier: MetadataImport_1
+  type: MetadataImport
+- form_inputs: {}
+  inputs: {}
+  instance_identifier: EvidenceImport_1
+  type: EvidenceImport
+- form_inputs:
+    percentage: 0.5
+  inputs: {}
+  instance_identifier: FilterProteinsBySamplesMissing_1
+  plot_inputs:
+    graph_type: Bar chart
+  type: FilterProteinsBySamplesMissing
+- form_inputs:
+    deviation_threshold: 2.0
+  inputs: {}
+  instance_identifier: FilterSamplesByProteinIntensitiesSum_1
+  plot_inputs:
+    graph_type: Bar chart
+  type: FilterSamplesByProteinIntensitiesSum
+- form_inputs:
+    number_of_neighbours: 5
+  inputs: {}
+  instance_identifier: ImputationByKNN_1
+  plot_inputs:
+    graph_type: Boxplot
+    graph_type_quantities: Bar chart
+    group_by: None
+    visual_transformation: log10
+  type: ImputationByKNN
+- form_inputs:
+    number_of_neighbors: 20
+  inputs: {}
+  instance_identifier: OutlierDetectionByLocalOutlierFactor_1
+  plot_inputs: {}
+  type: OutlierDetectionByLocalOutlierFactor
+- form_inputs:
+    percentile: 0.5
+  inputs: {}
+  instance_identifier: NormalisationByMedian_1
+  plot_inputs:
+    graph_type: Boxplot
+    group_by: None
+    visual_transformation: log10
+  type: NormalisationByMedian
+- form_inputs:
+    log_base: log2
+  inputs: {}
+  instance_identifier: TransformationLog_1
+  plot_inputs:
+    graph_type: Histogram
+    group_by: None
+  type: TransformationLog
+- form_inputs:
+    similarity_measure: euclidean distance
+  inputs: {}
+  instance_identifier: PlotProtQuant_1
+  type: PlotProtQuant
+- form_inputs:
+    alpha: 0.05
+  inputs: {}
+  instance_identifier: DifferentialExpressionTTest_1
+  type: DifferentialExpressionTTest
+- form_inputs:
+    fc_threshold: 1
+  inputs: {}
+  instance_identifier: PlotVolcano_1
+  type: PlotVolcano
+- form_inputs:
+    differential_expression_threshold: 1
+    direction: both
+    gene_sets_restring: []
+    organism: 9606
+  inputs: {}
+  instance_identifier: EnrichmentAnalysisGOAnalysisWithString_1
+  type: EnrichmentAnalysisGOAnalysisWithString
+- form_inputs:
+    colors: []
+    cutoff: 0.05
+    gene_sets:
+    - Process
+    - Component
+    - Function
+    - KEGG
+    top_terms: 10
+    value: p-value
+  inputs: {}
+  instance_identifier: PlotGOEnrichmentBarPlot_1
+  type: PlotGOEnrichmentBarPlot

From 53d6bafe458e43cb8f33dcbe40529058057c6b34 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Tue, 2 Jul 2024 11:10:34 +0200
Subject: [PATCH 02/52] Plotquantplot for peptide

---
 .../data_preprocessing/peptide_filter.py      | 64 ++++++++++++++
 protzilla/methods/data_analysis.py            | 20 +++++
 protzilla/methods/data_preprocessing.py       |  4 +-
 ui/runs/form_mapping.py                       |  1 +
 ui/runs/forms/data_analysis.py                | 69 ++++++---------
 ui/runs/forms/data_preprocessing.py           |  1 -
 user_data/workflows/workflow_Plot-Thesis.yaml | 86 +------------------
 7 files changed, 115 insertions(+), 130 deletions(-)

diff --git a/protzilla/data_preprocessing/peptide_filter.py b/protzilla/data_preprocessing/peptide_filter.py
index 3b1caee9..67745d15 100644
--- a/protzilla/data_preprocessing/peptide_filter.py
+++ b/protzilla/data_preprocessing/peptide_filter.py
@@ -50,3 +50,67 @@ def by_pep_value_plot(method_inputs, method_outputs, graph_type):
     elif graph_type == "Bar chart":
         fig = create_bar_plot(**value_dict)
     return [fig]
+
+def by_samples_missing(
+    protein_df: pd.DataFrame | None,
+    peptide_df: pd.DataFrame | None,
+    percentage: float = 0.5,
+) -> dict:
+    """
+    This function filters proteins based on the amount of samples with nan values, if the percentage of nan values
+    is below a threshold (percentage).
+
+    :param protein_df: the protein dataframe that should be filtered
+    :param peptide_df: the peptide dataframe that should be filtered in accordance to the intensity dataframe (optional)
+    :param percentage: ranging from 0 to 1. Defining the relative share of samples the proteins need to be present in,
+        in order for the protein to be kept.
+    :return: returns the filtered df as a Dataframe and a dict with a list of Protein IDs that were discarded
+        and a list of Protein IDs that were kept
+    """
+
+    filter_threshold: int = percentage * len(protein_df.Sample.unique())
+    transformed_df = long_to_wide(protein_df)
+
+    remaining_proteins_list = transformed_df.dropna(
+        axis=1, thresh=filter_threshold
+    ).columns.tolist()
+    filtered_proteins_list = (
+        transformed_df.drop(remaining_proteins_list, axis=1).columns.unique().tolist()
+    )
+    filtered_df = protein_df[
+        (protein_df["Protein ID"].isin(remaining_proteins_list))
+    ]
+    filtered_peptide_df = None
+    if peptide_df is not None:
+        filtered_peptide_df = peptide_df[
+            (peptide_df["Protein ID"].isin(remaining_proteins_list))
+        ]
+    return dict(
+        protein_df=filtered_df,
+        peptide_df=filtered_peptide_df,
+        filtered_proteins=filtered_proteins_list,
+        remaining_proteins=remaining_proteins_list,
+    )
+
+
+def _build_pie_bar_plot(remaining_proteins, filtered_proteins, graph_type):
+    if graph_type == "Pie chart":
+        fig = create_pie_plot(
+            values_of_sectors=[
+                len(remaining_proteins),
+                len(filtered_proteins),
+            ],
+            names_of_sectors=["Proteins kept", "Proteins filtered"],
+            heading="Number of Filtered Proteins",
+        )
+    elif graph_type == "Bar chart":
+        fig = create_bar_plot(
+            values_of_sectors=[
+                len(remaining_proteins),
+                len(filtered_proteins),
+            ],
+            names_of_sectors=["Proteins kept", "Proteins filtered"],
+            heading="Number of Filtered Proteins",
+            y_title="Number of Proteins",
+        )
+    return [fig]
\ No newline at end of file
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 79982ea2..c7f110e3 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -18,6 +18,7 @@
     prot_quant_plot,
     scatter_plot,
 )
+from protzilla.data_analysis.prot_quant_plot_peptide import prot_quant_plot_peptide
 from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph
 from protzilla.methods.data_preprocessing import TransformationLog
 from protzilla.steps import Plots, Step, StepManager
@@ -252,6 +253,25 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         )
         return inputs
 
+class PlotProtQuantPeptide(PlotStep):
+    display_name = "Protein Quantification Plot For Peptide"
+    operation = "plot"
+    method_description = (
+        "Creates a line chart for intensity across samples for protein groups"
+    )
+
+    input_keys = ["input_df", "protein_group", "similarity_measure", "similarity"]
+    output_keys = []
+
+    def method(self, inputs: dict) -> dict:
+        return prot_quant_plot_peptide(**inputs)
+
+    def insert_dataframes(self, steps: StepManager, inputs) -> dict:
+        inputs["input_df"] = steps.get_step_output(
+            Step, "peptide_df", inputs["input_df"]
+        )
+        return inputs
+
 
 class PlotPrecisionRecallCurve(PlotStep):
     display_name = "Precision Recall"
diff --git a/protzilla/methods/data_preprocessing.py b/protzilla/methods/data_preprocessing.py
index 0565eaf0..50373899 100644
--- a/protzilla/methods/data_preprocessing.py
+++ b/protzilla/methods/data_preprocessing.py
@@ -329,8 +329,8 @@ class FilterPeptidesByPEPThreshold(DataPreprocessingStep):
     operation = "filter_peptides"
     method_description = "Filter by PEP-threshold"
 
-    input_keys = ["protein_df", "peptide_df", "threshold"]
-    output_keys = ["protein_df", "peptide_df", "filtered_peptides"]
+    input_keys = ["peptide_df", "threshold"]
+    output_keys = ["peptide_df", "filtered_peptides"]
 
     def method(self, inputs):
         return peptide_filter.by_pep_value(**inputs)
diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py
index ab27a219..dcfc6306 100644
--- a/ui/runs/form_mapping.py
+++ b/ui/runs/form_mapping.py
@@ -47,6 +47,7 @@
     data_analysis.PlotScatterPlot: data_analysis_forms.PlotScatterPlotForm,
     data_analysis.PlotClustergram: data_analysis_forms.PlotClustergramForm,
     data_analysis.PlotProtQuant: data_analysis_forms.PlotProtQuantForm,
+    data_analysis.PlotProtQuantPeptide: data_analysis_forms.PlotProtQuantPeptideForm,
     data_analysis.PlotPrecisionRecallCurve: data_analysis_forms.PlotPrecisionRecallCurveForm,
     data_analysis.PlotROC: data_analysis_forms.PlotROCCurveForm,
     data_analysis.ClusteringKMeans: data_analysis_forms.ClusteringKMeansForm,
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 09c6a73c..de7c3fa4 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -368,56 +368,35 @@ class PlotProtQuantForm(MethodForm):
         label="Similarity", min_value=-1, max_value=999, step_size=1, initial=1
     )
 
+class PlotProtQuantPeptideForm(MethodForm):
+    is_dynamic = True
+
+    input_df = CustomChoiceField(
+        choices=[],
+        label="Choose dataframe to be plotted",
+    )
+    protein_group = CustomChoiceField(
+        choices=[],
+        label="Protein group: choose highlighted protein group",
+    )
+    similarity_measure = CustomChoiceField(
+        choices=SimilarityMeasure,
+        label="Similarity Measurement: choose how to compare protein groups",
+        initial=SimilarityMeasure.euclidean_distance,
+    )
+    similarity = CustomNumberField(
+        label="Similarity", min_value=-1, max_value=999, step_size=1, initial=1
+    )
+
     def fill_form(self, run: Run) -> None:
-        self.fields["input_df"].choices = fill_helper.get_choices_for_protein_df_steps(
-            run
-        )
+        self.fields["input_df"].choices = fill_helper.get_choices(run, "peptide_df")
 
-        input_df_instance_id = self.data.get(
-            "input_df", self.fields["input_df"].choices[0][0]
-        )
 
         self.fields["protein_group"].choices = fill_helper.to_choices(
-            run.steps.get_step_output(
-                step_type=Step,
-                output_key="protein_df",
-                instance_identifier=input_df_instance_id,
-            )["Protein ID"].unique()
-        )
-
-        similarity_measure = self.data.get(
-            "similarity_measure", self.fields["similarity_measure"].choices[0][0]
+            run.steps.get_step_output(Step, "peptide_df")[
+                "Protein ID"
+            ].unique()
         )
-        self.data = self.data.copy()
-        if similarity_measure == SimilarityMeasure.cosine_similarity:
-            self.fields["similarity"] = CustomFloatField(
-                label="Cosine Similarity",
-                min_value=-1,
-                max_value=1,
-                step_size=0.1,
-                initial=0,
-            )
-            if (
-                    "similarity" not in self.data
-                    or float(self.data["similarity"]) < -1
-                    or float(self.data["similarity"]) > 1
-            ):
-                self.data["similarity"] = 0
-        else:
-            self.fields["similarity"] = CustomNumberField(
-                label="Euclidean Distance",
-                min_value=0,
-                max_value=999,
-                step_size=1,
-                initial=1,
-            )
-            if (
-                    "similarity" not in self.data
-                    or float(self.data["similarity"]) < 0
-                    or float(self.data["similarity"]) > 999
-            ):
-                self.data["similarity"] = 1
-
 
 class PlotPrecisionRecallCurveForm(MethodForm):
     # Todo: Input
diff --git a/ui/runs/forms/data_preprocessing.py b/ui/runs/forms/data_preprocessing.py
index 08590b79..40ca2f78 100644
--- a/ui/runs/forms/data_preprocessing.py
+++ b/ui/runs/forms/data_preprocessing.py
@@ -469,7 +469,6 @@ class FilterPeptidesByPEPThresholdForm(MethodForm):
     threshold = CustomFloatField(
         label="Threshold value for PEP", min_value=0, initial=0
     )
-    peptide_df = CustomChoiceField(choices=EmptyEnum, label="peptide_df")
 
 
 class FilterPeptidesByPEPThresholdPlotForm(MethodForm):
diff --git a/user_data/workflows/workflow_Plot-Thesis.yaml b/user_data/workflows/workflow_Plot-Thesis.yaml
index 0bdccd0f..474c0839 100644
--- a/user_data/workflows/workflow_Plot-Thesis.yaml
+++ b/user_data/workflows/workflow_Plot-Thesis.yaml
@@ -1,94 +1,16 @@
 df_mode: disk_memory
 steps:
-- form_inputs:
-    feature_orientation: Columns (samples in rows, features in columns)
-  inputs: {}
-  instance_identifier: MetadataImport_1
-  type: MetadataImport
 - form_inputs: {}
   inputs: {}
   instance_identifier: EvidenceImport_1
   type: EvidenceImport
 - form_inputs:
-    percentage: 0.5
-  inputs: {}
-  instance_identifier: FilterProteinsBySamplesMissing_1
-  plot_inputs:
-    graph_type: Bar chart
-  type: FilterProteinsBySamplesMissing
-- form_inputs:
-    deviation_threshold: 2.0
-  inputs: {}
-  instance_identifier: FilterSamplesByProteinIntensitiesSum_1
-  plot_inputs:
-    graph_type: Bar chart
-  type: FilterSamplesByProteinIntensitiesSum
-- form_inputs:
-    number_of_neighbours: 5
-  inputs: {}
-  instance_identifier: ImputationByKNN_1
-  plot_inputs:
-    graph_type: Boxplot
-    graph_type_quantities: Bar chart
-    group_by: None
-    visual_transformation: log10
-  type: ImputationByKNN
-- form_inputs:
-    number_of_neighbors: 20
-  inputs: {}
-  instance_identifier: OutlierDetectionByLocalOutlierFactor_1
-  plot_inputs: {}
-  type: OutlierDetectionByLocalOutlierFactor
-- form_inputs:
-    percentile: 0.5
-  inputs: {}
-  instance_identifier: NormalisationByMedian_1
-  plot_inputs:
-    graph_type: Boxplot
-    group_by: None
-    visual_transformation: log10
-  type: NormalisationByMedian
-- form_inputs:
-    log_base: log2
+    feature_orientation: Columns (samples in rows, features in columns)
   inputs: {}
-  instance_identifier: TransformationLog_1
-  plot_inputs:
-    graph_type: Histogram
-    group_by: None
-  type: TransformationLog
+  instance_identifier: MetadataImport_1
+  type: MetadataImport
 - form_inputs:
     similarity_measure: euclidean distance
   inputs: {}
   instance_identifier: PlotProtQuant_1
-  type: PlotProtQuant
-- form_inputs:
-    alpha: 0.05
-  inputs: {}
-  instance_identifier: DifferentialExpressionTTest_1
-  type: DifferentialExpressionTTest
-- form_inputs:
-    fc_threshold: 1
-  inputs: {}
-  instance_identifier: PlotVolcano_1
-  type: PlotVolcano
-- form_inputs:
-    differential_expression_threshold: 1
-    direction: both
-    gene_sets_restring: []
-    organism: 9606
-  inputs: {}
-  instance_identifier: EnrichmentAnalysisGOAnalysisWithString_1
-  type: EnrichmentAnalysisGOAnalysisWithString
-- form_inputs:
-    colors: []
-    cutoff: 0.05
-    gene_sets:
-    - Process
-    - Component
-    - Function
-    - KEGG
-    top_terms: 10
-    value: p-value
-  inputs: {}
-  instance_identifier: PlotGOEnrichmentBarPlot_1
-  type: PlotGOEnrichmentBarPlot
+  type: PlotProtQuantPeptide
\ No newline at end of file

From 786da450da5fe037475fb632de0c926f5cb787d2 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Wed, 10 Jul 2024 08:22:48 +0200
Subject: [PATCH 03/52] Plotquantplot for peptide

---
 protzilla/methods/data_analysis.py            |  1 +
 ui/runs/forms/data_analysis.py                | 48 ++++++++++-
 ui/runs/forms/fill_helper.py                  |  4 +
 user_data/workflows/workflow_Plot-Thesis.yaml | 79 +++++++++++++++----
 4 files changed, 114 insertions(+), 18 deletions(-)

diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 8407f4c0..5fffaa67 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -339,6 +339,7 @@ class PlotProtQuantPeptide(PlotStep):
     def method(self, inputs: dict) -> dict:
         return prot_quant_plot_peptide(**inputs)
 
+
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         inputs["input_df"] = steps.get_step_output(
             Step, "peptide_df", inputs["input_df"]
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 5a824a08..30845d74 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -535,15 +535,55 @@ class PlotProtQuantPeptideForm(MethodForm):
     )
 
     def fill_form(self, run: Run) -> None:
-        self.fields["input_df"].choices = fill_helper.get_choices(run, "peptide_df")
+        self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps(
+            run
+        )
 
+        input_df_instance_id = self.data.get(
+            "input_df", self.fields["input_df"].choices[0][0]
+        )
 
         self.fields["protein_group"].choices = fill_helper.to_choices(
-            run.steps.get_step_output(Step, "peptide_df")[
-                "Protein ID"
-            ].unique()
+            run.steps.get_step_output(
+                step_type=Step,
+                output_key="peptide_df",
+                instance_identifier=input_df_instance_id,
+            )["Protein ID"].unique()
         )
 
+        similarity_measure = self.data.get(
+            "similarity_measure", self.fields["similarity_measure"].choices[0][0]
+        )
+        self.data = self.data.copy()
+        if similarity_measure == SimilarityMeasure.cosine_similarity:
+            self.fields["similarity"] = CustomFloatField(
+                label="Cosine Similarity",
+                min_value=-1,
+                max_value=1,
+                step_size=0.1,
+                initial=0,
+            )
+            if (
+                "similarity" not in self.data
+                or float(self.data["similarity"]) < -1
+                or float(self.data["similarity"]) > 1
+            ):
+                self.data["similarity"] = 0
+        else:
+            self.fields["similarity"] = CustomNumberField(
+                label="Euclidean Distance",
+                min_value=0,
+                max_value=999,
+                step_size=1,
+                initial=1,
+            )
+            if (
+                "similarity" not in self.data
+                or float(self.data["similarity"]) < 0
+                or float(self.data["similarity"]) > 999
+            ):
+                self.data["similarity"] = 1
+
 
 class PlotPrecisionRecallCurveForm(MethodForm):
     # Todo: Input
diff --git a/ui/runs/forms/fill_helper.py b/ui/runs/forms/fill_helper.py
index 0b416f8f..1d0ea050 100644
--- a/ui/runs/forms/fill_helper.py
+++ b/ui/runs/forms/fill_helper.py
@@ -14,6 +14,10 @@ def get_choices_for_protein_df_steps(run: Run) -> list[tuple[str, str]]:
     return reversed(to_choices(run.steps.get_instance_identifiers(Step, "protein_df")))
 
 
+def get_choices_for_peptide_df_steps(run: Run) -> list[tuple[str, str]]:
+    return reversed(to_choices(run.steps.get_instance_identifiers(Step, "peptide_df")))
+
+
 def get_choices(
     run: Run, output_key: str, step_type: type[Step] = Step
 ) -> list[tuple[str, str]]:
diff --git a/user_data/workflows/workflow_Plot-Thesis.yaml b/user_data/workflows/workflow_Plot-Thesis.yaml
index 474c0839..1758d861 100644
--- a/user_data/workflows/workflow_Plot-Thesis.yaml
+++ b/user_data/workflows/workflow_Plot-Thesis.yaml
@@ -1,16 +1,67 @@
 df_mode: disk_memory
 steps:
-- form_inputs: {}
-  inputs: {}
-  instance_identifier: EvidenceImport_1
-  type: EvidenceImport
-- form_inputs:
-    feature_orientation: Columns (samples in rows, features in columns)
-  inputs: {}
-  instance_identifier: MetadataImport_1
-  type: MetadataImport
-- form_inputs:
-    similarity_measure: euclidean distance
-  inputs: {}
-  instance_identifier: PlotProtQuant_1
-  type: PlotProtQuantPeptide
\ No newline at end of file
+  - form_inputs:
+      intensity_name: iBAQ
+      map_to_uniprot: false
+      aggregation_mode: Sum
+    inputs: { }
+    type: MaxQuantImport
+  - form_inputs: {}
+    inputs: {}
+    instance_identifier: EvidenceImport_1
+    type: EvidenceImport
+  - form_inputs:
+      feature_orientation: Columns (samples in rows, features in columns)
+    inputs: {}
+    instance_identifier: MetadataImport_1
+    type: MetadataImport
+  - form_inputs:
+      similarity_measure: euclidean distance
+    inputs: {}
+    instance_identifier: PlotProtQuant_1
+    type: PlotProtQuantPeptide
+  - form_inputs:
+      percentage: 0.5
+    inputs: { }
+    plot_inputs:
+      graph_type: Bar chart
+    type: FilterProteinsBySamplesMissing
+  - form_inputs:
+      deviation_threshold: 2.0
+    inputs: { }
+    plot_inputs:
+      graph_type: Bar chart
+    type: FilterSamplesByProteinIntensitiesSum
+  - form_inputs:
+      number_of_neighbours: 5
+    inputs: { }
+    plot_inputs:
+      graph_type: Boxplot
+      graph_type_quantities: Bar chart
+      group_by: None
+      visual_transformation: log10
+    type: ImputationByKNN
+  - form_inputs:
+      number_of_neighbors: 20
+    inputs: { }
+    plot_inputs: { }
+    type: OutlierDetectionByLocalOutlierFactor
+  - form_inputs:
+      percentile: 0.5
+    inputs: { }
+    plot_inputs:
+      graph_type: Boxplot
+      group_by: None
+      visual_transformation: log10
+    type: NormalisationByMedian
+  - form_inputs:
+      log_base: log2
+    inputs: { }
+    plot_inputs:
+      graph_type: Histogram
+      group_by: None
+    type: TransformationLog
+  - form_inputs:
+      similarity_measure: euclidean distance
+    inputs: { }
+    type: PlotProtQuantPeptide
\ No newline at end of file

From dc9fc74e317abea886b33d8e4191bbd5156ba079 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Wed, 10 Jul 2024 11:32:13 +0200
Subject: [PATCH 04/52] updated transform_dfs.py so that it supports peptide
 DFs

---
 protzilla/utilities/transform_dfs.py | 34 ++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/protzilla/utilities/transform_dfs.py b/protzilla/utilities/transform_dfs.py
index f3605b08..aa2bc6c0 100644
--- a/protzilla/utilities/transform_dfs.py
+++ b/protzilla/utilities/transform_dfs.py
@@ -17,10 +17,17 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str = None):
         packages such as sklearn
     :rtype: pd.DataFrame
     """
+
+    if intensity_df.duplicated(subset=["Sample", "Protein ID"]).any():
+        intensity_df = intensity_df.groupby(["Sample", "Protein ID"]).mean().reset_index()
+        intensity_df = intensity_df.dropna()
+
     values_name = default_intensity_column(intensity_df) if value_name is None else value_name
-    return pd.pivot(
+    intensity_df = pd.pivot(
         intensity_df, index="Sample", columns="Protein ID", values=values_name
     )
+    intensity_df = intensity_df.fillna(intensity_df.mean())
+    return intensity_df
 
 
 def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame):
@@ -40,26 +47,35 @@ def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame):
     """
     # Read out info from original dataframe
     intensity_name = default_intensity_column(original_long_df)
-    gene_info = original_long_df["Gene"]
-    # Turn the wide format into the long format
-    intensity_df = pd.melt(
-        wide_df.reset_index(),
+
+    # Identify the additional columns from the original long dataframe
+    additional_columns = ['Modification', 'Retention Time']
+    existing_additional_columns = [col for col in additional_columns if col in original_long_df.columns]
+
+    # Melt the wide format back to long format
+    melted_df = pd.melt(
+        wide_df,
         id_vars="Sample",
         var_name="Protein ID",
         value_name=intensity_name,
     )
-    intensity_df.sort_values(
+    melted_df.sort_values(
         by=["Sample", "Protein ID"],
         ignore_index=True,
         inplace=True,
     )
-    intensity_df.insert(2, "Gene", gene_info)
 
-    return intensity_df
+    # Add back the additional columns if they exist in the original dataframe
+    for col in existing_additional_columns:
+        melted_df[col] = original_long_df[col]
+
+    return melted_df
 
 
 def is_long_format(df: pd.DataFrame):
-    return set(df.columns[:3]) == {"Sample", "Protein ID", "Gene"}
+    required_columns = {"Sample", "Protein ID"}
+    additional_columns = {"Gene", "Retention time"}
+    return required_columns.issubset(df.columns) and any(col in df.columns for col in additional_columns)
 
 
 def is_intensity_df(df: pd.DataFrame):

From 286023daf8b5fe1fe25dcf4334f7d37f539f7b5d Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Wed, 10 Jul 2024 16:55:25 +0200
Subject: [PATCH 05/52] updated transform_dfs.py so that it supports peptide
 DFs

---
 .../data_analysis/prot_quant_plot_peptide.py  | 194 ++++++++++++++++++
 1 file changed, 194 insertions(+)
 create mode 100644 protzilla/data_analysis/prot_quant_plot_peptide.py

diff --git a/protzilla/data_analysis/prot_quant_plot_peptide.py b/protzilla/data_analysis/prot_quant_plot_peptide.py
new file mode 100644
index 00000000..eeefbcfc
--- /dev/null
+++ b/protzilla/data_analysis/prot_quant_plot_peptide.py
@@ -0,0 +1,194 @@
+import pandas as pd
+import plotly.graph_objects as go
+from scipy import stats
+from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
+
+from protzilla.utilities.transform_dfs import is_long_format, long_to_wide_retention_time
+
+# Define color constants
+PROTZILLA_DISCRETE_COLOR_SEQUENCE = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#19D3F3", "#E763FA", "#FECB52", "#FFA15A", "#FF6692", "#B6E880"]
+colors = {
+    "plot_bgcolor": "white",
+    "gridcolor": "#F1F1F1",
+    "linecolor": "#F1F1F1",
+    "annotation_text_color": "#ffffff",
+    "annotation_proteins_of_interest": "#4A536A",
+}
+
+def prot_quant_plot_peptide(
+    input_df: pd.DataFrame,
+    protein_group: str,
+    similarity: float = 1.0,
+    similarity_measure: str = "euclidean distance",
+) -> dict:
+    """
+    A function to create a graph visualising protein quantifications across all samples
+    as a line diagram using retention time and intensity. It's possible to select one proteingroup
+    that will be displayed in orange and choose a similarity measurement with a similarity score
+    to get all proteingroups that are similar displayed in another color in this line diagram.
+    All other proteingroups are displayed in the background as a grey polygon.
+
+    :param input_df: A dataframe in protzilla wide format, where each row
+        represents a sample and each column represents a feature.
+    :param protein_group: Protein IDs as the columnheader of the dataframe
+    :param similarity_measure: method to compare the chosen proteingroup with all others. The two
+        methods are "cosine similarity" and "euclidean distance".
+    :param similarity: similarity score of the chosen similarity measurement method.
+
+    :return: returns a dictionary containing a list with a plotly figure and/or a list of messages
+    """
+    # Ensure the dataframe includes retention time
+    if 'Retention time' not in input_df.columns:
+        raise ValueError("The input dataframe must include a 'Retention time' column.")
+
+    wide_df = input_df.interpolate(method='linear', axis=0)
+    wide_df = long_to_wide_retention_time(wide_df) if is_long_format(wide_df) else  wide_df
+
+    if protein_group not in wide_df.columns:
+        raise ValueError("Please select a valid protein group.")
+    elif similarity_measure == "euclidean distance" and similarity < 0:
+        raise ValueError(
+            "Similarity for euclidean distance should be greater than or equal to 0."
+        )
+    elif similarity_measure == "cosine similarity" and (
+            similarity < -1 or similarity > 1
+    ):
+        raise ValueError("Similarity for cosine similarity should be between -1 and 1")
+
+    fig = go.Figure()
+
+    color_mapping = {
+        "A": PROTZILLA_DISCRETE_COLOR_SEQUENCE[0],
+        "C": PROTZILLA_DISCRETE_COLOR_SEQUENCE[1],
+    }
+
+    lower_upper_x = []
+    lower_upper_y = []
+
+    lower_upper_x.append(wide_df['Retention time'].iloc[0])
+    lower_upper_y.append(wide_df.iloc[0].min())
+
+    for index, row in wide_df.iterrows():
+        lower_upper_x.append(row['Retention time'])
+        lower_upper_y.append(row.max())
+
+    for index, row in reversed(list(wide_df.iterrows())):
+        lower_upper_x.append(row['Retention time'])
+        lower_upper_y.append(row.min())
+
+    fig.add_trace(
+        go.Scatter(
+            x=lower_upper_x,
+            y=lower_upper_y,
+            fill="toself",
+            name="Intensity Range",
+            line=dict(color="silver"),
+        )
+    )
+
+    similar_groups = []
+    for group_to_compare in wide_df.columns:
+        if group_to_compare not in ['Retention time', protein_group]:
+            if similarity_measure == "euclidean distance":
+                distance = euclidean_distances(
+                    stats.zscore(wide_df[protein_group]).values.reshape(1, -1),
+                    stats.zscore(wide_df[group_to_compare]).values.reshape(1, -1),
+                )[0][0]
+            else:
+                distance = cosine_similarity(
+                    stats.zscore(wide_df[protein_group]).values.reshape(1, -1),
+                    stats.zscore(wide_df[group_to_compare]).values.reshape(1, -1),
+                )[0][0]
+            if similarity_measure == "euclidean distance":
+                if distance <= similarity:
+                    similar_groups.append(group_to_compare)
+            else:
+                if distance >= similarity:
+                    similar_groups.append(group_to_compare)
+
+    for group in similar_groups:
+        fig.add_trace(
+            go.Scatter(
+                x=wide_df['Retention time'],
+                y=wide_df[group],
+                mode="lines",
+                name=group[:15] + "..." if len(group) > 15 else group,
+                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[1]),
+                showlegend=len(similar_groups) <= 7,
+            )
+        )
+
+    if len(similar_groups) > 7:
+        fig.add_trace(
+            go.Scatter(
+                x=[None],
+                y=[None],
+                mode="lines",
+                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[1]),
+                name="Similar Protein Groups",
+            )
+        )
+
+    formatted_protein_name = (
+        protein_group[:15] + "..." if len(protein_group) > 15 else protein_group
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=wide_df['Retention time'],
+            y=wide_df[protein_group],
+            mode="lines",
+            name=formatted_protein_name,
+            line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]),
+        )
+    )
+
+    fig.add_trace(
+        go.Scatter(
+            x=[None],
+            y=[None],
+            mode="markers",
+            marker=dict(color=color_mapping.get("A")),
+            name="Experimental Group",
+        )
+    )
+
+    fig.add_trace(
+        go.Scatter(
+            x=[None],
+            y=[None],
+            mode="markers",
+            marker=dict(color=color_mapping.get("C")),
+            name="Control Group",
+        )
+    )
+
+    fig.update_layout(
+        title=f"Intensity of {formatted_protein_name} across retention time",
+        plot_bgcolor=colors["plot_bgcolor"],
+        xaxis_gridcolor=colors["gridcolor"],
+        yaxis_gridcolor=colors["gridcolor"],
+        xaxis_linecolor=colors["linecolor"],
+        yaxis_linecolor=colors["linecolor"],
+        xaxis_title="Retention Time",
+        yaxis_title="Intensity",
+        legend_title="Legend",
+        xaxis=dict(
+            tickmode="array",
+            tickangle=0,
+            tickvals=sorted(wide_df['Retention time']),
+            ticktext=[
+                f"<span style='font-size: 10px; color:{color_mapping.get(label[0], 'black')}'><b>•</b></span>"
+                for label in wide_df['Retention time']
+            ],
+        ),
+        autosize=True,
+        margin=dict(l=100, r=300, t=100, b=100),
+        legend=dict(
+            x=1.05,
+            y=1,
+            bgcolor="rgba(255, 255, 255, 0.5)",
+            orientation="v",
+        ),
+    )
+
+    return dict(plots=[fig])

From 4b940f0f7631fa69ddc69d1ea976b1cb43078f10 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Wed, 10 Jul 2024 17:55:46 +0200
Subject: [PATCH 06/52] Implemeted Protquantplot with retention time instead of
 Intensities

---
 .../data_analysis/prot_quant_plot_peptide.py  | 33 +++++++++----------
 protzilla/utilities/transform_dfs.py          | 25 ++++++++++++++
 2 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/protzilla/data_analysis/prot_quant_plot_peptide.py b/protzilla/data_analysis/prot_quant_plot_peptide.py
index eeefbcfc..7df80807 100644
--- a/protzilla/data_analysis/prot_quant_plot_peptide.py
+++ b/protzilla/data_analysis/prot_quant_plot_peptide.py
@@ -23,7 +23,7 @@ def prot_quant_plot_peptide(
 ) -> dict:
     """
     A function to create a graph visualising protein quantifications across all samples
-    as a line diagram using retention time and intensity. It's possible to select one proteingroup
+    as a line diagram using retention time. It's possible to select one proteingroup
     that will be displayed in orange and choose a similarity measurement with a similarity score
     to get all proteingroups that are similar displayed in another color in this line diagram.
     All other proteingroups are displayed in the background as a grey polygon.
@@ -37,13 +37,10 @@ def prot_quant_plot_peptide(
 
     :return: returns a dictionary containing a list with a plotly figure and/or a list of messages
     """
-    # Ensure the dataframe includes retention time
-    if 'Retention time' not in input_df.columns:
-        raise ValueError("The input dataframe must include a 'Retention time' column.")
-
     wide_df = input_df.interpolate(method='linear', axis=0)
     wide_df = long_to_wide_retention_time(wide_df) if is_long_format(wide_df) else  wide_df
 
+
     if protein_group not in wide_df.columns:
         raise ValueError("Please select a valid protein group.")
     elif similarity_measure == "euclidean distance" and similarity < 0:
@@ -65,15 +62,15 @@ def prot_quant_plot_peptide(
     lower_upper_x = []
     lower_upper_y = []
 
-    lower_upper_x.append(wide_df['Retention time'].iloc[0])
+    lower_upper_x.append(wide_df.index[0])
     lower_upper_y.append(wide_df.iloc[0].min())
 
     for index, row in wide_df.iterrows():
-        lower_upper_x.append(row['Retention time'])
+        lower_upper_x.append(index)
         lower_upper_y.append(row.max())
 
     for index, row in reversed(list(wide_df.iterrows())):
-        lower_upper_x.append(row['Retention time'])
+        lower_upper_x.append(index)
         lower_upper_y.append(row.min())
 
     fig.add_trace(
@@ -81,14 +78,14 @@ def prot_quant_plot_peptide(
             x=lower_upper_x,
             y=lower_upper_y,
             fill="toself",
-            name="Intensity Range",
+            name="Retention time of all protein groups",
             line=dict(color="silver"),
         )
     )
 
     similar_groups = []
     for group_to_compare in wide_df.columns:
-        if group_to_compare not in ['Retention time', protein_group]:
+        if group_to_compare != protein_group:
             if similarity_measure == "euclidean distance":
                 distance = euclidean_distances(
                     stats.zscore(wide_df[protein_group]).values.reshape(1, -1),
@@ -109,7 +106,7 @@ def prot_quant_plot_peptide(
     for group in similar_groups:
         fig.add_trace(
             go.Scatter(
-                x=wide_df['Retention time'],
+                x=wide_df.index,
                 y=wide_df[group],
                 mode="lines",
                 name=group[:15] + "..." if len(group) > 15 else group,
@@ -134,7 +131,7 @@ def prot_quant_plot_peptide(
     )
     fig.add_trace(
         go.Scatter(
-            x=wide_df['Retention time'],
+            x=wide_df.index,
             y=wide_df[protein_group],
             mode="lines",
             name=formatted_protein_name,
@@ -163,22 +160,22 @@ def prot_quant_plot_peptide(
     )
 
     fig.update_layout(
-        title=f"Intensity of {formatted_protein_name} across retention time",
+        title=f"Retention time of {formatted_protein_name} in all samples",
         plot_bgcolor=colors["plot_bgcolor"],
         xaxis_gridcolor=colors["gridcolor"],
         yaxis_gridcolor=colors["gridcolor"],
         xaxis_linecolor=colors["linecolor"],
         yaxis_linecolor=colors["linecolor"],
-        xaxis_title="Retention Time",
-        yaxis_title="Intensity",
+        xaxis_title="Sample",
+        yaxis_title="Retention time",
         legend_title="Legend",
         xaxis=dict(
             tickmode="array",
             tickangle=0,
-            tickvals=sorted(wide_df['Retention time']),
+            tickvals=wide_df.index,
             ticktext=[
                 f"<span style='font-size: 10px; color:{color_mapping.get(label[0], 'black')}'><b>•</b></span>"
-                for label in wide_df['Retention time']
+                for label in wide_df.index
             ],
         ),
         autosize=True,
@@ -191,4 +188,4 @@ def prot_quant_plot_peptide(
         ),
     )
 
-    return dict(plots=[fig])
+    return dict(plots=[fig])
\ No newline at end of file
diff --git a/protzilla/utilities/transform_dfs.py b/protzilla/utilities/transform_dfs.py
index aa2bc6c0..0b1e7976 100644
--- a/protzilla/utilities/transform_dfs.py
+++ b/protzilla/utilities/transform_dfs.py
@@ -29,6 +29,31 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str = None):
     intensity_df = intensity_df.fillna(intensity_df.mean())
     return intensity_df
 
+def long_to_wide_retention_time(intensity_df: pd.DataFrame, value_name: str = None):
+    """
+    This function transforms the dataframe to a wide format that
+    can be more easily handled by packages such as sklearn.
+    Each sample gets one row with all observations as columns.
+
+    :param intensity_df: the dataframe that should be transformed into
+        long format
+        :type intensity_df: pd.DataFrame
+
+    :return: returns dataframe in wide format suitable for use by
+        packages such as sklearn
+    :rtype: pd.DataFrame
+    """
+
+    if intensity_df.duplicated(subset=["Sample", "Protein ID"]).any():
+        intensity_df = intensity_df.groupby(["Sample", "Protein ID"]).mean().reset_index()
+        intensity_df = intensity_df.dropna()
+
+    values_name = 'Retention time'
+    intensity_df = pd.pivot(
+        intensity_df, index="Sample", columns="Protein ID", values=values_name
+    )
+    intensity_df = intensity_df.fillna(intensity_df.mean())
+    return intensity_df
 
 def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame):
     """

From 3d2bb3032cc293b9ac48f41da75fcaa3d72f78dd Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Thu, 11 Jul 2024 15:29:09 +0200
Subject: [PATCH 07/52] Implemeted Timequantplot

---
 ..._peptide.py => time_quant_plot_peptide.py} | 43 ++++++++---------
 protzilla/methods/data_analysis.py            | 13 ++---
 protzilla/utilities/transform_dfs.py          | 47 +++++++------------
 ui/runs/form_mapping.py                       |  2 +-
 ui/runs/forms/data_analysis.py                |  2 +-
 5 files changed, 44 insertions(+), 63 deletions(-)
 rename protzilla/data_analysis/{prot_quant_plot_peptide.py => time_quant_plot_peptide.py} (88%)

diff --git a/protzilla/data_analysis/prot_quant_plot_peptide.py b/protzilla/data_analysis/time_quant_plot_peptide.py
similarity index 88%
rename from protzilla/data_analysis/prot_quant_plot_peptide.py
rename to protzilla/data_analysis/time_quant_plot_peptide.py
index 7df80807..f5921ae4 100644
--- a/protzilla/data_analysis/prot_quant_plot_peptide.py
+++ b/protzilla/data_analysis/time_quant_plot_peptide.py
@@ -3,7 +3,7 @@
 from scipy import stats
 from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
 
-from protzilla.utilities.transform_dfs import is_long_format, long_to_wide_retention_time
+from protzilla.utilities.transform_dfs import is_long_format, long_to_wide_time
 
 # Define color constants
 PROTZILLA_DISCRETE_COLOR_SEQUENCE = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#19D3F3", "#E763FA", "#FECB52", "#FFA15A", "#FF6692", "#B6E880"]
@@ -15,8 +15,9 @@
     "annotation_proteins_of_interest": "#4A536A",
 }
 
-def prot_quant_plot_peptide(
+def time_quant_plot_peptide(
     input_df: pd.DataFrame,
+    metadata_df: pd.DataFrame,
     protein_group: str,
     similarity: float = 1.0,
     similarity_measure: str = "euclidean distance",
@@ -30,6 +31,7 @@ def prot_quant_plot_peptide(
 
     :param input_df: A dataframe in protzilla wide format, where each row
         represents a sample and each column represents a feature.
+    :param metadata_df: A dataframe containing the metadata of the samples.
     :param protein_group: Protein IDs as the columnheader of the dataframe
     :param similarity_measure: method to compare the chosen proteingroup with all others. The two
         methods are "cosine similarity" and "euclidean distance".
@@ -37,8 +39,16 @@ def prot_quant_plot_peptide(
 
     :return: returns a dictionary containing a list with a plotly figure and/or a list of messages
     """
+
+    input_df = pd.merge(
+        left=input_df,
+        right=metadata_df[["Sample", "Time"]],
+        on="Sample",
+        copy=False,
+    )
+
     wide_df = input_df.interpolate(method='linear', axis=0)
-    wide_df = long_to_wide_retention_time(wide_df) if is_long_format(wide_df) else  wide_df
+    wide_df = long_to_wide_time(wide_df) if is_long_format(wide_df) else  wide_df
 
 
     if protein_group not in wide_df.columns:
@@ -78,7 +88,7 @@ def prot_quant_plot_peptide(
             x=lower_upper_x,
             y=lower_upper_y,
             fill="toself",
-            name="Retention time of all protein groups",
+            name="Intensity Range",
             line=dict(color="silver"),
         )
     )
@@ -138,45 +148,30 @@ def prot_quant_plot_peptide(
             line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]),
         )
     )
-
     fig.add_trace(
         go.Scatter(
             x=[None],
             y=[None],
             mode="markers",
             marker=dict(color=color_mapping.get("A")),
-            name="Experimental Group",
+            name="Intensity",
         )
     )
-
-    fig.add_trace(
-        go.Scatter(
-            x=[None],
-            y=[None],
-            mode="markers",
-            marker=dict(color=color_mapping.get("C")),
-            name="Control Group",
-        )
-    )
-
     fig.update_layout(
-        title=f"Retention time of {formatted_protein_name} in all samples",
+        title=f"Time Series of {formatted_protein_name} in all samples",
         plot_bgcolor=colors["plot_bgcolor"],
         xaxis_gridcolor=colors["gridcolor"],
         yaxis_gridcolor=colors["gridcolor"],
         xaxis_linecolor=colors["linecolor"],
         yaxis_linecolor=colors["linecolor"],
-        xaxis_title="Sample",
-        yaxis_title="Retention time",
+        xaxis_title="Time",
+        yaxis_title="Intensity",
         legend_title="Legend",
         xaxis=dict(
             tickmode="array",
             tickangle=0,
             tickvals=wide_df.index,
-            ticktext=[
-                f"<span style='font-size: 10px; color:{color_mapping.get(label[0], 'black')}'><b>•</b></span>"
-                for label in wide_df.index
-            ],
+            ticktext=[wide_df["Time"].unique() for wide_df["Time"] in wide_df.index],
         ),
         autosize=True,
         margin=dict(l=100, r=300, t=100, b=100),
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 5fffaa67..bcd705a5 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -21,7 +21,7 @@
     prot_quant_plot,
     scatter_plot,
 )
-from protzilla.data_analysis.prot_quant_plot_peptide import prot_quant_plot_peptide
+from protzilla.data_analysis.time_quant_plot_peptide import time_quant_plot_peptide
 from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph
 from protzilla.methods.data_preprocessing import TransformationLog
 from protzilla.steps import Plots, Step, StepManager
@@ -326,24 +326,25 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         )
         return inputs
 
-class PlotProtQuantPeptide(PlotStep):
-    display_name = "Protein Quantification Plot For Peptide"
+class PlotTimeQuantPeptide(PlotStep):
+    display_name = "Time Quantification Plot For Peptide"
     operation = "plot"
     method_description = (
-        "Creates a line chart for intensity across samples for protein groups"
+        "Creates a line chart for intensity across Time for protein groups"
     )
 
-    input_keys = ["input_df", "protein_group", "similarity_measure", "similarity"]
+    input_keys = ["input_df", "metadata_df", "protein_group", "similarity_measure", "similarity"]
     output_keys = []
 
     def method(self, inputs: dict) -> dict:
-        return prot_quant_plot_peptide(**inputs)
+        return time_quant_plot_peptide(**inputs)
 
 
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         inputs["input_df"] = steps.get_step_output(
             Step, "peptide_df", inputs["input_df"]
         )
+        inputs["metadata_df"] = steps.metadata_df
         return inputs
 
 
diff --git a/protzilla/utilities/transform_dfs.py b/protzilla/utilities/transform_dfs.py
index 0b1e7976..5e5c5e99 100644
--- a/protzilla/utilities/transform_dfs.py
+++ b/protzilla/utilities/transform_dfs.py
@@ -3,6 +3,7 @@
 from protzilla.utilities import default_intensity_column
 
 
+
 def long_to_wide(intensity_df: pd.DataFrame, value_name: str = None):
     """
     This function transforms the dataframe to a wide format that
@@ -17,19 +18,13 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str = None):
         packages such as sklearn
     :rtype: pd.DataFrame
     """
-
-    if intensity_df.duplicated(subset=["Sample", "Protein ID"]).any():
-        intensity_df = intensity_df.groupby(["Sample", "Protein ID"]).mean().reset_index()
-        intensity_df = intensity_df.dropna()
-
     values_name = default_intensity_column(intensity_df) if value_name is None else value_name
-    intensity_df = pd.pivot(
+    return pd.pivot(
         intensity_df, index="Sample", columns="Protein ID", values=values_name
     )
-    intensity_df = intensity_df.fillna(intensity_df.mean())
-    return intensity_df
 
-def long_to_wide_retention_time(intensity_df: pd.DataFrame, value_name: str = None):
+
+def long_to_wide_time(intensity_df: pd.DataFrame, value_name: str = None):
     """
     This function transforms the dataframe to a wide format that
     can be more easily handled by packages such as sklearn.
@@ -43,14 +38,11 @@ def long_to_wide_retention_time(intensity_df: pd.DataFrame, value_name: str = No
         packages such as sklearn
     :rtype: pd.DataFrame
     """
-
-    if intensity_df.duplicated(subset=["Sample", "Protein ID"]).any():
-        intensity_df = intensity_df.groupby(["Sample", "Protein ID"]).mean().reset_index()
-        intensity_df = intensity_df.dropna()
-
-    values_name = 'Retention time'
+    if intensity_df.duplicated(subset=["Time", "Protein ID"]).any():
+        intensity_df = intensity_df.groupby(["Time", "Protein ID"]).mean().reset_index()
+    values_name = default_intensity_column(intensity_df) if value_name is None else value_name
     intensity_df = pd.pivot(
-        intensity_df, index="Sample", columns="Protein ID", values=values_name
+        intensity_df, index="Time", columns="Protein ID", values=values_name
     )
     intensity_df = intensity_df.fillna(intensity_df.mean())
     return intensity_df
@@ -72,34 +64,27 @@ def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame):
     """
     # Read out info from original dataframe
     intensity_name = default_intensity_column(original_long_df)
-
-    # Identify the additional columns from the original long dataframe
-    additional_columns = ['Modification', 'Retention Time']
-    existing_additional_columns = [col for col in additional_columns if col in original_long_df.columns]
-
-    # Melt the wide format back to long format
-    melted_df = pd.melt(
-        wide_df,
+    gene_info = original_long_df["Gene"]
+    # Turn the wide format into the long format
+    intensity_df = pd.melt(
+        wide_df.reset_index(),
         id_vars="Sample",
         var_name="Protein ID",
         value_name=intensity_name,
     )
-    melted_df.sort_values(
+    intensity_df.sort_values(
         by=["Sample", "Protein ID"],
         ignore_index=True,
         inplace=True,
     )
+    intensity_df.insert(2, "Gene", gene_info)
 
-    # Add back the additional columns if they exist in the original dataframe
-    for col in existing_additional_columns:
-        melted_df[col] = original_long_df[col]
-
-    return melted_df
+    return intensity_df
 
 
 def is_long_format(df: pd.DataFrame):
     required_columns = {"Sample", "Protein ID"}
-    additional_columns = {"Gene", "Retention time"}
+    additional_columns = {"Gene", "Time"}
     return required_columns.issubset(df.columns) and any(col in df.columns for col in additional_columns)
 
 
diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py
index f195f3b6..adac90a1 100644
--- a/ui/runs/form_mapping.py
+++ b/ui/runs/form_mapping.py
@@ -49,7 +49,7 @@
     data_analysis.PlotScatterPlot: data_analysis_forms.PlotScatterPlotForm,
     data_analysis.PlotClustergram: data_analysis_forms.PlotClustergramForm,
     data_analysis.PlotProtQuant: data_analysis_forms.PlotProtQuantForm,
-    data_analysis.PlotProtQuantPeptide: data_analysis_forms.PlotProtQuantPeptideForm,
+    data_analysis.PlotTimeQuantPeptide: data_analysis_forms.PlotTimeQuantPeptideForm,
     data_analysis.PlotPrecisionRecallCurve: data_analysis_forms.PlotPrecisionRecallCurveForm,
     data_analysis.PlotROC: data_analysis_forms.PlotROCCurveForm,
     data_analysis.ClusteringKMeans: data_analysis_forms.ClusteringKMeansForm,
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 30845d74..de182b5b 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -514,7 +514,7 @@ def fill_form(self, run: Run) -> None:
                 self.data["similarity"] = 1
 
 
-class PlotProtQuantPeptideForm(MethodForm):
+class PlotTimeQuantPeptideForm(MethodForm):
     is_dynamic = True
 
     input_df = CustomChoiceField(

From 468ac23d78986abfd9c943b89fffe80cfe93293d Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Thu, 11 Jul 2024 15:33:02 +0200
Subject: [PATCH 08/52] Renamed the plot to time series plot

---
 ...time_quant_plot_peptide.py => time_series_plot_peptide.py} | 2 +-
 protzilla/methods/data_analysis.py                            | 4 ++--
 ui/runs/form_mapping.py                                       | 2 +-
 ui/runs/forms/data_analysis.py                                | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)
 rename protzilla/data_analysis/{time_quant_plot_peptide.py => time_series_plot_peptide.py} (99%)

diff --git a/protzilla/data_analysis/time_quant_plot_peptide.py b/protzilla/data_analysis/time_series_plot_peptide.py
similarity index 99%
rename from protzilla/data_analysis/time_quant_plot_peptide.py
rename to protzilla/data_analysis/time_series_plot_peptide.py
index f5921ae4..5f5ac64e 100644
--- a/protzilla/data_analysis/time_quant_plot_peptide.py
+++ b/protzilla/data_analysis/time_series_plot_peptide.py
@@ -15,7 +15,7 @@
     "annotation_proteins_of_interest": "#4A536A",
 }
 
-def time_quant_plot_peptide(
+def time_series_plot_peptide(
     input_df: pd.DataFrame,
     metadata_df: pd.DataFrame,
     protein_group: str,
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index bcd705a5..42a77182 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -21,7 +21,7 @@
     prot_quant_plot,
     scatter_plot,
 )
-from protzilla.data_analysis.time_quant_plot_peptide import time_quant_plot_peptide
+from protzilla.data_analysis.time_series_plot_peptide import time_quant_plot_peptide
 from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph
 from protzilla.methods.data_preprocessing import TransformationLog
 from protzilla.steps import Plots, Step, StepManager
@@ -326,7 +326,7 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         )
         return inputs
 
-class PlotTimeQuantPeptide(PlotStep):
+class PlotTimeSeriesPeptide(PlotStep):
     display_name = "Time Quantification Plot For Peptide"
     operation = "plot"
     method_description = (
diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py
index adac90a1..083676f0 100644
--- a/ui/runs/form_mapping.py
+++ b/ui/runs/form_mapping.py
@@ -49,7 +49,7 @@
     data_analysis.PlotScatterPlot: data_analysis_forms.PlotScatterPlotForm,
     data_analysis.PlotClustergram: data_analysis_forms.PlotClustergramForm,
     data_analysis.PlotProtQuant: data_analysis_forms.PlotProtQuantForm,
-    data_analysis.PlotTimeQuantPeptide: data_analysis_forms.PlotTimeQuantPeptideForm,
+    data_analysis.PlotTimeSeriesPeptide: data_analysis_forms.PlotTimeSeriesForm,
     data_analysis.PlotPrecisionRecallCurve: data_analysis_forms.PlotPrecisionRecallCurveForm,
     data_analysis.PlotROC: data_analysis_forms.PlotROCCurveForm,
     data_analysis.ClusteringKMeans: data_analysis_forms.ClusteringKMeansForm,
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index de182b5b..de3651b6 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -514,7 +514,7 @@ def fill_form(self, run: Run) -> None:
                 self.data["similarity"] = 1
 
 
-class PlotTimeQuantPeptideForm(MethodForm):
+class PlotTimeSeriesForm(MethodForm):
     is_dynamic = True
 
     input_df = CustomChoiceField(

From 8a419b27f5b2a1cc6e0aae3fe7ffc7ec096f8e24 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Thu, 11 Jul 2024 16:15:38 +0200
Subject: [PATCH 09/52] Fixed Tests

---
 protzilla/importing/peptide_import.py         | 1 -
 protzilla/methods/data_analysis.py            | 4 ++--
 user_data/workflows/workflow_Plot-Thesis.yaml | 4 ++--
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/protzilla/importing/peptide_import.py b/protzilla/importing/peptide_import.py
index e5400204..d38495dd 100644
--- a/protzilla/importing/peptide_import.py
+++ b/protzilla/importing/peptide_import.py
@@ -87,7 +87,6 @@ def evidence_import(file_path, intensity_name, map_to_uniprot) -> dict:
         "Missed cleavages",
         "PEP",
         "Raw file",
-        "Retention time",
     ]
 
     read = pd.read_csv(
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 42a77182..b2bbbbdf 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -21,7 +21,7 @@
     prot_quant_plot,
     scatter_plot,
 )
-from protzilla.data_analysis.time_series_plot_peptide import time_quant_plot_peptide
+from protzilla.data_analysis.time_series_plot_peptide import time_series_plot_peptide
 from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph
 from protzilla.methods.data_preprocessing import TransformationLog
 from protzilla.steps import Plots, Step, StepManager
@@ -337,7 +337,7 @@ class PlotTimeSeriesPeptide(PlotStep):
     output_keys = []
 
     def method(self, inputs: dict) -> dict:
-        return time_quant_plot_peptide(**inputs)
+        return time_series_plot_peptide(**inputs)
 
 
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
diff --git a/user_data/workflows/workflow_Plot-Thesis.yaml b/user_data/workflows/workflow_Plot-Thesis.yaml
index 1758d861..a3dee9fa 100644
--- a/user_data/workflows/workflow_Plot-Thesis.yaml
+++ b/user_data/workflows/workflow_Plot-Thesis.yaml
@@ -18,8 +18,8 @@ steps:
   - form_inputs:
       similarity_measure: euclidean distance
     inputs: {}
-    instance_identifier: PlotProtQuant_1
-    type: PlotProtQuantPeptide
+    instance_identifier: PlotTimeSeries_1
+    type: PlotTimeSeriesPeptide
   - form_inputs:
       percentage: 0.5
     inputs: { }

From 61b6df881bc2d805c9cc89e82afa6597c8667489 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Thu, 11 Jul 2024 16:40:44 +0200
Subject: [PATCH 10/52] Implemented test for time series plot

---
 .../data_analysis/test_time_series_plots.py   | 65 +++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 tests/protzilla/data_analysis/test_time_series_plots.py

diff --git a/tests/protzilla/data_analysis/test_time_series_plots.py b/tests/protzilla/data_analysis/test_time_series_plots.py
new file mode 100644
index 00000000..85bfad11
--- /dev/null
+++ b/tests/protzilla/data_analysis/test_time_series_plots.py
@@ -0,0 +1,65 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from protzilla.data_analysis.time_series_plot_peptide import time_series_plot_peptide
+
+
+@pytest.fixture
+def time_series_test_data():
+    test_intensity_list = (
+        ["Sample1", "Protein1", "Gene1", 20],
+        ["Sample1", "Protein2", "Gene1", 16],
+        ["Sample1", "Protein3", "Gene1", 1],
+        ["Sample1", "Protein4", "Gene1", 14],
+        ["Sample2", "Protein1", "Gene1", 20],
+        ["Sample2", "Protein2", "Gene1", 15],
+        ["Sample2", "Protein3", "Gene1", 2],
+        ["Sample2", "Protein4", "Gene1", 15],
+        ["Sample3", "Protein1", "Gene1", 22],
+        ["Sample3", "Protein2", "Gene1", 14],
+        ["Sample3", "Protein3", "Gene1", 3],
+        ["Sample3", "Protein4", "Gene1", 16],
+        ["Sample4", "Protein1", "Gene1", 8],
+        ["Sample4", "Protein2", "Gene1", 15],
+        ["Sample4", "Protein3", "Gene1", 1],
+        ["Sample4", "Protein4", "Gene1", 9],
+        ["Sample5", "Protein1", "Gene1", 10],
+        ["Sample5", "Protein2", "Gene1", 14],
+        ["Sample5", "Protein3", "Gene1", 2],
+        ["Sample5", "Protein4", "Gene1", 10],
+        ["Sample6", "Protein1", "Gene1", 12],
+        ["Sample6", "Protein2", "Gene1", 13],
+        ["Sample6", "Protein3", "Gene1", 3],
+        ["Sample6", "Protein4", "Gene1", 11],
+        ["Sample7", "Protein1", "Gene1", 12],
+        ["Sample7", "Protein2", "Gene1", 13],
+        ["Sample7", "Protein3", "Gene1", 3],
+        ["Sample7", "Protein4", "Gene1", 11],
+    )
+
+    test_intensity_df = pd.DataFrame(
+        data=test_intensity_list,
+        columns=["Sample", "Protein ID", "Gene", "Intensity"],
+    )
+
+    test_metadata_df = (
+        ["Sample1", "02:00:00", 1],
+        ["Sample2", "06:00:00", 1],
+        ["Sample3", "10:00:00", 1],
+         ["Sample4", "14:00:00", 1],
+    )
+    test_metadata_df = pd.DataFrame(
+        data=test_metadata_df,
+        columns=["Sample", "Time", "Day"],
+    )
+    return test_intensity_df, test_metadata_df
+
+def test_time_series_plot(show_figures, time_series_test_data):
+    test_intensity, test_metadata = time_series_test_data
+    outputs = time_series_plot_peptide(test_intensity, test_metadata, "Protein1")
+    assert "plots" in outputs
+    fig = outputs["plots"][0]
+    if show_figures:
+        fig.show()
+    return
\ No newline at end of file

From 935f0b6f800c43de951209dec09876a4dbfe5565 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Thu, 18 Jul 2024 14:26:27 +0200
Subject: [PATCH 11/52] Implemented time series regression analysis

---
 protzilla/data_analysis/time_series_helper.py |  7 ++
 .../time_series_regression_analysis.py        | 91 +++++++++++++++++++
 protzilla/methods/data_analysis.py            | 23 +++++
 ui/runs/form_mapping.py                       |  1 +
 ui/runs/forms/data_analysis.py                | 20 +++-
 5 files changed, 141 insertions(+), 1 deletion(-)
 create mode 100644 protzilla/data_analysis/time_series_helper.py
 create mode 100644 protzilla/data_analysis/time_series_regression_analysis.py

diff --git a/protzilla/data_analysis/time_series_helper.py b/protzilla/data_analysis/time_series_helper.py
new file mode 100644
index 00000000..b3ebe0c5
--- /dev/null
+++ b/protzilla/data_analysis/time_series_helper.py
@@ -0,0 +1,7 @@
+import pandas as pd
+from datetime import datetime
+
+def convert_time_to_datetime(time_str):
+    time_obj = datetime.strptime(time_str, '%H:%M:%S')
+    seconds_since_midnight = time_obj.second + time_obj.minute * 60 + time_obj.hour * 3600
+    return seconds_since_midnight
diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
new file mode 100644
index 00000000..3b06c2f3
--- /dev/null
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -0,0 +1,91 @@
+import logging
+
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+import plotly.express as px
+
+from  protzilla.data_analysis.time_series_helper import convert_time_to_datetime
+
+from sklearn.linear_model import LinearRegression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error, r2_score
+
+def time_series_linear_regression(
+        input_df: pd.DataFrame,
+        metadata_df: pd.DataFrame,
+        test_size: float,
+):
+
+    input_df = pd.merge(
+        left=input_df,
+        right=metadata_df,
+        on="Sample",
+        copy=False,
+    )
+
+    input_df["Time"] = input_df["Time"].apply(convert_time_to_datetime)
+    input_df = input_df.interpolate(method='linear', axis=0)
+    X = input_df[["Time"]]
+    y = input_df["Intensity"]
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
+    model = LinearRegression()
+    model.fit(X_train, y_train)
+
+    y_pred_train = model.predict(X_train)
+    y_pred_test = model.predict(X_test)
+
+
+    """
+        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
+        test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
+        train_r2 = r2_score(y_train, y_pred_train)
+        test_r2 = r2_score(y_test, y_pred_test)
+        return dict(
+            train_rmse=train_rmse,
+            test_rmse=test_rmse,
+            train_r2=train_r2,
+            test_r2=test_r2,
+        )
+    """
+
+    train_df = pd.DataFrame({'Time': X_train['Time'], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
+    test_df = pd.DataFrame({'Time': X_test['Time'], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+    plot_df = pd.concat([train_df, test_df])
+
+    fig = go.Figure()
+
+    fig.add_trace(go.Scatter(
+        x=plot_df['Time'],
+        y=plot_df['Intensity'],
+        mode='markers',
+        name='Actual Intensity',
+        marker=dict(color='blue')
+    ))
+
+    fig.add_trace(go.Scatter(
+        x=plot_df['Time'],
+        y=plot_df['Predicted'],
+        mode='lines',
+        name='Predicted Intensity',
+        line=dict(color='red')
+    ))
+
+    fig.update_layout(
+        title={
+            "text": "<b>Intensity over Time</b>",
+            "font": dict(size=16),
+            "y": 0.98,
+            "x": 0.5,
+            "xanchor": "center",
+            "yanchor": "top",
+        },
+        xaxis_title="Time",
+        yaxis_title="Intensity",
+        plot_bgcolor="white",
+        yaxis={"gridcolor": "lightgrey", "zerolinecolor": "lightgrey"},
+        font=dict(size=14, family="Arial")
+    )
+
+    return dict(plot=[fig])
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index b2bbbbdf..b8171a9a 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -12,6 +12,7 @@
     mann_whitney_test_on_intensity_data
 from protzilla.data_analysis.differential_expression_t_test import t_test
 from protzilla.data_analysis.dimension_reduction import t_sne, umap
+from protzilla.data_analysis.time_series_regression_analysis import time_series_linear_regression
 from protzilla.data_analysis.ptm_analysis import filter_peptides_of_protein, ptms_per_sample, \
     ptms_per_protein_and_sample
 from protzilla.data_analysis.model_evaluation import evaluate_classification_model
@@ -738,6 +739,28 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         return inputs
 
 
+class TimeSeriesLinearRegression(PlotStep):
+    display_name = "Time Series Linear Regression"
+    operation = "Time series analysis"
+    method_description = ("A function to fit a linear model using ordinary least squares for each protein. "
+                                    "The linear model fits the protein intensities on Y axis and the Time on X. "
+                                    "The p-values are corrected for multiple testing.")
+
+    input_keys = [
+        "input_df",
+        "metadata_df",
+        "test_size",
+    ]
+    output_keys = []
+
+    def method(self, inputs: dict) -> dict:
+        return time_series_linear_regression(**inputs)
+
+    def insert_dataframes(self, steps: StepManager, inputs) -> dict:
+        inputs["input_df"] = steps.get_step_output(Step, "peptide_df", inputs["input_df"])
+        inputs["metadata_df"] = steps.metadata_df
+        return inputs
+
 class PTMsPerSample(DataAnalysisStep):
     display_name = "PTMs per Sample"
     operation = "Peptide analysis"
diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py
index 083676f0..c14d9a20 100644
--- a/ui/runs/form_mapping.py
+++ b/ui/runs/form_mapping.py
@@ -65,6 +65,7 @@
     data_analysis.SelectPeptidesForProtein: data_analysis_forms.SelectPeptidesForProteinForm,
     data_analysis.PTMsPerSample: data_analysis_forms.PTMsPerSampleForm,
     data_analysis.PTMsProteinAndPerSample: data_analysis_forms.PTMsPerProteinAndSampleForm,
+    data_analysis.TimeSeriesLinearRegression: data_analysis_forms.PlotTimeSeriesLinearRegressionForm,
     data_preprocessing.ImputationByMinPerSample: data_preprocessing_forms.ImputationByMinPerSampleForms,
     data_integration.EnrichmentAnalysisGOAnalysisWithString: data_integration_forms.EnrichmentAnalysisGOAnalysisWithStringForm,
     data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr: data_integration_forms.EnrichmentAnalysisGOAnalysisWithEnrichrForm,
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index de3651b6..9e6d6f29 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1161,4 +1161,22 @@ def fill_form(self, run: Run) -> None:
             SelectPeptidesForProtein, "peptide_df"
         )
         if single_protein_peptides:
-            self.fields["peptide_df"].initial = single_protein_peptides[0]
\ No newline at end of file
+            self.fields["peptide_df"].initial = single_protein_peptides[0]
+
+
+class PlotTimeSeriesLinearRegressionForm(MethodForm):
+    input_df = CustomChoiceField(
+        choices=[],
+        label="Peptide dataframe containing the peptides of a single protein",
+    )
+    test_size = CustomFloatField(
+        label="Test size",
+        min_value=0,
+        max_value=1,
+        initial=0.2
+    )
+
+    def fill_form(self, run: Run) -> None:
+        self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps(
+            run
+        )

From de89b56a7215089b35f735387962d845bb868e77 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Fri, 19 Jul 2024 13:41:23 +0200
Subject: [PATCH 12/52] Implemented time series regression analysis

---
 protzilla/data_analysis/time_series_helper.py |  4 +-
 .../time_series_regression_analysis.py        | 70 +++++++++++--------
 protzilla/methods/data_analysis.py            | 10 ++-
 ui/runs/form_mapping.py                       |  2 +-
 ui/runs/forms/data_analysis.py                | 23 +++++-
 5 files changed, 71 insertions(+), 38 deletions(-)

diff --git a/protzilla/data_analysis/time_series_helper.py b/protzilla/data_analysis/time_series_helper.py
index b3ebe0c5..b10b13b6 100644
--- a/protzilla/data_analysis/time_series_helper.py
+++ b/protzilla/data_analysis/time_series_helper.py
@@ -3,5 +3,5 @@
 
 def convert_time_to_datetime(time_str):
     time_obj = datetime.strptime(time_str, '%H:%M:%S')
-    seconds_since_midnight = time_obj.second + time_obj.minute * 60 + time_obj.hour * 3600
-    return seconds_since_midnight
+    hours_since_midnight = time_obj.hour
+    return hours_since_midnight
\ No newline at end of file
diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
index 3b06c2f3..b3fb2099 100644
--- a/protzilla/data_analysis/time_series_regression_analysis.py
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -3,20 +3,35 @@
 import numpy as np
 import pandas as pd
 import plotly.graph_objects as go
-import plotly.express as px
 
-from  protzilla.data_analysis.time_series_helper import convert_time_to_datetime
+from protzilla.data_analysis.time_series_helper import convert_time_to_datetime
+from protzilla.constants.colors import PROTZILLA_DISCRETE_COLOR_SEQUENCE
 
 from sklearn.linear_model import LinearRegression
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_squared_error, r2_score
 
+colors = {
+    "plot_bgcolor": "white",
+    "gridcolor": "#F1F1F1",
+    "linecolor": "#F1F1F1",
+    "annotation_text_color": "#ffffff",
+    "annotation_proteins_of_interest": "#4A536A",
+}
+
+
 def time_series_linear_regression(
         input_df: pd.DataFrame,
         metadata_df: pd.DataFrame,
+        protein_group: str,
         test_size: float,
 ):
 
+    if test_size < 0 or test_size > 1 :
+        raise ValueError("Test size should be between 0 and 1")
+
+    input_df = input_df[input_df['Protein ID'] == protein_group]
+
     input_df = pd.merge(
         left=input_df,
         right=metadata_df,
@@ -36,19 +51,10 @@ def time_series_linear_regression(
     y_pred_train = model.predict(X_train)
     y_pred_test = model.predict(X_test)
 
-
-    """
-        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
-        test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
-        train_r2 = r2_score(y_train, y_pred_train)
-        test_r2 = r2_score(y_test, y_pred_test)
-        return dict(
-            train_rmse=train_rmse,
-            test_rmse=test_rmse,
-            train_r2=train_r2,
-            test_r2=test_r2,
-        )
-    """
+    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
+    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
+    train_r2 = r2_score(y_train, y_pred_train)
+    test_r2 = r2_score(y_test, y_pred_test)
 
     train_df = pd.DataFrame({'Time': X_train['Time'], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
     test_df = pd.DataFrame({'Time': X_test['Time'], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
@@ -61,7 +67,7 @@ def time_series_linear_regression(
         y=plot_df['Intensity'],
         mode='markers',
         name='Actual Intensity',
-        marker=dict(color='blue')
+        marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
     ))
 
     fig.add_trace(go.Scatter(
@@ -69,23 +75,27 @@ def time_series_linear_regression(
         y=plot_df['Predicted'],
         mode='lines',
         name='Predicted Intensity',
-        line=dict(color='red')
+        line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2])
     ))
 
     fig.update_layout(
-        title={
-            "text": "<b>Intensity over Time</b>",
-            "font": dict(size=16),
-            "y": 0.98,
-            "x": 0.5,
-            "xanchor": "center",
-            "yanchor": "top",
-        },
-        xaxis_title="Time",
+        title=f"Intensity over Time for {protein_group}",
+        plot_bgcolor=colors["plot_bgcolor"],
+        xaxis_gridcolor=colors["gridcolor"],
+        yaxis_gridcolor=colors["gridcolor"],
+        xaxis_linecolor=colors["linecolor"],
+        yaxis_linecolor=colors["linecolor"],
+        xaxis_title="Time (hours)",
         yaxis_title="Intensity",
-        plot_bgcolor="white",
-        yaxis={"gridcolor": "lightgrey", "zerolinecolor": "lightgrey"},
-        font=dict(size=14, family="Arial")
+        legend_title="Legend",
+        autosize=True,
+        margin=dict(l=100, r=300, t=100, b=100),
     )
 
-    return dict(plot=[fig])
+    return dict(
+        train_root_mean_squared=train_rmse,
+        test_root_mean_squared=test_rmse,
+        train_r2_score=train_r2,
+        test_r2_score=test_r2,
+        plots=[fig],
+    )
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 2bbd0c6a..752bb5c6 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -799,7 +799,7 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
 
 
 class TimeSeriesLinearRegression(PlotStep):
-    display_name = "Time Series Linear Regression"
+    display_name = "Linear Regression"
     operation = "Time series analysis"
     method_description = ("A function to fit a linear model using ordinary least squares for each protein. "
                                     "The linear model fits the protein intensities on Y axis and the Time on X. "
@@ -808,9 +808,15 @@ class TimeSeriesLinearRegression(PlotStep):
     input_keys = [
         "input_df",
         "metadata_df",
+        "protein_group",
         "test_size",
     ]
-    output_keys = []
+    output_keys = [
+        "train_root_mean_squared",
+        "test_root_mean_squared",
+        "train_r2_score",
+        "test_r2_score",
+    ]
 
     def method(self, inputs: dict) -> dict:
         return time_series_linear_regression(**inputs)
diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py
index b2cb4c8e..a6d350d2 100644
--- a/ui/runs/form_mapping.py
+++ b/ui/runs/form_mapping.py
@@ -66,7 +66,7 @@
     data_analysis.FLEXIQuantLF: data_analysis_forms.FLEXIQuantLFForm,
     data_analysis.PTMsPerSample: data_analysis_forms.PTMsPerSampleForm,
     data_analysis.PTMsProteinAndPerSample: data_analysis_forms.PTMsPerProteinAndSampleForm,
-    data_analysis.TimeSeriesLinearRegression: data_analysis_forms.PlotTimeSeriesLinearRegressionForm,
+    data_analysis.TimeSeriesLinearRegression: data_analysis_forms.TimeSeriesLinearRegressionForm,
     data_preprocessing.ImputationByMinPerSample: data_preprocessing_forms.ImputationByMinPerSampleForms,
     data_integration.EnrichmentAnalysisGOAnalysisWithString: data_integration_forms.EnrichmentAnalysisGOAnalysisWithStringForm,
     data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr: data_integration_forms.EnrichmentAnalysisGOAnalysisWithEnrichrForm,
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index fa3a2825..7b80d0d0 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1214,19 +1214,36 @@ def fill_form(self, run: Run) -> None:
             self.fields["peptide_df"].initial = single_protein_peptides[0]
 
 
-class PlotTimeSeriesLinearRegressionForm(MethodForm):
+class TimeSeriesLinearRegressionForm(MethodForm):
     input_df = CustomChoiceField(
         choices=[],
-        label="Peptide dataframe containing the peptides of a single protein",
+        label="Peptide dataframe",
+    )
+    protein_group = CustomChoiceField(
+        choices=[],
+        label="Protein group: which protein group to perform the linear regression on",
     )
     test_size = CustomFloatField(
-        label="Test size",
+        label="Test size: proportion of the dataset to include in the test split",
         min_value=0,
         max_value=1,
+        step_size=0.1,
         initial=0.2
     )
 
+
     def fill_form(self, run: Run) -> None:
         self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps(
             run
         )
+        input_df_instance_id = self.data.get(
+            "input_df", self.fields["input_df"].choices[0][0]
+        )
+
+        self.fields["protein_group"].choices = fill_helper.to_choices(
+            run.steps.get_step_output(
+                step_type=Step,
+                output_key="peptide_df",
+                instance_identifier=input_df_instance_id,
+            )["Protein ID"].unique()
+        )

From 97ace4af9cfe128f85bceb2f9c99fcde82639a42 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Fri, 19 Jul 2024 13:51:20 +0200
Subject: [PATCH 13/52] Added Docstrings

---
 protzilla/data_analysis/time_series_helper.py            | 6 ++++++
 .../data_analysis/time_series_regression_analysis.py     | 9 +++++++++
 2 files changed, 15 insertions(+)

diff --git a/protzilla/data_analysis/time_series_helper.py b/protzilla/data_analysis/time_series_helper.py
index b10b13b6..e2ead503 100644
--- a/protzilla/data_analysis/time_series_helper.py
+++ b/protzilla/data_analysis/time_series_helper.py
@@ -2,6 +2,12 @@
 from datetime import datetime
 
 def convert_time_to_datetime(time_str):
+    """
+    Convert a string time to a datetime object
+    :param time_str: The time string to convert
+
+    :return: A datetime object
+    """
     time_obj = datetime.strptime(time_str, '%H:%M:%S')
     hours_since_midnight = time_obj.hour
     return hours_since_midnight
\ No newline at end of file
diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
index b3fb2099..f117bccb 100644
--- a/protzilla/data_analysis/time_series_regression_analysis.py
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -26,6 +26,15 @@ def time_series_linear_regression(
         protein_group: str,
         test_size: float,
 ):
+    """
+    Perform linear regression on the time series data for a given protein group.
+    :param input_df: Peptide dataframe which contains the intensity of each sample
+    :param metadata_df: Metadata dataframe which contains the timestamps
+    :param protein_group: Protein group to perform the analysis on
+    :param test_size: The proportion of the dataset to include in the test split
+
+    :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
+    """
 
     if test_size < 0 or test_size > 1 :
         raise ValueError("Test size should be between 0 and 1")

From 38eb985c0872f8d39ba4bc85a94ab91ce4207134 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Fri, 19 Jul 2024 14:22:08 +0200
Subject: [PATCH 14/52] Implemented tests

---
 .../test_time_series_analysis.py              | 79 +++++++++++++++++++
 .../data_analysis/test_time_series_plots.py   |  7 +-
 2 files changed, 85 insertions(+), 1 deletion(-)
 create mode 100644 tests/protzilla/data_analysis/test_time_series_analysis.py

diff --git a/tests/protzilla/data_analysis/test_time_series_analysis.py b/tests/protzilla/data_analysis/test_time_series_analysis.py
new file mode 100644
index 00000000..ff01ba50
--- /dev/null
+++ b/tests/protzilla/data_analysis/test_time_series_analysis.py
@@ -0,0 +1,79 @@
+import pandas as pd
+import pytest
+
+from protzilla.data_analysis.time_series_regression_analysis import time_series_linear_regression
+
+
+@pytest.fixture
+def time_series_test_data():
+    test_intensity_list = (
+        ["Sample1", "Protein1", "Gene1", 20],
+        ["Sample1", "Protein2", "Gene1", 16],
+        ["Sample1", "Protein3", "Gene1", 1],
+        ["Sample1", "Protein4", "Gene1", 14],
+        ["Sample2", "Protein1", "Gene1", 20],
+        ["Sample2", "Protein2", "Gene1", 15],
+        ["Sample2", "Protein3", "Gene1", 2],
+        ["Sample2", "Protein4", "Gene1", 15],
+        ["Sample3", "Protein1", "Gene1", 22],
+        ["Sample3", "Protein2", "Gene1", 14],
+        ["Sample3", "Protein3", "Gene1", 3],
+        ["Sample3", "Protein4", "Gene1", 16],
+        ["Sample4", "Protein1", "Gene1", 8],
+        ["Sample4", "Protein2", "Gene1", 15],
+        ["Sample4", "Protein3", "Gene1", 1],
+        ["Sample4", "Protein4", "Gene1", 9],
+        ["Sample5", "Protein1", "Gene1", 10],
+        ["Sample5", "Protein2", "Gene1", 14],
+        ["Sample5", "Protein3", "Gene1", 2],
+        ["Sample5", "Protein4", "Gene1", 10],
+        ["Sample6", "Protein1", "Gene1", 12],
+        ["Sample6", "Protein2", "Gene1", 13],
+        ["Sample6", "Protein3", "Gene1", 3],
+        ["Sample6", "Protein4", "Gene1", 11],
+        ["Sample7", "Protein1", "Gene1", 12],
+        ["Sample7", "Protein2", "Gene1", 13],
+        ["Sample7", "Protein3", "Gene1", 3],
+        ["Sample7", "Protein4", "Gene1", 11],
+    )
+
+    test_intensity_df = pd.DataFrame(
+        data=test_intensity_list,
+        columns=["Sample", "Protein ID", "Gene", "Intensity"],
+    )
+
+    test_metadata_df = (
+        ["Sample1", "02:00:00", 1],
+        ["Sample2", "06:00:00", 1],
+        ["Sample3", "10:00:00", 1],
+         ["Sample4", "14:00:00", 1],
+    )
+    test_metadata_df = pd.DataFrame(
+        data=test_metadata_df,
+        columns=["Sample", "Time", "Day"],
+    )
+    return test_intensity_df, test_metadata_df
+
+def test_linear_regression_plot(show_figures, time_series_test_data):
+    test_intensity, test_metadata = time_series_test_data
+    outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2)
+    assert "plots" in outputs
+    fig = outputs["plots"][0]
+    if show_figures:
+        fig.show()
+    return
+
+def test_linear_regression_plot_invalid_test_size(time_series_test_data):
+    test_intensity, test_metadata = time_series_test_data
+    with pytest.raises(ValueError):
+        time_series_linear_regression(test_intensity, test_metadata, "Protein1", 2)
+    return
+
+def test_linear_regression_outputs(time_series_test_data):
+    test_intensity, test_metadata = time_series_test_data
+    outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2)
+    assert "train_root_mean_squared" in outputs
+    assert "test_root_mean_squared" in outputs
+    assert "train_r2_score" in outputs
+    assert "test_r2_score" in outputs
+    return
\ No newline at end of file
diff --git a/tests/protzilla/data_analysis/test_time_series_plots.py b/tests/protzilla/data_analysis/test_time_series_plots.py
index 85bfad11..904d46a2 100644
--- a/tests/protzilla/data_analysis/test_time_series_plots.py
+++ b/tests/protzilla/data_analysis/test_time_series_plots.py
@@ -1,4 +1,3 @@
-import numpy as np
 import pandas as pd
 import pytest
 
@@ -62,4 +61,10 @@ def test_time_series_plot(show_figures, time_series_test_data):
     fig = outputs["plots"][0]
     if show_figures:
         fig.show()
+    return
+
+def test_time_series_plot_invalid_similarity(time_series_test_data):
+    test_intensity, test_metadata = time_series_test_data
+    with pytest.raises(ValueError):
+        time_series_plot_peptide(test_intensity, test_metadata, "Protein1", similarity=-1, similarity_measure="euclidean distance")
     return
\ No newline at end of file

From d7522254a7402b8eb52deb9c4a1c01264d609670 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Fri, 19 Jul 2024 14:27:43 +0200
Subject: [PATCH 15/52] Implemented tests

---
 tests/protzilla/data_analysis/test_time_series_plots.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/protzilla/data_analysis/test_time_series_plots.py b/tests/protzilla/data_analysis/test_time_series_plots.py
index 904d46a2..12249fb0 100644
--- a/tests/protzilla/data_analysis/test_time_series_plots.py
+++ b/tests/protzilla/data_analysis/test_time_series_plots.py
@@ -63,8 +63,14 @@ def test_time_series_plot(show_figures, time_series_test_data):
         fig.show()
     return
 
-def test_time_series_plot_invalid_similarity(time_series_test_data):
+def test_time_series_plot_invalid_euclidean_similarity(time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
     with pytest.raises(ValueError):
         time_series_plot_peptide(test_intensity, test_metadata, "Protein1", similarity=-1, similarity_measure="euclidean distance")
+    return
+
+def test_time_series_plot_invalid_cosine_similarity(time_series_test_data):
+    test_intensity, test_metadata = time_series_test_data
+    with pytest.raises(ValueError):
+        time_series_plot_peptide(test_intensity, test_metadata, "Protein1", similarity=2, similarity_measure="cosine similarity")
     return
\ No newline at end of file

From 47556f372a88aa4cd1c0fc5b1d48b8db2c235e68 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Fri, 19 Jul 2024 14:38:38 +0200
Subject: [PATCH 16/52] made some minor changes

---
 protzilla/data_analysis/time_series_helper.py              | 1 -
 protzilla/data_analysis/time_series_regression_analysis.py | 2 --
 2 files changed, 3 deletions(-)

diff --git a/protzilla/data_analysis/time_series_helper.py b/protzilla/data_analysis/time_series_helper.py
index e2ead503..077e7e06 100644
--- a/protzilla/data_analysis/time_series_helper.py
+++ b/protzilla/data_analysis/time_series_helper.py
@@ -1,4 +1,3 @@
-import pandas as pd
 from datetime import datetime
 
 def convert_time_to_datetime(time_str):
diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
index f117bccb..3785116f 100644
--- a/protzilla/data_analysis/time_series_regression_analysis.py
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -1,5 +1,3 @@
-import logging
-
 import numpy as np
 import pandas as pd
 import plotly.graph_objects as go

From 4f8737defec2ae3486cbc1b76f9e08e4df56c3e8 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Wed, 24 Jul 2024 15:08:59 +0200
Subject: [PATCH 17/52] Implemented RANSAC regression

---
 .../time_series_regression_analysis.py        | 181 +++++++++++++++++-
 protzilla/methods/data_analysis.py            |  29 ++-
 ui/runs/form_mapping.py                       |   1 +
 ui/runs/forms/data_analysis.py                |  35 ++++
 4 files changed, 238 insertions(+), 8 deletions(-)

diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
index 3785116f..2d5622ff 100644
--- a/protzilla/data_analysis/time_series_regression_analysis.py
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -5,15 +5,16 @@
 from protzilla.data_analysis.time_series_helper import convert_time_to_datetime
 from protzilla.constants.colors import PROTZILLA_DISCRETE_COLOR_SEQUENCE
 
-from sklearn.linear_model import LinearRegression
+from sklearn.linear_model import LinearRegression, RANSACRegressor
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_squared_error, r2_score
+from plotly.subplots import make_subplots
 
 colors = {
     "plot_bgcolor": "white",
     "gridcolor": "#F1F1F1",
     "linecolor": "#F1F1F1",
-    "annotation_text_color": "#ffffff",
+    "annotation_text_color": "#4c4c4c",
     "annotation_proteins_of_interest": "#4A536A",
 }
 
@@ -67,15 +68,16 @@ def time_series_linear_regression(
     test_df = pd.DataFrame({'Time': X_test['Time'], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
     plot_df = pd.concat([train_df, test_df])
 
-    fig = go.Figure()
+    fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025)
 
+    # Add main plot traces
     fig.add_trace(go.Scatter(
         x=plot_df['Time'],
         y=plot_df['Intensity'],
         mode='markers',
         name='Actual Intensity',
         marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
-    ))
+    ), row=1, col=1)
 
     fig.add_trace(go.Scatter(
         x=plot_df['Time'],
@@ -83,8 +85,26 @@ def time_series_linear_regression(
         mode='lines',
         name='Predicted Intensity',
         line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2])
-    ))
+    ), row=1, col=1)
 
+    # Add annotation text as a separate trace in the subplot
+    annotation_text = (
+        f"Train RMSE: {train_rmse:.3f}<br>"
+        f"Test RMSE: {test_rmse:.3f}<br>"
+        f"Train R²: {train_r2:.3f}<br>"
+        f"Test R²: {test_r2:.3f}"
+    )
+
+    fig.add_trace(go.Scatter(
+        x=[0],
+        y=[0.25],
+        text=[annotation_text],
+        mode='text',
+        textfont=dict(size=12),
+        showlegend=False
+    ), row=1, col=2)
+
+    # Update layout
     fig.update_layout(
         title=f"Intensity over Time for {protein_group}",
         plot_bgcolor=colors["plot_bgcolor"],
@@ -92,13 +112,160 @@ def time_series_linear_regression(
         yaxis_gridcolor=colors["gridcolor"],
         xaxis_linecolor=colors["linecolor"],
         yaxis_linecolor=colors["linecolor"],
-        xaxis_title="Time (hours)",
+        xaxis_title="Time",
         yaxis_title="Intensity",
         legend_title="Legend",
         autosize=True,
-        margin=dict(l=100, r=300, t=100, b=100),
+        margin=dict(l=100, r=100, t=100, b=50),
+        legend=dict(
+            yanchor="top",
+            y=0.95,
+            xanchor="right",
+            x=0.85
+        )
+    )
+
+    # Hide x-axis of the annotation subplot
+    fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2)
+    fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2)
+
+    # Adjust subplot titles
+    fig.update_annotations(font_size=12)
+
+    return dict(
+        train_root_mean_squared=train_rmse,
+        test_root_mean_squared=test_rmse,
+        train_r2_score=train_r2,
+        test_r2_score=test_r2,
+        plots=[fig],
     )
 
+
+def time_series_ransac_regression(
+        input_df: pd.DataFrame,
+        metadata_df: pd.DataFrame,
+        protein_group: str,
+        test_size: float,
+):
+    """
+    Perform RANSAC regression on the time series data for a given protein group.
+    :param input_df: Peptide dataframe which contains the intensity of each sample
+    :param metadata_df: Metadata dataframe which contains the timestamps
+    :param protein_group: Protein group to perform the analysis on
+    :param test_size: The proportion of the dataset to include in the test split
+
+    :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
+    """
+
+    if test_size < 0 or test_size > 1:
+        raise ValueError("Test size should be between 0 and 1")
+
+    input_df = input_df[input_df['Protein ID'] == protein_group]
+
+    input_df = pd.merge(
+        left=input_df,
+        right=metadata_df,
+        on="Sample",
+        copy=False,
+    )
+
+    input_df["Time"] = input_df["Time"].apply(convert_time_to_datetime)
+    input_df = input_df.interpolate(method='linear', axis=0)
+    X = input_df[["Time"]]
+    y = input_df["Intensity"]
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
+    model = RANSACRegressor(base_estimator=LinearRegression())
+    model.fit(X_train, y_train)
+
+    inlier_mask = model.inlier_mask_
+
+    y_pred_train = model.predict(X_train)
+    y_pred_test = model.predict(X_test)
+
+    train_rmse = np.sqrt(mean_squared_error(y_train[inlier_mask], y_pred_train[inlier_mask]))
+    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
+    train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask])
+    test_r2 = r2_score(y_test, y_pred_test)
+
+    train_df = pd.DataFrame({'Time': X_train["Time"], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
+    test_df = pd.DataFrame({'Time': X_test["Time"], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+    train_df['Inlier'] = inlier_mask
+    test_df['Inlier'] = False
+    plot_df = pd.concat([train_df, test_df])
+
+    fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025)
+
+    # Add main plot traces
+    fig.add_trace(go.Scatter(
+        x=plot_df['Time'],
+        y=plot_df['Intensity'],
+        mode='markers',
+        name='Actual Intensity',
+        marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
+    ), row=1, col=1)
+
+    fig.add_trace(go.Scatter(
+        x=plot_df['Time'],
+        y=plot_df['Predicted'],
+        mode='lines',
+        name='Predicted Intensity',
+        line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2])
+    ), row=1, col=1)
+
+    fig.add_trace(go.Scatter(
+        x=plot_df[plot_df['Inlier'] == False]['Time'],
+        y=plot_df[plot_df['Inlier'] == False]['Intensity'],
+        mode='markers',
+        name='Outliers',
+        marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[1])
+    ), row=1, col=1)
+
+    # Add annotation text as a separate trace in the subplot
+    annotation_text = (
+        f"Train RMSE: {train_rmse:.3f}<br>"
+        f"Test RMSE: {test_rmse:.3f}<br>"
+        f"Train R²: {train_r2:.3f}<br>"
+        f"Test R²: {test_r2:.3f}"
+    )
+
+    fig.add_trace(go.Scatter(
+        x=[0],
+        y=[0.25],
+        text=[annotation_text],
+        mode='text',
+        textfont=dict(size=12),
+        showlegend=False
+    ), row=1, col=2)
+
+    # Update layout
+    fig.update_layout(
+        title=f"Intensity over Time for {protein_group}",
+        plot_bgcolor=colors["plot_bgcolor"],
+        xaxis_gridcolor=colors["gridcolor"],
+        yaxis_gridcolor=colors["gridcolor"],
+        xaxis_linecolor=colors["linecolor"],
+        yaxis_linecolor=colors["linecolor"],
+        xaxis_title="Time",
+        yaxis_title="Intensity",
+        legend_title="Legend",
+        autosize=True,
+        margin=dict(l=100, r=100, t=100, b=50),
+        legend=dict(
+            yanchor="top",
+            y=0.95,
+            xanchor="right",
+            x=0.85
+        )
+    )
+
+    # Hide x-axis of the annotation subplot
+    fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2)
+    fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2)
+
+    # Adjust subplot titles
+    fig.update_annotations(font_size=12)
+
     return dict(
         train_root_mean_squared=train_rmse,
         test_root_mean_squared=test_rmse,
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 752bb5c6..6f45e7e1 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -14,7 +14,7 @@
 )
 from protzilla.data_analysis.differential_expression_t_test import t_test
 from protzilla.data_analysis.dimension_reduction import t_sne, umap
-from protzilla.data_analysis.time_series_regression_analysis import time_series_linear_regression
+from protzilla.data_analysis.time_series_regression_analysis import time_series_linear_regression, time_series_ransac_regression
 from protzilla.data_analysis.ptm_analysis import filter_peptides_of_protein, ptms_per_sample, \
     ptms_per_protein_and_sample
 from protzilla.data_analysis.model_evaluation import evaluate_classification_model
@@ -826,6 +826,33 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         inputs["metadata_df"] = steps.metadata_df
         return inputs
 
+
+class TimeSeriesRANSACRegression(PlotStep):
+    display_name = "RANSAC Regression"
+    operation = "Time series analysis"
+    method_description = " Perform RANSAC regression on the time series data for a given protein group."
+
+    input_keys = [
+        "input_df",
+        "metadata_df",
+        "protein_group",
+        "test_size",
+    ]
+    output_keys = [
+        "train_root_mean_squared",
+        "test_root_mean_squared",
+        "train_r2_score",
+        "test_r2_score",
+    ]
+
+    def method(self, inputs: dict) -> dict:
+        return time_series_ransac_regression(**inputs)
+
+    def insert_dataframes(self, steps: StepManager, inputs) -> dict:
+        inputs["input_df"] = steps.get_step_output(Step, "peptide_df", inputs["input_df"])
+        inputs["metadata_df"] = steps.metadata_df
+        return inputs
+
 class PTMsPerSample(DataAnalysisStep):
     display_name = "PTMs per Sample"
     operation = "Peptide analysis"
diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py
index a6d350d2..079e1569 100644
--- a/ui/runs/form_mapping.py
+++ b/ui/runs/form_mapping.py
@@ -67,6 +67,7 @@
     data_analysis.PTMsPerSample: data_analysis_forms.PTMsPerSampleForm,
     data_analysis.PTMsProteinAndPerSample: data_analysis_forms.PTMsPerProteinAndSampleForm,
     data_analysis.TimeSeriesLinearRegression: data_analysis_forms.TimeSeriesLinearRegressionForm,
+    data_analysis.TimeSeriesRANSACRegression: data_analysis_forms.TimeSeriesRANSACRegressionForm,
     data_preprocessing.ImputationByMinPerSample: data_preprocessing_forms.ImputationByMinPerSampleForms,
     data_integration.EnrichmentAnalysisGOAnalysisWithString: data_integration_forms.EnrichmentAnalysisGOAnalysisWithStringForm,
     data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr: data_integration_forms.EnrichmentAnalysisGOAnalysisWithEnrichrForm,
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 7b80d0d0..99d23798 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1247,3 +1247,38 @@ def fill_form(self, run: Run) -> None:
                 instance_identifier=input_df_instance_id,
             )["Protein ID"].unique()
         )
+
+
+class TimeSeriesRANSACRegressionForm(MethodForm):
+    input_df = CustomChoiceField(
+        choices=[],
+        label="Peptide dataframe",
+    )
+    protein_group = CustomChoiceField(
+        choices=[],
+        label="Protein group: which protein group to perform the RANSAC regression on",
+    )
+    test_size = CustomFloatField(
+        label="Test size: proportion of the dataset to include in the test split",
+        min_value=0,
+        max_value=1,
+        step_size=0.1,
+        initial=0.2
+    )
+
+
+    def fill_form(self, run: Run) -> None:
+        self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps(
+            run
+        )
+        input_df_instance_id = self.data.get(
+            "input_df", self.fields["input_df"].choices[0][0]
+        )
+
+        self.fields["protein_group"].choices = fill_helper.to_choices(
+            run.steps.get_step_output(
+                step_type=Step,
+                output_key="peptide_df",
+                instance_identifier=input_df_instance_id,
+            )["Protein ID"].unique()
+        )
\ No newline at end of file

From c3fae9b14c97e6504256f59149d17700cf308303 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Thu, 20 Jun 2024 17:33:18 +0200
Subject: [PATCH 18/52]  output field for result

---
 protzilla/methods/data_analysis.py |  4 ++++
 protzilla/steps.py                 | 14 ++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 6f45e7e1..d1f56e00 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -901,3 +901,7 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
             Step, "peptide_df", inputs["peptide_df"]
         )
         return inputs
+
+    def handle_outputs(self, outputs: dict):
+        super().handle_outputs(outputs)
+        self.display_output["required_sample_size"] = outputs["required_sample_size"]
\ No newline at end of file
diff --git a/protzilla/steps.py b/protzilla/steps.py
index d5fb124e..7dec4936 100644
--- a/protzilla/steps.py
+++ b/protzilla/steps.py
@@ -36,6 +36,7 @@ def __init__(self, instance_identifier: str | None = None):
         self.messages: Messages = Messages([])
         self.output: Output = Output()
         self.plots: Plots = Plots()
+        self.display_output: DisplayOutput = DisplayOutput()
         self.instance_identifier = instance_identifier
 
         if self.instance_identifier is None:
@@ -310,6 +311,19 @@ def export(self, format_):
                     exports.append(BytesIO(base64.b64decode(plot)))
         return exports
 
+class DisplayOutput:
+
+    def __init__(self, display_output: dict = None):
+        if display_output is None:
+            display_output = []
+        self.display_output = display_output
+    def __iter__(self):
+        return iter(self.display_output)
+    def __repr__(self):
+        return f"DisplayOutput: {self.display_output}"
+    def __contains__(self, key):
+        return key in self.display_output
+
 
 class StepManager:
     def __repr__(self):

From 67c59c7f8fc6fc8661694f6cb9e0725be01fd889 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Fri, 21 Jun 2024 20:24:11 +0200
Subject: [PATCH 19/52]  further implementation of output field for result

---
 protzilla/methods/data_analysis.py  |  3 +--
 protzilla/steps.py                  | 10 +++++++++-
 ui/runs/templates/runs/details.html |  7 +++++++
 ui/runs/views.py                    |  8 ++++++++
 4 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index d1f56e00..1c54da36 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -33,7 +33,7 @@
 )
 from protzilla.data_analysis.ptm_quantification import flexiquant_lf
 from protzilla.methods.data_preprocessing import TransformationLog
-from protzilla.steps import Plots, Step, StepManager
+from protzilla.steps import Plots, Step, StepManager, DisplayOutput
 
 
 class DataAnalysisStep(Step):
@@ -844,7 +844,6 @@ class TimeSeriesRANSACRegression(PlotStep):
         "train_r2_score",
         "test_r2_score",
     ]
-
     def method(self, inputs: dict) -> dict:
         return time_series_ransac_regression(**inputs)
 
diff --git a/protzilla/steps.py b/protzilla/steps.py
index 7dec4936..32ce93b3 100644
--- a/protzilla/steps.py
+++ b/protzilla/steps.py
@@ -315,7 +315,7 @@ class DisplayOutput:
 
     def __init__(self, display_output: dict = None):
         if display_output is None:
-            display_output = []
+            display_output = {}
         self.display_output = display_output
     def __iter__(self):
         return iter(self.display_output)
@@ -323,6 +323,14 @@ def __repr__(self):
         return f"DisplayOutput: {self.display_output}"
     def __contains__(self, key):
         return key in self.display_output
+    def __getitem__(self, key):
+        return self.display_output[key]
+    def __setitem__(self, key, value):
+        self.display_output[key] = value
+    def is_empty(self) -> bool:
+        return len(self.display_output) == 0
+
+
 
 
 class StepManager:
diff --git a/ui/runs/templates/runs/details.html b/ui/runs/templates/runs/details.html
index 5809d356..a930f486 100644
--- a/ui/runs/templates/runs/details.html
+++ b/ui/runs/templates/runs/details.html
@@ -211,6 +211,13 @@ <h3>{{ display_name }}</h3>
                                 {% endif %}
                             </div>
                         {% endif %}
+                        {% if display_output %}
+                            <div>
+                                <label for="display_output">Outputs:</label>
+                                <textarea class="form-control" id="display_output" rows="5" readonly>{{ display_output_result }}
+                                </textarea>
+                            </div>
+                        {% endif %}
                     </div>
                 {% else %}
                     <p>You are at the end of the run. Go back to add more steps of the same section, or add steps of
diff --git a/ui/runs/views.py b/ui/runs/views.py
index b95be756..67635169 100644
--- a/ui/runs/views.py
+++ b/ui/runs/views.py
@@ -121,6 +121,12 @@ def detail(request: HttpRequest, run_name: str):
         and Path(run.current_outputs["graph_path"]).exists()
     )
 
+    display_output_form = (
+        run.steps.current_step.display_output is not None
+        and not run.current_step.display_output.is_empty()
+    )
+    display_output_text = f"{run.current_step.display_output}"
+
     return render(
         request,
         "runs/details.html",
@@ -156,6 +162,8 @@ def detail(request: HttpRequest, run_name: str):
             method_form=method_form,
             is_form_dynamic=method_form.is_dynamic,
             plot_form=plot_form,
+            display_output=display_output_form,
+            display_output_result=display_output_text,
         ),
     )
 

From 20b5e69464137ecc9ee6d33ad378d90469aa3efb Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Sun, 23 Jun 2024 02:43:34 +0200
Subject: [PATCH 20/52] display display_output in output field

---
 protzilla/methods/data_analysis.py  | 2 +-
 ui/runs/templates/runs/details.html | 4 ++--
 ui/runs/views.py                    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 1c54da36..081b0d7f 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -903,4 +903,4 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
 
     def handle_outputs(self, outputs: dict):
         super().handle_outputs(outputs)
-        self.display_output["required_sample_size"] = outputs["required_sample_size"]
\ No newline at end of file
+        self.display_output["required_sample_size"] = f"Required Sample Size: {outputs['required_sample_size']}"
diff --git a/ui/runs/templates/runs/details.html b/ui/runs/templates/runs/details.html
index a930f486..361875f7 100644
--- a/ui/runs/templates/runs/details.html
+++ b/ui/runs/templates/runs/details.html
@@ -213,8 +213,8 @@ <h3>{{ display_name }}</h3>
                         {% endif %}
                         {% if display_output %}
                             <div>
-                                <label for="display_output">Outputs:</label>
-                                <textarea class="form-control" id="display_output" rows="5" readonly>{{ display_output_result }}
+                                <label for="display_output"></label>
+                                <textarea class="form-control" id="display_output" rows="1" width="100%" style="resize: none" readonly>{{ display_output_result }}
                                 </textarea>
                             </div>
                         {% endif %}
diff --git a/ui/runs/views.py b/ui/runs/views.py
index 67635169..c4314306 100644
--- a/ui/runs/views.py
+++ b/ui/runs/views.py
@@ -125,7 +125,7 @@ def detail(request: HttpRequest, run_name: str):
         run.steps.current_step.display_output is not None
         and not run.current_step.display_output.is_empty()
     )
-    display_output_text = f"{run.current_step.display_output}"
+    display_output_text = next(iter(run.current_step.display_output.display_output.values()))
 
     return render(
         request,

From 3aa711d808a6326119c85193db0f0cb5a7a0bdec Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Tue, 25 Jun 2024 13:25:53 +0200
Subject: [PATCH 21/52] display_output field displayed in the same size and
 position as the other fields

---
 ui/runs/static/runs/style.css       |  7 +++++++
 ui/runs/templates/runs/details.html | 15 ++++++++-------
 ui/runs/views.py                    |  2 +-
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/ui/runs/static/runs/style.css b/ui/runs/static/runs/style.css
index 63d66a0b..477e0f11 100644
--- a/ui/runs/static/runs/style.css
+++ b/ui/runs/static/runs/style.css
@@ -75,3 +75,10 @@ html, body {
 #gsea_enrichment_plot_img {
     width: 800px;
 }
+
+.display-output-textarea {
+    display: flex;
+    width: 100%;
+    height: auto;
+    resize: none;
+}
\ No newline at end of file
diff --git a/ui/runs/templates/runs/details.html b/ui/runs/templates/runs/details.html
index 361875f7..84ec3cfd 100644
--- a/ui/runs/templates/runs/details.html
+++ b/ui/runs/templates/runs/details.html
@@ -209,13 +209,14 @@ <h3>{{ display_name }}</h3>
                                         </div>
                                     </form>
                                 {% endif %}
-                            </div>
-                        {% endif %}
-                        {% if display_output %}
-                            <div>
-                                <label for="display_output"></label>
-                                <textarea class="form-control" id="display_output" rows="1" width="100%" style="resize: none" readonly>{{ display_output_result }}
-                                </textarea>
+                                {% if display_output %}
+                                    <div class="mb-5">
+                                        <label for="display_output"></label>
+                                            <textarea class="form-control display-output-textarea" id="display_output" rows="1"
+                                                      readonly>{{ display_output_result }}
+                                            </textarea>
+                                    </div>
+                                {% endif %}
                             </div>
                         {% endif %}
                     </div>
diff --git a/ui/runs/views.py b/ui/runs/views.py
index c4314306..6d98d025 100644
--- a/ui/runs/views.py
+++ b/ui/runs/views.py
@@ -125,7 +125,7 @@ def detail(request: HttpRequest, run_name: str):
         run.steps.current_step.display_output is not None
         and not run.current_step.display_output.is_empty()
     )
-    display_output_text = next(iter(run.current_step.display_output.display_output.values()))
+    display_output_text = next(iter(run.current_step.display_output.display_output.values()), None)
 
     return render(
         request,

From 2b483f9ceb005979b7ae371e4393daf6b13e1d67 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Thu, 25 Jul 2024 10:38:34 +0200
Subject: [PATCH 22/52] Changed is_dynamic to True

---
 ui/runs/forms/data_analysis.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 99d23798..fd8de70d 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1215,6 +1215,7 @@ def fill_form(self, run: Run) -> None:
 
 
 class TimeSeriesLinearRegressionForm(MethodForm):
+    is_dynamic = True
     input_df = CustomChoiceField(
         choices=[],
         label="Peptide dataframe",
@@ -1250,6 +1251,7 @@ def fill_form(self, run: Run) -> None:
 
 
 class TimeSeriesRANSACRegressionForm(MethodForm):
+    is_dynamic = True
     input_df = CustomChoiceField(
         choices=[],
         label="Peptide dataframe",

From 5553ca769cb9f34b3762004e4cbc73780aec81d7 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Thu, 25 Jul 2024 10:59:44 +0200
Subject: [PATCH 23/52] Made some minor changes to the Plot positioning

---
 protzilla/data_analysis/time_series_regression_analysis.py | 3 ++-
 protzilla/methods/data_analysis.py                         | 3 ---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
index 2d5622ff..a69eced9 100644
--- a/protzilla/data_analysis/time_series_regression_analysis.py
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -255,7 +255,7 @@ def time_series_ransac_regression(
             yanchor="top",
             y=0.95,
             xanchor="right",
-            x=0.85
+            x=0.825
         )
     )
 
@@ -273,3 +273,4 @@ def time_series_ransac_regression(
         test_r2_score=test_r2,
         plots=[fig],
     )
+
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 081b0d7f..687987a4 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -901,6 +901,3 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         )
         return inputs
 
-    def handle_outputs(self, outputs: dict):
-        super().handle_outputs(outputs)
-        self.display_output["required_sample_size"] = f"Required Sample Size: {outputs['required_sample_size']}"

From 14dac5e1ef398812061f8e293d61fb6b15c50658 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Thu, 25 Jul 2024 11:01:38 +0200
Subject: [PATCH 24/52] Made some minor changes to the Plot positioning

---
 protzilla/data_analysis/time_series_regression_analysis.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
index a69eced9..d61c5815 100644
--- a/protzilla/data_analysis/time_series_regression_analysis.py
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -70,7 +70,6 @@ def time_series_linear_regression(
 
     fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025)
 
-    # Add main plot traces
     fig.add_trace(go.Scatter(
         x=plot_df['Time'],
         y=plot_df['Intensity'],
@@ -104,7 +103,6 @@ def time_series_linear_regression(
         showlegend=False
     ), row=1, col=2)
 
-    # Update layout
     fig.update_layout(
         title=f"Intensity over Time for {protein_group}",
         plot_bgcolor=colors["plot_bgcolor"],
@@ -121,7 +119,7 @@ def time_series_linear_regression(
             yanchor="top",
             y=0.95,
             xanchor="right",
-            x=0.85
+            x=0.825
         )
     )
 
@@ -129,7 +127,6 @@ def time_series_linear_regression(
     fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2)
     fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2)
 
-    # Adjust subplot titles
     fig.update_annotations(font_size=12)
 
     return dict(
@@ -238,7 +235,6 @@ def time_series_ransac_regression(
         showlegend=False
     ), row=1, col=2)
 
-    # Update layout
     fig.update_layout(
         title=f"Intensity over Time for {protein_group}",
         plot_bgcolor=colors["plot_bgcolor"],
@@ -263,7 +259,6 @@ def time_series_ransac_regression(
     fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2)
     fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2)
 
-    # Adjust subplot titles
     fig.update_annotations(font_size=12)
 
     return dict(

From e9c9acf0b6885bceb57ee20b2a1a8726061bf867 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Thu, 25 Jul 2024 11:23:08 +0200
Subject: [PATCH 25/52] Created a thesis Workflow and added some tests for
 RANSAC

---
 .../test_time_series_analysis.py              | 25 ++++++
 user_data/workflows/workflow_BA_Kuganash.yaml | 89 +++++++++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100644 user_data/workflows/workflow_BA_Kuganash.yaml

diff --git a/tests/protzilla/data_analysis/test_time_series_analysis.py b/tests/protzilla/data_analysis/test_time_series_analysis.py
index ff01ba50..74c5d5c1 100644
--- a/tests/protzilla/data_analysis/test_time_series_analysis.py
+++ b/tests/protzilla/data_analysis/test_time_series_analysis.py
@@ -70,6 +70,31 @@ def test_linear_regression_plot_invalid_test_size(time_series_test_data):
     return
 
 def test_linear_regression_outputs(time_series_test_data):
+    test_intensity, test_metadata = time_series_test_data
+    outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2)
+    assert "train_root_mean_squared" in outputs
+    assert "test_root_mean_squared" in outputs
+    assert "train_r2_score" in outputs
+    assert "test_r2_score" in outputs
+    return
+
+
+def test_ransac_regression_plot(show_figures, time_series_test_data):
+    test_intensity, test_metadata = time_series_test_data
+    outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2)
+    assert "plots" in outputs
+    fig = outputs["plots"][0]
+    if show_figures:
+        fig.show()
+    return
+
+def test_linear_ransac_plot_invalid_test_size(time_series_test_data):
+    test_intensity, test_metadata = time_series_test_data
+    with pytest.raises(ValueError):
+        time_series_linear_regression(test_intensity, test_metadata, "Protein1", 2)
+    return
+
+def test_ransac_regression_outputs(time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
     outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2)
     assert "train_root_mean_squared" in outputs
diff --git a/user_data/workflows/workflow_BA_Kuganash.yaml b/user_data/workflows/workflow_BA_Kuganash.yaml
new file mode 100644
index 00000000..1a19947c
--- /dev/null
+++ b/user_data/workflows/workflow_BA_Kuganash.yaml
@@ -0,0 +1,89 @@
+df_mode: disk_memory
+steps:
+- form_inputs:
+    aggregation_method: Median
+    intensity_name: Intensity
+    map_to_uniprot: false
+  inputs: {}
+  instance_identifier: MaxQuantImport_1
+  type: MaxQuantImport
+- form_inputs:
+    intensity_name: Intensity
+    map_to_uniprot: false
+  inputs: {}
+  instance_identifier: EvidenceImport_1
+  type: EvidenceImport
+- form_inputs:
+    feature_orientation: Columns (samples in rows, features in columns)
+  inputs: {}
+  instance_identifier: MetadataImport_1
+  type: MetadataImport
+- form_inputs:
+    percentage: 0.5
+  inputs: {}
+  instance_identifier: FilterProteinsBySamplesMissing_1
+  plot_inputs:
+    graph_type: Bar chart
+  type: FilterProteinsBySamplesMissing
+- form_inputs:
+    deviation_threshold: 2.0
+  inputs: {}
+  instance_identifier: FilterSamplesByProteinIntensitiesSum_1
+  plot_inputs:
+    graph_type: Bar chart
+  type: FilterSamplesByProteinIntensitiesSum
+- form_inputs:
+    number_of_neighbours: 5
+  inputs: {}
+  instance_identifier: ImputationByKNN_1
+  plot_inputs:
+    graph_type: Boxplot
+    graph_type_quantities: Bar chart
+    group_by: None
+    visual_transformation: log10
+  type: ImputationByKNN
+- form_inputs:
+    number_of_neighbors: 20
+  inputs: {}
+  instance_identifier: OutlierDetectionByLocalOutlierFactor_1
+  plot_inputs: {}
+  type: OutlierDetectionByLocalOutlierFactor
+- form_inputs:
+    percentile: 0.5
+  inputs: {}
+  instance_identifier: NormalisationByMedian_1
+  plot_inputs:
+    graph_type: Boxplot
+    group_by: None
+    visual_transformation: log10
+  type: NormalisationByMedian
+- form_inputs:
+    log_base: log2
+  inputs: {}
+  instance_identifier: TransformationLog_1
+  plot_inputs:
+    graph_type: Histogram
+    group_by: None
+  type: TransformationLog
+- form_inputs:
+    input_df: TransformationLog_1
+    protein_group: D3YYU8
+    similarity: 1
+    similarity_measure: euclidean distance
+  inputs: {}
+  instance_identifier: PlotTimeSeries_1
+  type: PlotTimeSeriesPeptide
+- form_inputs:
+    input_df: TransformationLog_1
+    protein_group: D3YYU8
+    test_size: 0.2
+  inputs: {}
+  instance_identifier: TimeSeriesLinearRegression_1
+  type: TimeSeriesLinearRegression
+- form_inputs:
+    input_df: TransformationLog_1
+    protein_group: D3YYU8
+    test_size: 0.2
+  inputs: {}
+  instance_identifier: TimeSeriesRANSACRegression_1
+  type: TimeSeriesRANSACRegression

From 37180af62f7ea667abb3b07d9144ea297dbabf25 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Thu, 25 Jul 2024 14:12:51 +0200
Subject: [PATCH 26/52] Implemented Augmented Dickey-Fuller test to check if a
 time series data stationary or not

---
 .../time_series_regression_analysis.py        | 103 ++++++++++++++++++
 protzilla/methods/data_analysis.py            |  32 +++++-
 .../test_time_series_analysis.py              |  20 +++-
 ui/runs/form_mapping.py                       |   1 +
 ui/runs/forms/data_analysis.py                |  34 ++++++
 5 files changed, 184 insertions(+), 6 deletions(-)

diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
index d61c5815..1af2a6a1 100644
--- a/protzilla/data_analysis/time_series_regression_analysis.py
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -1,3 +1,5 @@
+import logging
+
 import numpy as np
 import pandas as pd
 import plotly.graph_objects as go
@@ -8,6 +10,8 @@
 from sklearn.linear_model import LinearRegression, RANSACRegressor
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_squared_error, r2_score
+from statsmodels.tsa.arima.model import ARIMA
+from statsmodels.tsa.stattools import adfuller
 from plotly.subplots import make_subplots
 
 colors = {
@@ -269,3 +273,102 @@ def time_series_ransac_regression(
         plots=[fig],
     )
 
+
+def adfuller_test(
+    input_df: pd.DataFrame,
+    metadata_df: pd.DataFrame,
+    protein_group: str,
+    alpha: float = 0.05,
+) -> dict:
+    """
+    Perform the Augmented Dickey-Fuller test to check for stationarity in a time series.
+    :param input_df: The dataframe containing the time series data.
+    :param metadata_df: The dataframe containing the metadata.
+    :param protein_group: The protein group to perform the test on.
+    :param alpha: The significance level for the test (default is 0.05).
+
+    :return: A dictionary containing:
+        - test_statistic: The test statistic from the ADF test.
+        - p_value: The p-value from the ADF test.
+        - critical_values: The critical values for different significance levels.
+        - is_stationary: A boolean indicating if the series is stationary.
+        - messages: A list of messages for the user.
+    """
+
+    messages = []
+    input_df = input_df[input_df['Protein ID'] == protein_group]
+
+    input_df = pd.merge(
+        left=input_df,
+        right=metadata_df,
+        on="Sample",
+        copy=False,
+    )
+
+    input_df = input_df["Intensity"].dropna()
+
+    # Perform the ADF test
+    result = adfuller(input_df)
+    test_statistic = result[0]
+    p_value = result[1]
+    critical_values = result[4]
+
+    # Determine if the series is stationary
+    is_stationary = p_value < alpha
+
+    # Create a message for the user
+    if is_stationary:
+        messages.append(
+            {
+                "level": logging.INFO,
+                "msg": f"The time series is stationary (p-value: {p_value:.5f}).",
+            }
+        )
+    else:
+        messages.append(
+            {
+                "level": logging.WARNING,
+                "msg": f"The time series is not stationary (p-value: {p_value:.5f}).",
+            }
+        )
+    """
+    fig = go.Figure()
+
+    annotation_text = (
+        f"Test Statistic: {test_statistic:.3f}<br>"
+        f"P-Value: {p_value:.3f}<br>"
+        f"Critical Values:<br>"
+        f"Is Stationary: {is_stationary}"
+    )
+
+    fig.add_trace(
+        go.Scatter(
+            x=[0],
+            y=[0.25],
+            text=[annotation_text],
+            mode='text',
+            textfont=dict(size=12),
+            showlegend=False
+        )
+    )
+
+    fig.update_layout(
+        title=f"Augmented Dickey-Fuller Test for {protein_group}",
+        autosize=True,
+        margin=dict(l=100, r=100, t=100, b=50),
+    )
+
+    # Hide x-axis of the annotation subplot
+    fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False)
+    fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False)
+
+    fig.update_annotations(font_size=12)
+    """
+    return dict(
+        test_statistic=test_statistic,
+        p_value=p_value,
+        critical_values=critical_values,
+        is_stationary=is_stationary,
+        messages=messages,
+    )
+
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 687987a4..92844a21 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -14,7 +14,7 @@
 )
 from protzilla.data_analysis.differential_expression_t_test import t_test
 from protzilla.data_analysis.dimension_reduction import t_sne, umap
-from protzilla.data_analysis.time_series_regression_analysis import time_series_linear_regression, time_series_ransac_regression
+from protzilla.data_analysis.time_series_regression_analysis import time_series_linear_regression, time_series_ransac_regression, adfuller_test
 from protzilla.data_analysis.ptm_analysis import filter_peptides_of_protein, ptms_per_sample, \
     ptms_per_protein_and_sample
 from protzilla.data_analysis.model_evaluation import evaluate_classification_model
@@ -33,7 +33,7 @@
 )
 from protzilla.data_analysis.ptm_quantification import flexiquant_lf
 from protzilla.methods.data_preprocessing import TransformationLog
-from protzilla.steps import Plots, Step, StepManager, DisplayOutput
+from protzilla.steps import Plots, Step, StepManager
 
 
 class DataAnalysisStep(Step):
@@ -852,6 +852,34 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         inputs["metadata_df"] = steps.metadata_df
         return inputs
 
+
+class TimeSeriesADFullerTest(DataAnalysisStep):
+    display_name = "Augmented Dickey-Fuller Test"
+    operation = "Time series analysis"
+    method_description = "Perform Augmented Dickey-Fuller test on the time series data for a given protein group."
+
+    input_keys = [
+        "input_df",
+        "metadata_df",
+        "protein_group",
+        "alpha",
+    ]
+    output_keys = [
+        "test_statistic",
+        "p_value",
+        "critical_values",
+        "is_stationary",
+     ]
+
+    def method(self, inputs: dict) -> dict:
+        return adfuller_test(**inputs)
+
+    def insert_dataframes(self, steps: StepManager, inputs) -> dict:
+        inputs["input_df"] = steps.get_step_output(Step, "peptide_df", inputs["input_df"])
+        inputs["metadata_df"] = steps.metadata_df
+        return inputs
+
+
 class PTMsPerSample(DataAnalysisStep):
     display_name = "PTMs per Sample"
     operation = "Peptide analysis"
diff --git a/tests/protzilla/data_analysis/test_time_series_analysis.py b/tests/protzilla/data_analysis/test_time_series_analysis.py
index 74c5d5c1..5c359d5d 100644
--- a/tests/protzilla/data_analysis/test_time_series_analysis.py
+++ b/tests/protzilla/data_analysis/test_time_series_analysis.py
@@ -1,7 +1,7 @@
 import pandas as pd
 import pytest
 
-from protzilla.data_analysis.time_series_regression_analysis import time_series_linear_regression
+from protzilla.data_analysis.time_series_regression_analysis import time_series_linear_regression, time_series_ransac_regression, adfuller_test
 
 
 @pytest.fixture
@@ -81,7 +81,7 @@ def test_linear_regression_outputs(time_series_test_data):
 
 def test_ransac_regression_plot(show_figures, time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
-    outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2)
+    outputs = time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 0.2)
     assert "plots" in outputs
     fig = outputs["plots"][0]
     if show_figures:
@@ -91,14 +91,26 @@ def test_ransac_regression_plot(show_figures, time_series_test_data):
 def test_linear_ransac_plot_invalid_test_size(time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
     with pytest.raises(ValueError):
-        time_series_linear_regression(test_intensity, test_metadata, "Protein1", 2)
+        time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 2)
     return
 
 def test_ransac_regression_outputs(time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
-    outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2)
+    outputs = time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 0.2)
     assert "train_root_mean_squared" in outputs
     assert "test_root_mean_squared" in outputs
     assert "train_r2_score" in outputs
     assert "test_r2_score" in outputs
+    return
+
+
+def test_adfuller_test(time_series_test_data):
+    test_intensity, test_metadata = time_series_test_data
+    outputs = adfuller_test(test_intensity, test_metadata, "Protein1")
+
+    assert "test_statistic" in outputs
+    assert "p_value" in outputs
+    assert "critical_values" in outputs
+    assert "is_stationary" in outputs
+    assert "messages" in outputs
     return
\ No newline at end of file
diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py
index 079e1569..bca07db4 100644
--- a/ui/runs/form_mapping.py
+++ b/ui/runs/form_mapping.py
@@ -68,6 +68,7 @@
     data_analysis.PTMsProteinAndPerSample: data_analysis_forms.PTMsPerProteinAndSampleForm,
     data_analysis.TimeSeriesLinearRegression: data_analysis_forms.TimeSeriesLinearRegressionForm,
     data_analysis.TimeSeriesRANSACRegression: data_analysis_forms.TimeSeriesRANSACRegressionForm,
+    data_analysis.TimeSeriesADFullerTest: data_analysis_forms.TimeSeriesADFullerTestForm,
     data_preprocessing.ImputationByMinPerSample: data_preprocessing_forms.ImputationByMinPerSampleForms,
     data_integration.EnrichmentAnalysisGOAnalysisWithString: data_integration_forms.EnrichmentAnalysisGOAnalysisWithStringForm,
     data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr: data_integration_forms.EnrichmentAnalysisGOAnalysisWithEnrichrForm,
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index fd8de70d..a215f8bf 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1269,6 +1269,40 @@ class TimeSeriesRANSACRegressionForm(MethodForm):
     )
 
 
+    def fill_form(self, run: Run) -> None:
+        self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps(
+            run
+        )
+        input_df_instance_id = self.data.get(
+            "input_df", self.fields["input_df"].choices[0][0]
+        )
+
+        self.fields["protein_group"].choices = fill_helper.to_choices(
+            run.steps.get_step_output(
+                step_type=Step,
+                output_key="peptide_df",
+                instance_identifier=input_df_instance_id,
+            )["Protein ID"].unique()
+        )
+
+
+class TimeSeriesADFullerTestForm(MethodForm):
+    is_dynamic = True
+    input_df = CustomChoiceField(
+        choices=[],
+        label="Peptide dataframe",
+    )
+    protein_group = CustomChoiceField(
+        choices=[],
+        label="Protein group: which protein group to perform the ADFuller test on",
+    )
+    alpha = CustomFloatField(
+        label="Significance level",
+        min_value=0,
+        max_value=1,
+        initial=0.05
+    )
+
     def fill_form(self, run: Run) -> None:
         self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps(
             run

From 21468fc26c3cddecb3e4643075d73b3ef12c04af Mon Sep 17 00:00:00 2001
From: henning <henning.gaertner@student.hpi.de>
Date: Wed, 10 Jul 2024 16:00:05 +0200
Subject: [PATCH 27/52] Implemented the option to do regression on each group

---
 protzilla/constants/colors.py                 |  25 +-
 .../time_series_regression_analysis.py        | 360 +++++++++++-------
 protzilla/methods/data_analysis.py            |  12 +-
 ui/runs/forms/data_analysis.py                |  14 +
 4 files changed, 270 insertions(+), 141 deletions(-)

diff --git a/protzilla/constants/colors.py b/protzilla/constants/colors.py
index eec08b1b..3f33249b 100644
--- a/protzilla/constants/colors.py
+++ b/protzilla/constants/colors.py
@@ -1,8 +1,23 @@
 PROTZILLA_DISCRETE_COLOR_SEQUENCE = [
-    "#4A536A",
-    "#87A8B9",
-    "#CE5A5A",
-    "#8E3325",
-    "#E2A46D",
+    #Muted Dark Slate
+        "#252935",
+        "#4A536A",
+        '#a4a9b4',
+# Muted Indian Red
+        "#CE5A5A",
+        "#B04A4A",
+        "#EBBDBD",
+# Muted Light Steel Blue
+        "#51646f",
+        "#87A8B9",
+        "#B7CAD5",
+ # Muted Sienna
+        "#804538",
+        "#8E3325",
+        "#471912",
+    #Muted Sandy Brown
+        "#715236",
+        "#E2A46D",
+        "F0D1B6",
 ]
 PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE = ["#4A536A", "#CE5A5A"]
diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
index 1af2a6a1..feb9997f 100644
--- a/protzilla/data_analysis/time_series_regression_analysis.py
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -27,7 +27,8 @@ def time_series_linear_regression(
         input_df: pd.DataFrame,
         metadata_df: pd.DataFrame,
         protein_group: str,
-        test_size: float,
+        grouping: str = None,
+        test_size: float = 0.2,
 ):
     """
     Perform linear regression on the time series data for a given protein group.
@@ -35,11 +36,12 @@ def time_series_linear_regression(
     :param metadata_df: Metadata dataframe which contains the timestamps
     :param protein_group: Protein group to perform the analysis on
     :param test_size: The proportion of the dataset to include in the test split
+    :param grouping: Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups
 
     :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
     """
-
-    if test_size < 0 or test_size > 1 :
+    color_index = 0
+    if test_size < 0 or test_size > 1:
         raise ValueError("Test size should be between 0 and 1")
 
     input_df = input_df[input_df['Protein ID'] == protein_group]
@@ -56,47 +58,109 @@ def time_series_linear_regression(
     X = input_df[["Time"]]
     y = input_df["Intensity"]
 
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
-    model = LinearRegression()
-    model.fit(X_train, y_train)
-
-    y_pred_train = model.predict(X_train)
-    y_pred_test = model.predict(X_test)
-
-    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
-    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
-    train_r2 = r2_score(y_train, y_pred_train)
-    test_r2 = r2_score(y_test, y_pred_test)
-
-    train_df = pd.DataFrame({'Time': X_train['Time'], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
-    test_df = pd.DataFrame({'Time': X_test['Time'], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
-    plot_df = pd.concat([train_df, test_df])
-
     fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025)
 
-    fig.add_trace(go.Scatter(
-        x=plot_df['Time'],
-        y=plot_df['Intensity'],
-        mode='markers',
-        name='Actual Intensity',
-        marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
-    ), row=1, col=1)
+    scores = []
+
+    if grouping == "With Grouping" and "Group" in input_df.columns:
+        groups = input_df["Group"].unique()
+        for group in groups:
+            group_df = input_df[input_df["Group"] == group]
+            X_group = group_df[["Time"]]
+            y_group = group_df["Intensity"]
+
+            X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, test_size=test_size, shuffle=False)
+            model = LinearRegression()
+            model.fit(X_train, y_train)
+
+            y_pred_train = model.predict(X_train)
+            y_pred_test = model.predict(X_test)
+
+            train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
+            test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
+            train_r2 = r2_score(y_train, y_pred_train)
+            test_r2 = r2_score(y_test, y_pred_test)
+
+            train_df = pd.DataFrame({'Time': X_train['Time'], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
+            test_df = pd.DataFrame({'Time': X_test['Time'], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+            plot_df = pd.concat([train_df, test_df])
+
+            color = PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index % len(PROTZILLA_DISCRETE_COLOR_SEQUENCE)]
+            color_index += 3
+
+            fig.add_trace(go.Scatter(
+                x=plot_df['Time'],
+                y=plot_df['Intensity'],
+                mode='markers',
+                name=f'Actual Intensity ({group})',
+                marker=dict(color=color)
+            ), row=1, col=1)
+
+            fig.add_trace(go.Scatter(
+                x=plot_df['Time'],
+                y=plot_df['Predicted'],
+                mode='lines',
+                name=f'Predicted Intensity ({group})',
+                line=dict(color=color)
+            ), row=1, col=1)
+
+            scores.append({
+                'group': group,
+                'train_root_mean_squared': train_rmse,
+                'test_root_mean_squared': test_rmse,
+                'train_r2_score': train_r2,
+                'test_r2_score': test_r2,
+            })
 
-    fig.add_trace(go.Scatter(
-        x=plot_df['Time'],
-        y=plot_df['Predicted'],
-        mode='lines',
-        name='Predicted Intensity',
-        line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2])
-    ), row=1, col=1)
+    else:
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
+        model = LinearRegression()
+        model.fit(X_train, y_train)
+
+        y_pred_train = model.predict(X_train)
+        y_pred_test = model.predict(X_test)
+
+        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
+        test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
+        train_r2 = r2_score(y_train, y_pred_train)
+        test_r2 = r2_score(y_test, y_pred_test)
+
+        train_df = pd.DataFrame({'Time': X_train['Time'], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
+        test_df = pd.DataFrame({'Time': X_test['Time'], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+        plot_df = pd.concat([train_df, test_df])
+
+        fig.add_trace(go.Scatter(
+            x=plot_df['Time'],
+            y=plot_df['Intensity'],
+            mode='markers',
+            name='Actual Intensity',
+            marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
+        ), row=1, col=1)
+
+        fig.add_trace(go.Scatter(
+            x=plot_df['Time'],
+            y=plot_df['Predicted'],
+            mode='lines',
+            name='Predicted Intensity',
+            line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2])
+        ), row=1, col=1)
+
+        scores.append({
+            'group': 'Overall',
+            'train_root_mean_squared': train_rmse,
+            'test_root_mean_squared': test_rmse,
+            'train_r2_score': train_r2,
+            'test_r2_score': test_r2,
+        })
 
     # Add annotation text as a separate trace in the subplot
-    annotation_text = (
-        f"Train RMSE: {train_rmse:.3f}<br>"
-        f"Test RMSE: {test_rmse:.3f}<br>"
-        f"Train R²: {train_r2:.3f}<br>"
-        f"Test R²: {test_r2:.3f}"
-    )
+    annotation_text = "<br>".join([
+        f"Group: {res['group']}<br>Train RMSE: {res['train_root_mean_squared']:.3f}<br>"
+        f"Test RMSE: {res['test_root_mean_squared']:.3f}<br>"
+        f"Train R²: {res['train_r2_score']:.3f}<br>"
+        f"Test R²: {res['test_r2_score']:.3f}"
+        for res in scores
+    ])
 
     fig.add_trace(go.Scatter(
         x=[0],
@@ -134,10 +198,7 @@ def time_series_linear_regression(
     fig.update_annotations(font_size=12)
 
     return dict(
-        train_root_mean_squared=train_rmse,
-        test_root_mean_squared=test_rmse,
-        train_r2_score=train_r2,
-        test_r2_score=test_r2,
+        scores=scores,
         plots=[fig],
     )
 
@@ -146,6 +207,7 @@ def time_series_ransac_regression(
         input_df: pd.DataFrame,
         metadata_df: pd.DataFrame,
         protein_group: str,
+        grouping: str,
         test_size: float,
 ):
     """
@@ -158,6 +220,7 @@ def time_series_ransac_regression(
     :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
     """
 
+    color_index = 0
     if test_size < 0 or test_size > 1:
         raise ValueError("Test size should be between 0 and 1")
 
@@ -175,60 +238,134 @@ def time_series_ransac_regression(
     X = input_df[["Time"]]
     y = input_df["Intensity"]
 
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
-    model = RANSACRegressor(base_estimator=LinearRegression())
-    model.fit(X_train, y_train)
-
-    inlier_mask = model.inlier_mask_
-
-    y_pred_train = model.predict(X_train)
-    y_pred_test = model.predict(X_test)
-
-    train_rmse = np.sqrt(mean_squared_error(y_train[inlier_mask], y_pred_train[inlier_mask]))
-    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
-    train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask])
-    test_r2 = r2_score(y_test, y_pred_test)
-
-    train_df = pd.DataFrame({'Time': X_train["Time"], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
-    test_df = pd.DataFrame({'Time': X_test["Time"], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
-    train_df['Inlier'] = inlier_mask
-    test_df['Inlier'] = False
-    plot_df = pd.concat([train_df, test_df])
-
     fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025)
 
-    # Add main plot traces
-    fig.add_trace(go.Scatter(
-        x=plot_df['Time'],
-        y=plot_df['Intensity'],
-        mode='markers',
-        name='Actual Intensity',
-        marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
-    ), row=1, col=1)
-
-    fig.add_trace(go.Scatter(
-        x=plot_df['Time'],
-        y=plot_df['Predicted'],
-        mode='lines',
-        name='Predicted Intensity',
-        line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2])
-    ), row=1, col=1)
+    scores = []
+
+    if grouping == "With Grouping" and "Group" in input_df.columns:
+        groups = input_df["Group"].unique()
+        for group in groups:
+            group_df = input_df[input_df["Group"] == group]
+            X_group = group_df[["Time"]]
+            y_group = group_df["Intensity"]
+
+            X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, test_size=test_size, shuffle=False)
+            model = RANSACRegressor(base_estimator=LinearRegression())
+            model.fit(X_train, y_train)
+
+            inlier_mask = model.inlier_mask_
+
+            y_pred_train = model.predict(X_train)
+            y_pred_test = model.predict(X_test)
+
+            train_rmse = np.sqrt(mean_squared_error(y_train[inlier_mask], y_pred_train[inlier_mask]))
+            test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
+            train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask])
+            test_r2 = r2_score(y_test, y_pred_test)
+
+            train_df = pd.DataFrame({'Time': X_train["Time"], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
+            test_df = pd.DataFrame({'Time': X_test["Time"], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+            train_df['Inlier'] = inlier_mask
+            test_df['Inlier'] = False
+            plot_df = pd.concat([train_df, test_df])
+
+            # Add main plot traces
+            fig.add_trace(go.Scatter(
+                x=plot_df['Time'],
+                y=plot_df['Intensity'],
+                mode='markers',
+                name='Actual Intensity',
+                marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index])
+            ), row=1, col=1)
+
+            fig.add_trace(go.Scatter(
+                x=plot_df['Time'],
+                y=plot_df['Predicted'],
+                mode='lines',
+                name='Predicted Intensity',
+                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 1])
+            ), row=1, col=1)
+
+            fig.add_trace(go.Scatter(
+                x=plot_df[plot_df['Inlier'] == False]['Time'],
+                y=plot_df[plot_df['Inlier'] == False]['Intensity'],
+                mode='markers',
+                name='Outliers',
+                marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2])
+            ), row=1, col=1)
+
+            color_index += 3
+
+            scores.append({
+                'group': group,
+                'train_root_mean_squared': train_rmse,
+                'test_root_mean_squared': test_rmse,
+                'train_r2_score': train_r2,
+                'test_r2_score': test_r2,
+            })
 
-    fig.add_trace(go.Scatter(
-        x=plot_df[plot_df['Inlier'] == False]['Time'],
-        y=plot_df[plot_df['Inlier'] == False]['Intensity'],
-        mode='markers',
-        name='Outliers',
-        marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[1])
-    ), row=1, col=1)
+    else:
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
+        model = RANSACRegressor(base_estimator=LinearRegression())
+        model.fit(X_train, y_train)
+
+        inlier_mask = model.inlier_mask_
+
+        y_pred_train = model.predict(X_train)
+        y_pred_test = model.predict(X_test)
+
+        train_rmse = np.sqrt(mean_squared_error(y_train[inlier_mask], y_pred_train[inlier_mask]))
+        test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
+        train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask])
+        test_r2 = r2_score(y_test, y_pred_test)
+
+        train_df = pd.DataFrame({'Time': X_train["Time"], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
+        test_df = pd.DataFrame({'Time': X_test["Time"], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+        train_df['Inlier'] = inlier_mask
+        test_df['Inlier'] = False
+        plot_df = pd.concat([train_df, test_df])
+
+        # Add main plot traces
+        fig.add_trace(go.Scatter(
+            x=plot_df['Time'],
+            y=plot_df['Intensity'],
+            mode='markers',
+            name='Actual Intensity',
+            marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
+        ), row=1, col=1)
+
+        fig.add_trace(go.Scatter(
+            x=plot_df['Time'],
+            y=plot_df['Predicted'],
+            mode='lines',
+            name='Predicted Intensity',
+            line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
+        ), row=1, col=1)
+
+        fig.add_trace(go.Scatter(
+            x=plot_df[plot_df['Inlier'] == False]['Time'],
+            y=plot_df[plot_df['Inlier'] == False]['Intensity'],
+            mode='markers',
+            name='Outliers',
+            marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2])
+        ), row=1, col=1)
+
+        scores.append({
+            'group': 'Overall',
+            'train_root_mean_squared': train_rmse,
+            'test_root_mean_squared': test_rmse,
+            'train_r2_score': train_r2,
+            'test_r2_score': test_r2,
+        })
 
     # Add annotation text as a separate trace in the subplot
-    annotation_text = (
-        f"Train RMSE: {train_rmse:.3f}<br>"
-        f"Test RMSE: {test_rmse:.3f}<br>"
-        f"Train R²: {train_r2:.3f}<br>"
-        f"Test R²: {test_r2:.3f}"
-    )
+    annotation_text = "<br>".join([
+        f"Group: {res['group']}<br>Train RMSE: {res['train_root_mean_squared']:.3f}<br>"
+        f"Test RMSE: {res['test_root_mean_squared']:.3f}<br>"
+        f"Train R²: {res['train_r2_score']:.3f}<br>"
+        f"Test R²: {res['test_r2_score']:.3f}"
+        for res in scores
+    ])
 
     fig.add_trace(go.Scatter(
         x=[0],
@@ -266,10 +403,7 @@ def time_series_ransac_regression(
     fig.update_annotations(font_size=12)
 
     return dict(
-        train_root_mean_squared=train_rmse,
-        test_root_mean_squared=test_rmse,
-        train_r2_score=train_r2,
-        test_r2_score=test_r2,
+        scores=scores,
         plots=[fig],
     )
 
@@ -295,6 +429,8 @@ def adfuller_test(
         - messages: A list of messages for the user.
     """
 
+    # TODO: Info box for the user
+
     messages = []
     input_df = input_df[input_df['Protein ID'] == protein_group]
 
@@ -331,39 +467,7 @@ def adfuller_test(
                 "msg": f"The time series is not stationary (p-value: {p_value:.5f}).",
             }
         )
-    """
-    fig = go.Figure()
-
-    annotation_text = (
-        f"Test Statistic: {test_statistic:.3f}<br>"
-        f"P-Value: {p_value:.3f}<br>"
-        f"Critical Values:<br>"
-        f"Is Stationary: {is_stationary}"
-    )
-
-    fig.add_trace(
-        go.Scatter(
-            x=[0],
-            y=[0.25],
-            text=[annotation_text],
-            mode='text',
-            textfont=dict(size=12),
-            showlegend=False
-        )
-    )
 
-    fig.update_layout(
-        title=f"Augmented Dickey-Fuller Test for {protein_group}",
-        autosize=True,
-        margin=dict(l=100, r=100, t=100, b=50),
-    )
-
-    # Hide x-axis of the annotation subplot
-    fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False)
-    fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False)
-
-    fig.update_annotations(font_size=12)
-    """
     return dict(
         test_statistic=test_statistic,
         p_value=p_value,
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 92844a21..1c3f7093 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -809,13 +809,11 @@ class TimeSeriesLinearRegression(PlotStep):
         "input_df",
         "metadata_df",
         "protein_group",
+        "grouping",
         "test_size",
     ]
     output_keys = [
-        "train_root_mean_squared",
-        "test_root_mean_squared",
-        "train_r2_score",
-        "test_r2_score",
+        "scores",
     ]
 
     def method(self, inputs: dict) -> dict:
@@ -836,13 +834,11 @@ class TimeSeriesRANSACRegression(PlotStep):
         "input_df",
         "metadata_df",
         "protein_group",
+        "grouping",
         "test_size",
     ]
     output_keys = [
-        "train_root_mean_squared",
-        "test_root_mean_squared",
-        "train_r2_score",
-        "test_r2_score",
+        "scores",
     ]
     def method(self, inputs: dict) -> dict:
         return time_series_ransac_regression(**inputs)
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index a215f8bf..f3d2e849 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -145,6 +145,10 @@ class DimensionReductionMetric(Enum):
     cosine = "cosine"
     havensine = "havensine"
 
+class TimeSeriesGrouping(Enum):
+    with_grouping = "With Grouping"
+    without_grouping = "Without Grouping"
+
 
 class DifferentialExpressionANOVAForm(MethodForm):
     is_dynamic = True
@@ -1224,6 +1228,11 @@ class TimeSeriesLinearRegressionForm(MethodForm):
         choices=[],
         label="Protein group: which protein group to perform the linear regression on",
     )
+    grouping = CustomChoiceField(
+        choices= TimeSeriesGrouping,
+        label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
+        initial=TimeSeriesGrouping.with_grouping
+    )
     test_size = CustomFloatField(
         label="Test size: proportion of the dataset to include in the test split",
         min_value=0,
@@ -1260,6 +1269,11 @@ class TimeSeriesRANSACRegressionForm(MethodForm):
         choices=[],
         label="Protein group: which protein group to perform the RANSAC regression on",
     )
+    grouping = CustomChoiceField(
+        choices= TimeSeriesGrouping,
+        label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
+        initial=TimeSeriesGrouping.with_grouping
+    )
     test_size = CustomFloatField(
         label="Test size: proportion of the dataset to include in the test split",
         min_value=0,

From 0153a1a61e9b96c4b8441980bc86c3fa43f696ed Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Wed, 31 Jul 2024 12:07:55 +0200
Subject: [PATCH 28/52] Cherry picked Text Field from Henning's BA

---
 ui/runs/forms/custom_fields.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/ui/runs/forms/custom_fields.py b/ui/runs/forms/custom_fields.py
index 7171f173..7370b64b 100644
--- a/ui/runs/forms/custom_fields.py
+++ b/ui/runs/forms/custom_fields.py
@@ -1,6 +1,8 @@
+import json
 import logging
 from enum import Enum
 
+import django.forms as forms
 from django.forms import (
     BooleanField,
     CharField,
@@ -126,3 +128,32 @@ class CustomFloatField(FloatField):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.widget.attrs.update({"class": "form-control mb-2"})
+
+
+from django import forms
+from django.utils.safestring import mark_safe
+
+
+class TextDisplayWidget(forms.Widget):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.attrs.update()
+
+    def render(self, name, value, attrs=None, renderer=None):
+        display_text = self.attrs.get("data-display-text", "")
+        return mark_safe(f"<div class=form-control mb-2>{display_text}</div>")
+
+
+class TextDisplayField(forms.Field):
+    widget = TextDisplayWidget
+
+    def __init__(self, *args, **kwargs):
+        self.text = kwargs.pop("text", "")
+        kwargs["required"] = False
+        super().__init__(*args, **kwargs)
+        self.update_text()
+
+    def update_text(self, text=None):
+        if text is not None:
+            self.text = text
+        self.widget.attrs["data-display-text"] = self.text

From 31bd7af965043b05e7fbf47243a8a792b6b2a5ee Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Wed, 31 Jul 2024 12:46:13 +0200
Subject: [PATCH 29/52] Added info box for ADFuller Test

---
 ui/runs/forms/data_analysis.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index f3d2e849..77b7384a 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -21,6 +21,7 @@
     CustomFloatField,
     CustomMultipleChoiceField,
     CustomNumberField,
+    TextDisplayField
 )
 
 
@@ -1302,6 +1303,16 @@ def fill_form(self, run: Run) -> None:
 
 class TimeSeriesADFullerTestForm(MethodForm):
     is_dynamic = True
+    test_info = TextDisplayField(
+        label="Information about the Augmented Dickey-Fuller test",
+        text=(
+            "The Augmented Dickey-Fuller test is a type of statistical test called a unit root test. The test "
+             "determines how strongly a time series is defined by a trend. The null hypothesis of the test is that the "
+             "time series can be represented by a unit root, which implies that the time series is not stationary. "
+             "The alternative hypothesis is that the time series is stationary. If the p-value is less than the "
+             "significance level, the null hypothesis can be rejected and the time series is considered stationary."
+        ),
+    )
     input_df = CustomChoiceField(
         choices=[],
         label="Peptide dataframe",

From fb476e4a45b34a11ae827bac50a6522865e976b1 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Thu, 1 Aug 2024 10:56:43 +0200
Subject: [PATCH 30/52] Fixed Tests

---
 protzilla/data_analysis/time_series_helper.py | 10 +--
 .../test_time_series_analysis.py              | 65 ++++++++++++-------
 2 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/protzilla/data_analysis/time_series_helper.py b/protzilla/data_analysis/time_series_helper.py
index 077e7e06..0fb294ed 100644
--- a/protzilla/data_analysis/time_series_helper.py
+++ b/protzilla/data_analysis/time_series_helper.py
@@ -1,12 +1,12 @@
 from datetime import datetime
 
-def convert_time_to_datetime(time_str):
+def convert_time_to_hours(time_str):
     """
-    Convert a string time to a datetime object
-    :param time_str: The time string to convert
+    Convert a string time to the number of hours since midnight.
+    :param time_str: The time string to convert in format '%H:%M:%S'
 
-    :return: A datetime object
+    :return: Number of hours since midnight as a float
     """
     time_obj = datetime.strptime(time_str, '%H:%M:%S')
-    hours_since_midnight = time_obj.hour
+    hours_since_midnight = time_obj.hour + time_obj.minute / 60 + time_obj.second / 3600
     return hours_since_midnight
\ No newline at end of file
diff --git a/tests/protzilla/data_analysis/test_time_series_analysis.py b/tests/protzilla/data_analysis/test_time_series_analysis.py
index 5c359d5d..4520d3f2 100644
--- a/tests/protzilla/data_analysis/test_time_series_analysis.py
+++ b/tests/protzilla/data_analysis/test_time_series_analysis.py
@@ -1,7 +1,11 @@
 import pandas as pd
 import pytest
 
-from protzilla.data_analysis.time_series_regression_analysis import time_series_linear_regression, time_series_ransac_regression, adfuller_test
+from protzilla.data_analysis.time_series_regression_analysis import (
+    time_series_linear_regression,
+    time_series_ransac_regression,
+    adfuller_test,
+)
 
 
 @pytest.fixture
@@ -43,20 +47,32 @@ def time_series_test_data():
     )
 
     test_metadata_df = (
-        ["Sample1", "02:00:00", 1],
-        ["Sample2", "06:00:00", 1],
-        ["Sample3", "10:00:00", 1],
-         ["Sample4", "14:00:00", 1],
+        ["Sample1", "02:00:00", "1"],
+        ["Sample2", "06:00:00", "1"],
+        ["Sample3", "10:00:00", "1"],
+         ["Sample4", "14:00:00", "1"],
+        ["Sample5", "2:00:00", "2"],
+        ["Sample6", "4:00:00", "2"],
+        ["Sample7", "6:00:00", "2"],
     )
     test_metadata_df = pd.DataFrame(
         data=test_metadata_df,
-        columns=["Sample", "Time", "Day"],
+        columns=["Sample", "Time", "Group"],
     )
     return test_intensity_df, test_metadata_df
 
-def test_linear_regression_plot(show_figures, time_series_test_data):
+def test_linear_regression_plot_with_grouping(show_figures, time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
-    outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2)
+    outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2,"With Grouping")
+    assert "plots" in outputs
+    fig = outputs["plots"][0]
+    if show_figures:
+        fig.show()
+    return
+
+def test_linear_regression_plot_without_grouping(show_figures, time_series_test_data):
+    test_intensity, test_metadata = time_series_test_data
+    outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2,"Without Grouping")
     assert "plots" in outputs
     fig = outputs["plots"][0]
     if show_figures:
@@ -66,41 +82,44 @@ def test_linear_regression_plot(show_figures, time_series_test_data):
 def test_linear_regression_plot_invalid_test_size(time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
     with pytest.raises(ValueError):
-        time_series_linear_regression(test_intensity, test_metadata, "Protein1", 2)
+        time_series_linear_regression(test_intensity, test_metadata, "Protein1", 2, "Without Grouping")
     return
 
 def test_linear_regression_outputs(time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
-    outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2)
-    assert "train_root_mean_squared" in outputs
-    assert "test_root_mean_squared" in outputs
-    assert "train_r2_score" in outputs
-    assert "test_r2_score" in outputs
+    outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2, "Without Grouping")
+    assert "scores" in outputs
     return
 
 
-def test_ransac_regression_plot(show_figures, time_series_test_data):
+def test_ransac_regression_plot_with_grouping(show_figures, time_series_test_data):
+    test_intensity, test_metadata = time_series_test_data
+    outputs = time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 0.2, "With Grouping")
+    assert "plots" in outputs
+    fig = outputs["plots"][0]
+    if show_figures:
+        fig.show()
+    return
+
+def test_ransac_regression_plot_without_grouping(show_figures, time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
-    outputs = time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 0.2)
+    outputs = time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 0.2, "Without Grouping")
     assert "plots" in outputs
     fig = outputs["plots"][0]
     if show_figures:
         fig.show()
     return
 
-def test_linear_ransac_plot_invalid_test_size(time_series_test_data):
+def test_ransac_plot_invalid_test_size(time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
     with pytest.raises(ValueError):
-        time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 2)
+        time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 2, "Without Grouping")
     return
 
 def test_ransac_regression_outputs(time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
-    outputs = time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 0.2)
-    assert "train_root_mean_squared" in outputs
-    assert "test_root_mean_squared" in outputs
-    assert "train_r2_score" in outputs
-    assert "test_r2_score" in outputs
+    outputs = time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 0.2, "Without Grouping")
+    assert "scores" in outputs
     return
 
 

From f7da4aaffd00d951113956378dcd84803f4e49fe Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Thu, 1 Aug 2024 11:00:04 +0200
Subject: [PATCH 31/52] implemented Auto ARIMA

---
 .../time_series_regression_analysis.py        | 198 ++++++++++++++++--
 protzilla/methods/data_analysis.py            |  41 +++-
 requirements.txt                              |   1 +
 ui/runs/form_mapping.py                       |   1 +
 ui/runs/forms/data_analysis.py                |  95 +++++++--
 5 files changed, 306 insertions(+), 30 deletions(-)

diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
index feb9997f..0e0a43ec 100644
--- a/protzilla/data_analysis/time_series_regression_analysis.py
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -4,7 +4,7 @@
 import pandas as pd
 import plotly.graph_objects as go
 
-from protzilla.data_analysis.time_series_helper import convert_time_to_datetime
+from protzilla.data_analysis.time_series_helper import convert_time_to_hours
 from protzilla.constants.colors import PROTZILLA_DISCRETE_COLOR_SEQUENCE
 
 from sklearn.linear_model import LinearRegression, RANSACRegressor
@@ -12,6 +12,7 @@
 from sklearn.metrics import mean_squared_error, r2_score
 from statsmodels.tsa.arima.model import ARIMA
 from statsmodels.tsa.stattools import adfuller
+from pmdarima import auto_arima
 from plotly.subplots import make_subplots
 
 colors = {
@@ -27,21 +28,21 @@ def time_series_linear_regression(
         input_df: pd.DataFrame,
         metadata_df: pd.DataFrame,
         protein_group: str,
+        train_size: float = 0.2,
         grouping: str = None,
-        test_size: float = 0.2,
 ):
     """
     Perform linear regression on the time series data for a given protein group.
     :param input_df: Peptide dataframe which contains the intensity of each sample
     :param metadata_df: Metadata dataframe which contains the timestamps
     :param protein_group: Protein group to perform the analysis on
-    :param test_size: The proportion of the dataset to include in the test split
+    :param train_size: The proportion of the dataset to include in the test split
     :param grouping: Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups
 
     :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
     """
     color_index = 0
-    if test_size < 0 or test_size > 1:
+    if train_size < 0 or train_size > 1:
         raise ValueError("Test size should be between 0 and 1")
 
     input_df = input_df[input_df['Protein ID'] == protein_group]
@@ -53,7 +54,7 @@ def time_series_linear_regression(
         copy=False,
     )
 
-    input_df["Time"] = input_df["Time"].apply(convert_time_to_datetime)
+    input_df["Time"] = input_df["Time"].apply(convert_time_to_hours)
     input_df = input_df.interpolate(method='linear', axis=0)
     X = input_df[["Time"]]
     y = input_df["Intensity"]
@@ -69,7 +70,7 @@ def time_series_linear_regression(
             X_group = group_df[["Time"]]
             y_group = group_df["Intensity"]
 
-            X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, test_size=test_size, shuffle=False)
+            X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, test_size=train_size, shuffle=False)
             model = LinearRegression()
             model.fit(X_train, y_train)
 
@@ -113,7 +114,7 @@ def time_series_linear_regression(
             })
 
     else:
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_size, shuffle=False)
         model = LinearRegression()
         model.fit(X_train, y_train)
 
@@ -207,21 +208,21 @@ def time_series_ransac_regression(
         input_df: pd.DataFrame,
         metadata_df: pd.DataFrame,
         protein_group: str,
+        train_size: float,
         grouping: str,
-        test_size: float,
 ):
     """
     Perform RANSAC regression on the time series data for a given protein group.
     :param input_df: Peptide dataframe which contains the intensity of each sample
     :param metadata_df: Metadata dataframe which contains the timestamps
     :param protein_group: Protein group to perform the analysis on
-    :param test_size: The proportion of the dataset to include in the test split
+    :param train_size: The proportion of the dataset to include in the test split
 
     :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
     """
 
     color_index = 0
-    if test_size < 0 or test_size > 1:
+    if train_size < 0 or train_size > 1:
         raise ValueError("Test size should be between 0 and 1")
 
     input_df = input_df[input_df['Protein ID'] == protein_group]
@@ -233,7 +234,7 @@ def time_series_ransac_regression(
         copy=False,
     )
 
-    input_df["Time"] = input_df["Time"].apply(convert_time_to_datetime)
+    input_df["Time"] = input_df["Time"].apply(convert_time_to_hours)
     input_df = input_df.interpolate(method='linear', axis=0)
     X = input_df[["Time"]]
     y = input_df["Intensity"]
@@ -249,7 +250,7 @@ def time_series_ransac_regression(
             X_group = group_df[["Time"]]
             y_group = group_df["Intensity"]
 
-            X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, test_size=test_size, shuffle=False)
+            X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, test_size=train_size, shuffle=False)
             model = RANSACRegressor(base_estimator=LinearRegression())
             model.fit(X_train, y_train)
 
@@ -305,7 +306,7 @@ def time_series_ransac_regression(
             })
 
     else:
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_size, shuffle=False)
         model = RANSACRegressor(base_estimator=LinearRegression())
         model.fit(X_train, y_train)
 
@@ -476,3 +477,174 @@ def adfuller_test(
         messages=messages,
     )
 
+
+def time_series_auto_arima(
+    input_df: pd.DataFrame,
+    metadata_df: pd.DataFrame,
+    protein_group: str,
+    seasonal: str,
+    m: int,
+    train_size: float,
+    forecast_steps: int,
+    grouping: str,
+) -> dict:
+    """
+    Perform an automatic ARIMA model selection on the time series data for a given protein group.
+    :param input_df: Peptide dataframe which contains the intensity of each sample
+    :param metadata_df: Metadata dataframe which contains the timestamps
+    :param protein_group: Protein group to perform the analysis on
+    :param seasonal: Whether the ARIMA model should be seasonal
+    :param m: The number of time steps for a single seasonal period (ignored if seasonal=False)
+    :param train_size: The proportion of the dataset to include in the test split
+    :param forecast_steps: The number of steps to forecast
+
+    :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
+    """
+
+    color_index = 0
+
+    if train_size < 0 or train_size > 1:
+        raise ValueError("Train size should be between 0 and 1")
+    if seasonal == "Yes":
+        seasonal = True
+    else:
+        seasonal = False
+
+    input_df = input_df[input_df['Protein ID'] == protein_group]
+
+    input_df = pd.merge(
+        left=input_df,
+        right=metadata_df,
+        on="Sample",
+        copy=False,
+    )
+
+    input_df["Time"] = input_df["Time"].apply(convert_time_to_hours)
+    input_df.set_index("Time", inplace=True)
+    input_df = input_df.interpolate(method='linear', axis=0)
+
+    data = input_df["Intensity"]
+
+    train_size = int(len(data) * train_size)
+    train, test = data[:train_size], data[train_size:]
+
+    # Fit the ARIMA model
+    model = auto_arima(
+        train,
+        seasonal=seasonal,
+        m=m,
+        trace=True,
+        error_action='ignore',
+        suppress_warnings=True,
+        stepwise=True,
+    )
+
+    # Forecast the test set
+    forecast = model.predict(n_periods=forecast_steps)
+
+    last_time = data.index[-1] +1
+    forecast_index = np.arange(last_time, last_time + forecast_steps)
+    forecast_series = pd.Series(forecast, index=forecast_index)
+
+    test_for_comparison = test[:forecast_steps]
+    forecast_for_comparison = forecast_series[: len(test_for_comparison)]
+
+
+
+    test_rmse = np.sqrt(mean_squared_error(test_for_comparison, forecast_for_comparison))
+    test_r2 = r2_score(test_for_comparison, forecast_for_comparison)
+    train_rmse = np.sqrt(mean_squared_error(train, model.predict_in_sample()))
+    train_r2 = r2_score(train, model.predict_in_sample())
+
+    fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3])
+
+    scores = []
+
+    plot_df = pd.DataFrame({
+        'Time': test.index[:forecast_steps],
+        'Intensity': test[:forecast_steps],
+        'Predicted': forecast_series,
+        'Inlier': np.abs(test[:forecast_steps] - forecast_series) < (1.5 * np.std(test[:forecast_steps]))
+    })
+
+    fig.add_trace(go.Scatter(
+        x=plot_df['Time'],
+        y=plot_df['Intensity'],
+        mode='markers',
+        name='Actual Intensity',
+        marker=dict(color='blue')
+    ), row=1, col=1)
+
+    fig.add_trace(go.Scatter(
+        x=plot_df['Time'],
+        y=plot_df['Predicted'],
+        mode='lines',
+        name='Predicted Intensity',
+        line=dict(color='red')
+    ), row=1, col=1)
+
+    fig.add_trace(go.Scatter(
+        x=plot_df[plot_df['Inlier'] == False]['Time'],
+        y=plot_df[plot_df['Inlier'] == False]['Intensity'],
+        mode='markers',
+        name='Outliers',
+        marker=dict(color='green')
+    ), row=1, col=1)
+
+    scores.append({
+        'group': 'Overall',
+         'train_root_mean_squared': train_rmse,
+        'test_root_mean_squared': test_rmse,
+        'train_r2_score': train_r2,
+        'test_r2_score': test_r2,
+    })
+
+    # Add annotation text as a separate trace in the subplot
+    annotation_text = "<br>".join([
+        f"Group: {res['group']}<br>Train RMSE: {res['train_root_mean_squared']:.3f}<br>"
+        f"Test RMSE: {res['test_root_mean_squared']:.3f}<br>"
+        f"Train R²: {res['train_r2_score']:.3f}<br>"
+        f"Test R²: {res['test_r2_score']:.3f}"
+        for res in scores
+    ])
+
+    fig.add_trace(go.Scatter(
+        x=[0],
+        y=[0.25],
+        text=[annotation_text],
+        mode='text',
+        textfont=dict(size=12),
+        showlegend=False
+    ), row=1, col=2)
+
+    fig.update_layout(
+        title=f"Intensity over Time for {protein_group}",
+        plot_bgcolor=colors["plot_bgcolor"],
+        xaxis_gridcolor=colors["gridcolor"],
+        yaxis_gridcolor=colors["gridcolor"],
+        xaxis_linecolor=colors["linecolor"],
+        yaxis_linecolor=colors["linecolor"],
+        xaxis_title="Time",
+        yaxis_title="Intensity",
+        legend_title="Legend",
+        autosize=True,
+        margin=dict(l=100, r=100, t=100, b=50),
+        legend=dict(
+            yanchor="top",
+            y=0.95,
+            xanchor="right",
+            x=0.825
+        )
+    )
+
+    fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2)
+    fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2)
+
+    fig.update_annotations(font_size=12)
+
+    fig.show()
+
+    return dict(
+        scores=scores,
+        plots=[fig],
+    )
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 1c3f7093..ad84a523 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -14,7 +14,12 @@
 )
 from protzilla.data_analysis.differential_expression_t_test import t_test
 from protzilla.data_analysis.dimension_reduction import t_sne, umap
-from protzilla.data_analysis.time_series_regression_analysis import time_series_linear_regression, time_series_ransac_regression, adfuller_test
+from protzilla.data_analysis.time_series_regression_analysis import (
+    time_series_linear_regression,
+    time_series_ransac_regression,
+     adfuller_test,
+    time_series_auto_arima,
+)
 from protzilla.data_analysis.ptm_analysis import filter_peptides_of_protein, ptms_per_sample, \
     ptms_per_protein_and_sample
 from protzilla.data_analysis.model_evaluation import evaluate_classification_model
@@ -809,8 +814,8 @@ class TimeSeriesLinearRegression(PlotStep):
         "input_df",
         "metadata_df",
         "protein_group",
+        "train_size",
         "grouping",
-        "test_size",
     ]
     output_keys = [
         "scores",
@@ -834,8 +839,8 @@ class TimeSeriesRANSACRegression(PlotStep):
         "input_df",
         "metadata_df",
         "protein_group",
+        "train_size",
         "grouping",
-        "test_size",
     ]
     output_keys = [
         "scores",
@@ -876,6 +881,36 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         return inputs
 
 
+class TimeSeriesAutoARIMA(PlotStep):
+    display_name = "Auto ARIMA (AutoRegressive Integrated Moving Average)"
+    operation = "Time series analysis"
+    method_description = (
+        "Perform Auto ARIMA on the time series data for a given protein group."
+    )
+
+    input_keys = [
+        "input_df",
+        "metadata_df",
+        "protein_group",
+        "seasonal",
+        "m",
+        "train_size",
+        "forecast_steps",
+        "grouping",
+    ]
+    output_keys = [
+        "scores",
+    ]
+
+    def method(self, inputs: dict) -> dict:
+        return time_series_auto_arima(**inputs)
+
+    def insert_dataframes(self, steps: StepManager, inputs) -> dict:
+        inputs["input_df"] = steps.get_step_output(Step, "peptide_df", inputs["input_df"])
+        inputs["metadata_df"] = steps.metadata_df
+        return inputs
+
+
 class PTMsPerSample(DataAnalysisStep):
     display_name = "PTMs per Sample"
     operation = "Peptide analysis"
diff --git a/requirements.txt b/requirements.txt
index bc175e2a..e7f0c7ed 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,6 +21,7 @@ restring==0.1.20
 scikit-learn==1.2.2
 scipy==1.10.1
 statsmodels==0.13.5
+pmdarima==2.0.4
 umap-learn==0.5.3
 Werkzeug==2.2.3
 numba==0.57.0
diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py
index bca07db4..a1bcd37b 100644
--- a/ui/runs/form_mapping.py
+++ b/ui/runs/form_mapping.py
@@ -69,6 +69,7 @@
     data_analysis.TimeSeriesLinearRegression: data_analysis_forms.TimeSeriesLinearRegressionForm,
     data_analysis.TimeSeriesRANSACRegression: data_analysis_forms.TimeSeriesRANSACRegressionForm,
     data_analysis.TimeSeriesADFullerTest: data_analysis_forms.TimeSeriesADFullerTestForm,
+    data_analysis.TimeSeriesAutoARIMA: data_analysis_forms.TimeSeriesAutoARIMAForm,
     data_preprocessing.ImputationByMinPerSample: data_preprocessing_forms.ImputationByMinPerSampleForms,
     data_integration.EnrichmentAnalysisGOAnalysisWithString: data_integration_forms.EnrichmentAnalysisGOAnalysisWithStringForm,
     data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr: data_integration_forms.EnrichmentAnalysisGOAnalysisWithEnrichrForm,
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 77b7384a..960dd33e 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1229,18 +1229,18 @@ class TimeSeriesLinearRegressionForm(MethodForm):
         choices=[],
         label="Protein group: which protein group to perform the linear regression on",
     )
-    grouping = CustomChoiceField(
-        choices= TimeSeriesGrouping,
-        label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
-        initial=TimeSeriesGrouping.with_grouping
-    )
-    test_size = CustomFloatField(
-        label="Test size: proportion of the dataset to include in the test split",
+    train_size = CustomFloatField(
+        label="Train size: proportion of the dataset to include in the test split",
         min_value=0,
         max_value=1,
         step_size=0.1,
         initial=0.2
     )
+    grouping = CustomChoiceField(
+        choices= TimeSeriesGrouping,
+        label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
+        initial=TimeSeriesGrouping.with_grouping
+    )
 
 
     def fill_form(self, run: Run) -> None:
@@ -1270,18 +1270,18 @@ class TimeSeriesRANSACRegressionForm(MethodForm):
         choices=[],
         label="Protein group: which protein group to perform the RANSAC regression on",
     )
-    grouping = CustomChoiceField(
-        choices= TimeSeriesGrouping,
-        label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
-        initial=TimeSeriesGrouping.with_grouping
-    )
-    test_size = CustomFloatField(
-        label="Test size: proportion of the dataset to include in the test split",
+    train_size = CustomFloatField(
+        label="Train size: proportion of the dataset to include in the test split",
         min_value=0,
         max_value=1,
         step_size=0.1,
         initial=0.2
     )
+    grouping = CustomChoiceField(
+        choices= TimeSeriesGrouping,
+        label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
+        initial=TimeSeriesGrouping.with_grouping
+    )
 
 
     def fill_form(self, run: Run) -> None:
@@ -1328,6 +1328,73 @@ class TimeSeriesADFullerTestForm(MethodForm):
         initial=0.05
     )
 
+    def fill_form(self, run: Run) -> None:
+        self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps(
+            run
+        )
+        input_df_instance_id = self.data.get(
+            "input_df", self.fields["input_df"].choices[0][0]
+        )
+
+        self.fields["protein_group"].choices = fill_helper.to_choices(
+            run.steps.get_step_output(
+                step_type=Step,
+                output_key="peptide_df",
+                instance_identifier=input_df_instance_id,
+            )["Protein ID"].unique()
+        )
+
+
+class TimeSeriesAutoARIMAForm(MethodForm):
+    is_dynamic = True
+    model_info = TextDisplayField(
+        label="Information about the AutoARIMA model",
+        text=(
+            "Auto ARIMA is a function that automatically selects the best-fitting ARIMA model for a time series"
+            "by iterating over multiple combinations of model parameters to minimize an information criterion like AIC (Akaike Information Criterion)."
+            "It simplifies the model selection process, handling both seasonal and non-seasonal data,"
+            " and helps in making accurate forecasts."
+        ),
+    )
+    input_df = CustomChoiceField(
+        choices=[],
+        label="Peptide dataframe",
+    )
+    protein_group = CustomChoiceField(
+        choices=[],
+        label="Protein group: which protein group to perform the AutoARIMA on",
+    )
+    seasonal = CustomChoiceField(
+        choices=YesNo,
+        label="Seasonal: Whether the ARIMA model should be seasonal",
+        initial=YesNo.no
+    )
+    m = CustomNumberField(
+        label = "The number of time steps for a single seasonal period (ignored if seasonal=No)",
+        min_value=1,
+        step_size=1,
+        initial=1,
+    )
+    train_size = CustomFloatField(
+        label="Train size: proportion of the dataset to include in the test split",
+        min_value=0,
+        max_value=1,
+        step_size=0.1,
+        initial=0.8,
+    )
+    forecast_steps = CustomNumberField(
+        label="Number of steps to forecast",
+        min_value=1,
+        step_size=1,
+        initial=10
+    )
+    grouping = CustomChoiceField(
+        choices= TimeSeriesGrouping,
+        label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
+        initial=TimeSeriesGrouping.with_grouping
+    )
+
+
     def fill_form(self, run: Run) -> None:
         self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps(
             run

From 2e22aa6df4c0cdaa02fdc6ff9280e0beea5d0905 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Thu, 8 Aug 2024 10:51:01 +0200
Subject: [PATCH 32/52] implemented Auto ARIMA

---
 .../time_series_regression_analysis.py        | 232 +++++++++++-------
 protzilla/methods/data_analysis.py            |   4 +-
 ui/runs/forms/data_analysis.py                |  32 ++-
 3 files changed, 175 insertions(+), 93 deletions(-)

diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
index 0e0a43ec..1646b532 100644
--- a/protzilla/data_analysis/time_series_regression_analysis.py
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -56,6 +56,9 @@ def time_series_linear_regression(
 
     input_df["Time"] = input_df["Time"].apply(convert_time_to_hours)
     input_df = input_df.interpolate(method='linear', axis=0)
+
+    input_df = input_df.sample(frac=1, random_state = 42).reset_index(drop=True)
+
     X = input_df[["Time"]]
     y = input_df["Intensity"]
 
@@ -114,7 +117,7 @@ def time_series_linear_regression(
             })
 
     else:
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_size, shuffle=False)
+        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, shuffle=False)
         model = LinearRegression()
         model.fit(X_train, y_train)
 
@@ -156,10 +159,9 @@ def time_series_linear_regression(
 
     # Add annotation text as a separate trace in the subplot
     annotation_text = "<br>".join([
-        f"Group: {res['group']}<br>Train RMSE: {res['train_root_mean_squared']:.3f}<br>"
-        f"Test RMSE: {res['test_root_mean_squared']:.3f}<br>"
-        f"Train R²: {res['train_r2_score']:.3f}<br>"
-        f"Test R²: {res['test_r2_score']:.3f}"
+        f"Group: {res['group']} (Train/Test)"
+        f"<br>RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}<br>"
+        f"Train R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}<br>"
         for res in scores
     ])
 
@@ -188,7 +190,7 @@ def time_series_linear_regression(
             yanchor="top",
             y=0.95,
             xanchor="right",
-            x=0.825
+            x=0.8
         )
     )
 
@@ -208,6 +210,9 @@ def time_series_ransac_regression(
         input_df: pd.DataFrame,
         metadata_df: pd.DataFrame,
         protein_group: str,
+        max_trials: int,
+        stop_probability: float,
+        loss: str,
         train_size: float,
         grouping: str,
 ):
@@ -236,6 +241,9 @@ def time_series_ransac_regression(
 
     input_df["Time"] = input_df["Time"].apply(convert_time_to_hours)
     input_df = input_df.interpolate(method='linear', axis=0)
+
+    input_df = input_df.sample(frac=1, random_state = 42).reset_index(drop=True)
+
     X = input_df[["Time"]]
     y = input_df["Intensity"]
 
@@ -250,8 +258,8 @@ def time_series_ransac_regression(
             X_group = group_df[["Time"]]
             y_group = group_df["Intensity"]
 
-            X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, test_size=train_size, shuffle=False)
-            model = RANSACRegressor(base_estimator=LinearRegression())
+            X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, train_size=train_size, shuffle=False)
+            model = RANSACRegressor(max_trials = max_trials, stop_probability = stop_probability, loss = loss, base_estimator=LinearRegression())
             model.fit(X_train, y_train)
 
             inlier_mask = model.inlier_mask_
@@ -361,10 +369,9 @@ def time_series_ransac_regression(
 
     # Add annotation text as a separate trace in the subplot
     annotation_text = "<br>".join([
-        f"Group: {res['group']}<br>Train RMSE: {res['train_root_mean_squared']:.3f}<br>"
-        f"Test RMSE: {res['test_root_mean_squared']:.3f}<br>"
-        f"Train R²: {res['train_r2_score']:.3f}<br>"
-        f"Test R²: {res['test_r2_score']:.3f}"
+        f"Group: {res['group']} (Train/Test)"
+        f"<br>RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}<br>"
+        f"Train R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}<br>"
         for res in scores
     ])
 
@@ -393,7 +400,7 @@ def time_series_ransac_regression(
             yanchor="top",
             y=0.95,
             xanchor="right",
-            x=0.825
+            x=0.8
         )
     )
 
@@ -485,7 +492,6 @@ def time_series_auto_arima(
     seasonal: str,
     m: int,
     train_size: float,
-    forecast_steps: int,
     grouping: str,
 ) -> dict:
     """
@@ -511,6 +517,7 @@ def time_series_auto_arima(
         seasonal = False
 
     input_df = input_df[input_df['Protein ID'] == protein_group]
+    input_df = input_df.sample(frac=1, random_state=42).reset_index(drop=True)
 
     input_df = pd.merge(
         left=input_df,
@@ -519,92 +526,150 @@ def time_series_auto_arima(
         copy=False,
     )
 
-    input_df["Time"] = input_df["Time"].apply(convert_time_to_hours)
-    input_df.set_index("Time", inplace=True)
-    input_df = input_df.interpolate(method='linear', axis=0)
+    fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3])
+    scores = []
 
-    data = input_df["Intensity"]
+    if grouping == "With Grouping" and "Group" in input_df.columns:
+        groups = input_df["Group"].unique()
+        for group in groups:
+            group_df = input_df[input_df["Group"] == group]
 
-    train_size = int(len(data) * train_size)
-    train, test = data[:train_size], data[train_size:]
+            group_df["Time"] = group_df["Time"].apply(convert_time_to_hours)
+            group_df = group_df.interpolate(method='linear', axis=0)
 
-    # Fit the ARIMA model
-    model = auto_arima(
-        train,
-        seasonal=seasonal,
-        m=m,
-        trace=True,
-        error_action='ignore',
-        suppress_warnings=True,
-        stepwise=True,
-    )
+            train_df_size = int(len(group_df) * train_size)
+            train_df, test_df = group_df[:train_df_size], group_df[train_df_size:]
 
-    # Forecast the test set
-    forecast = model.predict(n_periods=forecast_steps)
+            train_df = train_df.set_index("Time")["Intensity"]
+            test_df = test_df.set_index("Time")["Intensity"]
 
-    last_time = data.index[-1] +1
-    forecast_index = np.arange(last_time, last_time + forecast_steps)
-    forecast_series = pd.Series(forecast, index=forecast_index)
+            # Fit the ARIMA model
+            model = auto_arima(
+                train_df,
+                seasonal=seasonal,
+                m=m,
+                trace=True,
+                error_action='ignore',
+                suppress_warnings=True,
+                stepwise=True,
+            )
 
-    test_for_comparison = test[:forecast_steps]
-    forecast_for_comparison = forecast_series[: len(test_for_comparison)]
+            # Forecast the test set
+            forecast = model.predict(n_periods=test_df.shape[0])
 
+            test_rmse = np.sqrt(mean_squared_error(test_df, forecast))
+            test_r2 = r2_score(test_df, forecast)
+            train_rmse = np.sqrt(mean_squared_error(train_df, model.predict_in_sample()))
+            train_r2 = r2_score(train_df, model.predict_in_sample())
 
+            forecast_reset = forecast.reset_index(drop=True)
+            forecast_plot = pd.Series(forecast_reset.values, index=test_df.index)
+            forecast_plot = forecast_plot.groupby(forecast_plot.index).mean()
 
-    test_rmse = np.sqrt(mean_squared_error(test_for_comparison, forecast_for_comparison))
-    test_r2 = r2_score(test_for_comparison, forecast_for_comparison)
-    train_rmse = np.sqrt(mean_squared_error(train, model.predict_in_sample()))
-    train_r2 = r2_score(train, model.predict_in_sample())
+            fig.add_trace(go.Scatter(
+                x=test_df.index,
+                y=test_df,
+                mode='markers',
+                name='Actual Intensity',
+                marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index])
+            ), row=1, col=1)
 
-    fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3])
+            fig.add_trace(go.Scatter(
+                x=test_df.index,
+                y=forecast,
+                mode='markers',
+                name='Predicted Intensity',
+                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 1])
+            ), row=1, col=1)
 
-    scores = []
+            fig.add_trace(go.Scatter(
+                x = forecast_plot.index,
+                y = forecast_plot,
+                mode = 'lines',
+                name = 'Mean Predicted Intensity',
+                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2])
+            ), row=1, col=1)
 
-    plot_df = pd.DataFrame({
-        'Time': test.index[:forecast_steps],
-        'Intensity': test[:forecast_steps],
-        'Predicted': forecast_series,
-        'Inlier': np.abs(test[:forecast_steps] - forecast_series) < (1.5 * np.std(test[:forecast_steps]))
-    })
+            color_index += 3
 
-    fig.add_trace(go.Scatter(
-        x=plot_df['Time'],
-        y=plot_df['Intensity'],
-        mode='markers',
-        name='Actual Intensity',
-        marker=dict(color='blue')
-    ), row=1, col=1)
+            scores.append({
+                'group': group,
+                 'train_root_mean_squared': train_rmse,
+                'test_root_mean_squared': test_rmse,
+                'train_r2_score': train_r2,
+                'test_r2_score': test_r2,
+            })
 
-    fig.add_trace(go.Scatter(
-        x=plot_df['Time'],
-        y=plot_df['Predicted'],
-        mode='lines',
-        name='Predicted Intensity',
-        line=dict(color='red')
-    ), row=1, col=1)
+    else:
+        input_df["Time"] = input_df["Time"].apply(convert_time_to_hours)
+        input_df = input_df.interpolate(method='linear', axis=0)
+
+        train_size = int(len(input_df) * train_size)
+        train_df, test_df = input_df[:train_size], input_df[train_size:]
+
+        train_df = train_df.set_index("Time")["Intensity"]
+        test_df = test_df.set_index("Time")["Intensity"]
+
+        # Fit the ARIMA model
+        model = auto_arima(
+            train_df,
+            seasonal=seasonal,
+            m=m,
+            trace=True,
+            error_action='ignore',
+            suppress_warnings=True,
+            stepwise=True,
+        )
 
-    fig.add_trace(go.Scatter(
-        x=plot_df[plot_df['Inlier'] == False]['Time'],
-        y=plot_df[plot_df['Inlier'] == False]['Intensity'],
-        mode='markers',
-        name='Outliers',
-        marker=dict(color='green')
-    ), row=1, col=1)
-
-    scores.append({
-        'group': 'Overall',
-         'train_root_mean_squared': train_rmse,
-        'test_root_mean_squared': test_rmse,
-        'train_r2_score': train_r2,
-        'test_r2_score': test_r2,
-    })
+        # Forecast the test set
+        forecast = model.predict(n_periods=test_df.shape[0])
+
+        test_rmse = np.sqrt(mean_squared_error(test_df, forecast))
+        test_r2 = r2_score(test_df, forecast)
+        train_rmse = np.sqrt(mean_squared_error(train_df, model.predict_in_sample()))
+        train_r2 = r2_score(train_df, model.predict_in_sample())
+
+        forecast_reset = forecast.reset_index(drop=True)
+        forecast_plot = pd.Series(forecast_reset.values, index=test_df.index)
+        forecast_plot = forecast_plot.groupby(forecast_plot.index).mean()
+
+        fig.add_trace(go.Scatter(
+            x=test_df.index,
+            y=test_df,
+            mode='markers',
+            name='Actual Intensity',
+            marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
+        ), row=1, col=1)
+
+        fig.add_trace(go.Scatter(
+            x=test_df.index,
+            y=forecast,
+            mode='markers',
+            name='Predicted Intensity',
+            line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2])
+        ), row=1, col=1)
+
+        fig.add_trace(go.Scatter(
+            x=forecast_plot.index,
+            y=forecast_plot,
+            mode='lines',
+            name='Mean Predicted Intensity',
+            line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3])
+        ), row=1, col=1)
+
+        scores.append({
+            'group': 'Overall',
+            'train_root_mean_squared': train_rmse,
+            'test_root_mean_squared': test_rmse,
+            'train_r2_score': train_r2,
+            'test_r2_score': test_r2,
+        })
 
     # Add annotation text as a separate trace in the subplot
     annotation_text = "<br>".join([
-        f"Group: {res['group']}<br>Train RMSE: {res['train_root_mean_squared']:.3f}<br>"
-        f"Test RMSE: {res['test_root_mean_squared']:.3f}<br>"
-        f"Train R²: {res['train_r2_score']:.3f}<br>"
-        f"Test R²: {res['test_r2_score']:.3f}"
+        f"Group: {res['group']} (Train/Test)"
+        f"<br>RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}<br>"
+        f"Train R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}<br>"
         for res in scores
     ])
 
@@ -633,7 +698,7 @@ def time_series_auto_arima(
             yanchor="top",
             y=0.95,
             xanchor="right",
-            x=0.825
+            x=0.775
         )
     )
 
@@ -642,7 +707,6 @@ def time_series_auto_arima(
 
     fig.update_annotations(font_size=12)
 
-    fig.show()
 
     return dict(
         scores=scores,
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index ad84a523..4733ff90 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -839,6 +839,9 @@ class TimeSeriesRANSACRegression(PlotStep):
         "input_df",
         "metadata_df",
         "protein_group",
+        "max_trials",
+        "stop_probability",
+        "loss",
         "train_size",
         "grouping",
     ]
@@ -895,7 +898,6 @@ class TimeSeriesAutoARIMA(PlotStep):
         "seasonal",
         "m",
         "train_size",
-        "forecast_steps",
         "grouping",
     ]
     output_keys = [
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 960dd33e..9d15c199 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -150,6 +150,10 @@ class TimeSeriesGrouping(Enum):
     with_grouping = "With Grouping"
     without_grouping = "Without Grouping"
 
+class TimeSeriesRANSACLoss(Enum):
+    absolute_error = "absolute_error"
+    squared_error = "squared_error"
+
 
 class DifferentialExpressionANOVAForm(MethodForm):
     is_dynamic = True
@@ -1234,7 +1238,7 @@ class TimeSeriesLinearRegressionForm(MethodForm):
         min_value=0,
         max_value=1,
         step_size=0.1,
-        initial=0.2
+        initial=0.8
     )
     grouping = CustomChoiceField(
         choices= TimeSeriesGrouping,
@@ -1270,12 +1274,30 @@ class TimeSeriesRANSACRegressionForm(MethodForm):
         choices=[],
         label="Protein group: which protein group to perform the RANSAC regression on",
     )
+    max_trials = CustomNumberField(
+        label="Max trials: the maximum number of iterations for random sample selection",
+        min_value=1,
+        step_size=1,
+        initial=100,
+    )
+    stop_probability = CustomFloatField(
+        label="Stop Probability: the probability that the algorithm stops after a certain number of iterations if at least one outlier-free set of the training data is sampled",
+        min_value=0,
+        max_value=1,
+        step_size=0.01,
+        initial=0.99
+    )
+    loss = CustomChoiceField(
+        choices= TimeSeriesRANSACLoss,
+        label="Loss function: the loss function to be used for fitting the linear model",
+        initial=TimeSeriesRANSACLoss.absolute_error,
+    )
     train_size = CustomFloatField(
         label="Train size: proportion of the dataset to include in the test split",
         min_value=0,
         max_value=1,
         step_size=0.1,
-        initial=0.2
+        initial=0.8
     )
     grouping = CustomChoiceField(
         choices= TimeSeriesGrouping,
@@ -1382,12 +1404,6 @@ class TimeSeriesAutoARIMAForm(MethodForm):
         step_size=0.1,
         initial=0.8,
     )
-    forecast_steps = CustomNumberField(
-        label="Number of steps to forecast",
-        min_value=1,
-        step_size=1,
-        initial=10
-    )
     grouping = CustomChoiceField(
         choices= TimeSeriesGrouping,
         label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",

From 82a550ad211716584888984645609a3246e5d7c0 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Thu, 8 Aug 2024 15:39:16 +0200
Subject: [PATCH 33/52] implemented ARIMA

---
 .../time_series_regression_analysis.py        | 226 +++++++++++++++++-
 protzilla/methods/data_analysis.py            |  32 +++
 ui/runs/form_mapping.py                       |   1 +
 ui/runs/forms/data_analysis.py                |  81 ++++++-
 4 files changed, 330 insertions(+), 10 deletions(-)

diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
index 1646b532..7777f833 100644
--- a/protzilla/data_analysis/time_series_regression_analysis.py
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -437,8 +437,6 @@ def adfuller_test(
         - messages: A list of messages for the user.
     """
 
-    # TODO: Info box for the user
-
     messages = []
     input_df = input_df[input_df['Protein ID'] == protein_group]
 
@@ -502,7 +500,7 @@ def time_series_auto_arima(
     :param seasonal: Whether the ARIMA model should be seasonal
     :param m: The number of time steps for a single seasonal period (ignored if seasonal=False)
     :param train_size: The proportion of the dataset to include in the test split
-    :param forecast_steps: The number of steps to forecast
+    :param grouping: Whether to group the data by the 'Group' column
 
     :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
     """
@@ -579,7 +577,7 @@ def time_series_auto_arima(
                 y=forecast,
                 mode='markers',
                 name='Predicted Intensity',
-                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 1])
+                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2])
             ), row=1, col=1)
 
             fig.add_trace(go.Scatter(
@@ -712,3 +710,223 @@ def time_series_auto_arima(
         scores=scores,
         plots=[fig],
     )
+
+
+def time_series_arima(
+    input_df: pd.DataFrame,
+    metadata_df: pd.DataFrame,
+    protein_group: str,
+    seasonal: str,
+    p: int,
+    d: int,
+    q: int,
+    train_size: float,
+    grouping: str,
+) -> dict:
+
+    """
+    Perform ARIMA model selection on the time series data for a given protein group.
+    :param input_df: Peptide dataframe which contains the intensity of each sample
+    :param metadata_df: Metadata dataframe which contains the timestamps
+    :param protein_group: Protein group to perform the analysis on
+    :param seasonal: Whether the ARIMA model should be seasonal
+    :param p: ARIMA p parameter
+    :param d: ARIMA d parameter
+    :param q: ARIMA q parameter
+    :param train_size: The proportion of the dataset to include in the test split
+    :param grouping: Whether to group the data by the 'Group' column
+
+    :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
+    """
+
+    color_index = 0
+
+    if train_size < 0 or train_size > 1:
+        raise ValueError("Train size should be between 0 and 1")
+
+    input_df = input_df[input_df['Protein ID'] == protein_group]
+    input_df = input_df.sample(frac=1, random_state=42).reset_index(drop=True)
+
+    input_df = pd.merge(left=input_df, right=metadata_df, on="Sample", copy=False)
+
+    fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3])
+    scores = []
+
+    if grouping == "With Grouping" and "Group" in input_df.columns:
+        groups = input_df["Group"].unique()
+        for group in groups:
+            group_df = input_df[input_df["Group"] == group]
+
+            group_df["Time"] = group_df["Time"].apply(convert_time_to_hours)
+            group_df = group_df.interpolate(method='linear', axis=0)
+
+            train_df_size = int(len(group_df) * train_size)
+            train_df, test_df = group_df[:train_df_size], group_df[train_df_size:]
+
+            train_df = train_df.set_index("Time")["Intensity"]
+            test_df = test_df.set_index("Time")["Intensity"]
+
+            if seasonal == "Yes":
+                model = ARIMA(
+                    train_df,
+                    order=(p, d, q),
+                    #seasonal_order=(P, D, Q, m)
+                )
+            else:
+                model = ARIMA(
+                    train_df,
+                    order=(p, d, q)
+                )
+
+            model_fit = model.fit()
+
+            forecast = model_fit.forecast(steps=len(test_df))
+
+            test_rmse = np.sqrt(mean_squared_error(test_df, forecast))
+            test_r2 = r2_score(test_df, forecast)
+            train_rmse = np.sqrt(mean_squared_error(train_df, model_fit.fittedvalues))
+            train_r2 = r2_score(train_df, model_fit.fittedvalues)
+
+            forecast_reset = forecast.reset_index(drop=True)
+            forecast_plot = pd.Series(forecast_reset.values, index=test_df.index)
+            forecast_mean_plot = forecast_plot.groupby(forecast_plot.index).mean()
+
+            fig.add_trace(go.Scatter(
+                x=test_df.index,
+                y=test_df,
+                mode='markers',
+                name='Actual Intensity',
+                marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index])
+            ), row=1, col=1)
+
+            fig.add_trace(go.Scatter(
+                x=forecast_plot.index,
+                y=forecast_plot,
+                mode='markers',
+                name='Predicted Intensity',
+                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2])
+            ), row=1, col=1)
+
+            fig.add_trace(go.Scatter(
+                x = forecast_mean_plot.index,
+                y = forecast_mean_plot,
+                mode = 'lines',
+                name = 'Mean Predicted Intensity',
+                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2])
+            ), row=1, col=1)
+
+            color_index += 3
+
+            scores.append({
+                'group': group,
+                'train_root_mean_squared': train_rmse,
+                'test_root_mean_squared': test_rmse,
+                'train_r2_score': train_r2,
+                'test_r2_score': test_r2,
+            })
+
+    else:
+        input_df["Time"] = input_df["Time"].apply(convert_time_to_hours)
+        input_df = input_df.interpolate(method='linear', axis=0)
+
+        train_size = int(len(input_df) * train_size)
+        train_df, test_df = input_df[:train_size], input_df[train_size:]
+
+        train_df = train_df.set_index("Time")["Intensity"]
+        test_df = test_df.set_index("Time")["Intensity"]
+
+        if seasonal == "Yes":
+            model = ARIMA(train_df, order=(p, d, q))
+        else:
+            model = ARIMA(train_df, order=(p, d, q))
+
+        model_fit = model.fit()
+
+        forecast = model_fit.forecast(steps=len(test_df))
+
+        test_rmse = np.sqrt(mean_squared_error(test_df, forecast))
+        test_r2 = r2_score(test_df, forecast)
+        train_rmse = np.sqrt(mean_squared_error(train_df, model_fit.fittedvalues))
+        train_r2 = r2_score(train_df, model_fit.fittedvalues)
+
+        forecast_reset = forecast.reset_index(drop=True)
+        forecast_plot = pd.Series(forecast_reset.values, index=test_df.index)
+        forecast_plot = forecast_plot.groupby(forecast_plot.index).mean()
+
+        fig.add_trace(go.Scatter(
+            x=test_df.index,
+            y=test_df,
+            mode='markers',
+            name='Actual Intensity',
+            marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
+        ), row=1, col=1)
+
+        fig.add_trace(go.Scatter(
+            x=test_df.index,
+            y=forecast,
+            mode='markers',
+            name='Predicted Intensity',
+            line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2])
+        ), row=1, col=1)
+
+        fig.add_trace(go.Scatter(
+            x=forecast_plot.index,
+            y=forecast_plot,
+            mode='lines',
+            name='Mean Predicted Intensity',
+            line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3])
+        ), row=1, col=1)
+
+        scores.append({
+            'group': 'Overall',
+            'train_root_mean_squared': train_rmse,
+            'test_root_mean_squared': test_rmse,
+            'train_r2_score': train_r2,
+            'test_r2_score': test_r2,
+        })
+
+    annotation_text = "<br>".join([
+        f"Group: {res['group']} (Train/Test)"
+        f"<br>RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}<br>"
+        f"Train R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}<br>"
+        for res in scores
+    ])
+
+    fig.add_trace(go.Scatter(
+        x=[0],
+        y=[0.25],
+        text=[annotation_text],
+        mode='text',
+        textfont=dict(size=12),
+        showlegend=False
+    ), row=1, col=2)
+
+    fig.update_layout(
+        title=f"Intensity over Time for {protein_group}",
+        plot_bgcolor=colors["plot_bgcolor"],
+        xaxis_gridcolor=colors["gridcolor"],
+        yaxis_gridcolor=colors["gridcolor"],
+        xaxis_linecolor=colors["linecolor"],
+        yaxis_linecolor=colors["linecolor"],
+        xaxis_title="Time",
+        yaxis_title="Intensity",
+        legend_title="Legend",
+        autosize=True,
+        margin=dict(l=100, r=100, t=100, b=50),
+        legend=dict(
+            yanchor="top",
+            y=0.95,
+            xanchor="right",
+            x=0.775
+        )
+    )
+
+    fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2)
+    fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2)
+
+    fig.update_annotations(font_size=12)
+
+    return dict(
+        scores=scores,
+        plots=[fig],
+    )
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 4733ff90..26d73f77 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -19,6 +19,7 @@
     time_series_ransac_regression,
      adfuller_test,
     time_series_auto_arima,
+    time_series_arima,
 )
 from protzilla.data_analysis.ptm_analysis import filter_peptides_of_protein, ptms_per_sample, \
     ptms_per_protein_and_sample
@@ -913,6 +914,37 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         return inputs
 
 
+class TimeSeriesARIMA(PlotStep):
+    display_name = "ARIMA (AutoRegressive Integrated Moving Average)"
+    operation = "Time series analysis"
+    method_description = (
+        "Perform ARIMA on the time series data for a given protein group."
+    )
+
+    input_keys = [
+        "input_df",
+        "metadata_df",
+        "protein_group",
+        "seasonal",
+        "p",
+        "d",
+        "q",
+        "train_size",
+        "grouping",
+    ]
+    output_keys = [
+        "scores",
+    ]
+
+    def method(self, inputs: dict) -> dict:
+        return time_series_arima(**inputs)
+
+    def insert_dataframes(self, steps: StepManager, inputs) -> dict:
+        inputs["input_df"] = steps.get_step_output(Step, "peptide_df", inputs["input_df"])
+        inputs["metadata_df"] = steps.metadata_df
+        return inputs
+
+
 class PTMsPerSample(DataAnalysisStep):
     display_name = "PTMs per Sample"
     operation = "Peptide analysis"
diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py
index a1bcd37b..8f4793f1 100644
--- a/ui/runs/form_mapping.py
+++ b/ui/runs/form_mapping.py
@@ -70,6 +70,7 @@
     data_analysis.TimeSeriesRANSACRegression: data_analysis_forms.TimeSeriesRANSACRegressionForm,
     data_analysis.TimeSeriesADFullerTest: data_analysis_forms.TimeSeriesADFullerTestForm,
     data_analysis.TimeSeriesAutoARIMA: data_analysis_forms.TimeSeriesAutoARIMAForm,
+    data_analysis.TimeSeriesARIMA: data_analysis_forms.TimeSeriesARIMAForm,
     data_preprocessing.ImputationByMinPerSample: data_preprocessing_forms.ImputationByMinPerSampleForms,
     data_integration.EnrichmentAnalysisGOAnalysisWithString: data_integration_forms.EnrichmentAnalysisGOAnalysisWithStringForm,
     data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr: data_integration_forms.EnrichmentAnalysisGOAnalysisWithEnrichrForm,
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 9d15c199..5e01e5d0 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1332,7 +1332,9 @@ class TimeSeriesADFullerTestForm(MethodForm):
              "determines how strongly a time series is defined by a trend. The null hypothesis of the test is that the "
              "time series can be represented by a unit root, which implies that the time series is not stationary. "
              "The alternative hypothesis is that the time series is stationary. If the p-value is less than the "
-             "significance level, the null hypothesis can be rejected and the time series is considered stationary."
+             "significance level, the null hypothesis can be rejected and the time series is considered stationary.<br>"
+             "Dickey, D. & Fuller, Wayne. (1979). Distribution of the Estimators for Autoregressive Time Series With a Unit Root."
+             "JASA. Journal of the American Statistical Association. 74. 10.2307/2286348. "
         ),
     )
     input_df = CustomChoiceField(
@@ -1370,12 +1372,8 @@ def fill_form(self, run: Run) -> None:
 class TimeSeriesAutoARIMAForm(MethodForm):
     is_dynamic = True
     model_info = TextDisplayField(
-        label="Information about the AutoARIMA model",
+        label="Citation for AutoARIMA model",
         text=(
-            "Auto ARIMA is a function that automatically selects the best-fitting ARIMA model for a time series"
-            "by iterating over multiple combinations of model parameters to minimize an information criterion like AIC (Akaike Information Criterion)."
-            "It simplifies the model selection process, handling both seasonal and non-seasonal data,"
-            " and helps in making accurate forecasts."
         ),
     )
     input_df = CustomChoiceField(
@@ -1411,6 +1409,77 @@ class TimeSeriesAutoARIMAForm(MethodForm):
     )
 
 
+    def fill_form(self, run: Run) -> None:
+        self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps(
+            run
+        )
+        input_df_instance_id = self.data.get(
+            "input_df", self.fields["input_df"].choices[0][0]
+        )
+
+        self.fields["protein_group"].choices = fill_helper.to_choices(
+            run.steps.get_step_output(
+                step_type=Step,
+                output_key="peptide_df",
+                instance_identifier=input_df_instance_id,
+            )["Protein ID"].unique()
+        )
+
+
+class TimeSeriesARIMAForm(MethodForm):
+    is_dynamic = True
+    """
+    model_info = TextDisplayField(
+        label="Citation for ARIMA model",
+        text=(
+        ),
+    )
+    """
+    input_df = CustomChoiceField(
+        choices=[],
+        label="Peptide dataframe",
+    )
+    protein_group = CustomChoiceField(
+        choices=[],
+        label="Protein group: which protein group to perform the AutoARIMA on",
+    )
+    seasonal = CustomChoiceField(
+        choices=YesNo,
+        label="Seasonal: Whether the ARIMA model should be seasonal",
+        initial=YesNo.no
+    )
+    p = CustomNumberField(
+        label = "The number of lag observations included in the model",
+        min_value=0,
+        step_size=1,
+        initial=1,
+    )
+    d = CustomNumberField(
+        label = "The number of times that the raw observations are differenced",
+        min_value=0,
+        step_size=1,
+        initial=1,
+    )
+    q = CustomNumberField(
+        label = "The size of the moving average window",
+        min_value=1,
+        step_size=1,
+        initial=1,
+    )
+    train_size = CustomFloatField(
+        label="Train size: proportion of the dataset to include in the test split",
+        min_value=0,
+        max_value=1,
+        step_size=0.1,
+        initial=0.8,
+    )
+    grouping = CustomChoiceField(
+        choices= TimeSeriesGrouping,
+        label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
+        initial=TimeSeriesGrouping.with_grouping
+    )
+
+
     def fill_form(self, run: Run) -> None:
         self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps(
             run

From 5c0c157d4f3b899da7bfb239dd0db291f1da864d Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Thu, 8 Aug 2024 15:56:48 +0200
Subject: [PATCH 34/52] Fixed RANSAC tests

---
 .../time_series_regression_analysis.py        |  4 +-
 .../test_time_series_analysis.py              | 54 +++++++++++++++----
 2 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
index 7777f833..bb9f5d83 100644
--- a/protzilla/data_analysis/time_series_regression_analysis.py
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -73,7 +73,7 @@ def time_series_linear_regression(
             X_group = group_df[["Time"]]
             y_group = group_df["Intensity"]
 
-            X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, test_size=train_size, shuffle=False)
+            X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, train_size=train_size, shuffle=False)
             model = LinearRegression()
             model.fit(X_train, y_train)
 
@@ -314,7 +314,7 @@ def time_series_ransac_regression(
             })
 
     else:
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_size, shuffle=False)
+        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, shuffle=False)
         model = RANSACRegressor(base_estimator=LinearRegression())
         model.fit(X_train, y_train)
 
diff --git a/tests/protzilla/data_analysis/test_time_series_analysis.py b/tests/protzilla/data_analysis/test_time_series_analysis.py
index 4520d3f2..2ee0ff4f 100644
--- a/tests/protzilla/data_analysis/test_time_series_analysis.py
+++ b/tests/protzilla/data_analysis/test_time_series_analysis.py
@@ -63,7 +63,7 @@ def time_series_test_data():
 
 def test_linear_regression_plot_with_grouping(show_figures, time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
-    outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2,"With Grouping")
+    outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.8,"With Grouping")
     assert "plots" in outputs
     fig = outputs["plots"][0]
     if show_figures:
@@ -72,14 +72,14 @@ def test_linear_regression_plot_with_grouping(show_figures, time_series_test_dat
 
 def test_linear_regression_plot_without_grouping(show_figures, time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
-    outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2,"Without Grouping")
+    outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.8,"Without Grouping")
     assert "plots" in outputs
     fig = outputs["plots"][0]
     if show_figures:
         fig.show()
     return
 
-def test_linear_regression_plot_invalid_test_size(time_series_test_data):
+def test_linear_regression_plot_invalid_train_size(time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
     with pytest.raises(ValueError):
         time_series_linear_regression(test_intensity, test_metadata, "Protein1", 2, "Without Grouping")
@@ -87,14 +87,23 @@ def test_linear_regression_plot_invalid_test_size(time_series_test_data):
 
 def test_linear_regression_outputs(time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
-    outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.2, "Without Grouping")
+    outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.8, "Without Grouping")
     assert "scores" in outputs
     return
 
 
 def test_ransac_regression_plot_with_grouping(show_figures, time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
-    outputs = time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 0.2, "With Grouping")
+    outputs = time_series_ransac_regression(
+        test_intensity,
+        test_metadata,
+        "Protein1",
+        100,
+        0.99,
+        "absolute_error",
+        0.8,
+        "With Grouping"
+    )
     assert "plots" in outputs
     fig = outputs["plots"][0]
     if show_figures:
@@ -103,22 +112,49 @@ def test_ransac_regression_plot_with_grouping(show_figures, time_series_test_dat
 
 def test_ransac_regression_plot_without_grouping(show_figures, time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
-    outputs = time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 0.2, "Without Grouping")
+    outputs = time_series_ransac_regression(
+        test_intensity,
+        test_metadata,
+        "Protein1",
+        100,
+        0.99,
+        "absolute_error",
+        0.8,
+        "With Grouping"
+    )
     assert "plots" in outputs
     fig = outputs["plots"][0]
     if show_figures:
         fig.show()
     return
 
-def test_ransac_plot_invalid_test_size(time_series_test_data):
+def test_ransac_plot_invalid_train_size(time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
     with pytest.raises(ValueError):
-        time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 2, "Without Grouping")
+        time_series_ransac_regression(
+            test_intensity,
+            test_metadata,
+            "Protein1",
+            100,
+            0.99,
+            "absolute_error",
+            2,
+            "With Grouping"
+        )
     return
 
 def test_ransac_regression_outputs(time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
-    outputs = time_series_ransac_regression(test_intensity, test_metadata, "Protein1", 0.2, "Without Grouping")
+    outputs = time_series_ransac_regression(
+        test_intensity,
+        test_metadata,
+        "Protein1",
+        100,
+        0.99,
+        "absolute_error",
+        0.8,
+        "With Grouping"
+    )
     assert "scores" in outputs
     return
 

From 0b54ee81a787923f6a2cb9e95bdff1c488db0f26 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Wed, 14 Aug 2024 13:06:06 +0200
Subject: [PATCH 35/52] Updated ARIMA so that it supports seasonal parameters

---
 .../time_series_regression_analysis.py        | 12 ++++-
 protzilla/methods/data_analysis.py            |  4 ++
 ui/runs/forms/data_analysis.py                | 44 ++++++++++++++++---
 3 files changed, 53 insertions(+), 7 deletions(-)

diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
index bb9f5d83..52173b23 100644
--- a/protzilla/data_analysis/time_series_regression_analysis.py
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -720,6 +720,10 @@ def time_series_arima(
     p: int,
     d: int,
     q: int,
+    P: int,
+    D: int,
+    Q: int,
+    s: int,
     train_size: float,
     grouping: str,
 ) -> dict:
@@ -770,7 +774,7 @@ def time_series_arima(
                 model = ARIMA(
                     train_df,
                     order=(p, d, q),
-                    #seasonal_order=(P, D, Q, m)
+                    seasonal_order=(P, D, Q, s)
                 )
             else:
                 model = ARIMA(
@@ -836,7 +840,11 @@ def time_series_arima(
         test_df = test_df.set_index("Time")["Intensity"]
 
         if seasonal == "Yes":
-            model = ARIMA(train_df, order=(p, d, q))
+            model = ARIMA(
+                train_df,
+                order=(p, d, q),
+                seasonal_order = (P, D, Q, s),
+            )
         else:
             model = ARIMA(train_df, order=(p, d, q))
 
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 26d73f77..dcdee6ce 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -929,6 +929,10 @@ class TimeSeriesARIMA(PlotStep):
         "p",
         "d",
         "q",
+        "P",
+        "D",
+        "Q",
+        "s",
         "train_size",
         "grouping",
     ]
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 5e01e5d0..1da3729d 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1449,23 +1449,51 @@ class TimeSeriesARIMAForm(MethodForm):
         initial=YesNo.no
     )
     p = CustomNumberField(
-        label = "The number of lag observations included in the model",
+        label = "Autoregressive Order: The number of lag observations included in the model",
         min_value=0,
         step_size=1,
         initial=1,
     )
     d = CustomNumberField(
-        label = "The number of times that the raw observations are differenced",
+        label = "Differencing Order: The number of times that the raw observations are differenced",
         min_value=0,
         step_size=1,
         initial=1,
     )
     q = CustomNumberField(
-        label = "The size of the moving average window",
-        min_value=1,
+        label = "Moving Average Order: The size of the moving average window",
+        min_value=0,
         step_size=1,
         initial=1,
     )
+    P = CustomNumberField(
+        label = "Seasonal Autoregressive Order: The number of seasonal lag observations included in the model",
+        min_value=0,
+        step_size=1,
+        initial=0,
+        required=False
+    )
+    D = CustomNumberField(
+        label = "Seasonal Differencing Order: The number of times that the seasonal observations are differenced",
+        min_value=0,
+        step_size=1,
+        initial=0,
+        required=False
+    )
+    Q = CustomNumberField(
+        label = "Seasonal Moving Average Order: The size of the seasonal moving average window",
+        min_value=0,
+        step_size=1,
+        initial=0,
+        required=False
+    )
+    s = CustomNumberField(
+        label = "Seasonal Period: The number of periods for a single seasonal cycle",
+        min_value=0,
+        step_size=1,
+        initial=0,
+        required=False
+    )
     train_size = CustomFloatField(
         label="Train size: proportion of the dataset to include in the test split",
         min_value=0,
@@ -1494,4 +1522,10 @@ def fill_form(self, run: Run) -> None:
                 output_key="peptide_df",
                 instance_identifier=input_df_instance_id,
             )["Protein ID"].unique()
-        )
\ No newline at end of file
+        )
+        seasonal = self.data.get("seasonal")
+        if seasonal == "No":
+            self.toggle_visibility("P", False)
+            self.toggle_visibility("D", False)
+            self.toggle_visibility("Q", False)
+            self.toggle_visibility("s", False)
\ No newline at end of file

From e1b77dcf66a2d1a410d64ae657247b595f425e49 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Thu, 15 Aug 2024 12:18:24 +0200
Subject: [PATCH 36/52] Corrected the output text for the scores

---
 .../data_analysis/time_series_regression_analysis.py      | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
index 52173b23..0b136a76 100644
--- a/protzilla/data_analysis/time_series_regression_analysis.py
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -161,7 +161,7 @@ def time_series_linear_regression(
     annotation_text = "<br>".join([
         f"Group: {res['group']} (Train/Test)"
         f"<br>RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}<br>"
-        f"Train R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}<br>"
+        f"R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}<br>"
         for res in scores
     ])
 
@@ -371,7 +371,7 @@ def time_series_ransac_regression(
     annotation_text = "<br>".join([
         f"Group: {res['group']} (Train/Test)"
         f"<br>RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}<br>"
-        f"Train R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}<br>"
+        f"R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}<br>"
         for res in scores
     ])
 
@@ -667,7 +667,7 @@ def time_series_auto_arima(
     annotation_text = "<br>".join([
         f"Group: {res['group']} (Train/Test)"
         f"<br>RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}<br>"
-        f"Train R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}<br>"
+        f"R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}<br>"
         for res in scores
     ])
 
@@ -896,7 +896,7 @@ def time_series_arima(
     annotation_text = "<br>".join([
         f"Group: {res['group']} (Train/Test)"
         f"<br>RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}<br>"
-        f"Train R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}<br>"
+        f"R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}<br>"
         for res in scores
     ])
 

From d89b2362515ffcf03dd8f7b2c9f9b36e33cc56c7 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Thu, 15 Aug 2024 13:45:49 +0200
Subject: [PATCH 37/52] Implemented tests for auto ARIMA and ARIMA

---
 .../test_time_series_analysis.py              | 184 ++++++++++++++++++
 1 file changed, 184 insertions(+)

diff --git a/tests/protzilla/data_analysis/test_time_series_analysis.py b/tests/protzilla/data_analysis/test_time_series_analysis.py
index 2ee0ff4f..5e139ce7 100644
--- a/tests/protzilla/data_analysis/test_time_series_analysis.py
+++ b/tests/protzilla/data_analysis/test_time_series_analysis.py
@@ -5,6 +5,8 @@
     time_series_linear_regression,
     time_series_ransac_regression,
     adfuller_test,
+    time_series_auto_arima,
+    time_series_arima,
 )
 
 
@@ -39,6 +41,13 @@ def time_series_test_data():
         ["Sample7", "Protein2", "Gene1", 13],
         ["Sample7", "Protein3", "Gene1", 3],
         ["Sample7", "Protein4", "Gene1", 11],
+        ["Sample1", "Protein1", "Gene2", 10],
+        ["Sample1", "Protein2", "Gene2", 14],
+        ["Sample1", "Protein3", "Gene2", 2],
+        ["Sample1", "Protein4", "Gene2", 10],
+        ["Sample2", "Protein1", "Gene2", 12],
+        ["Sample2", "Protein1", "Gene3", 13],
+
     )
 
     test_intensity_df = pd.DataFrame(
@@ -168,4 +177,179 @@ def test_adfuller_test(time_series_test_data):
     assert "critical_values" in outputs
     assert "is_stationary" in outputs
     assert "messages" in outputs
+    return
+
+
+def test_auto_arima_plot_with_grouping(show_figures, time_series_test_data):
+    test_intensity, test_metadata = time_series_test_data
+    outputs = time_series_auto_arima(
+        test_intensity,
+        test_metadata,
+        "Protein1",
+        "No",
+        1,
+        0.5,
+        "With Grouping"
+    )
+    assert "plots" in outputs
+    fig = outputs["plots"][0]
+    if show_figures:
+        fig.show()
+    return
+
+def test_auto_arima_plot_without_grouping(show_figures, time_series_test_data):
+    test_intensity, test_metadata = time_series_test_data
+    outputs = time_series_auto_arima(
+        test_intensity,
+        test_metadata,
+        "Protein1",
+        "No",
+        1,
+        0.5,
+        "With Grouping"
+    )
+    assert "plots" in outputs
+    fig = outputs["plots"][0]
+    if show_figures:
+        fig.show()
+    return
+
+def test_auto_arima_plot_invalid_train_size(time_series_test_data):
+    test_intensity, test_metadata = time_series_test_data
+    with pytest.raises(ValueError):
+        time_series_auto_arima(
+            test_intensity,
+            test_metadata,
+            "Protein1",
+            "No",
+            1,
+            2,
+            "With Grouping"
+        )
+    return
+
+
+def test_auto_arima_outputs(time_series_test_data):
+    test_intensity, test_metadata = time_series_test_data
+    outputs = time_series_auto_arima(
+        test_intensity,
+        test_metadata,
+        "Protein1",
+        "No",
+        1,
+        0.5,
+        "With Grouping"
+    )
+    assert "scores" in outputs
+    return
+
+
+def test_arima_plot_with_grouping(show_figures, time_series_test_data):
+    test_intensity, test_metadata = time_series_test_data
+    outputs = time_series_arima(
+        test_intensity,
+        test_metadata,
+        "Protein1",
+        "No",
+        1,
+        1,
+        1,
+        0,
+        0,
+        0,
+        0,
+        0.5,
+        "With Grouping"
+    )
+    assert "plots" in outputs
+    fig = outputs["plots"][0]
+    if show_figures:
+        fig.show()
+    return
+
+def test_arima_plot_seasonal_with_grouping(show_figures, time_series_test_data):
+    test_intensity, test_metadata = time_series_test_data
+    outputs = time_series_arima(
+        test_intensity,
+        test_metadata,
+        "Protein1",
+        "Yes",
+        1,
+        1,
+        1,
+        0,
+        0,
+        0,
+        0,
+        0.5,
+        "With Grouping"
+    )
+    assert "plots" in outputs
+    fig = outputs["plots"][0]
+    if show_figures:
+        fig.show()
+    return
+
+def test_arima_plot_without_grouping(show_figures, time_series_test_data):
+    test_intensity, test_metadata = time_series_test_data
+    outputs = time_series_arima(
+        test_intensity,
+        test_metadata,
+        "Protein1",
+        "No",
+        1,
+        1,
+        1,
+        0,
+        0,
+        0,
+        0,
+        0.5,
+        "Without Grouping"
+    )
+    assert "plots" in outputs
+    fig = outputs["plots"][0]
+    if show_figures:
+        fig.show()
+    return
+
+def test_arima_plot_invalid_train_size(time_series_test_data):
+    test_intensity, test_metadata = time_series_test_data
+    with pytest.raises(ValueError):
+        time_series_arima(
+            test_intensity,
+            test_metadata,
+            "Protein1",
+            "No",
+            1,
+            1,
+            1,
+            0,
+            0,
+            0,
+            0,
+            2,
+            "With Grouping"
+        )
+    return
+
+
+def test_arima_outputs(time_series_test_data):
+    test_intensity, test_metadata = time_series_test_data
+    outputs = time_series_arima(
+        test_intensity,
+        test_metadata,
+        "Protein1",
+        "No",
+        1,
+        1,
+        1,
+        0,
+        0,
+        0,
+        0,
+        0.5,
+        "With Grouping"
+    )
+    assert "scores" in outputs
     return
\ No newline at end of file

From ec9b78301330f847bd04dd900a2ea3d9e9e40449 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Sun, 18 Aug 2024 19:05:58 +0200
Subject: [PATCH 38/52] Implemented a dynamic field where the user can select
 the time column and the group column in each time series methods

---
 .../time_series_regression_analysis.py        | 110 +++++++++++-------
 protzilla/methods/data_analysis.py            |   9 ++
 protzilla/methods/importing.py                |   3 +-
 ui/runs/forms/data_analysis.py                |  44 +++++++
 ui/runs/forms/importing.py                    |   8 +-
 5 files changed, 126 insertions(+), 48 deletions(-)

diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
index 0b136a76..5f6240c4 100644
--- a/protzilla/data_analysis/time_series_regression_analysis.py
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -27,16 +27,20 @@
 def time_series_linear_regression(
         input_df: pd.DataFrame,
         metadata_df: pd.DataFrame,
+        time_column_name: str,
         protein_group: str,
-        train_size: float = 0.2,
+        train_size: float,
+        grouping_column_name: str,
         grouping: str = None,
 ):
     """
     Perform linear regression on the time series data for a given protein group.
     :param input_df: Peptide dataframe which contains the intensity of each sample
     :param metadata_df: Metadata dataframe which contains the timestamps
+    :param time_column_name: The name of the column containing the time values
     :param protein_group: Protein group to perform the analysis on
     :param train_size: The proportion of the dataset to include in the test split
+    :param grouping_column_name: The name of the column containing the grouping information
     :param grouping: Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups
 
     :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
@@ -54,23 +58,23 @@ def time_series_linear_regression(
         copy=False,
     )
 
-    input_df["Time"] = input_df["Time"].apply(convert_time_to_hours)
+    input_df[time_column_name] = input_df[time_column_name].apply(convert_time_to_hours)
     input_df = input_df.interpolate(method='linear', axis=0)
 
     input_df = input_df.sample(frac=1, random_state = 42).reset_index(drop=True)
 
-    X = input_df[["Time"]]
+    X = input_df[[time_column_name]]
     y = input_df["Intensity"]
 
     fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025)
 
     scores = []
 
-    if grouping == "With Grouping" and "Group" in input_df.columns:
-        groups = input_df["Group"].unique()
+    if grouping == "With Grouping" and grouping_column_name in input_df.columns:
+        groups = input_df[grouping_column_name].unique()
         for group in groups:
-            group_df = input_df[input_df["Group"] == group]
-            X_group = group_df[["Time"]]
+            group_df = input_df[input_df[grouping_column_name] == group]
+            X_group = group_df[[time_column_name]]
             y_group = group_df["Intensity"]
 
             X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, train_size=train_size, shuffle=False)
@@ -85,15 +89,15 @@ def time_series_linear_regression(
             train_r2 = r2_score(y_train, y_pred_train)
             test_r2 = r2_score(y_test, y_pred_test)
 
-            train_df = pd.DataFrame({'Time': X_train['Time'], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
-            test_df = pd.DataFrame({'Time': X_test['Time'], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+            train_df = pd.DataFrame({'Time': X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
+            test_df = pd.DataFrame({'Time': X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
             plot_df = pd.concat([train_df, test_df])
 
             color = PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index % len(PROTZILLA_DISCRETE_COLOR_SEQUENCE)]
             color_index += 3
 
             fig.add_trace(go.Scatter(
-                x=plot_df['Time'],
+                x=plot_df[time_column_name],
                 y=plot_df['Intensity'],
                 mode='markers',
                 name=f'Actual Intensity ({group})',
@@ -101,7 +105,7 @@ def time_series_linear_regression(
             ), row=1, col=1)
 
             fig.add_trace(go.Scatter(
-                x=plot_df['Time'],
+                x=plot_df[time_column_name],
                 y=plot_df['Predicted'],
                 mode='lines',
                 name=f'Predicted Intensity ({group})',
@@ -129,12 +133,12 @@ def time_series_linear_regression(
         train_r2 = r2_score(y_train, y_pred_train)
         test_r2 = r2_score(y_test, y_pred_test)
 
-        train_df = pd.DataFrame({'Time': X_train['Time'], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
-        test_df = pd.DataFrame({'Time': X_test['Time'], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+        train_df = pd.DataFrame({'Time': X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
+        test_df = pd.DataFrame({'Time': X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
         plot_df = pd.concat([train_df, test_df])
 
         fig.add_trace(go.Scatter(
-            x=plot_df['Time'],
+            x=plot_df[time_column_name],
             y=plot_df['Intensity'],
             mode='markers',
             name='Actual Intensity',
@@ -142,7 +146,7 @@ def time_series_linear_regression(
         ), row=1, col=1)
 
         fig.add_trace(go.Scatter(
-            x=plot_df['Time'],
+            x=plot_df[time_column_name],
             y=plot_df['Predicted'],
             mode='lines',
             name='Predicted Intensity',
@@ -209,19 +213,27 @@ def time_series_linear_regression(
 def time_series_ransac_regression(
         input_df: pd.DataFrame,
         metadata_df: pd.DataFrame,
+        time_column_name: str,
         protein_group: str,
         max_trials: int,
         stop_probability: float,
         loss: str,
         train_size: float,
+        grouping_column_name: str,
         grouping: str,
 ):
     """
     Perform RANSAC regression on the time series data for a given protein group.
     :param input_df: Peptide dataframe which contains the intensity of each sample
     :param metadata_df: Metadata dataframe which contains the timestamps
+    :param time_column_name: The name of the column containing the time values
+    :param max_trials: The maximum number of iterations to perform
+    :param stop_probability: The probability to stop the RANSAC algorithm
+    :param loss: The loss function to use
     :param protein_group: Protein group to perform the analysis on
     :param train_size: The proportion of the dataset to include in the test split
+    :param grouping_column_name: The name of the column containing the grouping information
+    :param grouping: Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups
 
     :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
     """
@@ -239,23 +251,23 @@ def time_series_ransac_regression(
         copy=False,
     )
 
-    input_df["Time"] = input_df["Time"].apply(convert_time_to_hours)
+    input_df[time_column_name] = input_df[time_column_name].apply(convert_time_to_hours)
     input_df = input_df.interpolate(method='linear', axis=0)
 
     input_df = input_df.sample(frac=1, random_state = 42).reset_index(drop=True)
 
-    X = input_df[["Time"]]
+    X = input_df[[time_column_name]]
     y = input_df["Intensity"]
 
     fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025)
 
     scores = []
 
-    if grouping == "With Grouping" and "Group" in input_df.columns:
-        groups = input_df["Group"].unique()
+    if grouping == "With Grouping" and grouping_column_name in input_df.columns:
+        groups = input_df[grouping_column_name].unique()
         for group in groups:
-            group_df = input_df[input_df["Group"] == group]
-            X_group = group_df[["Time"]]
+            group_df = input_df[input_df[grouping_column_name] == group]
+            X_group = group_df[[time_column_name]]
             y_group = group_df["Intensity"]
 
             X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, train_size=train_size, shuffle=False)
@@ -272,8 +284,8 @@ def time_series_ransac_regression(
             train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask])
             test_r2 = r2_score(y_test, y_pred_test)
 
-            train_df = pd.DataFrame({'Time': X_train["Time"], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
-            test_df = pd.DataFrame({'Time': X_test["Time"], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+            train_df = pd.DataFrame({'Time': X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
+            test_df = pd.DataFrame({'Time': X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
             train_df['Inlier'] = inlier_mask
             test_df['Inlier'] = False
             plot_df = pd.concat([train_df, test_df])
@@ -328,8 +340,8 @@ def time_series_ransac_regression(
         train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask])
         test_r2 = r2_score(y_test, y_pred_test)
 
-        train_df = pd.DataFrame({'Time': X_train["Time"], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
-        test_df = pd.DataFrame({'Time': X_test["Time"], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+        train_df = pd.DataFrame({'Time': X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
+        test_df = pd.DataFrame({'Time': X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
         train_df['Inlier'] = inlier_mask
         test_df['Inlier'] = False
         plot_df = pd.concat([train_df, test_df])
@@ -486,20 +498,24 @@ def adfuller_test(
 def time_series_auto_arima(
     input_df: pd.DataFrame,
     metadata_df: pd.DataFrame,
+    time_column_name: str,
     protein_group: str,
     seasonal: str,
     m: int,
     train_size: float,
+    grouping_column_name: str,
     grouping: str,
 ) -> dict:
     """
     Perform an automatic ARIMA model selection on the time series data for a given protein group.
     :param input_df: Peptide dataframe which contains the intensity of each sample
     :param metadata_df: Metadata dataframe which contains the timestamps
+    :param time_column_name: The name of the column containing the time values
     :param protein_group: Protein group to perform the analysis on
     :param seasonal: Whether the ARIMA model should be seasonal
     :param m: The number of time steps for a single seasonal period (ignored if seasonal=False)
     :param train_size: The proportion of the dataset to include in the test split
+    :param grouping_column_name: The name of the column containing the grouping information
     :param grouping: Whether to group the data by the 'Group' column
 
     :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
@@ -527,19 +543,19 @@ def time_series_auto_arima(
     fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3])
     scores = []
 
-    if grouping == "With Grouping" and "Group" in input_df.columns:
-        groups = input_df["Group"].unique()
+    if grouping == "With Grouping" and grouping_column_name in input_df.columns:
+        groups = input_df[grouping_column_name].unique()
         for group in groups:
-            group_df = input_df[input_df["Group"] == group]
+            group_df = input_df[input_df[grouping_column_name] == group]
 
-            group_df["Time"] = group_df["Time"].apply(convert_time_to_hours)
+            group_df[time_column_name] = group_df[time_column_name].apply(convert_time_to_hours)
             group_df = group_df.interpolate(method='linear', axis=0)
 
             train_df_size = int(len(group_df) * train_size)
             train_df, test_df = group_df[:train_df_size], group_df[train_df_size:]
 
-            train_df = train_df.set_index("Time")["Intensity"]
-            test_df = test_df.set_index("Time")["Intensity"]
+            train_df = train_df.set_index(time_column_name)["Intensity"]
+            test_df = test_df.set_index(time_column_name)["Intensity"]
 
             # Fit the ARIMA model
             model = auto_arima(
@@ -599,14 +615,14 @@ def time_series_auto_arima(
             })
 
     else:
-        input_df["Time"] = input_df["Time"].apply(convert_time_to_hours)
+        input_df[time_column_name] = input_df[time_column_name].apply(convert_time_to_hours)
         input_df = input_df.interpolate(method='linear', axis=0)
 
         train_size = int(len(input_df) * train_size)
         train_df, test_df = input_df[:train_size], input_df[train_size:]
 
-        train_df = train_df.set_index("Time")["Intensity"]
-        test_df = test_df.set_index("Time")["Intensity"]
+        train_df = train_df.set_index(time_column_name)["Intensity"]
+        test_df = test_df.set_index(time_column_name)["Intensity"]
 
         # Fit the ARIMA model
         model = auto_arima(
@@ -715,6 +731,7 @@ def time_series_auto_arima(
 def time_series_arima(
     input_df: pd.DataFrame,
     metadata_df: pd.DataFrame,
+    time_column_name: str,
     protein_group: str,
     seasonal: str,
     p: int,
@@ -725,6 +742,7 @@ def time_series_arima(
     Q: int,
     s: int,
     train_size: float,
+    grouping_column_name: str,
     grouping: str,
 ) -> dict:
 
@@ -732,12 +750,18 @@ def time_series_arima(
     Perform ARIMA model selection on the time series data for a given protein group.
     :param input_df: Peptide dataframe which contains the intensity of each sample
     :param metadata_df: Metadata dataframe which contains the timestamps
+    :param time_column_name: The name of the column containing the time values
     :param protein_group: Protein group to perform the analysis on
     :param seasonal: Whether the ARIMA model should be seasonal
     :param p: ARIMA p parameter
     :param d: ARIMA d parameter
     :param q: ARIMA q parameter
+    :param P: ARIMA seasonal P parameter
+    :param D: ARIMA seasonal D parameter
+    :param Q: ARIMA seasonal Q parameter
+    :param s: ARIMA seasonal s parameter
     :param train_size: The proportion of the dataset to include in the test split
+    :param grouping_column_name: The name of the column containing the grouping information
     :param grouping: Whether to group the data by the 'Group' column
 
     :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
@@ -756,19 +780,19 @@ def time_series_arima(
     fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3])
     scores = []
 
-    if grouping == "With Grouping" and "Group" in input_df.columns:
-        groups = input_df["Group"].unique()
+    if grouping == "With Grouping" and grouping_column_name in input_df.columns:
+        groups = input_df[grouping_column_name].unique()
         for group in groups:
-            group_df = input_df[input_df["Group"] == group]
+            group_df = input_df[input_df[grouping_column_name] == group]
 
-            group_df["Time"] = group_df["Time"].apply(convert_time_to_hours)
+            group_df[time_column_name] = group_df[time_column_name].apply(convert_time_to_hours)
             group_df = group_df.interpolate(method='linear', axis=0)
 
             train_df_size = int(len(group_df) * train_size)
             train_df, test_df = group_df[:train_df_size], group_df[train_df_size:]
 
-            train_df = train_df.set_index("Time")["Intensity"]
-            test_df = test_df.set_index("Time")["Intensity"]
+            train_df = train_df.set_index(time_column_name)["Intensity"]
+            test_df = test_df.set_index(time_column_name)["Intensity"]
 
             if seasonal == "Yes":
                 model = ARIMA(
@@ -830,14 +854,14 @@ def time_series_arima(
             })
 
     else:
-        input_df["Time"] = input_df["Time"].apply(convert_time_to_hours)
+        input_df[time_column_name] = input_df[time_column_name].apply(convert_time_to_hours)
         input_df = input_df.interpolate(method='linear', axis=0)
 
         train_size = int(len(input_df) * train_size)
         train_df, test_df = input_df[:train_size], input_df[train_size:]
 
-        train_df = train_df.set_index("Time")["Intensity"]
-        test_df = test_df.set_index("Time")["Intensity"]
+        train_df = train_df.set_index(time_column_name)["Intensity"]
+        test_df = test_df.set_index(time_column_name)["Intensity"]
 
         if seasonal == "Yes":
             model = ARIMA(
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 45b9b96e..229c27b6 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -806,8 +806,10 @@ class TimeSeriesLinearRegression(PlotStep):
     input_keys = [
         "input_df",
         "metadata_df",
+        "time_column_name",
         "protein_group",
         "train_size",
+        "grouping_column_name",
         "grouping",
     ]
     output_keys = [
@@ -831,11 +833,13 @@ class TimeSeriesRANSACRegression(PlotStep):
     input_keys = [
         "input_df",
         "metadata_df",
+        "time_column_name",
         "protein_group",
         "max_trials",
         "stop_probability",
         "loss",
         "train_size",
+        "grouping_column_name",
         "grouping",
     ]
     output_keys = [
@@ -858,6 +862,7 @@ class TimeSeriesADFullerTest(DataAnalysisStep):
     input_keys = [
         "input_df",
         "metadata_df",
+        "time_column_name",
         "protein_group",
         "alpha",
     ]
@@ -887,10 +892,12 @@ class TimeSeriesAutoARIMA(PlotStep):
     input_keys = [
         "input_df",
         "metadata_df",
+        "time_column_name",
         "protein_group",
         "seasonal",
         "m",
         "train_size",
+        "grouping_column_name",
         "grouping",
     ]
     output_keys = [
@@ -916,6 +923,7 @@ class TimeSeriesARIMA(PlotStep):
     input_keys = [
         "input_df",
         "metadata_df",
+        "time_column_name",
         "protein_group",
         "seasonal",
         "p",
@@ -926,6 +934,7 @@ class TimeSeriesARIMA(PlotStep):
         "Q",
         "s",
         "train_size",
+        "grouping_column_name",
         "grouping",
     ]
     output_keys = [
diff --git a/protzilla/methods/importing.py b/protzilla/methods/importing.py
index 7cde1ba0..a7af6d42 100644
--- a/protzilla/methods/importing.py
+++ b/protzilla/methods/importing.py
@@ -96,7 +96,8 @@ class MetadataColumnAssignment(ImportingStep):
     display_name = "Metadata column assignment"
     operation = "metadataimport"
     method_description = (
-        "Assign columns to metadata categories, repeatable for each category"
+        "Protzilla uses a unique metadata column name to identify certain features in the metadata. "
+        "This step assigns the metadata columns to the correct feature."
     )
 
     input_keys = [
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index d7de9ce3..899b56ae 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1240,6 +1240,7 @@ class TimeSeriesLinearRegressionForm(MethodForm):
         choices=[],
         label="Peptide dataframe",
     )
+    time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time")
     protein_group = CustomChoiceField(
         choices=[],
         label="Protein group: which protein group to perform the linear regression on",
@@ -1251,6 +1252,7 @@ class TimeSeriesLinearRegressionForm(MethodForm):
         step_size=0.1,
         initial=0.8
     )
+    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
     grouping = CustomChoiceField(
         choices= TimeSeriesGrouping,
         label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
@@ -1265,6 +1267,13 @@ def fill_form(self, run: Run) -> None:
         input_df_instance_id = self.data.get(
             "input_df", self.fields["input_df"].choices[0][0]
         )
+        self.fields[
+            "time_column_name"
+        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+
+        self.fields[
+            "grouping_column_name"
+        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
 
         self.fields["protein_group"].choices = fill_helper.to_choices(
             run.steps.get_step_output(
@@ -1281,6 +1290,7 @@ class TimeSeriesRANSACRegressionForm(MethodForm):
         choices=[],
         label="Peptide dataframe",
     )
+    time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time")
     protein_group = CustomChoiceField(
         choices=[],
         label="Protein group: which protein group to perform the RANSAC regression on",
@@ -1310,6 +1320,7 @@ class TimeSeriesRANSACRegressionForm(MethodForm):
         step_size=0.1,
         initial=0.8
     )
+    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
     grouping = CustomChoiceField(
         choices= TimeSeriesGrouping,
         label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
@@ -1325,6 +1336,14 @@ def fill_form(self, run: Run) -> None:
             "input_df", self.fields["input_df"].choices[0][0]
         )
 
+        self.fields[
+            "time_column_name"
+        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+
+        self.fields[
+            "grouping_column_name"
+        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+
         self.fields["protein_group"].choices = fill_helper.to_choices(
             run.steps.get_step_output(
                 step_type=Step,
@@ -1352,6 +1371,7 @@ class TimeSeriesADFullerTestForm(MethodForm):
         choices=[],
         label="Peptide dataframe",
     )
+    time_column_name = CustomChoiceField(choices=[], label="Time: which column from metadata that represents time")
     protein_group = CustomChoiceField(
         choices=[],
         label="Protein group: which protein group to perform the ADFuller test on",
@@ -1371,6 +1391,10 @@ def fill_form(self, run: Run) -> None:
             "input_df", self.fields["input_df"].choices[0][0]
         )
 
+        self.fields[
+            "time_column_name"
+        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+
         self.fields["protein_group"].choices = fill_helper.to_choices(
             run.steps.get_step_output(
                 step_type=Step,
@@ -1391,6 +1415,7 @@ class TimeSeriesAutoARIMAForm(MethodForm):
         choices=[],
         label="Peptide dataframe",
     )
+    time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time")
     protein_group = CustomChoiceField(
         choices=[],
         label="Protein group: which protein group to perform the AutoARIMA on",
@@ -1413,6 +1438,7 @@ class TimeSeriesAutoARIMAForm(MethodForm):
         step_size=0.1,
         initial=0.8,
     )
+    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
     grouping = CustomChoiceField(
         choices= TimeSeriesGrouping,
         label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
@@ -1428,6 +1454,14 @@ def fill_form(self, run: Run) -> None:
             "input_df", self.fields["input_df"].choices[0][0]
         )
 
+        self.fields[
+            "time_column_name"
+        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+
+        self.fields[
+            "grouping_column_name"
+        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+
         self.fields["protein_group"].choices = fill_helper.to_choices(
             run.steps.get_step_output(
                 step_type=Step,
@@ -1450,6 +1484,7 @@ class TimeSeriesARIMAForm(MethodForm):
         choices=[],
         label="Peptide dataframe",
     )
+    time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time")
     protein_group = CustomChoiceField(
         choices=[],
         label="Protein group: which protein group to perform the AutoARIMA on",
@@ -1512,6 +1547,7 @@ class TimeSeriesARIMAForm(MethodForm):
         step_size=0.1,
         initial=0.8,
     )
+    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
     grouping = CustomChoiceField(
         choices= TimeSeriesGrouping,
         label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
@@ -1527,6 +1563,14 @@ def fill_form(self, run: Run) -> None:
             "input_df", self.fields["input_df"].choices[0][0]
         )
 
+        self.fields[
+            "time_column_name"
+        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+
+        self.fields[
+            "grouping_column_name"
+        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+
         self.fields["protein_group"].choices = fill_helper.to_choices(
             run.steps.get_step_output(
                 step_type=Step,
diff --git a/ui/runs/forms/importing.py b/ui/runs/forms/importing.py
index cc799be3..be961fa1 100644
--- a/ui/runs/forms/importing.py
+++ b/ui/runs/forms/importing.py
@@ -93,12 +93,12 @@ class MetadataImportMethodDiannForm(MethodForm):
 class MetadataColumnAssignmentForm(MethodForm):
     metadata_required_column = CustomChoiceField(
         choices=EmptyEnum,
-        label="Missing, but required metadata columns",
+        label="Columns in Metadata that needs to be assigned",
         required=False,
     )
     metadata_unknown_column = CustomChoiceField(
         choices=EmptyEnum,
-        label="Existing, but unknown metadata columns",
+        label="Available columns in Metadata that can be assigned",
         required=False,
     )
 
@@ -111,7 +111,7 @@ def fill_form(self, run: Run) -> None:
         if metadata is not None:
             self.fields["metadata_required_column"].choices = [
                 (col, col)
-                for col in ["Sample", "Group", "Batch"]
+                for col in ["Sample", "Group", "Batch", "Time"]
                 if col not in metadata.columns
             ]
             if len(self.fields["metadata_required_column"].choices) == 0:
@@ -122,7 +122,7 @@ def fill_form(self, run: Run) -> None:
 
             unknown_columns = list(
                 metadata.columns[
-                    ~metadata.columns.isin(["Sample", "Group", "Batch"])
+                    ~metadata.columns.isin(["Sample", "Group", "Batch", "Time"])
                 ].unique()
             )
 

From 4059f581e0f07c8980b689482d72ee5b4c383486 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Sun, 18 Aug 2024 19:27:57 +0200
Subject: [PATCH 39/52] Fixed Tests

---
 .../test_time_series_analysis.py              | 76 ++++++++++++++++---
 1 file changed, 67 insertions(+), 9 deletions(-)

diff --git a/tests/protzilla/data_analysis/test_time_series_analysis.py b/tests/protzilla/data_analysis/test_time_series_analysis.py
index 5e139ce7..4962eb22 100644
--- a/tests/protzilla/data_analysis/test_time_series_analysis.py
+++ b/tests/protzilla/data_analysis/test_time_series_analysis.py
@@ -72,7 +72,15 @@ def time_series_test_data():
 
 def test_linear_regression_plot_with_grouping(show_figures, time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
-    outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.8,"With Grouping")
+    outputs = time_series_linear_regression(
+        test_intensity,
+        test_metadata,
+        "Time",
+        "Protein1", #
+        0.8,
+        "Group",
+        "With Grouping"
+    )
     assert "plots" in outputs
     fig = outputs["plots"][0]
     if show_figures:
@@ -81,7 +89,15 @@ def test_linear_regression_plot_with_grouping(show_figures, time_series_test_dat
 
 def test_linear_regression_plot_without_grouping(show_figures, time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
-    outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.8,"Without Grouping")
+    outputs = time_series_linear_regression(
+        test_intensity,
+        test_metadata,
+        "Time",
+        "Protein1", #
+        0.8,
+        "Group",
+        "With Grouping"
+    )
     assert "plots" in outputs
     fig = outputs["plots"][0]
     if show_figures:
@@ -91,12 +107,28 @@ def test_linear_regression_plot_without_grouping(show_figures, time_series_test_
 def test_linear_regression_plot_invalid_train_size(time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
     with pytest.raises(ValueError):
-        time_series_linear_regression(test_intensity, test_metadata, "Protein1", 2, "Without Grouping")
+        time_series_linear_regression(
+            test_intensity,
+            test_metadata,
+            "Time",
+            "Protein1",  #
+            2,
+            "Group",
+            "With Grouping"
+        )
     return
 
 def test_linear_regression_outputs(time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
-    outputs = time_series_linear_regression(test_intensity, test_metadata, "Protein1", 0.8, "Without Grouping")
+    outputs = time_series_linear_regression(
+        test_intensity,
+        test_metadata,
+        "Time",
+        "Protein1", #
+        0.8,
+        "Group",
+        "With Grouping"
+    )
     assert "scores" in outputs
     return
 
@@ -106,11 +138,13 @@ def test_ransac_regression_plot_with_grouping(show_figures, time_series_test_dat
     outputs = time_series_ransac_regression(
         test_intensity,
         test_metadata,
+        "Time",
         "Protein1",
         100,
         0.99,
         "absolute_error",
         0.8,
+        "Group",
         "With Grouping"
     )
     assert "plots" in outputs
@@ -124,11 +158,13 @@ def test_ransac_regression_plot_without_grouping(show_figures, time_series_test_
     outputs = time_series_ransac_regression(
         test_intensity,
         test_metadata,
+        "Time",
         "Protein1",
         100,
         0.99,
         "absolute_error",
         0.8,
+        "Group",
         "With Grouping"
     )
     assert "plots" in outputs
@@ -143,11 +179,13 @@ def test_ransac_plot_invalid_train_size(time_series_test_data):
         time_series_ransac_regression(
             test_intensity,
             test_metadata,
+            "Time",
             "Protein1",
             100,
             0.99,
             "absolute_error",
             2,
+            "Group",
             "With Grouping"
         )
     return
@@ -157,11 +195,13 @@ def test_ransac_regression_outputs(time_series_test_data):
     outputs = time_series_ransac_regression(
         test_intensity,
         test_metadata,
+        "Time",
         "Protein1",
         100,
         0.99,
         "absolute_error",
         0.8,
+        "Group",
         "With Grouping"
     )
     assert "scores" in outputs
@@ -185,10 +225,12 @@ def test_auto_arima_plot_with_grouping(show_figures, time_series_test_data):
     outputs = time_series_auto_arima(
         test_intensity,
         test_metadata,
+        "Time",
         "Protein1",
         "No",
         1,
         0.5,
+        "Group",
         "With Grouping"
     )
     assert "plots" in outputs
@@ -199,13 +241,15 @@ def test_auto_arima_plot_with_grouping(show_figures, time_series_test_data):
 
 def test_auto_arima_plot_without_grouping(show_figures, time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
-    outputs = time_series_auto_arima(
+    outputs =  time_series_auto_arima(
         test_intensity,
         test_metadata,
+        "Time",
         "Protein1",
         "No",
         1,
         0.5,
+        "Group",
         "With Grouping"
     )
     assert "plots" in outputs
@@ -220,10 +264,12 @@ def test_auto_arima_plot_invalid_train_size(time_series_test_data):
         time_series_auto_arima(
             test_intensity,
             test_metadata,
+            "Time",
             "Protein1",
             "No",
             1,
             2,
+            "Group",
             "With Grouping"
         )
     return
@@ -231,13 +277,15 @@ def test_auto_arima_plot_invalid_train_size(time_series_test_data):
 
 def test_auto_arima_outputs(time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
-    outputs = time_series_auto_arima(
+    outputs =  time_series_auto_arima(
         test_intensity,
         test_metadata,
+        "Time",
         "Protein1",
         "No",
         1,
         0.5,
+        "Group",
         "With Grouping"
     )
     assert "scores" in outputs
@@ -249,6 +297,7 @@ def test_arima_plot_with_grouping(show_figures, time_series_test_data):
     outputs = time_series_arima(
         test_intensity,
         test_metadata,
+        "Time",
         "Protein1",
         "No",
         1,
@@ -259,6 +308,7 @@ def test_arima_plot_with_grouping(show_figures, time_series_test_data):
         0,
         0,
         0.5,
+        "Group",
         "With Grouping"
     )
     assert "plots" in outputs
@@ -272,8 +322,9 @@ def test_arima_plot_seasonal_with_grouping(show_figures, time_series_test_data):
     outputs = time_series_arima(
         test_intensity,
         test_metadata,
+        "Time",
         "Protein1",
-        "Yes",
+        "No",
         1,
         1,
         1,
@@ -282,6 +333,7 @@ def test_arima_plot_seasonal_with_grouping(show_figures, time_series_test_data):
         0,
         0,
         0.5,
+        "Group",
         "With Grouping"
     )
     assert "plots" in outputs
@@ -295,6 +347,7 @@ def test_arima_plot_without_grouping(show_figures, time_series_test_data):
     outputs = time_series_arima(
         test_intensity,
         test_metadata,
+        "Time",
         "Protein1",
         "No",
         1,
@@ -305,7 +358,8 @@ def test_arima_plot_without_grouping(show_figures, time_series_test_data):
         0,
         0,
         0.5,
-        "Without Grouping"
+        "Group",
+        "With Grouping"
     )
     assert "plots" in outputs
     fig = outputs["plots"][0]
@@ -319,6 +373,7 @@ def test_arima_plot_invalid_train_size(time_series_test_data):
         time_series_arima(
             test_intensity,
             test_metadata,
+            "Time",
             "Protein1",
             "No",
             1,
@@ -329,6 +384,7 @@ def test_arima_plot_invalid_train_size(time_series_test_data):
             0,
             0,
             2,
+            "Group",
             "With Grouping"
         )
     return
@@ -339,6 +395,7 @@ def test_arima_outputs(time_series_test_data):
     outputs = time_series_arima(
         test_intensity,
         test_metadata,
+        "Time",
         "Protein1",
         "No",
         1,
@@ -349,7 +406,8 @@ def test_arima_outputs(time_series_test_data):
         0,
         0,
         0.5,
-        "With Grouping"
+        "Group",
+        "With Grouping",
     )
     assert "scores" in outputs
     return
\ No newline at end of file

From 9f624f870946ee54c3f0ce360f4033edc29c33e3 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Thu, 5 Sep 2024 16:23:14 +0200
Subject: [PATCH 40/52] Fixed Time Series Analysis

---
 protzilla/constants/colors.py                 |  61 ++++---
 .../data_analysis/time_series_plot_peptide.py |   2 +-
 .../time_series_regression_analysis.py        | 169 +++++++++---------
 protzilla/methods/data_analysis.py            |  21 ++-
 ui/runs/forms/data_analysis.py                |  60 +++----
 5 files changed, 164 insertions(+), 149 deletions(-)

diff --git a/protzilla/constants/colors.py b/protzilla/constants/colors.py
index 3f33249b..98daf656 100644
--- a/protzilla/constants/colors.py
+++ b/protzilla/constants/colors.py
@@ -1,23 +1,44 @@
 PROTZILLA_DISCRETE_COLOR_SEQUENCE = [
-    #Muted Dark Slate
-        "#252935",
-        "#4A536A",
-        '#a4a9b4',
-# Muted Indian Red
-        "#CE5A5A",
-        "#B04A4A",
-        "#EBBDBD",
-# Muted Light Steel Blue
-        "#51646f",
-        "#87A8B9",
-        "#B7CAD5",
- # Muted Sienna
-        "#804538",
-        "#8E3325",
-        "#471912",
-    #Muted Sandy Brown
-        "#715236",
-        "#E2A46D",
-        "F0D1B6",
+    # Set 1: Muted Dark Slate
+    "#252935", "#3A3F50", "#50556A", "#6B7186", "#858DA2",
+    # Set 2: Muted Indian Red
+    "#CE5A5A", "#B24C4C", "#9D3F3F", "#E07272", "#F48D8D",
+    # Set 3: Muted Light Steel Blue
+    "#51646F", "#6A7D89", "#7F92A0", "#96A9B8", "#ADBFCD",
+    # Set 4: Muted Sienna
+    "#804538", "#6F3C31", "#5F342A", "#A05748", "#B66E5E",
+    # Set 5: Muted Sandy Brown
+    "#715236", "#63472F", "#57402B", "#96755A", "#A98575",
+    # Set 6: Muted Olive
+    "#6E6B48", "#5D5B3E", "#4E4D36", "#89875C", "#A1A16E",
+    # Set 7: Muted Teal
+    "#3B6B6A", "#315B5B", "#274C4C", "#507E7E", "#6B9898",
+    # Set 8: Muted Taupe
+    "#8B7E74", "#776F65", "#675E56", "#A09085", "#B9AAA1",
+    # Set 9: Muted Burgundy
+    "#7B3A4F", "#6A3345", "#582C3C", "#925664", "#A8737E",
+    # Set 10: Muted Forest Green
+    "#3D5047", "#35453E", "#2D3B35", "#5F7267", "#7B8D80",
+    # Set 11: Muted Navy
+    "#2F3E4C", "#283442", "#222B38", "#485669", "#627185",
+    # Set 12: Muted Mustard
+    "#BFA054", "#A98F4A", "#927D3F", "#D7BA75", "#E2CD96",
+    # Set 13: Muted Dusty Rose
+    "#C18394", "#AA727E", "#93616C", "#D69BA7", "#E4B8C2",
+    # Set 14: Muted Lavender
+    "#8A729D", "#7A638C", "#6A547C", "#A591B3", "#BDA9C8",
+    # Set 15: Muted Charcoal
+    "#404040", "#353535", "#2B2B2B", "#585858", "#707070",
+    # Set 16: Muted Emerald Green
+    "#4D7456", "#426448", "#37563B", "#6A9177", "#85A990",
+    # Set 17: Muted Peach
+    "#D89B83", "#C2866F", "#A7725E", "#E3B39C", "#ECC7B6",
+    # Set 18: Muted Plum
+    "#704F6E", "#634464", "#563A59", "#876A87", "#A18AA1",
+    # Set 19: Muted Periwinkle
+    "#7E8DAF", "#6F7B98", "#616A82", "#97A3BF", "#B0B9D1",
+    # Set 20: Muted Coral
+    "#CC7A5E", "#B26951", "#9A5A45", "#DD937C", "#EBAA99"
 ]
+
 PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE = ["#4A536A", "#CE5A5A"]
diff --git a/protzilla/data_analysis/time_series_plot_peptide.py b/protzilla/data_analysis/time_series_plot_peptide.py
index 5f5ac64e..04f95c32 100644
--- a/protzilla/data_analysis/time_series_plot_peptide.py
+++ b/protzilla/data_analysis/time_series_plot_peptide.py
@@ -24,7 +24,7 @@ def time_series_plot_peptide(
 ) -> dict:
     """
     A function to create a graph visualising protein quantifications across all samples
-    as a line diagram using retention time. It's possible to select one proteingroup
+    as a line diagram using time. It's possible to select one proteingroup
     that will be displayed in orange and choose a similarity measurement with a similarity score
     to get all proteingroups that are similar displayed in another color in this line diagram.
     All other proteingroups are displayed in the background as a grey polygon.
diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
index 5f6240c4..f579bb03 100644
--- a/protzilla/data_analysis/time_series_regression_analysis.py
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -5,6 +5,7 @@
 import plotly.graph_objects as go
 
 from protzilla.data_analysis.time_series_helper import convert_time_to_hours
+from protzilla.utilities import default_intensity_column
 from protzilla.constants.colors import PROTZILLA_DISCRETE_COLOR_SEQUENCE
 
 from sklearn.linear_model import LinearRegression, RANSACRegressor
@@ -25,7 +26,7 @@
 
 
 def time_series_linear_regression(
-        input_df: pd.DataFrame,
+        intensity_df: pd.DataFrame,
         metadata_df: pd.DataFrame,
         time_column_name: str,
         protein_group: str,
@@ -35,7 +36,7 @@ def time_series_linear_regression(
 ):
     """
     Perform linear regression on the time series data for a given protein group.
-    :param input_df: Peptide dataframe which contains the intensity of each sample
+    :param intensity_df: Peptide dataframe which contains the intensity of each sample
     :param metadata_df: Metadata dataframe which contains the timestamps
     :param time_column_name: The name of the column containing the time values
     :param protein_group: Protein group to perform the analysis on
@@ -49,33 +50,33 @@ def time_series_linear_regression(
     if train_size < 0 or train_size > 1:
         raise ValueError("Test size should be between 0 and 1")
 
-    input_df = input_df[input_df['Protein ID'] == protein_group]
-
-    input_df = pd.merge(
-        left=input_df,
+    intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group]
+    intensity_column_name = default_intensity_column(intensity_df)
+    intensity_df = pd.merge(
+        left=intensity_df,
         right=metadata_df,
         on="Sample",
         copy=False,
     )
 
-    input_df[time_column_name] = input_df[time_column_name].apply(convert_time_to_hours)
-    input_df = input_df.interpolate(method='linear', axis=0)
+    intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)
+    intensity_df = intensity_df.interpolate(method='linear', axis=0)
 
-    input_df = input_df.sample(frac=1, random_state = 42).reset_index(drop=True)
+    intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True)
 
-    X = input_df[[time_column_name]]
-    y = input_df["Intensity"]
+    X = intensity_df[[time_column_name]]
+    y = intensity_df[intensity_column_name]
 
     fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025)
 
     scores = []
 
-    if grouping == "With Grouping" and grouping_column_name in input_df.columns:
-        groups = input_df[grouping_column_name].unique()
+    if grouping == "With Grouping" and grouping_column_name in intensity_df.columns:
+        groups = intensity_df[grouping_column_name].unique()
         for group in groups:
-            group_df = input_df[input_df[grouping_column_name] == group]
+            group_df = intensity_df[intensity_df[grouping_column_name] == group]
             X_group = group_df[[time_column_name]]
-            y_group = group_df["Intensity"]
+            y_group = group_df[intensity_column_name]
 
             X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, train_size=train_size, shuffle=False)
             model = LinearRegression()
@@ -94,7 +95,7 @@ def time_series_linear_regression(
             plot_df = pd.concat([train_df, test_df])
 
             color = PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index % len(PROTZILLA_DISCRETE_COLOR_SEQUENCE)]
-            color_index += 3
+            color_index += 5
 
             fig.add_trace(go.Scatter(
                 x=plot_df[time_column_name],
@@ -150,7 +151,7 @@ def time_series_linear_regression(
             y=plot_df['Predicted'],
             mode='lines',
             name='Predicted Intensity',
-            line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2])
+            line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[5])
         ), row=1, col=1)
 
         scores.append({
@@ -211,7 +212,7 @@ def time_series_linear_regression(
 
 
 def time_series_ransac_regression(
-        input_df: pd.DataFrame,
+        intensity_df: pd.DataFrame,
         metadata_df: pd.DataFrame,
         time_column_name: str,
         protein_group: str,
@@ -224,7 +225,7 @@ def time_series_ransac_regression(
 ):
     """
     Perform RANSAC regression on the time series data for a given protein group.
-    :param input_df: Peptide dataframe which contains the intensity of each sample
+    :param intensity_df: Peptide dataframe which contains the intensity of each sample
     :param metadata_df: Metadata dataframe which contains the timestamps
     :param time_column_name: The name of the column containing the time values
     :param max_trials: The maximum number of iterations to perform
@@ -242,33 +243,34 @@ def time_series_ransac_regression(
     if train_size < 0 or train_size > 1:
         raise ValueError("Test size should be between 0 and 1")
 
-    input_df = input_df[input_df['Protein ID'] == protein_group]
+    intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group]
+    intensity_column_name = default_intensity_column(intensity_df)
 
-    input_df = pd.merge(
-        left=input_df,
+    intensity_df = pd.merge(
+        left=intensity_df,
         right=metadata_df,
         on="Sample",
         copy=False,
     )
 
-    input_df[time_column_name] = input_df[time_column_name].apply(convert_time_to_hours)
-    input_df = input_df.interpolate(method='linear', axis=0)
+    intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)
+    intensity_df = intensity_df.interpolate(method='linear', axis=0)
 
-    input_df = input_df.sample(frac=1, random_state = 42).reset_index(drop=True)
+    intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True)
 
-    X = input_df[[time_column_name]]
-    y = input_df["Intensity"]
+    X = intensity_df[[time_column_name]]
+    y = intensity_df[intensity_column_name]
 
     fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025)
 
     scores = []
 
-    if grouping == "With Grouping" and grouping_column_name in input_df.columns:
-        groups = input_df[grouping_column_name].unique()
+    if grouping == "With Grouping" and grouping_column_name in intensity_df.columns:
+        groups = intensity_df[grouping_column_name].unique()
         for group in groups:
-            group_df = input_df[input_df[grouping_column_name] == group]
+            group_df = intensity_df[intensity_df[grouping_column_name] == group]
             X_group = group_df[[time_column_name]]
-            y_group = group_df["Intensity"]
+            y_group = group_df[intensity_column_name]
 
             X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, train_size=train_size, shuffle=False)
             model = RANSACRegressor(max_trials = max_trials, stop_probability = stop_probability, loss = loss, base_estimator=LinearRegression())
@@ -304,7 +306,7 @@ def time_series_ransac_regression(
                 y=plot_df['Predicted'],
                 mode='lines',
                 name='Predicted Intensity',
-                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 1])
+                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2])
             ), row=1, col=1)
 
             fig.add_trace(go.Scatter(
@@ -312,10 +314,10 @@ def time_series_ransac_regression(
                 y=plot_df[plot_df['Inlier'] == False]['Intensity'],
                 mode='markers',
                 name='Outliers',
-                marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2])
+                marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4])
             ), row=1, col=1)
 
-            color_index += 3
+            color_index += 5
 
             scores.append({
                 'group': group,
@@ -368,7 +370,7 @@ def time_series_ransac_regression(
             y=plot_df[plot_df['Inlier'] == False]['Intensity'],
             mode='markers',
             name='Outliers',
-            marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2])
+            marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3])
         ), row=1, col=1)
 
         scores.append({
@@ -429,14 +431,14 @@ def time_series_ransac_regression(
 
 
 def adfuller_test(
-    input_df: pd.DataFrame,
+    intensity_df: pd.DataFrame,
     metadata_df: pd.DataFrame,
     protein_group: str,
     alpha: float = 0.05,
 ) -> dict:
     """
     Perform the Augmented Dickey-Fuller test to check for stationarity in a time series.
-    :param input_df: The dataframe containing the time series data.
+    :param intensity_df: The dataframe containing the time series data.
     :param metadata_df: The dataframe containing the metadata.
     :param protein_group: The protein group to perform the test on.
     :param alpha: The significance level for the test (default is 0.05).
@@ -450,19 +452,20 @@ def adfuller_test(
     """
 
     messages = []
-    input_df = input_df[input_df['Protein ID'] == protein_group]
+    intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group]
+    intensity_column_name = default_intensity_column(intensity_df)
 
-    input_df = pd.merge(
-        left=input_df,
+    intensity_df = pd.merge(
+        left=intensity_df,
         right=metadata_df,
         on="Sample",
         copy=False,
     )
 
-    input_df = input_df["Intensity"].dropna()
+    intensity_df = intensity_df[intensity_column_name].dropna()
 
     # Perform the ADF test
-    result = adfuller(input_df)
+    result = adfuller(intensity_df)
     test_statistic = result[0]
     p_value = result[1]
     critical_values = result[4]
@@ -496,7 +499,7 @@ def adfuller_test(
 
 
 def time_series_auto_arima(
-    input_df: pd.DataFrame,
+    intensity_df: pd.DataFrame,
     metadata_df: pd.DataFrame,
     time_column_name: str,
     protein_group: str,
@@ -508,7 +511,7 @@ def time_series_auto_arima(
 ) -> dict:
     """
     Perform an automatic ARIMA model selection on the time series data for a given protein group.
-    :param input_df: Peptide dataframe which contains the intensity of each sample
+    :param intensity_df: Peptide dataframe which contains the intensity of each sample
     :param metadata_df: Metadata dataframe which contains the timestamps
     :param time_column_name: The name of the column containing the time values
     :param protein_group: Protein group to perform the analysis on
@@ -530,11 +533,12 @@ def time_series_auto_arima(
     else:
         seasonal = False
 
-    input_df = input_df[input_df['Protein ID'] == protein_group]
-    input_df = input_df.sample(frac=1, random_state=42).reset_index(drop=True)
+    intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group]
+    intensity_df = intensity_df.sample(frac=1, random_state=42).reset_index(drop=True)
+    intensity_column_name = default_intensity_column(intensity_df)
 
-    input_df = pd.merge(
-        left=input_df,
+    intensity_df = pd.merge(
+        left=intensity_df,
         right=metadata_df,
         on="Sample",
         copy=False,
@@ -543,19 +547,19 @@ def time_series_auto_arima(
     fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3])
     scores = []
 
-    if grouping == "With Grouping" and grouping_column_name in input_df.columns:
-        groups = input_df[grouping_column_name].unique()
+    if grouping == "With Grouping" and grouping_column_name in intensity_df.columns:
+        groups = intensity_df[grouping_column_name].unique()
         for group in groups:
-            group_df = input_df[input_df[grouping_column_name] == group]
+            group_df = intensity_df[intensity_df[grouping_column_name] == group]
 
-            group_df[time_column_name] = group_df[time_column_name].apply(convert_time_to_hours)
+            group_df[time_column_name] = group_df[str(time_column_name)].apply(convert_time_to_hours)
             group_df = group_df.interpolate(method='linear', axis=0)
 
             train_df_size = int(len(group_df) * train_size)
             train_df, test_df = group_df[:train_df_size], group_df[train_df_size:]
 
-            train_df = train_df.set_index(time_column_name)["Intensity"]
-            test_df = test_df.set_index(time_column_name)["Intensity"]
+            train_df = train_df.set_index(time_column_name)[intensity_column_name]
+            test_df = test_df.set_index(time_column_name)[intensity_column_name]
 
             # Fit the ARIMA model
             model = auto_arima(
@@ -593,7 +597,7 @@ def time_series_auto_arima(
                 y=forecast,
                 mode='markers',
                 name='Predicted Intensity',
-                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2])
+                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 3])
             ), row=1, col=1)
 
             fig.add_trace(go.Scatter(
@@ -601,10 +605,10 @@ def time_series_auto_arima(
                 y = forecast_plot,
                 mode = 'lines',
                 name = 'Mean Predicted Intensity',
-                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2])
+                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 3])
             ), row=1, col=1)
 
-            color_index += 3
+            color_index += 5
 
             scores.append({
                 'group': group,
@@ -615,14 +619,14 @@ def time_series_auto_arima(
             })
 
     else:
-        input_df[time_column_name] = input_df[time_column_name].apply(convert_time_to_hours)
-        input_df = input_df.interpolate(method='linear', axis=0)
+        intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)
+        intensity_df = intensity_df.interpolate(method='linear', axis=0)
 
-        train_size = int(len(input_df) * train_size)
-        train_df, test_df = input_df[:train_size], input_df[train_size:]
+        train_size = int(len(intensity_df) * train_size)
+        train_df, test_df = intensity_df[:train_size], intensity_df[train_size:]
 
-        train_df = train_df.set_index(time_column_name)["Intensity"]
-        test_df = test_df.set_index(time_column_name)["Intensity"]
+        train_df = train_df.set_index(time_column_name)[intensity_column_name]
+        test_df = test_df.set_index(time_column_name)[intensity_column_name]
 
         # Fit the ARIMA model
         model = auto_arima(
@@ -729,7 +733,7 @@ def time_series_auto_arima(
 
 
 def time_series_arima(
-    input_df: pd.DataFrame,
+    intensity_df: pd.DataFrame,
     metadata_df: pd.DataFrame,
     time_column_name: str,
     protein_group: str,
@@ -748,7 +752,7 @@ def time_series_arima(
 
     """
     Perform ARIMA model selection on the time series data for a given protein group.
-    :param input_df: Peptide dataframe which contains the intensity of each sample
+    :param intensity_df: Peptide dataframe which contains the intensity of each sample
     :param metadata_df: Metadata dataframe which contains the timestamps
     :param time_column_name: The name of the column containing the time values
     :param protein_group: Protein group to perform the analysis on
@@ -772,27 +776,28 @@ def time_series_arima(
     if train_size < 0 or train_size > 1:
         raise ValueError("Train size should be between 0 and 1")
 
-    input_df = input_df[input_df['Protein ID'] == protein_group]
-    input_df = input_df.sample(frac=1, random_state=42).reset_index(drop=True)
+    intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group]
+    intensity_df = intensity_df.sample(frac=1, random_state=42).reset_index(drop=True)
+    intensity_column_name = default_intensity_column(intensity_df)
 
-    input_df = pd.merge(left=input_df, right=metadata_df, on="Sample", copy=False)
+    intensity_df = pd.merge(left=intensity_df, right=metadata_df, on="Sample", copy=False)
 
     fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3])
     scores = []
 
-    if grouping == "With Grouping" and grouping_column_name in input_df.columns:
-        groups = input_df[grouping_column_name].unique()
+    if grouping == "With Grouping" and grouping_column_name in intensity_df.columns:
+        groups = intensity_df[grouping_column_name].unique()
         for group in groups:
-            group_df = input_df[input_df[grouping_column_name] == group]
+            group_df = intensity_df[intensity_df[grouping_column_name] == group]
 
-            group_df[time_column_name] = group_df[time_column_name].apply(convert_time_to_hours)
+            group_df[time_column_name] = group_df[str(time_column_name)].apply(convert_time_to_hours)
             group_df = group_df.interpolate(method='linear', axis=0)
 
             train_df_size = int(len(group_df) * train_size)
             train_df, test_df = group_df[:train_df_size], group_df[train_df_size:]
 
-            train_df = train_df.set_index(time_column_name)["Intensity"]
-            test_df = test_df.set_index(time_column_name)["Intensity"]
+            train_df = train_df.set_index(time_column_name)[intensity_column_name]
+            test_df = test_df.set_index(time_column_name)[intensity_column_name]
 
             if seasonal == "Yes":
                 model = ARIMA(
@@ -843,7 +848,7 @@ def time_series_arima(
                 line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2])
             ), row=1, col=1)
 
-            color_index += 3
+            color_index += 5
 
             scores.append({
                 'group': group,
@@ -854,14 +859,14 @@ def time_series_arima(
             })
 
     else:
-        input_df[time_column_name] = input_df[time_column_name].apply(convert_time_to_hours)
-        input_df = input_df.interpolate(method='linear', axis=0)
+        intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)
+        intensity_df = intensity_df.interpolate(method='linear', axis=0)
 
-        train_size = int(len(input_df) * train_size)
-        train_df, test_df = input_df[:train_size], input_df[train_size:]
+        train_size = int(len(intensity_df) * train_size)
+        train_df, test_df = intensity_df[:train_size], intensity_df[train_size:]
 
-        train_df = train_df.set_index(time_column_name)["Intensity"]
-        test_df = test_df.set_index(time_column_name)["Intensity"]
+        train_df = train_df.set_index(time_column_name)[intensity_column_name]
+        test_df = test_df.set_index(time_column_name)[intensity_column_name]
 
         if seasonal == "Yes":
             model = ARIMA(
@@ -906,7 +911,7 @@ def time_series_arima(
             y=forecast_plot,
             mode='lines',
             name='Mean Predicted Intensity',
-            line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3])
+            line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[4])
         ), row=1, col=1)
 
         scores.append({
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 229c27b6..9f5261d1 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -804,7 +804,7 @@ class TimeSeriesLinearRegression(PlotStep):
                                     "The p-values are corrected for multiple testing.")
 
     input_keys = [
-        "input_df",
+        "intensity_df",
         "metadata_df",
         "time_column_name",
         "protein_group",
@@ -820,7 +820,7 @@ def method(self, inputs: dict) -> dict:
         return time_series_linear_regression(**inputs)
 
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
-        inputs["input_df"] = steps.get_step_output(Step, "peptide_df", inputs["input_df"])
+        inputs["intensity_df"] = steps.protein_df
         inputs["metadata_df"] = steps.metadata_df
         return inputs
 
@@ -831,7 +831,7 @@ class TimeSeriesRANSACRegression(PlotStep):
     method_description = " Perform RANSAC regression on the time series data for a given protein group."
 
     input_keys = [
-        "input_df",
+        "intensity_df",
         "metadata_df",
         "time_column_name",
         "protein_group",
@@ -849,7 +849,7 @@ def method(self, inputs: dict) -> dict:
         return time_series_ransac_regression(**inputs)
 
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
-        inputs["input_df"] = steps.get_step_output(Step, "peptide_df", inputs["input_df"])
+        inputs["intensity_df"] = steps.protein_df
         inputs["metadata_df"] = steps.metadata_df
         return inputs
 
@@ -860,9 +860,8 @@ class TimeSeriesADFullerTest(DataAnalysisStep):
     method_description = "Perform Augmented Dickey-Fuller test on the time series data for a given protein group."
 
     input_keys = [
-        "input_df",
+        "intensity_df",
         "metadata_df",
-        "time_column_name",
         "protein_group",
         "alpha",
     ]
@@ -877,7 +876,7 @@ def method(self, inputs: dict) -> dict:
         return adfuller_test(**inputs)
 
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
-        inputs["input_df"] = steps.get_step_output(Step, "peptide_df", inputs["input_df"])
+        inputs["intensity_df"] = steps.protein_df
         inputs["metadata_df"] = steps.metadata_df
         return inputs
 
@@ -890,7 +889,7 @@ class TimeSeriesAutoARIMA(PlotStep):
     )
 
     input_keys = [
-        "input_df",
+        "intensity_df",
         "metadata_df",
         "time_column_name",
         "protein_group",
@@ -908,7 +907,7 @@ def method(self, inputs: dict) -> dict:
         return time_series_auto_arima(**inputs)
 
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
-        inputs["input_df"] = steps.get_step_output(Step, "peptide_df", inputs["input_df"])
+        inputs["intensity_df"] = steps.protein_df
         inputs["metadata_df"] = steps.metadata_df
         return inputs
 
@@ -921,7 +920,7 @@ class TimeSeriesARIMA(PlotStep):
     )
 
     input_keys = [
-        "input_df",
+        "intensity_df",
         "metadata_df",
         "time_column_name",
         "protein_group",
@@ -945,7 +944,7 @@ def method(self, inputs: dict) -> dict:
         return time_series_arima(**inputs)
 
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
-        inputs["input_df"] = steps.get_step_output(Step, "peptide_df", inputs["input_df"])
+        inputs["intensity_df"] = steps.protein_df
         inputs["metadata_df"] = steps.metadata_df
         return inputs
 
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 899b56ae..12328911 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1236,9 +1236,9 @@ def fill_form(self, run: Run) -> None:
 
 class TimeSeriesLinearRegressionForm(MethodForm):
     is_dynamic = True
-    input_df = CustomChoiceField(
+    intensity_df = CustomChoiceField(
         choices=[],
-        label="Peptide dataframe",
+        label="Intensity dataframe",
     )
     time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time")
     protein_group = CustomChoiceField(
@@ -1261,11 +1261,11 @@ class TimeSeriesLinearRegressionForm(MethodForm):
 
 
     def fill_form(self, run: Run) -> None:
-        self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps(
+        self.fields["intensity_df"].choices = fill_helper.get_choices_for_protein_df_steps(
             run
         )
         input_df_instance_id = self.data.get(
-            "input_df", self.fields["input_df"].choices[0][0]
+            "intensity_df", self.fields["intensity_df"].choices[0][0]
         )
         self.fields[
             "time_column_name"
@@ -1278,7 +1278,7 @@ def fill_form(self, run: Run) -> None:
         self.fields["protein_group"].choices = fill_helper.to_choices(
             run.steps.get_step_output(
                 step_type=Step,
-                output_key="peptide_df",
+                output_key="protein_df",
                 instance_identifier=input_df_instance_id,
             )["Protein ID"].unique()
         )
@@ -1286,9 +1286,9 @@ def fill_form(self, run: Run) -> None:
 
 class TimeSeriesRANSACRegressionForm(MethodForm):
     is_dynamic = True
-    input_df = CustomChoiceField(
+    intensity_df = CustomChoiceField(
         choices=[],
-        label="Peptide dataframe",
+        label="Intensity dataframe",
     )
     time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time")
     protein_group = CustomChoiceField(
@@ -1329,13 +1329,12 @@ class TimeSeriesRANSACRegressionForm(MethodForm):
 
 
     def fill_form(self, run: Run) -> None:
-        self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps(
+        self.fields["intensity_df"].choices = fill_helper.get_choices_for_protein_df_steps(
             run
         )
         input_df_instance_id = self.data.get(
-            "input_df", self.fields["input_df"].choices[0][0]
+            "intensity_df", self.fields["intensity_df"].choices[0][0]
         )
-
         self.fields[
             "time_column_name"
         ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
@@ -1347,7 +1346,7 @@ def fill_form(self, run: Run) -> None:
         self.fields["protein_group"].choices = fill_helper.to_choices(
             run.steps.get_step_output(
                 step_type=Step,
-                output_key="peptide_df",
+                output_key="protein_df",
                 instance_identifier=input_df_instance_id,
             )["Protein ID"].unique()
         )
@@ -1367,11 +1366,10 @@ class TimeSeriesADFullerTestForm(MethodForm):
              "JASA. Journal of the American Statistical Association. 74. 10.2307/2286348. "
         ),
     )
-    input_df = CustomChoiceField(
+    intensity_df = CustomChoiceField(
         choices=[],
-        label="Peptide dataframe",
+        label="Intensity dataframe",
     )
-    time_column_name = CustomChoiceField(choices=[], label="Time: which column from metadata that represents time")
     protein_group = CustomChoiceField(
         choices=[],
         label="Protein group: which protein group to perform the ADFuller test on",
@@ -1384,26 +1382,20 @@ class TimeSeriesADFullerTestForm(MethodForm):
     )
 
     def fill_form(self, run: Run) -> None:
-        self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps(
+        self.fields["intensity_df"].choices = fill_helper.get_choices_for_protein_df_steps(
             run
         )
         input_df_instance_id = self.data.get(
-            "input_df", self.fields["input_df"].choices[0][0]
+            "intensity_df", self.fields["intensity_df"].choices[0][0]
         )
-
-        self.fields[
-            "time_column_name"
-        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
-
         self.fields["protein_group"].choices = fill_helper.to_choices(
             run.steps.get_step_output(
                 step_type=Step,
-                output_key="peptide_df",
+                output_key="protein_df",
                 instance_identifier=input_df_instance_id,
             )["Protein ID"].unique()
         )
 
-
 class TimeSeriesAutoARIMAForm(MethodForm):
     is_dynamic = True
     model_info = TextDisplayField(
@@ -1411,9 +1403,9 @@ class TimeSeriesAutoARIMAForm(MethodForm):
         text=(
         ),
     )
-    input_df = CustomChoiceField(
+    intensity_df = CustomChoiceField(
         choices=[],
-        label="Peptide dataframe",
+        label="Intensity dataframe",
     )
     time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time")
     protein_group = CustomChoiceField(
@@ -1447,13 +1439,12 @@ class TimeSeriesAutoARIMAForm(MethodForm):
 
 
     def fill_form(self, run: Run) -> None:
-        self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps(
+        self.fields["intensity_df"].choices = fill_helper.get_choices_for_protein_df_steps(
             run
         )
         input_df_instance_id = self.data.get(
-            "input_df", self.fields["input_df"].choices[0][0]
+            "intensity_df", self.fields["intensity_df"].choices[0][0]
         )
-
         self.fields[
             "time_column_name"
         ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
@@ -1465,7 +1456,7 @@ def fill_form(self, run: Run) -> None:
         self.fields["protein_group"].choices = fill_helper.to_choices(
             run.steps.get_step_output(
                 step_type=Step,
-                output_key="peptide_df",
+                output_key="protein_df",
                 instance_identifier=input_df_instance_id,
             )["Protein ID"].unique()
         )
@@ -1480,9 +1471,9 @@ class TimeSeriesARIMAForm(MethodForm):
         ),
     )
     """
-    input_df = CustomChoiceField(
+    intensity_df = CustomChoiceField(
         choices=[],
-        label="Peptide dataframe",
+        label="Intensity dataframe",
     )
     time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time")
     protein_group = CustomChoiceField(
@@ -1556,13 +1547,12 @@ class TimeSeriesARIMAForm(MethodForm):
 
 
     def fill_form(self, run: Run) -> None:
-        self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps(
+        self.fields["intensity_df"].choices = fill_helper.get_choices_for_protein_df_steps(
             run
         )
         input_df_instance_id = self.data.get(
-            "input_df", self.fields["input_df"].choices[0][0]
+            "intensity_df", self.fields["intensity_df"].choices[0][0]
         )
-
         self.fields[
             "time_column_name"
         ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
@@ -1574,7 +1564,7 @@ def fill_form(self, run: Run) -> None:
         self.fields["protein_group"].choices = fill_helper.to_choices(
             run.steps.get_step_output(
                 step_type=Step,
-                output_key="peptide_df",
+                output_key="protein_df",
                 instance_identifier=input_df_instance_id,
             )["Protein ID"].unique()
         )

From 671633de78589095c7442a3fed234221845b672d Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Sat, 14 Sep 2024 13:05:27 +0200
Subject: [PATCH 41/52] Implemented TMT data import for PROTzilla

---
 protzilla/importing/ms_data_import.py | 73 +++++++++++++++++++++++++++
 protzilla/methods/importing.py        | 12 +++++
 ui/runs/forms/importing.py            | 18 +++++++
 3 files changed, 103 insertions(+)

diff --git a/protzilla/importing/ms_data_import.py b/protzilla/importing/ms_data_import.py
index fc5cc105..595aacbb 100644
--- a/protzilla/importing/ms_data_import.py
+++ b/protzilla/importing/ms_data_import.py
@@ -123,6 +123,79 @@ def diann_import(file_path, map_to_uniprot=False, aggregation_method: str ="Sum"
         return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))])
 
 
+def tmt_data_import(
+        file_path: str, intensity_name: str = "Reporter intensity", map_to_uniprot=False,
+        aggregation_method: str = "Sum"
+) -> dict:
+    try:
+        # Read the file
+        df = pd.read_csv(
+            file_path,
+            sep="\t",
+            low_memory=False,
+            na_values=["", 0],
+            keep_default_na=True,
+        )
+
+        # Debug step: Print the column names to check the actual names in the data
+        print("Columns in the file:", df.columns.tolist())
+
+        # Try to handle different possible names for the 'Protein ID' column
+        protein_column = None
+        possible_names = ["Majority protein IDs"]
+
+        for name in possible_names:
+            if name in df.columns:
+                protein_column = name
+                break
+
+        if protein_column is None:
+            raise KeyError("No valid 'Protein ID' or equivalent column found in the data.")
+
+        df = df.rename(columns={protein_column: "Protein ID"})
+
+        # Extract protein or gene identifiers
+        protein_groups = df["Protein ID"]
+
+        # Drop columns that are not relevant
+        columns_to_drop = [
+            "Combined Spectral Count",
+            "Combined Unique Spectral Count",
+            "Combined Total Spectral Count",
+        ]
+        existing_columns = set(df.columns)
+        columns_to_drop_existing = [col for col in columns_to_drop if col in existing_columns]
+        df = df.drop(columns=columns_to_drop_existing)
+        print("Columns after dropping irrelevant ones:", df.columns.tolist())
+
+        # Use regex to find columns matching the TMT pattern with visits for both NP and T1D samples
+        intensity_columns = df.filter(
+            regex=f"{intensity_name} \\d+ (NP\\d{{2}}|TD\\d{{2}})", axis=1
+        )
+
+        # Debug step: Print the intensity columns that were matched
+        print("Matched intensity columns:", intensity_columns.columns.tolist())
+
+        # Rename columns to the format 'NPXX_1' or 'T1DXX_1'
+        intensity_columns.columns = [
+            re.sub(f"{intensity_name} (\\d+) (NP\\d{{2}}|TD\\d{{2}})",
+                   lambda m: f"{m.group(2)}_{int(m.group(1)) + 1}", col) for col in intensity_columns.columns
+        ]
+
+        # Debug step: Print the renamed intensity columns
+        print("Renamed intensity columns:", intensity_columns.columns.tolist())
+        # Add back the protein identifiers to the dataframe
+        intensity_columns = intensity_columns.assign(**{"Protein ID": protein_groups})
+
+        # Apply transformation, clean-up, or aggregation (depending on your logic)
+        return transform_and_clean(intensity_columns, intensity_name, map_to_uniprot, aggregation_method)
+
+    except Exception as e:
+        msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid TMT data file."
+        return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))])
+
+
+
 def transform_and_clean(
     df: pd.DataFrame, intensity_name: str, map_to_uniprot: bool, aggregation_method: str ="Sum"
 ) -> dict:
diff --git a/protzilla/methods/importing.py b/protzilla/methods/importing.py
index a7af6d42..f94218f4 100644
--- a/protzilla/methods/importing.py
+++ b/protzilla/methods/importing.py
@@ -9,6 +9,7 @@
     diann_import,
     max_quant_import,
     ms_fragger_import,
+    tmt_data_import,
 )
 from protzilla.importing.peptide_import import peptide_import, evidence_import
 from protzilla.steps import Step, StepManager
@@ -60,6 +61,17 @@ def method(self, inputs):
         return ms_fragger_import(**inputs)
 
 
+class TMTImport(ImportingStep):
+    display_name = "TMT"
+    operation = "msdataimport"
+    method_description = "TMT data import"
+    input_keys = ["file_path", "map_to_uniprot", "aggregation_method"]
+    output_keys = ["protein_df"]
+
+    def method(self, inputs):
+        return tmt_data_import(**inputs)
+
+
 class MetadataImport(ImportingStep):
     display_name = "Metadata import"
     operation = "metadataimport"
diff --git a/ui/runs/forms/importing.py b/ui/runs/forms/importing.py
index be961fa1..07c8d74c 100644
--- a/ui/runs/forms/importing.py
+++ b/ui/runs/forms/importing.py
@@ -75,6 +75,24 @@ class MSFraggerImportForm(MethodForm):
         choices=AggregationMethods, label="Aggregation method", initial="Sum"
     )
 
+class TMTImportForm(MethodForm):
+    file_path = CustomFileField(label="TMT intensities file")
+    map_to_uniprot = CustomBooleanField(
+        label="Map to Uniprot IDs using Biomart (online)", required=False
+    )
+    aggregation_method = CustomChoiceField(
+        choices=AggregationMethods, label="Aggregation method", initial="Sum"
+    )
+
+class DiannImportForm(MethodForm):
+    file_path = CustomFileField(label="DIA-NN intensities file:")
+    map_to_uniprot = CustomBooleanField(
+        label="Map to Uniprot IDs using Biomart (online)", required=False
+    )
+    aggregation_method = CustomChoiceField(
+        choices=AggregationMethods, label="Aggregation method", initial="Sum"
+    )
+
 
 class MetadataImportForm(MethodForm):
     file_path = CustomFileField(label="Metadata file")

From 1e1b50f7749f173f69f475b637dc7f72be884cf9 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Sat, 14 Sep 2024 13:06:37 +0200
Subject: [PATCH 42/52] Updated TimeQuant plot

---
 ...s_plot_peptide.py => time_series_plots.py} | 22 ++++++++++---------
 protzilla/utilities/transform_dfs.py          | 18 ++++++++-------
 2 files changed, 22 insertions(+), 18 deletions(-)
 rename protzilla/data_analysis/{time_series_plot_peptide.py => time_series_plots.py} (89%)

diff --git a/protzilla/data_analysis/time_series_plot_peptide.py b/protzilla/data_analysis/time_series_plots.py
similarity index 89%
rename from protzilla/data_analysis/time_series_plot_peptide.py
rename to protzilla/data_analysis/time_series_plots.py
index 04f95c32..236c0e5a 100644
--- a/protzilla/data_analysis/time_series_plot_peptide.py
+++ b/protzilla/data_analysis/time_series_plots.py
@@ -15,9 +15,10 @@
     "annotation_proteins_of_interest": "#4A536A",
 }
 
-def time_series_plot_peptide(
-    input_df: pd.DataFrame,
+def time_quant_plot(
+    intensity_df: pd.DataFrame,
     metadata_df: pd.DataFrame,
+    time_column_name: str,
     protein_group: str,
     similarity: float = 1.0,
     similarity_measure: str = "euclidean distance",
@@ -29,9 +30,10 @@ def time_series_plot_peptide(
     to get all proteingroups that are similar displayed in another color in this line diagram.
     All other proteingroups are displayed in the background as a grey polygon.
 
-    :param input_df: A dataframe in protzilla wide format, where each row
+    :param intensity_df: A dataframe in protzilla wide format, where each row
         represents a sample and each column represents a feature.
     :param metadata_df: A dataframe containing the metadata of the samples.
+    :param time_column_name: The name of the column in the metadata_df that contains the time information.
     :param protein_group: Protein IDs as the columnheader of the dataframe
     :param similarity_measure: method to compare the chosen proteingroup with all others. The two
         methods are "cosine similarity" and "euclidean distance".
@@ -40,15 +42,15 @@ def time_series_plot_peptide(
     :return: returns a dictionary containing a list with a plotly figure and/or a list of messages
     """
 
-    input_df = pd.merge(
-        left=input_df,
-        right=metadata_df[["Sample", "Time"]],
+    intensity_df = pd.merge(
+        left=intensity_df,
+        right=metadata_df[["Sample", time_column_name]],
         on="Sample",
         copy=False,
     )
 
-    wide_df = input_df.interpolate(method='linear', axis=0)
-    wide_df = long_to_wide_time(wide_df) if is_long_format(wide_df) else  wide_df
+    wide_df = intensity_df.interpolate(method='linear', axis=0)
+    wide_df = long_to_wide_time(wide_df, time_column_name=time_column_name) if is_long_format(wide_df, time_column_name=time_column_name) else  wide_df
 
 
     if protein_group not in wide_df.columns:
@@ -164,14 +166,14 @@ def time_series_plot_peptide(
         yaxis_gridcolor=colors["gridcolor"],
         xaxis_linecolor=colors["linecolor"],
         yaxis_linecolor=colors["linecolor"],
-        xaxis_title="Time",
+        xaxis_title=time_column_name,
         yaxis_title="Intensity",
         legend_title="Legend",
         xaxis=dict(
             tickmode="array",
             tickangle=0,
             tickvals=wide_df.index,
-            ticktext=[wide_df["Time"].unique() for wide_df["Time"] in wide_df.index],
+            ticktext=[wide_df[time_column_name].unique() for wide_df[time_column_name] in wide_df.index],
         ),
         autosize=True,
         margin=dict(l=100, r=300, t=100, b=100),
diff --git a/protzilla/utilities/transform_dfs.py b/protzilla/utilities/transform_dfs.py
index a5380b32..1c3b7fbe 100644
--- a/protzilla/utilities/transform_dfs.py
+++ b/protzilla/utilities/transform_dfs.py
@@ -12,6 +12,7 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None):
     :param intensity_df: the dataframe that should be transformed into
         long format
         :type intensity_df: pd.DataFrame
+    :param value_name: the name of the column in the metadata_df that contains the intensity information.
 
     :return: returns dataframe in wide format suitable for use by
         packages such as sklearn
@@ -23,7 +24,7 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None):
     )
 
 
-def long_to_wide_time(intensity_df: pd.DataFrame, value_name: str = None):
+def long_to_wide_time(intensity_df: pd.DataFrame, value_name: str = None, time_column_name: str = None):
     """
     This function transforms the dataframe to a wide format that
     can be more easily handled by packages such as sklearn.
@@ -32,16 +33,18 @@ def long_to_wide_time(intensity_df: pd.DataFrame, value_name: str = None):
     :param intensity_df: the dataframe that should be transformed into
         long format
         :type intensity_df: pd.DataFrame
+    :param value_name: the name of the column in the metadata_df that contains the intensity information.
+    :param time_column_name: the name of the column in the metadata_df that contains the time information.
 
     :return: returns dataframe in wide format suitable for use by
         packages such as sklearn
     :rtype: pd.DataFrame
     """
-    if intensity_df.duplicated(subset=["Time", "Protein ID"]).any():
-        intensity_df = intensity_df.groupby(["Time", "Protein ID"]).mean().reset_index()
+    if intensity_df.duplicated(subset=[time_column_name, "Protein ID"]).any():
+        intensity_df = intensity_df.groupby([time_column_name, "Protein ID"]).mean().reset_index()
     values_name = default_intensity_column(intensity_df) if value_name is None else value_name
     intensity_df = pd.pivot(
-        intensity_df, index="Time", columns="Protein ID", values=values_name
+        intensity_df, index=time_column_name, columns="Protein ID", values=values_name
     )
     intensity_df = intensity_df.fillna(intensity_df.mean())
     return intensity_df
@@ -81,17 +84,16 @@ def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame):
     return intensity_df
 
 
-def is_long_format(df: pd.DataFrame):
+def is_long_format(df: pd.DataFrame, time_column_name: str = None):
     required_columns = {"Sample", "Protein ID"}
-    additional_columns = {"Gene", "Time"}
+    additional_columns = {"Gene", time_column_name}
     return required_columns.issubset(df.columns) and any(col in df.columns for col in additional_columns)
 
 
 def is_intensity_df(df: pd.DataFrame):
     """
     Checks if the dataframe is an intensity dataframe.
-    An intensity dataframe should have the columns "Sample", "Protein ID" and
-    and intensity column.
+    An intensity dataframe should have the columns "Sample", "Protein ID" and intensity column.
 
     :param df: the dataframe that should be checked
     :type df: pd.DataFrame

From a3ffe29f1ef6d79a263e17d64ebf5d878aaa9722 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Sat, 14 Sep 2024 13:06:57 +0200
Subject: [PATCH 43/52] Updated a test

---
 tests/protzilla/data_analysis/test_time_series_plots.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/protzilla/data_analysis/test_time_series_plots.py b/tests/protzilla/data_analysis/test_time_series_plots.py
index 12249fb0..46182092 100644
--- a/tests/protzilla/data_analysis/test_time_series_plots.py
+++ b/tests/protzilla/data_analysis/test_time_series_plots.py
@@ -1,7 +1,7 @@
 import pandas as pd
 import pytest
 
-from protzilla.data_analysis.time_series_plot_peptide import time_series_plot_peptide
+from protzilla.data_analysis.time_series_plots import time_quant_plot
 
 
 @pytest.fixture
@@ -56,7 +56,7 @@ def time_series_test_data():
 
 def test_time_series_plot(show_figures, time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
-    outputs = time_series_plot_peptide(test_intensity, test_metadata, "Protein1")
+    outputs = time_quant_plot(test_intensity, test_metadata, "Protein1")
     assert "plots" in outputs
     fig = outputs["plots"][0]
     if show_figures:
@@ -66,11 +66,11 @@ def test_time_series_plot(show_figures, time_series_test_data):
 def test_time_series_plot_invalid_euclidean_similarity(time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
     with pytest.raises(ValueError):
-        time_series_plot_peptide(test_intensity, test_metadata, "Protein1", similarity=-1, similarity_measure="euclidean distance")
+        time_quant_plot(test_intensity, test_metadata, "Protein1", similarity=-1, similarity_measure="euclidean distance")
     return
 
 def test_time_series_plot_invalid_cosine_similarity(time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
     with pytest.raises(ValueError):
-        time_series_plot_peptide(test_intensity, test_metadata, "Protein1", similarity=2, similarity_measure="cosine similarity")
+        time_quant_plot(test_intensity, test_metadata, "Protein1", similarity=2, similarity_measure="cosine similarity")
     return
\ No newline at end of file

From 776299519242fadeed3eb2c257cecfaf7e3d0485 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Sat, 14 Sep 2024 13:07:36 +0200
Subject: [PATCH 44/52] Mapped TMT import

---
 ui/runs/form_mapping.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py
index 8f4793f1..a0e58689 100644
--- a/ui/runs/form_mapping.py
+++ b/ui/runs/form_mapping.py
@@ -17,6 +17,7 @@
     importing.MaxQuantImport: importing_forms.MaxQuantImportForm,
     importing.DiannImport: importing_forms.DiannImportForm,
     importing.MsFraggerImport: importing_forms.MSFraggerImportForm,
+    importing.TMTImport: importing_forms.TMTImportForm,
     importing.MetadataImport: importing_forms.MetadataImportForm,
     importing.MetadataImportMethodDiann: importing_forms.MetadataImportMethodDiannForm,
     importing.MetadataColumnAssignment: importing_forms.MetadataColumnAssignmentForm,
@@ -49,7 +50,7 @@
     data_analysis.PlotScatterPlot: data_analysis_forms.PlotScatterPlotForm,
     data_analysis.PlotClustergram: data_analysis_forms.PlotClustergramForm,
     data_analysis.PlotProtQuant: data_analysis_forms.PlotProtQuantForm,
-    data_analysis.PlotTimeSeriesPeptide: data_analysis_forms.PlotTimeSeriesForm,
+    data_analysis.PlotTimeQuant: data_analysis_forms.PlotTimeQuantForm,
     data_analysis.PlotPrecisionRecallCurve: data_analysis_forms.PlotPrecisionRecallCurveForm,
     data_analysis.PlotROC: data_analysis_forms.PlotROCCurveForm,
     data_analysis.ClusteringKMeans: data_analysis_forms.ClusteringKMeansForm,

From aaeed09dc2f534d57ddf7503a747fb0de97e0060 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Sat, 14 Sep 2024 13:09:17 +0200
Subject: [PATCH 45/52] Added an option for the user to select the Time and
 Grouping column names

---
 .../time_series_regression_analysis.py        |  99 ++++++++----
 protzilla/methods/data_analysis.py            |  50 +++---
 ui/runs/forms/data_analysis.py                | 148 +++++++++---------
 3 files changed, 171 insertions(+), 126 deletions(-)

diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
index f579bb03..cd579295 100644
--- a/protzilla/data_analysis/time_series_regression_analysis.py
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -60,7 +60,6 @@ def time_series_linear_regression(
     )
 
     intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)
-    intensity_df = intensity_df.interpolate(method='linear', axis=0)
 
     intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True)
 
@@ -90,8 +89,8 @@ def time_series_linear_regression(
             train_r2 = r2_score(y_train, y_pred_train)
             test_r2 = r2_score(y_test, y_pred_test)
 
-            train_df = pd.DataFrame({'Time': X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
-            test_df = pd.DataFrame({'Time': X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+            train_df = pd.DataFrame({time_column_name: X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
+            test_df = pd.DataFrame({time_column_name: X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
             plot_df = pd.concat([train_df, test_df])
 
             color = PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index % len(PROTZILLA_DISCRETE_COLOR_SEQUENCE)]
@@ -134,8 +133,11 @@ def time_series_linear_regression(
         train_r2 = r2_score(y_train, y_pred_train)
         test_r2 = r2_score(y_test, y_pred_test)
 
-        train_df = pd.DataFrame({'Time': X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
-        test_df = pd.DataFrame({'Time': X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+        train_df = pd.DataFrame(
+            {time_column_name: X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train,
+             'Type': 'Train'})
+        test_df = pd.DataFrame(
+            {time_column_name: X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
         plot_df = pd.concat([train_df, test_df])
 
         fig.add_trace(go.Scatter(
@@ -186,7 +188,7 @@ def time_series_linear_regression(
         yaxis_gridcolor=colors["gridcolor"],
         xaxis_linecolor=colors["linecolor"],
         yaxis_linecolor=colors["linecolor"],
-        xaxis_title="Time",
+        xaxis_title=time_column_name,
         yaxis_title="Intensity",
         legend_title="Legend",
         autosize=True,
@@ -254,7 +256,6 @@ def time_series_ransac_regression(
     )
 
     intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)
-    intensity_df = intensity_df.interpolate(method='linear', axis=0)
 
     intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True)
 
@@ -286,31 +287,31 @@ def time_series_ransac_regression(
             train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask])
             test_r2 = r2_score(y_test, y_pred_test)
 
-            train_df = pd.DataFrame({'Time': X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
-            test_df = pd.DataFrame({'Time': X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+            train_df = pd.DataFrame({time_column_name: X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
+            test_df = pd.DataFrame({time_column_name: X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
             train_df['Inlier'] = inlier_mask
             test_df['Inlier'] = False
             plot_df = pd.concat([train_df, test_df])
 
             # Add main plot traces
             fig.add_trace(go.Scatter(
-                x=plot_df['Time'],
+                x=plot_df[time_column_name],
                 y=plot_df['Intensity'],
                 mode='markers',
-                name='Actual Intensity',
+                name=f'Actual Intensity ({group})',
                 marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index])
             ), row=1, col=1)
 
             fig.add_trace(go.Scatter(
-                x=plot_df['Time'],
+                x=plot_df[time_column_name],
                 y=plot_df['Predicted'],
                 mode='lines',
-                name='Predicted Intensity',
+                name=f'Predicted Intensity ({group})',
                 line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2])
             ), row=1, col=1)
 
             fig.add_trace(go.Scatter(
-                x=plot_df[plot_df['Inlier'] == False]['Time'],
+                x=plot_df[plot_df['Inlier'] == False][time_column_name],
                 y=plot_df[plot_df['Inlier'] == False]['Intensity'],
                 mode='markers',
                 name='Outliers',
@@ -342,15 +343,15 @@ def time_series_ransac_regression(
         train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask])
         test_r2 = r2_score(y_test, y_pred_test)
 
-        train_df = pd.DataFrame({'Time': X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
-        test_df = pd.DataFrame({'Time': X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+        train_df = pd.DataFrame({time_column_name: X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
+        test_df = pd.DataFrame({time_column_name: X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
         train_df['Inlier'] = inlier_mask
         test_df['Inlier'] = False
         plot_df = pd.concat([train_df, test_df])
 
         # Add main plot traces
         fig.add_trace(go.Scatter(
-            x=plot_df['Time'],
+            x=plot_df[time_column_name],
             y=plot_df['Intensity'],
             mode='markers',
             name='Actual Intensity',
@@ -358,7 +359,7 @@ def time_series_ransac_regression(
         ), row=1, col=1)
 
         fig.add_trace(go.Scatter(
-            x=plot_df['Time'],
+            x=plot_df[time_column_name],
             y=plot_df['Predicted'],
             mode='lines',
             name='Predicted Intensity',
@@ -366,7 +367,7 @@ def time_series_ransac_regression(
         ), row=1, col=1)
 
         fig.add_trace(go.Scatter(
-            x=plot_df[plot_df['Inlier'] == False]['Time'],
+            x=plot_df[plot_df['Inlier'] == False][time_column_name],
             y=plot_df[plot_df['Inlier'] == False]['Intensity'],
             mode='markers',
             name='Outliers',
@@ -405,7 +406,7 @@ def time_series_ransac_regression(
         yaxis_gridcolor=colors["gridcolor"],
         xaxis_linecolor=colors["linecolor"],
         yaxis_linecolor=colors["linecolor"],
-        xaxis_title="Time",
+        xaxis_title=time_column_name,
         yaxis_title="Intensity",
         legend_title="Legend",
         autosize=True,
@@ -553,7 +554,6 @@ def time_series_auto_arima(
             group_df = intensity_df[intensity_df[grouping_column_name] == group]
 
             group_df[time_column_name] = group_df[str(time_column_name)].apply(convert_time_to_hours)
-            group_df = group_df.interpolate(method='linear', axis=0)
 
             train_df_size = int(len(group_df) * train_size)
             train_df, test_df = group_df[:train_df_size], group_df[train_df_size:]
@@ -574,6 +574,24 @@ def time_series_auto_arima(
 
             # Forecast the test set
             forecast = model.predict(n_periods=test_df.shape[0])
+            parameters = model.get_params()
+            aa_order = parameters['order']
+            aa_seasonal_order = parameters['seasonal_order']
+            messages = []
+
+            messages.append(
+                {
+                    "level": logging.INFO,
+                    "msg": f"Auto Arima Order (p,d,q): {aa_order}.",
+                }
+            )
+            if seasonal:
+                messages.append(
+                    {
+                        "level": logging.INFO,
+                        "msg": f"Auto Arima Seasonal Order (P,D,Q,s): {aa_seasonal_order}.",
+                    }
+                )
 
             test_rmse = np.sqrt(mean_squared_error(test_df, forecast))
             test_r2 = r2_score(test_df, forecast)
@@ -588,7 +606,7 @@ def time_series_auto_arima(
                 x=test_df.index,
                 y=test_df,
                 mode='markers',
-                name='Actual Intensity',
+                name=f'Actual Intensity ({group})',
                 marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index])
             ), row=1, col=1)
 
@@ -596,7 +614,7 @@ def time_series_auto_arima(
                 x=test_df.index,
                 y=forecast,
                 mode='markers',
-                name='Predicted Intensity',
+                name=f'Predicted Intensity ({group})',
                 line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 3])
             ), row=1, col=1)
 
@@ -604,7 +622,7 @@ def time_series_auto_arima(
                 x = forecast_plot.index,
                 y = forecast_plot,
                 mode = 'lines',
-                name = 'Mean Predicted Intensity',
+                name = f'Mean Predicted Intensity ({group})',
                 line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 3])
             ), row=1, col=1)
 
@@ -620,7 +638,6 @@ def time_series_auto_arima(
 
     else:
         intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)
-        intensity_df = intensity_df.interpolate(method='linear', axis=0)
 
         train_size = int(len(intensity_df) * train_size)
         train_df, test_df = intensity_df[:train_size], intensity_df[train_size:]
@@ -641,6 +658,25 @@ def time_series_auto_arima(
 
         # Forecast the test set
         forecast = model.predict(n_periods=test_df.shape[0])
+        parameters = model.get_params()
+
+        aa_order = parameters['order']
+        aa_seasonal_order = parameters['seasonal_order']
+        messages = []
+
+        messages.append(
+            {
+                "level": logging.INFO,
+                "msg": f"Auto Arima Order (p,d,q): {aa_order}.",
+            }
+        )
+        if seasonal:
+            messages.append(
+                {
+                    "level": logging.INFO,
+                    "msg": f"Auto Arima Seasonal Order (P,D,Q,s): {aa_seasonal_order}.",
+                }
+            )
 
         test_rmse = np.sqrt(mean_squared_error(test_df, forecast))
         test_r2 = r2_score(test_df, forecast)
@@ -707,7 +743,7 @@ def time_series_auto_arima(
         yaxis_gridcolor=colors["gridcolor"],
         xaxis_linecolor=colors["linecolor"],
         yaxis_linecolor=colors["linecolor"],
-        xaxis_title="Time",
+        xaxis_title=time_column_name,
         yaxis_title="Intensity",
         legend_title="Legend",
         autosize=True,
@@ -725,7 +761,6 @@ def time_series_auto_arima(
 
     fig.update_annotations(font_size=12)
 
-
     return dict(
         scores=scores,
         plots=[fig],
@@ -791,7 +826,6 @@ def time_series_arima(
             group_df = intensity_df[intensity_df[grouping_column_name] == group]
 
             group_df[time_column_name] = group_df[str(time_column_name)].apply(convert_time_to_hours)
-            group_df = group_df.interpolate(method='linear', axis=0)
 
             train_df_size = int(len(group_df) * train_size)
             train_df, test_df = group_df[:train_df_size], group_df[train_df_size:]
@@ -828,7 +862,7 @@ def time_series_arima(
                 x=test_df.index,
                 y=test_df,
                 mode='markers',
-                name='Actual Intensity',
+                name=f'Actual Intensity ({group})',
                 marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index])
             ), row=1, col=1)
 
@@ -836,7 +870,7 @@ def time_series_arima(
                 x=forecast_plot.index,
                 y=forecast_plot,
                 mode='markers',
-                name='Predicted Intensity',
+                name= f'Predicted Intensity ({group})',
                 line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2])
             ), row=1, col=1)
 
@@ -844,7 +878,7 @@ def time_series_arima(
                 x = forecast_mean_plot.index,
                 y = forecast_mean_plot,
                 mode = 'lines',
-                name = 'Mean Predicted Intensity',
+                name = f'Mean Predicted Intensity ({group})',
                 line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2])
             ), row=1, col=1)
 
@@ -860,7 +894,6 @@ def time_series_arima(
 
     else:
         intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)
-        intensity_df = intensity_df.interpolate(method='linear', axis=0)
 
         train_size = int(len(intensity_df) * train_size)
         train_df, test_df = intensity_df[:train_size], intensity_df[train_size:]
@@ -945,7 +978,7 @@ def time_series_arima(
         yaxis_gridcolor=colors["gridcolor"],
         xaxis_linecolor=colors["linecolor"],
         yaxis_linecolor=colors["linecolor"],
-        xaxis_title="Time",
+        xaxis_title=time_column_name,
         yaxis_title="Intensity",
         legend_title="Legend",
         autosize=True,
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 9f5261d1..bf05e4cb 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -28,7 +28,7 @@
     prot_quant_plot,
     scatter_plot,
 )
-from protzilla.data_analysis.time_series_plot_peptide import time_series_plot_peptide
+from protzilla.data_analysis.time_series_plots import time_quant_plot
 from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph
 from protzilla.data_analysis.ptm_analysis import (
     filter_peptides_of_protein,
@@ -344,27 +344,6 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         )
         return inputs
 
-class PlotTimeSeriesPeptide(PlotStep):
-    display_name = "Time Quantification Plot For Peptide"
-    operation = "plot"
-    method_description = (
-        "Creates a line chart for intensity across Time for protein groups"
-    )
-
-    input_keys = ["input_df", "metadata_df", "protein_group", "similarity_measure", "similarity"]
-    output_keys = []
-
-    def method(self, inputs: dict) -> dict:
-        return time_series_plot_peptide(**inputs)
-
-
-    def insert_dataframes(self, steps: StepManager, inputs) -> dict:
-        inputs["input_df"] = steps.get_step_output(
-            Step, "peptide_df", inputs["input_df"]
-        )
-        inputs["metadata_df"] = steps.metadata_df
-        return inputs
-
 
 class PlotPrecisionRecallCurve(PlotStep):
     display_name = "Precision Recall"
@@ -796,6 +775,33 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         return inputs
 
 
+class PlotTimeQuant(PlotStep):
+    display_name = "Time Quantification Plot For Protein"
+    operation = "Time series analysis"
+    method_description = (
+        "Creates a line chart for intensity across Time for protein groups"
+    )
+
+    input_keys = [
+        "intensity_df",
+        "metadata_df",
+        "time_column_name",
+        "protein_group",
+        "similarity_measure",
+        "similarity"
+    ]
+    output_keys = []
+
+    def method(self, inputs: dict) -> dict:
+        return time_quant_plot(**inputs)
+
+
+    def insert_dataframes(self, steps: StepManager, inputs) -> dict:
+        inputs["intensity_df"] = steps.protein_df
+        inputs["metadata_df"] = steps.metadata_df
+        return inputs
+
+
 class TimeSeriesLinearRegression(PlotStep):
     display_name = "Linear Regression"
     operation = "Time series analysis"
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 12328911..e682c879 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -540,77 +540,6 @@ def fill_form(self, run: Run) -> None:
                 self.data["similarity"] = 1
 
 
-class PlotTimeSeriesForm(MethodForm):
-    is_dynamic = True
-
-    input_df = CustomChoiceField(
-        choices=[],
-        label="Choose dataframe to be plotted",
-    )
-    protein_group = CustomChoiceField(
-        choices=[],
-        label="Protein group: choose highlighted protein group",
-    )
-    similarity_measure = CustomChoiceField(
-        choices=SimilarityMeasure,
-        label="Similarity Measurement: choose how to compare protein groups",
-        initial=SimilarityMeasure.euclidean_distance,
-    )
-    similarity = CustomNumberField(
-        label="Similarity", min_value=-1, max_value=999, step_size=1, initial=1
-    )
-
-    def fill_form(self, run: Run) -> None:
-        self.fields["input_df"].choices = fill_helper.get_choices_for_peptide_df_steps(
-            run
-        )
-
-        input_df_instance_id = self.data.get(
-            "input_df", self.fields["input_df"].choices[0][0]
-        )
-
-        self.fields["protein_group"].choices = fill_helper.to_choices(
-            run.steps.get_step_output(
-                step_type=Step,
-                output_key="peptide_df",
-                instance_identifier=input_df_instance_id,
-            )["Protein ID"].unique()
-        )
-
-        similarity_measure = self.data.get(
-            "similarity_measure", self.fields["similarity_measure"].choices[0][0]
-        )
-        self.data = self.data.copy()
-        if similarity_measure == SimilarityMeasure.cosine_similarity:
-            self.fields["similarity"] = CustomFloatField(
-                label="Cosine Similarity",
-                min_value=-1,
-                max_value=1,
-                step_size=0.1,
-                initial=0,
-            )
-            if (
-                "similarity" not in self.data
-                or float(self.data["similarity"]) < -1
-                or float(self.data["similarity"]) > 1
-            ):
-                self.data["similarity"] = 0
-        else:
-            self.fields["similarity"] = CustomNumberField(
-                label="Euclidean Distance",
-                min_value=0,
-                max_value=999,
-                step_size=1,
-                initial=1,
-            )
-            if (
-                "similarity" not in self.data
-                or float(self.data["similarity"]) < 0
-                or float(self.data["similarity"]) > 999
-            ):
-                self.data["similarity"] = 1
-
-
 class PlotPrecisionRecallCurveForm(MethodForm):
     # Todo: Input
     plot_title = CustomCharField(
@@ -1234,6 +1163,83 @@ def fill_form(self, run: Run) -> None:
             self.fields["peptide_df"].initial = single_protein_peptides[0]
 
 
+class PlotTimeQuantForm(MethodForm):
+    is_dynamic = True
+
+    intensity_df = CustomChoiceField(
+        choices=[],
+        label="Choose dataframe to be plotted",
+    )
+    time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time")
+    protein_group = CustomChoiceField(
+        choices=[],
+        label="Protein group: choose highlighted protein group",
+    )
+    similarity_measure = CustomChoiceField(
+        choices=SimilarityMeasure,
+        label="Similarity Measurement: choose how to compare protein groups",
+        initial=SimilarityMeasure.euclidean_distance,
+    )
+    similarity = CustomNumberField(
+        label="Similarity", min_value=-1, max_value=999, step_size=1, initial=1
+    )
+
+    def fill_form(self, run: Run) -> None:
+        self.fields["intensity_df"].choices = fill_helper.get_choices_for_protein_df_steps(
+            run
+        )
+
+        input_df_instance_id = self.data.get(
+            "intensity_df", self.fields["intensity_df"].choices[0][0]
+        )
+        self.fields[
+            "time_column_name"
+        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+
+        self.fields["protein_group"].choices = fill_helper.to_choices(
+            run.steps.get_step_output(
+                step_type=Step,
+                output_key="protein_df",
+                instance_identifier=input_df_instance_id,
+            )["Protein ID"].unique()
+        )
+
+        similarity_measure = self.data.get(
+            "similarity_measure", self.fields["similarity_measure"].choices[0][0]
+        )
+        self.data = self.data.copy()
+        if similarity_measure == SimilarityMeasure.cosine_similarity:
+            self.fields["similarity"] = CustomFloatField(
+                label="Cosine Similarity",
+                min_value=-1,
+                max_value=1,
+                step_size=0.1,
+                initial=0,
+            )
+            if (
+                    "similarity" not in self.data
+                    or float(self.data["similarity"]) < -1
+                    or float(self.data["similarity"]) > 1
+            ):
+                self.data["similarity"] = 0
+        else:
+            self.fields["similarity"] = CustomNumberField(
+                label="Euclidean Distance",
+                min_value=0,
+                max_value=999,
+                step_size=1,
+                initial=1,
+            )
+            if (
+                    "similarity" not in self.data
+                    or float(self.data["similarity"]) < 0
+                    or float(self.data["similarity"]) > 999
+            ):
+                self.data["similarity"] = 1
+
+
+
+
 class TimeSeriesLinearRegressionForm(MethodForm):
     is_dynamic = True
     intensity_df = CustomChoiceField(

From 25c37d6edd70b6cb0359542dce456529082411b2 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Sat, 14 Sep 2024 14:51:44 +0200
Subject: [PATCH 46/52] Resolved some comments from from Hendrik

---
 protzilla/data_analysis/time_series_helper.py |  5 +-
 .../time_series_regression_analysis.py        | 64 ++++++++-----------
 protzilla/methods/data_analysis.py            | 18 ++++--
 ui/runs/forms/data_analysis.py                | 44 +++++--------
 4 files changed, 59 insertions(+), 72 deletions(-)

diff --git a/protzilla/data_analysis/time_series_helper.py b/protzilla/data_analysis/time_series_helper.py
index 0fb294ed..e643fe93 100644
--- a/protzilla/data_analysis/time_series_helper.py
+++ b/protzilla/data_analysis/time_series_helper.py
@@ -6,7 +6,10 @@ def convert_time_to_hours(time_str):
     :param time_str: The time string to convert in format '%H:%M:%S'
 
     :return: Number of hours since midnight as a float
+    """
+
     """
     time_obj = datetime.strptime(time_str, '%H:%M:%S')
     hours_since_midnight = time_obj.hour + time_obj.minute / 60 + time_obj.second / 3600
-    return hours_since_midnight
\ No newline at end of file
+    """
+    return time_str
\ No newline at end of file
diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
index cd579295..984754fe 100644
--- a/protzilla/data_analysis/time_series_regression_analysis.py
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -4,7 +4,7 @@
 import pandas as pd
 import plotly.graph_objects as go
 
-from protzilla.data_analysis.time_series_helper import convert_time_to_hours
+#from protzilla.data_analysis.time_series_helper import convert_time_to_hours
 from protzilla.utilities import default_intensity_column
 from protzilla.constants.colors import PROTZILLA_DISCRETE_COLOR_SEQUENCE
 
@@ -29,10 +29,10 @@ def time_series_linear_regression(
         intensity_df: pd.DataFrame,
         metadata_df: pd.DataFrame,
         time_column_name: str,
-        protein_group: str,
         train_size: float,
+        protein_group: str,
+        grouping: str,
         grouping_column_name: str,
-        grouping: str = None,
 ):
     """
     Perform linear regression on the time series data for a given protein group.
@@ -59,8 +59,6 @@ def time_series_linear_regression(
         copy=False,
     )
 
-    intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)
-
     intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True)
 
     X = intensity_df[[time_column_name]]
@@ -222,8 +220,8 @@ def time_series_ransac_regression(
         stop_probability: float,
         loss: str,
         train_size: float,
-        grouping_column_name: str,
         grouping: str,
+        grouping_column_name: str,
 ):
     """
     Perform RANSAC regression on the time series data for a given protein group.
@@ -255,8 +253,6 @@ def time_series_ransac_regression(
         copy=False,
     )
 
-    intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)
-
     intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True)
 
     X = intensity_df[[time_column_name]]
@@ -298,7 +294,7 @@ def time_series_ransac_regression(
                 x=plot_df[time_column_name],
                 y=plot_df['Intensity'],
                 mode='markers',
-                name=f'Actual Intensity ({group})',
+                name=f'Inliers ({group})',
                 marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index])
             ), row=1, col=1)
 
@@ -354,7 +350,7 @@ def time_series_ransac_regression(
             x=plot_df[time_column_name],
             y=plot_df['Intensity'],
             mode='markers',
-            name='Actual Intensity',
+            name='Inliers',
             marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
         ), row=1, col=1)
 
@@ -507,8 +503,8 @@ def time_series_auto_arima(
     seasonal: str,
     m: int,
     train_size: float,
-    grouping_column_name: str,
     grouping: str,
+    grouping_column_name: str,
 ) -> dict:
     """
     Perform an automatic ARIMA model selection on the time series data for a given protein group.
@@ -526,6 +522,7 @@ def time_series_auto_arima(
     """
 
     color_index = 0
+    messages = []
 
     if train_size < 0 or train_size > 1:
         raise ValueError("Train size should be between 0 and 1")
@@ -553,8 +550,6 @@ def time_series_auto_arima(
         for group in groups:
             group_df = intensity_df[intensity_df[grouping_column_name] == group]
 
-            group_df[time_column_name] = group_df[str(time_column_name)].apply(convert_time_to_hours)
-
             train_df_size = int(len(group_df) * train_size)
             train_df, test_df = group_df[:train_df_size], group_df[train_df_size:]
 
@@ -575,23 +570,6 @@ def time_series_auto_arima(
             # Forecast the test set
             forecast = model.predict(n_periods=test_df.shape[0])
             parameters = model.get_params()
-            aa_order = parameters['order']
-            aa_seasonal_order = parameters['seasonal_order']
-            messages = []
-
-            messages.append(
-                {
-                    "level": logging.INFO,
-                    "msg": f"Auto Arima Order (p,d,q): {aa_order}.",
-                }
-            )
-            if seasonal:
-                messages.append(
-                    {
-                        "level": logging.INFO,
-                        "msg": f"Auto Arima Seasonal Order (P,D,Q,s): {aa_seasonal_order}.",
-                    }
-                )
 
             test_rmse = np.sqrt(mean_squared_error(test_df, forecast))
             test_r2 = r2_score(test_df, forecast)
@@ -635,10 +613,24 @@ def time_series_auto_arima(
                 'train_r2_score': train_r2,
                 'test_r2_score': test_r2,
             })
+        aa_order = parameters['order']
+        aa_seasonal_order = parameters['seasonal_order']
 
-    else:
-        intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)
+        messages.append(
+            {
+                "level": logging.INFO,
+                "msg": f"Auto Arima Order (p,d,q): {aa_order}.",
+            }
+        )
+        if seasonal:
+            messages.append(
+                {
+                    "level": logging.INFO,
+                    "msg": f"Auto Arima Seasonal Order (P,D,Q,s): {aa_seasonal_order}.",
+                }
+            )
 
+    else:
         train_size = int(len(intensity_df) * train_size)
         train_df, test_df = intensity_df[:train_size], intensity_df[train_size:]
 
@@ -662,7 +654,6 @@ def time_series_auto_arima(
 
         aa_order = parameters['order']
         aa_seasonal_order = parameters['seasonal_order']
-        messages = []
 
         messages.append(
             {
@@ -764,6 +755,7 @@ def time_series_auto_arima(
     return dict(
         scores=scores,
         plots=[fig],
+        messages=messages,
     )
 
 
@@ -781,8 +773,8 @@ def time_series_arima(
     Q: int,
     s: int,
     train_size: float,
-    grouping_column_name: str,
     grouping: str,
+    grouping_column_name: str,
 ) -> dict:
 
     """
@@ -825,8 +817,6 @@ def time_series_arima(
         for group in groups:
             group_df = intensity_df[intensity_df[grouping_column_name] == group]
 
-            group_df[time_column_name] = group_df[str(time_column_name)].apply(convert_time_to_hours)
-
             train_df_size = int(len(group_df) * train_size)
             train_df, test_df = group_df[:train_df_size], group_df[train_df_size:]
 
@@ -893,8 +883,6 @@ def time_series_arima(
             })
 
     else:
-        intensity_df[time_column_name] = intensity_df[str(time_column_name)].apply(convert_time_to_hours)
-
         train_size = int(len(intensity_df) * train_size)
         train_df, test_df = intensity_df[:train_size], intensity_df[train_size:]
 
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index bf05e4cb..3201cc44 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -815,8 +815,8 @@ class TimeSeriesLinearRegression(PlotStep):
         "time_column_name",
         "protein_group",
         "train_size",
-        "grouping_column_name",
         "grouping",
+        "grouping_column_name",
     ]
     output_keys = [
         "scores",
@@ -845,8 +845,8 @@ class TimeSeriesRANSACRegression(PlotStep):
         "stop_probability",
         "loss",
         "train_size",
-        "grouping_column_name",
         "grouping",
+        "grouping_column_name",
     ]
     output_keys = [
         "scores",
@@ -863,7 +863,15 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
 class TimeSeriesADFullerTest(DataAnalysisStep):
     display_name = "Augmented Dickey-Fuller Test"
     operation = "Time series analysis"
-    method_description = "Perform Augmented Dickey-Fuller test on the time series data for a given protein group."
+    method_description = (
+        "The Augmented Dickey-Fuller test is a type of statistical test called a unit root test. The test "
+        "determines how strongly a time series is defined by a trend. The null hypothesis of the test is that the "
+         "time series can be represented by a unit root, which implies that the time series is not stationary. "
+         "The alternative hypothesis is that the time series is stationary. If the p-value is less than the "
+          "significance level, the null hypothesis can be rejected and the time series is considered stationary."
+          "Dickey, D. & Fuller, Wayne. (1979). Distribution of the Estimators for Autoregressive Time Series With a Unit Root. "
+          "JASA. Journal of the American Statistical Association. 74. 10.2307/2286348. "
+    )
 
     input_keys = [
         "intensity_df",
@@ -902,8 +910,8 @@ class TimeSeriesAutoARIMA(PlotStep):
         "seasonal",
         "m",
         "train_size",
-        "grouping_column_name",
         "grouping",
+        "grouping_column_name",
     ]
     output_keys = [
         "scores",
@@ -939,8 +947,8 @@ class TimeSeriesARIMA(PlotStep):
         "Q",
         "s",
         "train_size",
-        "grouping_column_name",
         "grouping",
+        "grouping_column_name",
     ]
     output_keys = [
         "scores",
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index e682c879..4cdc84fd 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1258,12 +1258,12 @@ class TimeSeriesLinearRegressionForm(MethodForm):
         step_size=0.1,
         initial=0.8
     )
-    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
     grouping = CustomChoiceField(
         choices= TimeSeriesGrouping,
         label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
         initial=TimeSeriesGrouping.with_grouping
     )
+    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
 
 
     def fill_form(self, run: Run) -> None:
@@ -1288,6 +1288,9 @@ def fill_form(self, run: Run) -> None:
                 instance_identifier=input_df_instance_id,
             )["Protein ID"].unique()
         )
+        grouping = self.data.get("grouping")
+        if grouping == "Without Grouping":
+            self.toggle_visibility("grouping_column_name", False)
 
 
 class TimeSeriesRANSACRegressionForm(MethodForm):
@@ -1326,12 +1329,12 @@ class TimeSeriesRANSACRegressionForm(MethodForm):
         step_size=0.1,
         initial=0.8
     )
-    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
     grouping = CustomChoiceField(
         choices= TimeSeriesGrouping,
         label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
         initial=TimeSeriesGrouping.with_grouping
     )
+    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
 
 
     def fill_form(self, run: Run) -> None:
@@ -1356,22 +1359,13 @@ def fill_form(self, run: Run) -> None:
                 instance_identifier=input_df_instance_id,
             )["Protein ID"].unique()
         )
+        grouping = self.data.get("grouping")
+        if grouping == "Without Grouping":
+            self.toggle_visibility("grouping_column_name", False)
 
 
 class TimeSeriesADFullerTestForm(MethodForm):
     is_dynamic = True
-    test_info = TextDisplayField(
-        label="Information about the Augmented Dickey-Fuller test",
-        text=(
-            "The Augmented Dickey-Fuller test is a type of statistical test called a unit root test. The test "
-             "determines how strongly a time series is defined by a trend. The null hypothesis of the test is that the "
-             "time series can be represented by a unit root, which implies that the time series is not stationary. "
-             "The alternative hypothesis is that the time series is stationary. If the p-value is less than the "
-             "significance level, the null hypothesis can be rejected and the time series is considered stationary.<br>"
-             "Dickey, D. & Fuller, Wayne. (1979). Distribution of the Estimators for Autoregressive Time Series With a Unit Root."
-             "JASA. Journal of the American Statistical Association. 74. 10.2307/2286348. "
-        ),
-    )
     intensity_df = CustomChoiceField(
         choices=[],
         label="Intensity dataframe",
@@ -1404,11 +1398,6 @@ def fill_form(self, run: Run) -> None:
 
 class TimeSeriesAutoARIMAForm(MethodForm):
     is_dynamic = True
-    model_info = TextDisplayField(
-        label="Citation for AutoARIMA model",
-        text=(
-        ),
-    )
     intensity_df = CustomChoiceField(
         choices=[],
         label="Intensity dataframe",
@@ -1436,12 +1425,12 @@ class TimeSeriesAutoARIMAForm(MethodForm):
         step_size=0.1,
         initial=0.8,
     )
-    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
     grouping = CustomChoiceField(
         choices= TimeSeriesGrouping,
         label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
         initial=TimeSeriesGrouping.with_grouping
     )
+    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
 
 
     def fill_form(self, run: Run) -> None:
@@ -1466,17 +1455,13 @@ def fill_form(self, run: Run) -> None:
                 instance_identifier=input_df_instance_id,
             )["Protein ID"].unique()
         )
+        grouping = self.data.get("grouping")
+        if grouping == "Without Grouping":
+            self.toggle_visibility("grouping_column_name", False)
 
 
 class TimeSeriesARIMAForm(MethodForm):
     is_dynamic = True
-    """
-    model_info = TextDisplayField(
-        label="Citation for ARIMA model",
-        text=(
-        ),
-    )
-    """
     intensity_df = CustomChoiceField(
         choices=[],
         label="Intensity dataframe",
@@ -1544,12 +1529,12 @@ class TimeSeriesARIMAForm(MethodForm):
         step_size=0.1,
         initial=0.8,
     )
-    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
     grouping = CustomChoiceField(
         choices= TimeSeriesGrouping,
         label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
         initial=TimeSeriesGrouping.with_grouping
     )
+    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
 
 
     def fill_form(self, run: Run) -> None:
@@ -1574,6 +1559,9 @@ def fill_form(self, run: Run) -> None:
                 instance_identifier=input_df_instance_id,
             )["Protein ID"].unique()
         )
+        grouping = self.data.get("grouping")
+        if grouping == "Without Grouping":
+            self.toggle_visibility("grouping_column_name", False)
         seasonal = self.data.get("seasonal")
         if seasonal == "No":
             self.toggle_visibility("P", False)

From f8b556c805b96fba072e2430effe184f5a22e893 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Sun, 15 Sep 2024 14:56:52 +0200
Subject: [PATCH 47/52] Fixed Tests

---
 .../test_time_series_analysis.py              | 55 ++++++++++---------
 .../data_analysis/test_time_series_plots.py   | 14 ++---
 2 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/tests/protzilla/data_analysis/test_time_series_analysis.py b/tests/protzilla/data_analysis/test_time_series_analysis.py
index 4962eb22..20017922 100644
--- a/tests/protzilla/data_analysis/test_time_series_analysis.py
+++ b/tests/protzilla/data_analysis/test_time_series_analysis.py
@@ -56,13 +56,13 @@ def time_series_test_data():
     )
 
     test_metadata_df = (
-        ["Sample1", "02:00:00", "1"],
-        ["Sample2", "06:00:00", "1"],
-        ["Sample3", "10:00:00", "1"],
-         ["Sample4", "14:00:00", "1"],
-        ["Sample5", "2:00:00", "2"],
-        ["Sample6", "4:00:00", "2"],
-        ["Sample7", "6:00:00", "2"],
+        ["Sample1", "2", "1"],
+        ["Sample2", "6", "1"],
+        ["Sample3", "7", "1"],
+         ["Sample4", "8", "1"],
+        ["Sample5", "2", "2"],
+        ["Sample6", "6", "2"],
+        ["Sample7", "7", "2"],
     )
     test_metadata_df = pd.DataFrame(
         data=test_metadata_df,
@@ -76,8 +76,8 @@ def test_linear_regression_plot_with_grouping(show_figures, time_series_test_dat
         test_intensity,
         test_metadata,
         "Time",
-        "Protein1", #
         0.8,
+        "Protein1",
         "Group",
         "With Grouping"
     )
@@ -93,10 +93,10 @@ def test_linear_regression_plot_without_grouping(show_figures, time_series_test_
         test_intensity,
         test_metadata,
         "Time",
-        "Protein1", #
         0.8,
+        "Protein1",
+        "With Grouping",
         "Group",
-        "With Grouping"
     )
     assert "plots" in outputs
     fig = outputs["plots"][0]
@@ -111,10 +111,10 @@ def test_linear_regression_plot_invalid_train_size(time_series_test_data):
             test_intensity,
             test_metadata,
             "Time",
-            "Protein1",  #
             2,
+            "Protein1",
+            "With Grouping",
             "Group",
-            "With Grouping"
         )
     return
 
@@ -124,10 +124,10 @@ def test_linear_regression_outputs(time_series_test_data):
         test_intensity,
         test_metadata,
         "Time",
-        "Protein1", #
         0.8,
+        "Protein1",
+        "With Grouping",
         "Group",
-        "With Grouping"
     )
     assert "scores" in outputs
     return
@@ -144,8 +144,8 @@ def test_ransac_regression_plot_with_grouping(show_figures, time_series_test_dat
         0.99,
         "absolute_error",
         0.8,
+        "With Grouping",
         "Group",
-        "With Grouping"
     )
     assert "plots" in outputs
     fig = outputs["plots"][0]
@@ -164,8 +164,9 @@ def test_ransac_regression_plot_without_grouping(show_figures, time_series_test_
         0.99,
         "absolute_error",
         0.8,
+        "With Grouping",
         "Group",
-        "With Grouping"
+
     )
     assert "plots" in outputs
     fig = outputs["plots"][0]
@@ -185,8 +186,8 @@ def test_ransac_plot_invalid_train_size(time_series_test_data):
             0.99,
             "absolute_error",
             2,
+            "With Grouping",
             "Group",
-            "With Grouping"
         )
     return
 
@@ -201,8 +202,8 @@ def test_ransac_regression_outputs(time_series_test_data):
         0.99,
         "absolute_error",
         0.8,
+        "With Grouping",
         "Group",
-        "With Grouping"
     )
     assert "scores" in outputs
     return
@@ -230,8 +231,8 @@ def test_auto_arima_plot_with_grouping(show_figures, time_series_test_data):
         "No",
         1,
         0.5,
+        "With Grouping",
         "Group",
-        "With Grouping"
     )
     assert "plots" in outputs
     fig = outputs["plots"][0]
@@ -249,8 +250,8 @@ def test_auto_arima_plot_without_grouping(show_figures, time_series_test_data):
         "No",
         1,
         0.5,
+        "With Grouping",
         "Group",
-        "With Grouping"
     )
     assert "plots" in outputs
     fig = outputs["plots"][0]
@@ -269,8 +270,8 @@ def test_auto_arima_plot_invalid_train_size(time_series_test_data):
             "No",
             1,
             2,
+            "With Grouping",
             "Group",
-            "With Grouping"
         )
     return
 
@@ -285,8 +286,8 @@ def test_auto_arima_outputs(time_series_test_data):
         "No",
         1,
         0.5,
+        "With Grouping",
         "Group",
-        "With Grouping"
     )
     assert "scores" in outputs
     return
@@ -308,8 +309,8 @@ def test_arima_plot_with_grouping(show_figures, time_series_test_data):
         0,
         0,
         0.5,
+        "With Grouping",
         "Group",
-        "With Grouping"
     )
     assert "plots" in outputs
     fig = outputs["plots"][0]
@@ -333,8 +334,8 @@ def test_arima_plot_seasonal_with_grouping(show_figures, time_series_test_data):
         0,
         0,
         0.5,
+        "With Grouping",
         "Group",
-        "With Grouping"
     )
     assert "plots" in outputs
     fig = outputs["plots"][0]
@@ -358,8 +359,8 @@ def test_arima_plot_without_grouping(show_figures, time_series_test_data):
         0,
         0,
         0.5,
+        "With Grouping",
         "Group",
-        "With Grouping"
     )
     assert "plots" in outputs
     fig = outputs["plots"][0]
@@ -384,8 +385,8 @@ def test_arima_plot_invalid_train_size(time_series_test_data):
             0,
             0,
             2,
+            "With Grouping",
             "Group",
-            "With Grouping"
         )
     return
 
@@ -406,8 +407,8 @@ def test_arima_outputs(time_series_test_data):
         0,
         0,
         0.5,
-        "Group",
         "With Grouping",
+        "Group",
     )
     assert "scores" in outputs
     return
\ No newline at end of file
diff --git a/tests/protzilla/data_analysis/test_time_series_plots.py b/tests/protzilla/data_analysis/test_time_series_plots.py
index 46182092..ca3fe4fa 100644
--- a/tests/protzilla/data_analysis/test_time_series_plots.py
+++ b/tests/protzilla/data_analysis/test_time_series_plots.py
@@ -43,10 +43,10 @@ def time_series_test_data():
     )
 
     test_metadata_df = (
-        ["Sample1", "02:00:00", 1],
-        ["Sample2", "06:00:00", 1],
-        ["Sample3", "10:00:00", 1],
-         ["Sample4", "14:00:00", 1],
+        ["Sample1", "2", 1],
+        ["Sample2", "6", 1],
+        ["Sample3", "7", 1],
+         ["Sample4", "10", 1],
     )
     test_metadata_df = pd.DataFrame(
         data=test_metadata_df,
@@ -56,7 +56,7 @@ def time_series_test_data():
 
 def test_time_series_plot(show_figures, time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
-    outputs = time_quant_plot(test_intensity, test_metadata, "Protein1")
+    outputs = time_quant_plot(test_intensity, test_metadata, "Time","Protein1")
     assert "plots" in outputs
     fig = outputs["plots"][0]
     if show_figures:
@@ -66,11 +66,11 @@ def test_time_series_plot(show_figures, time_series_test_data):
 def test_time_series_plot_invalid_euclidean_similarity(time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
     with pytest.raises(ValueError):
-        time_quant_plot(test_intensity, test_metadata, "Protein1", similarity=-1, similarity_measure="euclidean distance")
+        time_quant_plot(test_intensity, test_metadata, "Time", "Protein1", similarity=-1, similarity_measure="euclidean distance")
     return
 
 def test_time_series_plot_invalid_cosine_similarity(time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
     with pytest.raises(ValueError):
-        time_quant_plot(test_intensity, test_metadata, "Protein1", similarity=2, similarity_measure="cosine similarity")
+        time_quant_plot(test_intensity, test_metadata, "Time","Protein1", similarity=2, similarity_measure="cosine similarity")
     return
\ No newline at end of file

From 58ec156c094845fc6912a444821853a31a3e2eb1 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Sun, 15 Sep 2024 15:24:04 +0200
Subject: [PATCH 48/52] Updated a variable name

---
 protzilla/data_analysis/time_series_plots.py  |  12 +-
 .../time_series_regression_analysis.py        | 124 +++++++++---------
 protzilla/methods/data_analysis.py            |  18 +--
 protzilla/utilities/transform_dfs.py          |  14 +-
 ui/runs/forms/data_analysis.py                |  44 +++----
 5 files changed, 106 insertions(+), 106 deletions(-)

diff --git a/protzilla/data_analysis/time_series_plots.py b/protzilla/data_analysis/time_series_plots.py
index 236c0e5a..9250c6a9 100644
--- a/protzilla/data_analysis/time_series_plots.py
+++ b/protzilla/data_analysis/time_series_plots.py
@@ -18,7 +18,7 @@
 def time_quant_plot(
     intensity_df: pd.DataFrame,
     metadata_df: pd.DataFrame,
-    time_column_name: str,
+    time_column: str,
     protein_group: str,
     similarity: float = 1.0,
     similarity_measure: str = "euclidean distance",
@@ -33,7 +33,7 @@ def time_quant_plot(
     :param intensity_df: A dataframe in protzilla wide format, where each row
         represents a sample and each column represents a feature.
     :param metadata_df: A dataframe containing the metadata of the samples.
-    :param time_column_name: The name of the column in the metadata_df that contains the time information.
+    :param time_column: The name of the column in the metadata_df that contains the time information.
     :param protein_group: Protein IDs as the columnheader of the dataframe
     :param similarity_measure: method to compare the chosen proteingroup with all others. The two
         methods are "cosine similarity" and "euclidean distance".
@@ -44,13 +44,13 @@ def time_quant_plot(
 
     intensity_df = pd.merge(
         left=intensity_df,
-        right=metadata_df[["Sample", time_column_name]],
+        right=metadata_df[["Sample", time_column]],
         on="Sample",
         copy=False,
     )
 
     wide_df = intensity_df.interpolate(method='linear', axis=0)
-    wide_df = long_to_wide_time(wide_df, time_column_name=time_column_name) if is_long_format(wide_df, time_column_name=time_column_name) else  wide_df
+    wide_df = long_to_wide_time(wide_df, time_column=time_column) if is_long_format(wide_df, time_column=time_column) else  wide_df
 
 
     if protein_group not in wide_df.columns:
@@ -166,14 +166,14 @@ def time_quant_plot(
         yaxis_gridcolor=colors["gridcolor"],
         xaxis_linecolor=colors["linecolor"],
         yaxis_linecolor=colors["linecolor"],
-        xaxis_title=time_column_name,
+        xaxis_title=time_column,
         yaxis_title="Intensity",
         legend_title="Legend",
         xaxis=dict(
             tickmode="array",
             tickangle=0,
             tickvals=wide_df.index,
-            ticktext=[wide_df[time_column_name].unique() for wide_df[time_column_name] in wide_df.index],
+            ticktext=[wide_df[time_column].unique() for wide_df[time_column] in wide_df.index],
         ),
         autosize=True,
         margin=dict(l=100, r=300, t=100, b=100),
diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
index 984754fe..143aa696 100644
--- a/protzilla/data_analysis/time_series_regression_analysis.py
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -28,20 +28,20 @@
 def time_series_linear_regression(
         intensity_df: pd.DataFrame,
         metadata_df: pd.DataFrame,
-        time_column_name: str,
+        time_column: str,
         train_size: float,
         protein_group: str,
         grouping: str,
-        grouping_column_name: str,
+        grouping_column: str,
 ):
     """
     Perform linear regression on the time series data for a given protein group.
     :param intensity_df: Peptide dataframe which contains the intensity of each sample
     :param metadata_df: Metadata dataframe which contains the timestamps
-    :param time_column_name: The name of the column containing the time values
+    :param time_column: The name of the column containing the time values
     :param protein_group: Protein group to perform the analysis on
     :param train_size: The proportion of the dataset to include in the test split
-    :param grouping_column_name: The name of the column containing the grouping information
+    :param grouping_column: The name of the column containing the grouping information
     :param grouping: Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups
 
     :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
@@ -61,18 +61,18 @@ def time_series_linear_regression(
 
     intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True)
 
-    X = intensity_df[[time_column_name]]
+    X = intensity_df[[time_column]]
     y = intensity_df[intensity_column_name]
 
     fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025)
 
     scores = []
 
-    if grouping == "With Grouping" and grouping_column_name in intensity_df.columns:
-        groups = intensity_df[grouping_column_name].unique()
+    if grouping == "With Grouping" and grouping_column in intensity_df.columns:
+        groups = intensity_df[grouping_column].unique()
         for group in groups:
-            group_df = intensity_df[intensity_df[grouping_column_name] == group]
-            X_group = group_df[[time_column_name]]
+            group_df = intensity_df[intensity_df[grouping_column] == group]
+            X_group = group_df[[time_column]]
             y_group = group_df[intensity_column_name]
 
             X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, train_size=train_size, shuffle=False)
@@ -87,15 +87,15 @@ def time_series_linear_regression(
             train_r2 = r2_score(y_train, y_pred_train)
             test_r2 = r2_score(y_test, y_pred_test)
 
-            train_df = pd.DataFrame({time_column_name: X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
-            test_df = pd.DataFrame({time_column_name: X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+            train_df = pd.DataFrame({time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
+            test_df = pd.DataFrame({time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
             plot_df = pd.concat([train_df, test_df])
 
             color = PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index % len(PROTZILLA_DISCRETE_COLOR_SEQUENCE)]
             color_index += 5
 
             fig.add_trace(go.Scatter(
-                x=plot_df[time_column_name],
+                x=plot_df[time_column],
                 y=plot_df['Intensity'],
                 mode='markers',
                 name=f'Actual Intensity ({group})',
@@ -103,7 +103,7 @@ def time_series_linear_regression(
             ), row=1, col=1)
 
             fig.add_trace(go.Scatter(
-                x=plot_df[time_column_name],
+                x=plot_df[time_column],
                 y=plot_df['Predicted'],
                 mode='lines',
                 name=f'Predicted Intensity ({group})',
@@ -132,14 +132,14 @@ def time_series_linear_regression(
         test_r2 = r2_score(y_test, y_pred_test)
 
         train_df = pd.DataFrame(
-            {time_column_name: X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train,
+            {time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train,
              'Type': 'Train'})
         test_df = pd.DataFrame(
-            {time_column_name: X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+            {time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
         plot_df = pd.concat([train_df, test_df])
 
         fig.add_trace(go.Scatter(
-            x=plot_df[time_column_name],
+            x=plot_df[time_column],
             y=plot_df['Intensity'],
             mode='markers',
             name='Actual Intensity',
@@ -147,7 +147,7 @@ def time_series_linear_regression(
         ), row=1, col=1)
 
         fig.add_trace(go.Scatter(
-            x=plot_df[time_column_name],
+            x=plot_df[time_column],
             y=plot_df['Predicted'],
             mode='lines',
             name='Predicted Intensity',
@@ -186,7 +186,7 @@ def time_series_linear_regression(
         yaxis_gridcolor=colors["gridcolor"],
         xaxis_linecolor=colors["linecolor"],
         yaxis_linecolor=colors["linecolor"],
-        xaxis_title=time_column_name,
+        xaxis_title=time_column,
         yaxis_title="Intensity",
         legend_title="Legend",
         autosize=True,
@@ -214,26 +214,26 @@ def time_series_linear_regression(
 def time_series_ransac_regression(
         intensity_df: pd.DataFrame,
         metadata_df: pd.DataFrame,
-        time_column_name: str,
+        time_column: str,
         protein_group: str,
         max_trials: int,
         stop_probability: float,
         loss: str,
         train_size: float,
         grouping: str,
-        grouping_column_name: str,
+        grouping_column: str,
 ):
     """
     Perform RANSAC regression on the time series data for a given protein group.
     :param intensity_df: Peptide dataframe which contains the intensity of each sample
     :param metadata_df: Metadata dataframe which contains the timestamps
-    :param time_column_name: The name of the column containing the time values
+    :param time_column: The name of the column containing the time values
     :param max_trials: The maximum number of iterations to perform
     :param stop_probability: The probability to stop the RANSAC algorithm
     :param loss: The loss function to use
     :param protein_group: Protein group to perform the analysis on
     :param train_size: The proportion of the dataset to include in the test split
-    :param grouping_column_name: The name of the column containing the grouping information
+    :param grouping_column: The name of the column containing the grouping information
     :param grouping: Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups
 
     :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
@@ -255,18 +255,18 @@ def time_series_ransac_regression(
 
     intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True)
 
-    X = intensity_df[[time_column_name]]
+    X = intensity_df[[time_column]]
     y = intensity_df[intensity_column_name]
 
     fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025)
 
     scores = []
 
-    if grouping == "With Grouping" and grouping_column_name in intensity_df.columns:
-        groups = intensity_df[grouping_column_name].unique()
+    if grouping == "With Grouping" and grouping_column in intensity_df.columns:
+        groups = intensity_df[grouping_column].unique()
         for group in groups:
-            group_df = intensity_df[intensity_df[grouping_column_name] == group]
-            X_group = group_df[[time_column_name]]
+            group_df = intensity_df[intensity_df[grouping_column] == group]
+            X_group = group_df[[time_column]]
             y_group = group_df[intensity_column_name]
 
             X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, train_size=train_size, shuffle=False)
@@ -283,15 +283,15 @@ def time_series_ransac_regression(
             train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask])
             test_r2 = r2_score(y_test, y_pred_test)
 
-            train_df = pd.DataFrame({time_column_name: X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
-            test_df = pd.DataFrame({time_column_name: X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+            train_df = pd.DataFrame({time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
+            test_df = pd.DataFrame({time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
             train_df['Inlier'] = inlier_mask
             test_df['Inlier'] = False
             plot_df = pd.concat([train_df, test_df])
 
             # Add main plot traces
             fig.add_trace(go.Scatter(
-                x=plot_df[time_column_name],
+                x=plot_df[time_column],
                 y=plot_df['Intensity'],
                 mode='markers',
                 name=f'Inliers ({group})',
@@ -299,7 +299,7 @@ def time_series_ransac_regression(
             ), row=1, col=1)
 
             fig.add_trace(go.Scatter(
-                x=plot_df[time_column_name],
+                x=plot_df[time_column],
                 y=plot_df['Predicted'],
                 mode='lines',
                 name=f'Predicted Intensity ({group})',
@@ -307,7 +307,7 @@ def time_series_ransac_regression(
             ), row=1, col=1)
 
             fig.add_trace(go.Scatter(
-                x=plot_df[plot_df['Inlier'] == False][time_column_name],
+                x=plot_df[plot_df['Inlier'] == False][time_column],
                 y=plot_df[plot_df['Inlier'] == False]['Intensity'],
                 mode='markers',
                 name='Outliers',
@@ -339,15 +339,15 @@ def time_series_ransac_regression(
         train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask])
         test_r2 = r2_score(y_test, y_pred_test)
 
-        train_df = pd.DataFrame({time_column_name: X_train[time_column_name], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
-        test_df = pd.DataFrame({time_column_name: X_test[time_column_name], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+        train_df = pd.DataFrame({time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
+        test_df = pd.DataFrame({time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
         train_df['Inlier'] = inlier_mask
         test_df['Inlier'] = False
         plot_df = pd.concat([train_df, test_df])
 
         # Add main plot traces
         fig.add_trace(go.Scatter(
-            x=plot_df[time_column_name],
+            x=plot_df[time_column],
             y=plot_df['Intensity'],
             mode='markers',
             name='Inliers',
@@ -355,7 +355,7 @@ def time_series_ransac_regression(
         ), row=1, col=1)
 
         fig.add_trace(go.Scatter(
-            x=plot_df[time_column_name],
+            x=plot_df[time_column],
             y=plot_df['Predicted'],
             mode='lines',
             name='Predicted Intensity',
@@ -363,7 +363,7 @@ def time_series_ransac_regression(
         ), row=1, col=1)
 
         fig.add_trace(go.Scatter(
-            x=plot_df[plot_df['Inlier'] == False][time_column_name],
+            x=plot_df[plot_df['Inlier'] == False][time_column],
             y=plot_df[plot_df['Inlier'] == False]['Intensity'],
             mode='markers',
             name='Outliers',
@@ -402,7 +402,7 @@ def time_series_ransac_regression(
         yaxis_gridcolor=colors["gridcolor"],
         xaxis_linecolor=colors["linecolor"],
         yaxis_linecolor=colors["linecolor"],
-        xaxis_title=time_column_name,
+        xaxis_title=time_column,
         yaxis_title="Intensity",
         legend_title="Legend",
         autosize=True,
@@ -498,24 +498,24 @@ def adfuller_test(
 def time_series_auto_arima(
     intensity_df: pd.DataFrame,
     metadata_df: pd.DataFrame,
-    time_column_name: str,
+    time_column: str,
     protein_group: str,
     seasonal: str,
     m: int,
     train_size: float,
     grouping: str,
-    grouping_column_name: str,
+    grouping_column: str,
 ) -> dict:
     """
     Perform an automatic ARIMA model selection on the time series data for a given protein group.
     :param intensity_df: Peptide dataframe which contains the intensity of each sample
     :param metadata_df: Metadata dataframe which contains the timestamps
-    :param time_column_name: The name of the column containing the time values
+    :param time_column: The name of the column containing the time values
     :param protein_group: Protein group to perform the analysis on
     :param seasonal: Whether the ARIMA model should be seasonal
     :param m: The number of time steps for a single seasonal period (ignored if seasonal=False)
     :param train_size: The proportion of the dataset to include in the test split
-    :param grouping_column_name: The name of the column containing the grouping information
+    :param grouping_column: The name of the column containing the grouping information
     :param grouping: Whether to group the data by the 'Group' column
 
     :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
@@ -545,16 +545,16 @@ def time_series_auto_arima(
     fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3])
     scores = []
 
-    if grouping == "With Grouping" and grouping_column_name in intensity_df.columns:
-        groups = intensity_df[grouping_column_name].unique()
+    if grouping == "With Grouping" and grouping_column in intensity_df.columns:
+        groups = intensity_df[grouping_column].unique()
         for group in groups:
-            group_df = intensity_df[intensity_df[grouping_column_name] == group]
+            group_df = intensity_df[intensity_df[grouping_column] == group]
 
             train_df_size = int(len(group_df) * train_size)
             train_df, test_df = group_df[:train_df_size], group_df[train_df_size:]
 
-            train_df = train_df.set_index(time_column_name)[intensity_column_name]
-            test_df = test_df.set_index(time_column_name)[intensity_column_name]
+            train_df = train_df.set_index(time_column)[intensity_column_name]
+            test_df = test_df.set_index(time_column)[intensity_column_name]
 
             # Fit the ARIMA model
             model = auto_arima(
@@ -634,8 +634,8 @@ def time_series_auto_arima(
         train_size = int(len(intensity_df) * train_size)
         train_df, test_df = intensity_df[:train_size], intensity_df[train_size:]
 
-        train_df = train_df.set_index(time_column_name)[intensity_column_name]
-        test_df = test_df.set_index(time_column_name)[intensity_column_name]
+        train_df = train_df.set_index(time_column)[intensity_column_name]
+        test_df = test_df.set_index(time_column)[intensity_column_name]
 
         # Fit the ARIMA model
         model = auto_arima(
@@ -734,7 +734,7 @@ def time_series_auto_arima(
         yaxis_gridcolor=colors["gridcolor"],
         xaxis_linecolor=colors["linecolor"],
         yaxis_linecolor=colors["linecolor"],
-        xaxis_title=time_column_name,
+        xaxis_title=time_column,
         yaxis_title="Intensity",
         legend_title="Legend",
         autosize=True,
@@ -762,7 +762,7 @@ def time_series_auto_arima(
 def time_series_arima(
     intensity_df: pd.DataFrame,
     metadata_df: pd.DataFrame,
-    time_column_name: str,
+    time_column: str,
     protein_group: str,
     seasonal: str,
     p: int,
@@ -774,14 +774,14 @@ def time_series_arima(
     s: int,
     train_size: float,
     grouping: str,
-    grouping_column_name: str,
+    grouping_column: str,
 ) -> dict:
 
     """
     Perform ARIMA model selection on the time series data for a given protein group.
     :param intensity_df: Peptide dataframe which contains the intensity of each sample
     :param metadata_df: Metadata dataframe which contains the timestamps
-    :param time_column_name: The name of the column containing the time values
+    :param time_column: The name of the column containing the time values
     :param protein_group: Protein group to perform the analysis on
     :param seasonal: Whether the ARIMA model should be seasonal
     :param p: ARIMA p parameter
@@ -792,7 +792,7 @@ def time_series_arima(
     :param Q: ARIMA seasonal Q parameter
     :param s: ARIMA seasonal s parameter
     :param train_size: The proportion of the dataset to include in the test split
-    :param grouping_column_name: The name of the column containing the grouping information
+    :param grouping_column: The name of the column containing the grouping information
     :param grouping: Whether to group the data by the 'Group' column
 
     :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
@@ -812,16 +812,16 @@ def time_series_arima(
     fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3])
     scores = []
 
-    if grouping == "With Grouping" and grouping_column_name in intensity_df.columns:
-        groups = intensity_df[grouping_column_name].unique()
+    if grouping == "With Grouping" and grouping_column in intensity_df.columns:
+        groups = intensity_df[grouping_column].unique()
         for group in groups:
-            group_df = intensity_df[intensity_df[grouping_column_name] == group]
+            group_df = intensity_df[intensity_df[grouping_column] == group]
 
             train_df_size = int(len(group_df) * train_size)
             train_df, test_df = group_df[:train_df_size], group_df[train_df_size:]
 
-            train_df = train_df.set_index(time_column_name)[intensity_column_name]
-            test_df = test_df.set_index(time_column_name)[intensity_column_name]
+            train_df = train_df.set_index(time_column)[intensity_column_name]
+            test_df = test_df.set_index(time_column)[intensity_column_name]
 
             if seasonal == "Yes":
                 model = ARIMA(
@@ -886,8 +886,8 @@ def time_series_arima(
         train_size = int(len(intensity_df) * train_size)
         train_df, test_df = intensity_df[:train_size], intensity_df[train_size:]
 
-        train_df = train_df.set_index(time_column_name)[intensity_column_name]
-        test_df = test_df.set_index(time_column_name)[intensity_column_name]
+        train_df = train_df.set_index(time_column)[intensity_column_name]
+        test_df = test_df.set_index(time_column)[intensity_column_name]
 
         if seasonal == "Yes":
             model = ARIMA(
@@ -966,7 +966,7 @@ def time_series_arima(
         yaxis_gridcolor=colors["gridcolor"],
         xaxis_linecolor=colors["linecolor"],
         yaxis_linecolor=colors["linecolor"],
-        xaxis_title=time_column_name,
+        xaxis_title=time_column,
         yaxis_title="Intensity",
         legend_title="Legend",
         autosize=True,
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 3201cc44..4a1f96d5 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -785,7 +785,7 @@ class PlotTimeQuant(PlotStep):
     input_keys = [
         "intensity_df",
         "metadata_df",
-        "time_column_name",
+        "time_column",
         "protein_group",
         "similarity_measure",
         "similarity"
@@ -812,11 +812,11 @@ class TimeSeriesLinearRegression(PlotStep):
     input_keys = [
         "intensity_df",
         "metadata_df",
-        "time_column_name",
+        "time_column",
         "protein_group",
         "train_size",
         "grouping",
-        "grouping_column_name",
+        "grouping_column",
     ]
     output_keys = [
         "scores",
@@ -839,14 +839,14 @@ class TimeSeriesRANSACRegression(PlotStep):
     input_keys = [
         "intensity_df",
         "metadata_df",
-        "time_column_name",
+        "time_column",
         "protein_group",
         "max_trials",
         "stop_probability",
         "loss",
         "train_size",
         "grouping",
-        "grouping_column_name",
+        "grouping_column",
     ]
     output_keys = [
         "scores",
@@ -905,13 +905,13 @@ class TimeSeriesAutoARIMA(PlotStep):
     input_keys = [
         "intensity_df",
         "metadata_df",
-        "time_column_name",
+        "time_column",
         "protein_group",
         "seasonal",
         "m",
         "train_size",
         "grouping",
-        "grouping_column_name",
+        "grouping_column",
     ]
     output_keys = [
         "scores",
@@ -936,7 +936,7 @@ class TimeSeriesARIMA(PlotStep):
     input_keys = [
         "intensity_df",
         "metadata_df",
-        "time_column_name",
+        "time_column",
         "protein_group",
         "seasonal",
         "p",
@@ -948,7 +948,7 @@ class TimeSeriesARIMA(PlotStep):
         "s",
         "train_size",
         "grouping",
-        "grouping_column_name",
+        "grouping_column",
     ]
     output_keys = [
         "scores",
diff --git a/protzilla/utilities/transform_dfs.py b/protzilla/utilities/transform_dfs.py
index 1c3b7fbe..fdb931e7 100644
--- a/protzilla/utilities/transform_dfs.py
+++ b/protzilla/utilities/transform_dfs.py
@@ -24,7 +24,7 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None):
     )
 
 
-def long_to_wide_time(intensity_df: pd.DataFrame, value_name: str = None, time_column_name: str = None):
+def long_to_wide_time(intensity_df: pd.DataFrame, value_name: str = None, time_column: str = None):
     """
     This function transforms the dataframe to a wide format that
     can be more easily handled by packages such as sklearn.
@@ -34,17 +34,17 @@ def long_to_wide_time(intensity_df: pd.DataFrame, value_name: str = None, time_c
         long format
         :type intensity_df: pd.DataFrame
     :param value_name: the name of the column in the metadata_df that contains the intensity information.
-    :param time_column_name: the name of the column in the metadata_df that contains the time information.
+    :param time_column: the name of the column in the metadata_df that contains the time information.
 
     :return: returns dataframe in wide format suitable for use by
         packages such as sklearn
     :rtype: pd.DataFrame
     """
-    if intensity_df.duplicated(subset=[time_column_name, "Protein ID"]).any():
-        intensity_df = intensity_df.groupby([time_column_name, "Protein ID"]).mean().reset_index()
+    if intensity_df.duplicated(subset=[time_column, "Protein ID"]).any():
+        intensity_df = intensity_df.groupby([time_column, "Protein ID"]).mean().reset_index()
     values_name = default_intensity_column(intensity_df) if value_name is None else value_name
     intensity_df = pd.pivot(
-        intensity_df, index=time_column_name, columns="Protein ID", values=values_name
+        intensity_df, index=time_column, columns="Protein ID", values=values_name
     )
     intensity_df = intensity_df.fillna(intensity_df.mean())
     return intensity_df
@@ -84,9 +84,9 @@ def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame):
     return intensity_df
 
 
-def is_long_format(df: pd.DataFrame, time_column_name: str = None):
+def is_long_format(df: pd.DataFrame, time_column: str = None):
     required_columns = {"Sample", "Protein ID"}
-    additional_columns = {"Gene", time_column_name}
+    additional_columns = {"Gene", time_column}
     return required_columns.issubset(df.columns) and any(col in df.columns for col in additional_columns)
 
 
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 4cdc84fd..5f4e18e8 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1170,7 +1170,7 @@ class PlotTimeQuantForm(MethodForm):
         choices=[],
         label="Choose dataframe to be plotted",
     )
-    time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time")
+    time_column = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time")
     protein_group = CustomChoiceField(
         choices=[],
         label="Protein group: choose highlighted protein group",
@@ -1193,7 +1193,7 @@ def fill_form(self, run: Run) -> None:
             "intensity_df", self.fields["intensity_df"].choices[0][0]
         )
         self.fields[
-            "time_column_name"
+            "time_column"
         ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
 
         self.fields["protein_group"].choices = fill_helper.to_choices(
@@ -1246,7 +1246,7 @@ class TimeSeriesLinearRegressionForm(MethodForm):
         choices=[],
         label="Intensity dataframe",
     )
-    time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time")
+    time_column = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time")
     protein_group = CustomChoiceField(
         choices=[],
         label="Protein group: which protein group to perform the linear regression on",
@@ -1263,7 +1263,7 @@ class TimeSeriesLinearRegressionForm(MethodForm):
         label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
         initial=TimeSeriesGrouping.with_grouping
     )
-    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
+    grouping_column = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
 
 
     def fill_form(self, run: Run) -> None:
@@ -1274,11 +1274,11 @@ def fill_form(self, run: Run) -> None:
             "intensity_df", self.fields["intensity_df"].choices[0][0]
         )
         self.fields[
-            "time_column_name"
+            "time_column"
         ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
 
         self.fields[
-            "grouping_column_name"
+            "grouping_column"
         ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
 
         self.fields["protein_group"].choices = fill_helper.to_choices(
@@ -1290,7 +1290,7 @@ def fill_form(self, run: Run) -> None:
         )
         grouping = self.data.get("grouping")
         if grouping == "Without Grouping":
-            self.toggle_visibility("grouping_column_name", False)
+            self.toggle_visibility("grouping_column", False)
 
 
 class TimeSeriesRANSACRegressionForm(MethodForm):
@@ -1299,7 +1299,7 @@ class TimeSeriesRANSACRegressionForm(MethodForm):
         choices=[],
         label="Intensity dataframe",
     )
-    time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time")
+    time_column = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time")
     protein_group = CustomChoiceField(
         choices=[],
         label="Protein group: which protein group to perform the RANSAC regression on",
@@ -1334,7 +1334,7 @@ class TimeSeriesRANSACRegressionForm(MethodForm):
         label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
         initial=TimeSeriesGrouping.with_grouping
     )
-    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
+    grouping_column = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
 
 
     def fill_form(self, run: Run) -> None:
@@ -1345,11 +1345,11 @@ def fill_form(self, run: Run) -> None:
             "intensity_df", self.fields["intensity_df"].choices[0][0]
         )
         self.fields[
-            "time_column_name"
+            "time_column"
         ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
 
         self.fields[
-            "grouping_column_name"
+            "grouping_column"
         ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
 
         self.fields["protein_group"].choices = fill_helper.to_choices(
@@ -1361,7 +1361,7 @@ def fill_form(self, run: Run) -> None:
         )
         grouping = self.data.get("grouping")
         if grouping == "Without Grouping":
-            self.toggle_visibility("grouping_column_name", False)
+            self.toggle_visibility("grouping_column", False)
 
 
 class TimeSeriesADFullerTestForm(MethodForm):
@@ -1402,7 +1402,7 @@ class TimeSeriesAutoARIMAForm(MethodForm):
         choices=[],
         label="Intensity dataframe",
     )
-    time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time")
+    time_column = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time")
     protein_group = CustomChoiceField(
         choices=[],
         label="Protein group: which protein group to perform the AutoARIMA on",
@@ -1430,7 +1430,7 @@ class TimeSeriesAutoARIMAForm(MethodForm):
         label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
         initial=TimeSeriesGrouping.with_grouping
     )
-    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
+    grouping_column = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
 
 
     def fill_form(self, run: Run) -> None:
@@ -1441,11 +1441,11 @@ def fill_form(self, run: Run) -> None:
             "intensity_df", self.fields["intensity_df"].choices[0][0]
         )
         self.fields[
-            "time_column_name"
+            "time_column"
         ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
 
         self.fields[
-            "grouping_column_name"
+            "grouping_column"
         ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
 
         self.fields["protein_group"].choices = fill_helper.to_choices(
@@ -1457,7 +1457,7 @@ def fill_form(self, run: Run) -> None:
         )
         grouping = self.data.get("grouping")
         if grouping == "Without Grouping":
-            self.toggle_visibility("grouping_column_name", False)
+            self.toggle_visibility("grouping_column", False)
 
 
 class TimeSeriesARIMAForm(MethodForm):
@@ -1466,7 +1466,7 @@ class TimeSeriesARIMAForm(MethodForm):
         choices=[],
         label="Intensity dataframe",
     )
-    time_column_name = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time")
+    time_column = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time")
     protein_group = CustomChoiceField(
         choices=[],
         label="Protein group: which protein group to perform the AutoARIMA on",
@@ -1534,7 +1534,7 @@ class TimeSeriesARIMAForm(MethodForm):
         label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups",
         initial=TimeSeriesGrouping.with_grouping
     )
-    grouping_column_name = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
+    grouping_column = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping")
 
 
     def fill_form(self, run: Run) -> None:
@@ -1545,11 +1545,11 @@ def fill_form(self, run: Run) -> None:
             "intensity_df", self.fields["intensity_df"].choices[0][0]
         )
         self.fields[
-            "time_column_name"
+            "time_column"
         ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
 
         self.fields[
-            "grouping_column_name"
+            "grouping_column"
         ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
 
         self.fields["protein_group"].choices = fill_helper.to_choices(
@@ -1561,7 +1561,7 @@ def fill_form(self, run: Run) -> None:
         )
         grouping = self.data.get("grouping")
         if grouping == "Without Grouping":
-            self.toggle_visibility("grouping_column_name", False)
+            self.toggle_visibility("grouping_column", False)
         seasonal = self.data.get("seasonal")
         if seasonal == "No":
             self.toggle_visibility("P", False)

From 8dda742f9551eab2c055c7c9e32f050b1144de97 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Tue, 24 Sep 2024 14:03:29 +0200
Subject: [PATCH 49/52] Updated some methods

---
 protzilla/data_analysis/time_series_plots.py  |  12 +-
 .../time_series_regression_analysis.py        | 304 ++++++++++--------
 protzilla/methods/data_analysis.py            |   1 +
 ui/runs/forms/data_analysis.py                |   4 +
 user_data/workflows/standard.yaml             |  14 +-
 ...uganash.yaml => workflow_Kuganash-BA.yaml} |  54 ++--
 user_data/workflows/workflow_Plot-Thesis.yaml |  67 ----
 7 files changed, 201 insertions(+), 255 deletions(-)
 rename user_data/workflows/{workflow_BA_Kuganash.yaml => workflow_Kuganash-BA.yaml} (71%)
 delete mode 100644 user_data/workflows/workflow_Plot-Thesis.yaml

diff --git a/protzilla/data_analysis/time_series_plots.py b/protzilla/data_analysis/time_series_plots.py
index 9250c6a9..3c5f8059 100644
--- a/protzilla/data_analysis/time_series_plots.py
+++ b/protzilla/data_analysis/time_series_plots.py
@@ -4,9 +4,9 @@
 from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
 
 from protzilla.utilities.transform_dfs import is_long_format, long_to_wide_time
+from protzilla.constants.colors import PROTZILLA_DISCRETE_COLOR_SEQUENCE
 
 # Define color constants
-PROTZILLA_DISCRETE_COLOR_SEQUENCE = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#19D3F3", "#E763FA", "#FECB52", "#FFA15A", "#FF6692", "#B6E880"]
 colors = {
     "plot_bgcolor": "white",
     "gridcolor": "#F1F1F1",
@@ -68,7 +68,7 @@ def time_quant_plot(
 
     color_mapping = {
         "A": PROTZILLA_DISCRETE_COLOR_SEQUENCE[0],
-        "C": PROTZILLA_DISCRETE_COLOR_SEQUENCE[1],
+        "C": PROTZILLA_DISCRETE_COLOR_SEQUENCE[4],
     }
 
     lower_upper_x = []
@@ -122,7 +122,7 @@ def time_quant_plot(
                 y=wide_df[group],
                 mode="lines",
                 name=group[:15] + "..." if len(group) > 15 else group,
-                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[1]),
+                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[9]),
                 showlegend=len(similar_groups) <= 7,
             )
         )
@@ -133,7 +133,7 @@ def time_quant_plot(
                 x=[None],
                 y=[None],
                 mode="lines",
-                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[1]),
+                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[9]),
                 name="Similar Protein Groups",
             )
         )
@@ -147,7 +147,7 @@ def time_quant_plot(
             y=wide_df[protein_group],
             mode="lines",
             name=formatted_protein_name,
-            line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2]),
+            line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[4]),
         )
     )
     fig.add_trace(
@@ -155,7 +155,7 @@ def time_quant_plot(
             x=[None],
             y=[None],
             mode="markers",
-            marker=dict(color=color_mapping.get("A")),
+            marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]),
             name="Intensity",
         )
     )
diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
index 143aa696..b8aebce3 100644
--- a/protzilla/data_analysis/time_series_regression_analysis.py
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -46,6 +46,7 @@ def time_series_linear_regression(
 
     :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
     """
+    messages = []
     color_index = 0
     if train_size < 0 or train_size > 1:
         raise ValueError("Test size should be between 0 and 1")
@@ -64,7 +65,7 @@ def time_series_linear_regression(
     X = intensity_df[[time_column]]
     y = intensity_df[intensity_column_name]
 
-    fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025)
+    fig = go.Figure()
 
     scores = []
 
@@ -100,7 +101,8 @@ def time_series_linear_regression(
                 mode='markers',
                 name=f'Actual Intensity ({group})',
                 marker=dict(color=color)
-            ), row=1, col=1)
+                )
+            )
 
             fig.add_trace(go.Scatter(
                 x=plot_df[time_column],
@@ -108,7 +110,8 @@ def time_series_linear_regression(
                 mode='lines',
                 name=f'Predicted Intensity ({group})',
                 line=dict(color=color)
-            ), row=1, col=1)
+                )
+            )
 
             scores.append({
                 'group': group,
@@ -144,7 +147,8 @@ def time_series_linear_regression(
             mode='markers',
             name='Actual Intensity',
             marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
-        ), row=1, col=1)
+            )
+        )
 
         fig.add_trace(go.Scatter(
             x=plot_df[time_column],
@@ -152,7 +156,8 @@ def time_series_linear_regression(
             mode='lines',
             name='Predicted Intensity',
             line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[5])
-        ), row=1, col=1)
+            )
+        )
 
         scores.append({
             'group': 'Overall',
@@ -170,15 +175,6 @@ def time_series_linear_regression(
         for res in scores
     ])
 
-    fig.add_trace(go.Scatter(
-        x=[0],
-        y=[0.25],
-        text=[annotation_text],
-        mode='text',
-        textfont=dict(size=12),
-        showlegend=False
-    ), row=1, col=2)
-
     fig.update_layout(
         title=f"Intensity over Time for {protein_group}",
         plot_bgcolor=colors["plot_bgcolor"],
@@ -190,24 +186,26 @@ def time_series_linear_regression(
         yaxis_title="Intensity",
         legend_title="Legend",
         autosize=True,
-        margin=dict(l=100, r=100, t=100, b=50),
+        margin=dict(l=100, r=300, t=100, b=100),
         legend=dict(
-            yanchor="top",
-            y=0.95,
-            xanchor="right",
-            x=0.8
+            y=1.05,
+            x=1,
+            bgcolor = "rgba(255, 255, 255, 0.5)",
+            orientation = "v",
         )
     )
 
-    # Hide x-axis of the annotation subplot
-    fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2)
-    fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2)
-
-    fig.update_annotations(font_size=12)
+    messages.append(
+        {
+            "level": logging.INFO,
+            "msg": annotation_text,
+        }
+    )
 
     return dict(
         scores=scores,
         plots=[fig],
+        messages=messages,
     )
 
 
@@ -238,7 +236,7 @@ def time_series_ransac_regression(
 
     :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
     """
-
+    messages = []
     color_index = 0
     if train_size < 0 or train_size > 1:
         raise ValueError("Test size should be between 0 and 1")
@@ -258,7 +256,7 @@ def time_series_ransac_regression(
     X = intensity_df[[time_column]]
     y = intensity_df[intensity_column_name]
 
-    fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25], vertical_spacing=0.025)
+    fig = go.Figure()
 
     scores = []
 
@@ -273,30 +271,38 @@ def time_series_ransac_regression(
             model = RANSACRegressor(max_trials = max_trials, stop_probability = stop_probability, loss = loss, base_estimator=LinearRegression())
             model.fit(X_train, y_train)
 
-            inlier_mask = model.inlier_mask_
-
             y_pred_train = model.predict(X_train)
             y_pred_test = model.predict(X_test)
 
-            train_rmse = np.sqrt(mean_squared_error(y_train[inlier_mask], y_pred_train[inlier_mask]))
-            test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
-            train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask])
-            test_r2 = r2_score(y_test, y_pred_test)
+            inlier_mask_train = model.inlier_mask_
 
-            train_df = pd.DataFrame({time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
-            test_df = pd.DataFrame({time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
-            train_df['Inlier'] = inlier_mask
-            test_df['Inlier'] = False
+            # Predict the inliers for the test set
+            test_inlier_mask = model.predict(
+                X_test) == y_pred_test
+
+            train_rmse = np.sqrt(mean_squared_error(y_train[inlier_mask_train], y_pred_train[inlier_mask_train]))
+            test_rmse = np.sqrt(mean_squared_error(y_test[test_inlier_mask], y_pred_test[test_inlier_mask]))
+            train_r2 = r2_score(y_train[inlier_mask_train], y_pred_train[inlier_mask_train])
+            test_r2 = r2_score(y_test[test_inlier_mask], y_pred_test[test_inlier_mask])
+
+            # Prepare DataFrames for plotting
+            train_df = pd.DataFrame(
+                {time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
+            test_df = pd.DataFrame(
+                {time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+            train_df['Inlier'] = inlier_mask_train
+            test_df['Inlier'] = test_inlier_mask
             plot_df = pd.concat([train_df, test_df])
 
             # Add main plot traces
             fig.add_trace(go.Scatter(
-                x=plot_df[time_column],
-                y=plot_df['Intensity'],
+                x=plot_df[plot_df['Inlier'] == True][time_column],
+                y=plot_df[plot_df['Inlier'] == True]['Intensity'],
                 mode='markers',
                 name=f'Inliers ({group})',
                 marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index])
-            ), row=1, col=1)
+                )
+            )
 
             fig.add_trace(go.Scatter(
                 x=plot_df[time_column],
@@ -304,7 +310,8 @@ def time_series_ransac_regression(
                 mode='lines',
                 name=f'Predicted Intensity ({group})',
                 line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2])
-            ), row=1, col=1)
+                )
+            )
 
             fig.add_trace(go.Scatter(
                 x=plot_df[plot_df['Inlier'] == False][time_column],
@@ -312,7 +319,8 @@ def time_series_ransac_regression(
                 mode='markers',
                 name='Outliers',
                 marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4])
-            ), row=1, col=1)
+                )
+            )
 
             color_index += 5
 
@@ -329,30 +337,38 @@ def time_series_ransac_regression(
         model = RANSACRegressor(base_estimator=LinearRegression())
         model.fit(X_train, y_train)
 
-        inlier_mask = model.inlier_mask_
 
         y_pred_train = model.predict(X_train)
         y_pred_test = model.predict(X_test)
 
-        train_rmse = np.sqrt(mean_squared_error(y_train[inlier_mask], y_pred_train[inlier_mask]))
-        test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
-        train_r2 = r2_score(y_train[inlier_mask], y_pred_train[inlier_mask])
-        test_r2 = r2_score(y_test, y_pred_test)
+        inlier_mask_train = model.inlier_mask_
 
-        train_df = pd.DataFrame({time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
-        test_df = pd.DataFrame({time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
-        train_df['Inlier'] = inlier_mask
-        test_df['Inlier'] = False
+        # Predict the inliers for the test set
+        test_inlier_mask = model.predict(X_test) == y_pred_test
+
+        train_rmse = np.sqrt(mean_squared_error(y_train[inlier_mask_train], y_pred_train[inlier_mask_train]))
+        test_rmse = np.sqrt(mean_squared_error(y_test[test_inlier_mask], y_pred_test[test_inlier_mask]))
+        train_r2 = r2_score(y_train[inlier_mask_train], y_pred_train[inlier_mask_train])
+        test_r2 = r2_score(y_test[test_inlier_mask], y_pred_test[test_inlier_mask])
+
+        # Prepare DataFrames for plotting
+        train_df = pd.DataFrame(
+            {time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
+        test_df = pd.DataFrame(
+            {time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+        train_df['Inlier'] = inlier_mask_train
+        test_df['Inlier'] = test_inlier_mask
         plot_df = pd.concat([train_df, test_df])
 
         # Add main plot traces
         fig.add_trace(go.Scatter(
-            x=plot_df[time_column],
-            y=plot_df['Intensity'],
+            x=plot_df[plot_df['Inlier'] == True][time_column],
+            y=plot_df[plot_df['Inlier'] == True]['Intensity'],
             mode='markers',
             name='Inliers',
             marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
-        ), row=1, col=1)
+            )
+        )
 
         fig.add_trace(go.Scatter(
             x=plot_df[time_column],
@@ -360,7 +376,8 @@ def time_series_ransac_regression(
             mode='lines',
             name='Predicted Intensity',
             line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
-        ), row=1, col=1)
+            )
+        )
 
         fig.add_trace(go.Scatter(
             x=plot_df[plot_df['Inlier'] == False][time_column],
@@ -368,7 +385,8 @@ def time_series_ransac_regression(
             mode='markers',
             name='Outliers',
             marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3])
-        ), row=1, col=1)
+            )
+        )
 
         scores.append({
             'group': 'Overall',
@@ -386,15 +404,6 @@ def time_series_ransac_regression(
         for res in scores
     ])
 
-    fig.add_trace(go.Scatter(
-        x=[0],
-        y=[0.25],
-        text=[annotation_text],
-        mode='text',
-        textfont=dict(size=12),
-        showlegend=False
-    ), row=1, col=2)
-
     fig.update_layout(
         title=f"Intensity over Time for {protein_group}",
         plot_bgcolor=colors["plot_bgcolor"],
@@ -408,36 +417,40 @@ def time_series_ransac_regression(
         autosize=True,
         margin=dict(l=100, r=100, t=100, b=50),
         legend=dict(
-            yanchor="top",
-            y=0.95,
-            xanchor="right",
-            x=0.8
-        )
+            x=1.05,
+            y=1,
+            bgcolor="rgba(255, 255, 255, 0.5)",
+            orientation="v",
+        ),
     )
 
-    # Hide x-axis of the annotation subplot
-    fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2)
-    fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2)
-
-    fig.update_annotations(font_size=12)
+    messages.append(
+        {
+            "level": logging.INFO,
+            "msg": annotation_text,
+        }
+    )
 
     return dict(
         scores=scores,
         plots=[fig],
+        messages=messages
     )
 
 
 def adfuller_test(
-    intensity_df: pd.DataFrame,
-    metadata_df: pd.DataFrame,
-    protein_group: str,
-    alpha: float = 0.05,
+        intensity_df: pd.DataFrame,
+        metadata_df: pd.DataFrame,
+        time_column: str,
+        protein_group: str,
+        alpha: float = 0.05,
 ) -> dict:
     """
     Perform the Augmented Dickey-Fuller test to check for stationarity in a time series.
     :param intensity_df: The dataframe containing the time series data.
     :param metadata_df: The dataframe containing the metadata.
     :param protein_group: The protein group to perform the test on.
+    :param time_column: The column representing time (e.g., 'visit', 'timepoint').
     :param alpha: The significance level for the test (default is 0.05).
 
     :return: A dictionary containing:
@@ -449,20 +462,27 @@ def adfuller_test(
     """
 
     messages = []
+    # Filter for the specific protein group
     intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group]
     intensity_column_name = default_intensity_column(intensity_df)
 
-    intensity_df = pd.merge(
-        left=intensity_df,
-        right=metadata_df,
+    # Merge with metadata to include time information
+    merged_df = pd.merge(
+        left=intensity_df[["Sample", intensity_column_name]],
+        right=metadata_df[["Sample", time_column]],
         on="Sample",
         copy=False,
     )
 
-    intensity_df = intensity_df[intensity_column_name].dropna()
+    # Sort the data by time to ensure it is treated as a time series
+    merged_df = merged_df.sort_values(by=time_column)
+    grouped_df = merged_df.groupby(time_column)[intensity_column_name].mean().reset_index()
+
+    # Extract the time series (after aggregation)
+    time_series = grouped_df[intensity_column_name].dropna()
 
     # Perform the ADF test
-    result = adfuller(intensity_df)
+    result = adfuller(time_series)
     test_statistic = result[0]
     p_value = result[1]
     critical_values = result[4]
@@ -521,8 +541,8 @@ def time_series_auto_arima(
     :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
     """
 
-    color_index = 0
     messages = []
+    color_index = 0
 
     if train_size < 0 or train_size > 1:
         raise ValueError("Train size should be between 0 and 1")
@@ -542,7 +562,7 @@ def time_series_auto_arima(
         copy=False,
     )
 
-    fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3])
+    fig = go.Figure()
     scores = []
 
     if grouping == "With Grouping" and grouping_column in intensity_df.columns:
@@ -586,23 +606,26 @@ def time_series_auto_arima(
                 mode='markers',
                 name=f'Actual Intensity ({group})',
                 marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index])
-            ), row=1, col=1)
+                )
+            )
 
             fig.add_trace(go.Scatter(
                 x=test_df.index,
                 y=forecast,
                 mode='markers',
                 name=f'Predicted Intensity ({group})',
-                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 3])
-            ), row=1, col=1)
+                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4])
+                )
+            )
 
             fig.add_trace(go.Scatter(
                 x = forecast_plot.index,
                 y = forecast_plot,
                 mode = 'lines',
                 name = f'Mean Predicted Intensity ({group})',
-                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 3])
-            ), row=1, col=1)
+                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4])
+                )
+            )
 
             color_index += 5
 
@@ -683,24 +706,27 @@ def time_series_auto_arima(
             y=test_df,
             mode='markers',
             name='Actual Intensity',
-            marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
-        ), row=1, col=1)
+           marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
+            )
+        )
 
         fig.add_trace(go.Scatter(
             x=test_df.index,
             y=forecast,
             mode='markers',
             name='Predicted Intensity',
-            line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2])
-        ), row=1, col=1)
+            line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3])
+            )
+        )
 
         fig.add_trace(go.Scatter(
             x=forecast_plot.index,
             y=forecast_plot,
             mode='lines',
             name='Mean Predicted Intensity',
-            line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3])
-        ), row=1, col=1)
+            line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[4])
+            )
+        )
 
         scores.append({
             'group': 'Overall',
@@ -718,14 +744,6 @@ def time_series_auto_arima(
         for res in scores
     ])
 
-    fig.add_trace(go.Scatter(
-        x=[0],
-        y=[0.25],
-        text=[annotation_text],
-        mode='text',
-        textfont=dict(size=12),
-        showlegend=False
-    ), row=1, col=2)
 
     fig.update_layout(
         title=f"Intensity over Time for {protein_group}",
@@ -740,17 +758,19 @@ def time_series_auto_arima(
         autosize=True,
         margin=dict(l=100, r=100, t=100, b=50),
         legend=dict(
-            yanchor="top",
-            y=0.95,
-            xanchor="right",
-            x=0.775
-        )
+            x=1.05,
+            y=1,
+            bgcolor="rgba(255, 255, 255, 0.5)",
+            orientation="v",
+        ),
     )
 
-    fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2)
-    fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2)
-
-    fig.update_annotations(font_size=12)
+    messages.append(
+        {
+            "level": logging.INFO,
+            "msg": annotation_text,
+        }
+    )
 
     return dict(
         scores=scores,
@@ -797,7 +817,7 @@ def time_series_arima(
 
     :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
     """
-
+    messages = []
     color_index = 0
 
     if train_size < 0 or train_size > 1:
@@ -809,7 +829,7 @@ def time_series_arima(
 
     intensity_df = pd.merge(left=intensity_df, right=metadata_df, on="Sample", copy=False)
 
-    fig = make_subplots(rows=1, cols=2, column_widths=[0.7, 0.3])
+    fig = go.Figure()
     scores = []
 
     if grouping == "With Grouping" and grouping_column in intensity_df.columns:
@@ -854,23 +874,26 @@ def time_series_arima(
                 mode='markers',
                 name=f'Actual Intensity ({group})',
                 marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index])
-            ), row=1, col=1)
+                )
+            )
 
             fig.add_trace(go.Scatter(
                 x=forecast_plot.index,
                 y=forecast_plot,
                 mode='markers',
                 name= f'Predicted Intensity ({group})',
-                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2])
-            ), row=1, col=1)
+                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4])
+                )
+            )
 
             fig.add_trace(go.Scatter(
                 x = forecast_mean_plot.index,
                 y = forecast_mean_plot,
                 mode = 'lines',
                 name = f'Mean Predicted Intensity ({group})',
-                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2])
-            ), row=1, col=1)
+                line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4])
+                )
+            )
 
             color_index += 5
 
@@ -917,15 +940,17 @@ def time_series_arima(
             mode='markers',
             name='Actual Intensity',
             marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
-        ), row=1, col=1)
+            )
+        )
 
         fig.add_trace(go.Scatter(
             x=test_df.index,
             y=forecast,
             mode='markers',
             name='Predicted Intensity',
-            line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[2])
-        ), row=1, col=1)
+            line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3])
+            )
+        )
 
         fig.add_trace(go.Scatter(
             x=forecast_plot.index,
@@ -933,7 +958,8 @@ def time_series_arima(
             mode='lines',
             name='Mean Predicted Intensity',
             line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[4])
-        ), row=1, col=1)
+            )
+        )
 
         scores.append({
             'group': 'Overall',
@@ -950,15 +976,6 @@ def time_series_arima(
         for res in scores
     ])
 
-    fig.add_trace(go.Scatter(
-        x=[0],
-        y=[0.25],
-        text=[annotation_text],
-        mode='text',
-        textfont=dict(size=12),
-        showlegend=False
-    ), row=1, col=2)
-
     fig.update_layout(
         title=f"Intensity over Time for {protein_group}",
         plot_bgcolor=colors["plot_bgcolor"],
@@ -972,19 +989,22 @@ def time_series_arima(
         autosize=True,
         margin=dict(l=100, r=100, t=100, b=50),
         legend=dict(
-            yanchor="top",
-            y=0.95,
-            xanchor="right",
-            x=0.775
-        )
+            x=1.05,
+            y=1,
+            bgcolor="rgba(255, 255, 255, 0.5)",
+            orientation="v",
+        ),
     )
 
-    fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2)
-    fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False, row=1, col=2)
-
-    fig.update_annotations(font_size=12)
+    messages.append(
+        {
+            "level": logging.INFO,
+            "msg": annotation_text,
+        }
+    )
 
     return dict(
         scores=scores,
         plots=[fig],
+        messages=messages,
     )
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 4a1f96d5..4907bbf6 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -876,6 +876,7 @@ class TimeSeriesADFullerTest(DataAnalysisStep):
     input_keys = [
         "intensity_df",
         "metadata_df",
+        "time_column",
         "protein_group",
         "alpha",
     ]
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 5f4e18e8..2b8fcf64 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1370,6 +1370,7 @@ class TimeSeriesADFullerTestForm(MethodForm):
         choices=[],
         label="Intensity dataframe",
     )
+    time_column = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time")
     protein_group = CustomChoiceField(
         choices=[],
         label="Protein group: which protein group to perform the ADFuller test on",
@@ -1388,6 +1389,9 @@ def fill_form(self, run: Run) -> None:
         input_df_instance_id = self.data.get(
             "intensity_df", self.fields["intensity_df"].choices[0][0]
         )
+        self.fields[
+            "time_column"
+        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
         self.fields["protein_group"].choices = fill_helper.to_choices(
             run.steps.get_step_output(
                 step_type=Step,
diff --git a/user_data/workflows/standard.yaml b/user_data/workflows/standard.yaml
index 0b93bd80..52d754b7 100644
--- a/user_data/workflows/standard.yaml
+++ b/user_data/workflows/standard.yaml
@@ -36,20 +36,20 @@ steps:
     plot_inputs: { }
     type: OutlierDetectionByLocalOutlierFactor
   - form_inputs:
-      percentile: 0.5
+      log_base: log2
     inputs: { }
     plot_inputs:
-      graph_type: Boxplot
+      graph_type: Histogram
       group_by: None
-      visual_transformation: log10
-    type: NormalisationByMedian
+    type: TransformationLog
   - form_inputs:
-      log_base: log2
+      percentile: 0.5
     inputs: { }
     plot_inputs:
-      graph_type: Histogram
+      graph_type: Boxplot
       group_by: None
-    type: TransformationLog
+      visual_transformation: log10
+    type: NormalisationByMedian
   - form_inputs:
       similarity_measure: euclidean distance
     inputs: { }
diff --git a/user_data/workflows/workflow_BA_Kuganash.yaml b/user_data/workflows/workflow_Kuganash-BA.yaml
similarity index 71%
rename from user_data/workflows/workflow_BA_Kuganash.yaml
rename to user_data/workflows/workflow_Kuganash-BA.yaml
index 1a19947c..ef2d26bb 100644
--- a/user_data/workflows/workflow_BA_Kuganash.yaml
+++ b/user_data/workflows/workflow_Kuganash-BA.yaml
@@ -1,18 +1,12 @@
 df_mode: disk_memory
 steps:
 - form_inputs:
-    aggregation_method: Median
-    intensity_name: Intensity
+    aggregation_mode: Sum
+    intensity_name: iBAQ
     map_to_uniprot: false
   inputs: {}
   instance_identifier: MaxQuantImport_1
   type: MaxQuantImport
-- form_inputs:
-    intensity_name: Intensity
-    map_to_uniprot: false
-  inputs: {}
-  instance_identifier: EvidenceImport_1
-  type: EvidenceImport
 - form_inputs:
     feature_orientation: Columns (samples in rows, features in columns)
   inputs: {}
@@ -48,15 +42,6 @@ steps:
   instance_identifier: OutlierDetectionByLocalOutlierFactor_1
   plot_inputs: {}
   type: OutlierDetectionByLocalOutlierFactor
-- form_inputs:
-    percentile: 0.5
-  inputs: {}
-  instance_identifier: NormalisationByMedian_1
-  plot_inputs:
-    graph_type: Boxplot
-    group_by: None
-    visual_transformation: log10
-  type: NormalisationByMedian
 - form_inputs:
     log_base: log2
   inputs: {}
@@ -66,24 +51,27 @@ steps:
     group_by: None
   type: TransformationLog
 - form_inputs:
-    input_df: TransformationLog_1
-    protein_group: D3YYU8
-    similarity: 1
-    similarity_measure: euclidean distance
+    percentile: 0.5
   inputs: {}
-  instance_identifier: PlotTimeSeries_1
-  type: PlotTimeSeriesPeptide
-- form_inputs:
-    input_df: TransformationLog_1
-    protein_group: D3YYU8
-    test_size: 0.2
+  instance_identifier: NormalisationByMedian_1
+  plot_inputs:
+    graph_type: Boxplot
+    group_by: None
+    visual_transformation: log10
+  type: NormalisationByMedian
+- form_inputs: {}
+  inputs: {}
+  instance_identifier: PlotTimeQuant_1
+  type: PlotTimeQuant
+- form_inputs: {}
   inputs: {}
   instance_identifier: TimeSeriesLinearRegression_1
   type: TimeSeriesLinearRegression
-- form_inputs:
-    input_df: TransformationLog_1
-    protein_group: D3YYU8
-    test_size: 0.2
+- form_inputs: {}
+  inputs: {}
+  instance_identifier: TimeSeriesADFullerTest_1
+  type: TimeSeriesADFullerTest
+- form_inputs: {}
   inputs: {}
-  instance_identifier: TimeSeriesRANSACRegression_1
-  type: TimeSeriesRANSACRegression
+  instance_identifier: TimeSeriesAutoARIMA_1
+  type: TimeSeriesAutoARIMA
diff --git a/user_data/workflows/workflow_Plot-Thesis.yaml b/user_data/workflows/workflow_Plot-Thesis.yaml
deleted file mode 100644
index a3dee9fa..00000000
--- a/user_data/workflows/workflow_Plot-Thesis.yaml
+++ /dev/null
@@ -1,67 +0,0 @@
-df_mode: disk_memory
-steps:
-  - form_inputs:
-      intensity_name: iBAQ
-      map_to_uniprot: false
-      aggregation_mode: Sum
-    inputs: { }
-    type: MaxQuantImport
-  - form_inputs: {}
-    inputs: {}
-    instance_identifier: EvidenceImport_1
-    type: EvidenceImport
-  - form_inputs:
-      feature_orientation: Columns (samples in rows, features in columns)
-    inputs: {}
-    instance_identifier: MetadataImport_1
-    type: MetadataImport
-  - form_inputs:
-      similarity_measure: euclidean distance
-    inputs: {}
-    instance_identifier: PlotTimeSeries_1
-    type: PlotTimeSeriesPeptide
-  - form_inputs:
-      percentage: 0.5
-    inputs: { }
-    plot_inputs:
-      graph_type: Bar chart
-    type: FilterProteinsBySamplesMissing
-  - form_inputs:
-      deviation_threshold: 2.0
-    inputs: { }
-    plot_inputs:
-      graph_type: Bar chart
-    type: FilterSamplesByProteinIntensitiesSum
-  - form_inputs:
-      number_of_neighbours: 5
-    inputs: { }
-    plot_inputs:
-      graph_type: Boxplot
-      graph_type_quantities: Bar chart
-      group_by: None
-      visual_transformation: log10
-    type: ImputationByKNN
-  - form_inputs:
-      number_of_neighbors: 20
-    inputs: { }
-    plot_inputs: { }
-    type: OutlierDetectionByLocalOutlierFactor
-  - form_inputs:
-      percentile: 0.5
-    inputs: { }
-    plot_inputs:
-      graph_type: Boxplot
-      group_by: None
-      visual_transformation: log10
-    type: NormalisationByMedian
-  - form_inputs:
-      log_base: log2
-    inputs: { }
-    plot_inputs:
-      graph_type: Histogram
-      group_by: None
-    type: TransformationLog
-  - form_inputs:
-      similarity_measure: euclidean distance
-    inputs: { }
-    type: PlotProtQuantPeptide
\ No newline at end of file

From 0e2044705df5a7aea2e5b2122e3c24e2aacae427 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Tue, 24 Sep 2024 14:04:52 +0200
Subject: [PATCH 50/52] Updated Test

---
 tests/protzilla/data_analysis/test_time_series_analysis.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/protzilla/data_analysis/test_time_series_analysis.py b/tests/protzilla/data_analysis/test_time_series_analysis.py
index 20017922..7bdebbda 100644
--- a/tests/protzilla/data_analysis/test_time_series_analysis.py
+++ b/tests/protzilla/data_analysis/test_time_series_analysis.py
@@ -211,7 +211,7 @@ def test_ransac_regression_outputs(time_series_test_data):
 
 def test_adfuller_test(time_series_test_data):
     test_intensity, test_metadata = time_series_test_data
-    outputs = adfuller_test(test_intensity, test_metadata, "Protein1")
+    outputs = adfuller_test(test_intensity, test_metadata, "Time", "Protein1")
 
     assert "test_statistic" in outputs
     assert "p_value" in outputs

From f3b00e3bfbd711858f73a6535cb26c47d6215d43 Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Fri, 27 Sep 2024 13:50:12 +0200
Subject: [PATCH 51/52] Removed unwanted lines

---
 protzilla/data_analysis/time_series_plots.py             | 9 ---------
 .../data_analysis/time_series_regression_analysis.py     | 2 --
 2 files changed, 11 deletions(-)

diff --git a/protzilla/data_analysis/time_series_plots.py b/protzilla/data_analysis/time_series_plots.py
index 3c5f8059..37c8ad34 100644
--- a/protzilla/data_analysis/time_series_plots.py
+++ b/protzilla/data_analysis/time_series_plots.py
@@ -150,15 +150,6 @@ def time_quant_plot(
             line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[4]),
         )
     )
-    fig.add_trace(
-        go.Scatter(
-            x=[None],
-            y=[None],
-            mode="markers",
-            marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]),
-            name="Intensity",
-        )
-    )
     fig.update_layout(
         title=f"Time Series of {formatted_protein_name} in all samples",
         plot_bgcolor=colors["plot_bgcolor"],
diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
index b8aebce3..898f82f9 100644
--- a/protzilla/data_analysis/time_series_regression_analysis.py
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -4,7 +4,6 @@
 import pandas as pd
 import plotly.graph_objects as go
 
-#from protzilla.data_analysis.time_series_helper import convert_time_to_hours
 from protzilla.utilities import default_intensity_column
 from protzilla.constants.colors import PROTZILLA_DISCRETE_COLOR_SEQUENCE
 
@@ -14,7 +13,6 @@
 from statsmodels.tsa.arima.model import ARIMA
 from statsmodels.tsa.stattools import adfuller
 from pmdarima import auto_arima
-from plotly.subplots import make_subplots
 
 colors = {
     "plot_bgcolor": "white",

From f8bab05baa3f559a7e5c6b2485496751cbf1dd1a Mon Sep 17 00:00:00 2001
From: AK <kuganash.ravishanker@student.hpi.de>
Date: Fri, 27 Sep 2024 13:54:08 +0200
Subject: [PATCH 52/52] Fixed Tests

---
 tests/protzilla/test_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/protzilla/test_runner.py b/tests/protzilla/test_runner.py
index b5de3148..18080f48 100644
--- a/tests/protzilla/test_runner.py
+++ b/tests/protzilla/test_runner.py
@@ -94,8 +94,8 @@ def test_runner_imports(
         'FilterSamplesByProteinIntensitiesSum',
         'ImputationByKNN',
         'OutlierDetectionByLocalOutlierFactor',
-        'NormalisationByMedian',
         'TransformationLog',
+        'NormalisationByMedian',
         'PlotProtQuant',
         'DifferentialExpressionTTest',
         'PlotVolcano',
@@ -109,8 +109,8 @@ def test_runner_imports(
         call({'deviation_threshold': 2.0}),
         call({'number_of_neighbours': 5}),
         call({'number_of_neighbors': 20}),
-        call({'percentile': 0.5}),
         call({'log_base': 'log2'}),
+        call({'percentile': 0.5}),
         call({'similarity_measure': 'euclidean distance'}),
         call({'alpha': 0.05}),
         call({'fc_threshold': 1}),