From b1aca5897529d019fb4c4c041d91daa480fe3b54 Mon Sep 17 00:00:00 2001 From: henning Date: Mon, 2 Dec 2024 15:22:37 +0100 Subject: [PATCH 1/6] Add filtering of zero or negative intensity values in transformation.py. Add message warning the user of filtered data in transformation.py. --- .../data_preprocessing/transformation.py | 33 +++++++++++++++-- .../data_preprocessing/test_transformation.py | 36 +++++++++++++++---- 2 files changed, 61 insertions(+), 8 deletions(-) diff --git a/protzilla/data_preprocessing/transformation.py b/protzilla/data_preprocessing/transformation.py index 221b01ab..5da0888a 100644 --- a/protzilla/data_preprocessing/transformation.py +++ b/protzilla/data_preprocessing/transformation.py @@ -1,3 +1,6 @@ +from pyexpat.errors import messages +import logging + import numpy as np import pandas as pd @@ -22,11 +25,37 @@ def by_log(protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base=" long format with the transformed data and an empty dict. :rtype: Tuple[pandas DataFrame, dict] """ + msg = [] intensity_name = default_intensity_column(protein_df) transformed_df = protein_df.copy() transformed_peptide_df = peptide_df.copy() if peptide_df is not None else None + zero_intensity_index = transformed_df[transformed_df[intensity_name] <= 0].index + untransformable_data_df = transformed_df.loc[zero_intensity_index] + transformed_df.drop(zero_intensity_index, inplace=True) + transformed_df.reset_index(drop=True, inplace=True) + + if transformed_peptide_df is not None: + zero_intensity_peptide_index = transformed_peptide_df[transformed_peptide_df["Intensity"] <= 0].index + untransformable_peptide_data_df = transformed_peptide_df.loc[zero_intensity_peptide_index] + transformed_peptide_df.drop(zero_intensity_peptide_index, inplace=True) + transformed_peptide_df.reset_index(drop=True, inplace=True) + if not untransformable_peptide_data_df.empty: + msg.append(dict( + msg=f"Warning: {len(untransformable_peptide_data_df)} data points of peptide data with zero or negative intensity values were found and will be dropped. " + f"Please adapt your preprocessing pipeline if this is unexpected.", + level=logging.WARNING + ) + ) + + + if not untransformable_data_df.empty: + msg.append(dict( + msg=f"Warning: {len(untransformable_data_df)} data points of {len(untransformable_data_df['Protein ID'])} distinct protein groups with zero or negative intensity values were found and will be dropped. " + f"Please adapt your preprocessing pipeline if this is unexpected.", + level=logging.WARNING + ) + ) - # TODO 41 drop data when intensity is 0 and return them in dict if log_base == "log2": transformed_df[intensity_name] = np.log2(transformed_df[intensity_name]) if transformed_peptide_df is not None: @@ -41,7 +70,7 @@ def by_log(protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base=" ) else: raise ValueError("Unknown log_base. Known log methods are 'log2' and 'log10'.") - return dict(protein_df=transformed_df, peptide_df=transformed_peptide_df) + return dict(protein_df=transformed_df, peptide_df=transformed_peptide_df, messages=msg) def by_log_plot(method_inputs, method_outputs, graph_type, group_by): diff --git a/tests/protzilla/data_preprocessing/test_transformation.py b/tests/protzilla/data_preprocessing/test_transformation.py index 09827157..cd7b0c10 100644 --- a/tests/protzilla/data_preprocessing/test_transformation.py +++ b/tests/protzilla/data_preprocessing/test_transformation.py @@ -16,8 +16,8 @@ def log2_transformation_df(): ["Sample2", "Protein2", "Gene2", np.nan], ["Sample2", "Protein3", "Gene3", 4], ["Sample2", "Protein4", "Gene4", 4], - ["Sample3", "Protein1", "Gene1", 8], - ["Sample3", "Protein2", "Gene2", 8], + ["Sample3", "Protein1", "Gene1", 0], + ["Sample3", "Protein2", "Gene2", 0], ["Sample3", "Protein3", "Gene3", 8], ["Sample3", "Protein4", "Gene4", 8], ["Sample4", "Protein1", "Gene1", 1024], @@ -42,8 +42,6 @@ def log2_transformation_expected_df(): ["Sample2", "Protein2", "Gene2", np.nan], ["Sample2", "Protein3", "Gene3", 2.0], ["Sample2", "Protein4", "Gene4", 2.0], - ["Sample3", "Protein1", "Gene1", 3.0], - ["Sample3", "Protein2", "Gene2", 3.0], ["Sample3", "Protein3", "Gene3", 3.0], ["Sample3", "Protein4", "Gene4", 3.0], ["Sample4", "Protein1", "Gene1", 10.0], @@ -250,10 +248,36 @@ def test_by_log_without_peptide_df(log2_transformation_df, log_base): def test_log_by_0_transformation(): - # TODO 41 test expected behaviour when 0 occurs in df df = pd.DataFrame( data=(["Sample1", "Protein1", "Gene1", 0.0],), columns=["Sample", "Protein ID", "Gene", "Intensity"], ) - by_log(df, None, log_base="log2") + method_outputs = by_log(df, None, log_base="log2") + assert method_outputs["protein_df"].empty, "The protein DataFrame should be empty." + +def test_log2_transformation_with_negative_values( + log2_transformation_df, peptides_df +): + # Add negative values to the DataFrame with concat + log2_transformation_df = pd.concat( + [log2_transformation_df, pd.DataFrame([["Sample5", "Protein5", "Gene5", -2]], columns=log2_transformation_df.columns)] + ) + peptides_df = pd.concat( + [peptides_df, pd.DataFrame([["Sample5", "Protein1", "Peptide5", -2, 0.037779]], + columns = ["Sample", "Protein ID", "Sequence", "Intensity", "PEP"])] + ) + + method_inputs = { + "protein_df": log2_transformation_df, + "peptide_df": peptides_df, + "log_base": "log2", + } + method_outputs = by_log(**method_inputs) + + result_df = method_outputs["protein_df"] + result_peptide_df = method_outputs["peptide_df"] + + # Check that negative values are removed + assert not (result_df["Intensity"] < 0).any(), "Negative values were not removed from the protein DataFrame" + assert not (result_peptide_df["Intensity"] < 0).any(), "Negative values were not removed from the peptide DataFrame" From 9042ffb667f71b65ffd826873390e5db6e4deaf3 Mon Sep 17 00:00:00 2001 From: henning Date: Mon, 2 Dec 2024 15:28:43 +0100 Subject: [PATCH 2/6] Fix docstring of transformation.py's by_log(..) function --- protzilla/data_preprocessing/transformation.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/protzilla/data_preprocessing/transformation.py b/protzilla/data_preprocessing/transformation.py index 5da0888a..97d4d8b8 100644 --- a/protzilla/data_preprocessing/transformation.py +++ b/protzilla/data_preprocessing/transformation.py @@ -10,9 +10,8 @@ def by_log(protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base="log10") -> dict: """ - This function log-transforms intensity - DataFrames. Supports log-transformation to the base - of 2 or 10. + This function log-transforms intensity, while ignoring and dropping negative or 0 intensity values. + Supports log-transformation to the base of 2 or 10. :param protein_df: a protein data frame in long format :type protein_df: pd.DataFrame From 4ad5f8f088b877d0dc72538c476b4881c663887c Mon Sep 17 00:00:00 2001 From: henning Date: Mon, 2 Dec 2024 15:29:40 +0100 Subject: [PATCH 3/6] Reformatting --- .../data_preprocessing/transformation.py | 37 ++++++++++++------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/protzilla/data_preprocessing/transformation.py b/protzilla/data_preprocessing/transformation.py index 97d4d8b8..be47dad6 100644 --- a/protzilla/data_preprocessing/transformation.py +++ b/protzilla/data_preprocessing/transformation.py @@ -8,7 +8,9 @@ from protzilla.utilities import default_intensity_column -def by_log(protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base="log10") -> dict: +def by_log( + protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base="log10" +) -> dict: """ This function log-transforms intensity, while ignoring and dropping negative or 0 intensity values. Supports log-transformation to the base of 2 or 10. @@ -34,24 +36,29 @@ def by_log(protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base=" transformed_df.reset_index(drop=True, inplace=True) if transformed_peptide_df is not None: - zero_intensity_peptide_index = transformed_peptide_df[transformed_peptide_df["Intensity"] <= 0].index - untransformable_peptide_data_df = transformed_peptide_df.loc[zero_intensity_peptide_index] + zero_intensity_peptide_index = transformed_peptide_df[ + transformed_peptide_df["Intensity"] <= 0 + ].index + untransformable_peptide_data_df = transformed_peptide_df.loc[ + zero_intensity_peptide_index + ] transformed_peptide_df.drop(zero_intensity_peptide_index, inplace=True) transformed_peptide_df.reset_index(drop=True, inplace=True) if not untransformable_peptide_data_df.empty: - msg.append(dict( - msg=f"Warning: {len(untransformable_peptide_data_df)} data points of peptide data with zero or negative intensity values were found and will be dropped. " - f"Please adapt your preprocessing pipeline if this is unexpected.", - level=logging.WARNING + msg.append( + dict( + msg=f"Warning: {len(untransformable_peptide_data_df)} data points of peptide data with zero or negative intensity values were found and will be dropped. " + f"Please adapt your preprocessing pipeline if this is unexpected.", + level=logging.WARNING, + ) ) - ) - if not untransformable_data_df.empty: - msg.append(dict( - msg=f"Warning: {len(untransformable_data_df)} data points of {len(untransformable_data_df['Protein ID'])} distinct protein groups with zero or negative intensity values were found and will be dropped. " - f"Please adapt your preprocessing pipeline if this is unexpected.", - level=logging.WARNING + msg.append( + dict( + msg=f"Warning: {len(untransformable_data_df)} data points of {len(untransformable_data_df['Protein ID'])} distinct protein groups with zero or negative intensity values were found and will be dropped. " + f"Please adapt your preprocessing pipeline if this is unexpected.", + level=logging.WARNING, ) ) @@ -69,7 +76,9 @@ def by_log(protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base=" ) else: raise ValueError("Unknown log_base. Known log methods are 'log2' and 'log10'.") - return dict(protein_df=transformed_df, peptide_df=transformed_peptide_df, messages=msg) + return dict( + protein_df=transformed_df, peptide_df=transformed_peptide_df, messages=msg + ) def by_log_plot(method_inputs, method_outputs, graph_type, group_by): From ccaa057c5c86814dd8ffc7ab7efc234611476186 Mon Sep 17 00:00:00 2001 From: henning Date: Mon, 2 Dec 2024 15:30:43 +0100 Subject: [PATCH 4/6] Reformatting --- .../data_preprocessing/test_transformation.py | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/tests/protzilla/data_preprocessing/test_transformation.py b/tests/protzilla/data_preprocessing/test_transformation.py index cd7b0c10..2610cdbc 100644 --- a/tests/protzilla/data_preprocessing/test_transformation.py +++ b/tests/protzilla/data_preprocessing/test_transformation.py @@ -256,16 +256,26 @@ def test_log_by_0_transformation(): method_outputs = by_log(df, None, log_base="log2") assert method_outputs["protein_df"].empty, "The protein DataFrame should be empty." -def test_log2_transformation_with_negative_values( - log2_transformation_df, peptides_df -): + +def test_log2_transformation_with_negative_values(log2_transformation_df, peptides_df): # Add negative values to the DataFrame with concat log2_transformation_df = pd.concat( - [log2_transformation_df, pd.DataFrame([["Sample5", "Protein5", "Gene5", -2]], columns=log2_transformation_df.columns)] + [ + log2_transformation_df, + pd.DataFrame( + [["Sample5", "Protein5", "Gene5", -2]], + columns=log2_transformation_df.columns, + ), + ] ) peptides_df = pd.concat( - [peptides_df, pd.DataFrame([["Sample5", "Protein1", "Peptide5", -2, 0.037779]], - columns = ["Sample", "Protein ID", "Sequence", "Intensity", "PEP"])] + [ + peptides_df, + pd.DataFrame( + [["Sample5", "Protein1", "Peptide5", -2, 0.037779]], + columns=["Sample", "Protein ID", "Sequence", "Intensity", "PEP"], + ), + ] ) method_inputs = { @@ -279,5 +289,9 @@ def test_log2_transformation_with_negative_values( result_peptide_df = method_outputs["peptide_df"] # Check that negative values are removed - assert not (result_df["Intensity"] < 0).any(), "Negative values were not removed from the protein DataFrame" - assert not (result_peptide_df["Intensity"] < 0).any(), "Negative values were not removed from the peptide DataFrame" + assert not ( + result_df["Intensity"] < 0 + ).any(), "Negative values were not removed from the protein DataFrame" + assert not ( + result_peptide_df["Intensity"] < 0 + ).any(), "Negative values were not removed from the peptide DataFrame" From 141644b298eb3236c8c163e61ba28d2284b9ffd5 Mon Sep 17 00:00:00 2001 From: henning Date: Mon, 2 Dec 2024 15:31:34 +0100 Subject: [PATCH 5/6] Remove unused import --- protzilla/data_preprocessing/transformation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/protzilla/data_preprocessing/transformation.py b/protzilla/data_preprocessing/transformation.py index be47dad6..60082113 100644 --- a/protzilla/data_preprocessing/transformation.py +++ b/protzilla/data_preprocessing/transformation.py @@ -1,4 +1,3 @@ -from pyexpat.errors import messages import logging import numpy as np From db8b358bd68b11e44e48d2433669dc32990f046f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20G=C3=A4rtner?= <104069093+henninggaertner@users.noreply.github.com> Date: Tue, 4 Mar 2025 16:46:18 +0100 Subject: [PATCH 6/6] Run black --- protzilla/data_preprocessing/transformation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protzilla/data_preprocessing/transformation.py b/protzilla/data_preprocessing/transformation.py index 60082113..6c5be256 100644 --- a/protzilla/data_preprocessing/transformation.py +++ b/protzilla/data_preprocessing/transformation.py @@ -47,7 +47,7 @@ def by_log( msg.append( dict( msg=f"Warning: {len(untransformable_peptide_data_df)} data points of peptide data with zero or negative intensity values were found and will be dropped. " - f"Please adapt your preprocessing pipeline if this is unexpected.", + f"Please adapt your preprocessing workflow if this is unexpected.", level=logging.WARNING, ) )