nifty/DataTransformer.py at main · PayneLab/nifty · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import sys
import pandas as pd
import numpy as np

from Colors import Colors

class DataTransformer:

    def __init__(self):
        pass

    def vectorize_all_pairs(self, pairs: list, quant_df) -> tuple[dict, np.ndarray]:
        """Vectorizes all pairs of proteins and returns a matrix of boolean vectors.
        Rows are indexed by the string representation of each pair."""

        filled_df = quant_df.fillna(-np.inf)
        quant_matrix = filled_df.to_numpy()

        col_to_idx = {col: i for i, col in enumerate(quant_df.columns)}

        idx1 = [col_to_idx[p[0]] for p in pairs]
        idx2 = [col_to_idx[p[1]] for p in pairs]

        prot1_matrix = quant_matrix[:, idx1].T
        prot2_matrix = quant_matrix[:, idx2].T

        final_matrix = (prot1_matrix > prot2_matrix).astype(np.int8)

        return final_matrix

    def filter_rules(self, feature_df, quant_df):
        proteins = set()
        proteins.update(feature_df['Protein1'].tolist())
        proteins.update(feature_df['Protein2'].tolist())

        updated_feature_df = feature_df.copy()

        for protein in proteins:
            if protein not in quant_df.columns:
                updated_feature_df = updated_feature_df[~((updated_feature_df['Protein1'] == protein) | (updated_feature_df['Protein2'] == protein))]

        if len(updated_feature_df) < 1:
            print(f"{Colors.ERROR}ERROR: All rules filtered out due to missing proteins in the quant table.{Colors.END}", file=sys.stderr, flush=True)
            raise SystemExit(1)

        return updated_feature_df

    def create_feature_table_from_model(self, model):
        protein1 = [feature.split(">")[0] for feature in model.feature_names_in_]
        protein2 = [feature.split(">")[1] for feature in model.feature_names_in_]
        feature_df = pd.DataFrame({
            "Protein1": protein1,
            "Protein2": protein2
        })
        return feature_df

    def add_missing_proteins(self, feature_df, quant_df):
        proteins = set()
        proteins.update(feature_df['Protein1'].tolist())
        proteins.update(feature_df['Protein2'].tolist())

        updated_quant_df = quant_df.copy()

        all_missing = True
        for protein in proteins:
            if protein not in quant_df.columns:
                updated_quant_df[protein] = np.nan
            else:
                all_missing = False

        if all_missing:
            print(f"{Colors.ERROR}ERROR: All proteins in rules are missing in the quant table.{Colors.END}", file=sys.stderr, flush=True)
            raise SystemExit(1)

        return updated_quant_df

    def prep_vectorized_pairs_for_scikitlearn(self, rules, bool_matrix):
        # Convert dict values (bool arrays) to int arrays
        pairs = [">".join(rule) for rule in rules]
        bool_df = pd.DataFrame()

        for i, rule in enumerate(rules):
            pair = ">".join(rule)
            bool_df[pair] = [int(eval) for eval in bool_matrix[i, :]]

        bool_df = bool_df[pairs]

        return bool_df