-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDataTransformer.py
More file actions
88 lines (63 loc) · 3.13 KB
/
DataTransformer.py
File metadata and controls
88 lines (63 loc) · 3.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import sys
import pandas as pd
import numpy as np
from Colors import Colors
class DataTransformer:
def __init__(self):
pass
def vectorize_all_pairs(self, pairs: list, quant_df) -> tuple[dict, np.ndarray]:
"""Vectorizes all pairs of proteins and returns a matrix of boolean vectors.
Rows are indexed by the string representation of each pair."""
filled_df = quant_df.fillna(-np.inf)
quant_matrix = filled_df.to_numpy()
col_to_idx = {col: i for i, col in enumerate(quant_df.columns)}
idx1 = [col_to_idx[p[0]] for p in pairs]
idx2 = [col_to_idx[p[1]] for p in pairs]
prot1_matrix = quant_matrix[:, idx1].T
prot2_matrix = quant_matrix[:, idx2].T
final_matrix = (prot1_matrix > prot2_matrix).astype(np.int8)
return final_matrix
def filter_rules(self, feature_df, quant_df):
proteins = set()
proteins.update(feature_df['Protein1'].tolist())
proteins.update(feature_df['Protein2'].tolist())
updated_feature_df = feature_df.copy()
for protein in proteins:
if protein not in quant_df.columns:
updated_feature_df = updated_feature_df[~((updated_feature_df['Protein1'] == protein) | (updated_feature_df['Protein2'] == protein))]
if len(updated_feature_df) < 1:
print(f"{Colors.ERROR}ERROR: All rules filtered out due to missing proteins in the quant table.{Colors.END}", file=sys.stderr, flush=True)
raise SystemExit(1)
return updated_feature_df
def create_feature_table_from_model(self, model):
protein1 = [feature.split(">")[0] for feature in model.feature_names_in_]
protein2 = [feature.split(">")[1] for feature in model.feature_names_in_]
feature_df = pd.DataFrame({
"Protein1": protein1,
"Protein2": protein2
})
return feature_df
def add_missing_proteins(self, feature_df, quant_df):
proteins = set()
proteins.update(feature_df['Protein1'].tolist())
proteins.update(feature_df['Protein2'].tolist())
updated_quant_df = quant_df.copy()
all_missing = True
for protein in proteins:
if protein not in quant_df.columns:
updated_quant_df[protein] = np.nan
else:
all_missing = False
if all_missing:
print(f"{Colors.ERROR}ERROR: All proteins in rules are missing in the quant table.{Colors.END}", file=sys.stderr, flush=True)
raise SystemExit(1)
return updated_quant_df
def prep_vectorized_pairs_for_scikitlearn(self, rules, bool_matrix):
# Convert dict values (bool arrays) to int arrays
pairs = [">".join(rule) for rule in rules]
bool_df = pd.DataFrame()
for i, rule in enumerate(rules):
pair = ">".join(rule)
bool_df[pair] = [int(eval) for eval in bool_matrix[i, :]]
bool_df = bool_df[pairs]
return bool_df