From dc4f944658cb7bb0b8b8eea4664ab744a3b92836 Mon Sep 17 00:00:00 2001 From: Caio Date: Wed, 14 Oct 2020 17:49:21 -0300 Subject: [PATCH 01/10] add gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ed8ebf5 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__ \ No newline at end of file From 1539d8b837edadf1867ed175292091d8a7de9785 Mon Sep 17 00:00:00 2001 From: Caio Date: Thu, 15 Oct 2020 10:39:46 -0300 Subject: [PATCH 02/10] add requirements txt --- requirements.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8fc7d88 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +click==7.1.2 +joblib==0.17.0 +numpy==1.19.2 +PyQt5==5.15.1 +PyQt5-sip==12.8.1 +scikit-learn==0.19.2 +scipy==1.5.2 +six==1.15.0 +sklearn==0.0 +threadpoolctl==2.1.0 From 54c8cde9e220f43801d7336fd8fbd2ed8e22379b Mon Sep 17 00:00:00 2001 From: Caio Date: Thu, 15 Oct 2020 11:00:56 -0300 Subject: [PATCH 03/10] add command line interface --- antivpp.py | 391 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 315 insertions(+), 76 deletions(-) diff --git a/antivpp.py b/antivpp.py index 9b5419d..19682c8 100644 --- a/antivpp.py +++ b/antivpp.py @@ -1,66 +1,295 @@ import sys from PyQt5 import uic, QtWidgets +from sklearn.externals import joblib -qtCreatorFile = "antivpp.ui" # Name of the file here +qtCreatorFile = "antivpp.ui" # Name of the file here Ui_MainWindow, QtBaseClass = uic.loadUiType(qtCreatorFile) + +def read_fasta(fp): + name, seq = None, [] + for line in fp: + line = line.rstrip() + if line.startswith(">"): + if name: + yield (name, ''.join(seq)) + name, seq = line, [] + else: + seq.append(line) + if name: + yield (name, ''.join(seq)) + + +def getResultsSeq(sequence, rfc): + paste_seq = str(sequence) + + kyte_doolittle = {'A': 1.80, 'C': 2.50, 'D': -3.50, 'E': -3.50, 'F': 2.80, + 'G': -0.40, 'H': -3.20, 'I': 4.50, 'K': -3.90, 'L': 3.80, + 'M': 1.90, 'N': -3.50, 'P': -1.60, 'Q': -3.50, 'R': -4.50, + 'S': -0.80, 'T': -0.70, 'V': 4.20, 'W': -0.90, 'Y': -1.30} + + molecular_weigth = {'A': 89.09, 'C': 121.15, 'D': 133.10, 'E': 147.13, 'F': 165.19, + 'G': 75.07, 'H': 155.16, 'I': 131.17, 'K': 146.19, 'L': 131.17, + 'M': 149.21, 'N': 132.12, 'P': 115.13, 'Q': 146.15, 'R': 174.20, + 'S': 105.09, 'T': 119.12, 'V': 117.15, 'W': 204.24, 'Y': 181.19} + + net_charge = {'A': 0, 'C': 0, 'D': -1, 'E': -1, 'F': 0, + 'G': 0, 'H': 0, 'I': 0, 'K': 1, 'L': 0, + 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 1, + 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0} + + net_hydrogen = {'A': 0, 'C': 0, 'D': 1, 'E': 1, 'F': 0, + 'G': 0, 'H': 1, 'I': 0, 'K': 2, 'L': 0, + 'M': 0, 'N': 2, 'P': 0, 'Q': 2, 'R': 4, + 'S': 1, 'T': 1, 'V': 0, 'W': 1, 'Y': 1} + + a_a = round(paste_seq.count("A")/len(paste_seq+str(0.000001)), 3) + c_c = round(paste_seq.count("C")/len(paste_seq+str(0.000001)), 3) + d_d = round(paste_seq.count("D")/len(paste_seq+str(0.000001)), 3) + e_e = round(paste_seq.count("E")/len(paste_seq+str(0.000001)), 3) + f_f = round(paste_seq.count("F")/len(paste_seq+str(0.000001)), 3) + g_g = round(paste_seq.count("G")/len(paste_seq+str(0.000001)), 3) + h_h = round(paste_seq.count("H")/len(paste_seq+str(0.000001)), 3) + i_i = round(paste_seq.count("I")/len(paste_seq+str(0.000001)), 3) + k_k = round(paste_seq.count("K")/len(paste_seq+str(0.000001)), 3) + l_l = round(paste_seq.count("L")/len(paste_seq+str(0.000001)), 3) + m_m = round(paste_seq.count("M")/len(paste_seq+str(0.000001)), 3) + n_n = round(paste_seq.count("N")/len(paste_seq+str(0.000001)), 3) + p_p = round(paste_seq.count("P")/len(paste_seq+str(0.000001)), 3) + q_q = round(paste_seq.count("Q")/len(paste_seq+str(0.000001)), 3) + r_r = round(paste_seq.count("R")/len(paste_seq+str(0.000001)), 3) + s_s = round(paste_seq.count("S")/len(paste_seq+str(0.000001)), 3) + t_t = round(paste_seq.count("T")/len(paste_seq+str(0.000001)), 3) + v_v = round(paste_seq.count("V")/len(paste_seq+str(0.000001)), 3) + w_w = round(paste_seq.count("W")/len(paste_seq+str(0.000001)), 3) + y_y = round(paste_seq.count("Y")/len(paste_seq+str(0.000001)), 3) + + a_kyte = paste_seq.count("A")*kyte_doolittle["A"] + c_kyte = paste_seq.count("C")*kyte_doolittle["C"] + d_kyte = paste_seq.count("D")*kyte_doolittle["D"] + e_kyte = paste_seq.count("E")*kyte_doolittle["E"] + f_kyte = paste_seq.count("F")*kyte_doolittle["F"] + g_kyte = paste_seq.count("G")*kyte_doolittle["G"] + h_kyte = paste_seq.count("H")*kyte_doolittle["H"] + i_kyte = paste_seq.count("I")*kyte_doolittle["I"] + k_kyte = paste_seq.count("K")*kyte_doolittle["K"] + l_kyte = paste_seq.count("L")*kyte_doolittle["L"] + m_kyte = paste_seq.count("M")*kyte_doolittle["M"] + n_kyte = paste_seq.count("N")*kyte_doolittle["N"] + p_kyte = paste_seq.count("P")*kyte_doolittle["P"] + q_kyte = paste_seq.count("Q")*kyte_doolittle["Q"] + r_kyte = paste_seq.count("R")*kyte_doolittle["R"] + s_kyte = paste_seq.count("S")*kyte_doolittle["S"] + t_kyte = paste_seq.count("T")*kyte_doolittle["T"] + v_kyte = paste_seq.count("V")*kyte_doolittle["V"] + w_kyte = paste_seq.count("W")*kyte_doolittle["W"] + y_kyte = paste_seq.count("Y")*kyte_doolittle["Y"] + + a_mw = paste_seq.count("A")*molecular_weigth["A"] + c_mw = paste_seq.count("C")*molecular_weigth["C"] + d_mw = paste_seq.count("D")*molecular_weigth["D"] + e_mw = paste_seq.count("E")*molecular_weigth["E"] + f_mw = paste_seq.count("F")*molecular_weigth["F"] + g_mw = paste_seq.count("G")*molecular_weigth["G"] + h_mw = paste_seq.count("H")*molecular_weigth["H"] + i_mw = paste_seq.count("I")*molecular_weigth["I"] + k_mw = paste_seq.count("K")*molecular_weigth["K"] + l_mw = paste_seq.count("L")*molecular_weigth["L"] + m_mw = paste_seq.count("M")*molecular_weigth["M"] + n_mw = paste_seq.count("N")*molecular_weigth["N"] + p_mw = paste_seq.count("P")*molecular_weigth["P"] + q_mw = paste_seq.count("Q")*molecular_weigth["Q"] + r_mw = paste_seq.count("R")*molecular_weigth["R"] + s_mw = paste_seq.count("S")*molecular_weigth["S"] + t_mw = paste_seq.count("T")*molecular_weigth["T"] + v_mw = paste_seq.count("V")*molecular_weigth["V"] + w_mw = paste_seq.count("W")*molecular_weigth["W"] + y_mw = paste_seq.count("Y")*molecular_weigth["Y"] + + a_charge = paste_seq.count("A")*net_charge["A"] + c_charge = paste_seq.count("C")*net_charge["C"] + d_charge = paste_seq.count("D")*net_charge["D"] + e_charge = paste_seq.count("E")*net_charge["E"] + f_charge = paste_seq.count("F")*net_charge["F"] + g_charge = paste_seq.count("G")*net_charge["G"] + h_charge = paste_seq.count("H")*net_charge["H"] + i_charge = paste_seq.count("I")*net_charge["I"] + k_charge = paste_seq.count("K")*net_charge["K"] + l_charge = paste_seq.count("L")*net_charge["L"] + m_charge = paste_seq.count("M")*net_charge["M"] + n_charge = paste_seq.count("N")*net_charge["N"] + p_charge = paste_seq.count("P")*net_charge["P"] + q_charge = paste_seq.count("Q")*net_charge["Q"] + r_charge = paste_seq.count("R")*net_charge["R"] + s_charge = paste_seq.count("S")*net_charge["S"] + t_charge = paste_seq.count("T")*net_charge["T"] + v_charge = paste_seq.count("V")*net_charge["V"] + w_charge = paste_seq.count("W")*net_charge["W"] + y_charge = paste_seq.count("Y")*net_charge["Y"] + + a_hydrogen = paste_seq.count("A")*net_hydrogen["A"] + c_hydrogen = paste_seq.count("C")*net_hydrogen["C"] + d_hydrogen = paste_seq.count("D")*net_hydrogen["D"] + e_hydrogen = paste_seq.count("E")*net_hydrogen["E"] + f_hydrogen = paste_seq.count("F")*net_hydrogen["F"] + g_hydrogen = paste_seq.count("G")*net_hydrogen["G"] + h_hydrogen = paste_seq.count("H")*net_hydrogen["H"] + i_hydrogen = paste_seq.count("I")*net_hydrogen["I"] + k_hydrogen = paste_seq.count("K")*net_hydrogen["K"] + l_hydrogen = paste_seq.count("L")*net_hydrogen["L"] + m_hydrogen = paste_seq.count("M")*net_hydrogen["M"] + n_hydrogen = paste_seq.count("N")*net_hydrogen["N"] + p_hydrogen = paste_seq.count("P")*net_hydrogen["P"] + q_hydrogen = paste_seq.count("Q")*net_hydrogen["Q"] + r_hydrogen = paste_seq.count("R")*net_hydrogen["R"] + s_hydrogen = paste_seq.count("S")*net_hydrogen["S"] + t_hydrogen = paste_seq.count("T")*net_hydrogen["T"] + v_hydrogen = paste_seq.count("V")*net_hydrogen["V"] + w_hydrogen = paste_seq.count("W")*net_hydrogen["W"] + y_hydrogen = paste_seq.count("Y")*net_hydrogen["Y"] + + # PROPERTIES Q-P + + aliphatic = round((i_i + l_l + v_v), 3) + + negative_charged = round((d_d + e_e), 3) + + total_charged = round((d_d + e_e + k_k + h_h + r_r), 3) + + aromatic = round((f_f + h_h + w_w + y_y), 3) + + polar = round((d_d + e_e + r_r + k_k + q_q + n_n), 3) + + neutral = round((a_a + g_g + h_h + p_p + s_s + t_t + y_y), 3) + + hydrophobic = round((c_c + f_f + i_i + l_l + m_m + v_v + w_w), 3) + + positive_charged = round((k_k + r_r + h_h), 3) + + tiny = round((a_a + c_c + d_d + g_g + s_s + t_t), 3) + + small = round((e_e + h_h + i_i + l_l + k_k + + m_m + n_n + p_p + q_q + v_v), 3) + + large = round((f_f + r_r + w_w + y_y), 3) + + # SCALES + + kyleD = round(( + (a_kyte+c_kyte+d_kyte + + e_kyte+f_kyte+g_kyte + + h_kyte+i_kyte+k_kyte + + l_kyte+m_kyte + n_kyte + + p_kyte+q_kyte+r_kyte + + s_kyte+t_kyte+v_kyte + + w_kyte+y_kyte)/len(paste_seq+str(0.000001)) + ), 3) + + molW = round( + (a_mw+c_mw+d_mw+e_mw + + f_mw+g_mw+h_mw+i_mw + + k_mw+l_mw+m_mw+n_mw + + p_mw+q_mw+r_mw+s_mw + + t_mw+v_mw+w_mw+y_mw), 3) + + netCharge = a_charge+c_charge+d_charge+e_charge+f_charge+g_charge+h_charge+i_charge+k_charge + \ + l_charge+m_charge+n_charge+p_charge+q_charge+r_charge + \ + s_charge+t_charge+v_charge+w_charge+y_charge + + netH = round((a_hydrogen+c_hydrogen+d_hydrogen+e_hydrogen+f_hydrogen+g_hydrogen+h_hydrogen+i_hydrogen+k_hydrogen+l_hydrogen + + m_hydrogen+n_hydrogen+p_hydrogen+q_hydrogen+r_hydrogen+s_hydrogen+t_hydrogen+v_hydrogen+w_hydrogen+y_hydrogen), 3) + + # result = "Probable: " + str(rfc.predict([[netH, netCharge, molW, kyleD, a_a, c_c, d_d, e_e, f_f, g_g, h_h, i_i, k_k, l_l, m_m, n_n, p_p, q_q, r_r, + # s_s, t_t, v_v, w_w, y_y, tiny, small, large, aliphatic, aromatic, total_charged, negative_charged, positive_charged, polar, neutral, hydrophobic]])) + + # self.textpred.setText(str(result)) + # self.textpred1.setText(str(aliphatic)) + # self.textpred2.setText(str(negative_charged)) + # self.textpred3.setText(str(aromatic)) + # self.textpred4.setText(str(polar)) + # self.textpred5.setText(str(neutral)) + # self.textpred6.setText(str(hydrophobic)) + # self.textpred7.setText(str(positive_charged)) + # self.textpred8.setText(str(tiny)) + # self.textpred9.setText(str(small)) + # self.textpred10.setText(str(large)) + # self.textpred11.setText(str(kyleD)) + # self.textpred12.setText(str(molW)) + # self.textpred13.setText(str(netCharge)) + # self.textpred14.setText(str(netH)) + # self.textpred15.setText(str(total_charged)) + # self.textrelat.setText("A: " + str(a_a) + " , " + "C: " + str(c_c) + " , " + "D: " + str(d_d) + " , " + "E: " + str(e_e) + " , " + "F: " + str(f_f) + " , " + "E: " + str(e_e) + " , " + "G: " + str(g_g) + " , " + "I: " + str(i_i) + " , " + "K: " + str(k_k) + " , " + "L: " + str(l_l) + " , " + "M: " + str(m_m) + " , " + "N: " + str(n_n) + " , " + "P: " + str(p_p) + " , " + "Q: " + str(q_q) + " , " + "R: " + str(r_r) + " , " + "S: " + str(s_s) + " , " + "T: " + str(t_t) + " , " + "V: " + str(v_v) + " , " + "W: " + str(w_w) + " , " + "Y: " + str(y_y)) + + result = str(rfc.predict([[netH, netCharge, molW, kyleD, a_a, c_c, d_d, e_e, f_f, g_g, h_h, i_i, k_k, l_l, m_m, n_n, p_p, q_q, r_r, + s_s, t_t, v_v, w_w, y_y, tiny, small, large, aliphatic, aromatic, total_charged, negative_charged, positive_charged, polar, neutral, hydrophobic]])) + + return result + + +def getResultsFile(filename): + rfc = joblib.load('modelo_entrenado_2.pkl') + + with open(filename) as fp: + for name, seq in read_fasta(fp): + print(name, seq, getResultsSeq(seq, rfc)) + + class MyApp(QtWidgets.QMainWindow, Ui_MainWindow): def __init__(self): QtWidgets.QMainWindow.__init__(self) Ui_MainWindow.__init__(self) self.setupUi(self) - self.setFixedSize(self.size()) #Dimensiones fijas - self.bottonpred.clicked.connect(self.calculation) #Esto es para ordenar que cuando se presione vaya a calculo - + self.setFixedSize(self.size()) # Dimensiones fijas + # Esto es para ordenar que cuando se presione vaya a calculo + self.bottonpred.clicked.connect(self.calculation) + def calculation(self): from sklearn.externals import joblib rfc = joblib.load('modelo_entrenado_2.pkl') paste_seq = str(self.pasteseq.toPlainText()) - kyte_doolittle = {'A':1.80,'C':2.50,'D':-3.50,'E':-3.50,'F':2.80, - 'G':-0.40,'H':-3.20,'I':4.50,'K':-3.90,'L':3.80, - 'M':1.90,'N':-3.50,'P':-1.60,'Q':-3.50,'R':-4.50, - 'S':-0.80,'T':-0.70,'V':4.20,'W':-0.90,'Y':-1.30} - - molecular_weigth = {'A':89.09,'C':121.15,'D':133.10,'E':147.13,'F':165.19, - 'G':75.07,'H':155.16,'I':131.17,'K':146.19,'L':131.17, - 'M':149.21,'N':132.12,'P':115.13,'Q':146.15,'R':174.20, - 'S':105.09,'T':119.12,'V':117.15,'W':204.24,'Y':181.19} - - - net_charge = {'A':0,'C':0,'D':-1,'E':-1,'F':0, - 'G':0,'H':0,'I':0,'K':1,'L':0, - 'M':0,'N':0,'P':0,'Q':0,'R':1, - 'S':0,'T':0,'V':0,'W':0,'Y':0} - - net_hydrogen = {'A':0,'C':0,'D':1,'E':1,'F':0, - 'G':0,'H':1,'I':0,'K':2,'L':0, - 'M':0,'N':2,'P':0,'Q':2,'R':4, - 'S':1,'T':1,'V':0,'W':1,'Y':1} - - a_a = round(paste_seq.count("A")/len(paste_seq+str(0.000001)),3) - c_c = round(paste_seq.count("C")/len(paste_seq+str(0.000001)),3) - d_d = round(paste_seq.count("D")/len(paste_seq+str(0.000001)),3) - e_e = round(paste_seq.count("E")/len(paste_seq+str(0.000001)),3) - f_f = round(paste_seq.count("F")/len(paste_seq+str(0.000001)),3) - g_g = round(paste_seq.count("G")/len(paste_seq+str(0.000001)),3) - h_h = round(paste_seq.count("H")/len(paste_seq+str(0.000001)),3) - i_i = round(paste_seq.count("I")/len(paste_seq+str(0.000001)),3) - k_k = round(paste_seq.count("K")/len(paste_seq+str(0.000001)),3) - l_l = round(paste_seq.count("L")/len(paste_seq+str(0.000001)),3) - m_m = round(paste_seq.count("M")/len(paste_seq+str(0.000001)),3) - n_n = round(paste_seq.count("N")/len(paste_seq+str(0.000001)),3) - p_p = round(paste_seq.count("P")/len(paste_seq+str(0.000001)),3) - q_q = round(paste_seq.count("Q")/len(paste_seq+str(0.000001)),3) - r_r = round(paste_seq.count("R")/len(paste_seq+str(0.000001)),3) - s_s = round(paste_seq.count("S")/len(paste_seq+str(0.000001)),3) - t_t = round(paste_seq.count("T")/len(paste_seq+str(0.000001)),3) - v_v = round(paste_seq.count("V")/len(paste_seq+str(0.000001)),3) - w_w = round(paste_seq.count("W")/len(paste_seq+str(0.000001)),3) - y_y = round(paste_seq.count("Y")/len(paste_seq+str(0.000001)),3) + kyte_doolittle = {'A': 1.80, 'C': 2.50, 'D': -3.50, 'E': -3.50, 'F': 2.80, + 'G': -0.40, 'H': -3.20, 'I': 4.50, 'K': -3.90, 'L': 3.80, + 'M': 1.90, 'N': -3.50, 'P': -1.60, 'Q': -3.50, 'R': -4.50, + 'S': -0.80, 'T': -0.70, 'V': 4.20, 'W': -0.90, 'Y': -1.30} + + molecular_weigth = {'A': 89.09, 'C': 121.15, 'D': 133.10, 'E': 147.13, 'F': 165.19, + 'G': 75.07, 'H': 155.16, 'I': 131.17, 'K': 146.19, 'L': 131.17, + 'M': 149.21, 'N': 132.12, 'P': 115.13, 'Q': 146.15, 'R': 174.20, + 'S': 105.09, 'T': 119.12, 'V': 117.15, 'W': 204.24, 'Y': 181.19} + net_charge = {'A': 0, 'C': 0, 'D': -1, 'E': -1, 'F': 0, + 'G': 0, 'H': 0, 'I': 0, 'K': 1, 'L': 0, + 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 1, + 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0} + + net_hydrogen = {'A': 0, 'C': 0, 'D': 1, 'E': 1, 'F': 0, + 'G': 0, 'H': 1, 'I': 0, 'K': 2, 'L': 0, + 'M': 0, 'N': 2, 'P': 0, 'Q': 2, 'R': 4, + 'S': 1, 'T': 1, 'V': 0, 'W': 1, 'Y': 1} + + a_a = round(paste_seq.count("A")/len(paste_seq+str(0.000001)), 3) + c_c = round(paste_seq.count("C")/len(paste_seq+str(0.000001)), 3) + d_d = round(paste_seq.count("D")/len(paste_seq+str(0.000001)), 3) + e_e = round(paste_seq.count("E")/len(paste_seq+str(0.000001)), 3) + f_f = round(paste_seq.count("F")/len(paste_seq+str(0.000001)), 3) + g_g = round(paste_seq.count("G")/len(paste_seq+str(0.000001)), 3) + h_h = round(paste_seq.count("H")/len(paste_seq+str(0.000001)), 3) + i_i = round(paste_seq.count("I")/len(paste_seq+str(0.000001)), 3) + k_k = round(paste_seq.count("K")/len(paste_seq+str(0.000001)), 3) + l_l = round(paste_seq.count("L")/len(paste_seq+str(0.000001)), 3) + m_m = round(paste_seq.count("M")/len(paste_seq+str(0.000001)), 3) + n_n = round(paste_seq.count("N")/len(paste_seq+str(0.000001)), 3) + p_p = round(paste_seq.count("P")/len(paste_seq+str(0.000001)), 3) + q_q = round(paste_seq.count("Q")/len(paste_seq+str(0.000001)), 3) + r_r = round(paste_seq.count("R")/len(paste_seq+str(0.000001)), 3) + s_s = round(paste_seq.count("S")/len(paste_seq+str(0.000001)), 3) + t_t = round(paste_seq.count("T")/len(paste_seq+str(0.000001)), 3) + v_v = round(paste_seq.count("V")/len(paste_seq+str(0.000001)), 3) + w_w = round(paste_seq.count("W")/len(paste_seq+str(0.000001)), 3) + y_y = round(paste_seq.count("Y")/len(paste_seq+str(0.000001)), 3) a_kyte = paste_seq.count("A")*kyte_doolittle["A"] c_kyte = paste_seq.count("C")*kyte_doolittle["C"] @@ -83,7 +312,6 @@ def calculation(self): w_kyte = paste_seq.count("W")*kyte_doolittle["W"] y_kyte = paste_seq.count("Y")*kyte_doolittle["Y"] - a_mw = paste_seq.count("A")*molecular_weigth["A"] c_mw = paste_seq.count("C")*molecular_weigth["C"] d_mw = paste_seq.count("D")*molecular_weigth["D"] @@ -104,7 +332,7 @@ def calculation(self): v_mw = paste_seq.count("V")*molecular_weigth["V"] w_mw = paste_seq.count("W")*molecular_weigth["W"] y_mw = paste_seq.count("Y")*molecular_weigth["Y"] - + a_charge = paste_seq.count("A")*net_charge["A"] c_charge = paste_seq.count("C")*net_charge["C"] d_charge = paste_seq.count("D")*net_charge["D"] @@ -147,44 +375,51 @@ def calculation(self): w_hydrogen = paste_seq.count("W")*net_hydrogen["W"] y_hydrogen = paste_seq.count("Y")*net_hydrogen["Y"] - #PROPERTIES Q-P - - aliphatic = round((i_i + l_l + v_v),3) + # PROPERTIES Q-P + + aliphatic = round((i_i + l_l + v_v), 3) + + negative_charged = round((d_d + e_e), 3) + + total_charged = round((d_d + e_e + k_k + h_h + r_r), 3) + + aromatic = round((f_f + h_h + w_w + y_y), 3) + + polar = round((d_d + e_e + r_r + k_k + q_q + n_n), 3) - negative_charged = round((d_d + e_e),3) + neutral = round((a_a + g_g + h_h + p_p + s_s + t_t + y_y), 3) - total_charged = round((d_d + e_e + k_k + h_h + r_r),3) + hydrophobic = round((c_c + f_f + i_i + l_l + m_m + v_v + w_w), 3) - aromatic = round((f_f + h_h + w_w + y_y),3) + positive_charged = round((k_k + r_r + h_h), 3) - polar = round((d_d + e_e + r_r + k_k + q_q + n_n),3) + tiny = round((a_a + c_c + d_d + g_g + s_s + t_t), 3) - neutral = round((a_a + g_g + h_h + p_p + s_s + t_t + y_y),3) + small = round((e_e + h_h + i_i + l_l + k_k + + m_m + n_n + p_p + q_q + v_v), 3) - hydrophobic = round((c_c + f_f + i_i + l_l + m_m + v_v + w_w),3) + large = round((f_f + r_r + w_w + y_y), 3) - positive_charged = round((k_k + r_r + h_h),3) + # SCALES - tiny = round((a_a + c_c + d_d + g_g + s_s + t_t),3) + kyleD = round(((a_kyte+c_kyte+d_kyte+e_kyte+f_kyte+g_kyte+h_kyte+i_kyte+k_kyte+l_kyte+m_kyte + + n_kyte+p_kyte+q_kyte+r_kyte+s_kyte+t_kyte+v_kyte+w_kyte+y_kyte)/len(paste_seq+str(0.000001))), 3) - small = round((e_e + h_h + i_i + l_l + k_k + m_m + n_n + p_p + q_q + v_v),3) + molW = round((a_mw+c_mw+d_mw+e_mw+f_mw+g_mw+h_mw+i_mw+k_mw + + l_mw+m_mw+n_mw+p_mw+q_mw+r_mw+s_mw+t_mw+v_mw+w_mw+y_mw), 3) - large = round((f_f + r_r + w_w + y_y),3) + netCharge = a_charge+c_charge+d_charge+e_charge+f_charge+g_charge+h_charge+i_charge+k_charge + \ + l_charge+m_charge+n_charge+p_charge+q_charge+r_charge + \ + s_charge+t_charge+v_charge+w_charge+y_charge - #SCALES - - kyleD = round(((a_kyte+c_kyte+d_kyte+e_kyte+f_kyte+g_kyte+h_kyte+i_kyte+k_kyte+l_kyte+m_kyte+n_kyte+p_kyte+q_kyte+r_kyte+s_kyte+t_kyte+v_kyte+w_kyte+y_kyte)/len(paste_seq+str(0.000001))),3) + netH = round((a_hydrogen+c_hydrogen+d_hydrogen+e_hydrogen+f_hydrogen+g_hydrogen+h_hydrogen+i_hydrogen+k_hydrogen+l_hydrogen + + m_hydrogen+n_hydrogen+p_hydrogen+q_hydrogen+r_hydrogen+s_hydrogen+t_hydrogen+v_hydrogen+w_hydrogen+y_hydrogen), 3) - molW = round((a_mw+c_mw+d_mw+e_mw+f_mw+g_mw+h_mw+i_mw+k_mw+l_mw+m_mw+n_mw+p_mw+q_mw+r_mw+s_mw+t_mw+v_mw+w_mw+y_mw),3) - - netCharge = a_charge+c_charge+d_charge+e_charge+f_charge+g_charge+h_charge+i_charge+k_charge+l_charge+m_charge+n_charge+p_charge+q_charge+r_charge+s_charge+t_charge+v_charge+w_charge+y_charge - - netH = round((a_hydrogen+c_hydrogen+d_hydrogen+e_hydrogen+f_hydrogen+g_hydrogen+h_hydrogen+i_hydrogen+k_hydrogen+l_hydrogen+m_hydrogen+n_hydrogen+p_hydrogen+q_hydrogen+r_hydrogen+s_hydrogen+t_hydrogen+v_hydrogen+w_hydrogen+y_hydrogen),3) + result = "Probable: " + str(rfc.predict([[netH, netCharge, molW, kyleD, a_a, c_c, d_d, e_e, f_f, g_g, h_h, i_i, k_k, l_l, m_m, n_n, p_p, q_q, r_r, + s_s, t_t, v_v, w_w, y_y, tiny, small, large, aliphatic, aromatic, total_charged, negative_charged, positive_charged, polar, neutral, hydrophobic]])) - result = "Probable: " + str(rfc.predict([[netH,netCharge,molW,kyleD,a_a,c_c,d_d,e_e,f_f,g_g,h_h,i_i,k_k,l_l,m_m,n_n,p_p,q_q,r_r,s_s,t_t,v_v,w_w,y_y,tiny,small,large,aliphatic,aromatic,total_charged,negative_charged,positive_charged,polar,neutral,hydrophobic]])) - self.textpred.setText(str(result)) - self.textpred1.setText(str(aliphatic)) + self.textpred1.setText(str(aliphatic)) self.textpred2.setText(str(negative_charged)) self.textpred3.setText(str(aromatic)) self.textpred4.setText(str(polar)) @@ -199,11 +434,15 @@ def calculation(self): self.textpred13.setText(str(netCharge)) self.textpred14.setText(str(netH)) self.textpred15.setText(str(total_charged)) - self.textrelat.setText("A: " + str(a_a) + " , " + "C: " + str(c_c) + " , " + "D: " + str(d_d) + " , " + "E: " + str(e_e) + " , " + "F: " + str(f_f) + " , " + "E: " + str(e_e) + " , " + "G: " + str(g_g) + " , " + "I: " + str(i_i) + " , " + "K: " + str(k_k) + " , " + "L: " + str(l_l) + " , " + "M: " + str(m_m) + " , " + "N: " + str(n_n) + " , " + "P: " + str(p_p) + " , " + "Q: " + str(q_q) + " , " + "R: " + str(r_r) + " , " + "S: " + str(s_s) + " , " + "T: " + str(t_t) + " , " + "V: " + str(v_v) + " , " + "W: " + str(w_w) + " , " + "Y: " + str(y_y)) + self.textrelat.setText("A: " + str(a_a) + " , " + "C: " + str(c_c) + " , " + "D: " + str(d_d) + " , " + "E: " + str(e_e) + " , " + "F: " + str(f_f) + " , " + "E: " + str(e_e) + " , " + "G: " + str(g_g) + " , " + "I: " + str(i_i) + " , " + "K: " + str(k_k) + " , " + "L: " + str( + l_l) + " , " + "M: " + str(m_m) + " , " + "N: " + str(n_n) + " , " + "P: " + str(p_p) + " , " + "Q: " + str(q_q) + " , " + "R: " + str(r_r) + " , " + "S: " + str(s_s) + " , " + "T: " + str(t_t) + " , " + "V: " + str(v_v) + " , " + "W: " + str(w_w) + " , " + "Y: " + str(y_y)) if __name__ == "__main__": - app = QtWidgets.QApplication(sys.argv) - window = MyApp() - window.show() - sys.exit(app.exec_()) + if len(sys.argv) == 1: + app = QtWidgets.QApplication(sys.argv) + window = MyApp() + window.show() + sys.exit(app.exec_()) + else: + print(getResultsFile(sys.argv[-1])) From cc47c05543d192198db8b0c67ff33368b25f9131 Mon Sep 17 00:00:00 2001 From: Caio Date: Thu, 15 Oct 2020 11:26:07 -0300 Subject: [PATCH 04/10] change output format to tsv + --help commands --- antivpp.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/antivpp.py b/antivpp.py index 19682c8..a95be50 100644 --- a/antivpp.py +++ b/antivpp.py @@ -231,8 +231,14 @@ def getResultsFile(filename): rfc = joblib.load('modelo_entrenado_2.pkl') with open(filename) as fp: + print('Name\tPredicted_Antiviral\tSequence\n') for name, seq in read_fasta(fp): - print(name, seq, getResultsSeq(seq, rfc)) + result = getResultsSeq(seq, rfc) + print('{}\t{}\t{}'.format( + name.replace('>', ''), + result.replace('[', '').replace(']', '').strip(), + seq + )) class MyApp(QtWidgets.QMainWindow, Ui_MainWindow): @@ -445,4 +451,13 @@ def calculation(self): window.show() sys.exit(app.exec_()) else: - print(getResultsFile(sys.argv[-1])) + if sys.argv[-1] == '--help': + print( + "Usage:\n" + "For UI version:\n" + "\tpython antivpp.py\n" + "For getting results from fasta file:\n" + "\tpython antivpp.py [fasta filename] > [tsv filename]\n" + "\nUse relative paths\n") + else: + print(getResultsFile(sys.argv[-1])) From f4ecbf16f9aefc55701518033e063409e0d5a998 Mon Sep 17 00:00:00 2001 From: Caio Date: Thu, 15 Oct 2020 11:54:30 -0300 Subject: [PATCH 05/10] small cleanup --- .gitignore | 2 +- antivpp.py | 41 +++++++++++++++-------------------------- 2 files changed, 16 insertions(+), 27 deletions(-) diff --git a/.gitignore b/.gitignore index ed8ebf5..bee8a64 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1 @@ -__pycache__ \ No newline at end of file +__pycache__ diff --git a/antivpp.py b/antivpp.py index a95be50..ba96e1d 100644 --- a/antivpp.py +++ b/antivpp.py @@ -197,32 +197,21 @@ def getResultsSeq(sequence, rfc): l_charge+m_charge+n_charge+p_charge+q_charge+r_charge + \ s_charge+t_charge+v_charge+w_charge+y_charge - netH = round((a_hydrogen+c_hydrogen+d_hydrogen+e_hydrogen+f_hydrogen+g_hydrogen+h_hydrogen+i_hydrogen+k_hydrogen+l_hydrogen + - m_hydrogen+n_hydrogen+p_hydrogen+q_hydrogen+r_hydrogen+s_hydrogen+t_hydrogen+v_hydrogen+w_hydrogen+y_hydrogen), 3) - - # result = "Probable: " + str(rfc.predict([[netH, netCharge, molW, kyleD, a_a, c_c, d_d, e_e, f_f, g_g, h_h, i_i, k_k, l_l, m_m, n_n, p_p, q_q, r_r, - # s_s, t_t, v_v, w_w, y_y, tiny, small, large, aliphatic, aromatic, total_charged, negative_charged, positive_charged, polar, neutral, hydrophobic]])) - - # self.textpred.setText(str(result)) - # self.textpred1.setText(str(aliphatic)) - # self.textpred2.setText(str(negative_charged)) - # self.textpred3.setText(str(aromatic)) - # self.textpred4.setText(str(polar)) - # self.textpred5.setText(str(neutral)) - # self.textpred6.setText(str(hydrophobic)) - # self.textpred7.setText(str(positive_charged)) - # self.textpred8.setText(str(tiny)) - # self.textpred9.setText(str(small)) - # self.textpred10.setText(str(large)) - # self.textpred11.setText(str(kyleD)) - # self.textpred12.setText(str(molW)) - # self.textpred13.setText(str(netCharge)) - # self.textpred14.setText(str(netH)) - # self.textpred15.setText(str(total_charged)) - # self.textrelat.setText("A: " + str(a_a) + " , " + "C: " + str(c_c) + " , " + "D: " + str(d_d) + " , " + "E: " + str(e_e) + " , " + "F: " + str(f_f) + " , " + "E: " + str(e_e) + " , " + "G: " + str(g_g) + " , " + "I: " + str(i_i) + " , " + "K: " + str(k_k) + " , " + "L: " + str(l_l) + " , " + "M: " + str(m_m) + " , " + "N: " + str(n_n) + " , " + "P: " + str(p_p) + " , " + "Q: " + str(q_q) + " , " + "R: " + str(r_r) + " , " + "S: " + str(s_s) + " , " + "T: " + str(t_t) + " , " + "V: " + str(v_v) + " , " + "W: " + str(w_w) + " , " + "Y: " + str(y_y)) - - result = str(rfc.predict([[netH, netCharge, molW, kyleD, a_a, c_c, d_d, e_e, f_f, g_g, h_h, i_i, k_k, l_l, m_m, n_n, p_p, q_q, r_r, - s_s, t_t, v_v, w_w, y_y, tiny, small, large, aliphatic, aromatic, total_charged, negative_charged, positive_charged, polar, neutral, hydrophobic]])) + netH = round(( + a_hydrogen+c_hydrogen+d_hydrogen+e_hydrogen + + f_hydrogen+g_hydrogen+h_hydrogen+i_hydrogen + + k_hydrogen+l_hydrogen+m_hydrogen+n_hydrogen + + p_hydrogen+q_hydrogen+r_hydrogen+s_hydrogen + + t_hydrogen+v_hydrogen+w_hydrogen+y_hydrogen), 3) + + result = str(rfc.predict([[ + netH, netCharge, molW, kyleD, + a_a, c_c, d_d, e_e, f_f, g_g, + h_h, i_i, k_k, l_l, m_m, n_n, + p_p, q_q, r_r, s_s, t_t, v_v, w_w, y_y, + tiny, small, large, aliphatic, aromatic, + total_charged, negative_charged, positive_charged, + polar, neutral, hydrophobic]])) return result From 2ca4558ece831b9bf5e0a4744a33bbfe58e324f4 Mon Sep 17 00:00:00 2001 From: Caio Date: Thu, 15 Oct 2020 12:50:59 -0300 Subject: [PATCH 06/10] begining of cleanup of code --- antivpp.py | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 127 insertions(+), 2 deletions(-) diff --git a/antivpp.py b/antivpp.py index ba96e1d..59072cb 100644 --- a/antivpp.py +++ b/antivpp.py @@ -21,6 +21,129 @@ def read_fasta(fp): yield (name, ''.join(seq)) +def getResultsSeqClean(sequence, rfc): + paste_seq = str(sequence) + seq_size = len(paste_seq+str(0.000001)) + + kyte_doolittle = {'A': 1.80, 'C': 2.50, 'D': -3.50, 'E': -3.50, 'F': 2.80, + 'G': -0.40, 'H': -3.20, 'I': 4.50, 'K': -3.90, 'L': 3.80, + 'M': 1.90, 'N': -3.50, 'P': -1.60, 'Q': -3.50, 'R': -4.50, + 'S': -0.80, 'T': -0.70, 'V': 4.20, 'W': -0.90, 'Y': -1.30} + + molecular_weigth = {'A': 89.09, 'C': 121.15, 'D': 133.10, 'E': 147.13, 'F': 165.19, + 'G': 75.07, 'H': 155.16, 'I': 131.17, 'K': 146.19, 'L': 131.17, + 'M': 149.21, 'N': 132.12, 'P': 115.13, 'Q': 146.15, 'R': 174.20, + 'S': 105.09, 'T': 119.12, 'V': 117.15, 'W': 204.24, 'Y': 181.19} + + net_charge = {'A': 0, 'C': 0, 'D': -1, 'E': -1, 'F': 0, + 'G': 0, 'H': 0, 'I': 0, 'K': 1, 'L': 0, + 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 1, + 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0} + + net_hydrogen = {'A': 0, 'C': 0, 'D': 1, 'E': 1, 'F': 0, + 'G': 0, 'H': 1, 'I': 0, 'K': 2, 'L': 0, + 'M': 0, 'N': 2, 'P': 0, 'Q': 2, 'R': 4, + 'S': 1, 'T': 1, 'V': 0, 'W': 1, 'Y': 1} + + aa_list = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', + 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] + aa_counts = {k: paste_seq.count(k) for k in aa_list} + + a_a = round(aa_counts["A"]/seq_size, 3) + c_c = round(aa_counts["C"]/seq_size, 3) + d_d = round(aa_counts["D"]/seq_size, 3) + e_e = round(aa_counts["E"]/seq_size, 3) + f_f = round(aa_counts["F"]/seq_size, 3) + g_g = round(aa_counts["G"]/seq_size, 3) + h_h = round(aa_counts["H"]/seq_size, 3) + i_i = round(aa_counts["I"]/seq_size, 3) + k_k = round(aa_counts["K"]/seq_size, 3) + l_l = round(aa_counts["L"]/seq_size, 3) + m_m = round(aa_counts["M"]/seq_size, 3) + n_n = round(aa_counts["N"]/seq_size, 3) + p_p = round(aa_counts["P"]/seq_size, 3) + q_q = round(aa_counts["Q"]/seq_size, 3) + r_r = round(aa_counts["R"]/seq_size, 3) + s_s = round(aa_counts["S"]/seq_size, 3) + t_t = round(aa_counts["T"]/seq_size, 3) + v_v = round(aa_counts["V"]/seq_size, 3) + w_w = round(aa_counts["W"]/seq_size, 3) + y_y = round(aa_counts["Y"]/seq_size, 3) + + a_hydrogen = aa_counts["A"]*net_hydrogen["A"] + c_hydrogen = aa_counts["C"]*net_hydrogen["C"] + d_hydrogen = aa_counts["D"]*net_hydrogen["D"] + e_hydrogen = aa_counts["E"]*net_hydrogen["E"] + f_hydrogen = aa_counts["F"]*net_hydrogen["F"] + g_hydrogen = aa_counts["G"]*net_hydrogen["G"] + h_hydrogen = aa_counts["H"]*net_hydrogen["H"] + i_hydrogen = aa_counts["I"]*net_hydrogen["I"] + k_hydrogen = aa_counts["K"]*net_hydrogen["K"] + l_hydrogen = aa_counts["L"]*net_hydrogen["L"] + m_hydrogen = aa_counts["M"]*net_hydrogen["M"] + n_hydrogen = aa_counts["N"]*net_hydrogen["N"] + p_hydrogen = aa_counts["P"]*net_hydrogen["P"] + q_hydrogen = aa_counts["Q"]*net_hydrogen["Q"] + r_hydrogen = aa_counts["R"]*net_hydrogen["R"] + s_hydrogen = aa_counts["S"]*net_hydrogen["S"] + t_hydrogen = aa_counts["T"]*net_hydrogen["T"] + v_hydrogen = aa_counts["V"]*net_hydrogen["V"] + w_hydrogen = aa_counts["W"]*net_hydrogen["W"] + y_hydrogen = aa_counts["Y"]*net_hydrogen["Y"] + + # PROPERTIES Q-P + + aliphatic = round((i_i + l_l + v_v), 3) + + negative_charged = round((d_d + e_e), 3) + + total_charged = round((d_d + e_e + k_k + h_h + r_r), 3) + + aromatic = round((f_f + h_h + w_w + y_y), 3) + + polar = round((d_d + e_e + r_r + k_k + q_q + n_n), 3) + + neutral = round((a_a + g_g + h_h + p_p + s_s + t_t + y_y), 3) + + hydrophobic = round((c_c + f_f + i_i + l_l + m_m + v_v + w_w), 3) + + positive_charged = round((k_k + r_r + h_h), 3) + + tiny = round((a_a + c_c + d_d + g_g + s_s + t_t), 3) + + small = round((e_e + h_h + i_i + l_l + k_k + + m_m + n_n + p_p + q_q + v_v), 3) + + large = round((f_f + r_r + w_w + y_y), 3) + + # SCALES + + kyleD = round(sum([aa_counts[k]*kyte_doolittle[k] + for k in aa_list])/seq_size, 3) + + molW = round(sum([aa_counts[k]*molecular_weigth[k] for k in aa_list]), 3) + + netCharge = sum([aa_counts[k]*net_charge[k] for k in aa_list]) + + netH = round(( + a_hydrogen+c_hydrogen+d_hydrogen+e_hydrogen + + f_hydrogen+g_hydrogen+h_hydrogen+i_hydrogen + + k_hydrogen+l_hydrogen+m_hydrogen+n_hydrogen + + p_hydrogen+q_hydrogen+r_hydrogen+s_hydrogen + + t_hydrogen+v_hydrogen+w_hydrogen+y_hydrogen), 3) + + result = str(rfc.predict([[ + netH, netCharge, molW, kyleD, + a_a, c_c, d_d, e_e, f_f, g_g, + h_h, i_i, k_k, l_l, m_m, n_n, + p_p, q_q, r_r, s_s, t_t, v_v, w_w, y_y, + tiny, small, large, aliphatic, aromatic, + total_charged, negative_charged, positive_charged, + polar, neutral, hydrophobic]])) + + return result + + def getResultsSeq(sequence, rfc): paste_seq = str(sequence) @@ -220,12 +343,14 @@ def getResultsFile(filename): rfc = joblib.load('modelo_entrenado_2.pkl') with open(filename) as fp: - print('Name\tPredicted_Antiviral\tSequence\n') + print('Name\tPredicted_Antiviral\tNew\tSequence\n') for name, seq in read_fasta(fp): result = getResultsSeq(seq, rfc) - print('{}\t{}\t{}'.format( + new_result = getResultsSeq(seq, rfc) + print('{}\t{}\t{}\t{}'.format( name.replace('>', ''), result.replace('[', '').replace(']', '').strip(), + new_result.replace('[', '').replace(']', '').strip(), seq )) From d5d0c6627efca847f2bd8ac8108df4a365039407 Mon Sep 17 00:00:00 2001 From: Caio Date: Thu, 15 Oct 2020 12:54:59 -0300 Subject: [PATCH 07/10] more cleanup --- antivpp.py | 35 +++++------------------------------ 1 file changed, 5 insertions(+), 30 deletions(-) diff --git a/antivpp.py b/antivpp.py index 59072cb..d42208c 100644 --- a/antivpp.py +++ b/antivpp.py @@ -47,6 +47,7 @@ def getResultsSeqClean(sequence, rfc): aa_list = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] + aa_counts = {k: paste_seq.count(k) for k in aa_list} a_a = round(aa_counts["A"]/seq_size, 3) @@ -70,27 +71,6 @@ def getResultsSeqClean(sequence, rfc): w_w = round(aa_counts["W"]/seq_size, 3) y_y = round(aa_counts["Y"]/seq_size, 3) - a_hydrogen = aa_counts["A"]*net_hydrogen["A"] - c_hydrogen = aa_counts["C"]*net_hydrogen["C"] - d_hydrogen = aa_counts["D"]*net_hydrogen["D"] - e_hydrogen = aa_counts["E"]*net_hydrogen["E"] - f_hydrogen = aa_counts["F"]*net_hydrogen["F"] - g_hydrogen = aa_counts["G"]*net_hydrogen["G"] - h_hydrogen = aa_counts["H"]*net_hydrogen["H"] - i_hydrogen = aa_counts["I"]*net_hydrogen["I"] - k_hydrogen = aa_counts["K"]*net_hydrogen["K"] - l_hydrogen = aa_counts["L"]*net_hydrogen["L"] - m_hydrogen = aa_counts["M"]*net_hydrogen["M"] - n_hydrogen = aa_counts["N"]*net_hydrogen["N"] - p_hydrogen = aa_counts["P"]*net_hydrogen["P"] - q_hydrogen = aa_counts["Q"]*net_hydrogen["Q"] - r_hydrogen = aa_counts["R"]*net_hydrogen["R"] - s_hydrogen = aa_counts["S"]*net_hydrogen["S"] - t_hydrogen = aa_counts["T"]*net_hydrogen["T"] - v_hydrogen = aa_counts["V"]*net_hydrogen["V"] - w_hydrogen = aa_counts["W"]*net_hydrogen["W"] - y_hydrogen = aa_counts["Y"]*net_hydrogen["Y"] - # PROPERTIES Q-P aliphatic = round((i_i + l_l + v_v), 3) @@ -125,23 +105,18 @@ def getResultsSeqClean(sequence, rfc): netCharge = sum([aa_counts[k]*net_charge[k] for k in aa_list]) - netH = round(( - a_hydrogen+c_hydrogen+d_hydrogen+e_hydrogen + - f_hydrogen+g_hydrogen+h_hydrogen+i_hydrogen + - k_hydrogen+l_hydrogen+m_hydrogen+n_hydrogen + - p_hydrogen+q_hydrogen+r_hydrogen+s_hydrogen + - t_hydrogen+v_hydrogen+w_hydrogen+y_hydrogen), 3) + netH = round(sum([aa_counts[k]*net_hydrogen[k] for k in aa_list]), 3) - result = str(rfc.predict([[ + result = rfc.predict([[ netH, netCharge, molW, kyleD, a_a, c_c, d_d, e_e, f_f, g_g, h_h, i_i, k_k, l_l, m_m, n_n, p_p, q_q, r_r, s_s, t_t, v_v, w_w, y_y, tiny, small, large, aliphatic, aromatic, total_charged, negative_charged, positive_charged, - polar, neutral, hydrophobic]])) + polar, neutral, hydrophobic]]) - return result + return str(result) def getResultsSeq(sequence, rfc): From 6c071fee84a0967b98341a611a7656dc5dc040d0 Mon Sep 17 00:00:00 2001 From: Caio Date: Thu, 15 Oct 2020 13:47:52 -0300 Subject: [PATCH 08/10] final cleanup. VERSION FOR COMPARING IMPLEMENTATIONS --- antivpp.py | 73 ++++++++++++++++++++++-------------------------------- 1 file changed, 30 insertions(+), 43 deletions(-) diff --git a/antivpp.py b/antivpp.py index d42208c..6c84fc2 100644 --- a/antivpp.py +++ b/antivpp.py @@ -21,6 +21,10 @@ def read_fasta(fp): yield (name, ''.join(seq)) +def partialDictRoundedSum(d, keys): + return round(sum([d[k] for k in keys]), 3) + + def getResultsSeqClean(sequence, rfc): paste_seq = str(sequence) seq_size = len(paste_seq+str(0.000001)) @@ -50,56 +54,41 @@ def getResultsSeqClean(sequence, rfc): aa_counts = {k: paste_seq.count(k) for k in aa_list} - a_a = round(aa_counts["A"]/seq_size, 3) - c_c = round(aa_counts["C"]/seq_size, 3) - d_d = round(aa_counts["D"]/seq_size, 3) - e_e = round(aa_counts["E"]/seq_size, 3) - f_f = round(aa_counts["F"]/seq_size, 3) - g_g = round(aa_counts["G"]/seq_size, 3) - h_h = round(aa_counts["H"]/seq_size, 3) - i_i = round(aa_counts["I"]/seq_size, 3) - k_k = round(aa_counts["K"]/seq_size, 3) - l_l = round(aa_counts["L"]/seq_size, 3) - m_m = round(aa_counts["M"]/seq_size, 3) - n_n = round(aa_counts["N"]/seq_size, 3) - p_p = round(aa_counts["P"]/seq_size, 3) - q_q = round(aa_counts["Q"]/seq_size, 3) - r_r = round(aa_counts["R"]/seq_size, 3) - s_s = round(aa_counts["S"]/seq_size, 3) - t_t = round(aa_counts["T"]/seq_size, 3) - v_v = round(aa_counts["V"]/seq_size, 3) - w_w = round(aa_counts["W"]/seq_size, 3) - y_y = round(aa_counts["Y"]/seq_size, 3) + aa_perc = {k: round(aa_counts[k]/seq_size, 3) for k in aa_list} # PROPERTIES Q-P - aliphatic = round((i_i + l_l + v_v), 3) + aliphatic = partialDictRoundedSum(aa_perc, ['I', 'V', 'L']) - negative_charged = round((d_d + e_e), 3) + negative_charged = partialDictRoundedSum(aa_perc, ['D', 'E']) - total_charged = round((d_d + e_e + k_k + h_h + r_r), 3) + total_charged = partialDictRoundedSum(aa_perc, ['D', 'E', 'K', 'H', 'R']) - aromatic = round((f_f + h_h + w_w + y_y), 3) + aromatic = partialDictRoundedSum(aa_perc, ['F', 'H', 'W', 'Y']) - polar = round((d_d + e_e + r_r + k_k + q_q + n_n), 3) + polar = partialDictRoundedSum(aa_perc, ['D', 'E', 'R', 'K', 'Q', 'N']) - neutral = round((a_a + g_g + h_h + p_p + s_s + t_t + y_y), 3) + neutral = partialDictRoundedSum(aa_perc, + ['A', 'G', 'H', 'P', 'S', 'T', 'Y']) - hydrophobic = round((c_c + f_f + i_i + l_l + m_m + v_v + w_w), 3) + hydrophobic = partialDictRoundedSum(aa_perc, + ['C', 'F', 'I', 'L', 'M', 'V', 'W']) - positive_charged = round((k_k + r_r + h_h), 3) + positive_charged = partialDictRoundedSum(aa_perc, ['K', 'R', 'H']) - tiny = round((a_a + c_c + d_d + g_g + s_s + t_t), 3) + tiny = partialDictRoundedSum(aa_perc, ['A', 'C', 'D', 'G', 'S', 'T']) - small = round((e_e + h_h + i_i + l_l + k_k + - m_m + n_n + p_p + q_q + v_v), 3) + small = partialDictRoundedSum(aa_perc, + ['E', 'H', 'I', 'L', 'K', 'M', 'N', 'P', 'Q', 'V']) - large = round((f_f + r_r + w_w + y_y), 3) + large = partialDictRoundedSum(aa_perc, ['F', 'R', 'W', 'Y']) # SCALES - kyleD = round(sum([aa_counts[k]*kyte_doolittle[k] - for k in aa_list])/seq_size, 3) + kyleD = round( + sum( + [aa_counts[k]*kyte_doolittle[k] for k in aa_list] + )/seq_size, 3) molW = round(sum([aa_counts[k]*molecular_weigth[k] for k in aa_list]), 3) @@ -107,14 +96,12 @@ def getResultsSeqClean(sequence, rfc): netH = round(sum([aa_counts[k]*net_hydrogen[k] for k in aa_list]), 3) - result = rfc.predict([[ - netH, netCharge, molW, kyleD, - a_a, c_c, d_d, e_e, f_f, g_g, - h_h, i_i, k_k, l_l, m_m, n_n, - p_p, q_q, r_r, s_s, t_t, v_v, w_w, y_y, - tiny, small, large, aliphatic, aromatic, - total_charged, negative_charged, positive_charged, - polar, neutral, hydrophobic]]) + result = rfc.predict([ + [netH, netCharge, molW, kyleD] + + [v for v in aa_perc.values()] + + [tiny, small, large, aliphatic, aromatic, + total_charged, negative_charged, positive_charged, + polar, neutral, hydrophobic]]) return str(result) @@ -321,7 +308,7 @@ def getResultsFile(filename): print('Name\tPredicted_Antiviral\tNew\tSequence\n') for name, seq in read_fasta(fp): result = getResultsSeq(seq, rfc) - new_result = getResultsSeq(seq, rfc) + new_result = getResultsSeqClean(seq, rfc) print('{}\t{}\t{}\t{}'.format( name.replace('>', ''), result.replace('[', '').replace(']', '').strip(), From 43852cab2526e72a242f88f0bc28d4745bef1110 Mon Sep 17 00:00:00 2001 From: Caio Date: Thu, 15 Oct 2020 13:54:46 -0300 Subject: [PATCH 09/10] Replace old version with refactored one --- antivpp.py | 373 +++++++---------------------------------------------- 1 file changed, 44 insertions(+), 329 deletions(-) diff --git a/antivpp.py b/antivpp.py index 6c84fc2..a7c9882 100644 --- a/antivpp.py +++ b/antivpp.py @@ -25,7 +25,7 @@ def partialDictRoundedSum(d, keys): return round(sum([d[k] for k in keys]), 3) -def getResultsSeqClean(sequence, rfc): +def getResultsSeq(sequence, rfc): paste_seq = str(sequence) seq_size = len(paste_seq+str(0.000001)) @@ -106,213 +106,16 @@ def getResultsSeqClean(sequence, rfc): return str(result) -def getResultsSeq(sequence, rfc): - paste_seq = str(sequence) - - kyte_doolittle = {'A': 1.80, 'C': 2.50, 'D': -3.50, 'E': -3.50, 'F': 2.80, - 'G': -0.40, 'H': -3.20, 'I': 4.50, 'K': -3.90, 'L': 3.80, - 'M': 1.90, 'N': -3.50, 'P': -1.60, 'Q': -3.50, 'R': -4.50, - 'S': -0.80, 'T': -0.70, 'V': 4.20, 'W': -0.90, 'Y': -1.30} - - molecular_weigth = {'A': 89.09, 'C': 121.15, 'D': 133.10, 'E': 147.13, 'F': 165.19, - 'G': 75.07, 'H': 155.16, 'I': 131.17, 'K': 146.19, 'L': 131.17, - 'M': 149.21, 'N': 132.12, 'P': 115.13, 'Q': 146.15, 'R': 174.20, - 'S': 105.09, 'T': 119.12, 'V': 117.15, 'W': 204.24, 'Y': 181.19} - - net_charge = {'A': 0, 'C': 0, 'D': -1, 'E': -1, 'F': 0, - 'G': 0, 'H': 0, 'I': 0, 'K': 1, 'L': 0, - 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 1, - 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0} - - net_hydrogen = {'A': 0, 'C': 0, 'D': 1, 'E': 1, 'F': 0, - 'G': 0, 'H': 1, 'I': 0, 'K': 2, 'L': 0, - 'M': 0, 'N': 2, 'P': 0, 'Q': 2, 'R': 4, - 'S': 1, 'T': 1, 'V': 0, 'W': 1, 'Y': 1} - - a_a = round(paste_seq.count("A")/len(paste_seq+str(0.000001)), 3) - c_c = round(paste_seq.count("C")/len(paste_seq+str(0.000001)), 3) - d_d = round(paste_seq.count("D")/len(paste_seq+str(0.000001)), 3) - e_e = round(paste_seq.count("E")/len(paste_seq+str(0.000001)), 3) - f_f = round(paste_seq.count("F")/len(paste_seq+str(0.000001)), 3) - g_g = round(paste_seq.count("G")/len(paste_seq+str(0.000001)), 3) - h_h = round(paste_seq.count("H")/len(paste_seq+str(0.000001)), 3) - i_i = round(paste_seq.count("I")/len(paste_seq+str(0.000001)), 3) - k_k = round(paste_seq.count("K")/len(paste_seq+str(0.000001)), 3) - l_l = round(paste_seq.count("L")/len(paste_seq+str(0.000001)), 3) - m_m = round(paste_seq.count("M")/len(paste_seq+str(0.000001)), 3) - n_n = round(paste_seq.count("N")/len(paste_seq+str(0.000001)), 3) - p_p = round(paste_seq.count("P")/len(paste_seq+str(0.000001)), 3) - q_q = round(paste_seq.count("Q")/len(paste_seq+str(0.000001)), 3) - r_r = round(paste_seq.count("R")/len(paste_seq+str(0.000001)), 3) - s_s = round(paste_seq.count("S")/len(paste_seq+str(0.000001)), 3) - t_t = round(paste_seq.count("T")/len(paste_seq+str(0.000001)), 3) - v_v = round(paste_seq.count("V")/len(paste_seq+str(0.000001)), 3) - w_w = round(paste_seq.count("W")/len(paste_seq+str(0.000001)), 3) - y_y = round(paste_seq.count("Y")/len(paste_seq+str(0.000001)), 3) - - a_kyte = paste_seq.count("A")*kyte_doolittle["A"] - c_kyte = paste_seq.count("C")*kyte_doolittle["C"] - d_kyte = paste_seq.count("D")*kyte_doolittle["D"] - e_kyte = paste_seq.count("E")*kyte_doolittle["E"] - f_kyte = paste_seq.count("F")*kyte_doolittle["F"] - g_kyte = paste_seq.count("G")*kyte_doolittle["G"] - h_kyte = paste_seq.count("H")*kyte_doolittle["H"] - i_kyte = paste_seq.count("I")*kyte_doolittle["I"] - k_kyte = paste_seq.count("K")*kyte_doolittle["K"] - l_kyte = paste_seq.count("L")*kyte_doolittle["L"] - m_kyte = paste_seq.count("M")*kyte_doolittle["M"] - n_kyte = paste_seq.count("N")*kyte_doolittle["N"] - p_kyte = paste_seq.count("P")*kyte_doolittle["P"] - q_kyte = paste_seq.count("Q")*kyte_doolittle["Q"] - r_kyte = paste_seq.count("R")*kyte_doolittle["R"] - s_kyte = paste_seq.count("S")*kyte_doolittle["S"] - t_kyte = paste_seq.count("T")*kyte_doolittle["T"] - v_kyte = paste_seq.count("V")*kyte_doolittle["V"] - w_kyte = paste_seq.count("W")*kyte_doolittle["W"] - y_kyte = paste_seq.count("Y")*kyte_doolittle["Y"] - - a_mw = paste_seq.count("A")*molecular_weigth["A"] - c_mw = paste_seq.count("C")*molecular_weigth["C"] - d_mw = paste_seq.count("D")*molecular_weigth["D"] - e_mw = paste_seq.count("E")*molecular_weigth["E"] - f_mw = paste_seq.count("F")*molecular_weigth["F"] - g_mw = paste_seq.count("G")*molecular_weigth["G"] - h_mw = paste_seq.count("H")*molecular_weigth["H"] - i_mw = paste_seq.count("I")*molecular_weigth["I"] - k_mw = paste_seq.count("K")*molecular_weigth["K"] - l_mw = paste_seq.count("L")*molecular_weigth["L"] - m_mw = paste_seq.count("M")*molecular_weigth["M"] - n_mw = paste_seq.count("N")*molecular_weigth["N"] - p_mw = paste_seq.count("P")*molecular_weigth["P"] - q_mw = paste_seq.count("Q")*molecular_weigth["Q"] - r_mw = paste_seq.count("R")*molecular_weigth["R"] - s_mw = paste_seq.count("S")*molecular_weigth["S"] - t_mw = paste_seq.count("T")*molecular_weigth["T"] - v_mw = paste_seq.count("V")*molecular_weigth["V"] - w_mw = paste_seq.count("W")*molecular_weigth["W"] - y_mw = paste_seq.count("Y")*molecular_weigth["Y"] - - a_charge = paste_seq.count("A")*net_charge["A"] - c_charge = paste_seq.count("C")*net_charge["C"] - d_charge = paste_seq.count("D")*net_charge["D"] - e_charge = paste_seq.count("E")*net_charge["E"] - f_charge = paste_seq.count("F")*net_charge["F"] - g_charge = paste_seq.count("G")*net_charge["G"] - h_charge = paste_seq.count("H")*net_charge["H"] - i_charge = paste_seq.count("I")*net_charge["I"] - k_charge = paste_seq.count("K")*net_charge["K"] - l_charge = paste_seq.count("L")*net_charge["L"] - m_charge = paste_seq.count("M")*net_charge["M"] - n_charge = paste_seq.count("N")*net_charge["N"] - p_charge = paste_seq.count("P")*net_charge["P"] - q_charge = paste_seq.count("Q")*net_charge["Q"] - r_charge = paste_seq.count("R")*net_charge["R"] - s_charge = paste_seq.count("S")*net_charge["S"] - t_charge = paste_seq.count("T")*net_charge["T"] - v_charge = paste_seq.count("V")*net_charge["V"] - w_charge = paste_seq.count("W")*net_charge["W"] - y_charge = paste_seq.count("Y")*net_charge["Y"] - - a_hydrogen = paste_seq.count("A")*net_hydrogen["A"] - c_hydrogen = paste_seq.count("C")*net_hydrogen["C"] - d_hydrogen = paste_seq.count("D")*net_hydrogen["D"] - e_hydrogen = paste_seq.count("E")*net_hydrogen["E"] - f_hydrogen = paste_seq.count("F")*net_hydrogen["F"] - g_hydrogen = paste_seq.count("G")*net_hydrogen["G"] - h_hydrogen = paste_seq.count("H")*net_hydrogen["H"] - i_hydrogen = paste_seq.count("I")*net_hydrogen["I"] - k_hydrogen = paste_seq.count("K")*net_hydrogen["K"] - l_hydrogen = paste_seq.count("L")*net_hydrogen["L"] - m_hydrogen = paste_seq.count("M")*net_hydrogen["M"] - n_hydrogen = paste_seq.count("N")*net_hydrogen["N"] - p_hydrogen = paste_seq.count("P")*net_hydrogen["P"] - q_hydrogen = paste_seq.count("Q")*net_hydrogen["Q"] - r_hydrogen = paste_seq.count("R")*net_hydrogen["R"] - s_hydrogen = paste_seq.count("S")*net_hydrogen["S"] - t_hydrogen = paste_seq.count("T")*net_hydrogen["T"] - v_hydrogen = paste_seq.count("V")*net_hydrogen["V"] - w_hydrogen = paste_seq.count("W")*net_hydrogen["W"] - y_hydrogen = paste_seq.count("Y")*net_hydrogen["Y"] - - # PROPERTIES Q-P - - aliphatic = round((i_i + l_l + v_v), 3) - - negative_charged = round((d_d + e_e), 3) - - total_charged = round((d_d + e_e + k_k + h_h + r_r), 3) - - aromatic = round((f_f + h_h + w_w + y_y), 3) - - polar = round((d_d + e_e + r_r + k_k + q_q + n_n), 3) - - neutral = round((a_a + g_g + h_h + p_p + s_s + t_t + y_y), 3) - - hydrophobic = round((c_c + f_f + i_i + l_l + m_m + v_v + w_w), 3) - - positive_charged = round((k_k + r_r + h_h), 3) - - tiny = round((a_a + c_c + d_d + g_g + s_s + t_t), 3) - - small = round((e_e + h_h + i_i + l_l + k_k + - m_m + n_n + p_p + q_q + v_v), 3) - - large = round((f_f + r_r + w_w + y_y), 3) - - # SCALES - - kyleD = round(( - (a_kyte+c_kyte+d_kyte + - e_kyte+f_kyte+g_kyte + - h_kyte+i_kyte+k_kyte + - l_kyte+m_kyte + n_kyte + - p_kyte+q_kyte+r_kyte + - s_kyte+t_kyte+v_kyte + - w_kyte+y_kyte)/len(paste_seq+str(0.000001)) - ), 3) - - molW = round( - (a_mw+c_mw+d_mw+e_mw + - f_mw+g_mw+h_mw+i_mw + - k_mw+l_mw+m_mw+n_mw + - p_mw+q_mw+r_mw+s_mw + - t_mw+v_mw+w_mw+y_mw), 3) - - netCharge = a_charge+c_charge+d_charge+e_charge+f_charge+g_charge+h_charge+i_charge+k_charge + \ - l_charge+m_charge+n_charge+p_charge+q_charge+r_charge + \ - s_charge+t_charge+v_charge+w_charge+y_charge - - netH = round(( - a_hydrogen+c_hydrogen+d_hydrogen+e_hydrogen + - f_hydrogen+g_hydrogen+h_hydrogen+i_hydrogen + - k_hydrogen+l_hydrogen+m_hydrogen+n_hydrogen + - p_hydrogen+q_hydrogen+r_hydrogen+s_hydrogen + - t_hydrogen+v_hydrogen+w_hydrogen+y_hydrogen), 3) - - result = str(rfc.predict([[ - netH, netCharge, molW, kyleD, - a_a, c_c, d_d, e_e, f_f, g_g, - h_h, i_i, k_k, l_l, m_m, n_n, - p_p, q_q, r_r, s_s, t_t, v_v, w_w, y_y, - tiny, small, large, aliphatic, aromatic, - total_charged, negative_charged, positive_charged, - polar, neutral, hydrophobic]])) - - return result - - def getResultsFile(filename): rfc = joblib.load('modelo_entrenado_2.pkl') with open(filename) as fp: - print('Name\tPredicted_Antiviral\tNew\tSequence\n') + print('Name\tPredicted_Antiviral\tSequence\n') for name, seq in read_fasta(fp): result = getResultsSeq(seq, rfc) - new_result = getResultsSeqClean(seq, rfc) - print('{}\t{}\t{}\t{}'.format( + print('{}\t{}\t{}'.format( name.replace('>', ''), result.replace('[', '').replace(']', '').strip(), - new_result.replace('[', '').replace(']', '').strip(), seq )) @@ -332,6 +135,8 @@ def calculation(self): paste_seq = str(self.pasteseq.toPlainText()) + seq_size = len(paste_seq+str(0.000001)) + kyte_doolittle = {'A': 1.80, 'C': 2.50, 'D': -3.50, 'E': -3.50, 'F': 2.80, 'G': -0.40, 'H': -3.20, 'I': 4.50, 'K': -3.90, 'L': 3.80, 'M': 1.90, 'N': -3.50, 'P': -1.60, 'Q': -3.50, 'R': -4.50, @@ -352,153 +157,63 @@ def calculation(self): 'M': 0, 'N': 2, 'P': 0, 'Q': 2, 'R': 4, 'S': 1, 'T': 1, 'V': 0, 'W': 1, 'Y': 1} - a_a = round(paste_seq.count("A")/len(paste_seq+str(0.000001)), 3) - c_c = round(paste_seq.count("C")/len(paste_seq+str(0.000001)), 3) - d_d = round(paste_seq.count("D")/len(paste_seq+str(0.000001)), 3) - e_e = round(paste_seq.count("E")/len(paste_seq+str(0.000001)), 3) - f_f = round(paste_seq.count("F")/len(paste_seq+str(0.000001)), 3) - g_g = round(paste_seq.count("G")/len(paste_seq+str(0.000001)), 3) - h_h = round(paste_seq.count("H")/len(paste_seq+str(0.000001)), 3) - i_i = round(paste_seq.count("I")/len(paste_seq+str(0.000001)), 3) - k_k = round(paste_seq.count("K")/len(paste_seq+str(0.000001)), 3) - l_l = round(paste_seq.count("L")/len(paste_seq+str(0.000001)), 3) - m_m = round(paste_seq.count("M")/len(paste_seq+str(0.000001)), 3) - n_n = round(paste_seq.count("N")/len(paste_seq+str(0.000001)), 3) - p_p = round(paste_seq.count("P")/len(paste_seq+str(0.000001)), 3) - q_q = round(paste_seq.count("Q")/len(paste_seq+str(0.000001)), 3) - r_r = round(paste_seq.count("R")/len(paste_seq+str(0.000001)), 3) - s_s = round(paste_seq.count("S")/len(paste_seq+str(0.000001)), 3) - t_t = round(paste_seq.count("T")/len(paste_seq+str(0.000001)), 3) - v_v = round(paste_seq.count("V")/len(paste_seq+str(0.000001)), 3) - w_w = round(paste_seq.count("W")/len(paste_seq+str(0.000001)), 3) - y_y = round(paste_seq.count("Y")/len(paste_seq+str(0.000001)), 3) - - a_kyte = paste_seq.count("A")*kyte_doolittle["A"] - c_kyte = paste_seq.count("C")*kyte_doolittle["C"] - d_kyte = paste_seq.count("D")*kyte_doolittle["D"] - e_kyte = paste_seq.count("E")*kyte_doolittle["E"] - f_kyte = paste_seq.count("F")*kyte_doolittle["F"] - g_kyte = paste_seq.count("G")*kyte_doolittle["G"] - h_kyte = paste_seq.count("H")*kyte_doolittle["H"] - i_kyte = paste_seq.count("I")*kyte_doolittle["I"] - k_kyte = paste_seq.count("K")*kyte_doolittle["K"] - l_kyte = paste_seq.count("L")*kyte_doolittle["L"] - m_kyte = paste_seq.count("M")*kyte_doolittle["M"] - n_kyte = paste_seq.count("N")*kyte_doolittle["N"] - p_kyte = paste_seq.count("P")*kyte_doolittle["P"] - q_kyte = paste_seq.count("Q")*kyte_doolittle["Q"] - r_kyte = paste_seq.count("R")*kyte_doolittle["R"] - s_kyte = paste_seq.count("S")*kyte_doolittle["S"] - t_kyte = paste_seq.count("T")*kyte_doolittle["T"] - v_kyte = paste_seq.count("V")*kyte_doolittle["V"] - w_kyte = paste_seq.count("W")*kyte_doolittle["W"] - y_kyte = paste_seq.count("Y")*kyte_doolittle["Y"] - - a_mw = paste_seq.count("A")*molecular_weigth["A"] - c_mw = paste_seq.count("C")*molecular_weigth["C"] - d_mw = paste_seq.count("D")*molecular_weigth["D"] - e_mw = paste_seq.count("E")*molecular_weigth["E"] - f_mw = paste_seq.count("F")*molecular_weigth["F"] - g_mw = paste_seq.count("G")*molecular_weigth["G"] - h_mw = paste_seq.count("H")*molecular_weigth["H"] - i_mw = paste_seq.count("I")*molecular_weigth["I"] - k_mw = paste_seq.count("K")*molecular_weigth["K"] - l_mw = paste_seq.count("L")*molecular_weigth["L"] - m_mw = paste_seq.count("M")*molecular_weigth["M"] - n_mw = paste_seq.count("N")*molecular_weigth["N"] - p_mw = paste_seq.count("P")*molecular_weigth["P"] - q_mw = paste_seq.count("Q")*molecular_weigth["Q"] - r_mw = paste_seq.count("R")*molecular_weigth["R"] - s_mw = paste_seq.count("S")*molecular_weigth["S"] - t_mw = paste_seq.count("T")*molecular_weigth["T"] - v_mw = paste_seq.count("V")*molecular_weigth["V"] - w_mw = paste_seq.count("W")*molecular_weigth["W"] - y_mw = paste_seq.count("Y")*molecular_weigth["Y"] - - a_charge = paste_seq.count("A")*net_charge["A"] - c_charge = paste_seq.count("C")*net_charge["C"] - d_charge = paste_seq.count("D")*net_charge["D"] - e_charge = paste_seq.count("E")*net_charge["E"] - f_charge = paste_seq.count("F")*net_charge["F"] - g_charge = paste_seq.count("G")*net_charge["G"] - h_charge = paste_seq.count("H")*net_charge["H"] - i_charge = paste_seq.count("I")*net_charge["I"] - k_charge = paste_seq.count("K")*net_charge["K"] - l_charge = paste_seq.count("L")*net_charge["L"] - m_charge = paste_seq.count("M")*net_charge["M"] - n_charge = paste_seq.count("N")*net_charge["N"] - p_charge = paste_seq.count("P")*net_charge["P"] - q_charge = paste_seq.count("Q")*net_charge["Q"] - r_charge = paste_seq.count("R")*net_charge["R"] - s_charge = paste_seq.count("S")*net_charge["S"] - t_charge = paste_seq.count("T")*net_charge["T"] - v_charge = paste_seq.count("V")*net_charge["V"] - w_charge = paste_seq.count("W")*net_charge["W"] - y_charge = paste_seq.count("Y")*net_charge["Y"] - - a_hydrogen = paste_seq.count("A")*net_hydrogen["A"] - c_hydrogen = paste_seq.count("C")*net_hydrogen["C"] - d_hydrogen = paste_seq.count("D")*net_hydrogen["D"] - e_hydrogen = paste_seq.count("E")*net_hydrogen["E"] - f_hydrogen = paste_seq.count("F")*net_hydrogen["F"] - g_hydrogen = paste_seq.count("G")*net_hydrogen["G"] - h_hydrogen = paste_seq.count("H")*net_hydrogen["H"] - i_hydrogen = paste_seq.count("I")*net_hydrogen["I"] - k_hydrogen = paste_seq.count("K")*net_hydrogen["K"] - l_hydrogen = paste_seq.count("L")*net_hydrogen["L"] - m_hydrogen = paste_seq.count("M")*net_hydrogen["M"] - n_hydrogen = paste_seq.count("N")*net_hydrogen["N"] - p_hydrogen = paste_seq.count("P")*net_hydrogen["P"] - q_hydrogen = paste_seq.count("Q")*net_hydrogen["Q"] - r_hydrogen = paste_seq.count("R")*net_hydrogen["R"] - s_hydrogen = paste_seq.count("S")*net_hydrogen["S"] - t_hydrogen = paste_seq.count("T")*net_hydrogen["T"] - v_hydrogen = paste_seq.count("V")*net_hydrogen["V"] - w_hydrogen = paste_seq.count("W")*net_hydrogen["W"] - y_hydrogen = paste_seq.count("Y")*net_hydrogen["Y"] + aa_list = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', + 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] + + aa_counts = {k: paste_seq.count(k) for k in aa_list} + + aa_perc = {k: round(aa_counts[k]/seq_size, 3) for k in aa_list} # PROPERTIES Q-P - aliphatic = round((i_i + l_l + v_v), 3) + aliphatic = partialDictRoundedSum(aa_perc, ['I', 'V', 'L']) - negative_charged = round((d_d + e_e), 3) + negative_charged = partialDictRoundedSum(aa_perc, ['D', 'E']) - total_charged = round((d_d + e_e + k_k + h_h + r_r), 3) + total_charged = partialDictRoundedSum( + aa_perc, ['D', 'E', 'K', 'H', 'R']) - aromatic = round((f_f + h_h + w_w + y_y), 3) + aromatic = partialDictRoundedSum(aa_perc, ['F', 'H', 'W', 'Y']) - polar = round((d_d + e_e + r_r + k_k + q_q + n_n), 3) + polar = partialDictRoundedSum(aa_perc, ['D', 'E', 'R', 'K', 'Q', 'N']) - neutral = round((a_a + g_g + h_h + p_p + s_s + t_t + y_y), 3) + neutral = partialDictRoundedSum(aa_perc, + ['A', 'G', 'H', 'P', 'S', 'T', 'Y']) - hydrophobic = round((c_c + f_f + i_i + l_l + m_m + v_v + w_w), 3) + hydrophobic = partialDictRoundedSum(aa_perc, + ['C', 'F', 'I', 'L', 'M', 'V', 'W']) - positive_charged = round((k_k + r_r + h_h), 3) + positive_charged = partialDictRoundedSum(aa_perc, ['K', 'R', 'H']) - tiny = round((a_a + c_c + d_d + g_g + s_s + t_t), 3) + tiny = partialDictRoundedSum(aa_perc, ['A', 'C', 'D', 'G', 'S', 'T']) - small = round((e_e + h_h + i_i + l_l + k_k + - m_m + n_n + p_p + q_q + v_v), 3) + small = partialDictRoundedSum(aa_perc, + ['E', 'H', 'I', 'L', 'K', 'M', 'N', 'P', 'Q', 'V']) - large = round((f_f + r_r + w_w + y_y), 3) + large = partialDictRoundedSum(aa_perc, ['F', 'R', 'W', 'Y']) # SCALES - kyleD = round(((a_kyte+c_kyte+d_kyte+e_kyte+f_kyte+g_kyte+h_kyte+i_kyte+k_kyte+l_kyte+m_kyte + - n_kyte+p_kyte+q_kyte+r_kyte+s_kyte+t_kyte+v_kyte+w_kyte+y_kyte)/len(paste_seq+str(0.000001))), 3) + kyleD = round( + sum( + [aa_counts[k]*kyte_doolittle[k] for k in aa_list] + )/seq_size, 3) + + molW = round(sum([aa_counts[k]*molecular_weigth[k] + for k in aa_list]), 3) - molW = round((a_mw+c_mw+d_mw+e_mw+f_mw+g_mw+h_mw+i_mw+k_mw + - l_mw+m_mw+n_mw+p_mw+q_mw+r_mw+s_mw+t_mw+v_mw+w_mw+y_mw), 3) + netCharge = sum([aa_counts[k]*net_charge[k] for k in aa_list]) - netCharge = a_charge+c_charge+d_charge+e_charge+f_charge+g_charge+h_charge+i_charge+k_charge + \ - l_charge+m_charge+n_charge+p_charge+q_charge+r_charge + \ - s_charge+t_charge+v_charge+w_charge+y_charge + netH = round(sum([aa_counts[k]*net_hydrogen[k] for k in aa_list]), 3) - netH = round((a_hydrogen+c_hydrogen+d_hydrogen+e_hydrogen+f_hydrogen+g_hydrogen+h_hydrogen+i_hydrogen+k_hydrogen+l_hydrogen + - m_hydrogen+n_hydrogen+p_hydrogen+q_hydrogen+r_hydrogen+s_hydrogen+t_hydrogen+v_hydrogen+w_hydrogen+y_hydrogen), 3) + result = rfc.predict([ + [netH, netCharge, molW, kyleD] + + [v for v in aa_perc.values()] + + [tiny, small, large, aliphatic, aromatic, + total_charged, negative_charged, positive_charged, + polar, neutral, hydrophobic]]) - result = "Probable: " + str(rfc.predict([[netH, netCharge, molW, kyleD, a_a, c_c, d_d, e_e, f_f, g_g, h_h, i_i, k_k, l_l, m_m, n_n, p_p, q_q, r_r, - s_s, t_t, v_v, w_w, y_y, tiny, small, large, aliphatic, aromatic, total_charged, negative_charged, positive_charged, polar, neutral, hydrophobic]])) + result = "Probable: " + str(result) self.textpred.setText(str(result)) self.textpred1.setText(str(aliphatic)) @@ -516,8 +231,8 @@ def calculation(self): self.textpred13.setText(str(netCharge)) self.textpred14.setText(str(netH)) self.textpred15.setText(str(total_charged)) - self.textrelat.setText("A: " + str(a_a) + " , " + "C: " + str(c_c) + " , " + "D: " + str(d_d) + " , " + "E: " + str(e_e) + " , " + "F: " + str(f_f) + " , " + "E: " + str(e_e) + " , " + "G: " + str(g_g) + " , " + "I: " + str(i_i) + " , " + "K: " + str(k_k) + " , " + "L: " + str( - l_l) + " , " + "M: " + str(m_m) + " , " + "N: " + str(n_n) + " , " + "P: " + str(p_p) + " , " + "Q: " + str(q_q) + " , " + "R: " + str(r_r) + " , " + "S: " + str(s_s) + " , " + "T: " + str(t_t) + " , " + "V: " + str(v_v) + " , " + "W: " + str(w_w) + " , " + "Y: " + str(y_y)) + self.textrelat.setText(' , '.join( + [k+': '+str(aa_perc[k]) for k in aa_list])) if __name__ == "__main__": From df8efebb90bb50cae2d8bfaab06ff51f7bab4cfc Mon Sep 17 00:00:00 2001 From: Caio Date: Thu, 15 Oct 2020 13:55:21 -0300 Subject: [PATCH 10/10] fix subversion --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8fc7d88..589d480 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ joblib==0.17.0 numpy==1.19.2 PyQt5==5.15.1 PyQt5-sip==12.8.1 -scikit-learn==0.19.2 +scikit-learn==0.19.1 scipy==1.5.2 six==1.15.0 sklearn==0.0