From dc4f944658cb7bb0b8b8eea4664ab744a3b92836 Mon Sep 17 00:00:00 2001
From: Caio <ccastro@localhost.localdomain>
Date: Wed, 14 Oct 2020 17:49:21 -0300
Subject: [PATCH 01/10] add gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ed8ebf5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+__pycache__
\ No newline at end of file

From 1539d8b837edadf1867ed175292091d8a7de9785 Mon Sep 17 00:00:00 2001
From: Caio <ccastro@localhost.localdomain>
Date: Thu, 15 Oct 2020 10:39:46 -0300
Subject: [PATCH 02/10] add requirements txt

---
 requirements.txt | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..8fc7d88
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,10 @@
+click==7.1.2
+joblib==0.17.0
+numpy==1.19.2
+PyQt5==5.15.1
+PyQt5-sip==12.8.1
+scikit-learn==0.19.2
+scipy==1.5.2
+six==1.15.0
+sklearn==0.0
+threadpoolctl==2.1.0

From 54c8cde9e220f43801d7336fd8fbd2ed8e22379b Mon Sep 17 00:00:00 2001
From: Caio <ccastro@localhost.localdomain>
Date: Thu, 15 Oct 2020 11:00:56 -0300
Subject: [PATCH 03/10] add command line interface

---
 antivpp.py | 391 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 315 insertions(+), 76 deletions(-)

diff --git a/antivpp.py b/antivpp.py
index 9b5419d..19682c8 100644
--- a/antivpp.py
+++ b/antivpp.py
@@ -1,66 +1,295 @@
 import sys
 from PyQt5 import uic, QtWidgets
+from sklearn.externals import joblib
 
-qtCreatorFile = "antivpp.ui" # Name of the file here
+qtCreatorFile = "antivpp.ui"  # Name of the file here
 
 Ui_MainWindow, QtBaseClass = uic.loadUiType(qtCreatorFile)
 
+
+def read_fasta(fp):
+    name, seq = None, []
+    for line in fp:
+        line = line.rstrip()
+        if line.startswith(">"):
+            if name:
+                yield (name, ''.join(seq))
+            name, seq = line, []
+        else:
+            seq.append(line)
+    if name:
+        yield (name, ''.join(seq))
+
+
+def getResultsSeq(sequence, rfc):
+    paste_seq = str(sequence)
+
+    kyte_doolittle = {'A': 1.80, 'C': 2.50, 'D': -3.50, 'E': -3.50, 'F': 2.80,
+                      'G': -0.40, 'H': -3.20, 'I': 4.50, 'K': -3.90, 'L': 3.80,
+                      'M': 1.90, 'N': -3.50, 'P': -1.60, 'Q': -3.50, 'R': -4.50,
+                      'S': -0.80, 'T': -0.70, 'V': 4.20, 'W': -0.90, 'Y': -1.30}
+
+    molecular_weigth = {'A': 89.09, 'C': 121.15, 'D': 133.10, 'E': 147.13, 'F': 165.19,
+                        'G': 75.07, 'H': 155.16, 'I': 131.17, 'K': 146.19, 'L': 131.17,
+                        'M': 149.21, 'N': 132.12, 'P': 115.13, 'Q': 146.15, 'R': 174.20,
+                        'S': 105.09, 'T': 119.12, 'V': 117.15, 'W': 204.24, 'Y': 181.19}
+
+    net_charge = {'A': 0, 'C': 0, 'D': -1, 'E': -1, 'F': 0,
+                  'G': 0, 'H': 0, 'I': 0, 'K': 1, 'L': 0,
+                  'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 1,
+                  'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}
+
+    net_hydrogen = {'A': 0, 'C': 0, 'D': 1, 'E': 1, 'F': 0,
+                    'G': 0, 'H': 1, 'I': 0, 'K': 2, 'L': 0,
+                    'M': 0, 'N': 2, 'P': 0, 'Q': 2, 'R': 4,
+                    'S': 1, 'T': 1, 'V': 0, 'W': 1, 'Y': 1}
+
+    a_a = round(paste_seq.count("A")/len(paste_seq+str(0.000001)), 3)
+    c_c = round(paste_seq.count("C")/len(paste_seq+str(0.000001)), 3)
+    d_d = round(paste_seq.count("D")/len(paste_seq+str(0.000001)), 3)
+    e_e = round(paste_seq.count("E")/len(paste_seq+str(0.000001)), 3)
+    f_f = round(paste_seq.count("F")/len(paste_seq+str(0.000001)), 3)
+    g_g = round(paste_seq.count("G")/len(paste_seq+str(0.000001)), 3)
+    h_h = round(paste_seq.count("H")/len(paste_seq+str(0.000001)), 3)
+    i_i = round(paste_seq.count("I")/len(paste_seq+str(0.000001)), 3)
+    k_k = round(paste_seq.count("K")/len(paste_seq+str(0.000001)), 3)
+    l_l = round(paste_seq.count("L")/len(paste_seq+str(0.000001)), 3)
+    m_m = round(paste_seq.count("M")/len(paste_seq+str(0.000001)), 3)
+    n_n = round(paste_seq.count("N")/len(paste_seq+str(0.000001)), 3)
+    p_p = round(paste_seq.count("P")/len(paste_seq+str(0.000001)), 3)
+    q_q = round(paste_seq.count("Q")/len(paste_seq+str(0.000001)), 3)
+    r_r = round(paste_seq.count("R")/len(paste_seq+str(0.000001)), 3)
+    s_s = round(paste_seq.count("S")/len(paste_seq+str(0.000001)), 3)
+    t_t = round(paste_seq.count("T")/len(paste_seq+str(0.000001)), 3)
+    v_v = round(paste_seq.count("V")/len(paste_seq+str(0.000001)), 3)
+    w_w = round(paste_seq.count("W")/len(paste_seq+str(0.000001)), 3)
+    y_y = round(paste_seq.count("Y")/len(paste_seq+str(0.000001)), 3)
+
+    a_kyte = paste_seq.count("A")*kyte_doolittle["A"]
+    c_kyte = paste_seq.count("C")*kyte_doolittle["C"]
+    d_kyte = paste_seq.count("D")*kyte_doolittle["D"]
+    e_kyte = paste_seq.count("E")*kyte_doolittle["E"]
+    f_kyte = paste_seq.count("F")*kyte_doolittle["F"]
+    g_kyte = paste_seq.count("G")*kyte_doolittle["G"]
+    h_kyte = paste_seq.count("H")*kyte_doolittle["H"]
+    i_kyte = paste_seq.count("I")*kyte_doolittle["I"]
+    k_kyte = paste_seq.count("K")*kyte_doolittle["K"]
+    l_kyte = paste_seq.count("L")*kyte_doolittle["L"]
+    m_kyte = paste_seq.count("M")*kyte_doolittle["M"]
+    n_kyte = paste_seq.count("N")*kyte_doolittle["N"]
+    p_kyte = paste_seq.count("P")*kyte_doolittle["P"]
+    q_kyte = paste_seq.count("Q")*kyte_doolittle["Q"]
+    r_kyte = paste_seq.count("R")*kyte_doolittle["R"]
+    s_kyte = paste_seq.count("S")*kyte_doolittle["S"]
+    t_kyte = paste_seq.count("T")*kyte_doolittle["T"]
+    v_kyte = paste_seq.count("V")*kyte_doolittle["V"]
+    w_kyte = paste_seq.count("W")*kyte_doolittle["W"]
+    y_kyte = paste_seq.count("Y")*kyte_doolittle["Y"]
+
+    a_mw = paste_seq.count("A")*molecular_weigth["A"]
+    c_mw = paste_seq.count("C")*molecular_weigth["C"]
+    d_mw = paste_seq.count("D")*molecular_weigth["D"]
+    e_mw = paste_seq.count("E")*molecular_weigth["E"]
+    f_mw = paste_seq.count("F")*molecular_weigth["F"]
+    g_mw = paste_seq.count("G")*molecular_weigth["G"]
+    h_mw = paste_seq.count("H")*molecular_weigth["H"]
+    i_mw = paste_seq.count("I")*molecular_weigth["I"]
+    k_mw = paste_seq.count("K")*molecular_weigth["K"]
+    l_mw = paste_seq.count("L")*molecular_weigth["L"]
+    m_mw = paste_seq.count("M")*molecular_weigth["M"]
+    n_mw = paste_seq.count("N")*molecular_weigth["N"]
+    p_mw = paste_seq.count("P")*molecular_weigth["P"]
+    q_mw = paste_seq.count("Q")*molecular_weigth["Q"]
+    r_mw = paste_seq.count("R")*molecular_weigth["R"]
+    s_mw = paste_seq.count("S")*molecular_weigth["S"]
+    t_mw = paste_seq.count("T")*molecular_weigth["T"]
+    v_mw = paste_seq.count("V")*molecular_weigth["V"]
+    w_mw = paste_seq.count("W")*molecular_weigth["W"]
+    y_mw = paste_seq.count("Y")*molecular_weigth["Y"]
+
+    a_charge = paste_seq.count("A")*net_charge["A"]
+    c_charge = paste_seq.count("C")*net_charge["C"]
+    d_charge = paste_seq.count("D")*net_charge["D"]
+    e_charge = paste_seq.count("E")*net_charge["E"]
+    f_charge = paste_seq.count("F")*net_charge["F"]
+    g_charge = paste_seq.count("G")*net_charge["G"]
+    h_charge = paste_seq.count("H")*net_charge["H"]
+    i_charge = paste_seq.count("I")*net_charge["I"]
+    k_charge = paste_seq.count("K")*net_charge["K"]
+    l_charge = paste_seq.count("L")*net_charge["L"]
+    m_charge = paste_seq.count("M")*net_charge["M"]
+    n_charge = paste_seq.count("N")*net_charge["N"]
+    p_charge = paste_seq.count("P")*net_charge["P"]
+    q_charge = paste_seq.count("Q")*net_charge["Q"]
+    r_charge = paste_seq.count("R")*net_charge["R"]
+    s_charge = paste_seq.count("S")*net_charge["S"]
+    t_charge = paste_seq.count("T")*net_charge["T"]
+    v_charge = paste_seq.count("V")*net_charge["V"]
+    w_charge = paste_seq.count("W")*net_charge["W"]
+    y_charge = paste_seq.count("Y")*net_charge["Y"]
+
+    a_hydrogen = paste_seq.count("A")*net_hydrogen["A"]
+    c_hydrogen = paste_seq.count("C")*net_hydrogen["C"]
+    d_hydrogen = paste_seq.count("D")*net_hydrogen["D"]
+    e_hydrogen = paste_seq.count("E")*net_hydrogen["E"]
+    f_hydrogen = paste_seq.count("F")*net_hydrogen["F"]
+    g_hydrogen = paste_seq.count("G")*net_hydrogen["G"]
+    h_hydrogen = paste_seq.count("H")*net_hydrogen["H"]
+    i_hydrogen = paste_seq.count("I")*net_hydrogen["I"]
+    k_hydrogen = paste_seq.count("K")*net_hydrogen["K"]
+    l_hydrogen = paste_seq.count("L")*net_hydrogen["L"]
+    m_hydrogen = paste_seq.count("M")*net_hydrogen["M"]
+    n_hydrogen = paste_seq.count("N")*net_hydrogen["N"]
+    p_hydrogen = paste_seq.count("P")*net_hydrogen["P"]
+    q_hydrogen = paste_seq.count("Q")*net_hydrogen["Q"]
+    r_hydrogen = paste_seq.count("R")*net_hydrogen["R"]
+    s_hydrogen = paste_seq.count("S")*net_hydrogen["S"]
+    t_hydrogen = paste_seq.count("T")*net_hydrogen["T"]
+    v_hydrogen = paste_seq.count("V")*net_hydrogen["V"]
+    w_hydrogen = paste_seq.count("W")*net_hydrogen["W"]
+    y_hydrogen = paste_seq.count("Y")*net_hydrogen["Y"]
+
+    # PROPERTIES Q-P
+
+    aliphatic = round((i_i + l_l + v_v), 3)
+
+    negative_charged = round((d_d + e_e), 3)
+
+    total_charged = round((d_d + e_e + k_k + h_h + r_r), 3)
+
+    aromatic = round((f_f + h_h + w_w + y_y), 3)
+
+    polar = round((d_d + e_e + r_r + k_k + q_q + n_n), 3)
+
+    neutral = round((a_a + g_g + h_h + p_p + s_s + t_t + y_y), 3)
+
+    hydrophobic = round((c_c + f_f + i_i + l_l + m_m + v_v + w_w), 3)
+
+    positive_charged = round((k_k + r_r + h_h), 3)
+
+    tiny = round((a_a + c_c + d_d + g_g + s_s + t_t), 3)
+
+    small = round((e_e + h_h + i_i + l_l + k_k +
+                   m_m + n_n + p_p + q_q + v_v), 3)
+
+    large = round((f_f + r_r + w_w + y_y), 3)
+
+    # SCALES
+
+    kyleD = round((
+        (a_kyte+c_kyte+d_kyte +
+         e_kyte+f_kyte+g_kyte +
+         h_kyte+i_kyte+k_kyte +
+         l_kyte+m_kyte + n_kyte +
+         p_kyte+q_kyte+r_kyte +
+         s_kyte+t_kyte+v_kyte +
+         w_kyte+y_kyte)/len(paste_seq+str(0.000001))
+    ), 3)
+
+    molW = round(
+        (a_mw+c_mw+d_mw+e_mw +
+         f_mw+g_mw+h_mw+i_mw +
+         k_mw+l_mw+m_mw+n_mw +
+         p_mw+q_mw+r_mw+s_mw +
+         t_mw+v_mw+w_mw+y_mw), 3)
+
+    netCharge = a_charge+c_charge+d_charge+e_charge+f_charge+g_charge+h_charge+i_charge+k_charge + \
+        l_charge+m_charge+n_charge+p_charge+q_charge+r_charge + \
+        s_charge+t_charge+v_charge+w_charge+y_charge
+
+    netH = round((a_hydrogen+c_hydrogen+d_hydrogen+e_hydrogen+f_hydrogen+g_hydrogen+h_hydrogen+i_hydrogen+k_hydrogen+l_hydrogen +
+                  m_hydrogen+n_hydrogen+p_hydrogen+q_hydrogen+r_hydrogen+s_hydrogen+t_hydrogen+v_hydrogen+w_hydrogen+y_hydrogen), 3)
+
+    # result = "Probable: " + str(rfc.predict([[netH, netCharge, molW, kyleD, a_a, c_c, d_d, e_e, f_f, g_g, h_h, i_i, k_k, l_l, m_m, n_n, p_p, q_q, r_r,
+    #                                           s_s, t_t, v_v, w_w, y_y, tiny, small, large, aliphatic, aromatic, total_charged, negative_charged, positive_charged, polar, neutral, hydrophobic]]))
+
+    # self.textpred.setText(str(result))
+    # self.textpred1.setText(str(aliphatic))
+    # self.textpred2.setText(str(negative_charged))
+    # self.textpred3.setText(str(aromatic))
+    # self.textpred4.setText(str(polar))
+    # self.textpred5.setText(str(neutral))
+    # self.textpred6.setText(str(hydrophobic))
+    # self.textpred7.setText(str(positive_charged))
+    # self.textpred8.setText(str(tiny))
+    # self.textpred9.setText(str(small))
+    # self.textpred10.setText(str(large))
+    # self.textpred11.setText(str(kyleD))
+    # self.textpred12.setText(str(molW))
+    # self.textpred13.setText(str(netCharge))
+    # self.textpred14.setText(str(netH))
+    # self.textpred15.setText(str(total_charged))
+    # self.textrelat.setText("A: " + str(a_a) + " , " + "C: " + str(c_c) + " , " + "D: " + str(d_d) + " , " + "E: " + str(e_e) + " , " + "F: " + str(f_f) + " , " + "E: " + str(e_e) + " , " + "G: " + str(g_g) + " , " + "I: " + str(i_i) + " , " + "K: " + str(k_k) + " , " + "L: " + str(l_l) + " , " + "M: " + str(m_m) + " , " + "N: " + str(n_n) + " , " + "P: " + str(p_p) + " , " + "Q: " + str(q_q) + " , " + "R: " + str(r_r) + " , " + "S: " + str(s_s) + " , " + "T: " + str(t_t) + " , " + "V: " + str(v_v) + " , " + "W: " + str(w_w) + " , " + "Y: " + str(y_y))
+
+    result = str(rfc.predict([[netH, netCharge, molW, kyleD, a_a, c_c, d_d, e_e, f_f, g_g, h_h, i_i, k_k, l_l, m_m, n_n, p_p, q_q, r_r,
+                               s_s, t_t, v_v, w_w, y_y, tiny, small, large, aliphatic, aromatic, total_charged, negative_charged, positive_charged, polar, neutral, hydrophobic]]))
+
+    return result
+
+
+def getResultsFile(filename):
+    rfc = joblib.load('modelo_entrenado_2.pkl')
+
+    with open(filename) as fp:
+        for name, seq in read_fasta(fp):
+            print(name, seq, getResultsSeq(seq, rfc))
+
+
 class MyApp(QtWidgets.QMainWindow, Ui_MainWindow):
     def __init__(self):
         QtWidgets.QMainWindow.__init__(self)
         Ui_MainWindow.__init__(self)
         self.setupUi(self)
-        self.setFixedSize(self.size()) #Dimensiones fijas
-        self.bottonpred.clicked.connect(self.calculation) #Esto es para ordenar que cuando se presione vaya a calculo       
-        
+        self.setFixedSize(self.size())  # Dimensiones fijas
+        # Esto es para ordenar que cuando se presione vaya a calculo
+        self.bottonpred.clicked.connect(self.calculation)
+
     def calculation(self):
         from sklearn.externals import joblib
         rfc = joblib.load('modelo_entrenado_2.pkl')
 
         paste_seq = str(self.pasteseq.toPlainText())
 
-        kyte_doolittle = {'A':1.80,'C':2.50,'D':-3.50,'E':-3.50,'F':2.80,
-                     'G':-0.40,'H':-3.20,'I':4.50,'K':-3.90,'L':3.80,
-                     'M':1.90,'N':-3.50,'P':-1.60,'Q':-3.50,'R':-4.50, 
-                     'S':-0.80,'T':-0.70,'V':4.20,'W':-0.90,'Y':-1.30}
-
-        molecular_weigth = {'A':89.09,'C':121.15,'D':133.10,'E':147.13,'F':165.19,
-                    'G':75.07,'H':155.16,'I':131.17,'K':146.19,'L':131.17,
-                    'M':149.21,'N':132.12,'P':115.13,'Q':146.15,'R':174.20, 
-                    'S':105.09,'T':119.12,'V':117.15,'W':204.24,'Y':181.19}
-
-
-        net_charge = {'A':0,'C':0,'D':-1,'E':-1,'F':0,
-                        'G':0,'H':0,'I':0,'K':1,'L':0,
-                        'M':0,'N':0,'P':0,'Q':0,'R':1,
-                        'S':0,'T':0,'V':0,'W':0,'Y':0}
-
-        net_hydrogen = {'A':0,'C':0,'D':1,'E':1,'F':0,
-                        'G':0,'H':1,'I':0,'K':2,'L':0,
-                        'M':0,'N':2,'P':0,'Q':2,'R':4,
-                        'S':1,'T':1,'V':0,'W':1,'Y':1}
-
-        a_a = round(paste_seq.count("A")/len(paste_seq+str(0.000001)),3)
-        c_c = round(paste_seq.count("C")/len(paste_seq+str(0.000001)),3)
-        d_d = round(paste_seq.count("D")/len(paste_seq+str(0.000001)),3)
-        e_e = round(paste_seq.count("E")/len(paste_seq+str(0.000001)),3)
-        f_f = round(paste_seq.count("F")/len(paste_seq+str(0.000001)),3)
-        g_g = round(paste_seq.count("G")/len(paste_seq+str(0.000001)),3)
-        h_h = round(paste_seq.count("H")/len(paste_seq+str(0.000001)),3)
-        i_i = round(paste_seq.count("I")/len(paste_seq+str(0.000001)),3)
-        k_k = round(paste_seq.count("K")/len(paste_seq+str(0.000001)),3)
-        l_l = round(paste_seq.count("L")/len(paste_seq+str(0.000001)),3)
-        m_m = round(paste_seq.count("M")/len(paste_seq+str(0.000001)),3)
-        n_n = round(paste_seq.count("N")/len(paste_seq+str(0.000001)),3)
-        p_p = round(paste_seq.count("P")/len(paste_seq+str(0.000001)),3)
-        q_q = round(paste_seq.count("Q")/len(paste_seq+str(0.000001)),3)
-        r_r = round(paste_seq.count("R")/len(paste_seq+str(0.000001)),3)
-        s_s = round(paste_seq.count("S")/len(paste_seq+str(0.000001)),3)
-        t_t = round(paste_seq.count("T")/len(paste_seq+str(0.000001)),3)
-        v_v = round(paste_seq.count("V")/len(paste_seq+str(0.000001)),3)
-        w_w = round(paste_seq.count("W")/len(paste_seq+str(0.000001)),3)
-        y_y = round(paste_seq.count("Y")/len(paste_seq+str(0.000001)),3)
+        kyte_doolittle = {'A': 1.80, 'C': 2.50, 'D': -3.50, 'E': -3.50, 'F': 2.80,
+                          'G': -0.40, 'H': -3.20, 'I': 4.50, 'K': -3.90, 'L': 3.80,
+                          'M': 1.90, 'N': -3.50, 'P': -1.60, 'Q': -3.50, 'R': -4.50,
+                          'S': -0.80, 'T': -0.70, 'V': 4.20, 'W': -0.90, 'Y': -1.30}
+
+        molecular_weigth = {'A': 89.09, 'C': 121.15, 'D': 133.10, 'E': 147.13, 'F': 165.19,
+                            'G': 75.07, 'H': 155.16, 'I': 131.17, 'K': 146.19, 'L': 131.17,
+                            'M': 149.21, 'N': 132.12, 'P': 115.13, 'Q': 146.15, 'R': 174.20,
+                            'S': 105.09, 'T': 119.12, 'V': 117.15, 'W': 204.24, 'Y': 181.19}
 
+        net_charge = {'A': 0, 'C': 0, 'D': -1, 'E': -1, 'F': 0,
+                      'G': 0, 'H': 0, 'I': 0, 'K': 1, 'L': 0,
+                      'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 1,
+                      'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}
+
+        net_hydrogen = {'A': 0, 'C': 0, 'D': 1, 'E': 1, 'F': 0,
+                        'G': 0, 'H': 1, 'I': 0, 'K': 2, 'L': 0,
+                        'M': 0, 'N': 2, 'P': 0, 'Q': 2, 'R': 4,
+                        'S': 1, 'T': 1, 'V': 0, 'W': 1, 'Y': 1}
+
+        a_a = round(paste_seq.count("A")/len(paste_seq+str(0.000001)), 3)
+        c_c = round(paste_seq.count("C")/len(paste_seq+str(0.000001)), 3)
+        d_d = round(paste_seq.count("D")/len(paste_seq+str(0.000001)), 3)
+        e_e = round(paste_seq.count("E")/len(paste_seq+str(0.000001)), 3)
+        f_f = round(paste_seq.count("F")/len(paste_seq+str(0.000001)), 3)
+        g_g = round(paste_seq.count("G")/len(paste_seq+str(0.000001)), 3)
+        h_h = round(paste_seq.count("H")/len(paste_seq+str(0.000001)), 3)
+        i_i = round(paste_seq.count("I")/len(paste_seq+str(0.000001)), 3)
+        k_k = round(paste_seq.count("K")/len(paste_seq+str(0.000001)), 3)
+        l_l = round(paste_seq.count("L")/len(paste_seq+str(0.000001)), 3)
+        m_m = round(paste_seq.count("M")/len(paste_seq+str(0.000001)), 3)
+        n_n = round(paste_seq.count("N")/len(paste_seq+str(0.000001)), 3)
+        p_p = round(paste_seq.count("P")/len(paste_seq+str(0.000001)), 3)
+        q_q = round(paste_seq.count("Q")/len(paste_seq+str(0.000001)), 3)
+        r_r = round(paste_seq.count("R")/len(paste_seq+str(0.000001)), 3)
+        s_s = round(paste_seq.count("S")/len(paste_seq+str(0.000001)), 3)
+        t_t = round(paste_seq.count("T")/len(paste_seq+str(0.000001)), 3)
+        v_v = round(paste_seq.count("V")/len(paste_seq+str(0.000001)), 3)
+        w_w = round(paste_seq.count("W")/len(paste_seq+str(0.000001)), 3)
+        y_y = round(paste_seq.count("Y")/len(paste_seq+str(0.000001)), 3)
 
         a_kyte = paste_seq.count("A")*kyte_doolittle["A"]
         c_kyte = paste_seq.count("C")*kyte_doolittle["C"]
@@ -83,7 +312,6 @@ def calculation(self):
         w_kyte = paste_seq.count("W")*kyte_doolittle["W"]
         y_kyte = paste_seq.count("Y")*kyte_doolittle["Y"]
 
-
         a_mw = paste_seq.count("A")*molecular_weigth["A"]
         c_mw = paste_seq.count("C")*molecular_weigth["C"]
         d_mw = paste_seq.count("D")*molecular_weigth["D"]
@@ -104,7 +332,7 @@ def calculation(self):
         v_mw = paste_seq.count("V")*molecular_weigth["V"]
         w_mw = paste_seq.count("W")*molecular_weigth["W"]
         y_mw = paste_seq.count("Y")*molecular_weigth["Y"]
- 
+
         a_charge = paste_seq.count("A")*net_charge["A"]
         c_charge = paste_seq.count("C")*net_charge["C"]
         d_charge = paste_seq.count("D")*net_charge["D"]
@@ -147,44 +375,51 @@ def calculation(self):
         w_hydrogen = paste_seq.count("W")*net_hydrogen["W"]
         y_hydrogen = paste_seq.count("Y")*net_hydrogen["Y"]
 
-        #PROPERTIES Q-P
-      
-        aliphatic = round((i_i + l_l + v_v),3)
+        # PROPERTIES Q-P
+
+        aliphatic = round((i_i + l_l + v_v), 3)
+
+        negative_charged = round((d_d + e_e), 3)
+
+        total_charged = round((d_d + e_e + k_k + h_h + r_r), 3)
+
+        aromatic = round((f_f + h_h + w_w + y_y), 3)
+
+        polar = round((d_d + e_e + r_r + k_k + q_q + n_n), 3)
 
-        negative_charged = round((d_d + e_e),3)
+        neutral = round((a_a + g_g + h_h + p_p + s_s + t_t + y_y), 3)
 
-        total_charged = round((d_d + e_e + k_k + h_h + r_r),3)
+        hydrophobic = round((c_c + f_f + i_i + l_l + m_m + v_v + w_w), 3)
 
-        aromatic = round((f_f + h_h + w_w + y_y),3)
+        positive_charged = round((k_k + r_r + h_h), 3)
 
-        polar = round((d_d + e_e + r_r + k_k + q_q + n_n),3)
+        tiny = round((a_a + c_c + d_d + g_g + s_s + t_t), 3)
 
-        neutral = round((a_a + g_g + h_h + p_p + s_s + t_t + y_y),3)
+        small = round((e_e + h_h + i_i + l_l + k_k +
+                       m_m + n_n + p_p + q_q + v_v), 3)
 
-        hydrophobic = round((c_c + f_f + i_i + l_l + m_m + v_v + w_w),3)
+        large = round((f_f + r_r + w_w + y_y), 3)
 
-        positive_charged = round((k_k + r_r + h_h),3)
+        # SCALES
 
-        tiny = round((a_a + c_c + d_d + g_g + s_s + t_t),3)
+        kyleD = round(((a_kyte+c_kyte+d_kyte+e_kyte+f_kyte+g_kyte+h_kyte+i_kyte+k_kyte+l_kyte+m_kyte +
+                        n_kyte+p_kyte+q_kyte+r_kyte+s_kyte+t_kyte+v_kyte+w_kyte+y_kyte)/len(paste_seq+str(0.000001))), 3)
 
-        small = round((e_e + h_h + i_i + l_l + k_k + m_m + n_n + p_p + q_q + v_v),3)
+        molW = round((a_mw+c_mw+d_mw+e_mw+f_mw+g_mw+h_mw+i_mw+k_mw +
+                      l_mw+m_mw+n_mw+p_mw+q_mw+r_mw+s_mw+t_mw+v_mw+w_mw+y_mw), 3)
 
-        large = round((f_f + r_r + w_w + y_y),3)
+        netCharge = a_charge+c_charge+d_charge+e_charge+f_charge+g_charge+h_charge+i_charge+k_charge + \
+            l_charge+m_charge+n_charge+p_charge+q_charge+r_charge + \
+            s_charge+t_charge+v_charge+w_charge+y_charge
 
-        #SCALES
-        
-        kyleD = round(((a_kyte+c_kyte+d_kyte+e_kyte+f_kyte+g_kyte+h_kyte+i_kyte+k_kyte+l_kyte+m_kyte+n_kyte+p_kyte+q_kyte+r_kyte+s_kyte+t_kyte+v_kyte+w_kyte+y_kyte)/len(paste_seq+str(0.000001))),3)
+        netH = round((a_hydrogen+c_hydrogen+d_hydrogen+e_hydrogen+f_hydrogen+g_hydrogen+h_hydrogen+i_hydrogen+k_hydrogen+l_hydrogen +
+                      m_hydrogen+n_hydrogen+p_hydrogen+q_hydrogen+r_hydrogen+s_hydrogen+t_hydrogen+v_hydrogen+w_hydrogen+y_hydrogen), 3)
 
-        molW = round((a_mw+c_mw+d_mw+e_mw+f_mw+g_mw+h_mw+i_mw+k_mw+l_mw+m_mw+n_mw+p_mw+q_mw+r_mw+s_mw+t_mw+v_mw+w_mw+y_mw),3)
-      
-        netCharge = a_charge+c_charge+d_charge+e_charge+f_charge+g_charge+h_charge+i_charge+k_charge+l_charge+m_charge+n_charge+p_charge+q_charge+r_charge+s_charge+t_charge+v_charge+w_charge+y_charge
-      
-        netH = round((a_hydrogen+c_hydrogen+d_hydrogen+e_hydrogen+f_hydrogen+g_hydrogen+h_hydrogen+i_hydrogen+k_hydrogen+l_hydrogen+m_hydrogen+n_hydrogen+p_hydrogen+q_hydrogen+r_hydrogen+s_hydrogen+t_hydrogen+v_hydrogen+w_hydrogen+y_hydrogen),3)
+        result = "Probable: " + str(rfc.predict([[netH, netCharge, molW, kyleD, a_a, c_c, d_d, e_e, f_f, g_g, h_h, i_i, k_k, l_l, m_m, n_n, p_p, q_q, r_r,
+                                                  s_s, t_t, v_v, w_w, y_y, tiny, small, large, aliphatic, aromatic, total_charged, negative_charged, positive_charged, polar, neutral, hydrophobic]]))
 
-        result = "Probable: " + str(rfc.predict([[netH,netCharge,molW,kyleD,a_a,c_c,d_d,e_e,f_f,g_g,h_h,i_i,k_k,l_l,m_m,n_n,p_p,q_q,r_r,s_s,t_t,v_v,w_w,y_y,tiny,small,large,aliphatic,aromatic,total_charged,negative_charged,positive_charged,polar,neutral,hydrophobic]]))
-          
         self.textpred.setText(str(result))
-        self.textpred1.setText(str(aliphatic)) 
+        self.textpred1.setText(str(aliphatic))
         self.textpred2.setText(str(negative_charged))
         self.textpred3.setText(str(aromatic))
         self.textpred4.setText(str(polar))
@@ -199,11 +434,15 @@ def calculation(self):
         self.textpred13.setText(str(netCharge))
         self.textpred14.setText(str(netH))
         self.textpred15.setText(str(total_charged))
-        self.textrelat.setText("A: " + str(a_a) + " , " + "C: " + str(c_c) + " , " + "D: " + str(d_d) + " , " + "E: " + str(e_e) + " , " + "F: " + str(f_f) + " , " + "E: " + str(e_e) + " , " + "G: " + str(g_g) + " , " + "I: " + str(i_i) + " , " + "K: " + str(k_k) + " , " + "L: " + str(l_l) + " , " + "M: " + str(m_m) + " , " + "N: " + str(n_n) + " , " + "P: " + str(p_p) + " , " + "Q: " + str(q_q) + " , " + "R: " + str(r_r) + " , " + "S: " + str(s_s) + " , " + "T: " + str(t_t) + " , " + "V: " + str(v_v) + " , " + "W: " + str(w_w) + " , " + "Y: " + str(y_y))
+        self.textrelat.setText("A: " + str(a_a) + " , " + "C: " + str(c_c) + " , " + "D: " + str(d_d) + " , " + "E: " + str(e_e) + " , " + "F: " + str(f_f) + " , " + "E: " + str(e_e) + " , " + "G: " + str(g_g) + " , " + "I: " + str(i_i) + " , " + "K: " + str(k_k) + " , " + "L: " + str(
+            l_l) + " , " + "M: " + str(m_m) + " , " + "N: " + str(n_n) + " , " + "P: " + str(p_p) + " , " + "Q: " + str(q_q) + " , " + "R: " + str(r_r) + " , " + "S: " + str(s_s) + " , " + "T: " + str(t_t) + " , " + "V: " + str(v_v) + " , " + "W: " + str(w_w) + " , " + "Y: " + str(y_y))
 
 
 if __name__ == "__main__":
-    app = QtWidgets.QApplication(sys.argv)
-    window = MyApp()
-    window.show()
-    sys.exit(app.exec_())
+    if len(sys.argv) == 1:
+        app = QtWidgets.QApplication(sys.argv)
+        window = MyApp()
+        window.show()
+        sys.exit(app.exec_())
+    else:
+        print(getResultsFile(sys.argv[-1]))

From cc47c05543d192198db8b0c67ff33368b25f9131 Mon Sep 17 00:00:00 2001
From: Caio <ccastro@localhost.localdomain>
Date: Thu, 15 Oct 2020 11:26:07 -0300
Subject: [PATCH 04/10] change output format to tsv + --help commands

---
 antivpp.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/antivpp.py b/antivpp.py
index 19682c8..a95be50 100644
--- a/antivpp.py
+++ b/antivpp.py
@@ -231,8 +231,14 @@ def getResultsFile(filename):
     rfc = joblib.load('modelo_entrenado_2.pkl')
 
     with open(filename) as fp:
+        print('Name\tPredicted_Antiviral\tSequence\n')
         for name, seq in read_fasta(fp):
-            print(name, seq, getResultsSeq(seq, rfc))
+            result = getResultsSeq(seq, rfc)
+            print('{}\t{}\t{}'.format(
+                name.replace('>', ''),
+                result.replace('[', '').replace(']', '').strip(),
+                seq
+            ))
 
 
 class MyApp(QtWidgets.QMainWindow, Ui_MainWindow):
@@ -445,4 +451,13 @@ def calculation(self):
         window.show()
         sys.exit(app.exec_())
     else:
-        print(getResultsFile(sys.argv[-1]))
+        if sys.argv[-1] == '--help':
+            print(
+                "Usage:\n"
+                "For UI version:\n"
+                "\tpython antivpp.py\n"
+                "For getting results from fasta file:\n"
+                "\tpython antivpp.py [fasta filename] > [tsv filename]\n"
+                "\nUse relative paths\n")
+        else:
+            print(getResultsFile(sys.argv[-1]))

From f4ecbf16f9aefc55701518033e063409e0d5a998 Mon Sep 17 00:00:00 2001
From: Caio <ccastro@localhost.localdomain>
Date: Thu, 15 Oct 2020 11:54:30 -0300
Subject: [PATCH 05/10] small cleanup

---
 .gitignore |  2 +-
 antivpp.py | 41 +++++++++++++++--------------------------
 2 files changed, 16 insertions(+), 27 deletions(-)

diff --git a/.gitignore b/.gitignore
index ed8ebf5..bee8a64 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1 @@
-__pycache__
\ No newline at end of file
+__pycache__
diff --git a/antivpp.py b/antivpp.py
index a95be50..ba96e1d 100644
--- a/antivpp.py
+++ b/antivpp.py
@@ -197,32 +197,21 @@ def getResultsSeq(sequence, rfc):
         l_charge+m_charge+n_charge+p_charge+q_charge+r_charge + \
         s_charge+t_charge+v_charge+w_charge+y_charge
 
-    netH = round((a_hydrogen+c_hydrogen+d_hydrogen+e_hydrogen+f_hydrogen+g_hydrogen+h_hydrogen+i_hydrogen+k_hydrogen+l_hydrogen +
-                  m_hydrogen+n_hydrogen+p_hydrogen+q_hydrogen+r_hydrogen+s_hydrogen+t_hydrogen+v_hydrogen+w_hydrogen+y_hydrogen), 3)
-
-    # result = "Probable: " + str(rfc.predict([[netH, netCharge, molW, kyleD, a_a, c_c, d_d, e_e, f_f, g_g, h_h, i_i, k_k, l_l, m_m, n_n, p_p, q_q, r_r,
-    #                                           s_s, t_t, v_v, w_w, y_y, tiny, small, large, aliphatic, aromatic, total_charged, negative_charged, positive_charged, polar, neutral, hydrophobic]]))
-
-    # self.textpred.setText(str(result))
-    # self.textpred1.setText(str(aliphatic))
-    # self.textpred2.setText(str(negative_charged))
-    # self.textpred3.setText(str(aromatic))
-    # self.textpred4.setText(str(polar))
-    # self.textpred5.setText(str(neutral))
-    # self.textpred6.setText(str(hydrophobic))
-    # self.textpred7.setText(str(positive_charged))
-    # self.textpred8.setText(str(tiny))
-    # self.textpred9.setText(str(small))
-    # self.textpred10.setText(str(large))
-    # self.textpred11.setText(str(kyleD))
-    # self.textpred12.setText(str(molW))
-    # self.textpred13.setText(str(netCharge))
-    # self.textpred14.setText(str(netH))
-    # self.textpred15.setText(str(total_charged))
-    # self.textrelat.setText("A: " + str(a_a) + " , " + "C: " + str(c_c) + " , " + "D: " + str(d_d) + " , " + "E: " + str(e_e) + " , " + "F: " + str(f_f) + " , " + "E: " + str(e_e) + " , " + "G: " + str(g_g) + " , " + "I: " + str(i_i) + " , " + "K: " + str(k_k) + " , " + "L: " + str(l_l) + " , " + "M: " + str(m_m) + " , " + "N: " + str(n_n) + " , " + "P: " + str(p_p) + " , " + "Q: " + str(q_q) + " , " + "R: " + str(r_r) + " , " + "S: " + str(s_s) + " , " + "T: " + str(t_t) + " , " + "V: " + str(v_v) + " , " + "W: " + str(w_w) + " , " + "Y: " + str(y_y))
-
-    result = str(rfc.predict([[netH, netCharge, molW, kyleD, a_a, c_c, d_d, e_e, f_f, g_g, h_h, i_i, k_k, l_l, m_m, n_n, p_p, q_q, r_r,
-                               s_s, t_t, v_v, w_w, y_y, tiny, small, large, aliphatic, aromatic, total_charged, negative_charged, positive_charged, polar, neutral, hydrophobic]]))
+    netH = round((
+        a_hydrogen+c_hydrogen+d_hydrogen+e_hydrogen +
+        f_hydrogen+g_hydrogen+h_hydrogen+i_hydrogen +
+        k_hydrogen+l_hydrogen+m_hydrogen+n_hydrogen +
+        p_hydrogen+q_hydrogen+r_hydrogen+s_hydrogen +
+        t_hydrogen+v_hydrogen+w_hydrogen+y_hydrogen), 3)
+
+    result = str(rfc.predict([[
+        netH, netCharge, molW, kyleD,
+        a_a, c_c, d_d, e_e, f_f, g_g,
+        h_h, i_i, k_k, l_l, m_m, n_n,
+        p_p, q_q, r_r, s_s, t_t, v_v, w_w, y_y,
+        tiny, small, large, aliphatic, aromatic,
+        total_charged, negative_charged, positive_charged,
+        polar, neutral, hydrophobic]]))
 
     return result
 

From 2ca4558ece831b9bf5e0a4744a33bbfe58e324f4 Mon Sep 17 00:00:00 2001
From: Caio <ccastro@localhost.localdomain>
Date: Thu, 15 Oct 2020 12:50:59 -0300
Subject: [PATCH 06/10] begining of cleanup of code

---
 antivpp.py | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 127 insertions(+), 2 deletions(-)

diff --git a/antivpp.py b/antivpp.py
index ba96e1d..59072cb 100644
--- a/antivpp.py
+++ b/antivpp.py
@@ -21,6 +21,129 @@ def read_fasta(fp):
         yield (name, ''.join(seq))
 
 
+def getResultsSeqClean(sequence, rfc):
+    paste_seq = str(sequence)
+    seq_size = len(paste_seq+str(0.000001))
+
+    kyte_doolittle = {'A': 1.80, 'C': 2.50, 'D': -3.50, 'E': -3.50, 'F': 2.80,
+                      'G': -0.40, 'H': -3.20, 'I': 4.50, 'K': -3.90, 'L': 3.80,
+                      'M': 1.90, 'N': -3.50, 'P': -1.60, 'Q': -3.50, 'R': -4.50,
+                      'S': -0.80, 'T': -0.70, 'V': 4.20, 'W': -0.90, 'Y': -1.30}
+
+    molecular_weigth = {'A': 89.09, 'C': 121.15, 'D': 133.10, 'E': 147.13, 'F': 165.19,
+                        'G': 75.07, 'H': 155.16, 'I': 131.17, 'K': 146.19, 'L': 131.17,
+                        'M': 149.21, 'N': 132.12, 'P': 115.13, 'Q': 146.15, 'R': 174.20,
+                        'S': 105.09, 'T': 119.12, 'V': 117.15, 'W': 204.24, 'Y': 181.19}
+
+    net_charge = {'A': 0, 'C': 0, 'D': -1, 'E': -1, 'F': 0,
+                  'G': 0, 'H': 0, 'I': 0, 'K': 1, 'L': 0,
+                  'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 1,
+                  'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}
+
+    net_hydrogen = {'A': 0, 'C': 0, 'D': 1, 'E': 1, 'F': 0,
+                    'G': 0, 'H': 1, 'I': 0, 'K': 2, 'L': 0,
+                    'M': 0, 'N': 2, 'P': 0, 'Q': 2, 'R': 4,
+                    'S': 1, 'T': 1, 'V': 0, 'W': 1, 'Y': 1}
+
+    aa_list = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
+               'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
+    aa_counts = {k: paste_seq.count(k) for k in aa_list}
+
+    a_a = round(aa_counts["A"]/seq_size, 3)
+    c_c = round(aa_counts["C"]/seq_size, 3)
+    d_d = round(aa_counts["D"]/seq_size, 3)
+    e_e = round(aa_counts["E"]/seq_size, 3)
+    f_f = round(aa_counts["F"]/seq_size, 3)
+    g_g = round(aa_counts["G"]/seq_size, 3)
+    h_h = round(aa_counts["H"]/seq_size, 3)
+    i_i = round(aa_counts["I"]/seq_size, 3)
+    k_k = round(aa_counts["K"]/seq_size, 3)
+    l_l = round(aa_counts["L"]/seq_size, 3)
+    m_m = round(aa_counts["M"]/seq_size, 3)
+    n_n = round(aa_counts["N"]/seq_size, 3)
+    p_p = round(aa_counts["P"]/seq_size, 3)
+    q_q = round(aa_counts["Q"]/seq_size, 3)
+    r_r = round(aa_counts["R"]/seq_size, 3)
+    s_s = round(aa_counts["S"]/seq_size, 3)
+    t_t = round(aa_counts["T"]/seq_size, 3)
+    v_v = round(aa_counts["V"]/seq_size, 3)
+    w_w = round(aa_counts["W"]/seq_size, 3)
+    y_y = round(aa_counts["Y"]/seq_size, 3)
+
+    a_hydrogen = aa_counts["A"]*net_hydrogen["A"]
+    c_hydrogen = aa_counts["C"]*net_hydrogen["C"]
+    d_hydrogen = aa_counts["D"]*net_hydrogen["D"]
+    e_hydrogen = aa_counts["E"]*net_hydrogen["E"]
+    f_hydrogen = aa_counts["F"]*net_hydrogen["F"]
+    g_hydrogen = aa_counts["G"]*net_hydrogen["G"]
+    h_hydrogen = aa_counts["H"]*net_hydrogen["H"]
+    i_hydrogen = aa_counts["I"]*net_hydrogen["I"]
+    k_hydrogen = aa_counts["K"]*net_hydrogen["K"]
+    l_hydrogen = aa_counts["L"]*net_hydrogen["L"]
+    m_hydrogen = aa_counts["M"]*net_hydrogen["M"]
+    n_hydrogen = aa_counts["N"]*net_hydrogen["N"]
+    p_hydrogen = aa_counts["P"]*net_hydrogen["P"]
+    q_hydrogen = aa_counts["Q"]*net_hydrogen["Q"]
+    r_hydrogen = aa_counts["R"]*net_hydrogen["R"]
+    s_hydrogen = aa_counts["S"]*net_hydrogen["S"]
+    t_hydrogen = aa_counts["T"]*net_hydrogen["T"]
+    v_hydrogen = aa_counts["V"]*net_hydrogen["V"]
+    w_hydrogen = aa_counts["W"]*net_hydrogen["W"]
+    y_hydrogen = aa_counts["Y"]*net_hydrogen["Y"]
+
+    # PROPERTIES Q-P
+
+    aliphatic = round((i_i + l_l + v_v), 3)
+
+    negative_charged = round((d_d + e_e), 3)
+
+    total_charged = round((d_d + e_e + k_k + h_h + r_r), 3)
+
+    aromatic = round((f_f + h_h + w_w + y_y), 3)
+
+    polar = round((d_d + e_e + r_r + k_k + q_q + n_n), 3)
+
+    neutral = round((a_a + g_g + h_h + p_p + s_s + t_t + y_y), 3)
+
+    hydrophobic = round((c_c + f_f + i_i + l_l + m_m + v_v + w_w), 3)
+
+    positive_charged = round((k_k + r_r + h_h), 3)
+
+    tiny = round((a_a + c_c + d_d + g_g + s_s + t_t), 3)
+
+    small = round((e_e + h_h + i_i + l_l + k_k +
+                   m_m + n_n + p_p + q_q + v_v), 3)
+
+    large = round((f_f + r_r + w_w + y_y), 3)
+
+    # SCALES
+
+    kyleD = round(sum([aa_counts[k]*kyte_doolittle[k]
+                       for k in aa_list])/seq_size, 3)
+
+    molW = round(sum([aa_counts[k]*molecular_weigth[k] for k in aa_list]), 3)
+
+    netCharge = sum([aa_counts[k]*net_charge[k] for k in aa_list])
+
+    netH = round((
+        a_hydrogen+c_hydrogen+d_hydrogen+e_hydrogen +
+        f_hydrogen+g_hydrogen+h_hydrogen+i_hydrogen +
+        k_hydrogen+l_hydrogen+m_hydrogen+n_hydrogen +
+        p_hydrogen+q_hydrogen+r_hydrogen+s_hydrogen +
+        t_hydrogen+v_hydrogen+w_hydrogen+y_hydrogen), 3)
+
+    result = str(rfc.predict([[
+        netH, netCharge, molW, kyleD,
+        a_a, c_c, d_d, e_e, f_f, g_g,
+        h_h, i_i, k_k, l_l, m_m, n_n,
+        p_p, q_q, r_r, s_s, t_t, v_v, w_w, y_y,
+        tiny, small, large, aliphatic, aromatic,
+        total_charged, negative_charged, positive_charged,
+        polar, neutral, hydrophobic]]))
+
+    return result
+
+
 def getResultsSeq(sequence, rfc):
     paste_seq = str(sequence)
 
@@ -220,12 +343,14 @@ def getResultsFile(filename):
     rfc = joblib.load('modelo_entrenado_2.pkl')
 
     with open(filename) as fp:
-        print('Name\tPredicted_Antiviral\tSequence\n')
+        print('Name\tPredicted_Antiviral\tNew\tSequence\n')
         for name, seq in read_fasta(fp):
             result = getResultsSeq(seq, rfc)
-            print('{}\t{}\t{}'.format(
+            new_result = getResultsSeq(seq, rfc)
+            print('{}\t{}\t{}\t{}'.format(
                 name.replace('>', ''),
                 result.replace('[', '').replace(']', '').strip(),
+                new_result.replace('[', '').replace(']', '').strip(),
                 seq
             ))
 

From d5d0c6627efca847f2bd8ac8108df4a365039407 Mon Sep 17 00:00:00 2001
From: Caio <ccastro@localhost.localdomain>
Date: Thu, 15 Oct 2020 12:54:59 -0300
Subject: [PATCH 07/10] more cleanup

---
 antivpp.py | 35 +++++------------------------------
 1 file changed, 5 insertions(+), 30 deletions(-)

diff --git a/antivpp.py b/antivpp.py
index 59072cb..d42208c 100644
--- a/antivpp.py
+++ b/antivpp.py
@@ -47,6 +47,7 @@ def getResultsSeqClean(sequence, rfc):
 
     aa_list = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
                'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
+
     aa_counts = {k: paste_seq.count(k) for k in aa_list}
 
     a_a = round(aa_counts["A"]/seq_size, 3)
@@ -70,27 +71,6 @@ def getResultsSeqClean(sequence, rfc):
     w_w = round(aa_counts["W"]/seq_size, 3)
     y_y = round(aa_counts["Y"]/seq_size, 3)
 
-    a_hydrogen = aa_counts["A"]*net_hydrogen["A"]
-    c_hydrogen = aa_counts["C"]*net_hydrogen["C"]
-    d_hydrogen = aa_counts["D"]*net_hydrogen["D"]
-    e_hydrogen = aa_counts["E"]*net_hydrogen["E"]
-    f_hydrogen = aa_counts["F"]*net_hydrogen["F"]
-    g_hydrogen = aa_counts["G"]*net_hydrogen["G"]
-    h_hydrogen = aa_counts["H"]*net_hydrogen["H"]
-    i_hydrogen = aa_counts["I"]*net_hydrogen["I"]
-    k_hydrogen = aa_counts["K"]*net_hydrogen["K"]
-    l_hydrogen = aa_counts["L"]*net_hydrogen["L"]
-    m_hydrogen = aa_counts["M"]*net_hydrogen["M"]
-    n_hydrogen = aa_counts["N"]*net_hydrogen["N"]
-    p_hydrogen = aa_counts["P"]*net_hydrogen["P"]
-    q_hydrogen = aa_counts["Q"]*net_hydrogen["Q"]
-    r_hydrogen = aa_counts["R"]*net_hydrogen["R"]
-    s_hydrogen = aa_counts["S"]*net_hydrogen["S"]
-    t_hydrogen = aa_counts["T"]*net_hydrogen["T"]
-    v_hydrogen = aa_counts["V"]*net_hydrogen["V"]
-    w_hydrogen = aa_counts["W"]*net_hydrogen["W"]
-    y_hydrogen = aa_counts["Y"]*net_hydrogen["Y"]
-
     # PROPERTIES Q-P
 
     aliphatic = round((i_i + l_l + v_v), 3)
@@ -125,23 +105,18 @@ def getResultsSeqClean(sequence, rfc):
 
     netCharge = sum([aa_counts[k]*net_charge[k] for k in aa_list])
 
-    netH = round((
-        a_hydrogen+c_hydrogen+d_hydrogen+e_hydrogen +
-        f_hydrogen+g_hydrogen+h_hydrogen+i_hydrogen +
-        k_hydrogen+l_hydrogen+m_hydrogen+n_hydrogen +
-        p_hydrogen+q_hydrogen+r_hydrogen+s_hydrogen +
-        t_hydrogen+v_hydrogen+w_hydrogen+y_hydrogen), 3)
+    netH = round(sum([aa_counts[k]*net_hydrogen[k] for k in aa_list]), 3)
 
-    result = str(rfc.predict([[
+    result = rfc.predict([[
         netH, netCharge, molW, kyleD,
         a_a, c_c, d_d, e_e, f_f, g_g,
         h_h, i_i, k_k, l_l, m_m, n_n,
         p_p, q_q, r_r, s_s, t_t, v_v, w_w, y_y,
         tiny, small, large, aliphatic, aromatic,
         total_charged, negative_charged, positive_charged,
-        polar, neutral, hydrophobic]]))
+        polar, neutral, hydrophobic]])
 
-    return result
+    return str(result)
 
 
 def getResultsSeq(sequence, rfc):

From 6c071fee84a0967b98341a611a7656dc5dc040d0 Mon Sep 17 00:00:00 2001
From: Caio <ccastro@localhost.localdomain>
Date: Thu, 15 Oct 2020 13:47:52 -0300
Subject: [PATCH 08/10] final cleanup. VERSION FOR COMPARING IMPLEMENTATIONS

---
 antivpp.py | 73 ++++++++++++++++++++++--------------------------------
 1 file changed, 30 insertions(+), 43 deletions(-)

diff --git a/antivpp.py b/antivpp.py
index d42208c..6c84fc2 100644
--- a/antivpp.py
+++ b/antivpp.py
@@ -21,6 +21,10 @@ def read_fasta(fp):
         yield (name, ''.join(seq))
 
 
+def partialDictRoundedSum(d, keys):
+    return round(sum([d[k] for k in keys]), 3)
+
+
 def getResultsSeqClean(sequence, rfc):
     paste_seq = str(sequence)
     seq_size = len(paste_seq+str(0.000001))
@@ -50,56 +54,41 @@ def getResultsSeqClean(sequence, rfc):
 
     aa_counts = {k: paste_seq.count(k) for k in aa_list}
 
-    a_a = round(aa_counts["A"]/seq_size, 3)
-    c_c = round(aa_counts["C"]/seq_size, 3)
-    d_d = round(aa_counts["D"]/seq_size, 3)
-    e_e = round(aa_counts["E"]/seq_size, 3)
-    f_f = round(aa_counts["F"]/seq_size, 3)
-    g_g = round(aa_counts["G"]/seq_size, 3)
-    h_h = round(aa_counts["H"]/seq_size, 3)
-    i_i = round(aa_counts["I"]/seq_size, 3)
-    k_k = round(aa_counts["K"]/seq_size, 3)
-    l_l = round(aa_counts["L"]/seq_size, 3)
-    m_m = round(aa_counts["M"]/seq_size, 3)
-    n_n = round(aa_counts["N"]/seq_size, 3)
-    p_p = round(aa_counts["P"]/seq_size, 3)
-    q_q = round(aa_counts["Q"]/seq_size, 3)
-    r_r = round(aa_counts["R"]/seq_size, 3)
-    s_s = round(aa_counts["S"]/seq_size, 3)
-    t_t = round(aa_counts["T"]/seq_size, 3)
-    v_v = round(aa_counts["V"]/seq_size, 3)
-    w_w = round(aa_counts["W"]/seq_size, 3)
-    y_y = round(aa_counts["Y"]/seq_size, 3)
+    aa_perc = {k: round(aa_counts[k]/seq_size, 3) for k in aa_list}
 
     # PROPERTIES Q-P
 
-    aliphatic = round((i_i + l_l + v_v), 3)
+    aliphatic = partialDictRoundedSum(aa_perc, ['I', 'V', 'L'])
 
-    negative_charged = round((d_d + e_e), 3)
+    negative_charged = partialDictRoundedSum(aa_perc, ['D', 'E'])
 
-    total_charged = round((d_d + e_e + k_k + h_h + r_r), 3)
+    total_charged = partialDictRoundedSum(aa_perc, ['D', 'E', 'K', 'H', 'R'])
 
-    aromatic = round((f_f + h_h + w_w + y_y), 3)
+    aromatic = partialDictRoundedSum(aa_perc, ['F', 'H', 'W', 'Y'])
 
-    polar = round((d_d + e_e + r_r + k_k + q_q + n_n), 3)
+    polar = partialDictRoundedSum(aa_perc, ['D', 'E', 'R', 'K', 'Q', 'N'])
 
-    neutral = round((a_a + g_g + h_h + p_p + s_s + t_t + y_y), 3)
+    neutral = partialDictRoundedSum(aa_perc,
+                                    ['A', 'G', 'H', 'P', 'S', 'T', 'Y'])
 
-    hydrophobic = round((c_c + f_f + i_i + l_l + m_m + v_v + w_w), 3)
+    hydrophobic = partialDictRoundedSum(aa_perc,
+                                        ['C', 'F', 'I', 'L', 'M', 'V', 'W'])
 
-    positive_charged = round((k_k + r_r + h_h), 3)
+    positive_charged = partialDictRoundedSum(aa_perc, ['K', 'R', 'H'])
 
-    tiny = round((a_a + c_c + d_d + g_g + s_s + t_t), 3)
+    tiny = partialDictRoundedSum(aa_perc, ['A', 'C', 'D', 'G', 'S', 'T'])
 
-    small = round((e_e + h_h + i_i + l_l + k_k +
-                   m_m + n_n + p_p + q_q + v_v), 3)
+    small = partialDictRoundedSum(aa_perc,
+                                  ['E', 'H', 'I', 'L', 'K', 'M', 'N', 'P', 'Q', 'V'])
 
-    large = round((f_f + r_r + w_w + y_y), 3)
+    large = partialDictRoundedSum(aa_perc, ['F', 'R', 'W', 'Y'])
 
     # SCALES
 
-    kyleD = round(sum([aa_counts[k]*kyte_doolittle[k]
-                       for k in aa_list])/seq_size, 3)
+    kyleD = round(
+        sum(
+            [aa_counts[k]*kyte_doolittle[k] for k in aa_list]
+        )/seq_size, 3)
 
     molW = round(sum([aa_counts[k]*molecular_weigth[k] for k in aa_list]), 3)
 
@@ -107,14 +96,12 @@ def getResultsSeqClean(sequence, rfc):
 
     netH = round(sum([aa_counts[k]*net_hydrogen[k] for k in aa_list]), 3)
 
-    result = rfc.predict([[
-        netH, netCharge, molW, kyleD,
-        a_a, c_c, d_d, e_e, f_f, g_g,
-        h_h, i_i, k_k, l_l, m_m, n_n,
-        p_p, q_q, r_r, s_s, t_t, v_v, w_w, y_y,
-        tiny, small, large, aliphatic, aromatic,
-        total_charged, negative_charged, positive_charged,
-        polar, neutral, hydrophobic]])
+    result = rfc.predict([
+        [netH, netCharge, molW, kyleD] +
+        [v for v in aa_perc.values()] +
+        [tiny, small, large, aliphatic, aromatic,
+         total_charged, negative_charged, positive_charged,
+         polar, neutral, hydrophobic]])
 
     return str(result)
 
@@ -321,7 +308,7 @@ def getResultsFile(filename):
         print('Name\tPredicted_Antiviral\tNew\tSequence\n')
         for name, seq in read_fasta(fp):
             result = getResultsSeq(seq, rfc)
-            new_result = getResultsSeq(seq, rfc)
+            new_result = getResultsSeqClean(seq, rfc)
             print('{}\t{}\t{}\t{}'.format(
                 name.replace('>', ''),
                 result.replace('[', '').replace(']', '').strip(),

From 43852cab2526e72a242f88f0bc28d4745bef1110 Mon Sep 17 00:00:00 2001
From: Caio <ccastro@localhost.localdomain>
Date: Thu, 15 Oct 2020 13:54:46 -0300
Subject: [PATCH 09/10] Replace old version with refactored one

---
 antivpp.py | 373 +++++++----------------------------------------------
 1 file changed, 44 insertions(+), 329 deletions(-)

diff --git a/antivpp.py b/antivpp.py
index 6c84fc2..a7c9882 100644
--- a/antivpp.py
+++ b/antivpp.py
@@ -25,7 +25,7 @@ def partialDictRoundedSum(d, keys):
     return round(sum([d[k] for k in keys]), 3)
 
 
-def getResultsSeqClean(sequence, rfc):
+def getResultsSeq(sequence, rfc):
     paste_seq = str(sequence)
     seq_size = len(paste_seq+str(0.000001))
 
@@ -106,213 +106,16 @@ def getResultsSeqClean(sequence, rfc):
     return str(result)
 
 
-def getResultsSeq(sequence, rfc):
-    paste_seq = str(sequence)
-
-    kyte_doolittle = {'A': 1.80, 'C': 2.50, 'D': -3.50, 'E': -3.50, 'F': 2.80,
-                      'G': -0.40, 'H': -3.20, 'I': 4.50, 'K': -3.90, 'L': 3.80,
-                      'M': 1.90, 'N': -3.50, 'P': -1.60, 'Q': -3.50, 'R': -4.50,
-                      'S': -0.80, 'T': -0.70, 'V': 4.20, 'W': -0.90, 'Y': -1.30}
-
-    molecular_weigth = {'A': 89.09, 'C': 121.15, 'D': 133.10, 'E': 147.13, 'F': 165.19,
-                        'G': 75.07, 'H': 155.16, 'I': 131.17, 'K': 146.19, 'L': 131.17,
-                        'M': 149.21, 'N': 132.12, 'P': 115.13, 'Q': 146.15, 'R': 174.20,
-                        'S': 105.09, 'T': 119.12, 'V': 117.15, 'W': 204.24, 'Y': 181.19}
-
-    net_charge = {'A': 0, 'C': 0, 'D': -1, 'E': -1, 'F': 0,
-                  'G': 0, 'H': 0, 'I': 0, 'K': 1, 'L': 0,
-                  'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 1,
-                  'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 0}
-
-    net_hydrogen = {'A': 0, 'C': 0, 'D': 1, 'E': 1, 'F': 0,
-                    'G': 0, 'H': 1, 'I': 0, 'K': 2, 'L': 0,
-                    'M': 0, 'N': 2, 'P': 0, 'Q': 2, 'R': 4,
-                    'S': 1, 'T': 1, 'V': 0, 'W': 1, 'Y': 1}
-
-    a_a = round(paste_seq.count("A")/len(paste_seq+str(0.000001)), 3)
-    c_c = round(paste_seq.count("C")/len(paste_seq+str(0.000001)), 3)
-    d_d = round(paste_seq.count("D")/len(paste_seq+str(0.000001)), 3)
-    e_e = round(paste_seq.count("E")/len(paste_seq+str(0.000001)), 3)
-    f_f = round(paste_seq.count("F")/len(paste_seq+str(0.000001)), 3)
-    g_g = round(paste_seq.count("G")/len(paste_seq+str(0.000001)), 3)
-    h_h = round(paste_seq.count("H")/len(paste_seq+str(0.000001)), 3)
-    i_i = round(paste_seq.count("I")/len(paste_seq+str(0.000001)), 3)
-    k_k = round(paste_seq.count("K")/len(paste_seq+str(0.000001)), 3)
-    l_l = round(paste_seq.count("L")/len(paste_seq+str(0.000001)), 3)
-    m_m = round(paste_seq.count("M")/len(paste_seq+str(0.000001)), 3)
-    n_n = round(paste_seq.count("N")/len(paste_seq+str(0.000001)), 3)
-    p_p = round(paste_seq.count("P")/len(paste_seq+str(0.000001)), 3)
-    q_q = round(paste_seq.count("Q")/len(paste_seq+str(0.000001)), 3)
-    r_r = round(paste_seq.count("R")/len(paste_seq+str(0.000001)), 3)
-    s_s = round(paste_seq.count("S")/len(paste_seq+str(0.000001)), 3)
-    t_t = round(paste_seq.count("T")/len(paste_seq+str(0.000001)), 3)
-    v_v = round(paste_seq.count("V")/len(paste_seq+str(0.000001)), 3)
-    w_w = round(paste_seq.count("W")/len(paste_seq+str(0.000001)), 3)
-    y_y = round(paste_seq.count("Y")/len(paste_seq+str(0.000001)), 3)
-
-    a_kyte = paste_seq.count("A")*kyte_doolittle["A"]
-    c_kyte = paste_seq.count("C")*kyte_doolittle["C"]
-    d_kyte = paste_seq.count("D")*kyte_doolittle["D"]
-    e_kyte = paste_seq.count("E")*kyte_doolittle["E"]
-    f_kyte = paste_seq.count("F")*kyte_doolittle["F"]
-    g_kyte = paste_seq.count("G")*kyte_doolittle["G"]
-    h_kyte = paste_seq.count("H")*kyte_doolittle["H"]
-    i_kyte = paste_seq.count("I")*kyte_doolittle["I"]
-    k_kyte = paste_seq.count("K")*kyte_doolittle["K"]
-    l_kyte = paste_seq.count("L")*kyte_doolittle["L"]
-    m_kyte = paste_seq.count("M")*kyte_doolittle["M"]
-    n_kyte = paste_seq.count("N")*kyte_doolittle["N"]
-    p_kyte = paste_seq.count("P")*kyte_doolittle["P"]
-    q_kyte = paste_seq.count("Q")*kyte_doolittle["Q"]
-    r_kyte = paste_seq.count("R")*kyte_doolittle["R"]
-    s_kyte = paste_seq.count("S")*kyte_doolittle["S"]
-    t_kyte = paste_seq.count("T")*kyte_doolittle["T"]
-    v_kyte = paste_seq.count("V")*kyte_doolittle["V"]
-    w_kyte = paste_seq.count("W")*kyte_doolittle["W"]
-    y_kyte = paste_seq.count("Y")*kyte_doolittle["Y"]
-
-    a_mw = paste_seq.count("A")*molecular_weigth["A"]
-    c_mw = paste_seq.count("C")*molecular_weigth["C"]
-    d_mw = paste_seq.count("D")*molecular_weigth["D"]
-    e_mw = paste_seq.count("E")*molecular_weigth["E"]
-    f_mw = paste_seq.count("F")*molecular_weigth["F"]
-    g_mw = paste_seq.count("G")*molecular_weigth["G"]
-    h_mw = paste_seq.count("H")*molecular_weigth["H"]
-    i_mw = paste_seq.count("I")*molecular_weigth["I"]
-    k_mw = paste_seq.count("K")*molecular_weigth["K"]
-    l_mw = paste_seq.count("L")*molecular_weigth["L"]
-    m_mw = paste_seq.count("M")*molecular_weigth["M"]
-    n_mw = paste_seq.count("N")*molecular_weigth["N"]
-    p_mw = paste_seq.count("P")*molecular_weigth["P"]
-    q_mw = paste_seq.count("Q")*molecular_weigth["Q"]
-    r_mw = paste_seq.count("R")*molecular_weigth["R"]
-    s_mw = paste_seq.count("S")*molecular_weigth["S"]
-    t_mw = paste_seq.count("T")*molecular_weigth["T"]
-    v_mw = paste_seq.count("V")*molecular_weigth["V"]
-    w_mw = paste_seq.count("W")*molecular_weigth["W"]
-    y_mw = paste_seq.count("Y")*molecular_weigth["Y"]
-
-    a_charge = paste_seq.count("A")*net_charge["A"]
-    c_charge = paste_seq.count("C")*net_charge["C"]
-    d_charge = paste_seq.count("D")*net_charge["D"]
-    e_charge = paste_seq.count("E")*net_charge["E"]
-    f_charge = paste_seq.count("F")*net_charge["F"]
-    g_charge = paste_seq.count("G")*net_charge["G"]
-    h_charge = paste_seq.count("H")*net_charge["H"]
-    i_charge = paste_seq.count("I")*net_charge["I"]
-    k_charge = paste_seq.count("K")*net_charge["K"]
-    l_charge = paste_seq.count("L")*net_charge["L"]
-    m_charge = paste_seq.count("M")*net_charge["M"]
-    n_charge = paste_seq.count("N")*net_charge["N"]
-    p_charge = paste_seq.count("P")*net_charge["P"]
-    q_charge = paste_seq.count("Q")*net_charge["Q"]
-    r_charge = paste_seq.count("R")*net_charge["R"]
-    s_charge = paste_seq.count("S")*net_charge["S"]
-    t_charge = paste_seq.count("T")*net_charge["T"]
-    v_charge = paste_seq.count("V")*net_charge["V"]
-    w_charge = paste_seq.count("W")*net_charge["W"]
-    y_charge = paste_seq.count("Y")*net_charge["Y"]
-
-    a_hydrogen = paste_seq.count("A")*net_hydrogen["A"]
-    c_hydrogen = paste_seq.count("C")*net_hydrogen["C"]
-    d_hydrogen = paste_seq.count("D")*net_hydrogen["D"]
-    e_hydrogen = paste_seq.count("E")*net_hydrogen["E"]
-    f_hydrogen = paste_seq.count("F")*net_hydrogen["F"]
-    g_hydrogen = paste_seq.count("G")*net_hydrogen["G"]
-    h_hydrogen = paste_seq.count("H")*net_hydrogen["H"]
-    i_hydrogen = paste_seq.count("I")*net_hydrogen["I"]
-    k_hydrogen = paste_seq.count("K")*net_hydrogen["K"]
-    l_hydrogen = paste_seq.count("L")*net_hydrogen["L"]
-    m_hydrogen = paste_seq.count("M")*net_hydrogen["M"]
-    n_hydrogen = paste_seq.count("N")*net_hydrogen["N"]
-    p_hydrogen = paste_seq.count("P")*net_hydrogen["P"]
-    q_hydrogen = paste_seq.count("Q")*net_hydrogen["Q"]
-    r_hydrogen = paste_seq.count("R")*net_hydrogen["R"]
-    s_hydrogen = paste_seq.count("S")*net_hydrogen["S"]
-    t_hydrogen = paste_seq.count("T")*net_hydrogen["T"]
-    v_hydrogen = paste_seq.count("V")*net_hydrogen["V"]
-    w_hydrogen = paste_seq.count("W")*net_hydrogen["W"]
-    y_hydrogen = paste_seq.count("Y")*net_hydrogen["Y"]
-
-    # PROPERTIES Q-P
-
-    aliphatic = round((i_i + l_l + v_v), 3)
-
-    negative_charged = round((d_d + e_e), 3)
-
-    total_charged = round((d_d + e_e + k_k + h_h + r_r), 3)
-
-    aromatic = round((f_f + h_h + w_w + y_y), 3)
-
-    polar = round((d_d + e_e + r_r + k_k + q_q + n_n), 3)
-
-    neutral = round((a_a + g_g + h_h + p_p + s_s + t_t + y_y), 3)
-
-    hydrophobic = round((c_c + f_f + i_i + l_l + m_m + v_v + w_w), 3)
-
-    positive_charged = round((k_k + r_r + h_h), 3)
-
-    tiny = round((a_a + c_c + d_d + g_g + s_s + t_t), 3)
-
-    small = round((e_e + h_h + i_i + l_l + k_k +
-                   m_m + n_n + p_p + q_q + v_v), 3)
-
-    large = round((f_f + r_r + w_w + y_y), 3)
-
-    # SCALES
-
-    kyleD = round((
-        (a_kyte+c_kyte+d_kyte +
-         e_kyte+f_kyte+g_kyte +
-         h_kyte+i_kyte+k_kyte +
-         l_kyte+m_kyte + n_kyte +
-         p_kyte+q_kyte+r_kyte +
-         s_kyte+t_kyte+v_kyte +
-         w_kyte+y_kyte)/len(paste_seq+str(0.000001))
-    ), 3)
-
-    molW = round(
-        (a_mw+c_mw+d_mw+e_mw +
-         f_mw+g_mw+h_mw+i_mw +
-         k_mw+l_mw+m_mw+n_mw +
-         p_mw+q_mw+r_mw+s_mw +
-         t_mw+v_mw+w_mw+y_mw), 3)
-
-    netCharge = a_charge+c_charge+d_charge+e_charge+f_charge+g_charge+h_charge+i_charge+k_charge + \
-        l_charge+m_charge+n_charge+p_charge+q_charge+r_charge + \
-        s_charge+t_charge+v_charge+w_charge+y_charge
-
-    netH = round((
-        a_hydrogen+c_hydrogen+d_hydrogen+e_hydrogen +
-        f_hydrogen+g_hydrogen+h_hydrogen+i_hydrogen +
-        k_hydrogen+l_hydrogen+m_hydrogen+n_hydrogen +
-        p_hydrogen+q_hydrogen+r_hydrogen+s_hydrogen +
-        t_hydrogen+v_hydrogen+w_hydrogen+y_hydrogen), 3)
-
-    result = str(rfc.predict([[
-        netH, netCharge, molW, kyleD,
-        a_a, c_c, d_d, e_e, f_f, g_g,
-        h_h, i_i, k_k, l_l, m_m, n_n,
-        p_p, q_q, r_r, s_s, t_t, v_v, w_w, y_y,
-        tiny, small, large, aliphatic, aromatic,
-        total_charged, negative_charged, positive_charged,
-        polar, neutral, hydrophobic]]))
-
-    return result
-
-
 def getResultsFile(filename):
     rfc = joblib.load('modelo_entrenado_2.pkl')
 
     with open(filename) as fp:
-        print('Name\tPredicted_Antiviral\tNew\tSequence\n')
+        print('Name\tPredicted_Antiviral\tSequence\n')
         for name, seq in read_fasta(fp):
             result = getResultsSeq(seq, rfc)
-            new_result = getResultsSeqClean(seq, rfc)
-            print('{}\t{}\t{}\t{}'.format(
+            print('{}\t{}\t{}'.format(
                 name.replace('>', ''),
                 result.replace('[', '').replace(']', '').strip(),
-                new_result.replace('[', '').replace(']', '').strip(),
                 seq
             ))
 
@@ -332,6 +135,8 @@ def calculation(self):
 
         paste_seq = str(self.pasteseq.toPlainText())
 
+        seq_size = len(paste_seq+str(0.000001))
+
         kyte_doolittle = {'A': 1.80, 'C': 2.50, 'D': -3.50, 'E': -3.50, 'F': 2.80,
                           'G': -0.40, 'H': -3.20, 'I': 4.50, 'K': -3.90, 'L': 3.80,
                           'M': 1.90, 'N': -3.50, 'P': -1.60, 'Q': -3.50, 'R': -4.50,
@@ -352,153 +157,63 @@ def calculation(self):
                         'M': 0, 'N': 2, 'P': 0, 'Q': 2, 'R': 4,
                         'S': 1, 'T': 1, 'V': 0, 'W': 1, 'Y': 1}
 
-        a_a = round(paste_seq.count("A")/len(paste_seq+str(0.000001)), 3)
-        c_c = round(paste_seq.count("C")/len(paste_seq+str(0.000001)), 3)
-        d_d = round(paste_seq.count("D")/len(paste_seq+str(0.000001)), 3)
-        e_e = round(paste_seq.count("E")/len(paste_seq+str(0.000001)), 3)
-        f_f = round(paste_seq.count("F")/len(paste_seq+str(0.000001)), 3)
-        g_g = round(paste_seq.count("G")/len(paste_seq+str(0.000001)), 3)
-        h_h = round(paste_seq.count("H")/len(paste_seq+str(0.000001)), 3)
-        i_i = round(paste_seq.count("I")/len(paste_seq+str(0.000001)), 3)
-        k_k = round(paste_seq.count("K")/len(paste_seq+str(0.000001)), 3)
-        l_l = round(paste_seq.count("L")/len(paste_seq+str(0.000001)), 3)
-        m_m = round(paste_seq.count("M")/len(paste_seq+str(0.000001)), 3)
-        n_n = round(paste_seq.count("N")/len(paste_seq+str(0.000001)), 3)
-        p_p = round(paste_seq.count("P")/len(paste_seq+str(0.000001)), 3)
-        q_q = round(paste_seq.count("Q")/len(paste_seq+str(0.000001)), 3)
-        r_r = round(paste_seq.count("R")/len(paste_seq+str(0.000001)), 3)
-        s_s = round(paste_seq.count("S")/len(paste_seq+str(0.000001)), 3)
-        t_t = round(paste_seq.count("T")/len(paste_seq+str(0.000001)), 3)
-        v_v = round(paste_seq.count("V")/len(paste_seq+str(0.000001)), 3)
-        w_w = round(paste_seq.count("W")/len(paste_seq+str(0.000001)), 3)
-        y_y = round(paste_seq.count("Y")/len(paste_seq+str(0.000001)), 3)
-
-        a_kyte = paste_seq.count("A")*kyte_doolittle["A"]
-        c_kyte = paste_seq.count("C")*kyte_doolittle["C"]
-        d_kyte = paste_seq.count("D")*kyte_doolittle["D"]
-        e_kyte = paste_seq.count("E")*kyte_doolittle["E"]
-        f_kyte = paste_seq.count("F")*kyte_doolittle["F"]
-        g_kyte = paste_seq.count("G")*kyte_doolittle["G"]
-        h_kyte = paste_seq.count("H")*kyte_doolittle["H"]
-        i_kyte = paste_seq.count("I")*kyte_doolittle["I"]
-        k_kyte = paste_seq.count("K")*kyte_doolittle["K"]
-        l_kyte = paste_seq.count("L")*kyte_doolittle["L"]
-        m_kyte = paste_seq.count("M")*kyte_doolittle["M"]
-        n_kyte = paste_seq.count("N")*kyte_doolittle["N"]
-        p_kyte = paste_seq.count("P")*kyte_doolittle["P"]
-        q_kyte = paste_seq.count("Q")*kyte_doolittle["Q"]
-        r_kyte = paste_seq.count("R")*kyte_doolittle["R"]
-        s_kyte = paste_seq.count("S")*kyte_doolittle["S"]
-        t_kyte = paste_seq.count("T")*kyte_doolittle["T"]
-        v_kyte = paste_seq.count("V")*kyte_doolittle["V"]
-        w_kyte = paste_seq.count("W")*kyte_doolittle["W"]
-        y_kyte = paste_seq.count("Y")*kyte_doolittle["Y"]
-
-        a_mw = paste_seq.count("A")*molecular_weigth["A"]
-        c_mw = paste_seq.count("C")*molecular_weigth["C"]
-        d_mw = paste_seq.count("D")*molecular_weigth["D"]
-        e_mw = paste_seq.count("E")*molecular_weigth["E"]
-        f_mw = paste_seq.count("F")*molecular_weigth["F"]
-        g_mw = paste_seq.count("G")*molecular_weigth["G"]
-        h_mw = paste_seq.count("H")*molecular_weigth["H"]
-        i_mw = paste_seq.count("I")*molecular_weigth["I"]
-        k_mw = paste_seq.count("K")*molecular_weigth["K"]
-        l_mw = paste_seq.count("L")*molecular_weigth["L"]
-        m_mw = paste_seq.count("M")*molecular_weigth["M"]
-        n_mw = paste_seq.count("N")*molecular_weigth["N"]
-        p_mw = paste_seq.count("P")*molecular_weigth["P"]
-        q_mw = paste_seq.count("Q")*molecular_weigth["Q"]
-        r_mw = paste_seq.count("R")*molecular_weigth["R"]
-        s_mw = paste_seq.count("S")*molecular_weigth["S"]
-        t_mw = paste_seq.count("T")*molecular_weigth["T"]
-        v_mw = paste_seq.count("V")*molecular_weigth["V"]
-        w_mw = paste_seq.count("W")*molecular_weigth["W"]
-        y_mw = paste_seq.count("Y")*molecular_weigth["Y"]
-
-        a_charge = paste_seq.count("A")*net_charge["A"]
-        c_charge = paste_seq.count("C")*net_charge["C"]
-        d_charge = paste_seq.count("D")*net_charge["D"]
-        e_charge = paste_seq.count("E")*net_charge["E"]
-        f_charge = paste_seq.count("F")*net_charge["F"]
-        g_charge = paste_seq.count("G")*net_charge["G"]
-        h_charge = paste_seq.count("H")*net_charge["H"]
-        i_charge = paste_seq.count("I")*net_charge["I"]
-        k_charge = paste_seq.count("K")*net_charge["K"]
-        l_charge = paste_seq.count("L")*net_charge["L"]
-        m_charge = paste_seq.count("M")*net_charge["M"]
-        n_charge = paste_seq.count("N")*net_charge["N"]
-        p_charge = paste_seq.count("P")*net_charge["P"]
-        q_charge = paste_seq.count("Q")*net_charge["Q"]
-        r_charge = paste_seq.count("R")*net_charge["R"]
-        s_charge = paste_seq.count("S")*net_charge["S"]
-        t_charge = paste_seq.count("T")*net_charge["T"]
-        v_charge = paste_seq.count("V")*net_charge["V"]
-        w_charge = paste_seq.count("W")*net_charge["W"]
-        y_charge = paste_seq.count("Y")*net_charge["Y"]
-
-        a_hydrogen = paste_seq.count("A")*net_hydrogen["A"]
-        c_hydrogen = paste_seq.count("C")*net_hydrogen["C"]
-        d_hydrogen = paste_seq.count("D")*net_hydrogen["D"]
-        e_hydrogen = paste_seq.count("E")*net_hydrogen["E"]
-        f_hydrogen = paste_seq.count("F")*net_hydrogen["F"]
-        g_hydrogen = paste_seq.count("G")*net_hydrogen["G"]
-        h_hydrogen = paste_seq.count("H")*net_hydrogen["H"]
-        i_hydrogen = paste_seq.count("I")*net_hydrogen["I"]
-        k_hydrogen = paste_seq.count("K")*net_hydrogen["K"]
-        l_hydrogen = paste_seq.count("L")*net_hydrogen["L"]
-        m_hydrogen = paste_seq.count("M")*net_hydrogen["M"]
-        n_hydrogen = paste_seq.count("N")*net_hydrogen["N"]
-        p_hydrogen = paste_seq.count("P")*net_hydrogen["P"]
-        q_hydrogen = paste_seq.count("Q")*net_hydrogen["Q"]
-        r_hydrogen = paste_seq.count("R")*net_hydrogen["R"]
-        s_hydrogen = paste_seq.count("S")*net_hydrogen["S"]
-        t_hydrogen = paste_seq.count("T")*net_hydrogen["T"]
-        v_hydrogen = paste_seq.count("V")*net_hydrogen["V"]
-        w_hydrogen = paste_seq.count("W")*net_hydrogen["W"]
-        y_hydrogen = paste_seq.count("Y")*net_hydrogen["Y"]
+        aa_list = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
+                   'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
+
+        aa_counts = {k: paste_seq.count(k) for k in aa_list}
+
+        aa_perc = {k: round(aa_counts[k]/seq_size, 3) for k in aa_list}
 
         # PROPERTIES Q-P
 
-        aliphatic = round((i_i + l_l + v_v), 3)
+        aliphatic = partialDictRoundedSum(aa_perc, ['I', 'V', 'L'])
 
-        negative_charged = round((d_d + e_e), 3)
+        negative_charged = partialDictRoundedSum(aa_perc, ['D', 'E'])
 
-        total_charged = round((d_d + e_e + k_k + h_h + r_r), 3)
+        total_charged = partialDictRoundedSum(
+            aa_perc, ['D', 'E', 'K', 'H', 'R'])
 
-        aromatic = round((f_f + h_h + w_w + y_y), 3)
+        aromatic = partialDictRoundedSum(aa_perc, ['F', 'H', 'W', 'Y'])
 
-        polar = round((d_d + e_e + r_r + k_k + q_q + n_n), 3)
+        polar = partialDictRoundedSum(aa_perc, ['D', 'E', 'R', 'K', 'Q', 'N'])
 
-        neutral = round((a_a + g_g + h_h + p_p + s_s + t_t + y_y), 3)
+        neutral = partialDictRoundedSum(aa_perc,
+                                        ['A', 'G', 'H', 'P', 'S', 'T', 'Y'])
 
-        hydrophobic = round((c_c + f_f + i_i + l_l + m_m + v_v + w_w), 3)
+        hydrophobic = partialDictRoundedSum(aa_perc,
+                                            ['C', 'F', 'I', 'L', 'M', 'V', 'W'])
 
-        positive_charged = round((k_k + r_r + h_h), 3)
+        positive_charged = partialDictRoundedSum(aa_perc, ['K', 'R', 'H'])
 
-        tiny = round((a_a + c_c + d_d + g_g + s_s + t_t), 3)
+        tiny = partialDictRoundedSum(aa_perc, ['A', 'C', 'D', 'G', 'S', 'T'])
 
-        small = round((e_e + h_h + i_i + l_l + k_k +
-                       m_m + n_n + p_p + q_q + v_v), 3)
+        small = partialDictRoundedSum(aa_perc,
+                                      ['E', 'H', 'I', 'L', 'K', 'M', 'N', 'P', 'Q', 'V'])
 
-        large = round((f_f + r_r + w_w + y_y), 3)
+        large = partialDictRoundedSum(aa_perc, ['F', 'R', 'W', 'Y'])
 
         # SCALES
 
-        kyleD = round(((a_kyte+c_kyte+d_kyte+e_kyte+f_kyte+g_kyte+h_kyte+i_kyte+k_kyte+l_kyte+m_kyte +
-                        n_kyte+p_kyte+q_kyte+r_kyte+s_kyte+t_kyte+v_kyte+w_kyte+y_kyte)/len(paste_seq+str(0.000001))), 3)
+        kyleD = round(
+            sum(
+                [aa_counts[k]*kyte_doolittle[k] for k in aa_list]
+            )/seq_size, 3)
+
+        molW = round(sum([aa_counts[k]*molecular_weigth[k]
+                          for k in aa_list]), 3)
 
-        molW = round((a_mw+c_mw+d_mw+e_mw+f_mw+g_mw+h_mw+i_mw+k_mw +
-                      l_mw+m_mw+n_mw+p_mw+q_mw+r_mw+s_mw+t_mw+v_mw+w_mw+y_mw), 3)
+        netCharge = sum([aa_counts[k]*net_charge[k] for k in aa_list])
 
-        netCharge = a_charge+c_charge+d_charge+e_charge+f_charge+g_charge+h_charge+i_charge+k_charge + \
-            l_charge+m_charge+n_charge+p_charge+q_charge+r_charge + \
-            s_charge+t_charge+v_charge+w_charge+y_charge
+        netH = round(sum([aa_counts[k]*net_hydrogen[k] for k in aa_list]), 3)
 
-        netH = round((a_hydrogen+c_hydrogen+d_hydrogen+e_hydrogen+f_hydrogen+g_hydrogen+h_hydrogen+i_hydrogen+k_hydrogen+l_hydrogen +
-                      m_hydrogen+n_hydrogen+p_hydrogen+q_hydrogen+r_hydrogen+s_hydrogen+t_hydrogen+v_hydrogen+w_hydrogen+y_hydrogen), 3)
+        result = rfc.predict([
+            [netH, netCharge, molW, kyleD] +
+            [v for v in aa_perc.values()] +
+            [tiny, small, large, aliphatic, aromatic,
+             total_charged, negative_charged, positive_charged,
+             polar, neutral, hydrophobic]])
 
-        result = "Probable: " + str(rfc.predict([[netH, netCharge, molW, kyleD, a_a, c_c, d_d, e_e, f_f, g_g, h_h, i_i, k_k, l_l, m_m, n_n, p_p, q_q, r_r,
-                                                  s_s, t_t, v_v, w_w, y_y, tiny, small, large, aliphatic, aromatic, total_charged, negative_charged, positive_charged, polar, neutral, hydrophobic]]))
+        result = "Probable: " + str(result)
 
         self.textpred.setText(str(result))
         self.textpred1.setText(str(aliphatic))
@@ -516,8 +231,8 @@ def calculation(self):
         self.textpred13.setText(str(netCharge))
         self.textpred14.setText(str(netH))
         self.textpred15.setText(str(total_charged))
-        self.textrelat.setText("A: " + str(a_a) + " , " + "C: " + str(c_c) + " , " + "D: " + str(d_d) + " , " + "E: " + str(e_e) + " , " + "F: " + str(f_f) + " , " + "E: " + str(e_e) + " , " + "G: " + str(g_g) + " , " + "I: " + str(i_i) + " , " + "K: " + str(k_k) + " , " + "L: " + str(
-            l_l) + " , " + "M: " + str(m_m) + " , " + "N: " + str(n_n) + " , " + "P: " + str(p_p) + " , " + "Q: " + str(q_q) + " , " + "R: " + str(r_r) + " , " + "S: " + str(s_s) + " , " + "T: " + str(t_t) + " , " + "V: " + str(v_v) + " , " + "W: " + str(w_w) + " , " + "Y: " + str(y_y))
+        self.textrelat.setText(' , '.join(
+            [k+': '+str(aa_perc[k]) for k in aa_list]))
 
 
 if __name__ == "__main__":

From df8efebb90bb50cae2d8bfaab06ff51f7bab4cfc Mon Sep 17 00:00:00 2001
From: Caio <ccastro@localhost.localdomain>
Date: Thu, 15 Oct 2020 13:55:21 -0300
Subject: [PATCH 10/10] fix subversion

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 8fc7d88..589d480 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ joblib==0.17.0
 numpy==1.19.2
 PyQt5==5.15.1
 PyQt5-sip==12.8.1
-scikit-learn==0.19.2
+scikit-learn==0.19.1
 scipy==1.5.2
 six==1.15.0
 sklearn==0.0