-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathMain.py
More file actions
71 lines (59 loc) · 2.34 KB
/
Main.py
File metadata and controls
71 lines (59 loc) · 2.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# -*- coding: utf-8 -*-
"""
Created on Thu Nov 16 14:40:06 2017
@author: KRapes
MAIN
"""
import json_management
import Supervised_Learning
import os
from sklearn.model_selection import train_test_split
print("Preparing the Dataset")
pruning_percent = 0.7
try:
os.remove('df.pkl')
os.remove('vocabulary.pkl')
except:
pass
df = json_management.prepare_df_labeled(pruning_percent)
print("The Dataset Contains {} Unique Messages".format(len(df)))
df_human = df[df.cluster != -1]
df_human, df_validation = train_test_split(df_human, test_size=0.10)
print("There are {} training messages and {} validation messages".format(len(df_human),
len(df_validation)))
print("")
validation_idx = list(df_validation.index)
print("Finding the Best Classifier")
clf, score = Supervised_Learning.best_classifier(df_human, pruning_percent)
print("")
print("The Best Classifier is:")
print(clf)
print("")
print("Optimizing the Vocabulary")
clf, pruning_percent = Supervised_Learning.best_pruning_percent(clf, validation_idx)
print("")
print("Labeling the Data")
df = json_management.prepare_df_labeled(pruning_percent)
df_machine = df[df.cluster == -1].copy()
df_machine = Supervised_Learning.predict_cluster(clf, df_machine)
total = len(df_machine)
motivated = len(df[df['cluster'] == 0])
genuine = len(df[df['cluster'] == 1])
l_total = motivated + genuine
print("{} Entries Have Now Been Labeled and"
" {} Entries Have Been Marked As Spam" .format(l_total,
total - l_total ))
print("Finacially Motivated: {}/{} ({}%) "
"Genuine Expression: {}/{} ({}%)".format(motivated,
l_total,
round(100 * motivated/l_total, 2),
genuine,
l_total,
round(100 * genuine/l_total, 2)))
print("")
json_management.save_obj(df_machine, 'df_Machine_Labeled')
df_validation = Supervised_Learning.relate_dfs(df, df_validation)
df_validation = df_validation[df.cluster != -1]
score = clf.score(list(df_validation.features), list(df_validation.cluster))
print("The Final Validation Score is {}".format(round(score, 2)))
print("Done")