-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathword2vec_baselines.py
More file actions
81 lines (68 loc) · 3.5 KB
/
word2vec_baselines.py
File metadata and controls
81 lines (68 loc) · 3.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from utils import visualize_representation
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.svm import SVC
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
npz_file = np.load('word2vec_sum_repr.npz')
data_train = npz_file['data_train']
data_test = npz_file['data_test']
labels_train = npz_file['labels_train']
labels_test = npz_file['labels_test']
log_file = open('logs_word2vec_baselines.txt', mode='w')
def experiment_3():
"""
Note: excecution of this method takes a lot of time: using ec2 c4.8xlarge, results yielded
after 1 hours. Please refer to 'best_params.txt' to get optimal parameters for all experiments.
:return:
"""
pipeline = Pipeline([('forest', RandomForestClassifier(n_estimators=100))])
params = dict(
forest__n_estimators=[100, 200, 300, 600, 900,1000,2000,4000],
forest__criterion=['gini', 'entropy'],
forest__max_depth=[None,100, 50, 75],
forest__min_samples_split=[2,4,8,10,100])
grid_search = GridSearchCV(pipeline, param_grid=params, cv=StratifiedKFold(labels_train,3, shuffle=True), n_jobs=-1)
grid_search.fit(data_train, labels_train)
print("Best params for word2vec_random forest:", file=log_file)
print(grid_search.best_params_, file=log_file)
print(grid_search.best_score_, file=log_file)
print("Score is", file=log_file)
pipeline.set_params(**grid_search.best_params_).set_params(forest__n_jobs=-1).fit(data_train, labels_train)
print(pipeline.score(data_test, labels_test), file=log_file)
print('Stats for Word2Vec Random forest: Rows - precision, recall, f1, support; '
'Columns: environment active lifestyle physical capacity other', file=log_file)
print(precision_recall_fscore_support(labels_test, pipeline.predict(data_test),
labels=['environment', 'active lifestyle', 'physical capacity', 'other']), file=log_file)
def experiment_4():
"""
Note: excecution of this method takes a relatively small time: using ec2 c4.8xlarge, results yielded
after 5 minutes. Please refer to 'best_params.txt' to get optimal parameters for all experiments.
:return:
"""
pipeline = Pipeline([('svm', SVC())])
params = dict(
svm__C=[0.1, 1, 5, 10, 50, 100],
svm__kernel=['rbf', 'poly', 'linear'])
grid_search = GridSearchCV(pipeline, param_grid=params, cv=StratifiedKFold(labels_train,3, shuffle=True), n_jobs=-1)
grid_search.fit(data_train, labels_train)
print("Best params for word2vec_svm forest:", file=log_file)
print(grid_search.best_params_, file=log_file)
print(grid_search.best_score_, file=log_file)
print("Score is", file=log_file)
pipeline.set_params(**grid_search.best_params_).fit(data_train, labels_train)
print(pipeline.score(data_test, labels_test), file=log_file)
print('Stats for Word2Vec SVM: Rows - precision, recall, f1, support; '
'Columns: environment active lifestyle physical capacity other', file=log_file)
print(precision_recall_fscore_support(labels_test, pipeline.predict(data_test),
labels=['environment', 'active lifestyle', 'physical capacity', 'other']), file=log_file)
def statistics():
pass
experiment_3()
experiment_4()
log_file.close()