dynamic/word2vec_baselines.py at master · akamaster/dynamic · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from utils import visualize_representation
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.svm import SVC
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

npz_file = np.load('word2vec_sum_repr.npz')

data_train = npz_file['data_train']
data_test = npz_file['data_test']
labels_train = npz_file['labels_train']
labels_test = npz_file['labels_test']
log_file = open('logs_word2vec_baselines.txt', mode='w')

def experiment_3():
    """
    Note: excecution of this method takes a lot of time: using ec2 c4.8xlarge, results yielded
    after 1 hours. Please refer to 'best_params.txt' to get optimal parameters for all experiments.
    :return:
    """
    pipeline = Pipeline([('forest', RandomForestClassifier(n_estimators=100))])
    params = dict(
              forest__n_estimators=[100, 200, 300, 600, 900,1000,2000,4000],
              forest__criterion=['gini', 'entropy'],
              forest__max_depth=[None,100, 50, 75],
              forest__min_samples_split=[2,4,8,10,100])

    grid_search = GridSearchCV(pipeline, param_grid=params, cv=StratifiedKFold(labels_train,3, shuffle=True), n_jobs=-1)

    grid_search.fit(data_train, labels_train)
    print("Best params for word2vec_random forest:", file=log_file)
    print(grid_search.best_params_, file=log_file)
    print(grid_search.best_score_, file=log_file)
    print("Score is", file=log_file)


    pipeline.set_params(**grid_search.best_params_).set_params(forest__n_jobs=-1).fit(data_train, labels_train)
    print(pipeline.score(data_test, labels_test), file=log_file)
    print('Stats for Word2Vec Random forest: Rows - precision, recall, f1, support; '
          'Columns: environment active lifestyle physical capacity other', file=log_file)
    print(precision_recall_fscore_support(labels_test, pipeline.predict(data_test),
              labels=['environment', 'active lifestyle', 'physical capacity', 'other']), file=log_file)

def experiment_4():
    """
    Note: excecution of this method takes a relatively small time: using ec2 c4.8xlarge, results yielded
    after 5 minutes. Please refer to 'best_params.txt' to get optimal parameters for all experiments.
    :return:
    """
    pipeline = Pipeline([('svm', SVC())])
    params = dict(
              svm__C=[0.1, 1, 5, 10, 50, 100],
              svm__kernel=['rbf', 'poly', 'linear'])

    grid_search = GridSearchCV(pipeline, param_grid=params, cv=StratifiedKFold(labels_train,3, shuffle=True), n_jobs=-1)

    grid_search.fit(data_train, labels_train)
    print("Best params for word2vec_svm forest:", file=log_file)
    print(grid_search.best_params_, file=log_file)
    print(grid_search.best_score_, file=log_file)
    print("Score is", file=log_file)

    pipeline.set_params(**grid_search.best_params_).fit(data_train, labels_train)

    print(pipeline.score(data_test, labels_test), file=log_file)
    print('Stats for Word2Vec SVM: Rows - precision, recall, f1, support; '
          'Columns: environment active lifestyle physical capacity other', file=log_file)
    print(precision_recall_fscore_support(labels_test, pipeline.predict(data_test),
              labels=['environment', 'active lifestyle', 'physical capacity', 'other']), file=log_file)
def statistics():
    pass

experiment_3()
experiment_4()
log_file.close()