-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
75 lines (62 loc) · 3.09 KB
/
preprocessing.py
File metadata and controls
75 lines (62 loc) · 3.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
def add_dates_features(data):
# use the dates information as new features
x_with_dates = np.hstack((data.x, data.dates_x))
return x_with_dates
def count_zeros(data):
# count non zeros in data
zero_count = (data.x == 0).sum()
zero_count1 = data.x.shape[0]*data.x.shape[1] - np.count_nonzero(data.x, axis=1)
print(zero_count,zero_count1)
# count non zeros in data per column
col_zero_count = (data.x == 0).sum(0)
plt.plot(col_zero_count, '.')
plt.xlabel('#movie')
plt.ylabel('number of missing values')
plt.title('num of zeros in each column')
plt.show() # waits for user to close the window to continue
def imputation(data, imp_type):
if imp_type == 0:
data.x_imputed = data.x
elif imp_type == 1:
replace_val = np.average(np.average(data.x[data.x != 0]))
data.x_imputed = data.x
data.x_imputed[data.x == 0] = replace_val
elif imp_type == 2:
# threshold for similarity to "miss congeniality" (test movie)
corr_th = 0.3
corr_sp = np.zeros((99, 1))
corr_np = np.zeros((99, 1))
pval_sc = np.zeros((99, 1))
for ind in range(99):
# correlation between two vectors
# np.correlate(data.x[:,ind], data.y) == np.dot(data.x[:,ind],data.y)
corr_np[ind] = np.correlate(data.x[:, ind], data.y)
# pearsonr returns also the 2-tail p-value
# scipy.stats.stats.pearsonr(x,y) == np.dot((x-np.mean(x)),(y-np.mean(y)))/(2*np.sqrt(np.var(x))*2*np.sqrt(np.var(y)))
[coefp, pval] = scipy.stats.stats.pearsonr(data.x[:,ind], data.y)
corr_sp[ind] = coefp
pval_sc[ind] = pval
# corr_sp is used not corr_np (which is brought only for educational purposes)
plt.plot(range(99), corr_np/max(corr_np), range(99),corr_sp, range(99),pval_sc)
#plt.show()
similar_mov = corr_sp[corr_sp > corr_th]
n_similar = len(similar_mov)
# the last n_similar are the most similar (highest corr) that match similar_mov
sorted_corr = np.argsort(corr_sp,axis=0)[-n_similar:]
# copy the x array and convert integers to floats
data.x_imputed = data.x[:] * 1.0
replace_val = np.average(np.average(data.x[data.x != 0]))
data.x_imputed_tmp = data.x
data.x_imputed_tmp[data.x == 0] = replace_val
for mov_idx in range(0, 99):
users_mov_norating_idx = data.x_imputed[:, mov_idx] == 0
# first 14 movies have all rating
if not any(users_mov_norating_idx): continue
# average of the similar movies where 0 was replaced with mean
similar_rating = data.x_imputed_tmp[users_mov_norating_idx, sorted_corr]
# row average for all similar movies of same user
# replacing zero rating with float will raise num of errors in data but reduce srmse
data.x_imputed[users_mov_norating_idx, mov_idx] = np.average(similar_rating, axis=0)