python-destin/load_data.py at master · jswiergo/python-destin · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#  -*- coding: utf-8 -*-
from numpy import *
import cPickle
from random import randrange
import numpy as np
cifar_dir = '/home/teddy/Desktop/Cifar/'
# cifar_dir = '/home/eskender/Destin/cifar-10-batches-py/'
#  Contains loading cifar batches and
#  feeding input to lower layer nodes


def read_cifar_file(fn):
    fo = open(fn, 'rb')
    dict = cPickle.load(fo)
    fo.close()
    return dict


def load_cifar(psz=4):
    #  file strings
    filenames = ['data_batch_1', 'data_batch_2',
                 'data_batch_3', 'data_batch_4',
                 'data_batch_5', 'test_batch']

    #  gather data
    train_data = empty((50000, 3072))
    test_data = empty((10000, 3072))
    train_labels = empty(50000)
    test_labels = empty(10000)

    start = 0
    width = 10000
    for file in filenames:
        dic = read_cifar_file(cifar_dir + file)
        if start < 50000:
            train_data[start:start + width, :] = dic['data']
            train_labels[start:start + width] = array(dic['labels'])
        else:
            test_data[:, :] = dic['data']
            test_labels[:] = array(dic['labels'])

        start += width

    #  reshape data into images

    for x in range(50000):
        image = train_data[x]
        image.shape = (3, 32, 32)
        image2 = copy(image.transpose((1, 2, 0)))
        image2 = reshape(image2, (1, 3072))
        train_data[x] = image2

    for x in range(10000):
        image = test_data[x]
        image.shape = (3, 32, 32)
        image2 = copy(image.transpose((1, 2, 0)))
        image2 = reshape(image2, (1, 3072))
        test_data[x] = image2

    #  set dims
    train_data.shape = (50000, 32, 32, 3)
    test_data.shape = (10000, 32, 32, 3)

    #  get random patches
    patches = empty((200000, psz * psz * 3))
    # psz = 4
    for i in range(200000):
        im = randrange(50000)
        a = randrange(32 - psz)
        b = randrange(32 - psz)
        patch = reshape(
            train_data[im, a:a + psz, b:b + psz, :], (1, psz * psz * 3))
        patches[i] = patch

    #  get statistics
    patch_mean = mean(patches, axis=0)
    patch_std = std(patches, axis=0)

    #  zero mean and unit variance
    patches = patches - patch_mean
    patches = patches / patch_std

    #  whitening stuff using notation from:
    #  http://web.eecs.utk.edu/~itamar/Papers/ICMLA2012_Derek.pdf
    eps = 1e-9
    patch_cov = cov(patches, rowvar=0)
    d, e = linalg.eig(patch_cov)
    d = diag(d) + eps
    v = e.dot(linalg.inv(sqrt(d))).dot(e.T)
    patches = patches.dot(v)

    ret = {}
    # ret['train_data'] = train_data
    # ret['test_data'] = test_data
    # ret['train_labels'] = train_labels
    # ret['test_labels'] = test_labels
    ret['patch_mean'] = patch_mean
    ret['patch_std'] = patch_std
    ret['whiten_mat'] = v

    return ret


def loadCifar(batchNum):
    #  For training_batches specify numbers 1 to 5
    #  for the test set pass 6
    if batchNum <= 5:
        file_name = cifar_dir + '/data_batch_' + str(batchNum)
        file_id = open(file_name, 'rb')
        dict = cPickle.load(file_id)
        file_id.close()
        return dict['data'], dict['labels']
    elif batchNum == 6:
        file_name = cifar_dir + '/test_batch'
        file_id = open(file_name, 'rb')
        dict = cPickle.load(file_id)
        file_id.close()
        return dict['data'], dict['labels']
    else:  # here we will get the whole 50,000x3072 dataset
        I = 0
        file_name = cifar_dir + '/data_batch_' + str(I + 1)
        file_id = open(file_name, 'rb')
        dict = cPickle.load(file_id)
        file_id.close()
        data = dict['data']
        labels = dict['labels']
        for I in range(1, 5):
            file_name = cifar_dir + '/data_batch_' + str(I + 1)
            file_id = open(file_name, 'rb')
            dict = cPickle.load(file_id)
            file_id.close()
            data = np.concatenate((data, dict['data']), axis=0)
            labels = np.concatenate((labels, dict['labels']), axis=0)
        return data, labels


def return_node_input(input_, Position, Ratio, mode, image_type):
    if mode == 'Adjacent':  # Non overlapping or Adjacent Patches
        PatchWidth = Ratio
        PatchHeight = Ratio
        if image_type == 'Color':
            PatchDepth = 3
        else:
            PatchDepth = 1
        Patch = input_[Position[0]:Position[0] + PatchWidth, Position[1]:Position[1] + PatchHeight].reshape(1,
                                                                                                           PatchWidth * PatchWidth * PatchDepth)
    else:  # TODO Overlapping Patch could be fed to a node
        print('Overlapping Patches Are Not Implemented Yet')
        patch = np.array([])
    return Patch