deep-speech-command/learn.py at master · stefancosquer/deep-speech-command · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
import random

import numpy as np
import matplotlib.pyplot as plt
from keras.callbacks import EarlyStopping
from keras.layers import MaxPool1D, Conv1D, Dropout, BatchNormalization, GlobalAvgPool1D
from keras.models import Sequential
from keras.layers.core import Dense
from keras.optimizers import Adam
from keras.utils import to_categorical
from scipy.io.wavfile import read

SAMPLERATE = 44100
CLASSES = ['-', '1', '2', '3', '4', '5', '6', '7', '8', '9']

model = Sequential()

model.add(Conv1D(64, 88, activation='relu', input_shape=(SAMPLERATE, 1)))
model.add(MaxPool1D(3))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Conv1D(64, 4, activation='relu'))
model.add(MaxPool1D(3))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Conv1D(128, 4, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(GlobalAvgPool1D())
model.add(Dense(10, activation='softmax'))

model.compile(optimizer=Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

## Train

def generator(directory, batch_size, shuffle=False):
    rate, background = read('backgrounds/hospital.wav')
    files = [file for file in os.listdir(directory) if file.endswith(".wav")]
    np.random.shuffle(files)
    x = np.zeros((batch_size, SAMPLERATE, 1))
    y = np.zeros((batch_size, len(CLASSES)))
    while True:
        if (shuffle):
            np.random.shuffle(files)
        for i in range(batch_size):
            # choose random file
            rate, samples = read(directory + '/' + files[i])
            # pad randomly before and after
            before = 1 # random.randint(0, SAMPLERATE - samples.size)
            after = SAMPLERATE - samples.size - before
            samples = np.concatenate((np.zeros(before), samples, np.zeros(after)))
            # mix with background sounds
            idx = np.random.randint(0, len(background) - SAMPLERATE)
            samples = samples + background[idx:idx + SAMPLERATE]
            # normalize sound
            samples = (samples - np.mean(samples)) / np.std(samples)
            x[i] = samples.reshape(1, SAMPLERATE, 1)
            y[i] = to_categorical(files[i][0], len(CLASSES))
        yield x, y

history = model.fit_generator(
    generator('recordings', 9, shuffle=True),
    validation_data=generator('recordings', 9),
    steps_per_epoch=1,
    validation_steps=1,
    epochs=1000,
    callbacks=[EarlyStopping(patience=100, restore_best_weights=True)],
    verbose=1)


## Evaluate

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

## Test

def test(file):
    rate, samples = read('recordings/' + file)
    samples = (samples - np.mean(samples)) / np.std(samples)
    samples = np.concatenate((samples, np.zeros(SAMPLERATE - samples.size)))
    samples = samples.reshape(1, SAMPLERATE, 1)
    predictions = model.predict(samples)
    print(file + ' : ' + str(np.argmax(predictions)))


test('1_stefan_0.wav')
test('2_stefan_0.wav')
test('3_stefan_0.wav')
test('4_stefan_0.wav')
test('5_stefan_0.wav')
test('6_stefan_0.wav')
test('7_stefan_0.wav')
test('8_stefan_0.wav')
test('9_stefan_0.wav')