phone based scheduled sampling

jinserk · jinserk · commit 0522b58aa673 · 2018-12-10T11:49:27.000-06:00
diff --git a/asr/models/las/loss.py b/asr/models/las/loss.py
@@ -0,0 +1,63 @@
+import torch
+from torch.nn.modules.loss import _Loss
+
+from asr.utils.misc import onehot2int
+
+class EditDistanceLoss(_Loss):
+    __constants__ = ['reduction']
+
+    def __init__(self, size_average=None, reduce=None, reduction='mean'):
+        super().__init__(size_average, reduce, reduction)
+
+    def forward(self, input, target, input_seq_lens, target_seq_lens):
+        """
+        input: BxTxH, target: BxN, input_seq_lens: B, target_seq_lens: B
+        """
+        batch_size = input.size(0)
+        eds = list()
+        for b in range(batch_size):
+            x = torch.argmax(input[b, :input_seq_lens[b]], dim=-1)
+            y = target[b, :target_seq_lens[b]]
+            d = self.calculate_levenshtein(x, y)
+            eds.append(d)
+        loss = torch.FloatTensor(eds)
+
+        if self.reduction == 'none':
+            return loss
+        elif self.reduction == 'mean':
+            return loss.mean()
+
+    def calculate_levenshtein(self, seq1, seq2):
+        """
+        implement the extension of the Wagner–Fischer dynamic programming algorithm
+        """
+        size_x, size_y = len(seq1), len(seq2)
+        matrix = torch.zeros((size_x, size_y))
+        for x in range(size_x):
+            matrix[x, 0] = x
+        for y in range(size_y):
+            matrix[0, y] = y
+
+        for x in range(1, size_x):
+            for y in range(1, size_y):
+                cost = 0 if seq1[x] == seq2[y] else 1
+                comps = torch.LongTensor([
+                    matrix[x - 1, y] + 1,               # deletion
+                    matrix[x, y - 1] + 1,               # insertion
+                    matrix[x - 1, y - 1] + cost,        # subtitution
+                ])
+                matrix[x, y] = torch.min(comps)
+                if x > 1 and y > 1 and seq1[x] == seq2[y - 1] and seq1[x - 1] == seq2[y]:
+                    comps = torch.LongTensor([
+                        matrix[x, y],
+                        matrix[x - 2, y - 2] + cost,    # transposition
+                    ])
+                    matrix[x, y] = torch.min(comps)
+
+        return matrix[-1, -1]
+
+if __name__ == "__main__":
+    x = torch.LongTensor([0, 1, 2])
+    y = torch.LongTensor([0, 2, 1, 3])
+    l = EditDistanceLoss()
+    print(l.calculate_levenshtein(x, y))
diff --git a/asr/models/las/network.py b/asr/models/las/network.py
@@ -194,6 +194,7 @@ def __init__(self, listen_vec_size, label_vec_size, max_seq_lens=256, sos=None,
         self.eos = label_vec_size - 1 if eos is None else eos
         self.max_seq_lens = max_seq_lens
         self.num_eos = 3
+        self.tfr = 1.
 
         Hs, Hc, Hy = rnn_hidden_size, listen_vec_size, label_vec_size
 
@@ -222,6 +223,9 @@ def get_mask(self, h, seq_lens):
             mask[b, seq_lens[b]:] = 0.
         return mask
 
+    def _is_sample_step(self):
+        return np.random.random_sample() < self.tfr
+
     def forward(self, h, x_seq_lens, y=None, y_seq_lens=None):
         batch_size = h.size(0)
         sos = int2onehot(h.new_full((batch_size, 1), self.sos), num_classes=self.label_vec_size).float()
@@ -261,9 +265,9 @@ def forward(self, h, x_seq_lens, y=None, y_seq_lens=None):
             if y_hats_seq_lens.le(t + 1).all():
                 break
 
-            if y is None:
+            if y is None or not self._is_sample_step():     # non sampling step
                 x = torch.cat([y_hat, c], dim=-1)
-            elif t < y.size(1):  # teach force
+            elif t < y.size(1):                             # scheduled sampling step
                 x = torch.cat([y.narrow(1, t, 1), c], dim=-1)
             else:
                 x = torch.cat([eos, c], dim=-1)
@@ -335,7 +339,6 @@ def __init__(self, label_vec_size=p.NUM_CTC_LABELS, listen_vec_size=256,
         self.eos = self.label_vec_size - 1
 
         self.num_heads = num_attend_heads
-        self.tfr = 1.
 
         self.listen = Listener(listen_vec_size=listen_vec_size, input_folding=input_folding, rnn_type=nn.LSTM,
                                rnn_hidden_size=listen_vec_size, rnn_num_layers=4, bidirectional=True,
@@ -357,9 +360,6 @@ def forward(self, x, x_seq_lens, y=None, y_seq_lens=None):
         else:
             return self._eval_forward(x, x_seq_lens)
 
-    def _is_teacher_force(self):
-        return np.random.random_sample() < self.tfr
-
     def _train_forward(self, x, x_seq_lens, y, y_seq_lens):
         # to remove the case of x_seq_lens < y_seq_lens and y_seq_lens > max_seq_lens
         bi = x_seq_lens.gt(y_seq_lens) * y_seq_lens.lt(self.spell.max_seq_lens)
@@ -376,15 +376,11 @@ def _train_forward(self, x, x_seq_lens, y, y_seq_lens):
         ys = nn.utils.rnn.pad_sequence(ys, batch_first=True, padding_value=self.blk)
         ys, ys_seq_lens = ys[bi], y_seq_lens[bi] + self.spell.num_eos
 
-        if self._is_teacher_force():
-            # speller with teach force rate including noise
-            floor = np.random.random_sample() * 1e-2
-            yss = int2onehot(ys, num_classes=self.label_vec_size, floor=floor).float()
-            noise = torch.rand_like(yss) * 0.1
-            yss = yss * noise
-            y_hats, y_hats_seq_lens, self.attentions = self.spell(h, x_seq_lens, yss, ys_seq_lens)
-        else:
-            y_hats, y_hats_seq_lens, self.attentions = self.spell(h, x_seq_lens)
+        floor = np.random.random_sample() * 0.1
+        yss = int2onehot(ys, num_classes=self.label_vec_size, floor=floor).float()
+        noise = torch.rand_like(yss) * 0.1
+        yss = F.softmax(yss * noise, dim=-1)
+        y_hats, y_hats_seq_lens, self.attentions = self.spell(h, x_seq_lens, yss, ys_seq_lens)
 
         # add regions to attentions
         self.regions = torch.IntTensor([(frames - 1, labels - 1) for frames, labels in zip(x_seq_lens, ys_seq_lens)])
diff --git a/asr/models/las/train.py b/asr/models/las/train.py
@@ -20,23 +20,26 @@
 from ..trainer import *
 from .network import TFRScheduler, ListenAttendSpell
 
+#from .loss import EditDistanceLoss
+
 
 class LASTrainer(NonSplitTrainer):
     """Trainer for ListenAttendSpell model"""
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-        self.loss = nn.NLLLoss(reduction='mean', ignore_index=self.model.blk)
+        self.loss = nn.NLLLoss(reduction='none', ignore_index=self.model.blk)
+        #self.loss = EditDistanceLoss()
 
-        self.tfr_scheduler = TFRScheduler(self.model, ranges=(0.9, 0.0), warm_up=0, epochs=9, restart=True)
+        self.tfr_scheduler = TFRScheduler(self.model.spell, ranges=(0.9, 0.0), warm_up=0, epochs=9, restart=True)
         #self.tfr_scheduler.step(9)
         if self.states is not None and "tfr_scheduler" in self.states:
             self.tfr_scheduler.load_state_dict(self.states["tfr_scheduler"])
 
     def train_loop_before_hook(self):
         self.tfr_scheduler.step()
-        logger.debug(f"current tfr = {self.model.tfr:.3e}")
+        logger.debug(f"current tfr = {self.model.spell.tfr:.3e}")
 
     def train_loop_checkpoint_hook(self):
         self.plot_attention_heatmap()
@@ -68,8 +71,9 @@ def unit_train(self, data):
                 ys_hat = ys_hat.float()
             if self.use_cuda:
                 ys_lens = ys_lens.cuda()
-            loss = self.loss(ys_hat.transpose(1, 2), ys.long())
-            #loss = self.loss(ys_hat.transpose(1, 2), ys.long()).sum(dim=-1).div(ys_lens.float()).mean()
+            #loss = self.loss(ys_hat.transpose(1, 2), ys.long())
+            loss = self.loss(ys_hat.transpose(1, 2), ys.long()).sum(dim=-1).div(ys_lens.float()).mean()
+            #loss = self.loss(ys_hat, ys.long(), ys_hat_lens, ys_lens)
             #if ys_hat_lens is None:
             #    logger.debug("the batch includes a data with label_lens > max_seq_lens: ignore the entire batch")
             #    loss.mul_(0)
@@ -210,13 +214,13 @@ def batch_train(argv):
         #if i < 2:
         #    trainer.train_epoch(dataloaders["train3"])
         #    trainer.validate(dataloaders["dev"])
-        if i < 10:
+        if i < 30:
             trainer.train_epoch(dataloaders["warmup5"])
             trainer.validate(dataloaders["dev"])
-        elif i < 20:
+        elif i < 50:
             trainer.train_epoch(dataloaders["warmup10"])
             trainer.validate(dataloaders["dev"])
-        elif i < 30:
+        elif i < 60:
             trainer.train_epoch(dataloaders["train10"])
             trainer.validate(dataloaders["dev"])
         else: