Add T5Tokenizer based on SentencePieceTokenizer (#283)

swapnull7 · gpengzhi · commit 4bb70ecc29be · 2020-01-02T16:08:58.000-05:00
* Add text style transfer (#1) * initial commit * bug fixes and adjusting conv inputs * separate forward function for Discriminator and Generator and disable Gen training for debugging * remove debugger statement * bug fix * detaching stuff before accumulating * refactor and add component as optional parameter * Add optimizer for and backprop against encoder * Add in README * Add text style transfer with improvements (#2) * initial commit * bug fixes and adjusting conv inputs * separate forward function for Discriminator and Generator and disable Gen training for debugging * remove debugger statement * bug fix * detaching stuff before accumulating * refactor and add component as optional parameter * Add optimizer for and backprop against encoder * Add in README * more fixes to eval mode * create optimizers so that they can be saved * fix typo * restore optimizers * Update ctrl_gen_model.py * remove tensorflow import * Add text style transfer (#3) * Add text style transfer (#4) * initial commit * bug fixes and adjusting conv inputs * separate forward function for Discriminator and Generator and disable Gen training for debugging * remove debugger statement * bug fix * detaching stuff before accumulating * refactor and add component as optional parameter * Add optimizer for and backprop against encoder * Add in README * more fixes to eval mode * create optimizers so that they can be saved * fix typo * linting issues * add type annotation for encoder * fix linting * Isolate AE in training * works after changing the learning rate * remove debugger * Add text style transfer (#5) * Reviewed changes * linting * Add text style transfer (#6) * initial commit * linting * Fix docs build issue * Fix typo * init_commit * modularize t5 and comment out debugging statements * Add decorators for pretrained_tests * remove changes from text-style-transfer * remove collect variable changes * remove text-style-transfer from docs * more clean up and removing debugger statements * more clean up * fix linting * more linting * Update utils.rst * linting and fixing minor bugs in gpt2-tests * skipping pretrained tests * fix documentation error * fix linting * Update gpt2_test.py * refactor gin reading function * revert using identity, use nn.Module instead * Update decoder_base.py * fix type for T5Decoder * fix linting * add a standalone test for T5 * Adding T5 Tokenizer * adding import to __init__.py * linting fix * making review changes * reviewed changes
diff --git a/docs/code/data.rst b/docs/code/data.rst
@@ -37,6 +37,11 @@ Tokenizer
 .. autoclass:: texar.torch.data.XLNetTokenizer
     :members:
 
+:hidden:`T5Tokenizer`
+~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: texar.torch.data.T5Tokenizer
+    :members:
+
 Vocabulary
 ==========
 
diff --git a/docs/code/modules.rst b/docs/code/modules.rst
@@ -252,6 +252,14 @@ Regressors
 .. autoclass:: texar.torch.modules.XLNetRegressor
     :members:
 
+EncoderDecoders
+================
+
+:hidden:`T5EncoderDecoder`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: texar.torch.modules.T5EncoderDecoder
+    :members:
+
 Pre-trained
 ===========
 
@@ -285,6 +293,11 @@ Pre-trained
 .. autoclass:: texar.torch.modules.PretrainedXLNetMixin
     :members:
 
+:hidden:`PretrainedT5Mixin`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: texar.torch.modules.PretrainedT5Mixin
+    :members:
+
 Connectors
 ==========
 
diff --git a/texar/torch/data/tokenizers/__init__.py b/texar/torch/data/tokenizers/__init__.py
@@ -21,3 +21,4 @@
 from texar.torch.data.tokenizers.tokenizer_base import *
 from texar.torch.data.tokenizers.xlnet_tokenizer import *
 from texar.torch.data.tokenizers.sentencepiece_tokenizer import *
+from texar.torch.data.tokenizers.t5_tokenizer import *
diff --git a/texar/torch/data/tokenizers/bert_tokenizer.py b/texar/torch/data/tokenizers/bert_tokenizer.py
@@ -121,7 +121,7 @@ def __init__(self,
             vocab_file = os.path.join(self.pretrained_model_dir,
                                       self._VOCAB_FILE_MAP['vocab_file']
                                       [self.pretrained_model_name])
-            assert self.pretrained_model_name is not None
+
             if self._MAX_INPUT_SIZE.get(self.pretrained_model_name):
                 self.max_len = self._MAX_INPUT_SIZE[self.pretrained_model_name]
         else:
diff --git a/texar/torch/data/tokenizers/t5_tokenizer.py b/texar/torch/data/tokenizers/t5_tokenizer.py
@@ -0,0 +1,192 @@
+# Copyright 2019 The Texar Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Pre-trained T5 tokenizer.
+"""
+
+from typing import Any, Dict, Optional
+
+import os
+import re
+
+from texar.torch.data.tokenizers.sentencepiece_tokenizer \
+    import SentencePieceTokenizer
+from texar.torch.modules.pretrained.t5 import PretrainedT5Mixin
+
+__all__ = [
+    'T5Tokenizer',
+]
+
+
+class T5Tokenizer(SentencePieceTokenizer, PretrainedT5Mixin):
+    r"""Pre-trained T5 Tokenizer.
+
+    Args:
+        pretrained_model_name (optional): a `str`, the name of
+            pre-trained model (e.g., `T5-Small`). Please refer to
+            :class:`~texar.torch.modules.PretrainedT5Mixin` for
+            all supported models.
+            If None, the model name in :attr:`hparams` is used.
+        cache_dir (optional): the path to a folder in which the
+            pre-trained models will be cached. If `None` (default),
+            a default directory (``texar_data`` folder under user's home
+            directory) will be used.
+        hparams (dict or HParams, optional): Hyperparameters. Missing
+            hyperparameters will be set to default values. See
+            :meth:`default_hparams` for the hyperparameter structure
+            and default values.
+    """
+
+    _IS_PRETRAINED = True
+
+    _VOCAB_FILE_NAMES = {
+        'vocab_file': 'sentencepiece.model'
+    }
+
+    _MAX_INPUT_SIZE = {
+        'T5-Small': 512,
+        'T5-Base': 512,
+        'T5-Large': 512,
+        'T5-3B': 512,
+        'T5-11B': 512
+    }
+
+    def __init__(self,
+                 pretrained_model_name: Optional[str] = None,
+                 cache_dir: Optional[str] = None,
+                 hparams=None):
+
+        self.load_pretrained_config(pretrained_model_name, cache_dir, hparams)
+
+        if self.pretrained_model_dir is not None:
+            assert self.pretrained_model_name is not None
+            vocab_file = os.path.join(self.pretrained_model_dir,
+                                      self._VOCAB_FILE_NAMES['vocab_file'])
+
+            if self._MAX_INPUT_SIZE.get(self.pretrained_model_name):
+                self.max_len = self._MAX_INPUT_SIZE[self.pretrained_model_name]
+            setattr(self.hparams, 'vocab_file', vocab_file)
+        else:
+            if self.hparams.get('max_len'):
+                self.max_len = self.hparams['max_len']
+
+        # Add extra_ids to the special token list
+        additional_special_tokens = []
+        extra_ids = self.hparams['extra_ids']
+        if extra_ids > 0:
+            additional_special_tokens.extend(
+                ["<extra_id_{}>".format(i) for i in range(extra_ids)])
+
+        setattr(self.hparams, 'additional_special_tokens',
+                additional_special_tokens)
+
+        super().__init__(hparams=None)
+
+    @staticmethod
+    def default_hparams() -> Dict[str, Any]:
+        r"""Returns a dictionary of hyperparameters with default values.
+
+        * The tokenizer is determined by the constructor argument
+          :attr:`pretrained_model_name` if it's specified. In this case,
+          `hparams` are ignored.
+        * Otherwise, the tokenizer is determined by
+          `hparams['pretrained_model_name']` if it's specified. All other
+          configurations in `hparams` are ignored.
+        * If the above two are `None`, the tokenizer is defined by the
+          configurations in `hparams`.
+
+        .. code-block:: python
+
+            {
+                "pretrained_model_name": "T5-Small",
+                "vocab_file": None,
+                "max_len": 512,
+                "bos_token": None,
+                "eos_token": "</s>",
+                "unk_token": "<unk>",
+                "pad_token": "<pad>",
+                "extra_ids": 100,
+                "additional_special_tokens": [],
+                "name": "t5_tokenizer",
+            }
+
+        Here:
+
+        `"pretrained_model_name"`: str or None
+            The name of the pre-trained T5 model.
+
+        `"vocab_file"`: str or None
+            The path to a sentencepiece vocabulary file.
+
+        `"max_len"`: int or None
+            The maximum sequence length that this model might ever be used with.
+
+        `"bos_token"`: str or None
+            Beginning of sentence token. Set None to disable ``bos_token``.
+
+        `"eos_token"`: str
+            End of sentence token. Set None to disable ``eos_token``.
+
+        `"unk_token"`: str
+            Unknown token. Set None to disable ``unk_token``.
+
+        `"pad_token"`: str
+            Padding token. Set None to disable ``pad_token``.
+
+        `"extra_ids"`: int
+            Add a number of extra ids added to the end of the vocabulary for
+            use as sentinels. These tokens are accessible as `<extra_id_{%d}>`
+            where `{%d}` is a number between 0 and extra_ids-1. Extra tokens
+            are indexed from the end of the vocabulary up to beginning
+            (<extra_id_0> is the last token in the vocabulary) (like in T5
+            preprocessing) see:
+            `https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117`
+
+        `"additional_special_tokens"`: list
+            A list of additional special tokens.
+
+        `"name"`: str
+            Name of the tokenizer.
+        """
+        return {
+            'pretrained_model_name': 'T5-Small',
+            'vocab_file': None,
+            'max_len': 512,
+            'bos_token': None,
+            'eos_token': '</s>',
+            'unk_token': '<unk>',
+            'pad_token': '<pad>',
+            'extra_ids': 100,
+            'additional_special_tokens': [],
+            'name': 't5_tokenizer',
+            '@no_typecheck': ['pretrained_model_name'],
+        }
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.sp_model) + self.hparams['extra_ids']
+
+    def _map_token_to_id(self, token: str) -> int:
+        if token.startswith("<extra_id_"):
+            match = re.match(r"<extra_id_(\d+)>", token)
+            num = int(match.group(1))  # type: ignore
+            return self.vocab_size - num - 1
+        return self.sp_model.PieceToId(token)
+
+    def _map_id_to_token(self, index: int) -> str:
+        if index < self.sp_model.get_piece_size():
+            token = self.sp_model.IdToPiece(index)
+        else:
+            token = "<extra_id_{}>".format(self.vocab_size - 1 - index)
+        return token
diff --git a/texar/torch/data/tokenizers/t5_tokenizer_test.py b/texar/torch/data/tokenizers/t5_tokenizer_test.py
@@ -0,0 +1,62 @@
+"""
+Unit tests for T5 tokenizer.
+"""
+
+import unittest
+
+import os
+import tempfile
+
+from texar.torch.utils.test import pretrained_test
+from texar.torch.data.tokenizers.t5_tokenizer import T5Tokenizer
+from texar.torch.data.data_utils import maybe_download
+
+
+class T5TokenizerTest(unittest.TestCase):
+
+    def setUp(self):
+        self.tmp_dir = tempfile.TemporaryDirectory()
+        self.SAMPLE_VOCAB = maybe_download(
+            'https://github.com/google/sentencepiece/blob/master/'
+            'python/test/test_model.model?raw=true', self.tmp_dir.name)
+
+        self.tokenizer = T5Tokenizer.load(self.SAMPLE_VOCAB)
+
+        self.tokenizer.save(self.tmp_dir.name)
+
+    def tearDown(self):
+        self.tmp_dir.cleanup()
+
+    @pretrained_test
+    def test_model_loading(self):
+        for pretrained_model_name in T5Tokenizer.available_checkpoints():
+            tokenizer = T5Tokenizer(
+                pretrained_model_name=pretrained_model_name)
+
+            info = list(os.walk(tokenizer.pretrained_model_dir))
+            _, _, files = info[0]
+
+            self.assertIn('sentencepiece.model', files)
+
+            _ = tokenizer.map_text_to_token(u"This is a test")
+
+    def test_roundtrip(self):
+        tokenizer = T5Tokenizer.load(self.tmp_dir.name)
+
+        text = 'I saw a girl with a telescope.'
+        ids = tokenizer.map_text_to_id(text)
+        tokens = tokenizer.map_text_to_token(text)
+
+        self.assertEqual(text, tokenizer.map_id_to_text(ids))
+        self.assertEqual(text, tokenizer.map_token_to_text(tokens))
+
+        text = '<extra_id_32> I saw a girl with a telescope.<extra_id_74>'
+        ids = tokenizer.map_text_to_id(text)
+        tokens = tokenizer.map_text_to_token(text)
+
+        self.assertEqual(text, tokenizer.map_id_to_text(ids))
+        self.assertEqual(text, tokenizer.map_token_to_text(tokens))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/texar/torch/data/tokenizers/tokenizer_base.py b/texar/torch/data/tokenizers/tokenizer_base.py
@@ -83,7 +83,12 @@ def __init__(self, hparams):
                     assert isinstance(value, (list, tuple)) and \
                            all(isinstance(v, str) for v in value)
                 else:
-                    assert isinstance(value, str)
+                    if value is not None:
+                        assert isinstance(value, str)
+                    else:
+                        warnings.warn(f"Trying to set None as value special "
+                                      f"token '{key}'. Proceed only if you"
+                                      f" are sure!", UserWarning)
                 setattr(self, key, value)
 
     @classmethod
diff --git a/texar/torch/modules/__init__.py b/texar/torch/modules/__init__.py
@@ -23,3 +23,4 @@
 from texar.torch.modules.networks import *
 from texar.torch.modules.pretrained import *
 from texar.torch.modules.regressors import *
+from texar.torch.modules.encoder_decoders import *
diff --git a/texar/torch/modules/decoders/t5_decoder.py b/texar/torch/modules/decoders/t5_decoder.py
@@ -30,8 +30,8 @@
 
 
 class T5Decoder(TransformerDecoder):
-    r"""T5 decoder that applies multi-head self-attention with #todo rpr for
-    sequence decoding.
+    r"""T5 decoder that applies multi-head self-attention with relative
+    position representation for sequence decoding.
 
     It is a stack of
     :class:`~texar.torch.modules.MultiheadRPRAttention`,
diff --git a/texar/torch/modules/encoder_decoders/__init__.py b/texar/torch/modules/encoder_decoders/__init__.py
@@ -15,5 +15,5 @@
 Modules of Texar library encoders.
 """
 
-from texar.torch.modules.encoder_decoders.t5_encoder_decoder \
-    import T5EncoderDecoder
+from texar.torch.modules.encoder_decoders.encoder_decoder_base import *
+from texar.torch.modules.encoder_decoders.t5_encoder_decoder import *
diff --git a/texar/torch/modules/encoder_decoders/t5_encoder_decoder.py b/texar/torch/modules/encoder_decoders/t5_encoder_decoder.py
@@ -33,7 +33,7 @@
 
 
 class T5EncoderDecoder(EncoderDecoderBase, PretrainedT5Mixin):
-    r"""Pretrained T5 model. Please see
+    r"""The pre-trained T5 model. Please see
     :class:`~texar.torch.modules.PretrainedT5Mixin` for a brief description
     of T5.
 
diff --git a/texar/torch/modules/encoder_decoders/t5_encoder_decoder_test.py b/texar/torch/modules/encoder_decoders/t5_encoder_decoder_test.py
@@ -95,7 +95,7 @@ def test_trainable_variables(self):
 
     @pretrained_test
     def test_t5_eval(self):
-        r"""Tests pretrained model and check it generates
+        r"""Tests pre-trained model and check it generates
         same results everytime.
         """
         hparams = {
@@ -120,7 +120,7 @@ def test_t5_eval(self):
             torch.Size([self.inputs.size()[0], self.max_length, outputs_dim]))
 
         # Check if these value are same consistently. If not, there is something
-        # wrong with the pretrained model.
+        # wrong with the pre-trained model.
         self.assertEqual(
             encoder_output.data[0][3][345].tolist(),
             -0.16204041242599487
diff --git a/texar/torch/modules/pretrained/t5.py b/texar/torch/modules/pretrained/t5.py
diff --git a/texar/torch/modules/pretrained/t5_test.py b/texar/torch/modules/pretrained/t5_test.py