Quick.AI/tokenizers_cpp.h at main · jayden0701/Quick.AI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/**
 *  Copyright (c) 2023 by Contributors
 * @file tokenizers_cpp.h
 * @brief A C++ binding to common set of tokenizers
 * @author Contributors
 * @bug No known bugs
 */
#ifndef TOKENIZERS_CPP_H_
#define TOKENIZERS_CPP_H_

#include <memory>
#include <string>
#include <vector>

namespace tokenizers {

/**
 * @brief a universal tokenizer that loads
 *  either HF's tokenizer or sentence piece,
 *  depending on the constructor
 */
class Tokenizer {
public:
  /** @brief virtual destructor */
  virtual ~Tokenizer() {}

  /**
   * @brief Encode text into ids.
   * @param text The input text.
   * @return The encoded token ids.
   */
  virtual std::vector<int32_t> Encode(const std::string &text) = 0;

  /**
   * @brief Encode text into ids with special tokens option.
   * @param text The input text.
   * @param add_special_tokens Whether to add special tokens.
   * @return The encoded token ids.
   */
  virtual std::vector<int32_t> Encode(const std::string &text,
                                      bool add_special_tokens) = 0;

  /**
   * @brief Encode a batch of texts into ids.
   * @param texts The input texts.
   * @return The encoded token ids.
   */
  virtual std::vector<std::vector<int32_t>>
  EncodeBatch(const std::vector<std::string> &texts) {
    // Fall back when the derived class does not implement this function.
    std::vector<std::vector<int32_t>> ret;
    ret.reserve(texts.size());
    for (const auto &text : texts) {
      ret.push_back(Encode(text));
    }
    return ret;
  }

  /**
   * @brief Decode token ids into text.
   * @param text The token ids.
   * @return The decoded text.
   */
  virtual std::string Decode(const std::vector<int32_t> &ids) = 0;

  /**
   * @brief Returns the vocabulary size. Special tokens are considered.
   */
  virtual size_t GetVocabSize() = 0;

  /**
   * @brief Convert the given id to its corresponding token if it exists. If
   * not, return an empty string.
   */
  virtual std::string IdToToken(int32_t token_id) = 0;

  /**
   * @brief Convert the given token to its corresponding id if it exists. If
   * not, return -1.
   */
  virtual int32_t TokenToId(const std::string &token) = 0;

  //---------------------------------------------------
  // Factory functions from byte-blobs
  // These factory function takes in in-memory blobs
  // so the library can be independent from filesystem
  //---------------------------------------------------
  /**
   * @brief Create HF tokenizer from a single in-memory json blob.
   *
   * @param json_blob The json blob.
   * @return The created tokenzier.
   */
  static std::unique_ptr<Tokenizer> FromBlobJSON(const std::string &json_blob);
  /**
   * @brief Create BPE tokenizer
   *
   * @param vocab_blob The blob that contains vocabs.
   * @param merges_blob The blob that contains the merges.
   * @param added_tokens The added tokens.
   * @return The created tokenizer.
   */
  static std::unique_ptr<Tokenizer>
  FromBlobByteLevelBPE(const std::string &vocab_blob,
                       const std::string &merges_blob,
                       const std::string &added_tokens = "");
  /**
   * @brief Create SentencePiece.
   *
   * @param model_blob The blob that contains vocabs.
   * @return The created tokenizer.
   */
  static std::unique_ptr<Tokenizer>
  FromBlobSentencePiece(const std::string &model_blob);
  /**
   * @brief Create RWKVWorldTokenizer.
   *
   * @param model_blob The blob that contains vocabs.
   * @return The created tokenizer.
   */
  static std::unique_ptr<Tokenizer>
  FromBlobRWKVWorld(const std::string &model_blob);
};

} // namespace tokenizers
#endif // TOKENIZERS_CPP_H_