tokenizers-cpp/include/tokenizers_cpp.h at 48de0e8c5ec0fa044fa45dc6d6bf29599f8fce94 · mlc-ai/tokenizers-cpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
/*!
 *  Copyright (c) 2023 by Contributors
 * \file tokenizers_cpp.h
 * \brief A C++ binding to common set of tokenizers
 */
#ifndef TOKENIZERS_CPP_H_
#define TOKENIZERS_CPP_H_

#include <tokenizers_c.h>

#include <memory>
#include <string>
#include <vector>
namespace tokenizers {

/*!
 * \brief a universal tokenizer that loads
 *  either HF's tokenizer or sentence piece,
 *  depending on the constructor
 */
class Tokenizer {
 public:
  /*! \brief virtual destructor */
  virtual ~Tokenizer() {}

  /*!
   * \brief Encode text into ids.
   * \param text The input text.
   * \returns The encoded token ids.
   */
  virtual std::vector<int32_t> Encode(const std::string& text) = 0;

  /*!
   * \brief Encode a batch of texts into ids.
   * \param texts The input texts.
   * \returns The encoded token ids.
   */
  virtual std::vector<std::vector<int32_t>> EncodeBatch(const std::vector<std::string>& texts) {
    // Fall back when the derived class does not implement this function.
    std::vector<std::vector<int32_t>> ret;
    ret.reserve(texts.size());
    for (const auto& text : texts) {
      ret.push_back(Encode(text));
    }
    return ret;
  }

  /*!
   * \brief Decode token ids into text.
   * \param text The token ids.
   * \returns The decoded text.
   */
  virtual std::string Decode(const std::vector<int32_t>& ids) = 0;

  /*!
   * \brief Returns the vocabulary size. Special tokens are considered.
   */
  virtual size_t GetVocabSize() = 0;

  /*!
   * \brief Convert the given id to its corresponding token if it exists. If
   * not, return an empty string.
   */
  virtual std::string IdToToken(int32_t token_id) = 0;

  /*!
   * \brief Convert the given token to its corresponding id if it exists. If
   * not, return -1.
   */
  virtual int32_t TokenToId(const std::string& token) = 0;

  //---------------------------------------------------
  // Factory functions from byte-blobs
  // These factory function takes in in-memory blobs
  // so the library can be independent from filesystem
  //---------------------------------------------------
  /*!
   * \brief Create HF tokenizer from a single in-memory json blob.
   *
   * \param json_blob The json blob.
   * \return The created tokenzier.
   */
  static std::unique_ptr<Tokenizer> FromBlobJSON(const std::string& json_blob);
  /*!
   * \brief Create BPE tokenizer
   *
   * \param vocab_blob The blob that contains vocabs.
   * \param merges_blob The blob that contains the merges.
   * \param added_tokens The added tokens.
   * \return The created tokenizer.
   */
  static std::unique_ptr<Tokenizer> FromBlobByteLevelBPE(const std::string& vocab_blob,
                                                         const std::string& merges_blob,
                                                         const std::string& added_tokens = "");
  /*!
   * \brief Create SentencePiece.
   *
   * \param model_blob The blob that contains vocabs.
   * \return The created tokenizer.
   */
  static std::unique_ptr<Tokenizer> FromBlobSentencePiece(const std::string& model_blob);
  /*!
   * \brief Create RWKVWorldTokenizer.
   *
   * \param model_blob The blob that contains vocabs.
   * \return The created tokenizer.
   */
  static std::unique_ptr<Tokenizer> FromBlobRWKVWorld(const std::string& model_blob);
};

class HFTokenizer : public Tokenizer {
 public:
  explicit HFTokenizer(TokenizerHandle handle);

  HFTokenizer(const HFTokenizer&);
  HFTokenizer(HFTokenizer&& other);

  ~HFTokenizer();

  // use i32 to be consistent with sentencepiece
  std::vector<int32_t> Encode(const std::string& text, bool add_special_tokens);

  // use i32 to be consistent with sentencepiece
  std::vector<int32_t> Encode(const std::string& text) final;

  // version specific to HFTokenizer, which adds special tokens flag
  std::vector<std::vector<int32_t>> EncodeBatch(const std::vector<std::string>& texts,
                                                bool add_special_tokens);

  std::tuple<std::vector<std::vector<int32_t>>, std::vector<std::vector<int32_t>>>
  EncodeBatchWithMask(const std::vector<std::string>& texts, bool add_special_tokens);

  std::vector<std::vector<int32_t>> EncodeBatch(const std::vector<std::string>& texts) final;

  // use i32 to be consistent with sentencepiece
  std::string Decode(const std::vector<int32_t>& ids, bool skip_special_tokens);

  std::string Decode(const std::vector<int32_t>& ids) final;

  size_t GetVocabSize() final;

  std::string IdToToken(int32_t id) final;

  int32_t TokenToId(const std::string& token) final;

  /*!
   * \brief Create HF tokenizer from a single in-memory json blob.
   *
   * \param json_blob The json blob.
   * \return The created tokenzier.
   */
  static std::unique_ptr<HFTokenizer> FromBlobJSON(const std::string& json_blob);

  /*!
   * \brief Create BPE tokenizer
   *
   * \param vocab_blob The blob that contains vocabs.
   * \param merges_blob The blob that contains the merges.
   * \param added_tokens The added tokens.
   * \return The created tokenizer.
   */
  static std::unique_ptr<HFTokenizer> FromBlobByteLevelBPE(const std::string& vocab_blob,
                                                           const std::string& merges_blob,
                                                           const std::string& added_tokens = "");

 private:
  // internal handle
  TokenizerHandle handle_{nullptr};
};

}  // namespace tokenizers
#endif  // TOKENIZERS_CPP_H_