forked from mlc-ai/tokenizers-cpp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizers.ts
97 lines (87 loc) · 2.2 KB
/
tokenizers.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import Module from "./tokenizers_binding"
let binding: any = null;
async function asyncInitTokenizers() {
if (binding == null) {
binding = await Module();
}
}
/**
* A universal tokenizer that is backed by either
* HF tokenizers rust library or sentencepiece
*/
export class Tokenizer {
private handle: any;
private constructor(tokenizer) {
this.handle = tokenizer;
}
/**
* Dispose this tokenizer.
*
* Call this function when we no longer needs to
*/
dispose() {
this.handle.delete();
}
/**
* Encode text to token ids.
*
* @param text Input text.
* @returns The output tokens
*/
encode(text: string): Int32Array {
const ids = this.handle.Encode(text);
const arr = binding.vecIntToView(ids).slice();
ids.delete();
return arr;
}
/**
* Decode the token ids into string.
*
* @param ids the input ids.
* @returns The decoded string.
*/
decode(ids: Int32Array): string {
const vec = binding.vecIntFromJSArray(ids);
const res = this.handle.Decode(vec).slice();
vec.delete();
return res;
}
/**
* Create a tokenizer from jsonArrayBuffer
*
* @param json The input array buffer that contains json text.
* @returns The tokenizer
*/
static async fromJSON(json: ArrayBuffer): Promise<Tokenizer> {
await asyncInitTokenizers();
return new Tokenizer(binding.Tokenizer.FromBlobJSON(json));
}
/**
* Create a tokenizer from byte-level BPE blobs.
*
* @param vocab The vocab blob.
* @param merges The merges blob.
* @param addedTokens The addedTokens blob
* @returns The tokenizer
*/
static async fromByteLevelBPE(
vocab: ArrayBuffer,
merges: ArrayBuffer,
addedTokens = ""
) : Promise<Tokenizer> {
await asyncInitTokenizers();
return new Tokenizer(
binding.Tokenizer.FromBlobByteLevelBPE(vocab, merges, addedTokens));
}
/**
* Create a tokenizer from sentencepiece model.
*
* @param model The model blob.
* @returns The tokenizer
*/
static async fromSentencePiece(model: ArrayBuffer) : Promise<Tokenizer> {
await asyncInitTokenizers();
return new Tokenizer(
binding.Tokenizer.FromBlobSentencePiece(model));
}
}