ComfyUI/comfy/text_encoders/llama_tokenizer.py
2024-07-18 17:15:44 -07:00

25 lines
946 B
Python

class LLAMATokenizer:
# todo: not sure why we're not using the tokenizer from transformers for this
@staticmethod
def from_pretrained(path):
return LLAMATokenizer(path)
def __init__(self, tokenizer_path):
import sentencepiece
self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=tokenizer_path) # pylint: disable=unexpected-keyword-arg
self.end = self.tokenizer.eos_id()
self.eos_token_id = self.end
self.eos_token = self.tokenizer.id_to_piece(self.eos_token_id) # pylint: disable=no-member
self._vocab = {
self.tokenizer.id_to_piece(i): i for i in range(self.tokenizer.get_piece_size()) # pylint: disable=no-member
}
def get_vocab(self):
return self._vocab
def __call__(self, string):
out = self.tokenizer.encode(string) # pylint: disable=no-member
out += [self.end]
return {"input_ids": out}