ComfyUI/comfy/text_encoders/llama_tokenizer.py
doctorpangloss a20bf8134d Fix AuraFlow
2024-07-15 15:29:49 -07:00

25 lines
888 B
Python

class LLAMATokenizer:
# todo: not sure why we're not using the tokenizer from transformers for this
@staticmethod
def from_pretrained(path):
return LLAMATokenizer(path)
def __init__(self, tokenizer_path):
import sentencepiece
self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=tokenizer_path) # pylint: disable=unexpected-keyword-arg
self.end = self.tokenizer.eos_id()
self.eos_token_id = self.end
self.eos_token = self.tokenizer.id_to_piece(self.eos_token_id)
self._vocab = {
self.tokenizer.id_to_piece(i): i for i in range(self.tokenizer.get_piece_size())
}
def get_vocab(self):
return self._vocab
def __call__(self, string):
out = self.tokenizer.encode(string) # pylint: disable=no-member
out += [self.end]
return {"input_ids": out}