added clap tokenizer

This commit is contained in:
Yousef Rafat 2025-09-28 18:12:30 +03:00
parent 8311b156ad
commit 2ceb9f0fdc
7 changed files with 300482 additions and 3 deletions

50001
clap_tokenizer/merges.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,51 @@
{
"bos_token": {
"content": "<s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"cls_token": {
"content": "<s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"eos_token": {
"content": "</s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"mask_token": {
"content": "<mask>",
"lstrip": true,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": {
"content": "<pad>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"sep_token": {
"content": "</s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"unk_token": {
"content": "<unk>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
}

250364
clap_tokenizer/tokenizer.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,63 @@
{
"add_prefix_space": false,
"added_tokens_decoder": {
"0": {
"content": "<s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"1": {
"content": "<pad>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"2": {
"content": "</s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"3": {
"content": "<unk>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"50264": {
"content": "<mask>",
"lstrip": true,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"bos_token": "<s>",
"clean_up_tokenization_spaces": true,
"cls_token": "<s>",
"eos_token": "</s>",
"errors": "replace",
"extra_special_tokens": {},
"mask_token": "<mask>",
"max_length": null,
"model_max_length": 512,
"pad_to_multiple_of": null,
"pad_token": "<pad>",
"pad_token_type_id": 0,
"padding_side": "right",
"processor_class": "ClapProcessor",
"sep_token": "</s>",
"tokenizer_class": "RobertaTokenizer",
"trim_offsets": true,
"unk_token": "<unk>"
}

File diff suppressed because one or more lines are too long

View File

@ -4,7 +4,7 @@ from . import utils
from . import sd1_clip
from . import sdxl_clip
import comfy.clap_model
import comfy.text_encoders.clap_model
import comfy.text_encoders.sd2_clip
import comfy.text_encoders.sd3_clip
import comfy.text_encoders.sa_t5
@ -1317,7 +1317,7 @@ class HunyuanFoley(supported_models_base.BASE):
def get_model(self, state_dict, prefix="", device=None):
return model_base.HunyuanFoley(self, device=device)
def clip_target(self, state_dict={}):
return supported_models_base.ClipTarget(comfy.clap_model.ClapLargeTokenizer, comfy.clap_model.ClapTextEncoderModel)
return supported_models_base.ClipTarget(comfy.text_encoders.clap_model.ClapLargeTokenizer, comfy.text_encoders.clap_model.ClapTextEncoderModel)
class QwenImage(supported_models_base.BASE):
unet_config = {

View File

@ -353,4 +353,3 @@ class ClapLargeTokenizer(sd1_clip.SDTokenizer):
def __init__(self, embedding_directory=None, tokenizer_data={}):
tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clap_tokenizer")
super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='clap_l', tokenizer_class=AutoTokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data)