mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-01-28 07:10:15 +08:00
Merge branch 'comfyanonymous:master' into master
This commit is contained in:
commit
09156b577c
@ -18,15 +18,27 @@ from typing_extensions import NotRequired
|
|||||||
from comfy.cli_args import DEFAULT_VERSION_STRING
|
from comfy.cli_args import DEFAULT_VERSION_STRING
|
||||||
|
|
||||||
|
|
||||||
|
def frontend_install_warning_message():
|
||||||
|
req_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'requirements.txt'))
|
||||||
|
extra = ""
|
||||||
|
if sys.flags.no_user_site:
|
||||||
|
extra = "-s "
|
||||||
|
return f"Please install the updated requirements.txt file by running:\n{sys.executable} {extra}-m pip install -r {req_path}\n\nThis error is happening because the ComfyUI frontend is no longer shipped as part of the main repo but as a pip package instead.\n\nIf you are on the portable package you can run: update\\update_comfyui.bat to solve this problem"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import comfyui_frontend_package
|
import comfyui_frontend_package
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# TODO: Remove the check after roll out of 0.3.16
|
# TODO: Remove the check after roll out of 0.3.16
|
||||||
req_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'requirements.txt'))
|
logging.error(f"\n\n********** ERROR ***********\n\ncomfyui-frontend-package is not installed. {frontend_install_warning_message()}\n********** ERROR **********\n")
|
||||||
logging.error(f"\n\n********** ERROR ***********\n\ncomfyui-frontend-package is not installed. Please install the updated requirements.txt file by running:\n{sys.executable} -s -m pip install -r {req_path}\n\nThis error is happening because the ComfyUI frontend is no longer shipped as part of the main repo but as a pip package instead.\n\nIf you are on the portable package you can run: update\\update_comfyui.bat to solve this problem\n********** ERROR **********\n")
|
|
||||||
exit(-1)
|
exit(-1)
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
frontend_version = tuple(map(int, comfyui_frontend_package.__version__.split(".")))
|
||||||
|
except:
|
||||||
|
frontend_version = (0,)
|
||||||
|
pass
|
||||||
|
|
||||||
REQUEST_TIMEOUT = 10 # seconds
|
REQUEST_TIMEOUT = 10 # seconds
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -42,7 +42,7 @@ class HunyuanVideoTokenizer:
|
|||||||
self.llama_template = """<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: 1. The main content and theme of the video.2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.4. background environment, light, style and atmosphere.5. camera angles, movements, and transitions used in the video:<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>""" # 95 tokens
|
self.llama_template = """<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: 1. The main content and theme of the video.2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.4. background environment, light, style and atmosphere.5. camera angles, movements, and transitions used in the video:<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>""" # 95 tokens
|
||||||
self.llama = LLAMA3Tokenizer(embedding_directory=embedding_directory, min_length=1)
|
self.llama = LLAMA3Tokenizer(embedding_directory=embedding_directory, min_length=1)
|
||||||
|
|
||||||
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, image_embeds=None, **kwargs):
|
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, image_embeds=None, image_interleave=1, **kwargs):
|
||||||
out = {}
|
out = {}
|
||||||
out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)
|
out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)
|
||||||
|
|
||||||
@ -56,7 +56,7 @@ class HunyuanVideoTokenizer:
|
|||||||
for i in range(len(r)):
|
for i in range(len(r)):
|
||||||
if r[i][0] == 128257:
|
if r[i][0] == 128257:
|
||||||
if image_embeds is not None and embed_count < image_embeds.shape[0]:
|
if image_embeds is not None and embed_count < image_embeds.shape[0]:
|
||||||
r[i] = ({"type": "embedding", "data": image_embeds[embed_count], "original_type": "image"},) + r[i][1:]
|
r[i] = ({"type": "embedding", "data": image_embeds[embed_count], "original_type": "image", "image_interleave": image_interleave},) + r[i][1:]
|
||||||
embed_count += 1
|
embed_count += 1
|
||||||
out["llama"] = llama_text_tokens
|
out["llama"] = llama_text_tokens
|
||||||
return out
|
return out
|
||||||
@ -92,10 +92,10 @@ class HunyuanVideoClipModel(torch.nn.Module):
|
|||||||
llama_out, llama_pooled, llama_extra_out = self.llama.encode_token_weights(token_weight_pairs_llama)
|
llama_out, llama_pooled, llama_extra_out = self.llama.encode_token_weights(token_weight_pairs_llama)
|
||||||
|
|
||||||
template_end = 0
|
template_end = 0
|
||||||
image_start = None
|
extra_template_end = 0
|
||||||
image_end = None
|
|
||||||
extra_sizes = 0
|
extra_sizes = 0
|
||||||
user_end = 9999999999999
|
user_end = 9999999999999
|
||||||
|
images = []
|
||||||
|
|
||||||
tok_pairs = token_weight_pairs_llama[0]
|
tok_pairs = token_weight_pairs_llama[0]
|
||||||
for i, v in enumerate(tok_pairs):
|
for i, v in enumerate(tok_pairs):
|
||||||
@ -112,22 +112,28 @@ class HunyuanVideoClipModel(torch.nn.Module):
|
|||||||
else:
|
else:
|
||||||
if elem.get("original_type") == "image":
|
if elem.get("original_type") == "image":
|
||||||
elem_size = elem.get("data").shape[0]
|
elem_size = elem.get("data").shape[0]
|
||||||
if image_start is None:
|
if template_end > 0:
|
||||||
|
if user_end == -1:
|
||||||
|
extra_template_end += elem_size - 1
|
||||||
|
else:
|
||||||
image_start = i + extra_sizes
|
image_start = i + extra_sizes
|
||||||
image_end = i + elem_size + extra_sizes
|
image_end = i + elem_size + extra_sizes
|
||||||
extra_sizes += elem_size - 1
|
images.append((image_start, image_end, elem.get("image_interleave", 1)))
|
||||||
|
extra_sizes += elem_size - 1
|
||||||
|
|
||||||
if llama_out.shape[1] > (template_end + 2):
|
if llama_out.shape[1] > (template_end + 2):
|
||||||
if tok_pairs[template_end + 1][0] == 271:
|
if tok_pairs[template_end + 1][0] == 271:
|
||||||
template_end += 2
|
template_end += 2
|
||||||
llama_output = llama_out[:, template_end + extra_sizes:user_end + extra_sizes]
|
llama_output = llama_out[:, template_end + extra_sizes:user_end + extra_sizes + extra_template_end]
|
||||||
llama_extra_out["attention_mask"] = llama_extra_out["attention_mask"][:, template_end + extra_sizes:user_end + extra_sizes]
|
llama_extra_out["attention_mask"] = llama_extra_out["attention_mask"][:, template_end + extra_sizes:user_end + extra_sizes + extra_template_end]
|
||||||
if llama_extra_out["attention_mask"].sum() == torch.numel(llama_extra_out["attention_mask"]):
|
if llama_extra_out["attention_mask"].sum() == torch.numel(llama_extra_out["attention_mask"]):
|
||||||
llama_extra_out.pop("attention_mask") # attention mask is useless if no masked elements
|
llama_extra_out.pop("attention_mask") # attention mask is useless if no masked elements
|
||||||
|
|
||||||
if image_start is not None:
|
if len(images) > 0:
|
||||||
image_output = llama_out[:, image_start: image_end]
|
out = []
|
||||||
llama_output = torch.cat([image_output[:, ::2], llama_output], dim=1)
|
for i in images:
|
||||||
|
out.append(llama_out[:, i[0]: i[1]: i[2]])
|
||||||
|
llama_output = torch.cat(out + [llama_output], dim=1)
|
||||||
|
|
||||||
l_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs_l)
|
l_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs_l)
|
||||||
return llama_output, l_pooled, llama_extra_out
|
return llama_output, l_pooled, llama_extra_out
|
||||||
|
|||||||
@ -57,14 +57,15 @@ class TextEncodeHunyuanVideo_ImageToVideo:
|
|||||||
"clip": ("CLIP", ),
|
"clip": ("CLIP", ),
|
||||||
"clip_vision_output": ("CLIP_VISION_OUTPUT", ),
|
"clip_vision_output": ("CLIP_VISION_OUTPUT", ),
|
||||||
"prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}),
|
"prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}),
|
||||||
|
"image_interleave": ("INT", {"default": 2, "min": 1, "max": 512, "tooltip": "How much the image influences things vs the text prompt. Higher number means more influence from the text prompt."}),
|
||||||
}}
|
}}
|
||||||
RETURN_TYPES = ("CONDITIONING",)
|
RETURN_TYPES = ("CONDITIONING",)
|
||||||
FUNCTION = "encode"
|
FUNCTION = "encode"
|
||||||
|
|
||||||
CATEGORY = "advanced/conditioning"
|
CATEGORY = "advanced/conditioning"
|
||||||
|
|
||||||
def encode(self, clip, clip_vision_output, prompt):
|
def encode(self, clip, clip_vision_output, prompt, image_interleave):
|
||||||
tokens = clip.tokenize(prompt, llama_template=PROMPT_TEMPLATE_ENCODE_VIDEO_I2V, image_embeds=clip_vision_output.mm_projected)
|
tokens = clip.tokenize(prompt, llama_template=PROMPT_TEMPLATE_ENCODE_VIDEO_I2V, image_embeds=clip_vision_output.mm_projected, image_interleave=image_interleave)
|
||||||
return (clip.encode_from_tokens_scheduled(tokens), )
|
return (clip.encode_from_tokens_scheduled(tokens), )
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
20
main.py
20
main.py
@ -139,6 +139,7 @@ from server import BinaryEventTypes
|
|||||||
import nodes
|
import nodes
|
||||||
import comfy.model_management
|
import comfy.model_management
|
||||||
import comfyui_version
|
import comfyui_version
|
||||||
|
import app.frontend_management
|
||||||
|
|
||||||
|
|
||||||
def cuda_malloc_warning():
|
def cuda_malloc_warning():
|
||||||
@ -292,12 +293,29 @@ def start_comfyui(asyncio_loop=None):
|
|||||||
return asyncio_loop, prompt_server, start_all
|
return asyncio_loop, prompt_server, start_all
|
||||||
|
|
||||||
|
|
||||||
|
def warn_frontend_version(frontend_version):
|
||||||
|
try:
|
||||||
|
required_frontend = (0,)
|
||||||
|
req_path = os.path.join(os.path.dirname(__file__), 'requirements.txt')
|
||||||
|
with open(req_path, 'r') as f:
|
||||||
|
required_frontend = tuple(map(int, f.readline().split('=')[-1].split('.')))
|
||||||
|
if frontend_version < required_frontend:
|
||||||
|
logging.warning("________________________________________________________________________\nWARNING WARNING WARNING WARNING WARNING\n\nInstalled frontend version {} is lower than the recommended version {}.\n\n{}\n________________________________________________________________________".format('.'.join(map(str, frontend_version)), '.'.join(map(str, required_frontend)), app.frontend_management.frontend_install_warning_message()))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Running directly, just start ComfyUI.
|
# Running directly, just start ComfyUI.
|
||||||
logging.info("ComfyUI version: {}".format(comfyui_version.__version__))
|
logging.info("ComfyUI version: {}".format(comfyui_version.__version__))
|
||||||
|
frontend_version = app.frontend_management.frontend_version
|
||||||
|
logging.info("ComfyUI frontend version: {}".format('.'.join(map(str, frontend_version))))
|
||||||
|
|
||||||
event_loop, _, start_all_func = start_comfyui()
|
event_loop, _, start_all_func = start_comfyui()
|
||||||
try:
|
try:
|
||||||
event_loop.run_until_complete(start_all_func())
|
x = start_all_func()
|
||||||
|
warn_frontend_version(frontend_version)
|
||||||
|
event_loop.run_until_complete(x)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
logging.info("\nStopped server")
|
logging.info("\nStopped server")
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
comfyui-frontend-package==1.10.17
|
comfyui-frontend-package==1.11.8
|
||||||
torch
|
torch
|
||||||
torchsde
|
torchsde
|
||||||
torchvision
|
torchvision
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user