from comfy_api.latest import ComfyExtension, io from typing_extensions import override class TextGenerate(io.ComfyNode): @classmethod def define_schema(cls): # Define dynamic combo options for sampling mode sampling_options = [ io.DynamicCombo.Option( key="on", inputs=[ io.Float.Input("temperature", default=0.7, min=0.01, max=2.0, step=0.000001), io.Int.Input("top_k", default=64, min=0, max=1000), io.Float.Input("top_p", default=0.95, min=0.0, max=1.0, step=0.01), io.Float.Input("min_p", default=0.05, min=0.0, max=1.0, step=0.01), io.Float.Input("repetition_penalty", default=1.05, min=0.0, max=5.0, step=0.01), io.Int.Input("seed", default=0, min=0, max=0xffffffffffffffff), io.Float.Input("presence_penalty", optional=True, default=0.0, min=0.0, max=5.0, step=0.01), ] ), io.DynamicCombo.Option( key="off", inputs=[] ), ] return io.Schema( node_id="TextGenerate", category="textgen", search_aliases=["LLM", "gemma"], inputs=[ io.Clip.Input("clip"), io.String.Input("prompt", multiline=True, dynamic_prompts=True, default=""), io.Image.Input("image", optional=True), io.Image.Input("video", optional=True, tooltip="Video frames as image batch (1 FPS recommended)."), io.Audio.Input("audio", optional=True), io.Int.Input("max_length", default=256, min=1, max=2048), io.DynamicCombo.Input("sampling_mode", options=sampling_options, display_name="Sampling Mode"), io.Boolean.Input("thinking", optional=True, default=False, tooltip="Operate in thinking mode if the model supports it."), ], outputs=[ io.String.Output(display_name="generated_text"), ], ) @classmethod def execute(cls, clip, prompt, max_length, sampling_mode, image=None, video=None, audio=None, thinking=False) -> io.NodeOutput: tokens = clip.tokenize(prompt, image=image, video=video, audio=audio, skip_template=False, min_length=1, thinking=thinking) # Get sampling parameters from dynamic combo do_sample = sampling_mode.get("sampling_mode") == "on" temperature = sampling_mode.get("temperature", 1.0) top_k = sampling_mode.get("top_k", 50) top_p = sampling_mode.get("top_p", 1.0) min_p = sampling_mode.get("min_p", 0.0) seed = sampling_mode.get("seed", None) repetition_penalty = sampling_mode.get("repetition_penalty", 1.0) presence_penalty = sampling_mode.get("presence_penalty", 0.0) generated_ids = clip.generate( tokens, do_sample=do_sample, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, min_p=min_p, repetition_penalty=repetition_penalty, presence_penalty=presence_penalty, seed=seed ) generated_text = clip.decode(generated_ids, skip_special_tokens=not thinking) if thinking: # Translate Gemma4 thinking channel markers to standard / tags generated_text = generated_text.replace("<|channel>thought\n", "\n") generated_text = generated_text.replace("", "") # Strip remaining special tokens generated_text = generated_text.replace("", "").replace("", "").strip() return io.NodeOutput(generated_text) LTX2_T2V_SYSTEM_PROMPT = """You are a Creative Assistant. Given a user's raw input prompt describing a scene or concept, expand it into a detailed video generation prompt with specific visuals and integrated audio to guide a text-to-video model. #### Guidelines - Strictly follow all aspects of the user's raw input: include every element requested (style, visuals, motions, actions, camera movement, audio). - If the input is vague, invent concrete details: lighting, textures, materials, scene settings, etc. - For characters: describe gender, clothing, hair, expressions. DO NOT invent unrequested characters. - Use active language: present-progressive verbs ("is walking," "speaking"). If no action specified, describe natural movements. - Maintain chronological flow: use temporal connectors ("as," "then," "while"). - Audio layer: Describe complete soundscape (background audio, ambient sounds, SFX, speech/music when requested). Integrate sounds chronologically alongside actions. Be specific (e.g., "soft footsteps on tile"), not vague (e.g., "ambient sound is present"). - Speech (only when requested): - For ANY speech-related input (talking, conversation, singing, etc.), ALWAYS include exact words in quotes with voice characteristics (e.g., "The man says in an excited voice: 'You won't believe what I just saw!'"). - Specify language if not English and accent if relevant. - Style: Include visual style at the beginning: "Style: