import nodes import node_helpers import torch import comfy.model_management class CLIPTextEncodeHunyuanDiT: @classmethod def INPUT_TYPES(s): return {"required": { "clip": ("CLIP", ), "bert": ("STRING", {"multiline": True, "dynamicPrompts": True}), "mt5xl": ("STRING", {"multiline": True, "dynamicPrompts": True}), }} RETURN_TYPES = ("CONDITIONING",) FUNCTION = "encode" CATEGORY = "advanced/conditioning" def encode(self, clip, bert, mt5xl): tokens = clip.tokenize(bert) tokens["mt5xl"] = clip.tokenize(mt5xl)["mt5xl"] return (clip.encode_from_tokens_scheduled(tokens), ) class EmptyHunyuanLatentVideo: @classmethod def INPUT_TYPES(s): return {"required": { "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), "length": ("INT", {"default": 25, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}} RETURN_TYPES = ("LATENT",) FUNCTION = "generate" CATEGORY = "latent/video" def generate(self, width, height, length, batch_size=1): latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) return ({"samples":latent}, ) PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = ( "<|start_header_id|>system<|end_header_id|>\n\n\nDescribe the video by detailing the following aspects according to the reference image: " "1. The main content and theme of the video." "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects." "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects." "4. background environment, light, style and atmosphere." "5. camera angles, movements, and transitions used in the video:<|eot_id|>\n\n" "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>" "<|start_header_id|>assistant<|end_header_id|>\n\n" ) class TextEncodeHunyuanVideo_ImageToVideo: @classmethod def INPUT_TYPES(s): return {"required": { "clip": ("CLIP", ), "clip_vision_output": ("CLIP_VISION_OUTPUT", ), "prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}), "image_interleave": ("INT", {"default": 2, "min": 1, "max": 512, "tooltip": "How much the image influences things vs the text prompt. Higher number means more influence from the text prompt."}), }} RETURN_TYPES = ("CONDITIONING",) FUNCTION = "encode" CATEGORY = "advanced/conditioning" def encode(self, clip, clip_vision_output, prompt, image_interleave): tokens = clip.tokenize(prompt, llama_template=PROMPT_TEMPLATE_ENCODE_VIDEO_I2V, image_embeds=clip_vision_output.mm_projected, image_interleave=image_interleave) return (clip.encode_from_tokens_scheduled(tokens), ) class HunyuanImageToVideo: @classmethod def INPUT_TYPES(s): return {"required": {"positive": ("CONDITIONING", ), "vae": ("VAE", ), "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), "length": ("INT", {"default": 53, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), "guidance_type": (["v1 (concat)", "v2 (replace)"], ) }, "optional": {"start_image": ("IMAGE", ), }} RETURN_TYPES = ("CONDITIONING", "LATENT") RETURN_NAMES = ("positive", "latent") FUNCTION = "encode" CATEGORY = "conditioning/video_models" def encode(self, positive, vae, width, height, length, batch_size, guidance_type, start_image=None): latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) out_latent = {} if start_image is not None: start_image = comfy.utils.common_upscale(start_image[:length, :, :, :3].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) concat_latent_image = vae.encode(start_image) mask = torch.ones((1, 1, latent.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=start_image.device, dtype=start_image.dtype) mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0 if guidance_type == "v1 (concat)": cond = {"concat_latent_image": concat_latent_image, "concat_mask": mask} else: cond = {'guiding_frame_index': 0} latent[:, :, :concat_latent_image.shape[2]] = concat_latent_image out_latent["noise_mask"] = mask positive = node_helpers.conditioning_set_values(positive, cond) out_latent["samples"] = latent return (positive, out_latent) NODE_CLASS_MAPPINGS = { "CLIPTextEncodeHunyuanDiT": CLIPTextEncodeHunyuanDiT, "TextEncodeHunyuanVideo_ImageToVideo": TextEncodeHunyuanVideo_ImageToVideo, "EmptyHunyuanLatentVideo": EmptyHunyuanLatentVideo, "HunyuanImageToVideo": HunyuanImageToVideo, }