From 67c7184b7432105d2db52cc19fc82ccd4aa06fb3 Mon Sep 17 00:00:00 2001 From: Andrew Kvochko Date: Mon, 10 Mar 2025 10:11:48 +0200 Subject: [PATCH] ltxv: relax frame_idx divisibility for single frames. (#7146) This commit relaxes divisibility constraint for single-frame conditionings. For single frames, the index can be arbitrary, while multi-frame conditionings (>= 9 frames) must still be aligned to 8 frames. Co-authored-by: Andrew Kvochko --- comfy_extras/nodes_lt.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/comfy_extras/nodes_lt.py b/comfy_extras/nodes_lt.py index b608b940..fdc6c7c1 100644 --- a/comfy_extras/nodes_lt.py +++ b/comfy_extras/nodes_lt.py @@ -99,12 +99,13 @@ class LTXVAddGuide: "negative": ("CONDITIONING", ), "vae": ("VAE",), "latent": ("LATENT",), - "image": ("IMAGE", {"tooltip": "Image or video to condition the latent video on. Must be 8*n + 1 frames." \ + "image": ("IMAGE", {"tooltip": "Image or video to condition the latent video on. Must be 8*n + 1 frames." "If the video is not 8*n + 1 frames, it will be cropped to the nearest 8*n + 1 frames."}), "frame_idx": ("INT", {"default": 0, "min": -9999, "max": 9999, - "tooltip": "Frame index to start the conditioning at. Must be divisible by 8. " \ - "If a frame is not divisible by 8, it will be rounded down to the nearest multiple of 8. " \ - "Negative values are counted from the end of the video."}), + "tooltip": "Frame index to start the conditioning at. For single-frame images or " + "videos with 1-8 frames, any frame_idx value is acceptable. For videos with 9+ " + "frames, frame_idx must be divisible by 8, otherwise it will be rounded down to " + "the nearest multiple of 8. Negative values are counted from the end of the video."}), "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}), } } @@ -127,12 +128,13 @@ class LTXVAddGuide: t = vae.encode(encode_pixels) return encode_pixels, t - def get_latent_index(self, cond, latent_length, frame_idx, scale_factors): + def get_latent_index(self, cond, latent_length, guide_length, frame_idx, scale_factors): time_scale_factor, _, _ = scale_factors _, num_keyframes = get_keyframe_idxs(cond) latent_count = latent_length - num_keyframes - frame_idx = frame_idx if frame_idx >= 0 else max((latent_count - 1) * 8 + 1 + frame_idx, 0) - frame_idx = frame_idx // time_scale_factor * time_scale_factor # frame index must be divisible by 8 + frame_idx = frame_idx if frame_idx >= 0 else max((latent_count - 1) * time_scale_factor + 1 + frame_idx, 0) + if guide_length > 1: + frame_idx = frame_idx // time_scale_factor * time_scale_factor # frame index must be divisible by 8 latent_idx = (frame_idx + time_scale_factor - 1) // time_scale_factor @@ -191,7 +193,7 @@ class LTXVAddGuide: _, _, latent_length, latent_height, latent_width = latent_image.shape image, t = self.encode(vae, latent_width, latent_height, image, scale_factors) - frame_idx, latent_idx = self.get_latent_index(positive, latent_length, frame_idx, scale_factors) + frame_idx, latent_idx = self.get_latent_index(positive, latent_length, len(image), frame_idx, scale_factors) assert latent_idx + t.shape[2] <= latent_length, "Conditioning frames exceed the length of the latent sequence." num_prefix_frames = min(self._num_prefix_frames, t.shape[2])