From a618f768e06d72acc5b97d78a9ae209cbae14654 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Sun, 29 Dec 2024 02:26:49 -0500 Subject: [PATCH] Auto reshape 2d to 3d latent for single image generation on video model. --- comfy/latent_formats.py | 6 ++++++ comfy/sample.py | 8 +++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index 40234b64..a50a70ae 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -3,6 +3,7 @@ import torch class LatentFormat: scale_factor = 1.0 latent_channels = 4 + latent_dimensions = 2 latent_rgb_factors = None latent_rgb_factors_bias = None taesd_decoder_name = None @@ -143,6 +144,7 @@ class SD3(LatentFormat): class StableAudio1(LatentFormat): latent_channels = 64 + latent_dimensions = 1 class Flux(SD3): latent_channels = 16 @@ -178,6 +180,7 @@ class Flux(SD3): class Mochi(LatentFormat): latent_channels = 12 + latent_dimensions = 3 def __init__(self): self.scale_factor = 1.0 @@ -219,6 +222,8 @@ class Mochi(LatentFormat): class LTXV(LatentFormat): latent_channels = 128 + latent_dimensions = 3 + def __init__(self): self.latent_rgb_factors = [ [ 1.1202e-02, -6.3815e-04, -1.0021e-02], @@ -355,6 +360,7 @@ class LTXV(LatentFormat): class HunyuanVideo(LatentFormat): latent_channels = 16 + latent_dimensions = 3 scale_factor = 0.476986 latent_rgb_factors = [ [-0.0395, -0.0331, 0.0445], diff --git a/comfy/sample.py b/comfy/sample.py index 9974e065..be5a7e24 100644 --- a/comfy/sample.py +++ b/comfy/sample.py @@ -25,9 +25,11 @@ def prepare_noise(latent_image, seed, noise_inds=None): return noises def fix_empty_latent_channels(model, latent_image): - latent_channels = model.get_model_object("latent_format").latent_channels #Resize the empty latent image so it has the right number of channels - if latent_channels != latent_image.shape[1] and torch.count_nonzero(latent_image) == 0: - latent_image = comfy.utils.repeat_to_batch_size(latent_image, latent_channels, dim=1) + latent_format = model.get_model_object("latent_format") #Resize the empty latent image so it has the right number of channels + if latent_format.latent_channels != latent_image.shape[1] and torch.count_nonzero(latent_image) == 0: + latent_image = comfy.utils.repeat_to_batch_size(latent_image, latent_format.latent_channels, dim=1) + if latent_format.latent_dimensions == 3 and latent_image.ndim == 4: + latent_image = latent_image.unsqueeze(2) return latent_image def prepare_sampling(model, noise_shape, positive, negative, noise_mask):