2023-06-09 16:24:24 +00:00
|
|
|
import torch
|
2024-03-10 15:37:08 +00:00
|
|
|
import logging
|
2024-01-03 19:27:11 +00:00
|
|
|
from comfy.ldm.modules.diffusionmodules.openaimodel import UNetModel, Timestep
|
2024-02-16 15:55:08 +00:00
|
|
|
from comfy.ldm.cascade.stage_c import StageC
|
2024-02-16 17:56:11 +00:00
|
|
|
from comfy.ldm.cascade.stage_b import StageB
|
2023-06-09 16:24:24 +00:00
|
|
|
from comfy.ldm.modules.encoders.noise_aug_modules import CLIPEmbeddingNoiseAugmentation
|
2024-01-03 19:27:11 +00:00
|
|
|
from comfy.ldm.modules.diffusionmodules.upscaling import ImageConcatWithNoiseAugmentation
|
2024-06-10 17:26:25 +00:00
|
|
|
from comfy.ldm.modules.diffusionmodules.mmdit import OpenAISignatureMMDITWrapper
|
2023-08-26 15:52:07 +00:00
|
|
|
import comfy.model_management
|
2023-10-25 03:31:12 +00:00
|
|
|
import comfy.conds
|
2023-12-11 23:24:44 +00:00
|
|
|
import comfy.ops
|
2023-07-17 05:22:12 +00:00
|
|
|
from enum import Enum
|
2023-06-26 16:21:07 +00:00
|
|
|
from . import utils
|
2024-06-10 17:26:25 +00:00
|
|
|
import comfy.latent_formats
|
2023-06-09 16:24:24 +00:00
|
|
|
|
2023-07-17 05:22:12 +00:00
|
|
|
class ModelType(Enum):
|
|
|
|
EPS = 1
|
|
|
|
V_PREDICTION = 2
|
2023-11-24 00:41:33 +00:00
|
|
|
V_PREDICTION_EDM = 3
|
2024-02-16 15:55:08 +00:00
|
|
|
STABLE_CASCADE = 4
|
2024-02-27 23:03:03 +00:00
|
|
|
EDM = 5
|
2024-06-10 17:26:25 +00:00
|
|
|
FLOW = 6
|
2023-07-17 05:22:12 +00:00
|
|
|
|
2023-10-31 21:33:43 +00:00
|
|
|
|
2024-02-27 23:03:03 +00:00
|
|
|
from comfy.model_sampling import EPS, V_PREDICTION, EDM, ModelSamplingDiscrete, ModelSamplingContinuousEDM, StableCascadeSampling
|
2023-11-24 00:41:33 +00:00
|
|
|
|
2023-11-01 02:14:32 +00:00
|
|
|
|
2023-10-31 21:33:43 +00:00
|
|
|
def model_sampling(model_config, model_type):
|
2023-11-24 00:41:33 +00:00
|
|
|
s = ModelSamplingDiscrete
|
|
|
|
|
2023-10-31 21:33:43 +00:00
|
|
|
if model_type == ModelType.EPS:
|
|
|
|
c = EPS
|
|
|
|
elif model_type == ModelType.V_PREDICTION:
|
|
|
|
c = V_PREDICTION
|
2023-11-24 00:41:33 +00:00
|
|
|
elif model_type == ModelType.V_PREDICTION_EDM:
|
|
|
|
c = V_PREDICTION
|
|
|
|
s = ModelSamplingContinuousEDM
|
2024-06-10 17:26:25 +00:00
|
|
|
elif model_type == ModelType.FLOW:
|
|
|
|
c = comfy.model_sampling.CONST
|
|
|
|
s = comfy.model_sampling.ModelSamplingDiscreteFlow
|
2024-02-16 15:55:08 +00:00
|
|
|
elif model_type == ModelType.STABLE_CASCADE:
|
|
|
|
c = EPS
|
|
|
|
s = StableCascadeSampling
|
2024-02-27 23:03:03 +00:00
|
|
|
elif model_type == ModelType.EDM:
|
|
|
|
c = EDM
|
|
|
|
s = ModelSamplingContinuousEDM
|
2023-10-31 21:33:43 +00:00
|
|
|
|
|
|
|
class ModelSampling(s, c):
|
|
|
|
pass
|
|
|
|
|
|
|
|
return ModelSampling(model_config)
|
|
|
|
|
|
|
|
|
2023-06-09 16:24:24 +00:00
|
|
|
class BaseModel(torch.nn.Module):
|
2024-02-16 15:55:08 +00:00
|
|
|
def __init__(self, model_config, model_type=ModelType.EPS, device=None, unet_model=UNetModel):
|
2023-06-09 16:24:24 +00:00
|
|
|
super().__init__()
|
|
|
|
|
2023-06-23 06:14:12 +00:00
|
|
|
unet_config = model_config.unet_config
|
|
|
|
self.latent_format = model_config.latent_format
|
2023-06-26 16:21:07 +00:00
|
|
|
self.model_config = model_config
|
2023-12-11 23:24:44 +00:00
|
|
|
self.manual_cast_dtype = model_config.manual_cast_dtype
|
2023-10-31 21:33:43 +00:00
|
|
|
|
2023-08-29 18:22:53 +00:00
|
|
|
if not unet_config.get("disable_unet_model_creation", False):
|
2023-12-11 23:24:44 +00:00
|
|
|
if self.manual_cast_dtype is not None:
|
|
|
|
operations = comfy.ops.manual_cast
|
|
|
|
else:
|
2023-12-12 04:27:13 +00:00
|
|
|
operations = comfy.ops.disable_weight_init
|
2024-02-16 15:55:08 +00:00
|
|
|
self.diffusion_model = unet_model(**unet_config, device=device, operations=operations)
|
2023-07-17 05:22:12 +00:00
|
|
|
self.model_type = model_type
|
2023-10-31 21:33:43 +00:00
|
|
|
self.model_sampling = model_sampling(model_config, model_type)
|
|
|
|
|
2023-06-22 17:03:50 +00:00
|
|
|
self.adm_channels = unet_config.get("adm_in_channels", None)
|
|
|
|
if self.adm_channels is None:
|
2023-06-09 16:24:24 +00:00
|
|
|
self.adm_channels = 0
|
2024-03-29 18:43:24 +00:00
|
|
|
|
|
|
|
self.concat_keys = ()
|
2024-03-11 17:54:56 +00:00
|
|
|
logging.info("model_type {}".format(model_type.name))
|
|
|
|
logging.debug("adm {}".format(self.adm_channels))
|
2023-06-09 16:24:24 +00:00
|
|
|
|
2023-10-25 04:07:53 +00:00
|
|
|
def apply_model(self, x, t, c_concat=None, c_crossattn=None, control=None, transformer_options={}, **kwargs):
|
2023-10-31 21:33:43 +00:00
|
|
|
sigma = t
|
|
|
|
xc = self.model_sampling.calculate_input(sigma, x)
|
2023-06-09 16:24:24 +00:00
|
|
|
if c_concat is not None:
|
2023-10-31 21:33:43 +00:00
|
|
|
xc = torch.cat([xc] + [c_concat], dim=1)
|
|
|
|
|
2023-08-31 17:25:00 +00:00
|
|
|
context = c_crossattn
|
2023-07-06 00:58:44 +00:00
|
|
|
dtype = self.get_dtype()
|
2023-12-04 16:10:00 +00:00
|
|
|
|
2023-12-11 23:24:44 +00:00
|
|
|
if self.manual_cast_dtype is not None:
|
|
|
|
dtype = self.manual_cast_dtype
|
2023-12-04 16:10:00 +00:00
|
|
|
|
2023-07-06 00:58:44 +00:00
|
|
|
xc = xc.to(dtype)
|
2023-11-01 02:14:32 +00:00
|
|
|
t = self.model_sampling.timestep(t).float()
|
2023-07-06 00:58:44 +00:00
|
|
|
context = context.to(dtype)
|
2023-10-25 04:07:53 +00:00
|
|
|
extra_conds = {}
|
|
|
|
for o in kwargs:
|
2023-11-08 06:59:09 +00:00
|
|
|
extra = kwargs[o]
|
2024-01-03 19:27:11 +00:00
|
|
|
if hasattr(extra, "dtype"):
|
|
|
|
if extra.dtype != torch.int and extra.dtype != torch.long:
|
|
|
|
extra = extra.to(dtype)
|
2023-11-08 06:59:09 +00:00
|
|
|
extra_conds[o] = extra
|
2023-12-04 16:10:00 +00:00
|
|
|
|
2023-12-11 23:24:44 +00:00
|
|
|
model_output = self.diffusion_model(xc, t, context=context, control=control, transformer_options=transformer_options, **extra_conds).float()
|
2023-10-31 21:33:43 +00:00
|
|
|
return self.model_sampling.calculate_denoised(sigma, model_output, x)
|
2023-06-09 16:24:24 +00:00
|
|
|
|
|
|
|
def get_dtype(self):
|
|
|
|
return self.diffusion_model.dtype
|
|
|
|
|
|
|
|
def is_adm(self):
|
|
|
|
return self.adm_channels > 0
|
|
|
|
|
2023-06-22 17:03:50 +00:00
|
|
|
def encode_adm(self, **kwargs):
|
|
|
|
return None
|
|
|
|
|
2023-10-25 03:31:12 +00:00
|
|
|
def extra_conds(self, **kwargs):
|
|
|
|
out = {}
|
2024-03-29 18:43:24 +00:00
|
|
|
if len(self.concat_keys) > 0:
|
2023-10-18 20:48:37 +00:00
|
|
|
cond_concat = []
|
2024-01-11 08:15:27 +00:00
|
|
|
denoise_mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
|
|
|
|
concat_latent_image = kwargs.get("concat_latent_image", None)
|
|
|
|
if concat_latent_image is None:
|
|
|
|
concat_latent_image = kwargs.get("latent_image", None)
|
|
|
|
else:
|
|
|
|
concat_latent_image = self.process_latent_in(concat_latent_image)
|
|
|
|
|
2023-10-18 20:48:37 +00:00
|
|
|
noise = kwargs.get("noise", None)
|
2023-10-19 05:10:41 +00:00
|
|
|
device = kwargs["device"]
|
2023-10-18 20:48:37 +00:00
|
|
|
|
2024-01-11 08:15:27 +00:00
|
|
|
if concat_latent_image.shape[1:] != noise.shape[1:]:
|
|
|
|
concat_latent_image = utils.common_upscale(concat_latent_image, noise.shape[-1], noise.shape[-2], "bilinear", "center")
|
|
|
|
|
|
|
|
concat_latent_image = utils.resize_to_batch_size(concat_latent_image, noise.shape[0])
|
|
|
|
|
2024-03-29 18:43:24 +00:00
|
|
|
if denoise_mask is not None:
|
|
|
|
if len(denoise_mask.shape) == len(noise.shape):
|
|
|
|
denoise_mask = denoise_mask[:,:1]
|
2024-01-11 08:15:27 +00:00
|
|
|
|
2024-03-29 18:43:24 +00:00
|
|
|
denoise_mask = denoise_mask.reshape((-1, 1, denoise_mask.shape[-2], denoise_mask.shape[-1]))
|
|
|
|
if denoise_mask.shape[-2:] != noise.shape[-2:]:
|
|
|
|
denoise_mask = utils.common_upscale(denoise_mask, noise.shape[-1], noise.shape[-2], "bilinear", "center")
|
|
|
|
denoise_mask = utils.resize_to_batch_size(denoise_mask.round(), noise.shape[0])
|
2023-10-18 20:48:37 +00:00
|
|
|
|
2024-03-29 18:43:24 +00:00
|
|
|
for ck in self.concat_keys:
|
2023-10-18 20:48:37 +00:00
|
|
|
if denoise_mask is not None:
|
|
|
|
if ck == "mask":
|
2024-01-11 08:15:27 +00:00
|
|
|
cond_concat.append(denoise_mask.to(device))
|
2023-10-18 20:48:37 +00:00
|
|
|
elif ck == "masked_image":
|
2024-01-11 08:15:27 +00:00
|
|
|
cond_concat.append(concat_latent_image.to(device)) #NOTE: the latent_image should be masked by the mask in pixel space
|
2023-10-18 20:48:37 +00:00
|
|
|
else:
|
|
|
|
if ck == "mask":
|
|
|
|
cond_concat.append(torch.ones_like(noise)[:,:1])
|
|
|
|
elif ck == "masked_image":
|
2024-03-29 18:43:24 +00:00
|
|
|
cond_concat.append(self.blank_inpaint_image_like(noise))
|
2023-10-25 03:31:12 +00:00
|
|
|
data = torch.cat(cond_concat, dim=1)
|
|
|
|
out['c_concat'] = comfy.conds.CONDNoiseShape(data)
|
2023-12-18 17:54:23 +00:00
|
|
|
|
2023-10-25 03:31:12 +00:00
|
|
|
adm = self.encode_adm(**kwargs)
|
|
|
|
if adm is not None:
|
2023-10-25 04:07:53 +00:00
|
|
|
out['y'] = comfy.conds.CONDRegular(adm)
|
2023-12-18 17:54:23 +00:00
|
|
|
|
|
|
|
cross_attn = kwargs.get("cross_attn", None)
|
|
|
|
if cross_attn is not None:
|
|
|
|
out['c_crossattn'] = comfy.conds.CONDCrossAttn(cross_attn)
|
|
|
|
|
2024-02-09 19:13:31 +00:00
|
|
|
cross_attn_cnet = kwargs.get("cross_attn_controlnet", None)
|
|
|
|
if cross_attn_cnet is not None:
|
|
|
|
out['crossattn_controlnet'] = comfy.conds.CONDCrossAttn(cross_attn_cnet)
|
|
|
|
|
2024-03-02 16:44:06 +00:00
|
|
|
c_concat = kwargs.get("noise_concat", None)
|
|
|
|
if c_concat is not None:
|
2024-05-08 22:36:56 +00:00
|
|
|
out['c_concat'] = comfy.conds.CONDNoiseShape(c_concat)
|
2024-03-02 16:44:06 +00:00
|
|
|
|
2023-10-25 03:31:12 +00:00
|
|
|
return out
|
2023-10-18 20:48:37 +00:00
|
|
|
|
2023-06-22 17:03:50 +00:00
|
|
|
def load_model_weights(self, sd, unet_prefix=""):
|
|
|
|
to_load = {}
|
|
|
|
keys = list(sd.keys())
|
|
|
|
for k in keys:
|
|
|
|
if k.startswith(unet_prefix):
|
|
|
|
to_load[k[len(unet_prefix):]] = sd.pop(k)
|
|
|
|
|
2023-11-21 03:27:36 +00:00
|
|
|
to_load = self.model_config.process_unet_state_dict(to_load)
|
2023-06-22 17:03:50 +00:00
|
|
|
m, u = self.diffusion_model.load_state_dict(to_load, strict=False)
|
|
|
|
if len(m) > 0:
|
2024-03-10 15:37:08 +00:00
|
|
|
logging.warning("unet missing: {}".format(m))
|
2023-06-22 17:03:50 +00:00
|
|
|
|
|
|
|
if len(u) > 0:
|
2024-03-10 15:37:08 +00:00
|
|
|
logging.warning("unet unexpected: {}".format(u))
|
2023-06-22 17:03:50 +00:00
|
|
|
del to_load
|
|
|
|
return self
|
|
|
|
|
2023-06-23 06:14:12 +00:00
|
|
|
def process_latent_in(self, latent):
|
|
|
|
return self.latent_format.process_in(latent)
|
|
|
|
|
|
|
|
def process_latent_out(self, latent):
|
|
|
|
return self.latent_format.process_out(latent)
|
|
|
|
|
2024-01-18 00:37:19 +00:00
|
|
|
def state_dict_for_saving(self, clip_state_dict=None, vae_state_dict=None, clip_vision_state_dict=None):
|
|
|
|
extra_sds = []
|
|
|
|
if clip_state_dict is not None:
|
|
|
|
extra_sds.append(self.model_config.process_clip_state_dict_for_saving(clip_state_dict))
|
|
|
|
if vae_state_dict is not None:
|
|
|
|
extra_sds.append(self.model_config.process_vae_state_dict_for_saving(vae_state_dict))
|
|
|
|
if clip_vision_state_dict is not None:
|
|
|
|
extra_sds.append(self.model_config.process_clip_vision_state_dict_for_saving(clip_vision_state_dict))
|
|
|
|
|
2023-12-22 19:24:04 +00:00
|
|
|
unet_state_dict = self.diffusion_model.state_dict()
|
2023-06-26 16:21:07 +00:00
|
|
|
unet_state_dict = self.model_config.process_unet_state_dict_for_saving(unet_state_dict)
|
2024-01-18 00:37:19 +00:00
|
|
|
|
2023-06-26 16:21:07 +00:00
|
|
|
if self.get_dtype() == torch.float16:
|
2024-01-18 00:37:19 +00:00
|
|
|
extra_sds = map(lambda sd: utils.convert_sd_to(sd, torch.float16), extra_sds)
|
2023-07-18 04:25:53 +00:00
|
|
|
|
|
|
|
if self.model_type == ModelType.V_PREDICTION:
|
|
|
|
unet_state_dict["v_pred"] = torch.tensor([])
|
|
|
|
|
2024-01-18 00:37:19 +00:00
|
|
|
for sd in extra_sds:
|
|
|
|
unet_state_dict.update(sd)
|
|
|
|
|
|
|
|
return unet_state_dict
|
2023-06-26 16:21:07 +00:00
|
|
|
|
2023-09-01 19:18:25 +00:00
|
|
|
def set_inpaint(self):
|
2024-03-29 18:43:24 +00:00
|
|
|
self.concat_keys = ("mask", "masked_image")
|
|
|
|
def blank_inpaint_image_like(latent_image):
|
|
|
|
blank_image = torch.ones_like(latent_image)
|
|
|
|
# these are the values for "zero" in pixel space translated to latent space
|
|
|
|
blank_image[:,0] *= 0.8223
|
|
|
|
blank_image[:,1] *= -0.6876
|
|
|
|
blank_image[:,2] *= 0.6364
|
|
|
|
blank_image[:,3] *= 0.1380
|
|
|
|
return blank_image
|
|
|
|
self.blank_inpaint_image_like = blank_inpaint_image_like
|
2023-09-01 19:18:25 +00:00
|
|
|
|
2023-11-12 09:02:16 +00:00
|
|
|
def memory_required(self, input_shape):
|
|
|
|
if comfy.model_management.xformers_enabled() or comfy.model_management.pytorch_attention_flash_attention():
|
2023-12-12 04:50:38 +00:00
|
|
|
dtype = self.get_dtype()
|
|
|
|
if self.manual_cast_dtype is not None:
|
|
|
|
dtype = self.manual_cast_dtype
|
2023-11-12 09:02:16 +00:00
|
|
|
#TODO: this needs to be tweaked
|
2023-11-27 19:55:40 +00:00
|
|
|
area = input_shape[0] * input_shape[2] * input_shape[3]
|
2023-12-12 04:50:38 +00:00
|
|
|
return (area * comfy.model_management.dtype_size(dtype) / 50) * (1024 * 1024)
|
2023-11-12 09:02:16 +00:00
|
|
|
else:
|
|
|
|
#TODO: this formula might be too aggressive since I tweaked the sub-quad and split algorithms to use less memory.
|
2023-11-27 19:04:16 +00:00
|
|
|
area = input_shape[0] * input_shape[2] * input_shape[3]
|
2023-11-12 09:02:16 +00:00
|
|
|
return (((area * 0.6) / 0.9) + 1024) * (1024 * 1024)
|
|
|
|
|
|
|
|
|
2024-01-14 22:25:21 +00:00
|
|
|
def unclip_adm(unclip_conditioning, device, noise_augmentor, noise_augment_merge=0.0, seed=None):
|
2023-08-15 03:41:52 +00:00
|
|
|
adm_inputs = []
|
|
|
|
weights = []
|
|
|
|
noise_aug = []
|
|
|
|
for unclip_cond in unclip_conditioning:
|
|
|
|
for adm_cond in unclip_cond["clip_vision_output"].image_embeds:
|
|
|
|
weight = unclip_cond["strength"]
|
|
|
|
noise_augment = unclip_cond["noise_augmentation"]
|
|
|
|
noise_level = round((noise_augmentor.max_noise_level - 1) * noise_augment)
|
2024-01-14 22:25:21 +00:00
|
|
|
c_adm, noise_level_emb = noise_augmentor(adm_cond.to(device), noise_level=torch.tensor([noise_level], device=device), seed=seed)
|
2023-08-15 03:41:52 +00:00
|
|
|
adm_out = torch.cat((c_adm, noise_level_emb), 1) * weight
|
|
|
|
weights.append(weight)
|
|
|
|
noise_aug.append(noise_augment)
|
|
|
|
adm_inputs.append(adm_out)
|
|
|
|
|
|
|
|
if len(noise_aug) > 1:
|
|
|
|
adm_out = torch.stack(adm_inputs).sum(0)
|
|
|
|
noise_augment = noise_augment_merge
|
|
|
|
noise_level = round((noise_augmentor.max_noise_level - 1) * noise_augment)
|
|
|
|
c_adm, noise_level_emb = noise_augmentor(adm_out[:, :noise_augmentor.time_embed.dim], noise_level=torch.tensor([noise_level], device=device))
|
|
|
|
adm_out = torch.cat((c_adm, noise_level_emb), 1)
|
|
|
|
|
|
|
|
return adm_out
|
2023-06-23 06:14:12 +00:00
|
|
|
|
2023-06-09 16:24:24 +00:00
|
|
|
class SD21UNCLIP(BaseModel):
|
2023-07-29 18:51:56 +00:00
|
|
|
def __init__(self, model_config, noise_aug_config, model_type=ModelType.V_PREDICTION, device=None):
|
|
|
|
super().__init__(model_config, model_type, device=device)
|
2023-06-09 16:24:24 +00:00
|
|
|
self.noise_augmentor = CLIPEmbeddingNoiseAugmentation(**noise_aug_config)
|
|
|
|
|
2023-06-11 08:01:18 +00:00
|
|
|
def encode_adm(self, **kwargs):
|
|
|
|
unclip_conditioning = kwargs.get("unclip_conditioning", None)
|
|
|
|
device = kwargs["device"]
|
2023-08-15 03:41:52 +00:00
|
|
|
if unclip_conditioning is None:
|
|
|
|
return torch.zeros((1, self.adm_channels))
|
2023-06-11 08:01:18 +00:00
|
|
|
else:
|
2024-01-14 22:25:21 +00:00
|
|
|
return unclip_adm(unclip_conditioning, device, self.noise_augmentor, kwargs.get("unclip_noise_augment_merge", 0.05), kwargs.get("seed", 0) - 10)
|
2023-06-11 08:01:18 +00:00
|
|
|
|
2023-08-18 06:39:23 +00:00
|
|
|
def sdxl_pooled(args, noise_augmentor):
|
|
|
|
if "unclip_conditioning" in args:
|
2024-01-14 22:25:21 +00:00
|
|
|
return unclip_adm(args.get("unclip_conditioning", None), args["device"], noise_augmentor, seed=args.get("seed", 0) - 10)[:,:1280]
|
2023-08-18 06:39:23 +00:00
|
|
|
else:
|
|
|
|
return args["pooled_output"]
|
|
|
|
|
2023-06-22 17:03:50 +00:00
|
|
|
class SDXLRefiner(BaseModel):
|
2023-07-29 18:51:56 +00:00
|
|
|
def __init__(self, model_config, model_type=ModelType.EPS, device=None):
|
|
|
|
super().__init__(model_config, model_type, device=device)
|
2023-06-22 17:03:50 +00:00
|
|
|
self.embedder = Timestep(256)
|
2023-08-18 06:39:23 +00:00
|
|
|
self.noise_augmentor = CLIPEmbeddingNoiseAugmentation(**{"noise_schedule_config": {"timesteps": 1000, "beta_schedule": "squaredcos_cap_v2"}, "timestep_dim": 1280})
|
2023-06-22 17:03:50 +00:00
|
|
|
|
|
|
|
def encode_adm(self, **kwargs):
|
2023-08-18 06:39:23 +00:00
|
|
|
clip_pooled = sdxl_pooled(kwargs, self.noise_augmentor)
|
2023-06-22 17:03:50 +00:00
|
|
|
width = kwargs.get("width", 768)
|
|
|
|
height = kwargs.get("height", 768)
|
|
|
|
crop_w = kwargs.get("crop_w", 0)
|
|
|
|
crop_h = kwargs.get("crop_h", 0)
|
|
|
|
|
|
|
|
if kwargs.get("prompt_type", "") == "negative":
|
|
|
|
aesthetic_score = kwargs.get("aesthetic_score", 2.5)
|
|
|
|
else:
|
|
|
|
aesthetic_score = kwargs.get("aesthetic_score", 6)
|
|
|
|
|
|
|
|
out = []
|
|
|
|
out.append(self.embedder(torch.Tensor([height])))
|
2023-06-28 04:38:07 +00:00
|
|
|
out.append(self.embedder(torch.Tensor([width])))
|
2023-06-22 17:03:50 +00:00
|
|
|
out.append(self.embedder(torch.Tensor([crop_h])))
|
2023-06-28 04:38:07 +00:00
|
|
|
out.append(self.embedder(torch.Tensor([crop_w])))
|
2023-06-22 17:03:50 +00:00
|
|
|
out.append(self.embedder(torch.Tensor([aesthetic_score])))
|
2023-09-21 05:14:42 +00:00
|
|
|
flat = torch.flatten(torch.cat(out)).unsqueeze(dim=0).repeat(clip_pooled.shape[0], 1)
|
2023-06-22 17:03:50 +00:00
|
|
|
return torch.cat((clip_pooled.to(flat.device), flat), dim=1)
|
|
|
|
|
|
|
|
class SDXL(BaseModel):
|
2023-07-29 18:51:56 +00:00
|
|
|
def __init__(self, model_config, model_type=ModelType.EPS, device=None):
|
|
|
|
super().__init__(model_config, model_type, device=device)
|
2023-06-22 17:03:50 +00:00
|
|
|
self.embedder = Timestep(256)
|
2023-08-18 06:39:23 +00:00
|
|
|
self.noise_augmentor = CLIPEmbeddingNoiseAugmentation(**{"noise_schedule_config": {"timesteps": 1000, "beta_schedule": "squaredcos_cap_v2"}, "timestep_dim": 1280})
|
2023-06-22 17:03:50 +00:00
|
|
|
|
|
|
|
def encode_adm(self, **kwargs):
|
2023-08-18 06:39:23 +00:00
|
|
|
clip_pooled = sdxl_pooled(kwargs, self.noise_augmentor)
|
2023-06-22 17:03:50 +00:00
|
|
|
width = kwargs.get("width", 768)
|
|
|
|
height = kwargs.get("height", 768)
|
|
|
|
crop_w = kwargs.get("crop_w", 0)
|
|
|
|
crop_h = kwargs.get("crop_h", 0)
|
|
|
|
target_width = kwargs.get("target_width", width)
|
|
|
|
target_height = kwargs.get("target_height", height)
|
|
|
|
|
|
|
|
out = []
|
|
|
|
out.append(self.embedder(torch.Tensor([height])))
|
2023-06-28 04:38:07 +00:00
|
|
|
out.append(self.embedder(torch.Tensor([width])))
|
2023-06-22 17:03:50 +00:00
|
|
|
out.append(self.embedder(torch.Tensor([crop_h])))
|
2023-06-28 04:38:07 +00:00
|
|
|
out.append(self.embedder(torch.Tensor([crop_w])))
|
2023-06-22 17:03:50 +00:00
|
|
|
out.append(self.embedder(torch.Tensor([target_height])))
|
2023-06-28 04:38:07 +00:00
|
|
|
out.append(self.embedder(torch.Tensor([target_width])))
|
2023-09-21 05:14:42 +00:00
|
|
|
flat = torch.flatten(torch.cat(out)).unsqueeze(dim=0).repeat(clip_pooled.shape[0], 1)
|
2023-06-22 17:03:50 +00:00
|
|
|
return torch.cat((clip_pooled.to(flat.device), flat), dim=1)
|
2023-11-24 00:41:33 +00:00
|
|
|
|
|
|
|
class SVD_img2vid(BaseModel):
|
|
|
|
def __init__(self, model_config, model_type=ModelType.V_PREDICTION_EDM, device=None):
|
|
|
|
super().__init__(model_config, model_type, device=device)
|
|
|
|
self.embedder = Timestep(256)
|
|
|
|
|
|
|
|
def encode_adm(self, **kwargs):
|
|
|
|
fps_id = kwargs.get("fps", 6) - 1
|
|
|
|
motion_bucket_id = kwargs.get("motion_bucket_id", 127)
|
|
|
|
augmentation = kwargs.get("augmentation_level", 0)
|
|
|
|
|
|
|
|
out = []
|
|
|
|
out.append(self.embedder(torch.Tensor([fps_id])))
|
|
|
|
out.append(self.embedder(torch.Tensor([motion_bucket_id])))
|
|
|
|
out.append(self.embedder(torch.Tensor([augmentation])))
|
|
|
|
|
|
|
|
flat = torch.flatten(torch.cat(out)).unsqueeze(dim=0)
|
|
|
|
return flat
|
|
|
|
|
|
|
|
def extra_conds(self, **kwargs):
|
|
|
|
out = {}
|
|
|
|
adm = self.encode_adm(**kwargs)
|
|
|
|
if adm is not None:
|
|
|
|
out['y'] = comfy.conds.CONDRegular(adm)
|
|
|
|
|
|
|
|
latent_image = kwargs.get("concat_latent_image", None)
|
|
|
|
noise = kwargs.get("noise", None)
|
|
|
|
device = kwargs["device"]
|
|
|
|
|
|
|
|
if latent_image is None:
|
|
|
|
latent_image = torch.zeros_like(noise)
|
|
|
|
|
|
|
|
if latent_image.shape[1:] != noise.shape[1:]:
|
|
|
|
latent_image = utils.common_upscale(latent_image, noise.shape[-1], noise.shape[-2], "bilinear", "center")
|
|
|
|
|
A different way of handling multiple images passed to SVD.
Previously when a list of 3 images [0, 1, 2] was used for a 6 frame video
they were concated like this:
[0, 1, 2, 0, 1, 2]
now they are concated like this:
[0, 0, 1, 1, 2, 2]
2023-12-03 08:31:47 +00:00
|
|
|
latent_image = utils.resize_to_batch_size(latent_image, noise.shape[0])
|
2023-11-24 00:41:33 +00:00
|
|
|
|
|
|
|
out['c_concat'] = comfy.conds.CONDNoiseShape(latent_image)
|
|
|
|
|
2023-12-18 17:54:23 +00:00
|
|
|
cross_attn = kwargs.get("cross_attn", None)
|
|
|
|
if cross_attn is not None:
|
|
|
|
out['c_crossattn'] = comfy.conds.CONDCrossAttn(cross_attn)
|
|
|
|
|
2023-11-24 00:41:33 +00:00
|
|
|
if "time_conditioning" in kwargs:
|
|
|
|
out["time_context"] = comfy.conds.CONDCrossAttn(kwargs["time_conditioning"])
|
|
|
|
|
|
|
|
out['num_video_frames'] = comfy.conds.CONDConstant(noise.shape[0])
|
|
|
|
return out
|
2023-12-18 08:18:40 +00:00
|
|
|
|
2024-03-18 14:04:51 +00:00
|
|
|
class SV3D_u(SVD_img2vid):
|
|
|
|
def encode_adm(self, **kwargs):
|
|
|
|
augmentation = kwargs.get("augmentation_level", 0)
|
|
|
|
|
|
|
|
out = []
|
|
|
|
out.append(self.embedder(torch.flatten(torch.Tensor([augmentation]))))
|
|
|
|
|
|
|
|
flat = torch.flatten(torch.cat(out)).unsqueeze(dim=0)
|
|
|
|
return flat
|
|
|
|
|
|
|
|
class SV3D_p(SVD_img2vid):
|
|
|
|
def __init__(self, model_config, model_type=ModelType.V_PREDICTION_EDM, device=None):
|
|
|
|
super().__init__(model_config, model_type, device=device)
|
|
|
|
self.embedder_512 = Timestep(512)
|
|
|
|
|
|
|
|
def encode_adm(self, **kwargs):
|
|
|
|
augmentation = kwargs.get("augmentation_level", 0)
|
|
|
|
elevation = kwargs.get("elevation", 0) #elevation and azimuth are in degrees here
|
|
|
|
azimuth = kwargs.get("azimuth", 0)
|
|
|
|
noise = kwargs.get("noise", None)
|
|
|
|
|
|
|
|
out = []
|
|
|
|
out.append(self.embedder(torch.flatten(torch.Tensor([augmentation]))))
|
|
|
|
out.append(self.embedder_512(torch.deg2rad(torch.fmod(torch.flatten(90 - torch.Tensor([elevation])), 360.0))))
|
|
|
|
out.append(self.embedder_512(torch.deg2rad(torch.fmod(torch.flatten(torch.Tensor([azimuth])), 360.0))))
|
|
|
|
|
|
|
|
out = list(map(lambda a: utils.resize_to_batch_size(a, noise.shape[0]), out))
|
|
|
|
return torch.cat(out, dim=1)
|
|
|
|
|
|
|
|
|
2023-12-18 08:18:40 +00:00
|
|
|
class Stable_Zero123(BaseModel):
|
|
|
|
def __init__(self, model_config, model_type=ModelType.EPS, device=None, cc_projection_weight=None, cc_projection_bias=None):
|
|
|
|
super().__init__(model_config, model_type, device=device)
|
|
|
|
self.cc_projection = comfy.ops.manual_cast.Linear(cc_projection_weight.shape[1], cc_projection_weight.shape[0], dtype=self.get_dtype(), device=device)
|
|
|
|
self.cc_projection.weight.copy_(cc_projection_weight)
|
|
|
|
self.cc_projection.bias.copy_(cc_projection_bias)
|
|
|
|
|
|
|
|
def extra_conds(self, **kwargs):
|
|
|
|
out = {}
|
|
|
|
|
|
|
|
latent_image = kwargs.get("concat_latent_image", None)
|
|
|
|
noise = kwargs.get("noise", None)
|
|
|
|
|
|
|
|
if latent_image is None:
|
|
|
|
latent_image = torch.zeros_like(noise)
|
|
|
|
|
|
|
|
if latent_image.shape[1:] != noise.shape[1:]:
|
|
|
|
latent_image = utils.common_upscale(latent_image, noise.shape[-1], noise.shape[-2], "bilinear", "center")
|
|
|
|
|
|
|
|
latent_image = utils.resize_to_batch_size(latent_image, noise.shape[0])
|
|
|
|
|
|
|
|
out['c_concat'] = comfy.conds.CONDNoiseShape(latent_image)
|
|
|
|
|
|
|
|
cross_attn = kwargs.get("cross_attn", None)
|
|
|
|
if cross_attn is not None:
|
|
|
|
if cross_attn.shape[-1] != 768:
|
|
|
|
cross_attn = self.cc_projection(cross_attn)
|
|
|
|
out['c_crossattn'] = comfy.conds.CONDCrossAttn(cross_attn)
|
|
|
|
return out
|
2024-01-03 08:30:39 +00:00
|
|
|
|
|
|
|
class SD_X4Upscaler(BaseModel):
|
|
|
|
def __init__(self, model_config, model_type=ModelType.V_PREDICTION, device=None):
|
|
|
|
super().__init__(model_config, model_type, device=device)
|
2024-01-03 19:27:11 +00:00
|
|
|
self.noise_augmentor = ImageConcatWithNoiseAugmentation(noise_schedule_config={"linear_start": 0.0001, "linear_end": 0.02}, max_noise_level=350)
|
2024-01-03 08:30:39 +00:00
|
|
|
|
|
|
|
def extra_conds(self, **kwargs):
|
|
|
|
out = {}
|
|
|
|
|
|
|
|
image = kwargs.get("concat_image", None)
|
|
|
|
noise = kwargs.get("noise", None)
|
2024-01-03 19:27:11 +00:00
|
|
|
noise_augment = kwargs.get("noise_augmentation", 0.0)
|
|
|
|
device = kwargs["device"]
|
|
|
|
seed = kwargs["seed"] - 10
|
|
|
|
|
|
|
|
noise_level = round((self.noise_augmentor.max_noise_level) * noise_augment)
|
2024-01-03 08:30:39 +00:00
|
|
|
|
|
|
|
if image is None:
|
|
|
|
image = torch.zeros_like(noise)[:,:3]
|
|
|
|
|
|
|
|
if image.shape[1:] != noise.shape[1:]:
|
2024-01-03 19:27:11 +00:00
|
|
|
image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
|
|
|
|
|
|
|
|
noise_level = torch.tensor([noise_level], device=device)
|
|
|
|
if noise_augment > 0:
|
|
|
|
image, noise_level = self.noise_augmentor(image.to(device), noise_level=noise_level, seed=seed)
|
2024-01-03 08:30:39 +00:00
|
|
|
|
|
|
|
image = utils.resize_to_batch_size(image, noise.shape[0])
|
|
|
|
|
|
|
|
out['c_concat'] = comfy.conds.CONDNoiseShape(image)
|
2024-01-03 19:27:11 +00:00
|
|
|
out['y'] = comfy.conds.CONDRegular(noise_level)
|
2024-01-03 08:30:39 +00:00
|
|
|
return out
|
2024-02-16 15:55:08 +00:00
|
|
|
|
2024-03-31 05:25:16 +00:00
|
|
|
class IP2P:
|
|
|
|
def extra_conds(self, **kwargs):
|
|
|
|
out = {}
|
|
|
|
|
|
|
|
image = kwargs.get("concat_latent_image", None)
|
|
|
|
noise = kwargs.get("noise", None)
|
|
|
|
device = kwargs["device"]
|
|
|
|
|
|
|
|
if image is None:
|
|
|
|
image = torch.zeros_like(noise)
|
|
|
|
|
|
|
|
if image.shape[1:] != noise.shape[1:]:
|
|
|
|
image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
|
|
|
|
|
|
|
|
image = utils.resize_to_batch_size(image, noise.shape[0])
|
|
|
|
|
|
|
|
out['c_concat'] = comfy.conds.CONDNoiseShape(self.process_ip2p_image_in(image))
|
|
|
|
adm = self.encode_adm(**kwargs)
|
|
|
|
if adm is not None:
|
|
|
|
out['y'] = comfy.conds.CONDRegular(adm)
|
|
|
|
return out
|
|
|
|
|
|
|
|
class SD15_instructpix2pix(IP2P, BaseModel):
|
|
|
|
def __init__(self, model_config, model_type=ModelType.EPS, device=None):
|
|
|
|
super().__init__(model_config, model_type, device=device)
|
|
|
|
self.process_ip2p_image_in = lambda image: image
|
|
|
|
|
|
|
|
class SDXL_instructpix2pix(IP2P, SDXL):
|
|
|
|
def __init__(self, model_config, model_type=ModelType.EPS, device=None):
|
|
|
|
super().__init__(model_config, model_type, device=device)
|
2024-04-05 14:40:27 +00:00
|
|
|
if model_type == ModelType.V_PREDICTION_EDM:
|
|
|
|
self.process_ip2p_image_in = lambda image: comfy.latent_formats.SDXL().process_in(image) #cosxl ip2p
|
|
|
|
else:
|
|
|
|
self.process_ip2p_image_in = lambda image: image #diffusers ip2p
|
2024-03-31 05:25:16 +00:00
|
|
|
|
|
|
|
|
2024-02-16 15:55:08 +00:00
|
|
|
class StableCascade_C(BaseModel):
|
|
|
|
def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None):
|
|
|
|
super().__init__(model_config, model_type, device=device, unet_model=StageC)
|
|
|
|
self.diffusion_model.eval().requires_grad_(False)
|
|
|
|
|
|
|
|
def extra_conds(self, **kwargs):
|
|
|
|
out = {}
|
|
|
|
clip_text_pooled = kwargs["pooled_output"]
|
|
|
|
if clip_text_pooled is not None:
|
|
|
|
out['clip_text_pooled'] = comfy.conds.CONDRegular(clip_text_pooled)
|
|
|
|
|
|
|
|
if "unclip_conditioning" in kwargs:
|
|
|
|
embeds = []
|
|
|
|
for unclip_cond in kwargs["unclip_conditioning"]:
|
|
|
|
weight = unclip_cond["strength"]
|
|
|
|
embeds.append(unclip_cond["clip_vision_output"].image_embeds.unsqueeze(0) * weight)
|
|
|
|
clip_img = torch.cat(embeds, dim=1)
|
|
|
|
else:
|
|
|
|
clip_img = torch.zeros((1, 1, 768))
|
|
|
|
out["clip_img"] = comfy.conds.CONDRegular(clip_img)
|
|
|
|
out["sca"] = comfy.conds.CONDRegular(torch.zeros((1,)))
|
|
|
|
out["crp"] = comfy.conds.CONDRegular(torch.zeros((1,)))
|
|
|
|
|
|
|
|
cross_attn = kwargs.get("cross_attn", None)
|
|
|
|
if cross_attn is not None:
|
|
|
|
out['clip_text'] = comfy.conds.CONDCrossAttn(cross_attn)
|
|
|
|
return out
|
|
|
|
|
2024-02-16 17:56:11 +00:00
|
|
|
|
|
|
|
class StableCascade_B(BaseModel):
|
|
|
|
def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None):
|
|
|
|
super().__init__(model_config, model_type, device=device, unet_model=StageB)
|
|
|
|
self.diffusion_model.eval().requires_grad_(False)
|
|
|
|
|
|
|
|
def extra_conds(self, **kwargs):
|
|
|
|
out = {}
|
|
|
|
noise = kwargs.get("noise", None)
|
|
|
|
|
|
|
|
clip_text_pooled = kwargs["pooled_output"]
|
|
|
|
if clip_text_pooled is not None:
|
2024-02-20 09:23:25 +00:00
|
|
|
out['clip'] = comfy.conds.CONDRegular(clip_text_pooled)
|
2024-02-16 17:56:11 +00:00
|
|
|
|
|
|
|
#size of prior doesn't really matter if zeros because it gets resized but I still want it to get batched
|
|
|
|
prior = kwargs.get("stable_cascade_prior", torch.zeros((1, 16, (noise.shape[2] * 4) // 42, (noise.shape[3] * 4) // 42), dtype=noise.dtype, layout=noise.layout, device=noise.device))
|
|
|
|
|
|
|
|
out["effnet"] = comfy.conds.CONDRegular(prior)
|
|
|
|
out["sca"] = comfy.conds.CONDRegular(torch.zeros((1,)))
|
|
|
|
return out
|
2024-06-10 17:26:25 +00:00
|
|
|
|
|
|
|
|
|
|
|
class SD3(BaseModel):
|
|
|
|
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
|
|
|
|
super().__init__(model_config, model_type, device=device, unet_model=OpenAISignatureMMDITWrapper)
|
|
|
|
|
|
|
|
def encode_adm(self, **kwargs):
|
|
|
|
return kwargs["pooled_output"]
|
|
|
|
|
|
|
|
def extra_conds(self, **kwargs):
|
2024-06-11 11:20:26 +00:00
|
|
|
out = super().extra_conds(**kwargs)
|
2024-06-10 17:26:25 +00:00
|
|
|
cross_attn = kwargs.get("cross_attn", None)
|
|
|
|
if cross_attn is not None:
|
|
|
|
out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
|
|
|
|
return out
|
2024-06-12 04:49:00 +00:00
|
|
|
|
|
|
|
def memory_required(self, input_shape):
|
|
|
|
if comfy.model_management.xformers_enabled() or comfy.model_management.pytorch_attention_flash_attention():
|
|
|
|
dtype = self.get_dtype()
|
|
|
|
if self.manual_cast_dtype is not None:
|
|
|
|
dtype = self.manual_cast_dtype
|
|
|
|
#TODO: this probably needs to be tweaked
|
|
|
|
area = input_shape[0] * input_shape[2] * input_shape[3]
|
|
|
|
return (area * comfy.model_management.dtype_size(dtype) * 0.012) * (1024 * 1024)
|
|
|
|
else:
|
|
|
|
area = input_shape[0] * input_shape[2] * input_shape[3]
|
|
|
|
return (area * 0.3) * (1024 * 1024)
|