diff --git a/comfy/clip_vision.py b/comfy/clip_vision.py index a95707e4..7a59ef6e 100644 --- a/comfy/clip_vision.py +++ b/comfy/clip_vision.py @@ -2,12 +2,14 @@ from transformers import CLIPVisionModelWithProjection, CLIPVisionConfig, CLIPIm from .utils import load_torch_file, transformers_convert import os import torch +import comfy.ops class ClipVisionModel(): def __init__(self, json_config): config = CLIPVisionConfig.from_json_file(json_config) - with modeling_utils.no_init_weights(): - self.model = CLIPVisionModelWithProjection(config) + with comfy.ops.use_comfy_ops(): + with modeling_utils.no_init_weights(): + self.model = CLIPVisionModelWithProjection(config) self.processor = CLIPImageProcessor(crop_size=224, do_center_crop=True, do_convert_rgb=True, diff --git a/comfy/ops.py b/comfy/ops.py index c39b994a..2e72030b 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -1,4 +1,5 @@ import torch +from contextlib import contextmanager class Linear(torch.nn.Module): def __init__(self, in_features: int, out_features: int, bias: bool = True, @@ -19,3 +20,13 @@ class Linear(torch.nn.Module): class Conv2d(torch.nn.Conv2d): def reset_parameters(self): return None + + +@contextmanager +def use_comfy_ops(): # Kind of an ugly hack but I can't think of a better way + old_torch_nn_linear = torch.nn.Linear + torch.nn.Linear = Linear + try: + yield + finally: + torch.nn.Linear = old_torch_nn_linear diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index 0df3d9d9..c2d4df09 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -1,6 +1,7 @@ import os from transformers import CLIPTokenizer, CLIPTextModel, CLIPTextConfig, modeling_utils +import comfy.ops import torch import traceback import zipfile @@ -38,8 +39,9 @@ class SD1ClipModel(torch.nn.Module, ClipTokenWeightEncoder): if textmodel_json_config is None: textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_clip_config.json") config = CLIPTextConfig.from_json_file(textmodel_json_config) - with modeling_utils.no_init_weights(): - self.transformer = CLIPTextModel(config) + with comfy.ops.use_comfy_ops(): + with modeling_utils.no_init_weights(): + self.transformer = CLIPTextModel(config) self.device = device self.max_length = max_length