mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2025-04-19 10:53:29 +00:00
Move text_projection to base clip model.
This commit is contained in:
parent
30eb92c3cb
commit
ec96f6d03a
@ -564,9 +564,6 @@ class CLIP:
|
|||||||
n.layer_idx = self.layer_idx
|
n.layer_idx = self.layer_idx
|
||||||
return n
|
return n
|
||||||
|
|
||||||
def load_from_state_dict(self, sd):
|
|
||||||
self.cond_stage_model.load_sd(sd)
|
|
||||||
|
|
||||||
def add_patches(self, patches, strength_patch=1.0, strength_model=1.0):
|
def add_patches(self, patches, strength_patch=1.0, strength_model=1.0):
|
||||||
return self.patcher.add_patches(patches, strength_patch, strength_model)
|
return self.patcher.add_patches(patches, strength_patch, strength_model)
|
||||||
|
|
||||||
|
@ -66,7 +66,9 @@ class SD1ClipModel(torch.nn.Module, ClipTokenWeightEncoder):
|
|||||||
self.layer = layer
|
self.layer = layer
|
||||||
self.layer_idx = None
|
self.layer_idx = None
|
||||||
self.empty_tokens = [[49406] + [49407] * 76]
|
self.empty_tokens = [[49406] + [49407] * 76]
|
||||||
self.text_projection = None
|
self.text_projection = torch.nn.Parameter(torch.eye(self.transformer.get_input_embeddings().weight.shape[1]))
|
||||||
|
self.logit_scale = torch.nn.Parameter(torch.tensor(4.6055))
|
||||||
|
|
||||||
self.layer_norm_hidden_state = True
|
self.layer_norm_hidden_state = True
|
||||||
if layer == "hidden":
|
if layer == "hidden":
|
||||||
assert layer_idx is not None
|
assert layer_idx is not None
|
||||||
@ -163,6 +165,10 @@ class SD1ClipModel(torch.nn.Module, ClipTokenWeightEncoder):
|
|||||||
return self(tokens)
|
return self(tokens)
|
||||||
|
|
||||||
def load_sd(self, sd):
|
def load_sd(self, sd):
|
||||||
|
if "text_projection" in sd:
|
||||||
|
self.text_projection[:] = sd.pop("text_projection")
|
||||||
|
if "text_projection.weight" in sd:
|
||||||
|
self.text_projection[:] = sd.pop("text_projection.weight").transpose(0, 1)
|
||||||
return self.transformer.load_state_dict(sd, strict=False)
|
return self.transformer.load_state_dict(sd, strict=False)
|
||||||
|
|
||||||
def parse_parentheses(string):
|
def parse_parentheses(string):
|
||||||
|
@ -17,7 +17,7 @@
|
|||||||
"num_attention_heads": 16,
|
"num_attention_heads": 16,
|
||||||
"num_hidden_layers": 24,
|
"num_hidden_layers": 24,
|
||||||
"pad_token_id": 1,
|
"pad_token_id": 1,
|
||||||
"projection_dim": 512,
|
"projection_dim": 1024,
|
||||||
"torch_dtype": "float32",
|
"torch_dtype": "float32",
|
||||||
"vocab_size": 49408
|
"vocab_size": 49408
|
||||||
}
|
}
|
||||||
|
@ -11,15 +11,9 @@ class SDXLClipG(sd1_clip.SD1ClipModel):
|
|||||||
textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_config_bigg.json")
|
textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_config_bigg.json")
|
||||||
super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, textmodel_path=textmodel_path, dtype=dtype)
|
super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, textmodel_path=textmodel_path, dtype=dtype)
|
||||||
self.empty_tokens = [[49406] + [49407] + [0] * 75]
|
self.empty_tokens = [[49406] + [49407] + [0] * 75]
|
||||||
self.text_projection = torch.nn.Parameter(torch.empty(1280, 1280))
|
|
||||||
self.logit_scale = torch.nn.Parameter(torch.tensor(4.6055))
|
|
||||||
self.layer_norm_hidden_state = False
|
self.layer_norm_hidden_state = False
|
||||||
|
|
||||||
def load_sd(self, sd):
|
def load_sd(self, sd):
|
||||||
if "text_projection" in sd:
|
|
||||||
self.text_projection[:] = sd.pop("text_projection")
|
|
||||||
if "text_projection.weight" in sd:
|
|
||||||
self.text_projection[:] = sd.pop("text_projection.weight").transpose(0, 1)
|
|
||||||
return super().load_sd(sd)
|
return super().load_sd(sd)
|
||||||
|
|
||||||
class SDXLClipGTokenizer(sd1_clip.SD1Tokenizer):
|
class SDXLClipGTokenizer(sd1_clip.SD1Tokenizer):
|
||||||
|
Loading…
Reference in New Issue
Block a user