From 813b8df872fe9e6f5b36dcdadab3d69372d8b2d2 Mon Sep 17 00:00:00 2001 From: Austin Mroz Date: Thu, 19 Dec 2024 13:17:00 -0600 Subject: [PATCH 1/2] Fix race condition in preview code. In the previous preview code, when possible, a non-blocking `to` operation is performed and, immediately after, the output tensor is used to create an image. If this non-blocking operation has not completed, PIL makes a copy of the uninitialized memory to produce an image. Generally, this will either contain zeros, or the result of a previously generated preview. This results in both incorrect output, and wasted computation (unless the memory this output was eventually copied to is reallocated and displayed instead of a future preview). To resolve this, the state of the preview generation is tracked with an event. - The PIL image is created with no copy - The preview image is not sent to from the server until ready - Completion of this event is polled with a reasonably slow frequency - A new preview is not created if a previous preview has not completed --- latent_preview.py | 27 ++++++++++++++++----------- server.py | 4 ++++ 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/latent_preview.py b/latent_preview.py index 07f9cc68..e3f620de 100644 --- a/latent_preview.py +++ b/latent_preview.py @@ -9,20 +9,25 @@ import logging MAX_PREVIEW_RESOLUTION = args.preview_size -def preview_to_image(latent_image): - latents_ubyte = (((latent_image + 1.0) / 2.0).clamp(0, 1) # change scale from -1..1 to 0..1 - .mul(0xFF) # to 0..255 - ).to(device="cpu", dtype=torch.uint8, non_blocking=comfy.model_management.device_supports_non_blocking(latent_image.device)) - - return Image.fromarray(latents_ubyte.numpy()) - class LatentPreviewer: def decode_latent_to_preview(self, x0): pass def decode_latent_to_preview_image(self, preview_format, x0): - preview_image = self.decode_latent_to_preview(x0) - return ("JPEG", preview_image, MAX_PREVIEW_RESOLUTION) + if hasattr(self, 'event') and not self.event.query(): + # A previous preview is still being processed + return None + preview_tensor = self.decode_latent_to_preview(x0) + latents_ubyte = (((preview_tensor + 1.0) / 2.0).clamp(0, 1) # change scale from -1..1 to 0..1 + .mul(0xFF) # to 0..255 + ).to(device="cpu", dtype=torch.uint8, non_blocking=comfy.model_management.device_supports_non_blocking(preview_tensor.device)) + latents_rgbx = torch.zeros(latents_ubyte.shape[:2] + (4,), device="cpu", dtype=torch.uint8) + latents_rgbx[:,:,:3] = latents_ubyte + self.event = torch.cuda.Event() + self.event.record() + preview_image = Image.frombuffer('RGBX', (latents_ubyte.shape[1], latents_ubyte.shape[0]), + latents_rgbx.numpy().data, 'raw', 'RGBX', 0, 1) + return ("JPEG", preview_image, MAX_PREVIEW_RESOLUTION, self.event) class TAESDPreviewerImpl(LatentPreviewer): def __init__(self, taesd): @@ -30,7 +35,7 @@ class TAESDPreviewerImpl(LatentPreviewer): def decode_latent_to_preview(self, x0): x_sample = self.taesd.decode(x0[:1])[0].movedim(0, 2) - return preview_to_image(x_sample) + return x_sample class Latent2RGBPreviewer(LatentPreviewer): @@ -53,7 +58,7 @@ class Latent2RGBPreviewer(LatentPreviewer): latent_image = torch.nn.functional.linear(x0.movedim(0, -1), self.latent_rgb_factors, bias=self.latent_rgb_factors_bias) # latent_image = x0[0].permute(1, 2, 0) @ self.latent_rgb_factors - return preview_to_image(latent_image) + return latent_image def get_previewer(device, latent_format): diff --git a/server.py b/server.py index ddd71e06..f31d792b 100644 --- a/server.py +++ b/server.py @@ -750,6 +750,10 @@ class PromptServer(): image_type = image_data[0] image = image_data[1] max_size = image_data[2] + if len(image_data) > 3: + event = image_data[3] + while not event.query(): + await asyncio.sleep(.01) if max_size is not None: if hasattr(Image, 'Resampling'): resampling = Image.Resampling.BILINEAR From ce5afecc362efb1bc9670744e18aeafe730a99d8 Mon Sep 17 00:00:00 2001 From: Austin Mroz Date: Mon, 23 Dec 2024 16:43:43 -0600 Subject: [PATCH 2/2] Only use events for devices supporting nonblocking --- latent_preview.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/latent_preview.py b/latent_preview.py index e3f620de..06ebf14a 100644 --- a/latent_preview.py +++ b/latent_preview.py @@ -18,16 +18,21 @@ class LatentPreviewer: # A previous preview is still being processed return None preview_tensor = self.decode_latent_to_preview(x0) + if comfy.model_management.device_supports_non_blocking(preview_tensor.device): + latents_ubyte = (((preview_tensor + 1.0) / 2.0).clamp(0, 1) # change scale from -1..1 to 0..1 + .mul(0xFF) # to 0..255 + ).to(device="cpu", dtype=torch.uint8, non_blocking=True) + latents_rgbx = torch.zeros(latents_ubyte.shape[:2] + (4,), device="cpu", dtype=torch.uint8) + latents_rgbx[:,:,:3] = latents_ubyte + self.event = torch.cuda.Event() + self.event.record() + preview_image = Image.frombuffer('RGBX', (latents_ubyte.shape[1], latents_ubyte.shape[0]), + latents_rgbx.numpy().data, 'raw', 'RGBX', 0, 1) + return ("JPEG", preview_image, MAX_PREVIEW_RESOLUTION, self.event) latents_ubyte = (((preview_tensor + 1.0) / 2.0).clamp(0, 1) # change scale from -1..1 to 0..1 .mul(0xFF) # to 0..255 - ).to(device="cpu", dtype=torch.uint8, non_blocking=comfy.model_management.device_supports_non_blocking(preview_tensor.device)) - latents_rgbx = torch.zeros(latents_ubyte.shape[:2] + (4,), device="cpu", dtype=torch.uint8) - latents_rgbx[:,:,:3] = latents_ubyte - self.event = torch.cuda.Event() - self.event.record() - preview_image = Image.frombuffer('RGBX', (latents_ubyte.shape[1], latents_ubyte.shape[0]), - latents_rgbx.numpy().data, 'raw', 'RGBX', 0, 1) - return ("JPEG", preview_image, MAX_PREVIEW_RESOLUTION, self.event) + ).to(device="cpu", dtype=torch.uint8, non_blocking=False) + return ("JPEG", Image.fromarray(latents_ubyte.numpy()), MAX_PREVIEW_RESOLUTION) class TAESDPreviewerImpl(LatentPreviewer): def __init__(self, taesd):