Slightly smarter batching behaviour.

Try to keep batch sizes more consistent which seems to improve things on AMD GPUs.
2025-07-15 03:46:58 +08:00 · 2023-02-08 17:09:47 -05:00 · 2023-02-08 17:09:47 -05:00 · 3fd87cbd21
commit 3fd87cbd21
parent bbdcf0b737
1 changed files with 11 additions and 5 deletions
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@ -86,15 +86,21 @@ class CFGDenoiserComplex(torch.nn.Module):
            while len(to_run) > 0:
                first = to_run[0]
                first_shape = first[0][0].shape
-                to_batch = []
+                to_batch_temp = []
                for x in range(len(to_run)):
                    if to_run[x][0][0].shape == first_shape:
                        if to_run[x][0][2].shape == first[0][2].shape:
-                            to_batch += [x]
-                            if (len(to_batch) * first_shape[0] * first_shape[2] * first_shape[3] >= max_total_area):
+                            to_batch_temp += [x]
+
+                to_batch_temp.reverse()
+                to_batch = to_batch_temp[:1]
+
+                for i in range(1, len(to_batch_temp) + 1):
+                    batch_amount = to_batch_temp[:len(to_batch_temp)//i]
+                    if (len(batch_amount) * first_shape[0] * first_shape[2] * first_shape[3] < max_total_area):
+                        to_batch = batch_amount
                        break

-                to_batch.reverse()
                input_x = []
                mult = []
                c = []