Fix some memory issues in sub quad attention.

2025-06-16 08:15:32 +08:00 · 2023-10-30 15:29:45 -04:00 · 2023-10-30 15:29:45 -04:00 · c837a173fa
commit c837a173fa
parent 125b03eead
1 changed files with 10 additions and 23 deletions
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@ -160,32 +160,19 @@ def attention_sub_quad(query, key, value, heads, mask=None):
    mem_free_total, mem_free_torch = model_management.get_free_memory(query.device, True)
    chunk_threshold_bytes = mem_free_torch * 0.5 #Using only this seems to work better on AMD
    kv_chunk_size_min = None
    kv_chunk_size = None
    query_chunk_size = None
-    #not sure at all about the math here
+    for x in [4096, 2048, 1024, 512, 256]:
-    #TODO: tweak this
+        count = mem_free_total / (batch_x_heads * bytes_per_token * x * 4.0)
-    if mem_free_total > 8192 * 1024 * 1024 * 1.3:
+        if count >= k_tokens:
        query_chunk_size_x = 1024 * 4
    elif mem_free_total > 4096 * 1024 * 1024 * 1.3:
        query_chunk_size_x = 1024 * 2
    else:
        query_chunk_size_x = 1024
    kv_chunk_size_min_x = None
    kv_chunk_size_x = (int((chunk_threshold_bytes // (batch_x_heads * bytes_per_token * query_chunk_size_x)) * 2.0) // 1024) * 1024
    if kv_chunk_size_x < 1024:
        kv_chunk_size_x = None
    if chunk_threshold_bytes is not None and qk_matmul_size_bytes <= chunk_threshold_bytes:
        # the big matmul fits into our memory limit; do everything in 1 chunk,
        # i.e. send it down the unchunked fast-path
        query_chunk_size = q_tokens
            kv_chunk_size = k_tokens
-    else:
+            query_chunk_size = x
-        query_chunk_size = query_chunk_size_x
+            break
-        kv_chunk_size = kv_chunk_size_x
+
-        kv_chunk_size_min = kv_chunk_size_min_x
+    if query_chunk_size is None:
        query_chunk_size = 512
    hidden_states = efficient_dot_product_attention(
        query,