From c4bfdba3301eb8dd2000b1b22e4752a662d4c856 Mon Sep 17 00:00:00 2001
From: Huazhong Ji <hzji210@gmail.com>
Date: Fri, 27 Dec 2024 08:36:50 +0800
Subject: [PATCH] Support ascend npu (#5436)

* support ascend npu

Co-authored-by: YukMingLaw <lymmm2@163.com>
Co-authored-by: starmountain1997 <guozr1997@hotmail.com>
Co-authored-by: Ginray <ginray0215@gmail.com>
---
 README.md                 | 10 ++++++++++
 comfy/model_management.py | 41 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 37142161..000d7680 100644
--- a/README.md
+++ b/README.md
@@ -224,6 +224,16 @@ You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS ve
 
 ```pip install torch-directml``` Then you can launch ComfyUI with: ```python main.py --directml```
 
+#### Ascend NPUs
+
+For models compatible with Ascend Extension for PyTorch (torch_npu). To get started, ensure your environment meets the prerequisites outlined on the [installation](https://ascend.github.io/docs/sources/ascend/quick_install.html) page. Here's a step-by-step guide tailored to your platform and installation method:
+
+1. Begin by installing the recommended or newer kernel version for Linux as specified in the Installation page of torch-npu, if necessary.
+2. Proceed with the installation of Ascend Basekit, which includes the driver, firmware, and CANN, following the instructions provided for your specific platform.
+3. Next, install the necessary packages for torch-npu by adhering to the platform-specific instructions on the [Installation](https://ascend.github.io/docs/sources/pytorch/install.html#pytorch) page.
+4. Finally, adhere to the [ComfyUI manual installation](#manual-install-windows-linux) guide for Linux. Once all components are installed, you can run ComfyUI as described earlier.
+
+
 # Running
 
 ```python main.py```
diff --git a/comfy/model_management.py b/comfy/model_management.py
index db2a6139..c36c52ff 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -86,6 +86,13 @@ try:
 except:
     pass
 
+try:
+    import torch_npu
+    _ = torch.npu.device_count()
+    npu_available = torch.npu.is_available()
+except:
+    npu_available = False
+
 if args.cpu:
     cpu_state = CPUState.CPU
 
@@ -97,6 +104,12 @@ def is_intel_xpu():
             return True
     return False
 
+def is_ascend_npu():
+    global npu_available
+    if npu_available:
+        return True
+    return False
+
 def get_torch_device():
     global directml_enabled
     global cpu_state
@@ -110,6 +123,8 @@ def get_torch_device():
     else:
         if is_intel_xpu():
             return torch.device("xpu", torch.xpu.current_device())
+        elif is_ascend_npu():
+            return torch.device("npu", torch.npu.current_device())
         else:
             return torch.device(torch.cuda.current_device())
 
@@ -130,6 +145,12 @@ def get_total_memory(dev=None, torch_total_too=False):
             mem_reserved = stats['reserved_bytes.all.current']
             mem_total_torch = mem_reserved
             mem_total = torch.xpu.get_device_properties(dev).total_memory
+        elif is_ascend_npu():
+            stats = torch.npu.memory_stats(dev)
+            mem_reserved = stats['reserved_bytes.all.current']
+            _, mem_total_npu = torch.npu.mem_get_info(dev)
+            mem_total_torch = mem_reserved
+            mem_total = mem_total_npu
         else:
             stats = torch.cuda.memory_stats(dev)
             mem_reserved = stats['reserved_bytes.all.current']
@@ -209,7 +230,7 @@ try:
         if int(torch_version[0]) >= 2:
             if ENABLE_PYTORCH_ATTENTION == False and args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
                 ENABLE_PYTORCH_ATTENTION = True
-    if is_intel_xpu():
+    if is_intel_xpu() or is_ascend_npu():
         if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
             ENABLE_PYTORCH_ATTENTION = True
 except:
@@ -274,6 +295,8 @@ def get_torch_device_name(device):
             return "{}".format(device.type)
     elif is_intel_xpu():
         return "{} {}".format(device, torch.xpu.get_device_name(device))
+    elif is_ascend_npu():
+        return "{} {}".format(device, torch.npu.get_device_name(device))
     else:
         return "CUDA {}: {}".format(device, torch.cuda.get_device_name(device))
 
@@ -860,6 +883,8 @@ def xformers_enabled():
         return False
     if is_intel_xpu():
         return False
+    if is_ascend_npu():
+        return False
     if directml_enabled:
         return False
     return XFORMERS_IS_AVAILABLE
@@ -884,6 +909,8 @@ def pytorch_attention_flash_attention():
             return True
         if is_intel_xpu():
             return True
+        if is_ascend_npu():
+            return True
     return False
 
 def mac_version():
@@ -923,6 +950,13 @@ def get_free_memory(dev=None, torch_free_too=False):
             mem_free_torch = mem_reserved - mem_active
             mem_free_xpu = torch.xpu.get_device_properties(dev).total_memory - mem_reserved
             mem_free_total = mem_free_xpu + mem_free_torch
+        elif is_ascend_npu():
+            stats = torch.npu.memory_stats(dev)
+            mem_active = stats['active_bytes.all.current']
+            mem_reserved = stats['reserved_bytes.all.current']
+            mem_free_npu, _ = torch.npu.mem_get_info(dev)
+            mem_free_torch = mem_reserved - mem_active
+            mem_free_total = mem_free_npu + mem_free_torch
         else:
             stats = torch.cuda.memory_stats(dev)
             mem_active = stats['active_bytes.all.current']
@@ -984,6 +1018,9 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma
     if is_intel_xpu():
         return True
 
+    if is_ascend_npu():
+        return True
+
     if torch.version.hip:
         return True
 
@@ -1081,6 +1118,8 @@ def soft_empty_cache(force=False):
         torch.mps.empty_cache()
     elif is_intel_xpu():
         torch.xpu.empty_cache()
+    elif is_ascend_npu():
+        torch.npu.empty_cache()
     elif torch.cuda.is_available():
         if force or is_nvidia(): #This seems to make things worse on ROCm so I only do it for cuda
             torch.cuda.empty_cache()