diff --git a/.ci/nightly/update_windows/update_comfyui_and_python_dependencies.bat b/.ci/nightly/update_windows/update_comfyui_and_python_dependencies.bat
index c345a6992..b4989534f 100755
--- a/.ci/nightly/update_windows/update_comfyui_and_python_dependencies.bat
+++ b/.ci/nightly/update_windows/update_comfyui_and_python_dependencies.bat
@@ -1,3 +1,3 @@
 ..\python_embeded\python.exe .\update.py ..\ComfyUI\
-..\python_embeded\python.exe -s -m pip install --upgrade --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121 -r ../ComfyUI/requirements.txt pygit2
+..\python_embeded\python.exe -s -m pip install --upgrade --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu118 -r ../ComfyUI/requirements.txt pygit2
 pause
diff --git a/.github/workflows/windows_release_nightly_pytorch.yml b/.github/workflows/windows_release_nightly_pytorch.yml
index 4d686ded8..f23cae6d5 100644
--- a/.github/workflows/windows_release_nightly_pytorch.yml
+++ b/.github/workflows/windows_release_nightly_pytorch.yml
@@ -30,7 +30,7 @@ jobs:
             echo 'import site' >> ./python310._pth
             curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
             ./python.exe get-pip.py
-            python -m pip wheel torch torchvision torchaudio --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu121 -r ../ComfyUI/requirements.txt pygit2 -w ../temp_wheel_dir
+            python -m pip wheel torch torchvision torchaudio --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu118 -r ../ComfyUI/requirements.txt pygit2 -w ../temp_wheel_dir
             ls ../temp_wheel_dir
             ./python.exe -s -m pip install --pre ../temp_wheel_dir/*
             sed -i '1i../ComfyUI' ./python310._pth
diff --git a/README.md b/README.md
index 5b6346a67..3b3824714 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,8 @@ A powerful and modular stable diffusion GUI and backend.
 This ui will let you design and execute advanced stable diffusion pipelines using a graph/nodes/flowchart based interface. For some workflow examples and see what ComfyUI can do you can check out:
 ### [ComfyUI Examples](https://comfyanonymous.github.io/ComfyUI_examples/)
 
+### [Installing ComfyUI](#installing)
+
 ## Features
 - Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
 - Fully supports SD1.x and SD2.x
diff --git a/comfy/cli_args.py b/comfy/cli_args.py
index b24054ce0..764427165 100644
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -10,6 +10,7 @@ parser.add_argument("--output-directory", type=str, default=None, help="Set the
 parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use.")
 parser.add_argument("--dont-upcast-attention", action="store_true", help="Disable upcasting of attention. Can boost speed but increase the chances of black images.")
 parser.add_argument("--force-fp32", action="store_true", help="Force fp32 (If this makes your GPU work better please report it).")
+parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE", const=-1, help="Use torch-directml.")
 
 attn_group = parser.add_mutually_exclusive_group()
 attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization instead of the sub-quadratic one. Ignored when xformers is used.")
diff --git a/comfy/model_management.py b/comfy/model_management.py
index 6e3a03530..9497ae7af 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -20,6 +20,18 @@ total_vram_available_mb = -1
 accelerate_enabled = False
 xpu_available = False
 
+directml_enabled = False
+if args.directml is not None:
+    import torch_directml
+    directml_enabled = True
+    device_index = args.directml
+    if device_index < 0:
+        directml_device = torch_directml.device()
+    else:
+        directml_device = torch_directml.device(device_index)
+    print("Using directml with device:", torch_directml.device_name(device_index))
+    # torch_directml.disable_tiled_resources(True)
+
 try:
     import torch
     try:
@@ -217,6 +229,10 @@ def unload_if_low_vram(model):
 
 def get_torch_device():
     global xpu_available
+    global directml_enabled
+    if directml_enabled:
+        global directml_device
+        return directml_device
     if vram_state == VRAMState.MPS:
         return torch.device("mps")
     if vram_state == VRAMState.CPU:
@@ -234,8 +250,14 @@ def get_autocast_device(dev):
 
 
 def xformers_enabled():
+    global xpu_available
+    global directml_enabled
     if vram_state == VRAMState.CPU:
         return False
+    if xpu_available:
+        return False
+    if directml_enabled:
+        return False
     return XFORMERS_IS_AVAILABLE
 
 
@@ -251,6 +273,7 @@ def pytorch_attention_enabled():
 
 def get_free_memory(dev=None, torch_free_too=False):
     global xpu_available
+    global directml_enabled
     if dev is None:
         dev = get_torch_device()
 
@@ -258,7 +281,10 @@ def get_free_memory(dev=None, torch_free_too=False):
         mem_free_total = psutil.virtual_memory().available
         mem_free_torch = mem_free_total
     else:
-        if xpu_available:
+        if directml_enabled:
+            mem_free_total = 1024 * 1024 * 1024 #TODO
+            mem_free_torch = mem_free_total
+        elif xpu_available:
             mem_free_total = torch.xpu.get_device_properties(dev).total_memory - torch.xpu.memory_allocated(dev)
             mem_free_torch = mem_free_total
         else:
@@ -293,9 +319,14 @@ def mps_mode():
 
 def should_use_fp16():
     global xpu_available
+    global directml_enabled
+
     if FORCE_FP32:
         return False
 
+    if directml_enabled:
+        return False
+
     if cpu_mode() or mps_mode() or xpu_available:
         return False #TODO ?