diff --git a/comfy/cli_args.py b/comfy/cli_args.py
index 29e5fb159..dc1597d88 100644
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -42,7 +42,7 @@ parser.add_argument("--auto-launch", action="store_true", help="Automatically la
 parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use.")
 cm_group = parser.add_mutually_exclusive_group()
 cm_group.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).")
-cm_group.add_argument("--disable-cuda-malloc", action="store_true", help="Enable cudaMallocAsync.")
+cm_group.add_argument("--disable-cuda-malloc", action="store_true", help="Disable cudaMallocAsync.")
 
 parser.add_argument("--dont-upcast-attention", action="store_true", help="Disable upcasting of attention. Can boost speed but increase the chances of black images.")
 
diff --git a/comfy/k_diffusion/external.py b/comfy/k_diffusion/external.py
index 7335d56c4..c1a137d9c 100644
--- a/comfy/k_diffusion/external.py
+++ b/comfy/k_diffusion/external.py
@@ -91,7 +91,9 @@ class DiscreteSchedule(nn.Module):
         return log_sigma.exp()
 
     def predict_eps_discrete_timestep(self, input, t, **kwargs):
-        sigma = self.t_to_sigma(t.round())
+        if t.dtype != torch.int64 and t.dtype != torch.int32:
+            t = t.round()
+        sigma = self.t_to_sigma(t)
         input = input * ((utils.append_dims(sigma, input.ndim) ** 2 + 1.0) ** 0.5)
         return  (input - self(input, sigma, **kwargs)) / utils.append_dims(sigma, input.ndim)