From 39f29605813df18ba44dc23c85393b15d894fcc8 Mon Sep 17 00:00:00 2001 From: nahcmon Date: Mon, 8 Jun 2026 18:31:48 +0200 Subject: [PATCH] Use torch.neg instead of unary minus in RoPE freqs computation On RTX 5090 (Blackwell) with PyTorch cu130, the unary-minus operation on a CUDA tensor slice in precompute_freqs_cis crashes during RoPE computation for the Gemma3 text encoder (CUDA error: unknown error / access violation depending on driver version), which in turn triggers cascading DynamicVRAM/RAM exhaustion that's easy to misdiagnose as a memory issue. torch.neg(x) is mathematically and numerically identical to -x (verified bit-for-bit equal on CPU) but apparently avoids whatever code path in the unary-minus operator dispatch trips up Blackwell/cu130. Fixes #13977 --- comfy/text_encoders/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py index 5087228ca..bdf666832 100644 --- a/comfy/text_encoders/llama.py +++ b/comfy/text_encoders/llama.py @@ -437,7 +437,7 @@ def precompute_freqs_cis(head_dim, position_ids, theta, rope_scale=None, rope_di cos = cos.unsqueeze(1) sin = sin.unsqueeze(1) sin_split = sin.shape[-1] // 2 - out.append((cos, sin[..., : sin_split], -sin[..., sin_split :])) + out.append((cos, sin[..., : sin_split], torch.neg(sin[..., sin_split :]))) if len(out) == 1: return out[0]