mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-02-04 10:40:30 +08:00
llama: cast logits as a comfy-weight (#12248)
This is using a different layers weight with .to(). Change it to use the ops caster if the original layer is a comfy weight so that it picks up dynamic_vram and async_offload functionality in full. Co-authored-by: Rattus <rattus128@gmail.com>
This commit is contained in:
parent
affe881354
commit
223364743c
@ -6,6 +6,7 @@ import math
|
||||
|
||||
from comfy.ldm.modules.attention import optimized_attention_for_device
|
||||
import comfy.model_management
|
||||
import comfy.ops
|
||||
import comfy.ldm.common_dit
|
||||
import comfy.clip_model
|
||||
|
||||
@ -794,7 +795,19 @@ class Qwen3_2B_ACE15_lm(BaseLlama, torch.nn.Module):
|
||||
self.dtype = dtype
|
||||
|
||||
def logits(self, x):
|
||||
return torch.nn.functional.linear(x[:, -1:], self.model.embed_tokens.weight.to(x), None)
|
||||
input = x[:, -1:]
|
||||
module = self.model.embed_tokens
|
||||
|
||||
offload_stream = None
|
||||
if module.comfy_cast_weights:
|
||||
weight, _, offload_stream = comfy.ops.cast_bias_weight(module, input, offloadable=True)
|
||||
else:
|
||||
weight = self.model.embed_tokens.weight.to(x)
|
||||
|
||||
x = torch.nn.functional.linear(input, weight, None)
|
||||
|
||||
comfy.ops.uncast_bias_weight(module, weight, None, offload_stream)
|
||||
return x
|
||||
|
||||
class Qwen3_4B(BaseLlama, torch.nn.Module):
|
||||
def __init__(self, config_dict, dtype, device, operations):
|
||||
|
||||
Loading…
Reference in New Issue
Block a user