Added DummyGPT.

This commit is contained in:
2026-05-01 00:49:45 +02:00
parent ac8852d25e
commit cbdd32faaa
3 changed files with 51 additions and 7 deletions

View File

@ -1,4 +1,3 @@
from typing import final
import torch
@ -15,7 +14,7 @@ class SelfAttention(torch.nn.Module):
self.w_query = torch.nn.Linear(d_in, d_out, bias=bias)
self.w_key = torch.nn.Linear(d_in, d_out, bias=bias)
self.w_val = torch.nn.Linear(d_in, d_out, bias=bias)
self.dropout = torch.nn.Dropout(inplace=True)
self.dropout = torch.nn.Dropout(dropout, inplace=True)
self.register_buffer(
"mask",
torch.triu(torch.ones(ctx_len, ctx_len), diagonal=1).bool(),
@ -54,14 +53,14 @@ class MultiHeadAttention(torch.nn.Module):
self.w_key = torch.nn.Linear(d_in, d_out, bias=bias)
self.w_val = torch.nn.Linear(d_in, d_out, bias=bias)
self.out_proj = torch.nn.Linear(d_out, d_out)
self.dropout = torch.nn.Dropout(inplace=True)
self.dropout = torch.nn.Dropout(dropout, inplace=True)
self.register_buffer(
"mask",
torch.triu(torch.ones(ctx_len, ctx_len), diagonal=1),
torch.triu(torch.ones(ctx_len, ctx_len), diagonal=1).bool(),
)
def forward(self, data: torch.Tensor) -> torch.Tensor:
batches, num_tokens, _features = data.shape
batches, num_tokens, _ = data.shape
queries = self.w_query(data)
keys = self.w_key(data)
values = self.w_val(data)
@ -71,16 +70,22 @@ class MultiHeadAttention(torch.nn.Module):
values_v = values.view(
batches, num_tokens, self.num_heads, self.head_dim
).transpose(1, 2)
# 2, 6, 2, 4 -> 2, 2, 6, 4
# (batches, tokens, heads, out_dim) -> (batches, heads, tokens, out_dim)
queries_v = queries.view(
batches, num_tokens, self.num_heads, self.head_dim
).transpose(1, 2)
# (2, 2, 6, 4) @ (2, 2, 4, 6) -> (2, 2, 6, 6)
# (batches, tokens, heads, out_dim) @ (batches', tokens', out_dim', heads') -> (batches, tokens, heads, heads')
attn_scores = queries_v @ keys_v.transpose(2, 3)
attn_scores.masked_fill_(
self.mask.bool()[:num_tokens, :num_tokens],
self.mask[:num_tokens, :num_tokens],
-torch.inf,
)
attn_weights = (attn_scores / keys.shape[-1] ** 0.5).softmax(dim=-1)
self.dropout(attn_weights)
# (2, 2, 6, 6) @ (2, 2, 6, 4) -> (2, 2, 6, 6)
# (2, 2, 6, 6) -> T(1,2) -> (2, 6, 2, 6)
context_vec = (attn_weights @ values_v).transpose(1, 2)
context_vec = context_vec.contiguous().view(batches, num_tokens, self.d_out)
context_vec = self.out_proj(context_vec)