Untrained model is done.

This commit is contained in:
2026-05-01 15:05:56 +02:00
parent cbdd32faaa
commit 1c9029ec78
2 changed files with 162 additions and 43 deletions

View File

@@ -1,11 +1,11 @@
from pathlib import Path
import tiktoken
import torch
from llmfs.attn import MultiHeadAttention
from llmfs.datasets.v1 import GPTDataSetV1
from llmfs.gpt import GPTConfig
from llmfs.gpt import DummyGPT, GPTConfig, TransformerBlock
from llmfs.tokenizers import BPETokenizer
DATA_DIR = Path(__file__).parent.parent / "data"
@@ -20,23 +20,61 @@ GPT_CONFIG_124M = GPTConfig(
)
def generate_text_simple(
model: DummyGPT, idx: torch.Tensor, max_new_tokens: int, context_size: int
) -> torch.Tensor:
for _ in range(max_new_tokens):
idx_cond = idx[:, -context_size:]
with torch.no_grad():
logits: torch.Tensor = model(idx_cond)
logits = logits[:, -1, :]
probs = logits.softmax(dim=-1)
idx_next = probs.argmax(dim=-1, keepdim=True)
idx = torch.cat((idx, idx_next), dim=1)
return idx
def process_text(text: str):
tokenizer = BPETokenizer.build(text)
print("Buiding tokenizer")
# tokenizer = BPETokenizer.build(text)
tokenizer = tiktoken.encoding_for_model("gpt2")
vocab_size = tokenizer.max_token_value + 1
max_len = 4
ctx_len = max_len
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(ctx_len, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(ctx_len))
dataset = GPTDataSetV1.data_loader(
text,
tokenizer,
batch_size=8,
max_len=4,
stride=1,
shuffle=False,
print(f"Tokenizer is ready. Vocab size: {vocab_size}")
batch = torch.stack(
[
torch.tensor(tokenizer.encode("Every effort moves you")),
torch.tensor(tokenizer.encode("Every day holds a")),
],
dim=0,
)
cfg = GPTConfig(
vocab_size=vocab_size,
context_length=1024,
embedding_dim=768,
n_heads=12,
n_layers=12,
dropout=0.1,
qkv_bias=False,
)
gpt = DummyGPT(cfg)
gpt.eval()
start_ctx = "Hello, I am"
encoded = tokenizer.encode(start_ctx)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
out = generate_text_simple(gpt, encoded_tensor, 6, cfg.context_length)
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)
# logits = gpt(batch)
# print(logits)
# print(logits.shape)
# dataset = GPTDataSetV1.data_loader(
# text,
# tokenizer,
# batch_size=8,
# max_len=4,
# stride=1,
# shuffle=False,
# )
# for inps, targs in iter(dataset):
# embeds = token_embedding_layer(inps)
# print(embeds.shape)
@@ -46,32 +84,9 @@ def process_text(text: str):
# tokenizer = BPETokenizer.build(text)
def attn_test():
inps = torch.Tensor(
[
[0.43, 0.15, 0.89],
[0.55, 0.87, 0.66],
[0.57, 0.85, 0.64],
[0.22, 0.58, 0.43],
[0.77, 0.25, 0.10],
[0.05, 0.80, 0.55],
]
)
batch = torch.stack((inps, inps), dim=0)
attn = MultiHeadAttention(
inps.shape[1],
8,
inps.shape[0],
dropout=True,
num_heads=2,
)
print(attn(batch))
def main():
raw_text = (DATA_DIR / "the-verdict.txt").read_text()
# process_text(raw_text)
attn_test()
process_text(raw_text)
if __name__ == "__main__":

View File

@@ -2,6 +2,8 @@ from dataclasses import dataclass
import torch
from llmfs.attn import MultiHeadAttention
@dataclass
class GPTConfig:
@@ -14,8 +16,91 @@ class GPTConfig:
qkv_bias: bool
class DummyGPT:
class DummyTransformerBlock(torch.nn.Module):
def __init__(self, config: GPTConfig):
super().__init__()
def forward(self, x: torch.Tensor) -> torch.Tensor:
return x
class GELU(torch.nn.Module):
def forward(self, x: torch.Tensor) -> torch.Tensor:
return (
0.5
* x
* (
1
+ torch.tanh(
torch.sqrt(torch.tensor(2.0 / torch.pi))
* (x + 0.44715 * torch.pow(x, 3))
)
)
)
class FeedForward(torch.nn.Module):
def __init__(self, cfg: GPTConfig) -> None:
super().__init__()
self.layers = torch.nn.Sequential(
torch.nn.Linear(cfg.embedding_dim, 4 * cfg.embedding_dim),
GELU(),
torch.nn.Linear(cfg.embedding_dim * 4, cfg.embedding_dim),
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.layers(x)
class TransformerBlock(torch.nn.Module):
def __init__(self, cfg: GPTConfig) -> None:
super().__init__()
self.att = MultiHeadAttention(
cfg.embedding_dim,
cfg.embedding_dim,
cfg.context_length,
cfg.dropout,
cfg.qkv_bias,
)
self.ff = FeedForward(cfg)
self.norm1 = NormLayer(cfg.embedding_dim)
self.norm2 = NormLayer(cfg.embedding_dim)
self.dropout = torch.nn.Dropout(cfg.dropout)
def forward(self, x: torch.Tensor) -> torch.Tensor:
shortcut = x
x = self.norm1(x)
x = self.att(x)
x = self.dropout(x)
x = x + shortcut
shortcut = x
x = self.norm2(x)
x = self.ff(x)
x = self.dropout(x)
x = x + shortcut
return x
class NormLayer(torch.nn.Module):
def __init__(self, dim: int, eps: float = 1e-5):
super().__init__()
self.dim = dim
self.eps = eps
self.scale = torch.nn.Parameter(torch.ones(dim))
self.shift = torch.nn.Parameter(torch.zeros(dim))
def forward(self, x: torch.Tensor) -> torch.Tensor:
mean = x.mean(-1, keepdim=True)
var = x.var(-1, keepdim=True, unbiased=True)
# Makes mean = 0 and variance = 1
norm_x = (x - mean) / torch.sqrt(var + self.eps)
return self.scale * norm_x + self.shift
class DummyGPT(torch.nn.Module):
def __init__(self, config: GPTConfig):
super().__init__()
self.tok_embedding = torch.nn.Embedding(
config.vocab_size,
config.embedding_dim,
@@ -24,4 +109,23 @@ class DummyGPT:
config.context_length,
config.embedding_dim,
)
self.dropout = torch.nn.Dropout(config.dropout)
self.drop_emb = torch.nn.Dropout(config.dropout)
self.trf_blocks = torch.nn.Sequential(
*[TransformerBlock(config) for _ in range(config.n_layers)]
)
self.final_norm = NormLayer(config.embedding_dim)
self.out_head = torch.nn.Linear(
config.embedding_dim,
config.vocab_size,
bias=False,
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
_, seq_len = x.shape
res = self.tok_embedding(x) + self.pos_embedding(
torch.arange(seq_len, device=x.device)
)
res = self.drop_emb(res)
res = self.trf_blocks(res)
res = self.final_norm(res)
return self.out_head(res)