From 1c9029ec78777ef9b78240ec46379450fdb7c66e Mon Sep 17 00:00:00 2001 From: Pavel Kirilin Date: Fri, 1 May 2026 15:05:56 +0200 Subject: [PATCH] Untrained model is done. --- llmfs/__main__.py | 97 +++++++++++++++++++++++------------------ llmfs/gpt.py | 108 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 162 insertions(+), 43 deletions(-) diff --git a/llmfs/__main__.py b/llmfs/__main__.py index 3cf0fe0..25778df 100644 --- a/llmfs/__main__.py +++ b/llmfs/__main__.py @@ -1,11 +1,11 @@ from pathlib import Path +import tiktoken import torch -from llmfs.attn import MultiHeadAttention -from llmfs.datasets.v1 import GPTDataSetV1 -from llmfs.gpt import GPTConfig +from llmfs.gpt import DummyGPT, GPTConfig, TransformerBlock from llmfs.tokenizers import BPETokenizer + DATA_DIR = Path(__file__).parent.parent / "data" @@ -20,23 +20,61 @@ GPT_CONFIG_124M = GPTConfig( ) +def generate_text_simple( + model: DummyGPT, idx: torch.Tensor, max_new_tokens: int, context_size: int +) -> torch.Tensor: + for _ in range(max_new_tokens): + idx_cond = idx[:, -context_size:] + with torch.no_grad(): + logits: torch.Tensor = model(idx_cond) + logits = logits[:, -1, :] + probs = logits.softmax(dim=-1) + idx_next = probs.argmax(dim=-1, keepdim=True) + idx = torch.cat((idx, idx_next), dim=1) + return idx + + def process_text(text: str): - tokenizer = BPETokenizer.build(text) + print("Buiding tokenizer") + # tokenizer = BPETokenizer.build(text) + tokenizer = tiktoken.encoding_for_model("gpt2") vocab_size = tokenizer.max_token_value + 1 - max_len = 4 - ctx_len = max_len - output_dim = 256 - token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim) - pos_embedding_layer = torch.nn.Embedding(ctx_len, output_dim) - pos_embeddings = pos_embedding_layer(torch.arange(ctx_len)) - dataset = GPTDataSetV1.data_loader( - text, - tokenizer, - batch_size=8, - max_len=4, - stride=1, - shuffle=False, + print(f"Tokenizer is ready. Vocab size: {vocab_size}") + batch = torch.stack( + [ + torch.tensor(tokenizer.encode("Every effort moves you")), + torch.tensor(tokenizer.encode("Every day holds a")), + ], + dim=0, ) + cfg = GPTConfig( + vocab_size=vocab_size, + context_length=1024, + embedding_dim=768, + n_heads=12, + n_layers=12, + dropout=0.1, + qkv_bias=False, + ) + gpt = DummyGPT(cfg) + gpt.eval() + start_ctx = "Hello, I am" + encoded = tokenizer.encode(start_ctx) + encoded_tensor = torch.tensor(encoded).unsqueeze(0) + out = generate_text_simple(gpt, encoded_tensor, 6, cfg.context_length) + decoded_text = tokenizer.decode(out.squeeze(0).tolist()) + print(decoded_text) + # logits = gpt(batch) + # print(logits) + # print(logits.shape) + # dataset = GPTDataSetV1.data_loader( + # text, + # tokenizer, + # batch_size=8, + # max_len=4, + # stride=1, + # shuffle=False, + # ) # for inps, targs in iter(dataset): # embeds = token_embedding_layer(inps) # print(embeds.shape) @@ -46,32 +84,9 @@ def process_text(text: str): # tokenizer = BPETokenizer.build(text) -def attn_test(): - inps = torch.Tensor( - [ - [0.43, 0.15, 0.89], - [0.55, 0.87, 0.66], - [0.57, 0.85, 0.64], - [0.22, 0.58, 0.43], - [0.77, 0.25, 0.10], - [0.05, 0.80, 0.55], - ] - ) - batch = torch.stack((inps, inps), dim=0) - attn = MultiHeadAttention( - inps.shape[1], - 8, - inps.shape[0], - dropout=True, - num_heads=2, - ) - print(attn(batch)) - - def main(): raw_text = (DATA_DIR / "the-verdict.txt").read_text() - # process_text(raw_text) - attn_test() + process_text(raw_text) if __name__ == "__main__": diff --git a/llmfs/gpt.py b/llmfs/gpt.py index a1fbbc2..9374e8e 100644 --- a/llmfs/gpt.py +++ b/llmfs/gpt.py @@ -2,6 +2,8 @@ from dataclasses import dataclass import torch +from llmfs.attn import MultiHeadAttention + @dataclass class GPTConfig: @@ -14,8 +16,91 @@ class GPTConfig: qkv_bias: bool -class DummyGPT: +class DummyTransformerBlock(torch.nn.Module): def __init__(self, config: GPTConfig): + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x + + +class GELU(torch.nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + return ( + 0.5 + * x + * ( + 1 + + torch.tanh( + torch.sqrt(torch.tensor(2.0 / torch.pi)) + * (x + 0.44715 * torch.pow(x, 3)) + ) + ) + ) + + +class FeedForward(torch.nn.Module): + def __init__(self, cfg: GPTConfig) -> None: + super().__init__() + self.layers = torch.nn.Sequential( + torch.nn.Linear(cfg.embedding_dim, 4 * cfg.embedding_dim), + GELU(), + torch.nn.Linear(cfg.embedding_dim * 4, cfg.embedding_dim), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.layers(x) + + +class TransformerBlock(torch.nn.Module): + def __init__(self, cfg: GPTConfig) -> None: + super().__init__() + self.att = MultiHeadAttention( + cfg.embedding_dim, + cfg.embedding_dim, + cfg.context_length, + cfg.dropout, + cfg.qkv_bias, + ) + self.ff = FeedForward(cfg) + self.norm1 = NormLayer(cfg.embedding_dim) + self.norm2 = NormLayer(cfg.embedding_dim) + self.dropout = torch.nn.Dropout(cfg.dropout) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + shortcut = x + x = self.norm1(x) + x = self.att(x) + x = self.dropout(x) + x = x + shortcut + + shortcut = x + x = self.norm2(x) + x = self.ff(x) + x = self.dropout(x) + x = x + shortcut + return x + + +class NormLayer(torch.nn.Module): + def __init__(self, dim: int, eps: float = 1e-5): + super().__init__() + self.dim = dim + self.eps = eps + self.scale = torch.nn.Parameter(torch.ones(dim)) + self.shift = torch.nn.Parameter(torch.zeros(dim)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + mean = x.mean(-1, keepdim=True) + var = x.var(-1, keepdim=True, unbiased=True) + # Makes mean = 0 and variance = 1 + norm_x = (x - mean) / torch.sqrt(var + self.eps) + return self.scale * norm_x + self.shift + + +class DummyGPT(torch.nn.Module): + def __init__(self, config: GPTConfig): + super().__init__() self.tok_embedding = torch.nn.Embedding( config.vocab_size, config.embedding_dim, @@ -24,4 +109,23 @@ class DummyGPT: config.context_length, config.embedding_dim, ) - self.dropout = torch.nn.Dropout(config.dropout) + self.drop_emb = torch.nn.Dropout(config.dropout) + self.trf_blocks = torch.nn.Sequential( + *[TransformerBlock(config) for _ in range(config.n_layers)] + ) + self.final_norm = NormLayer(config.embedding_dim) + self.out_head = torch.nn.Linear( + config.embedding_dim, + config.vocab_size, + bias=False, + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + _, seq_len = x.shape + res = self.tok_embedding(x) + self.pos_embedding( + torch.arange(seq_len, device=x.device) + ) + res = self.drop_emb(res) + res = self.trf_blocks(res) + res = self.final_norm(res) + return self.out_head(res)