From 1c9029ec78777ef9b78240ec46379450fdb7c66e Mon Sep 17 00:00:00 2001
From: Pavel Kirilin <s3riussan@gmail.com>
Date: Fri, 1 May 2026 15:05:56 +0200
Subject: [PATCH] Untrained model is done.

---
 llmfs/__main__.py |  97 +++++++++++++++++++++++------------------
 llmfs/gpt.py      | 108 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 162 insertions(+), 43 deletions(-)

diff --git a/llmfs/__main__.py b/llmfs/__main__.py
index 3cf0fe0..25778df 100644
--- a/llmfs/__main__.py
+++ b/llmfs/__main__.py
@@ -1,11 +1,11 @@
 from pathlib import Path
 
+import tiktoken
 import torch
-from llmfs.attn import MultiHeadAttention
-from llmfs.datasets.v1 import GPTDataSetV1
-from llmfs.gpt import GPTConfig
+from llmfs.gpt import DummyGPT, GPTConfig, TransformerBlock
 from llmfs.tokenizers import BPETokenizer
 
+
 DATA_DIR = Path(__file__).parent.parent / "data"
 
 
@@ -20,23 +20,61 @@ GPT_CONFIG_124M = GPTConfig(
 )
 
 
+def generate_text_simple(
+    model: DummyGPT, idx: torch.Tensor, max_new_tokens: int, context_size: int
+) -> torch.Tensor:
+    for _ in range(max_new_tokens):
+        idx_cond = idx[:, -context_size:]
+        with torch.no_grad():
+            logits: torch.Tensor = model(idx_cond)
+        logits = logits[:, -1, :]
+        probs = logits.softmax(dim=-1)
+        idx_next = probs.argmax(dim=-1, keepdim=True)
+        idx = torch.cat((idx, idx_next), dim=1)
+    return idx
+
+
 def process_text(text: str):
-    tokenizer = BPETokenizer.build(text)
+    print("Buiding tokenizer")
+    # tokenizer = BPETokenizer.build(text)
+    tokenizer = tiktoken.encoding_for_model("gpt2")
     vocab_size = tokenizer.max_token_value + 1
-    max_len = 4
-    ctx_len = max_len
-    output_dim = 256
-    token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
-    pos_embedding_layer = torch.nn.Embedding(ctx_len, output_dim)
-    pos_embeddings = pos_embedding_layer(torch.arange(ctx_len))
-    dataset = GPTDataSetV1.data_loader(
-        text,
-        tokenizer,
-        batch_size=8,
-        max_len=4,
-        stride=1,
-        shuffle=False,
+    print(f"Tokenizer is ready. Vocab size: {vocab_size}")
+    batch = torch.stack(
+        [
+            torch.tensor(tokenizer.encode("Every effort moves you")),
+            torch.tensor(tokenizer.encode("Every day holds a")),
+        ],
+        dim=0,
     )
+    cfg = GPTConfig(
+        vocab_size=vocab_size,
+        context_length=1024,
+        embedding_dim=768,
+        n_heads=12,
+        n_layers=12,
+        dropout=0.1,
+        qkv_bias=False,
+    )
+    gpt = DummyGPT(cfg)
+    gpt.eval()
+    start_ctx = "Hello, I am"
+    encoded = tokenizer.encode(start_ctx)
+    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
+    out = generate_text_simple(gpt, encoded_tensor, 6, cfg.context_length)
+    decoded_text = tokenizer.decode(out.squeeze(0).tolist())
+    print(decoded_text)
+    # logits = gpt(batch)
+    # print(logits)
+    # print(logits.shape)
+    # dataset = GPTDataSetV1.data_loader(
+    #     text,
+    #     tokenizer,
+    #     batch_size=8,
+    #     max_len=4,
+    #     stride=1,
+    #     shuffle=False,
+    # )
     # for inps, targs in iter(dataset):
     #     embeds = token_embedding_layer(inps)
     #     print(embeds.shape)
@@ -46,32 +84,9 @@ def process_text(text: str):
     # tokenizer = BPETokenizer.build(text)
 
 
-def attn_test():
-    inps = torch.Tensor(
-        [
-            [0.43, 0.15, 0.89],
-            [0.55, 0.87, 0.66],
-            [0.57, 0.85, 0.64],
-            [0.22, 0.58, 0.43],
-            [0.77, 0.25, 0.10],
-            [0.05, 0.80, 0.55],
-        ]
-    )
-    batch = torch.stack((inps, inps), dim=0)
-    attn = MultiHeadAttention(
-        inps.shape[1],
-        8,
-        inps.shape[0],
-        dropout=True,
-        num_heads=2,
-    )
-    print(attn(batch))
-
-
 def main():
     raw_text = (DATA_DIR / "the-verdict.txt").read_text()
-    # process_text(raw_text)
-    attn_test()
+    process_text(raw_text)
 
 
 if __name__ == "__main__":
diff --git a/llmfs/gpt.py b/llmfs/gpt.py
index a1fbbc2..9374e8e 100644
--- a/llmfs/gpt.py
+++ b/llmfs/gpt.py
@@ -2,6 +2,8 @@ from dataclasses import dataclass
 
 import torch
 
+from llmfs.attn import MultiHeadAttention
+
 
 @dataclass
 class GPTConfig:
@@ -14,8 +16,91 @@ class GPTConfig:
     qkv_bias: bool
 
 
-class DummyGPT:
+class DummyTransformerBlock(torch.nn.Module):
     def __init__(self, config: GPTConfig):
+        super().__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x
+
+
+class GELU(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return (
+            0.5
+            * x
+            * (
+                1
+                + torch.tanh(
+                    torch.sqrt(torch.tensor(2.0 / torch.pi))
+                    * (x + 0.44715 * torch.pow(x, 3))
+                )
+            )
+        )
+
+
+class FeedForward(torch.nn.Module):
+    def __init__(self, cfg: GPTConfig) -> None:
+        super().__init__()
+        self.layers = torch.nn.Sequential(
+            torch.nn.Linear(cfg.embedding_dim, 4 * cfg.embedding_dim),
+            GELU(),
+            torch.nn.Linear(cfg.embedding_dim * 4, cfg.embedding_dim),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.layers(x)
+
+
+class TransformerBlock(torch.nn.Module):
+    def __init__(self, cfg: GPTConfig) -> None:
+        super().__init__()
+        self.att = MultiHeadAttention(
+            cfg.embedding_dim,
+            cfg.embedding_dim,
+            cfg.context_length,
+            cfg.dropout,
+            cfg.qkv_bias,
+        )
+        self.ff = FeedForward(cfg)
+        self.norm1 = NormLayer(cfg.embedding_dim)
+        self.norm2 = NormLayer(cfg.embedding_dim)
+        self.dropout = torch.nn.Dropout(cfg.dropout)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        shortcut = x
+        x = self.norm1(x)
+        x = self.att(x)
+        x = self.dropout(x)
+        x = x + shortcut
+
+        shortcut = x
+        x = self.norm2(x)
+        x = self.ff(x)
+        x = self.dropout(x)
+        x = x + shortcut
+        return x
+
+
+class NormLayer(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        self.scale = torch.nn.Parameter(torch.ones(dim))
+        self.shift = torch.nn.Parameter(torch.zeros(dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        mean = x.mean(-1, keepdim=True)
+        var = x.var(-1, keepdim=True, unbiased=True)
+        # Makes mean = 0 and variance = 1
+        norm_x = (x - mean) / torch.sqrt(var + self.eps)
+        return self.scale * norm_x + self.shift
+
+
+class DummyGPT(torch.nn.Module):
+    def __init__(self, config: GPTConfig):
+        super().__init__()
         self.tok_embedding = torch.nn.Embedding(
             config.vocab_size,
             config.embedding_dim,
@@ -24,4 +109,23 @@ class DummyGPT:
             config.context_length,
             config.embedding_dim,
         )
-        self.dropout = torch.nn.Dropout(config.dropout)
+        self.drop_emb = torch.nn.Dropout(config.dropout)
+        self.trf_blocks = torch.nn.Sequential(
+            *[TransformerBlock(config) for _ in range(config.n_layers)]
+        )
+        self.final_norm = NormLayer(config.embedding_dim)
+        self.out_head = torch.nn.Linear(
+            config.embedding_dim,
+            config.vocab_size,
+            bias=False,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        _, seq_len = x.shape
+        res = self.tok_embedding(x) + self.pos_embedding(
+            torch.arange(seq_len, device=x.device)
+        )
+        res = self.drop_emb(res)
+        res = self.trf_blocks(res)
+        res = self.final_norm(res)
+        return self.out_head(res)